search.py 20 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576
  1. '''
  2. searx is free software: you can redistribute it and/or modify
  3. it under the terms of the GNU Affero General Public License as published by
  4. the Free Software Foundation, either version 3 of the License, or
  5. (at your option) any later version.
  6. searx is distributed in the hope that it will be useful,
  7. but WITHOUT ANY WARRANTY; without even the implied warranty of
  8. MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  9. GNU Affero General Public License for more details.
  10. You should have received a copy of the GNU Affero General Public License
  11. along with searx. If not, see < http://www.gnu.org/licenses/ >.
  12. (C) 2013- by Adam Tauber, <asciimoo@gmail.com>
  13. '''
  14. import threading
  15. import re
  16. import searx.poolrequests as requests_lib
  17. from itertools import izip_longest, chain
  18. from operator import itemgetter
  19. from Queue import Queue
  20. from time import time
  21. from urlparse import urlparse, unquote
  22. from searx import settings
  23. from searx.engines import (
  24. categories, engines
  25. )
  26. from searx.languages import language_codes
  27. from searx.utils import gen_useragent, get_blocked_engines
  28. from searx.query import Query
  29. from searx import logger
  30. logger = logger.getChild('search')
  31. number_of_searches = 0
  32. def search_request_wrapper(fn, url, engine_name, **kwargs):
  33. try:
  34. return fn(url, **kwargs)
  35. except:
  36. # increase errors stats
  37. engines[engine_name].stats['errors'] += 1
  38. # print engine name and specific error message
  39. logger.exception('engine crash: {0}'.format(engine_name))
  40. return
  41. def threaded_requests(requests):
  42. timeout_limit = max(r[2]['timeout'] for r in requests)
  43. search_start = time()
  44. for fn, url, request_args, engine_name in requests:
  45. request_args['timeout'] = timeout_limit
  46. th = threading.Thread(
  47. target=search_request_wrapper,
  48. args=(fn, url, engine_name),
  49. kwargs=request_args,
  50. name='search_request',
  51. )
  52. th._engine_name = engine_name
  53. th.start()
  54. for th in threading.enumerate():
  55. if th.name == 'search_request':
  56. remaining_time = max(0.0, timeout_limit - (time() - search_start))
  57. th.join(remaining_time)
  58. if th.isAlive():
  59. logger.warning('engine timeout: {0}'.format(th._engine_name))
  60. # get default reqest parameter
  61. def default_request_params():
  62. return {
  63. 'method': 'GET',
  64. 'headers': {},
  65. 'data': {},
  66. 'url': '',
  67. 'cookies': {},
  68. 'verify': True
  69. }
  70. # create a callback wrapper for the search engine results
  71. def make_callback(engine_name, results_queue, callback, params):
  72. # creating a callback wrapper for the search engine results
  73. def process_callback(response, **kwargs):
  74. # check if redirect comparing to the True value,
  75. # because resp can be a Mock object, and any attribut name returns something.
  76. if response.is_redirect is True:
  77. logger.debug('{0} redirect on: {1}'.format(engine_name, response))
  78. return
  79. response.search_params = params
  80. timeout_overhead = 0.2 # seconds
  81. search_duration = time() - params['started']
  82. timeout_limit = engines[engine_name].timeout + timeout_overhead
  83. if search_duration > timeout_limit:
  84. engines[engine_name].stats['page_load_time'] += timeout_limit
  85. engines[engine_name].stats['errors'] += 1
  86. return
  87. # callback
  88. search_results = callback(response)
  89. # add results
  90. for result in search_results:
  91. result['engine'] = engine_name
  92. results_queue.put_nowait((engine_name, search_results))
  93. # update stats with current page-load-time
  94. engines[engine_name].stats['page_load_time'] += search_duration
  95. return process_callback
  96. # return the meaningful length of the content for a result
  97. def content_result_len(content):
  98. if isinstance(content, basestring):
  99. content = re.sub('[,;:!?\./\\\\ ()-_]', '', content)
  100. return len(content)
  101. else:
  102. return 0
  103. # score results and remove duplications
  104. def score_results(results):
  105. # calculate scoring parameters
  106. flat_res = filter(
  107. None, chain.from_iterable(izip_longest(*results.values())))
  108. flat_len = len(flat_res)
  109. engines_len = len(results)
  110. results = []
  111. # pass 1: deduplication + scoring
  112. for i, res in enumerate(flat_res):
  113. res['parsed_url'] = urlparse(res['url'])
  114. res['host'] = res['parsed_url'].netloc
  115. if res['host'].startswith('www.'):
  116. res['host'] = res['host'].replace('www.', '', 1)
  117. res['engines'] = [res['engine']]
  118. weight = 1.0
  119. # strip multiple spaces and cariage returns from content
  120. if res.get('content'):
  121. res['content'] = re.sub(' +', ' ',
  122. res['content'].strip().replace('\n', ''))
  123. # get weight of this engine if possible
  124. if hasattr(engines[res['engine']], 'weight'):
  125. weight = float(engines[res['engine']].weight)
  126. # calculate score for that engine
  127. score = int((flat_len - i) / engines_len) * weight + 1
  128. # check for duplicates
  129. duplicated = False
  130. for new_res in results:
  131. # remove / from the end of the url if required
  132. p1 = res['parsed_url'].path[:-1]\
  133. if res['parsed_url'].path.endswith('/')\
  134. else res['parsed_url'].path
  135. p2 = new_res['parsed_url'].path[:-1]\
  136. if new_res['parsed_url'].path.endswith('/')\
  137. else new_res['parsed_url'].path
  138. # check if that result is a duplicate
  139. if res['host'] == new_res['host'] and\
  140. unquote(p1) == unquote(p2) and\
  141. res['parsed_url'].query == new_res['parsed_url'].query and\
  142. res.get('template') == new_res.get('template'):
  143. duplicated = new_res
  144. break
  145. # merge duplicates together
  146. if duplicated:
  147. # using content with more text
  148. if content_result_len(res.get('content', '')) >\
  149. content_result_len(duplicated.get('content', '')):
  150. duplicated['content'] = res['content']
  151. # increase result-score
  152. duplicated['score'] += score
  153. # add engine to list of result-engines
  154. duplicated['engines'].append(res['engine'])
  155. # using https if possible
  156. if duplicated['parsed_url'].scheme == 'https':
  157. continue
  158. elif res['parsed_url'].scheme == 'https':
  159. duplicated['url'] = res['parsed_url'].geturl()
  160. duplicated['parsed_url'] = res['parsed_url']
  161. # if there is no duplicate found, append result
  162. else:
  163. res['score'] = score
  164. # if the result has no scheme, use http as default
  165. if res['parsed_url'].scheme == '':
  166. res['parsed_url'] = res['parsed_url']._replace(scheme="http")
  167. results.append(res)
  168. results = sorted(results, key=itemgetter('score'), reverse=True)
  169. # pass 2 : group results by category and template
  170. gresults = []
  171. categoryPositions = {}
  172. for i, res in enumerate(results):
  173. # FIXME : handle more than one category per engine
  174. category = engines[res['engine']].categories[0] + ':' + ''\
  175. if 'template' not in res\
  176. else res['template']
  177. current = None if category not in categoryPositions\
  178. else categoryPositions[category]
  179. # group with previous results using the same category
  180. # if the group can accept more result and is not too far
  181. # from the current position
  182. if current is not None and (current['count'] > 0)\
  183. and (len(gresults) - current['index'] < 20):
  184. # group with the previous results using
  185. # the same category with this one
  186. index = current['index']
  187. gresults.insert(index, res)
  188. # update every index after the current one
  189. # (including the current one)
  190. for k in categoryPositions:
  191. v = categoryPositions[k]['index']
  192. if v >= index:
  193. categoryPositions[k]['index'] = v + 1
  194. # update this category
  195. current['count'] -= 1
  196. else:
  197. # same category
  198. gresults.append(res)
  199. # update categoryIndex
  200. categoryPositions[category] = {'index': len(gresults), 'count': 8}
  201. # return gresults
  202. return gresults
  203. def merge_two_infoboxes(infobox1, infobox2):
  204. if 'urls' in infobox2:
  205. urls1 = infobox1.get('urls', None)
  206. if urls1 is None:
  207. urls1 = []
  208. infobox1.set('urls', urls1)
  209. urlSet = set()
  210. for url in infobox1.get('urls', []):
  211. urlSet.add(url.get('url', None))
  212. for url in infobox2.get('urls', []):
  213. if url.get('url', None) not in urlSet:
  214. urls1.append(url)
  215. if 'attributes' in infobox2:
  216. attributes1 = infobox1.get('attributes', None)
  217. if attributes1 is None:
  218. attributes1 = []
  219. infobox1.set('attributes', attributes1)
  220. attributeSet = set()
  221. for attribute in infobox1.get('attributes', []):
  222. if attribute.get('label', None) not in attributeSet:
  223. attributeSet.add(attribute.get('label', None))
  224. for attribute in infobox2.get('attributes', []):
  225. attributes1.append(attribute)
  226. if 'content' in infobox2:
  227. content1 = infobox1.get('content', None)
  228. content2 = infobox2.get('content', '')
  229. if content1 is not None:
  230. if content_result_len(content2) > content_result_len(content1):
  231. infobox1['content'] = content2
  232. else:
  233. infobox1.set('content', content2)
  234. def merge_infoboxes(infoboxes):
  235. results = []
  236. infoboxes_id = {}
  237. for infobox in infoboxes:
  238. add_infobox = True
  239. infobox_id = infobox.get('id', None)
  240. if infobox_id is not None:
  241. existingIndex = infoboxes_id.get(infobox_id, None)
  242. if existingIndex is not None:
  243. merge_two_infoboxes(results[existingIndex], infobox)
  244. add_infobox = False
  245. if add_infobox:
  246. results.append(infobox)
  247. infoboxes_id[infobox_id] = len(results) - 1
  248. return results
  249. class Search(object):
  250. """Search information container"""
  251. def __init__(self, request):
  252. # init vars
  253. super(Search, self).__init__()
  254. self.query = None
  255. self.engines = []
  256. self.categories = []
  257. self.paging = False
  258. self.pageno = 1
  259. self.lang = 'all'
  260. # set blocked engines
  261. self.blocked_engines = get_blocked_engines(engines, request.cookies)
  262. self.results = []
  263. self.suggestions = set()
  264. self.answers = set()
  265. self.infoboxes = []
  266. self.request_data = {}
  267. # set specific language if set
  268. if request.cookies.get('language')\
  269. and request.cookies['language'] in (x[0] for x in language_codes):
  270. self.lang = request.cookies['language']
  271. # set request method
  272. if request.method == 'POST':
  273. self.request_data = request.form
  274. else:
  275. self.request_data = request.args
  276. # TODO better exceptions
  277. if not self.request_data.get('q'):
  278. raise Exception('noquery')
  279. # set pagenumber
  280. pageno_param = self.request_data.get('pageno', '1')
  281. if not pageno_param.isdigit() or int(pageno_param) < 1:
  282. raise Exception('wrong pagenumber')
  283. self.pageno = int(pageno_param)
  284. # parse query, if tags are set, which change
  285. # the serch engine or search-language
  286. query_obj = Query(self.request_data['q'], self.blocked_engines)
  287. query_obj.parse_query()
  288. # set query
  289. self.query = query_obj.getSearchQuery()
  290. # get last selected language in query, if possible
  291. # TODO support search with multible languages
  292. if len(query_obj.languages):
  293. self.lang = query_obj.languages[-1]
  294. self.engines = query_obj.engines
  295. self.categories = []
  296. # if engines are calculated from query,
  297. # set categories by using that informations
  298. if self.engines and query_obj.specific:
  299. self.categories = list(set(engine['category']
  300. for engine in self.engines))
  301. # otherwise, using defined categories to
  302. # calculate which engines should be used
  303. else:
  304. # set categories/engines
  305. load_default_categories = True
  306. for pd_name, pd in self.request_data.items():
  307. if pd_name == 'categories':
  308. self.categories.extend(categ for categ in map(unicode.strip, pd.split(',')) if categ in categories)
  309. elif pd_name == 'engines':
  310. pd_engines = [{'category': engines[engine].categories[0],
  311. 'name': engine}
  312. for engine in map(unicode.strip, pd.split(',')) if engine in engines]
  313. if pd_engines:
  314. self.engines.extend(pd_engines)
  315. load_default_categories = False
  316. elif pd_name.startswith('category_'):
  317. category = pd_name[9:]
  318. # if category is not found in list, skip
  319. if category not in categories:
  320. continue
  321. if pd != 'off':
  322. # add category to list
  323. self.categories.append(category)
  324. elif category in self.categories:
  325. # remove category from list if property is set to 'off'
  326. self.categories.remove(category)
  327. if not load_default_categories:
  328. return
  329. # if no category is specified for this search,
  330. # using user-defined default-configuration which
  331. # (is stored in cookie)
  332. if not self.categories:
  333. cookie_categories = request.cookies.get('categories', '')
  334. cookie_categories = cookie_categories.split(',')
  335. for ccateg in cookie_categories:
  336. if ccateg in categories:
  337. self.categories.append(ccateg)
  338. # if still no category is specified, using general
  339. # as default-category
  340. if not self.categories:
  341. self.categories = ['general']
  342. # using all engines for that search, which are
  343. # declared under the specific categories
  344. for categ in self.categories:
  345. self.engines.extend({'category': categ,
  346. 'name': engine.name}
  347. for engine in categories[categ]
  348. if (engine.name, categ) not in self.blocked_engines)
  349. # do search-request
  350. def search(self, request):
  351. global number_of_searches
  352. # init vars
  353. requests = []
  354. results_queue = Queue()
  355. results = {}
  356. # increase number of searches
  357. number_of_searches += 1
  358. # set default useragent
  359. # user_agent = request.headers.get('User-Agent', '')
  360. user_agent = gen_useragent()
  361. # start search-reqest for all selected engines
  362. for selected_engine in self.engines:
  363. if selected_engine['name'] not in engines:
  364. continue
  365. engine = engines[selected_engine['name']]
  366. # if paging is not supported, skip
  367. if self.pageno > 1 and not engine.paging:
  368. continue
  369. # if search-language is set and engine does not
  370. # provide language-support, skip
  371. if self.lang != 'all' and not engine.language_support:
  372. continue
  373. # set default request parameters
  374. request_params = default_request_params()
  375. request_params['headers']['User-Agent'] = user_agent
  376. request_params['category'] = selected_engine['category']
  377. request_params['started'] = time()
  378. request_params['pageno'] = self.pageno
  379. if hasattr(engine, 'language') and engine.language:
  380. request_params['language'] = engine.language
  381. else:
  382. request_params['language'] = self.lang
  383. try:
  384. # 0 = None, 1 = Moderate, 2 = Strict
  385. request_params['safesearch'] = int(request.cookies.get('safesearch'))
  386. except Exception:
  387. request_params['safesearch'] = settings['search']['safe_search']
  388. # update request parameters dependent on
  389. # search-engine (contained in engines folder)
  390. engine.request(self.query.encode('utf-8'), request_params)
  391. if request_params['url'] is None:
  392. # TODO add support of offline engines
  393. pass
  394. # create a callback wrapper for the search engine results
  395. callback = make_callback(
  396. selected_engine['name'],
  397. results_queue,
  398. engine.response,
  399. request_params)
  400. # create dictionary which contain all
  401. # informations about the request
  402. request_args = dict(
  403. headers=request_params['headers'],
  404. hooks=dict(response=callback),
  405. cookies=request_params['cookies'],
  406. timeout=engine.timeout,
  407. verify=request_params['verify']
  408. )
  409. # specific type of request (GET or POST)
  410. if request_params['method'] == 'GET':
  411. req = requests_lib.get
  412. else:
  413. req = requests_lib.post
  414. request_args['data'] = request_params['data']
  415. # ignoring empty urls
  416. if not request_params['url']:
  417. continue
  418. # append request to list
  419. requests.append((req, request_params['url'],
  420. request_args,
  421. selected_engine['name']))
  422. if not requests:
  423. return self
  424. # send all search-request
  425. threaded_requests(requests)
  426. while not results_queue.empty():
  427. engine_name, engine_results = results_queue.get_nowait()
  428. # TODO type checks
  429. [self.suggestions.add(x['suggestion'])
  430. for x in list(engine_results)
  431. if 'suggestion' in x
  432. and engine_results.remove(x) is None]
  433. [self.answers.add(x['answer'])
  434. for x in list(engine_results)
  435. if 'answer' in x
  436. and engine_results.remove(x) is None]
  437. self.infoboxes.extend(x for x in list(engine_results)
  438. if 'infobox' in x
  439. and engine_results.remove(x) is None)
  440. results[engine_name] = engine_results
  441. # update engine-specific stats
  442. for engine_name, engine_results in results.items():
  443. engines[engine_name].stats['search_count'] += 1
  444. engines[engine_name].stats['result_count'] += len(engine_results)
  445. # score results and remove duplications
  446. self.results = score_results(results)
  447. # merge infoboxes according to their ids
  448. self.infoboxes = merge_infoboxes(self.infoboxes)
  449. # update engine stats, using calculated score
  450. for result in self.results:
  451. for res_engine in result['engines']:
  452. engines[result['engine']]\
  453. .stats['score_count'] += result['score']
  454. # return results, suggestions, answers and infoboxes
  455. return self