search.py 20 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579
  1. '''
  2. searx is free software: you can redistribute it and/or modify
  3. it under the terms of the GNU Affero General Public License as published by
  4. the Free Software Foundation, either version 3 of the License, or
  5. (at your option) any later version.
  6. searx is distributed in the hope that it will be useful,
  7. but WITHOUT ANY WARRANTY; without even the implied warranty of
  8. MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  9. GNU Affero General Public License for more details.
  10. You should have received a copy of the GNU Affero General Public License
  11. along with searx. If not, see < http://www.gnu.org/licenses/ >.
  12. (C) 2013- by Adam Tauber, <asciimoo@gmail.com>
  13. '''
  14. import threading
  15. import re
  16. import searx.poolrequests as requests_lib
  17. from itertools import izip_longest, chain
  18. from operator import itemgetter
  19. from Queue import Queue
  20. from time import time
  21. from urlparse import urlparse, unquote
  22. from searx import settings
  23. from searx.engines import (
  24. categories, engines
  25. )
  26. from searx.languages import language_codes
  27. from searx.utils import gen_useragent, get_blocked_engines
  28. from searx.query import Query
  29. from searx import logger
  30. logger = logger.getChild('search')
  31. number_of_searches = 0
  32. def search_request_wrapper(fn, url, engine_name, **kwargs):
  33. try:
  34. return fn(url, **kwargs)
  35. except:
  36. # increase errors stats
  37. engines[engine_name].stats['errors'] += 1
  38. # print engine name and specific error message
  39. logger.exception('engine crash: {0}'.format(engine_name))
  40. return
  41. def threaded_requests(requests):
  42. timeout_limit = max(r[2]['timeout'] for r in requests)
  43. search_start = time()
  44. for fn, url, request_args, engine_name in requests:
  45. request_args['timeout'] = timeout_limit
  46. th = threading.Thread(
  47. target=search_request_wrapper,
  48. args=(fn, url, engine_name),
  49. kwargs=request_args,
  50. name='search_request',
  51. )
  52. th._engine_name = engine_name
  53. th.start()
  54. for th in threading.enumerate():
  55. if th.name == 'search_request':
  56. remaining_time = max(0.0, timeout_limit - (time() - search_start))
  57. th.join(remaining_time)
  58. if th.isAlive():
  59. logger.warning('engine timeout: {0}'.format(th._engine_name))
  60. # get default reqest parameter
  61. def default_request_params():
  62. return {
  63. 'method': 'GET',
  64. 'headers': {},
  65. 'data': {},
  66. 'url': '',
  67. 'cookies': {},
  68. 'verify': True
  69. }
  70. # create a callback wrapper for the search engine results
  71. def make_callback(engine_name, results_queue, callback, params):
  72. # creating a callback wrapper for the search engine results
  73. def process_callback(response, **kwargs):
  74. # check if redirect comparing to the True value,
  75. # because resp can be a Mock object, and any attribut name returns something.
  76. if response.is_redirect is True:
  77. logger.debug('{0} redirect on: {1}'.format(engine_name, response))
  78. return
  79. response.search_params = params
  80. timeout_overhead = 0.2 # seconds
  81. search_duration = time() - params['started']
  82. timeout_limit = engines[engine_name].timeout + timeout_overhead
  83. if search_duration > timeout_limit:
  84. engines[engine_name].stats['page_load_time'] += timeout_limit
  85. engines[engine_name].stats['errors'] += 1
  86. return
  87. # callback
  88. search_results = callback(response)
  89. # add results
  90. for result in search_results:
  91. result['engine'] = engine_name
  92. results_queue.put_nowait((engine_name, search_results))
  93. # update stats with current page-load-time
  94. engines[engine_name].stats['page_load_time'] += search_duration
  95. return process_callback
  96. # return the meaningful length of the content for a result
  97. def content_result_len(content):
  98. if isinstance(content, basestring):
  99. content = re.sub('[,;:!?\./\\\\ ()-_]', '', content)
  100. return len(content)
  101. else:
  102. return 0
  103. # score results and remove duplications
  104. def score_results(results):
  105. # calculate scoring parameters
  106. flat_res = filter(
  107. None, chain.from_iterable(izip_longest(*results.values())))
  108. flat_len = len(flat_res)
  109. engines_len = len(results)
  110. results = []
  111. # pass 1: deduplication + scoring
  112. for i, res in enumerate(flat_res):
  113. res['parsed_url'] = urlparse(res['url'])
  114. res['host'] = res['parsed_url'].netloc
  115. if res['host'].startswith('www.'):
  116. res['host'] = res['host'].replace('www.', '', 1)
  117. res['engines'] = [res['engine']]
  118. weight = 1.0
  119. # strip multiple spaces and cariage returns from content
  120. if res.get('content'):
  121. res['content'] = re.sub(' +', ' ',
  122. res['content'].strip().replace('\n', ''))
  123. # get weight of this engine if possible
  124. if hasattr(engines[res['engine']], 'weight'):
  125. weight = float(engines[res['engine']].weight)
  126. # calculate score for that engine
  127. score = int((flat_len - i) / engines_len) * weight + 1
  128. # check for duplicates
  129. duplicated = False
  130. for new_res in results:
  131. # remove / from the end of the url if required
  132. p1 = res['parsed_url'].path[:-1]\
  133. if res['parsed_url'].path.endswith('/')\
  134. else res['parsed_url'].path
  135. p2 = new_res['parsed_url'].path[:-1]\
  136. if new_res['parsed_url'].path.endswith('/')\
  137. else new_res['parsed_url'].path
  138. # check if that result is a duplicate
  139. if res['host'] == new_res['host'] and\
  140. unquote(p1) == unquote(p2) and\
  141. res['parsed_url'].query == new_res['parsed_url'].query and\
  142. res.get('template') == new_res.get('template'):
  143. duplicated = new_res
  144. break
  145. # merge duplicates together
  146. if duplicated:
  147. # using content with more text
  148. if content_result_len(res.get('content', '')) >\
  149. content_result_len(duplicated.get('content', '')):
  150. duplicated['content'] = res['content']
  151. # increase result-score
  152. duplicated['score'] += score
  153. # add engine to list of result-engines
  154. duplicated['engines'].append(res['engine'])
  155. # using https if possible
  156. if duplicated['parsed_url'].scheme == 'https':
  157. continue
  158. elif res['parsed_url'].scheme == 'https':
  159. duplicated['url'] = res['parsed_url'].geturl()
  160. duplicated['parsed_url'] = res['parsed_url']
  161. # if there is no duplicate found, append result
  162. else:
  163. res['score'] = score
  164. # if the result has no scheme, use http as default
  165. if res['parsed_url'].scheme == '':
  166. res['parsed_url'] = res['parsed_url']._replace(scheme="http")
  167. results.append(res)
  168. results = sorted(results, key=itemgetter('score'), reverse=True)
  169. # pass 2 : group results by category and template
  170. gresults = []
  171. categoryPositions = {}
  172. for i, res in enumerate(results):
  173. # FIXME : handle more than one category per engine
  174. category = engines[res['engine']].categories[0] + ':' + ''\
  175. if 'template' not in res\
  176. else res['template']
  177. current = None if category not in categoryPositions\
  178. else categoryPositions[category]
  179. # group with previous results using the same category
  180. # if the group can accept more result and is not too far
  181. # from the current position
  182. if current is not None and (current['count'] > 0)\
  183. and (len(gresults) - current['index'] < 20):
  184. # group with the previous results using
  185. # the same category with this one
  186. index = current['index']
  187. gresults.insert(index, res)
  188. # update every index after the current one
  189. # (including the current one)
  190. for k in categoryPositions:
  191. v = categoryPositions[k]['index']
  192. if v >= index:
  193. categoryPositions[k]['index'] = v + 1
  194. # update this category
  195. current['count'] -= 1
  196. else:
  197. # same category
  198. gresults.append(res)
  199. # update categoryIndex
  200. categoryPositions[category] = {'index': len(gresults), 'count': 8}
  201. # return gresults
  202. return gresults
  203. def merge_two_infoboxes(infobox1, infobox2):
  204. if 'urls' in infobox2:
  205. urls1 = infobox1.get('urls', None)
  206. if urls1 is None:
  207. urls1 = []
  208. infobox1.set('urls', urls1)
  209. urlSet = set()
  210. for url in infobox1.get('urls', []):
  211. urlSet.add(url.get('url', None))
  212. for url in infobox2.get('urls', []):
  213. if url.get('url', None) not in urlSet:
  214. urls1.append(url)
  215. if 'attributes' in infobox2:
  216. attributes1 = infobox1.get('attributes', None)
  217. if attributes1 is None:
  218. attributes1 = []
  219. infobox1.set('attributes', attributes1)
  220. attributeSet = set()
  221. for attribute in infobox1.get('attributes', []):
  222. if attribute.get('label', None) not in attributeSet:
  223. attributeSet.add(attribute.get('label', None))
  224. for attribute in infobox2.get('attributes', []):
  225. attributes1.append(attribute)
  226. if 'content' in infobox2:
  227. content1 = infobox1.get('content', None)
  228. content2 = infobox2.get('content', '')
  229. if content1 is not None:
  230. if content_result_len(content2) > content_result_len(content1):
  231. infobox1['content'] = content2
  232. else:
  233. infobox1.set('content', content2)
  234. def merge_infoboxes(infoboxes):
  235. results = []
  236. infoboxes_id = {}
  237. for infobox in infoboxes:
  238. add_infobox = True
  239. infobox_id = infobox.get('id', None)
  240. if infobox_id is not None:
  241. existingIndex = infoboxes_id.get(infobox_id, None)
  242. if existingIndex is not None:
  243. merge_two_infoboxes(results[existingIndex], infobox)
  244. add_infobox = False
  245. if add_infobox:
  246. results.append(infobox)
  247. infoboxes_id[infobox_id] = len(results) - 1
  248. return results
  249. class Search(object):
  250. """Search information container"""
  251. def __init__(self, request):
  252. # init vars
  253. super(Search, self).__init__()
  254. self.query = None
  255. self.engines = []
  256. self.categories = []
  257. self.paging = False
  258. self.pageno = 1
  259. self.lang = 'all'
  260. # set blocked engines
  261. self.blocked_engines = get_blocked_engines(engines, request.cookies)
  262. self.results = []
  263. self.suggestions = set()
  264. self.answers = set()
  265. self.infoboxes = []
  266. self.request_data = {}
  267. # set specific language if set
  268. if request.cookies.get('language')\
  269. and request.cookies['language'] in (x[0] for x in language_codes):
  270. self.lang = request.cookies['language']
  271. # set request method
  272. if request.method == 'POST':
  273. self.request_data = request.form
  274. else:
  275. self.request_data = request.args
  276. # TODO better exceptions
  277. if not self.request_data.get('q'):
  278. raise Exception('noquery')
  279. # set pagenumber
  280. pageno_param = self.request_data.get('pageno', '1')
  281. if not pageno_param.isdigit() or int(pageno_param) < 1:
  282. raise Exception('wrong pagenumber')
  283. self.pageno = int(pageno_param)
  284. # parse query, if tags are set, which change
  285. # the serch engine or search-language
  286. query_obj = Query(self.request_data['q'], self.blocked_engines)
  287. query_obj.parse_query()
  288. # set query
  289. self.query = query_obj.getSearchQuery()
  290. # get last selected language in query, if possible
  291. # TODO support search with multible languages
  292. if len(query_obj.languages):
  293. self.lang = query_obj.languages[-1]
  294. self.engines = query_obj.engines
  295. self.categories = []
  296. # if engines are calculated from query,
  297. # set categories by using that informations
  298. if self.engines and query_obj.specific:
  299. self.categories = list(set(engine['category']
  300. for engine in self.engines))
  301. # otherwise, using defined categories to
  302. # calculate which engines should be used
  303. else:
  304. # set categories/engines
  305. load_default_categories = True
  306. for pd_name, pd in self.request_data.items():
  307. if pd_name == 'categories':
  308. self.categories.extend(categ for categ in map(unicode.strip, pd.split(',')) if categ in categories)
  309. elif pd_name == 'engines':
  310. pd_engines = [{'category': engines[engine].categories[0],
  311. 'name': engine}
  312. for engine in map(unicode.strip, pd.split(',')) if engine in engines]
  313. if pd_engines:
  314. self.engines.extend(pd_engines)
  315. load_default_categories = False
  316. elif pd_name.startswith('category_'):
  317. category = pd_name[9:]
  318. # if category is not found in list, skip
  319. if category not in categories:
  320. continue
  321. if pd != 'off':
  322. # add category to list
  323. self.categories.append(category)
  324. elif category in self.categories:
  325. # remove category from list if property is set to 'off'
  326. self.categories.remove(category)
  327. if not load_default_categories:
  328. if not self.categories:
  329. self.categories = list(set(engine['category']
  330. for engine in self.engines))
  331. return
  332. # if no category is specified for this search,
  333. # using user-defined default-configuration which
  334. # (is stored in cookie)
  335. if not self.categories:
  336. cookie_categories = request.cookies.get('categories', '')
  337. cookie_categories = cookie_categories.split(',')
  338. for ccateg in cookie_categories:
  339. if ccateg in categories:
  340. self.categories.append(ccateg)
  341. # if still no category is specified, using general
  342. # as default-category
  343. if not self.categories:
  344. self.categories = ['general']
  345. # using all engines for that search, which are
  346. # declared under the specific categories
  347. for categ in self.categories:
  348. self.engines.extend({'category': categ,
  349. 'name': engine.name}
  350. for engine in categories[categ]
  351. if (engine.name, categ) not in self.blocked_engines)
  352. # do search-request
  353. def search(self, request):
  354. global number_of_searches
  355. # init vars
  356. requests = []
  357. results_queue = Queue()
  358. results = {}
  359. # increase number of searches
  360. number_of_searches += 1
  361. # set default useragent
  362. # user_agent = request.headers.get('User-Agent', '')
  363. user_agent = gen_useragent()
  364. # start search-reqest for all selected engines
  365. for selected_engine in self.engines:
  366. if selected_engine['name'] not in engines:
  367. continue
  368. engine = engines[selected_engine['name']]
  369. # if paging is not supported, skip
  370. if self.pageno > 1 and not engine.paging:
  371. continue
  372. # if search-language is set and engine does not
  373. # provide language-support, skip
  374. if self.lang != 'all' and not engine.language_support:
  375. continue
  376. # set default request parameters
  377. request_params = default_request_params()
  378. request_params['headers']['User-Agent'] = user_agent
  379. request_params['category'] = selected_engine['category']
  380. request_params['started'] = time()
  381. request_params['pageno'] = self.pageno
  382. if hasattr(engine, 'language') and engine.language:
  383. request_params['language'] = engine.language
  384. else:
  385. request_params['language'] = self.lang
  386. try:
  387. # 0 = None, 1 = Moderate, 2 = Strict
  388. request_params['safesearch'] = int(request.cookies.get('safesearch'))
  389. except Exception:
  390. request_params['safesearch'] = settings['search']['safe_search']
  391. # update request parameters dependent on
  392. # search-engine (contained in engines folder)
  393. engine.request(self.query.encode('utf-8'), request_params)
  394. if request_params['url'] is None:
  395. # TODO add support of offline engines
  396. pass
  397. # create a callback wrapper for the search engine results
  398. callback = make_callback(
  399. selected_engine['name'],
  400. results_queue,
  401. engine.response,
  402. request_params)
  403. # create dictionary which contain all
  404. # informations about the request
  405. request_args = dict(
  406. headers=request_params['headers'],
  407. hooks=dict(response=callback),
  408. cookies=request_params['cookies'],
  409. timeout=engine.timeout,
  410. verify=request_params['verify']
  411. )
  412. # specific type of request (GET or POST)
  413. if request_params['method'] == 'GET':
  414. req = requests_lib.get
  415. else:
  416. req = requests_lib.post
  417. request_args['data'] = request_params['data']
  418. # ignoring empty urls
  419. if not request_params['url']:
  420. continue
  421. # append request to list
  422. requests.append((req, request_params['url'],
  423. request_args,
  424. selected_engine['name']))
  425. if not requests:
  426. return self
  427. # send all search-request
  428. threaded_requests(requests)
  429. while not results_queue.empty():
  430. engine_name, engine_results = results_queue.get_nowait()
  431. # TODO type checks
  432. [self.suggestions.add(x['suggestion'])
  433. for x in list(engine_results)
  434. if 'suggestion' in x
  435. and engine_results.remove(x) is None]
  436. [self.answers.add(x['answer'])
  437. for x in list(engine_results)
  438. if 'answer' in x
  439. and engine_results.remove(x) is None]
  440. self.infoboxes.extend(x for x in list(engine_results)
  441. if 'infobox' in x
  442. and engine_results.remove(x) is None)
  443. results[engine_name] = engine_results
  444. # update engine-specific stats
  445. for engine_name, engine_results in results.items():
  446. engines[engine_name].stats['search_count'] += 1
  447. engines[engine_name].stats['result_count'] += len(engine_results)
  448. # score results and remove duplications
  449. self.results = score_results(results)
  450. # merge infoboxes according to their ids
  451. self.infoboxes = merge_infoboxes(self.infoboxes)
  452. # update engine stats, using calculated score
  453. for result in self.results:
  454. for res_engine in result['engines']:
  455. engines[result['engine']]\
  456. .stats['score_count'] += result['score']
  457. # return results, suggestions, answers and infoboxes
  458. return self