search.py 19 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572
  1. '''
  2. searx is free software: you can redistribute it and/or modify
  3. it under the terms of the GNU Affero General Public License as published by
  4. the Free Software Foundation, either version 3 of the License, or
  5. (at your option) any later version.
  6. searx is distributed in the hope that it will be useful,
  7. but WITHOUT ANY WARRANTY; without even the implied warranty of
  8. MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  9. GNU Affero General Public License for more details.
  10. You should have received a copy of the GNU Affero General Public License
  11. along with searx. If not, see < http://www.gnu.org/licenses/ >.
  12. (C) 2013- by Adam Tauber, <asciimoo@gmail.com>
  13. '''
  14. import threading
  15. import re
  16. import searx.poolrequests as requests_lib
  17. from itertools import izip_longest, chain
  18. from operator import itemgetter
  19. from Queue import Queue
  20. from time import time
  21. from urlparse import urlparse, unquote
  22. from searx import settings
  23. from searx.engines import (
  24. categories, engines
  25. )
  26. from searx.languages import language_codes
  27. from searx.utils import gen_useragent, get_blocked_engines
  28. from searx.query import Query
  29. from searx import logger
  30. logger = logger.getChild('search')
  31. number_of_searches = 0
  32. def search_request_wrapper(fn, url, engine_name, **kwargs):
  33. try:
  34. return fn(url, **kwargs)
  35. except:
  36. # increase errors stats
  37. engines[engine_name].stats['errors'] += 1
  38. # print engine name and specific error message
  39. logger.exception('engine crash: {0}'.format(engine_name))
  40. return
  41. def threaded_requests(requests):
  42. timeout_limit = max(r[2]['timeout'] for r in requests)
  43. search_start = time()
  44. for fn, url, request_args, engine_name in requests:
  45. request_args['timeout'] = timeout_limit
  46. th = threading.Thread(
  47. target=search_request_wrapper,
  48. args=(fn, url, engine_name),
  49. kwargs=request_args,
  50. name='search_request',
  51. )
  52. th._engine_name = engine_name
  53. th.start()
  54. for th in threading.enumerate():
  55. if th.name == 'search_request':
  56. remaining_time = max(0.0, timeout_limit - (time() - search_start))
  57. th.join(remaining_time)
  58. if th.isAlive():
  59. logger.warning('engine timeout: {0}'.format(th._engine_name))
  60. # get default reqest parameter
  61. def default_request_params():
  62. return {
  63. 'method': 'GET',
  64. 'headers': {},
  65. 'data': {},
  66. 'url': '',
  67. 'cookies': {},
  68. 'verify': True
  69. }
  70. # create a callback wrapper for the search engine results
  71. def make_callback(engine_name, results_queue, callback, params):
  72. # creating a callback wrapper for the search engine results
  73. def process_callback(response, **kwargs):
  74. # check if redirect comparing to the True value,
  75. # because resp can be a Mock object, and any attribut name returns something.
  76. if response.is_redirect is True:
  77. logger.debug('{0} redirect on: {1}'.format(engine_name, response))
  78. return
  79. response.search_params = params
  80. timeout_overhead = 0.2 # seconds
  81. search_duration = time() - params['started']
  82. timeout_limit = engines[engine_name].timeout + timeout_overhead
  83. if search_duration > timeout_limit:
  84. engines[engine_name].stats['page_load_time'] += timeout_limit
  85. engines[engine_name].stats['errors'] += 1
  86. return
  87. # callback
  88. search_results = callback(response)
  89. # add results
  90. for result in search_results:
  91. result['engine'] = engine_name
  92. results_queue.put_nowait((engine_name, search_results))
  93. # update stats with current page-load-time
  94. engines[engine_name].stats['page_load_time'] += search_duration
  95. return process_callback
  96. # return the meaningful length of the content for a result
  97. def content_result_len(content):
  98. if isinstance(content, basestring):
  99. content = re.sub('[,;:!?\./\\\\ ()-_]', '', content)
  100. return len(content)
  101. else:
  102. return 0
  103. # score results and remove duplications
  104. def score_results(results):
  105. # calculate scoring parameters
  106. flat_res = filter(
  107. None, chain.from_iterable(izip_longest(*results.values())))
  108. flat_len = len(flat_res)
  109. engines_len = len(results)
  110. results = []
  111. # pass 1: deduplication + scoring
  112. for i, res in enumerate(flat_res):
  113. res['parsed_url'] = urlparse(res['url'])
  114. res['host'] = res['parsed_url'].netloc
  115. if res['host'].startswith('www.'):
  116. res['host'] = res['host'].replace('www.', '', 1)
  117. res['engines'] = [res['engine']]
  118. weight = 1.0
  119. # strip multiple spaces and cariage returns from content
  120. if res.get('content'):
  121. res['content'] = re.sub(' +', ' ',
  122. res['content'].strip().replace('\n', ''))
  123. # get weight of this engine if possible
  124. if hasattr(engines[res['engine']], 'weight'):
  125. weight = float(engines[res['engine']].weight)
  126. # calculate score for that engine
  127. score = int((flat_len - i) / engines_len) * weight + 1
  128. # check for duplicates
  129. duplicated = False
  130. for new_res in results:
  131. # remove / from the end of the url if required
  132. p1 = res['parsed_url'].path[:-1]\
  133. if res['parsed_url'].path.endswith('/')\
  134. else res['parsed_url'].path
  135. p2 = new_res['parsed_url'].path[:-1]\
  136. if new_res['parsed_url'].path.endswith('/')\
  137. else new_res['parsed_url'].path
  138. # check if that result is a duplicate
  139. if res['host'] == new_res['host'] and\
  140. unquote(p1) == unquote(p2) and\
  141. res['parsed_url'].query == new_res['parsed_url'].query and\
  142. res.get('template') == new_res.get('template'):
  143. duplicated = new_res
  144. break
  145. # merge duplicates together
  146. if duplicated:
  147. # using content with more text
  148. if content_result_len(res.get('content', '')) >\
  149. content_result_len(duplicated.get('content', '')):
  150. duplicated['content'] = res['content']
  151. # increase result-score
  152. duplicated['score'] += score
  153. # add engine to list of result-engines
  154. duplicated['engines'].append(res['engine'])
  155. # using https if possible
  156. if duplicated['parsed_url'].scheme == 'https':
  157. continue
  158. elif res['parsed_url'].scheme == 'https':
  159. duplicated['url'] = res['parsed_url'].geturl()
  160. duplicated['parsed_url'] = res['parsed_url']
  161. # if there is no duplicate found, append result
  162. else:
  163. res['score'] = score
  164. results.append(res)
  165. results = sorted(results, key=itemgetter('score'), reverse=True)
  166. # pass 2 : group results by category and template
  167. gresults = []
  168. categoryPositions = {}
  169. for i, res in enumerate(results):
  170. # FIXME : handle more than one category per engine
  171. category = engines[res['engine']].categories[0] + ':' + ''\
  172. if 'template' not in res\
  173. else res['template']
  174. current = None if category not in categoryPositions\
  175. else categoryPositions[category]
  176. # group with previous results using the same category
  177. # if the group can accept more result and is not too far
  178. # from the current position
  179. if current is not None and (current['count'] > 0)\
  180. and (len(gresults) - current['index'] < 20):
  181. # group with the previous results using
  182. # the same category with this one
  183. index = current['index']
  184. gresults.insert(index, res)
  185. # update every index after the current one
  186. # (including the current one)
  187. for k in categoryPositions:
  188. v = categoryPositions[k]['index']
  189. if v >= index:
  190. categoryPositions[k]['index'] = v + 1
  191. # update this category
  192. current['count'] -= 1
  193. else:
  194. # same category
  195. gresults.append(res)
  196. # update categoryIndex
  197. categoryPositions[category] = {'index': len(gresults), 'count': 8}
  198. # return gresults
  199. return gresults
  200. def merge_two_infoboxes(infobox1, infobox2):
  201. if 'urls' in infobox2:
  202. urls1 = infobox1.get('urls', None)
  203. if urls1 is None:
  204. urls1 = []
  205. infobox1.set('urls', urls1)
  206. urlSet = set()
  207. for url in infobox1.get('urls', []):
  208. urlSet.add(url.get('url', None))
  209. for url in infobox2.get('urls', []):
  210. if url.get('url', None) not in urlSet:
  211. urls1.append(url)
  212. if 'attributes' in infobox2:
  213. attributes1 = infobox1.get('attributes', None)
  214. if attributes1 is None:
  215. attributes1 = []
  216. infobox1.set('attributes', attributes1)
  217. attributeSet = set()
  218. for attribute in infobox1.get('attributes', []):
  219. if attribute.get('label', None) not in attributeSet:
  220. attributeSet.add(attribute.get('label', None))
  221. for attribute in infobox2.get('attributes', []):
  222. attributes1.append(attribute)
  223. if 'content' in infobox2:
  224. content1 = infobox1.get('content', None)
  225. content2 = infobox2.get('content', '')
  226. if content1 is not None:
  227. if content_result_len(content2) > content_result_len(content1):
  228. infobox1['content'] = content2
  229. else:
  230. infobox1.set('content', content2)
  231. def merge_infoboxes(infoboxes):
  232. results = []
  233. infoboxes_id = {}
  234. for infobox in infoboxes:
  235. add_infobox = True
  236. infobox_id = infobox.get('id', None)
  237. if infobox_id is not None:
  238. existingIndex = infoboxes_id.get(infobox_id, None)
  239. if existingIndex is not None:
  240. merge_two_infoboxes(results[existingIndex], infobox)
  241. add_infobox = False
  242. if add_infobox:
  243. results.append(infobox)
  244. infoboxes_id[infobox_id] = len(results) - 1
  245. return results
  246. class Search(object):
  247. """Search information container"""
  248. def __init__(self, request):
  249. # init vars
  250. super(Search, self).__init__()
  251. self.query = None
  252. self.engines = []
  253. self.categories = []
  254. self.paging = False
  255. self.pageno = 1
  256. self.lang = 'all'
  257. # set blocked engines
  258. self.blocked_engines = get_blocked_engines(engines, request.cookies)
  259. self.results = []
  260. self.suggestions = set()
  261. self.answers = set()
  262. self.infoboxes = []
  263. self.request_data = {}
  264. # set specific language if set
  265. if request.cookies.get('language')\
  266. and request.cookies['language'] in (x[0] for x in language_codes):
  267. self.lang = request.cookies['language']
  268. # set request method
  269. if request.method == 'POST':
  270. self.request_data = request.form
  271. else:
  272. self.request_data = request.args
  273. # TODO better exceptions
  274. if not self.request_data.get('q'):
  275. raise Exception('noquery')
  276. # set pagenumber
  277. pageno_param = self.request_data.get('pageno', '1')
  278. if not pageno_param.isdigit() or int(pageno_param) < 1:
  279. raise Exception('wrong pagenumber')
  280. self.pageno = int(pageno_param)
  281. # parse query, if tags are set, which change
  282. # the serch engine or search-language
  283. query_obj = Query(self.request_data['q'], self.blocked_engines)
  284. query_obj.parse_query()
  285. # set query
  286. self.query = query_obj.getSearchQuery()
  287. # get last selected language in query, if possible
  288. # TODO support search with multible languages
  289. if len(query_obj.languages):
  290. self.lang = query_obj.languages[-1]
  291. self.engines = query_obj.engines
  292. self.categories = []
  293. # if engines are calculated from query,
  294. # set categories by using that informations
  295. if self.engines and query_obj.specific:
  296. self.categories = list(set(engine['category']
  297. for engine in self.engines))
  298. # otherwise, using defined categories to
  299. # calculate which engines should be used
  300. else:
  301. # set categories/engines
  302. load_default_categories = True
  303. for pd_name, pd in self.request_data.items():
  304. if pd_name == 'categories':
  305. self.categories.extend(categ.strip() for categ in pd.split(',') if categ in categories)
  306. elif pd_name == 'engines':
  307. pd_engines = [{'category': engines[engine].categories[0],
  308. 'name': engine}
  309. for engine in map(str.strip, pd.split(',')) if engine in engines]
  310. if pd_engines:
  311. self.engines.extend(pd_engines)
  312. load_default_categories = False
  313. elif pd_name.startswith('category_'):
  314. category = pd_name[9:]
  315. # if category is not found in list, skip
  316. if category not in categories:
  317. continue
  318. if pd != 'off':
  319. # add category to list
  320. self.categories.append(category)
  321. elif category in self.categories:
  322. # remove category from list if property is set to 'off'
  323. self.categories.remove(category)
  324. if not load_default_categories:
  325. return
  326. # if no category is specified for this search,
  327. # using user-defined default-configuration which
  328. # (is stored in cookie)
  329. if not self.categories:
  330. cookie_categories = request.cookies.get('categories', '')
  331. cookie_categories = cookie_categories.split(',')
  332. for ccateg in cookie_categories:
  333. if ccateg in categories:
  334. self.categories.append(ccateg)
  335. # if still no category is specified, using general
  336. # as default-category
  337. if not self.categories:
  338. self.categories = ['general']
  339. # using all engines for that search, which are
  340. # declared under the specific categories
  341. for categ in self.categories:
  342. self.engines.extend({'category': categ,
  343. 'name': engine.name}
  344. for engine in categories[categ]
  345. if (engine.name, categ) not in self.blocked_engines)
  346. # do search-request
  347. def search(self, request):
  348. global number_of_searches
  349. # init vars
  350. requests = []
  351. results_queue = Queue()
  352. results = {}
  353. # increase number of searches
  354. number_of_searches += 1
  355. # set default useragent
  356. # user_agent = request.headers.get('User-Agent', '')
  357. user_agent = gen_useragent()
  358. # start search-reqest for all selected engines
  359. for selected_engine in self.engines:
  360. if selected_engine['name'] not in engines:
  361. continue
  362. engine = engines[selected_engine['name']]
  363. # if paging is not supported, skip
  364. if self.pageno > 1 and not engine.paging:
  365. continue
  366. # if search-language is set and engine does not
  367. # provide language-support, skip
  368. if self.lang != 'all' and not engine.language_support:
  369. continue
  370. # set default request parameters
  371. request_params = default_request_params()
  372. request_params['headers']['User-Agent'] = user_agent
  373. request_params['category'] = selected_engine['category']
  374. request_params['started'] = time()
  375. request_params['pageno'] = self.pageno
  376. if hasattr(engine, 'language'):
  377. request_params['language'] = engine.language
  378. else:
  379. request_params['language'] = self.lang
  380. try:
  381. # 0 = None, 1 = Moderate, 2 = Strict
  382. request_params['safesearch'] = int(request.cookies.get('safesearch'))
  383. except Exception:
  384. request_params['safesearch'] = settings['search']['safe_search']
  385. # update request parameters dependent on
  386. # search-engine (contained in engines folder)
  387. engine.request(self.query.encode('utf-8'), request_params)
  388. if request_params['url'] is None:
  389. # TODO add support of offline engines
  390. pass
  391. # create a callback wrapper for the search engine results
  392. callback = make_callback(
  393. selected_engine['name'],
  394. results_queue,
  395. engine.response,
  396. request_params)
  397. # create dictionary which contain all
  398. # informations about the request
  399. request_args = dict(
  400. headers=request_params['headers'],
  401. hooks=dict(response=callback),
  402. cookies=request_params['cookies'],
  403. timeout=engine.timeout,
  404. verify=request_params['verify']
  405. )
  406. # specific type of request (GET or POST)
  407. if request_params['method'] == 'GET':
  408. req = requests_lib.get
  409. else:
  410. req = requests_lib.post
  411. request_args['data'] = request_params['data']
  412. # ignoring empty urls
  413. if not request_params['url']:
  414. continue
  415. # append request to list
  416. requests.append((req, request_params['url'],
  417. request_args,
  418. selected_engine['name']))
  419. if not requests:
  420. return self
  421. # send all search-request
  422. threaded_requests(requests)
  423. while not results_queue.empty():
  424. engine_name, engine_results = results_queue.get_nowait()
  425. # TODO type checks
  426. [self.suggestions.add(x['suggestion'])
  427. for x in list(engine_results)
  428. if 'suggestion' in x
  429. and engine_results.remove(x) is None]
  430. [self.answers.add(x['answer'])
  431. for x in list(engine_results)
  432. if 'answer' in x
  433. and engine_results.remove(x) is None]
  434. self.infoboxes.extend(x for x in list(engine_results)
  435. if 'infobox' in x
  436. and engine_results.remove(x) is None)
  437. results[engine_name] = engine_results
  438. # update engine-specific stats
  439. for engine_name, engine_results in results.items():
  440. engines[engine_name].stats['search_count'] += 1
  441. engines[engine_name].stats['result_count'] += len(engine_results)
  442. # score results and remove duplications
  443. self.results = score_results(results)
  444. # merge infoboxes according to their ids
  445. self.infoboxes = merge_infoboxes(self.infoboxes)
  446. # update engine stats, using calculated score
  447. for result in self.results:
  448. for res_engine in result['engines']:
  449. engines[result['engine']]\
  450. .stats['score_count'] += result['score']
  451. # return results, suggestions, answers and infoboxes
  452. return self