search.py 18 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536
  1. '''
  2. searx is free software: you can redistribute it and/or modify
  3. it under the terms of the GNU Affero General Public License as published by
  4. the Free Software Foundation, either version 3 of the License, or
  5. (at your option) any later version.
  6. searx is distributed in the hope that it will be useful,
  7. but WITHOUT ANY WARRANTY; without even the implied warranty of
  8. MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  9. GNU Affero General Public License for more details.
  10. You should have received a copy of the GNU Affero General Public License
  11. along with searx. If not, see < http://www.gnu.org/licenses/ >.
  12. (C) 2013- by Adam Tauber, <asciimoo@gmail.com>
  13. '''
  14. import requests as requests_lib
  15. import threading
  16. import re
  17. from itertools import izip_longest, chain
  18. from datetime import datetime
  19. from operator import itemgetter
  20. from Queue import Queue
  21. from time import time
  22. from urlparse import urlparse, unquote
  23. from searx.engines import (
  24. categories, engines
  25. )
  26. from searx.languages import language_codes
  27. from searx.utils import gen_useragent
  28. from searx.query import Query
  29. number_of_searches = 0
  30. def threaded_requests(requests):
  31. timeout_limit = max(r[2]['timeout'] for r in requests)
  32. search_start = time()
  33. for fn, url, request_args in requests:
  34. th = threading.Thread(
  35. target=fn,
  36. args=(url,),
  37. kwargs=request_args,
  38. name='search_request',
  39. )
  40. th.start()
  41. for th in threading.enumerate():
  42. if th.name == 'search_request':
  43. remaining_time = max(0.0, timeout_limit - (time() - search_start))
  44. th.join(remaining_time)
  45. if th.isAlive():
  46. print('engine timeout')
  47. # get default reqest parameter
  48. def default_request_params():
  49. return {
  50. 'method': 'GET', 'headers': {}, 'data': {}, 'url': '', 'cookies': {}}
  51. # create a callback wrapper for the search engine results
  52. def make_callback(engine_name,
  53. results_queue,
  54. suggestions,
  55. answers,
  56. infoboxes,
  57. callback,
  58. params):
  59. # creating a callback wrapper for the search engine results
  60. def process_callback(response, **kwargs):
  61. cb_res = []
  62. response.search_params = params
  63. # callback
  64. try:
  65. search_results = callback(response)
  66. except Exception, e:
  67. # increase errors stats
  68. engines[engine_name].stats['errors'] += 1
  69. results_queue.put_nowait((engine_name, cb_res))
  70. # print engine name and specific error message
  71. print '[E] Error with engine "{0}":\n\t{1}'.format(
  72. engine_name, str(e))
  73. return
  74. # add results
  75. for result in search_results:
  76. result['engine'] = engine_name
  77. # if it is a suggestion, add it to list of suggestions
  78. if 'suggestion' in result:
  79. # TODO type checks
  80. suggestions.add(result['suggestion'])
  81. continue
  82. # if it is an answer, add it to list of answers
  83. if 'answer' in result:
  84. answers.add(result['answer'])
  85. continue
  86. # if it is an infobox, add it to list of infoboxes
  87. if 'infobox' in result:
  88. infoboxes.append(result)
  89. continue
  90. # append result
  91. cb_res.append(result)
  92. results_queue.put_nowait((engine_name, cb_res))
  93. # update stats with current page-load-time
  94. engines[engine_name].stats['page_load_time'] += \
  95. (datetime.now() - params['started']).total_seconds()
  96. return process_callback
  97. # return the meaningful length of the content for a result
  98. def content_result_len(content):
  99. if isinstance(content, basestring):
  100. content = re.sub('[,;:!?\./\\\\ ()-_]', '', content)
  101. return len(content)
  102. else:
  103. return 0
  104. # score results and remove duplications
  105. def score_results(results):
  106. # calculate scoring parameters
  107. flat_res = filter(
  108. None, chain.from_iterable(izip_longest(*results.values())))
  109. flat_len = len(flat_res)
  110. engines_len = len(results)
  111. results = []
  112. # pass 1: deduplication + scoring
  113. for i, res in enumerate(flat_res):
  114. res['parsed_url'] = urlparse(res['url'])
  115. res['host'] = res['parsed_url'].netloc
  116. if res['host'].startswith('www.'):
  117. res['host'] = res['host'].replace('www.', '', 1)
  118. res['engines'] = [res['engine']]
  119. weight = 1.0
  120. # strip multiple spaces and cariage returns from content
  121. if res.get('content'):
  122. res['content'] = re.sub(' +', ' ',
  123. res['content'].strip().replace('\n', ''))
  124. # get weight of this engine if possible
  125. if hasattr(engines[res['engine']], 'weight'):
  126. weight = float(engines[res['engine']].weight)
  127. # calculate score for that engine
  128. score = int((flat_len - i) / engines_len) * weight + 1
  129. # check for duplicates
  130. duplicated = False
  131. for new_res in results:
  132. # remove / from the end of the url if required
  133. p1 = res['parsed_url'].path[:-1]\
  134. if res['parsed_url'].path.endswith('/')\
  135. else res['parsed_url'].path
  136. p2 = new_res['parsed_url'].path[:-1]\
  137. if new_res['parsed_url'].path.endswith('/')\
  138. else new_res['parsed_url'].path
  139. # check if that result is a duplicate
  140. if res['host'] == new_res['host'] and\
  141. unquote(p1) == unquote(p2) and\
  142. res['parsed_url'].query == new_res['parsed_url'].query and\
  143. res.get('template') == new_res.get('template'):
  144. duplicated = new_res
  145. break
  146. # merge duplicates together
  147. if duplicated:
  148. # using content with more text
  149. if content_result_len(res.get('content', '')) >\
  150. content_result_len(duplicated.get('content', '')):
  151. duplicated['content'] = res['content']
  152. # increase result-score
  153. duplicated['score'] += score
  154. # add engine to list of result-engines
  155. duplicated['engines'].append(res['engine'])
  156. # using https if possible
  157. if duplicated['parsed_url'].scheme == 'https':
  158. continue
  159. elif res['parsed_url'].scheme == 'https':
  160. duplicated['url'] = res['parsed_url'].geturl()
  161. duplicated['parsed_url'] = res['parsed_url']
  162. # if there is no duplicate found, append result
  163. else:
  164. res['score'] = score
  165. results.append(res)
  166. results = sorted(results, key=itemgetter('score'), reverse=True)
  167. # pass 2 : group results by category and template
  168. gresults = []
  169. categoryPositions = {}
  170. for i, res in enumerate(results):
  171. # FIXME : handle more than one category per engine
  172. category = engines[res['engine']].categories[0] + ':' + ''\
  173. if 'template' not in res\
  174. else res['template']
  175. current = None if category not in categoryPositions\
  176. else categoryPositions[category]
  177. # group with previous results using the same category
  178. # if the group can accept more result and is not too far
  179. # from the current position
  180. if current is not None and (current['count'] > 0)\
  181. and (len(gresults) - current['index'] < 20):
  182. # group with the previous results using
  183. # the same category with this one
  184. index = current['index']
  185. gresults.insert(index, res)
  186. # update every index after the current one
  187. # (including the current one)
  188. for k in categoryPositions:
  189. v = categoryPositions[k]['index']
  190. if v >= index:
  191. categoryPositions[k]['index'] = v+1
  192. # update this category
  193. current['count'] -= 1
  194. else:
  195. # same category
  196. gresults.append(res)
  197. # update categoryIndex
  198. categoryPositions[category] = {'index': len(gresults), 'count': 8}
  199. # return gresults
  200. return gresults
  201. def merge_two_infoboxes(infobox1, infobox2):
  202. if 'urls' in infobox2:
  203. urls1 = infobox1.get('urls', None)
  204. if urls1 is None:
  205. urls1 = []
  206. infobox1.set('urls', urls1)
  207. urlSet = set()
  208. for url in infobox1.get('urls', []):
  209. urlSet.add(url.get('url', None))
  210. for url in infobox2.get('urls', []):
  211. if url.get('url', None) not in urlSet:
  212. urls1.append(url)
  213. if 'attributes' in infobox2:
  214. attributes1 = infobox1.get('attributes', None)
  215. if attributes1 is None:
  216. attributes1 = []
  217. infobox1.set('attributes', attributes1)
  218. attributeSet = set()
  219. for attribute in infobox1.get('attributes', []):
  220. if attribute.get('label', None) not in attributeSet:
  221. attributeSet.add(attribute.get('label', None))
  222. for attribute in infobox2.get('attributes', []):
  223. attributes1.append(attribute)
  224. if 'content' in infobox2:
  225. content1 = infobox1.get('content', None)
  226. content2 = infobox2.get('content', '')
  227. if content1 is not None:
  228. if content_result_len(content2) > content_result_len(content1):
  229. infobox1['content'] = content2
  230. else:
  231. infobox1.set('content', content2)
  232. def merge_infoboxes(infoboxes):
  233. results = []
  234. infoboxes_id = {}
  235. for infobox in infoboxes:
  236. add_infobox = True
  237. infobox_id = infobox.get('id', None)
  238. if infobox_id is not None:
  239. existingIndex = infoboxes_id.get(infobox_id, None)
  240. if existingIndex is not None:
  241. merge_two_infoboxes(results[existingIndex], infobox)
  242. add_infobox = False
  243. if add_infobox:
  244. results.append(infobox)
  245. infoboxes_id[infobox_id] = len(results)-1
  246. return results
  247. class Search(object):
  248. """Search information container"""
  249. def __init__(self, request):
  250. # init vars
  251. super(Search, self).__init__()
  252. self.query = None
  253. self.engines = []
  254. self.categories = []
  255. self.paging = False
  256. self.pageno = 1
  257. self.lang = 'all'
  258. # set blocked engines
  259. if request.cookies.get('blocked_engines'):
  260. self.blocked_engines = request.cookies['blocked_engines'].split(',') # noqa
  261. else:
  262. self.blocked_engines = []
  263. self.results = []
  264. self.suggestions = []
  265. self.answers = []
  266. self.infoboxes = []
  267. self.request_data = {}
  268. # set specific language if set
  269. if request.cookies.get('language')\
  270. and request.cookies['language'] in (x[0] for x in language_codes):
  271. self.lang = request.cookies['language']
  272. # set request method
  273. if request.method == 'POST':
  274. self.request_data = request.form
  275. else:
  276. self.request_data = request.args
  277. # TODO better exceptions
  278. if not self.request_data.get('q'):
  279. raise Exception('noquery')
  280. # set pagenumber
  281. pageno_param = self.request_data.get('pageno', '1')
  282. if not pageno_param.isdigit() or int(pageno_param) < 1:
  283. raise Exception('wrong pagenumber')
  284. self.pageno = int(pageno_param)
  285. # parse query, if tags are set, which change
  286. # the serch engine or search-language
  287. query_obj = Query(self.request_data['q'], self.blocked_engines)
  288. query_obj.parse_query()
  289. # set query
  290. self.query = query_obj.getSearchQuery()
  291. # get last selected language in query, if possible
  292. # TODO support search with multible languages
  293. if len(query_obj.languages):
  294. self.lang = query_obj.languages[-1]
  295. self.engines = query_obj.engines
  296. self.categories = []
  297. # if engines are calculated from query,
  298. # set categories by using that informations
  299. if self.engines:
  300. self.categories = list(set(engine['category']
  301. for engine in self.engines))
  302. # otherwise, using defined categories to
  303. # calculate which engines should be used
  304. else:
  305. # set used categories
  306. for pd_name, pd in self.request_data.items():
  307. if pd_name.startswith('category_'):
  308. category = pd_name[9:]
  309. # if category is not found in list, skip
  310. if category not in categories:
  311. continue
  312. # add category to list
  313. self.categories.append(category)
  314. # if no category is specified for this search,
  315. # using user-defined default-configuration which
  316. # (is stored in cookie)
  317. if not self.categories:
  318. cookie_categories = request.cookies.get('categories', '')
  319. cookie_categories = cookie_categories.split(',')
  320. for ccateg in cookie_categories:
  321. if ccateg in categories:
  322. self.categories.append(ccateg)
  323. # if still no category is specified, using general
  324. # as default-category
  325. if not self.categories:
  326. self.categories = ['general']
  327. # using all engines for that search, which are
  328. # declared under the specific categories
  329. for categ in self.categories:
  330. self.engines.extend({'category': categ,
  331. 'name': x.name}
  332. for x in categories[categ]
  333. if x.name not in self.blocked_engines)
  334. # do search-request
  335. def search(self, request):
  336. global number_of_searches
  337. # init vars
  338. requests = []
  339. results_queue = Queue()
  340. suggestions = set()
  341. answers = set()
  342. infoboxes = []
  343. # increase number of searches
  344. number_of_searches += 1
  345. # set default useragent
  346. # user_agent = request.headers.get('User-Agent', '')
  347. user_agent = gen_useragent()
  348. # start search-reqest for all selected engines
  349. for selected_engine in self.engines:
  350. if selected_engine['name'] not in engines:
  351. continue
  352. engine = engines[selected_engine['name']]
  353. # if paging is not supported, skip
  354. if self.pageno > 1 and not engine.paging:
  355. continue
  356. # if search-language is set and engine does not
  357. # provide language-support, skip
  358. if self.lang != 'all' and not engine.language_support:
  359. continue
  360. # set default request parameters
  361. request_params = default_request_params()
  362. request_params['headers']['User-Agent'] = user_agent
  363. request_params['category'] = selected_engine['category']
  364. request_params['started'] = datetime.now()
  365. request_params['pageno'] = self.pageno
  366. request_params['language'] = self.lang
  367. # update request parameters dependent on
  368. # search-engine (contained in engines folder)
  369. request_params = engine.request(self.query.encode('utf-8'),
  370. request_params)
  371. if request_params['url'] is None:
  372. # TODO add support of offline engines
  373. pass
  374. # create a callback wrapper for the search engine results
  375. callback = make_callback(
  376. selected_engine['name'],
  377. results_queue,
  378. suggestions,
  379. answers,
  380. infoboxes,
  381. engine.response,
  382. request_params
  383. )
  384. # create dictionary which contain all
  385. # informations about the request
  386. request_args = dict(
  387. headers=request_params['headers'],
  388. hooks=dict(response=callback),
  389. cookies=request_params['cookies'],
  390. timeout=engine.timeout
  391. )
  392. # specific type of request (GET or POST)
  393. if request_params['method'] == 'GET':
  394. req = requests_lib.get
  395. else:
  396. req = requests_lib.post
  397. request_args['data'] = request_params['data']
  398. # ignoring empty urls
  399. if not request_params['url']:
  400. continue
  401. # append request to list
  402. requests.append((req, request_params['url'], request_args))
  403. # send all search-request
  404. threaded_requests(requests)
  405. results = {}
  406. while not results_queue.empty():
  407. engine_name, engine_results = results_queue.get_nowait()
  408. results[engine_name] = engine_results
  409. # update engine-specific stats
  410. for engine_name, engine_results in results.items():
  411. engines[engine_name].stats['search_count'] += 1
  412. engines[engine_name].stats['result_count'] += len(engine_results)
  413. # score results and remove duplications
  414. results = score_results(results)
  415. # merge infoboxes according to their ids
  416. infoboxes = merge_infoboxes(infoboxes)
  417. # update engine stats, using calculated score
  418. for result in results:
  419. for res_engine in result['engines']:
  420. engines[result['engine']]\
  421. .stats['score_count'] += result['score']
  422. # return results, suggestions, answers and infoboxes
  423. return results, suggestions, answers, infoboxes