results.py 10 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299
  1. import re
  2. from collections import defaultdict
  3. from operator import itemgetter
  4. from threading import RLock
  5. from urlparse import urlparse, unquote
  6. from searx.engines import engines
  7. CONTENT_LEN_IGNORED_CHARS_REGEX = re.compile(r'[,;:!?\./\\\\ ()-_]', re.M | re.U)
  8. WHITESPACE_REGEX = re.compile('( |\t|\n)+', re.M | re.U)
  9. # return the meaningful length of the content for a result
  10. def result_content_len(content):
  11. if isinstance(content, basestring):
  12. return len(CONTENT_LEN_IGNORED_CHARS_REGEX.sub('', content))
  13. else:
  14. return 0
  15. def compare_urls(url_a, url_b):
  16. # ignore www. in comparison
  17. if url_a.netloc.startswith('www.'):
  18. host_a = url_a.netloc.replace('www.', '', 1)
  19. else:
  20. host_a = url_a.netloc
  21. if url_b.netloc.startswith('www.'):
  22. host_b = url_b.netloc.replace('www.', '', 1)
  23. else:
  24. host_b = url_b.netloc
  25. if host_a != host_b or url_a.query != url_b.query or url_a.fragment != url_b.fragment:
  26. return False
  27. # remove / from the end of the url if required
  28. path_a = url_a.path[:-1]\
  29. if url_a.path.endswith('/')\
  30. else url_a.path
  31. path_b = url_b.path[:-1]\
  32. if url_b.path.endswith('/')\
  33. else url_b.path
  34. return unquote(path_a) == unquote(path_b)
  35. def merge_two_infoboxes(infobox1, infobox2):
  36. # get engines weights
  37. if hasattr(engines[infobox1['engine']], 'weight'):
  38. weight1 = engines[infobox1['engine']].weight
  39. else:
  40. weight1 = 1
  41. if hasattr(engines[infobox2['engine']], 'weight'):
  42. weight2 = engines[infobox2['engine']].weight
  43. else:
  44. weight2 = 1
  45. if weight2 > weight1:
  46. infobox1['engine'] = infobox2['engine']
  47. if 'urls' in infobox2:
  48. urls1 = infobox1.get('urls', None)
  49. if urls1 is None:
  50. urls1 = []
  51. for url2 in infobox2.get('urls', []):
  52. unique_url = True
  53. for url1 in infobox1.get('urls', []):
  54. if compare_urls(urlparse(url1.get('url', '')), urlparse(url2.get('url', ''))):
  55. unique_url = False
  56. break
  57. if unique_url:
  58. urls1.append(url2)
  59. infobox1['urls'] = urls1
  60. if 'img_src' in infobox2:
  61. img1 = infobox1.get('img_src', None)
  62. img2 = infobox2.get('img_src')
  63. if img1 is None:
  64. infobox1['img_src'] = img2
  65. elif weight2 > weight1:
  66. infobox1['img_src'] = img2
  67. if 'attributes' in infobox2:
  68. attributes1 = infobox1.get('attributes', None)
  69. if attributes1 is None:
  70. attributes1 = []
  71. infobox1['attributes'] = attributes1
  72. attributeSet = set()
  73. for attribute in infobox1.get('attributes', []):
  74. if attribute.get('label', None) not in attributeSet:
  75. attributeSet.add(attribute.get('label', None))
  76. for attribute in infobox2.get('attributes', []):
  77. if attribute.get('label', None) not in attributeSet:
  78. attributes1.append(attribute)
  79. if 'content' in infobox2:
  80. content1 = infobox1.get('content', None)
  81. content2 = infobox2.get('content', '')
  82. if content1 is not None:
  83. if result_content_len(content2) > result_content_len(content1):
  84. infobox1['content'] = content2
  85. else:
  86. infobox1['content'] = content2
  87. def result_score(result):
  88. weight = 1.0
  89. for result_engine in result['engines']:
  90. if hasattr(engines[result_engine], 'weight'):
  91. weight *= float(engines[result_engine].weight)
  92. occurences = len(result['positions'])
  93. return sum((occurences * weight) / position for position in result['positions'])
  94. class ResultContainer(object):
  95. """docstring for ResultContainer"""
  96. def __init__(self):
  97. super(ResultContainer, self).__init__()
  98. self.results = defaultdict(list)
  99. self._merged_results = []
  100. self.infoboxes = []
  101. self.suggestions = set()
  102. self.answers = set()
  103. self.corrections = set()
  104. self._number_of_results = []
  105. self._ordered = False
  106. self.paging = False
  107. def extend(self, engine_name, results):
  108. for result in list(results):
  109. result['engine'] = engine_name
  110. if 'suggestion' in result:
  111. self.suggestions.add(result['suggestion'])
  112. results.remove(result)
  113. elif 'answer' in result:
  114. self.answers.add(result['answer'])
  115. results.remove(result)
  116. elif 'correction' in result:
  117. self.corrections.add(result['correction'])
  118. results.remove(result)
  119. elif 'infobox' in result:
  120. self._merge_infobox(result)
  121. results.remove(result)
  122. elif 'number_of_results' in result:
  123. self._number_of_results.append(result['number_of_results'])
  124. results.remove(result)
  125. if engine_name in engines:
  126. with RLock():
  127. engines[engine_name].stats['search_count'] += 1
  128. engines[engine_name].stats['result_count'] += len(results)
  129. if not results:
  130. return
  131. self.results[engine_name].extend(results)
  132. if not self.paging and engine_name in engines and engines[engine_name].paging:
  133. self.paging = True
  134. for i, result in enumerate(results):
  135. try:
  136. result['url'] = result['url'].decode('utf-8')
  137. except:
  138. pass
  139. position = i + 1
  140. self._merge_result(result, position)
  141. def _merge_infobox(self, infobox):
  142. add_infobox = True
  143. infobox_id = infobox.get('id', None)
  144. if infobox_id is not None:
  145. for existingIndex in self.infoboxes:
  146. if compare_urls(urlparse(existingIndex.get('id', '')), urlparse(infobox_id)):
  147. merge_two_infoboxes(existingIndex, infobox)
  148. add_infobox = False
  149. if add_infobox:
  150. self.infoboxes.append(infobox)
  151. def _merge_result(self, result, position):
  152. result['parsed_url'] = urlparse(result['url'])
  153. # if the result has no scheme, use http as default
  154. if not result['parsed_url'].scheme:
  155. result['parsed_url'] = result['parsed_url']._replace(scheme="http")
  156. result['url'] = result['parsed_url'].geturl()
  157. result['engines'] = [result['engine']]
  158. # strip multiple spaces and cariage returns from content
  159. if result.get('content'):
  160. result['content'] = WHITESPACE_REGEX.sub(' ', result['content'])
  161. # check for duplicates
  162. duplicated = False
  163. for merged_result in self._merged_results:
  164. if compare_urls(result['parsed_url'], merged_result['parsed_url'])\
  165. and result.get('template') == merged_result.get('template'):
  166. duplicated = merged_result
  167. break
  168. # merge duplicates together
  169. if duplicated:
  170. # using content with more text
  171. if result_content_len(result.get('content', '')) >\
  172. result_content_len(duplicated.get('content', '')):
  173. duplicated['content'] = result['content']
  174. # add the new position
  175. duplicated['positions'].append(position)
  176. # add engine to list of result-engines
  177. duplicated['engines'].append(result['engine'])
  178. # using https if possible
  179. if duplicated['parsed_url'].scheme != 'https' and result['parsed_url'].scheme == 'https':
  180. duplicated['url'] = result['parsed_url'].geturl()
  181. duplicated['parsed_url'] = result['parsed_url']
  182. # if there is no duplicate found, append result
  183. else:
  184. result['positions'] = [position]
  185. with RLock():
  186. self._merged_results.append(result)
  187. def order_results(self):
  188. for result in self._merged_results:
  189. score = result_score(result)
  190. result['score'] = score
  191. with RLock():
  192. for result_engine in result['engines']:
  193. engines[result_engine].stats['score_count'] += score
  194. results = sorted(self._merged_results, key=itemgetter('score'), reverse=True)
  195. # pass 2 : group results by category and template
  196. gresults = []
  197. categoryPositions = {}
  198. for i, res in enumerate(results):
  199. # FIXME : handle more than one category per engine
  200. category = engines[res['engine']].categories[0] + ':' + ''\
  201. if 'template' not in res\
  202. else res['template']
  203. current = None if category not in categoryPositions\
  204. else categoryPositions[category]
  205. # group with previous results using the same category
  206. # if the group can accept more result and is not too far
  207. # from the current position
  208. if current is not None and (current['count'] > 0)\
  209. and (len(gresults) - current['index'] < 20):
  210. # group with the previous results using
  211. # the same category with this one
  212. index = current['index']
  213. gresults.insert(index, res)
  214. # update every index after the current one
  215. # (including the current one)
  216. for k in categoryPositions:
  217. v = categoryPositions[k]['index']
  218. if v >= index:
  219. categoryPositions[k]['index'] = v + 1
  220. # update this category
  221. current['count'] -= 1
  222. else:
  223. # same category
  224. gresults.append(res)
  225. # update categoryIndex
  226. categoryPositions[category] = {'index': len(gresults), 'count': 8}
  227. # update _merged_results
  228. self._ordered = True
  229. self._merged_results = gresults
  230. def get_ordered_results(self):
  231. if not self._ordered:
  232. self.order_results()
  233. return self._merged_results
  234. def results_length(self):
  235. return len(self._merged_results)
  236. def results_number(self):
  237. resultnum_sum = sum(self._number_of_results)
  238. if not resultnum_sum or not self._number_of_results:
  239. return 0
  240. return resultnum_sum / len(self._number_of_results)