results.py 9.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266
  1. import re
  2. from collections import defaultdict
  3. from operator import itemgetter
  4. from threading import RLock
  5. from urlparse import urlparse, unquote
  6. from searx.engines import engines
  7. CONTENT_LEN_IGNORED_CHARS_REGEX = re.compile(r'[,;:!?\./\\\\ ()-_]', re.M | re.U)
  8. WHITESPACE_REGEX = re.compile('( |\t|\n)+', re.M | re.U)
  9. # return the meaningful length of the content for a result
  10. def result_content_len(content):
  11. if isinstance(content, basestring):
  12. return len(CONTENT_LEN_IGNORED_CHARS_REGEX.sub('', content))
  13. else:
  14. return 0
  15. def compare_urls(url_a, url_b):
  16. # ignore www. in comparison
  17. if url_a.netloc.startswith('www.'):
  18. host_a = url_a.netloc.replace('www.', '', 1)
  19. else:
  20. host_a = url_a.netloc
  21. if url_b.netloc.startswith('www.'):
  22. host_b = url_b.netloc.replace('www.', '', 1)
  23. else:
  24. host_b = url_b.netloc
  25. if host_a != host_b or url_a.query != url_b.query:
  26. return False
  27. # remove / from the end of the url if required
  28. path_a = url_a.path[:-1]\
  29. if url_a.path.endswith('/')\
  30. else url_a.path
  31. path_b = url_b.path[:-1]\
  32. if url_b.path.endswith('/')\
  33. else url_b.path
  34. return unquote(path_a) == unquote(path_b)
  35. def merge_two_infoboxes(infobox1, infobox2):
  36. if 'urls' in infobox2:
  37. urls1 = infobox1.get('urls', None)
  38. if urls1 is None:
  39. urls1 = []
  40. for url2 in infobox2.get('urls', []):
  41. unique_url = True
  42. for url1 in infobox1.get('urls', []):
  43. if compare_urls(urlparse(url1.get('url', '')), urlparse(url2.get('url', ''))):
  44. unique_url = False
  45. break
  46. if unique_url:
  47. urls1.append(url2)
  48. infobox1['urls'] = urls1
  49. if 'img_src' in infobox2:
  50. img1 = infobox1.get('img_src', None)
  51. img2 = infobox2.get('img_src')
  52. if img1 is None:
  53. infobox1['img_src'] = img2
  54. if 'attributes' in infobox2:
  55. attributes1 = infobox1.get('attributes', None)
  56. if attributes1 is None:
  57. attributes1 = []
  58. infobox1['attributes'] = attributes1
  59. attributeSet = set()
  60. for attribute in infobox1.get('attributes', []):
  61. if attribute.get('label', None) not in attributeSet:
  62. attributeSet.add(attribute.get('label', None))
  63. for attribute in infobox2.get('attributes', []):
  64. attributes1.append(attribute)
  65. if 'content' in infobox2:
  66. content1 = infobox1.get('content', None)
  67. content2 = infobox2.get('content', '')
  68. if content1 is not None:
  69. if result_content_len(content2) > result_content_len(content1):
  70. infobox1['content'] = content2
  71. else:
  72. infobox1['content'] = content2
  73. def result_score(result):
  74. weight = 1.0
  75. for result_engine in result['engines']:
  76. if hasattr(engines[result_engine], 'weight'):
  77. weight *= float(engines[result_engine].weight)
  78. occurences = len(result['positions'])
  79. return sum((occurences * weight) / position for position in result['positions'])
  80. class ResultContainer(object):
  81. """docstring for ResultContainer"""
  82. def __init__(self):
  83. super(ResultContainer, self).__init__()
  84. self.results = defaultdict(list)
  85. self._merged_results = []
  86. self.infoboxes = []
  87. self.suggestions = set()
  88. self.answers = set()
  89. self._number_of_results = []
  90. def extend(self, engine_name, results):
  91. for result in list(results):
  92. if 'suggestion' in result:
  93. self.suggestions.add(result['suggestion'])
  94. results.remove(result)
  95. elif 'answer' in result:
  96. self.answers.add(result['answer'])
  97. results.remove(result)
  98. elif 'infobox' in result:
  99. self._merge_infobox(result)
  100. results.remove(result)
  101. elif 'number_of_results' in result:
  102. self._number_of_results.append(result['number_of_results'])
  103. results.remove(result)
  104. with RLock():
  105. engines[engine_name].stats['search_count'] += 1
  106. engines[engine_name].stats['result_count'] += len(results)
  107. if not results:
  108. return
  109. self.results[engine_name].extend(results)
  110. for i, result in enumerate(results):
  111. try:
  112. result['url'] = result['url'].decode('utf-8')
  113. except:
  114. pass
  115. position = i + 1
  116. self._merge_result(result, position)
  117. def _merge_infobox(self, infobox):
  118. add_infobox = True
  119. infobox_id = infobox.get('id', None)
  120. if infobox_id is not None:
  121. for existingIndex in self.infoboxes:
  122. if compare_urls(urlparse(existingIndex.get('id', '')), urlparse(infobox_id)):
  123. merge_two_infoboxes(existingIndex, infobox)
  124. add_infobox = False
  125. if add_infobox:
  126. self.infoboxes.append(infobox)
  127. def _merge_result(self, result, position):
  128. result['parsed_url'] = urlparse(result['url'])
  129. # if the result has no scheme, use http as default
  130. if not result['parsed_url'].scheme:
  131. result['parsed_url'] = result['parsed_url']._replace(scheme="http")
  132. result['url'] = result['parsed_url'].geturl()
  133. result['engines'] = [result['engine']]
  134. # strip multiple spaces and cariage returns from content
  135. if result.get('content'):
  136. result['content'] = WHITESPACE_REGEX.sub(' ', result['content'])
  137. # check for duplicates
  138. duplicated = False
  139. for merged_result in self._merged_results:
  140. if compare_urls(result['parsed_url'], merged_result['parsed_url'])\
  141. and result.get('template') == merged_result.get('template'):
  142. duplicated = merged_result
  143. break
  144. # merge duplicates together
  145. if duplicated:
  146. # using content with more text
  147. if result_content_len(result.get('content', '')) >\
  148. result_content_len(duplicated.get('content', '')):
  149. duplicated['content'] = result['content']
  150. # add the new position
  151. duplicated['positions'].append(position)
  152. # add engine to list of result-engines
  153. duplicated['engines'].append(result['engine'])
  154. # using https if possible
  155. if duplicated['parsed_url'].scheme != 'https' and result['parsed_url'].scheme == 'https':
  156. duplicated['url'] = result['parsed_url'].geturl()
  157. duplicated['parsed_url'] = result['parsed_url']
  158. # if there is no duplicate found, append result
  159. else:
  160. result['positions'] = [position]
  161. with RLock():
  162. self._merged_results.append(result)
  163. def get_ordered_results(self):
  164. for result in self._merged_results:
  165. score = result_score(result)
  166. result['score'] = score
  167. with RLock():
  168. for result_engine in result['engines']:
  169. engines[result_engine].stats['score_count'] += score
  170. results = sorted(self._merged_results, key=itemgetter('score'), reverse=True)
  171. # pass 2 : group results by category and template
  172. gresults = []
  173. categoryPositions = {}
  174. for i, res in enumerate(results):
  175. # FIXME : handle more than one category per engine
  176. category = engines[res['engine']].categories[0] + ':' + ''\
  177. if 'template' not in res\
  178. else res['template']
  179. current = None if category not in categoryPositions\
  180. else categoryPositions[category]
  181. # group with previous results using the same category
  182. # if the group can accept more result and is not too far
  183. # from the current position
  184. if current is not None and (current['count'] > 0)\
  185. and (len(gresults) - current['index'] < 20):
  186. # group with the previous results using
  187. # the same category with this one
  188. index = current['index']
  189. gresults.insert(index, res)
  190. # update every index after the current one
  191. # (including the current one)
  192. for k in categoryPositions:
  193. v = categoryPositions[k]['index']
  194. if v >= index:
  195. categoryPositions[k]['index'] = v + 1
  196. # update this category
  197. current['count'] -= 1
  198. else:
  199. # same category
  200. gresults.append(res)
  201. # update categoryIndex
  202. categoryPositions[category] = {'index': len(gresults), 'count': 8}
  203. # return gresults
  204. return gresults
  205. def results_length(self):
  206. return len(self._merged_results)
  207. def results_number(self):
  208. resultnum_sum = sum(self._number_of_results)
  209. if not resultnum_sum or not self._number_of_results:
  210. return 0
  211. return resultnum_sum / len(self._number_of_results)