results.py 17 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474
  1. # SPDX-License-Identifier: AGPL-3.0-or-later
  2. # pylint: disable=missing-module-docstring
  3. import re
  4. from collections import defaultdict
  5. from operator import itemgetter
  6. from threading import RLock
  7. from typing import List, NamedTuple, Set
  8. from urllib.parse import urlparse, unquote
  9. from searx import logger
  10. from searx import utils
  11. from searx.engines import engines
  12. from searx.metrics import histogram_observe, counter_add, count_error
  13. CONTENT_LEN_IGNORED_CHARS_REGEX = re.compile(r'[,;:!?\./\\\\ ()-_]', re.M | re.U)
  14. WHITESPACE_REGEX = re.compile('( |\t|\n)+', re.M | re.U)
  15. # return the meaningful length of the content for a result
  16. def result_content_len(content):
  17. if isinstance(content, str):
  18. return len(CONTENT_LEN_IGNORED_CHARS_REGEX.sub('', content))
  19. return 0
  20. def compare_urls(url_a, url_b):
  21. """Lazy compare between two URL.
  22. "www.example.com" and "example.com" are equals.
  23. "www.example.com/path/" and "www.example.com/path" are equals.
  24. "https://www.example.com/" and "http://www.example.com/" are equals.
  25. Args:
  26. url_a (ParseResult): first URL
  27. url_b (ParseResult): second URL
  28. Returns:
  29. bool: True if url_a and url_b are equals
  30. """
  31. # ignore www. in comparison
  32. if url_a.netloc.startswith('www.'):
  33. host_a = url_a.netloc.replace('www.', '', 1)
  34. else:
  35. host_a = url_a.netloc
  36. if url_b.netloc.startswith('www.'):
  37. host_b = url_b.netloc.replace('www.', '', 1)
  38. else:
  39. host_b = url_b.netloc
  40. if host_a != host_b or url_a.query != url_b.query or url_a.fragment != url_b.fragment:
  41. return False
  42. # remove / from the end of the url if required
  43. path_a = url_a.path[:-1] if url_a.path.endswith('/') else url_a.path
  44. path_b = url_b.path[:-1] if url_b.path.endswith('/') else url_b.path
  45. return unquote(path_a) == unquote(path_b)
  46. def merge_two_infoboxes(infobox1, infobox2): # pylint: disable=too-many-branches, too-many-statements
  47. # get engines weights
  48. if hasattr(engines[infobox1['engine']], 'weight'):
  49. weight1 = engines[infobox1['engine']].weight
  50. else:
  51. weight1 = 1
  52. if hasattr(engines[infobox2['engine']], 'weight'):
  53. weight2 = engines[infobox2['engine']].weight
  54. else:
  55. weight2 = 1
  56. if weight2 > weight1:
  57. infobox1['engine'] = infobox2['engine']
  58. infobox1['engines'] |= infobox2['engines']
  59. if 'urls' in infobox2:
  60. urls1 = infobox1.get('urls', None)
  61. if urls1 is None:
  62. urls1 = []
  63. for url2 in infobox2.get('urls', []):
  64. unique_url = True
  65. parsed_url2 = urlparse(url2.get('url', ''))
  66. entity_url2 = url2.get('entity')
  67. for url1 in urls1:
  68. if (entity_url2 is not None and url1.get('entity') == entity_url2) or compare_urls(
  69. urlparse(url1.get('url', '')), parsed_url2
  70. ):
  71. unique_url = False
  72. break
  73. if unique_url:
  74. urls1.append(url2)
  75. infobox1['urls'] = urls1
  76. if 'img_src' in infobox2:
  77. img1 = infobox1.get('img_src', None)
  78. img2 = infobox2.get('img_src')
  79. if img1 is None:
  80. infobox1['img_src'] = img2
  81. elif weight2 > weight1:
  82. infobox1['img_src'] = img2
  83. if 'attributes' in infobox2:
  84. attributes1 = infobox1.get('attributes')
  85. if attributes1 is None:
  86. infobox1['attributes'] = attributes1 = []
  87. attributeSet = set()
  88. for attribute in attributes1:
  89. label = attribute.get('label')
  90. if label not in attributeSet:
  91. attributeSet.add(label)
  92. entity = attribute.get('entity')
  93. if entity not in attributeSet:
  94. attributeSet.add(entity)
  95. for attribute in infobox2.get('attributes', []):
  96. if attribute.get('label') not in attributeSet and attribute.get('entity') not in attributeSet:
  97. attributes1.append(attribute)
  98. if 'content' in infobox2:
  99. content1 = infobox1.get('content', None)
  100. content2 = infobox2.get('content', '')
  101. if content1 is not None:
  102. if result_content_len(content2) > result_content_len(content1):
  103. infobox1['content'] = content2
  104. else:
  105. infobox1['content'] = content2
  106. def result_score(result, priority):
  107. weight = 1.0
  108. for result_engine in result['engines']:
  109. if hasattr(engines[result_engine], 'weight'):
  110. weight *= float(engines[result_engine].weight)
  111. weight *= len(result['positions'])
  112. score = 0
  113. for position in result['positions']:
  114. if priority == 'low':
  115. continue
  116. if priority == 'high':
  117. score += weight
  118. else:
  119. score += weight / position
  120. return score
  121. class Timing(NamedTuple): # pylint: disable=missing-class-docstring
  122. engine: str
  123. total: float
  124. load: float
  125. class UnresponsiveEngine(NamedTuple): # pylint: disable=missing-class-docstring
  126. engine: str
  127. error_type: str
  128. suspended: bool
  129. class ResultContainer:
  130. """docstring for ResultContainer"""
  131. __slots__ = (
  132. '_merged_results',
  133. 'infoboxes',
  134. 'suggestions',
  135. 'answers',
  136. 'corrections',
  137. '_number_of_results',
  138. '_closed',
  139. 'paging',
  140. 'unresponsive_engines',
  141. 'timings',
  142. 'redirect_url',
  143. 'engine_data',
  144. 'on_result',
  145. '_lock',
  146. )
  147. def __init__(self):
  148. super().__init__()
  149. self._merged_results = []
  150. self.infoboxes = []
  151. self.suggestions = set()
  152. self.answers = {}
  153. self.corrections = set()
  154. self._number_of_results = []
  155. self.engine_data = defaultdict(dict)
  156. self._closed = False
  157. self.paging = False
  158. self.unresponsive_engines: Set[UnresponsiveEngine] = set()
  159. self.timings: List[Timing] = []
  160. self.redirect_url = None
  161. self.on_result = lambda _: True
  162. self._lock = RLock()
  163. def extend(self, engine_name, results): # pylint: disable=too-many-branches
  164. if self._closed:
  165. return
  166. standard_result_count = 0
  167. error_msgs = set()
  168. for result in list(results):
  169. result['engine'] = engine_name
  170. if 'suggestion' in result and self.on_result(result):
  171. self.suggestions.add(result['suggestion'])
  172. elif 'answer' in result and self.on_result(result):
  173. self.answers[result['answer']] = result
  174. elif 'correction' in result and self.on_result(result):
  175. self.corrections.add(result['correction'])
  176. elif 'infobox' in result and self.on_result(result):
  177. self._merge_infobox(result)
  178. elif 'number_of_results' in result and self.on_result(result):
  179. self._number_of_results.append(result['number_of_results'])
  180. elif 'engine_data' in result and self.on_result(result):
  181. self.engine_data[engine_name][result['key']] = result['engine_data']
  182. elif 'url' in result:
  183. # standard result (url, title, content)
  184. if not self._is_valid_url_result(result, error_msgs):
  185. continue
  186. # normalize the result
  187. self._normalize_url_result(result)
  188. # call on_result call searx.search.SearchWithPlugins._on_result
  189. # which calls the plugins
  190. if not self.on_result(result):
  191. continue
  192. self.__merge_url_result(result, standard_result_count + 1)
  193. standard_result_count += 1
  194. elif self.on_result(result):
  195. self.__merge_result_no_url(result, standard_result_count + 1)
  196. standard_result_count += 1
  197. if len(error_msgs) > 0:
  198. for msg in error_msgs:
  199. count_error(engine_name, 'some results are invalids: ' + msg, secondary=True)
  200. if engine_name in engines:
  201. histogram_observe(standard_result_count, 'engine', engine_name, 'result', 'count')
  202. if not self.paging and engine_name in engines and engines[engine_name].paging:
  203. self.paging = True
  204. def _merge_infobox(self, infobox):
  205. add_infobox = True
  206. infobox_id = infobox.get('id', None)
  207. infobox['engines'] = set([infobox['engine']])
  208. if infobox_id is not None:
  209. parsed_url_infobox_id = urlparse(infobox_id)
  210. with self._lock:
  211. for existingIndex in self.infoboxes:
  212. if compare_urls(urlparse(existingIndex.get('id', '')), parsed_url_infobox_id):
  213. merge_two_infoboxes(existingIndex, infobox)
  214. add_infobox = False
  215. if add_infobox:
  216. self.infoboxes.append(infobox)
  217. def _is_valid_url_result(self, result, error_msgs):
  218. if 'url' in result:
  219. if not isinstance(result['url'], str):
  220. logger.debug('result: invalid URL: %s', str(result))
  221. error_msgs.add('invalid URL')
  222. return False
  223. if 'title' in result and not isinstance(result['title'], str):
  224. logger.debug('result: invalid title: %s', str(result))
  225. error_msgs.add('invalid title')
  226. return False
  227. if 'content' in result:
  228. if not isinstance(result['content'], str):
  229. logger.debug('result: invalid content: %s', str(result))
  230. error_msgs.add('invalid content')
  231. return False
  232. return True
  233. def _normalize_url_result(self, result):
  234. """Return True if the result is valid"""
  235. result['parsed_url'] = urlparse(result['url'])
  236. # if the result has no scheme, use http as default
  237. if not result['parsed_url'].scheme:
  238. result['parsed_url'] = result['parsed_url']._replace(scheme="http")
  239. result['url'] = result['parsed_url'].geturl()
  240. # avoid duplicate content between the content and title fields
  241. if result.get('content') == result.get('title'):
  242. del result['content']
  243. # make sure there is a template
  244. if 'template' not in result:
  245. result['template'] = 'default.html'
  246. # strip multiple spaces and carriage returns from content
  247. if result.get('content'):
  248. result['content'] = WHITESPACE_REGEX.sub(' ', result['content'])
  249. def __merge_url_result(self, result, position):
  250. result['engines'] = set([result['engine']])
  251. with self._lock:
  252. duplicated = self.__find_duplicated_http_result(result)
  253. if duplicated:
  254. self.__merge_duplicated_http_result(duplicated, result, position)
  255. return
  256. # if there is no duplicate found, append result
  257. result['positions'] = [position]
  258. self._merged_results.append(result)
  259. def __find_duplicated_http_result(self, result):
  260. result_template = result.get('template')
  261. for merged_result in self._merged_results:
  262. if 'parsed_url' not in merged_result:
  263. continue
  264. if compare_urls(result['parsed_url'], merged_result['parsed_url']) and result_template == merged_result.get(
  265. 'template'
  266. ):
  267. if result_template != 'images.html':
  268. # not an image, same template, same url : it's a duplicate
  269. return merged_result
  270. # it's an image
  271. # it's a duplicate if the parsed_url, template and img_src are different
  272. if result.get('img_src', '') == merged_result.get('img_src', ''):
  273. return merged_result
  274. return None
  275. def __merge_duplicated_http_result(self, duplicated, result, position):
  276. # using content with more text
  277. if result_content_len(result.get('content', '')) > result_content_len(duplicated.get('content', '')):
  278. duplicated['content'] = result['content']
  279. # merge all result's parameters not found in duplicate
  280. for key in result.keys():
  281. if not duplicated.get(key):
  282. duplicated[key] = result.get(key)
  283. # add the new position
  284. duplicated['positions'].append(position)
  285. # add engine to list of result-engines
  286. duplicated['engines'].add(result['engine'])
  287. # using https if possible
  288. if duplicated['parsed_url'].scheme != 'https' and result['parsed_url'].scheme == 'https':
  289. duplicated['url'] = result['parsed_url'].geturl()
  290. duplicated['parsed_url'] = result['parsed_url']
  291. def __merge_result_no_url(self, result, position):
  292. result['engines'] = set([result['engine']])
  293. result['positions'] = [position]
  294. with self._lock:
  295. self._merged_results.append(result)
  296. def close(self):
  297. self._closed = True
  298. for result in self._merged_results:
  299. result['score'] = result_score(result, result.get('priority'))
  300. # removing html content and whitespace duplications
  301. if result.get('content'):
  302. result['content'] = utils.html_to_text(result['content']).strip()
  303. if result.get('title'):
  304. result['title'] = ' '.join(utils.html_to_text(result['title']).strip().split())
  305. for result_engine in result['engines']:
  306. counter_add(result['score'], 'engine', result_engine, 'score')
  307. results = sorted(self._merged_results, key=itemgetter('score'), reverse=True)
  308. # pass 2 : group results by category and template
  309. gresults = []
  310. categoryPositions = {}
  311. for res in results:
  312. # do we need to handle more than one category per engine?
  313. engine = engines[res['engine']]
  314. res['category'] = engine.categories[0] if len(engine.categories) > 0 else ''
  315. # do we need to handle more than one category per engine?
  316. category = (
  317. res['category']
  318. + ':'
  319. + res.get('template', '')
  320. + ':'
  321. + ('img_src' if 'img_src' in res or 'thumbnail' in res else '')
  322. )
  323. current = None if category not in categoryPositions else categoryPositions[category]
  324. # group with previous results using the same category
  325. # if the group can accept more result and is not too far
  326. # from the current position
  327. if current is not None and (current['count'] > 0) and (len(gresults) - current['index'] < 20):
  328. # group with the previous results using
  329. # the same category with this one
  330. index = current['index']
  331. gresults.insert(index, res)
  332. # update every index after the current one
  333. # (including the current one)
  334. for k in categoryPositions: # pylint: disable=consider-using-dict-items
  335. v = categoryPositions[k]['index']
  336. if v >= index:
  337. categoryPositions[k]['index'] = v + 1
  338. # update this category
  339. current['count'] -= 1
  340. else:
  341. # same category
  342. gresults.append(res)
  343. # update categoryIndex
  344. categoryPositions[category] = {'index': len(gresults), 'count': 8}
  345. # update _merged_results
  346. self._merged_results = gresults
  347. def get_ordered_results(self):
  348. if not self._closed:
  349. self.close()
  350. return self._merged_results
  351. def results_length(self):
  352. return len(self._merged_results)
  353. @property
  354. def number_of_results(self) -> int:
  355. """Returns the average of results number, returns zero if the average
  356. result number is smaller than the actual result count."""
  357. with self._lock:
  358. if not self._closed:
  359. logger.error("call to ResultContainer.number_of_results before ResultContainer.close")
  360. return 0
  361. resultnum_sum = sum(self._number_of_results)
  362. if not resultnum_sum or not self._number_of_results:
  363. return 0
  364. average = int(resultnum_sum / len(self._number_of_results))
  365. if average < self.results_length():
  366. average = 0
  367. return average
  368. def add_unresponsive_engine(self, engine_name: str, error_type: str, suspended: bool = False):
  369. with self._lock:
  370. if self._closed:
  371. logger.error("call to ResultContainer.add_unresponsive_engine after ResultContainer.close")
  372. return
  373. if engines[engine_name].display_error_messages:
  374. self.unresponsive_engines.add(UnresponsiveEngine(engine_name, error_type, suspended))
  375. def add_timing(self, engine_name: str, engine_time: float, page_load_time: float):
  376. with self._lock:
  377. if self._closed:
  378. logger.error("call to ResultContainer.add_timing after ResultContainer.close")
  379. return
  380. self.timings.append(Timing(engine_name, total=engine_time, load=page_load_time))
  381. def get_timings(self):
  382. with self._lock:
  383. if not self._closed:
  384. logger.error("call to ResultContainer.get_timings before ResultContainer.close")
  385. return []
  386. return self.timings