impl.py 17 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448
  1. # SPDX-License-Identifier: AGPL-3.0-or-later
  2. import gc
  3. import typing
  4. import types
  5. import functools
  6. import itertools
  7. from time import time
  8. from timeit import default_timer
  9. from urllib.parse import urlparse
  10. import re
  11. from langdetect import detect_langs
  12. from langdetect.lang_detect_exception import LangDetectException
  13. import httpx
  14. from searx import network, logger
  15. from searx.utils import gen_useragent
  16. from searx.results import ResultContainer
  17. from searx.search.models import SearchQuery, EngineRef
  18. from searx.search.processors import EngineProcessor
  19. from searx.metrics import counter_inc
  20. logger = logger.getChild('searx.search.checker')
  21. HTML_TAGS = [
  22. # fmt: off
  23. 'embed', 'iframe', 'object', 'param', 'picture', 'source', 'svg', 'math', 'canvas', 'noscript', 'script',
  24. 'del', 'ins', 'area', 'audio', 'img', 'map', 'track', 'video', 'a', 'abbr', 'b', 'bdi', 'bdo', 'br', 'cite',
  25. 'code', 'data', 'dfn', 'em', 'i', 'kdb', 'mark', 'q', 'rb', 'rp', 'rt', 'rtc', 'ruby', 's', 'samp', 'small',
  26. 'span', 'strong', 'sub', 'sup', 'time', 'u', 'var', 'wbr', 'style', 'blockquote', 'dd', 'div', 'dl', 'dt',
  27. 'figcaption', 'figure', 'hr', 'li', 'ol', 'p', 'pre', 'ul', 'button', 'datalist', 'fieldset', 'form', 'input',
  28. 'label', 'legend', 'meter', 'optgroup', 'option', 'output', 'progress', 'select', 'textarea', 'applet',
  29. 'frame', 'frameset'
  30. # fmt: on
  31. ]
  32. def get_check_no_html():
  33. rep = ['<' + tag + '[^\>]*>' for tag in HTML_TAGS]
  34. rep += ['</' + tag + '>' for tag in HTML_TAGS]
  35. pattern = re.compile('|'.join(rep))
  36. def f(text):
  37. return pattern.search(text.lower()) is None
  38. return f
  39. _check_no_html = get_check_no_html()
  40. def _is_url(url):
  41. try:
  42. result = urlparse(url)
  43. except ValueError:
  44. return False
  45. if result.scheme not in ('http', 'https'):
  46. return False
  47. return True
  48. @functools.lru_cache(maxsize=8192)
  49. def _download_and_check_if_image(image_url: str) -> bool:
  50. """Download an URL and check if the Content-Type starts with "image/"
  51. This function should not be called directly: use _is_url_image
  52. otherwise the cache of functools.lru_cache contains data: URL which might be huge.
  53. """
  54. retry = 2
  55. while retry > 0:
  56. a = time()
  57. try:
  58. # use "image_proxy" (avoid HTTP/2)
  59. network.set_context_network_name('image_proxy')
  60. r, stream = network.stream(
  61. 'GET',
  62. image_url,
  63. timeout=10.0,
  64. allow_redirects=True,
  65. headers={
  66. 'User-Agent': gen_useragent(),
  67. 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
  68. 'Accept-Language': 'en-US;q=0.5,en;q=0.3',
  69. 'Accept-Encoding': 'gzip, deflate, br',
  70. 'DNT': '1',
  71. 'Connection': 'keep-alive',
  72. 'Upgrade-Insecure-Requests': '1',
  73. 'Sec-GPC': '1',
  74. 'Cache-Control': 'max-age=0',
  75. },
  76. )
  77. r.close()
  78. if r.status_code == 200:
  79. is_image = r.headers.get('content-type', '').startswith('image/')
  80. else:
  81. is_image = False
  82. del r
  83. del stream
  84. return is_image
  85. except httpx.TimeoutException:
  86. logger.error('Timeout for %s: %i', image_url, int(time() - a))
  87. retry -= 1
  88. except httpx.HTTPError:
  89. logger.exception('Exception for %s', image_url)
  90. return False
  91. return False
  92. def _is_url_image(image_url) -> bool:
  93. """Normalize image_url"""
  94. if not isinstance(image_url, str):
  95. return False
  96. if image_url.startswith('//'):
  97. image_url = 'https:' + image_url
  98. if image_url.startswith('data:'):
  99. return image_url.startswith('data:image/')
  100. if not _is_url(image_url):
  101. return False
  102. return _download_and_check_if_image(image_url)
  103. def _search_query_to_dict(search_query: SearchQuery) -> typing.Dict[str, typing.Any]:
  104. return {
  105. 'query': search_query.query,
  106. 'lang': search_query.lang,
  107. 'pageno': search_query.pageno,
  108. 'safesearch': search_query.safesearch,
  109. 'time_range': search_query.time_range,
  110. }
  111. def _search_query_diff(
  112. sq1: SearchQuery, sq2: SearchQuery
  113. ) -> typing.Tuple[typing.Dict[str, typing.Any], typing.Dict[str, typing.Any]]:
  114. param1 = _search_query_to_dict(sq1)
  115. param2 = _search_query_to_dict(sq2)
  116. common = {}
  117. diff = {}
  118. for k, value1 in param1.items():
  119. value2 = param2[k]
  120. if value1 == value2:
  121. common[k] = value1
  122. else:
  123. diff[k] = (value1, value2)
  124. return (common, diff)
  125. class TestResults:
  126. __slots__ = 'errors', 'logs', 'languages'
  127. def __init__(self):
  128. self.errors: typing.Dict[str, typing.List[str]] = {}
  129. self.logs: typing.Dict[str, typing.List[typing.Any]] = {}
  130. self.languages: typing.Set[str] = set()
  131. def add_error(self, test, message, *args):
  132. # message to self.errors
  133. errors_for_test = self.errors.setdefault(test, [])
  134. if message not in errors_for_test:
  135. errors_for_test.append(message)
  136. # (message, *args) to self.logs
  137. logs_for_test = self.logs.setdefault(test, [])
  138. if (message, *args) not in logs_for_test:
  139. logs_for_test.append((message, *args))
  140. def add_language(self, language):
  141. self.languages.add(language)
  142. @property
  143. def successful(self):
  144. return len(self.errors) == 0
  145. def __iter__(self):
  146. for test_name, errors in self.errors.items():
  147. for error in sorted(errors):
  148. yield (test_name, error)
  149. class ResultContainerTests:
  150. __slots__ = 'test_name', 'search_query', 'result_container', 'languages', 'stop_test', 'test_results'
  151. def __init__(
  152. self, test_results: TestResults, test_name: str, search_query: SearchQuery, result_container: ResultContainer
  153. ):
  154. self.test_name = test_name
  155. self.search_query = search_query
  156. self.result_container = result_container
  157. self.languages: typing.Set[str] = set()
  158. self.test_results = test_results
  159. self.stop_test = False
  160. @property
  161. def result_urls(self):
  162. results = self.result_container.get_ordered_results()
  163. return [result['url'] for result in results if 'url' in result]
  164. def _record_error(self, message: str, *args) -> None:
  165. sq = _search_query_to_dict(self.search_query)
  166. sqstr = ' '.join(['{}={!r}'.format(k, v) for k, v in sq.items()])
  167. self.test_results.add_error(self.test_name, message, *args, '(' + sqstr + ')')
  168. def _add_language(self, text: str) -> typing.Optional[str]:
  169. try:
  170. r = detect_langs(str(text)) # pylint: disable=E1101
  171. except LangDetectException:
  172. return None
  173. if len(r) > 0 and r[0].prob > 0.95:
  174. self.languages.add(r[0].lang)
  175. self.test_results.add_language(r[0].lang)
  176. return None
  177. def _check_result(self, result):
  178. if not _check_no_html(result.get('title', '')):
  179. self._record_error('HTML in title', repr(result.get('title', '')))
  180. if not _check_no_html(result.get('content', '')):
  181. self._record_error('HTML in content', repr(result.get('content', '')))
  182. if result.get('url') is None:
  183. self._record_error('url is None')
  184. self._add_language(result.get('title', ''))
  185. self._add_language(result.get('content', ''))
  186. template = result.get('template', 'default.html')
  187. if template == 'default.html':
  188. return
  189. if template == 'code.html':
  190. return
  191. if template == 'torrent.html':
  192. return
  193. if template == 'map.html':
  194. return
  195. if template == 'images.html':
  196. thumbnail_src = result.get('thumbnail_src')
  197. if thumbnail_src is not None:
  198. if not _is_url_image(thumbnail_src):
  199. self._record_error('thumbnail_src URL is invalid', thumbnail_src)
  200. elif not _is_url_image(result.get('img_src')):
  201. self._record_error('img_src URL is invalid', result.get('img_src'))
  202. if template == 'videos.html' and not _is_url_image(result.get('thumbnail')):
  203. self._record_error('thumbnail URL is invalid', result.get('img_src'))
  204. def _check_results(self, results: list):
  205. for result in results:
  206. self._check_result(result)
  207. def _check_answers(self, answers):
  208. for answer in answers:
  209. if not _check_no_html(answer):
  210. self._record_error('HTML in answer', answer)
  211. def _check_infoboxes(self, infoboxes):
  212. for infobox in infoboxes:
  213. if not _check_no_html(infobox.get('content', '')):
  214. self._record_error('HTML in infobox content', infobox.get('content', ''))
  215. self._add_language(infobox.get('content', ''))
  216. for attribute in infobox.get('attributes', {}):
  217. if not _check_no_html(attribute.get('value', '')):
  218. self._record_error('HTML in infobox attribute value', attribute.get('value', ''))
  219. def check_basic(self):
  220. if len(self.result_container.unresponsive_engines) > 0:
  221. for message in self.result_container.unresponsive_engines:
  222. self._record_error(message[1] + ' ' + (message[2] or ''))
  223. self.stop_test = True
  224. return
  225. results = self.result_container.get_ordered_results()
  226. if len(results) > 0:
  227. self._check_results(results)
  228. if len(self.result_container.answers) > 0:
  229. self._check_answers(self.result_container.answers)
  230. if len(self.result_container.infoboxes) > 0:
  231. self._check_infoboxes(self.result_container.infoboxes)
  232. def has_infobox(self):
  233. """Check the ResultContainer has at least one infobox"""
  234. if len(self.result_container.infoboxes) == 0:
  235. self._record_error('No infobox')
  236. def has_answer(self):
  237. """Check the ResultContainer has at least one answer"""
  238. if len(self.result_container.answers) == 0:
  239. self._record_error('No answer')
  240. def has_language(self, lang):
  241. """Check at least one title or content of the results is written in the `lang`.
  242. Detected using pycld3, may be not accurate"""
  243. if lang not in self.languages:
  244. self._record_error(lang + ' not found')
  245. def not_empty(self):
  246. """Check the ResultContainer has at least one answer or infobox or result"""
  247. result_types = set()
  248. results = self.result_container.get_ordered_results()
  249. if len(results) > 0:
  250. result_types.add('results')
  251. if len(self.result_container.answers) > 0:
  252. result_types.add('answers')
  253. if len(self.result_container.infoboxes) > 0:
  254. result_types.add('infoboxes')
  255. if len(result_types) == 0:
  256. self._record_error('No result')
  257. def one_title_contains(self, title: str):
  258. """Check one of the title contains `title` (case insensitive comparison)"""
  259. title = title.lower()
  260. for result in self.result_container.get_ordered_results():
  261. if title in result['title'].lower():
  262. return
  263. self._record_error(('{!r} not found in the title'.format(title)))
  264. class CheckerTests:
  265. __slots__ = 'test_results', 'test_name', 'result_container_tests_list'
  266. def __init__(
  267. self, test_results: TestResults, test_name: str, result_container_tests_list: typing.List[ResultContainerTests]
  268. ):
  269. self.test_results = test_results
  270. self.test_name = test_name
  271. self.result_container_tests_list = result_container_tests_list
  272. def unique_results(self):
  273. """Check the results of each ResultContainer is unique"""
  274. urls_list = [rct.result_urls for rct in self.result_container_tests_list]
  275. if len(urls_list[0]) > 0:
  276. # results on the first page
  277. for i, urls_i in enumerate(urls_list):
  278. for j, urls_j in enumerate(urls_list):
  279. if i < j and urls_i == urls_j:
  280. common, diff = _search_query_diff(
  281. self.result_container_tests_list[i].search_query,
  282. self.result_container_tests_list[j].search_query,
  283. )
  284. common_str = ' '.join(['{}={!r}'.format(k, v) for k, v in common.items()])
  285. diff1_str = ', '.join(['{}={!r}'.format(k, v1) for (k, (v1, v2)) in diff.items()])
  286. diff2_str = ', '.join(['{}={!r}'.format(k, v2) for (k, (v1, v2)) in diff.items()])
  287. self.test_results.add_error(
  288. self.test_name,
  289. 'results are identitical for {} and {} ({})'.format(diff1_str, diff2_str, common_str),
  290. )
  291. class Checker:
  292. __slots__ = 'processor', 'tests', 'test_results'
  293. def __init__(self, processor: EngineProcessor):
  294. self.processor = processor
  295. self.tests = self.processor.get_tests()
  296. self.test_results = TestResults()
  297. @property
  298. def engineref_list(self):
  299. engine_name = self.processor.engine_name
  300. engine_category = self.processor.engine.categories[0]
  301. return [EngineRef(engine_name, engine_category)]
  302. @staticmethod
  303. def search_query_matrix_iterator(engineref_list, matrix):
  304. p = []
  305. for name, values in matrix.items():
  306. if isinstance(values, (tuple, list)):
  307. l = [(name, value) for value in values]
  308. else:
  309. l = [(name, values)]
  310. p.append(l)
  311. for kwargs in itertools.product(*p):
  312. kwargs = {k: v for k, v in kwargs}
  313. query = kwargs['query']
  314. params = dict(kwargs)
  315. del params['query']
  316. yield SearchQuery(query, engineref_list, **params)
  317. def call_test(self, obj, test_description):
  318. if isinstance(test_description, (tuple, list)):
  319. method, args = test_description[0], test_description[1:]
  320. else:
  321. method = test_description
  322. args = ()
  323. if isinstance(method, str) and hasattr(obj, method):
  324. getattr(obj, method)(*args)
  325. elif isinstance(method, types.FunctionType):
  326. method(*args)
  327. else:
  328. self.test_results.add_error(
  329. obj.test_name,
  330. 'method {!r} ({}) not found for {}'.format(method, method.__class__.__name__, obj.__class__.__name__),
  331. )
  332. def call_tests(self, obj, test_descriptions):
  333. for test_description in test_descriptions:
  334. self.call_test(obj, test_description)
  335. def search(self, search_query: SearchQuery) -> ResultContainer:
  336. result_container = ResultContainer()
  337. engineref_category = search_query.engineref_list[0].category
  338. params = self.processor.get_params(search_query, engineref_category)
  339. if params is not None:
  340. counter_inc('engine', search_query.engineref_list[0].name, 'search', 'count', 'sent')
  341. self.processor.search(search_query.query, params, result_container, default_timer(), 5)
  342. return result_container
  343. def get_result_container_tests(self, test_name: str, search_query: SearchQuery) -> ResultContainerTests:
  344. result_container = self.search(search_query)
  345. result_container_check = ResultContainerTests(self.test_results, test_name, search_query, result_container)
  346. result_container_check.check_basic()
  347. return result_container_check
  348. def run_test(self, test_name):
  349. test_parameters = self.tests[test_name]
  350. search_query_list = list(Checker.search_query_matrix_iterator(self.engineref_list, test_parameters['matrix']))
  351. rct_list = [self.get_result_container_tests(test_name, search_query) for search_query in search_query_list]
  352. stop_test = False
  353. if 'result_container' in test_parameters:
  354. for rct in rct_list:
  355. stop_test = stop_test or rct.stop_test
  356. if not rct.stop_test:
  357. self.call_tests(rct, test_parameters['result_container'])
  358. if not stop_test:
  359. if 'test' in test_parameters:
  360. checker_tests = CheckerTests(self.test_results, test_name, rct_list)
  361. self.call_tests(checker_tests, test_parameters['test'])
  362. def run(self):
  363. for test_name in self.tests:
  364. self.run_test(test_name)
  365. # clear cache
  366. _download_and_check_if_image.cache_clear()
  367. # force a garbage collector
  368. gc.collect()