impl.py 14 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388
  1. import typing
  2. import types
  3. import functools
  4. import itertools
  5. from time import time
  6. from urllib.parse import urlparse
  7. import re
  8. import cld3
  9. import requests.exceptions
  10. from searx import poolrequests, logger
  11. from searx.results import ResultContainer
  12. from searx.search import SearchQuery, EngineRef
  13. from searx.search.processors import EngineProcessor
  14. HTML_TAGS = [
  15. 'embed', 'iframe', 'object', 'param', 'picture', 'source', 'svg', 'math', 'canvas', 'noscript', 'script',
  16. 'del', 'ins', 'area', 'audio', 'img', 'map', 'track', 'video', 'a', 'abbr', 'b', 'bdi', 'bdo', 'br', 'cite',
  17. 'code', 'data', 'dfn', 'em', 'i', 'kdb', 'mark', 'q', 'rb', 'rp', 'rt', 'rtc', 'ruby', 's', 'samp', 'small',
  18. 'span', 'strong', 'sub', 'sup', 'time', 'u', 'var', 'wbr', 'style', 'blockquote', 'dd', 'div', 'dl', 'dt',
  19. 'figcaption', 'figure', 'hr', 'li', 'ol', 'p', 'pre', 'ul', 'button', 'datalist', 'fieldset', 'form', 'input',
  20. 'label', 'legend', 'meter', 'optgroup', 'option', 'output', 'progress', 'select', 'textarea', 'applet',
  21. 'frame', 'frameset'
  22. ]
  23. def get_check_no_html():
  24. rep = ['<' + tag + '[^\>]*>' for tag in HTML_TAGS]
  25. rep += ['</' + tag + '>' for tag in HTML_TAGS]
  26. pattern = re.compile('|'.join(rep))
  27. def f(text):
  28. return pattern.search(text.lower()) is None
  29. return f
  30. _check_no_html = get_check_no_html()
  31. def _is_url(url):
  32. try:
  33. result = urlparse(url)
  34. except ValueError:
  35. return False
  36. if result.scheme not in ('http', 'https'):
  37. return False
  38. return True
  39. @functools.lru_cache(maxsize=8192)
  40. def _is_url_image(image_url):
  41. if not isinstance(image_url, str):
  42. return False
  43. if image_url.startswith('//'):
  44. image_url = 'https:' + image_url
  45. if image_url.startswith('data:'):
  46. return image_url.startswith('data:image/')
  47. if not _is_url(image_url):
  48. return False
  49. retry = 2
  50. while retry > 0:
  51. a = time()
  52. try:
  53. poolrequests.set_timeout_for_thread(10.0, time())
  54. r = poolrequests.get(image_url, timeout=10.0, allow_redirects=True, headers={
  55. 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:84.0) Gecko/20100101 Firefox/84.0',
  56. 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
  57. 'Accept-Language': 'en-US;q=0.5,en;q=0.3',
  58. 'Accept-Encoding': 'gzip, deflate, br',
  59. 'DNT': '1',
  60. 'Connection': 'keep-alive',
  61. 'Upgrade-Insecure-Requests': '1',
  62. 'Sec-GPC': '1',
  63. 'Cache-Control': 'max-age=0'
  64. })
  65. if r.headers["content-type"].startswith('image/'):
  66. return True
  67. return False
  68. except requests.exceptions.Timeout:
  69. logger.error('Timeout for %s: %i', image_url, int(time() - a))
  70. retry -= 1
  71. except requests.exceptions.RequestException:
  72. logger.exception('Exception for %s', image_url)
  73. return False
  74. def _search_query_to_dict(search_query: SearchQuery) -> typing.Dict[str, typing.Any]:
  75. return {
  76. 'query': search_query.query,
  77. 'lang': search_query.lang,
  78. 'pageno': search_query.pageno,
  79. 'safesearch': search_query.safesearch,
  80. 'time_range': search_query.time_range,
  81. }
  82. def _search_query_diff(sq1: SearchQuery, sq2: SearchQuery)\
  83. -> typing.Tuple[typing.Dict[str, typing.Any], typing.Dict[str, typing.Any]]:
  84. param1 = _search_query_to_dict(sq1)
  85. param2 = _search_query_to_dict(sq2)
  86. common = {}
  87. diff = {}
  88. for k, value1 in param1.items():
  89. value2 = param2[k]
  90. if value1 == value2:
  91. common[k] = value1
  92. else:
  93. diff[k] = (value1, value2)
  94. return (common, diff)
  95. class TestResults:
  96. __slots__ = 'errors', 'broken_urls'
  97. def __init__(self):
  98. self.errors: typing.Dict[str, typing.List[str]] = {}
  99. self.broken_urls = []
  100. def add_error(self, test, message):
  101. errors_for_test = self.errors.setdefault(test, [])
  102. if message not in errors_for_test:
  103. errors_for_test.append(message)
  104. def add_broken_url(self, url):
  105. if url not in self.broken_urls:
  106. self.broken_urls.append(url)
  107. @property
  108. def succesfull(self):
  109. return len(self.errors) == 0
  110. def __iter__(self):
  111. for test_name, errors in self.errors.items():
  112. for error in sorted(errors):
  113. yield (test_name, error)
  114. class ResultContainerTests:
  115. __slots__ = 'test_name', 'search_query', 'result_container', 'languages', 'stop_test', 'test_results'
  116. def __init__(self,
  117. test_results: TestResults,
  118. test_name: str,
  119. search_query: SearchQuery,
  120. result_container: ResultContainer):
  121. self.test_name = test_name
  122. self.search_query = search_query
  123. self.result_container = result_container
  124. self.languages: typing.Set[str] = set()
  125. self.test_results = test_results
  126. self.stop_test = False
  127. @property
  128. def result_urls(self):
  129. results = self.result_container.get_ordered_results()
  130. return [result['url'] for result in results]
  131. def _record_error(self, message: str) -> None:
  132. self.test_results.add_error(self.test_name, message)
  133. def _add_language(self, text: str) -> typing.Optional[str]:
  134. r = cld3.get_language(str(text)) # pylint: disable=E1101
  135. if r is not None and r.probability >= 0.9 and r.is_reliable:
  136. self.languages.add(r.language)
  137. return None
  138. def _check_result(self, result):
  139. if not _check_no_html(result.get('title', '')):
  140. self._record_error('HTML in title')
  141. if not _check_no_html(result.get('content', '')):
  142. self._record_error('HTML in content')
  143. self._add_language(result.get('title', ''))
  144. self._add_language(result.get('content', ''))
  145. template = result.get('template', 'default.html')
  146. if template == 'default.html':
  147. return
  148. if template == 'code.html':
  149. return
  150. if template == 'torrent.html':
  151. return
  152. if template == 'map.html':
  153. return
  154. if template == 'images.html':
  155. thumbnail_src = result.get('thumbnail_src')
  156. if thumbnail_src is not None:
  157. if not _is_url_image(thumbnail_src):
  158. self.test_results.add_broken_url(thumbnail_src)
  159. self._record_error('thumbnail_src URL is invalid')
  160. elif not _is_url_image(result.get('img_src')):
  161. self.test_results.add_broken_url(result.get('img_src'))
  162. self._record_error('img_src URL is invalid')
  163. if template == 'videos.html' and not _is_url_image(result.get('thumbnail')):
  164. self._record_error('thumbnail URL is invalid')
  165. def _check_results(self, results: list):
  166. for result in results:
  167. self._check_result(result)
  168. def _check_answers(self, answers):
  169. for answer in answers:
  170. if not _check_no_html(answer):
  171. self._record_error('HTML in answer')
  172. def _check_infoboxes(self, infoboxes):
  173. for infobox in infoboxes:
  174. if not _check_no_html(infobox.get('content', '')):
  175. self._record_error('HTML in infobox content')
  176. self._add_language(infobox.get('content', ''))
  177. for attribute in infobox.get('attributes', {}):
  178. if not _check_no_html(attribute.get('value', '')):
  179. self._record_error('HTML in infobox attribute value')
  180. def check_basic(self):
  181. if len(self.result_container.unresponsive_engines) > 0:
  182. for message in self.result_container.unresponsive_engines:
  183. self._record_error(message[1] + ' ' + (message[2] or ''))
  184. self.stop_test = True
  185. return
  186. results = self.result_container.get_ordered_results()
  187. if len(results) > 0:
  188. self._check_results(results)
  189. if len(self.result_container.answers) > 0:
  190. self._check_answers(self.result_container.answers)
  191. if len(self.result_container.infoboxes) > 0:
  192. self._check_infoboxes(self.result_container.infoboxes)
  193. def has_infobox(self):
  194. if len(self.result_container.infoboxes) == 0:
  195. self._record_error('No infobox')
  196. def has_answer(self):
  197. if len(self.result_container.answers) == 0:
  198. self._record_error('No answer')
  199. def has_language(self, lang):
  200. if lang not in self.languages:
  201. self._record_error(lang + ' not found')
  202. def not_empty(self):
  203. result_types = set()
  204. results = self.result_container.get_ordered_results()
  205. if len(results) > 0:
  206. result_types.add('results')
  207. if len(self.result_container.answers) > 0:
  208. result_types.add('answers')
  209. if len(self.result_container.infoboxes) > 0:
  210. result_types.add('infoboxes')
  211. if len(result_types) == 0:
  212. self._record_error('No result')
  213. def one_title_contains(self, title: str):
  214. title = title.lower()
  215. for result in self.result_container.get_ordered_results():
  216. if title in result['title'].lower():
  217. return
  218. self._record_error(('{!r} not found in the title'.format(title)))
  219. class CheckerTests:
  220. __slots__ = 'test_results', 'test_name', 'result_container_tests_list'
  221. def __init__(self,
  222. test_results: TestResults,
  223. test_name: str,
  224. result_container_tests_list: typing.List[ResultContainerTests]):
  225. self.test_results = test_results
  226. self.test_name = test_name
  227. self.result_container_tests_list = result_container_tests_list
  228. def unique_results(self):
  229. urls_list = [rct.result_urls for rct in self.result_container_tests_list]
  230. if len(urls_list[0]) > 0:
  231. # results on the first page
  232. for i, urls_i in enumerate(urls_list):
  233. for j, urls_j in enumerate(urls_list):
  234. if i < j and urls_i == urls_j:
  235. common, diff = _search_query_diff(self.result_container_tests_list[i].search_query,
  236. self.result_container_tests_list[j].search_query)
  237. common_str = ' '.join(['{}={!r}'.format(k, v) for k, v in common.items()])
  238. diff1_str = ', ' .join(['{}={!r}'.format(k, v1) for (k, (v1, v2)) in diff.items()])
  239. diff2_str = ', ' .join(['{}={!r}'.format(k, v2) for (k, (v1, v2)) in diff.items()])
  240. self.test_results.add_error(self.test_name,
  241. 'results are identitical for {} and {} ({})'
  242. .format(diff1_str, diff2_str, common_str))
  243. class Checker:
  244. __slots__ = 'processor', 'tests', 'test_results'
  245. def __init__(self, processor: EngineProcessor):
  246. self.processor = processor
  247. self.tests = self.processor.get_tests()
  248. self.test_results = TestResults()
  249. @property
  250. def engineref_list(self):
  251. engine_name = self.processor.engine_name
  252. engine_category = self.processor.engine.categories[0]
  253. return [EngineRef(engine_name, engine_category)]
  254. @staticmethod
  255. def search_query_matrix_iterator(engineref_list, matrix):
  256. p = []
  257. for name, values in matrix.items():
  258. if isinstance(values, (tuple, list)):
  259. l = [(name, value) for value in values]
  260. else:
  261. l = [(name, values)]
  262. p.append(l)
  263. for kwargs in itertools.product(*p):
  264. kwargs = {k: v for k, v in kwargs}
  265. query = kwargs['query']
  266. params = dict(kwargs)
  267. del params['query']
  268. yield SearchQuery(query, engineref_list, **params)
  269. def call_test(self, obj, test_description):
  270. if isinstance(test_description, (tuple, list)):
  271. method, args = test_description[0], test_description[1:]
  272. else:
  273. method = test_description
  274. args = ()
  275. if isinstance(method, str) and hasattr(obj, method):
  276. getattr(obj, method)(*args)
  277. elif isinstance(method, types.FunctionType):
  278. method(*args)
  279. else:
  280. self.test_results.add_error(obj.test_name,
  281. 'method {!r} ({}) not found for {}'
  282. .format(method, method.__class__.__name__, obj.__class__.__name__))
  283. def call_tests(self, obj, test_descriptions):
  284. for test_description in test_descriptions:
  285. self.call_test(obj, test_description)
  286. def search(self, search_query: SearchQuery) -> ResultContainer:
  287. result_container = ResultContainer()
  288. engineref_category = search_query.engineref_list[0].category
  289. params = self.processor.get_params(search_query, engineref_category)
  290. if params is not None:
  291. self.processor.search(search_query.query, params, result_container, time(), 5)
  292. return result_container
  293. def get_result_container_tests(self, test_name: str, search_query: SearchQuery) -> ResultContainerTests:
  294. result_container = self.search(search_query)
  295. result_container_check = ResultContainerTests(self.test_results, test_name, search_query, result_container)
  296. result_container_check.check_basic()
  297. return result_container_check
  298. def run_test(self, test_name):
  299. test_parameters = self.tests[test_name]
  300. search_query_list = list(Checker.search_query_matrix_iterator(self.engineref_list, test_parameters['matrix']))
  301. rct_list = [self.get_result_container_tests(test_name, search_query) for search_query in search_query_list]
  302. stop_test = False
  303. if 'result_container' in test_parameters:
  304. for rct in rct_list:
  305. stop_test = stop_test or rct.stop_test
  306. if not rct.stop_test:
  307. self.call_tests(rct, test_parameters['result_container'])
  308. if not stop_test:
  309. if 'test' in test_parameters:
  310. checker_tests = CheckerTests(self.test_results, test_name, rct_list)
  311. self.call_tests(checker_tests, test_parameters['test'])
  312. def run(self):
  313. for test_name in self.tests:
  314. self.run_test(test_name)