online.py 10 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261
  1. # SPDX-License-Identifier: AGPL-3.0-or-later
  2. from time import time
  3. import threading
  4. import asyncio
  5. import httpx
  6. import searx.network
  7. from searx.engines import settings
  8. from searx import logger
  9. from searx.utils import gen_useragent
  10. from searx.exceptions import (SearxEngineAccessDeniedException, SearxEngineCaptchaException,
  11. SearxEngineTooManyRequestsException,)
  12. from searx.metrology.error_recorder import record_exception, record_error
  13. from searx.search.processors.abstract import EngineProcessor
  14. logger = logger.getChild('search.processor.online')
  15. def default_request_params():
  16. return {
  17. 'method': 'GET',
  18. 'headers': {},
  19. 'data': {},
  20. 'url': '',
  21. 'cookies': {},
  22. 'verify': True,
  23. 'auth': None
  24. }
  25. class OnlineProcessor(EngineProcessor):
  26. engine_type = 'online'
  27. def get_params(self, search_query, engine_category):
  28. params = super().get_params(search_query, engine_category)
  29. if params is None:
  30. return None
  31. # skip suspended engines
  32. if self.engine.suspend_end_time >= time():
  33. logger.debug('Engine currently suspended: %s', self.engine_name)
  34. return None
  35. # add default params
  36. params.update(default_request_params())
  37. # add an user agent
  38. params['headers']['User-Agent'] = gen_useragent()
  39. return params
  40. def _send_http_request(self, params):
  41. # create dictionary which contain all
  42. # informations about the request
  43. request_args = dict(
  44. headers=params['headers'],
  45. cookies=params['cookies'],
  46. verify=params['verify'],
  47. auth=params['auth']
  48. )
  49. # max_redirects
  50. max_redirects = params.get('max_redirects')
  51. if max_redirects:
  52. request_args['max_redirects'] = max_redirects
  53. # allow_redirects
  54. if 'allow_redirects' in params:
  55. request_args['allow_redirects'] = params['allow_redirects']
  56. # soft_max_redirects
  57. soft_max_redirects = params.get('soft_max_redirects', max_redirects or 0)
  58. # raise_for_status
  59. request_args['raise_for_httperror'] = params.get('raise_for_httperror', True)
  60. # specific type of request (GET or POST)
  61. if params['method'] == 'GET':
  62. req = searx.network.get
  63. else:
  64. req = searx.network.post
  65. request_args['data'] = params['data']
  66. # send the request
  67. response = req(params['url'], **request_args)
  68. # check soft limit of the redirect count
  69. if len(response.history) > soft_max_redirects:
  70. # unexpected redirect : record an error
  71. # but the engine might still return valid results.
  72. status_code = str(response.status_code or '')
  73. reason = response.reason_phrase or ''
  74. hostname = response.url.host
  75. record_error(self.engine_name,
  76. '{} redirects, maximum: {}'.format(len(response.history), soft_max_redirects),
  77. (status_code, reason, hostname))
  78. return response
  79. def _search_basic(self, query, params):
  80. # update request parameters dependent on
  81. # search-engine (contained in engines folder)
  82. self.engine.request(query, params)
  83. # ignoring empty urls
  84. if params['url'] is None:
  85. return None
  86. if not params['url']:
  87. return None
  88. # send request
  89. response = self._send_http_request(params)
  90. # parse the response
  91. response.search_params = params
  92. return self.engine.response(response)
  93. def search(self, query, params, result_container, start_time, timeout_limit):
  94. # set timeout for all HTTP requests
  95. searx.network.set_timeout_for_thread(timeout_limit, start_time=start_time)
  96. # reset the HTTP total time
  97. searx.network.reset_time_for_thread()
  98. # set the network
  99. searx.network.set_context_network_name(self.engine_name)
  100. # suppose everything will be alright
  101. http_exception = False
  102. suspended_time = None
  103. try:
  104. # send requests and parse the results
  105. search_results = self._search_basic(query, params)
  106. # check if the engine accepted the request
  107. if search_results is not None:
  108. # yes, so add results
  109. result_container.extend(self.engine_name, search_results)
  110. # update engine time when there is no exception
  111. engine_time = time() - start_time
  112. page_load_time = searx.network.get_time_for_thread()
  113. result_container.add_timing(self.engine_name, engine_time, page_load_time)
  114. with threading.RLock():
  115. self.engine.stats['engine_time'] += engine_time
  116. self.engine.stats['engine_time_count'] += 1
  117. # update stats with the total HTTP time
  118. self.engine.stats['page_load_time'] += page_load_time
  119. self.engine.stats['page_load_count'] += 1
  120. except Exception as e:
  121. record_exception(self.engine_name, e)
  122. # Timing
  123. engine_time = time() - start_time
  124. page_load_time = searx.network.get_time_for_thread()
  125. result_container.add_timing(self.engine_name, engine_time, page_load_time)
  126. # Record the errors
  127. with threading.RLock():
  128. self.engine.stats['errors'] += 1
  129. if (issubclass(e.__class__, (httpx.TimeoutException, asyncio.TimeoutError))):
  130. result_container.add_unresponsive_engine(self.engine_name, 'HTTP timeout')
  131. # requests timeout (connect or read)
  132. logger.error("engine {0} : HTTP requests timeout"
  133. "(search duration : {1} s, timeout: {2} s) : {3}"
  134. .format(self.engine_name, engine_time, timeout_limit, e.__class__.__name__))
  135. http_exception = True
  136. elif (issubclass(e.__class__, (httpx.HTTPError, httpx.StreamError))):
  137. result_container.add_unresponsive_engine(self.engine_name, 'HTTP error')
  138. # other requests exception
  139. logger.exception("engine {0} : requests exception"
  140. "(search duration : {1} s, timeout: {2} s) : {3}"
  141. .format(self.engine_name, engine_time, timeout_limit, e))
  142. http_exception = True
  143. elif (issubclass(e.__class__, SearxEngineCaptchaException)):
  144. result_container.add_unresponsive_engine(self.engine_name, 'CAPTCHA required')
  145. logger.exception('engine {0} : CAPTCHA'.format(self.engine_name))
  146. suspended_time = e.suspended_time # pylint: disable=no-member
  147. elif (issubclass(e.__class__, SearxEngineTooManyRequestsException)):
  148. result_container.add_unresponsive_engine(self.engine_name, 'too many requests')
  149. logger.exception('engine {0} : Too many requests'.format(self.engine_name))
  150. suspended_time = e.suspended_time # pylint: disable=no-member
  151. elif (issubclass(e.__class__, SearxEngineAccessDeniedException)):
  152. result_container.add_unresponsive_engine(self.engine_name, 'blocked')
  153. logger.exception('engine {0} : Searx is blocked'.format(self.engine_name))
  154. suspended_time = e.suspended_time # pylint: disable=no-member
  155. else:
  156. result_container.add_unresponsive_engine(self.engine_name, 'unexpected crash')
  157. # others errors
  158. logger.exception('engine {0} : exception : {1}'.format(self.engine_name, e))
  159. else:
  160. if getattr(threading.current_thread(), '_timeout', False):
  161. record_error(self.engine_name, 'Timeout')
  162. # suspend the engine if there is an HTTP error
  163. # or suspended_time is defined
  164. with threading.RLock():
  165. if http_exception or suspended_time:
  166. # update continuous_errors / suspend_end_time
  167. self.engine.continuous_errors += 1
  168. if suspended_time is None:
  169. suspended_time = min(settings['search']['max_ban_time_on_fail'],
  170. self.engine.continuous_errors * settings['search']['ban_time_on_fail'])
  171. self.engine.suspend_end_time = time() + suspended_time
  172. else:
  173. # reset the suspend variables
  174. self.engine.continuous_errors = 0
  175. self.engine.suspend_end_time = 0
  176. def get_default_tests(self):
  177. tests = {}
  178. tests['simple'] = {
  179. 'matrix': {'query': ('life', 'computer')},
  180. 'result_container': ['not_empty'],
  181. }
  182. if getattr(self.engine, 'paging', False):
  183. tests['paging'] = {
  184. 'matrix': {'query': 'time',
  185. 'pageno': (1, 2, 3)},
  186. 'result_container': ['not_empty'],
  187. 'test': ['unique_results']
  188. }
  189. if 'general' in self.engine.categories:
  190. # avoid documentation about HTML tags (<time> and <input type="time">)
  191. tests['paging']['matrix']['query'] = 'news'
  192. if getattr(self.engine, 'time_range', False):
  193. tests['time_range'] = {
  194. 'matrix': {'query': 'news',
  195. 'time_range': (None, 'day')},
  196. 'result_container': ['not_empty'],
  197. 'test': ['unique_results']
  198. }
  199. if getattr(self.engine, 'supported_languages', []):
  200. tests['lang_fr'] = {
  201. 'matrix': {'query': 'paris', 'lang': 'fr'},
  202. 'result_container': ['not_empty', ('has_language', 'fr')],
  203. }
  204. tests['lang_en'] = {
  205. 'matrix': {'query': 'paris', 'lang': 'en'},
  206. 'result_container': ['not_empty', ('has_language', 'en')],
  207. }
  208. if getattr(self.engine, 'safesearch', False):
  209. tests['safesearch'] = {
  210. 'matrix': {'query': 'porn',
  211. 'safesearch': (0, 2)},
  212. 'test': ['unique_results']
  213. }
  214. return tests