online.py 8.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211
  1. # SPDX-License-Identifier: AGPL-3.0-or-later
  2. from urllib.parse import urlparse
  3. from time import time
  4. import threading
  5. import requests.exceptions
  6. import searx.poolrequests as poolrequests
  7. from searx.engines import settings
  8. from searx import logger
  9. from searx.utils import gen_useragent
  10. from searx.exceptions import (SearxEngineAccessDeniedException, SearxEngineCaptchaException,
  11. SearxEngineTooManyRequestsException,)
  12. from searx.metrology.error_recorder import record_exception, record_error
  13. from searx.search.processors.abstract import EngineProcessor
  14. logger = logger.getChild('search.processor.online')
  15. DEFAULT_PARAMS = {
  16. 'method': 'GET',
  17. 'headers': {},
  18. 'data': {},
  19. 'url': '',
  20. 'cookies': {},
  21. 'verify': True,
  22. 'auth': None
  23. }
  24. class OnlineProcessor(EngineProcessor):
  25. engine_type = 'online'
  26. def get_params(self, search_query, engine_category):
  27. params = super().get_params(search_query, engine_category)
  28. if params is None:
  29. return None
  30. # skip suspended engines
  31. if self.engine.suspend_end_time >= time():
  32. logger.debug('Engine currently suspended: %s', self.engine_name)
  33. return None
  34. # add default params
  35. params.update(DEFAULT_PARAMS)
  36. # add an user agent
  37. params['headers']['User-Agent'] = gen_useragent()
  38. return params
  39. def _send_http_request(self, params):
  40. # create dictionary which contain all
  41. # informations about the request
  42. request_args = dict(
  43. headers=params['headers'],
  44. cookies=params['cookies'],
  45. verify=params['verify'],
  46. auth=params['auth']
  47. )
  48. # setting engine based proxies
  49. if hasattr(self.engine, 'proxies'):
  50. request_args['proxies'] = poolrequests.get_proxies(self.engine.proxies)
  51. # max_redirects
  52. max_redirects = params.get('max_redirects')
  53. if max_redirects:
  54. request_args['max_redirects'] = max_redirects
  55. # soft_max_redirects
  56. soft_max_redirects = params.get('soft_max_redirects', max_redirects or 0)
  57. # raise_for_status
  58. request_args['raise_for_httperror'] = params.get('raise_for_httperror', False)
  59. # specific type of request (GET or POST)
  60. if params['method'] == 'GET':
  61. req = poolrequests.get
  62. else:
  63. req = poolrequests.post
  64. request_args['data'] = params['data']
  65. # send the request
  66. response = req(params['url'], **request_args)
  67. # check soft limit of the redirect count
  68. if len(response.history) > soft_max_redirects:
  69. # unexpected redirect : record an error
  70. # but the engine might still return valid results.
  71. status_code = str(response.status_code or '')
  72. reason = response.reason or ''
  73. hostname = str(urlparse(response.url or '').netloc)
  74. record_error(self.engine_name,
  75. '{} redirects, maximum: {}'.format(len(response.history), soft_max_redirects),
  76. (status_code, reason, hostname))
  77. return response
  78. def _search_basic(self, query, params):
  79. # update request parameters dependent on
  80. # search-engine (contained in engines folder)
  81. self.engine.request(query, params)
  82. # ignoring empty urls
  83. if params['url'] is None:
  84. return None
  85. if not params['url']:
  86. return None
  87. # send request
  88. response = self._send_http_request(params)
  89. # parse the response
  90. response.search_params = params
  91. return self.engine.response(response)
  92. def search(self, query, params, result_container, start_time, timeout_limit):
  93. # set timeout for all HTTP requests
  94. poolrequests.set_timeout_for_thread(timeout_limit, start_time=start_time)
  95. # reset the HTTP total time
  96. poolrequests.reset_time_for_thread()
  97. # suppose everything will be alright
  98. requests_exception = False
  99. suspended_time = None
  100. try:
  101. # send requests and parse the results
  102. search_results = self._search_basic(query, params)
  103. # check if the engine accepted the request
  104. if search_results is not None:
  105. # yes, so add results
  106. result_container.extend(self.engine_name, search_results)
  107. # update engine time when there is no exception
  108. engine_time = time() - start_time
  109. page_load_time = poolrequests.get_time_for_thread()
  110. result_container.add_timing(self.engine_name, engine_time, page_load_time)
  111. with threading.RLock():
  112. self.engine.stats['engine_time'] += engine_time
  113. self.engine.stats['engine_time_count'] += 1
  114. # update stats with the total HTTP time
  115. self.engine.stats['page_load_time'] += page_load_time
  116. self.engine.stats['page_load_count'] += 1
  117. except Exception as e:
  118. record_exception(self.engine_name, e)
  119. # Timing
  120. engine_time = time() - start_time
  121. page_load_time = poolrequests.get_time_for_thread()
  122. result_container.add_timing(self.engine_name, engine_time, page_load_time)
  123. # Record the errors
  124. with threading.RLock():
  125. self.engine.stats['errors'] += 1
  126. if (issubclass(e.__class__, requests.exceptions.Timeout)):
  127. result_container.add_unresponsive_engine(self.engine_name, 'HTTP timeout')
  128. # requests timeout (connect or read)
  129. logger.error("engine {0} : HTTP requests timeout"
  130. "(search duration : {1} s, timeout: {2} s) : {3}"
  131. .format(self.engine_name, engine_time, timeout_limit, e.__class__.__name__))
  132. requests_exception = True
  133. elif (issubclass(e.__class__, requests.exceptions.RequestException)):
  134. result_container.add_unresponsive_engine(self.engine_name, 'HTTP error')
  135. # other requests exception
  136. logger.exception("engine {0} : requests exception"
  137. "(search duration : {1} s, timeout: {2} s) : {3}"
  138. .format(self.engine_name, engine_time, timeout_limit, e))
  139. requests_exception = True
  140. elif (issubclass(e.__class__, SearxEngineCaptchaException)):
  141. result_container.add_unresponsive_engine(self.engine_name, 'CAPTCHA required')
  142. logger.exception('engine {0} : CAPTCHA')
  143. suspended_time = e.suspended_time # pylint: disable=no-member
  144. elif (issubclass(e.__class__, SearxEngineTooManyRequestsException)):
  145. result_container.add_unresponsive_engine(self.engine_name, 'too many requests')
  146. logger.exception('engine {0} : Too many requests')
  147. suspended_time = e.suspended_time # pylint: disable=no-member
  148. elif (issubclass(e.__class__, SearxEngineAccessDeniedException)):
  149. result_container.add_unresponsive_engine(self.engine_name, 'blocked')
  150. logger.exception('engine {0} : Searx is blocked')
  151. suspended_time = e.suspended_time # pylint: disable=no-member
  152. else:
  153. result_container.add_unresponsive_engine(self.engine_name, 'unexpected crash')
  154. # others errors
  155. logger.exception('engine {0} : exception : {1}'.format(self.engine_name, e))
  156. else:
  157. if getattr(threading.current_thread(), '_timeout', False):
  158. record_error(self.engine_name, 'Timeout')
  159. # suspend the engine if there is an HTTP error
  160. # or suspended_time is defined
  161. with threading.RLock():
  162. if requests_exception or suspended_time:
  163. # update continuous_errors / suspend_end_time
  164. self.engine.continuous_errors += 1
  165. if suspended_time is None:
  166. suspended_time = min(settings['search']['max_ban_time_on_fail'],
  167. self.engine.continuous_errors * settings['search']['ban_time_on_fail'])
  168. self.engine.suspend_end_time = time() + suspended_time
  169. else:
  170. # reset the suspend variables
  171. self.engine.continuous_errors = 0
  172. self.engine.suspend_end_time = 0