4 years ago · d703119d3a
--- a/docs/dev/engine_overview.rst
+++ b/docs/dev/engine_overview.rst
@@ -134,19 +134,19 @@ The function ``def request(query, params):`` always returns the ``params``
 
															 variable.  Inside searx, the following paramters can be used to specify a search
														
 
															 request:
														
 
															-================== =========== ==========================================================================
														
 
															-argument           type        information
														
 
															-================== =========== ==========================================================================
														
 
															-url                string      requested url
														
 
															-method             string      HTTP request method
														
 
															-headers            set         HTTP header information
														
 
															-data               set         HTTP data information (parsed if ``method != 'GET'``)
														
 
															-cookies            set         HTTP cookies
														
 
															-verify             boolean     Performing SSL-Validity check
														
 
															-max_redirects      int         maximum redirects, hard limit
														
 
															-soft_max_redirects int         maximum redirects, soft limit. Record an error but don't stop the engine
														
 
															-raise_for_status   bool        True by default: raise an exception if the HTTP code of response is >= 300
														
 
															-================== =========== ==========================================================================
														
 
															+=================== =========== ==========================================================================
														
 
															+argument            type        information
														
 
															+=================== =========== ==========================================================================
														
 
															+url                 string      requested url
														
 
															+method              string      HTTP request method
														
 
															+headers             set         HTTP header information
														
 
															+data                set         HTTP data information (parsed if ``method != 'GET'``)
														
 
															+cookies             set         HTTP cookies
														
 
															+verify              boolean     Performing SSL-Validity check
														
 
															+max_redirects       int         maximum redirects, hard limit
														
 
															+soft_max_redirects  int         maximum redirects, soft limit. Record an error but don't stop the engine
														
 
															+raise_for_httperror bool        True by default: raise an exception if the HTTP code of response is >= 300
														
 
															+=================== =========== ==========================================================================
														
 
															 example code
														
--- a/searx/engines/__init__.py
+++ b/searx/engines/__init__.py
@@ -281,8 +281,12 @@ def initialize_engines(engine_list):
 
															     load_engines(engine_list)
														
 
															     def engine_init(engine_name, init_fn):
														
 
															-        init_fn(get_engine_from_settings(engine_name))
														
 
															-        logger.debug('%s engine: Initialized', engine_name)
														
 
															+        try:
														
 
															+            init_fn(get_engine_from_settings(engine_name))
														
 
															+        except Exception:
														
 
															+            logger.exception('%s engine: Fail to initialize', engine_name)
														
 
															+        else:
														
 
															+            logger.debug('%s engine: Initialized', engine_name)
														
 
															     for engine_name, engine in engines.items():
														
 
															         if hasattr(engine, 'init'):
														
--- a/searx/engines/qwant.py
+++ b/searx/engines/qwant.py
@@ -14,6 +14,8 @@ from datetime import datetime
 
															 from json import loads
														
 
															 from urllib.parse import urlencode
														
 
															 from searx.utils import html_to_text, match_language
														
 
															+from searx.exceptions import SearxEngineAPIException, SearxEngineCaptchaException
														
 
															+from searx.raise_for_httperror import raise_for_httperror
														
 
															 # engine dependent config
														
@@ -24,8 +26,7 @@ supported_languages_url = 'https://qwant.com/region'
 
															 category_to_keyword = {'general': 'web',
														
 
															                        'images': 'images',
														
 
															-                       'news': 'news',
														
 
															-                       'social media': 'social'}
														
 
															+                       'news': 'news'}
														
 
															 # search-url
														
 
															 url = 'https://api.qwant.com/api/search/{keyword}?count=10&offset={offset}&f=&{query}&t={keyword}&uiv=4'
														
@@ -51,6 +52,7 @@ def request(query, params):
 
															         params['url'] += '&locale=' + language.replace('-', '_').lower()
														
 
															     params['headers']['User-Agent'] = 'Mozilla/5.0 (X11; Linux x86_64; rv:69.0) Gecko/20100101 Firefox/69.0'
														
 
															+    params['raise_for_httperror'] = False
														
 
															     return params
														
@@ -58,8 +60,20 @@ def request(query, params):
 
															 def response(resp):
														
 
															     results = []
														
 
															+    # According to https://www.qwant.com/js/app.js
														
 
															+    if resp.status_code == 429:
														
 
															+        raise SearxEngineCaptchaException()
														
 
															+
														
 
															+    # raise for other errors
														
 
															+    raise_for_httperror(resp)
														
 
															+
														
 
															+    # load JSON result
														
 
															     search_results = loads(resp.text)
														
 
															+    # check for an API error
														
 
															+    if search_results.get('status') != 'success':
														
 
															+        raise SearxEngineAPIException('API error ' + str(search_results.get('error', '')))
														
 
															+
														
 
															     # return empty array if there are no results
														
 
															     if 'data' not in search_results:
														
 
															         return []
														
@@ -90,15 +104,6 @@ def response(resp):
 
															                             'thumbnail_src': thumbnail_src,
														
 
															                             'img_src': img_src})
														
 
															-        elif category_to_keyword.get(categories[0], '') == 'social':
														
 
															-            published_date = datetime.fromtimestamp(result['date'], None)
														
 
															-            img_src = result.get('img', None)
														
 
															-            results.append({'url': res_url,
														
 
															-                            'title': title,
														
 
															-                            'publishedDate': published_date,
														
 
															-                            'content': content,
														
 
															-                            'img_src': img_src})
														
 
															-
														
 
															         elif category_to_keyword.get(categories[0], '') == 'news':
														
 
															             published_date = datetime.fromtimestamp(result['date'], None)
														
 
															             media = result.get('media', [])
														
--- a/searx/engines/wikidata.py
+++ b/searx/engines/wikidata.py
@@ -161,9 +161,6 @@ def request(query, params):
 
															 def response(resp):
														
 
															     results = []
														
 
															-    if resp.status_code != 200:
														
 
															-        logger.debug('SPARQL endpoint error %s', resp.content.decode())
														
 
															-    resp.raise_for_status()
														
 
															     jsonresponse = loads(resp.content.decode())
														
 
															     language = resp.search_params['language'].lower()
														
--- a/searx/engines/wikipedia.py
+++ b/searx/engines/wikipedia.py
@@ -14,6 +14,7 @@ from urllib.parse import quote
 
															 from json import loads
														
 
															 from lxml.html import fromstring
														
 
															 from searx.utils import match_language, searx_useragent
														
 
															+from searx.raise_for_httperror import raise_for_httperror
														
 
															 # search-url
														
 
															 search_url = 'https://{language}.wikipedia.org/api/rest_v1/page/summary/{title}'
														
@@ -37,7 +38,7 @@ def request(query, params):
 
															                                       language=url_lang(params['language']))
														
 
															     params['headers']['User-Agent'] = searx_useragent()
														
 
															-    params['raise_for_status'] = False
														
 
															+    params['raise_for_httperror'] = False
														
 
															     params['soft_max_redirects'] = 2
														
 
															     return params
														
@@ -47,6 +48,7 @@ def request(query, params):
 
															 def response(resp):
														
 
															     if resp.status_code == 404:
														
 
															         return []
														
 
															+    raise_for_httperror(resp)
														
 
															     results = []
														
 
															     api_result = loads(resp.text)
														
--- a/searx/exceptions.py
+++ b/searx/exceptions.py
@@ -64,8 +64,33 @@ class SearxEngineAPIException(SearxEngineResponseException):
 
															     """The website has returned an application error"""
														
 
															-class SearxEngineCaptchaException(SearxEngineResponseException):
														
 
															-    """The website has returned a CAPTCHA"""
														
 
															+class SearxEngineAccessDeniedException(SearxEngineResponseException):
														
 
															+    """The website is blocking the access"""
														
 
															+
														
 
															+    def __init__(self, suspended_time=24 * 3600, message='Access denied'):
														
 
															+        super().__init__(message + ', suspended_time=' + str(suspended_time))
														
 
															+        self.suspended_time = suspended_time
														
 
															+        self.message = message
														
 
															+
														
 
															+
														
 
															+class SearxEngineCaptchaException(SearxEngineAccessDeniedException):
														
 
															+    """The website has returned a CAPTCHA
														
 
															+
														
 
															+    By default, searx stops sending requests to this engine for 1 day.
														
 
															+    """
														
 
															+
														
 
															+    def __init__(self, suspended_time=24 * 3600, message='CAPTCHA'):
														
 
															+        super().__init__(message=message, suspended_time=suspended_time)
														
 
															+
														
 
															+
														
 
															+class SearxEngineTooManyRequestsException(SearxEngineAccessDeniedException):
														
 
															+    """The website has returned a Too Many Request status code
														
 
															+
														
 
															+    By default, searx stops sending requests to this engine for 1 hour.
														
 
															+    """
														
 
															+
														
 
															+    def __init__(self, suspended_time=3600, message='Too many request'):
														
 
															+        super().__init__(message=message, suspended_time=suspended_time)
														
 
															 class SearxEngineXPathException(SearxEngineResponseException):
														
--- a/searx/metrology/error_recorder.py
+++ b/searx/metrology/error_recorder.py
@@ -4,7 +4,8 @@ import logging
 
															 from json import JSONDecodeError
														
 
															 from urllib.parse import urlparse
														
 
															 from requests.exceptions import RequestException
														
 
															-from searx.exceptions import SearxXPathSyntaxException, SearxEngineXPathException
														
 
															+from searx.exceptions import (SearxXPathSyntaxException, SearxEngineXPathException, SearxEngineAPIException,
														
 
															+                              SearxEngineAccessDeniedException)
														
 
															 from searx import logger
														
@@ -100,6 +101,10 @@ def get_messages(exc, filename) -> typing.Tuple:
 
															         return (exc.xpath_str, exc.message)
														
 
															     if isinstance(exc, SearxEngineXPathException):
														
 
															         return (exc.xpath_str, exc.message)
														
 
															+    if isinstance(exc, SearxEngineAPIException):
														
 
															+        return (str(exc.args[0]), )
														
 
															+    if isinstance(exc, SearxEngineAccessDeniedException):
														
 
															+        return (exc.message, )
														
 
															     return ()
														
--- a/searx/poolrequests.py
+++ b/searx/poolrequests.py
@@ -7,6 +7,7 @@ import requests
 
															 from searx import settings
														
 
															 from searx import logger
														
 
															+from searx.raise_for_httperror import raise_for_httperror
														
 
															 logger = logger.getChild('poolrequests')
														
@@ -156,6 +157,12 @@ def request(method, url, **kwargs):
 
															         if timeout is not None:
														
 
															             kwargs['timeout'] = timeout
														
 
															+    # raise_for_error
														
 
															+    check_for_httperror = True
														
 
															+    if 'raise_for_httperror' in kwargs:
														
 
															+        check_for_httperror = kwargs['raise_for_httperror']
														
 
															+        del kwargs['raise_for_httperror']
														
 
															+
														
 
															     # do request
														
 
															     response = session.request(method=method, url=url, **kwargs)
														
@@ -176,6 +183,10 @@ def request(method, url, **kwargs):
 
															     if hasattr(threadLocal, 'total_time'):
														
 
															         threadLocal.total_time += time_after_request - time_before_request
														
 
															+    # raise an exception
														
 
															+    if check_for_httperror:
														
 
															+        raise_for_httperror(response)
														
 
															+
														
 
															     return response
														
--- a/searx/raise_for_httperror.py
+++ b/searx/raise_for_httperror.py
@@ -0,0 +1,66 @@
 
															+# SPDX-License-Identifier: AGPL-3.0-or-later
														
 
															+"""
														
 
															+Raise exception for an HTTP response is an error.
														
 
															+"""
														
 
															+from searx.exceptions import (SearxEngineCaptchaException, SearxEngineTooManyRequestsException,
														
 
															+                              SearxEngineAccessDeniedException)
														
 
															+
														
 
															+
														
 
															+def is_cloudflare_challenge(resp):
														
 
															+    if resp.status_code in [429, 503]:
														
 
															+        if ('__cf_chl_jschl_tk__=' in resp.text)\
														
 
															+           or ('/cdn-cgi/challenge-platform/' in resp.text
														
 
															+               and 'orchestrate/jsch/v1' in resp.text
														
 
															+               and 'window._cf_chl_enter(' in resp.text):
														
 
															+            return True
														
 
															+    if resp.status_code == 403 and '__cf_chl_captcha_tk__=' in resp.text:
														
 
															+        return True
														
 
															+    return False
														
 
															+
														
 
															+
														
 
															+def is_cloudflare_firewall(resp):
														
 
															+    return resp.status_code == 403 and '<span class="cf-error-code">1020</span>' in resp.text
														
 
															+
														
 
															+
														
 
															+def raise_for_cloudflare_captcha(resp):
														
 
															+    if resp.headers.get('Server', '').startswith('cloudflare'):
														
 
															+        if is_cloudflare_challenge(resp):
														
 
															+            # https://support.cloudflare.com/hc/en-us/articles/200170136-Understanding-Cloudflare-Challenge-Passage-Captcha-
														
 
															+            # suspend for 2 weeks
														
 
															+            raise SearxEngineCaptchaException(message='Cloudflare CAPTCHA', suspended_time=3600 * 24 * 15)
														
 
															+
														
 
															+        if is_cloudflare_firewall(resp):
														
 
															+            raise SearxEngineAccessDeniedException(message='Cloudflare Firewall', suspended_time=3600 * 24)
														
 
															+
														
 
															+
														
 
															+def raise_for_recaptcha(resp):
														
 
															+    if resp.status_code == 503 \
														
 
															+       and '"https://www.google.com/recaptcha/' in resp.text:
														
 
															+        raise SearxEngineCaptchaException(message='ReCAPTCHA', suspended_time=3600 * 24 * 7)
														
 
															+
														
 
															+
														
 
															+def raise_for_captcha(resp):
														
 
															+    raise_for_cloudflare_captcha(resp)
														
 
															+    raise_for_recaptcha(resp)
														
 
															+
														
 
															+
														
 
															+def raise_for_httperror(resp):
														
 
															+    """Raise exception for an HTTP response is an error.
														
 
															+
														
 
															+    Args:
														
 
															+        resp (requests.Response): Response to check
														
 
															+
														
 
															+    Raises:
														
 
															+        requests.HTTPError: raise by resp.raise_for_status()
														
 
															+        searx.exceptions.SearxEngineAccessDeniedException: raise when the HTTP status code is 402 or 403.
														
 
															+        searx.exceptions.SearxEngineTooManyRequestsException: raise when the HTTP status code is 429.
														
 
															+        searx.exceptions.SearxEngineCaptchaException: raise when if CATPCHA challenge is detected.
														
 
															+    """
														
 
															+    if resp.status_code and resp.status_code >= 400:
														
 
															+        raise_for_captcha(resp)
														
 
															+        if resp.status_code in (402, 403):
														
 
															+            raise SearxEngineAccessDeniedException(message='HTTP error ' + str(resp.status_code),
														
 
															+                                                   suspended_time=3600 * 24)
														
 
															+        if resp.status_code == 429:
														
 
															+            raise SearxEngineTooManyRequestsException()
														
 
															+        resp.raise_for_status()
														
--- a/searx/search.py
+++ b/searx/search.py
@@ -32,7 +32,8 @@ from searx.utils import gen_useragent
 
															 from searx.results import ResultContainer
														
 
															 from searx import logger
														
 
															 from searx.plugins import plugins
														
 
															-from searx.exceptions import SearxEngineCaptchaException
														
 
															+from searx.exceptions import (SearxEngineAccessDeniedException, SearxEngineCaptchaException,
														
 
															+                              SearxEngineTooManyRequestsException,)
														
 
															 from searx.metrology.error_recorder import record_exception, record_error
														
@@ -131,6 +132,9 @@ def send_http_request(engine, request_params):
 
															     # soft_max_redirects
														
 
															     soft_max_redirects = request_params.get('soft_max_redirects', max_redirects or 0)
														
 
															+    # raise_for_status
														
 
															+    request_args['raise_for_httperror'] = request_params.get('raise_for_httperror', False)
														
 
															+
														
 
															     # specific type of request (GET or POST)
														
 
															     if request_params['method'] == 'GET':
														
 
															         req = requests_lib.get
														
@@ -142,10 +146,6 @@ def send_http_request(engine, request_params):
 
															     # send the request
														
 
															     response = req(request_params['url'], **request_args)
														
 
															-    # check HTTP status
														
 
															-    if request_params.get('raise_for_status'):
														
 
															-        response.raise_for_status()
														
 
															-
														
 
															     # check soft limit of the redirect count
														
 
															     if len(response.history) > soft_max_redirects:
														
 
															         # unexpected redirect : record an error
														
@@ -191,6 +191,7 @@ def search_one_http_request_safe(engine_name, query, request_params, result_cont
 
															     # suppose everything will be alright
														
 
															     requests_exception = False
														
 
															+    suspended_time = None
														
 
															     try:
														
 
															         # send requests and parse the results
														
@@ -240,6 +241,15 @@ def search_one_http_request_safe(engine_name, query, request_params, result_cont
 
															         elif (issubclass(e.__class__, SearxEngineCaptchaException)):
														
 
															             result_container.add_unresponsive_engine(engine_name, 'CAPTCHA required')
														
 
															             logger.exception('engine {0} : CAPTCHA')
														
 
															+            suspended_time = e.suspended_time  # pylint: disable=no-member
														
 
															+        elif (issubclass(e.__class__, SearxEngineTooManyRequestsException)):
														
 
															+            result_container.add_unresponsive_engine(engine_name, 'too many requests')
														
 
															+            logger.exception('engine {0} : Too many requests')
														
 
															+            suspended_time = e.suspended_time  # pylint: disable=no-member
														
 
															+        elif (issubclass(e.__class__, SearxEngineAccessDeniedException)):
														
 
															+            result_container.add_unresponsive_engine(engine_name, 'blocked')
														
 
															+            logger.exception('engine {0} : Searx is blocked')
														
 
															+            suspended_time = e.suspended_time  # pylint: disable=no-member
														
 
															         else:
														
 
															             result_container.add_unresponsive_engine(engine_name, 'unexpected crash')
														
 
															             # others errors
														
@@ -248,16 +258,18 @@ def search_one_http_request_safe(engine_name, query, request_params, result_cont
 
															         if getattr(threading.current_thread(), '_timeout', False):
														
 
															             record_error(engine_name, 'Timeout')
														
 
															-    # suspend or not the engine if there are HTTP errors
														
 
															+    # suspend the engine if there is an HTTP error
														
 
															+    # or suspended_time is defined
														
 
															     with threading.RLock():
														
 
															-        if requests_exception:
														
 
															+        if requests_exception or suspended_time:
														
 
															             # update continuous_errors / suspend_end_time
														
 
															             engine.continuous_errors += 1
														
 
															-            engine.suspend_end_time = time() + min(settings['search']['max_ban_time_on_fail'],
														
 
															-                                                   engine.continuous_errors * settings['search']['ban_time_on_fail'])
														
 
															+            if suspended_time is None:
														
 
															+                suspended_time = min(settings['search']['max_ban_time_on_fail'],
														
 
															+                                     engine.continuous_errors * settings['search']['ban_time_on_fail'])
														
 
															+            engine.suspend_end_time = time() + suspended_time
														
 
															         else:
														
 
															-            # no HTTP error (perhaps an engine error)
														
 
															-            # anyway, reset the suspend variables
														
 
															+            # reset the suspend variables
														
 
															             engine.continuous_errors = 0
														
 
															             engine.suspend_end_time = 0
														
@@ -342,7 +354,7 @@ def default_request_params():
 
															         'cookies': {},
														
 
															         'verify': True,
														
 
															         'auth': None,
														
 
															-        'raise_for_status': True
														
 
															+        'raise_for_httperror': True
														
 
															     }
														
--- a/searx/settings.yml
+++ b/searx/settings.yml
@@ -647,11 +647,6 @@ engines:
 
															     shortcut : qwn
														
 
															     categories : news
														
 
															-  - name : qwant social
														
 
															-    engine : qwant
														
 
															-    shortcut : qws
														
 
															-    categories : social media
														
 
															-
														
 
															 #  - name: library
														
 
															 #    engine: recoll
														
 
															 #    shortcut: lib
														
@@ -817,12 +812,13 @@ engines:
 
															     # Or you can use the html non-stable engine, activated by default
														
 
															     engine : youtube_noapi
														
 
															-  - name : yggtorrent
														
 
															-    engine : yggtorrent
														
 
															-    shortcut : ygg
														
 
															-    url: https://www2.yggtorrent.si/
														
 
															-    disabled : True
														
 
															-    timeout : 4.0
														
 
															+  # tmp suspended: Cloudflare CAPTCHA
														
 
															+  #- name : yggtorrent
														
 
															+  #  engine : yggtorrent
														
 
															+  #  shortcut : ygg
														
 
															+  #  url: https://www2.yggtorrent.si/
														
 
															+  #  disabled : True
														
 
															+  #  timeout : 4.0
														
 
															   - name : dailymotion
														
 
															     engine : dailymotion