Browse Source

[fix] engine: duckduckgo - CAPTCHA detection

The previous implementation could not distinguish a CAPTCHA response from an
ordinary result list.  In the previous implementation a CAPTCHA was taken as a
result list where no items are in.

DDG does not block IPs.  Instead, a CAPTCHA wall is placed in front of request
on a dubious request.

Signed-off-by: Markus Heiser <markus.heiser@darmarit.de>
Markus Heiser 7 months ago
parent
commit
050451347b
2 changed files with 18 additions and 5 deletions
  1. 11 0
      searx/engines/duckduckgo.py
  2. 7 5
      searx/exceptions.py

+ 11 - 0
searx/engines/duckduckgo.py

@@ -25,6 +25,7 @@ from searx.network import get  # see https://github.com/searxng/searxng/issues/7
 from searx import redisdb
 from searx import redisdb
 from searx.enginelib.traits import EngineTraits
 from searx.enginelib.traits import EngineTraits
 from searx.utils import extr
 from searx.utils import extr
+from searx.exceptions import SearxEngineCaptchaException
 
 
 if TYPE_CHECKING:
 if TYPE_CHECKING:
     import logging
     import logging
@@ -292,6 +293,15 @@ def request(query, params):
     return params
     return params
 
 
 
 
+def detect_ddg_captcha(dom):
+    """In case of CAPTCHA ddg open its own *not a Robot* dialog and is
+    not redirected to CAPTCHA page.
+    """
+    if eval_xpath(dom, "//form[@id='challenge-form']"):
+        # set suspend time to zero is OK --> ddg does not block the IP
+        raise SearxEngineCaptchaException(suspended_time=0)
+
+
 def response(resp):
 def response(resp):
 
 
     if resp.status_code == 303:
     if resp.status_code == 303:
@@ -299,6 +309,7 @@ def response(resp):
 
 
     results = []
     results = []
     doc = lxml.html.fromstring(resp.text)
     doc = lxml.html.fromstring(resp.text)
+    detect_ddg_captcha(doc)
 
 
     result_table = eval_xpath(doc, '//html/body/form/div[@class="filters"]/table')
     result_table = eval_xpath(doc, '//html/body/form/div[@class="filters"]/table')
 
 

+ 7 - 5
searx/exceptions.py

@@ -1,6 +1,7 @@
 # SPDX-License-Identifier: AGPL-3.0-or-later
 # SPDX-License-Identifier: AGPL-3.0-or-later
 """Exception types raised by SearXNG modules.
 """Exception types raised by SearXNG modules.
 """
 """
+from __future__ import annotations
 
 
 from typing import Optional, Union
 from typing import Optional, Union
 
 
@@ -61,7 +62,7 @@ class SearxEngineAccessDeniedException(SearxEngineResponseException):
     """This settings contains the default suspended time (default 86400 sec / 1
     """This settings contains the default suspended time (default 86400 sec / 1
     day)."""
     day)."""
 
 
-    def __init__(self, suspended_time: int = None, message: str = 'Access denied'):
+    def __init__(self, suspended_time: int | None = None, message: str = 'Access denied'):
         """Generic exception to raise when an engine denies access to the results.
         """Generic exception to raise when an engine denies access to the results.
 
 
         :param suspended_time: How long the engine is going to be suspended in
         :param suspended_time: How long the engine is going to be suspended in
@@ -70,12 +71,13 @@ class SearxEngineAccessDeniedException(SearxEngineResponseException):
         :param message: Internal message.  Defaults to ``Access denied``
         :param message: Internal message.  Defaults to ``Access denied``
         :type message: str
         :type message: str
         """
         """
-        suspended_time = suspended_time or self._get_default_suspended_time()
+        if suspended_time is None:
+            suspended_time = self._get_default_suspended_time()
         super().__init__(message + ', suspended_time=' + str(suspended_time))
         super().__init__(message + ', suspended_time=' + str(suspended_time))
         self.suspended_time = suspended_time
         self.suspended_time = suspended_time
         self.message = message
         self.message = message
 
 
-    def _get_default_suspended_time(self):
+    def _get_default_suspended_time(self) -> int:
         from searx import get_setting  # pylint: disable=C0415
         from searx import get_setting  # pylint: disable=C0415
 
 
         return get_setting(self.SUSPEND_TIME_SETTING)
         return get_setting(self.SUSPEND_TIME_SETTING)
@@ -88,7 +90,7 @@ class SearxEngineCaptchaException(SearxEngineAccessDeniedException):
     """This settings contains the default suspended time (default 86400 sec / 1
     """This settings contains the default suspended time (default 86400 sec / 1
     day)."""
     day)."""
 
 
-    def __init__(self, suspended_time=None, message='CAPTCHA'):
+    def __init__(self, suspended_time: int | None = None, message='CAPTCHA'):
         super().__init__(message=message, suspended_time=suspended_time)
         super().__init__(message=message, suspended_time=suspended_time)
 
 
 
 
@@ -102,7 +104,7 @@ class SearxEngineTooManyRequestsException(SearxEngineAccessDeniedException):
     """This settings contains the default suspended time (default 3660 sec / 1
     """This settings contains the default suspended time (default 3660 sec / 1
     hour)."""
     hour)."""
 
 
-    def __init__(self, suspended_time=None, message='Too many request'):
+    def __init__(self, suspended_time: int | None = None, message='Too many request'):
         super().__init__(message=message, suspended_time=suspended_time)
         super().__init__(message=message, suspended_time=suspended_time)