Browse Source

[fix] duckduckgo extra: crashes and returns no results

Bnyro 6 months ago
parent
commit
66f6495a22
2 changed files with 66 additions and 50 deletions
  1. 54 44
      searx/engines/duckduckgo.py
  2. 12 6
      searx/engines/duckduckgo_extra.py

+ 54 - 44
searx/engines/duckduckgo.py

@@ -1,12 +1,14 @@
 # SPDX-License-Identifier: AGPL-3.0-or-later
 # SPDX-License-Identifier: AGPL-3.0-or-later
 """
 """
-DuckDuckGo Lite
-~~~~~~~~~~~~~~~
+DuckDuckGo WEB
+~~~~~~~~~~~~~~
 """
 """
 
 
+from __future__ import annotations
+
 from typing import TYPE_CHECKING
 from typing import TYPE_CHECKING
 import re
 import re
-from urllib.parse import urlencode
+from urllib.parse import urlencode, quote_plus
 import json
 import json
 import babel
 import babel
 import lxml.html
 import lxml.html
@@ -18,12 +20,12 @@ from searx import (
 )
 )
 from searx.utils import (
 from searx.utils import (
     eval_xpath,
     eval_xpath,
+    extr,
     extract_text,
     extract_text,
 )
 )
 from searx.network import get  # see https://github.com/searxng/searxng/issues/762
 from searx.network import get  # see https://github.com/searxng/searxng/issues/762
 from searx import redisdb
 from searx import redisdb
 from searx.enginelib.traits import EngineTraits
 from searx.enginelib.traits import EngineTraits
-from searx.utils import extr
 from searx.exceptions import SearxEngineCaptchaException
 from searx.exceptions import SearxEngineCaptchaException
 
 
 if TYPE_CHECKING:
 if TYPE_CHECKING:
@@ -60,42 +62,30 @@ form_data = {'v': 'l', 'api': 'd.js', 'o': 'json'}
 __CACHE = []
 __CACHE = []
 
 
 
 
-def _cache_key(data: dict):
-    return 'SearXNG_ddg_web_vqd' + redislib.secret_hash(f"{data['q']}//{data['kl']}")
+def _cache_key(query: str, region: str):
+    return 'SearXNG_ddg_web_vqd' + redislib.secret_hash(f"{query}//{region}")
 
 
 
 
-def cache_vqd(data: dict, value):
+def cache_vqd(query: str, region: str, value: str):
     """Caches a ``vqd`` value from a query."""
     """Caches a ``vqd`` value from a query."""
     c = redisdb.client()
     c = redisdb.client()
     if c:
     if c:
-        logger.debug("cache vqd value: %s", value)
-        c.set(_cache_key(data), value, ex=600)
+        logger.debug("VALKEY cache vqd value: %s (%s)", value, region)
+        c.set(_cache_key(query, region), value, ex=600)
 
 
     else:
     else:
-        logger.debug("MEM cache vqd value: %s", value)
+        logger.debug("MEM cache vqd value: %s (%s)", value, region)
         if len(__CACHE) > 100:  # cache vqd from last 100 queries
         if len(__CACHE) > 100:  # cache vqd from last 100 queries
             __CACHE.pop(0)
             __CACHE.pop(0)
-        __CACHE.append((_cache_key(data), value))
-
+        __CACHE.append((_cache_key(query, region), value))
 
 
-def get_vqd(data):
-    """Returns the ``vqd`` that fits to the *query* (``data`` from HTTP POST).
 
 
-    DDG's bot detection is sensitive to the ``vqd`` value.  For some search terms
-    (such as extremely long search terms that are often sent by bots), no ``vqd``
-    value can be determined.
+def get_vqd(query: str, region: str, force_request: bool = False):
+    """Returns the ``vqd`` that fits to the *query*.
 
 
-    If SearXNG cannot determine a ``vqd`` value, then no request should go out
-    to DDG:
-
-        A request with a wrong ``vqd`` value leads to DDG temporarily putting
-        SearXNG's IP on a block list.
-
-        Requests from IPs in this block list run into timeouts.
-
-    Not sure, but it seems the block list is a sliding window: to get my IP rid
-    from the bot list I had to cool down my IP for 1h (send no requests from
-    that IP to DDG).
+    :param query: The query term
+    :param region: DDG's region code
+    :param force_request: force a request to get a vqd value from DDG
 
 
     TL;DR; the ``vqd`` value is needed to pass DDG's bot protection and is used
     TL;DR; the ``vqd`` value is needed to pass DDG's bot protection and is used
     by all request to DDG:
     by all request to DDG:
@@ -106,23 +96,46 @@ def get_vqd(data):
     - DuckDuckGo Videos: ``https://duckduckgo.com/v.js??q=...&vqd=...``
     - DuckDuckGo Videos: ``https://duckduckgo.com/v.js??q=...&vqd=...``
     - DuckDuckGo News: ``https://duckduckgo.com/news.js??q=...&vqd=...``
     - DuckDuckGo News: ``https://duckduckgo.com/news.js??q=...&vqd=...``
 
 
+    DDG's bot detection is sensitive to the ``vqd`` value.  For some search terms
+    (such as extremely long search terms that are often sent by bots), no ``vqd``
+    value can be determined.
+
+    If SearXNG cannot determine a ``vqd`` value, then no request should go out
+    to DDG.
+
+    .. attention::
+
+       A request with a wrong ``vqd`` value leads to DDG temporarily putting
+       SearXNG's IP on a block list.
+
+    Requests from IPs in this block list run into timeouts.  Not sure, but it
+    seems the block list is a sliding window: to get my IP rid from the bot list
+    I had to cool down my IP for 1h (send no requests from that IP to DDG).
     """
     """
+    key = _cache_key(query, region)
 
 
-    key = _cache_key(data)
-    value = None
     c = redisdb.client()
     c = redisdb.client()
     if c:
     if c:
         value = c.get(key)
         value = c.get(key)
         if value or value == b'':
         if value or value == b'':
-            value = value.decode('utf-8')
+            value = value.decode('utf-8')  # type: ignore
             logger.debug("re-use CACHED vqd value: %s", value)
             logger.debug("re-use CACHED vqd value: %s", value)
             return value
             return value
 
 
-    else:
-        for k, value in __CACHE:
-            if k == key:
-                logger.debug("MEM re-use CACHED vqd value: %s", value)
+    for k, value in __CACHE:
+        if k == key:
+            logger.debug("MEM re-use CACHED vqd value: %s", value)
+            return value
+
+    if force_request:
+        resp = get(f'https://duckduckgo.com/?q={quote_plus(query)}')
+        if resp.status_code == 200:  # type: ignore
+            value = extr(resp.text, 'vqd="', '"')  # type: ignore
+            if value:
+                logger.debug("vqd value from DDG request: %s", value)
+                cache_vqd(query, region, value)
                 return value
                 return value
+
     return None
     return None
 
 
 
 
@@ -251,7 +264,7 @@ def request(query, params):
             for x in query.split()
             for x in query.split()
         ]
         ]
     )
     )
-    eng_region = traits.get_region(params['searxng_locale'], traits.all_locale)
+    eng_region: str = traits.get_region(params['searxng_locale'], traits.all_locale)  # type: ignore
     if eng_region == "wt-wt":
     if eng_region == "wt-wt":
         # https://html.duckduckgo.com/html sets an empty value for "all".
         # https://html.duckduckgo.com/html sets an empty value for "all".
         eng_region = ""
         eng_region = ""
@@ -310,10 +323,7 @@ def request(query, params):
         params['data']['v'] = form_data.get('v', 'l')
         params['data']['v'] = form_data.get('v', 'l')
         params['headers']['Referer'] = url
         params['headers']['Referer'] = url
 
 
-        # from here on no more params['data'] shuld be set, since this dict is
-        # needed to get a vqd value from the cache ..
-
-        vqd = get_vqd(params['data'])
+        vqd = get_vqd(query, eng_region, force_request=False)
 
 
         # Certain conditions must be met in order to call up one of the
         # Certain conditions must be met in order to call up one of the
         # following pages ...
         # following pages ...
@@ -362,7 +372,7 @@ def response(resp):
         form = form[0]
         form = form[0]
         form_vqd = eval_xpath(form, '//input[@name="vqd"]/@value')[0]
         form_vqd = eval_xpath(form, '//input[@name="vqd"]/@value')[0]
 
 
-        cache_vqd(resp.search_params["data"], form_vqd)
+        cache_vqd(resp.search_params['data']['q'], resp.search_params['data']['kl'], form_vqd)
 
 
     # just select "web-result" and ignore results of class "result--ad result--ad--small"
     # just select "web-result" and ignore results of class "result--ad result--ad--small"
     for div_result in eval_xpath(doc, '//div[@id="links"]/div[contains(@class, "web-result")]'):
     for div_result in eval_xpath(doc, '//div[@id="links"]/div[contains(@class, "web-result")]'):
@@ -379,7 +389,7 @@ def response(resp):
         results.append(item)
         results.append(item)
 
 
     zero_click_info_xpath = '//div[@id="zero_click_abstract"]'
     zero_click_info_xpath = '//div[@id="zero_click_abstract"]'
-    zero_click = extract_text(eval_xpath(doc, zero_click_info_xpath)).strip()
+    zero_click = extract_text(eval_xpath(doc, zero_click_info_xpath)).strip()  # type: ignore
 
 
     if zero_click and (
     if zero_click and (
         "Your IP address is" not in zero_click
         "Your IP address is" not in zero_click
@@ -432,7 +442,7 @@ def fetch_traits(engine_traits: EngineTraits):
     if not resp.ok:  # type: ignore
     if not resp.ok:  # type: ignore
         print("ERROR: response from DuckDuckGo is not OK.")
         print("ERROR: response from DuckDuckGo is not OK.")
 
 
-    js_code = extr(resp.text, 'regions:', ',snippetLengths')
+    js_code = extr(resp.text, 'regions:', ',snippetLengths')  # type: ignore
 
 
     regions = json.loads(js_code)
     regions = json.loads(js_code)
     for eng_tag, name in regions.items():
     for eng_tag, name in regions.items():
@@ -466,7 +476,7 @@ def fetch_traits(engine_traits: EngineTraits):
 
 
     engine_traits.custom['lang_region'] = {}
     engine_traits.custom['lang_region'] = {}
 
 
-    js_code = extr(resp.text, 'languages:', ',regions')
+    js_code = extr(resp.text, 'languages:', ',regions')  # type: ignore
 
 
     languages = js_variable_to_python(js_code)
     languages = js_variable_to_python(js_code)
     for eng_lang, name in languages.items():
     for eng_lang, name in languages.items():

+ 12 - 6
searx/engines/duckduckgo_extra.py

@@ -4,16 +4,15 @@ DuckDuckGo Extra (images, videos, news)
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 """
 """
 
 
+from __future__ import annotations
+
 from datetime import datetime
 from datetime import datetime
 from typing import TYPE_CHECKING
 from typing import TYPE_CHECKING
 from urllib.parse import urlencode
 from urllib.parse import urlencode
 from searx.utils import get_embeded_stream_url
 from searx.utils import get_embeded_stream_url
 
 
 from searx.engines.duckduckgo import fetch_traits  # pylint: disable=unused-import
 from searx.engines.duckduckgo import fetch_traits  # pylint: disable=unused-import
-from searx.engines.duckduckgo import (
-    get_ddg_lang,
-    get_vqd,
-)
+from searx.engines.duckduckgo import get_ddg_lang, get_vqd
 from searx.enginelib.traits import EngineTraits
 from searx.enginelib.traits import EngineTraits
 
 
 if TYPE_CHECKING:
 if TYPE_CHECKING:
@@ -48,15 +47,16 @@ search_path_map = {'images': 'i', 'videos': 'v', 'news': 'news'}
 
 
 
 
 def request(query, params):
 def request(query, params):
+    eng_region: str = traits.get_region(params['searxng_locale'], traits.all_locale)  # type: ignore
 
 
     # request needs a vqd argument
     # request needs a vqd argument
-    vqd = get_vqd(query)
+    vqd = get_vqd(query, eng_region, force_request=True)
+
     if not vqd:
     if not vqd:
         # some search terms do not have results and therefore no vqd value
         # some search terms do not have results and therefore no vqd value
         params['url'] = None
         params['url'] = None
         return params
         return params
 
 
-    eng_region = traits.get_region(params['searxng_locale'], traits.all_locale)
     eng_lang = get_ddg_lang(traits, params['searxng_locale'])
     eng_lang = get_ddg_lang(traits, params['searxng_locale'])
 
 
     args = {
     args = {
@@ -86,6 +86,12 @@ def request(query, params):
 
 
     params['url'] = f'https://duckduckgo.com/{search_path_map[ddg_category]}.js?{urlencode(args)}'
     params['url'] = f'https://duckduckgo.com/{search_path_map[ddg_category]}.js?{urlencode(args)}'
 
 
+    # sending these two headers prevents rate limiting for the query
+    params['headers'] = {
+        'Referer': 'https://duckduckgo.com/',
+        'X-Requested-With': 'XMLHttpRequest',
+    }
+
     return params
     return params