Browse Source

[refactor] duckduckgo engine: improve request logic and code structure (#4837)

Changes:
- Add trailing slash to base URL to prevent potential redirects
- Remove advanced search syntax filtering (no longer guarantees a CAPTCHA)
- Correct pagination offset calculation: Page 2 now starts at offset 10,
  subsequent pages use 10 + (n-2)*15 formula instead of the previous
  broken 20 + (n-2)*50 calculation that caused CAPTCHAs
- Restructure request parameter building to better match a real request
- "kt" cookie is no longer an empty string if the language/region is "all"
- Group related parameter assignments together
- Add header logging to debugging output

Related:

- https://github.com/searxng/searxng/issues/4824
useralias 1 month ago
parent
commit
4fa7de8033
1 changed files with 51 additions and 66 deletions
  1. 51 66
      searx/engines/duckduckgo.py

+ 51 - 66
searx/engines/duckduckgo.py

@@ -58,7 +58,7 @@ paging = True
 time_range_support = True
 safesearch = True  # user can't select but the results are filtered
 
-url = "https://html.duckduckgo.com/html"
+url = "https://html.duckduckgo.com/html/"
 
 time_range_dict = {'day': 'd', 'week': 'w', 'month': 'm', 'year': 'y'}
 form_data = {'v': 'l', 'api': 'd.js', 'o': 'json'}
@@ -248,7 +248,6 @@ def quote_ddg_bangs(query):
 
 
 def request(query, params):
-
     query = quote_ddg_bangs(query)
 
     if len(query) >= 500:
@@ -256,93 +255,79 @@ def request(query, params):
         params["url"] = None
         return
 
-    # Advanced search syntax ends in CAPTCHA
-    # https://duckduckgo.com/duckduckgo-help-pages/results/syntax/
-    query = " ".join(
-        [
-            x.removeprefix("site:").removeprefix("intitle:").removeprefix("inurl:").removeprefix("filetype:")
-            for x in query.split()
-        ]
-    )
     eng_region: str = traits.get_region(params['searxng_locale'], traits.all_locale)  # type: ignore
-    if eng_region == "wt-wt":
-        # https://html.duckduckgo.com/html sets an empty value for "all".
-        eng_region = ""
-
-    params['data']['kl'] = eng_region
-    params['cookies']['kl'] = eng_region
 
-    # eng_lang = get_ddg_lang(traits, params['searxng_locale'])
+    # Note: The API is reverse-engineered from DuckDuckGo's HTML webpage
+    # (https://html.duckduckgo.com/html/) and may be subject to additional bot detection mechanisms
+    # and breaking changes in the future.
+    #
+    # The params['data'] dictionary can have the following key parameters, in this order:
+    # - q (str): Search query string
+    # - b (str): Beginning parameter - empty string for first page requests
+    # - s (int): Search offset for pagination
+    # - nextParams (str): Continuation parameters from previous page response, typically empty
+    # - v (str): Typically 'l' for subsequent pages
+    # - o (str): Output format, typically 'json'
+    # - dc (int): Display count - value equal to offset (s) + 1
+    # - api (str): API endpoint identifier, typically 'd.js'
+    # - vqd (str): Validation query digest
+    # - kl (str): Keyboard language/region code (e.g., 'en-us')
+    # - df (str): Time filter, maps to values like 'd' (day), 'w' (week), 'm' (month), 'y' (year)
 
-    params['url'] = url
-    params['method'] = 'POST'
     params['data']['q'] = query
 
-    # The API is not documented, so we do some reverse engineering and emulate
-    # what https://html.duckduckgo.com/html does when you press "next Page" link
-    # again and again ..
-
-    params['headers']['Content-Type'] = 'application/x-www-form-urlencoded'
-
-    params['headers']['Sec-Fetch-Dest'] = "document"
-    params['headers']['Sec-Fetch-Mode'] = "navigate"  # at least this one is used by ddg's bot detection
-    params['headers']['Sec-Fetch-Site'] = "same-origin"
-    params['headers']['Sec-Fetch-User'] = "?1"
-
-    # Form of the initial search page does have empty values in the form
     if params['pageno'] == 1:
-
         params['data']['b'] = ""
-
-    params['data']['df'] = ''
-    if params['time_range'] in time_range_dict:
-
-        params['data']['df'] = time_range_dict[params['time_range']]
-        params['cookies']['df'] = time_range_dict[params['time_range']]
-
-    if params['pageno'] == 2:
-
-        # second page does have an offset of 20
-        offset = (params['pageno'] - 1) * 20
+    elif params['pageno'] >= 2:
+        offset = 10 + (params['pageno'] - 2) * 15  # Page 2 = 10, Page 3+ = 10 + n*15
         params['data']['s'] = offset
-        params['data']['dc'] = offset + 1
-
-    elif params['pageno'] > 2:
-
-        # third and following pages do have an offset of 20 + n*50
-        offset = 20 + (params['pageno'] - 2) * 50
-        params['data']['s'] = offset
-        params['data']['dc'] = offset + 1
-
-    if params['pageno'] > 1:
-
-        # initial page does not have these additional data in the input form
-        params['data']['o'] = form_data.get('o', 'json')
-        params['data']['api'] = form_data.get('api', 'd.js')
         params['data']['nextParams'] = form_data.get('nextParams', '')
         params['data']['v'] = form_data.get('v', 'l')
-        params['headers']['Referer'] = url
+        params['data']['o'] = form_data.get('o', 'json')
+        params['data']['dc'] = offset + 1
+        params['data']['api'] = form_data.get('api', 'd.js')
 
+        # vqd is required to request other pages after the first one
         vqd = get_vqd(query, eng_region, force_request=False)
-
-        # Certain conditions must be met in order to call up one of the
-        # following pages ...
-
         if vqd:
-            params['data']['vqd'] = vqd  # follow up pages / requests needs a vqd argument
+            params['data']['vqd'] = vqd
         else:
-            # Don't try to call follow up pages without a vqd value.  DDG
-            # recognizes this as a request from a bot.  This lowers the
+            # Don't try to call follow up pages without a vqd value.
+            # DDG recognizes this as a request from a bot. This lowers the
             # reputation of the SearXNG IP and DDG starts to activate CAPTCHAs.
             params["url"] = None
             return
 
         if params['searxng_locale'].startswith("zh"):
-            # Some locales (at least China) do not have a "next page" button and ddg
+            # Some locales (at least China) do not have a "next page" button and DDG
             # will return a HTTP/2 403 Forbidden for a request of such a page.
             params["url"] = None
             return
 
+    # Put empty kl in form data if language/region set to all
+    if eng_region == "wt-wt":
+        params['data']['kl'] = ""
+    else:
+        params['data']['kl'] = eng_region
+
+    params['data']['df'] = ''
+    if params['time_range'] in time_range_dict:
+        params['data']['df'] = time_range_dict[params['time_range']]
+        params['cookies']['df'] = time_range_dict[params['time_range']]
+
+    params['cookies']['kl'] = eng_region
+
+    params['url'] = url
+    params['method'] = 'POST'
+
+    params['headers']['Content-Type'] = 'application/x-www-form-urlencoded'
+    params['headers']['Referer'] = url
+    params['headers']['Sec-Fetch-Dest'] = "document"
+    params['headers']['Sec-Fetch-Mode'] = "navigate"  # at least this one is used by ddg's bot detection
+    params['headers']['Sec-Fetch-Site'] = "same-origin"
+    params['headers']['Sec-Fetch-User'] = "?1"
+
+    logger.debug("param headers: %s", params['headers'])
     logger.debug("param data: %s", params['data'])
     logger.debug("param cookies: %s", params['cookies'])