Browse Source

[fix] startpage engine: resolve instant CAPTCHA issues (#4890)

Changes:
- Improve log messages for better debugging of future CAPTCHA issues
- Fixed erroneous get_sc_url variable where sc was always blank (when no cached value)
- Move Origin and Referer headers to request() function
- Add missing form parameters (abp, abd, abe) required by Startpage
  to avoid being flagged as automated requests
- Include segment parameter for paginated requests
- Clean up unnecessary commented-out headers
- Fix minor typos e.g. "time-stamp" → "timestamp", "scrap" → "scrapes"

Related:
- https://github.com/searxng/searxng/issues/4673
useralias 3 days ago
parent
commit
b73ac81815
1 changed files with 21 additions and 24 deletions
  1. 21 24
      searx/engines/startpage.py

+ 21 - 24
searx/engines/startpage.py

@@ -183,23 +183,19 @@ def get_sc_code(searxng_locale, params):
     Startpage puts a ``sc`` argument on every HTML :py:obj:`search form
     <search_form_xpath>`.  Without this argument Startpage considers the request
     is from a bot.  We do not know what is encoded in the value of the ``sc``
-    argument, but it seems to be a kind of a *time-stamp*.
+    argument, but it seems to be a kind of a *timestamp*.
 
     Startpage's search form generates a new sc-code on each request.  This
-    function scrap a new sc-code from Startpage's home page every
+    function scrapes a new sc-code from Startpage's home page every
     :py:obj:`sc_code_cache_sec` seconds."""
 
-    sc_code = CACHE.get("SC_CODE", "")
+    sc_code = CACHE.get("SC_CODE")
+
     if sc_code:
+        logger.debug("get_sc_code: using cached value: %s", sc_code)
         return sc_code
 
     headers = {**params['headers']}
-    headers['Origin'] = base_url
-    headers['Referer'] = base_url + '/'
-    # headers['Connection'] = 'keep-alive'
-    # headers['Accept-Encoding'] = 'gzip, deflate, br'
-    # headers['Accept'] = 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8'
-    # headers['User-Agent'] = 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:105.0) Gecko/20100101 Firefox/105.0'
 
     # add Accept-Language header
     if searxng_locale == 'all':
@@ -216,9 +212,9 @@ def get_sc_code(searxng_locale, params):
             )
         headers['Accept-Language'] = ac_lang
 
-    get_sc_url = base_url + '/?sc=%s' % (sc_code)
-    logger.debug("query new sc time-stamp ... %s", get_sc_url)
-    logger.debug("headers: %s", headers)
+    get_sc_url = base_url + '/'
+    logger.debug("get_sc_code: querying new sc timestamp @ %s", get_sc_url)
+    logger.debug("get_sc_code: request headers: %s", headers)
     resp = get(get_sc_url, headers=headers)
 
     # ?? x = network.get('https://www.startpage.com/sp/cdn/images/filter-chevron.svg', headers=headers)
@@ -237,7 +233,7 @@ def get_sc_code(searxng_locale, params):
     except IndexError as exc:
         logger.debug("suspend startpage API --> https://github.com/searxng/searxng/pull/695")
         raise SearxEngineCaptchaException(
-            message="get_sc_code: [PR-695] query new sc time-stamp failed! (%s)" % resp.url,  # type: ignore
+            message="get_sc_code: [PR-695] querying new sc timestamp failed! (%s)" % resp.url,  # type: ignore
         ) from exc
 
     sc_code = str(sc_code)
@@ -249,8 +245,8 @@ def get_sc_code(searxng_locale, params):
 def request(query, params):
     """Assemble a Startpage request.
 
-    To avoid CAPTCHA we need to send a well formed HTTP POST request with a
-    cookie.  We need to form a request that is identical to the request build by
+    To avoid CAPTCHAs we need to send a well formed HTTP POST request with a
+    cookie. We need to form a request that is identical to the request built by
     Startpage's search form:
 
     - in the cookie the **region** is selected
@@ -262,24 +258,30 @@ def request(query, params):
     engine_region = traits.get_region(params['searxng_locale'], 'en-US')
     engine_language = traits.get_language(params['searxng_locale'], 'en')
 
-    # build arguments
+    params['headers']['Origin'] = base_url
+    params['headers']['Referer'] = base_url + '/'
+
+    # Build form data
     args = {
         'query': query,
         'cat': startpage_categ,
         't': 'device',
-        'sc': get_sc_code(params['searxng_locale'], params),  # hint: this func needs HTTP headers,
+        'sc': get_sc_code(params['searxng_locale'], params),  # hint: this func needs HTTP headers
         'with_date': time_range_dict.get(params['time_range'], ''),
+        'abp': '1',
+        'abd': '1',
+        'abe': '1',
     }
 
     if engine_language:
         args['language'] = engine_language
         args['lui'] = engine_language
 
-    args['abp'] = '1'
     if params['pageno'] > 1:
         args['page'] = params['pageno']
+        args['segment'] = 'startpage.udog'
 
-    # build cookie
+    # Build cookie
     lang_homepage = 'en'
     cookie = OrderedDict()
     cookie['date_time'] = 'world'
@@ -304,15 +306,10 @@ def request(query, params):
     params['cookies']['preferences'] = 'N1N'.join(["%sEEE%s" % x for x in cookie.items()])
     logger.debug('cookie preferences: %s', params['cookies']['preferences'])
 
-    # POST request
     logger.debug("data: %s", args)
     params['data'] = args
     params['method'] = 'POST'
     params['url'] = search_url
-    params['headers']['Origin'] = base_url
-    params['headers']['Referer'] = base_url + '/'
-    # is the Accept header needed?
-    # params['headers']['Accept'] = 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8'
 
     return params