Browse Source

[mod] presearch: add language & region support

In Presearch there are languages for the UI and regions for narrowing down the
search.  With this change the SearXNG engine supports a search by region.  The
details can be found in the documentation of the source code.

To test, you can search terms like::

   !presearch bmw :zh-TW
   !presearch bmw :en-CA

1. You should get results corresponding to the region (Taiwan, Canada)
2. and in the language (Chinese, Englisch).
3. The context in info box content is in the same language.

Exceptions:

1. Region or language is not supported by Presearch or
2. SearXNG user did not selected a region tag, example::

    !presearch bmw :en

Signed-off-by: Markus Heiser <markus.heiser@darmarit.de>
Markus Heiser 1 year ago
parent
commit
e560d7e373
2 changed files with 113 additions and 17 deletions
  1. 13 0
      docs/dev/engines/online/presearch.rst
  2. 100 17
      searx/engines/presearch.py

+ 13 - 0
docs/dev/engines/online/presearch.rst

@@ -0,0 +1,13 @@
+.. _engine presearch:
+
+================
+Presearch Engine
+================
+
+.. contents::
+   :depth: 2
+   :local:
+   :backlinks: entry
+
+.. automodule:: searx.engines.presearch
+   :members:

+ 100 - 17
searx/engines/presearch.py

@@ -1,23 +1,72 @@
 # SPDX-License-Identifier: AGPL-3.0-or-later
 # SPDX-License-Identifier: AGPL-3.0-or-later
 # lint: pylint
 # lint: pylint
-"""Presearch (general, images, videos, news)
+"""Presearch supports the search types listed in :py:obj:`search_type` (general,
+images, videos, news).
 
 
-.. hint::
+Configured ``presarch`` engines:
+
+.. code:: yaml
+
+  - name: presearch
+    engine: presearch
+    search_type: search
+    categories: [general, web]
+
+  - name: presearch images
+    ...
+    search_type: images
+    categories: [images, web]
 
 
-   The results in the video category are most often links to pages that contain
-   a video, for instance many links from preasearch's video category link
-   content from facebook (aka Meta) or Twitter (aka X).  Since these are not
-   real links to video streams SearXNG can't use the video template for this and
-   if SearXNG can't use this template, then the user doesn't want to see these
-   hits in the videos category.
+  - name: presearch videos
+    ...
+    search_type: videos
+    categories: [general, web]
+
+  - name: presearch news
+    ...
+    search_type: news
+    categories: [news, web]
+
+.. hint::
 
 
-   TL;DR; by default presearch's video category is placed into categories::
+   By default Presearch's video category is intentionally placed into::
 
 
        categories: [general, web]
        categories: [general, web]
 
 
+
+Search type ``video``
+=====================
+
+The results in the video category are most often links to pages that contain a
+video, for instance many links from Preasearch's video category link content
+from facebook (aka Meta) or Twitter (aka X).  Since these are not real links to
+video streams SearXNG can't use the video template for this and if SearXNG can't
+use this template, then the user doesn't want to see these hits in the videos
+category.
+
+
+Languages & Regions
+===================
+
+In Presearch there are languages for the UI and regions for narrowing down the
+search.  If we set "auto" for the region in the WEB-UI of Presearch and cookie
+``use_local_search_results=false``, then the defaults are set for both (the
+language and the region) from the ``Accept-Language`` header.
+
+Since the region is already "auto" by default, we only need to set the
+``use_local_search_results`` cookie and send the ``Accept-Language`` header.  We
+have to set these values in both requests we send to Presearch; in the first
+request to get the request-ID from Presearch and in the final request to get the
+result list (see ``send_accept_language_header``).
+
+
+Implementations
+===============
+
 """
 """
 
 
 from urllib.parse import urlencode
 from urllib.parse import urlencode
+from searx import locales
 from searx.network import get
 from searx.network import get
 from searx.utils import gen_useragent, html_to_text
 from searx.utils import gen_useragent, html_to_text
 
 
@@ -32,6 +81,7 @@ about = {
 paging = True
 paging = True
 safesearch = True
 safesearch = True
 time_range_support = True
 time_range_support = True
+send_accept_language_header = True
 categories = ["general", "web"]  # general, images, videos, news
 categories = ["general", "web"]  # general, images, videos, news
 
 
 search_type = "search"
 search_type = "search"
@@ -46,19 +96,43 @@ def init(_):
         raise ValueError(f'presearch search_type: {search_type}')
         raise ValueError(f'presearch search_type: {search_type}')
 
 
 
 
-def _get_request_id(query, page, time_range, safesearch_param):
+def _get_request_id(query, params):
+
     args = {
     args = {
         "q": query,
         "q": query,
-        "page": page,
+        "page": params["pageno"],
     }
     }
-    if time_range:
-        args["time"] = time_range
+
+    if params["time_range"]:
+        args["time"] = params["time_range"]
 
 
     url = f"{base_url}/{search_type}?{urlencode(args)}"
     url = f"{base_url}/{search_type}?{urlencode(args)}"
+
     headers = {
     headers = {
         'User-Agent': gen_useragent(),
         'User-Agent': gen_useragent(),
-        'Cookie': f"b=1;presearch_session=;use_safe_search={safesearch_map[safesearch_param]}",
+        'Cookie': (
+            f"b=1;"
+            f" presearch_session=;"
+            f" use_local_search_results=false;"
+            f" use_safe_search={safesearch_map[params['safesearch']]}"
+        ),
     }
     }
+    if params['searxng_locale'] != 'all':
+        l = locales.get_locale(params['searxng_locale'])
+
+        # Presearch narrows down the search by region.  In SearXNG when the user
+        # does not set a region (e.g. 'en-CA' / canada) we cannot hand over a
+        # region.
+
+        # We could possibly use searx.locales.get_official_locales to determine
+        # in which regions this language is an official one, but then we still
+        # wouldn't know which region should be given more weight / Presearch
+        # performs an IP-based geolocation of the user, we don't want that in
+        # SearXNG ;-)
+
+        if l.territory:
+            headers['Accept-Language'] = f"{l.language}-{l.territory},{l.language};" "q=0.9,*;" "q=0.5"
+
     resp_text = get(url, headers=headers).text  # type: ignore
     resp_text = get(url, headers=headers).text  # type: ignore
 
 
     for line in resp_text.split("\n"):
     for line in resp_text.split("\n"):
@@ -69,8 +143,7 @@ def _get_request_id(query, page, time_range, safesearch_param):
 
 
 
 
 def request(query, params):
 def request(query, params):
-    request_id = _get_request_id(query, params["pageno"], params["time_range"], params["safesearch"])
-
+    request_id = _get_request_id(query, params)
     params["headers"]["Accept"] = "application/json"
     params["headers"]["Accept"] = "application/json"
     params["url"] = f"{base_url}/results?id={request_id}"
     params["url"] = f"{base_url}/results?id={request_id}"
 
 
@@ -109,7 +182,17 @@ def parse_search_query(json_results):
     if info:
     if info:
         attributes = []
         attributes = []
         for item in info.get('about', []):
         for item in info.get('about', []):
-            label, value = html_to_text(item).split(':', 1)
+
+            text = html_to_text(item)
+            if ':' in text:
+                # split text into key / value
+                label, value = text.split(':', 1)
+            else:
+                # In other languages (tested with zh-TW) a colon is represented
+                # by a different symbol --> then we split at the first space.
+                label, value = text.split(' ', 1)
+                label = label[:-1]
+
             value = _strip_leading_strings(value)
             value = _strip_leading_strings(value)
             attributes.append({'label': label, 'value': value})
             attributes.append({'label': label, 'value': value})
         content = []
         content = []