Browse Source

[mod] brave engines: add fetch_traits() / improve language support

Signed-off-by: Markus Heiser <markus.heiser@darmarit.de>
Markus Heiser 1 year ago
parent
commit
b8352eca0c
2 changed files with 431 additions and 15 deletions
  1. 280 0
      searx/data/engine_traits.json
  2. 151 15
      searx/engines/brave.py

+ 280 - 0
searx/data/engine_traits.json

@@ -541,6 +541,286 @@
       "zh-TW": "zh-TW"
       "zh-TW": "zh-TW"
     }
     }
   },
   },
+  "brave": {
+    "all_locale": "all",
+    "custom": {
+      "ui_lang": {
+        "ca": "ca",
+        "de-DE": "de-de",
+        "en-CA": "en-ca",
+        "en-GB": "en-gb",
+        "en-US": "en-us",
+        "es": "es",
+        "fr-CA": "fr-ca",
+        "fr-FR": "fr-fr",
+        "ja-JP": "ja-jp",
+        "pt-BR": "pt-br",
+        "sq-AL": "sq-al"
+      }
+    },
+    "data_type": "traits_v1",
+    "languages": {},
+    "regions": {
+      "ar-SA": "sa",
+      "da-DK": "dk",
+      "de-AT": "at",
+      "de-BE": "be",
+      "de-CH": "ch",
+      "de-DE": "de",
+      "en-AU": "au",
+      "en-CA": "ca",
+      "en-GB": "gb",
+      "en-HK": "hk",
+      "en-IN": "in",
+      "en-NZ": "nz",
+      "en-PH": "ph",
+      "en-US": "us",
+      "en-ZA": "za",
+      "es-AR": "ar",
+      "es-CL": "cl",
+      "es-ES": "es",
+      "es-MX": "mx",
+      "fi-FI": "fi",
+      "fil-PH": "ph",
+      "fr-BE": "be",
+      "fr-CA": "ca",
+      "fr-CH": "ch",
+      "fr-FR": "fr",
+      "gsw-CH": "ch",
+      "hi-IN": "in",
+      "id-ID": "id",
+      "it-CH": "ch",
+      "it-IT": "it",
+      "ja-JP": "jp",
+      "ko-KR": "kr",
+      "mi-NZ": "nz",
+      "ms-MY": "my",
+      "nb-NO": "no",
+      "nl-BE": "be",
+      "nl-NL": "nl",
+      "nn-NO": "no",
+      "pl-PL": "pl",
+      "pt-BR": "br",
+      "pt-PT": "pt",
+      "ru-RU": "ru",
+      "sv-FI": "fi",
+      "sv-SE": "se",
+      "tr-TR": "tr",
+      "zh-CN": "cn",
+      "zh-HK": "hk",
+      "zh-TW": "tw"
+    }
+  },
+  "brave.images": {
+    "all_locale": "all",
+    "custom": {
+      "ui_lang": {
+        "ca": "ca",
+        "de-DE": "de-de",
+        "en-CA": "en-ca",
+        "en-GB": "en-gb",
+        "en-US": "en-us",
+        "es": "es",
+        "fr-CA": "fr-ca",
+        "fr-FR": "fr-fr",
+        "ja-JP": "ja-jp",
+        "pt-BR": "pt-br",
+        "sq-AL": "sq-al"
+      }
+    },
+    "data_type": "traits_v1",
+    "languages": {},
+    "regions": {
+      "ar-SA": "sa",
+      "da-DK": "dk",
+      "de-AT": "at",
+      "de-BE": "be",
+      "de-CH": "ch",
+      "de-DE": "de",
+      "en-AU": "au",
+      "en-CA": "ca",
+      "en-GB": "gb",
+      "en-HK": "hk",
+      "en-IN": "in",
+      "en-NZ": "nz",
+      "en-PH": "ph",
+      "en-US": "us",
+      "en-ZA": "za",
+      "es-AR": "ar",
+      "es-CL": "cl",
+      "es-ES": "es",
+      "es-MX": "mx",
+      "fi-FI": "fi",
+      "fil-PH": "ph",
+      "fr-BE": "be",
+      "fr-CA": "ca",
+      "fr-CH": "ch",
+      "fr-FR": "fr",
+      "gsw-CH": "ch",
+      "hi-IN": "in",
+      "id-ID": "id",
+      "it-CH": "ch",
+      "it-IT": "it",
+      "ja-JP": "jp",
+      "ko-KR": "kr",
+      "mi-NZ": "nz",
+      "ms-MY": "my",
+      "nb-NO": "no",
+      "nl-BE": "be",
+      "nl-NL": "nl",
+      "nn-NO": "no",
+      "pl-PL": "pl",
+      "pt-BR": "br",
+      "pt-PT": "pt",
+      "ru-RU": "ru",
+      "sv-FI": "fi",
+      "sv-SE": "se",
+      "tr-TR": "tr",
+      "zh-CN": "cn",
+      "zh-HK": "hk",
+      "zh-TW": "tw"
+    }
+  },
+  "brave.news": {
+    "all_locale": "all",
+    "custom": {
+      "ui_lang": {
+        "ca": "ca",
+        "de-DE": "de-de",
+        "en-CA": "en-ca",
+        "en-GB": "en-gb",
+        "en-US": "en-us",
+        "es": "es",
+        "fr-CA": "fr-ca",
+        "fr-FR": "fr-fr",
+        "ja-JP": "ja-jp",
+        "pt-BR": "pt-br",
+        "sq-AL": "sq-al"
+      }
+    },
+    "data_type": "traits_v1",
+    "languages": {},
+    "regions": {
+      "ar-SA": "sa",
+      "da-DK": "dk",
+      "de-AT": "at",
+      "de-BE": "be",
+      "de-CH": "ch",
+      "de-DE": "de",
+      "en-AU": "au",
+      "en-CA": "ca",
+      "en-GB": "gb",
+      "en-HK": "hk",
+      "en-IN": "in",
+      "en-NZ": "nz",
+      "en-PH": "ph",
+      "en-US": "us",
+      "en-ZA": "za",
+      "es-AR": "ar",
+      "es-CL": "cl",
+      "es-ES": "es",
+      "es-MX": "mx",
+      "fi-FI": "fi",
+      "fil-PH": "ph",
+      "fr-BE": "be",
+      "fr-CA": "ca",
+      "fr-CH": "ch",
+      "fr-FR": "fr",
+      "gsw-CH": "ch",
+      "hi-IN": "in",
+      "id-ID": "id",
+      "it-CH": "ch",
+      "it-IT": "it",
+      "ja-JP": "jp",
+      "ko-KR": "kr",
+      "mi-NZ": "nz",
+      "ms-MY": "my",
+      "nb-NO": "no",
+      "nl-BE": "be",
+      "nl-NL": "nl",
+      "nn-NO": "no",
+      "pl-PL": "pl",
+      "pt-BR": "br",
+      "pt-PT": "pt",
+      "ru-RU": "ru",
+      "sv-FI": "fi",
+      "sv-SE": "se",
+      "tr-TR": "tr",
+      "zh-CN": "cn",
+      "zh-HK": "hk",
+      "zh-TW": "tw"
+    }
+  },
+  "brave.videos": {
+    "all_locale": "all",
+    "custom": {
+      "ui_lang": {
+        "ca": "ca",
+        "de-DE": "de-de",
+        "en-CA": "en-ca",
+        "en-GB": "en-gb",
+        "en-US": "en-us",
+        "es": "es",
+        "fr-CA": "fr-ca",
+        "fr-FR": "fr-fr",
+        "ja-JP": "ja-jp",
+        "pt-BR": "pt-br",
+        "sq-AL": "sq-al"
+      }
+    },
+    "data_type": "traits_v1",
+    "languages": {},
+    "regions": {
+      "ar-SA": "sa",
+      "da-DK": "dk",
+      "de-AT": "at",
+      "de-BE": "be",
+      "de-CH": "ch",
+      "de-DE": "de",
+      "en-AU": "au",
+      "en-CA": "ca",
+      "en-GB": "gb",
+      "en-HK": "hk",
+      "en-IN": "in",
+      "en-NZ": "nz",
+      "en-PH": "ph",
+      "en-US": "us",
+      "en-ZA": "za",
+      "es-AR": "ar",
+      "es-CL": "cl",
+      "es-ES": "es",
+      "es-MX": "mx",
+      "fi-FI": "fi",
+      "fil-PH": "ph",
+      "fr-BE": "be",
+      "fr-CA": "ca",
+      "fr-CH": "ch",
+      "fr-FR": "fr",
+      "gsw-CH": "ch",
+      "hi-IN": "in",
+      "id-ID": "id",
+      "it-CH": "ch",
+      "it-IT": "it",
+      "ja-JP": "jp",
+      "ko-KR": "kr",
+      "mi-NZ": "nz",
+      "ms-MY": "my",
+      "nb-NO": "no",
+      "nl-BE": "be",
+      "nl-NL": "nl",
+      "nn-NO": "no",
+      "pl-PL": "pl",
+      "pt-BR": "br",
+      "pt-PT": "pt",
+      "ru-RU": "ru",
+      "sv-FI": "fi",
+      "sv-SE": "se",
+      "tr-TR": "tr",
+      "zh-CN": "cn",
+      "zh-HK": "hk",
+      "zh-TW": "tw"
+    }
+  },
   "dailymotion": {
   "dailymotion": {
     "all_locale": null,
     "all_locale": null,
     "custom": {},
     "custom": {},

+ 151 - 15
searx/engines/brave.py

@@ -31,12 +31,73 @@ Configured ``brave`` engines:
     brave_category: news
     brave_category: news
 
 
 
 
+.. _brave regions:
+
+Brave regions
+=============
+
+Brave uses two-digit tags for the regions like ``ca`` while SearXNG deals with
+locales.  To get a mapping, all *officatl de-facto* languages of the Brave
+region are mapped to regions in SearXNG (see :py:obj:`babel
+<babel.languages.get_official_languages>`):
+
+.. code:: python
+
+    "regions": {
+      ..
+      "en-CA": "ca",
+      "fr-CA": "ca",
+      ..
+     }
+
+
+.. note::
+
+   The language (aka region) support of Brave's index is limited to very basic
+   languages.  The search results for languages like Chinese or Arabic are of
+   low quality.
+
+
+.. _brave languages:
+
+Brave languages
+===============
+
+Brave's language support is limited to the UI (menues, area local notations,
+etc).  Brave's index only seems to support a locale, but it does not seem to
+support any languages in its index.  The choice of available languages is very
+small (and its not clear to me where the differencee in UI is when switching
+from en-us to en-ca or en-gb).
+
+In the :py:obj:`EngineTraits object <searx.enginelib.traits.EngineTraits>` the
+UI languages are stored in a custom field named ``ui_lang``:
+
+.. code:: python
+
+    "custom": {
+      "ui_lang": {
+        "ca": "ca",
+        "de-DE": "de-de",
+        "en-CA": "en-ca",
+        "en-GB": "en-gb",
+        "en-US": "en-us",
+        "es": "es",
+        "fr-CA": "fr-ca",
+        "fr-FR": "fr-fr",
+        "ja-JP": "ja-jp",
+        "pt-BR": "pt-br",
+        "sq-AL": "sq-al"
+      }
+    },
+
 Implementations
 Implementations
 ===============
 ===============
 
 
 """
 """
-# pylint: disable=fixme
 
 
+from typing import TYPE_CHECKING
+
+import re
 from urllib.parse import (
 from urllib.parse import (
     urlencode,
     urlencode,
     urlparse,
     urlparse,
@@ -46,11 +107,20 @@ from urllib.parse import (
 import chompjs
 import chompjs
 from lxml import html
 from lxml import html
 
 
+from searx import locales
 from searx.utils import (
 from searx.utils import (
     extract_text,
     extract_text,
     eval_xpath_list,
     eval_xpath_list,
     eval_xpath_getindex,
     eval_xpath_getindex,
 )
 )
+from searx.enginelib.traits import EngineTraits
+
+if TYPE_CHECKING:
+    import logging
+
+    logger: logging.Logger
+
+traits: EngineTraits
 
 
 about = {
 about = {
     "website": 'https://search.brave.com/',
     "website": 'https://search.brave.com/',
@@ -118,23 +188,20 @@ def request(query, params):
 
 
     params["url"] = f"{base_url}{brave_category}?{urlencode(args)}"
     params["url"] = f"{base_url}{brave_category}?{urlencode(args)}"
 
 
-    # set preferences in cookie
-    params['cookies']['safesearch'] = safesearch_map.get(params['safesearch'], 'off')
-
-    # ToDo: we need a fetch_traits(..) implementation / the ui_lang of Brave are
-    #       limited and the country handling has it quirks
+    # set properties in the cookies
 
 
-    eng_locale = params.get('searxng_locale')
-    params['cookies']['useLocation'] = '0'  # the useLocation is IP based, we use 'country'
+    params['cookies']['safesearch'] = safesearch_map.get(params['safesearch'], 'off')
+    # the useLocation is IP based, we use cookie 'country' for the region
+    params['cookies']['useLocation'] = '0'
     params['cookies']['summarizer'] = '0'
     params['cookies']['summarizer'] = '0'
 
 
-    if not eng_locale or eng_locale == 'all':
-        params['cookies']['country'] = 'all'  # country=all
-    else:
-        params['cookies']['country'] = eng_locale.split('-')[-1].lower()
-        params['cookies']['ui_lang'] = eng_locale.split('-')[0].lower()
+    engine_region = traits.get_region(params['searxng_locale'], 'all')
+    params['cookies']['country'] = engine_region.split('-')[-1].lower()  # type: ignore
 
 
-    # logger.debug("cookies %s", params['cookies'])
+    ui_lang = locales.get_engine_locale(params['searxng_locale'], traits.custom["ui_lang"], 'en-us')
+    params['cookies']['ui_lang'] = ui_lang
+
+    logger.debug("cookies %s", params['cookies'])
 
 
 
 
 def response(resp):
 def response(resp):
@@ -195,7 +262,7 @@ def _parse_search(resp):
         video_tag = eval_xpath_getindex(
         video_tag = eval_xpath_getindex(
             result, './/div[contains(@class, "video-snippet") and @data-macro="video"]', 0, default=None
             result, './/div[contains(@class, "video-snippet") and @data-macro="video"]', 0, default=None
         )
         )
-        if video_tag:
+        if video_tag is not None:
 
 
             # In my tests a video tag in the WEB search was mostoften not a
             # In my tests a video tag in the WEB search was mostoften not a
             # video, except the ones from youtube ..
             # video, except the ones from youtube ..
@@ -281,3 +348,72 @@ def _parse_videos(json_resp):
         result_list.append(item)
         result_list.append(item)
 
 
     return result_list
     return result_list
+
+
+def fetch_traits(engine_traits: EngineTraits):
+    """Fetch :ref:`languages <brave languages>` and :ref:`regions <brave
+    regions>` from Brave."""
+
+    # pylint: disable=import-outside-toplevel
+
+    import babel.languages
+    from searx.locales import region_tag, language_tag
+    from searx.network import get  # see https://github.com/searxng/searxng/issues/762
+
+    engine_traits.custom["ui_lang"] = {}
+
+    headers = {
+        'Accept-Encoding': 'gzip, deflate',
+    }
+    lang_map = {'no': 'nb'}  # norway
+
+    # languages (UI)
+
+    resp = get('https://search.brave.com/settings', headers=headers)
+
+    if not resp.ok:  # type: ignore
+        print("ERROR: response from Brave is not OK.")
+    dom = html.fromstring(resp.text)  # type: ignore
+
+    for option in dom.xpath('//div[@id="language-select"]//option'):
+
+        ui_lang = option.get('value')
+        try:
+            if '-' in ui_lang:
+                sxng_tag = region_tag(babel.Locale.parse(ui_lang, sep='-'))
+            else:
+                sxng_tag = language_tag(babel.Locale.parse(ui_lang))
+
+        except babel.UnknownLocaleError:
+            print("ERROR: can't determine babel locale of Brave's (UI) language %s" % ui_lang)
+            continue
+
+        conflict = engine_traits.custom["ui_lang"].get(sxng_tag)
+        if conflict:
+            if conflict != ui_lang:
+                print("CONFLICT: babel %s --> %s, %s" % (sxng_tag, conflict, ui_lang))
+            continue
+        engine_traits.custom["ui_lang"][sxng_tag] = ui_lang
+
+    # search regions of brave
+
+    engine_traits.all_locale = 'all'
+
+    for country in dom.xpath('//div[@id="sidebar"]//ul/li/div[contains(@class, "country")]'):
+
+        flag = country.xpath('./span[contains(@class, "flag")]')[0]
+        # country_name = extract_text(flag.xpath('./following-sibling::*')[0])
+        country_tag = re.search(r'flag-([^\s]*)\s', flag.xpath('./@class')[0]).group(1)  # type: ignore
+
+        # add offical languages of the country ..
+        for lang_tag in babel.languages.get_official_languages(country_tag, de_facto=True):
+            lang_tag = lang_map.get(lang_tag, lang_tag)
+            sxng_tag = region_tag(babel.Locale.parse('%s_%s' % (lang_tag, country_tag.upper())))
+            # print("%-20s: %s <-- %s" % (country_name, country_tag, sxng_tag))
+
+            conflict = engine_traits.regions.get(sxng_tag)
+            if conflict:
+                if conflict != country_tag:
+                    print("CONFLICT: babel %s --> %s, %s" % (sxng_tag, conflict, country_tag))
+                    continue
+            engine_traits.regions[sxng_tag] = country_tag