Browse Source

[mod] bing: fetch engine traits (data_type: supported_languages)

Implements a fetch_traits function for the Bing engines.

.. note::

   Does not include migration of the request methode from 'supported_languages'
   to 'traits' (EngineTraits) object!

Signed-off-by: Markus Heiser <markus.heiser@darmarit.de>
Markus Heiser 2 years ago
parent
commit
d3aa690a7a

File diff suppressed because it is too large
+ 1176 - 6
searx/data/engine_traits.json


+ 97 - 0
searx/engines/bing.py

@@ -12,6 +12,10 @@ from lxml import html
 from searx.utils import eval_xpath, extract_text, eval_xpath_list, match_language, eval_xpath_getindex
 from searx.utils import eval_xpath, extract_text, eval_xpath_list, match_language, eval_xpath_getindex
 from searx.network import multi_requests, Request
 from searx.network import multi_requests, Request
 
 
+from searx.enginelib.traits import EngineTraits
+
+traits: EngineTraits
+
 about = {
 about = {
     "website": 'https://www.bing.com',
     "website": 'https://www.bing.com',
     "wikidata_id": 'Q182496',
     "wikidata_id": 'Q182496',
@@ -181,3 +185,96 @@ def _fetch_supported_languages(resp):
         lang_tags.add(tag)
         lang_tags.add(tag)
 
 
     return list(lang_tags)
     return list(lang_tags)
+
+
+def fetch_traits(engine_traits: EngineTraits):
+    """Fetch languages and regions from bing."""
+
+    # pylint: disable=import-outside-toplevel, disable=too-many-branches,
+    # pylint: disable=too-many-locals, too-many-statements
+
+    engine_traits.data_type = 'supported_languages'  # deprecated
+
+    import babel
+    import babel.languages
+    from searx import network
+    from searx.locales import get_offical_locales, language_tag, region_tag
+    from searx.utils import gen_useragent
+
+    headers = {
+        'User-Agent': gen_useragent(),
+        'Accept-Language': "en-US,en;q=0.5",  # bing needs to set the English language
+    }
+    resp = network.get('https://www.bing.com/account/general', headers=headers)
+
+    if not resp.ok:
+        print("ERROR: response from peertube is not OK.")
+
+    dom = html.fromstring(resp.text)
+
+    # Selector to get items from "Display language"
+
+    lang_map = {
+        'prs': 'fa',  # Persian
+        'pt_BR': 'pt',  # Portuguese (Brasil)
+        'pt_PT': 'pt',  # Portuguese (Portugal)
+        'ca-ES-VALENCIA': 'ca',  # Catalan (Spain, Valencian)
+    }
+
+    unknow_langs = [
+        'quc',  # K'iche'
+        'nso',  # Sesotho sa Leboa
+        'tn',  # Setswana
+    ]
+
+    for div in eval_xpath(dom, '//div[@id="limit-languages"]//input/..'):
+
+        eng_lang = eval_xpath(div, './/input/@value')[0]
+        if eng_lang in unknow_langs:
+            continue
+
+        eng_lang = lang_map.get(eng_lang, eng_lang)
+        label = extract_text(eval_xpath(div, './/label'))
+
+        # The 'language:xx' query string in the request function (above) does
+        # only support the language codes from the "Display languages" list.
+        # Examples of items from the "Display languages" not sopported in the
+        # query string: zh_Hans --> zh / sr_latn --> sr
+        #
+        # eng_lang = eng_lang.split('_')[0]
+
+        try:
+            sxng_tag = language_tag(babel.Locale.parse(eng_lang.replace('-', '_'), sep='_'))
+        except babel.UnknownLocaleError:
+            print("ERROR: %s (%s) is unknown by babel" % (label, eng_lang))
+            continue
+
+        conflict = engine_traits.languages.get(sxng_tag)
+        if conflict:
+            if conflict != eng_lang:
+                print("CONFLICT: babel %s --> %s, %s" % (sxng_tag, conflict, eng_lang))
+            continue
+        engine_traits.languages[sxng_tag] = eng_lang
+
+    engine_traits.languages['zh'] = 'zh_Hans'
+
+    # regiones
+
+    for a in eval_xpath(dom, '//div[@id="region-section-content"]//li/a'):
+        href = eval_xpath(a, './/@href')[0]
+        # lang_name = extract_text(a)
+        query = urlparse(href)[4]
+        query = parse_qs(query, keep_blank_values=True)
+        cc = query.get('cc')[0]  # pylint:disable=invalid-name
+        if cc == 'clear':
+            continue
+
+        # Assert babel supports this locales
+        sxng_locales = get_offical_locales(cc.upper(), engine_traits.languages.keys())
+
+        if not sxng_locales:
+            # print("ERROR: can't map from bing country %s (%s) to a babel region." % (a.text_content().strip(), cc))
+            continue
+
+        for sxng_locale in sxng_locales:
+            engine_traits.regions[region_tag(sxng_locale)] = cc

+ 1 - 0
searx/engines/bing_images.py

@@ -13,6 +13,7 @@ from searx.utils import match_language
 from searx.engines.bing import language_aliases
 from searx.engines.bing import language_aliases
 from searx.engines.bing import (  # pylint: disable=unused-import
 from searx.engines.bing import (  # pylint: disable=unused-import
     _fetch_supported_languages,
     _fetch_supported_languages,
+    fetch_traits,
     supported_languages_url,
     supported_languages_url,
 )
 )
 
 

+ 1 - 0
searx/engines/bing_news.py

@@ -17,6 +17,7 @@ from searx.utils import match_language, eval_xpath_getindex
 from searx.engines.bing import (  # pylint: disable=unused-import
 from searx.engines.bing import (  # pylint: disable=unused-import
     language_aliases,
     language_aliases,
     _fetch_supported_languages,
     _fetch_supported_languages,
+    fetch_traits,
     supported_languages_url,
     supported_languages_url,
 )
 )
 
 

+ 1 - 0
searx/engines/bing_videos.py

@@ -14,6 +14,7 @@ from searx.engines.bing import language_aliases
 
 
 from searx.engines.bing import (  # pylint: disable=unused-import
 from searx.engines.bing import (  # pylint: disable=unused-import
     _fetch_supported_languages,
     _fetch_supported_languages,
+    fetch_traits,
     supported_languages_url,
     supported_languages_url,
 )
 )
 
 

Some files were not shown because too many files changed in this diff