| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199 | #!/usr/bin/env python# SPDX-License-Identifier: AGPL-3.0-or-later"""Update :py:obj:`searx.enginelib.traits.EngineTraitsMap` and :origin:`searx/languages.py`:py:obj:`searx.enginelib.traits.EngineTraitsMap.ENGINE_TRAITS_FILE`:  Persistence of engines traits, fetched from the engines.:origin:`searx/languages.py`  Is generated  from intersecting each engine's supported traits.The script :origin:`searxng_extra/update/update_engine_traits.py` is called inthe :origin:`CI Update data ... <.github/workflows/data-update.yml>`"""# pylint: disable=invalid-namefrom unicodedata import lookupfrom pathlib import Pathfrom pprint import pformatimport babelfrom searx import settings, searx_dirfrom searx import networkfrom searx.engines import load_enginesfrom searx.enginelib.traits import EngineTraitsMap# Output files.languages_file = Path(searx_dir) / 'sxng_locales.py'languages_file_header = """\# SPDX-License-Identifier: AGPL-3.0-or-later'''List of SearXNG's locale codes used for the search language/region... hint::   Don't modify this file, this file is generated by::     ./manage data.traits'''sxng_locales = ("""languages_file_footer = """,)'''A list of five-digit tuples:0. SearXNG's internal locale tag (a language or region tag)1. Name of the language (:py:obj:`babel.core.Locale.get_language_name`)2. For region tags the name of the region (:py:obj:`babel.core.Locale.get_territory_name`).   Empty string for language tags.3. English language name (from :py:obj:`babel.core.Locale.english_name`)4. Unicode flag (emoji) that fits to SearXNG's internal region tag. Languages   are represented by a globe (\U0001F310).. code:: python   ('en',    'English', '',              'English', '\U0001f310'),   ('en-CA', 'English', 'Canada',        'English', '\U0001f1e8\U0001f1e6'),   ('en-US', 'English', 'United States', 'English', '\U0001f1fa\U0001f1f8'),   ..   ('fr',    'Français', '',             'French',  '\U0001f310'),   ('fr-BE', 'Français', 'Belgique',     'French',  '\U0001f1e7\U0001f1ea'),   ('fr-CA', 'Français', 'Canada',       'French',  '\U0001f1e8\U0001f1e6'),:meta hide-value:'''"""lang2emoji = {    'ha': '\U0001F1F3\U0001F1EA',  # Hausa / Niger    'bs': '\U0001F1E7\U0001F1E6',  # Bosnian / Bosnia & Herzegovina    'jp': '\U0001F1EF\U0001F1F5',  # Japanese    'ua': '\U0001F1FA\U0001F1E6',  # Ukrainian    'he': '\U0001F1EE\U0001F1F1',  # Hebrew}def main():    load_engines(settings['engines'])    # traits_map = EngineTraitsMap.from_data()    traits_map = fetch_traits_map()    sxng_tag_list = filter_locales(traits_map)    write_languages_file(sxng_tag_list)def fetch_traits_map():    """Fetches supported languages for each engine and writes json file with those."""    network.set_timeout_for_thread(10.0)    def log(msg):        print(msg)    traits_map = EngineTraitsMap.fetch_traits(log=log)    print("fetched properties from %s engines" % len(traits_map))    print("write json file: %s" % traits_map.ENGINE_TRAITS_FILE)    traits_map.save_data()    return traits_mapdef filter_locales(traits_map: EngineTraitsMap):    """Filter language & region tags by a threshold."""    min_eng_per_region = 18    min_eng_per_lang = 22    _ = {}    for eng in traits_map.values():        for reg in eng.regions.keys():            _[reg] = _.get(reg, 0) + 1    regions = set(k for k, v in _.items() if v >= min_eng_per_region)    lang_from_region = set(k.split('-')[0] for k in regions)    _ = {}    for eng in traits_map.values():        for lang in eng.languages.keys():            # ignore script types like zh_Hant, zh_Hans or sr_Latin, pa_Arab (they            # already counted by existence of 'zh' or 'sr', 'pa')            if '_' in lang:                # print("ignore %s" % lang)                continue            _[lang] = _.get(lang, 0) + 1    languages = set(k for k, v in _.items() if v >= min_eng_per_lang)    sxng_tag_list = set()    sxng_tag_list.update(regions)    sxng_tag_list.update(lang_from_region)    sxng_tag_list.update(languages)    return sxng_tag_listdef write_languages_file(sxng_tag_list):    language_codes = []    for sxng_tag in sorted(sxng_tag_list):        sxng_locale: babel.Locale = babel.Locale.parse(sxng_tag, sep='-')        flag = get_unicode_flag(sxng_locale) or ''        item = (            sxng_tag,            sxng_locale.get_language_name().title(),  # type: ignore            sxng_locale.get_territory_name() or '',            sxng_locale.english_name.split(' (')[0] if sxng_locale.english_name else '',            UnicodeEscape(flag),        )        language_codes.append(item)    language_codes = tuple(language_codes)    with languages_file.open('w', encoding='utf-8') as new_file:        file_content = "{header} {language_codes}{footer}".format(            header=languages_file_header,            language_codes=pformat(language_codes, width=120, indent=4)[1:-1],            footer=languages_file_footer,        )        new_file.write(file_content)        new_file.close()class UnicodeEscape(str):    """Escape unicode string in :py:obj:`pprint.pformat`"""    def __repr__(self):        return "'" + "".join([chr(c) for c in self.encode('unicode-escape')]) + "'"def get_unicode_flag(locale: babel.Locale):    """Determine a unicode flag (emoji) that fits to the ``locale``"""    emoji = lang2emoji.get(locale.language)    if emoji:        return emoji    if not locale.territory:        return '\U0001F310'    emoji = lang2emoji.get(locale.territory.lower())    if emoji:        return emoji    try:        c1 = lookup('REGIONAL INDICATOR SYMBOL LETTER ' + locale.territory[0])        c2 = lookup('REGIONAL INDICATOR SYMBOL LETTER ' + locale.territory[1])        # print("OK   : %s --> %s%s" % (locale, c1, c2))    except KeyError as exc:        print("ERROR: %s --> %s" % (locale, exc))        return None    return c1 + c2if __name__ == "__main__":    main()
 |