fetch_languages.py 8.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207
  1. # -*- coding: utf-8 -*-
  2. # This script generates languages.py from intersecting each engine's supported languages.
  3. #
  4. # Output files (engines_languages.json and languages.py)
  5. # are written in current directory to avoid overwriting in case something goes wrong.
  6. import json
  7. from pprint import pformat
  8. from sys import path
  9. from babel import Locale, UnknownLocaleError
  10. from babel.languages import get_global
  11. path.append('../searx') # noqa
  12. from searx import settings
  13. from searx.engines import initialize_engines, engines
  14. # Output files.
  15. engines_languages_file = 'engines_languages.json'
  16. languages_file = 'languages.py'
  17. # Fetchs supported languages for each engine and writes json file with those.
  18. def fetch_supported_languages():
  19. engines_languages = dict()
  20. names = list(engines)
  21. names.sort()
  22. for engine_name in names:
  23. if hasattr(engines[engine_name], 'fetch_supported_languages'):
  24. engines_languages[engine_name] = engines[engine_name].fetch_supported_languages()
  25. print("fetched %s languages from engine %s" % (
  26. len(engines_languages[engine_name]), engine_name))
  27. if type(engines_languages[engine_name]) == list:
  28. engines_languages[engine_name] = sorted(engines_languages[engine_name])
  29. # write json file
  30. with open(engines_languages_file, 'w', encoding='utf-8') as f:
  31. json.dump(engines_languages, f, indent=2, sort_keys=True)
  32. return engines_languages
  33. # Get babel Locale object from lang_code if possible.
  34. def get_locale(lang_code):
  35. try:
  36. locale = Locale.parse(lang_code, sep='-')
  37. return locale
  38. except (UnknownLocaleError, ValueError):
  39. return None
  40. # Join all language lists.
  41. def join_language_lists(engines_languages):
  42. language_list = dict()
  43. for engine_name in engines_languages:
  44. for lang_code in engines_languages[engine_name]:
  45. # apply custom fixes if necessary
  46. if lang_code in getattr(engines[engine_name], 'language_aliases', {}).values():
  47. lang_code = next(lc for lc, alias in engines[engine_name].language_aliases.items()
  48. if lang_code == alias)
  49. locale = get_locale(lang_code)
  50. # ensure that lang_code uses standard language and country codes
  51. if locale and locale.territory:
  52. lang_code = "{lang}-{country}".format(lang=locale.language, country=locale.territory)
  53. short_code = lang_code.split('-')[0]
  54. # add language without country if not in list
  55. if short_code not in language_list:
  56. if locale:
  57. # get language's data from babel's Locale object
  58. language_name = locale.get_language_name().title()
  59. english_name = locale.english_name.split(' (')[0]
  60. elif short_code in engines_languages['wikipedia']:
  61. # get language's data from wikipedia if not known by babel
  62. language_name = engines_languages['wikipedia'][short_code]['name']
  63. english_name = engines_languages['wikipedia'][short_code]['english_name']
  64. else:
  65. language_name = None
  66. english_name = None
  67. # add language to list
  68. language_list[short_code] = {'name': language_name,
  69. 'english_name': english_name,
  70. 'counter': set(),
  71. 'countries': dict()}
  72. # add language with country if not in list
  73. if lang_code != short_code and lang_code not in language_list[short_code]['countries']:
  74. country_name = ''
  75. if locale:
  76. # get country name from babel's Locale object
  77. country_name = locale.get_territory_name()
  78. language_list[short_code]['countries'][lang_code] = {'country_name': country_name,
  79. 'counter': set()}
  80. # count engine for both language_country combination and language alone
  81. language_list[short_code]['counter'].add(engine_name)
  82. if lang_code != short_code:
  83. language_list[short_code]['countries'][lang_code]['counter'].add(engine_name)
  84. return language_list
  85. # Filter language list so it only includes the most supported languages and countries
  86. def filter_language_list(all_languages):
  87. min_engines_per_lang = 15
  88. min_engines_per_country = 10
  89. main_engines = [engine_name for engine_name in engines.keys()
  90. if 'general' in engines[engine_name].categories and
  91. engines[engine_name].supported_languages and
  92. not engines[engine_name].disabled]
  93. # filter list to include only languages supported by most engines or all default general engines
  94. filtered_languages = {code: lang for code, lang
  95. in all_languages.items()
  96. if (len(lang['counter']) >= min_engines_per_lang or
  97. all(main_engine in lang['counter']
  98. for main_engine in main_engines))}
  99. def _copy_lang_data(lang, country_name=None):
  100. new_dict = dict()
  101. new_dict['name'] = all_languages[lang]['name']
  102. new_dict['english_name'] = all_languages[lang]['english_name']
  103. if country_name:
  104. new_dict['country_name'] = country_name
  105. return new_dict
  106. def _country_count(i):
  107. return len(countries[sorted_countries[i]]['counter'])
  108. # for each language get country codes supported by most engines or at least one country code
  109. filtered_languages_with_countries = dict()
  110. for lang, lang_data in filtered_languages.items():
  111. countries = lang_data['countries']
  112. filtered_countries = dict()
  113. # get language's country codes with enough supported engines
  114. for lang_country, country_data in countries.items():
  115. if len(country_data['counter']) >= min_engines_per_country:
  116. filtered_countries[lang_country] = _copy_lang_data(lang, country_data['country_name'])
  117. # add language without countries too if there's more than one country to choose from
  118. if len(filtered_countries) > 1:
  119. filtered_countries[lang] = _copy_lang_data(lang)
  120. elif len(filtered_countries) == 1:
  121. # if there's only one country per language, it's not necessary to show country name
  122. lang_country = next(iter(filtered_countries))
  123. filtered_countries[lang_country]['country_name'] = None
  124. # if no country has enough engines try to get most likely country code from babel
  125. if not filtered_countries:
  126. lang_country = None
  127. subtags = get_global('likely_subtags').get(lang)
  128. if subtags:
  129. country_code = subtags.split('_')[-1]
  130. if len(country_code) == 2:
  131. lang_country = "{lang}-{country}".format(lang=lang, country=country_code)
  132. if lang_country:
  133. filtered_countries[lang_country] = _copy_lang_data(lang)
  134. else:
  135. filtered_countries[lang] = _copy_lang_data(lang)
  136. filtered_languages_with_countries.update(filtered_countries)
  137. return filtered_languages_with_countries
  138. # Write languages.py.
  139. def write_languages_file(languages):
  140. file_headers = (
  141. "# -*- coding: utf-8 -*-",
  142. "# list of language codes",
  143. "# this file is generated automatically by utils/fetch_languages.py",
  144. "language_codes ="
  145. )
  146. language_codes = tuple([
  147. (
  148. code,
  149. languages[code]['name'].split(' (')[0],
  150. languages[code].get('country_name') or '',
  151. languages[code].get('english_name') or ''
  152. ) for code in sorted(languages)
  153. ])
  154. with open(languages_file, 'w') as new_file:
  155. file_content = "{file_headers} \\\n{language_codes}".format(
  156. file_headers='\n'.join(file_headers),
  157. language_codes=pformat(language_codes, indent=4)
  158. )
  159. new_file.write(file_content)
  160. new_file.close()
  161. if __name__ == "__main__":
  162. initialize_engines(settings['engines'])
  163. engines_languages = fetch_supported_languages()
  164. all_languages = join_language_lists(engines_languages)
  165. filtered_languages = filter_language_list(all_languages)
  166. write_languages_file(filtered_languages)