Browse Source

[fix] fix match_language issue to make zh-TW match to zh-Hant-TW

pybabel separates locales with underscores but we use hyphens
everywhere babel doesn't directly touch
Marc Abonce Seguin 3 years ago
parent
commit
66b7be0965
3 changed files with 32 additions and 6 deletions
  1. 22 4
      searx/utils.py
  2. 8 0
      tests/unit/test_utils.py
  3. 2 2
      tests/unit/test_webapp.py

+ 22 - 4
searx/utils.py

@@ -369,6 +369,16 @@ def _get_lang_to_lc_dict(lang_list):
     return value
 
 
+# babel's get_global contains all sorts of miscellaneous locale and territory related data
+# see get_global in: https://github.com/python-babel/babel/blob/master/babel/core.py
+def _get_from_babel(lang_code, key):
+    match = get_global(key).get(lang_code.replace('-', '_'))
+    # for some keys, such as territory_aliases, match may be a list
+    if isinstance(match, str):
+        return match.replace('_', '-')
+    return match
+
+
 def _match_language(lang_code, lang_list=[], custom_aliases={}):  # pylint: disable=W0102
     """auxiliary function to match lang_code in lang_list"""
     # replace language code with a custom alias if necessary
@@ -379,9 +389,11 @@ def _match_language(lang_code, lang_list=[], custom_aliases={}):  # pylint: disa
         return lang_code
 
     # try to get the most likely country for this language
-    subtags = get_global('likely_subtags').get(lang_code)
+    subtags = _get_from_babel(lang_code, 'likely_subtags')
     if subtags:
-        subtag_parts = subtags.split('_')
+        if subtags in lang_list:
+            return subtags
+        subtag_parts = subtags.split('-')
         new_code = subtag_parts[0] + '-' + subtag_parts[-1]
         if new_code in custom_aliases:
             new_code = custom_aliases[new_code]
@@ -402,16 +414,22 @@ def match_language(locale_code, lang_list=[], custom_aliases={}, fallback='en-US
     locale_parts = locale_code.split('-')
     lang_code = locale_parts[0]
 
+    # if locale_code has script, try matching without it
+    if len(locale_parts) > 2:
+        language = _match_language(lang_code + '-' + locale_parts[-1], lang_list, custom_aliases)
+        if language:
+            return language
+
     # try to get language using an equivalent country code
     if len(locale_parts) > 1:
-        country_alias = get_global('territory_aliases').get(locale_parts[-1])
+        country_alias = _get_from_babel(locale_parts[-1], 'territory_aliases')
         if country_alias:
             language = _match_language(lang_code + '-' + country_alias[0], lang_list, custom_aliases)
             if language:
                 return language
 
     # try to get language using an equivalent language code
-    alias = get_global('language_aliases').get(lang_code)
+    alias = _get_from_babel(lang_code, 'language_aliases')
     if alias:
         language = _match_language(alias, lang_list, custom_aliases)
         if language:

+ 8 - 0
tests/unit/test_utils.py

@@ -92,6 +92,14 @@ class TestUtils(SearxTestCase):
         self.assertEqual(utils.match_language('es', [], fallback='fallback'), 'fallback')
         self.assertEqual(utils.match_language('ja', ['jp'], {'ja': 'jp'}), 'jp')
 
+        # handle script tags
+        self.assertEqual(utils.match_language('zh-CN', ['zh-Hans-CN', 'zh-Hant-TW']), 'zh-Hans-CN')
+        self.assertEqual(utils.match_language('zh-TW', ['zh-Hans-CN', 'zh-Hant-TW']), 'zh-Hant-TW')
+        self.assertEqual(utils.match_language('zh-Hans-CN', ['zh-CN', 'zh-TW']), 'zh-CN')
+        self.assertEqual(utils.match_language('zh-Hant-TW', ['zh-CN', 'zh-TW']), 'zh-TW')
+        self.assertEqual(utils.match_language('zh-Hans', ['zh-CN', 'zh-TW', 'zh-HK']), 'zh-CN')
+        self.assertEqual(utils.match_language('zh-Hant', ['zh-CN', 'zh-TW', 'zh-HK']), 'zh-TW')
+
         aliases = {'en-GB': 'en-UK', 'he': 'iw'}
 
         # guess country

+ 2 - 2
tests/unit/test_webapp.py

@@ -211,12 +211,12 @@ class ViewsTestCase(SearxTestCase):
         result = self.app.get('/preferences', headers={'Accept-Language': 'zh-tw;q=0.8'})
         self.assertEqual(result.status_code, 200)
         self.assertIn(
-            b'<option value="zh_TW" selected="selected">',
+            b'<option value="zh_Hant_TW" selected="selected">',
             result.data,
             'Interface locale ignored browser preference.'
         )
         self.assertIn(
-            b'<option value="zh-TW" selected="selected">',
+            b'<option value="zh_Hant_TW" selected="selected">',
             result.data,
             'Search language ignored browser preference.'
         )