Browse Source

refactor engine's search language handling

Add match_language function in utils to match any user given
language code with a list of engine's supported languages.

Also add language_aliases dict on each engine to translate
standard language codes into the custom codes used by the engine.
Marc Abonce Seguin 7 years ago
parent
commit
772c048d01
42 changed files with 273 additions and 169 deletions
  1. 0 0
      searx/data/engines_languages.json
  2. 20 1
      searx/engines/__init__.py
  3. 2 2
      searx/engines/archlinux.py
  4. 4 2
      searx/engines/bing.py
  5. 2 21
      searx/engines/bing_images.py
  6. 3 3
      searx/engines/bing_news.py
  7. 4 3
      searx/engines/bing_videos.py
  8. 2 1
      searx/engines/dailymotion.py
  9. 18 29
      searx/engines/duckduckgo.py
  10. 4 3
      searx/engines/duckduckgo_definitions.py
  11. 4 1
      searx/engines/duckduckgo_images.py
  12. 14 15
      searx/engines/google.py
  13. 4 2
      searx/engines/google_news.py
  14. 3 10
      searx/engines/qwant.py
  15. 3 5
      searx/engines/swisscows.py
  16. 3 2
      searx/engines/wikidata.py
  17. 2 7
      searx/engines/wikipedia.py
  18. 12 12
      searx/engines/yahoo.py
  19. 5 2
      searx/engines/yahoo_news.py
  20. 0 4
      searx/preferences.py
  21. 7 3
      searx/query.py
  22. 2 2
      searx/templates/oscar/preferences.html
  23. 61 0
      searx/utils.py
  24. 15 7
      searx/webapp.py
  25. 6 1
      tests/unit/engines/test_archlinux.py
  26. 1 0
      tests/unit/engines/test_bing.py
  27. 0 1
      tests/unit/engines/test_bing_images.py
  28. 2 1
      tests/unit/engines/test_bing_news.py
  29. 0 1
      tests/unit/engines/test_bing_videos.py
  30. 2 1
      tests/unit/engines/test_dailymotion.py
  31. 12 6
      tests/unit/engines/test_duckduckgo.py
  32. 1 0
      tests/unit/engines/test_duckduckgo_definitions.py
  33. 0 1
      tests/unit/engines/test_duckduckgo_images.py
  34. 7 0
      tests/unit/engines/test_google.py
  35. 1 0
      tests/unit/engines/test_google_news.py
  36. 1 1
      tests/unit/engines/test_qwant.py
  37. 1 0
      tests/unit/engines/test_swisscows.py
  38. 1 0
      tests/unit/engines/test_wikidata.py
  39. 14 3
      tests/unit/engines/test_yahoo.py
  40. 2 1
      tests/unit/engines/test_yahoo_news.py
  41. 25 0
      tests/unit/test_utils.py
  42. 3 15
      utils/fetch_languages.py

File diff suppressed because it is too large
+ 0 - 0
searx/data/engines_languages.json


+ 20 - 1
searx/engines/__init__.py

@@ -20,13 +20,14 @@ import sys
 import threading
 import threading
 from os.path import realpath, dirname
 from os.path import realpath, dirname
 from io import open
 from io import open
+from babel.localedata import locale_identifiers
 from flask_babel import gettext
 from flask_babel import gettext
 from operator import itemgetter
 from operator import itemgetter
 from json import loads
 from json import loads
 from requests import get
 from requests import get
 from searx import settings
 from searx import settings
 from searx import logger
 from searx import logger
-from searx.utils import load_module
+from searx.utils import load_module, match_language
 
 
 
 
 logger = logger.getChild('engines')
 logger = logger.getChild('engines')
@@ -38,6 +39,8 @@ engines = {}
 categories = {'general': []}
 categories = {'general': []}
 
 
 languages = loads(open(engine_dir + '/../data/engines_languages.json', 'r', encoding='utf-8').read())
 languages = loads(open(engine_dir + '/../data/engines_languages.json', 'r', encoding='utf-8').read())
+babel_langs = [lang_parts[0] + '-' + lang_parts[-1] if len(lang_parts) > 1 else lang_parts[0]
+               for lang_parts in (lang_code.split('_') for lang_code in locale_identifiers())]
 
 
 engine_shortcuts = {}
 engine_shortcuts = {}
 engine_default_args = {'paging': False,
 engine_default_args = {'paging': False,
@@ -97,6 +100,22 @@ def load_engine(engine_data):
     if engine_data['name'] in languages:
     if engine_data['name'] in languages:
         setattr(engine, 'supported_languages', languages[engine_data['name']])
         setattr(engine, 'supported_languages', languages[engine_data['name']])
 
 
+    # find custom aliases for non standard language codes
+    if hasattr(engine, 'supported_languages'):
+        if hasattr(engine, 'language_aliases'):
+            language_aliases = getattr(engine, 'language_aliases')
+        else:
+            language_aliases = {}
+
+        for engine_lang in getattr(engine, 'supported_languages'):
+            iso_lang = match_language(engine_lang, babel_langs, fallback=None)
+            if iso_lang and iso_lang != engine_lang and not engine_lang.startswith(iso_lang) and \
+               iso_lang not in getattr(engine, 'supported_languages'):
+                language_aliases[iso_lang] = engine_lang
+
+        if language_aliases:
+            setattr(engine, 'language_aliases', language_aliases)
+
     # assign language fetching method if auxiliary method exists
     # assign language fetching method if auxiliary method exists
     if hasattr(engine, '_fetch_supported_languages'):
     if hasattr(engine, '_fetch_supported_languages'):
         setattr(engine, 'fetch_supported_languages',
         setattr(engine, 'fetch_supported_languages',

+ 2 - 2
searx/engines/archlinux.py

@@ -99,13 +99,13 @@ supported_languages = dict(lang_urls, **main_langs)
 
 
 # do search-request
 # do search-request
 def request(query, params):
 def request(query, params):
-    # translate the locale (e.g. 'en_US') to language code ('en')
+    # translate the locale (e.g. 'en-US') to language code ('en')
     language = locale_to_lang_code(params['language'])
     language = locale_to_lang_code(params['language'])
 
 
     # if our language is hosted on the main site, we need to add its name
     # if our language is hosted on the main site, we need to add its name
     # to the query in order to narrow the results to that language
     # to the query in order to narrow the results to that language
     if language in main_langs:
     if language in main_langs:
-        query += '(' + main_langs[language] + ')'
+        query += b' (' + main_langs[language] + b')'
 
 
     # prepare the request parameters
     # prepare the request parameters
     query = urlencode({'search': query})
     query = urlencode({'search': query})

+ 4 - 2
searx/engines/bing.py

@@ -16,12 +16,14 @@
 from lxml import html
 from lxml import html
 from searx.engines.xpath import extract_text
 from searx.engines.xpath import extract_text
 from searx.url_utils import urlencode
 from searx.url_utils import urlencode
+from searx.utils import match_language
 
 
 # engine dependent config
 # engine dependent config
 categories = ['general']
 categories = ['general']
 paging = True
 paging = True
 language_support = True
 language_support = True
 supported_languages_url = 'https://www.bing.com/account/general'
 supported_languages_url = 'https://www.bing.com/account/general'
+language_aliases = {'zh-CN': 'zh-CHS', 'zh-TW': 'zh-CHT', 'zh-HK': 'zh-CHT'}
 
 
 # search-url
 # search-url
 base_url = 'https://www.bing.com/'
 base_url = 'https://www.bing.com/'
@@ -32,9 +34,9 @@ search_string = 'search?{query}&first={offset}'
 def request(query, params):
 def request(query, params):
     offset = (params['pageno'] - 1) * 10 + 1
     offset = (params['pageno'] - 1) * 10 + 1
 
 
-    lang = params['language'].split('-')[0].upper()
+    lang = match_language(params['language'], supported_languages, language_aliases)
 
 
-    query = u'language:{} {}'.format(lang, query.decode('utf-8')).encode('utf-8')
+    query = u'language:{} {}'.format(lang.split('-')[0].upper(), query.decode('utf-8')).encode('utf-8')
 
 
     search_path = search_string.format(
     search_path = search_string.format(
         query=urlencode({'q': query}),
         query=urlencode({'q': query}),

+ 2 - 21
searx/engines/bing_images.py

@@ -19,6 +19,7 @@ from lxml import html
 from json import loads
 from json import loads
 import re
 import re
 from searx.url_utils import urlencode
 from searx.url_utils import urlencode
+from searx.utils import match_language
 
 
 # engine dependent config
 # engine dependent config
 categories = ['images']
 categories = ['images']
@@ -46,26 +47,6 @@ safesearch_types = {2: 'STRICT',
 _quote_keys_regex = re.compile('({|,)([a-z][a-z0-9]*):(")', re.I | re.U)
 _quote_keys_regex = re.compile('({|,)([a-z][a-z0-9]*):(")', re.I | re.U)
 
 
 
 
-# get supported region code
-def get_region_code(lang, lang_list=None):
-    region = None
-    if lang in (lang_list or supported_languages):
-        region = lang
-    elif lang.startswith('no'):
-        region = 'nb-NO'
-    else:
-        # try to get a supported country code with language
-        lang = lang.split('-')[0]
-        for lc in (lang_list or supported_languages):
-            if lang == lc.split('-')[0]:
-                region = lc
-                break
-    if region:
-        return region.lower()
-    else:
-        return 'en-us'
-
-
 # do search-request
 # do search-request
 def request(query, params):
 def request(query, params):
     offset = (params['pageno'] - 1) * 10 + 1
     offset = (params['pageno'] - 1) * 10 + 1
@@ -74,7 +55,7 @@ def request(query, params):
         query=urlencode({'q': query}),
         query=urlencode({'q': query}),
         offset=offset)
         offset=offset)
 
 
-    language = get_region_code(params['language'])
+    language = match_language(params['language'], supported_languages).lower()
 
 
     params['cookies']['SRCHHPGUSR'] = \
     params['cookies']['SRCHHPGUSR'] = \
         'ADLT=' + safesearch_types.get(params['safesearch'], 'DEMOTE')
         'ADLT=' + safesearch_types.get(params['safesearch'], 'DEMOTE')

+ 3 - 3
searx/engines/bing_news.py

@@ -14,8 +14,8 @@
 from datetime import datetime
 from datetime import datetime
 from dateutil import parser
 from dateutil import parser
 from lxml import etree
 from lxml import etree
-from searx.utils import list_get
-from searx.engines.bing import _fetch_supported_languages, supported_languages_url
+from searx.utils import list_get, match_language
+from searx.engines.bing import _fetch_supported_languages, supported_languages_url, language_aliases
 from searx.url_utils import urlencode, urlparse, parse_qsl
 from searx.url_utils import urlencode, urlparse, parse_qsl
 
 
 # engine dependent config
 # engine dependent config
@@ -71,7 +71,7 @@ def request(query, params):
 
 
     offset = (params['pageno'] - 1) * 10 + 1
     offset = (params['pageno'] - 1) * 10 + 1
 
 
-    language = params['language']
+    language = match_language(params['language'], supported_languages, language_aliases)
 
 
     params['url'] = _get_url(query, language, offset, params['time_range'])
     params['url'] = _get_url(query, language, offset, params['time_range'])
 
 

+ 4 - 3
searx/engines/bing_videos.py

@@ -12,9 +12,10 @@
 
 
 from json import loads
 from json import loads
 from lxml import html
 from lxml import html
-from searx.engines.bing_images import _fetch_supported_languages, supported_languages_url, get_region_code
+from searx.engines.bing_images import _fetch_supported_languages, supported_languages_url
 from searx.engines.xpath import extract_text
 from searx.engines.xpath import extract_text
 from searx.url_utils import urlencode
 from searx.url_utils import urlencode
+from searx.utils import match_language
 
 
 
 
 categories = ['videos']
 categories = ['videos']
@@ -47,8 +48,8 @@ def request(query, params):
         'ADLT=' + safesearch_types.get(params['safesearch'], 'DEMOTE')
         'ADLT=' + safesearch_types.get(params['safesearch'], 'DEMOTE')
 
 
     # language cookie
     # language cookie
-    region = get_region_code(params['language'], lang_list=supported_languages)
-    params['cookies']['_EDGE_S'] = 'mkt=' + region + '&F=1'
+    language = match_language(params['language'], supported_languages).lower()
+    params['cookies']['_EDGE_S'] = 'mkt=' + language + '&F=1'
 
 
     # query and paging
     # query and paging
     params['url'] = search_url.format(query=urlencode({'q': query}),
     params['url'] = search_url.format(query=urlencode({'q': query}),

+ 2 - 1
searx/engines/dailymotion.py

@@ -15,6 +15,7 @@
 from json import loads
 from json import loads
 from datetime import datetime
 from datetime import datetime
 from searx.url_utils import urlencode
 from searx.url_utils import urlencode
+from searx.utils import match_language
 
 
 # engine dependent config
 # engine dependent config
 categories = ['videos']
 categories = ['videos']
@@ -32,7 +33,7 @@ supported_languages_url = 'https://api.dailymotion.com/languages'
 
 
 # do search-request
 # do search-request
 def request(query, params):
 def request(query, params):
-    locale = params['language']
+    locale = match_language(params['language'], supported_languages)
 
 
     params['url'] = search_url.format(
     params['url'] = search_url.format(
         query=urlencode({'search': query, 'localization': locale}),
         query=urlencode({'search': query, 'localization': locale}),

+ 18 - 29
searx/engines/duckduckgo.py

@@ -18,6 +18,7 @@ from json import loads
 from searx.engines.xpath import extract_text
 from searx.engines.xpath import extract_text
 from searx.poolrequests import get
 from searx.poolrequests import get
 from searx.url_utils import urlencode
 from searx.url_utils import urlencode
+from searx.utils import match_language
 
 
 # engine dependent config
 # engine dependent config
 categories = ['general']
 categories = ['general']
@@ -26,6 +27,16 @@ language_support = True
 supported_languages_url = 'https://duckduckgo.com/util/u172.js'
 supported_languages_url = 'https://duckduckgo.com/util/u172.js'
 time_range_support = True
 time_range_support = True
 
 
+language_aliases = {
+    'ar-SA': 'ar-XA',
+    'es-419': 'es-XL',
+    'ja': 'jp-JP',
+    'ko': 'kr-KR',
+    'sl-SI': 'sl-SL',
+    'zh-TW': 'tzh-TW',
+    'zh-HK': 'tzh-HK'
+}
+
 # search-url
 # search-url
 url = 'https://duckduckgo.com/html?{query}&s={offset}&dc={dc_param}'
 url = 'https://duckduckgo.com/html?{query}&s={offset}&dc={dc_param}'
 time_range_url = '&df={range}'
 time_range_url = '&df={range}'
@@ -42,34 +53,12 @@ content_xpath = './/a[@class="result__snippet"]'
 
 
 
 
 # match query's language to a region code that duckduckgo will accept
 # match query's language to a region code that duckduckgo will accept
-def get_region_code(lang, lang_list=None):
-    # custom fixes for languages
-    if lang[:2] == 'ja':
-        region_code = 'jp-jp'
-    elif lang[:2] == 'sl':
-        region_code = 'sl-sl'
-    elif lang == 'zh-TW':
-        region_code = 'tw-tzh'
-    elif lang == 'zh-HK':
-        region_code = 'hk-tzh'
-    elif lang[-2:] == 'SA':
-        region_code = 'xa-' + lang.split('-')[0]
-    elif lang[-2:] == 'GB':
-        region_code = 'uk-' + lang.split('-')[0]
-    else:
-        region_code = lang.split('-')
-        if len(region_code) == 2:
-            # country code goes first
-            region_code = region_code[1].lower() + '-' + region_code[0].lower()
-        else:
-            # tries to get a country code from language
-            region_code = region_code[0].lower()
-            for lc in (lang_list or supported_languages):
-                lc = lc.split('-')
-                if region_code == lc[0]:
-                    region_code = lc[1].lower() + '-' + lc[0].lower()
-                    break
-    return region_code
+def get_region_code(lang, lang_list=[]):
+    lang_code = match_language(lang, lang_list, language_aliases, 'wt-WT')
+    lang_parts = lang_code.split('-')
+
+    # country code goes first
+    return lang_parts[1].lower() + '-' + lang_parts[0].lower()
 
 
 
 
 # do search-request
 # do search-request
@@ -79,7 +68,7 @@ def request(query, params):
 
 
     offset = (params['pageno'] - 1) * 30
     offset = (params['pageno'] - 1) * 30
 
 
-    region_code = get_region_code(params['language'])
+    region_code = get_region_code(params['language'], supported_languages)
     params['url'] = url.format(
     params['url'] = url.format(
         query=urlencode({'q': query, 'kl': region_code}), offset=offset, dc_param=offset)
         query=urlencode({'q': query, 'kl': region_code}), offset=offset, dc_param=offset)
 
 

+ 4 - 3
searx/engines/duckduckgo_definitions.py

@@ -2,9 +2,9 @@ import json
 from lxml import html
 from lxml import html
 from re import compile
 from re import compile
 from searx.engines.xpath import extract_text
 from searx.engines.xpath import extract_text
-from searx.engines.duckduckgo import _fetch_supported_languages, supported_languages_url
+from searx.engines.duckduckgo import _fetch_supported_languages, supported_languages_url, language_aliases
 from searx.url_utils import urlencode
 from searx.url_utils import urlencode
-from searx.utils import html_to_text
+from searx.utils import html_to_text, match_language
 
 
 url = 'https://api.duckduckgo.com/'\
 url = 'https://api.duckduckgo.com/'\
     + '?{query}&format=json&pretty=0&no_redirect=1&d=1'
     + '?{query}&format=json&pretty=0&no_redirect=1&d=1'
@@ -24,7 +24,8 @@ def result_to_text(url, text, htmlResult):
 
 
 def request(query, params):
 def request(query, params):
     params['url'] = url.format(query=urlencode({'q': query}))
     params['url'] = url.format(query=urlencode({'q': query}))
-    params['headers']['Accept-Language'] = params['language'].split('-')[0]
+    language = match_language(params['language'], supported_languages, language_aliases)
+    params['headers']['Accept-Language'] = language.split('-')[0]
     return params
     return params
 
 
 
 

+ 4 - 1
searx/engines/duckduckgo_images.py

@@ -15,7 +15,10 @@
 
 
 from json import loads
 from json import loads
 from searx.engines.xpath import extract_text
 from searx.engines.xpath import extract_text
-from searx.engines.duckduckgo import _fetch_supported_languages, supported_languages_url, get_region_code
+from searx.engines.duckduckgo import (
+    _fetch_supported_languages, supported_languages_url,
+    get_region_code, language_aliases
+)
 from searx.poolrequests import get
 from searx.poolrequests import get
 from searx.url_utils import urlencode
 from searx.url_utils import urlencode
 
 

+ 14 - 15
searx/engines/google.py

@@ -14,6 +14,7 @@ from lxml import html, etree
 from searx.engines.xpath import extract_text, extract_url
 from searx.engines.xpath import extract_text, extract_url
 from searx import logger
 from searx import logger
 from searx.url_utils import urlencode, urlparse, parse_qsl
 from searx.url_utils import urlencode, urlparse, parse_qsl
+from searx.utils import match_language
 
 
 logger = logger.getChild('google engine')
 logger = logger.getChild('google engine')
 
 
@@ -165,22 +166,20 @@ def extract_text_from_dom(result, xpath):
 def request(query, params):
 def request(query, params):
     offset = (params['pageno'] - 1) * 10
     offset = (params['pageno'] - 1) * 10
 
 
+    language = match_language(params['language'], supported_languages)
+    language_array = language.split('-')
+    if params['language'].find('-') > 0:
+        country = params['language'].split('-')[1]
+    elif len(language_array) == 2:
+        country = language_array[1]
+    else:
+        country = 'US'
+
     # temporary fix until a way of supporting en-US is found
     # temporary fix until a way of supporting en-US is found
-    if params['language'] == 'en-US':
-        params['language'] = 'en-GB'
+    if language == 'en-US':
+        country = 'GB'
 
 
-    if params['language'][:2] == 'jv':
-        language = 'jw'
-        country = 'ID'
-        url_lang = 'lang_jw'
-    else:
-        language_array = params['language'].lower().split('-')
-        if len(language_array) == 2:
-            country = language_array[1]
-        else:
-            country = 'US'
-        language = language_array[0] + ',' + language_array[0] + '-' + country
-        url_lang = 'lang_' + language_array[0]
+    url_lang = 'lang_' + language
 
 
     if use_locale_domain:
     if use_locale_domain:
         google_hostname = country_to_hostname.get(country.upper(), default_hostname)
         google_hostname = country_to_hostname.get(country.upper(), default_hostname)
@@ -196,7 +195,7 @@ def request(query, params):
     if params['time_range'] in time_range_dict:
     if params['time_range'] in time_range_dict:
         params['url'] += time_range_search.format(range=time_range_dict[params['time_range']])
         params['url'] += time_range_search.format(range=time_range_dict[params['time_range']])
 
 
-    params['headers']['Accept-Language'] = language
+    params['headers']['Accept-Language'] = language + ',' + language + '-' + country
     params['headers']['Accept'] = 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8'
     params['headers']['Accept'] = 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8'
 
 
     params['google_hostname'] = google_hostname
     params['google_hostname'] = google_hostname

+ 4 - 2
searx/engines/google_news.py

@@ -13,6 +13,7 @@
 from lxml import html
 from lxml import html
 from searx.engines.google import _fetch_supported_languages, supported_languages_url
 from searx.engines.google import _fetch_supported_languages, supported_languages_url
 from searx.url_utils import urlencode
 from searx.url_utils import urlencode
+from searx.utils import match_language
 
 
 # search-url
 # search-url
 categories = ['news']
 categories = ['news']
@@ -50,8 +51,9 @@ def request(query, params):
     params['url'] = search_url.format(query=urlencode({'q': query}),
     params['url'] = search_url.format(query=urlencode({'q': query}),
                                       search_options=urlencode(search_options))
                                       search_options=urlencode(search_options))
 
 
-    language_array = params['language'].lower().split('-')
-    params['url'] += '&lr=lang_' + language_array[0]
+    language = match_language(params['language'], supported_languages).split('-')[0]
+    if language:
+        params['url'] += '&lr=lang_' + language
 
 
     return params
     return params
 
 

+ 3 - 10
searx/engines/qwant.py

@@ -14,6 +14,7 @@ from datetime import datetime
 from json import loads
 from json import loads
 from searx.utils import html_to_text
 from searx.utils import html_to_text
 from searx.url_utils import urlencode
 from searx.url_utils import urlencode
+from searx.utils import match_language
 
 
 # engine dependent config
 # engine dependent config
 categories = None
 categories = None
@@ -45,16 +46,8 @@ def request(query, params):
                                    offset=offset)
                                    offset=offset)
 
 
     # add language tag
     # add language tag
-    if params['language'] == 'no' or params['language'].startswith('no-'):
-        params['language'] = params['language'].replace('no', 'nb', 1)
-    if params['language'].find('-') < 0:
-        # tries to get a country code from language
-        for lang in supported_languages:
-            lc = lang.split('-')
-            if params['language'] == lc[0]:
-                params['language'] = lang
-                break
-    params['url'] += '&locale=' + params['language'].replace('-', '_').lower()
+    language = match_language(params['language'], supported_languages)
+    params['url'] += '&locale=' + language.replace('-', '_').lower()
 
 
     return params
     return params
 
 

+ 3 - 5
searx/engines/swisscows.py

@@ -14,6 +14,7 @@ from json import loads
 import re
 import re
 from lxml.html import fromstring
 from lxml.html import fromstring
 from searx.url_utils import unquote, urlencode
 from searx.url_utils import unquote, urlencode
+from searx.utils import match_language
 
 
 # engine dependent config
 # engine dependent config
 categories = ['general', 'images']
 categories = ['general', 'images']
@@ -35,11 +36,8 @@ regex_img_url_remove_start = re.compile(b'^https?://i\.swisscows\.ch/\?link=')
 
 
 # do search-request
 # do search-request
 def request(query, params):
 def request(query, params):
-    if params['language'].split('-')[0] == 'no':
-        region = 'nb-NO'
-    else:
-        region = params['language']
-        ui_language = params['language'].split('-')[0]
+    region = match_language(params['language'], supported_languages)
+    ui_language = region.split('-')[0]
 
 
     search_path = search_string.format(
     search_path = search_string.format(
         query=urlencode({'query': query, 'uiLanguage': ui_language, 'region': region}),
         query=urlencode({'query': query, 'uiLanguage': ui_language, 'region': region}),

+ 3 - 2
searx/engines/wikidata.py

@@ -16,6 +16,7 @@ from searx.poolrequests import get
 from searx.engines.xpath import extract_text
 from searx.engines.xpath import extract_text
 from searx.engines.wikipedia import _fetch_supported_languages, supported_languages_url
 from searx.engines.wikipedia import _fetch_supported_languages, supported_languages_url
 from searx.url_utils import urlencode
 from searx.url_utils import urlencode
+from searx.utils import match_language
 
 
 from json import loads
 from json import loads
 from lxml.html import fromstring
 from lxml.html import fromstring
@@ -56,7 +57,7 @@ calendar_name_xpath = './/sup[contains(@class,"wb-calendar-name")]'
 
 
 
 
 def request(query, params):
 def request(query, params):
-    language = params['language'].split('-')[0]
+    language = match_language(params['language'], supported_languages).split('-')[0]
 
 
     params['url'] = url_search.format(
     params['url'] = url_search.format(
         query=urlencode({'label': query, 'language': language}))
         query=urlencode({'label': query, 'language': language}))
@@ -68,7 +69,7 @@ def response(resp):
     html = fromstring(resp.text)
     html = fromstring(resp.text)
     wikidata_ids = html.xpath(wikidata_ids_xpath)
     wikidata_ids = html.xpath(wikidata_ids_xpath)
 
 
-    language = resp.search_params['language'].split('-')[0]
+    language = match_language(resp.search_params['language'], supported_languages).split('-')[0]
 
 
     # TODO: make requests asynchronous to avoid timeout when result_count > 1
     # TODO: make requests asynchronous to avoid timeout when result_count > 1
     for wikidata_id in wikidata_ids[:result_count]:
     for wikidata_id in wikidata_ids[:result_count]:

+ 2 - 7
searx/engines/wikipedia.py

@@ -13,6 +13,7 @@
 from json import loads
 from json import loads
 from lxml.html import fromstring
 from lxml.html import fromstring
 from searx.url_utils import quote, urlencode
 from searx.url_utils import quote, urlencode
+from searx.utils import match_language
 
 
 # search-url
 # search-url
 base_url = u'https://{language}.wikipedia.org/'
 base_url = u'https://{language}.wikipedia.org/'
@@ -30,13 +31,7 @@ supported_languages_url = 'https://meta.wikimedia.org/wiki/List_of_Wikipedias'
 
 
 # set language in base_url
 # set language in base_url
 def url_lang(lang):
 def url_lang(lang):
-    lang = lang.split('-')[0]
-    if lang not in supported_languages:
-        language = 'en'
-    else:
-        language = lang
-
-    return language
+    return match_language(lang, supported_languages).split('-')[0]
 
 
 
 
 # do search-request
 # do search-request

+ 12 - 12
searx/engines/yahoo.py

@@ -14,6 +14,7 @@
 from lxml import html
 from lxml import html
 from searx.engines.xpath import extract_text, extract_url
 from searx.engines.xpath import extract_text, extract_url
 from searx.url_utils import unquote, urlencode
 from searx.url_utils import unquote, urlencode
+from searx.utils import match_language
 
 
 # engine dependent config
 # engine dependent config
 categories = ['general']
 categories = ['general']
@@ -39,6 +40,8 @@ time_range_dict = {'day': ['1d', 'd'],
                    'week': ['1w', 'w'],
                    'week': ['1w', 'w'],
                    'month': ['1m', 'm']}
                    'month': ['1m', 'm']}
 
 
+language_aliases = {'zh-CN': 'zh-CHS', 'zh-TW': 'zh-CHT', 'zh-HK': 'zh-CHT'}
+
 
 
 # remove yahoo-specific tracking-url
 # remove yahoo-specific tracking-url
 def parse_url(url_string):
 def parse_url(url_string):
@@ -70,23 +73,16 @@ def _get_url(query, offset, language, time_range):
                                         lang=language)
                                         lang=language)
 
 
 
 
-def _get_language(params):
-    if params['language'][:2] == 'zh':
-        if params['language'] == 'zh' or params['language'] == 'zh-CH':
-            return 'szh'
-        else:
-            return 'tzh'
-    else:
-        return params['language'].split('-')[0]
-
-
 # do search-request
 # do search-request
 def request(query, params):
 def request(query, params):
     if params['time_range'] and params['time_range'] not in time_range_dict:
     if params['time_range'] and params['time_range'] not in time_range_dict:
         return params
         return params
 
 
     offset = (params['pageno'] - 1) * 10 + 1
     offset = (params['pageno'] - 1) * 10 + 1
-    language = _get_language(params)
+    language = match_language(params['language'], supported_languages, language_aliases)
+    if language not in language_aliases.values():
+        language = language.split('-')[0]
+    language = language.replace('-', '_').lower()
 
 
     params['url'] = _get_url(query, offset, language, params['time_range'])
     params['url'] = _get_url(query, offset, language, params['time_range'])
 
 
@@ -145,7 +141,11 @@ def _fetch_supported_languages(resp):
     dom = html.fromstring(resp.text)
     dom = html.fromstring(resp.text)
     options = dom.xpath('//div[@id="yschlang"]/span/label/input')
     options = dom.xpath('//div[@id="yschlang"]/span/label/input')
     for option in options:
     for option in options:
-        code = option.xpath('./@value')[0][5:].replace('_', '-')
+        code_parts = option.xpath('./@value')[0][5:].split('_')
+        if len(code_parts) == 2:
+            code = code_parts[0] + '-' + code_parts[1].upper()
+        else:
+            code = code_parts[0]
         supported_languages.append(code)
         supported_languages.append(code)
 
 
     return supported_languages
     return supported_languages

+ 5 - 2
searx/engines/yahoo_news.py

@@ -13,9 +13,12 @@ import re
 from datetime import datetime, timedelta
 from datetime import datetime, timedelta
 from lxml import html
 from lxml import html
 from searx.engines.xpath import extract_text, extract_url
 from searx.engines.xpath import extract_text, extract_url
-from searx.engines.yahoo import parse_url, _fetch_supported_languages, supported_languages_url
+from searx.engines.yahoo import (
+    parse_url, _fetch_supported_languages, supported_languages_url, language_aliases
+)
 from dateutil import parser
 from dateutil import parser
 from searx.url_utils import urlencode
 from searx.url_utils import urlencode
+from searx.utils import match_language
 
 
 # engine dependent config
 # engine dependent config
 categories = ['news']
 categories = ['news']
@@ -38,7 +41,7 @@ suggestion_xpath = '//div[contains(@class,"VerALSOTRY")]//a'
 def request(query, params):
 def request(query, params):
     offset = (params['pageno'] - 1) * 10 + 1
     offset = (params['pageno'] - 1) * 10 + 1
 
 
-    language = params['language'].split('-')[0]
+    language = match_language(params['language'], supported_languages, language_aliases).split('-')[0]
 
 
     params['url'] = search_url.format(offset=offset,
     params['url'] = search_url.format(offset=offset,
                                       query=urlencode({'p': query}),
                                       query=urlencode({'p': query}),

+ 0 - 4
searx/preferences.py

@@ -115,10 +115,6 @@ class SearchLanguageSetting(EnumStringSetting):
                 pass
                 pass
             elif lang in self.choices:
             elif lang in self.choices:
                 data = lang
                 data = lang
-            elif data == 'nb-NO':
-                data = 'no-NO'
-            elif data == 'ar-XA':
-                data = 'ar-SA'
             else:
             else:
                 data = self.value
                 data = self.value
         self.value = data
         self.value = data

+ 7 - 3
searx/query.py

@@ -96,9 +96,13 @@ class RawTextQuery(object):
                                 break
                                 break
 
 
                 # user may set a valid, yet not selectable language
                 # user may set a valid, yet not selectable language
-                if not self.languages and VALID_LANGUAGE_CODE.match(lang):
-                    self.languages.append(lang)
-                    parse_next = True
+                if VALID_LANGUAGE_CODE.match(lang):
+                    lang_parts = lang.split('-')
+                    if len(lang_parts) > 1:
+                        lang = lang_parts[0].lower() + '-' + lang_parts[1].upper()
+                    if lang not in self.languages:
+                        self.languages.append(lang)
+                        parse_next = True
 
 
             # this force a engine or category
             # this force a engine or category
             if query_part[0] == '!' or query_part[0] == '?':
             if query_part[0] == '!' or query_part[0] == '?':

+ 2 - 2
searx/templates/oscar/preferences.html

@@ -187,7 +187,7 @@
                                     </td>
                                     </td>
                                     <th>{{ search_engine.name }}</th>
                                     <th>{{ search_engine.name }}</th>
 				    <td class="name">{{ shortcuts[search_engine.name] }}</td>
 				    <td class="name">{{ shortcuts[search_engine.name] }}</td>
-					<td>{{ support_toggle(current_language in search_engine.supported_languages or current_language.split('-')[0] in search_engine.supported_languages) }}</td>
+					<td>{{ support_toggle(stats[search_engine.name].supports_selected_language) }}</td>
 					<td>{{ support_toggle(search_engine.safesearch==True) }}</td>
 					<td>{{ support_toggle(search_engine.safesearch==True) }}</td>
 					<td>{{ support_toggle(search_engine.time_range_support==True) }}</td>
 					<td>{{ support_toggle(search_engine.time_range_support==True) }}</td>
 					<td class="{{ 'danger' if stats[search_engine.name]['warn_time'] else '' }}">{{ 'N/A' if stats[search_engine.name].time==None else stats[search_engine.name].time }}</td>
 					<td class="{{ 'danger' if stats[search_engine.name]['warn_time'] else '' }}">{{ 'N/A' if stats[search_engine.name].time==None else stats[search_engine.name].time }}</td>
@@ -197,7 +197,7 @@
 					<td class="{{ 'danger' if stats[search_engine.name]['warn_time'] else '' }}">{{ 'N/A' if stats[search_engine.name].time==None else stats[search_engine.name].time }}</td>
 					<td class="{{ 'danger' if stats[search_engine.name]['warn_time'] else '' }}">{{ 'N/A' if stats[search_engine.name].time==None else stats[search_engine.name].time }}</td>
 					<td>{{ support_toggle(search_engine.time_range_support==True) }}</td>
 					<td>{{ support_toggle(search_engine.time_range_support==True) }}</td>
 					<td>{{ support_toggle(search_engine.safesearch==True) }}</td>
 					<td>{{ support_toggle(search_engine.safesearch==True) }}</td>
-					<td>{{ support_toggle(current_language in search_engine.supported_languages or current_language.split('-')[0] in search_engine.supported_languages) }}</td>
+					<td>{{ support_toggle(stats[search_engine.name].supports_selected_language) }}</td>
 					<td>{{ shortcuts[search_engine.name] }}</td>
 					<td>{{ shortcuts[search_engine.name] }}</td>
                                     <th>{{ search_engine.name }}</th>
                                     <th>{{ search_engine.name }}</th>
                                     <td class="onoff-checkbox">
                                     <td class="onoff-checkbox">

+ 61 - 0
searx/utils.py

@@ -4,6 +4,7 @@ import hmac
 import os
 import os
 import re
 import re
 
 
+from babel.core import get_global
 from babel.dates import format_date
 from babel.dates import format_date
 from codecs import getincrementalencoder
 from codecs import getincrementalencoder
 from imp import load_source
 from imp import load_source
@@ -12,6 +13,7 @@ from os.path import splitext, join
 from random import choice
 from random import choice
 import sys
 import sys
 
 
+from searx import settings
 from searx.version import VERSION_STRING
 from searx.version import VERSION_STRING
 from searx.languages import language_codes
 from searx.languages import language_codes
 from searx import settings
 from searx import settings
@@ -322,6 +324,65 @@ def is_valid_lang(lang):
         return False
         return False
 
 
 
 
+# auxiliary function to match lang_code in lang_list
+def _match_language(lang_code, lang_list=[], custom_aliases={}):
+    # replace language code with a custom alias if necessary
+    if lang_code in custom_aliases:
+        lang_code = custom_aliases[lang_code]
+
+    if lang_code in lang_list:
+        return lang_code
+
+    # try to get the most likely country for this language
+    subtags = get_global('likely_subtags').get(lang_code)
+    if subtags:
+        subtag_parts = subtags.split('_')
+        new_code = subtag_parts[0] + '-' + subtag_parts[-1]
+        if new_code in custom_aliases:
+            new_code = custom_aliases[new_code]
+        if new_code in lang_list:
+            return new_code
+
+    # try to get the any supported country for this language
+    for lc in lang_list:
+        if lang_code == lc.split('-')[0]:
+            return lc
+
+    return None
+
+
+# get the language code from lang_list that best matches locale_code
+def match_language(locale_code, lang_list=[], custom_aliases={}, fallback='en-US'):
+    # try to get language from given locale_code
+    language = _match_language(locale_code, lang_list, custom_aliases)
+    if language:
+        return language
+
+    locale_parts = locale_code.split('-')
+    lang_code = locale_parts[0]
+
+    # try to get language using an equivalent country code
+    if len(locale_parts) > 1:
+        country_alias = get_global('territory_aliases').get(locale_parts[-1])
+        if country_alias:
+            language = _match_language(lang_code + '-' + country_alias[0], lang_list, custom_aliases)
+            if language:
+                return language
+
+    # try to get language using an equivalent language code
+    alias = get_global('language_aliases').get(lang_code)
+    if alias:
+        language = _match_language(alias, lang_list, custom_aliases)
+        if language:
+            return language
+
+    if lang_code != locale_code:
+        # try to get language from given language without giving the country
+        language = _match_language(lang_code, lang_list, custom_aliases)
+
+    return language or fallback
+
+
 def load_module(filename, module_dir):
 def load_module(filename, module_dir):
     modname = splitext(filename)[0]
     modname = splitext(filename)[0]
     if modname in sys.modules:
     if modname in sys.modules:

+ 15 - 7
searx/webapp.py

@@ -58,16 +58,16 @@ from searx.engines import (
 from searx.utils import (
 from searx.utils import (
     UnicodeWriter, highlight_content, html_to_text, get_resources_directory,
     UnicodeWriter, highlight_content, html_to_text, get_resources_directory,
     get_static_files, get_result_templates, get_themes, gen_useragent,
     get_static_files, get_result_templates, get_themes, gen_useragent,
-    dict_subset, prettify_url
+    dict_subset, prettify_url, match_language
 )
 )
 from searx.version import VERSION_STRING
 from searx.version import VERSION_STRING
-from searx.languages import language_codes
+from searx.languages import language_codes as languages
 from searx.search import SearchWithPlugins, get_search_query_from_webapp
 from searx.search import SearchWithPlugins, get_search_query_from_webapp
 from searx.query import RawTextQuery
 from searx.query import RawTextQuery
 from searx.autocomplete import searx_bang, backends as autocomplete_backends
 from searx.autocomplete import searx_bang, backends as autocomplete_backends
 from searx.plugins import plugins
 from searx.plugins import plugins
 from searx.plugins.oa_doi_rewrite import get_doi_resolver
 from searx.plugins.oa_doi_rewrite import get_doi_resolver
-from searx.preferences import Preferences, ValidationException
+from searx.preferences import Preferences, ValidationException, LANGUAGE_CODES
 from searx.answerers import answerers
 from searx.answerers import answerers
 from searx.url_utils import urlencode, urlparse, urljoin
 from searx.url_utils import urlencode, urlparse, urljoin
 from searx.utils import new_hmac
 from searx.utils import new_hmac
@@ -133,7 +133,7 @@ if not searx_debug \
 babel = Babel(app)
 babel = Babel(app)
 
 
 rtl_locales = ['ar', 'arc', 'bcc', 'bqi', 'ckb', 'dv', 'fa', 'glk', 'he',
 rtl_locales = ['ar', 'arc', 'bcc', 'bqi', 'ckb', 'dv', 'fa', 'glk', 'he',
-               'ku', 'mzn', 'pnb'', ''ps', 'sd', 'ug', 'ur', 'yi']
+               'ku', 'mzn', 'pnb', 'ps', 'sd', 'ug', 'ur', 'yi']
 
 
 # used when translating category names
 # used when translating category names
 _category_names = (gettext('files'),
 _category_names = (gettext('files'),
@@ -352,9 +352,11 @@ def render(template_name, override_theme=None, **kwargs):
 
 
     kwargs['safesearch'] = str(request.preferences.get_value('safesearch'))
     kwargs['safesearch'] = str(request.preferences.get_value('safesearch'))
 
 
-    kwargs['language_codes'] = language_codes
+    kwargs['language_codes'] = languages
     if 'current_language' not in kwargs:
     if 'current_language' not in kwargs:
-        kwargs['current_language'] = request.preferences.get_value('language')
+        kwargs['current_language'] = match_language(request.preferences.get_value('language'),
+                                                    LANGUAGE_CODES,
+                                                    fallback=settings['search']['language'])
 
 
     # override url_for function in templates
     # override url_for function in templates
     kwargs['url_for'] = url_for_theme
     kwargs['url_for'] = url_for_theme
@@ -590,7 +592,9 @@ def index():
         infoboxes=result_container.infoboxes,
         infoboxes=result_container.infoboxes,
         paging=result_container.paging,
         paging=result_container.paging,
         unresponsive_engines=result_container.unresponsive_engines,
         unresponsive_engines=result_container.unresponsive_engines,
-        current_language=search_query.lang,
+        current_language=match_language(search_query.lang,
+                                        LANGUAGE_CODES,
+                                        fallback=settings['search']['language']),
         base_url=get_base_url(),
         base_url=get_base_url(),
         theme=get_current_theme_name(),
         theme=get_current_theme_name(),
         favicons=global_favicons[themes.index(get_current_theme_name())]
         favicons=global_favicons[themes.index(get_current_theme_name())]
@@ -687,6 +691,10 @@ def preferences():
                              'warn_time': False}
                              'warn_time': False}
             if e.timeout > settings['outgoing']['request_timeout']:
             if e.timeout > settings['outgoing']['request_timeout']:
                 stats[e.name]['warn_timeout'] = True
                 stats[e.name]['warn_timeout'] = True
+            if match_language(request.preferences.get_value('language'),
+                              getattr(e, 'supported_languages', []),
+                              getattr(e, 'language_aliases', {}), None):
+                stats[e.name]['supports_selected_language'] = True
 
 
     # get first element [0], the engine time,
     # get first element [0], the engine time,
     # and then the second element [1] : the time (the first one is the label)
     # and then the second element [1] : the time (the first one is the label)

+ 6 - 1
tests/unit/engines/test_archlinux.py

@@ -19,12 +19,17 @@ class TestArchLinuxEngine(SearxTestCase):
         query = 'test_query'
         query = 'test_query'
         dic = defaultdict(dict)
         dic = defaultdict(dict)
         dic['pageno'] = 1
         dic['pageno'] = 1
-        dic['language'] = 'en_US'
+        dic['language'] = 'en-US'
         params = archlinux.request(query, dic)
         params = archlinux.request(query, dic)
         self.assertTrue('url' in params)
         self.assertTrue('url' in params)
         self.assertTrue(query in params['url'])
         self.assertTrue(query in params['url'])
         self.assertTrue('wiki.archlinux.org' in params['url'])
         self.assertTrue('wiki.archlinux.org' in params['url'])
 
 
+        for lang, name in archlinux.main_langs:
+            dic['language'] = lang
+            params = archlinux.request(query, dic)
+            self.assertTrue(name in params['url'])
+
         for lang, domain in domains.items():
         for lang, domain in domains.items():
             dic['language'] = lang
             dic['language'] = lang
             params = archlinux.request(query, dic)
             params = archlinux.request(query, dic)

+ 1 - 0
tests/unit/engines/test_bing.py

@@ -7,6 +7,7 @@ from searx.testing import SearxTestCase
 class TestBingEngine(SearxTestCase):
 class TestBingEngine(SearxTestCase):
 
 
     def test_request(self):
     def test_request(self):
+        bing.supported_languages = ['en', 'fr', 'zh-CHS', 'zh-CHT', 'pt-PT', 'pt-BR']
         query = u'test_query'
         query = u'test_query'
         dicto = defaultdict(dict)
         dicto = defaultdict(dict)
         dicto['pageno'] = 0
         dicto['pageno'] = 0

+ 0 - 1
tests/unit/engines/test_bing_images.py

@@ -9,7 +9,6 @@ class TestBingImagesEngine(SearxTestCase):
 
 
     def test_request(self):
     def test_request(self):
         bing_images.supported_languages = ['fr-FR', 'en-US']
         bing_images.supported_languages = ['fr-FR', 'en-US']
-
         query = 'test_query'
         query = 'test_query'
         dicto = defaultdict(dict)
         dicto = defaultdict(dict)
         dicto['pageno'] = 1
         dicto['pageno'] = 1

+ 2 - 1
tests/unit/engines/test_bing_news.py

@@ -8,10 +8,11 @@ import lxml
 class TestBingNewsEngine(SearxTestCase):
 class TestBingNewsEngine(SearxTestCase):
 
 
     def test_request(self):
     def test_request(self):
+        bing_news.supported_languages = ['en', 'fr']
         query = 'test_query'
         query = 'test_query'
         dicto = defaultdict(dict)
         dicto = defaultdict(dict)
         dicto['pageno'] = 1
         dicto['pageno'] = 1
-        dicto['language'] = 'fr_FR'
+        dicto['language'] = 'fr-FR'
         dicto['time_range'] = ''
         dicto['time_range'] = ''
         params = bing_news.request(query, dicto)
         params = bing_news.request(query, dicto)
         self.assertIn('url', params)
         self.assertIn('url', params)

+ 0 - 1
tests/unit/engines/test_bing_videos.py

@@ -9,7 +9,6 @@ class TestBingVideosEngine(SearxTestCase):
 
 
     def test_request(self):
     def test_request(self):
         bing_videos.supported_languages = ['fr-FR', 'en-US']
         bing_videos.supported_languages = ['fr-FR', 'en-US']
-
         query = 'test_query'
         query = 'test_query'
         dicto = defaultdict(dict)
         dicto = defaultdict(dict)
         dicto['pageno'] = 1
         dicto['pageno'] = 1

+ 2 - 1
tests/unit/engines/test_dailymotion.py

@@ -8,10 +8,11 @@ from searx.testing import SearxTestCase
 class TestDailymotionEngine(SearxTestCase):
 class TestDailymotionEngine(SearxTestCase):
 
 
     def test_request(self):
     def test_request(self):
+        dailymotion.supported_languages = ['en', 'fr']
         query = 'test_query'
         query = 'test_query'
         dicto = defaultdict(dict)
         dicto = defaultdict(dict)
         dicto['pageno'] = 0
         dicto['pageno'] = 0
-        dicto['language'] = 'fr_FR'
+        dicto['language'] = 'fr-FR'
         params = dailymotion.request(query, dicto)
         params = dailymotion.request(query, dicto)
         self.assertTrue('url' in params)
         self.assertTrue('url' in params)
         self.assertTrue(query in params['url'])
         self.assertTrue(query in params['url'])

+ 12 - 6
tests/unit/engines/test_duckduckgo.py

@@ -1,18 +1,21 @@
 # -*- coding: utf-8 -*-
 # -*- coding: utf-8 -*-
 from collections import defaultdict
 from collections import defaultdict
 import mock
 import mock
-from searx.engines import duckduckgo
+from searx.engines import load_engine, duckduckgo
 from searx.testing import SearxTestCase
 from searx.testing import SearxTestCase
 
 
 
 
 class TestDuckduckgoEngine(SearxTestCase):
 class TestDuckduckgoEngine(SearxTestCase):
 
 
     def test_request(self):
     def test_request(self):
+        duckduckgo = load_engine({'engine': 'duckduckgo', 'name': 'duckduckgo'})
+
         query = 'test_query'
         query = 'test_query'
         dicto = defaultdict(dict)
         dicto = defaultdict(dict)
         dicto['pageno'] = 1
         dicto['pageno'] = 1
-        dicto['language'] = 'de-CH'
         dicto['time_range'] = ''
         dicto['time_range'] = ''
+
+        dicto['language'] = 'de-CH'
         params = duckduckgo.request(query, dicto)
         params = duckduckgo.request(query, dicto)
         self.assertIn('url', params)
         self.assertIn('url', params)
         self.assertIn(query, params['url'])
         self.assertIn(query, params['url'])
@@ -20,16 +23,19 @@ class TestDuckduckgoEngine(SearxTestCase):
         self.assertIn('ch-de', params['url'])
         self.assertIn('ch-de', params['url'])
         self.assertIn('s=0', params['url'])
         self.assertIn('s=0', params['url'])
 
 
-        # when ddg uses non standard code
+        # when ddg uses non standard codes
+        dicto['language'] = 'zh-HK'
+        params = duckduckgo.request(query, dicto)
+        self.assertIn('hk-tzh', params['url'])
+
         dicto['language'] = 'en-GB'
         dicto['language'] = 'en-GB'
         params = duckduckgo.request(query, dicto)
         params = duckduckgo.request(query, dicto)
         self.assertIn('uk-en', params['url'])
         self.assertIn('uk-en', params['url'])
 
 
         # no country given
         # no country given
-        duckduckgo.supported_languages = ['de-CH', 'en-US']
-        dicto['language'] = 'de'
+        dicto['language'] = 'en'
         params = duckduckgo.request(query, dicto)
         params = duckduckgo.request(query, dicto)
-        self.assertIn('ch-de', params['url'])
+        self.assertIn('us-en', params['url'])
 
 
     def test_no_url_in_request_year_time_range(self):
     def test_no_url_in_request_year_time_range(self):
         dicto = defaultdict(dict)
         dicto = defaultdict(dict)

+ 1 - 0
tests/unit/engines/test_duckduckgo_definitions.py

@@ -18,6 +18,7 @@ class TestDDGDefinitionsEngine(SearxTestCase):
         self.assertEqual(result, 'Text in link')
         self.assertEqual(result, 'Text in link')
 
 
     def test_request(self):
     def test_request(self):
+        duckduckgo_definitions.supported_languages = ['en-US', 'es-ES']
         query = 'test_query'
         query = 'test_query'
         dicto = defaultdict(dict)
         dicto = defaultdict(dict)
         dicto['pageno'] = 1
         dicto['pageno'] = 1

+ 0 - 1
tests/unit/engines/test_duckduckgo_images.py

@@ -9,7 +9,6 @@ class TestDuckduckgoImagesEngine(SearxTestCase):
 
 
     def test_request(self):
     def test_request(self):
         duckduckgo_images.supported_languages = ['de-CH', 'en-US']
         duckduckgo_images.supported_languages = ['de-CH', 'en-US']
-
         query = 'test_query'
         query = 'test_query'
         dicto = defaultdict(dict)
         dicto = defaultdict(dict)
         dicto['is_test'] = True
         dicto['is_test'] = True

+ 7 - 0
tests/unit/engines/test_google.py

@@ -15,6 +15,8 @@ class TestGoogleEngine(SearxTestCase):
         return response
         return response
 
 
     def test_request(self):
     def test_request(self):
+        google.supported_languages = ['en', 'fr', 'zh-CN']
+
         query = 'test_query'
         query = 'test_query'
         dicto = defaultdict(dict)
         dicto = defaultdict(dict)
         dicto['pageno'] = 1
         dicto['pageno'] = 1
@@ -31,6 +33,11 @@ class TestGoogleEngine(SearxTestCase):
         self.assertIn('google.co', params['url'])
         self.assertIn('google.co', params['url'])
         self.assertIn('en', params['headers']['Accept-Language'])
         self.assertIn('en', params['headers']['Accept-Language'])
 
 
+        dicto['language'] = 'zh'
+        params = google.request(query, dicto)
+        self.assertIn('google.com', params['url'])
+        self.assertIn('zh-CN', params['headers']['Accept-Language'])
+
     def test_response(self):
     def test_response(self):
         self.assertRaises(AttributeError, google.response, None)
         self.assertRaises(AttributeError, google.response, None)
         self.assertRaises(AttributeError, google.response, [])
         self.assertRaises(AttributeError, google.response, [])

+ 1 - 0
tests/unit/engines/test_google_news.py

@@ -9,6 +9,7 @@ from searx.testing import SearxTestCase
 class TestGoogleNewsEngine(SearxTestCase):
 class TestGoogleNewsEngine(SearxTestCase):
 
 
     def test_request(self):
     def test_request(self):
+        google_news.supported_languages = ['en-US', 'fr-FR']
         query = 'test_query'
         query = 'test_query'
         dicto = defaultdict(dict)
         dicto = defaultdict(dict)
         dicto['pageno'] = 1
         dicto['pageno'] = 1

+ 1 - 1
tests/unit/engines/test_qwant.py

@@ -7,6 +7,7 @@ from searx.testing import SearxTestCase
 class TestQwantEngine(SearxTestCase):
 class TestQwantEngine(SearxTestCase):
 
 
     def test_request(self):
     def test_request(self):
+        qwant.supported_languages = ['en-US', 'fr-CA', 'fr-FR']
         query = 'test_query'
         query = 'test_query'
         dicto = defaultdict(dict)
         dicto = defaultdict(dict)
         dicto['pageno'] = 0
         dicto['pageno'] = 0
@@ -26,7 +27,6 @@ class TestQwantEngine(SearxTestCase):
         self.assertIn('en_us', params['url'])
         self.assertIn('en_us', params['url'])
         self.assertIn('news', params['url'])
         self.assertIn('news', params['url'])
 
 
-        qwant.supported_languages = ['en', 'fr-FR', 'fr-CA']
         dicto['language'] = 'fr'
         dicto['language'] = 'fr'
         params = qwant.request(query, dicto)
         params = qwant.request(query, dicto)
         self.assertIn('fr_fr', params['url'])
         self.assertIn('fr_fr', params['url'])

+ 1 - 0
tests/unit/engines/test_swisscows.py

@@ -7,6 +7,7 @@ from searx.testing import SearxTestCase
 class TestSwisscowsEngine(SearxTestCase):
 class TestSwisscowsEngine(SearxTestCase):
 
 
     def test_request(self):
     def test_request(self):
+        swisscows.supported_languages = ['de-AT', 'de-DE']
         query = 'test_query'
         query = 'test_query'
         dicto = defaultdict(dict)
         dicto = defaultdict(dict)
         dicto['pageno'] = 1
         dicto['pageno'] = 1

+ 1 - 0
tests/unit/engines/test_wikidata.py

@@ -9,6 +9,7 @@ from searx.testing import SearxTestCase
 class TestWikidataEngine(SearxTestCase):
 class TestWikidataEngine(SearxTestCase):
 
 
     def test_request(self):
     def test_request(self):
+        wikidata.supported_languages = ['en', 'es']
         query = 'test_query'
         query = 'test_query'
         dicto = defaultdict(dict)
         dicto = defaultdict(dict)
         dicto['language'] = 'en-US'
         dicto['language'] = 'en-US'

+ 14 - 3
tests/unit/engines/test_yahoo.py

@@ -25,11 +25,12 @@ class TestYahooEngine(SearxTestCase):
         self.assertEqual('https://this.is.the.url/', url)
         self.assertEqual('https://this.is.the.url/', url)
 
 
     def test_request(self):
     def test_request(self):
+        yahoo.supported_languages = ['en', 'fr', 'zh-CHT', 'zh-CHS']
         query = 'test_query'
         query = 'test_query'
         dicto = defaultdict(dict)
         dicto = defaultdict(dict)
         dicto['pageno'] = 1
         dicto['pageno'] = 1
         dicto['time_range'] = ''
         dicto['time_range'] = ''
-        dicto['language'] = 'fr_FR'
+        dicto['language'] = 'fr-FR'
         params = yahoo.request(query, dicto)
         params = yahoo.request(query, dicto)
         self.assertIn('url', params)
         self.assertIn('url', params)
         self.assertIn(query, params['url'])
         self.assertIn(query, params['url'])
@@ -39,6 +40,16 @@ class TestYahooEngine(SearxTestCase):
         self.assertIn('sB', params['cookies'])
         self.assertIn('sB', params['cookies'])
         self.assertIn('fr', params['cookies']['sB'])
         self.assertIn('fr', params['cookies']['sB'])
 
 
+        dicto['language'] = 'zh'
+        params = yahoo.request(query, dicto)
+        self.assertIn('zh_chs', params['url'])
+        self.assertIn('zh_chs', params['cookies']['sB'])
+
+        dicto['language'] = 'zh-TW'
+        params = yahoo.request(query, dicto)
+        self.assertIn('zh_cht', params['url'])
+        self.assertIn('zh_cht', params['cookies']['sB'])
+
     def test_no_url_in_request_year_time_range(self):
     def test_no_url_in_request_year_time_range(self):
         dicto = defaultdict(dict)
         dicto = defaultdict(dict)
         query = 'test_query'
         query = 'test_query'
@@ -168,5 +179,5 @@ class TestYahooEngine(SearxTestCase):
         self.assertEqual(type(languages), list)
         self.assertEqual(type(languages), list)
         self.assertEqual(len(languages), 3)
         self.assertEqual(len(languages), 3)
         self.assertIn('ar', languages)
         self.assertIn('ar', languages)
-        self.assertIn('zh-chs', languages)
-        self.assertIn('zh-cht', languages)
+        self.assertIn('zh-CHS', languages)
+        self.assertIn('zh-CHT', languages)

+ 2 - 1
tests/unit/engines/test_yahoo_news.py

@@ -9,10 +9,11 @@ from searx.testing import SearxTestCase
 class TestYahooNewsEngine(SearxTestCase):
 class TestYahooNewsEngine(SearxTestCase):
 
 
     def test_request(self):
     def test_request(self):
+        yahoo_news.supported_languages = ['en', 'fr']
         query = 'test_query'
         query = 'test_query'
         dicto = defaultdict(dict)
         dicto = defaultdict(dict)
         dicto['pageno'] = 1
         dicto['pageno'] = 1
-        dicto['language'] = 'fr_FR'
+        dicto['language'] = 'fr-FR'
         params = yahoo_news.request(query, dicto)
         params = yahoo_news.request(query, dicto)
         self.assertIn('url', params)
         self.assertIn('url', params)
         self.assertIn(query, params['url'])
         self.assertIn(query, params['url'])

+ 25 - 0
tests/unit/test_utils.py

@@ -65,6 +65,31 @@ class TestUtils(SearxTestCase):
         for test_url, expected in data:
         for test_url, expected in data:
             self.assertEqual(utils.prettify_url(test_url, max_length=32), expected)
             self.assertEqual(utils.prettify_url(test_url, max_length=32), expected)
 
 
+    def test_match_language(self):
+        self.assertEqual(utils.match_language('es', ['es']), 'es')
+        self.assertEqual(utils.match_language('es', [], fallback='fallback'), 'fallback')
+        self.assertEqual(utils.match_language('ja', ['jp'], {'ja': 'jp'}), 'jp')
+
+        aliases = {'en-GB': 'en-UK', 'he': 'iw'}
+
+        # guess country
+        self.assertEqual(utils.match_language('de-DE', ['de']), 'de')
+        self.assertEqual(utils.match_language('de', ['de-DE']), 'de-DE')
+        self.assertEqual(utils.match_language('es-CO', ['es-AR', 'es-ES', 'es-MX']), 'es-ES')
+        self.assertEqual(utils.match_language('es-CO', ['es-MX']), 'es-MX')
+        self.assertEqual(utils.match_language('en-UK', ['en-AU', 'en-GB', 'en-US']), 'en-GB')
+        self.assertEqual(utils.match_language('en-GB', ['en-AU', 'en-UK', 'en-US'], aliases), 'en-UK')
+
+        # language aliases
+        self.assertEqual(utils.match_language('iw', ['he']), 'he')
+        self.assertEqual(utils.match_language('he', ['iw'], aliases), 'iw')
+        self.assertEqual(utils.match_language('iw-IL', ['he']), 'he')
+        self.assertEqual(utils.match_language('he-IL', ['iw'], aliases), 'iw')
+        self.assertEqual(utils.match_language('iw', ['he-IL']), 'he-IL')
+        self.assertEqual(utils.match_language('he', ['iw-IL'], aliases), 'iw-IL')
+        self.assertEqual(utils.match_language('iw-IL', ['he-IL']), 'he-IL')
+        self.assertEqual(utils.match_language('he-IL', ['iw-IL'], aliases), 'iw-IL')
+
 
 
 class TestHTMLTextExtractor(SearxTestCase):
 class TestHTMLTextExtractor(SearxTestCase):
 
 

+ 3 - 15
utils/fetch_languages.py

@@ -19,19 +19,6 @@ from searx.engines import initialize_engines, engines
 engines_languages_file = 'engines_languages.json'
 engines_languages_file = 'engines_languages.json'
 languages_file = 'languages.py'
 languages_file = 'languages.py'
 
 
-# custom fixes for non standard locale codes
-# sl-SL is technically not invalid, but still a mistake
-# TODO: move to respective engines
-locale_fixes = {
-    'sl-sl': 'sl-SI',
-    'ar-xa': 'ar-SA',
-    'es-xl': 'es-419',
-    'zh-chs': 'zh-Hans-CN',
-    'zh-cht': 'zh-Hant-TW',
-    'tzh-tw': 'zh-Hant-TW',
-    'tzh-hk': 'zh-Hant-HK'
-}
-
 
 
 # Fetchs supported languages for each engine and writes json file with those.
 # Fetchs supported languages for each engine and writes json file with those.
 def fetch_supported_languages():
 def fetch_supported_languages():
@@ -76,8 +63,9 @@ def join_language_lists(engines_languages):
         for lang_code in engines_languages[engine_name]:
         for lang_code in engines_languages[engine_name]:
 
 
             # apply custom fixes if necessary
             # apply custom fixes if necessary
-            if lang_code.lower() in locale_fixes:
-                lang_code = locale_fixes[lang_code.lower()]
+            if lang_code in getattr(engines[engine_name], 'language_aliases', {}).values():
+                lang_code = next(lc for lc, alias in engines[engine_name].language_aliases.items()
+                                 if lang_code == alias)
 
 
             locale = get_locale(lang_code)
             locale = get_locale(lang_code)
 
 

Some files were not shown because too many files changed in this diff