Browse Source

Merge pull request #1252 from MarcAbonce/search-languages

[mod] Refactor engine's search language handling
Adam Tauber 7 years ago
parent
commit
283f6c9053
43 changed files with 419 additions and 311 deletions
  1. 0 0
      searx/data/engines_languages.json
  2. 20 1
      searx/engines/__init__.py
  3. 2 2
      searx/engines/archlinux.py
  4. 4 2
      searx/engines/bing.py
  5. 2 21
      searx/engines/bing_images.py
  6. 3 3
      searx/engines/bing_news.py
  7. 4 3
      searx/engines/bing_videos.py
  8. 2 1
      searx/engines/dailymotion.py
  9. 19 30
      searx/engines/duckduckgo.py
  10. 4 3
      searx/engines/duckduckgo_definitions.py
  11. 4 1
      searx/engines/duckduckgo_images.py
  12. 15 16
      searx/engines/google.py
  13. 4 2
      searx/engines/google_news.py
  14. 3 10
      searx/engines/qwant.py
  15. 3 5
      searx/engines/swisscows.py
  16. 3 2
      searx/engines/wikidata.py
  17. 2 7
      searx/engines/wikipedia.py
  18. 12 12
      searx/engines/yahoo.py
  19. 5 2
      searx/engines/yahoo_news.py
  20. 16 24
      searx/languages.py
  21. 0 4
      searx/preferences.py
  22. 7 3
      searx/query.py
  23. 2 2
      searx/templates/oscar/preferences.html
  24. 61 0
      searx/utils.py
  25. 15 7
      searx/webapp.py
  26. 6 1
      tests/unit/engines/test_archlinux.py
  27. 1 0
      tests/unit/engines/test_bing.py
  28. 0 1
      tests/unit/engines/test_bing_images.py
  29. 2 1
      tests/unit/engines/test_bing_news.py
  30. 0 1
      tests/unit/engines/test_bing_videos.py
  31. 2 1
      tests/unit/engines/test_dailymotion.py
  32. 12 6
      tests/unit/engines/test_duckduckgo.py
  33. 1 0
      tests/unit/engines/test_duckduckgo_definitions.py
  34. 0 1
      tests/unit/engines/test_duckduckgo_images.py
  35. 7 0
      tests/unit/engines/test_google.py
  36. 1 0
      tests/unit/engines/test_google_news.py
  37. 1 1
      tests/unit/engines/test_qwant.py
  38. 1 0
      tests/unit/engines/test_swisscows.py
  39. 1 0
      tests/unit/engines/test_wikidata.py
  40. 14 3
      tests/unit/engines/test_yahoo.py
  41. 2 1
      tests/unit/engines/test_yahoo_news.py
  42. 25 0
      tests/unit/test_utils.py
  43. 131 131
      utils/fetch_languages.py

File diff suppressed because it is too large
+ 0 - 0
searx/data/engines_languages.json


+ 20 - 1
searx/engines/__init__.py

@@ -20,13 +20,14 @@ import sys
 import threading
 import threading
 from os.path import realpath, dirname
 from os.path import realpath, dirname
 from io import open
 from io import open
+from babel.localedata import locale_identifiers
 from flask_babel import gettext
 from flask_babel import gettext
 from operator import itemgetter
 from operator import itemgetter
 from json import loads
 from json import loads
 from requests import get
 from requests import get
 from searx import settings
 from searx import settings
 from searx import logger
 from searx import logger
-from searx.utils import load_module
+from searx.utils import load_module, match_language
 
 
 
 
 logger = logger.getChild('engines')
 logger = logger.getChild('engines')
@@ -38,6 +39,8 @@ engines = {}
 categories = {'general': []}
 categories = {'general': []}
 
 
 languages = loads(open(engine_dir + '/../data/engines_languages.json', 'r', encoding='utf-8').read())
 languages = loads(open(engine_dir + '/../data/engines_languages.json', 'r', encoding='utf-8').read())
+babel_langs = [lang_parts[0] + '-' + lang_parts[-1] if len(lang_parts) > 1 else lang_parts[0]
+               for lang_parts in (lang_code.split('_') for lang_code in locale_identifiers())]
 
 
 engine_shortcuts = {}
 engine_shortcuts = {}
 engine_default_args = {'paging': False,
 engine_default_args = {'paging': False,
@@ -97,6 +100,22 @@ def load_engine(engine_data):
     if engine_data['name'] in languages:
     if engine_data['name'] in languages:
         setattr(engine, 'supported_languages', languages[engine_data['name']])
         setattr(engine, 'supported_languages', languages[engine_data['name']])
 
 
+    # find custom aliases for non standard language codes
+    if hasattr(engine, 'supported_languages'):
+        if hasattr(engine, 'language_aliases'):
+            language_aliases = getattr(engine, 'language_aliases')
+        else:
+            language_aliases = {}
+
+        for engine_lang in getattr(engine, 'supported_languages'):
+            iso_lang = match_language(engine_lang, babel_langs, fallback=None)
+            if iso_lang and iso_lang != engine_lang and not engine_lang.startswith(iso_lang) and \
+               iso_lang not in getattr(engine, 'supported_languages'):
+                language_aliases[iso_lang] = engine_lang
+
+        if language_aliases:
+            setattr(engine, 'language_aliases', language_aliases)
+
     # assign language fetching method if auxiliary method exists
     # assign language fetching method if auxiliary method exists
     if hasattr(engine, '_fetch_supported_languages'):
     if hasattr(engine, '_fetch_supported_languages'):
         setattr(engine, 'fetch_supported_languages',
         setattr(engine, 'fetch_supported_languages',

+ 2 - 2
searx/engines/archlinux.py

@@ -99,13 +99,13 @@ supported_languages = dict(lang_urls, **main_langs)
 
 
 # do search-request
 # do search-request
 def request(query, params):
 def request(query, params):
-    # translate the locale (e.g. 'en_US') to language code ('en')
+    # translate the locale (e.g. 'en-US') to language code ('en')
     language = locale_to_lang_code(params['language'])
     language = locale_to_lang_code(params['language'])
 
 
     # if our language is hosted on the main site, we need to add its name
     # if our language is hosted on the main site, we need to add its name
     # to the query in order to narrow the results to that language
     # to the query in order to narrow the results to that language
     if language in main_langs:
     if language in main_langs:
-        query += '(' + main_langs[language] + ')'
+        query += b' (' + main_langs[language] + b')'
 
 
     # prepare the request parameters
     # prepare the request parameters
     query = urlencode({'search': query})
     query = urlencode({'search': query})

+ 4 - 2
searx/engines/bing.py

@@ -16,12 +16,14 @@
 from lxml import html
 from lxml import html
 from searx.engines.xpath import extract_text
 from searx.engines.xpath import extract_text
 from searx.url_utils import urlencode
 from searx.url_utils import urlencode
+from searx.utils import match_language
 
 
 # engine dependent config
 # engine dependent config
 categories = ['general']
 categories = ['general']
 paging = True
 paging = True
 language_support = True
 language_support = True
 supported_languages_url = 'https://www.bing.com/account/general'
 supported_languages_url = 'https://www.bing.com/account/general'
+language_aliases = {'zh-CN': 'zh-CHS', 'zh-TW': 'zh-CHT', 'zh-HK': 'zh-CHT'}
 
 
 # search-url
 # search-url
 base_url = 'https://www.bing.com/'
 base_url = 'https://www.bing.com/'
@@ -32,9 +34,9 @@ search_string = 'search?{query}&first={offset}'
 def request(query, params):
 def request(query, params):
     offset = (params['pageno'] - 1) * 10 + 1
     offset = (params['pageno'] - 1) * 10 + 1
 
 
-    lang = params['language'].split('-')[0].upper()
+    lang = match_language(params['language'], supported_languages, language_aliases)
 
 
-    query = u'language:{} {}'.format(lang, query.decode('utf-8')).encode('utf-8')
+    query = u'language:{} {}'.format(lang.split('-')[0].upper(), query.decode('utf-8')).encode('utf-8')
 
 
     search_path = search_string.format(
     search_path = search_string.format(
         query=urlencode({'q': query}),
         query=urlencode({'q': query}),

+ 2 - 21
searx/engines/bing_images.py

@@ -19,6 +19,7 @@ from lxml import html
 from json import loads
 from json import loads
 import re
 import re
 from searx.url_utils import urlencode
 from searx.url_utils import urlencode
+from searx.utils import match_language
 
 
 # engine dependent config
 # engine dependent config
 categories = ['images']
 categories = ['images']
@@ -46,26 +47,6 @@ safesearch_types = {2: 'STRICT',
 _quote_keys_regex = re.compile('({|,)([a-z][a-z0-9]*):(")', re.I | re.U)
 _quote_keys_regex = re.compile('({|,)([a-z][a-z0-9]*):(")', re.I | re.U)
 
 
 
 
-# get supported region code
-def get_region_code(lang, lang_list=None):
-    region = None
-    if lang in (lang_list or supported_languages):
-        region = lang
-    elif lang.startswith('no'):
-        region = 'nb-NO'
-    else:
-        # try to get a supported country code with language
-        lang = lang.split('-')[0]
-        for lc in (lang_list or supported_languages):
-            if lang == lc.split('-')[0]:
-                region = lc
-                break
-    if region:
-        return region.lower()
-    else:
-        return 'en-us'
-
-
 # do search-request
 # do search-request
 def request(query, params):
 def request(query, params):
     offset = (params['pageno'] - 1) * 10 + 1
     offset = (params['pageno'] - 1) * 10 + 1
@@ -74,7 +55,7 @@ def request(query, params):
         query=urlencode({'q': query}),
         query=urlencode({'q': query}),
         offset=offset)
         offset=offset)
 
 
-    language = get_region_code(params['language'])
+    language = match_language(params['language'], supported_languages).lower()
 
 
     params['cookies']['SRCHHPGUSR'] = \
     params['cookies']['SRCHHPGUSR'] = \
         'ADLT=' + safesearch_types.get(params['safesearch'], 'DEMOTE')
         'ADLT=' + safesearch_types.get(params['safesearch'], 'DEMOTE')

+ 3 - 3
searx/engines/bing_news.py

@@ -14,8 +14,8 @@
 from datetime import datetime
 from datetime import datetime
 from dateutil import parser
 from dateutil import parser
 from lxml import etree
 from lxml import etree
-from searx.utils import list_get
-from searx.engines.bing import _fetch_supported_languages, supported_languages_url
+from searx.utils import list_get, match_language
+from searx.engines.bing import _fetch_supported_languages, supported_languages_url, language_aliases
 from searx.url_utils import urlencode, urlparse, parse_qsl
 from searx.url_utils import urlencode, urlparse, parse_qsl
 
 
 # engine dependent config
 # engine dependent config
@@ -71,7 +71,7 @@ def request(query, params):
 
 
     offset = (params['pageno'] - 1) * 10 + 1
     offset = (params['pageno'] - 1) * 10 + 1
 
 
-    language = params['language']
+    language = match_language(params['language'], supported_languages, language_aliases)
 
 
     params['url'] = _get_url(query, language, offset, params['time_range'])
     params['url'] = _get_url(query, language, offset, params['time_range'])
 
 

+ 4 - 3
searx/engines/bing_videos.py

@@ -12,9 +12,10 @@
 
 
 from json import loads
 from json import loads
 from lxml import html
 from lxml import html
-from searx.engines.bing_images import _fetch_supported_languages, supported_languages_url, get_region_code
+from searx.engines.bing_images import _fetch_supported_languages, supported_languages_url
 from searx.engines.xpath import extract_text
 from searx.engines.xpath import extract_text
 from searx.url_utils import urlencode
 from searx.url_utils import urlencode
+from searx.utils import match_language
 
 
 
 
 categories = ['videos']
 categories = ['videos']
@@ -47,8 +48,8 @@ def request(query, params):
         'ADLT=' + safesearch_types.get(params['safesearch'], 'DEMOTE')
         'ADLT=' + safesearch_types.get(params['safesearch'], 'DEMOTE')
 
 
     # language cookie
     # language cookie
-    region = get_region_code(params['language'], lang_list=supported_languages)
-    params['cookies']['_EDGE_S'] = 'mkt=' + region + '&F=1'
+    language = match_language(params['language'], supported_languages).lower()
+    params['cookies']['_EDGE_S'] = 'mkt=' + language + '&F=1'
 
 
     # query and paging
     # query and paging
     params['url'] = search_url.format(query=urlencode({'q': query}),
     params['url'] = search_url.format(query=urlencode({'q': query}),

+ 2 - 1
searx/engines/dailymotion.py

@@ -15,6 +15,7 @@
 from json import loads
 from json import loads
 from datetime import datetime
 from datetime import datetime
 from searx.url_utils import urlencode
 from searx.url_utils import urlencode
+from searx.utils import match_language
 
 
 # engine dependent config
 # engine dependent config
 categories = ['videos']
 categories = ['videos']
@@ -32,7 +33,7 @@ supported_languages_url = 'https://api.dailymotion.com/languages'
 
 
 # do search-request
 # do search-request
 def request(query, params):
 def request(query, params):
-    locale = params['language']
+    locale = match_language(params['language'], supported_languages)
 
 
     params['url'] = search_url.format(
     params['url'] = search_url.format(
         query=urlencode({'search': query, 'localization': locale}),
         query=urlencode({'search': query, 'localization': locale}),

+ 19 - 30
searx/engines/duckduckgo.py

@@ -18,14 +18,25 @@ from json import loads
 from searx.engines.xpath import extract_text
 from searx.engines.xpath import extract_text
 from searx.poolrequests import get
 from searx.poolrequests import get
 from searx.url_utils import urlencode
 from searx.url_utils import urlencode
+from searx.utils import match_language
 
 
 # engine dependent config
 # engine dependent config
 categories = ['general']
 categories = ['general']
 paging = True
 paging = True
 language_support = True
 language_support = True
-supported_languages_url = 'https://duckduckgo.com/d2030.js'
+supported_languages_url = 'https://duckduckgo.com/util/u172.js'
 time_range_support = True
 time_range_support = True
 
 
+language_aliases = {
+    'ar-SA': 'ar-XA',
+    'es-419': 'es-XL',
+    'ja': 'jp-JP',
+    'ko': 'kr-KR',
+    'sl-SI': 'sl-SL',
+    'zh-TW': 'tzh-TW',
+    'zh-HK': 'tzh-HK'
+}
+
 # search-url
 # search-url
 url = 'https://duckduckgo.com/html?{query}&s={offset}&dc={dc_param}'
 url = 'https://duckduckgo.com/html?{query}&s={offset}&dc={dc_param}'
 time_range_url = '&df={range}'
 time_range_url = '&df={range}'
@@ -42,34 +53,12 @@ content_xpath = './/a[@class="result__snippet"]'
 
 
 
 
 # match query's language to a region code that duckduckgo will accept
 # match query's language to a region code that duckduckgo will accept
-def get_region_code(lang, lang_list=None):
-    # custom fixes for languages
-    if lang[:2] == 'ja':
-        region_code = 'jp-jp'
-    elif lang[:2] == 'sl':
-        region_code = 'sl-sl'
-    elif lang == 'zh-TW':
-        region_code = 'tw-tzh'
-    elif lang == 'zh-HK':
-        region_code = 'hk-tzh'
-    elif lang[-2:] == 'SA':
-        region_code = 'xa-' + lang.split('-')[0]
-    elif lang[-2:] == 'GB':
-        region_code = 'uk-' + lang.split('-')[0]
-    else:
-        region_code = lang.split('-')
-        if len(region_code) == 2:
-            # country code goes first
-            region_code = region_code[1].lower() + '-' + region_code[0].lower()
-        else:
-            # tries to get a country code from language
-            region_code = region_code[0].lower()
-            for lc in (lang_list or supported_languages):
-                lc = lc.split('-')
-                if region_code == lc[0]:
-                    region_code = lc[1].lower() + '-' + lc[0].lower()
-                    break
-    return region_code
+def get_region_code(lang, lang_list=[]):
+    lang_code = match_language(lang, lang_list, language_aliases, 'wt-WT')
+    lang_parts = lang_code.split('-')
+
+    # country code goes first
+    return lang_parts[1].lower() + '-' + lang_parts[0].lower()
 
 
 
 
 # do search-request
 # do search-request
@@ -79,7 +68,7 @@ def request(query, params):
 
 
     offset = (params['pageno'] - 1) * 30
     offset = (params['pageno'] - 1) * 30
 
 
-    region_code = get_region_code(params['language'])
+    region_code = get_region_code(params['language'], supported_languages)
     params['url'] = url.format(
     params['url'] = url.format(
         query=urlencode({'q': query, 'kl': region_code}), offset=offset, dc_param=offset)
         query=urlencode({'q': query, 'kl': region_code}), offset=offset, dc_param=offset)
 
 

+ 4 - 3
searx/engines/duckduckgo_definitions.py

@@ -2,9 +2,9 @@ import json
 from lxml import html
 from lxml import html
 from re import compile
 from re import compile
 from searx.engines.xpath import extract_text
 from searx.engines.xpath import extract_text
-from searx.engines.duckduckgo import _fetch_supported_languages, supported_languages_url
+from searx.engines.duckduckgo import _fetch_supported_languages, supported_languages_url, language_aliases
 from searx.url_utils import urlencode
 from searx.url_utils import urlencode
-from searx.utils import html_to_text
+from searx.utils import html_to_text, match_language
 
 
 url = 'https://api.duckduckgo.com/'\
 url = 'https://api.duckduckgo.com/'\
     + '?{query}&format=json&pretty=0&no_redirect=1&d=1'
     + '?{query}&format=json&pretty=0&no_redirect=1&d=1'
@@ -24,7 +24,8 @@ def result_to_text(url, text, htmlResult):
 
 
 def request(query, params):
 def request(query, params):
     params['url'] = url.format(query=urlencode({'q': query}))
     params['url'] = url.format(query=urlencode({'q': query}))
-    params['headers']['Accept-Language'] = params['language'].split('-')[0]
+    language = match_language(params['language'], supported_languages, language_aliases)
+    params['headers']['Accept-Language'] = language.split('-')[0]
     return params
     return params
 
 
 
 

+ 4 - 1
searx/engines/duckduckgo_images.py

@@ -15,7 +15,10 @@
 
 
 from json import loads
 from json import loads
 from searx.engines.xpath import extract_text
 from searx.engines.xpath import extract_text
-from searx.engines.duckduckgo import _fetch_supported_languages, supported_languages_url, get_region_code
+from searx.engines.duckduckgo import (
+    _fetch_supported_languages, supported_languages_url,
+    get_region_code, language_aliases
+)
 from searx.poolrequests import get
 from searx.poolrequests import get
 from searx.url_utils import urlencode
 from searx.url_utils import urlencode
 
 

+ 15 - 16
searx/engines/google.py

@@ -14,6 +14,7 @@ from lxml import html, etree
 from searx.engines.xpath import extract_text, extract_url
 from searx.engines.xpath import extract_text, extract_url
 from searx import logger
 from searx import logger
 from searx.url_utils import urlencode, urlparse, parse_qsl
 from searx.url_utils import urlencode, urlparse, parse_qsl
+from searx.utils import match_language
 
 
 logger = logger.getChild('google engine')
 logger = logger.getChild('google engine')
 
 
@@ -72,7 +73,7 @@ country_to_hostname = {
     'RO': 'www.google.ro',  # Romania
     'RO': 'www.google.ro',  # Romania
     'RU': 'www.google.ru',  # Russia
     'RU': 'www.google.ru',  # Russia
     'SK': 'www.google.sk',  # Slovakia
     'SK': 'www.google.sk',  # Slovakia
-    'SL': 'www.google.si',  # Slovenia (SL -> si)
+    'SI': 'www.google.si',  # Slovenia
     'SE': 'www.google.se',  # Sweden
     'SE': 'www.google.se',  # Sweden
     'TH': 'www.google.co.th',  # Thailand
     'TH': 'www.google.co.th',  # Thailand
     'TR': 'www.google.com.tr',  # Turkey
     'TR': 'www.google.com.tr',  # Turkey
@@ -165,22 +166,20 @@ def extract_text_from_dom(result, xpath):
 def request(query, params):
 def request(query, params):
     offset = (params['pageno'] - 1) * 10
     offset = (params['pageno'] - 1) * 10
 
 
+    language = match_language(params['language'], supported_languages)
+    language_array = language.split('-')
+    if params['language'].find('-') > 0:
+        country = params['language'].split('-')[1]
+    elif len(language_array) == 2:
+        country = language_array[1]
+    else:
+        country = 'US'
+
     # temporary fix until a way of supporting en-US is found
     # temporary fix until a way of supporting en-US is found
-    if params['language'] == 'en-US':
-        params['language'] = 'en-GB'
+    if language == 'en-US':
+        country = 'GB'
 
 
-    if params['language'][:2] == 'jv':
-        language = 'jw'
-        country = 'ID'
-        url_lang = 'lang_jw'
-    else:
-        language_array = params['language'].lower().split('-')
-        if len(language_array) == 2:
-            country = language_array[1]
-        else:
-            country = 'US'
-        language = language_array[0] + ',' + language_array[0] + '-' + country
-        url_lang = 'lang_' + language_array[0]
+    url_lang = 'lang_' + language
 
 
     if use_locale_domain:
     if use_locale_domain:
         google_hostname = country_to_hostname.get(country.upper(), default_hostname)
         google_hostname = country_to_hostname.get(country.upper(), default_hostname)
@@ -196,7 +195,7 @@ def request(query, params):
     if params['time_range'] in time_range_dict:
     if params['time_range'] in time_range_dict:
         params['url'] += time_range_search.format(range=time_range_dict[params['time_range']])
         params['url'] += time_range_search.format(range=time_range_dict[params['time_range']])
 
 
-    params['headers']['Accept-Language'] = language
+    params['headers']['Accept-Language'] = language + ',' + language + '-' + country
     params['headers']['Accept'] = 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8'
     params['headers']['Accept'] = 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8'
 
 
     params['google_hostname'] = google_hostname
     params['google_hostname'] = google_hostname

+ 4 - 2
searx/engines/google_news.py

@@ -13,6 +13,7 @@
 from lxml import html
 from lxml import html
 from searx.engines.google import _fetch_supported_languages, supported_languages_url
 from searx.engines.google import _fetch_supported_languages, supported_languages_url
 from searx.url_utils import urlencode
 from searx.url_utils import urlencode
+from searx.utils import match_language
 
 
 # search-url
 # search-url
 categories = ['news']
 categories = ['news']
@@ -50,8 +51,9 @@ def request(query, params):
     params['url'] = search_url.format(query=urlencode({'q': query}),
     params['url'] = search_url.format(query=urlencode({'q': query}),
                                       search_options=urlencode(search_options))
                                       search_options=urlencode(search_options))
 
 
-    language_array = params['language'].lower().split('-')
-    params['url'] += '&lr=lang_' + language_array[0]
+    language = match_language(params['language'], supported_languages).split('-')[0]
+    if language:
+        params['url'] += '&lr=lang_' + language
 
 
     return params
     return params
 
 

+ 3 - 10
searx/engines/qwant.py

@@ -14,6 +14,7 @@ from datetime import datetime
 from json import loads
 from json import loads
 from searx.utils import html_to_text
 from searx.utils import html_to_text
 from searx.url_utils import urlencode
 from searx.url_utils import urlencode
+from searx.utils import match_language
 
 
 # engine dependent config
 # engine dependent config
 categories = None
 categories = None
@@ -45,16 +46,8 @@ def request(query, params):
                                    offset=offset)
                                    offset=offset)
 
 
     # add language tag
     # add language tag
-    if params['language'] == 'no' or params['language'].startswith('no-'):
-        params['language'] = params['language'].replace('no', 'nb', 1)
-    if params['language'].find('-') < 0:
-        # tries to get a country code from language
-        for lang in supported_languages:
-            lc = lang.split('-')
-            if params['language'] == lc[0]:
-                params['language'] = lang
-                break
-    params['url'] += '&locale=' + params['language'].replace('-', '_').lower()
+    language = match_language(params['language'], supported_languages)
+    params['url'] += '&locale=' + language.replace('-', '_').lower()
 
 
     return params
     return params
 
 

+ 3 - 5
searx/engines/swisscows.py

@@ -14,6 +14,7 @@ from json import loads
 import re
 import re
 from lxml.html import fromstring
 from lxml.html import fromstring
 from searx.url_utils import unquote, urlencode
 from searx.url_utils import unquote, urlencode
+from searx.utils import match_language
 
 
 # engine dependent config
 # engine dependent config
 categories = ['general', 'images']
 categories = ['general', 'images']
@@ -35,11 +36,8 @@ regex_img_url_remove_start = re.compile(b'^https?://i\.swisscows\.ch/\?link=')
 
 
 # do search-request
 # do search-request
 def request(query, params):
 def request(query, params):
-    if params['language'].split('-')[0] == 'no':
-        region = 'nb-NO'
-    else:
-        region = params['language']
-        ui_language = params['language'].split('-')[0]
+    region = match_language(params['language'], supported_languages)
+    ui_language = region.split('-')[0]
 
 
     search_path = search_string.format(
     search_path = search_string.format(
         query=urlencode({'query': query, 'uiLanguage': ui_language, 'region': region}),
         query=urlencode({'query': query, 'uiLanguage': ui_language, 'region': region}),

+ 3 - 2
searx/engines/wikidata.py

@@ -16,6 +16,7 @@ from searx.poolrequests import get
 from searx.engines.xpath import extract_text
 from searx.engines.xpath import extract_text
 from searx.engines.wikipedia import _fetch_supported_languages, supported_languages_url
 from searx.engines.wikipedia import _fetch_supported_languages, supported_languages_url
 from searx.url_utils import urlencode
 from searx.url_utils import urlencode
+from searx.utils import match_language
 
 
 from json import loads
 from json import loads
 from lxml.html import fromstring
 from lxml.html import fromstring
@@ -56,7 +57,7 @@ calendar_name_xpath = './/sup[contains(@class,"wb-calendar-name")]'
 
 
 
 
 def request(query, params):
 def request(query, params):
-    language = params['language'].split('-')[0]
+    language = match_language(params['language'], supported_languages).split('-')[0]
 
 
     params['url'] = url_search.format(
     params['url'] = url_search.format(
         query=urlencode({'label': query, 'language': language}))
         query=urlencode({'label': query, 'language': language}))
@@ -68,7 +69,7 @@ def response(resp):
     html = fromstring(resp.text)
     html = fromstring(resp.text)
     wikidata_ids = html.xpath(wikidata_ids_xpath)
     wikidata_ids = html.xpath(wikidata_ids_xpath)
 
 
-    language = resp.search_params['language'].split('-')[0]
+    language = match_language(resp.search_params['language'], supported_languages).split('-')[0]
 
 
     # TODO: make requests asynchronous to avoid timeout when result_count > 1
     # TODO: make requests asynchronous to avoid timeout when result_count > 1
     for wikidata_id in wikidata_ids[:result_count]:
     for wikidata_id in wikidata_ids[:result_count]:

+ 2 - 7
searx/engines/wikipedia.py

@@ -13,6 +13,7 @@
 from json import loads
 from json import loads
 from lxml.html import fromstring
 from lxml.html import fromstring
 from searx.url_utils import quote, urlencode
 from searx.url_utils import quote, urlencode
+from searx.utils import match_language
 
 
 # search-url
 # search-url
 base_url = u'https://{language}.wikipedia.org/'
 base_url = u'https://{language}.wikipedia.org/'
@@ -30,13 +31,7 @@ supported_languages_url = 'https://meta.wikimedia.org/wiki/List_of_Wikipedias'
 
 
 # set language in base_url
 # set language in base_url
 def url_lang(lang):
 def url_lang(lang):
-    lang = lang.split('-')[0]
-    if lang not in supported_languages:
-        language = 'en'
-    else:
-        language = lang
-
-    return language
+    return match_language(lang, supported_languages).split('-')[0]
 
 
 
 
 # do search-request
 # do search-request

+ 12 - 12
searx/engines/yahoo.py

@@ -14,6 +14,7 @@
 from lxml import html
 from lxml import html
 from searx.engines.xpath import extract_text, extract_url
 from searx.engines.xpath import extract_text, extract_url
 from searx.url_utils import unquote, urlencode
 from searx.url_utils import unquote, urlencode
+from searx.utils import match_language
 
 
 # engine dependent config
 # engine dependent config
 categories = ['general']
 categories = ['general']
@@ -39,6 +40,8 @@ time_range_dict = {'day': ['1d', 'd'],
                    'week': ['1w', 'w'],
                    'week': ['1w', 'w'],
                    'month': ['1m', 'm']}
                    'month': ['1m', 'm']}
 
 
+language_aliases = {'zh-CN': 'zh-CHS', 'zh-TW': 'zh-CHT', 'zh-HK': 'zh-CHT'}
+
 
 
 # remove yahoo-specific tracking-url
 # remove yahoo-specific tracking-url
 def parse_url(url_string):
 def parse_url(url_string):
@@ -70,23 +73,16 @@ def _get_url(query, offset, language, time_range):
                                         lang=language)
                                         lang=language)
 
 
 
 
-def _get_language(params):
-    if params['language'][:2] == 'zh':
-        if params['language'] == 'zh' or params['language'] == 'zh-CH':
-            return 'szh'
-        else:
-            return 'tzh'
-    else:
-        return params['language'].split('-')[0]
-
-
 # do search-request
 # do search-request
 def request(query, params):
 def request(query, params):
     if params['time_range'] and params['time_range'] not in time_range_dict:
     if params['time_range'] and params['time_range'] not in time_range_dict:
         return params
         return params
 
 
     offset = (params['pageno'] - 1) * 10 + 1
     offset = (params['pageno'] - 1) * 10 + 1
-    language = _get_language(params)
+    language = match_language(params['language'], supported_languages, language_aliases)
+    if language not in language_aliases.values():
+        language = language.split('-')[0]
+    language = language.replace('-', '_').lower()
 
 
     params['url'] = _get_url(query, offset, language, params['time_range'])
     params['url'] = _get_url(query, offset, language, params['time_range'])
 
 
@@ -145,7 +141,11 @@ def _fetch_supported_languages(resp):
     dom = html.fromstring(resp.text)
     dom = html.fromstring(resp.text)
     options = dom.xpath('//div[@id="yschlang"]/span/label/input')
     options = dom.xpath('//div[@id="yschlang"]/span/label/input')
     for option in options:
     for option in options:
-        code = option.xpath('./@value')[0][5:].replace('_', '-')
+        code_parts = option.xpath('./@value')[0][5:].split('_')
+        if len(code_parts) == 2:
+            code = code_parts[0] + '-' + code_parts[1].upper()
+        else:
+            code = code_parts[0]
         supported_languages.append(code)
         supported_languages.append(code)
 
 
     return supported_languages
     return supported_languages

+ 5 - 2
searx/engines/yahoo_news.py

@@ -13,9 +13,12 @@ import re
 from datetime import datetime, timedelta
 from datetime import datetime, timedelta
 from lxml import html
 from lxml import html
 from searx.engines.xpath import extract_text, extract_url
 from searx.engines.xpath import extract_text, extract_url
-from searx.engines.yahoo import parse_url, _fetch_supported_languages, supported_languages_url
+from searx.engines.yahoo import (
+    parse_url, _fetch_supported_languages, supported_languages_url, language_aliases
+)
 from dateutil import parser
 from dateutil import parser
 from searx.url_utils import urlencode
 from searx.url_utils import urlencode
+from searx.utils import match_language
 
 
 # engine dependent config
 # engine dependent config
 categories = ['news']
 categories = ['news']
@@ -38,7 +41,7 @@ suggestion_xpath = '//div[contains(@class,"VerALSOTRY")]//a'
 def request(query, params):
 def request(query, params):
     offset = (params['pageno'] - 1) * 10 + 1
     offset = (params['pageno'] - 1) * 10 + 1
 
 
-    language = params['language'].split('-')[0]
+    language = match_language(params['language'], supported_languages, language_aliases).split('-')[0]
 
 
     params['url'] = search_url.format(offset=offset,
     params['url'] = search_url.format(offset=offset,
                                       query=urlencode({'p': query}),
                                       query=urlencode({'p': query}),

+ 16 - 24
searx/languages.py

@@ -5,11 +5,7 @@
 language_codes = (
 language_codes = (
     (u"ar-SA", u"العربية", u"", u"Arabic"),
     (u"ar-SA", u"العربية", u"", u"Arabic"),
     (u"bg-BG", u"Български", u"", u"Bulgarian"),
     (u"bg-BG", u"Български", u"", u"Bulgarian"),
-    (u"ca", u"Català", u"", u"Catalan"),
-    (u"ca-AD", u"Català", u"Andorra", u"Catalan"),
-    (u"ca-CT", u"Català", u"", u"Catalan"),
-    (u"ca-ES", u"Català", u"Espanya", u"Catalan"),
-    (u"ca-FR", u"Català", u"França", u"Catalan"),
+    (u"ca-ES", u"Català", u"", u"Catalan"),
     (u"cs-CZ", u"Čeština", u"", u"Czech"),
     (u"cs-CZ", u"Čeština", u"", u"Czech"),
     (u"da-DK", u"Dansk", u"", u"Danish"),
     (u"da-DK", u"Dansk", u"", u"Danish"),
     (u"de", u"Deutsch", u"", u"German"),
     (u"de", u"Deutsch", u"", u"German"),
@@ -21,55 +17,51 @@ language_codes = (
     (u"en-AU", u"English", u"Australia", u"English"),
     (u"en-AU", u"English", u"Australia", u"English"),
     (u"en-CA", u"English", u"Canada", u"English"),
     (u"en-CA", u"English", u"Canada", u"English"),
     (u"en-GB", u"English", u"United Kingdom", u"English"),
     (u"en-GB", u"English", u"United Kingdom", u"English"),
-    (u"en-ID", u"English", u"Indonesia", u"English"),
-    (u"en-IE", u"English", u"Ireland", u"English"),
     (u"en-IN", u"English", u"India", u"English"),
     (u"en-IN", u"English", u"India", u"English"),
     (u"en-MY", u"English", u"Malaysia", u"English"),
     (u"en-MY", u"English", u"Malaysia", u"English"),
-    (u"en-NZ", u"English", u"New Zealand", u"English"),
-    (u"en-PH", u"English", u"Philippines", u"English"),
-    (u"en-SG", u"English", u"Singapore", u"English"),
     (u"en-US", u"English", u"United States", u"English"),
     (u"en-US", u"English", u"United States", u"English"),
-    (u"en-ZA", u"English", u"South Africa", u"English"),
     (u"es", u"Español", u"", u"Spanish"),
     (u"es", u"Español", u"", u"Spanish"),
-    (u"es-AD", u"Español", u"Andorra", u"Spanish"),
     (u"es-AR", u"Español", u"Argentina", u"Spanish"),
     (u"es-AR", u"Español", u"Argentina", u"Spanish"),
-    (u"es-CL", u"Español", u"Chile", u"Spanish"),
-    (u"es-CO", u"Español", u"Colombia", u"Spanish"),
     (u"es-ES", u"Español", u"España", u"Spanish"),
     (u"es-ES", u"Español", u"España", u"Spanish"),
     (u"es-MX", u"Español", u"México", u"Spanish"),
     (u"es-MX", u"Español", u"México", u"Spanish"),
-    (u"es-PE", u"Español", u"Perú", u"Spanish"),
-    (u"es-US", u"Español", u"Estados Unidos", u"Spanish"),
     (u"et-EE", u"Eesti", u"", u"Estonian"),
     (u"et-EE", u"Eesti", u"", u"Estonian"),
+    (u"fa-IR", u"فارسی", u"", u"Persian"),
     (u"fi-FI", u"Suomi", u"", u"Finnish"),
     (u"fi-FI", u"Suomi", u"", u"Finnish"),
     (u"fr", u"Français", u"", u"French"),
     (u"fr", u"Français", u"", u"French"),
-    (u"fr-AD", u"Français", u"Andorre", u"French"),
     (u"fr-BE", u"Français", u"Belgique", u"French"),
     (u"fr-BE", u"Français", u"Belgique", u"French"),
     (u"fr-CA", u"Français", u"Canada", u"French"),
     (u"fr-CA", u"Français", u"Canada", u"French"),
     (u"fr-CH", u"Français", u"Suisse", u"French"),
     (u"fr-CH", u"Français", u"Suisse", u"French"),
     (u"fr-FR", u"Français", u"France", u"French"),
     (u"fr-FR", u"Français", u"France", u"French"),
     (u"he-IL", u"עברית", u"", u"Hebrew"),
     (u"he-IL", u"עברית", u"", u"Hebrew"),
+    (u"hr-HR", u"Hrvatski", u"", u"Croatian"),
     (u"hu-HU", u"Magyar", u"", u"Hungarian"),
     (u"hu-HU", u"Magyar", u"", u"Hungarian"),
-    (u"it", u"Italiano", u"", u"Italian"),
-    (u"it-CH", u"Italiano", u"Svizzera", u"Italian"),
-    (u"it-IT", u"Italiano", u"Italia", u"Italian"),
+    (u"id-ID", u"Indonesia", u"", u"Indonesian"),
+    (u"is-IS", u"Íslenska", u"", u"Icelandic"),
+    (u"it-IT", u"Italiano", u"", u"Italian"),
     (u"ja-JP", u"日本語", u"", u"Japanese"),
     (u"ja-JP", u"日本語", u"", u"Japanese"),
     (u"ko-KR", u"한국어", u"", u"Korean"),
     (u"ko-KR", u"한국어", u"", u"Korean"),
+    (u"lt-LT", u"Lietuvių", u"", u"Lithuanian"),
+    (u"lv-LV", u"Latviešu", u"", u"Latvian"),
+    (u"ms-MY", u"Bahasa Melayu", u"", u"Malay"),
+    (u"nb-NO", u"Norsk Bokmål", u"", u"Norwegian Bokmål"),
     (u"nl", u"Nederlands", u"", u"Dutch"),
     (u"nl", u"Nederlands", u"", u"Dutch"),
     (u"nl-BE", u"Nederlands", u"België", u"Dutch"),
     (u"nl-BE", u"Nederlands", u"België", u"Dutch"),
     (u"nl-NL", u"Nederlands", u"Nederland", u"Dutch"),
     (u"nl-NL", u"Nederlands", u"Nederland", u"Dutch"),
-    (u"no-NO", u"Norsk", u"", u"Norwegian"),
     (u"pl-PL", u"Polski", u"", u"Polish"),
     (u"pl-PL", u"Polski", u"", u"Polish"),
     (u"pt", u"Português", u"", u"Portuguese"),
     (u"pt", u"Português", u"", u"Portuguese"),
-    (u"pt-AD", u"Português", u"Andorra", u"Portuguese"),
     (u"pt-BR", u"Português", u"Brasil", u"Portuguese"),
     (u"pt-BR", u"Português", u"Brasil", u"Portuguese"),
     (u"pt-PT", u"Português", u"Portugal", u"Portuguese"),
     (u"pt-PT", u"Português", u"Portugal", u"Portuguese"),
     (u"ro-RO", u"Română", u"", u"Romanian"),
     (u"ro-RO", u"Română", u"", u"Romanian"),
     (u"ru-RU", u"Русский", u"", u"Russian"),
     (u"ru-RU", u"Русский", u"", u"Russian"),
+    (u"sk-SK", u"Slovenčina", u"", u"Slovak"),
+    (u"sl-SI", u"Slovenščina", u"", u"Slovenian"),
+    (u"sr-RS", u"Српски", u"", u"Serbian"),
     (u"sv-SE", u"Svenska", u"", u"Swedish"),
     (u"sv-SE", u"Svenska", u"", u"Swedish"),
     (u"th-TH", u"ไทย", u"", u"Thai"),
     (u"th-TH", u"ไทย", u"", u"Thai"),
     (u"tr-TR", u"Türkçe", u"", u"Turkish"),
     (u"tr-TR", u"Türkçe", u"", u"Turkish"),
+    (u"uk-UA", u"Українська", u"", u"Ukrainian"),
+    (u"vi-VN", u"Tiếng Việt", u"", u"Vietnamese"),
     (u"zh", u"中文", u"", u"Chinese"),
     (u"zh", u"中文", u"", u"Chinese"),
     (u"zh-CN", u"中文", u"中国", u"Chinese"),
     (u"zh-CN", u"中文", u"中国", u"Chinese"),
-    (u"zh-HK", u"中文", u"香港", u"Chinese"),
-    (u"zh-TW", u"中文", u"台湾", u"Chinese")
+    (u"zh-TW", u"中文", u"台灣", u"Chinese")
 )
 )

+ 0 - 4
searx/preferences.py

@@ -115,10 +115,6 @@ class SearchLanguageSetting(EnumStringSetting):
                 pass
                 pass
             elif lang in self.choices:
             elif lang in self.choices:
                 data = lang
                 data = lang
-            elif data == 'nb-NO':
-                data = 'no-NO'
-            elif data == 'ar-XA':
-                data = 'ar-SA'
             else:
             else:
                 data = self.value
                 data = self.value
         self.value = data
         self.value = data

+ 7 - 3
searx/query.py

@@ -96,9 +96,13 @@ class RawTextQuery(object):
                                 break
                                 break
 
 
                 # user may set a valid, yet not selectable language
                 # user may set a valid, yet not selectable language
-                if not self.languages and VALID_LANGUAGE_CODE.match(lang):
-                    self.languages.append(lang)
-                    parse_next = True
+                if VALID_LANGUAGE_CODE.match(lang):
+                    lang_parts = lang.split('-')
+                    if len(lang_parts) > 1:
+                        lang = lang_parts[0].lower() + '-' + lang_parts[1].upper()
+                    if lang not in self.languages:
+                        self.languages.append(lang)
+                        parse_next = True
 
 
             # this force a engine or category
             # this force a engine or category
             if query_part[0] == '!' or query_part[0] == '?':
             if query_part[0] == '!' or query_part[0] == '?':

+ 2 - 2
searx/templates/oscar/preferences.html

@@ -187,7 +187,7 @@
                                     </td>
                                     </td>
                                     <th>{{ search_engine.name }}</th>
                                     <th>{{ search_engine.name }}</th>
 				    <td class="name">{{ shortcuts[search_engine.name] }}</td>
 				    <td class="name">{{ shortcuts[search_engine.name] }}</td>
-					<td>{{ support_toggle(current_language in search_engine.supported_languages or current_language.split('-')[0] in search_engine.supported_languages) }}</td>
+					<td>{{ support_toggle(stats[search_engine.name].supports_selected_language) }}</td>
 					<td>{{ support_toggle(search_engine.safesearch==True) }}</td>
 					<td>{{ support_toggle(search_engine.safesearch==True) }}</td>
 					<td>{{ support_toggle(search_engine.time_range_support==True) }}</td>
 					<td>{{ support_toggle(search_engine.time_range_support==True) }}</td>
 					<td class="{{ 'danger' if stats[search_engine.name]['warn_time'] else '' }}">{{ 'N/A' if stats[search_engine.name].time==None else stats[search_engine.name].time }}</td>
 					<td class="{{ 'danger' if stats[search_engine.name]['warn_time'] else '' }}">{{ 'N/A' if stats[search_engine.name].time==None else stats[search_engine.name].time }}</td>
@@ -197,7 +197,7 @@
 					<td class="{{ 'danger' if stats[search_engine.name]['warn_time'] else '' }}">{{ 'N/A' if stats[search_engine.name].time==None else stats[search_engine.name].time }}</td>
 					<td class="{{ 'danger' if stats[search_engine.name]['warn_time'] else '' }}">{{ 'N/A' if stats[search_engine.name].time==None else stats[search_engine.name].time }}</td>
 					<td>{{ support_toggle(search_engine.time_range_support==True) }}</td>
 					<td>{{ support_toggle(search_engine.time_range_support==True) }}</td>
 					<td>{{ support_toggle(search_engine.safesearch==True) }}</td>
 					<td>{{ support_toggle(search_engine.safesearch==True) }}</td>
-					<td>{{ support_toggle(current_language in search_engine.supported_languages or current_language.split('-')[0] in search_engine.supported_languages) }}</td>
+					<td>{{ support_toggle(stats[search_engine.name].supports_selected_language) }}</td>
 					<td>{{ shortcuts[search_engine.name] }}</td>
 					<td>{{ shortcuts[search_engine.name] }}</td>
                                     <th>{{ search_engine.name }}</th>
                                     <th>{{ search_engine.name }}</th>
                                     <td class="onoff-checkbox">
                                     <td class="onoff-checkbox">

+ 61 - 0
searx/utils.py

@@ -4,6 +4,7 @@ import hmac
 import os
 import os
 import re
 import re
 
 
+from babel.core import get_global
 from babel.dates import format_date
 from babel.dates import format_date
 from codecs import getincrementalencoder
 from codecs import getincrementalencoder
 from imp import load_source
 from imp import load_source
@@ -12,6 +13,7 @@ from os.path import splitext, join
 from random import choice
 from random import choice
 import sys
 import sys
 
 
+from searx import settings
 from searx.version import VERSION_STRING
 from searx.version import VERSION_STRING
 from searx.languages import language_codes
 from searx.languages import language_codes
 from searx import settings
 from searx import settings
@@ -322,6 +324,65 @@ def is_valid_lang(lang):
         return False
         return False
 
 
 
 
+# auxiliary function to match lang_code in lang_list
+def _match_language(lang_code, lang_list=[], custom_aliases={}):
+    # replace language code with a custom alias if necessary
+    if lang_code in custom_aliases:
+        lang_code = custom_aliases[lang_code]
+
+    if lang_code in lang_list:
+        return lang_code
+
+    # try to get the most likely country for this language
+    subtags = get_global('likely_subtags').get(lang_code)
+    if subtags:
+        subtag_parts = subtags.split('_')
+        new_code = subtag_parts[0] + '-' + subtag_parts[-1]
+        if new_code in custom_aliases:
+            new_code = custom_aliases[new_code]
+        if new_code in lang_list:
+            return new_code
+
+    # try to get the any supported country for this language
+    for lc in lang_list:
+        if lang_code == lc.split('-')[0]:
+            return lc
+
+    return None
+
+
+# get the language code from lang_list that best matches locale_code
+def match_language(locale_code, lang_list=[], custom_aliases={}, fallback='en-US'):
+    # try to get language from given locale_code
+    language = _match_language(locale_code, lang_list, custom_aliases)
+    if language:
+        return language
+
+    locale_parts = locale_code.split('-')
+    lang_code = locale_parts[0]
+
+    # try to get language using an equivalent country code
+    if len(locale_parts) > 1:
+        country_alias = get_global('territory_aliases').get(locale_parts[-1])
+        if country_alias:
+            language = _match_language(lang_code + '-' + country_alias[0], lang_list, custom_aliases)
+            if language:
+                return language
+
+    # try to get language using an equivalent language code
+    alias = get_global('language_aliases').get(lang_code)
+    if alias:
+        language = _match_language(alias, lang_list, custom_aliases)
+        if language:
+            return language
+
+    if lang_code != locale_code:
+        # try to get language from given language without giving the country
+        language = _match_language(lang_code, lang_list, custom_aliases)
+
+    return language or fallback
+
+
 def load_module(filename, module_dir):
 def load_module(filename, module_dir):
     modname = splitext(filename)[0]
     modname = splitext(filename)[0]
     if modname in sys.modules:
     if modname in sys.modules:

+ 15 - 7
searx/webapp.py

@@ -58,16 +58,16 @@ from searx.engines import (
 from searx.utils import (
 from searx.utils import (
     UnicodeWriter, highlight_content, html_to_text, get_resources_directory,
     UnicodeWriter, highlight_content, html_to_text, get_resources_directory,
     get_static_files, get_result_templates, get_themes, gen_useragent,
     get_static_files, get_result_templates, get_themes, gen_useragent,
-    dict_subset, prettify_url
+    dict_subset, prettify_url, match_language
 )
 )
 from searx.version import VERSION_STRING
 from searx.version import VERSION_STRING
-from searx.languages import language_codes
+from searx.languages import language_codes as languages
 from searx.search import SearchWithPlugins, get_search_query_from_webapp
 from searx.search import SearchWithPlugins, get_search_query_from_webapp
 from searx.query import RawTextQuery
 from searx.query import RawTextQuery
 from searx.autocomplete import searx_bang, backends as autocomplete_backends
 from searx.autocomplete import searx_bang, backends as autocomplete_backends
 from searx.plugins import plugins
 from searx.plugins import plugins
 from searx.plugins.oa_doi_rewrite import get_doi_resolver
 from searx.plugins.oa_doi_rewrite import get_doi_resolver
-from searx.preferences import Preferences, ValidationException
+from searx.preferences import Preferences, ValidationException, LANGUAGE_CODES
 from searx.answerers import answerers
 from searx.answerers import answerers
 from searx.url_utils import urlencode, urlparse, urljoin
 from searx.url_utils import urlencode, urlparse, urljoin
 from searx.utils import new_hmac
 from searx.utils import new_hmac
@@ -133,7 +133,7 @@ if not searx_debug \
 babel = Babel(app)
 babel = Babel(app)
 
 
 rtl_locales = ['ar', 'arc', 'bcc', 'bqi', 'ckb', 'dv', 'fa', 'glk', 'he',
 rtl_locales = ['ar', 'arc', 'bcc', 'bqi', 'ckb', 'dv', 'fa', 'glk', 'he',
-               'ku', 'mzn', 'pnb'', ''ps', 'sd', 'ug', 'ur', 'yi']
+               'ku', 'mzn', 'pnb', 'ps', 'sd', 'ug', 'ur', 'yi']
 
 
 # used when translating category names
 # used when translating category names
 _category_names = (gettext('files'),
 _category_names = (gettext('files'),
@@ -352,9 +352,11 @@ def render(template_name, override_theme=None, **kwargs):
 
 
     kwargs['safesearch'] = str(request.preferences.get_value('safesearch'))
     kwargs['safesearch'] = str(request.preferences.get_value('safesearch'))
 
 
-    kwargs['language_codes'] = language_codes
+    kwargs['language_codes'] = languages
     if 'current_language' not in kwargs:
     if 'current_language' not in kwargs:
-        kwargs['current_language'] = request.preferences.get_value('language')
+        kwargs['current_language'] = match_language(request.preferences.get_value('language'),
+                                                    LANGUAGE_CODES,
+                                                    fallback=settings['search']['language'])
 
 
     # override url_for function in templates
     # override url_for function in templates
     kwargs['url_for'] = url_for_theme
     kwargs['url_for'] = url_for_theme
@@ -590,7 +592,9 @@ def index():
         infoboxes=result_container.infoboxes,
         infoboxes=result_container.infoboxes,
         paging=result_container.paging,
         paging=result_container.paging,
         unresponsive_engines=result_container.unresponsive_engines,
         unresponsive_engines=result_container.unresponsive_engines,
-        current_language=search_query.lang,
+        current_language=match_language(search_query.lang,
+                                        LANGUAGE_CODES,
+                                        fallback=settings['search']['language']),
         base_url=get_base_url(),
         base_url=get_base_url(),
         theme=get_current_theme_name(),
         theme=get_current_theme_name(),
         favicons=global_favicons[themes.index(get_current_theme_name())]
         favicons=global_favicons[themes.index(get_current_theme_name())]
@@ -687,6 +691,10 @@ def preferences():
                              'warn_time': False}
                              'warn_time': False}
             if e.timeout > settings['outgoing']['request_timeout']:
             if e.timeout > settings['outgoing']['request_timeout']:
                 stats[e.name]['warn_timeout'] = True
                 stats[e.name]['warn_timeout'] = True
+            if match_language(request.preferences.get_value('language'),
+                              getattr(e, 'supported_languages', []),
+                              getattr(e, 'language_aliases', {}), None):
+                stats[e.name]['supports_selected_language'] = True
 
 
     # get first element [0], the engine time,
     # get first element [0], the engine time,
     # and then the second element [1] : the time (the first one is the label)
     # and then the second element [1] : the time (the first one is the label)

+ 6 - 1
tests/unit/engines/test_archlinux.py

@@ -19,12 +19,17 @@ class TestArchLinuxEngine(SearxTestCase):
         query = 'test_query'
         query = 'test_query'
         dic = defaultdict(dict)
         dic = defaultdict(dict)
         dic['pageno'] = 1
         dic['pageno'] = 1
-        dic['language'] = 'en_US'
+        dic['language'] = 'en-US'
         params = archlinux.request(query, dic)
         params = archlinux.request(query, dic)
         self.assertTrue('url' in params)
         self.assertTrue('url' in params)
         self.assertTrue(query in params['url'])
         self.assertTrue(query in params['url'])
         self.assertTrue('wiki.archlinux.org' in params['url'])
         self.assertTrue('wiki.archlinux.org' in params['url'])
 
 
+        for lang, name in archlinux.main_langs:
+            dic['language'] = lang
+            params = archlinux.request(query, dic)
+            self.assertTrue(name in params['url'])
+
         for lang, domain in domains.items():
         for lang, domain in domains.items():
             dic['language'] = lang
             dic['language'] = lang
             params = archlinux.request(query, dic)
             params = archlinux.request(query, dic)

+ 1 - 0
tests/unit/engines/test_bing.py

@@ -7,6 +7,7 @@ from searx.testing import SearxTestCase
 class TestBingEngine(SearxTestCase):
 class TestBingEngine(SearxTestCase):
 
 
     def test_request(self):
     def test_request(self):
+        bing.supported_languages = ['en', 'fr', 'zh-CHS', 'zh-CHT', 'pt-PT', 'pt-BR']
         query = u'test_query'
         query = u'test_query'
         dicto = defaultdict(dict)
         dicto = defaultdict(dict)
         dicto['pageno'] = 0
         dicto['pageno'] = 0

+ 0 - 1
tests/unit/engines/test_bing_images.py

@@ -9,7 +9,6 @@ class TestBingImagesEngine(SearxTestCase):
 
 
     def test_request(self):
     def test_request(self):
         bing_images.supported_languages = ['fr-FR', 'en-US']
         bing_images.supported_languages = ['fr-FR', 'en-US']
-
         query = 'test_query'
         query = 'test_query'
         dicto = defaultdict(dict)
         dicto = defaultdict(dict)
         dicto['pageno'] = 1
         dicto['pageno'] = 1

+ 2 - 1
tests/unit/engines/test_bing_news.py

@@ -8,10 +8,11 @@ import lxml
 class TestBingNewsEngine(SearxTestCase):
 class TestBingNewsEngine(SearxTestCase):
 
 
     def test_request(self):
     def test_request(self):
+        bing_news.supported_languages = ['en', 'fr']
         query = 'test_query'
         query = 'test_query'
         dicto = defaultdict(dict)
         dicto = defaultdict(dict)
         dicto['pageno'] = 1
         dicto['pageno'] = 1
-        dicto['language'] = 'fr_FR'
+        dicto['language'] = 'fr-FR'
         dicto['time_range'] = ''
         dicto['time_range'] = ''
         params = bing_news.request(query, dicto)
         params = bing_news.request(query, dicto)
         self.assertIn('url', params)
         self.assertIn('url', params)

+ 0 - 1
tests/unit/engines/test_bing_videos.py

@@ -9,7 +9,6 @@ class TestBingVideosEngine(SearxTestCase):
 
 
     def test_request(self):
     def test_request(self):
         bing_videos.supported_languages = ['fr-FR', 'en-US']
         bing_videos.supported_languages = ['fr-FR', 'en-US']
-
         query = 'test_query'
         query = 'test_query'
         dicto = defaultdict(dict)
         dicto = defaultdict(dict)
         dicto['pageno'] = 1
         dicto['pageno'] = 1

+ 2 - 1
tests/unit/engines/test_dailymotion.py

@@ -8,10 +8,11 @@ from searx.testing import SearxTestCase
 class TestDailymotionEngine(SearxTestCase):
 class TestDailymotionEngine(SearxTestCase):
 
 
     def test_request(self):
     def test_request(self):
+        dailymotion.supported_languages = ['en', 'fr']
         query = 'test_query'
         query = 'test_query'
         dicto = defaultdict(dict)
         dicto = defaultdict(dict)
         dicto['pageno'] = 0
         dicto['pageno'] = 0
-        dicto['language'] = 'fr_FR'
+        dicto['language'] = 'fr-FR'
         params = dailymotion.request(query, dicto)
         params = dailymotion.request(query, dicto)
         self.assertTrue('url' in params)
         self.assertTrue('url' in params)
         self.assertTrue(query in params['url'])
         self.assertTrue(query in params['url'])

+ 12 - 6
tests/unit/engines/test_duckduckgo.py

@@ -1,18 +1,21 @@
 # -*- coding: utf-8 -*-
 # -*- coding: utf-8 -*-
 from collections import defaultdict
 from collections import defaultdict
 import mock
 import mock
-from searx.engines import duckduckgo
+from searx.engines import load_engine, duckduckgo
 from searx.testing import SearxTestCase
 from searx.testing import SearxTestCase
 
 
 
 
 class TestDuckduckgoEngine(SearxTestCase):
 class TestDuckduckgoEngine(SearxTestCase):
 
 
     def test_request(self):
     def test_request(self):
+        duckduckgo = load_engine({'engine': 'duckduckgo', 'name': 'duckduckgo'})
+
         query = 'test_query'
         query = 'test_query'
         dicto = defaultdict(dict)
         dicto = defaultdict(dict)
         dicto['pageno'] = 1
         dicto['pageno'] = 1
-        dicto['language'] = 'de-CH'
         dicto['time_range'] = ''
         dicto['time_range'] = ''
+
+        dicto['language'] = 'de-CH'
         params = duckduckgo.request(query, dicto)
         params = duckduckgo.request(query, dicto)
         self.assertIn('url', params)
         self.assertIn('url', params)
         self.assertIn(query, params['url'])
         self.assertIn(query, params['url'])
@@ -20,16 +23,19 @@ class TestDuckduckgoEngine(SearxTestCase):
         self.assertIn('ch-de', params['url'])
         self.assertIn('ch-de', params['url'])
         self.assertIn('s=0', params['url'])
         self.assertIn('s=0', params['url'])
 
 
-        # when ddg uses non standard code
+        # when ddg uses non standard codes
+        dicto['language'] = 'zh-HK'
+        params = duckduckgo.request(query, dicto)
+        self.assertIn('hk-tzh', params['url'])
+
         dicto['language'] = 'en-GB'
         dicto['language'] = 'en-GB'
         params = duckduckgo.request(query, dicto)
         params = duckduckgo.request(query, dicto)
         self.assertIn('uk-en', params['url'])
         self.assertIn('uk-en', params['url'])
 
 
         # no country given
         # no country given
-        duckduckgo.supported_languages = ['de-CH', 'en-US']
-        dicto['language'] = 'de'
+        dicto['language'] = 'en'
         params = duckduckgo.request(query, dicto)
         params = duckduckgo.request(query, dicto)
-        self.assertIn('ch-de', params['url'])
+        self.assertIn('us-en', params['url'])
 
 
     def test_no_url_in_request_year_time_range(self):
     def test_no_url_in_request_year_time_range(self):
         dicto = defaultdict(dict)
         dicto = defaultdict(dict)

+ 1 - 0
tests/unit/engines/test_duckduckgo_definitions.py

@@ -18,6 +18,7 @@ class TestDDGDefinitionsEngine(SearxTestCase):
         self.assertEqual(result, 'Text in link')
         self.assertEqual(result, 'Text in link')
 
 
     def test_request(self):
     def test_request(self):
+        duckduckgo_definitions.supported_languages = ['en-US', 'es-ES']
         query = 'test_query'
         query = 'test_query'
         dicto = defaultdict(dict)
         dicto = defaultdict(dict)
         dicto['pageno'] = 1
         dicto['pageno'] = 1

+ 0 - 1
tests/unit/engines/test_duckduckgo_images.py

@@ -9,7 +9,6 @@ class TestDuckduckgoImagesEngine(SearxTestCase):
 
 
     def test_request(self):
     def test_request(self):
         duckduckgo_images.supported_languages = ['de-CH', 'en-US']
         duckduckgo_images.supported_languages = ['de-CH', 'en-US']
-
         query = 'test_query'
         query = 'test_query'
         dicto = defaultdict(dict)
         dicto = defaultdict(dict)
         dicto['is_test'] = True
         dicto['is_test'] = True

+ 7 - 0
tests/unit/engines/test_google.py

@@ -15,6 +15,8 @@ class TestGoogleEngine(SearxTestCase):
         return response
         return response
 
 
     def test_request(self):
     def test_request(self):
+        google.supported_languages = ['en', 'fr', 'zh-CN']
+
         query = 'test_query'
         query = 'test_query'
         dicto = defaultdict(dict)
         dicto = defaultdict(dict)
         dicto['pageno'] = 1
         dicto['pageno'] = 1
@@ -31,6 +33,11 @@ class TestGoogleEngine(SearxTestCase):
         self.assertIn('google.co', params['url'])
         self.assertIn('google.co', params['url'])
         self.assertIn('en', params['headers']['Accept-Language'])
         self.assertIn('en', params['headers']['Accept-Language'])
 
 
+        dicto['language'] = 'zh'
+        params = google.request(query, dicto)
+        self.assertIn('google.com', params['url'])
+        self.assertIn('zh-CN', params['headers']['Accept-Language'])
+
     def test_response(self):
     def test_response(self):
         self.assertRaises(AttributeError, google.response, None)
         self.assertRaises(AttributeError, google.response, None)
         self.assertRaises(AttributeError, google.response, [])
         self.assertRaises(AttributeError, google.response, [])

+ 1 - 0
tests/unit/engines/test_google_news.py

@@ -9,6 +9,7 @@ from searx.testing import SearxTestCase
 class TestGoogleNewsEngine(SearxTestCase):
 class TestGoogleNewsEngine(SearxTestCase):
 
 
     def test_request(self):
     def test_request(self):
+        google_news.supported_languages = ['en-US', 'fr-FR']
         query = 'test_query'
         query = 'test_query'
         dicto = defaultdict(dict)
         dicto = defaultdict(dict)
         dicto['pageno'] = 1
         dicto['pageno'] = 1

+ 1 - 1
tests/unit/engines/test_qwant.py

@@ -7,6 +7,7 @@ from searx.testing import SearxTestCase
 class TestQwantEngine(SearxTestCase):
 class TestQwantEngine(SearxTestCase):
 
 
     def test_request(self):
     def test_request(self):
+        qwant.supported_languages = ['en-US', 'fr-CA', 'fr-FR']
         query = 'test_query'
         query = 'test_query'
         dicto = defaultdict(dict)
         dicto = defaultdict(dict)
         dicto['pageno'] = 0
         dicto['pageno'] = 0
@@ -26,7 +27,6 @@ class TestQwantEngine(SearxTestCase):
         self.assertIn('en_us', params['url'])
         self.assertIn('en_us', params['url'])
         self.assertIn('news', params['url'])
         self.assertIn('news', params['url'])
 
 
-        qwant.supported_languages = ['en', 'fr-FR', 'fr-CA']
         dicto['language'] = 'fr'
         dicto['language'] = 'fr'
         params = qwant.request(query, dicto)
         params = qwant.request(query, dicto)
         self.assertIn('fr_fr', params['url'])
         self.assertIn('fr_fr', params['url'])

+ 1 - 0
tests/unit/engines/test_swisscows.py

@@ -7,6 +7,7 @@ from searx.testing import SearxTestCase
 class TestSwisscowsEngine(SearxTestCase):
 class TestSwisscowsEngine(SearxTestCase):
 
 
     def test_request(self):
     def test_request(self):
+        swisscows.supported_languages = ['de-AT', 'de-DE']
         query = 'test_query'
         query = 'test_query'
         dicto = defaultdict(dict)
         dicto = defaultdict(dict)
         dicto['pageno'] = 1
         dicto['pageno'] = 1

+ 1 - 0
tests/unit/engines/test_wikidata.py

@@ -9,6 +9,7 @@ from searx.testing import SearxTestCase
 class TestWikidataEngine(SearxTestCase):
 class TestWikidataEngine(SearxTestCase):
 
 
     def test_request(self):
     def test_request(self):
+        wikidata.supported_languages = ['en', 'es']
         query = 'test_query'
         query = 'test_query'
         dicto = defaultdict(dict)
         dicto = defaultdict(dict)
         dicto['language'] = 'en-US'
         dicto['language'] = 'en-US'

+ 14 - 3
tests/unit/engines/test_yahoo.py

@@ -25,11 +25,12 @@ class TestYahooEngine(SearxTestCase):
         self.assertEqual('https://this.is.the.url/', url)
         self.assertEqual('https://this.is.the.url/', url)
 
 
     def test_request(self):
     def test_request(self):
+        yahoo.supported_languages = ['en', 'fr', 'zh-CHT', 'zh-CHS']
         query = 'test_query'
         query = 'test_query'
         dicto = defaultdict(dict)
         dicto = defaultdict(dict)
         dicto['pageno'] = 1
         dicto['pageno'] = 1
         dicto['time_range'] = ''
         dicto['time_range'] = ''
-        dicto['language'] = 'fr_FR'
+        dicto['language'] = 'fr-FR'
         params = yahoo.request(query, dicto)
         params = yahoo.request(query, dicto)
         self.assertIn('url', params)
         self.assertIn('url', params)
         self.assertIn(query, params['url'])
         self.assertIn(query, params['url'])
@@ -39,6 +40,16 @@ class TestYahooEngine(SearxTestCase):
         self.assertIn('sB', params['cookies'])
         self.assertIn('sB', params['cookies'])
         self.assertIn('fr', params['cookies']['sB'])
         self.assertIn('fr', params['cookies']['sB'])
 
 
+        dicto['language'] = 'zh'
+        params = yahoo.request(query, dicto)
+        self.assertIn('zh_chs', params['url'])
+        self.assertIn('zh_chs', params['cookies']['sB'])
+
+        dicto['language'] = 'zh-TW'
+        params = yahoo.request(query, dicto)
+        self.assertIn('zh_cht', params['url'])
+        self.assertIn('zh_cht', params['cookies']['sB'])
+
     def test_no_url_in_request_year_time_range(self):
     def test_no_url_in_request_year_time_range(self):
         dicto = defaultdict(dict)
         dicto = defaultdict(dict)
         query = 'test_query'
         query = 'test_query'
@@ -168,5 +179,5 @@ class TestYahooEngine(SearxTestCase):
         self.assertEqual(type(languages), list)
         self.assertEqual(type(languages), list)
         self.assertEqual(len(languages), 3)
         self.assertEqual(len(languages), 3)
         self.assertIn('ar', languages)
         self.assertIn('ar', languages)
-        self.assertIn('zh-chs', languages)
-        self.assertIn('zh-cht', languages)
+        self.assertIn('zh-CHS', languages)
+        self.assertIn('zh-CHT', languages)

+ 2 - 1
tests/unit/engines/test_yahoo_news.py

@@ -9,10 +9,11 @@ from searx.testing import SearxTestCase
 class TestYahooNewsEngine(SearxTestCase):
 class TestYahooNewsEngine(SearxTestCase):
 
 
     def test_request(self):
     def test_request(self):
+        yahoo_news.supported_languages = ['en', 'fr']
         query = 'test_query'
         query = 'test_query'
         dicto = defaultdict(dict)
         dicto = defaultdict(dict)
         dicto['pageno'] = 1
         dicto['pageno'] = 1
-        dicto['language'] = 'fr_FR'
+        dicto['language'] = 'fr-FR'
         params = yahoo_news.request(query, dicto)
         params = yahoo_news.request(query, dicto)
         self.assertIn('url', params)
         self.assertIn('url', params)
         self.assertIn(query, params['url'])
         self.assertIn(query, params['url'])

+ 25 - 0
tests/unit/test_utils.py

@@ -65,6 +65,31 @@ class TestUtils(SearxTestCase):
         for test_url, expected in data:
         for test_url, expected in data:
             self.assertEqual(utils.prettify_url(test_url, max_length=32), expected)
             self.assertEqual(utils.prettify_url(test_url, max_length=32), expected)
 
 
+    def test_match_language(self):
+        self.assertEqual(utils.match_language('es', ['es']), 'es')
+        self.assertEqual(utils.match_language('es', [], fallback='fallback'), 'fallback')
+        self.assertEqual(utils.match_language('ja', ['jp'], {'ja': 'jp'}), 'jp')
+
+        aliases = {'en-GB': 'en-UK', 'he': 'iw'}
+
+        # guess country
+        self.assertEqual(utils.match_language('de-DE', ['de']), 'de')
+        self.assertEqual(utils.match_language('de', ['de-DE']), 'de-DE')
+        self.assertEqual(utils.match_language('es-CO', ['es-AR', 'es-ES', 'es-MX']), 'es-ES')
+        self.assertEqual(utils.match_language('es-CO', ['es-MX']), 'es-MX')
+        self.assertEqual(utils.match_language('en-UK', ['en-AU', 'en-GB', 'en-US']), 'en-GB')
+        self.assertEqual(utils.match_language('en-GB', ['en-AU', 'en-UK', 'en-US'], aliases), 'en-UK')
+
+        # language aliases
+        self.assertEqual(utils.match_language('iw', ['he']), 'he')
+        self.assertEqual(utils.match_language('he', ['iw'], aliases), 'iw')
+        self.assertEqual(utils.match_language('iw-IL', ['he']), 'he')
+        self.assertEqual(utils.match_language('he-IL', ['iw'], aliases), 'iw')
+        self.assertEqual(utils.match_language('iw', ['he-IL']), 'he-IL')
+        self.assertEqual(utils.match_language('he', ['iw-IL'], aliases), 'iw-IL')
+        self.assertEqual(utils.match_language('iw-IL', ['he-IL']), 'he-IL')
+        self.assertEqual(utils.match_language('he-IL', ['iw-IL'], aliases), 'iw-IL')
+
 
 
 class TestHTMLTextExtractor(SearxTestCase):
 class TestHTMLTextExtractor(SearxTestCase):
 
 

+ 131 - 131
utils/fetch_languages.py

@@ -2,83 +2,27 @@
 
 
 # This script generates languages.py from intersecting each engine's supported languages.
 # This script generates languages.py from intersecting each engine's supported languages.
 #
 #
-# The country names are obtained from http://api.geonames.org which requires registering as a user.
-#
 # Output files (engines_languages.json and languages.py)
 # Output files (engines_languages.json and languages.py)
 # are written in current directory to avoid overwriting in case something goes wrong.
 # are written in current directory to avoid overwriting in case something goes wrong.
 
 
-from requests import get
-from lxml.html import fromstring
-from json import loads, dump
+from json import dump
 import io
 import io
 from sys import path
 from sys import path
+from babel import Locale, UnknownLocaleError
+from babel.languages import get_global
+
 path.append('../searx')  # noqa
 path.append('../searx')  # noqa
 from searx import settings
 from searx import settings
-from searx.url_utils import urlencode
 from searx.engines import initialize_engines, engines
 from searx.engines import initialize_engines, engines
 
 
-# Geonames API for country names.
-geonames_user = ''  # ADD USER NAME HERE
-country_names_url = 'http://api.geonames.org/countryInfoJSON?{parameters}'
-
 # Output files.
 # Output files.
 engines_languages_file = 'engines_languages.json'
 engines_languages_file = 'engines_languages.json'
 languages_file = 'languages.py'
 languages_file = 'languages.py'
 
 
-engines_languages = {}
-
-
-# To filter out invalid codes and dialects.
-def valid_code(lang_code):
-    # filter invalid codes
-    # sl-SL is technically not invalid, but still a mistake
-    invalid_codes = ['sl-SL', 'wt-WT', 'jw']
-    invalid_countries = ['UK', 'XA', 'XL']
-    if lang_code[:2] == 'xx'\
-       or lang_code in invalid_codes\
-       or lang_code[-2:] in invalid_countries\
-       or is_dialect(lang_code):
-        return False
-
-    return True
-
-
-# Language codes with any additional tags other than language and country.
-def is_dialect(lang_code):
-    lang_code = lang_code.split('-')
-    if len(lang_code) > 2 or len(lang_code[0]) > 3:
-        return True
-    if len(lang_code) == 2 and len(lang_code[1]) > 2:
-        return True
-
-    return False
-
-
-# Get country name in specified language.
-def get_country_name(locale):
-    if geonames_user is '':
-        return ''
-
-    locale = locale.split('-')
-    if len(locale) != 2:
-        return ''
-
-    url = country_names_url.format(parameters=urlencode({'lang': locale[0],
-                                                         'country': locale[1],
-                                                         'username': geonames_user}))
-    response = get(url)
-    json = loads(response.text)
-    content = json.get('geonames', None)
-    if content is None or len(content) != 1:
-        print("No country name found for " + locale[0] + "-" + locale[1])
-        return ''
-
-    return content[0].get('countryName', '')
-
 
 
 # Fetchs supported languages for each engine and writes json file with those.
 # Fetchs supported languages for each engine and writes json file with those.
 def fetch_supported_languages():
 def fetch_supported_languages():
-    initialize_engines(settings['engines'])
+    engines_languages = {}
     for engine_name in engines:
     for engine_name in engines:
         if hasattr(engines[engine_name], 'fetch_supported_languages'):
         if hasattr(engines[engine_name], 'fetch_supported_languages'):
             try:
             try:
@@ -90,81 +34,135 @@ def fetch_supported_languages():
     with io.open(engines_languages_file, "w", encoding="utf-8") as f:
     with io.open(engines_languages_file, "w", encoding="utf-8") as f:
         dump(engines_languages, f, ensure_ascii=False)
         dump(engines_languages, f, ensure_ascii=False)
 
 
+    return engines_languages
+
+
+# Get babel Locale object from lang_code if possible.
+def get_locale(lang_code):
+    try:
+        locale = Locale.parse(lang_code, sep='-')
+        return locale
+    except (UnknownLocaleError, ValueError):
+        return None
+
+
+# Append engine_name to list of engines that support locale.
+def add_engine_counter(lang_code, engine_name, languages):
+    if lang_code in languages:
+        if 'counter' not in languages[lang_code]:
+            languages[lang_code]['counter'] = [engine_name]
+        elif engine_name not in languages[lang_code]['counter']:
+            languages[lang_code]['counter'].append(engine_name)
 
 
-# Join all language lists.
-# Iterate all languages supported by each engine.
-def join_language_lists():
-    global languages
-    # include wikipedia first for more accurate language names
-    languages = {code: lang for code, lang
-                 in engines_languages['wikipedia'].items()
-                 if valid_code(code)}
 
 
+# Join all language lists.
+# TODO: Add language names from engine's language list if name not known by babel.
+def join_language_lists(engines_languages):
+    language_list = {}
     for engine_name in engines_languages:
     for engine_name in engines_languages:
-        for locale in engines_languages[engine_name]:
-            if valid_code(locale):
-                # if language is not on list or if it has no name yet
-                if locale not in languages or not languages[locale].get('name'):
-                    if isinstance(engines_languages[engine_name], dict):
-                        languages[locale] = engines_languages[engine_name][locale]
-                    else:
-                        languages[locale] = {}
-
-            # add to counter of engines that support given language
-            lang = locale.split('-')[0]
-            if lang in languages:
-                if 'counter' not in languages[lang]:
-                    languages[lang]['counter'] = [engine_name]
-                elif engine_name not in languages[lang]['counter']:
-                    languages[lang]['counter'].append(engine_name)
-
-    # filter list to include only languages supported by most engines
-    min_supported_engines = int(0.70 * len(engines_languages))
-    languages = {code: lang for code, lang
-                 in languages.items()
-                 if len(lang.get('counter', [])) >= min_supported_engines or
-                 len(languages.get(code.split('-')[0], {}).get('counter', [])) >= min_supported_engines}
-
-    # get locales that have no name or country yet
-    for locale in languages.keys():
-        # try to get language names
-        if not languages[locale].get('name'):
-            name = languages.get(locale.split('-')[0], {}).get('name', None)
-            if name:
-                languages[locale]['name'] = name
-            else:
-                # filter out locales with no name
-                del languages[locale]
-                continue
-
-        # try to get language name in english
-        if not languages[locale].get('english_name'):
-            languages[locale]['english_name'] = languages.get(locale.split('-')[0], {}).get('english_name', '')
-
-        # try to get country name
-        if locale.find('-') > 0 and not languages[locale].get('country'):
-            languages[locale]['country'] = get_country_name(locale) or ''
-
-
-# Remove countryless language if language is featured in only one country.
-def filter_single_country_languages():
-    prev_lang = None
-    prev_code = None
-    for code in sorted(languages):
-        lang = code.split('-')[0]
-        if lang == prev_lang:
+        for lang_code in engines_languages[engine_name]:
+
+            # apply custom fixes if necessary
+            if lang_code in getattr(engines[engine_name], 'language_aliases', {}).values():
+                lang_code = next(lc for lc, alias in engines[engine_name].language_aliases.items()
+                                 if lang_code == alias)
+
+            locale = get_locale(lang_code)
+
+            # ensure that lang_code uses standard language and country codes
+            if locale and locale.territory:
+                lang_code = locale.language + '-' + locale.territory
+
+            # add locale if it's not in list
+            if lang_code not in language_list:
+                if locale:
+                    language_list[lang_code] = {'name': locale.get_language_name().title(),
+                                                'english_name': locale.english_name,
+                                                'country': locale.get_territory_name() or ''}
+
+                    # also add language without country
+                    if locale.language not in language_list:
+                        language_list[locale.language] = {'name': locale.get_language_name().title(),
+                                                          'english_name': locale.english_name}
+                else:
+                    language_list[lang_code] = {}
+
+            # count engine for both language_country combination and language alone
+            add_engine_counter(lang_code, engine_name, language_list)
+            add_engine_counter(lang_code.split('-')[0], engine_name, language_list)
+
+    return language_list
+
+
+# Filter language list so it only includes the most supported languages and countries.
+def filter_language_list(all_languages):
+    min_supported_engines = 10
+    main_engines = [engine_name for engine_name in engines.keys()
+                    if 'general' in engines[engine_name].categories and
+                       engines[engine_name].supported_languages and
+                       not engines[engine_name].disabled]
+
+    # filter list to include only languages supported by most engines or all default general engines
+    filtered_languages = {code: lang for code, lang
+                          in all_languages.items()
+                          if (len(lang.get('counter', [])) >= min_supported_engines or
+                              all(main_engine in lang.get('counter', [])
+                                  for main_engine in main_engines))}
+
+    return filtered_languages
+
+
+# Add country codes to languages without one and filter out language codes.
+def assign_country_codes(filtered_languages, all_languages):
+    sorted_languages = sorted(all_languages,
+                              key=lambda lang: len(all_languages[lang].get('counter', [])),
+                              reverse=True)
+    previous_lang = None
+    previous_code = None
+    countries = 0
+    for current_code in sorted(filtered_languages):
+        current_lang = current_code.split('-')[0]
+
+        # count country codes per language
+        if current_lang == previous_lang:
             countries += 1
             countries += 1
+
         else:
         else:
-            if prev_lang is not None and countries == 1:
-                del languages[prev_lang]
-                languages[prev_code]['country'] = ''
+            if previous_lang is not None:
+                # if language has no single country code
+                if countries == 0:
+                    # try to get country code with most supported engines
+                    for l in sorted_languages:
+                        l_parts = l.split('-')
+                        if len(l_parts) == 2 and l_parts[0] == previous_lang:
+                            filtered_languages[l] = all_languages[l]
+                            filtered_languages[l]['country'] = ''
+                            countries = 1
+                            break
+
+                    if countries == 0:
+                        # get most likely country code from babel
+                        subtags = get_global('likely_subtags').get(previous_lang)
+                        if subtags:
+                            subtag_parts = subtags.split('_')
+                            new_code = subtag_parts[0] + '-' + subtag_parts[-1]
+                            filtered_languages[new_code] = all_languages[previous_lang]
+                            countries = 1
+
+                if countries == 1:
+                    # remove countryless version of language if there's only one country
+                    del filtered_languages[previous_lang]
+                    if previous_code in filtered_languages:
+                        filtered_languages[previous_code]['country'] = ''
+
             countries = 0
             countries = 0
-            prev_lang = lang
-        prev_code = code
+            previous_lang = current_lang
+
+        previous_code = current_code
 
 
 
 
 # Write languages.py.
 # Write languages.py.
-def write_languages_file():
+def write_languages_file(languages):
     new_file = open(languages_file, 'wb')
     new_file = open(languages_file, 'wb')
     file_content = '# -*- coding: utf-8 -*-\n'\
     file_content = '# -*- coding: utf-8 -*-\n'\
                    + '# list of language codes\n'\
                    + '# list of language codes\n'\
@@ -183,7 +181,9 @@ def write_languages_file():
 
 
 
 
 if __name__ == "__main__":
 if __name__ == "__main__":
-    fetch_supported_languages()
-    join_language_lists()
-    filter_single_country_languages()
-    write_languages_file()
+    initialize_engines(settings['engines'])
+    engines_languages = fetch_supported_languages()
+    all_languages = join_language_lists(engines_languages)
+    filtered_languages = filter_language_list(all_languages)
+    assign_country_codes(filtered_languages, all_languages)
+    write_languages_file(filtered_languages)

Some files were not shown because too many files changed in this diff