Browse Source

Add language support for more engines.

marc 8 years ago
parent
commit
a11948c71b

+ 18 - 0
searx/engines/dailymotion.py

@@ -20,6 +20,24 @@ from datetime import datetime
 categories = ['videos']
 categories = ['videos']
 paging = True
 paging = True
 language_support = True
 language_support = True
+supported_languages = ["af", "ak", "am", "ar", "an", "as", "av", "ae", "ay", "az",
+                       "ba", "bm", "be", "bn", "bi", "bo", "bs", "br", "bg", "ca",
+                       "cs", "ch", "ce", "cu", "cv", "kw", "co", "cr", "cy", "da",
+                       "de", "dv", "dz", "el", "en", "eo", "et", "eu", "ee", "fo",
+                       "fa", "fj", "fi", "fr", "fy", "ff", "gd", "ga", "gl", "gv",
+                       "gn", "gu", "ht", "ha", "sh", "he", "hz", "hi", "ho", "hr",
+                       "hu", "hy", "ig", "io", "ii", "iu", "ie", "ia", "id", "ik",
+                       "is", "it", "jv", "ja", "kl", "kn", "ks", "ka", "kr", "kk",
+                       "km", "ki", "rw", "ky", "kv", "kg", "ko", "kj", "ku", "lo",
+                       "la", "lv", "li", "ln", "lt", "lb", "lu", "lg", "mh", "ml",
+                       "mr", "mk", "mg", "mt", "mn", "mi", "ms", "my", "na", "nv",
+                       "nr", "nd", "ng", "ne", "nl", "nn", "nb", "no", "ny", "oc",
+                       "oj", "or", "om", "os", "pa", "pi", "pl", "pt", "ps", "qu",
+                       "rm", "ro", "rn", "ru", "sg", "sa", "si", "sk", "sl", "se",
+                       "sm", "sn", "sd", "so", "st", "es", "sq", "sc", "sr", "ss",
+                       "su", "sw", "sv", "ty", "ta", "tt", "te", "tg", "tl", "th",
+                       "ti", "to", "tn", "ts", "tk", "tr", "tw", "ug", "uk", "ur",
+                       "uz", "ve", "vi", "vo", "wa", "wo", "xh", "yi", "yo", "za", "zh", "zu"]
 
 
 # search-url
 # search-url
 # see http://www.dailymotion.com/doc/api/obj-video.html
 # see http://www.dailymotion.com/doc/api/obj-video.html

+ 1 - 21
searx/engines/duckduckgo.py

@@ -16,7 +16,6 @@
 from urllib import urlencode
 from urllib import urlencode
 from lxml.html import fromstring
 from lxml.html import fromstring
 from searx.engines.xpath import extract_text
 from searx.engines.xpath import extract_text
-from searx.languages import language_codes
 
 
 # engine dependent config
 # engine dependent config
 categories = ['general']
 categories = ['general']
@@ -76,26 +75,7 @@ def request(query, params):
         else:
         else:
             # tries to get a country code from language
             # tries to get a country code from language
             locale = locale[0].lower()
             locale = locale[0].lower()
-            lang_codes = [x[0] for x in language_codes]
-            for lc in lang_codes:
-                lc = lc.split('-')
-                if locale == lc[0] and len(lc) == 2:
-                    locale = lc[1].lower() + '-' + lc[0].lower()
-                    break
-
-    if locale:
-        params['url'] = url.format(
-            query=urlencode({'q': query, 'kl': locale}), offset=offset)
-    else:
-        locale = params['language'].split('-')
-        if len(locale) == 2:
-            # country code goes first
-            locale = locale[1].lower() + '-' + locale[0].lower()
-        else:
-            # tries to get a country code from language
-            locale = locale[0].lower()
-            lang_codes = [x[0] for x in language_codes]
-            for lc in lang_codes:
+            for lc in supported_languages:
                 lc = lc.split('-')
                 lc = lc.split('-')
                 if locale == lc[0]:
                 if locale == lc[0]:
                     locale = lc[1].lower() + '-' + lc[0].lower()
                     locale = lc[1].lower() + '-' + lc[0].lower()

+ 1 - 1
searx/engines/gigablast.py

@@ -44,7 +44,7 @@ supported_languages = ["en", "fr", "es", "ru", "tr", "ja", "zh-CN", "zh-TW", "ko
                        "nl", "it", "fi", "sv", "no", "pt", "vi", "ar", "he", "id", "el",
                        "nl", "it", "fi", "sv", "no", "pt", "vi", "ar", "he", "id", "el",
                        "th", "hi", "bn", "pl", "tl", "la", "eo", "ca", "bg", "tx", "sr",
                        "th", "hi", "bn", "pl", "tl", "la", "eo", "ca", "bg", "tx", "sr",
                        "hu", "da", "lt", "cs", "gl", "ka", "gd", "go", "ro", "ga", "lv",
                        "hu", "da", "lt", "cs", "gl", "ka", "gd", "go", "ro", "ga", "lv",
-                       "hy", "is", "ag", "gv", "io", "fa", "te", "vv", "mg", "ku", "lb", "et"] 
+                       "hy", "is", "ag", "gv", "io", "fa", "te", "vv", "mg", "ku", "lb", "et"]
 
 
 
 
 # do search-request
 # do search-request

+ 14 - 1
searx/engines/qwant.py

@@ -20,6 +20,11 @@ from searx.utils import html_to_text
 categories = None
 categories = None
 paging = True
 paging = True
 language_support = True
 language_support = True
+supported_languages = ["fr-FR", "de-DE", "en-GB", "it-IT", "es-ES", "pt-PT", "de-CH", "fr-CH", "it-CH", "de-AT",
+                       "fr-BE", "nl-BE", "nl-NL", "da-DK", "fi-FI", "sv-SE", "en-IE", "no-NO", "pl-PL", "ru-RU",
+                       "el-GR", "bg-BG", "cs-CZ", "et-EE", "hu-HU", "ro-RO", "en-US", "en-CA", "fr-CA", "pt-BR",
+                       "es-AR", "es-CL", "es-MX", "ja-JP", "en-SG", "en-IN", "en-MY", "ms-MY", "ko-KR", "tl-PH",
+                       "th-TH", "he-IL", "tr-TR", "en-AU", "en-NZ"]
 
 
 category_to_keyword = {'general': 'web',
 category_to_keyword = {'general': 'web',
                        'images': 'images',
                        'images': 'images',
@@ -46,7 +51,15 @@ def request(query, params):
 
 
     # add language tag if specified
     # add language tag if specified
     if params['language'] != 'all':
     if params['language'] != 'all':
-        params['url'] += '&locale=' + params['language'].lower()
+        locale = params['language'].split('-')
+        if len(locale) == 2 and params['language'] in supported_languages:
+            params['url'] += '&locale=' + params['language'].replace('-', '_').lower()
+        else:
+            # try to get a country code for language
+            for lang in supported_languages:
+                if locale[0] == lang.split('-')[0]:
+                    params['url'] += '&locale=' + lang.replace('-', '_').lower()
+                    break
 
 
     return params
     return params
 
 

+ 5 - 0
searx/engines/startpage.py

@@ -24,6 +24,11 @@ categories = ['general']
 
 
 # paging = False
 # paging = False
 language_support = True
 language_support = True
+supported_languages = ["af", "de", "ar", "hy", "be", "bg", "ca", "cs", "zh-CN", "zh-TW",
+                       "ko", "hr", "da", "sk", "sl", "es", "eo", "et", "fi", "fr",
+                       "el", "iw", "hi", "nl", "hu", "id", "en", "is", "it", "ja",
+                       "lv", "lt", "no", "fa", "pl", "pt", "ro", "ru", "sr", "sw",
+                       "sv", "tl", "th", "tr", "uk", "vi"]
 
 
 # search-url
 # search-url
 base_url = 'https://startpage.com/'
 base_url = 'https://startpage.com/'

+ 8 - 0
searx/engines/swisscows.py

@@ -18,6 +18,12 @@ import re
 categories = ['general', 'images']
 categories = ['general', 'images']
 paging = True
 paging = True
 language_support = True
 language_support = True
+supported_languages = ["ar-SA", "es-AR", "en-AU", "de-AT", "fr-BE", "nl-BE", "pt-BR", "bg-BG", "en-CA", "fr-CA",
+                       "es-CL", "zh-CN", "hr-HR", "cs-CZ", "da-DK", "et-EE", "fi-FI", "fr-FR", "de-DE", "el-GR",
+                       "zh-HK", "hu-HU", "en-IN", "en-IE", "he-IL", "it-IT", "ja-JP", "ko-KR", "lv-LV", "lt-LT",
+                       "en-MY", "es-MX", "nl-NL", "en-NZ", "nb-NO", "en-PH", "pl-PL", "pt-PT", "ro-RO", "ru-RU",
+                       "en-SG", "sk-SK", "sl-SI", "en-ZA", "es-ES", "sv-SE", "de-CH", "fr-CH", "zh-TW", "th-TH",
+                       "tr-TR", "uk-UA", "en-GB", "en-US", "es-US"]
 
 
 # search-url
 # search-url
 base_url = 'https://swisscows.ch/'
 base_url = 'https://swisscows.ch/'
@@ -35,6 +41,8 @@ def request(query, params):
     if params['language'] == 'all':
     if params['language'] == 'all':
         ui_language = 'browser'
         ui_language = 'browser'
         region = 'browser'
         region = 'browser'
+    elif params['language'].split('-')[0] == 'no':
+        region = 'nb-NO'
     else:
     else:
         region = params['language']
         region = params['language']
         ui_language = params['language'].split('-')[0]
         ui_language = params['language'].split('-')[0]

+ 3 - 1
searx/engines/yandex.py

@@ -22,7 +22,9 @@ language_support = True  # TODO
 
 
 default_tld = 'com'
 default_tld = 'com'
 language_map = {'ru': 'ru',
 language_map = {'ru': 'ru',
-                'ua': 'uk',
+                'ua': 'ua',
+                'be': 'by',
+                'kk': 'kz',
                 'tr': 'com.tr'}
                 'tr': 'com.tr'}
 
 
 # search-url
 # search-url

+ 3 - 1
searx/languages.py

@@ -100,7 +100,7 @@ language_codes = (
     (u"sa", u"संस्कृतम्", u"", u"Sanskrit"),
     (u"sa", u"संस्कृतम्", u"", u"Sanskrit"),
     (u"he-IL", u"עברית", u"", u"Hebrew"),
     (u"he-IL", u"עברית", u"", u"Hebrew"),
     (u"se", u"Sámegiella", u"", u"Northern Sami"),
     (u"se", u"Sámegiella", u"", u"Northern Sami"),
-    (u"sd", u"سنڌي، سندھی ، सिन्ध", u"", u"Sindhi"),
+    (u"sd", u"سنڌي ،सिन्ध", u"", u"Sindhi"),
     (u"fr-CH", u"Français", u"", u"French"),
     (u"fr-CH", u"Français", u"", u"French"),
     (u"zea", u"Zeêuws", u"", u"Zeelandic"),
     (u"zea", u"Zeêuws", u"", u"Zeelandic"),
     (u"it-CH", u"Italiano", u"", u"Italian"),
     (u"it-CH", u"Italiano", u"", u"Italian"),
@@ -191,6 +191,7 @@ language_codes = (
     (u"jam", u"Jamaican Creole English", u"", u"Patois"),
     (u"jam", u"Jamaican Creole English", u"", u"Patois"),
     (u"udm", u"Удмурт кыл", u"", u"Udmurt"),
     (u"udm", u"Удмурт кыл", u"", u"Udmurt"),
     (u"ksh", u"Ripoarisch", u"", u"Ripuarian"),
     (u"ksh", u"Ripoarisch", u"", u"Ripuarian"),
+    (u"sl-SI", u"Slovenščina", u"", u"Slovenian"),
     (u"ms-MY", u"Bahasa Melayu", u"", u"Malay"),
     (u"ms-MY", u"Bahasa Melayu", u"", u"Malay"),
     (u"de", u"Deutsch", u"", u"German"),
     (u"de", u"Deutsch", u"", u"German"),
     (u"da", u"Dansk", u"", u"Danish"),
     (u"da", u"Dansk", u"", u"Danish"),
@@ -284,6 +285,7 @@ language_codes = (
     (u"mhr", u"Олык Марий (Olyk Marij)", u"", u"Meadow Mari"),
     (u"mhr", u"Олык Марий (Olyk Marij)", u"", u"Meadow Mari"),
     (u"ca-CT", u"Català", u"", u"Catalan"),
     (u"ca-CT", u"Català", u"", u"Catalan"),
     (u"en-MY", u"English", u"", u"English"),
     (u"en-MY", u"English", u"", u"English"),
+    (u"olo", u"Livvi-Karelian", u"", u"Livvinkarjala"),
     (u"sv-SE", u"Svenska", u"", u"Swedish"),
     (u"sv-SE", u"Svenska", u"", u"Swedish"),
     (u"de-AT", u"Deutsch", u"", u"German"),
     (u"de-AT", u"Deutsch", u"", u"German"),
     (u"hsb", u"Hornjoserbsce", u"", u"Upper Sorbian"),
     (u"hsb", u"Hornjoserbsce", u"", u"Upper Sorbian"),

+ 1 - 1
tests/unit/engines/test_duckduckgo.py

@@ -11,7 +11,7 @@ class TestDuckduckgoEngine(SearxTestCase):
         query = 'test_query'
         query = 'test_query'
         dicto = defaultdict(dict)
         dicto = defaultdict(dict)
         dicto['pageno'] = 1
         dicto['pageno'] = 1
-        dicto['language'] = 'de_CH'
+        dicto['language'] = 'de-CH'
         dicto['time_range'] = ''
         dicto['time_range'] = ''
         params = duckduckgo.request(query, dicto)
         params = duckduckgo.request(query, dicto)
         self.assertIn('url', params)
         self.assertIn('url', params)

+ 4 - 0
tests/unit/engines/test_duckduckgo_definitions.py

@@ -21,10 +21,14 @@ class TestDDGDefinitionsEngine(SearxTestCase):
         query = 'test_query'
         query = 'test_query'
         dicto = defaultdict(dict)
         dicto = defaultdict(dict)
         dicto['pageno'] = 1
         dicto['pageno'] = 1
+        dicto['language'] = 'es'
         params = duckduckgo_definitions.request(query, dicto)
         params = duckduckgo_definitions.request(query, dicto)
         self.assertIn('url', params)
         self.assertIn('url', params)
         self.assertIn(query, params['url'])
         self.assertIn(query, params['url'])
         self.assertIn('duckduckgo.com', params['url'])
         self.assertIn('duckduckgo.com', params['url'])
+        self.assertIn('headers', params)
+        self.assertIn('Accept-Language', params['headers'])
+        self.assertIn('es', params['headers']['Accept-Language'])
 
 
     def test_response(self):
     def test_response(self):
         self.assertRaises(AttributeError, duckduckgo_definitions.response, None)
         self.assertRaises(AttributeError, duckduckgo_definitions.response, None)

+ 1 - 1
tests/unit/engines/test_google.py

@@ -18,7 +18,7 @@ class TestGoogleEngine(SearxTestCase):
         query = 'test_query'
         query = 'test_query'
         dicto = defaultdict(dict)
         dicto = defaultdict(dict)
         dicto['pageno'] = 1
         dicto['pageno'] = 1
-        dicto['language'] = 'fr_FR'
+        dicto['language'] = 'fr-FR'
         dicto['time_range'] = ''
         dicto['time_range'] = ''
         params = google.request(query, dicto)
         params = google.request(query, dicto)
         self.assertIn('url', params)
         self.assertIn('url', params)

+ 1 - 1
tests/unit/engines/test_qwant.py

@@ -10,7 +10,7 @@ class TestQwantEngine(SearxTestCase):
         query = 'test_query'
         query = 'test_query'
         dicto = defaultdict(dict)
         dicto = defaultdict(dict)
         dicto['pageno'] = 0
         dicto['pageno'] = 0
-        dicto['language'] = 'fr_FR'
+        dicto['language'] = 'fr-FR'
         qwant.categories = ['']
         qwant.categories = ['']
         params = qwant.request(query, dicto)
         params = qwant.request(query, dicto)
         self.assertIn('url', params)
         self.assertIn('url', params)

+ 1 - 1
tests/unit/engines/test_swisscows.py

@@ -10,7 +10,7 @@ class TestSwisscowsEngine(SearxTestCase):
         query = 'test_query'
         query = 'test_query'
         dicto = defaultdict(dict)
         dicto = defaultdict(dict)
         dicto['pageno'] = 1
         dicto['pageno'] = 1
-        dicto['language'] = 'de_DE'
+        dicto['language'] = 'de-DE'
         params = swisscows.request(query, dicto)
         params = swisscows.request(query, dicto)
         self.assertTrue('url' in params)
         self.assertTrue('url' in params)
         self.assertTrue(query in params['url'])
         self.assertTrue(query in params['url'])

+ 1 - 1
tests/unit/engines/test_wikipedia.py

@@ -10,7 +10,7 @@ class TestWikipediaEngine(SearxTestCase):
     def test_request(self):
     def test_request(self):
         query = 'test_query'
         query = 'test_query'
         dicto = defaultdict(dict)
         dicto = defaultdict(dict)
-        dicto['language'] = 'fr_FR'
+        dicto['language'] = 'fr-FR'
         params = wikipedia.request(query, dicto)
         params = wikipedia.request(query, dicto)
         self.assertIn('url', params)
         self.assertIn('url', params)
         self.assertIn(query, params['url'])
         self.assertIn(query, params['url'])

+ 4 - 9
utils/update_languages.py

@@ -41,7 +41,6 @@ def valid_code(lang_code):
     if len(lang_code) > 2 or len(lang_code[0]) > 3:
     if len(lang_code) > 2 or len(lang_code[0]) > 3:
         return False
         return False
     if len(lang_code) == 2 and len(lang_code[1]) > 2:
     if len(lang_code) == 2 and len(lang_code[1]) > 2:
-        print lang_code
         return False
         return False
         
         
     return True
     return True
@@ -62,8 +61,8 @@ def get_wikipedia_languages():
             english_name = td[1].xpath('./a')[0].text
             english_name = td[1].xpath('./a')[0].text
             articles = int(td[4].xpath('./a/b')[0].text.replace(',',''))
             articles = int(td[4].xpath('./a/b')[0].text.replace(',',''))
             
             
-            # exclude languages with few articles and language variants
-            if code not in languages and articles >= 100 and valid_code(code):
+            # exclude language variants and languages with few articles
+            if code not in languages and articles >= 1000 and valid_code(code):
                 languages[code] = (name, '', english_name)
                 languages[code] = (name, '', english_name)
 
 
 
 
@@ -90,7 +89,7 @@ def join_language_lists():
                 # try to get language name
                 # try to get language name
                 language = languages.get(locale.split('-')[0], None)
                 language = languages.get(locale.split('-')[0], None)
                 if language == None:
                 if language == None:
-                    # print engine_name + ": " + locale
+                    print engine_name + ": " + locale
                     continue
                     continue
 
 
                 (name, country, english) = language
                 (name, country, english) = language
@@ -117,12 +116,8 @@ def write_languages_file():
     new_file.close()
     new_file.close()
 
 
 
 
-def main():
+if __name__ == "__main__":
     get_wikipedia_languages()
     get_wikipedia_languages()
     get_google_languages()
     get_google_languages()
     join_language_lists()
     join_language_lists()
     write_languages_file()
     write_languages_file()
-
-
-if __name__ == "__main__":
-    main()