Browse Source

[fix] language support for bing images and videos

marc 7 years ago
parent
commit
a524dbb823

+ 47 - 9
searx/engines/bing_images.py

@@ -18,7 +18,6 @@
 from lxml import html
 from json import loads
 import re
-from searx.engines.bing import _fetch_supported_languages, supported_languages_url
 from searx.url_utils import urlencode
 
 # engine dependent config
@@ -26,6 +25,8 @@ categories = ['images']
 paging = True
 safesearch = True
 time_range_support = True
+language_support = True
+supported_languages_url = 'https://www.bing.com/account/general'
 
 # search-url
 base_url = 'https://www.bing.com/'
@@ -45,23 +46,41 @@ safesearch_types = {2: 'STRICT',
 _quote_keys_regex = re.compile('({|,)([a-z][a-z0-9]*):(")', re.I | re.U)
 
 
+# get supported region code
+def get_region_code(lang, lang_list=None):
+    region = None
+    if lang in (lang_list or supported_languages):
+        region = lang
+    elif lang.startswith('no'):
+        region = 'nb-NO'
+    else:
+        # try to get a supported country code with language
+        lang = lang.split('-')[0]
+        for lc in (lang_list or supported_languages):
+            if lang == lc.split('-')[0]:
+                region = lc
+                break
+    if region:
+        return region.lower()
+    else:
+        return 'en-us'
+
+
 # do search-request
 def request(query, params):
     offset = (params['pageno'] - 1) * 10 + 1
 
-    # required for cookie
-    if params['language'] == 'all':
-        language = 'en-US'
-    else:
-        language = params['language']
-
     search_path = search_string.format(
         query=urlencode({'q': query}),
         offset=offset)
 
+    language = get_region_code(params['language'])
+
     params['cookies']['SRCHHPGUSR'] = \
-        'NEWWND=0&NRSLT=-1&SRCHLANG=' + language.split('-')[0] +\
-        '&ADLT=' + safesearch_types.get(params['safesearch'], 'DEMOTE')
+        'ADLT=' + safesearch_types.get(params['safesearch'], 'DEMOTE')
+
+    params['cookies']['_EDGE_S'] = 'mkt=' + language +\
+        '&ui=' + language + '&F=1'
 
     params['url'] = base_url + search_path
     if params['time_range'] in time_range_dict:
@@ -106,3 +125,22 @@ def response(resp):
 
     # return results
     return results
+
+
+# get supported languages from their site
+def _fetch_supported_languages(resp):
+    supported_languages = []
+    dom = html.fromstring(resp.text)
+
+    regions_xpath = '//div[@id="region-section-content"]' \
+                    + '//ul[@class="b_vList"]/li/a/@href'
+
+    regions = dom.xpath(regions_xpath)
+    for region in regions:
+        code = re.search('setmkt=[^\&]+', region).group()[7:]
+        if code == 'nb-NO':
+            code = 'no-NO'
+
+        supported_languages.append(code)
+
+    return supported_languages

+ 4 - 1
searx/engines/bing_videos.py

@@ -12,6 +12,7 @@
 
 from json import loads
 from lxml import html
+from searx.engines.bing_images import _fetch_supported_languages, supported_languages_url, get_region_code
 from searx.engines.xpath import extract_text
 from searx.url_utils import urlencode
 
@@ -21,6 +22,7 @@ paging = True
 safesearch = True
 time_range_support = True
 number_of_results = 10
+language_support = True
 
 search_url = 'https://www.bing.com/videos/asyncv2?{query}&async=content&'\
              'first={offset}&count={number_of_results}&CW=1366&CH=25&FORM=R5VR5'
@@ -45,7 +47,8 @@ def request(query, params):
         'ADLT=' + safesearch_types.get(params['safesearch'], 'DEMOTE')
 
     # language cookie
-    params['cookies']['_EDGE_S'] = 'mkt=' + params['language'].lower() + '&F=1'
+    region = get_region_code(params['language'], lang_list=supported_languages)
+    params['cookies']['_EDGE_S'] = 'mkt=' + region + '&F=1'
 
     # query and paging
     params['url'] = search_url.format(query=urlencode({'q': query}),

+ 8 - 4
tests/unit/engines/test_bing_images.py

@@ -8,10 +8,12 @@ from searx.testing import SearxTestCase
 class TestBingImagesEngine(SearxTestCase):
 
     def test_request(self):
+        bing_images.supported_languages = ['fr-FR', 'en-US']
+
         query = 'test_query'
         dicto = defaultdict(dict)
         dicto['pageno'] = 1
-        dicto['language'] = 'fr_FR'
+        dicto['language'] = 'fr-FR'
         dicto['safesearch'] = 1
         dicto['time_range'] = ''
         params = bing_images.request(query, dicto)
@@ -19,12 +21,14 @@ class TestBingImagesEngine(SearxTestCase):
         self.assertTrue(query in params['url'])
         self.assertTrue('bing.com' in params['url'])
         self.assertTrue('SRCHHPGUSR' in params['cookies'])
-        self.assertTrue('fr' in params['cookies']['SRCHHPGUSR'])
+        self.assertTrue('DEMOTE' in params['cookies']['SRCHHPGUSR'])
+        self.assertTrue('_EDGE_S' in params['cookies'])
+        self.assertTrue('fr-fr' in params['cookies']['_EDGE_S'])
 
         dicto['language'] = 'all'
         params = bing_images.request(query, dicto)
-        self.assertIn('SRCHHPGUSR', params['cookies'])
-        self.assertIn('en', params['cookies']['SRCHHPGUSR'])
+        self.assertTrue('_EDGE_S' in params['cookies'])
+        self.assertTrue('en' in params['cookies']['_EDGE_S'])
 
     def test_response(self):
         self.assertRaises(AttributeError, bing_images.response, None)

+ 2 - 0
tests/unit/engines/test_bing_videos.py

@@ -8,6 +8,8 @@ from searx.testing import SearxTestCase
 class TestBingVideosEngine(SearxTestCase):
 
     def test_request(self):
+        bing_videos.supported_languages = ['fr-FR', 'en-US']
+
         query = 'test_query'
         dicto = defaultdict(dict)
         dicto['pageno'] = 1