Browse Source

[mod] add 'Accept-Language' HTTP header to online processores

Most engines that support languages (and regions) use the Accept-Language from
the WEB browser to build a response that fits to the language (and region).

- add new engine option: send_accept_language_header

Signed-off-by: Markus Heiser <markus.heiser@darmarit.de>
Markus Heiser 2 years ago
parent
commit
8df1f0c47e

+ 8 - 0
docs/admin/engines/settings.rst

@@ -440,6 +440,7 @@ engine is shown.  Most of the options have a default value or even are optional.
      engine: example
      engine: example
      shortcut: demo
      shortcut: demo
      base_url: 'https://{language}.example.com/'
      base_url: 'https://{language}.example.com/'
+     send_accept_language_header: false
      categories: general
      categories: general
      timeout: 3.0
      timeout: 3.0
      api_key: 'apikey'
      api_key: 'apikey'
@@ -488,6 +489,13 @@ engine is shown.  Most of the options have a default value or even are optional.
   use multiple sites using only one engine, or updating the site URL without
   use multiple sites using only one engine, or updating the site URL without
   touching at the code.
   touching at the code.
 
 
+``send_accept_language_header`` :
+  Several engines that support languages (or regions) deal with the HTTP header
+  ``Accept-Language`` to build a response that fits to the locale.  When this
+  option is activated, the language (locale) that is selected by the user is used
+  to build and send a ``Accept-Language`` header in the request to the origin
+  search engine.
+
 ``categories`` : optional
 ``categories`` : optional
   Define in which categories this engine will be active.  Most of the time, it is
   Define in which categories this engine will be active.  Most of the time, it is
   defined in the code of the engine, but in a few cases it is useful, like when
   defined in the code of the engine, but in a few cases it is useful, like when

+ 1 - 0
searx/engines/__init__.py

@@ -44,6 +44,7 @@ ENGINE_DEFAULT_ARGS = {
     "enable_http": False,
     "enable_http": False,
     "using_tor_proxy": False,
     "using_tor_proxy": False,
     "display_error_messages": True,
     "display_error_messages": True,
+    "send_accept_language_header": False,
     "tokens": [],
     "tokens": [],
     "about": {},
     "about": {},
 }
 }

+ 1 - 1
searx/engines/bing.py

@@ -25,6 +25,7 @@ categories = ['general', 'web']
 paging = True
 paging = True
 time_range_support = False
 time_range_support = False
 safesearch = False
 safesearch = False
+send_accept_language_header = True
 supported_languages_url = 'https://www.bing.com/account/general'
 supported_languages_url = 'https://www.bing.com/account/general'
 language_aliases = {}
 language_aliases = {}
 
 
@@ -68,7 +69,6 @@ def request(query, params):
         logger.debug("headers.Referer --> %s", referer)
         logger.debug("headers.Referer --> %s", referer)
 
 
     params['url'] = base_url + search_path
     params['url'] = base_url + search_path
-    params['headers']['Accept-Language'] = "en-US,en;q=0.5"
     params['headers']['Accept'] = 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8'
     params['headers']['Accept'] = 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8'
     return params
     return params
 
 

+ 1 - 0
searx/engines/bing_images.py

@@ -31,6 +31,7 @@ categories = ['images', 'web']
 paging = True
 paging = True
 safesearch = True
 safesearch = True
 time_range_support = True
 time_range_support = True
+send_accept_language_header = True
 supported_languages_url = 'https://www.bing.com/account/general'
 supported_languages_url = 'https://www.bing.com/account/general'
 number_of_results = 28
 number_of_results = 28
 
 

+ 1 - 0
searx/engines/bing_news.py

@@ -34,6 +34,7 @@ about = {
 categories = ['news']
 categories = ['news']
 paging = True
 paging = True
 time_range_support = True
 time_range_support = True
+send_accept_language_header = True
 
 
 # search-url
 # search-url
 base_url = 'https://www.bing.com/'
 base_url = 'https://www.bing.com/'

+ 1 - 4
searx/engines/bing_videos.py

@@ -30,6 +30,7 @@ categories = ['videos', 'web']
 paging = True
 paging = True
 safesearch = True
 safesearch = True
 time_range_support = True
 time_range_support = True
+send_accept_language_header = True
 number_of_results = 28
 number_of_results = 28
 
 
 base_url = 'https://www.bing.com/'
 base_url = 'https://www.bing.com/'
@@ -70,10 +71,6 @@ def request(query, params):
     if params['time_range'] in time_range_dict:
     if params['time_range'] in time_range_dict:
         params['url'] += time_range_string.format(interval=time_range_dict[params['time_range']])
         params['url'] += time_range_string.format(interval=time_range_dict[params['time_range']])
 
 
-    # bing videos did not like "older" versions < 70.0.1 when selectin other
-    # languages then 'en' .. very strange ?!?!
-    params['headers']['User-Agent'] = 'Mozilla/5.0 (X11; Linux x86_64; rv:73.0.1) Gecko/20100101 Firefox/73.0.1'
-
     return params
     return params
 
 
 
 

+ 1 - 0
searx/engines/demo_online.py

@@ -20,6 +20,7 @@ from json import loads
 from urllib.parse import urlencode
 from urllib.parse import urlencode
 
 
 engine_type = 'online'
 engine_type = 'online'
+send_accept_language_header = True
 categories = ['general']
 categories = ['general']
 disabled = True
 disabled = True
 timeout = 2.0
 timeout = 2.0

+ 1 - 0
searx/engines/duckduckgo.py

@@ -31,6 +31,7 @@ categories = ['general', 'web']
 paging = True
 paging = True
 supported_languages_url = 'https://duckduckgo.com/util/u588.js'
 supported_languages_url = 'https://duckduckgo.com/util/u588.js'
 time_range_support = True
 time_range_support = True
+send_accept_language_header = True
 
 
 language_aliases = {
 language_aliases = {
     'ar-SA': 'ar-XA',
     'ar-SA': 'ar-XA',

+ 2 - 1
searx/engines/duckduckgo_definitions.py

@@ -27,6 +27,8 @@ about = {
     "results": 'JSON',
     "results": 'JSON',
 }
 }
 
 
+send_accept_language_header = True
+
 URL = 'https://api.duckduckgo.com/' + '?{query}&format=json&pretty=0&no_redirect=1&d=1'
 URL = 'https://api.duckduckgo.com/' + '?{query}&format=json&pretty=0&no_redirect=1&d=1'
 
 
 WIKIDATA_PREFIX = ['http://www.wikidata.org/entity/', 'https://www.wikidata.org/entity/']
 WIKIDATA_PREFIX = ['http://www.wikidata.org/entity/', 'https://www.wikidata.org/entity/']
@@ -62,7 +64,6 @@ def request(query, params):
     params['url'] = URL.format(query=urlencode({'q': query}))
     params['url'] = URL.format(query=urlencode({'q': query}))
     language = match_language(params['language'], supported_languages, language_aliases)
     language = match_language(params['language'], supported_languages, language_aliases)
     language = language.split('-')[0]
     language = language.split('-')[0]
-    params['headers']['Accept-Language'] = language
     return params
     return params
 
 
 
 

+ 1 - 0
searx/engines/duckduckgo_images.py

@@ -30,6 +30,7 @@ about = {
 categories = ['images', 'web']
 categories = ['images', 'web']
 paging = True
 paging = True
 safesearch = True
 safesearch = True
+send_accept_language_header = True
 
 
 # search-url
 # search-url
 images_url = 'https://duckduckgo.com/i.js?{query}&s={offset}&p={safesearch}&o=json&vqd={vqd}'
 images_url = 'https://duckduckgo.com/i.js?{query}&s={offset}&p={safesearch}&o=json&vqd={vqd}'

+ 1 - 10
searx/engines/google.py

@@ -45,6 +45,7 @@ categories = ['general', 'web']
 paging = True
 paging = True
 time_range_support = True
 time_range_support = True
 safesearch = True
 safesearch = True
+send_accept_language_header = True
 use_mobile_ui = False
 use_mobile_ui = False
 supported_languages_url = 'https://www.google.com/preferences?#languages'
 supported_languages_url = 'https://www.google.com/preferences?#languages'
 
 
@@ -241,16 +242,6 @@ def get_lang_info(params, lang_list, custom_aliases, supported_any_language):
         # language.
         # language.
         ret_val['params']['lr'] = "lang_" + lang_list.get(lang_country, language)
         ret_val['params']['lr'] = "lang_" + lang_list.get(lang_country, language)
 
 
-        # Accept-Language: fr-CH, fr;q=0.8, en;q=0.6, *;q=0.5
-        ret_val['headers']['Accept-Language'] = ','.join(
-            [
-                lang_country,
-                language + ';q=0.8,',
-                'en;q=0.6',
-                '*;q=0.5',
-            ]
-        )
-
     return ret_val
     return ret_val
 
 
 
 

+ 1 - 1
searx/engines/google_images.py

@@ -51,6 +51,7 @@ paging = False
 use_locale_domain = True
 use_locale_domain = True
 time_range_support = True
 time_range_support = True
 safesearch = True
 safesearch = True
+send_accept_language_header = True
 
 
 filter_mapping = {0: 'images', 1: 'active', 2: 'active'}
 filter_mapping = {0: 'images', 1: 'active', 2: 'active'}
 
 
@@ -125,7 +126,6 @@ def request(query, params):
     """Google-Video search request"""
     """Google-Video search request"""
 
 
     lang_info = get_lang_info(params, supported_languages, language_aliases, False)
     lang_info = get_lang_info(params, supported_languages, language_aliases, False)
-    logger.debug("HTTP header Accept-Language --> %s", lang_info['headers']['Accept-Language'])
 
 
     query_url = (
     query_url = (
         'https://'
         'https://'

+ 1 - 1
searx/engines/google_news.py

@@ -70,13 +70,13 @@ time_range_support = True
 #
 #
 #  safesearch : results are identitical for safesearch=0 and safesearch=2
 #  safesearch : results are identitical for safesearch=0 and safesearch=2
 safesearch = False
 safesearch = False
+send_accept_language_header = True
 
 
 
 
 def request(query, params):
 def request(query, params):
     """Google-News search request"""
     """Google-News search request"""
 
 
     lang_info = get_lang_info(params, supported_languages, language_aliases, False)
     lang_info = get_lang_info(params, supported_languages, language_aliases, False)
-    logger.debug("HTTP header Accept-Language --> %s", lang_info['headers']['Accept-Language'])
 
 
     # google news has only one domain
     # google news has only one domain
     lang_info['subdomain'] = 'news.google.com'
     lang_info['subdomain'] = 'news.google.com'

+ 2 - 0
searx/engines/google_play_apps.py

@@ -22,6 +22,8 @@ about = {
 }
 }
 
 
 categories = ["files", "apps"]
 categories = ["files", "apps"]
+send_accept_language_header = True
+
 search_url = "https://play.google.com/store/search?{query}&c=apps"
 search_url = "https://play.google.com/store/search?{query}&c=apps"
 
 
 
 

+ 1 - 1
searx/engines/google_scholar.py

@@ -52,6 +52,7 @@ language_support = True
 use_locale_domain = True
 use_locale_domain = True
 time_range_support = True
 time_range_support = True
 safesearch = False
 safesearch = False
+send_accept_language_header = True
 
 
 
 
 def time_range_url(params):
 def time_range_url(params):
@@ -75,7 +76,6 @@ def request(query, params):
 
 
     offset = (params['pageno'] - 1) * 10
     offset = (params['pageno'] - 1) * 10
     lang_info = get_lang_info(params, supported_languages, language_aliases, False)
     lang_info = get_lang_info(params, supported_languages, language_aliases, False)
-    logger.debug("HTTP header Accept-Language --> %s", lang_info['headers']['Accept-Language'])
 
 
     # subdomain is: scholar.google.xy
     # subdomain is: scholar.google.xy
     lang_info['subdomain'] = lang_info['subdomain'].replace("www.", "scholar.")
     lang_info['subdomain'] = lang_info['subdomain'].replace("www.", "scholar.")

+ 1 - 1
searx/engines/google_videos.py

@@ -60,6 +60,7 @@ language_support = True
 use_locale_domain = True
 use_locale_domain = True
 time_range_support = True
 time_range_support = True
 safesearch = True
 safesearch = True
+send_accept_language_header = True
 
 
 RE_CACHE = {}
 RE_CACHE = {}
 
 
@@ -111,7 +112,6 @@ def request(query, params):
     """Google-Video search request"""
     """Google-Video search request"""
 
 
     lang_info = get_lang_info(params, supported_languages, language_aliases, False)
     lang_info = get_lang_info(params, supported_languages, language_aliases, False)
-    logger.debug("HTTP header Accept-Language --> %s", lang_info['headers']['Accept-Language'])
 
 
     query_url = (
     query_url = (
         'https://'
         'https://'

+ 3 - 3
searx/engines/openstreetmap.py

@@ -30,6 +30,7 @@ about = {
 categories = ['map']
 categories = ['map']
 paging = False
 paging = False
 language_support = True
 language_support = True
+send_accept_language_header = True
 
 
 # search-url
 # search-url
 base_url = 'https://nominatim.openstreetmap.org/'
 base_url = 'https://nominatim.openstreetmap.org/'
@@ -142,9 +143,8 @@ def request(query, params):
     params['url'] = base_url + search_string.format(query=urlencode({'q': query}))
     params['url'] = base_url + search_string.format(query=urlencode({'q': query}))
     params['route'] = route_re.match(query)
     params['route'] = route_re.match(query)
     params['headers']['User-Agent'] = searx_useragent()
     params['headers']['User-Agent'] = searx_useragent()
-
-    accept_language = 'en' if params['language'] == 'all' else params['language']
-    params['headers']['Accept-Language'] = accept_language
+    if 'Accept-Language' not in params['headers']:
+        params['headers']['Accept-Language'] = 'en'
     return params
     return params
 
 
 
 

+ 3 - 3
searx/engines/wikipedia.py

@@ -19,6 +19,9 @@ about = {
     "results": 'JSON',
     "results": 'JSON',
 }
 }
 
 
+
+send_accept_language_header = True
+
 # search-url
 # search-url
 search_url = 'https://{language}.wikipedia.org/api/rest_v1/page/summary/{title}'
 search_url = 'https://{language}.wikipedia.org/api/rest_v1/page/summary/{title}'
 supported_languages_url = 'https://meta.wikimedia.org/wiki/List_of_Wikipedias'
 supported_languages_url = 'https://meta.wikimedia.org/wiki/List_of_Wikipedias'
@@ -41,9 +44,6 @@ def request(query, params):
     language = url_lang(params['language'])
     language = url_lang(params['language'])
     params['url'] = search_url.format(title=quote(query), language=language)
     params['url'] = search_url.format(title=quote(query), language=language)
 
 
-    if params['language'].lower() in language_variants.get(language, []):
-        params['headers']['Accept-Language'] = params['language'].lower()
-
     params['headers']['User-Agent'] = searx_useragent()
     params['headers']['User-Agent'] = searx_useragent()
     params['raise_for_httperror'] = False
     params['raise_for_httperror'] = False
     params['soft_max_redirects'] = 2
     params['soft_max_redirects'] = 2

+ 9 - 0
searx/search/models.py

@@ -1,6 +1,7 @@
 # SPDX-License-Identifier: AGPL-3.0-or-later
 # SPDX-License-Identifier: AGPL-3.0-or-later
 
 
 import typing
 import typing
+import babel
 
 
 
 
 class EngineRef:
 class EngineRef:
@@ -29,6 +30,7 @@ class SearchQuery:
         'query',
         'query',
         'engineref_list',
         'engineref_list',
         'lang',
         'lang',
+        'locale',
         'safesearch',
         'safesearch',
         'pageno',
         'pageno',
         'time_range',
         'time_range',
@@ -59,6 +61,13 @@ class SearchQuery:
         self.external_bang = external_bang
         self.external_bang = external_bang
         self.engine_data = engine_data or {}
         self.engine_data = engine_data or {}
 
 
+        self.locale = None
+        if self.lang:
+            try:
+                self.locale = babel.Locale.parse(self.lang, sep='-')
+            except babel.core.UnknownLocaleError:
+                pass
+
     @property
     @property
     def categories(self):
     def categories(self):
         return list(set(map(lambda engineref: engineref.category, self.engineref_list)))
         return list(set(map(lambda engineref: engineref.category, self.engineref_list)))

+ 11 - 0
searx/search/processors/online.py

@@ -60,6 +60,17 @@ class OnlineProcessor(EngineProcessor):
         # add an user agent
         # add an user agent
         params['headers']['User-Agent'] = gen_useragent()
         params['headers']['User-Agent'] = gen_useragent()
 
 
+        # add Accept-Language header
+        if self.engine.send_accept_language_header and search_query.locale:
+            ac_lang = search_query.locale.language
+            if search_query.locale.territory:
+                ac_lang = "%s-%s,%s;q=0.9,*;q=0.5" % (
+                    search_query.locale.language,
+                    search_query.locale.territory,
+                    search_query.locale.language,
+                )
+            params['headers']['Accept-Language'] = ac_lang
+
         return params
         return params
 
 
     def _send_http_request(self, params):
     def _send_http_request(self, params):

+ 1 - 0
searx/settings.yml

@@ -748,6 +748,7 @@ engines:
 
 
   - name: google play movies
   - name: google play movies
     engine: xpath
     engine: xpath
+    send_accept_language_header: true
     search_url: https://play.google.com/store/search?q={query}&c=movies
     search_url: https://play.google.com/store/search?q={query}&c=movies
     results_xpath: '//div[@class="ImZGtf mpg5gc"]'
     results_xpath: '//div[@class="ImZGtf mpg5gc"]'
     title_xpath: './/div[@class="RZEgze"]//div[@class="kCSSQe"]//a'
     title_xpath: './/div[@class="RZEgze"]//div[@class="kCSSQe"]//a'