Browse Source

[mod] add 'Accept-Language' HTTP header to online processores

Most engines that support languages (and regions) use the Accept-Language from
the WEB browser to build a response that fits to the language (and region).

- add new engine option: send_accept_language_header

Signed-off-by: Markus Heiser <markus.heiser@darmarit.de>
Markus Heiser 2 years ago
parent
commit
8df1f0c47e

+ 8 - 0
docs/admin/engines/settings.rst

@@ -440,6 +440,7 @@ engine is shown.  Most of the options have a default value or even are optional.
      engine: example
      shortcut: demo
      base_url: 'https://{language}.example.com/'
+     send_accept_language_header: false
      categories: general
      timeout: 3.0
      api_key: 'apikey'
@@ -488,6 +489,13 @@ engine is shown.  Most of the options have a default value or even are optional.
   use multiple sites using only one engine, or updating the site URL without
   touching at the code.
 
+``send_accept_language_header`` :
+  Several engines that support languages (or regions) deal with the HTTP header
+  ``Accept-Language`` to build a response that fits to the locale.  When this
+  option is activated, the language (locale) that is selected by the user is used
+  to build and send a ``Accept-Language`` header in the request to the origin
+  search engine.
+
 ``categories`` : optional
   Define in which categories this engine will be active.  Most of the time, it is
   defined in the code of the engine, but in a few cases it is useful, like when

+ 1 - 0
searx/engines/__init__.py

@@ -44,6 +44,7 @@ ENGINE_DEFAULT_ARGS = {
     "enable_http": False,
     "using_tor_proxy": False,
     "display_error_messages": True,
+    "send_accept_language_header": False,
     "tokens": [],
     "about": {},
 }

+ 1 - 1
searx/engines/bing.py

@@ -25,6 +25,7 @@ categories = ['general', 'web']
 paging = True
 time_range_support = False
 safesearch = False
+send_accept_language_header = True
 supported_languages_url = 'https://www.bing.com/account/general'
 language_aliases = {}
 
@@ -68,7 +69,6 @@ def request(query, params):
         logger.debug("headers.Referer --> %s", referer)
 
     params['url'] = base_url + search_path
-    params['headers']['Accept-Language'] = "en-US,en;q=0.5"
     params['headers']['Accept'] = 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8'
     return params
 

+ 1 - 0
searx/engines/bing_images.py

@@ -31,6 +31,7 @@ categories = ['images', 'web']
 paging = True
 safesearch = True
 time_range_support = True
+send_accept_language_header = True
 supported_languages_url = 'https://www.bing.com/account/general'
 number_of_results = 28
 

+ 1 - 0
searx/engines/bing_news.py

@@ -34,6 +34,7 @@ about = {
 categories = ['news']
 paging = True
 time_range_support = True
+send_accept_language_header = True
 
 # search-url
 base_url = 'https://www.bing.com/'

+ 1 - 4
searx/engines/bing_videos.py

@@ -30,6 +30,7 @@ categories = ['videos', 'web']
 paging = True
 safesearch = True
 time_range_support = True
+send_accept_language_header = True
 number_of_results = 28
 
 base_url = 'https://www.bing.com/'
@@ -70,10 +71,6 @@ def request(query, params):
     if params['time_range'] in time_range_dict:
         params['url'] += time_range_string.format(interval=time_range_dict[params['time_range']])
 
-    # bing videos did not like "older" versions < 70.0.1 when selectin other
-    # languages then 'en' .. very strange ?!?!
-    params['headers']['User-Agent'] = 'Mozilla/5.0 (X11; Linux x86_64; rv:73.0.1) Gecko/20100101 Firefox/73.0.1'
-
     return params
 
 

+ 1 - 0
searx/engines/demo_online.py

@@ -20,6 +20,7 @@ from json import loads
 from urllib.parse import urlencode
 
 engine_type = 'online'
+send_accept_language_header = True
 categories = ['general']
 disabled = True
 timeout = 2.0

+ 1 - 0
searx/engines/duckduckgo.py

@@ -31,6 +31,7 @@ categories = ['general', 'web']
 paging = True
 supported_languages_url = 'https://duckduckgo.com/util/u588.js'
 time_range_support = True
+send_accept_language_header = True
 
 language_aliases = {
     'ar-SA': 'ar-XA',

+ 2 - 1
searx/engines/duckduckgo_definitions.py

@@ -27,6 +27,8 @@ about = {
     "results": 'JSON',
 }
 
+send_accept_language_header = True
+
 URL = 'https://api.duckduckgo.com/' + '?{query}&format=json&pretty=0&no_redirect=1&d=1'
 
 WIKIDATA_PREFIX = ['http://www.wikidata.org/entity/', 'https://www.wikidata.org/entity/']
@@ -62,7 +64,6 @@ def request(query, params):
     params['url'] = URL.format(query=urlencode({'q': query}))
     language = match_language(params['language'], supported_languages, language_aliases)
     language = language.split('-')[0]
-    params['headers']['Accept-Language'] = language
     return params
 
 

+ 1 - 0
searx/engines/duckduckgo_images.py

@@ -30,6 +30,7 @@ about = {
 categories = ['images', 'web']
 paging = True
 safesearch = True
+send_accept_language_header = True
 
 # search-url
 images_url = 'https://duckduckgo.com/i.js?{query}&s={offset}&p={safesearch}&o=json&vqd={vqd}'

+ 1 - 10
searx/engines/google.py

@@ -45,6 +45,7 @@ categories = ['general', 'web']
 paging = True
 time_range_support = True
 safesearch = True
+send_accept_language_header = True
 use_mobile_ui = False
 supported_languages_url = 'https://www.google.com/preferences?#languages'
 
@@ -241,16 +242,6 @@ def get_lang_info(params, lang_list, custom_aliases, supported_any_language):
         # language.
         ret_val['params']['lr'] = "lang_" + lang_list.get(lang_country, language)
 
-        # Accept-Language: fr-CH, fr;q=0.8, en;q=0.6, *;q=0.5
-        ret_val['headers']['Accept-Language'] = ','.join(
-            [
-                lang_country,
-                language + ';q=0.8,',
-                'en;q=0.6',
-                '*;q=0.5',
-            ]
-        )
-
     return ret_val
 
 

+ 1 - 1
searx/engines/google_images.py

@@ -51,6 +51,7 @@ paging = False
 use_locale_domain = True
 time_range_support = True
 safesearch = True
+send_accept_language_header = True
 
 filter_mapping = {0: 'images', 1: 'active', 2: 'active'}
 
@@ -125,7 +126,6 @@ def request(query, params):
     """Google-Video search request"""
 
     lang_info = get_lang_info(params, supported_languages, language_aliases, False)
-    logger.debug("HTTP header Accept-Language --> %s", lang_info['headers']['Accept-Language'])
 
     query_url = (
         'https://'

+ 1 - 1
searx/engines/google_news.py

@@ -70,13 +70,13 @@ time_range_support = True
 #
 #  safesearch : results are identitical for safesearch=0 and safesearch=2
 safesearch = False
+send_accept_language_header = True
 
 
 def request(query, params):
     """Google-News search request"""
 
     lang_info = get_lang_info(params, supported_languages, language_aliases, False)
-    logger.debug("HTTP header Accept-Language --> %s", lang_info['headers']['Accept-Language'])
 
     # google news has only one domain
     lang_info['subdomain'] = 'news.google.com'

+ 2 - 0
searx/engines/google_play_apps.py

@@ -22,6 +22,8 @@ about = {
 }
 
 categories = ["files", "apps"]
+send_accept_language_header = True
+
 search_url = "https://play.google.com/store/search?{query}&c=apps"
 
 

+ 1 - 1
searx/engines/google_scholar.py

@@ -52,6 +52,7 @@ language_support = True
 use_locale_domain = True
 time_range_support = True
 safesearch = False
+send_accept_language_header = True
 
 
 def time_range_url(params):
@@ -75,7 +76,6 @@ def request(query, params):
 
     offset = (params['pageno'] - 1) * 10
     lang_info = get_lang_info(params, supported_languages, language_aliases, False)
-    logger.debug("HTTP header Accept-Language --> %s", lang_info['headers']['Accept-Language'])
 
     # subdomain is: scholar.google.xy
     lang_info['subdomain'] = lang_info['subdomain'].replace("www.", "scholar.")

+ 1 - 1
searx/engines/google_videos.py

@@ -60,6 +60,7 @@ language_support = True
 use_locale_domain = True
 time_range_support = True
 safesearch = True
+send_accept_language_header = True
 
 RE_CACHE = {}
 
@@ -111,7 +112,6 @@ def request(query, params):
     """Google-Video search request"""
 
     lang_info = get_lang_info(params, supported_languages, language_aliases, False)
-    logger.debug("HTTP header Accept-Language --> %s", lang_info['headers']['Accept-Language'])
 
     query_url = (
         'https://'

+ 3 - 3
searx/engines/openstreetmap.py

@@ -30,6 +30,7 @@ about = {
 categories = ['map']
 paging = False
 language_support = True
+send_accept_language_header = True
 
 # search-url
 base_url = 'https://nominatim.openstreetmap.org/'
@@ -142,9 +143,8 @@ def request(query, params):
     params['url'] = base_url + search_string.format(query=urlencode({'q': query}))
     params['route'] = route_re.match(query)
     params['headers']['User-Agent'] = searx_useragent()
-
-    accept_language = 'en' if params['language'] == 'all' else params['language']
-    params['headers']['Accept-Language'] = accept_language
+    if 'Accept-Language' not in params['headers']:
+        params['headers']['Accept-Language'] = 'en'
     return params
 
 

+ 3 - 3
searx/engines/wikipedia.py

@@ -19,6 +19,9 @@ about = {
     "results": 'JSON',
 }
 
+
+send_accept_language_header = True
+
 # search-url
 search_url = 'https://{language}.wikipedia.org/api/rest_v1/page/summary/{title}'
 supported_languages_url = 'https://meta.wikimedia.org/wiki/List_of_Wikipedias'
@@ -41,9 +44,6 @@ def request(query, params):
     language = url_lang(params['language'])
     params['url'] = search_url.format(title=quote(query), language=language)
 
-    if params['language'].lower() in language_variants.get(language, []):
-        params['headers']['Accept-Language'] = params['language'].lower()
-
     params['headers']['User-Agent'] = searx_useragent()
     params['raise_for_httperror'] = False
     params['soft_max_redirects'] = 2

+ 9 - 0
searx/search/models.py

@@ -1,6 +1,7 @@
 # SPDX-License-Identifier: AGPL-3.0-or-later
 
 import typing
+import babel
 
 
 class EngineRef:
@@ -29,6 +30,7 @@ class SearchQuery:
         'query',
         'engineref_list',
         'lang',
+        'locale',
         'safesearch',
         'pageno',
         'time_range',
@@ -59,6 +61,13 @@ class SearchQuery:
         self.external_bang = external_bang
         self.engine_data = engine_data or {}
 
+        self.locale = None
+        if self.lang:
+            try:
+                self.locale = babel.Locale.parse(self.lang, sep='-')
+            except babel.core.UnknownLocaleError:
+                pass
+
     @property
     def categories(self):
         return list(set(map(lambda engineref: engineref.category, self.engineref_list)))

+ 11 - 0
searx/search/processors/online.py

@@ -60,6 +60,17 @@ class OnlineProcessor(EngineProcessor):
         # add an user agent
         params['headers']['User-Agent'] = gen_useragent()
 
+        # add Accept-Language header
+        if self.engine.send_accept_language_header and search_query.locale:
+            ac_lang = search_query.locale.language
+            if search_query.locale.territory:
+                ac_lang = "%s-%s,%s;q=0.9,*;q=0.5" % (
+                    search_query.locale.language,
+                    search_query.locale.territory,
+                    search_query.locale.language,
+                )
+            params['headers']['Accept-Language'] = ac_lang
+
         return params
 
     def _send_http_request(self, params):

+ 1 - 0
searx/settings.yml

@@ -748,6 +748,7 @@ engines:
 
   - name: google play movies
     engine: xpath
+    send_accept_language_header: true
     search_url: https://play.google.com/store/search?q={query}&c=movies
     results_xpath: '//div[@class="ImZGtf mpg5gc"]'
     title_xpath: './/div[@class="RZEgze"]//div[@class="kCSSQe"]//a'