Browse Source

Merge pull request #1560 from return42/http-accept-language

[mod] add 'Accept-Language' HTTP header to online processores
Markus Heiser 2 years ago
parent
commit
7c9c112484

+ 8 - 0
docs/admin/engines/settings.rst

@@ -440,6 +440,7 @@ engine is shown.  Most of the options have a default value or even are optional.
      engine: example
      engine: example
      shortcut: demo
      shortcut: demo
      base_url: 'https://{language}.example.com/'
      base_url: 'https://{language}.example.com/'
+     send_accept_language_header: false
      categories: general
      categories: general
      timeout: 3.0
      timeout: 3.0
      api_key: 'apikey'
      api_key: 'apikey'
@@ -488,6 +489,13 @@ engine is shown.  Most of the options have a default value or even are optional.
   use multiple sites using only one engine, or updating the site URL without
   use multiple sites using only one engine, or updating the site URL without
   touching at the code.
   touching at the code.
 
 
+``send_accept_language_header`` :
+  Several engines that support languages (or regions) deal with the HTTP header
+  ``Accept-Language`` to build a response that fits to the locale.  When this
+  option is activated, the language (locale) that is selected by the user is used
+  to build and send a ``Accept-Language`` header in the request to the origin
+  search engine.
+
 ``categories`` : optional
 ``categories`` : optional
   Define in which categories this engine will be active.  Most of the time, it is
   Define in which categories this engine will be active.  Most of the time, it is
   defined in the code of the engine, but in a few cases it is useful, like when
   defined in the code of the engine, but in a few cases it is useful, like when

+ 1 - 0
searx/engines/__init__.py

@@ -44,6 +44,7 @@ ENGINE_DEFAULT_ARGS = {
     "enable_http": False,
     "enable_http": False,
     "using_tor_proxy": False,
     "using_tor_proxy": False,
     "display_error_messages": True,
     "display_error_messages": True,
+    "send_accept_language_header": False,
     "tokens": [],
     "tokens": [],
     "about": {},
     "about": {},
 }
 }

+ 1 - 1
searx/engines/bing.py

@@ -25,6 +25,7 @@ categories = ['general', 'web']
 paging = True
 paging = True
 time_range_support = False
 time_range_support = False
 safesearch = False
 safesearch = False
+send_accept_language_header = True
 supported_languages_url = 'https://www.bing.com/account/general'
 supported_languages_url = 'https://www.bing.com/account/general'
 language_aliases = {}
 language_aliases = {}
 
 
@@ -68,7 +69,6 @@ def request(query, params):
         logger.debug("headers.Referer --> %s", referer)
         logger.debug("headers.Referer --> %s", referer)
 
 
     params['url'] = base_url + search_path
     params['url'] = base_url + search_path
-    params['headers']['Accept-Language'] = "en-US,en;q=0.5"
     params['headers']['Accept'] = 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8'
     params['headers']['Accept'] = 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8'
     return params
     return params
 
 

+ 1 - 0
searx/engines/bing_images.py

@@ -31,6 +31,7 @@ categories = ['images', 'web']
 paging = True
 paging = True
 safesearch = True
 safesearch = True
 time_range_support = True
 time_range_support = True
+send_accept_language_header = True
 supported_languages_url = 'https://www.bing.com/account/general'
 supported_languages_url = 'https://www.bing.com/account/general'
 number_of_results = 28
 number_of_results = 28
 
 

+ 1 - 0
searx/engines/bing_news.py

@@ -34,6 +34,7 @@ about = {
 categories = ['news']
 categories = ['news']
 paging = True
 paging = True
 time_range_support = True
 time_range_support = True
+send_accept_language_header = True
 
 
 # search-url
 # search-url
 base_url = 'https://www.bing.com/'
 base_url = 'https://www.bing.com/'

+ 1 - 4
searx/engines/bing_videos.py

@@ -30,6 +30,7 @@ categories = ['videos', 'web']
 paging = True
 paging = True
 safesearch = True
 safesearch = True
 time_range_support = True
 time_range_support = True
+send_accept_language_header = True
 number_of_results = 28
 number_of_results = 28
 
 
 base_url = 'https://www.bing.com/'
 base_url = 'https://www.bing.com/'
@@ -70,10 +71,6 @@ def request(query, params):
     if params['time_range'] in time_range_dict:
     if params['time_range'] in time_range_dict:
         params['url'] += time_range_string.format(interval=time_range_dict[params['time_range']])
         params['url'] += time_range_string.format(interval=time_range_dict[params['time_range']])
 
 
-    # bing videos did not like "older" versions < 70.0.1 when selectin other
-    # languages then 'en' .. very strange ?!?!
-    params['headers']['User-Agent'] = 'Mozilla/5.0 (X11; Linux x86_64; rv:73.0.1) Gecko/20100101 Firefox/73.0.1'
-
     return params
     return params
 
 
 
 

+ 1 - 0
searx/engines/demo_online.py

@@ -20,6 +20,7 @@ from json import loads
 from urllib.parse import urlencode
 from urllib.parse import urlencode
 
 
 engine_type = 'online'
 engine_type = 'online'
+send_accept_language_header = True
 categories = ['general']
 categories = ['general']
 disabled = True
 disabled = True
 timeout = 2.0
 timeout = 2.0

+ 1 - 0
searx/engines/duckduckgo.py

@@ -31,6 +31,7 @@ categories = ['general', 'web']
 paging = True
 paging = True
 supported_languages_url = 'https://duckduckgo.com/util/u588.js'
 supported_languages_url = 'https://duckduckgo.com/util/u588.js'
 time_range_support = True
 time_range_support = True
+send_accept_language_header = True
 
 
 language_aliases = {
 language_aliases = {
     'ar-SA': 'ar-XA',
     'ar-SA': 'ar-XA',

+ 2 - 1
searx/engines/duckduckgo_definitions.py

@@ -27,6 +27,8 @@ about = {
     "results": 'JSON',
     "results": 'JSON',
 }
 }
 
 
+send_accept_language_header = True
+
 URL = 'https://api.duckduckgo.com/' + '?{query}&format=json&pretty=0&no_redirect=1&d=1'
 URL = 'https://api.duckduckgo.com/' + '?{query}&format=json&pretty=0&no_redirect=1&d=1'
 
 
 WIKIDATA_PREFIX = ['http://www.wikidata.org/entity/', 'https://www.wikidata.org/entity/']
 WIKIDATA_PREFIX = ['http://www.wikidata.org/entity/', 'https://www.wikidata.org/entity/']
@@ -62,7 +64,6 @@ def request(query, params):
     params['url'] = URL.format(query=urlencode({'q': query}))
     params['url'] = URL.format(query=urlencode({'q': query}))
     language = match_language(params['language'], supported_languages, language_aliases)
     language = match_language(params['language'], supported_languages, language_aliases)
     language = language.split('-')[0]
     language = language.split('-')[0]
-    params['headers']['Accept-Language'] = language
     return params
     return params
 
 
 
 

+ 1 - 0
searx/engines/duckduckgo_images.py

@@ -30,6 +30,7 @@ about = {
 categories = ['images', 'web']
 categories = ['images', 'web']
 paging = True
 paging = True
 safesearch = True
 safesearch = True
+send_accept_language_header = True
 
 
 # search-url
 # search-url
 images_url = 'https://duckduckgo.com/i.js?{query}&s={offset}&p={safesearch}&o=json&vqd={vqd}'
 images_url = 'https://duckduckgo.com/i.js?{query}&s={offset}&p={safesearch}&o=json&vqd={vqd}'

+ 1 - 10
searx/engines/google.py

@@ -45,6 +45,7 @@ categories = ['general', 'web']
 paging = True
 paging = True
 time_range_support = True
 time_range_support = True
 safesearch = True
 safesearch = True
+send_accept_language_header = True
 use_mobile_ui = False
 use_mobile_ui = False
 supported_languages_url = 'https://www.google.com/preferences?#languages'
 supported_languages_url = 'https://www.google.com/preferences?#languages'
 
 
@@ -241,16 +242,6 @@ def get_lang_info(params, lang_list, custom_aliases, supported_any_language):
         # language.
         # language.
         ret_val['params']['lr'] = "lang_" + lang_list.get(lang_country, language)
         ret_val['params']['lr'] = "lang_" + lang_list.get(lang_country, language)
 
 
-        # Accept-Language: fr-CH, fr;q=0.8, en;q=0.6, *;q=0.5
-        ret_val['headers']['Accept-Language'] = ','.join(
-            [
-                lang_country,
-                language + ';q=0.8,',
-                'en;q=0.6',
-                '*;q=0.5',
-            ]
-        )
-
     return ret_val
     return ret_val
 
 
 
 

+ 1 - 1
searx/engines/google_images.py

@@ -51,6 +51,7 @@ paging = False
 use_locale_domain = True
 use_locale_domain = True
 time_range_support = True
 time_range_support = True
 safesearch = True
 safesearch = True
+send_accept_language_header = True
 
 
 filter_mapping = {0: 'images', 1: 'active', 2: 'active'}
 filter_mapping = {0: 'images', 1: 'active', 2: 'active'}
 
 
@@ -125,7 +126,6 @@ def request(query, params):
     """Google-Video search request"""
     """Google-Video search request"""
 
 
     lang_info = get_lang_info(params, supported_languages, language_aliases, False)
     lang_info = get_lang_info(params, supported_languages, language_aliases, False)
-    logger.debug("HTTP header Accept-Language --> %s", lang_info['headers']['Accept-Language'])
 
 
     query_url = (
     query_url = (
         'https://'
         'https://'

+ 1 - 1
searx/engines/google_news.py

@@ -70,13 +70,13 @@ time_range_support = True
 #
 #
 #  safesearch : results are identitical for safesearch=0 and safesearch=2
 #  safesearch : results are identitical for safesearch=0 and safesearch=2
 safesearch = False
 safesearch = False
+send_accept_language_header = True
 
 
 
 
 def request(query, params):
 def request(query, params):
     """Google-News search request"""
     """Google-News search request"""
 
 
     lang_info = get_lang_info(params, supported_languages, language_aliases, False)
     lang_info = get_lang_info(params, supported_languages, language_aliases, False)
-    logger.debug("HTTP header Accept-Language --> %s", lang_info['headers']['Accept-Language'])
 
 
     # google news has only one domain
     # google news has only one domain
     lang_info['subdomain'] = 'news.google.com'
     lang_info['subdomain'] = 'news.google.com'

+ 2 - 0
searx/engines/google_play_apps.py

@@ -22,6 +22,8 @@ about = {
 }
 }
 
 
 categories = ["files", "apps"]
 categories = ["files", "apps"]
+send_accept_language_header = True
+
 search_url = "https://play.google.com/store/search?{query}&c=apps"
 search_url = "https://play.google.com/store/search?{query}&c=apps"
 
 
 
 

+ 1 - 1
searx/engines/google_scholar.py

@@ -52,6 +52,7 @@ language_support = True
 use_locale_domain = True
 use_locale_domain = True
 time_range_support = True
 time_range_support = True
 safesearch = False
 safesearch = False
+send_accept_language_header = True
 
 
 
 
 def time_range_url(params):
 def time_range_url(params):
@@ -75,7 +76,6 @@ def request(query, params):
 
 
     offset = (params['pageno'] - 1) * 10
     offset = (params['pageno'] - 1) * 10
     lang_info = get_lang_info(params, supported_languages, language_aliases, False)
     lang_info = get_lang_info(params, supported_languages, language_aliases, False)
-    logger.debug("HTTP header Accept-Language --> %s", lang_info['headers']['Accept-Language'])
 
 
     # subdomain is: scholar.google.xy
     # subdomain is: scholar.google.xy
     lang_info['subdomain'] = lang_info['subdomain'].replace("www.", "scholar.")
     lang_info['subdomain'] = lang_info['subdomain'].replace("www.", "scholar.")

+ 1 - 1
searx/engines/google_videos.py

@@ -60,6 +60,7 @@ language_support = True
 use_locale_domain = True
 use_locale_domain = True
 time_range_support = True
 time_range_support = True
 safesearch = True
 safesearch = True
+send_accept_language_header = True
 
 
 RE_CACHE = {}
 RE_CACHE = {}
 
 
@@ -111,7 +112,6 @@ def request(query, params):
     """Google-Video search request"""
     """Google-Video search request"""
 
 
     lang_info = get_lang_info(params, supported_languages, language_aliases, False)
     lang_info = get_lang_info(params, supported_languages, language_aliases, False)
-    logger.debug("HTTP header Accept-Language --> %s", lang_info['headers']['Accept-Language'])
 
 
     query_url = (
     query_url = (
         'https://'
         'https://'

+ 3 - 3
searx/engines/openstreetmap.py

@@ -30,6 +30,7 @@ about = {
 categories = ['map']
 categories = ['map']
 paging = False
 paging = False
 language_support = True
 language_support = True
+send_accept_language_header = True
 
 
 # search-url
 # search-url
 base_url = 'https://nominatim.openstreetmap.org/'
 base_url = 'https://nominatim.openstreetmap.org/'
@@ -142,9 +143,8 @@ def request(query, params):
     params['url'] = base_url + search_string.format(query=urlencode({'q': query}))
     params['url'] = base_url + search_string.format(query=urlencode({'q': query}))
     params['route'] = route_re.match(query)
     params['route'] = route_re.match(query)
     params['headers']['User-Agent'] = searx_useragent()
     params['headers']['User-Agent'] = searx_useragent()
-
-    accept_language = 'en' if params['language'] == 'all' else params['language']
-    params['headers']['Accept-Language'] = accept_language
+    if 'Accept-Language' not in params['headers']:
+        params['headers']['Accept-Language'] = 'en'
     return params
     return params
 
 
 
 

+ 3 - 3
searx/engines/wikipedia.py

@@ -19,6 +19,9 @@ about = {
     "results": 'JSON',
     "results": 'JSON',
 }
 }
 
 
+
+send_accept_language_header = True
+
 # search-url
 # search-url
 search_url = 'https://{language}.wikipedia.org/api/rest_v1/page/summary/{title}'
 search_url = 'https://{language}.wikipedia.org/api/rest_v1/page/summary/{title}'
 supported_languages_url = 'https://meta.wikimedia.org/wiki/List_of_Wikipedias'
 supported_languages_url = 'https://meta.wikimedia.org/wiki/List_of_Wikipedias'
@@ -41,9 +44,6 @@ def request(query, params):
     language = url_lang(params['language'])
     language = url_lang(params['language'])
     params['url'] = search_url.format(title=quote(query), language=language)
     params['url'] = search_url.format(title=quote(query), language=language)
 
 
-    if params['language'].lower() in language_variants.get(language, []):
-        params['headers']['Accept-Language'] = params['language'].lower()
-
     params['headers']['User-Agent'] = searx_useragent()
     params['headers']['User-Agent'] = searx_useragent()
     params['raise_for_httperror'] = False
     params['raise_for_httperror'] = False
     params['soft_max_redirects'] = 2
     params['soft_max_redirects'] = 2

+ 9 - 0
searx/search/models.py

@@ -1,6 +1,7 @@
 # SPDX-License-Identifier: AGPL-3.0-or-later
 # SPDX-License-Identifier: AGPL-3.0-or-later
 
 
 import typing
 import typing
+import babel
 
 
 
 
 class EngineRef:
 class EngineRef:
@@ -29,6 +30,7 @@ class SearchQuery:
         'query',
         'query',
         'engineref_list',
         'engineref_list',
         'lang',
         'lang',
+        'locale',
         'safesearch',
         'safesearch',
         'pageno',
         'pageno',
         'time_range',
         'time_range',
@@ -59,6 +61,13 @@ class SearchQuery:
         self.external_bang = external_bang
         self.external_bang = external_bang
         self.engine_data = engine_data or {}
         self.engine_data = engine_data or {}
 
 
+        self.locale = None
+        if self.lang:
+            try:
+                self.locale = babel.Locale.parse(self.lang, sep='-')
+            except babel.core.UnknownLocaleError:
+                pass
+
     @property
     @property
     def categories(self):
     def categories(self):
         return list(set(map(lambda engineref: engineref.category, self.engineref_list)))
         return list(set(map(lambda engineref: engineref.category, self.engineref_list)))

+ 7 - 0
searx/search/processors/abstract.py

@@ -138,6 +138,13 @@ class EngineProcessor(ABC):
         return False
         return False
 
 
     def get_params(self, search_query, engine_category):
     def get_params(self, search_query, engine_category):
+        """Returns a set of *request params* or ``None`` if request is not supported.
+
+        Not supported conditions (``None`` is returned):
+
+        - A page-number > 1 when engine does not support paging.
+        - A time range when the engine does not support time range.
+        """
         # if paging is not supported, skip
         # if paging is not supported, skip
         if search_query.pageno > 1 and not self.engine.paging:
         if search_query.pageno > 1 and not self.engine.paging:
             return None
             return None

+ 11 - 0
searx/search/processors/online.py

@@ -60,6 +60,17 @@ class OnlineProcessor(EngineProcessor):
         # add an user agent
         # add an user agent
         params['headers']['User-Agent'] = gen_useragent()
         params['headers']['User-Agent'] = gen_useragent()
 
 
+        # add Accept-Language header
+        if self.engine.send_accept_language_header and search_query.locale:
+            ac_lang = search_query.locale.language
+            if search_query.locale.territory:
+                ac_lang = "%s-%s,%s;q=0.9,*;q=0.5" % (
+                    search_query.locale.language,
+                    search_query.locale.territory,
+                    search_query.locale.language,
+                )
+            params['headers']['Accept-Language'] = ac_lang
+
         return params
         return params
 
 
     def _send_http_request(self, params):
     def _send_http_request(self, params):

+ 3 - 0
searx/search/processors/online_currency.py

@@ -38,6 +38,9 @@ class OnlineCurrencyProcessor(OnlineProcessor):
     engine_type = 'online_currency'
     engine_type = 'online_currency'
 
 
     def get_params(self, search_query, engine_category):
     def get_params(self, search_query, engine_category):
+        """Returns a set of *request params* or ``None`` if search query does not match
+        to :py:obj:`parser_re`."""
+
         params = super().get_params(search_query, engine_category)
         params = super().get_params(search_query, engine_category)
         if params is None:
         if params is None:
             return None
             return None

+ 2 - 0
searx/search/processors/online_dictionary.py

@@ -18,6 +18,8 @@ class OnlineDictionaryProcessor(OnlineProcessor):
     engine_type = 'online_dictionary'
     engine_type = 'online_dictionary'
 
 
     def get_params(self, search_query, engine_category):
     def get_params(self, search_query, engine_category):
+        """Returns a set of *request params* or ``None`` if search query does not match
+        to :py:obj:`parser_re`."""
         params = super().get_params(search_query, engine_category)
         params = super().get_params(search_query, engine_category)
         if params is None:
         if params is None:
             return None
             return None

+ 3 - 0
searx/search/processors/online_url_search.py

@@ -20,6 +20,9 @@ class OnlineUrlSearchProcessor(OnlineProcessor):
     engine_type = 'online_url_search'
     engine_type = 'online_url_search'
 
 
     def get_params(self, search_query, engine_category):
     def get_params(self, search_query, engine_category):
+        """Returns a set of *request params* or ``None`` if search query does not match
+        to at least one of :py:obj:`re_search_urls`.
+        """
         params = super().get_params(search_query, engine_category)
         params = super().get_params(search_query, engine_category)
         if params is None:
         if params is None:
             return None
             return None

+ 1 - 0
searx/settings.yml

@@ -748,6 +748,7 @@ engines:
 
 
   - name: google play movies
   - name: google play movies
     engine: xpath
     engine: xpath
+    send_accept_language_header: true
     search_url: https://play.google.com/store/search?q={query}&c=movies
     search_url: https://play.google.com/store/search?q={query}&c=movies
     results_xpath: '//div[@class="ImZGtf mpg5gc"]'
     results_xpath: '//div[@class="ImZGtf mpg5gc"]'
     title_xpath: './/div[@class="RZEgze"]//div[@class="kCSSQe"]//a'
     title_xpath: './/div[@class="RZEgze"]//div[@class="kCSSQe"]//a'