Browse Source

Merge pull request #2347 from return42/mod-lang-detection

If language recognition fails use the Accept-Language
Markus Heiser 2 years ago
parent
commit
45529f51a1
5 changed files with 106 additions and 71 deletions
  1. 59 2
      searx/preferences.py
  2. 2 41
      searx/search/__init__.py
  3. 1 1
      searx/settings.yml
  4. 30 2
      searx/webadapter.py
  5. 14 25
      searx/webapp.py

+ 59 - 2
searx/preferences.py

@@ -8,9 +8,10 @@
 from base64 import urlsafe_b64encode, urlsafe_b64decode
 from zlib import compress, decompress
 from urllib.parse import parse_qs, urlencode
-from typing import Iterable, Dict, List
+from typing import Iterable, Dict, List, Optional
 
 import flask
+import babel
 
 from searx import settings, autocomplete
 from searx.enginelib import Engine
@@ -287,10 +288,65 @@ class PluginsSetting(BooleanChoices):
         return [item[len('plugin_') :] for item in items]
 
 
+class ClientPref:
+    """Container to assemble client prefferences and settings."""
+
+    # hint: searx.webapp.get_client_settings should be moved into this class
+
+    locale: babel.Locale
+    """Locale prefered by the client."""
+
+    def __init__(self, locale: Optional[babel.Locale] = None):
+        self.locale = locale
+
+    @property
+    def locale_tag(self):
+        if self.locale is None:
+            return None
+        tag = self.locale.language
+        if self.locale.territory:
+            tag += '-' + self.locale.territory
+        return tag
+
+    @classmethod
+    def from_http_request(cls, http_request: flask.Request):
+        """Build ClientPref object from HTTP request.
+
+        - `Accept-Language used for locale setting
+          <https://www.w3.org/International/questions/qa-accept-lang-locales.en>`__
+
+        """
+        al_header = http_request.headers.get("Accept-Language")
+        if not al_header:
+            return cls(locale=None)
+
+        pairs = []
+        for l in al_header.split(','):
+            # fmt: off
+            lang, qvalue = [_.strip() for _ in (l.split(';') + ['q=1',])[:2]]
+            # fmt: on
+            try:
+                qvalue = float(qvalue.split('=')[-1])
+                locale = babel.Locale.parse(lang, sep='-')
+            except (ValueError, babel.core.UnknownLocaleError):
+                continue
+            pairs.append((locale, qvalue))
+        pairs.sort(reverse=True, key=lambda x: x[1])
+        return cls(locale=pairs[0][0])
+
+
 class Preferences:
     """Validates and saves preferences to cookies"""
 
-    def __init__(self, themes: List[str], categories: List[str], engines: Dict[str, Engine], plugins: Iterable[Plugin]):
+    def __init__(
+        self,
+        themes: List[str],
+        categories: List[str],
+        engines: Dict[str, Engine],
+        plugins: Iterable[Plugin],
+        client: Optional[ClientPref] = None,
+    ):
+
         super().__init__()
 
         self.key_value_settings: Dict[str, Setting] = {
@@ -414,6 +470,7 @@ class Preferences:
         self.engines = EnginesSetting('engines', engines=engines.values())
         self.plugins = PluginsSetting('plugins', plugins=plugins)
         self.tokens = SetSetting('tokens')
+        self.client = client or ClientPref()
         self.unknown_params: Dict[str, str] = {}
 
     def get_as_url_params(self):

+ 2 - 41
searx/search/__init__.py

@@ -22,7 +22,6 @@ from searx.network import initialize as initialize_network, check_network_config
 from searx.metrics import initialize as initialize_metrics, counter_inc, histogram_observe_time
 from searx.search.processors import PROCESSORS, initialize as initialize_processors
 from searx.search.checker import initialize as initialize_checker
-from searx.utils import detect_language
 
 
 logger = logger.getChild('search')
@@ -40,57 +39,19 @@ def initialize(settings_engines=None, enable_checker=False, check_network=False,
         initialize_checker()
 
 
-def replace_auto_language(search_query: SearchQuery):
-    """
-    Do nothing except if `search_query.lang` is "auto".
-    In this case:
-    * the value "auto" is replaced by the detected language of the query.
-      The default value is "all" when no language is detected.
-    * `search_query.locale` is updated accordingly
-
-    Use :py:obj:`searx.utils.detect_language` with `only_search_languages=True` to keep
-    only languages supported by the engines.
-    """
-    if search_query.lang != 'auto':
-        return
-
-    detected_lang = detect_language(search_query.query, threshold=0.3, only_search_languages=True)
-    if detected_lang is None:
-        # fallback to 'all' if no language has been detected
-        search_query.lang = 'all'
-        search_query.locale = None
-        return
-    search_query.lang = detected_lang
-    try:
-        search_query.locale = babel.Locale.parse(search_query.lang)
-    except babel.core.UnknownLocaleError:
-        search_query.locale = None
-
-
 class Search:
     """Search information container"""
 
     __slots__ = "search_query", "result_container", "start_time", "actual_timeout"
 
     def __init__(self, search_query: SearchQuery):
-        """Initialize the Search
-
-        search_query is copied
-        """
+        """Initialize the Search"""
         # init vars
         super().__init__()
+        self.search_query = search_query
         self.result_container = ResultContainer()
         self.start_time = None
         self.actual_timeout = None
-        self.search_query = copy(search_query)
-        self.update_search_query(self.search_query)
-
-    def update_search_query(self, search_query: SearchQuery):
-        """Update search_query.
-
-        call replace_auto_language to replace the "auto" language
-        """
-        replace_auto_language(search_query)
 
     def search_external_bang(self):
         """

+ 1 - 1
searx/settings.yml

@@ -31,7 +31,7 @@ search:
   autocomplete_min: 4
   # Default search language - leave blank to detect from browser information or
   # use codes from 'languages.py'
-  default_lang: ""
+  default_lang: "auto"
   # Available languages
   # languages:
   #   - all

+ 30 - 2
searx/webadapter.py

@@ -6,6 +6,7 @@ from searx.query import RawTextQuery
 from searx.engines import categories, engines
 from searx.search import SearchQuery, EngineRef
 from searx.preferences import Preferences, is_locked
+from searx.utils import detect_language
 
 
 # remove duplicate queries.
@@ -214,7 +215,27 @@ def parse_engine_data(form):
 
 def get_search_query_from_webapp(
     preferences: Preferences, form: Dict[str, str]
-) -> Tuple[SearchQuery, RawTextQuery, List[EngineRef], List[EngineRef]]:
+) -> Tuple[SearchQuery, RawTextQuery, List[EngineRef], List[EngineRef], str]:
+    """Assemble data from preferences and request.form (from the HTML form) needed
+    in a search query.
+
+    The returned tuple consits of:
+
+    1. instance of :py:obj:`searx.search.SearchQuery`
+    2. instance of :py:obj:`searx.query.RawTextQuery`
+    3. list of :py:obj:`searx.search.EngineRef` instances
+    4. string with the *selected locale* of the query
+
+    About language/locale: if the client selects the alias ``auto`` the
+    ``SearchQuery`` object is build up by the :py:obj:`detected language
+    <searx.utils.detect_language>`.  If language recognition does not have a
+    match the language preferred by the :py:obj:`Preferences.client` is used.
+    If client does not have a preference, the default ``all`` is used.
+
+    The *selected locale* in the tuple always represents the selected
+    language/locale and might differ from the language recognition.
+
+    """
     # no text for the query ?
     if not form.get('q'):
         raise SearxParameterException('q', '')
@@ -229,13 +250,19 @@ def get_search_query_from_webapp(
     # set query
     query = raw_text_query.getQuery()
     query_pageno = parse_pageno(form)
-    query_lang = parse_lang(preferences, form, raw_text_query)
     query_safesearch = parse_safesearch(preferences, form)
     query_time_range = parse_time_range(form)
     query_timeout = parse_timeout(form, raw_text_query)
     external_bang = raw_text_query.external_bang
     engine_data = parse_engine_data(form)
 
+    query_lang = parse_lang(preferences, form, raw_text_query)
+    selected_locale = query_lang
+
+    if query_lang == 'auto':
+        query_lang = detect_language(query, threshold=0.8, only_search_languages=True)
+        query_lang = query_lang or preferences.client.locale_tag or 'all'
+
     if not is_locked('categories') and raw_text_query.specific:
         # if engines are calculated from query,
         # set categories by using that information
@@ -265,4 +292,5 @@ def get_search_query_from_webapp(
         raw_text_query,
         query_engineref_list_unknown,
         query_engineref_list_notoken,
+        selected_locale,
     )

+ 14 - 25
searx/webapp.py

@@ -84,6 +84,7 @@ from searx.webutils import (
 from searx.webadapter import (
     get_search_query_from_webapp,
     get_selected_categories,
+    parse_lang,
 )
 from searx.utils import (
     html_to_text,
@@ -96,6 +97,7 @@ from searx.plugins import Plugin, plugins, initialize as plugin_initialize
 from searx.plugins.oa_doi_rewrite import get_doi_resolver
 from searx.preferences import (
     Preferences,
+    ClientPref,
     ValidationException,
 )
 from searx.answerers import (
@@ -221,16 +223,9 @@ babel = Babel(app, locale_selector=get_locale)
 
 
 def _get_browser_language(req, lang_list):
-    for lang in req.headers.get("Accept-Language", "en").split(","):
-        if ';' in lang:
-            lang = lang.split(';')[0]
-        if '-' in lang:
-            lang_parts = lang.split('-')
-            lang = "{}-{}".format(lang_parts[0], lang_parts[-1].upper())
-        locale = match_locale(lang, lang_list, fallback=None)
-        if locale is not None:
-            return locale
-    return 'en'
+    client = ClientPref.from_http_request(req)
+    locale = match_locale(client.locale_tag, lang_list, fallback='en')
+    return locale
 
 
 def _get_locale_rfc5646(locale):
@@ -446,11 +441,7 @@ def render(template_name: str, **kwargs):
         kwargs['rtl'] = True
 
     if 'current_language' not in kwargs:
-        _locale = request.preferences.get_value('language')
-        if _locale in ('auto', 'all'):
-            kwargs['current_language'] = _locale
-        else:
-            kwargs['current_language'] = match_locale(_locale, settings['search']['languages'])
+        kwargs['current_language'] = parse_lang(request.preferences, {}, RawTextQuery('', []))
 
     # values from settings
     kwargs['search_formats'] = [x for x in settings['search']['formats'] if x != 'html']
@@ -512,7 +503,10 @@ def pre_request():
     request.timings = []  # pylint: disable=assigning-non-slot
     request.errors = []  # pylint: disable=assigning-non-slot
 
-    preferences = Preferences(themes, list(categories.keys()), engines, plugins)  # pylint: disable=redefined-outer-name
+    client_pref = ClientPref.from_http_request(request)
+    # pylint: disable=redefined-outer-name
+    preferences = Preferences(themes, list(categories.keys()), engines, plugins, client_pref)
+
     user_agent = request.headers.get('User-Agent', '').lower()
     if 'webkit' in user_agent and 'android' in user_agent:
         preferences.key_value_settings['method'].value = 'GET'
@@ -681,7 +675,9 @@ def search():
     raw_text_query = None
     result_container = None
     try:
-        search_query, raw_text_query, _, _ = get_search_query_from_webapp(request.preferences, request.form)
+        search_query, raw_text_query, _, _, selected_locale = get_search_query_from_webapp(
+            request.preferences, request.form
+        )
         # search = Search(search_query) #  without plugins
         search = SearchWithPlugins(search_query, request.user_plugins, request)  # pylint: disable=redefined-outer-name
 
@@ -812,13 +808,6 @@ def search():
         )
     )
 
-    if search_query.lang in ('auto', 'all'):
-        current_language = search_query.lang
-    else:
-        current_language = match_locale(
-            search_query.lang, settings['search']['languages'], fallback=request.preferences.get_value("language")
-        )
-
     # search_query.lang contains the user choice (all, auto, en, ...)
     # when the user choice is "auto", search.search_query.lang contains the detected language
     # otherwise it is equals to search_query.lang
@@ -841,7 +830,7 @@ def search():
             result_container.unresponsive_engines
         ),
         current_locale = request.preferences.get_value("locale"),
-        current_language = current_language,
+        current_language = selected_locale,
         search_language = match_locale(
             search.search_query.lang,
             settings['search']['languages'],