2 years ago · d669da81fb
--- a/docs/src/searx.plugins.autodetect_search_language.rst
+++ b/docs/src/searx.plugins.autodetect_search_language.rst
@@ -1,8 +0,0 @@
 
				-.. _autodetect search language:
			
 
				-
			
 
				-======================
			
 
				-Search language plugin
			
 
				-======================
			
 
				-
			
 
				-.. automodule:: searx.plugins.autodetect_search_language
			
 
				-  :members:
			
--- a/searx/plugins/autodetect_search_language.py
+++ b/searx/plugins/autodetect_search_language.py
@@ -1,97 +0,0 @@
 
				-# SPDX-License-Identifier: AGPL-3.0-or-later
			
 
				-# lint: pylint
			
 
				-"""Plugin to detect the search language from the search query.
			
 
				-
			
 
				-The language detection is done by using the fastText_ library (`python
			
 
				-fasttext`_). fastText_ distributes the `language identification model`_, for
			
 
				-reference:
			
 
				-
			
 
				-- `FastText.zip: Compressing text classification models`_
			
 
				-- `Bag of Tricks for Efficient Text Classification`_
			
 
				-
			
 
				-The `language identification model`_ support the language codes (ISO-639-3)::
			
 
				-
			
 
				-   af als am an ar arz as ast av az azb ba bar bcl be bg bh bn bo bpy br bs bxr
			
 
				-   ca cbk ce ceb ckb co cs cv cy da de diq dsb dty dv el eml en eo es et eu fa
			
 
				-   fi fr frr fy ga gd gl gn gom gu gv he hi hif hr hsb ht hu hy ia id ie ilo io
			
 
				-   is it ja jbo jv ka kk km kn ko krc ku kv kw ky la lb lez li lmo lo lrc lt lv
			
 
				-   mai mg mhr min mk ml mn mr mrj ms mt mwl my myv mzn nah nap nds ne new nl nn
			
 
				-   no oc or os pa pam pfl pl pms pnb ps pt qu rm ro ru rue sa sah sc scn sco sd
			
 
				-   sh si sk sl so sq sr su sv sw ta te tg th tk tl tr tt tyv ug uk ur uz vec vep
			
 
				-   vi vls vo wa war wuu xal xmf yi yo yue zh
			
 
				-
			
 
				-The `language identification model`_ is harmonized with the SearXNG's language
			
 
				-(locale) model.  General conditions of SearXNG's locale model are:
			
 
				-
			
 
				-a. SearXNG's locale of a query is passed to the
			
 
				-   :py:obj:`searx.locales.get_engine_locale` to get a language and/or region
			
 
				-   code that is used by an engine.
			
 
				-
			
 
				-b. SearXNG and most of the engines do not support all the languages from
			
 
				-   language model and there might be also a discrepancy in the ISO-639-3 and
			
 
				-   ISO-639-2 handling (:py:obj:`searx.locales.get_engine_locale`).  Further
			
 
				-   more, in SearXNG the locales like ``zh-TH`` (``zh-CN``) are mapped to
			
 
				-   ``zh_Hant`` (``zh_Hans``).
			
 
				-
			
 
				-Conclusion: This plugin does only auto-detect the languages a user can select in
			
 
				-the language menu (:py:obj:`supported_langs`).
			
 
				-
			
 
				-SearXNG's locale of a query comes from (*highest wins*):
			
 
				-
			
 
				-1. The ``Accept-Language`` header from user's HTTP client.
			
 
				-2. The user select a locale in the preferences.
			
 
				-3. The user select a locale from the menu in the query form (e.g. ``:zh-TW``)
			
 
				-4. This plugin is activated in the preferences and the locale (only the language
			
 
				-   code / none region code) comes from the fastText's language detection.
			
 
				-
			
 
				-Conclusion: There is a conflict between the language selected by the user and
			
 
				-the language from language detection of this plugin.  For example, the user
			
 
				-explicitly selects the German locale via the search syntax to search for a term
			
 
				-that is identified as an English term (try ``:de-DE thermomix``, for example).
			
 
				-
			
 
				-.. hint::
			
 
				-
			
 
				-   To SearXNG maintainers; please take into account: under some circumstances
			
 
				-   the auto-detection of the language of this plugin could be detrimental to
			
 
				-   users expectations.  Its not recommended to activate this plugin by
			
 
				-   default. It should always be the user's decision whether to activate this
			
 
				-   plugin or not.
			
 
				-
			
 
				-.. _fastText: https://fasttext.cc/
			
 
				-.. _python fasttext: https://pypi.org/project/fasttext/
			
 
				-.. _language identification model: https://fasttext.cc/docs/en/language-identification.html
			
 
				-.. _Bag of Tricks for Efficient Text Classification: https://arxiv.org/abs/1607.01759
			
 
				-.. _`FastText.zip: Compressing text classification models`: https://arxiv.org/abs/1612.03651
			
 
				-
			
 
				-"""
			
 
				-
			
 
				-from flask_babel import gettext
			
 
				-import babel
			
 
				-
			
 
				-from searx.utils import detect_language
			
 
				-from searx.languages import language_codes
			
 
				-
			
 
				-name = gettext('Autodetect search language')
			
 
				-description = gettext('Automatically detect the query search language and switch to it.')
			
 
				-preference_section = 'general'
			
 
				-default_on = False
			
 
				-
			
 
				-supported_langs = set()
			
 
				-"""Languages supported by most searxng engines (:py:obj:`searx.languages.language_codes`)."""
			
 
				-
			
 
				-
			
 
				-def pre_search(request, search):  # pylint: disable=unused-argument
			
 
				-    lang = detect_language(search.search_query.query, min_probability=0)
			
 
				-    if lang in supported_langs:
			
 
				-        search.search_query.lang = lang
			
 
				-        try:
			
 
				-            search.search_query.locale = babel.Locale.parse(lang)
			
 
				-        except babel.core.UnknownLocaleError:
			
 
				-            pass
			
 
				-    return True
			
 
				-
			
 
				-
			
 
				-def init(app, settings):  # pylint: disable=unused-argument
			
 
				-    for searxng_locale in language_codes:
			
 
				-        supported_langs.add(searxng_locale[0].split('-')[0])
			
 
				-    return True
			
--- a/searx/preferences.py
+++ b/searx/preferences.py
@@ -154,7 +154,7 @@ class SearchLanguageSetting(EnumStringSetting):
 
				     """Available choices may change, so user's value may not be in choices anymore"""
			
 
				 
			
 
				     def _validate_selection(self, selection):
			
 
				-        if selection != '' and not VALID_LANGUAGE_CODE.match(selection):
			
 
				+        if selection != '' and selection != 'auto' and not VALID_LANGUAGE_CODE.match(selection):
			
 
				             raise ValidationException('Invalid language code: "{0}"'.format(selection))
			
 
				 
			
 
				     def parse(self, data: str):
			
--- a/searx/query.py
+++ b/searx/query.py
@@ -104,7 +104,7 @@ class LanguageParser(QueryPartParser):
 
				                     break
			
 
				 
			
 
				         # user may set a valid, yet not selectable language
			
 
				-        if VALID_LANGUAGE_CODE.match(value):
			
 
				+        if VALID_LANGUAGE_CODE.match(value) or value == 'auto':
			
 
				             lang_parts = value.split('-')
			
 
				             if len(lang_parts) > 1:
			
 
				                 value = lang_parts[0].lower() + '-' + lang_parts[1].upper()
			
--- a/searx/search/__init__.py
+++ b/searx/search/__init__.py
@@ -3,10 +3,12 @@
 
				 # pylint: disable=missing-module-docstring, too-few-public-methods
			
 
				 
			
 
				 import threading
			
 
				+from copy import copy
			
 
				 from timeit import default_timer
			
 
				 from uuid import uuid4
			
 
				 
			
 
				 import flask
			
 
				+import babel
			
 
				 
			
 
				 from searx import settings
			
 
				 from searx.answerers import ask
			
@@ -20,6 +22,7 @@ from searx.network import initialize as initialize_network, check_network_config
 
				 from searx.metrics import initialize as initialize_metrics, counter_inc, histogram_observe_time
			
 
				 from searx.search.processors import PROCESSORS, initialize as initialize_processors
			
 
				 from searx.search.checker import initialize as initialize_checker
			
 
				+from searx.utils import detect_language
			
 
				 
			
 
				 
			
 
				 logger = logger.getChild('search')
			
@@ -37,18 +40,57 @@ def initialize(settings_engines=None, enable_checker=False, check_network=False,
 
				         initialize_checker()
			
 
				 
			
 
				 
			
 
				+def replace_auto_language(search_query: SearchQuery):
			
 
				+    """
			
 
				+    Do nothing except if `search_query.lang` is "auto".
			
 
				+    In this case:
			
 
				+    * the value "auto" is replaced by the detected language of the query.
			
 
				+      The default value is "all" when no language is detected.
			
 
				+    * `search_query.locale` is updated accordingly
			
 
				+
			
 
				+    Use :py:obj:`searx.utils.detect_language` with `only_search_languages=True` to keep
			
 
				+    only languages supported by the engines.
			
 
				+    """
			
 
				+    if search_query.lang != 'auto':
			
 
				+        return
			
 
				+
			
 
				+    detected_lang = detect_language(search_query.query, threshold=0.0, only_search_languages=True)
			
 
				+    if detected_lang is None:
			
 
				+        # fallback to 'all' if no language has been detected
			
 
				+        search_query.lang = 'all'
			
 
				+        search_query.locale = None
			
 
				+        return
			
 
				+    search_query.lang = detected_lang
			
 
				+    try:
			
 
				+        search_query.locale = babel.Locale.parse(search_query.lang)
			
 
				+    except babel.core.UnknownLocaleError:
			
 
				+        search_query.locale = None
			
 
				+
			
 
				+
			
 
				 class Search:
			
 
				     """Search information container"""
			
 
				 
			
 
				     __slots__ = "search_query", "result_container", "start_time", "actual_timeout"
			
 
				 
			
 
				     def __init__(self, search_query: SearchQuery):
			
 
				+        """Initialize the Search
			
 
				+
			
 
				+        search_query is copied
			
 
				+        """
			
 
				         # init vars
			
 
				         super().__init__()
			
 
				-        self.search_query = search_query
			
 
				         self.result_container = ResultContainer()
			
 
				         self.start_time = None
			
 
				         self.actual_timeout = None
			
 
				+        self.search_query = copy(search_query)
			
 
				+        self.update_search_query(self.search_query)
			
 
				+
			
 
				+    def update_search_query(self, search_query: SearchQuery):
			
 
				+        """Update search_query.
			
 
				+
			
 
				+        call replace_auto_language to replace the "auto" language
			
 
				+        """
			
 
				+        replace_auto_language(search_query)
			
 
				 
			
 
				     def search_external_bang(self):
			
 
				         """
			
--- a/searx/search/models.py
+++ b/searx/search/models.py
@@ -109,3 +109,16 @@ class SearchQuery:
 
				                 self.external_bang,
			
 
				             )
			
 
				         )
			
 
				+
			
 
				+    def __copy__(self):
			
 
				+        return SearchQuery(
			
 
				+            self.query,
			
 
				+            self.engineref_list,
			
 
				+            self.lang,
			
 
				+            self.safesearch,
			
 
				+            self.pageno,
			
 
				+            self.time_range,
			
 
				+            self.timeout_limit,
			
 
				+            self.external_bang,
			
 
				+            self.engine_data,
			
 
				+        )
			
--- a/searx/settings_defaults.py
+++ b/searx/settings_defaults.py
@@ -18,7 +18,7 @@ searx_dir = abspath(dirname(__file__))
 
				 
			
 
				 logger = logging.getLogger('searx')
			
 
				 OUTPUT_FORMATS = ['html', 'csv', 'json', 'rss']
			
 
				-LANGUAGE_CODES = ['all'] + list(l[0] for l in languages)
			
 
				+LANGUAGE_CODES = ['all', 'auto'] + list(l[0] for l in languages)
			
 
				 SIMPLE_STYLE = ('auto', 'light', 'dark')
			
 
				 CATEGORIES_AS_TABS = {
			
 
				     'general': {},
			
--- a/searx/templates/simple/filters/languages.html
+++ b/searx/templates/simple/filters/languages.html
@@ -1,5 +1,9 @@
 
				 <select class="language" id="language" name="language" aria-label="{{ _('Search language') }}">{{- '' -}}
			
 
				 	<option value="all" {% if current_language == 'all' %}selected="selected"{% endif %}>{{ _('Default language') }}</option>
			
 
				+	<option value="auto" {% if current_language == 'auto' %}selected="selected"{% endif %}>
			
 
				+		{{- _('Auto-detect') -}}
			
 
				+		{%- if current_language == 'auto' %} ({{ search_language }}){%- endif -%}
			
 
				+	</option>
			
 
				 	{%- for lang_id,lang_name,country_name,english_name,flag in language_codes | sort(attribute=1) -%}
			
 
				 	<option value="{{ lang_id }}" {% if lang_id == current_language %}selected="selected"{% endif %}>
			
 
				 		{% if flag %}{{ flag }} {% endif%} {{- lang_name }} {% if country_name %}({{ country_name }}) {% endif %}
			
--- a/searx/templates/simple/preferences.html
+++ b/searx/templates/simple/preferences.html
@@ -116,12 +116,15 @@
 
				       <p class="value">{{- '' -}}
			
 
				         <select name='language' aria-labelledby="pref_language" aria-describedby="desc_language">{{- '' -}}
			
 
				           <option value="all" {% if current_language == 'all' %}selected="selected"{% endif %}>{{ _('Default language') }}</option>
			
 
				+          <option value="auto" {% if current_language == 'auto' %}selected="selected"{% endif %}>{{ _('Auto-detect') }}</option>
			
 
				           {%- for lang_id,lang_name,country_name,english_name,flag in language_codes | sort(attribute=1) -%}
			
 
				           <option value="{{ lang_id }}" {% if lang_id == current_language %}selected="selected"{% endif %}>{% if flag %}{{ flag }} {% endif%} {{- lang_name }} {% if country_name %}({{ country_name }}) {% endif %}</option>
			
 
				           {%- endfor -%}
			
 
				         </select>{{- '' -}}
			
 
				       </p>
			
 
				-      <div class="description" id="desc_language">{{ _('What language do you prefer for search?') }}</div>
			
 
				+      <div class="description" id="desc_language">
			
 
				+        {{- _('What language do you prefer for search?') }} {{ _('Choose Auto-detect to let SearXNG detect the language of your query.') -}}
			
 
				+      </div>
			
 
				     </fieldset>
			
 
				     {% endif %}
			
 
				     {% if 'autocomplete' not in locked_preferences %}
			
--- a/searx/utils.py
+++ b/searx/utils.py
@@ -53,6 +53,9 @@ _LANG_TO_LC_CACHE: Dict[str, Dict[str, str]] = {}
 
				 _FASTTEXT_MODEL: Optional["fasttext.FastText._FastText"] = None
			
 
				 """fasttext model to predict laguage of a search term"""
			
 
				 
			
 
				+SEARCH_LANGUAGE_CODES = frozenset([searxng_locale[0].split('-')[0] for searxng_locale in language_codes])
			
 
				+"""Languages supported by most searxng engines (:py:obj:`searx.languages.language_codes`)."""
			
 
				+
			
 
				 
			
 
				 class _NotSetClass:  # pylint: disable=too-few-public-methods
			
 
				     """Internal class for this module, do not create instance of this class.
			
@@ -637,11 +640,72 @@ def _get_fasttext_model() -> "fasttext.FastText._FastText":
 
				     return _FASTTEXT_MODEL
			
 
				 
			
 
				 
			
 
				-def detect_language(text: str, threshold: float = 0.3, min_probability: float = 0.5) -> Optional[str]:
			
 
				-    """https://fasttext.cc/docs/en/language-identification.html"""
			
 
				+def detect_language(text: str, threshold: float = 0.3, only_search_languages: bool = False) -> Optional[str]:
			
 
				+    """Detect the language of the ``text`` parameter.
			
 
				+
			
 
				+    :param str text: The string whose language is to be detected.
			
 
				+
			
 
				+    :param float threshold: Threshold filters the returned labels by a threshold
			
 
				+        on probability.  A choice of 0.3 will return labels with at least 0.3
			
 
				+        probability.
			
 
				+
			
 
				+    :param bool only_search_languages: If ``True``, returns only supported
			
 
				+        SearXNG search languages.  see :py:obj:`searx.languages`
			
 
				+
			
 
				+    :rtype: str, None
			
 
				+    :returns:
			
 
				+        The detected language code or ``None``. See below.
			
 
				+
			
 
				+    :raises ValueError: If ``text`` is not a string.
			
 
				+
			
 
				+    The language detection is done by using `a fork`_ of the fastText_ library
			
 
				+    (`python fasttext`_). fastText_ distributes the `language identification
			
 
				+    model`_, for reference:
			
 
				+
			
 
				+    - `FastText.zip: Compressing text classification models`_
			
 
				+    - `Bag of Tricks for Efficient Text Classification`_
			
 
				+
			
 
				+    The `language identification model`_ support the language codes
			
 
				+    (ISO-639-3)::
			
 
				+
			
 
				+        af als am an ar arz as ast av az azb ba bar bcl be bg bh bn bo bpy br bs
			
 
				+        bxr ca cbk ce ceb ckb co cs cv cy da de diq dsb dty dv el eml en eo es
			
 
				+        et eu fa fi fr frr fy ga gd gl gn gom gu gv he hi hif hr hsb ht hu hy ia
			
 
				+        id ie ilo io is it ja jbo jv ka kk km kn ko krc ku kv kw ky la lb lez li
			
 
				+        lmo lo lrc lt lv mai mg mhr min mk ml mn mr mrj ms mt mwl my myv mzn nah
			
 
				+        nap nds ne new nl nn no oc or os pa pam pfl pl pms pnb ps pt qu rm ro ru
			
 
				+        rue sa sah sc scn sco sd sh si sk sl so sq sr su sv sw ta te tg th tk tl
			
 
				+        tr tt tyv ug uk ur uz vec vep vi vls vo wa war wuu xal xmf yi yo yue zh
			
 
				+
			
 
				+    By using ``only_search_languages=True`` the `language identification model`_
			
 
				+    is harmonized with the SearXNG's language (locale) model.  General
			
 
				+    conditions of SearXNG's locale model are:
			
 
				+
			
 
				+    a. SearXNG's locale of a query is passed to the
			
 
				+       :py:obj:`searx.locales.get_engine_locale` to get a language and/or region
			
 
				+       code that is used by an engine.
			
 
				+
			
 
				+    b. Most of SearXNG's engines do not support all the languages from `language
			
 
				+       identification model`_ and there is also a discrepancy in the ISO-639-3
			
 
				+       (fastext) and ISO-639-2 (SearXNG)handling.  Further more, in SearXNG the
			
 
				+       locales like ``zh-TH`` (``zh-CN``) are mapped to ``zh_Hant``
			
 
				+       (``zh_Hans``) while the `language identification model`_ reduce both to
			
 
				+       ``zh``.
			
 
				+
			
 
				+    .. _a fork: https://github.com/searxng/fasttext-predict
			
 
				+    .. _fastText: https://fasttext.cc/
			
 
				+    .. _python fasttext: https://pypi.org/project/fasttext/
			
 
				+    .. _language identification model: https://fasttext.cc/docs/en/language-identification.html
			
 
				+    .. _Bag of Tricks for Efficient Text Classification: https://arxiv.org/abs/1607.01759
			
 
				+    .. _`FastText.zip: Compressing text classification models`: https://arxiv.org/abs/1612.03651
			
 
				+
			
 
				+    """
			
 
				     if not isinstance(text, str):
			
 
				         raise ValueError('text must a str')
			
 
				     r = _get_fasttext_model().predict(text.replace('\n', ' '), k=1, threshold=threshold)
			
 
				-    if isinstance(r, tuple) and len(r) == 2 and len(r[0]) > 0 and len(r[1]) > 0 and r[1][0] > min_probability:
			
 
				-        return r[0][0].split('__label__')[1]
			
 
				+    if isinstance(r, tuple) and len(r) == 2 and len(r[0]) > 0 and len(r[1]) > 0:
			
 
				+        language = r[0][0].split('__label__')[1]
			
 
				+        if only_search_languages and language not in SEARCH_LANGUAGE_CODES:
			
 
				+            return None
			
 
				+        return language
			
 
				     return None
			
--- a/searx/webadapter.py
+++ b/searx/webadapter.py
@@ -63,7 +63,7 @@ def parse_lang(preferences: Preferences, form: Dict[str, str], raw_text_query: R
 
				         query_lang = preferences.get_value('language')
			
 
				 
			
 
				     # check language
			
 
				-    if not VALID_LANGUAGE_CODE.match(query_lang):
			
 
				+    if not VALID_LANGUAGE_CODE.match(query_lang) and query_lang != 'auto':
			
 
				         raise SearxParameterException('language', query_lang)
			
 
				 
			
 
				     return query_lang
			
--- a/searx/webapp.py
+++ b/searx/webapp.py
@@ -810,6 +810,9 @@ def search():
 
				         )
			
 
				     )
			
 
				 
			
 
				+    # search_query.lang contains the user choice (all, auto, en, ...)
			
 
				+    # when the user choice is "auto", search.search_query.lang contains the detected language
			
 
				+    # otherwise it is equals to search_query.lang
			
 
				     return render(
			
 
				         # fmt: off
			
 
				         'results.html',
			
@@ -834,6 +837,11 @@ def search():
 
				             settings['search']['languages'],
			
 
				             fallback=request.preferences.get_value("language")
			
 
				         ),
			
 
				+        search_language = match_language(
			
 
				+            search.search_query.lang,
			
 
				+            settings['search']['languages'],
			
 
				+            fallback=request.preferences.get_value("language")
			
 
				+        ),
			
 
				         timeout_limit = request.form.get('timeout_limit', None)
			
 
				         # fmt: on
			
 
				     )
			
--- a/tests/unit/test_query.py
+++ b/tests/unit/test_query.py
@@ -91,6 +91,17 @@ class TestLanguageParser(SearxTestCase):
 
				         self.assertIn('all', query.languages)
			
 
				         self.assertFalse(query.specific)
			
 
				 
			
 
				+    def test_auto_language_code(self):
			
 
				+        language = 'auto'
			
 
				+        query_text = 'una consulta'
			
 
				+        full_query = ':' + language + ' ' + query_text
			
 
				+        query = RawTextQuery(full_query, [])
			
 
				+
			
 
				+        self.assertEqual(query.getFullQuery(), full_query)
			
 
				+        self.assertEqual(len(query.query_parts), 1)
			
 
				+        self.assertIn('auto', query.languages)
			
 
				+        self.assertFalse(query.specific)
			
 
				+
			
 
				     def test_invalid_language_code(self):
			
 
				         language = 'not_a_language'
			
 
				         query_text = 'the query'
			
--- a/tests/unit/test_search.py
+++ b/tests/unit/test_search.py
@@ -1,5 +1,7 @@
 
				 # -*- coding: utf-8 -*-
			
 
				 
			
 
				+from copy import copy
			
 
				+
			
 
				 import searx.search
			
 
				 from searx.search import SearchQuery, EngineRef
			
 
				 from searx import settings
			
@@ -34,6 +36,11 @@ class SearchQueryTestCase(SearxTestCase):
 
				         self.assertEqual(s, s)
			
 
				         self.assertNotEqual(s, t)
			
 
				 
			
 
				+    def test_copy(self):
			
 
				+        s = SearchQuery('test', [EngineRef('bing', 'general')], 'all', 0, 1, None, None, None)
			
 
				+        t = copy(s)
			
 
				+        self.assertEqual(s, t)
			
 
				+
			
 
				 
			
 
				 class SearchTestCase(SearxTestCase):
			
 
				     @classmethod