2 years ago · 1f8f8c1e91
--- a/requirements.txt
+++ b/requirements.txt
@@ -11,7 +11,6 @@ httpx[http2]==0.21.2
 
				 Brotli==1.0.9
			
 
				 uvloop==0.17.0
			
 
				 httpx-socks[asyncio]==0.7.2
			
 
				-langdetect==1.0.9
			
 
				 setproctitle==1.3.2
			
 
				 redis==4.3.5
			
 
				 markdown-it-py==2.1.0
			
--- a/searx/plugins/autodetect_search_language.py
+++ b/searx/plugins/autodetect_search_language.py
@@ -0,0 +1,98 @@
 
				+# SPDX-License-Identifier: AGPL-3.0-or-later
			
 
				+# lint: pylint
			
 
				+"""Plugin to detect the search language from the search query.
			
 
				+
			
 
				+The language detection is done by using the fastText_ library (`python
			
 
				+fasttext`_). fastText_ distributes the `language identification model`_, for
			
 
				+reference:
			
 
				+
			
 
				+- `FastText.zip: Compressing text classification models`_
			
 
				+- `Bag of Tricks for Efficient Text Classification`_
			
 
				+
			
 
				+The `language identification model`_ support the language codes (ISO-639-3)::
			
 
				+
			
 
				+   af als am an ar arz as ast av az azb ba bar bcl be bg bh bn bo bpy br bs bxr
			
 
				+   ca cbk ce ceb ckb co cs cv cy da de diq dsb dty dv el eml en eo es et eu fa
			
 
				+   fi fr frr fy ga gd gl gn gom gu gv he hi hif hr hsb ht hu hy ia id ie ilo io
			
 
				+   is it ja jbo jv ka kk km kn ko krc ku kv kw ky la lb lez li lmo lo lrc lt lv
			
 
				+   mai mg mhr min mk ml mn mr mrj ms mt mwl my myv mzn nah nap nds ne new nl nn
			
 
				+   no oc or os pa pam pfl pl pms pnb ps pt qu rm ro ru rue sa sah sc scn sco sd
			
 
				+   sh si sk sl so sq sr su sv sw ta te tg th tk tl tr tt tyv ug uk ur uz vec vep
			
 
				+   vi vls vo wa war wuu xal xmf yi yo yue zh
			
 
				+
			
 
				+The `language identification model`_ is harmonized with the SearXNG's language
			
 
				+(locale) model.  General conditions of SearXNG's locale model are:
			
 
				+
			
 
				+a. SearXNG's locale of a query is passed to the
			
 
				+   :py:obj:`searx.locales.get_engine_locale` to get a language and/or region
			
 
				+   code that is used by an engine.
			
 
				+
			
 
				+b. SearXNG and most of the engines do not support all the languages from
			
 
				+   language model and there might be also a discrepancy in the ISO-639-3 and
			
 
				+   ISO-639-2 handling (:py:obj:`searx.locales.get_engine_locale`).  Further
			
 
				+   more, in SearXNG the locales like ``zh-TH`` (``zh-CN``) are mapped to
			
 
				+   ``zh_Hant`` (``zh_Hans``).
			
 
				+
			
 
				+Conclusion: This plugin does only auto-detect the languages a user can select in
			
 
				+the language menu (:py:obj:`supported_langs`).
			
 
				+
			
 
				+SearXNG's locale of a query comes from (*highest wins*):
			
 
				+
			
 
				+1. The ``Accept-Language`` header from user's HTTP client.
			
 
				+2. The user select a locale in the preferences.
			
 
				+3. The user select a locale from the menu in the query form (e.g. ``:zh-TW``)
			
 
				+4. This plugin is activated in the preferences and the locale (only the language
			
 
				+   code / none region code) comes from the fastText's language detection.
			
 
				+
			
 
				+Conclusion: There is a conflict between the language selected by the user and
			
 
				+the language from language detection of this plugin.  For example, the user
			
 
				+explicitly selects the German locale via the search syntax to search for a term
			
 
				+that is identified as an English term (try ``:de-DE thermomix``, for example).
			
 
				+
			
 
				+.. hint::
			
 
				+
			
 
				+   To SearXNG maintainers; please take into account: under some circumstances
			
 
				+   the auto-detection of the language of this plugin could be detrimental to
			
 
				+   users expectations.  Its not recommended to activate this plugin by
			
 
				+   default. It should always be the user's decision whether to activate this
			
 
				+   plugin or not.
			
 
				+
			
 
				+.. _fastText: https://fasttext.cc/
			
 
				+.. _python fasttext: https://pypi.org/project/fasttext/
			
 
				+.. _language identification model: https://fasttext.cc/docs/en/language-identification.html
			
 
				+.. _Bag of Tricks for Efficient Text Classification: https://arxiv.org/abs/1607.01759
			
 
				+.. _`FastText.zip: Compressing text classification models`: https://arxiv.org/abs/1612.03651
			
 
				+
			
 
				+"""
			
 
				+
			
 
				+from flask_babel import gettext
			
 
				+import babel
			
 
				+
			
 
				+from searx.utils import detect_language
			
 
				+from searx.languages import language_codes
			
 
				+
			
 
				+
			
 
				+name = gettext('Autodetect search language')
			
 
				+description = gettext('Automatically detect the query search language and switch to it.')
			
 
				+preference_section = 'general'
			
 
				+default_on = False
			
 
				+
			
 
				+supported_langs = set()
			
 
				+"""Languages supported by most searxng engines (:py:obj:`searx.languages.language_codes`)."""
			
 
				+
			
 
				+
			
 
				+def pre_search(request, search):  # pylint: disable=unused-argument
			
 
				+    lang = detect_language(search.search_query.query, min_probability=0)
			
 
				+    if lang in supported_langs:
			
 
				+        search.search_query.lang = lang
			
 
				+        try:
			
 
				+            search.search_query.locale = babel.Locale.parse(lang)
			
 
				+        except babel.core.UnknownLocaleError:
			
 
				+            pass
			
 
				+    return True
			
 
				+
			
 
				+
			
 
				+def init(app, settings):  # pylint: disable=unused-argument
			
 
				+    for searxng_locale in language_codes:
			
 
				+        supported_langs.add(searxng_locale[0].split('-')[0])
			
 
				+    return True
			
--- a/searx/search/checker/impl.py
+++ b/searx/search/checker/impl.py
@@ -10,12 +10,10 @@ from timeit import default_timer
 
				 from urllib.parse import urlparse
			
 
				 
			
 
				 import re
			
 
				-from langdetect import detect_langs
			
 
				-from langdetect.lang_detect_exception import LangDetectException
			
 
				 import httpx
			
 
				 
			
 
				 from searx import network, logger
			
 
				-from searx.utils import gen_useragent
			
 
				+from searx.utils import gen_useragent, detect_language
			
 
				 from searx.results import ResultContainer
			
 
				 from searx.search.models import SearchQuery, EngineRef
			
 
				 from searx.search.processors import EngineProcessor
			
@@ -208,14 +206,10 @@ class ResultContainerTests:
 
				         self.test_results.add_error(self.test_name, message, *args, '(' + sqstr + ')')
			
 
				 
			
 
				     def _add_language(self, text: str) -> typing.Optional[str]:
			
 
				-        try:
			
 
				-            r = detect_langs(str(text))  # pylint: disable=E1101
			
 
				-        except LangDetectException:
			
 
				-            return None
			
 
				-
			
 
				-        if len(r) > 0 and r[0].prob > 0.95:
			
 
				-            self.languages.add(r[0].lang)
			
 
				-            self.test_results.add_language(r[0].lang)
			
 
				+        langStr = detect_language(text)
			
 
				+        if langStr:
			
 
				+            self.languages.add(langStr)
			
 
				+            self.test_results.add_language(langStr)
			
 
				         return None
			
 
				 
			
 
				     def _check_result(self, result):
			
--- a/searx/utils.py
+++ b/searx/utils.py
@@ -15,6 +15,7 @@ from os.path import splitext, join
 
				 from random import choice
			
 
				 from html.parser import HTMLParser
			
 
				 from urllib.parse import urljoin, urlparse
			
 
				+import fasttext
			
 
				 
			
 
				 from lxml import html
			
 
				 from lxml.etree import ElementBase, XPath, XPathError, XPathSyntaxError, _ElementStringResult, _ElementUnicodeResult
			
@@ -22,7 +23,7 @@ from babel.core import get_global
 
				 
			
 
				 
			
 
				 from searx import settings
			
 
				-from searx.data import USER_AGENTS
			
 
				+from searx.data import USER_AGENTS, data_dir
			
 
				 from searx.version import VERSION_TAG
			
 
				 from searx.languages import language_codes
			
 
				 from searx.exceptions import SearxXPathSyntaxException, SearxEngineXPathException
			
@@ -50,6 +51,12 @@ _STORAGE_UNIT_VALUE: Dict[str, int] = {
 
				 _XPATH_CACHE: Dict[str, XPath] = {}
			
 
				 _LANG_TO_LC_CACHE: Dict[str, Dict[str, str]] = {}
			
 
				 
			
 
				+_FASTTEXT_MODEL: Optional[fasttext.FastText._FastText] = None
			
 
				+"""fasttext model to predict laguage of a search term"""
			
 
				+
			
 
				+# Monkey patch: prevent fasttext from showing a (useless) warning when loading a model.
			
 
				+fasttext.FastText.eprint = lambda x: None
			
 
				+
			
 
				 
			
 
				 class _NotSetClass:  # pylint: disable=too-few-public-methods
			
 
				     """Internal class for this module, do not create instance of this class.
			
@@ -621,3 +628,20 @@ def eval_xpath_getindex(elements: ElementBase, xpath_spec: XPathSpecType, index:
 
				         # to record xpath_spec
			
 
				         raise SearxEngineXPathException(xpath_spec, 'index ' + str(index) + ' not found')
			
 
				     return default
			
 
				+
			
 
				+
			
 
				+def _get_fasttext_model() -> fasttext.FastText._FastText:
			
 
				+    global _FASTTEXT_MODEL  # pylint: disable=global-statement
			
 
				+    if _FASTTEXT_MODEL is None:
			
 
				+        _FASTTEXT_MODEL = fasttext.load_model(str(data_dir / 'lid.176.ftz'))
			
 
				+    return _FASTTEXT_MODEL
			
 
				+
			
 
				+
			
 
				+def detect_language(text: str, threshold: float = 0.3, min_probability: float = 0.5) -> Optional[str]:
			
 
				+    """https://fasttext.cc/docs/en/language-identification.html"""
			
 
				+    if not isinstance(text, str):
			
 
				+        raise ValueError('text must a str')
			
 
				+    r = _get_fasttext_model().predict(text.replace('\n', ' '), k=1, threshold=threshold)
			
 
				+    if isinstance(r, tuple) and len(r) == 2 and len(r[0]) > 0 and len(r[1]) > 0 and r[1][0] > min_probability:
			
 
				+        return r[0][0].split('__label__')[1]
			
 
				+    return None
			
--- a/searxng_extra/update/update_engine_descriptions.py
+++ b/searxng_extra/update/update_engine_descriptions.py
@@ -17,14 +17,11 @@ from os.path import join
 
				 
			
 
				 from lxml.html import fromstring
			
 
				 
			
 
				-from langdetect import detect_langs
			
 
				-from langdetect.lang_detect_exception import LangDetectException
			
 
				-
			
 
				 from searx.engines import wikidata, set_loggers
			
 
				 from searx.utils import extract_text, match_language
			
 
				 from searx.locales import LOCALE_NAMES, locales_initialize
			
 
				 from searx import searx_dir
			
 
				-from searx.utils import gen_useragent
			
 
				+from searx.utils import gen_useragent, detect_language
			
 
				 import searx.search
			
 
				 import searx.network
			
 
				 
			
@@ -117,17 +114,6 @@ def get_wikipedia_summary(lang, pageid):
 
				         return None
			
 
				 
			
 
				 
			
 
				-def detect_language(text):
			
 
				-    try:
			
 
				-        r = detect_langs(str(text))  # pylint: disable=E1101
			
 
				-    except LangDetectException:
			
 
				-        return None
			
 
				-
			
 
				-    if len(r) > 0 and r[0].prob > 0.95:
			
 
				-        return r[0].lang
			
 
				-    return None
			
 
				-
			
 
				-
			
 
				 def get_website_description(url, lang1, lang2=None):
			
 
				     headers = {
			
 
				         'User-Agent': gen_useragent(),
			
--- a/tests/unit/test_utils.py
+++ b/tests/unit/test_utils.py
@@ -232,3 +232,25 @@ class TestXPathUtils(SearxTestCase):
 
				         with self.assertRaises(SearxEngineXPathException) as context:
			
 
				             utils.eval_xpath_getindex(doc, 'count(//i)', 1)
			
 
				         self.assertEqual(context.exception.message, 'the result is not a list')
			
 
				+
			
 
				+    def test_detect_language(self):
			
 
				+        # make sure new line are not an issue
			
 
				+        # fasttext.predict('') does not accept new line.
			
 
				+        l = utils.detect_language('The quick brown fox jumps over\nthe lazy dog')
			
 
				+        self.assertEqual(l, 'en')
			
 
				+
			
 
				+        l = utils.detect_language('いろはにほへと ちりぬるを わかよたれそ つねならむ うゐのおくやま けふこえて あさきゆめみし ゑひもせす')
			
 
				+        self.assertEqual(l, 'ja')
			
 
				+
			
 
				+        l = utils.detect_language('Pijamalı hasta yağız şoföre çabucak güvendi.')
			
 
				+        self.assertEqual(l, 'tr')
			
 
				+
			
 
				+        l = utils.detect_language('')
			
 
				+        self.assertIsNone(l)
			
 
				+
			
 
				+        # mix languages --> None
			
 
				+        l = utils.detect_language('The いろはにほへと Pijamalı')
			
 
				+        self.assertIsNone(l)
			
 
				+
			
 
				+        with self.assertRaises(ValueError):
			
 
				+            utils.detect_language(None)