Browse Source

Replace langdetect with fasttext

ArtikusHG 2 years ago
parent
commit
1f8f8c1e91

+ 0 - 1
requirements.txt

@@ -11,7 +11,6 @@ httpx[http2]==0.21.2
 Brotli==1.0.9
 Brotli==1.0.9
 uvloop==0.17.0
 uvloop==0.17.0
 httpx-socks[asyncio]==0.7.2
 httpx-socks[asyncio]==0.7.2
-langdetect==1.0.9
 setproctitle==1.3.2
 setproctitle==1.3.2
 redis==4.3.5
 redis==4.3.5
 markdown-it-py==2.1.0
 markdown-it-py==2.1.0

+ 98 - 0
searx/plugins/autodetect_search_language.py

@@ -0,0 +1,98 @@
+# SPDX-License-Identifier: AGPL-3.0-or-later
+# lint: pylint
+"""Plugin to detect the search language from the search query.
+
+The language detection is done by using the fastText_ library (`python
+fasttext`_). fastText_ distributes the `language identification model`_, for
+reference:
+
+- `FastText.zip: Compressing text classification models`_
+- `Bag of Tricks for Efficient Text Classification`_
+
+The `language identification model`_ support the language codes (ISO-639-3)::
+
+   af als am an ar arz as ast av az azb ba bar bcl be bg bh bn bo bpy br bs bxr
+   ca cbk ce ceb ckb co cs cv cy da de diq dsb dty dv el eml en eo es et eu fa
+   fi fr frr fy ga gd gl gn gom gu gv he hi hif hr hsb ht hu hy ia id ie ilo io
+   is it ja jbo jv ka kk km kn ko krc ku kv kw ky la lb lez li lmo lo lrc lt lv
+   mai mg mhr min mk ml mn mr mrj ms mt mwl my myv mzn nah nap nds ne new nl nn
+   no oc or os pa pam pfl pl pms pnb ps pt qu rm ro ru rue sa sah sc scn sco sd
+   sh si sk sl so sq sr su sv sw ta te tg th tk tl tr tt tyv ug uk ur uz vec vep
+   vi vls vo wa war wuu xal xmf yi yo yue zh
+
+The `language identification model`_ is harmonized with the SearXNG's language
+(locale) model.  General conditions of SearXNG's locale model are:
+
+a. SearXNG's locale of a query is passed to the
+   :py:obj:`searx.locales.get_engine_locale` to get a language and/or region
+   code that is used by an engine.
+
+b. SearXNG and most of the engines do not support all the languages from
+   language model and there might be also a discrepancy in the ISO-639-3 and
+   ISO-639-2 handling (:py:obj:`searx.locales.get_engine_locale`).  Further
+   more, in SearXNG the locales like ``zh-TH`` (``zh-CN``) are mapped to
+   ``zh_Hant`` (``zh_Hans``).
+
+Conclusion: This plugin does only auto-detect the languages a user can select in
+the language menu (:py:obj:`supported_langs`).
+
+SearXNG's locale of a query comes from (*highest wins*):
+
+1. The ``Accept-Language`` header from user's HTTP client.
+2. The user select a locale in the preferences.
+3. The user select a locale from the menu in the query form (e.g. ``:zh-TW``)
+4. This plugin is activated in the preferences and the locale (only the language
+   code / none region code) comes from the fastText's language detection.
+
+Conclusion: There is a conflict between the language selected by the user and
+the language from language detection of this plugin.  For example, the user
+explicitly selects the German locale via the search syntax to search for a term
+that is identified as an English term (try ``:de-DE thermomix``, for example).
+
+.. hint::
+
+   To SearXNG maintainers; please take into account: under some circumstances
+   the auto-detection of the language of this plugin could be detrimental to
+   users expectations.  Its not recommended to activate this plugin by
+   default. It should always be the user's decision whether to activate this
+   plugin or not.
+
+.. _fastText: https://fasttext.cc/
+.. _python fasttext: https://pypi.org/project/fasttext/
+.. _language identification model: https://fasttext.cc/docs/en/language-identification.html
+.. _Bag of Tricks for Efficient Text Classification: https://arxiv.org/abs/1607.01759
+.. _`FastText.zip: Compressing text classification models`: https://arxiv.org/abs/1612.03651
+
+"""
+
+from flask_babel import gettext
+import babel
+
+from searx.utils import detect_language
+from searx.languages import language_codes
+
+
+name = gettext('Autodetect search language')
+description = gettext('Automatically detect the query search language and switch to it.')
+preference_section = 'general'
+default_on = False
+
+supported_langs = set()
+"""Languages supported by most searxng engines (:py:obj:`searx.languages.language_codes`)."""
+
+
+def pre_search(request, search):  # pylint: disable=unused-argument
+    lang = detect_language(search.search_query.query, min_probability=0)
+    if lang in supported_langs:
+        search.search_query.lang = lang
+        try:
+            search.search_query.locale = babel.Locale.parse(lang)
+        except babel.core.UnknownLocaleError:
+            pass
+    return True
+
+
+def init(app, settings):  # pylint: disable=unused-argument
+    for searxng_locale in language_codes:
+        supported_langs.add(searxng_locale[0].split('-')[0])
+    return True

+ 5 - 11
searx/search/checker/impl.py

@@ -10,12 +10,10 @@ from timeit import default_timer
 from urllib.parse import urlparse
 from urllib.parse import urlparse
 
 
 import re
 import re
-from langdetect import detect_langs
-from langdetect.lang_detect_exception import LangDetectException
 import httpx
 import httpx
 
 
 from searx import network, logger
 from searx import network, logger
-from searx.utils import gen_useragent
+from searx.utils import gen_useragent, detect_language
 from searx.results import ResultContainer
 from searx.results import ResultContainer
 from searx.search.models import SearchQuery, EngineRef
 from searx.search.models import SearchQuery, EngineRef
 from searx.search.processors import EngineProcessor
 from searx.search.processors import EngineProcessor
@@ -208,14 +206,10 @@ class ResultContainerTests:
         self.test_results.add_error(self.test_name, message, *args, '(' + sqstr + ')')
         self.test_results.add_error(self.test_name, message, *args, '(' + sqstr + ')')
 
 
     def _add_language(self, text: str) -> typing.Optional[str]:
     def _add_language(self, text: str) -> typing.Optional[str]:
-        try:
-            r = detect_langs(str(text))  # pylint: disable=E1101
-        except LangDetectException:
-            return None
-
-        if len(r) > 0 and r[0].prob > 0.95:
-            self.languages.add(r[0].lang)
-            self.test_results.add_language(r[0].lang)
+        langStr = detect_language(text)
+        if langStr:
+            self.languages.add(langStr)
+            self.test_results.add_language(langStr)
         return None
         return None
 
 
     def _check_result(self, result):
     def _check_result(self, result):

+ 25 - 1
searx/utils.py

@@ -15,6 +15,7 @@ from os.path import splitext, join
 from random import choice
 from random import choice
 from html.parser import HTMLParser
 from html.parser import HTMLParser
 from urllib.parse import urljoin, urlparse
 from urllib.parse import urljoin, urlparse
+import fasttext
 
 
 from lxml import html
 from lxml import html
 from lxml.etree import ElementBase, XPath, XPathError, XPathSyntaxError, _ElementStringResult, _ElementUnicodeResult
 from lxml.etree import ElementBase, XPath, XPathError, XPathSyntaxError, _ElementStringResult, _ElementUnicodeResult
@@ -22,7 +23,7 @@ from babel.core import get_global
 
 
 
 
 from searx import settings
 from searx import settings
-from searx.data import USER_AGENTS
+from searx.data import USER_AGENTS, data_dir
 from searx.version import VERSION_TAG
 from searx.version import VERSION_TAG
 from searx.languages import language_codes
 from searx.languages import language_codes
 from searx.exceptions import SearxXPathSyntaxException, SearxEngineXPathException
 from searx.exceptions import SearxXPathSyntaxException, SearxEngineXPathException
@@ -50,6 +51,12 @@ _STORAGE_UNIT_VALUE: Dict[str, int] = {
 _XPATH_CACHE: Dict[str, XPath] = {}
 _XPATH_CACHE: Dict[str, XPath] = {}
 _LANG_TO_LC_CACHE: Dict[str, Dict[str, str]] = {}
 _LANG_TO_LC_CACHE: Dict[str, Dict[str, str]] = {}
 
 
+_FASTTEXT_MODEL: Optional[fasttext.FastText._FastText] = None
+"""fasttext model to predict laguage of a search term"""
+
+# Monkey patch: prevent fasttext from showing a (useless) warning when loading a model.
+fasttext.FastText.eprint = lambda x: None
+
 
 
 class _NotSetClass:  # pylint: disable=too-few-public-methods
 class _NotSetClass:  # pylint: disable=too-few-public-methods
     """Internal class for this module, do not create instance of this class.
     """Internal class for this module, do not create instance of this class.
@@ -621,3 +628,20 @@ def eval_xpath_getindex(elements: ElementBase, xpath_spec: XPathSpecType, index:
         # to record xpath_spec
         # to record xpath_spec
         raise SearxEngineXPathException(xpath_spec, 'index ' + str(index) + ' not found')
         raise SearxEngineXPathException(xpath_spec, 'index ' + str(index) + ' not found')
     return default
     return default
+
+
+def _get_fasttext_model() -> fasttext.FastText._FastText:
+    global _FASTTEXT_MODEL  # pylint: disable=global-statement
+    if _FASTTEXT_MODEL is None:
+        _FASTTEXT_MODEL = fasttext.load_model(str(data_dir / 'lid.176.ftz'))
+    return _FASTTEXT_MODEL
+
+
+def detect_language(text: str, threshold: float = 0.3, min_probability: float = 0.5) -> Optional[str]:
+    """https://fasttext.cc/docs/en/language-identification.html"""
+    if not isinstance(text, str):
+        raise ValueError('text must a str')
+    r = _get_fasttext_model().predict(text.replace('\n', ' '), k=1, threshold=threshold)
+    if isinstance(r, tuple) and len(r) == 2 and len(r[0]) > 0 and len(r[1]) > 0 and r[1][0] > min_probability:
+        return r[0][0].split('__label__')[1]
+    return None

+ 1 - 15
searxng_extra/update/update_engine_descriptions.py

@@ -17,14 +17,11 @@ from os.path import join
 
 
 from lxml.html import fromstring
 from lxml.html import fromstring
 
 
-from langdetect import detect_langs
-from langdetect.lang_detect_exception import LangDetectException
-
 from searx.engines import wikidata, set_loggers
 from searx.engines import wikidata, set_loggers
 from searx.utils import extract_text, match_language
 from searx.utils import extract_text, match_language
 from searx.locales import LOCALE_NAMES, locales_initialize
 from searx.locales import LOCALE_NAMES, locales_initialize
 from searx import searx_dir
 from searx import searx_dir
-from searx.utils import gen_useragent
+from searx.utils import gen_useragent, detect_language
 import searx.search
 import searx.search
 import searx.network
 import searx.network
 
 
@@ -117,17 +114,6 @@ def get_wikipedia_summary(lang, pageid):
         return None
         return None
 
 
 
 
-def detect_language(text):
-    try:
-        r = detect_langs(str(text))  # pylint: disable=E1101
-    except LangDetectException:
-        return None
-
-    if len(r) > 0 and r[0].prob > 0.95:
-        return r[0].lang
-    return None
-
-
 def get_website_description(url, lang1, lang2=None):
 def get_website_description(url, lang1, lang2=None):
     headers = {
     headers = {
         'User-Agent': gen_useragent(),
         'User-Agent': gen_useragent(),

+ 22 - 0
tests/unit/test_utils.py

@@ -232,3 +232,25 @@ class TestXPathUtils(SearxTestCase):
         with self.assertRaises(SearxEngineXPathException) as context:
         with self.assertRaises(SearxEngineXPathException) as context:
             utils.eval_xpath_getindex(doc, 'count(//i)', 1)
             utils.eval_xpath_getindex(doc, 'count(//i)', 1)
         self.assertEqual(context.exception.message, 'the result is not a list')
         self.assertEqual(context.exception.message, 'the result is not a list')
+
+    def test_detect_language(self):
+        # make sure new line are not an issue
+        # fasttext.predict('') does not accept new line.
+        l = utils.detect_language('The quick brown fox jumps over\nthe lazy dog')
+        self.assertEqual(l, 'en')
+
+        l = utils.detect_language('いろはにほへと ちりぬるを わかよたれそ つねならむ うゐのおくやま けふこえて あさきゆめみし ゑひもせす')
+        self.assertEqual(l, 'ja')
+
+        l = utils.detect_language('Pijamalı hasta yağız şoföre çabucak güvendi.')
+        self.assertEqual(l, 'tr')
+
+        l = utils.detect_language('')
+        self.assertIsNone(l)
+
+        # mix languages --> None
+        l = utils.detect_language('The いろはにほへと Pijamalı')
+        self.assertIsNone(l)
+
+        with self.assertRaises(ValueError):
+            utils.detect_language(None)