Browse Source

Merge pull request #2019 from ArtikusHG/fasttext

Replace langdetect with fasttext (followup of #1969)
Alexandre Flament 2 years ago
parent
commit
b927482195

+ 0 - 1
requirements.txt

@@ -11,7 +11,6 @@ httpx[http2]==0.21.2
 Brotli==1.0.9
 Brotli==1.0.9
 uvloop==0.17.0
 uvloop==0.17.0
 httpx-socks[asyncio]==0.7.2
 httpx-socks[asyncio]==0.7.2
-langdetect==1.0.9
 setproctitle==1.3.2
 setproctitle==1.3.2
 redis==4.4.0
 redis==4.4.0
 markdown-it-py==2.1.0
 markdown-it-py==2.1.0

+ 8 - 26
searx/plugins/autodetect_search_language.py

@@ -66,46 +66,28 @@ that is identified as an English term (try ``:de-DE thermomix``, for example).
 """
 """
 
 
 from flask_babel import gettext
 from flask_babel import gettext
-import fasttext
 import babel
 import babel
 
 
-from searx.data import data_dir
+from searx.utils import detect_language
 from searx.languages import language_codes
 from searx.languages import language_codes
 
 
-# Monkey patch: prevent fasttext from showing a (useless) warning when loading a
-# model.
-fasttext.FastText.eprint = lambda x: None
-
 name = gettext('Autodetect search language')
 name = gettext('Autodetect search language')
 description = gettext('Automatically detect the query search language and switch to it.')
 description = gettext('Automatically detect the query search language and switch to it.')
 preference_section = 'general'
 preference_section = 'general'
 default_on = False
 default_on = False
 
 
-lang_model: fasttext.FastText._FastText = None
-"""fasttext model to predict laguage of a search term"""
-
 supported_langs = set()
 supported_langs = set()
 """Languages supported by most searxng engines (:py:obj:`searx.languages.language_codes`)."""
 """Languages supported by most searxng engines (:py:obj:`searx.languages.language_codes`)."""
 
 
 
 
-def get_model():
-    # lazy load, in order to to save memory
-    global lang_model  # pylint: disable=global-statement
-    if lang_model is None:
-        lang_model = fasttext.load_model(str(data_dir / 'lid.176.ftz'))
-    return lang_model
-
-
 def pre_search(request, search):  # pylint: disable=unused-argument
 def pre_search(request, search):  # pylint: disable=unused-argument
-    prediction = get_model().predict(search.search_query.query, k=1, threshold=0.3)
+    lang = detect_language(search.search_query.query, min_probability=0)
-    if prediction:
+    if lang in supported_langs:
-        lang = prediction[0][0].split('__label__')[1]
+        search.search_query.lang = lang
-        if lang in supported_langs:
+        try:
-            search.search_query.lang = lang
+            search.search_query.locale = babel.Locale.parse(lang)
-            try:
+        except babel.core.UnknownLocaleError:
-                search.search_query.locale = babel.Locale.parse(lang)
+            pass
-            except babel.core.UnknownLocaleError:
-                pass
     return True
     return True
 
 
 
 

+ 5 - 11
searx/search/checker/impl.py

@@ -10,12 +10,10 @@ from timeit import default_timer
 from urllib.parse import urlparse
 from urllib.parse import urlparse
 
 
 import re
 import re
-from langdetect import detect_langs
-from langdetect.lang_detect_exception import LangDetectException
 import httpx
 import httpx
 
 
 from searx import network, logger
 from searx import network, logger
-from searx.utils import gen_useragent
+from searx.utils import gen_useragent, detect_language
 from searx.results import ResultContainer
 from searx.results import ResultContainer
 from searx.search.models import SearchQuery, EngineRef
 from searx.search.models import SearchQuery, EngineRef
 from searx.search.processors import EngineProcessor
 from searx.search.processors import EngineProcessor
@@ -208,14 +206,10 @@ class ResultContainerTests:
         self.test_results.add_error(self.test_name, message, *args, '(' + sqstr + ')')
         self.test_results.add_error(self.test_name, message, *args, '(' + sqstr + ')')
 
 
     def _add_language(self, text: str) -> typing.Optional[str]:
     def _add_language(self, text: str) -> typing.Optional[str]:
-        try:
+        langStr = detect_language(text)
-            r = detect_langs(str(text))  # pylint: disable=E1101
+        if langStr:
-        except LangDetectException:
+            self.languages.add(langStr)
-            return None
+            self.test_results.add_language(langStr)
-
-        if len(r) > 0 and r[0].prob > 0.95:
-            self.languages.add(r[0].lang)
-            self.test_results.add_language(r[0].lang)
         return None
         return None
 
 
     def _check_result(self, result):
     def _check_result(self, result):

+ 25 - 1
searx/utils.py

@@ -15,6 +15,7 @@ from os.path import splitext, join
 from random import choice
 from random import choice
 from html.parser import HTMLParser
 from html.parser import HTMLParser
 from urllib.parse import urljoin, urlparse
 from urllib.parse import urljoin, urlparse
+import fasttext
 
 
 from lxml import html
 from lxml import html
 from lxml.etree import ElementBase, XPath, XPathError, XPathSyntaxError, _ElementStringResult, _ElementUnicodeResult
 from lxml.etree import ElementBase, XPath, XPathError, XPathSyntaxError, _ElementStringResult, _ElementUnicodeResult
@@ -22,7 +23,7 @@ from babel.core import get_global
 
 
 
 
 from searx import settings
 from searx import settings
-from searx.data import USER_AGENTS
+from searx.data import USER_AGENTS, data_dir
 from searx.version import VERSION_TAG
 from searx.version import VERSION_TAG
 from searx.languages import language_codes
 from searx.languages import language_codes
 from searx.exceptions import SearxXPathSyntaxException, SearxEngineXPathException
 from searx.exceptions import SearxXPathSyntaxException, SearxEngineXPathException
@@ -50,6 +51,12 @@ _STORAGE_UNIT_VALUE: Dict[str, int] = {
 _XPATH_CACHE: Dict[str, XPath] = {}
 _XPATH_CACHE: Dict[str, XPath] = {}
 _LANG_TO_LC_CACHE: Dict[str, Dict[str, str]] = {}
 _LANG_TO_LC_CACHE: Dict[str, Dict[str, str]] = {}
 
 
+_FASTTEXT_MODEL: Optional[fasttext.FastText._FastText] = None
+"""fasttext model to predict laguage of a search term"""
+
+# Monkey patch: prevent fasttext from showing a (useless) warning when loading a model.
+fasttext.FastText.eprint = lambda x: None
+
 
 
 class _NotSetClass:  # pylint: disable=too-few-public-methods
 class _NotSetClass:  # pylint: disable=too-few-public-methods
     """Internal class for this module, do not create instance of this class.
     """Internal class for this module, do not create instance of this class.
@@ -621,3 +628,20 @@ def eval_xpath_getindex(elements: ElementBase, xpath_spec: XPathSpecType, index:
         # to record xpath_spec
         # to record xpath_spec
         raise SearxEngineXPathException(xpath_spec, 'index ' + str(index) + ' not found')
         raise SearxEngineXPathException(xpath_spec, 'index ' + str(index) + ' not found')
     return default
     return default
+
+
+def _get_fasttext_model() -> fasttext.FastText._FastText:
+    global _FASTTEXT_MODEL  # pylint: disable=global-statement
+    if _FASTTEXT_MODEL is None:
+        _FASTTEXT_MODEL = fasttext.load_model(str(data_dir / 'lid.176.ftz'))
+    return _FASTTEXT_MODEL
+
+
+def detect_language(text: str, threshold: float = 0.3, min_probability: float = 0.5) -> Optional[str]:
+    """https://fasttext.cc/docs/en/language-identification.html"""
+    if not isinstance(text, str):
+        raise ValueError('text must a str')
+    r = _get_fasttext_model().predict(text.replace('\n', ' '), k=1, threshold=threshold)
+    if isinstance(r, tuple) and len(r) == 2 and len(r[0]) > 0 and len(r[1]) > 0 and r[1][0] > min_probability:
+        return r[0][0].split('__label__')[1]
+    return None

+ 1 - 15
searxng_extra/update/update_engine_descriptions.py

@@ -17,14 +17,11 @@ from os.path import join
 
 
 from lxml.html import fromstring
 from lxml.html import fromstring
 
 
-from langdetect import detect_langs
-from langdetect.lang_detect_exception import LangDetectException
-
 from searx.engines import wikidata, set_loggers
 from searx.engines import wikidata, set_loggers
 from searx.utils import extract_text, match_language
 from searx.utils import extract_text, match_language
 from searx.locales import LOCALE_NAMES, locales_initialize
 from searx.locales import LOCALE_NAMES, locales_initialize
 from searx import searx_dir
 from searx import searx_dir
-from searx.utils import gen_useragent
+from searx.utils import gen_useragent, detect_language
 import searx.search
 import searx.search
 import searx.network
 import searx.network
 
 
@@ -117,17 +114,6 @@ def get_wikipedia_summary(lang, pageid):
         return None
         return None
 
 
 
 
-def detect_language(text):
-    try:
-        r = detect_langs(str(text))  # pylint: disable=E1101
-    except LangDetectException:
-        return None
-
-    if len(r) > 0 and r[0].prob > 0.95:
-        return r[0].lang
-    return None
-
-
 def get_website_description(url, lang1, lang2=None):
 def get_website_description(url, lang1, lang2=None):
     headers = {
     headers = {
         'User-Agent': gen_useragent(),
         'User-Agent': gen_useragent(),

+ 22 - 0
tests/unit/test_utils.py

@@ -232,3 +232,25 @@ class TestXPathUtils(SearxTestCase):
         with self.assertRaises(SearxEngineXPathException) as context:
         with self.assertRaises(SearxEngineXPathException) as context:
             utils.eval_xpath_getindex(doc, 'count(//i)', 1)
             utils.eval_xpath_getindex(doc, 'count(//i)', 1)
         self.assertEqual(context.exception.message, 'the result is not a list')
         self.assertEqual(context.exception.message, 'the result is not a list')
+
+    def test_detect_language(self):
+        # make sure new line are not an issue
+        # fasttext.predict('') does not accept new line.
+        l = utils.detect_language('The quick brown fox jumps over\nthe lazy dog')
+        self.assertEqual(l, 'en')
+
+        l = utils.detect_language('いろはにほへと ちりぬるを わかよたれそ つねならむ うゐのおくやま けふこえて あさきゆめみし ゑひもせす')
+        self.assertEqual(l, 'ja')
+
+        l = utils.detect_language('Pijamalı hasta yağız şoföre çabucak güvendi.')
+        self.assertEqual(l, 'tr')
+
+        l = utils.detect_language('')
+        self.assertIsNone(l)
+
+        # mix languages --> None
+        l = utils.detect_language('The いろはにほへと Pijamalı')
+        self.assertIsNone(l)
+
+        with self.assertRaises(ValueError):
+            utils.detect_language(None)