Browse Source

[enh] engine: mojeek - add language support

Improve region and language detection / all locale

Testing has shown the following behaviour for the different
default and empty values of Mojeeks parameters:

| param    | idx | value  | behaviour                 |
| -------- | --- | ------ | ------------------------- |
| region   |  0  | ''     | detect region based on IP |
| region   |  1  | 'none' | all regions               |
| language |  0  | ''     | all languages             |
0xhtml 7 months ago
parent
commit
8b6a3f3e11

+ 48 - 2
searx/engines/mojeek.py

@@ -1,12 +1,15 @@
 # SPDX-License-Identifier: AGPL-3.0-or-later
 # SPDX-License-Identifier: AGPL-3.0-or-later
 """Mojeek (general, images, news)"""
 """Mojeek (general, images, news)"""
 
 
+from typing import TYPE_CHECKING
+
 from datetime import datetime
 from datetime import datetime
 from urllib.parse import urlencode
 from urllib.parse import urlencode
 from lxml import html
 from lxml import html
 
 
 from dateutil.relativedelta import relativedelta
 from dateutil.relativedelta import relativedelta
 from searx.utils import eval_xpath, eval_xpath_list, extract_text
 from searx.utils import eval_xpath, eval_xpath_list, extract_text
+from searx.enginelib.traits import EngineTraits
 
 
 about = {
 about = {
     'website': 'https://mojeek.com',
     'website': 'https://mojeek.com',
@@ -42,6 +45,18 @@ news_url_xpath = './/h2/a/@href'
 news_title_xpath = './/h2/a'
 news_title_xpath = './/h2/a'
 news_content_xpath = './/p[@class="s"]'
 news_content_xpath = './/p[@class="s"]'
 
 
+language_param = 'lb'
+region_param = 'arc'
+
+_delta_kwargs = {'day': 'days', 'week': 'weeks', 'month': 'months', 'year': 'years'}
+
+if TYPE_CHECKING:
+    import logging
+
+    logger = logging.getLogger()
+
+traits: EngineTraits
+
 
 
 def init(_):
 def init(_):
     if search_type not in ('', 'images', 'news'):
     if search_type not in ('', 'images', 'news'):
@@ -53,13 +68,16 @@ def request(query, params):
         'q': query,
         'q': query,
         'safe': min(params['safesearch'], 1),
         'safe': min(params['safesearch'], 1),
         'fmt': search_type,
         'fmt': search_type,
+        language_param: traits.get_language(params['searxng_locale'], traits.custom['language_all']),
+        region_param: traits.get_region(params['searxng_locale'], traits.custom['region_all']),
     }
     }
 
 
     if search_type == '':
     if search_type == '':
         args['s'] = 10 * (params['pageno'] - 1)
         args['s'] = 10 * (params['pageno'] - 1)
 
 
     if params['time_range'] and search_type != 'images':
     if params['time_range'] and search_type != 'images':
-        args["since"] = (datetime.now() - relativedelta(**{f"{params['time_range']}s": 1})).strftime("%Y%m%d")
+        kwargs = {_delta_kwargs[params['time_range']]: 1}
+        args["since"] = (datetime.now() - relativedelta(**kwargs)).strftime("%Y%m%d")  # type: ignore
         logger.debug(args["since"])
         logger.debug(args["since"])
 
 
     params['url'] = f"{base_url}/search?{urlencode(args)}"
     params['url'] = f"{base_url}/search?{urlencode(args)}"
@@ -94,7 +112,7 @@ def _image_results(dom):
                 'template': 'images.html',
                 'template': 'images.html',
                 'url': extract_text(eval_xpath(result, image_url_xpath)),
                 'url': extract_text(eval_xpath(result, image_url_xpath)),
                 'title': extract_text(eval_xpath(result, image_title_xpath)),
                 'title': extract_text(eval_xpath(result, image_title_xpath)),
-                'img_src': base_url + extract_text(eval_xpath(result, image_img_src_xpath)),
+                'img_src': base_url + extract_text(eval_xpath(result, image_img_src_xpath)),  # type: ignore
                 'content': '',
                 'content': '',
             }
             }
         )
         )
@@ -130,3 +148,31 @@ def response(resp):
         return _news_results(dom)
         return _news_results(dom)
 
 
     raise ValueError(f"Invalid search type {search_type}")
     raise ValueError(f"Invalid search type {search_type}")
+
+
+def fetch_traits(engine_traits: EngineTraits):
+    # pylint: disable=import-outside-toplevel
+    from searx import network
+    from searx.locales import get_official_locales, region_tag
+    from babel import Locale, UnknownLocaleError
+    import contextlib
+
+    resp = network.get(base_url + "/preferences", headers={'Accept-Language': 'en-US,en;q=0.5'})
+    dom = html.fromstring(resp.text)  # type: ignore
+
+    languages = eval_xpath_list(dom, f'//select[@name="{language_param}"]/option/@value')
+
+    engine_traits.custom['language_all'] = languages[0]
+
+    for code in languages[1:]:
+        with contextlib.suppress(UnknownLocaleError):
+            locale = Locale(code)
+            engine_traits.languages[locale.language] = code
+
+    regions = eval_xpath_list(dom, f'//select[@name="{region_param}"]/option/@value')
+
+    engine_traits.custom['region_all'] = regions[1]
+
+    for code in regions[2:]:
+        for locale in get_official_locales(code, engine_traits.languages):
+            engine_traits.regions[region_tag(locale)] = code

+ 1 - 1
searxng_extra/update/update_engine_traits.py

@@ -101,7 +101,7 @@ def fetch_traits_map():
 def filter_locales(traits_map: EngineTraitsMap):
 def filter_locales(traits_map: EngineTraitsMap):
     """Filter language & region tags by a threshold."""
     """Filter language & region tags by a threshold."""
 
 
-    min_eng_per_region = 15
+    min_eng_per_region = 18
     min_eng_per_lang = 20
     min_eng_per_lang = 20
 
 
     _ = {}
     _ = {}

+ 0 - 2
tests/unit/test_locales.py

@@ -32,12 +32,10 @@ class TestLocales(SearxTestCase):
 
 
     @parameterized.expand(
     @parameterized.expand(
         [
         [
-            ('ca-es', 'ca-ES'),
             ('de-at', 'de-AT'),
             ('de-at', 'de-AT'),
             ('de-de', 'de-DE'),
             ('de-de', 'de-DE'),
             ('en-UK', 'en-GB'),
             ('en-UK', 'en-GB'),
             ('fr-be', 'fr-BE'),
             ('fr-be', 'fr-BE'),
-            ('fr-be', 'fr-BE'),
             ('fr-ca', 'fr-CA'),
             ('fr-ca', 'fr-CA'),
             ('fr-ch', 'fr-CH'),
             ('fr-ch', 'fr-CH'),
             ('zh-cn', 'zh-CN'),
             ('zh-cn', 'zh-CN'),