Browse Source

[mod] engines: Yahoo in different languages (#4826)

BTW fix issue reported in [1]

[1] https://github.com/searxng/searxng/pull/4814#issuecomment-2896948787

Signed-off-by: Markus Heiser <markus.heiser@darmarit.de>
Co-authored-by: Bnyro <bnyro@tutanota.com>
Markus Heiser 1 week ago
parent
commit
064eb50473
1 changed files with 70 additions and 37 deletions
  1. 70 37
      searx/engines/yahoo.py

+ 70 - 37
searx/engines/yahoo.py

@@ -44,6 +44,29 @@ time_range_dict = {
     'month': ('1m', 'm'),
 }
 
+region2domain = {
+    "CO": "co.search.yahoo.com",  # Colombia
+    "TH": "th.search.yahoo.com",  # Thailand
+    "VE": "ve.search.yahoo.com",  # Venezuela
+    "CL": "cl.search.yahoo.com",  # Chile
+    "HK": "hk.search.yahoo.com",  # Hong Kong
+    "PE": "pe.search.yahoo.com",  # Peru
+    "CA": "ca.search.yahoo.com",  # Canada
+    "DE": "de.search.yahoo.com",  # Germany
+    "FR": "fr.search.yahoo.com",  # France
+    "TW": "tw.search.yahoo.com",  # Taiwan
+    "GB": "uk.search.yahoo.com",  # United Kingdom
+    "UK": "uk.search.yahoo.com",
+    "BR": "br.search.yahoo.com",  # Brazil
+    "IN": "in.search.yahoo.com",  # India
+    "ES": "espanol.search.yahoo.com",  # Espanol
+    "PH": "ph.search.yahoo.com",  # Philippines
+    "AR": "ar.search.yahoo.com",  # Argentina
+    "MX": "mx.search.yahoo.com",  # Mexico
+    "SG": "sg.search.yahoo.com",  # Singapore
+}
+"""Map regions to domain"""
+
 lang2domain = {
     'zh_chs': 'hk.search.yahoo.com',
     'zh_cht': 'tw.search.yahoo.com',
@@ -65,40 +88,40 @@ lang2domain = {
 
 yahoo_languages = {
     "all": "any",
-    "ar": "ar",
-    "bg": "bg",
-    "cs": "cs",
-    "da": "da",
-    "de": "de",
-    "el": "el",
-    "en": "en",
-    "es": "es",
-    "et": "et",
-    "fi": "fi",
-    "fr": "fr",
-    "he": "he",
-    "hr": "hr",
-    "hu": "hu",
-    "it": "it",
-    "ja": "ja",
-    "ko": "ko",
-    "lt": "lt",
-    "lv": "lv",
-    "nl": "nl",
-    "no": "no",
-    "pl": "pl",
-    "pt": "pt",
-    "ro": "ro",
-    "ru": "ru",
-    "sk": "sk",
-    "sl": "sl",
-    "sv": "sv",
-    "th": "th",
-    "tr": "tr",
-    "zh": "zh_chs",
+    "ar": "ar",  # Arabic
+    "bg": "bg",  # Bulgarian
+    "cs": "cs",  # Czech
+    "da": "da",  # Danish
+    "de": "de",  # German
+    "el": "el",  # Greek
+    "en": "en",  # English
+    "es": "es",  # Spanish
+    "et": "et",  # Estonian
+    "fi": "fi",  # Finnish
+    "fr": "fr",  # French
+    "he": "he",  # Hebrew
+    "hr": "hr",  # Croatian
+    "hu": "hu",  # Hungarian
+    "it": "it",  # Italian
+    "ja": "ja",  # Japanese
+    "ko": "ko",  # Korean
+    "lt": "lt",  # Lithuanian
+    "lv": "lv",  # Latvian
+    "nl": "nl",  # Dutch
+    "no": "no",  # Norwegian
+    "pl": "pl",  # Polish
+    "pt": "pt",  # Portuguese
+    "ro": "ro",  # Romanian
+    "ru": "ru",  # Russian
+    "sk": "sk",  # Slovak
+    "sl": "sl",  # Slovenian
+    "sv": "sv",  # Swedish
+    "th": "th",  # Thai
+    "tr": "tr",  # Turkish
+    "zh": "zh_chs",  # Chinese (Simplified)
     "zh_Hans": "zh_chs",
     'zh-CN': "zh_chs",
-    "zh_Hant": "zh_cht",
+    "zh_Hant": "zh_cht",  # Chinese (Traditional)
     "zh-HK": "zh_cht",
     'zh-TW': "zh_cht",
 }
@@ -107,7 +130,7 @@ yahoo_languages = {
 def request(query, params):
     """build request"""
 
-    lang = params["language"].split("-")[0]
+    lang, region = (params["language"].split("-") + [None])[:2]
     lang = yahoo_languages.get(lang, "any")
 
     offset = (params['pageno'] - 1) * 7 + 1
@@ -127,9 +150,11 @@ def request(query, params):
         }
     )
 
-    domain = lang2domain.get(lang, '%s.search.yahoo.com' % lang)
+    domain = region2domain.get(region)
+    if not domain:
+        domain = lang2domain.get(lang, '%s.search.yahoo.com' % lang)
     params['url'] = 'https://%s/search?%s' % (domain, args)
-    return params
+    params['domain'] = domain
 
 
 def parse_url(url_string):
@@ -157,14 +182,22 @@ def response(resp):
     results = []
     dom = html.fromstring(resp.text)
 
+    url_xpath = './/div[contains(@class,"compTitle")]/h3/a/@href'
+    title_xpath = './/h3//a/@aria-label'
+
+    domain = resp.search_params['domain']
+    if domain == "search.yahoo.com":
+        url_xpath = './/div[contains(@class,"compTitle")]/a/@href'
+        title_xpath = './/div[contains(@class,"compTitle")]/a/h3/span'
+
     # parse results
     for result in eval_xpath_list(dom, '//div[contains(@class,"algo-sr")]'):
-        url = eval_xpath_getindex(result, './/div[contains(@class,"compTitle")]/a/@href', 0, default=None)
+        url = eval_xpath_getindex(result, url_xpath, 0, default=None)
         if url is None:
             continue
         url = parse_url(url)
 
-        title = eval_xpath_getindex(result, './/div[contains(@class,"compTitle")]/a/h3/span', 0, default='')
+        title = eval_xpath_getindex(result, title_xpath, 0, default='')
         title: str = extract_text(title)
         content = eval_xpath_getindex(result, './/div[contains(@class, "compText")]', 0, default='')
         content: str = extract_text(content, allow_none=True)