yahoo.py 7.7 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266
  1. # SPDX-License-Identifier: AGPL-3.0-or-later
  2. """Yahoo Search (Web)
  3. Languages are supported by mapping the language to a domain. If domain is not
  4. found in :py:obj:`lang2domain` URL ``<lang>.search.yahoo.com`` is used.
  5. """
  6. from typing import TYPE_CHECKING
  7. from urllib.parse import (
  8. unquote,
  9. urlencode,
  10. )
  11. from lxml import html
  12. from searx.utils import (
  13. eval_xpath_getindex,
  14. eval_xpath_list,
  15. extract_text,
  16. html_to_text,
  17. )
  18. from searx.enginelib.traits import EngineTraits
  19. traits: EngineTraits
  20. if TYPE_CHECKING:
  21. import logging
  22. logger: logging.Logger
  23. # about
  24. about = {
  25. "website": 'https://search.yahoo.com/',
  26. "wikidata_id": None,
  27. "official_api_documentation": 'https://developer.yahoo.com/api/',
  28. "use_official_api": False,
  29. "require_api_key": False,
  30. "results": 'HTML',
  31. }
  32. # engine dependent config
  33. categories = ['general', 'web']
  34. paging = True
  35. time_range_support = True
  36. # send_accept_language_header = True
  37. time_range_dict = {'day': 'd', 'week': 'w', 'month': 'm'}
  38. safesearch_dict = {0: 'p', 1: 'i', 2: 'r'}
  39. region2domain = {
  40. "CO": "co.search.yahoo.com", # Colombia
  41. "TH": "th.search.yahoo.com", # Thailand
  42. "VE": "ve.search.yahoo.com", # Venezuela
  43. "CL": "cl.search.yahoo.com", # Chile
  44. "HK": "hk.search.yahoo.com", # Hong Kong
  45. "PE": "pe.search.yahoo.com", # Peru
  46. "CA": "ca.search.yahoo.com", # Canada
  47. "DE": "de.search.yahoo.com", # Germany
  48. "FR": "fr.search.yahoo.com", # France
  49. "TW": "tw.search.yahoo.com", # Taiwan
  50. "GB": "uk.search.yahoo.com", # United Kingdom
  51. "UK": "uk.search.yahoo.com",
  52. "BR": "br.search.yahoo.com", # Brazil
  53. "IN": "in.search.yahoo.com", # India
  54. "ES": "espanol.search.yahoo.com", # Espanol
  55. "PH": "ph.search.yahoo.com", # Philippines
  56. "AR": "ar.search.yahoo.com", # Argentina
  57. "MX": "mx.search.yahoo.com", # Mexico
  58. "SG": "sg.search.yahoo.com", # Singapore
  59. }
  60. """Map regions to domain"""
  61. lang2domain = {
  62. 'zh_chs': 'hk.search.yahoo.com',
  63. 'zh_cht': 'tw.search.yahoo.com',
  64. 'any': 'search.yahoo.com',
  65. 'en': 'search.yahoo.com',
  66. 'bg': 'search.yahoo.com',
  67. 'cs': 'search.yahoo.com',
  68. 'da': 'search.yahoo.com',
  69. 'el': 'search.yahoo.com',
  70. 'et': 'search.yahoo.com',
  71. 'he': 'search.yahoo.com',
  72. 'hr': 'search.yahoo.com',
  73. 'ja': 'search.yahoo.com',
  74. 'ko': 'search.yahoo.com',
  75. 'sk': 'search.yahoo.com',
  76. 'sl': 'search.yahoo.com',
  77. }
  78. """Map language to domain"""
  79. yahoo_languages = {
  80. "all": "any",
  81. "ar": "ar", # Arabic
  82. "bg": "bg", # Bulgarian
  83. "cs": "cs", # Czech
  84. "da": "da", # Danish
  85. "de": "de", # German
  86. "el": "el", # Greek
  87. "en": "en", # English
  88. "es": "es", # Spanish
  89. "et": "et", # Estonian
  90. "fi": "fi", # Finnish
  91. "fr": "fr", # French
  92. "he": "he", # Hebrew
  93. "hr": "hr", # Croatian
  94. "hu": "hu", # Hungarian
  95. "it": "it", # Italian
  96. "ja": "ja", # Japanese
  97. "ko": "ko", # Korean
  98. "lt": "lt", # Lithuanian
  99. "lv": "lv", # Latvian
  100. "nl": "nl", # Dutch
  101. "no": "no", # Norwegian
  102. "pl": "pl", # Polish
  103. "pt": "pt", # Portuguese
  104. "ro": "ro", # Romanian
  105. "ru": "ru", # Russian
  106. "sk": "sk", # Slovak
  107. "sl": "sl", # Slovenian
  108. "sv": "sv", # Swedish
  109. "th": "th", # Thai
  110. "tr": "tr", # Turkish
  111. "zh": "zh_chs", # Chinese (Simplified)
  112. "zh_Hans": "zh_chs",
  113. 'zh-CN': "zh_chs",
  114. "zh_Hant": "zh_cht", # Chinese (Traditional)
  115. "zh-HK": "zh_cht",
  116. 'zh-TW': "zh_cht",
  117. }
  118. def build_sb_cookie(cookie_params):
  119. """Build sB cookie parameter from provided parameters.
  120. :param cookie_params: Dictionary of cookie parameters
  121. :type cookie_params: dict
  122. :returns: Formatted cookie string
  123. :rtype: str
  124. Example:
  125. >>> cookie_params = {'v': '1', 'vm': 'p', 'fl': '1', 'vl': 'lang_fr'}
  126. >>> build_sb_cookie(cookie_params)
  127. 'v=1&vm=p&fl=1&vl=lang_fr'
  128. """
  129. cookie_parts = []
  130. for key, value in cookie_params.items():
  131. cookie_parts.append(f"{key}={value}")
  132. return "&".join(cookie_parts)
  133. def request(query, params):
  134. """Build Yahoo search request."""
  135. lang, region = (params["language"].split("-") + [None])[:2]
  136. lang = yahoo_languages.get(lang, "any")
  137. # Build URL parameters
  138. # - p (str): Search query string
  139. # - btf (str): Time filter, maps to values like 'd' (day), 'w' (week), 'm' (month)
  140. # - iscqry (str): Empty string, necessary for results to appear properly on first page
  141. # - b (int): Search offset for pagination
  142. # - pz (str): Amount of results expected for the page
  143. url_params = {'p': query}
  144. btf = time_range_dict.get(params['time_range'])
  145. if btf:
  146. url_params['btf'] = btf
  147. if params['pageno'] == 1:
  148. url_params['iscqry'] = ''
  149. elif params['pageno'] >= 2:
  150. url_params['b'] = params['pageno'] * 7 + 1 # 8, 15, 21, etc.
  151. url_params['pz'] = 7
  152. url_params['bct'] = 0
  153. url_params['xargs'] = 0
  154. # Build sB cookie (for filters)
  155. # - vm (str): SafeSearch filter, maps to values like 'p' (None), 'i' (Moderate), 'r' (Strict)
  156. # - fl (bool): Indicates if a search language is used or not
  157. # - vl (str): The search language to use (e.g. lang_fr)
  158. sbcookie_params = {
  159. 'v': 1,
  160. 'vm': safesearch_dict[params['safesearch']],
  161. 'fl': 1,
  162. 'vl': f'lang_{lang}',
  163. 'pn': 10,
  164. 'rw': 'new',
  165. 'userset': 1,
  166. }
  167. params['cookies']['sB'] = build_sb_cookie(sbcookie_params)
  168. # Search region/language
  169. domain = region2domain.get(region)
  170. if not domain:
  171. domain = lang2domain.get(lang, f'{lang}.search.yahoo.com')
  172. logger.debug(f'domain selected: {domain}')
  173. logger.debug(f'cookies: {params["cookies"]}')
  174. params['url'] = f'https://{domain}/search?{urlencode(url_params)}'
  175. params['domain'] = domain
  176. def parse_url(url_string):
  177. """remove yahoo-specific tracking-url"""
  178. endings = ['/RS', '/RK']
  179. endpositions = []
  180. start = url_string.find('http', url_string.find('/RU=') + 1)
  181. for ending in endings:
  182. endpos = url_string.rfind(ending)
  183. if endpos > -1:
  184. endpositions.append(endpos)
  185. if start == 0 or len(endpositions) == 0:
  186. return url_string
  187. end = min(endpositions)
  188. return unquote(url_string[start:end])
  189. def response(resp):
  190. """parse response"""
  191. results = []
  192. dom = html.fromstring(resp.text)
  193. url_xpath = './/div[contains(@class,"compTitle")]/h3/a/@href'
  194. title_xpath = './/h3//a/@aria-label'
  195. domain = resp.search_params['domain']
  196. if domain == "search.yahoo.com":
  197. url_xpath = './/div[contains(@class,"compTitle")]/a/@href'
  198. title_xpath = './/div[contains(@class,"compTitle")]/a/h3/span'
  199. # parse results
  200. for result in eval_xpath_list(dom, '//div[contains(@class,"algo-sr")]'):
  201. url = eval_xpath_getindex(result, url_xpath, 0, default=None)
  202. if url is None:
  203. continue
  204. url = parse_url(url)
  205. title = eval_xpath_getindex(result, title_xpath, 0, default='')
  206. title: str = extract_text(title)
  207. content = eval_xpath_getindex(result, './/div[contains(@class, "compText")]', 0, default='')
  208. content: str = extract_text(content, allow_none=True)
  209. # append result
  210. results.append(
  211. {
  212. 'url': url,
  213. # title sometimes contains HTML tags / see
  214. # https://github.com/searxng/searxng/issues/3790
  215. 'title': " ".join(html_to_text(title).strip().split()),
  216. 'content': " ".join(html_to_text(content).strip().split()),
  217. }
  218. )
  219. for suggestion in eval_xpath_list(dom, '//div[contains(@class, "AlsoTry")]//table//a'):
  220. # append suggestion
  221. results.append({'suggestion': extract_text(suggestion)})
  222. return results