yahoo.py 4.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187
  1. # SPDX-License-Identifier: AGPL-3.0-or-later
  2. """Yahoo Search (Web)
  3. Languages are supported by mapping the language to a domain. If domain is not
  4. found in :py:obj:`lang2domain` URL ``<lang>.search.yahoo.com`` is used.
  5. """
  6. from urllib.parse import (
  7. unquote,
  8. urlencode,
  9. )
  10. from lxml import html
  11. from searx.utils import (
  12. eval_xpath_getindex,
  13. eval_xpath_list,
  14. extract_text,
  15. html_to_text,
  16. )
  17. from searx.enginelib.traits import EngineTraits
  18. traits: EngineTraits
  19. # about
  20. about = {
  21. "website": 'https://search.yahoo.com/',
  22. "wikidata_id": None,
  23. "official_api_documentation": 'https://developer.yahoo.com/api/',
  24. "use_official_api": False,
  25. "require_api_key": False,
  26. "results": 'HTML',
  27. }
  28. # engine dependent config
  29. categories = ['general', 'web']
  30. paging = True
  31. time_range_support = True
  32. # send_accept_language_header = True
  33. time_range_dict = {
  34. 'day': ('1d', 'd'),
  35. 'week': ('1w', 'w'),
  36. 'month': ('1m', 'm'),
  37. }
  38. lang2domain = {
  39. 'zh_chs': 'hk.search.yahoo.com',
  40. 'zh_cht': 'tw.search.yahoo.com',
  41. 'any': 'search.yahoo.com',
  42. 'en': 'search.yahoo.com',
  43. 'bg': 'search.yahoo.com',
  44. 'cs': 'search.yahoo.com',
  45. 'da': 'search.yahoo.com',
  46. 'el': 'search.yahoo.com',
  47. 'et': 'search.yahoo.com',
  48. 'he': 'search.yahoo.com',
  49. 'hr': 'search.yahoo.com',
  50. 'ja': 'search.yahoo.com',
  51. 'ko': 'search.yahoo.com',
  52. 'sk': 'search.yahoo.com',
  53. 'sl': 'search.yahoo.com',
  54. }
  55. """Map language to domain"""
  56. yahoo_languages = {
  57. "all": "any",
  58. "ar": "ar",
  59. "bg": "bg",
  60. "cs": "cs",
  61. "da": "da",
  62. "de": "de",
  63. "el": "el",
  64. "en": "en",
  65. "es": "es",
  66. "et": "et",
  67. "fi": "fi",
  68. "fr": "fr",
  69. "he": "he",
  70. "hr": "hr",
  71. "hu": "hu",
  72. "it": "it",
  73. "ja": "ja",
  74. "ko": "ko",
  75. "lt": "lt",
  76. "lv": "lv",
  77. "nl": "nl",
  78. "no": "no",
  79. "pl": "pl",
  80. "pt": "pt",
  81. "ro": "ro",
  82. "ru": "ru",
  83. "sk": "sk",
  84. "sl": "sl",
  85. "sv": "sv",
  86. "th": "th",
  87. "tr": "tr",
  88. "zh": "zh_chs",
  89. "zh_Hans": "zh_chs",
  90. 'zh-CN': "zh_chs",
  91. "zh_Hant": "zh_cht",
  92. "zh-HK": "zh_cht",
  93. 'zh-TW': "zh_cht",
  94. }
  95. def request(query, params):
  96. """build request"""
  97. lang = params["language"].split("-")[0]
  98. lang = yahoo_languages.get(lang, "any")
  99. offset = (params['pageno'] - 1) * 7 + 1
  100. age, btf = time_range_dict.get(params['time_range'], ('', ''))
  101. args = urlencode(
  102. {
  103. 'p': query,
  104. 'ei': 'UTF-8',
  105. 'fl': 1,
  106. 'vl': 'lang_' + lang,
  107. 'btf': btf,
  108. 'fr2': 'time',
  109. 'age': age,
  110. 'b': offset,
  111. 'xargs': 0,
  112. }
  113. )
  114. domain = lang2domain.get(lang, '%s.search.yahoo.com' % lang)
  115. params['url'] = 'https://%s/search?%s' % (domain, args)
  116. return params
  117. def parse_url(url_string):
  118. """remove yahoo-specific tracking-url"""
  119. endings = ['/RS', '/RK']
  120. endpositions = []
  121. start = url_string.find('http', url_string.find('/RU=') + 1)
  122. for ending in endings:
  123. endpos = url_string.rfind(ending)
  124. if endpos > -1:
  125. endpositions.append(endpos)
  126. if start == 0 or len(endpositions) == 0:
  127. return url_string
  128. end = min(endpositions)
  129. return unquote(url_string[start:end])
  130. def response(resp):
  131. """parse response"""
  132. results = []
  133. dom = html.fromstring(resp.text)
  134. # parse results
  135. for result in eval_xpath_list(dom, '//div[contains(@class,"algo-sr")]'):
  136. url = eval_xpath_getindex(result, './/h3/a/@href', 0, default=None)
  137. if url is None:
  138. continue
  139. url = parse_url(url)
  140. title = eval_xpath_getindex(result, './/h3//a/@aria-label', 0, default='')
  141. title: str = extract_text(title)
  142. content = eval_xpath_getindex(result, './/div[contains(@class, "compText")]', 0, default='')
  143. content: str = extract_text(content, allow_none=True)
  144. # append result
  145. results.append(
  146. {
  147. 'url': url,
  148. # title sometimes contains HTML tags / see
  149. # https://github.com/searxng/searxng/issues/3790
  150. 'title': " ".join(html_to_text(title).strip().split()),
  151. 'content': " ".join(html_to_text(content).strip().split()),
  152. }
  153. )
  154. for suggestion in eval_xpath_list(dom, '//div[contains(@class, "AlsoTry")]//table//a'):
  155. # append suggestion
  156. results.append({'suggestion': extract_text(suggestion)})
  157. return results