yahoo_news.py 3.6 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112
  1. # SPDX-License-Identifier: AGPL-3.0-or-later
  2. """
  3. Yahoo (News)
  4. """
  5. import re
  6. from datetime import datetime, timedelta
  7. from urllib.parse import urlencode
  8. from lxml import html
  9. from searx.engines.yahoo import parse_url, language_aliases
  10. from searx.engines.yahoo import _fetch_supported_languages, supported_languages_url # NOQA # pylint: disable=unused-import
  11. from dateutil import parser
  12. from searx.utils import extract_text, extract_url, match_language
  13. # about
  14. about = {
  15. "website": 'https://news.yahoo.com',
  16. "wikidata_id": 'Q3044717',
  17. "official_api_documentation": 'https://developer.yahoo.com/api/',
  18. "use_official_api": False,
  19. "require_api_key": False,
  20. "results": 'HTML',
  21. }
  22. # engine dependent config
  23. categories = ['news']
  24. paging = True
  25. language_support = True
  26. # search-url
  27. search_url = 'https://news.search.yahoo.com/search?{query}&b={offset}&{lang}=uh3_news_web_gs_1&pz=10&xargs=0&vl=lang_{lang}' # noqa
  28. # specific xpath variables
  29. results_xpath = '//ol[contains(@class,"searchCenterMiddle")]//li'
  30. url_xpath = './/h3/a/@href'
  31. title_xpath = './/h3/a'
  32. content_xpath = './/div[@class="compText"]'
  33. publishedDate_xpath = './/span[contains(@class,"tri")]'
  34. suggestion_xpath = '//div[contains(@class,"VerALSOTRY")]//a'
  35. # do search-request
  36. def request(query, params):
  37. offset = (params['pageno'] - 1) * 10 + 1
  38. if params['language'] == 'all':
  39. language = 'en'
  40. else:
  41. language = match_language(params['language'], supported_languages, language_aliases).split('-')[0]
  42. params['url'] = search_url.format(offset=offset,
  43. query=urlencode({'p': query}),
  44. lang=language)
  45. # TODO required?
  46. params['cookies']['sB'] = '"v=1&vm=p&fl=1&vl=lang_{lang}&sh=1&pn=10&rw=new'\
  47. .format(lang=language)
  48. return params
  49. def sanitize_url(url):
  50. if ".yahoo.com/" in url:
  51. return re.sub("\\;\\_ylt\\=.+$", "", url)
  52. else:
  53. return url
  54. # get response from search-request
  55. def response(resp):
  56. results = []
  57. dom = html.fromstring(resp.text)
  58. # parse results
  59. for result in dom.xpath(results_xpath):
  60. urls = result.xpath(url_xpath)
  61. if len(urls) != 1:
  62. continue
  63. url = sanitize_url(parse_url(extract_url(urls, search_url)))
  64. title = extract_text(result.xpath(title_xpath)[0])
  65. content = extract_text(result.xpath(content_xpath)[0])
  66. # parse publishedDate
  67. publishedDate = extract_text(result.xpath(publishedDate_xpath)[0])
  68. # still useful ?
  69. if re.match("^[0-9]+ minute(s|) ago$", publishedDate):
  70. publishedDate = datetime.now() - timedelta(minutes=int(re.match(r'\d+', publishedDate).group()))
  71. elif re.match("^[0-9]+ days? ago$", publishedDate):
  72. publishedDate = datetime.now() - timedelta(days=int(re.match(r'\d+', publishedDate).group()))
  73. elif re.match("^[0-9]+ hour(s|), [0-9]+ minute(s|) ago$", publishedDate):
  74. timeNumbers = re.findall(r'\d+', publishedDate)
  75. publishedDate = datetime.now()\
  76. - timedelta(hours=int(timeNumbers[0]))\
  77. - timedelta(minutes=int(timeNumbers[1]))
  78. else:
  79. try:
  80. publishedDate = parser.parse(publishedDate)
  81. except:
  82. publishedDate = datetime.now()
  83. if publishedDate.year == 1900:
  84. publishedDate = publishedDate.replace(year=datetime.now().year)
  85. # append result
  86. results.append({'url': url,
  87. 'title': title,
  88. 'content': content,
  89. 'publishedDate': publishedDate})
  90. # return results
  91. return results