yahoo_news.py 2.8 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109
  1. # SPDX-License-Identifier: AGPL-3.0-or-later
  2. # lint: pylint
  3. """Yahoo (News)
  4. Yahoo News is "English only" and do not offer localized nor language queries.
  5. """
  6. # pylint: disable=invalid-name
  7. import re
  8. from urllib.parse import urlencode
  9. from datetime import datetime, timedelta
  10. from dateutil import parser
  11. from lxml import html
  12. from searx.utils import (
  13. eval_xpath_list,
  14. eval_xpath_getindex,
  15. extract_text,
  16. )
  17. from searx.engines.yahoo import parse_url
  18. # about
  19. about = {
  20. "website": 'https://news.yahoo.com',
  21. "wikidata_id": 'Q3044717',
  22. "official_api_documentation": 'https://developer.yahoo.com/api/',
  23. "use_official_api": False,
  24. "require_api_key": False,
  25. "results": 'HTML',
  26. }
  27. language_support = False
  28. time_range_support = False
  29. safesearch = False
  30. paging = True
  31. categories = ['news']
  32. # search-url
  33. search_url = (
  34. 'https://news.search.yahoo.com/search'
  35. '?{query}&b={offset}'
  36. )
  37. AGO_RE = re.compile(r'([0-9]+)\s*(year|month|week|day|minute|hour)')
  38. AGO_TIMEDELTA = {
  39. 'minute': timedelta(minutes=1),
  40. 'hour': timedelta(hours=1),
  41. 'day': timedelta(days=1),
  42. 'week': timedelta(days=7),
  43. 'month': timedelta(days=30),
  44. 'year': timedelta(days=365),
  45. }
  46. def request(query, params):
  47. offset = (params['pageno'] - 1) * 10 + 1
  48. params['url'] = search_url.format(
  49. offset = offset,
  50. query = urlencode({'p': query})
  51. )
  52. logger.debug("query_url --> %s", params['url'])
  53. return params
  54. def response(resp):
  55. results = []
  56. dom = html.fromstring(resp.text)
  57. # parse results
  58. for result in eval_xpath_list(dom, '//ol[contains(@class,"searchCenterMiddle")]//li'):
  59. url = eval_xpath_getindex(result, './/h4/a/@href', 0, None)
  60. if url is None:
  61. continue
  62. url = parse_url(url)
  63. title = extract_text(result.xpath('.//h4/a'))
  64. content = extract_text(result.xpath('.//p'))
  65. img_src = eval_xpath_getindex(result, './/img/@data-src', 0, None)
  66. item = {
  67. 'url': url,
  68. 'title': title,
  69. 'content': content,
  70. 'img_src' : img_src
  71. }
  72. pub_date = extract_text(result.xpath('.//span[contains(@class,"s-time")]'))
  73. ago = AGO_RE.search(pub_date)
  74. if ago:
  75. number = int(ago.group(1))
  76. delta = AGO_TIMEDELTA[ago.group(2)]
  77. pub_date = datetime.now() - delta * number
  78. else:
  79. try:
  80. pub_date = parser.parse(pub_date)
  81. except parser.ParserError:
  82. pub_date = None
  83. if pub_date is not None:
  84. item['publishedDate'] = pub_date
  85. results.append(item)
  86. for suggestion in eval_xpath_list(dom, '//div[contains(@class,"AlsoTry")]//td'):
  87. results.append({'suggestion': extract_text(suggestion)})
  88. return results