yahoo_news.py 2.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112
  1. # SPDX-License-Identifier: AGPL-3.0-or-later
  2. # lint: pylint
  3. """Yahoo (News)
  4. Yahoo News is "English only" and do not offer localized nor language queries.
  5. """
  6. # pylint: disable=invalid-name, missing-function-docstring
  7. import re
  8. from urllib.parse import urlencode
  9. from datetime import datetime, timedelta
  10. from dateutil import parser
  11. from lxml import html
  12. from searx import logger
  13. from searx.utils import (
  14. eval_xpath_list,
  15. eval_xpath_getindex,
  16. extract_text,
  17. )
  18. from searx.engines.yahoo import parse_url
  19. logger = logger.getChild('yahoo_news engine')
  20. # about
  21. about = {
  22. "website": 'https://news.yahoo.com',
  23. "wikidata_id": 'Q3044717',
  24. "official_api_documentation": 'https://developer.yahoo.com/api/',
  25. "use_official_api": False,
  26. "require_api_key": False,
  27. "results": 'HTML',
  28. }
  29. language_support = False
  30. time_range_support = False
  31. safesearch = False
  32. paging = True
  33. categories = ['news']
  34. # search-url
  35. search_url = (
  36. 'https://news.search.yahoo.com/search'
  37. '?{query}&b={offset}'
  38. )
  39. AGO_RE = re.compile(r'([0-9]+)\s*(year|month|week|day|minute|hour)')
  40. AGO_TIMEDELTA = {
  41. 'minute': timedelta(minutes=1),
  42. 'hour': timedelta(hours=1),
  43. 'day': timedelta(days=1),
  44. 'week': timedelta(days=7),
  45. 'month': timedelta(days=30),
  46. 'year': timedelta(days=365),
  47. }
  48. def request(query, params):
  49. offset = (params['pageno'] - 1) * 10 + 1
  50. params['url'] = search_url.format(
  51. offset = offset,
  52. query = urlencode({'p': query})
  53. )
  54. logger.debug("query_url --> %s", params['url'])
  55. return params
  56. def response(resp):
  57. results = []
  58. dom = html.fromstring(resp.text)
  59. # parse results
  60. for result in eval_xpath_list(dom, '//ol[contains(@class,"searchCenterMiddle")]//li'):
  61. url = eval_xpath_getindex(result, './/h4/a/@href', 0, None)
  62. if url is None:
  63. continue
  64. url = parse_url(url)
  65. title = extract_text(result.xpath('.//h4/a'))
  66. content = extract_text(result.xpath('.//p'))
  67. img_src = eval_xpath_getindex(result, './/img/@data-src', 0, None)
  68. item = {
  69. 'url': url,
  70. 'title': title,
  71. 'content': content,
  72. 'img_src' : img_src
  73. }
  74. pub_date = extract_text(result.xpath('.//span[contains(@class,"s-time")]'))
  75. ago = AGO_RE.search(pub_date)
  76. if ago:
  77. number = int(ago.group(1))
  78. delta = AGO_TIMEDELTA[ago.group(2)]
  79. pub_date = datetime.now() - delta * number
  80. else:
  81. try:
  82. pub_date = parser.parse(pub_date)
  83. except parser.ParserError:
  84. pub_date = None
  85. if pub_date is not None:
  86. item['publishedDate'] = pub_date
  87. results.append(item)
  88. for suggestion in eval_xpath_list(dom, '//div[contains(@class,"AlsoTry")]//td'):
  89. results.append({'suggestion': extract_text(suggestion)})
  90. return results