yahoo_news.py 3.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118
  1. # SPDX-License-Identifier: AGPL-3.0-or-later
  2. """Yahoo (News)
  3. Yahoo News is "English only" and do not offer localized nor language queries.
  4. """
  5. # pylint: disable=invalid-name, missing-function-docstring
  6. import re
  7. from urllib.parse import urlencode
  8. from datetime import datetime, timedelta
  9. from dateutil import parser
  10. from lxml import html
  11. from searx import logger
  12. from searx.utils import (
  13. eval_xpath_list,
  14. eval_xpath_getindex,
  15. extract_text,
  16. )
  17. from searx.engines.yahoo import parse_url
  18. # pylint: disable=unused-import
  19. from searx.engines.yahoo import (
  20. _fetch_supported_languages,
  21. supported_languages_url,
  22. )
  23. # pylint: enable=unused-import
  24. logger = logger.getChild('yahoo_news engine')
  25. # about
  26. about = {
  27. "website": 'https://news.yahoo.com',
  28. "wikidata_id": 'Q3044717',
  29. "official_api_documentation": 'https://developer.yahoo.com/api/',
  30. "use_official_api": False,
  31. "require_api_key": False,
  32. "results": 'HTML',
  33. }
  34. language_support = False
  35. time_range_support = False
  36. safesearch = False
  37. paging = True
  38. categories = ['news']
  39. # search-url
  40. search_url = (
  41. 'https://news.search.yahoo.com/search'
  42. '?{query}&b={offset}'
  43. )
  44. AGO_RE = re.compile(r'([0-9]+)\s*(year|month|week|day|minute|hour)')
  45. AGO_TIMEDELTA = {
  46. 'minute': timedelta(minutes=1),
  47. 'hour': timedelta(hours=1),
  48. 'day': timedelta(days=1),
  49. 'week': timedelta(days=7),
  50. 'month': timedelta(days=30),
  51. 'year': timedelta(days=365),
  52. }
  53. def request(query, params):
  54. offset = (params['pageno'] - 1) * 10 + 1
  55. params['url'] = search_url.format(
  56. offset = offset,
  57. query = urlencode({'p': query})
  58. )
  59. logger.debug("query_url --> %s", params['url'])
  60. return params
  61. def response(resp):
  62. results = []
  63. dom = html.fromstring(resp.text)
  64. # parse results
  65. for result in eval_xpath_list(dom, '//ol[contains(@class,"searchCenterMiddle")]//li'):
  66. url = eval_xpath_getindex(result, './/h4/a/@href', 0, None)
  67. if url is None:
  68. continue
  69. url = parse_url(url)
  70. title = extract_text(result.xpath('.//h4/a'))
  71. content = extract_text(result.xpath('.//p'))
  72. img_src = eval_xpath_getindex(result, './/img/@data-src', 0, None)
  73. item = {
  74. 'url': url,
  75. 'title': title,
  76. 'content': content,
  77. 'img_src' : img_src
  78. }
  79. pub_date = extract_text(result.xpath('.//span[contains(@class,"s-time")]'))
  80. ago = AGO_RE.search(pub_date)
  81. if ago:
  82. number = int(ago.group(1))
  83. delta = AGO_TIMEDELTA[ago.group(2)]
  84. pub_date = datetime.now() - delta * number
  85. else:
  86. try:
  87. pub_date = parser.parse(pub_date)
  88. except parser.ParserError:
  89. pub_date = None
  90. if pub_date is not None:
  91. item['publishedDate'] = pub_date
  92. results.append(item)
  93. for suggestion in eval_xpath_list(dom, '//div[contains(@class,"AlsoTry")]//td'):
  94. results.append({'suggestion': extract_text(suggestion)})
  95. return results