yahoo_news.py 2.7 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104
  1. # SPDX-License-Identifier: AGPL-3.0-or-later
  2. # lint: pylint
  3. """Yahoo (News)
  4. Yahoo News is "English only" and do not offer localized nor language queries.
  5. """
  6. # pylint: disable=invalid-name
  7. import re
  8. from urllib.parse import urlencode
  9. from datetime import datetime, timedelta
  10. from dateutil import parser
  11. from lxml import html
  12. from searx.utils import (
  13. eval_xpath_list,
  14. eval_xpath_getindex,
  15. extract_text,
  16. )
  17. from searx.engines.yahoo import parse_url
  18. # about
  19. about = {
  20. "website": 'https://news.yahoo.com',
  21. "wikidata_id": 'Q3044717',
  22. "official_api_documentation": 'https://developer.yahoo.com/api/',
  23. "use_official_api": False,
  24. "require_api_key": False,
  25. "results": 'HTML',
  26. }
  27. language_support = False
  28. time_range_support = False
  29. safesearch = False
  30. paging = True
  31. categories = ['news']
  32. # search-url
  33. search_url = (
  34. # fmt: off
  35. 'https://news.search.yahoo.com/search'
  36. '?{query}&b={offset}'
  37. # fmt: on
  38. )
  39. AGO_RE = re.compile(r'([0-9]+)\s*(year|month|week|day|minute|hour)')
  40. AGO_TIMEDELTA = {
  41. 'minute': timedelta(minutes=1),
  42. 'hour': timedelta(hours=1),
  43. 'day': timedelta(days=1),
  44. 'week': timedelta(days=7),
  45. 'month': timedelta(days=30),
  46. 'year': timedelta(days=365),
  47. }
  48. def request(query, params):
  49. offset = (params['pageno'] - 1) * 10 + 1
  50. params['url'] = search_url.format(offset=offset, query=urlencode({'p': query}))
  51. logger.debug("query_url --> %s", params['url'])
  52. return params
  53. def response(resp):
  54. results = []
  55. dom = html.fromstring(resp.text)
  56. # parse results
  57. for result in eval_xpath_list(dom, '//ol[contains(@class,"searchCenterMiddle")]//li'):
  58. url = eval_xpath_getindex(result, './/h4/a/@href', 0, None)
  59. if url is None:
  60. continue
  61. url = parse_url(url)
  62. title = extract_text(result.xpath('.//h4/a'))
  63. content = extract_text(result.xpath('.//p'))
  64. img_src = eval_xpath_getindex(result, './/img/@data-src', 0, None)
  65. item = {'url': url, 'title': title, 'content': content, 'img_src': img_src}
  66. pub_date = extract_text(result.xpath('.//span[contains(@class,"s-time")]'))
  67. ago = AGO_RE.search(pub_date)
  68. if ago:
  69. number = int(ago.group(1))
  70. delta = AGO_TIMEDELTA[ago.group(2)]
  71. pub_date = datetime.now() - delta * number
  72. else:
  73. try:
  74. pub_date = parser.parse(pub_date)
  75. except parser.ParserError:
  76. pub_date = None
  77. if pub_date is not None:
  78. item['publishedDate'] = pub_date
  79. results.append(item)
  80. for suggestion in eval_xpath_list(dom, '//div[contains(@class,"AlsoTry")]//td'):
  81. results.append({'suggestion': extract_text(suggestion)})
  82. return results