yahoo_news.py 2.7 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899
  1. # SPDX-License-Identifier: AGPL-3.0-or-later
  2. # lint: pylint
  3. """Yahoo (News)
  4. Yahoo News is "English only" and do not offer localized nor language queries.
  5. """
  6. # pylint: disable=invalid-name
  7. import re
  8. from urllib.parse import urlencode
  9. from datetime import datetime, timedelta
  10. from dateutil import parser
  11. from lxml import html
  12. from searx.utils import (
  13. eval_xpath_list,
  14. eval_xpath_getindex,
  15. extract_text,
  16. )
  17. from searx.engines.yahoo import parse_url
  18. # about
  19. about = {
  20. "website": 'https://news.yahoo.com',
  21. "wikidata_id": 'Q3044717',
  22. "official_api_documentation": 'https://developer.yahoo.com/api/',
  23. "use_official_api": False,
  24. "require_api_key": False,
  25. "results": 'HTML',
  26. }
  27. language_support = False
  28. time_range_support = False
  29. safesearch = False
  30. paging = True
  31. categories = ['news']
  32. # search-url
  33. search_url = 'https://news.search.yahoo.com/search' '?{query}&b={offset}'
  34. AGO_RE = re.compile(r'([0-9]+)\s*(year|month|week|day|minute|hour)')
  35. AGO_TIMEDELTA = {
  36. 'minute': timedelta(minutes=1),
  37. 'hour': timedelta(hours=1),
  38. 'day': timedelta(days=1),
  39. 'week': timedelta(days=7),
  40. 'month': timedelta(days=30),
  41. 'year': timedelta(days=365),
  42. }
  43. def request(query, params):
  44. offset = (params['pageno'] - 1) * 10 + 1
  45. params['url'] = search_url.format(offset=offset, query=urlencode({'p': query}))
  46. logger.debug("query_url --> %s", params['url'])
  47. return params
  48. def response(resp):
  49. results = []
  50. dom = html.fromstring(resp.text)
  51. # parse results
  52. for result in eval_xpath_list(dom, '//ol[contains(@class,"searchCenterMiddle")]//li'):
  53. url = eval_xpath_getindex(result, './/h4/a/@href', 0, None)
  54. if url is None:
  55. continue
  56. url = parse_url(url)
  57. title = extract_text(result.xpath('.//h4/a'))
  58. content = extract_text(result.xpath('.//p'))
  59. img_src = eval_xpath_getindex(result, './/img/@data-src', 0, None)
  60. item = {'url': url, 'title': title, 'content': content, 'img_src': img_src}
  61. pub_date = extract_text(result.xpath('.//span[contains(@class,"s-time")]'))
  62. ago = AGO_RE.search(pub_date)
  63. if ago:
  64. number = int(ago.group(1))
  65. delta = AGO_TIMEDELTA[ago.group(2)]
  66. pub_date = datetime.now() - delta * number
  67. else:
  68. try:
  69. pub_date = parser.parse(pub_date)
  70. except parser.ParserError:
  71. pub_date = None
  72. if pub_date is not None:
  73. item['publishedDate'] = pub_date
  74. results.append(item)
  75. for suggestion in eval_xpath_list(dom, '//div[contains(@class,"AlsoTry")]//td'):
  76. results.append({'suggestion': extract_text(suggestion)})
  77. return results