google_news.py 2.4 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889
  1. # SPDX-License-Identifier: AGPL-3.0-or-later
  2. """
  3. Google (News)
  4. """
  5. from urllib.parse import urlencode
  6. from lxml import html
  7. from searx.utils import match_language
  8. from searx.engines.google import _fetch_supported_languages, supported_languages_url # NOQA # pylint: disable=unused-import
  9. # about
  10. about = {
  11. "website": 'https://news.google.com',
  12. "wikidata_id": 'Q12020',
  13. "official_api_documentation": None,
  14. "use_official_api": False,
  15. "require_api_key": False,
  16. "results": 'HTML',
  17. }
  18. # search-url
  19. categories = ['news']
  20. paging = True
  21. language_support = True
  22. safesearch = True
  23. time_range_support = True
  24. number_of_results = 10
  25. search_url = 'https://www.google.com/search'\
  26. '?{query}'\
  27. '&tbm=nws'\
  28. '&gws_rd=cr'\
  29. '&{search_options}'
  30. time_range_attr = "qdr:{range}"
  31. time_range_dict = {'day': 'd',
  32. 'week': 'w',
  33. 'month': 'm',
  34. 'year': 'y'}
  35. # do search-request
  36. def request(query, params):
  37. search_options = {
  38. 'start': (params['pageno'] - 1) * number_of_results
  39. }
  40. if params['time_range'] in time_range_dict:
  41. search_options['tbs'] = time_range_attr.format(range=time_range_dict[params['time_range']])
  42. if safesearch and params['safesearch']:
  43. search_options['safe'] = 'on'
  44. params['url'] = search_url.format(query=urlencode({'q': query}),
  45. search_options=urlencode(search_options))
  46. if params['language'] != 'all':
  47. language = match_language(params['language'], supported_languages, language_aliases).split('-')[0]
  48. if language:
  49. params['url'] += '&hl=' + language
  50. return params
  51. # get response from search-request
  52. def response(resp):
  53. results = []
  54. dom = html.fromstring(resp.text)
  55. # parse results
  56. for result in dom.xpath('//div[@class="g"]|//div[@class="g _cy"]'):
  57. try:
  58. r = {
  59. 'url': result.xpath('.//a[@class="l lLrAF"]')[0].attrib.get("href"),
  60. 'title': ''.join(result.xpath('.//a[@class="l lLrAF"]//text()')),
  61. 'content': ''.join(result.xpath('.//div[@class="st"]//text()')),
  62. }
  63. except:
  64. continue
  65. imgs = result.xpath('.//img/@src')
  66. if len(imgs) and not imgs[0].startswith('data'):
  67. r['img_src'] = imgs[0]
  68. results.append(r)
  69. # return results
  70. return results