google_news.py 6.1 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189
  1. # SPDX-License-Identifier: AGPL-3.0-or-later
  2. # lint: pylint
  3. """This is the implementation of the google news engine. The google news API
  4. ignores some parameters from the common :ref:`google API`:
  5. - num_ : the number of search results is ignored
  6. - save_ : is ignored / Google-News results are always *SafeSearch*
  7. .. _num: https://developers.google.com/custom-search/docs/xml_results#numsp
  8. .. _save: https://developers.google.com/custom-search/docs/xml_results#safesp
  9. """
  10. # pylint: disable=invalid-name, missing-function-docstring
  11. import binascii
  12. from datetime import datetime
  13. import re
  14. from urllib.parse import urlencode
  15. from base64 import b64decode
  16. from lxml import html
  17. from searx.utils import (
  18. eval_xpath,
  19. eval_xpath_list,
  20. eval_xpath_getindex,
  21. extract_text,
  22. )
  23. # pylint: disable=unused-import
  24. from searx.engines.google import (
  25. supported_languages_url,
  26. _fetch_supported_languages,
  27. )
  28. # pylint: enable=unused-import
  29. from searx.engines.google import (
  30. get_lang_info,
  31. detect_google_sorry,
  32. )
  33. # about
  34. about = {
  35. "website": 'https://news.google.com',
  36. "wikidata_id": 'Q12020',
  37. "official_api_documentation": 'https://developers.google.com/custom-search',
  38. "use_official_api": False,
  39. "require_api_key": False,
  40. "results": 'HTML',
  41. }
  42. # compared to other google engines google-news has a different time range
  43. # support. The time range is included in the search term.
  44. time_range_dict = {
  45. 'day': 'when:1d',
  46. 'week': 'when:7d',
  47. 'month': 'when:1m',
  48. 'year': 'when:1y',
  49. }
  50. # engine dependent config
  51. categories = ['news']
  52. paging = False
  53. use_locale_domain = True
  54. time_range_support = True
  55. # Google-News results are always *SafeSearch*. Option 'safesearch' is set to
  56. # False here, otherwise checker will report safesearch-errors::
  57. #
  58. # safesearch : results are identitical for safesearch=0 and safesearch=2
  59. safesearch = False
  60. def request(query, params):
  61. """Google-News search request"""
  62. lang_info = get_lang_info(
  63. params, supported_languages, language_aliases, False
  64. )
  65. logger.debug(
  66. "HTTP header Accept-Language --> %s", lang_info['headers']['Accept-Language'])
  67. # google news has only one domain
  68. lang_info['subdomain'] = 'news.google.com'
  69. ceid = "%s:%s" % (lang_info['country'], lang_info['language'])
  70. # google news redirects en to en-US
  71. if lang_info['params']['hl'] == 'en':
  72. lang_info['params']['hl'] = 'en-US'
  73. # Very special to google-news compared to other google engines, the time
  74. # range is included in the search term.
  75. if params['time_range']:
  76. query += ' ' + time_range_dict[params['time_range']]
  77. query_url = 'https://' + lang_info['subdomain'] + '/search' + "?" + urlencode({
  78. 'q': query,
  79. **lang_info['params'],
  80. 'ie': "utf8",
  81. 'oe': "utf8",
  82. 'gl': lang_info['country'],
  83. }) + ('&ceid=%s' % ceid) # ceid includes a ':' character which must not be urlencoded
  84. params['url'] = query_url
  85. params['headers'].update(lang_info['headers'])
  86. params['headers']['Accept'] = (
  87. 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8'
  88. )
  89. params['headers']['Cookie'] = "CONSENT=YES+cb.%s-14-p0.en+F+941;" % datetime.now().strftime("%Y%m%d")
  90. return params
  91. def response(resp):
  92. """Get response from google's search request"""
  93. results = []
  94. detect_google_sorry(resp)
  95. # convert the text to dom
  96. dom = html.fromstring(resp.text)
  97. for result in eval_xpath_list(dom, '//div[@class="xrnccd"]'):
  98. # The first <a> tag in the <article> contains the link to the
  99. # article The href attribute of the <a> is a google internal link,
  100. # we can't use. The real link is hidden in the jslog attribute:
  101. #
  102. # <a ...
  103. # jslog="95014; 4:https://www.cnn.com/.../index.html; track:click"
  104. # href="./articles/CAIiENu3nGS...?hl=en-US&amp;gl=US&amp;ceid=US%3Aen"
  105. # ... />
  106. jslog = eval_xpath_getindex(result, './article/a/@jslog', 0)
  107. url = re.findall('http[^;]*', jslog)
  108. if url:
  109. url = url[0]
  110. else:
  111. # The real URL is base64 encoded in the json attribute:
  112. # jslog="95014; 5:W251bGwsbnVsbCxudW...giXQ==; track:click"
  113. jslog = jslog.split(";")[1].split(':')[1].strip()
  114. try:
  115. padding = (4 -(len(jslog) % 4)) * "="
  116. jslog = b64decode(jslog + padding)
  117. except binascii.Error:
  118. # URL cant be read, skip this result
  119. continue
  120. # now we have : b'[null, ... null,"https://www.cnn.com/.../index.html"]'
  121. url = re.findall('http[^;"]*', str(jslog))[0]
  122. # the first <h3> tag in the <article> contains the title of the link
  123. title = extract_text(eval_xpath(result, './article/h3[1]'))
  124. # the first <div> tag in the <article> contains the content of the link
  125. content = extract_text(eval_xpath(result, './article/div[1]'))
  126. # the second <div> tag contains origin publisher and the publishing date
  127. pub_date = extract_text(eval_xpath(result, './article/div[2]//time'))
  128. pub_origin = extract_text(eval_xpath(result, './article/div[2]//a'))
  129. pub_info = []
  130. if pub_origin:
  131. pub_info.append(pub_origin)
  132. if pub_date:
  133. # The pub_date is mostly a string like 'yesertday', not a real
  134. # timezone date or time. Therefore we can't use publishedDate.
  135. pub_info.append(pub_date)
  136. pub_info = ', '.join(pub_info)
  137. if pub_info:
  138. content = pub_info + ': ' + content
  139. # The image URL is located in a preceding sibling <img> tag, e.g.:
  140. # "https://lh3.googleusercontent.com/DjhQh7DMszk.....z=-p-h100-w100"
  141. # These URL are long but not personalized (double checked via tor).
  142. img_src = extract_text(result.xpath('preceding-sibling::a/figure/img/@src'))
  143. results.append({
  144. 'url': url,
  145. 'title': title,
  146. 'content': content,
  147. 'img_src': img_src,
  148. })
  149. # return results
  150. return results