google_news.py 6.3 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198
  1. # SPDX-License-Identifier: AGPL-3.0-or-later
  2. # lint: pylint
  3. """Google (News)
  4. For detailed description of the *REST-full* API see: `Query Parameter
  5. Definitions`_. Not all parameters can be appied:
  6. - num_ : the number of search results is ignored
  7. - save_ : is ignored / Google-News results are always *SafeSearch*
  8. .. _Query Parameter Definitions:
  9. https://developers.google.com/custom-search/docs/xml_results#WebSearch_Query_Parameter_Definitions
  10. .. _num: https://developers.google.com/custom-search/docs/xml_results#numsp
  11. .. _save: https://developers.google.com/custom-search/docs/xml_results#safesp
  12. """
  13. # pylint: disable=invalid-name, missing-function-docstring
  14. import binascii
  15. from datetime import datetime
  16. import re
  17. from urllib.parse import urlencode
  18. from base64 import b64decode
  19. from lxml import html
  20. from searx import logger
  21. from searx.utils import (
  22. eval_xpath,
  23. eval_xpath_list,
  24. eval_xpath_getindex,
  25. extract_text,
  26. )
  27. # pylint: disable=unused-import
  28. from searx.engines.google import (
  29. supported_languages_url,
  30. _fetch_supported_languages,
  31. )
  32. # pylint: enable=unused-import
  33. from searx.engines.google import (
  34. get_lang_info,
  35. detect_google_sorry,
  36. )
  37. # about
  38. about = {
  39. "website": 'https://news.google.com',
  40. "wikidata_id": 'Q12020',
  41. "official_api_documentation": 'https://developers.google.com/custom-search',
  42. "use_official_api": False,
  43. "require_api_key": False,
  44. "results": 'HTML',
  45. }
  46. logger = logger.getChild('google news')
  47. # compared to other google engines google-news has a different time range
  48. # support. The time range is included in the search term.
  49. time_range_dict = {
  50. 'day': 'when:1d',
  51. 'week': 'when:7d',
  52. 'month': 'when:1m',
  53. 'year': 'when:1y',
  54. }
  55. # engine dependent config
  56. categories = ['news']
  57. paging = False
  58. use_locale_domain = True
  59. time_range_support = True
  60. # Google-News results are always *SafeSearch*. Option 'safesearch' is set to
  61. # False here, otherwise checker will report safesearch-errors::
  62. #
  63. # safesearch : results are identitical for safesearch=0 and safesearch=2
  64. safesearch = False
  65. def request(query, params):
  66. """Google-News search request"""
  67. lang_info = get_lang_info(
  68. # pylint: disable=undefined-variable
  69. params, supported_languages, language_aliases, False
  70. )
  71. logger.debug(
  72. "HTTP header Accept-Language --> %s", lang_info['headers']['Accept-Language'])
  73. # google news has only one domain
  74. lang_info['subdomain'] = 'news.google.com'
  75. ceid = "%s:%s" % (lang_info['country'], lang_info['language'])
  76. # google news redirects en to en-US
  77. if lang_info['params']['hl'] == 'en':
  78. lang_info['params']['hl'] = 'en-US'
  79. # Very special to google-news compared to other google engines, the time
  80. # range is included in the search term.
  81. if params['time_range']:
  82. query += ' ' + time_range_dict[params['time_range']]
  83. query_url = 'https://' + lang_info['subdomain'] + '/search' + "?" + urlencode({
  84. 'q': query,
  85. **lang_info['params'],
  86. 'ie': "utf8",
  87. 'oe': "utf8",
  88. 'gl': lang_info['country'],
  89. }) + ('&ceid=%s' % ceid) # ceid includes a ':' character which must not be urlencoded
  90. params['url'] = query_url
  91. params['headers'].update(lang_info['headers'])
  92. params['headers']['Accept'] = (
  93. 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8'
  94. )
  95. params['headers']['Cookie'] = "CONSENT=YES+cb.%s-14-p0.en+F+941;" % datetime.now().strftime("%Y%m%d")
  96. return params
  97. def response(resp):
  98. """Get response from google's search request"""
  99. results = []
  100. detect_google_sorry(resp)
  101. # convert the text to dom
  102. dom = html.fromstring(resp.text)
  103. for result in eval_xpath_list(dom, '//div[@class="xrnccd"]'):
  104. # The first <a> tag in the <article> contains the link to the
  105. # article The href attribute of the <a> is a google internal link,
  106. # we can't use. The real link is hidden in the jslog attribute:
  107. #
  108. # <a ...
  109. # jslog="95014; 4:https://www.cnn.com/.../index.html; track:click"
  110. # href="./articles/CAIiENu3nGS...?hl=en-US&amp;gl=US&amp;ceid=US%3Aen"
  111. # ... />
  112. jslog = eval_xpath_getindex(result, './article/a/@jslog', 0)
  113. url = re.findall('http[^;]*', jslog)
  114. if url:
  115. url = url[0]
  116. else:
  117. # The real URL is base64 encoded in the json attribute:
  118. # jslog="95014; 5:W251bGwsbnVsbCxudW...giXQ==; track:click"
  119. jslog = jslog.split(";")[1].split(':')[1].strip()
  120. try:
  121. padding = (4 -(len(jslog) % 4)) * "="
  122. jslog = b64decode(jslog + padding)
  123. except binascii.Error:
  124. # URL cant be read, skip this result
  125. continue
  126. # now we have : b'[null, ... null,"https://www.cnn.com/.../index.html"]'
  127. url = re.findall('http[^;"]*', str(jslog))[0]
  128. # the first <h3> tag in the <article> contains the title of the link
  129. title = extract_text(eval_xpath(result, './article/h3[1]'))
  130. # the first <div> tag in the <article> contains the content of the link
  131. content = extract_text(eval_xpath(result, './article/div[1]'))
  132. # the second <div> tag contains origin publisher and the publishing date
  133. pub_date = extract_text(eval_xpath(result, './article/div[2]//time'))
  134. pub_origin = extract_text(eval_xpath(result, './article/div[2]//a'))
  135. pub_info = []
  136. if pub_origin:
  137. pub_info.append(pub_origin)
  138. if pub_date:
  139. # The pub_date is mostly a string like 'yesertday', not a real
  140. # timezone date or time. Therefore we can't use publishedDate.
  141. pub_info.append(pub_date)
  142. pub_info = ', '.join(pub_info)
  143. if pub_info:
  144. content = pub_info + ': ' + content
  145. # The image URL is located in a preceding sibling <img> tag, e.g.:
  146. # "https://lh3.googleusercontent.com/DjhQh7DMszk.....z=-p-h100-w100"
  147. # These URL are long but not personalized (double checked via tor).
  148. img_src = extract_text(result.xpath('preceding-sibling::a/figure/img/@src'))
  149. results.append({
  150. 'url': url,
  151. 'title': title,
  152. 'content': content,
  153. 'img_src': img_src,
  154. })
  155. # return results
  156. return results