google_news.py 5.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193
  1. # SPDX-License-Identifier: AGPL-3.0-or-later
  2. """Google (News)
  3. For detailed description of the *REST-full* API see: `Query Parameter
  4. Definitions`_. Not all parameters can be appied, e.g. num_ (the number of
  5. search results to return) is ignored.
  6. .. _Query Parameter Definitions:
  7. https://developers.google.com/custom-search/docs/xml_results#WebSearch_Query_Parameter_Definitions
  8. .. _num: https://developers.google.com/custom-search/docs/xml_results#numsp
  9. """
  10. # pylint: disable=invalid-name, missing-function-docstring
  11. import binascii
  12. import re
  13. from urllib.parse import urlencode
  14. from base64 import b64decode
  15. from lxml import html
  16. from searx import logger
  17. from searx.utils import (
  18. eval_xpath,
  19. eval_xpath_list,
  20. eval_xpath_getindex,
  21. extract_text,
  22. )
  23. # pylint: disable=unused-import
  24. from searx.engines.google import (
  25. supported_languages_url,
  26. _fetch_supported_languages,
  27. detect_google_sorry,
  28. )
  29. # pylint: enable=unused-import
  30. from searx.engines.google import (
  31. get_lang_country,
  32. filter_mapping,
  33. )
  34. # about
  35. about = {
  36. "website": 'https://news.google.com',
  37. "wikidata_id": 'Q12020',
  38. "official_api_documentation": None,
  39. "use_official_api": False,
  40. "require_api_key": False,
  41. "results": 'HTML',
  42. }
  43. logger = logger.getChild('google news')
  44. # compared to other google engines google-news has a different time range
  45. # support. The time range is included in the search term.
  46. time_range_dict = {
  47. 'day': 'when:1d',
  48. 'week': 'when:7d',
  49. 'month': 'when:1m',
  50. 'year': 'when:1y',
  51. }
  52. # engine dependent config
  53. categories = ['news']
  54. paging = False
  55. language_support = True
  56. use_locale_domain = True
  57. time_range_support = True
  58. safesearch = True # not really, but it is not generated by google
  59. def request(query, params):
  60. """Google-News search request"""
  61. language, country, lang_country = get_lang_country(
  62. # pylint: disable=undefined-variable
  63. params, supported_languages, language_aliases
  64. )
  65. subdomain = 'news.google.com'
  66. if params['time_range']: # in time_range_dict:
  67. query += ' ' + time_range_dict[params['time_range']]
  68. query_url = 'https://'+ subdomain + '/search' + "?" + urlencode({
  69. 'q': query,
  70. 'hl': lang_country,
  71. 'lr': "lang_" + language,
  72. 'ie': "utf8",
  73. 'oe': "utf8",
  74. 'ceid' : "%s:%s" % (country, language),
  75. 'gl' : country,
  76. })
  77. if params['safesearch']:
  78. query_url += '&' + urlencode({'safe': filter_mapping[params['safesearch']]})
  79. params['url'] = query_url
  80. logger.debug("query_url --> %s", query_url)
  81. # en-US,en;q=0.8,en;q=0.5
  82. params['headers']['Accept-Language'] = (
  83. lang_country + ',' + language + ';q=0.8,' + language + ';q=0.5'
  84. )
  85. logger.debug("HTTP header Accept-Language --> %s",
  86. params['headers']['Accept-Language'])
  87. params['headers']['Accept'] = (
  88. 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8'
  89. )
  90. #params['google_subdomain'] = subdomain
  91. return params
  92. def response(resp):
  93. """Get response from google's search request"""
  94. results = []
  95. detect_google_sorry(resp)
  96. # which subdomain ?
  97. # subdomain = resp.search_params.get('google_subdomain')
  98. # convert the text to dom
  99. dom = html.fromstring(resp.text)
  100. for result in eval_xpath_list(dom, '//div[@class="xrnccd"]'):
  101. # The first <a> tag in the <article> contains the link to the
  102. # article The href attribute of the <a> is a google internal link,
  103. # we can't use. The real link is hidden in the jslog attribute:
  104. #
  105. # <a ...
  106. # jslog="95014; 4:https://www.cnn.com/.../index.html; track:click"
  107. # href="./articles/CAIiENu3nGS...?hl=en-US&amp;gl=US&amp;ceid=US%3Aen"
  108. # ... />
  109. jslog = eval_xpath_getindex(result, './article/a/@jslog', 0)
  110. url = re.findall('http[^;]*', jslog)
  111. if url:
  112. url = url[0]
  113. else:
  114. # The real URL is base64 encoded in the json attribute:
  115. # jslog="95014; 5:W251bGwsbnVsbCxudW...giXQ==; track:click"
  116. jslog = jslog.split(";")[1].split(':')[1].strip()
  117. try:
  118. padding = (4 -(len(jslog) % 4)) * "="
  119. jslog = b64decode(jslog + padding)
  120. except binascii.Error:
  121. # URL cant be read, skip this result
  122. continue
  123. # now we have : b'[null, ... null,"https://www.cnn.com/.../index.html"]'
  124. url = re.findall('http[^;"]*', str(jslog))[0]
  125. # the first <h3> tag in the <article> contains the title of the link
  126. title = extract_text(eval_xpath(result, './article/h3[1]'))
  127. # the first <div> tag in the <article> contains the content of the link
  128. content = extract_text(eval_xpath(result, './article/div[1]'))
  129. # the second <div> tag contains origin publisher and the publishing date
  130. pub_date = extract_text(eval_xpath(result, './article/div[2]//time'))
  131. pub_origin = extract_text(eval_xpath(result, './article/div[2]//a'))
  132. pub_info = []
  133. if pub_origin:
  134. pub_info.append(pub_origin)
  135. if pub_date:
  136. # The pub_date is mostly a string like 'yesertday', not a real
  137. # timezone date or time. Therefore we can't use publishedDate.
  138. pub_info.append(pub_date)
  139. pub_info = ', '.join(pub_info)
  140. if pub_info:
  141. content = pub_info + ': ' + content
  142. # The image URL is located in a preceding sibling <img> tag, e.g.:
  143. # "https://lh3.googleusercontent.com/DjhQh7DMszk.....z=-p-h100-w100"
  144. # These URL are long but not personalized (double checked via tor).
  145. img_src = extract_text(result.xpath('preceding-sibling::a/figure/img/@src'))
  146. results.append({
  147. 'url': url,
  148. 'title': title,
  149. 'content': content,
  150. 'img_src': img_src,
  151. })
  152. # return results
  153. return results