google_news.py 8.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324
  1. # SPDX-License-Identifier: AGPL-3.0-or-later
  2. # lint: pylint
  3. """This is the implementation of the Google News engine.
  4. Google News has a different region handling compared to Google WEB.
  5. - the ``ceid`` argument has to be set (:py:obj:`ceid_list`)
  6. - the hl_ argument has to be set correctly (and different to Google WEB)
  7. - the gl_ argument is mandatory
  8. If one of this argument is not set correctly, the request is redirected to
  9. CONSENT dialog::
  10. https://consent.google.com/m?continue=
  11. The google news API ignores some parameters from the common :ref:`google API`:
  12. - num_ : the number of search results is ignored / there is no paging all
  13. results for a query term are in the first response.
  14. - save_ : is ignored / Google-News results are always *SafeSearch*
  15. .. _hl: https://developers.google.com/custom-search/docs/xml_results#hlsp
  16. .. _gl: https://developers.google.com/custom-search/docs/xml_results#glsp
  17. .. _num: https://developers.google.com/custom-search/docs/xml_results#numsp
  18. .. _save: https://developers.google.com/custom-search/docs/xml_results#safesp
  19. """
  20. from typing import TYPE_CHECKING
  21. import binascii
  22. import re
  23. from urllib.parse import urlencode
  24. from base64 import b64decode
  25. from lxml import html
  26. import babel
  27. from searx import locales
  28. from searx.utils import (
  29. eval_xpath,
  30. eval_xpath_list,
  31. eval_xpath_getindex,
  32. extract_text,
  33. )
  34. from searx.engines.google import fetch_traits as _fetch_traits # pylint: disable=unused-import
  35. from searx.engines.google import (
  36. get_google_info,
  37. detect_google_sorry,
  38. )
  39. from searx.enginelib.traits import EngineTraits
  40. if TYPE_CHECKING:
  41. import logging
  42. logger: logging.Logger
  43. traits: EngineTraits
  44. # about
  45. about = {
  46. "website": 'https://news.google.com',
  47. "wikidata_id": 'Q12020',
  48. "official_api_documentation": 'https://developers.google.com/custom-search',
  49. "use_official_api": False,
  50. "require_api_key": False,
  51. "results": 'HTML',
  52. }
  53. # engine dependent config
  54. categories = ['news']
  55. paging = False
  56. time_range_support = False
  57. # Google-News results are always *SafeSearch*. Option 'safesearch' is set to
  58. # False here, otherwise checker will report safesearch-errors::
  59. #
  60. # safesearch : results are identitical for safesearch=0 and safesearch=2
  61. safesearch = True
  62. # send_accept_language_header = True
  63. def request(query, params):
  64. """Google-News search request"""
  65. sxng_locale = params.get('searxng_locale', 'en-US')
  66. ceid = locales.get_engine_locale(sxng_locale, traits.custom['ceid'], default='US:en')
  67. google_info = get_google_info(params, traits)
  68. google_info['subdomain'] = 'news.google.com' # google news has only one domain
  69. ceid_region, ceid_lang = ceid.split(':')
  70. ceid_lang, ceid_suffix = (
  71. ceid_lang.split('-')
  72. + [
  73. None,
  74. ]
  75. )[:2]
  76. google_info['params']['hl'] = ceid_lang
  77. if ceid_suffix and ceid_suffix not in ['Hans', 'Hant']:
  78. if ceid_region.lower() == ceid_lang:
  79. google_info['params']['hl'] = ceid_lang + '-' + ceid_region
  80. else:
  81. google_info['params']['hl'] = ceid_lang + '-' + ceid_suffix
  82. elif ceid_region.lower() != ceid_lang:
  83. if ceid_region in ['AT', 'BE', 'CH', 'IL', 'SA', 'IN', 'BD', 'PT']:
  84. google_info['params']['hl'] = ceid_lang
  85. else:
  86. google_info['params']['hl'] = ceid_lang + '-' + ceid_region
  87. google_info['params']['lr'] = 'lang_' + ceid_lang.split('-')[0]
  88. google_info['params']['gl'] = ceid_region
  89. query_url = (
  90. 'https://'
  91. + google_info['subdomain']
  92. + "/search?"
  93. + urlencode(
  94. {
  95. 'q': query,
  96. **google_info['params'],
  97. }
  98. )
  99. # ceid includes a ':' character which must not be urlencoded
  100. + ('&ceid=%s' % ceid)
  101. )
  102. params['url'] = query_url
  103. params['cookies'] = google_info['cookies']
  104. params['headers'].update(google_info['headers'])
  105. return params
  106. def response(resp):
  107. """Get response from google's search request"""
  108. results = []
  109. detect_google_sorry(resp)
  110. # convert the text to dom
  111. dom = html.fromstring(resp.text)
  112. for result in eval_xpath_list(dom, '//div[@class="xrnccd"]'):
  113. # The first <a> tag in the <article> contains the link to the
  114. # article The href attribute of the <a> is a google internal link,
  115. # we can't use. The real link is hidden in the jslog attribute:
  116. #
  117. # <a ...
  118. # jslog="95014; 4:https://www.cnn.com/.../index.html; track:click"
  119. # href="./articles/CAIiENu3nGS...?hl=en-US&amp;gl=US&amp;ceid=US%3Aen"
  120. # ... />
  121. jslog = eval_xpath_getindex(result, './article/a/@jslog', 0)
  122. url = re.findall('http[^;]*', jslog)
  123. if url:
  124. url = url[0]
  125. else:
  126. # The real URL is base64 encoded in the json attribute:
  127. # jslog="95014; 5:W251bGwsbnVsbCxudW...giXQ==; track:click"
  128. jslog = jslog.split(";")[1].split(':')[1].strip()
  129. try:
  130. padding = (4 - (len(jslog) % 4)) * "="
  131. jslog = b64decode(jslog + padding)
  132. except binascii.Error:
  133. # URL can't be read, skip this result
  134. continue
  135. # now we have : b'[null, ... null,"https://www.cnn.com/.../index.html"]'
  136. url = re.findall('http[^;"]*', str(jslog))[0]
  137. # the first <h3> tag in the <article> contains the title of the link
  138. title = extract_text(eval_xpath(result, './article/h3[1]'))
  139. # The pub_date is mostly a string like 'yesertday', not a real
  140. # timezone date or time. Therefore we can't use publishedDate.
  141. pub_date = extract_text(eval_xpath(result, './article//time'))
  142. pub_origin = extract_text(eval_xpath(result, './article//a[@data-n-tid]'))
  143. content = ' / '.join([x for x in [pub_origin, pub_date] if x])
  144. # The image URL is located in a preceding sibling <img> tag, e.g.:
  145. # "https://lh3.googleusercontent.com/DjhQh7DMszk.....z=-p-h100-w100"
  146. # These URL are long but not personalized (double checked via tor).
  147. img_src = extract_text(result.xpath('preceding-sibling::a/figure/img/@src'))
  148. results.append(
  149. {
  150. 'url': url,
  151. 'title': title,
  152. 'content': content,
  153. 'img_src': img_src,
  154. }
  155. )
  156. # return results
  157. return results
  158. ceid_list = [
  159. 'AE:ar',
  160. 'AR:es-419',
  161. 'AT:de',
  162. 'AU:en',
  163. 'BD:bn',
  164. 'BE:fr',
  165. 'BE:nl',
  166. 'BG:bg',
  167. 'BR:pt-419',
  168. 'BW:en',
  169. 'CA:en',
  170. 'CA:fr',
  171. 'CH:de',
  172. 'CH:fr',
  173. 'CL:es-419',
  174. 'CN:zh-Hans',
  175. 'CO:es-419',
  176. 'CU:es-419',
  177. 'CZ:cs',
  178. 'DE:de',
  179. 'EG:ar',
  180. 'ES:es',
  181. 'ET:en',
  182. 'FR:fr',
  183. 'GB:en',
  184. 'GH:en',
  185. 'GR:el',
  186. 'HK:zh-Hant',
  187. 'HU:hu',
  188. 'ID:en',
  189. 'ID:id',
  190. 'IE:en',
  191. 'IL:en',
  192. 'IL:he',
  193. 'IN:bn',
  194. 'IN:en',
  195. 'IN:hi',
  196. 'IN:ml',
  197. 'IN:mr',
  198. 'IN:ta',
  199. 'IN:te',
  200. 'IT:it',
  201. 'JP:ja',
  202. 'KE:en',
  203. 'KR:ko',
  204. 'LB:ar',
  205. 'LT:lt',
  206. 'LV:en',
  207. 'LV:lv',
  208. 'MA:fr',
  209. 'MX:es-419',
  210. 'MY:en',
  211. 'NA:en',
  212. 'NG:en',
  213. 'NL:nl',
  214. 'NO:no',
  215. 'NZ:en',
  216. 'PE:es-419',
  217. 'PH:en',
  218. 'PK:en',
  219. 'PL:pl',
  220. 'PT:pt-150',
  221. 'RO:ro',
  222. 'RS:sr',
  223. 'RU:ru',
  224. 'SA:ar',
  225. 'SE:sv',
  226. 'SG:en',
  227. 'SI:sl',
  228. 'SK:sk',
  229. 'SN:fr',
  230. 'TH:th',
  231. 'TR:tr',
  232. 'TW:zh-Hant',
  233. 'TZ:en',
  234. 'UA:ru',
  235. 'UA:uk',
  236. 'UG:en',
  237. 'US:en',
  238. 'US:es-419',
  239. 'VE:es-419',
  240. 'VN:vi',
  241. 'ZA:en',
  242. 'ZW:en',
  243. ]
  244. """List of region/language combinations supported by Google News. Values of the
  245. ``ceid`` argument of the Google News REST API."""
  246. _skip_values = [
  247. 'ET:en', # english (ethiopia)
  248. 'ID:en', # english (indonesia)
  249. 'LV:en', # english (latvia)
  250. ]
  251. _ceid_locale_map = {'NO:no': 'nb-NO'}
  252. def fetch_traits(engine_traits: EngineTraits):
  253. _fetch_traits(engine_traits, add_domains=False)
  254. engine_traits.custom['ceid'] = {}
  255. for ceid in ceid_list:
  256. if ceid in _skip_values:
  257. continue
  258. region, lang = ceid.split(':')
  259. x = lang.split('-')
  260. if len(x) > 1:
  261. if x[1] not in ['Hant', 'Hans']:
  262. lang = x[0]
  263. sxng_locale = _ceid_locale_map.get(ceid, lang + '-' + region)
  264. try:
  265. locale = babel.Locale.parse(sxng_locale, sep='-')
  266. except babel.UnknownLocaleError:
  267. print("ERROR: %s -> %s is unknown by babel" % (ceid, sxng_locale))
  268. continue
  269. engine_traits.custom['ceid'][locales.region_tag(locale)] = ceid