google.py 17 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497
  1. # SPDX-License-Identifier: AGPL-3.0-or-later
  2. # lint: pylint
  3. """This is the implementation of the Google WEB engine. Some of this
  4. implementations (manly the :py:obj:`get_google_info`) are shared by other
  5. engines:
  6. - :ref:`google images engine`
  7. - :ref:`google news engine`
  8. - :ref:`google videos engine`
  9. - :ref:`google scholar engine`
  10. - :ref:`google autocomplete`
  11. """
  12. from typing import TYPE_CHECKING
  13. import re
  14. from urllib.parse import urlencode
  15. from lxml import html
  16. import babel
  17. import babel.core
  18. import babel.languages
  19. from searx.utils import extract_text, eval_xpath, eval_xpath_list, eval_xpath_getindex
  20. from searx.locales import language_tag, region_tag, get_official_locales
  21. from searx.network import get # see https://github.com/searxng/searxng/issues/762
  22. from searx.exceptions import SearxEngineCaptchaException
  23. from searx.enginelib.traits import EngineTraits
  24. if TYPE_CHECKING:
  25. import logging
  26. logger: logging.Logger
  27. traits: EngineTraits
  28. # about
  29. about = {
  30. "website": 'https://www.google.com',
  31. "wikidata_id": 'Q9366',
  32. "official_api_documentation": 'https://developers.google.com/custom-search/',
  33. "use_official_api": False,
  34. "require_api_key": False,
  35. "results": 'HTML',
  36. }
  37. # engine dependent config
  38. categories = ['general', 'web']
  39. paging = True
  40. max_page = 50
  41. time_range_support = True
  42. safesearch = True
  43. time_range_dict = {'day': 'd', 'week': 'w', 'month': 'm', 'year': 'y'}
  44. # Filter results. 0: None, 1: Moderate, 2: Strict
  45. filter_mapping = {0: 'off', 1: 'medium', 2: 'high'}
  46. # specific xpath variables
  47. # ------------------------
  48. results_xpath = './/div[contains(@jscontroller, "SC7lYd")]'
  49. title_xpath = './/a/h3[1]'
  50. href_xpath = './/a[h3]/@href'
  51. content_xpath = './/div[@data-sncf]'
  52. # Suggestions are links placed in a *card-section*, we extract only the text
  53. # from the links not the links itself.
  54. suggestion_xpath = '//div[contains(@class, "EIaa9b")]//a'
  55. # UI_ASYNC = 'use_ac:true,_fmt:html' # returns a HTTP 500 when user search for
  56. # # celebrities like '!google natasha allegri'
  57. # # or '!google chris evans'
  58. UI_ASYNC = 'use_ac:true,_fmt:prog'
  59. """Format of the response from UI's async request."""
  60. def get_google_info(params, eng_traits):
  61. """Composing various (language) properties for the google engines (:ref:`google
  62. API`).
  63. This function is called by the various google engines (:ref:`google web
  64. engine`, :ref:`google images engine`, :ref:`google news engine` and
  65. :ref:`google videos engine`).
  66. :param dict param: Request parameters of the engine. At least
  67. a ``searxng_locale`` key should be in the dictionary.
  68. :param eng_traits: Engine's traits fetched from google preferences
  69. (:py:obj:`searx.enginelib.traits.EngineTraits`)
  70. :rtype: dict
  71. :returns:
  72. Py-Dictionary with the key/value pairs:
  73. language:
  74. The language code that is used by google (e.g. ``lang_en`` or
  75. ``lang_zh-TW``)
  76. country:
  77. The country code that is used by google (e.g. ``US`` or ``TW``)
  78. locale:
  79. A instance of :py:obj:`babel.core.Locale` build from the
  80. ``searxng_locale`` value.
  81. subdomain:
  82. Google subdomain :py:obj:`google_domains` that fits to the country
  83. code.
  84. params:
  85. Py-Dictionary with additional request arguments (can be passed to
  86. :py:func:`urllib.parse.urlencode`).
  87. - ``hl`` parameter: specifies the interface language of user interface.
  88. - ``lr`` parameter: restricts search results to documents written in
  89. a particular language.
  90. - ``cr`` parameter: restricts search results to documents
  91. originating in a particular country.
  92. - ``ie`` parameter: sets the character encoding scheme that should
  93. be used to interpret the query string ('utf8').
  94. - ``oe`` parameter: sets the character encoding scheme that should
  95. be used to decode the XML result ('utf8').
  96. headers:
  97. Py-Dictionary with additional HTTP headers (can be passed to
  98. request's headers)
  99. - ``Accept: '*/*``
  100. """
  101. ret_val = {
  102. 'language': None,
  103. 'country': None,
  104. 'subdomain': None,
  105. 'params': {},
  106. 'headers': {},
  107. 'cookies': {},
  108. 'locale': None,
  109. }
  110. sxng_locale = params.get('searxng_locale', 'all')
  111. try:
  112. locale = babel.Locale.parse(sxng_locale, sep='-')
  113. except babel.core.UnknownLocaleError:
  114. locale = None
  115. eng_lang = eng_traits.get_language(sxng_locale, 'lang_en')
  116. lang_code = eng_lang.split('_')[-1] # lang_zh-TW --> zh-TW / lang_en --> en
  117. country = eng_traits.get_region(sxng_locale, eng_traits.all_locale)
  118. # Test zh_hans & zh_hant --> in the topmost links in the result list of list
  119. # TW and HK you should a find wiktionary.org zh_hant link. In the result
  120. # list of zh-CN should not be no hant link instead you should find
  121. # zh.m.wikipedia.org/zh somewhere in the top.
  122. # '!go 日 :zh-TW' --> https://zh.m.wiktionary.org/zh-hant/%E6%97%A5
  123. # '!go 日 :zh-CN' --> https://zh.m.wikipedia.org/zh/%E6%97%A5
  124. ret_val['language'] = eng_lang
  125. ret_val['country'] = country
  126. ret_val['locale'] = locale
  127. ret_val['subdomain'] = eng_traits.custom['supported_domains'].get(country.upper(), 'www.google.com')
  128. # hl parameter:
  129. # The hl parameter specifies the interface language (host language) of
  130. # your user interface. To improve the performance and the quality of your
  131. # search results, you are strongly encouraged to set this parameter
  132. # explicitly.
  133. # https://developers.google.com/custom-search/docs/xml_results#hlsp
  134. # The Interface Language:
  135. # https://developers.google.com/custom-search/docs/xml_results_appendices#interfaceLanguages
  136. # https://github.com/searxng/searxng/issues/2515#issuecomment-1607150817
  137. ret_val['params']['hl'] = f'{lang_code}-{country}'
  138. # lr parameter:
  139. # The lr (language restrict) parameter restricts search results to
  140. # documents written in a particular language.
  141. # https://developers.google.com/custom-search/docs/xml_results#lrsp
  142. # Language Collection Values:
  143. # https://developers.google.com/custom-search/docs/xml_results_appendices#languageCollections
  144. #
  145. # To select 'all' languages an empty 'lr' value is used.
  146. #
  147. # Different to other google services, Google Scholar supports to select more
  148. # than one language. The languages are separated by a pipe '|' (logical OR).
  149. # By example: &lr=lang_zh-TW%7Clang_de selects articles written in
  150. # traditional chinese OR german language.
  151. ret_val['params']['lr'] = eng_lang
  152. if sxng_locale == 'all':
  153. ret_val['params']['lr'] = ''
  154. # cr parameter:
  155. # The cr parameter restricts search results to documents originating in a
  156. # particular country.
  157. # https://developers.google.com/custom-search/docs/xml_results#crsp
  158. # specify a region (country) only if a region is given in the selected
  159. # locale --> https://github.com/searxng/searxng/issues/2672
  160. ret_val['params']['cr'] = ''
  161. if len(sxng_locale.split('-')) > 1:
  162. ret_val['params']['cr'] = 'country' + country
  163. # gl parameter: (mandatory by Google News)
  164. # The gl parameter value is a two-letter country code. For WebSearch
  165. # results, the gl parameter boosts search results whose country of origin
  166. # matches the parameter value. See the Country Codes section for a list of
  167. # valid values.
  168. # Specifying a gl parameter value in WebSearch requests should improve the
  169. # relevance of results. This is particularly true for international
  170. # customers and, even more specifically, for customers in English-speaking
  171. # countries other than the United States.
  172. # https://developers.google.com/custom-search/docs/xml_results#glsp
  173. # https://github.com/searxng/searxng/issues/2515#issuecomment-1606294635
  174. # ret_val['params']['gl'] = country
  175. # ie parameter:
  176. # The ie parameter sets the character encoding scheme that should be used
  177. # to interpret the query string. The default ie value is latin1.
  178. # https://developers.google.com/custom-search/docs/xml_results#iesp
  179. ret_val['params']['ie'] = 'utf8'
  180. # oe parameter:
  181. # The oe parameter sets the character encoding scheme that should be used
  182. # to decode the XML result. The default oe value is latin1.
  183. # https://developers.google.com/custom-search/docs/xml_results#oesp
  184. ret_val['params']['oe'] = 'utf8'
  185. # num parameter:
  186. # The num parameter identifies the number of search results to return.
  187. # The default num value is 10, and the maximum value is 20. If you request
  188. # more than 20 results, only 20 results will be returned.
  189. # https://developers.google.com/custom-search/docs/xml_results#numsp
  190. # HINT: seems to have no effect (tested in google WEB & Images)
  191. # ret_val['params']['num'] = 20
  192. # HTTP headers
  193. ret_val['headers']['Accept'] = '*/*'
  194. # Cookies
  195. # - https://github.com/searxng/searxng/pull/1679#issuecomment-1235432746
  196. # - https://github.com/searxng/searxng/issues/1555
  197. ret_val['cookies']['CONSENT'] = "YES+"
  198. return ret_val
  199. def detect_google_sorry(resp):
  200. if resp.url.host == 'sorry.google.com' or resp.url.path.startswith('/sorry'):
  201. raise SearxEngineCaptchaException()
  202. def request(query, params):
  203. """Google search request"""
  204. # pylint: disable=line-too-long
  205. offset = (params['pageno'] - 1) * 10
  206. google_info = get_google_info(params, traits)
  207. # https://www.google.de/search?q=corona&hl=de&lr=lang_de&start=0&tbs=qdr%3Ad&safe=medium
  208. query_url = (
  209. 'https://'
  210. + google_info['subdomain']
  211. + '/search'
  212. + "?"
  213. + urlencode(
  214. {
  215. 'q': query,
  216. **google_info['params'],
  217. 'filter': '0',
  218. 'start': offset,
  219. # 'vet': '12ahUKEwik3ZbIzfn7AhXMX_EDHbUDBh0QxK8CegQIARAC..i',
  220. # 'ved': '2ahUKEwik3ZbIzfn7AhXMX_EDHbUDBh0Q_skCegQIARAG',
  221. # 'cs' : 1,
  222. # 'sa': 'N',
  223. # 'yv': 3,
  224. # 'prmd': 'vin',
  225. # 'ei': 'GASaY6TxOcy_xc8PtYeY6AE',
  226. # 'sa': 'N',
  227. # 'sstk': 'AcOHfVkD7sWCSAheZi-0tx_09XDO55gTWY0JNq3_V26cNN-c8lfD45aZYPI8s_Bqp8s57AHz5pxchDtAGCA_cikAWSjy9kw3kgg'
  228. # formally known as use_mobile_ui
  229. 'asearch': 'arc',
  230. 'async': UI_ASYNC,
  231. }
  232. )
  233. )
  234. if params['time_range'] in time_range_dict:
  235. query_url += '&' + urlencode({'tbs': 'qdr:' + time_range_dict[params['time_range']]})
  236. if params['safesearch']:
  237. query_url += '&' + urlencode({'safe': filter_mapping[params['safesearch']]})
  238. params['url'] = query_url
  239. params['cookies'] = google_info['cookies']
  240. params['headers'].update(google_info['headers'])
  241. return params
  242. # =26;[3,"dimg_ZNMiZPCqE4apxc8P3a2tuAQ_137"]a87;
  243. # ...6T+9Nl4cnD+gr9OK8I56/tX3l86nWYw//2Q==26;
  244. RE_DATA_IMAGE = re.compile(r'"(dimg_[^"]*)"[^;]*;(data:image[^;]*;[^;]*);')
  245. def _parse_data_images(dom):
  246. data_image_map = {}
  247. for img_id, data_image in RE_DATA_IMAGE.findall(dom.text_content()):
  248. end_pos = data_image.rfind('=')
  249. if end_pos > 0:
  250. data_image = data_image[: end_pos + 1]
  251. data_image_map[img_id] = data_image
  252. logger.debug('data:image objects --> %s', list(data_image_map.keys()))
  253. return data_image_map
  254. def response(resp):
  255. """Get response from google's search request"""
  256. # pylint: disable=too-many-branches, too-many-statements
  257. detect_google_sorry(resp)
  258. results = []
  259. # convert the text to dom
  260. dom = html.fromstring(resp.text)
  261. data_image_map = _parse_data_images(dom)
  262. # results --> answer
  263. answer_list = eval_xpath(dom, '//div[contains(@class, "LGOjhe")]')
  264. for item in answer_list:
  265. results.append(
  266. {
  267. 'answer': item.xpath("normalize-space()"),
  268. 'url': (eval_xpath(item, '../..//a/@href') + [None])[0],
  269. }
  270. )
  271. # parse results
  272. for result in eval_xpath_list(dom, results_xpath): # pylint: disable=too-many-nested-blocks
  273. try:
  274. title_tag = eval_xpath_getindex(result, title_xpath, 0, default=None)
  275. if title_tag is None:
  276. # this not one of the common google results *section*
  277. logger.debug('ignoring item from the result_xpath list: missing title')
  278. continue
  279. title = extract_text(title_tag)
  280. url = eval_xpath_getindex(result, href_xpath, 0, None)
  281. if url is None:
  282. logger.debug('ignoring item from the result_xpath list: missing url of title "%s"', title)
  283. continue
  284. content_nodes = eval_xpath(result, content_xpath)
  285. content = extract_text(content_nodes)
  286. if not content:
  287. logger.debug('ignoring item from the result_xpath list: missing content of title "%s"', title)
  288. continue
  289. img_src = content_nodes[0].xpath('.//img/@src')
  290. if img_src:
  291. img_src = img_src[0]
  292. if img_src.startswith('data:image'):
  293. img_id = content_nodes[0].xpath('.//img/@id')
  294. if img_id:
  295. img_src = data_image_map.get(img_id[0])
  296. else:
  297. img_src = None
  298. results.append({'url': url, 'title': title, 'content': content, 'img_src': img_src})
  299. except Exception as e: # pylint: disable=broad-except
  300. logger.error(e, exc_info=True)
  301. continue
  302. # parse suggestion
  303. for suggestion in eval_xpath_list(dom, suggestion_xpath):
  304. # append suggestion
  305. results.append({'suggestion': extract_text(suggestion)})
  306. # return results
  307. return results
  308. # get supported languages from their site
  309. skip_countries = [
  310. # official language of google-country not in google-languages
  311. 'AL', # Albanien (sq)
  312. 'AZ', # Aserbaidschan (az)
  313. 'BD', # Bangladesch (bn)
  314. 'BN', # Brunei Darussalam (ms)
  315. 'BT', # Bhutan (dz)
  316. 'ET', # Äthiopien (am)
  317. 'GE', # Georgien (ka, os)
  318. 'GL', # Grönland (kl)
  319. 'KH', # Kambodscha (km)
  320. 'LA', # Laos (lo)
  321. 'LK', # Sri Lanka (si, ta)
  322. 'ME', # Montenegro (sr)
  323. 'MK', # Nordmazedonien (mk, sq)
  324. 'MM', # Myanmar (my)
  325. 'MN', # Mongolei (mn)
  326. 'MV', # Malediven (dv) // dv_MV is unknown by babel
  327. 'MY', # Malaysia (ms)
  328. 'NP', # Nepal (ne)
  329. 'TJ', # Tadschikistan (tg)
  330. 'TM', # Turkmenistan (tk)
  331. 'UZ', # Usbekistan (uz)
  332. ]
  333. def fetch_traits(engine_traits: EngineTraits, add_domains: bool = True):
  334. """Fetch languages from Google."""
  335. # pylint: disable=import-outside-toplevel, too-many-branches
  336. engine_traits.custom['supported_domains'] = {}
  337. resp = get('https://www.google.com/preferences')
  338. if not resp.ok: # type: ignore
  339. raise RuntimeError("Response from Google's preferences is not OK.")
  340. dom = html.fromstring(resp.text.replace('<?xml version="1.0" encoding="UTF-8"?>', ''))
  341. # supported language codes
  342. lang_map = {'no': 'nb'}
  343. for x in eval_xpath_list(dom, "//select[@name='hl']/option"):
  344. eng_lang = x.get("value")
  345. try:
  346. locale = babel.Locale.parse(lang_map.get(eng_lang, eng_lang), sep='-')
  347. except babel.UnknownLocaleError:
  348. print("ERROR: %s -> %s is unknown by babel" % (x.get("data-name"), eng_lang))
  349. continue
  350. sxng_lang = language_tag(locale)
  351. conflict = engine_traits.languages.get(sxng_lang)
  352. if conflict:
  353. if conflict != eng_lang:
  354. print("CONFLICT: babel %s --> %s, %s" % (sxng_lang, conflict, eng_lang))
  355. continue
  356. engine_traits.languages[sxng_lang] = 'lang_' + eng_lang
  357. # alias languages
  358. engine_traits.languages['zh'] = 'lang_zh-CN'
  359. # supported region codes
  360. for x in eval_xpath_list(dom, "//select[@name='gl']/option"):
  361. eng_country = x.get("value")
  362. if eng_country in skip_countries:
  363. continue
  364. if eng_country == 'ZZ':
  365. engine_traits.all_locale = 'ZZ'
  366. continue
  367. sxng_locales = get_official_locales(eng_country, engine_traits.languages.keys(), regional=True)
  368. if not sxng_locales:
  369. print("ERROR: can't map from google country %s (%s) to a babel region." % (x.get('data-name'), eng_country))
  370. continue
  371. for sxng_locale in sxng_locales:
  372. engine_traits.regions[region_tag(sxng_locale)] = eng_country
  373. # alias regions
  374. engine_traits.regions['zh-CN'] = 'HK'
  375. # supported domains
  376. if add_domains:
  377. resp = get('https://www.google.com/supported_domains')
  378. if not resp.ok: # type: ignore
  379. raise RuntimeError("Response from https://www.google.com/supported_domains is not OK.")
  380. for domain in resp.text.split(): # type: ignore
  381. domain = domain.strip()
  382. if not domain or domain in [
  383. '.google.com',
  384. ]:
  385. continue
  386. region = domain.split('.')[-1].upper()
  387. engine_traits.custom['supported_domains'][region] = 'www' + domain # type: ignore
  388. if region == 'HK':
  389. # There is no google.cn, we use .com.hk for zh-CN
  390. engine_traits.custom['supported_domains']['CN'] = 'www' + domain # type: ignore