google.py 17 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493
  1. # SPDX-License-Identifier: AGPL-3.0-or-later
  2. # lint: pylint
  3. """This is the implementation of the Google WEB engine. Some of this
  4. implementations (manly the :py:obj:`get_google_info`) are shared by other
  5. engines:
  6. - :ref:`google images engine`
  7. - :ref:`google news engine`
  8. - :ref:`google videos engine`
  9. - :ref:`google scholar engine`
  10. - :ref:`google autocomplete`
  11. """
  12. from typing import TYPE_CHECKING
  13. import re
  14. from urllib.parse import urlencode
  15. from lxml import html
  16. import babel
  17. import babel.core
  18. import babel.languages
  19. from searx.utils import extract_text, eval_xpath, eval_xpath_list, eval_xpath_getindex
  20. from searx.locales import language_tag, region_tag, get_offical_locales
  21. from searx.network import get # see https://github.com/searxng/searxng/issues/762
  22. from searx.exceptions import SearxEngineCaptchaException
  23. from searx.enginelib.traits import EngineTraits
  24. if TYPE_CHECKING:
  25. import logging
  26. logger: logging.Logger
  27. traits: EngineTraits
  28. # about
  29. about = {
  30. "website": 'https://www.google.com',
  31. "wikidata_id": 'Q9366',
  32. "official_api_documentation": 'https://developers.google.com/custom-search/',
  33. "use_official_api": False,
  34. "require_api_key": False,
  35. "results": 'HTML',
  36. }
  37. # engine dependent config
  38. categories = ['general', 'web']
  39. paging = True
  40. time_range_support = True
  41. safesearch = True
  42. time_range_dict = {'day': 'd', 'week': 'w', 'month': 'm', 'year': 'y'}
  43. # Filter results. 0: None, 1: Moderate, 2: Strict
  44. filter_mapping = {0: 'off', 1: 'medium', 2: 'high'}
  45. # specific xpath variables
  46. # ------------------------
  47. results_xpath = './/div[contains(@jscontroller, "SC7lYd")]'
  48. title_xpath = './/a/h3[1]'
  49. href_xpath = './/a[h3]/@href'
  50. content_xpath = './/div[@data-sncf]'
  51. # Suggestions are links placed in a *card-section*, we extract only the text
  52. # from the links not the links itself.
  53. suggestion_xpath = '//div[contains(@class, "EIaa9b")]//a'
  54. # UI_ASYNC = 'use_ac:true,_fmt:html' # returns a HTTP 500 when user search for
  55. # # celebrities like '!google natasha allegri'
  56. # # or '!google chris evans'
  57. UI_ASYNC = 'use_ac:true,_fmt:prog'
  58. """Format of the response from UI's async request."""
  59. def get_google_info(params, eng_traits):
  60. """Composing various (language) properties for the google engines (:ref:`google
  61. API`).
  62. This function is called by the various google engines (:ref:`google web
  63. engine`, :ref:`google images engine`, :ref:`google news engine` and
  64. :ref:`google videos engine`).
  65. :param dict param: Request parameters of the engine. At least
  66. a ``searxng_locale`` key should be in the dictionary.
  67. :param eng_traits: Engine's traits fetched from google preferences
  68. (:py:obj:`searx.enginelib.traits.EngineTraits`)
  69. :rtype: dict
  70. :returns:
  71. Py-Dictionary with the key/value pairs:
  72. language:
  73. The language code that is used by google (e.g. ``lang_en`` or
  74. ``lang_zh-TW``)
  75. country:
  76. The country code that is used by google (e.g. ``US`` or ``TW``)
  77. locale:
  78. A instance of :py:obj:`babel.core.Locale` build from the
  79. ``searxng_locale`` value.
  80. subdomain:
  81. Google subdomain :py:obj:`google_domains` that fits to the country
  82. code.
  83. params:
  84. Py-Dictionary with additional request arguments (can be passed to
  85. :py:func:`urllib.parse.urlencode`).
  86. - ``hl`` parameter: specifies the interface language of user interface.
  87. - ``lr`` parameter: restricts search results to documents written in
  88. a particular language.
  89. - ``cr`` parameter: restricts search results to documents
  90. originating in a particular country.
  91. - ``ie`` parameter: sets the character encoding scheme that should
  92. be used to interpret the query string ('utf8').
  93. - ``oe`` parameter: sets the character encoding scheme that should
  94. be used to decode the XML result ('utf8').
  95. headers:
  96. Py-Dictionary with additional HTTP headers (can be passed to
  97. request's headers)
  98. - ``Accept: '*/*``
  99. """
  100. ret_val = {
  101. 'language': None,
  102. 'country': None,
  103. 'subdomain': None,
  104. 'params': {},
  105. 'headers': {},
  106. 'cookies': {},
  107. 'locale': None,
  108. }
  109. sxng_locale = params.get('searxng_locale', 'all')
  110. try:
  111. locale = babel.Locale.parse(sxng_locale, sep='-')
  112. except babel.core.UnknownLocaleError:
  113. locale = None
  114. eng_lang = eng_traits.get_language(sxng_locale, 'lang_en')
  115. lang_code = eng_lang.split('_')[-1] # lang_zh-TW --> zh-TW / lang_en --> en
  116. country = eng_traits.get_region(sxng_locale, eng_traits.all_locale)
  117. # Test zh_hans & zh_hant --> in the topmost links in the result list of list
  118. # TW and HK you should a find wiktionary.org zh_hant link. In the result
  119. # list of zh-CN should not be no hant link instead you should find
  120. # zh.m.wikipedia.org/zh somewhere in the top.
  121. # '!go 日 :zh-TW' --> https://zh.m.wiktionary.org/zh-hant/%E6%97%A5
  122. # '!go 日 :zh-CN' --> https://zh.m.wikipedia.org/zh/%E6%97%A5
  123. ret_val['language'] = eng_lang
  124. ret_val['country'] = country
  125. ret_val['locale'] = locale
  126. ret_val['subdomain'] = eng_traits.custom['supported_domains'].get(country.upper(), 'www.google.com')
  127. # hl parameter:
  128. # The hl parameter specifies the interface language (host language) of
  129. # your user interface. To improve the performance and the quality of your
  130. # search results, you are strongly encouraged to set this parameter
  131. # explicitly.
  132. # https://developers.google.com/custom-search/docs/xml_results#hlsp
  133. # The Interface Language:
  134. # https://developers.google.com/custom-search/docs/xml_results_appendices#interfaceLanguages
  135. # https://github.com/searxng/searxng/issues/2515#issuecomment-1607150817
  136. ret_val['params']['hl'] = f'{lang_code}-{country}'
  137. # lr parameter:
  138. # The lr (language restrict) parameter restricts search results to
  139. # documents written in a particular language.
  140. # https://developers.google.com/custom-search/docs/xml_results#lrsp
  141. # Language Collection Values:
  142. # https://developers.google.com/custom-search/docs/xml_results_appendices#languageCollections
  143. #
  144. # To select 'all' languages an empty 'lr' value is used.
  145. #
  146. # Different to other google services, Google Schloar supports to select more
  147. # than one language. The languages are seperated by a pipe '|' (logical OR).
  148. # By example: &lr=lang_zh-TW%7Clang_de selects articles written in
  149. # traditional chinese OR german language.
  150. ret_val['params']['lr'] = eng_lang
  151. if sxng_locale == 'all':
  152. ret_val['params']['lr'] = ''
  153. # cr parameter:
  154. # The cr parameter restricts search results to documents originating in a
  155. # particular country.
  156. # https://developers.google.com/custom-search/docs/xml_results#crsp
  157. ret_val['params']['cr'] = 'country' + country
  158. if sxng_locale == 'all':
  159. ret_val['params']['cr'] = ''
  160. # gl parameter: (mandatory by Geeogle News)
  161. # The gl parameter value is a two-letter country code. For WebSearch
  162. # results, the gl parameter boosts search results whose country of origin
  163. # matches the parameter value. See the Country Codes section for a list of
  164. # valid values.
  165. # Specifying a gl parameter value in WebSearch requests should improve the
  166. # relevance of results. This is particularly true for international
  167. # customers and, even more specifically, for customers in English-speaking
  168. # countries other than the United States.
  169. # https://developers.google.com/custom-search/docs/xml_results#glsp
  170. # https://github.com/searxng/searxng/issues/2515#issuecomment-1606294635
  171. # ret_val['params']['gl'] = country
  172. # ie parameter:
  173. # The ie parameter sets the character encoding scheme that should be used
  174. # to interpret the query string. The default ie value is latin1.
  175. # https://developers.google.com/custom-search/docs/xml_results#iesp
  176. ret_val['params']['ie'] = 'utf8'
  177. # oe parameter:
  178. # The oe parameter sets the character encoding scheme that should be used
  179. # to decode the XML result. The default oe value is latin1.
  180. # https://developers.google.com/custom-search/docs/xml_results#oesp
  181. ret_val['params']['oe'] = 'utf8'
  182. # num parameter:
  183. # The num parameter identifies the number of search results to return.
  184. # The default num value is 10, and the maximum value is 20. If you request
  185. # more than 20 results, only 20 results will be returned.
  186. # https://developers.google.com/custom-search/docs/xml_results#numsp
  187. # HINT: seems to have no effect (tested in google WEB & Images)
  188. # ret_val['params']['num'] = 20
  189. # HTTP headers
  190. ret_val['headers']['Accept'] = '*/*'
  191. # Cookies
  192. # - https://github.com/searxng/searxng/pull/1679#issuecomment-1235432746
  193. # - https://github.com/searxng/searxng/issues/1555
  194. ret_val['cookies']['CONSENT'] = "YES+"
  195. return ret_val
  196. def detect_google_sorry(resp):
  197. if resp.url.host == 'sorry.google.com' or resp.url.path.startswith('/sorry'):
  198. raise SearxEngineCaptchaException()
  199. def request(query, params):
  200. """Google search request"""
  201. # pylint: disable=line-too-long
  202. offset = (params['pageno'] - 1) * 10
  203. google_info = get_google_info(params, traits)
  204. # https://www.google.de/search?q=corona&hl=de&lr=lang_de&start=0&tbs=qdr%3Ad&safe=medium
  205. query_url = (
  206. 'https://'
  207. + google_info['subdomain']
  208. + '/search'
  209. + "?"
  210. + urlencode(
  211. {
  212. 'q': query,
  213. **google_info['params'],
  214. 'filter': '0',
  215. 'start': offset,
  216. # 'vet': '12ahUKEwik3ZbIzfn7AhXMX_EDHbUDBh0QxK8CegQIARAC..i',
  217. # 'ved': '2ahUKEwik3ZbIzfn7AhXMX_EDHbUDBh0Q_skCegQIARAG',
  218. # 'cs' : 1,
  219. # 'sa': 'N',
  220. # 'yv': 3,
  221. # 'prmd': 'vin',
  222. # 'ei': 'GASaY6TxOcy_xc8PtYeY6AE',
  223. # 'sa': 'N',
  224. # 'sstk': 'AcOHfVkD7sWCSAheZi-0tx_09XDO55gTWY0JNq3_V26cNN-c8lfD45aZYPI8s_Bqp8s57AHz5pxchDtAGCA_cikAWSjy9kw3kgg'
  225. # formally known as use_mobile_ui
  226. 'asearch': 'arc',
  227. 'async': UI_ASYNC,
  228. }
  229. )
  230. )
  231. if params['time_range'] in time_range_dict:
  232. query_url += '&' + urlencode({'tbs': 'qdr:' + time_range_dict[params['time_range']]})
  233. if params['safesearch']:
  234. query_url += '&' + urlencode({'safe': filter_mapping[params['safesearch']]})
  235. params['url'] = query_url
  236. params['cookies'] = google_info['cookies']
  237. params['headers'].update(google_info['headers'])
  238. return params
  239. # =26;[3,"dimg_ZNMiZPCqE4apxc8P3a2tuAQ_137"]a87;
  240. # ...6T+9Nl4cnD+gr9OK8I56/tX3l86nWYw//2Q==26;
  241. RE_DATA_IMAGE = re.compile(r'"(dimg_[^"]*)"[^;]*;(data:image[^;]*;[^;]*);')
  242. def _parse_data_images(dom):
  243. data_image_map = {}
  244. for img_id, data_image in RE_DATA_IMAGE.findall(dom.text_content()):
  245. end_pos = data_image.rfind('=')
  246. if end_pos > 0:
  247. data_image = data_image[: end_pos + 1]
  248. data_image_map[img_id] = data_image
  249. logger.debug('data:image objects --> %s', list(data_image_map.keys()))
  250. return data_image_map
  251. def response(resp):
  252. """Get response from google's search request"""
  253. # pylint: disable=too-many-branches, too-many-statements
  254. detect_google_sorry(resp)
  255. results = []
  256. # convert the text to dom
  257. dom = html.fromstring(resp.text)
  258. data_image_map = _parse_data_images(dom)
  259. # results --> answer
  260. answer_list = eval_xpath(dom, '//div[contains(@class, "LGOjhe")]')
  261. if answer_list:
  262. answer_list = [_.xpath("normalize-space()") for _ in answer_list]
  263. results.append({'answer': ' '.join(answer_list)})
  264. else:
  265. logger.debug("did not find 'answer'")
  266. # parse results
  267. for result in eval_xpath_list(dom, results_xpath): # pylint: disable=too-many-nested-blocks
  268. try:
  269. title_tag = eval_xpath_getindex(result, title_xpath, 0, default=None)
  270. if title_tag is None:
  271. # this not one of the common google results *section*
  272. logger.debug('ignoring item from the result_xpath list: missing title')
  273. continue
  274. title = extract_text(title_tag)
  275. url = eval_xpath_getindex(result, href_xpath, 0, None)
  276. if url is None:
  277. logger.debug('ignoring item from the result_xpath list: missing url of title "%s"', title)
  278. continue
  279. content_nodes = eval_xpath(result, content_xpath)
  280. content = extract_text(content_nodes)
  281. if not content:
  282. logger.debug('ignoring item from the result_xpath list: missing content of title "%s"', title)
  283. continue
  284. img_src = content_nodes[0].xpath('.//img/@src')
  285. if img_src:
  286. img_src = img_src[0]
  287. if img_src.startswith('data:image'):
  288. img_id = content_nodes[0].xpath('.//img/@id')
  289. if img_id:
  290. img_src = data_image_map.get(img_id[0])
  291. else:
  292. img_src = None
  293. results.append({'url': url, 'title': title, 'content': content, 'img_src': img_src})
  294. except Exception as e: # pylint: disable=broad-except
  295. logger.error(e, exc_info=True)
  296. continue
  297. # parse suggestion
  298. for suggestion in eval_xpath_list(dom, suggestion_xpath):
  299. # append suggestion
  300. results.append({'suggestion': extract_text(suggestion)})
  301. # return results
  302. return results
  303. # get supported languages from their site
  304. skip_countries = [
  305. # official language of google-country not in google-languages
  306. 'AL', # Albanien (sq)
  307. 'AZ', # Aserbaidschan (az)
  308. 'BD', # Bangladesch (bn)
  309. 'BN', # Brunei Darussalam (ms)
  310. 'BT', # Bhutan (dz)
  311. 'ET', # Äthiopien (am)
  312. 'GE', # Georgien (ka, os)
  313. 'GL', # Grönland (kl)
  314. 'KH', # Kambodscha (km)
  315. 'LA', # Laos (lo)
  316. 'LK', # Sri Lanka (si, ta)
  317. 'ME', # Montenegro (sr)
  318. 'MK', # Nordmazedonien (mk, sq)
  319. 'MM', # Myanmar (my)
  320. 'MN', # Mongolei (mn)
  321. 'MV', # Malediven (dv) // dv_MV is unknown by babel
  322. 'MY', # Malaysia (ms)
  323. 'NP', # Nepal (ne)
  324. 'TJ', # Tadschikistan (tg)
  325. 'TM', # Turkmenistan (tk)
  326. 'UZ', # Usbekistan (uz)
  327. ]
  328. def fetch_traits(engine_traits: EngineTraits, add_domains: bool = True):
  329. """Fetch languages from Google."""
  330. # pylint: disable=import-outside-toplevel, too-many-branches
  331. engine_traits.custom['supported_domains'] = {}
  332. resp = get('https://www.google.com/preferences')
  333. if not resp.ok: # type: ignore
  334. raise RuntimeError("Response from Google's preferences is not OK.")
  335. dom = html.fromstring(resp.text) # type: ignore
  336. # supported language codes
  337. lang_map = {'no': 'nb'}
  338. for x in eval_xpath_list(dom, '//*[@id="langSec"]//input[@name="lr"]'):
  339. eng_lang = x.get("value").split('_')[-1]
  340. try:
  341. locale = babel.Locale.parse(lang_map.get(eng_lang, eng_lang), sep='-')
  342. except babel.UnknownLocaleError:
  343. print("ERROR: %s -> %s is unknown by babel" % (x.get("data-name"), eng_lang))
  344. continue
  345. sxng_lang = language_tag(locale)
  346. conflict = engine_traits.languages.get(sxng_lang)
  347. if conflict:
  348. if conflict != eng_lang:
  349. print("CONFLICT: babel %s --> %s, %s" % (sxng_lang, conflict, eng_lang))
  350. continue
  351. engine_traits.languages[sxng_lang] = 'lang_' + eng_lang
  352. # alias languages
  353. engine_traits.languages['zh'] = 'lang_zh-CN'
  354. # supported region codes
  355. for x in eval_xpath_list(dom, '//*[@name="region"]/..//input[@name="region"]'):
  356. eng_country = x.get("value")
  357. if eng_country in skip_countries:
  358. continue
  359. if eng_country == 'ZZ':
  360. engine_traits.all_locale = 'ZZ'
  361. continue
  362. sxng_locales = get_offical_locales(eng_country, engine_traits.languages.keys(), regional=True)
  363. if not sxng_locales:
  364. print("ERROR: can't map from google country %s (%s) to a babel region." % (x.get('data-name'), eng_country))
  365. continue
  366. for sxng_locale in sxng_locales:
  367. engine_traits.regions[region_tag(sxng_locale)] = eng_country
  368. # alias regions
  369. engine_traits.regions['zh-CN'] = 'HK'
  370. # supported domains
  371. if add_domains:
  372. resp = get('https://www.google.com/supported_domains')
  373. if not resp.ok: # type: ignore
  374. raise RuntimeError("Response from https://www.google.com/supported_domains is not OK.")
  375. for domain in resp.text.split(): # type: ignore
  376. domain = domain.strip()
  377. if not domain or domain in [
  378. '.google.com',
  379. ]:
  380. continue
  381. region = domain.split('.')[-1].upper()
  382. engine_traits.custom['supported_domains'][region] = 'www' + domain # type: ignore
  383. if region == 'HK':
  384. # There is no google.cn, we use .com.hk for zh-CN
  385. engine_traits.custom['supported_domains']['CN'] = 'www' + domain # type: ignore