google.py 18 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510
  1. # SPDX-License-Identifier: AGPL-3.0-or-later
  2. # lint: pylint
  3. """This is the implementation of the Google WEB engine. Some of this
  4. implementations (manly the :py:obj:`get_google_info`) are shared by other
  5. engines:
  6. - :ref:`google images engine`
  7. - :ref:`google news engine`
  8. - :ref:`google videos engine`
  9. - :ref:`google scholar engine`
  10. - :ref:`google autocomplete`
  11. """
  12. from typing import TYPE_CHECKING
  13. import re
  14. from urllib.parse import urlencode
  15. from lxml import html
  16. import babel
  17. import babel.core
  18. import babel.languages
  19. from searx.utils import extract_text, eval_xpath, eval_xpath_list, eval_xpath_getindex
  20. from searx.locales import language_tag, region_tag, get_offical_locales
  21. from searx import network
  22. from searx.exceptions import SearxEngineCaptchaException
  23. from searx.enginelib.traits import EngineTraits
  24. if TYPE_CHECKING:
  25. import logging
  26. logger: logging.Logger
  27. traits: EngineTraits
  28. # about
  29. about = {
  30. "website": 'https://www.google.com',
  31. "wikidata_id": 'Q9366',
  32. "official_api_documentation": 'https://developers.google.com/custom-search/',
  33. "use_official_api": False,
  34. "require_api_key": False,
  35. "results": 'HTML',
  36. }
  37. # engine dependent config
  38. categories = ['general', 'web']
  39. paging = True
  40. time_range_support = True
  41. safesearch = True
  42. time_range_dict = {'day': 'd', 'week': 'w', 'month': 'm', 'year': 'y'}
  43. # Filter results. 0: None, 1: Moderate, 2: Strict
  44. filter_mapping = {0: 'off', 1: 'medium', 2: 'high'}
  45. # specific xpath variables
  46. # ------------------------
  47. results_xpath = './/div[@data-sokoban-container]'
  48. title_xpath = './/a/h3[1]'
  49. href_xpath = './/a[h3]/@href'
  50. content_xpath = './/div[@data-content-feature]'
  51. # google *sections* are no usual *results*, we ignore them
  52. g_section_with_header = './g-section-with-header'
  53. # Suggestions are links placed in a *card-section*, we extract only the text
  54. # from the links not the links itself.
  55. suggestion_xpath = '//div[contains(@class, "EIaa9b")]//a'
  56. # UI_ASYNC = 'use_ac:true,_fmt:html' # returns a HTTP 500 when user search for
  57. # # celebrities like '!google natasha allegri'
  58. # # or '!google chris evans'
  59. UI_ASYNC = 'use_ac:true,_fmt:prog'
  60. """Format of the response from UI's async request."""
  61. def get_google_info(params, eng_traits):
  62. """Composing various (language) properties for the google engines (:ref:`google
  63. API`).
  64. This function is called by the various google engines (:ref:`google web
  65. engine`, :ref:`google images engine`, :ref:`google news engine` and
  66. :ref:`google videos engine`).
  67. :param dict param: Request parameters of the engine. At least
  68. a ``searxng_locale`` key should be in the dictionary.
  69. :param eng_traits: Engine's traits fetched from google preferences
  70. (:py:obj:`searx.enginelib.traits.EngineTraits`)
  71. :rtype: dict
  72. :returns:
  73. Py-Dictionary with the key/value pairs:
  74. language:
  75. The language code that is used by google (e.g. ``lang_en`` or
  76. ``lang_zh-TW``)
  77. country:
  78. The country code that is used by google (e.g. ``US`` or ``TW``)
  79. locale:
  80. A instance of :py:obj:`babel.core.Locale` build from the
  81. ``searxng_locale`` value.
  82. subdomain:
  83. Google subdomain :py:obj:`google_domains` that fits to the country
  84. code.
  85. params:
  86. Py-Dictionary with additional request arguments (can be passed to
  87. :py:func:`urllib.parse.urlencode`).
  88. - ``hl`` parameter: specifies the interface language of user interface.
  89. - ``lr`` parameter: restricts search results to documents written in
  90. a particular language.
  91. - ``cr`` parameter: restricts search results to documents
  92. originating in a particular country.
  93. - ``ie`` parameter: sets the character encoding scheme that should
  94. be used to interpret the query string ('utf8').
  95. - ``oe`` parameter: sets the character encoding scheme that should
  96. be used to decode the XML result ('utf8').
  97. headers:
  98. Py-Dictionary with additional HTTP headers (can be passed to
  99. request's headers)
  100. - ``Accept: '*/*``
  101. """
  102. ret_val = {
  103. 'language': None,
  104. 'country': None,
  105. 'subdomain': None,
  106. 'params': {},
  107. 'headers': {},
  108. 'cookies': {},
  109. 'locale': None,
  110. }
  111. sxng_locale = params.get('searxng_locale', 'all')
  112. try:
  113. locale = babel.Locale.parse(sxng_locale, sep='-')
  114. except babel.core.UnknownLocaleError:
  115. locale = None
  116. eng_lang = eng_traits.get_language(sxng_locale, 'lang_en')
  117. lang_code = eng_lang.split('_')[-1] # lang_zh-TW --> zh-TW / lang_en --> en
  118. country = eng_traits.get_region(sxng_locale, eng_traits.all_locale)
  119. # Test zh_hans & zh_hant --> in the topmost links in the result list of list
  120. # TW and HK you should a find wiktionary.org zh_hant link. In the result
  121. # list of zh-CN should not be no hant link instead you should find
  122. # zh.m.wikipedia.org/zh somewhere in the top.
  123. # '!go 日 :zh-TW' --> https://zh.m.wiktionary.org/zh-hant/%E6%97%A5
  124. # '!go 日 :zh-CN' --> https://zh.m.wikipedia.org/zh/%E6%97%A5
  125. ret_val['language'] = eng_lang
  126. ret_val['country'] = country
  127. ret_val['locale'] = locale
  128. ret_val['subdomain'] = eng_traits.custom['supported_domains'].get(country.upper(), 'www.google.com')
  129. # hl parameter:
  130. # The hl parameter specifies the interface language (host language) of
  131. # your user interface. To improve the performance and the quality of your
  132. # search results, you are strongly encouraged to set this parameter
  133. # explicitly.
  134. # https://developers.google.com/custom-search/docs/xml_results#hlsp
  135. # The Interface Language:
  136. # https://developers.google.com/custom-search/docs/xml_results_appendices#interfaceLanguages
  137. ret_val['params']['hl'] = lang_code
  138. # lr parameter:
  139. # The lr (language restrict) parameter restricts search results to
  140. # documents written in a particular language.
  141. # https://developers.google.com/custom-search/docs/xml_results#lrsp
  142. # Language Collection Values:
  143. # https://developers.google.com/custom-search/docs/xml_results_appendices#languageCollections
  144. #
  145. # To select 'all' languages an empty 'lr' value is used.
  146. #
  147. # Different to other google services, Google Schloar supports to select more
  148. # than one language. The languages are seperated by a pipe '|' (logical OR).
  149. # By example: &lr=lang_zh-TW%7Clang_de selects articles written in
  150. # traditional chinese OR german language.
  151. ret_val['params']['lr'] = eng_lang
  152. if sxng_locale == 'all':
  153. ret_val['params']['lr'] = ''
  154. # cr parameter:
  155. # The cr parameter restricts search results to documents originating in a
  156. # particular country.
  157. # https://developers.google.com/custom-search/docs/xml_results#crsp
  158. ret_val['params']['cr'] = 'country' + country
  159. if sxng_locale == 'all':
  160. ret_val['params']['cr'] = ''
  161. # gl parameter: (mandatory by Geeogle News)
  162. # The gl parameter value is a two-letter country code. For WebSearch
  163. # results, the gl parameter boosts search results whose country of origin
  164. # matches the parameter value. See the Country Codes section for a list of
  165. # valid values.
  166. # Specifying a gl parameter value in WebSearch requests should improve the
  167. # relevance of results. This is particularly true for international
  168. # customers and, even more specifically, for customers in English-speaking
  169. # countries other than the United States.
  170. # https://developers.google.com/custom-search/docs/xml_results#glsp
  171. ret_val['params']['gl'] = country
  172. # ie parameter:
  173. # The ie parameter sets the character encoding scheme that should be used
  174. # to interpret the query string. The default ie value is latin1.
  175. # https://developers.google.com/custom-search/docs/xml_results#iesp
  176. ret_val['params']['ie'] = 'utf8'
  177. # oe parameter:
  178. # The oe parameter sets the character encoding scheme that should be used
  179. # to decode the XML result. The default oe value is latin1.
  180. # https://developers.google.com/custom-search/docs/xml_results#oesp
  181. ret_val['params']['oe'] = 'utf8'
  182. # num parameter:
  183. # The num parameter identifies the number of search results to return.
  184. # The default num value is 10, and the maximum value is 20. If you request
  185. # more than 20 results, only 20 results will be returned.
  186. # https://developers.google.com/custom-search/docs/xml_results#numsp
  187. # HINT: seems to have no effect (tested in google WEB & Images)
  188. # ret_val['params']['num'] = 20
  189. # HTTP headers
  190. ret_val['headers']['Accept'] = '*/*'
  191. # Cookies
  192. # - https://github.com/searxng/searxng/pull/1679#issuecomment-1235432746
  193. # - https://github.com/searxng/searxng/issues/1555
  194. ret_val['cookies']['CONSENT'] = "YES+"
  195. return ret_val
  196. def detect_google_sorry(resp):
  197. if resp.url.host == 'sorry.google.com' or resp.url.path.startswith('/sorry'):
  198. raise SearxEngineCaptchaException()
  199. def request(query, params):
  200. """Google search request"""
  201. # pylint: disable=line-too-long
  202. offset = (params['pageno'] - 1) * 10
  203. google_info = get_google_info(params, traits)
  204. # https://www.google.de/search?q=corona&hl=de&lr=lang_de&start=0&tbs=qdr%3Ad&safe=medium
  205. query_url = (
  206. 'https://'
  207. + google_info['subdomain']
  208. + '/search'
  209. + "?"
  210. + urlencode(
  211. {
  212. 'q': query,
  213. **google_info['params'],
  214. 'filter': '0',
  215. 'start': offset,
  216. # 'vet': '12ahUKEwik3ZbIzfn7AhXMX_EDHbUDBh0QxK8CegQIARAC..i',
  217. # 'ved': '2ahUKEwik3ZbIzfn7AhXMX_EDHbUDBh0Q_skCegQIARAG',
  218. # 'cs' : 1,
  219. # 'sa': 'N',
  220. # 'yv': 3,
  221. # 'prmd': 'vin',
  222. # 'ei': 'GASaY6TxOcy_xc8PtYeY6AE',
  223. # 'sa': 'N',
  224. # 'sstk': 'AcOHfVkD7sWCSAheZi-0tx_09XDO55gTWY0JNq3_V26cNN-c8lfD45aZYPI8s_Bqp8s57AHz5pxchDtAGCA_cikAWSjy9kw3kgg'
  225. # formally known as use_mobile_ui
  226. 'asearch': 'arc',
  227. 'async': UI_ASYNC,
  228. }
  229. )
  230. )
  231. if params['time_range'] in time_range_dict:
  232. query_url += '&' + urlencode({'tbs': 'qdr:' + time_range_dict[params['time_range']]})
  233. if params['safesearch']:
  234. query_url += '&' + urlencode({'safe': filter_mapping[params['safesearch']]})
  235. params['url'] = query_url
  236. params['cookies'] = google_info['cookies']
  237. params['headers'].update(google_info['headers'])
  238. return params
  239. # (function(){var s='data:image/jpeg;base64,/9j/4AAQ ...
  240. # ... DX/Ff5XSpSgdU32xSlKDJ//9k\x3d';var ii=['dimg_21'];_setImagesSrc(ii,s);})();
  241. RE_DATA_IMAGE = re.compile(r"'(data:image[^']*)'[^']*ii=\['([^']*)'\];_setImagesSrc")
  242. def _parse_data_images(dom):
  243. data_image_map = {}
  244. for _script in eval_xpath_list(dom, "//script[@nonce]"):
  245. script = _script.text
  246. if not script:
  247. continue
  248. script = RE_DATA_IMAGE.search(script)
  249. if not script:
  250. continue
  251. data_image_map[script.group(2)] = script.group(1).replace(r'\x3d', '=')
  252. logger.debug('data:image objects --> %s', list(data_image_map.keys()))
  253. return data_image_map
  254. def response(resp):
  255. """Get response from google's search request"""
  256. # pylint: disable=too-many-branches, too-many-statements
  257. detect_google_sorry(resp)
  258. results = []
  259. # convert the text to dom
  260. dom = html.fromstring(resp.text)
  261. data_image_map = {}
  262. if '_fmt:html' in UI_ASYNC:
  263. # in this format images are embedded by a bse64 encoded 'data:image'
  264. data_image_map = _parse_data_images(dom)
  265. # results --> answer
  266. answer_list = eval_xpath(dom, '//div[contains(@class, "LGOjhe")]')
  267. if answer_list:
  268. answer_list = [_.xpath("normalize-space()") for _ in answer_list]
  269. results.append({'answer': ' '.join(answer_list)})
  270. else:
  271. logger.debug("did not find 'answer'")
  272. # parse results
  273. for result in eval_xpath_list(dom, results_xpath): # pylint: disable=too-many-nested-blocks
  274. # google *sections*
  275. if extract_text(eval_xpath(result, g_section_with_header)):
  276. logger.debug("ignoring <g-section-with-header>")
  277. continue
  278. try:
  279. title_tag = eval_xpath_getindex(result, title_xpath, 0, default=None)
  280. if title_tag is None:
  281. # this not one of the common google results *section*
  282. logger.debug('ignoring item from the result_xpath list: missing title')
  283. continue
  284. title = extract_text(title_tag)
  285. url = eval_xpath_getindex(result, href_xpath, 0, None)
  286. if url is None:
  287. continue
  288. content = []
  289. img_list = []
  290. for content_feature in eval_xpath(result, content_xpath):
  291. val = content_feature.attrib['data-content-feature']
  292. if val in ['1', '2']:
  293. txt = extract_text(content_feature, allow_none=True)
  294. if txt:
  295. content.append(txt)
  296. elif '0' in val:
  297. img = content_feature.xpath('.//img/@src')
  298. if img:
  299. img = img[0]
  300. if img.startswith('data:image'):
  301. img_id = content_feature.xpath('.//img/@id')
  302. if img_id:
  303. img = data_image_map.get(img_id[0])
  304. img_list.append(img)
  305. if not content:
  306. logger.debug('ignoring item from the result_xpath list: missing content of title "%s"', title)
  307. continue
  308. content = ' / '.join(content)
  309. img_src = img_list[0] if img_list else None
  310. results.append({'url': url, 'title': title, 'content': content, 'img_src': img_src})
  311. except Exception as e: # pylint: disable=broad-except
  312. logger.error(e, exc_info=True)
  313. continue
  314. # parse suggestion
  315. for suggestion in eval_xpath_list(dom, suggestion_xpath):
  316. # append suggestion
  317. results.append({'suggestion': extract_text(suggestion)})
  318. # return results
  319. return results
  320. # get supported languages from their site
  321. skip_countries = [
  322. # official language of google-country not in google-languages
  323. 'AL', # Albanien (sq)
  324. 'AZ', # Aserbaidschan (az)
  325. 'BD', # Bangladesch (bn)
  326. 'BN', # Brunei Darussalam (ms)
  327. 'BT', # Bhutan (dz)
  328. 'ET', # Äthiopien (am)
  329. 'GE', # Georgien (ka, os)
  330. 'GL', # Grönland (kl)
  331. 'KH', # Kambodscha (km)
  332. 'LA', # Laos (lo)
  333. 'LK', # Sri Lanka (si, ta)
  334. 'ME', # Montenegro (sr)
  335. 'MK', # Nordmazedonien (mk, sq)
  336. 'MM', # Myanmar (my)
  337. 'MN', # Mongolei (mn)
  338. 'MV', # Malediven (dv) // dv_MV is unknown by babel
  339. 'MY', # Malaysia (ms)
  340. 'NP', # Nepal (ne)
  341. 'TJ', # Tadschikistan (tg)
  342. 'TM', # Turkmenistan (tk)
  343. 'UZ', # Usbekistan (uz)
  344. ]
  345. def fetch_traits(engine_traits: EngineTraits, add_domains: bool = True):
  346. """Fetch languages from Google."""
  347. # pylint: disable=import-outside-toplevel, too-many-branches
  348. engine_traits.custom['supported_domains'] = {}
  349. resp = network.get('https://www.google.com/preferences')
  350. if not resp.ok:
  351. raise RuntimeError("Response from Google's preferences is not OK.")
  352. dom = html.fromstring(resp.text)
  353. # supported language codes
  354. lang_map = {'no': 'nb'}
  355. for x in eval_xpath_list(dom, '//*[@id="langSec"]//input[@name="lr"]'):
  356. eng_lang = x.get("value").split('_')[-1]
  357. try:
  358. locale = babel.Locale.parse(lang_map.get(eng_lang, eng_lang), sep='-')
  359. except babel.UnknownLocaleError:
  360. print("ERROR: %s -> %s is unknown by babel" % (x.get("data-name"), eng_lang))
  361. continue
  362. sxng_lang = language_tag(locale)
  363. conflict = engine_traits.languages.get(sxng_lang)
  364. if conflict:
  365. if conflict != eng_lang:
  366. print("CONFLICT: babel %s --> %s, %s" % (sxng_lang, conflict, eng_lang))
  367. continue
  368. engine_traits.languages[sxng_lang] = 'lang_' + eng_lang
  369. # alias languages
  370. engine_traits.languages['zh'] = 'lang_zh-CN'
  371. # supported region codes
  372. for x in eval_xpath_list(dom, '//*[@name="region"]/..//input[@name="region"]'):
  373. eng_country = x.get("value")
  374. if eng_country in skip_countries:
  375. continue
  376. if eng_country == 'ZZ':
  377. engine_traits.all_locale = 'ZZ'
  378. continue
  379. sxng_locales = get_offical_locales(eng_country, engine_traits.languages.keys(), regional=True)
  380. if not sxng_locales:
  381. print("ERROR: can't map from google country %s (%s) to a babel region." % (x.get('data-name'), eng_country))
  382. continue
  383. for sxng_locale in sxng_locales:
  384. engine_traits.regions[region_tag(sxng_locale)] = eng_country
  385. # alias regions
  386. engine_traits.regions['zh-CN'] = 'HK'
  387. # supported domains
  388. if add_domains:
  389. resp = network.get('https://www.google.com/supported_domains')
  390. if not resp.ok:
  391. raise RuntimeError("Response from https://www.google.com/supported_domains is not OK.")
  392. for domain in resp.text.split():
  393. domain = domain.strip()
  394. if not domain or domain in [
  395. '.google.com',
  396. ]:
  397. continue
  398. region = domain.split('.')[-1].upper()
  399. engine_traits.custom['supported_domains'][region] = 'www' + domain
  400. if region == 'HK':
  401. # There is no google.cn, we use .com.hk for zh-CN
  402. engine_traits.custom['supported_domains']['CN'] = 'www' + domain