|
@@ -1,38 +1,40 @@
|
|
|
# SPDX-License-Identifier: AGPL-3.0-or-later
|
|
|
# lint: pylint
|
|
|
-"""This is the implementation of the google WEB engine. Some of this
|
|
|
-implementations are shared by other engines:
|
|
|
+"""This is the implementation of the Google WEB engine. Some of this
|
|
|
+implementations (manly the :py:obj:`get_google_info`) are shared by other
|
|
|
+engines:
|
|
|
|
|
|
- :ref:`google images engine`
|
|
|
- :ref:`google news engine`
|
|
|
- :ref:`google videos engine`
|
|
|
-
|
|
|
-The google WEB engine itself has a special setup option:
|
|
|
-
|
|
|
-.. code:: yaml
|
|
|
-
|
|
|
- - name: google
|
|
|
- ...
|
|
|
- use_mobile_ui: false
|
|
|
-
|
|
|
-``use_mobile_ui``: (default: ``false``)
|
|
|
- Enables to use *mobile endpoint* to bypass the google blocking (see
|
|
|
- :issue:`159`). On the mobile UI of Google Search, the button :guilabel:`More
|
|
|
- results` is not affected by Google rate limiting and we can still do requests
|
|
|
- while actively blocked by the original Google search. By activate
|
|
|
- ``use_mobile_ui`` this behavior is simulated by adding the parameter
|
|
|
- ``async=use_ac:true,_fmt:pc`` to the :py:func:`request`.
|
|
|
+- :ref:`google scholar engine`
|
|
|
+- :ref:`google autocomplete`
|
|
|
|
|
|
"""
|
|
|
|
|
|
+from typing import TYPE_CHECKING
|
|
|
+
|
|
|
+import re
|
|
|
from urllib.parse import urlencode
|
|
|
from lxml import html
|
|
|
-from searx.utils import match_language, extract_text, eval_xpath, eval_xpath_list, eval_xpath_getindex
|
|
|
+import babel
|
|
|
+import babel.core
|
|
|
+import babel.languages
|
|
|
+
|
|
|
+from searx.utils import extract_text, eval_xpath, eval_xpath_list, eval_xpath_getindex
|
|
|
+from searx.locales import language_tag, region_tag, get_offical_locales
|
|
|
+from searx import network
|
|
|
from searx.exceptions import SearxEngineCaptchaException
|
|
|
from searx.enginelib.traits import EngineTraits
|
|
|
|
|
|
+if TYPE_CHECKING:
|
|
|
+ import logging
|
|
|
+
|
|
|
+ logger: logging.Logger
|
|
|
+
|
|
|
traits: EngineTraits
|
|
|
|
|
|
+
|
|
|
# about
|
|
|
about = {
|
|
|
"website": 'https://www.google.com',
|
|
@@ -48,64 +50,6 @@ categories = ['general', 'web']
|
|
|
paging = True
|
|
|
time_range_support = True
|
|
|
safesearch = True
|
|
|
-send_accept_language_header = True
|
|
|
-use_mobile_ui = False
|
|
|
-supported_languages_url = 'https://www.google.com/preferences?#languages'
|
|
|
-
|
|
|
-# based on https://en.wikipedia.org/wiki/List_of_Google_domains and tests
|
|
|
-google_domains = {
|
|
|
- 'BG': 'google.bg', # Bulgaria
|
|
|
- 'CZ': 'google.cz', # Czech Republic
|
|
|
- 'DE': 'google.de', # Germany
|
|
|
- 'DK': 'google.dk', # Denmark
|
|
|
- 'AT': 'google.at', # Austria
|
|
|
- 'CH': 'google.ch', # Switzerland
|
|
|
- 'GR': 'google.gr', # Greece
|
|
|
- 'AU': 'google.com.au', # Australia
|
|
|
- 'CA': 'google.ca', # Canada
|
|
|
- 'GB': 'google.co.uk', # United Kingdom
|
|
|
- 'ID': 'google.co.id', # Indonesia
|
|
|
- 'IE': 'google.ie', # Ireland
|
|
|
- 'IN': 'google.co.in', # India
|
|
|
- 'MY': 'google.com.my', # Malaysia
|
|
|
- 'NZ': 'google.co.nz', # New Zealand
|
|
|
- 'PH': 'google.com.ph', # Philippines
|
|
|
- 'SG': 'google.com.sg', # Singapore
|
|
|
- 'US': 'google.com', # United States (google.us) redirects to .com
|
|
|
- 'ZA': 'google.co.za', # South Africa
|
|
|
- 'AR': 'google.com.ar', # Argentina
|
|
|
- 'CL': 'google.cl', # Chile
|
|
|
- 'ES': 'google.es', # Spain
|
|
|
- 'MX': 'google.com.mx', # Mexico
|
|
|
- 'EE': 'google.ee', # Estonia
|
|
|
- 'FI': 'google.fi', # Finland
|
|
|
- 'BE': 'google.be', # Belgium
|
|
|
- 'FR': 'google.fr', # France
|
|
|
- 'IL': 'google.co.il', # Israel
|
|
|
- 'HR': 'google.hr', # Croatia
|
|
|
- 'HU': 'google.hu', # Hungary
|
|
|
- 'IT': 'google.it', # Italy
|
|
|
- 'JP': 'google.co.jp', # Japan
|
|
|
- 'KR': 'google.co.kr', # South Korea
|
|
|
- 'LT': 'google.lt', # Lithuania
|
|
|
- 'LV': 'google.lv', # Latvia
|
|
|
- 'NO': 'google.no', # Norway
|
|
|
- 'NL': 'google.nl', # Netherlands
|
|
|
- 'PL': 'google.pl', # Poland
|
|
|
- 'BR': 'google.com.br', # Brazil
|
|
|
- 'PT': 'google.pt', # Portugal
|
|
|
- 'RO': 'google.ro', # Romania
|
|
|
- 'RU': 'google.ru', # Russia
|
|
|
- 'SK': 'google.sk', # Slovakia
|
|
|
- 'SI': 'google.si', # Slovenia
|
|
|
- 'SE': 'google.se', # Sweden
|
|
|
- 'TH': 'google.co.th', # Thailand
|
|
|
- 'TR': 'google.com.tr', # Turkey
|
|
|
- 'UA': 'google.com.ua', # Ukraine
|
|
|
- 'CN': 'google.com.hk', # There is no google.cn, we use .com.hk for zh-CN
|
|
|
- 'HK': 'google.com.hk', # Hong Kong
|
|
|
- 'TW': 'google.com.tw', # Taiwan
|
|
|
-}
|
|
|
|
|
|
time_range_dict = {'day': 'd', 'week': 'w', 'month': 'm', 'year': 'y'}
|
|
|
|
|
@@ -118,47 +62,50 @@ filter_mapping = {0: 'off', 1: 'medium', 2: 'high'}
|
|
|
results_xpath = './/div[@data-sokoban-container]'
|
|
|
title_xpath = './/a/h3[1]'
|
|
|
href_xpath = './/a[h3]/@href'
|
|
|
-content_xpath = './/div[@data-content-feature=1]'
|
|
|
+content_xpath = './/div[@data-content-feature]'
|
|
|
|
|
|
# google *sections* are no usual *results*, we ignore them
|
|
|
g_section_with_header = './g-section-with-header'
|
|
|
|
|
|
-
|
|
|
# Suggestions are links placed in a *card-section*, we extract only the text
|
|
|
# from the links not the links itself.
|
|
|
suggestion_xpath = '//div[contains(@class, "EIaa9b")]//a'
|
|
|
|
|
|
+# UI_ASYNC = 'use_ac:true,_fmt:html' # returns a HTTP 500 when user search for
|
|
|
+# # celebrities like '!google natasha allegri'
|
|
|
+# # or '!google chris evans'
|
|
|
+UI_ASYNC = 'use_ac:true,_fmt:prog'
|
|
|
+"""Format of the response from UI's async request."""
|
|
|
+
|
|
|
|
|
|
-def get_lang_info(params, lang_list, custom_aliases, supported_any_language):
|
|
|
- """Composing various language properties for the google engines.
|
|
|
+def get_google_info(params, eng_traits):
|
|
|
+ """Composing various (language) properties for the google engines (:ref:`google
|
|
|
+ API`).
|
|
|
|
|
|
This function is called by the various google engines (:ref:`google web
|
|
|
engine`, :ref:`google images engine`, :ref:`google news engine` and
|
|
|
:ref:`google videos engine`).
|
|
|
|
|
|
- :param dict param: request parameters of the engine
|
|
|
-
|
|
|
- :param list lang_list: list of supported languages of the engine
|
|
|
- :py:obj:`ENGINES_LANGUAGES[engine-name] <searx.data.ENGINES_LANGUAGES>`
|
|
|
-
|
|
|
- :param dict lang_list: custom aliases for non standard language codes
|
|
|
- (used when calling :py:func:`searx.utils.match_language`)
|
|
|
+ :param dict param: Request parameters of the engine. At least
|
|
|
+ a ``searxng_locale`` key should be in the dictionary.
|
|
|
|
|
|
- :param bool supported_any_language: When a language is not specified, the
|
|
|
- language interpretation is left up to Google to decide how the search
|
|
|
- results should be delivered. This argument is ``True`` for the google
|
|
|
- engine and ``False`` for the other engines (google-images, -news,
|
|
|
- -scholar, -videos).
|
|
|
+ :param eng_traits: Engine's traits fetched from google preferences
|
|
|
+ (:py:obj:`searx.enginelib.traits.EngineTraits`)
|
|
|
|
|
|
:rtype: dict
|
|
|
:returns:
|
|
|
Py-Dictionary with the key/value pairs:
|
|
|
|
|
|
language:
|
|
|
- Return value from :py:func:`searx.utils.match_language`
|
|
|
+ The language code that is used by google (e.g. ``lang_en`` or
|
|
|
+ ``lang_zh-TW``)
|
|
|
|
|
|
country:
|
|
|
- The country code (e.g. US, AT, CA, FR, DE ..)
|
|
|
+ The country code that is used by google (e.g. ``US`` or ``TW``)
|
|
|
+
|
|
|
+ locale:
|
|
|
+ A instance of :py:obj:`babel.core.Locale` build from the
|
|
|
+ ``searxng_locale`` value.
|
|
|
|
|
|
subdomain:
|
|
|
Google subdomain :py:obj:`google_domains` that fits to the country
|
|
@@ -168,52 +115,67 @@ def get_lang_info(params, lang_list, custom_aliases, supported_any_language):
|
|
|
Py-Dictionary with additional request arguments (can be passed to
|
|
|
:py:func:`urllib.parse.urlencode`).
|
|
|
|
|
|
+ - ``hl`` parameter: specifies the interface language of user interface.
|
|
|
+ - ``lr`` parameter: restricts search results to documents written in
|
|
|
+ a particular language.
|
|
|
+ - ``cr`` parameter: restricts search results to documents
|
|
|
+ originating in a particular country.
|
|
|
+ - ``ie`` parameter: sets the character encoding scheme that should
|
|
|
+ be used to interpret the query string ('utf8').
|
|
|
+ - ``oe`` parameter: sets the character encoding scheme that should
|
|
|
+ be used to decode the XML result ('utf8').
|
|
|
+
|
|
|
headers:
|
|
|
Py-Dictionary with additional HTTP headers (can be passed to
|
|
|
request's headers)
|
|
|
+
|
|
|
+ - ``Accept: '*/*``
|
|
|
+
|
|
|
"""
|
|
|
+
|
|
|
ret_val = {
|
|
|
'language': None,
|
|
|
'country': None,
|
|
|
'subdomain': None,
|
|
|
'params': {},
|
|
|
'headers': {},
|
|
|
+ 'cookies': {},
|
|
|
+ 'locale': None,
|
|
|
}
|
|
|
|
|
|
- # language ...
|
|
|
-
|
|
|
- _lang = params['language']
|
|
|
- _any_language = _lang.lower() == 'all'
|
|
|
- if _any_language:
|
|
|
- _lang = 'en-US'
|
|
|
- language = match_language(_lang, lang_list, custom_aliases)
|
|
|
- ret_val['language'] = language
|
|
|
-
|
|
|
- # country ...
|
|
|
+ sxng_locale = params.get('searxng_locale', 'all')
|
|
|
+ try:
|
|
|
+ locale = babel.Locale.parse(sxng_locale, sep='-')
|
|
|
+ except babel.core.UnknownLocaleError:
|
|
|
+ locale = None
|
|
|
|
|
|
- _l = _lang.split('-')
|
|
|
- if len(_l) == 2:
|
|
|
- country = _l[1]
|
|
|
- else:
|
|
|
- country = _l[0].upper()
|
|
|
- if country == 'EN':
|
|
|
- country = 'US'
|
|
|
- ret_val['country'] = country
|
|
|
+ eng_lang = eng_traits.get_language(sxng_locale, 'lang_en')
|
|
|
+ lang_code = eng_lang.split('_')[-1] # lang_zh-TW --> zh-TW / lang_en --> en
|
|
|
+ country = eng_traits.get_region(sxng_locale, eng_traits.all_locale)
|
|
|
|
|
|
- # subdomain ...
|
|
|
+ # Test zh_hans & zh_hant --> in the topmost links in the result list of list
|
|
|
+ # TW and HK you should a find wiktionary.org zh_hant link. In the result
|
|
|
+ # list of zh-CN should not be no hant link instead you should find
|
|
|
+ # zh.m.wikipedia.org/zh somewhere in the top.
|
|
|
|
|
|
- ret_val['subdomain'] = 'www.' + google_domains.get(country.upper(), 'google.com')
|
|
|
+ # '!go 日 :zh-TW' --> https://zh.m.wiktionary.org/zh-hant/%E6%97%A5
|
|
|
+ # '!go 日 :zh-CN' --> https://zh.m.wikipedia.org/zh/%E6%97%A5
|
|
|
|
|
|
- # params & headers
|
|
|
-
|
|
|
- lang_country = '%s-%s' % (language, country) # (en-US, en-EN, de-DE, de-AU, fr-FR ..)
|
|
|
+ ret_val['language'] = eng_lang
|
|
|
+ ret_val['country'] = country
|
|
|
+ ret_val['locale'] = locale
|
|
|
+ ret_val['subdomain'] = eng_traits.custom['supported_domains'].get(country.upper(), 'www.google.com')
|
|
|
|
|
|
# hl parameter:
|
|
|
- # https://developers.google.com/custom-search/docs/xml_results#hlsp The
|
|
|
- # Interface Language:
|
|
|
+ # The hl parameter specifies the interface language (host language) of
|
|
|
+ # your user interface. To improve the performance and the quality of your
|
|
|
+ # search results, you are strongly encouraged to set this parameter
|
|
|
+ # explicitly.
|
|
|
+ # https://developers.google.com/custom-search/docs/xml_results#hlsp
|
|
|
+ # The Interface Language:
|
|
|
# https://developers.google.com/custom-search/docs/xml_results_appendices#interfaceLanguages
|
|
|
|
|
|
- ret_val['params']['hl'] = lang_list.get(lang_country, language)
|
|
|
+ ret_val['params']['hl'] = lang_code
|
|
|
|
|
|
# lr parameter:
|
|
|
# The lr (language restrict) parameter restricts search results to
|
|
@@ -221,22 +183,72 @@ def get_lang_info(params, lang_list, custom_aliases, supported_any_language):
|
|
|
# https://developers.google.com/custom-search/docs/xml_results#lrsp
|
|
|
# Language Collection Values:
|
|
|
# https://developers.google.com/custom-search/docs/xml_results_appendices#languageCollections
|
|
|
+ #
|
|
|
+ # To select 'all' languages an empty 'lr' value is used.
|
|
|
+ #
|
|
|
+ # Different to other google services, Google Schloar supports to select more
|
|
|
+ # than one language. The languages are seperated by a pipe '|' (logical OR).
|
|
|
+ # By example: &lr=lang_zh-TW%7Clang_de selects articles written in
|
|
|
+ # traditional chinese OR german language.
|
|
|
|
|
|
- if _any_language and supported_any_language:
|
|
|
+ ret_val['params']['lr'] = eng_lang
|
|
|
+ if sxng_locale == 'all':
|
|
|
+ ret_val['params']['lr'] = ''
|
|
|
|
|
|
- # interpretation is left up to Google (based on whoogle)
|
|
|
- #
|
|
|
- # - add parameter ``source=lnt``
|
|
|
- # - don't use parameter ``lr``
|
|
|
- # - don't add a ``Accept-Language`` HTTP header.
|
|
|
+ # cr parameter:
|
|
|
+ # The cr parameter restricts search results to documents originating in a
|
|
|
+ # particular country.
|
|
|
+ # https://developers.google.com/custom-search/docs/xml_results#crsp
|
|
|
|
|
|
- ret_val['params']['source'] = 'lnt'
|
|
|
+ ret_val['params']['cr'] = 'country' + country
|
|
|
+ if sxng_locale == 'all':
|
|
|
+ ret_val['params']['cr'] = ''
|
|
|
|
|
|
- else:
|
|
|
+ # gl parameter: (mandatory by Geeogle News)
|
|
|
+ # The gl parameter value is a two-letter country code. For WebSearch
|
|
|
+ # results, the gl parameter boosts search results whose country of origin
|
|
|
+ # matches the parameter value. See the Country Codes section for a list of
|
|
|
+ # valid values.
|
|
|
+ # Specifying a gl parameter value in WebSearch requests should improve the
|
|
|
+ # relevance of results. This is particularly true for international
|
|
|
+ # customers and, even more specifically, for customers in English-speaking
|
|
|
+ # countries other than the United States.
|
|
|
+ # https://developers.google.com/custom-search/docs/xml_results#glsp
|
|
|
+
|
|
|
+ ret_val['params']['gl'] = country
|
|
|
+
|
|
|
+ # ie parameter:
|
|
|
+ # The ie parameter sets the character encoding scheme that should be used
|
|
|
+ # to interpret the query string. The default ie value is latin1.
|
|
|
+ # https://developers.google.com/custom-search/docs/xml_results#iesp
|
|
|
+
|
|
|
+ ret_val['params']['ie'] = 'utf8'
|
|
|
+
|
|
|
+ # oe parameter:
|
|
|
+ # The oe parameter sets the character encoding scheme that should be used
|
|
|
+ # to decode the XML result. The default oe value is latin1.
|
|
|
+ # https://developers.google.com/custom-search/docs/xml_results#oesp
|
|
|
+
|
|
|
+ ret_val['params']['oe'] = 'utf8'
|
|
|
+
|
|
|
+ # num parameter:
|
|
|
+ # The num parameter identifies the number of search results to return.
|
|
|
+ # The default num value is 10, and the maximum value is 20. If you request
|
|
|
+ # more than 20 results, only 20 results will be returned.
|
|
|
+ # https://developers.google.com/custom-search/docs/xml_results#numsp
|
|
|
+
|
|
|
+ # HINT: seems to have no effect (tested in google WEB & Images)
|
|
|
+ # ret_val['params']['num'] = 20
|
|
|
+
|
|
|
+ # HTTP headers
|
|
|
+
|
|
|
+ ret_val['headers']['Accept'] = '*/*'
|
|
|
|
|
|
- # restricts search results to documents written in a particular
|
|
|
- # language.
|
|
|
- ret_val['params']['lr'] = "lang_" + lang_list.get(lang_country, language)
|
|
|
+ # Cookies
|
|
|
+
|
|
|
+ # - https://github.com/searxng/searxng/pull/1679#issuecomment-1235432746
|
|
|
+ # - https://github.com/searxng/searxng/issues/1555
|
|
|
+ ret_val['cookies']['CONSENT'] = "YES+"
|
|
|
|
|
|
return ret_val
|
|
|
|
|
@@ -248,33 +260,34 @@ def detect_google_sorry(resp):
|
|
|
|
|
|
def request(query, params):
|
|
|
"""Google search request"""
|
|
|
-
|
|
|
+ # pylint: disable=line-too-long
|
|
|
offset = (params['pageno'] - 1) * 10
|
|
|
-
|
|
|
- lang_info = get_lang_info(params, supported_languages, language_aliases, True)
|
|
|
-
|
|
|
- additional_parameters = {}
|
|
|
- if use_mobile_ui:
|
|
|
- additional_parameters = {
|
|
|
- 'asearch': 'arc',
|
|
|
- 'async': 'use_ac:true,_fmt:prog',
|
|
|
- }
|
|
|
+ google_info = get_google_info(params, traits)
|
|
|
|
|
|
# https://www.google.de/search?q=corona&hl=de&lr=lang_de&start=0&tbs=qdr%3Ad&safe=medium
|
|
|
query_url = (
|
|
|
'https://'
|
|
|
- + lang_info['subdomain']
|
|
|
+ + google_info['subdomain']
|
|
|
+ '/search'
|
|
|
+ "?"
|
|
|
+ urlencode(
|
|
|
{
|
|
|
'q': query,
|
|
|
- **lang_info['params'],
|
|
|
- 'ie': "utf8",
|
|
|
- 'oe': "utf8",
|
|
|
- 'start': offset,
|
|
|
+ **google_info['params'],
|
|
|
'filter': '0',
|
|
|
- **additional_parameters,
|
|
|
+ 'start': offset,
|
|
|
+ # 'vet': '12ahUKEwik3ZbIzfn7AhXMX_EDHbUDBh0QxK8CegQIARAC..i',
|
|
|
+ # 'ved': '2ahUKEwik3ZbIzfn7AhXMX_EDHbUDBh0Q_skCegQIARAG',
|
|
|
+ # 'cs' : 1,
|
|
|
+ # 'sa': 'N',
|
|
|
+ # 'yv': 3,
|
|
|
+ # 'prmd': 'vin',
|
|
|
+ # 'ei': 'GASaY6TxOcy_xc8PtYeY6AE',
|
|
|
+ # 'sa': 'N',
|
|
|
+ # 'sstk': 'AcOHfVkD7sWCSAheZi-0tx_09XDO55gTWY0JNq3_V26cNN-c8lfD45aZYPI8s_Bqp8s57AHz5pxchDtAGCA_cikAWSjy9kw3kgg'
|
|
|
+ # formally known as use_mobile_ui
|
|
|
+ 'asearch': 'arc',
|
|
|
+ 'async': UI_ASYNC,
|
|
|
}
|
|
|
)
|
|
|
)
|
|
@@ -285,25 +298,45 @@ def request(query, params):
|
|
|
query_url += '&' + urlencode({'safe': filter_mapping[params['safesearch']]})
|
|
|
params['url'] = query_url
|
|
|
|
|
|
- params['cookies']['CONSENT'] = "YES+"
|
|
|
- params['headers'].update(lang_info['headers'])
|
|
|
- if use_mobile_ui:
|
|
|
- params['headers']['Accept'] = '*/*'
|
|
|
- else:
|
|
|
- params['headers']['Accept'] = 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8'
|
|
|
-
|
|
|
+ params['cookies'] = google_info['cookies']
|
|
|
+ params['headers'].update(google_info['headers'])
|
|
|
return params
|
|
|
|
|
|
|
|
|
+# (function(){var s=' ...
|
|
|
+# ... DX/Ff5XSpSgdU32xSlKDJ//9k\x3d';var ii=['dimg_21'];_setImagesSrc(ii,s);})();
|
|
|
+RE_DATA_IMAGE = re.compile(r"'(data:image[^']*)'[^']*ii=\['([^']*)'\];_setImagesSrc")
|
|
|
+
|
|
|
+
|
|
|
+def _parse_data_images(dom):
|
|
|
+ data_image_map = {}
|
|
|
+ for _script in eval_xpath_list(dom, "//script[@nonce]"):
|
|
|
+ script = _script.text
|
|
|
+ if not script:
|
|
|
+ continue
|
|
|
+ script = RE_DATA_IMAGE.search(script)
|
|
|
+ if not script:
|
|
|
+ continue
|
|
|
+ data_image_map[script.group(2)] = script.group(1).replace(r'\x3d', '=')
|
|
|
+ logger.debug('data:image objects --> %s', list(data_image_map.keys()))
|
|
|
+ return data_image_map
|
|
|
+
|
|
|
+
|
|
|
def response(resp):
|
|
|
"""Get response from google's search request"""
|
|
|
-
|
|
|
+ # pylint: disable=too-many-branches, too-many-statements
|
|
|
detect_google_sorry(resp)
|
|
|
|
|
|
results = []
|
|
|
|
|
|
# convert the text to dom
|
|
|
dom = html.fromstring(resp.text)
|
|
|
+
|
|
|
+ data_image_map = {}
|
|
|
+ if '_fmt:html' in UI_ASYNC:
|
|
|
+ # in this format images are embedded by a bse64 encoded 'data:image'
|
|
|
+ data_image_map = _parse_data_images(dom)
|
|
|
+
|
|
|
# results --> answer
|
|
|
answer_list = eval_xpath(dom, '//div[contains(@class, "LGOjhe")]')
|
|
|
if answer_list:
|
|
@@ -312,20 +345,9 @@ def response(resp):
|
|
|
else:
|
|
|
logger.debug("did not find 'answer'")
|
|
|
|
|
|
- # results --> number_of_results
|
|
|
- if not use_mobile_ui:
|
|
|
- try:
|
|
|
- _txt = eval_xpath_getindex(dom, '//div[@id="result-stats"]//text()', 0)
|
|
|
- _digit = ''.join([n for n in _txt if n.isdigit()])
|
|
|
- number_of_results = int(_digit)
|
|
|
- results.append({'number_of_results': number_of_results})
|
|
|
- except Exception as e: # pylint: disable=broad-except
|
|
|
- logger.debug("did not 'number_of_results'")
|
|
|
- logger.error(e, exc_info=True)
|
|
|
-
|
|
|
# parse results
|
|
|
|
|
|
- for result in eval_xpath_list(dom, results_xpath):
|
|
|
+ for result in eval_xpath_list(dom, results_xpath): # pylint: disable=too-many-nested-blocks
|
|
|
|
|
|
# google *sections*
|
|
|
if extract_text(eval_xpath(result, g_section_with_header)):
|
|
@@ -342,13 +364,31 @@ def response(resp):
|
|
|
url = eval_xpath_getindex(result, href_xpath, 0, None)
|
|
|
if url is None:
|
|
|
continue
|
|
|
- content = extract_text(eval_xpath_getindex(result, content_xpath, 0, default=None), allow_none=True)
|
|
|
- if content is None:
|
|
|
+
|
|
|
+ content = []
|
|
|
+ img_list = []
|
|
|
+ for content_feature in eval_xpath(result, content_xpath):
|
|
|
+ val = content_feature.attrib['data-content-feature']
|
|
|
+ if val in ['1', '2']:
|
|
|
+ txt = extract_text(content_feature, allow_none=True)
|
|
|
+ if txt:
|
|
|
+ content.append(txt)
|
|
|
+ elif '0' in val:
|
|
|
+ img = content_feature.xpath('.//img/@src')
|
|
|
+ if img:
|
|
|
+ img = img[0]
|
|
|
+ if img.startswith('data:image'):
|
|
|
+ img_id = content_feature.xpath('.//img/@id')
|
|
|
+ if img_id:
|
|
|
+ img = data_image_map.get(img_id[0])
|
|
|
+ img_list.append(img)
|
|
|
+
|
|
|
+ if not content:
|
|
|
logger.debug('ignoring item from the result_xpath list: missing content of title "%s"', title)
|
|
|
continue
|
|
|
-
|
|
|
- logger.debug('add link to results: %s', title)
|
|
|
- results.append({'url': url, 'title': title, 'content': content})
|
|
|
+ content = ' / '.join(content)
|
|
|
+ img_src = img_list[0] if img_list else None
|
|
|
+ results.append({'url': url, 'title': title, 'content': content, 'img_src': img_src})
|
|
|
|
|
|
except Exception as e: # pylint: disable=broad-except
|
|
|
logger.error(e, exc_info=True)
|
|
@@ -364,18 +404,6 @@ def response(resp):
|
|
|
|
|
|
|
|
|
# get supported languages from their site
|
|
|
-def _fetch_supported_languages(resp):
|
|
|
- ret_val = {}
|
|
|
- dom = html.fromstring(resp.text)
|
|
|
-
|
|
|
- radio_buttons = eval_xpath_list(dom, '//*[@id="langSec"]//input[@name="lr"]')
|
|
|
-
|
|
|
- for x in radio_buttons:
|
|
|
- name = x.get("data-name")
|
|
|
- code = x.get("value").split('_')[-1]
|
|
|
- ret_val[code] = {"name": name}
|
|
|
-
|
|
|
- return ret_val
|
|
|
|
|
|
|
|
|
skip_countries = [
|
|
@@ -404,25 +432,21 @@ skip_countries = [
|
|
|
]
|
|
|
|
|
|
|
|
|
-def fetch_traits(engine_traits: EngineTraits):
|
|
|
+def fetch_traits(engine_traits: EngineTraits, add_domains: bool = True):
|
|
|
"""Fetch languages from Google."""
|
|
|
- # pylint: disable=import-outside-toplevel
|
|
|
-
|
|
|
- engine_traits.data_type = 'supported_languages' # deprecated
|
|
|
+ # pylint: disable=import-outside-toplevel, too-many-branches
|
|
|
|
|
|
- import babel
|
|
|
- import babel.languages
|
|
|
- from searx import network
|
|
|
- from searx.locales import language_tag, region_tag, get_offical_locales
|
|
|
+ engine_traits.custom['supported_domains'] = {}
|
|
|
|
|
|
resp = network.get('https://www.google.com/preferences')
|
|
|
if not resp.ok:
|
|
|
- print("ERROR: response from Google is not OK.")
|
|
|
+ raise RuntimeError("Response from Google's preferences is not OK.")
|
|
|
|
|
|
dom = html.fromstring(resp.text)
|
|
|
|
|
|
- lang_map = {'no': 'nb'}
|
|
|
+ # supported language codes
|
|
|
|
|
|
+ lang_map = {'no': 'nb'}
|
|
|
for x in eval_xpath_list(dom, '//*[@id="langSec"]//input[@name="lr"]'):
|
|
|
|
|
|
eng_lang = x.get("value").split('_')[-1]
|
|
@@ -443,6 +467,8 @@ def fetch_traits(engine_traits: EngineTraits):
|
|
|
# alias languages
|
|
|
engine_traits.languages['zh'] = 'lang_zh-CN'
|
|
|
|
|
|
+ # supported region codes
|
|
|
+
|
|
|
for x in eval_xpath_list(dom, '//*[@name="region"]/..//input[@name="region"]'):
|
|
|
eng_country = x.get("value")
|
|
|
|
|
@@ -459,4 +485,26 @@ def fetch_traits(engine_traits: EngineTraits):
|
|
|
continue
|
|
|
|
|
|
for sxng_locale in sxng_locales:
|
|
|
- engine_traits.regions[region_tag(sxng_locale)] = 'country' + eng_country
|
|
|
+ engine_traits.regions[region_tag(sxng_locale)] = eng_country
|
|
|
+
|
|
|
+ # alias regions
|
|
|
+ engine_traits.regions['zh-CN'] = 'HK'
|
|
|
+
|
|
|
+ # supported domains
|
|
|
+
|
|
|
+ if add_domains:
|
|
|
+ resp = network.get('https://www.google.com/supported_domains')
|
|
|
+ if not resp.ok:
|
|
|
+ raise RuntimeError("Response from https://www.google.com/supported_domains is not OK.")
|
|
|
+
|
|
|
+ for domain in resp.text.split():
|
|
|
+ domain = domain.strip()
|
|
|
+ if not domain or domain in [
|
|
|
+ '.google.com',
|
|
|
+ ]:
|
|
|
+ continue
|
|
|
+ region = domain.split('.')[-1].upper()
|
|
|
+ engine_traits.custom['supported_domains'][region] = 'www' + domain
|
|
|
+ if region == 'HK':
|
|
|
+ # There is no google.cn, we use .com.hk for zh-CN
|
|
|
+ engine_traits.custom['supported_domains']['CN'] = 'www' + domain
|