4 years ago · 71d66979c2
--- a/Makefile
+++ b/Makefile
@@ -166,6 +166,18 @@ PHONY += gecko.driver
 
				 gecko.driver:
			
 
				 	$(PY_ENV_ACT); ./manage.sh install_geckodriver
			
 
				 
			
 
				+# search.checker
			
 
				+# --------------
			
 
				+
			
 
				+search.checker: pyenvinstall
			
 
				+	$(Q)$(PY_ENV_ACT); searx-checker -v
			
 
				+
			
 
				+ENGINE_TARGETS=$(patsubst searx/engines/%.py,search.checker.%,$(wildcard searx/engines/[!_]*.py))
			
 
				+
			
 
				+$(ENGINE_TARGETS): pyenvinstall
			
 
				+	$(Q)$(PY_ENV_ACT); searx-checker -v "$(subst _, ,$(patsubst search.checker.%,%,$@))"
			
 
				+
			
 
				+
			
 
				 # test
			
 
				 # ----
			
 
				 
			
@@ -179,7 +191,9 @@ PYLINT_FILES=\
 
				 	searx/engines/deviantart.py \
			
 
				 	searx/engines/digg.py \
			
 
				 	searx/engines/google.py \
			
 
				-	searx/engines/google_news.py
			
 
				+	searx/engines/google_news.py \
			
 
				+	searx/engines/google_videos.py \
			
 
				+	searx/engines/google_images.py
			
 
				 
			
 
				 test.pylint: pyenvinstall
			
 
				 	$(call cmd,pylint,$(PYLINT_FILES))
			
--- a/searx/data/engines_languages.json
+++ b/searx/data/engines_languages.json
--- a/searx/engines/google.py
+++ b/searx/engines/google.py
@@ -1,11 +1,11 @@
 
				 # SPDX-License-Identifier: AGPL-3.0-or-later
			
 
				 """Google (Web)
			
 
				 
			
 
				- For detailed description of the *REST-full* API see: `Query Parameter
			
 
				- Definitions`_.
			
 
				+For detailed description of the *REST-full* API see: `Query Parameter
			
 
				+Definitions`_.
			
 
				 
			
 
				- .. _Query Parameter Definitions:
			
 
				- https://developers.google.com/custom-search/docs/xml_results#WebSearch_Query_Parameter_Definitions
			
 
				+.. _Query Parameter Definitions:
			
 
				+   https://developers.google.com/custom-search/docs/xml_results#WebSearch_Query_Parameter_Definitions
			
 
				 """
			
 
				 
			
 
				 # pylint: disable=invalid-name, missing-function-docstring
			
@@ -16,7 +16,6 @@ from searx import logger
 
				 from searx.utils import match_language, extract_text, eval_xpath, eval_xpath_list, eval_xpath_getindex
			
 
				 from searx.exceptions import SearxEngineCaptchaException
			
 
				 
			
 
				-
			
 
				 logger = logger.getChild('google engine')
			
 
				 
			
 
				 # about
			
@@ -56,7 +55,7 @@ google_domains = {
 
				     'NZ': 'google.co.nz',   # New Zealand
			
 
				     'PH': 'google.com.ph',  # Philippines
			
 
				     'SG': 'google.com.sg',  # Singapore
			
 
				-    # 'US': 'google.us',    # United States, redirect to .com
			
 
				+    'US': 'google.com',     # United States (google.us) redirects to .com
			
 
				     'ZA': 'google.co.za',   # South Africa
			
 
				     'AR': 'google.com.ar',  # Argentina
			
 
				     'CL': 'google.cl',      # Chile
			
@@ -87,7 +86,7 @@ google_domains = {
 
				     'TH': 'google.co.th',   # Thailand
			
 
				     'TR': 'google.com.tr',  # Turkey
			
 
				     'UA': 'google.com.ua',  # Ukraine
			
 
				-    # 'CN': 'google.cn',    # China, only from China ?
			
 
				+    'CN': 'google.com.hk',  # There is no google.cn, we use .com.hk for zh-CN
			
 
				     'HK': 'google.com.hk',  # Hong Kong
			
 
				     'TW': 'google.com.tw'   # Taiwan
			
 
				 }
			
@@ -134,26 +133,58 @@ suggestion_xpath = '//div[contains(@class, "card-section")]//a'
 
				 spelling_suggestion_xpath = '//div[@class="med"]/p/a'
			
 
				 
			
 
				 
			
 
				-def get_lang_country(params, lang_list, custom_aliases):
			
 
				-    """Returns a tuple with *langauage* on its first and *country* on its second
			
 
				-    position."""
			
 
				-    language = params['language']
			
 
				-    if language == 'all':
			
 
				-        language = 'en-US'
			
 
				+def get_lang_info(params, lang_list, custom_aliases):
			
 
				+    ret_val = {}
			
 
				+
			
 
				+    _lang = params['language']
			
 
				+    if _lang.lower() == 'all':
			
 
				+        _lang = 'en-US'
			
 
				 
			
 
				-    language_array = language.split('-')
			
 
				+    language = match_language(_lang, lang_list, custom_aliases)
			
 
				+    ret_val['language'] = language
			
 
				 
			
 
				-    if len(language_array) == 2:
			
 
				-        country = language_array[1]
			
 
				+    # the requested language from params (en, en-US, de, de-AT, fr, fr-CA, ...)
			
 
				+    _l = _lang.split('-')
			
 
				+
			
 
				+    # the country code (US, AT, CA)
			
 
				+    if len(_l) == 2:
			
 
				+        country = _l[1]
			
 
				     else:
			
 
				-        country = language_array[0].upper()
			
 
				+        country = _l[0].upper()
			
 
				+        if country == 'EN':
			
 
				+            country = 'US'
			
 
				+
			
 
				+    ret_val['country'] = country
			
 
				 
			
 
				-    language = match_language(language, lang_list, custom_aliases)
			
 
				+    # the combination (en-US, en-EN, de-DE, de-AU, fr-FR, fr-FR)
			
 
				     lang_country = '%s-%s' % (language, country)
			
 
				-    if lang_country == 'en-EN':
			
 
				-        lang_country = 'en'
			
 
				 
			
 
				-    return language, country, lang_country
			
 
				+    # Accept-Language: fr-CH, fr;q=0.8, en;q=0.6, *;q=0.5
			
 
				+    ret_val['Accept-Language'] = ','.join([
			
 
				+        lang_country,
			
 
				+        language + ';q=0.8,',
			
 
				+        'en;q=0.6',
			
 
				+        '*;q=0.5',
			
 
				+    ])
			
 
				+
			
 
				+    # subdomain
			
 
				+    ret_val['subdomain']  = 'www.' + google_domains.get(country.upper(), 'google.com')
			
 
				+
			
 
				+    # hl parameter:
			
 
				+    #   https://developers.google.com/custom-search/docs/xml_results#hlsp The
			
 
				+    # Interface Language:
			
 
				+    #   https://developers.google.com/custom-search/docs/xml_results_appendices#interfaceLanguages
			
 
				+
			
 
				+    ret_val['hl'] = lang_list.get(lang_country, language)
			
 
				+
			
 
				+    # lr parameter:
			
 
				+    #   https://developers.google.com/custom-search/docs/xml_results#lrsp
			
 
				+    # Language Collection Values:
			
 
				+    #   https://developers.google.com/custom-search/docs/xml_results_appendices#languageCollections
			
 
				+
			
 
				+    ret_val['lr'] = "lang_" + lang_list.get(lang_country, language)
			
 
				+
			
 
				+    return ret_val
			
 
				 
			
 
				 def detect_google_sorry(resp):
			
 
				     resp_url = urlparse(resp.url)
			
@@ -165,17 +196,17 @@ def request(query, params):
 
				     """Google search request"""
			
 
				 
			
 
				     offset = (params['pageno'] - 1) * 10
			
 
				-    language, country, lang_country = get_lang_country(
			
 
				+
			
 
				+    lang_info = get_lang_info(
			
 
				         # pylint: disable=undefined-variable
			
 
				         params, supported_languages, language_aliases
			
 
				     )
			
 
				-    subdomain = 'www.' + google_domains.get(country.upper(), 'google.com')
			
 
				 
			
 
				-    # https://www.google.de/search?q=corona&hl=de-DE&lr=lang_de&start=0&tbs=qdr%3Ad&safe=medium
			
 
				-    query_url = 'https://' + subdomain + '/search' + "?" + urlencode({
			
 
				+    # https://www.google.de/search?q=corona&hl=de&lr=lang_de&start=0&tbs=qdr%3Ad&safe=medium
			
 
				+    query_url = 'https://' + lang_info['subdomain'] + '/search' + "?" + urlencode({
			
 
				         'q': query,
			
 
				-        'hl': lang_country,
			
 
				-        'lr': "lang_" + language,
			
 
				+        'hl': lang_info['hl'],
			
 
				+        'lr': lang_info['lr'],
			
 
				         'ie': "utf8",
			
 
				         'oe': "utf8",
			
 
				         'start': offset,
			
@@ -186,19 +217,14 @@ def request(query, params):
 
				     if params['safesearch']:
			
 
				         query_url += '&' + urlencode({'safe': filter_mapping[params['safesearch']]})
			
 
				 
			
 
				-    params['url'] = query_url
			
 
				     logger.debug("query_url --> %s", query_url)
			
 
				+    params['url'] = query_url
			
 
				 
			
 
				-    # en-US,en;q=0.8,en;q=0.5
			
 
				-    params['headers']['Accept-Language'] = (
			
 
				-        lang_country + ',' + language + ';q=0.8,' + language + ';q=0.5'
			
 
				-    )
			
 
				-    logger.debug("HTTP header Accept-Language --> %s",
			
 
				-                 params['headers']['Accept-Language'])
			
 
				+    logger.debug("HTTP header Accept-Language --> %s", lang_info['Accept-Language'])
			
 
				+    params['headers']['Accept-Language'] = lang_info['Accept-Language']
			
 
				     params['headers']['Accept'] = (
			
 
				         'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8'
			
 
				     )
			
 
				-    # params['google_subdomain'] = subdomain
			
 
				 
			
 
				     return params
			
 
				 
			
@@ -209,8 +235,6 @@ def response(resp):
 
				     detect_google_sorry(resp)
			
 
				 
			
 
				     results = []
			
 
				-    # which subdomain ?
			
 
				-    # subdomain = resp.search_params.get('google_subdomain')
			
 
				 
			
 
				     # convert the text to dom
			
 
				     dom = html.fromstring(resp.text)
			
@@ -247,7 +271,9 @@ def response(resp):
 
				                 logger.debug('ingoring <div class="g" ../> section: missing title')
			
 
				                 continue
			
 
				             title = extract_text(title_tag)
			
 
				-            url = eval_xpath_getindex(result, href_xpath, 0)
			
 
				+            url = eval_xpath_getindex(result, href_xpath, 0, None)
			
 
				+            if url is None:
			
 
				+                continue
			
 
				             content = extract_text(eval_xpath_getindex(result, content_xpath, 0, default=None), allow_none=True)
			
 
				             results.append({
			
 
				                 'url': url,
			
--- a/searx/engines/google_images.py
+++ b/searx/engines/google_images.py
@@ -10,35 +10,50 @@ Definitions`_.
 
				    ``data:` scheme).::
			
 
				 
			
 
				      Header set Content-Security-Policy "img-src 'self' data: ;"
			
 
				+
			
 
				+.. _Query Parameter Definitions:
			
 
				+   https://developers.google.com/custom-search/docs/xml_results#WebSearch_Query_Parameter_Definitions
			
 
				+.. _data URLs:
			
 
				+   https://developer.mozilla.org/en-US/docs/Web/HTTP/Basics_of_HTTP/Data_URIs
			
 
				 """
			
 
				 
			
 
				 from urllib.parse import urlencode, unquote
			
 
				 from lxml import html
			
 
				+
			
 
				 from searx import logger
			
 
				-from searx.utils import extract_text, eval_xpath
			
 
				-from searx.engines.google import _fetch_supported_languages, supported_languages_url  # NOQA # pylint: disable=unused-import
			
 
				+from searx.utils import (
			
 
				+    eval_xpath,
			
 
				+    eval_xpath_list,
			
 
				+    eval_xpath_getindex,
			
 
				+    extract_text,
			
 
				+)
			
 
				 
			
 
				 from searx.engines.google import (
			
 
				-    get_lang_country,
			
 
				-    google_domains,
			
 
				+    get_lang_info,
			
 
				     time_range_dict,
			
 
				     detect_google_sorry,
			
 
				 )
			
 
				 
			
 
				+# pylint: disable=unused-import
			
 
				+from searx.engines.google import (
			
 
				+    supported_languages_url
			
 
				+    ,  _fetch_supported_languages
			
 
				+)
			
 
				+# pylint: enable=unused-import
			
 
				+
			
 
				 logger = logger.getChild('google images')
			
 
				 
			
 
				 # about
			
 
				 about = {
			
 
				-    "website": 'https://images.google.com/',
			
 
				+    "website": 'https://images.google.com',
			
 
				     "wikidata_id": 'Q521550',
			
 
				-    "official_api_documentation": 'https://developers.google.com/custom-search/docs/xml_results#WebSearch_Query_Parameter_Definitions',  # NOQA
			
 
				+    "official_api_documentation": 'https://developers.google.com/custom-search',
			
 
				     "use_official_api": False,
			
 
				     "require_api_key": False,
			
 
				     "results": 'HTML',
			
 
				 }
			
 
				 
			
 
				 # engine dependent config
			
 
				-
			
 
				 categories = ['images']
			
 
				 paging = False
			
 
				 language_support = True
			
@@ -84,17 +99,16 @@ def scrap_img_by_id(script, data_id):
 
				 def request(query, params):
			
 
				     """Google-Video search request"""
			
 
				 
			
 
				-    language, country, lang_country = get_lang_country(
			
 
				+    lang_info = get_lang_info(
			
 
				         # pylint: disable=undefined-variable
			
 
				         params, supported_languages, language_aliases
			
 
				     )
			
 
				-    subdomain = 'www.' + google_domains.get(country.upper(), 'google.com')
			
 
				 
			
 
				-    query_url = 'https://' + subdomain + '/search' + "?" + urlencode({
			
 
				+    query_url = 'https://' + lang_info['subdomain'] + '/search' + "?" + urlencode({
			
 
				         'q': query,
			
 
				         'tbm': "isch",
			
 
				-        'hl': lang_country,
			
 
				-        'lr': "lang_" + language,
			
 
				+        'hl': lang_info['hl'],
			
 
				+        'lr': lang_info['lr'],
			
 
				         'ie': "utf8",
			
 
				         'oe': "utf8",
			
 
				         'num': 30,
			
@@ -105,17 +119,14 @@ def request(query, params):
 
				     if params['safesearch']:
			
 
				         query_url += '&' + urlencode({'safe': filter_mapping[params['safesearch']]})
			
 
				 
			
 
				-    params['url'] = query_url
			
 
				     logger.debug("query_url --> %s", query_url)
			
 
				+    params['url'] = query_url
			
 
				 
			
 
				-    params['headers']['Accept-Language'] = (
			
 
				-        "%s,%s;q=0.8,%s;q=0.5" % (lang_country, language, language))
			
 
				-    logger.debug(
			
 
				-        "HTTP Accept-Language --> %s", params['headers']['Accept-Language'])
			
 
				+    logger.debug("HTTP header Accept-Language --> %s", lang_info['Accept-Language'])
			
 
				+    params['headers']['Accept-Language'] = lang_info['Accept-Language']
			
 
				     params['headers']['Accept'] = (
			
 
				         'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8'
			
 
				     )
			
 
				-    # params['google_subdomain'] = subdomain
			
 
				     return params
			
 
				 
			
 
				 
			
@@ -125,13 +136,11 @@ def response(resp):
 
				 
			
 
				     detect_google_sorry(resp)
			
 
				 
			
 
				-    # which subdomain ?
			
 
				-    # subdomain = resp.search_params.get('google_subdomain')
			
 
				-
			
 
				     # convert the text to dom
			
 
				     dom = html.fromstring(resp.text)
			
 
				     img_bas64_map = scrap_out_thumbs(dom)
			
 
				-    img_src_script = eval_xpath(dom, '//script[contains(., "AF_initDataCallback({key: ")]')[1].text
			
 
				+    img_src_script = eval_xpath_getindex(
			
 
				+        dom, '//script[contains(., "AF_initDataCallback({key: ")]', 1).text
			
 
				 
			
 
				     # parse results
			
 
				     #
			
@@ -156,55 +165,47 @@ def response(resp):
 
				         return results
			
 
				 
			
 
				     root = root[0]
			
 
				-    for img_node in eval_xpath(root, './/img[contains(@class, "rg_i")]'):
			
 
				-
			
 
				-        try:
			
 
				-            img_alt = eval_xpath(img_node, '@alt')[0]
			
 
				-
			
 
				-            img_base64_id = eval_xpath(img_node, '@data-iid')
			
 
				-            if img_base64_id:
			
 
				-                img_base64_id = img_base64_id[0]
			
 
				-                thumbnail_src = img_bas64_map[img_base64_id]
			
 
				+    for img_node in eval_xpath_list(root, './/img[contains(@class, "rg_i")]'):
			
 
				+
			
 
				+        img_alt = eval_xpath_getindex(img_node, '@alt', 0)
			
 
				+
			
 
				+        img_base64_id = eval_xpath(img_node, '@data-iid')
			
 
				+        if img_base64_id:
			
 
				+            img_base64_id = img_base64_id[0]
			
 
				+            thumbnail_src = img_bas64_map[img_base64_id]
			
 
				+        else:
			
 
				+            thumbnail_src = eval_xpath(img_node, '@src')
			
 
				+            if not thumbnail_src:
			
 
				+                thumbnail_src = eval_xpath(img_node, '@data-src')
			
 
				+            if thumbnail_src:
			
 
				+                thumbnail_src = thumbnail_src[0]
			
 
				             else:
			
 
				-                thumbnail_src = eval_xpath(img_node, '@src')
			
 
				-                if not thumbnail_src:
			
 
				-                    thumbnail_src = eval_xpath(img_node, '@data-src')
			
 
				-                if thumbnail_src:
			
 
				-                    thumbnail_src = thumbnail_src[0]
			
 
				-                else:
			
 
				-                    thumbnail_src = ''
			
 
				-
			
 
				-            link_node = eval_xpath(img_node, '../../../a[2]')[0]
			
 
				-            url = eval_xpath(link_node, '@href')[0]
			
 
				-
			
 
				-            pub_nodes = eval_xpath(link_node, './div/div')
			
 
				-            pub_descr = img_alt
			
 
				-            pub_source = ''
			
 
				-            if pub_nodes:
			
 
				-                pub_descr = extract_text(pub_nodes[0])
			
 
				-                pub_source = extract_text(pub_nodes[1])
			
 
				-
			
 
				-            img_src_id = eval_xpath(img_node, '../../../@data-id')[0]
			
 
				-            src_url = scrap_img_by_id(img_src_script, img_src_id)
			
 
				-            if not src_url:
			
 
				-                src_url = thumbnail_src
			
 
				-
			
 
				-            results.append({
			
 
				-                'url': url,
			
 
				-                'title': img_alt,
			
 
				-                'content': pub_descr,
			
 
				-                'source': pub_source,
			
 
				-                'img_src': src_url,
			
 
				-                # 'img_format': img_format,
			
 
				-                'thumbnail_src': thumbnail_src,
			
 
				-                'template': 'images.html'
			
 
				-            })
			
 
				-        except Exception as e:  # pylint: disable=broad-except
			
 
				-            logger.error(e, exc_info=True)
			
 
				-            # from lxml import etree
			
 
				-            # logger.debug(etree.tostring(img_node, pretty_print=True))
			
 
				-            # import pdb
			
 
				-            # pdb.set_trace()
			
 
				-            continue
			
 
				+                thumbnail_src = ''
			
 
				+
			
 
				+        link_node = eval_xpath_getindex(img_node, '../../../a[2]', 0)
			
 
				+        url = eval_xpath_getindex(link_node, '@href', 0)
			
 
				+
			
 
				+        pub_nodes = eval_xpath(link_node, './div/div')
			
 
				+        pub_descr = img_alt
			
 
				+        pub_source = ''
			
 
				+        if pub_nodes:
			
 
				+            pub_descr = extract_text(pub_nodes[0])
			
 
				+            pub_source = extract_text(pub_nodes[1])
			
 
				+
			
 
				+        img_src_id = eval_xpath_getindex(img_node, '../../../@data-id', 0)
			
 
				+        src_url = scrap_img_by_id(img_src_script, img_src_id)
			
 
				+        if not src_url:
			
 
				+            src_url = thumbnail_src
			
 
				+
			
 
				+        results.append({
			
 
				+            'url': url,
			
 
				+            'title': img_alt,
			
 
				+            'content': pub_descr,
			
 
				+            'source': pub_source,
			
 
				+            'img_src': src_url,
			
 
				+            # 'img_format': img_format,
			
 
				+            'thumbnail_src': thumbnail_src,
			
 
				+            'template': 'images.html'
			
 
				+        })
			
 
				 
			
 
				     return results
			
--- a/searx/engines/google_news.py
+++ b/searx/engines/google_news.py
@@ -2,13 +2,16 @@
 
				 """Google (News)
			
 
				 
			
 
				 For detailed description of the *REST-full* API see: `Query Parameter
			
 
				-Definitions`_.  Not all parameters can be appied, e.g. num_ (the number of
			
 
				-search results to return) is ignored.
			
 
				+Definitions`_.  Not all parameters can be appied:
			
 
				+
			
 
				+- num_ : the number of search results is ignored
			
 
				+- save_ : is ignored / Google-News results are always *SafeSearch*
			
 
				 
			
 
				 .. _Query Parameter Definitions:
			
 
				    https://developers.google.com/custom-search/docs/xml_results#WebSearch_Query_Parameter_Definitions
			
 
				 
			
 
				 .. _num: https://developers.google.com/custom-search/docs/xml_results#numsp
			
 
				+.. _save: https://developers.google.com/custom-search/docs/xml_results#safesp
			
 
				 
			
 
				 """
			
 
				 
			
@@ -32,20 +35,19 @@ from searx.utils import (
 
				 from searx.engines.google import (
			
 
				     supported_languages_url,
			
 
				     _fetch_supported_languages,
			
 
				-    detect_google_sorry,
			
 
				 )
			
 
				 # pylint: enable=unused-import
			
 
				 
			
 
				 from searx.engines.google import (
			
 
				-    get_lang_country,
			
 
				-    filter_mapping,
			
 
				+    get_lang_info,
			
 
				+    detect_google_sorry,
			
 
				 )
			
 
				 
			
 
				 # about
			
 
				 about = {
			
 
				     "website": 'https://news.google.com',
			
 
				     "wikidata_id": 'Q12020',
			
 
				-    "official_api_documentation": None,
			
 
				+    "official_api_documentation": 'https://developers.google.com/custom-search',
			
 
				     "use_official_api": False,
			
 
				     "require_api_key": False,
			
 
				     "results": 'HTML',
			
@@ -69,51 +71,53 @@ paging = False
 
				 language_support = True
			
 
				 use_locale_domain = True
			
 
				 time_range_support = True
			
 
				-safesearch = True # not really, but it is not generated by google
			
 
				+
			
 
				+# Google-News results are always *SafeSearch*. Option 'safesearch' is set to
			
 
				+# False here, otherwise checker will report safesearch-errors::
			
 
				+#
			
 
				+#  safesearch : results are identitical for safesearch=0 and safesearch=2
			
 
				+safesearch = False
			
 
				 
			
 
				 def request(query, params):
			
 
				     """Google-News search request"""
			
 
				 
			
 
				-    language, country, lang_country = get_lang_country(
			
 
				+    lang_info = get_lang_info(
			
 
				         # pylint: disable=undefined-variable
			
 
				         params, supported_languages, language_aliases
			
 
				     )
			
 
				-    subdomain = 'news.google.com'
			
 
				 
			
 
				-    if params['time_range']: # in time_range_dict:
			
 
				+    # google news has only one domain
			
 
				+    lang_info['subdomain'] = 'news.google.com'
			
 
				+
			
 
				+    ceid = "%s:%s" % (lang_info['country'], lang_info['language'])
			
 
				+
			
 
				+    # google news redirects en to en-US
			
 
				+    if lang_info['hl'] == 'en':
			
 
				+        lang_info['hl'] = 'en-US'
			
 
				+
			
 
				+    # Very special to google-news compared to other google engines, the time
			
 
				+    # range is included in the search term.
			
 
				+    if params['time_range']:
			
 
				         query += ' ' + time_range_dict[params['time_range']]
			
 
				 
			
 
				-    query_url = 'https://'+ subdomain + '/search' + "?" + urlencode({
			
 
				+    query_url = 'https://' + lang_info['subdomain'] + '/search' + "?" + urlencode({
			
 
				         'q': query,
			
 
				-        'hl': language,
			
 
				-        'lr': "lang_" + language,
			
 
				+        'hl': lang_info['hl'],
			
 
				+        'lr': lang_info['lr'],
			
 
				         'ie': "utf8",
			
 
				         'oe': "utf8",
			
 
				-        'ceid' : "%s:%s" % (country, language),
			
 
				-        'gl' : country,
			
 
				-    })
			
 
				+        'gl': lang_info['country'],
			
 
				+    }) + ('&ceid=%s' % ceid)  # ceid includes a ':' character which must not be urlencoded
			
 
				 
			
 
				-    if params['safesearch']:
			
 
				-        query_url += '&' + urlencode({'safe': filter_mapping[params['safesearch']]})
			
 
				-
			
 
				-    params['url'] = query_url
			
 
				     logger.debug("query_url --> %s", query_url)
			
 
				+    params['url'] = query_url
			
 
				 
			
 
				-    # en-US,en;q=0.8,en;q=0.5
			
 
				-    params['headers']['Accept-Language'] = (
			
 
				-        lang_country + ',' + language + ';q=0.8,' + language + ';q=0.5'
			
 
				-        )
			
 
				-    logger.debug("HTTP header Accept-Language --> %s",
			
 
				-                 params['headers']['Accept-Language'])
			
 
				+    logger.debug("HTTP header Accept-Language --> %s", lang_info['Accept-Language'])
			
 
				+    params['headers']['Accept-Language'] = lang_info['Accept-Language']
			
 
				     params['headers']['Accept'] = (
			
 
				         'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8'
			
 
				         )
			
 
				 
			
 
				-    # hl=en redirect to hl=en-US / en-CA ...
			
 
				-    params['soft_max_redirects'] = 1
			
 
				-
			
 
				-    #params['google_subdomain'] = subdomain
			
 
				-
			
 
				     return params
			
 
				 
			
 
				 
			
@@ -123,9 +127,6 @@ def response(resp):
 
				 
			
 
				     detect_google_sorry(resp)
			
 
				 
			
 
				-    # which subdomain ?
			
 
				-    # subdomain = resp.search_params.get('google_subdomain')
			
 
				-
			
 
				     # convert the text to dom
			
 
				     dom = html.fromstring(resp.text)
			
 
				 
			
--- a/searx/engines/google_videos.py
+++ b/searx/engines/google_videos.py
@@ -1,99 +1,202 @@
 
				 # SPDX-License-Identifier: AGPL-3.0-or-later
			
 
				+"""Google (Video)
			
 
				+
			
 
				+For detailed description of the *REST-full* API see: `Query Parameter
			
 
				+Definitions`_.  Not all parameters can be appied.
			
 
				+
			
 
				+.. _admonition:: Content-Security-Policy (CSP)
			
 
				+
			
 
				+   This engine needs to allow images from the `data URLs`_ (prefixed with the
			
 
				+   ``data:` scheme).::
			
 
				+
			
 
				+     Header set Content-Security-Policy "img-src 'self' data: ;"
			
 
				+
			
 
				+.. _Query Parameter Definitions:
			
 
				+   https://developers.google.com/custom-search/docs/xml_results#WebSearch_Query_Parameter_Definitions
			
 
				+.. _data URLs:
			
 
				+   https://developer.mozilla.org/en-US/docs/Web/HTTP/Basics_of_HTTP/Data_URIs
			
 
				+
			
 
				 """
			
 
				- Google (Videos)
			
 
				-"""
			
 
				 
			
 
				-from datetime import date, timedelta
			
 
				+# pylint: disable=invalid-name, missing-function-docstring
			
 
				+
			
 
				+import re
			
 
				 from urllib.parse import urlencode
			
 
				 from lxml import html
			
 
				-from searx.utils import extract_text, eval_xpath, eval_xpath_list, eval_xpath_getindex
			
 
				-import re
			
 
				+
			
 
				+from searx import logger
			
 
				+from searx.utils import (
			
 
				+    eval_xpath,
			
 
				+    eval_xpath_list,
			
 
				+    eval_xpath_getindex,
			
 
				+    extract_text,
			
 
				+)
			
 
				+
			
 
				+from searx.engines.google import (
			
 
				+    get_lang_info,
			
 
				+    time_range_dict,
			
 
				+    filter_mapping,
			
 
				+    results_xpath,
			
 
				+    g_section_with_header,
			
 
				+    title_xpath,
			
 
				+    href_xpath,
			
 
				+    content_xpath,
			
 
				+    suggestion_xpath,
			
 
				+    spelling_suggestion_xpath,
			
 
				+    detect_google_sorry,
			
 
				+)
			
 
				+
			
 
				+# pylint: disable=unused-import
			
 
				+from searx.engines.google import (
			
 
				+    supported_languages_url
			
 
				+    ,  _fetch_supported_languages
			
 
				+)
			
 
				+# pylint: enable=unused-import
			
 
				 
			
 
				 # about
			
 
				 about = {
			
 
				     "website": 'https://www.google.com',
			
 
				     "wikidata_id": 'Q219885',
			
 
				-    "official_api_documentation": 'https://developers.google.com/custom-search/',
			
 
				+    "official_api_documentation": 'https://developers.google.com/custom-search',
			
 
				     "use_official_api": False,
			
 
				     "require_api_key": False,
			
 
				     "results": 'HTML',
			
 
				 }
			
 
				 
			
 
				+logger = logger.getChild('google video')
			
 
				+
			
 
				 # engine dependent config
			
 
				+
			
 
				 categories = ['videos']
			
 
				-paging = True
			
 
				-safesearch = True
			
 
				+paging = False
			
 
				+language_support = True
			
 
				+use_locale_domain = True
			
 
				 time_range_support = True
			
 
				-number_of_results = 10
			
 
				+safesearch = True
			
 
				 
			
 
				-search_url = 'https://www.google.com/search'\
			
 
				-    '?q={query}'\
			
 
				-    '&tbm=vid'\
			
 
				-    '&{search_options}'
			
 
				-time_range_attr = "qdr:{range}"
			
 
				-time_range_custom_attr = "cdr:1,cd_min:{start},cd_max{end}"
			
 
				-time_range_dict = {'day': 'd',
			
 
				-                   'week': 'w',
			
 
				-                   'month': 'm'}
			
 
				+RE_CACHE = {}
			
 
				 
			
 
				+def _re(regexpr):
			
 
				+    """returns compiled regular expression"""
			
 
				+    RE_CACHE[regexpr] = RE_CACHE.get(regexpr, re.compile(regexpr))
			
 
				+    return RE_CACHE[regexpr]
			
 
				 
			
 
				-# do search-request
			
 
				-def request(query, params):
			
 
				-    search_options = {
			
 
				-        'ijn': params['pageno'] - 1,
			
 
				-        'start': (params['pageno'] - 1) * number_of_results
			
 
				-    }
			
 
				+def scrap_out_thumbs(dom):
			
 
				+    """Scrap out thumbnail data from <script> tags.
			
 
				+    """
			
 
				+    ret_val = dict()
			
 
				+    thumb_name = 'vidthumb'
			
 
				 
			
 
				-    if params['time_range'] in time_range_dict:
			
 
				-        search_options['tbs'] = time_range_attr.format(range=time_range_dict[params['time_range']])
			
 
				-    elif params['time_range'] == 'year':
			
 
				-        now = date.today()
			
 
				-        then = now - timedelta(days=365)
			
 
				-        start = then.strftime('%m/%d/%Y')
			
 
				-        end = now.strftime('%m/%d/%Y')
			
 
				-        search_options['tbs'] = time_range_custom_attr.format(start=start, end=end)
			
 
				+    for script in eval_xpath_list(dom, '//script[contains(., "_setImagesSrc")]'):
			
 
				+        _script = script.text
			
 
				+
			
 
				+        # var s='data:image/jpeg;base64, ...'
			
 
				+        _imgdata = _re("s='([^']*)").findall( _script)
			
 
				+        if not _imgdata:
			
 
				+            continue
			
 
				 
			
 
				-    if safesearch and params['safesearch']:
			
 
				-        search_options['safe'] = 'on'
			
 
				+        # var ii=['vidthumb4','vidthumb7']
			
 
				+        for _vidthumb in _re(r"(%s\d+)" % thumb_name).findall(_script):
			
 
				+            # At least the equal sign in the URL needs to be decoded
			
 
				+            ret_val[_vidthumb] = _imgdata[0].replace(r"\x3d", "=")
			
 
				 
			
 
				-    params['url'] = search_url.format(query=urlencode({'q': query}),
			
 
				-                                      search_options=urlencode(search_options))
			
 
				+    # {google.ldidly=-1;google.ldi={"vidthumb8":"https://...
			
 
				+    for script in eval_xpath_list(dom, '//script[contains(., "google.ldi={")]'):
			
 
				+        _script = script.text
			
 
				+        for key_val in _re(r'"%s\d+\":\"[^\"]*"' % thumb_name).findall( _script) :
			
 
				+            match = _re(r'"(%s\d+)":"(.*)"' % thumb_name).search(key_val)
			
 
				+            if match:
			
 
				+                # At least the equal sign in the URL needs to be decoded
			
 
				+                ret_val[match.group(1)] = match.group(2).replace(r"\u003d", "=")
			
 
				 
			
 
				+    logger.debug("found %s imgdata for: %s", thumb_name, ret_val.keys())
			
 
				+    return ret_val
			
 
				+
			
 
				+
			
 
				+def request(query, params):
			
 
				+    """Google-Video search request"""
			
 
				+
			
 
				+    lang_info = get_lang_info(
			
 
				+        # pylint: disable=undefined-variable
			
 
				+        params, supported_languages, language_aliases
			
 
				+    )
			
 
				+
			
 
				+    query_url = 'https://' + lang_info['subdomain'] + '/search' + "?" + urlencode({
			
 
				+        'q':   query,
			
 
				+        'tbm': "vid",
			
 
				+        'hl': lang_info['hl'],
			
 
				+        'lr': lang_info['lr'],
			
 
				+        'ie': "utf8",
			
 
				+        'oe': "utf8",
			
 
				+    })
			
 
				+
			
 
				+    if params['time_range'] in time_range_dict:
			
 
				+        query_url += '&' + urlencode({'tbs': 'qdr:' + time_range_dict[params['time_range']]})
			
 
				+    if params['safesearch']:
			
 
				+        query_url += '&' + urlencode({'safe': filter_mapping[params['safesearch']]})
			
 
				+
			
 
				+    logger.debug("query_url --> %s", query_url)
			
 
				+    params['url'] = query_url
			
 
				+
			
 
				+    logger.debug("HTTP header Accept-Language --> %s", lang_info['Accept-Language'])
			
 
				+    params['headers']['Accept-Language'] = lang_info['Accept-Language']
			
 
				+    params['headers']['Accept'] = (
			
 
				+        'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8'
			
 
				+        )
			
 
				     return params
			
 
				 
			
 
				 
			
 
				-# get response from search-request
			
 
				 def response(resp):
			
 
				+    """Get response from google's search request"""
			
 
				     results = []
			
 
				 
			
 
				+    detect_google_sorry(resp)
			
 
				+
			
 
				+    # convert the text to dom
			
 
				     dom = html.fromstring(resp.text)
			
 
				+    vidthumb_imgdata = scrap_out_thumbs(dom)
			
 
				 
			
 
				     # parse results
			
 
				-    for result in eval_xpath_list(dom, '//div[@class="g"]'):
			
 
				-
			
 
				-        title = extract_text(eval_xpath(result, './/h3'))
			
 
				-        url = eval_xpath_getindex(result, './/div[@class="r"]/a/@href', 0)
			
 
				-        content = extract_text(eval_xpath(result, './/span[@class="st"]'))
			
 
				-
			
 
				-        # get thumbnails
			
 
				-        script = str(dom.xpath('//script[contains(., "_setImagesSrc")]')[0].text)
			
 
				-        ids = result.xpath('.//div[@class="s"]//img/@id')
			
 
				-        if len(ids) > 0:
			
 
				-            thumbnails_data = \
			
 
				-                re.findall('s=\'(.*?)(?:\\\\[a-z,1-9,\\\\]+\'|\')\;var ii=\[(?:|[\'vidthumb\d+\',]+)\'' + ids[0],
			
 
				-                           script)
			
 
				-            tmp = []
			
 
				-            if len(thumbnails_data) != 0:
			
 
				-                tmp = re.findall('(data:image/jpeg;base64,[a-z,A-Z,0-9,/,\+]+)', thumbnails_data[0])
			
 
				-            thumbnail = ''
			
 
				-            if len(tmp) != 0:
			
 
				-                thumbnail = tmp[-1]
			
 
				-
			
 
				-        # append result
			
 
				-        results.append({'url': url,
			
 
				-                        'title': title,
			
 
				-                        'content': content,
			
 
				-                        'thumbnail': thumbnail,
			
 
				-                        'template': 'videos.html'})
			
 
				+    for result in eval_xpath_list(dom, results_xpath):
			
 
				+
			
 
				+        # google *sections*
			
 
				+        if extract_text(eval_xpath(result, g_section_with_header)):
			
 
				+            logger.debug("ingoring <g-section-with-header>")
			
 
				+            continue
			
 
				+
			
 
				+        title = extract_text(eval_xpath_getindex(result, title_xpath, 0))
			
 
				+        url = eval_xpath_getindex(result, href_xpath, 0)
			
 
				+        c_node = eval_xpath_getindex(result, content_xpath, 0)
			
 
				+
			
 
				+        # <img id="vidthumb1" ...>
			
 
				+        img_id = eval_xpath_getindex(c_node, './div[1]//a/g-img/img/@id', 0, default=None)
			
 
				+        if img_id is None:
			
 
				+            continue
			
 
				+        img_src = vidthumb_imgdata.get(img_id, None)
			
 
				+        if not img_src:
			
 
				+            logger.error("no vidthumb imgdata for: %s" % img_id)
			
 
				+            img_src = eval_xpath_getindex(c_node, './div[1]//a/g-img/img/@src', 0)
			
 
				+
			
 
				+        length = extract_text(eval_xpath(c_node, './/div[1]//a/div[3]'))
			
 
				+        content = extract_text(eval_xpath(c_node, './/div[2]/span'))
			
 
				+        pub_info = extract_text(eval_xpath(c_node, './/div[2]/div'))
			
 
				+
			
 
				+        results.append({
			
 
				+            'url':         url,
			
 
				+            'title':       title,
			
 
				+            'content':     content,
			
 
				+            'length':      length,
			
 
				+            'author':      pub_info,
			
 
				+            'thumbnail':   img_src,
			
 
				+            'template':    'videos.html',
			
 
				+            })
			
 
				+
			
 
				+    # parse suggestion
			
 
				+    for suggestion in eval_xpath_list(dom, suggestion_xpath):
			
 
				+        # append suggestion
			
 
				+        results.append({'suggestion': extract_text(suggestion)})
			
 
				+
			
 
				+    for correction in eval_xpath_list(dom, spelling_suggestion_xpath):
			
 
				+        results.append({'correction': extract_text(correction)})
			
 
				 
			
 
				     return results
			
--- a/searx/languages.py
+++ b/searx/languages.py
@@ -21,8 +21,6 @@ language_codes = \
 
				     ('en-IE', 'English', 'Ireland', 'English'),
			
 
				     ('en-IN', 'English', 'India', 'English'),
			
 
				     ('en-NZ', 'English', 'New Zealand', 'English'),
			
 
				-    ('en-PH', 'English', 'Philippines', 'English'),
			
 
				-    ('en-SG', 'English', 'Singapore', 'English'),
			
 
				     ('en-US', 'English', 'United States', 'English'),
			
 
				     ('es', 'Español', '', 'Spanish'),
			
 
				     ('es-AR', 'Español', 'Argentina', 'Spanish'),
			
@@ -48,7 +46,6 @@ language_codes = \
 
				     ('ko-KR', '한국어', '', 'Korean'),
			
 
				     ('lt-LT', 'Lietuvių', '', 'Lithuanian'),
			
 
				     ('lv-LV', 'Latviešu', '', 'Latvian'),
			
 
				-    ('ms-MY', 'Melayu', '', 'Malay'),
			
 
				     ('nb-NO', 'Norsk Bokmål', '', 'Norwegian Bokmål'),
			
 
				     ('nl', 'Nederlands', '', 'Dutch'),
			
 
				     ('nl-BE', 'Nederlands', 'België', 'Dutch'),
			
--- a/searx/settings.yml
+++ b/searx/settings.yml
@@ -109,7 +109,7 @@ checker:
 
				     # scheduling: interval or int
			
 
				     # use "scheduling: False" to disable scheduling
			
 
				     # to activate the scheduler:
			
 
				-    # * uncomment "scheduling" section 
			
 
				+    # * uncomment "scheduling" section
			
 
				     # * add "cache2 = name=searxcache,items=2000,blocks=2000,blocksize=4096,bitmap=1" to your uwsgi.ini
			
 
				 
			
 
				     # scheduling:
			
@@ -117,24 +117,36 @@ checker:
 
				     #    every: [86400, 90000]  # how often the checker runs
			
 
				 
			
 
				     # additional tests: only for the YAML anchors (see the engines section)
			
 
				+
			
 
				     additional_tests:
			
 
				-        rosebud: &test_rosebud
			
 
				-          matrix:
			
 
				-              query: rosebud
			
 
				-              lang: en
			
 
				-          result_container:
			
 
				-              - not_empty
			
 
				-              - ['one_title_contains', 'citizen kane']
			
 
				-          test:
			
 
				-              - unique_results
			
 
				+      rosebud: &test_rosebud
			
 
				+        matrix:
			
 
				+          query: rosebud
			
 
				+          lang: en
			
 
				+        result_container:
			
 
				+          - not_empty
			
 
				+          - ['one_title_contains', 'citizen kane']
			
 
				+        test:
			
 
				+          - unique_results
			
 
				+
			
 
				+      android: &test_android
			
 
				+        matrix:
			
 
				+          query: ['android']
			
 
				+          lang: ['en', 'de', 'fr', 'zh-CN']
			
 
				+        result_container:
			
 
				+          - not_empty
			
 
				+          - ['one_title_contains', 'google']
			
 
				+        test:
			
 
				+          - unique_results
			
 
				+
			
 
				     # tests: only for the YAML anchors (see the engines section)
			
 
				     tests:
			
 
				-        infobox: &tests_infobox
			
 
				-          infobox:
			
 
				-              matrix:
			
 
				-                  query: ["linux", "new york", "bbc"]
			
 
				-              result_container:
			
 
				-                  - has_infobox
			
 
				+      infobox: &tests_infobox
			
 
				+        infobox:
			
 
				+          matrix:
			
 
				+            query: ["linux", "new york", "bbc"]
			
 
				+          result_container:
			
 
				+            - has_infobox
			
 
				 
			
 
				 engines:
			
 
				   - name: apk mirror
			
@@ -480,18 +492,32 @@ engines:
 
				   - name : google
			
 
				     engine : google
			
 
				     shortcut : go
			
 
				+    # additional_tests:
			
 
				+    #   android: *test_android
			
 
				 
			
 
				   - name : google images
			
 
				     engine : google_images
			
 
				     shortcut : goi
			
 
				+    # additional_tests:
			
 
				+    #   android: *test_android
			
 
				+    #   dali:
			
 
				+    #     matrix:
			
 
				+    #       query: ['Dali Christ']
			
 
				+    #       lang: ['en', 'de', 'fr', 'zh-CN']
			
 
				+    #     result_container:
			
 
				+    #       - ['one_title_contains', 'Salvador']
			
 
				 
			
 
				   - name : google news
			
 
				     engine : google_news
			
 
				     shortcut : gon
			
 
				+    # additional_tests:
			
 
				+    #   android: *test_android
			
 
				 
			
 
				   - name : google videos
			
 
				     engine : google_videos
			
 
				     shortcut : gov
			
 
				+    # additional_tests:
			
 
				+    #   android: *test_android
			
 
				 
			
 
				   - name : google scholar
			
 
				     engine : xpath