3 years ago · 328473befd
--- a/searx/engines/google.py
+++ b/searx/engines/google.py
@@ -138,12 +138,7 @@ content_xpath = './/div[@class="IsZvec"]'
 
															 # Suggestions are links placed in a *card-section*, we extract only the text
														
 
															 # from the links not the links itself.
														
 
															-suggestion_xpath = '//div[contains(@class, "card-section")]//a'
														
 
															-
														
 
															-# Since google does *auto-correction* on the first query these are not really
														
 
															-# *spelling suggestions*, we use them anyway.
														
 
															-spelling_suggestion_xpath = '//div[@class="med"]/p/a'
														
 
															-
														
 
															+suggestion_xpath = '//div[contains(@class, "EIaa9b")]//a'
														
 
															 def get_lang_info(params, lang_list, custom_aliases, supported_any_language):
														
 
															     """Composing various language properties for the google engines.
														
@@ -322,7 +317,6 @@ def response(resp):
 
															     # convert the text to dom
														
 
															     dom = html.fromstring(resp.text)
														
 
															-
														
 
															     # results --> answer
														
 
															     answer_list = eval_xpath(dom, '//div[contains(@class, "LGOjhe")]')
														
 
															     if answer_list:
														
@@ -379,9 +373,6 @@ def response(resp):
 
															         # append suggestion
														
 
															         results.append({'suggestion': extract_text(suggestion)})
														
 
															-    for correction in eval_xpath_list(dom, spelling_suggestion_xpath):
														
 
															-        results.append({'correction': extract_text(correction)})
														
 
															-
														
 
															     # return results
														
 
															     return results
														
--- a/searx/engines/google_videos.py
+++ b/searx/engines/google_videos.py
@@ -31,13 +31,9 @@ from searx.engines.google import (
 
															     get_lang_info,
														
 
															     time_range_dict,
														
 
															     filter_mapping,
														
 
															-    results_xpath,
														
 
															     g_section_with_header,
														
 
															     title_xpath,
														
 
															-    href_xpath,
														
 
															-    content_xpath,
														
 
															     suggestion_xpath,
														
 
															-    spelling_suggestion_xpath,
														
 
															     detect_google_sorry,
														
 
															 )
														
@@ -74,11 +70,27 @@ def _re(regexpr):
 
															     RE_CACHE[regexpr] = RE_CACHE.get(regexpr, re.compile(regexpr))
														
 
															     return RE_CACHE[regexpr]
														
 
															+
														
 
															+def scrap_out_thumbs_src(dom):
														
 
															+    ret_val = {}
														
 
															+    thumb_name = 'dimg_'
														
 
															+    for script in eval_xpath_list(dom, '//script[contains(., "google.ldi={")]'):
														
 
															+        _script = script.text
														
 
															+        # "dimg_35":"https://i.ytimg.c....",
														
 
															+        _dimurl = _re("s='([^']*)").findall( _script)
														
 
															+        for k,v in _re('(' + thumb_name + '[0-9]*)":"(http[^"]*)' ).findall(_script):
														
 
															+            v = v.replace(r'\u003d','=')
														
 
															+            v = v.replace(r'\u0026','&')
														
 
															+            ret_val[k] = v
														
 
															+    logger.debug("found %s imgdata for: %s", thumb_name, ret_val.keys())
														
 
															+    return ret_val
														
 
															+
														
 
															+
														
 
															 def scrap_out_thumbs(dom):
														
 
															     """Scrap out thumbnail data from <script> tags.
														
 
															     """
														
 
															     ret_val = {}
														
 
															-    thumb_name = 'vidthumb'
														
 
															+    thumb_name = 'dimg_'
														
 
															     for script in eval_xpath_list(dom, '//script[contains(., "_setImagesSrc")]'):
														
 
															         _script = script.text
														
@@ -88,20 +100,11 @@ def scrap_out_thumbs(dom):
 
															         if not _imgdata:
														
 
															             continue
														
 
															-        # var ii=['vidthumb4','vidthumb7']
														
 
															+        # var ii=['dimg_17']
														
 
															         for _vidthumb in _re(r"(%s\d+)" % thumb_name).findall(_script):
														
 
															             # At least the equal sign in the URL needs to be decoded
														
 
															             ret_val[_vidthumb] = _imgdata[0].replace(r"\x3d", "=")
														
 
															-    # {google.ldidly=-1;google.ldi={"vidthumb8":"https://...
														
 
															-    for script in eval_xpath_list(dom, '//script[contains(., "google.ldi={")]'):
														
 
															-        _script = script.text
														
 
															-        for key_val in _re(r'"%s\d+\":\"[^\"]*"' % thumb_name).findall( _script) :
														
 
															-            match = _re(r'"(%s\d+)":"(.*)"' % thumb_name).search(key_val)
														
 
															-            if match:
														
 
															-                # At least the equal sign in the URL needs to be decoded
														
 
															-                ret_val[match.group(1)] = match.group(2).replace(r"\u003d", "=")
														
 
															-
														
 
															     logger.debug("found %s imgdata for: %s", thumb_name, ret_val.keys())
														
 
															     return ret_val
														
@@ -145,9 +148,11 @@ def response(resp):
 
															     # convert the text to dom
														
 
															     dom = html.fromstring(resp.text)
														
 
															     vidthumb_imgdata = scrap_out_thumbs(dom)
														
 
															+    thumbs_src = scrap_out_thumbs_src(dom)
														
 
															+    logger.debug(str(thumbs_src))
														
 
															     # parse results
														
 
															-    for result in eval_xpath_list(dom, results_xpath):
														
 
															+    for result in eval_xpath_list(dom, '//div[contains(@class, "g ")]'):
														
 
															         # google *sections*
														
 
															         if extract_text(eval_xpath(result, g_section_with_header)):
														
@@ -155,21 +160,24 @@ def response(resp):
 
															             continue
														
 
															         title = extract_text(eval_xpath_getindex(result, title_xpath, 0))
														
 
															-        url = eval_xpath_getindex(result, href_xpath, 0)
														
 
															-        c_node = eval_xpath_getindex(result, content_xpath, 0)
														
 
															+        url = eval_xpath_getindex(result, './/div[@class="dXiKIc"]//a/@href', 0)
														
 
															         # <img id="vidthumb1" ...>
														
 
															-        img_id = eval_xpath_getindex(c_node, './div[1]//a/g-img/img/@id', 0, default=None)
														
 
															+        img_id = eval_xpath_getindex(result, './/g-img/img/@id', 0, default=None)
														
 
															         if img_id is None:
														
 
															+            logger.error("no img_id for: %s" % result)
														
 
															             continue
														
 
															+
														
 
															         img_src = vidthumb_imgdata.get(img_id, None)
														
 
															         if not img_src:
														
 
															             logger.error("no vidthumb imgdata for: %s" % img_id)
														
 
															-            img_src = eval_xpath_getindex(c_node, './div[1]//a/g-img/img/@src', 0)
														
 
															+            img_src = thumbs_src.get(img_id, "")
														
 
															-        length = extract_text(eval_xpath(c_node, './/div[1]//a/div[3]'))
														
 
															-        content = extract_text(eval_xpath(c_node, './/div[2]/span'))
														
 
															-        pub_info = extract_text(eval_xpath(c_node, './/div[2]/div'))
														
 
															+        length = extract_text(eval_xpath(
														
 
															+            result, './/div[contains(@class, "P7xzyf")]/span/span'))
														
 
															+        c_node = eval_xpath_getindex(result, './/div[@class="Uroaid"]', 0)
														
 
															+        content = extract_text(c_node)
														
 
															+        pub_info = extract_text(eval_xpath(result, './/div[@class="Zg1NU"]'))
														
 
															         results.append({
														
 
															             'url':         url,
														
@@ -186,7 +194,4 @@ def response(resp):
 
															         # append suggestion
														
 
															         results.append({'suggestion': extract_text(suggestion)})
														
 
															-    for correction in eval_xpath_list(dom, spelling_suggestion_xpath):
														
 
															-        results.append({'correction': extract_text(correction)})
														
 
															-
														
 
															     return results