Browse Source

[fix] google video engine - rework of the HTML parser

The google video response has been changed slightly, a rework of the parser was
needed.

Signed-off-by: Markus Heiser <markus.heiser@darmarit.de>
Markus Heiser 3 years ago
parent
commit
1ce09df9aa
1 changed files with 31 additions and 22 deletions
  1. 31 22
      searx/engines/google_videos.py

+ 31 - 22
searx/engines/google_videos.py

@@ -31,11 +31,8 @@ from searx.engines.google import (
     get_lang_info,
     time_range_dict,
     filter_mapping,
-    results_xpath,
     g_section_with_header,
     title_xpath,
-    href_xpath,
-    content_xpath,
     suggestion_xpath,
     detect_google_sorry,
 )
@@ -73,11 +70,27 @@ def _re(regexpr):
     RE_CACHE[regexpr] = RE_CACHE.get(regexpr, re.compile(regexpr))
     return RE_CACHE[regexpr]
 
+
+def scrap_out_thumbs_src(dom):
+    ret_val = {}
+    thumb_name = 'dimg_'
+    for script in eval_xpath_list(dom, '//script[contains(., "google.ldi={")]'):
+        _script = script.text
+        # "dimg_35":"https://i.ytimg.c....",
+        _dimurl = _re("s='([^']*)").findall( _script)
+        for k,v in _re('(' + thumb_name + '[0-9]*)":"(http[^"]*)' ).findall(_script):
+            v = v.replace(r'\u003d','=')
+            v = v.replace(r'\u0026','&')
+            ret_val[k] = v
+    logger.debug("found %s imgdata for: %s", thumb_name, ret_val.keys())
+    return ret_val
+
+
 def scrap_out_thumbs(dom):
     """Scrap out thumbnail data from <script> tags.
     """
     ret_val = {}
-    thumb_name = 'vidthumb'
+    thumb_name = 'dimg_'
 
     for script in eval_xpath_list(dom, '//script[contains(., "_setImagesSrc")]'):
         _script = script.text
@@ -87,20 +100,11 @@ def scrap_out_thumbs(dom):
         if not _imgdata:
             continue
 
-        # var ii=['vidthumb4','vidthumb7']
+        # var ii=['dimg_17']
         for _vidthumb in _re(r"(%s\d+)" % thumb_name).findall(_script):
             # At least the equal sign in the URL needs to be decoded
             ret_val[_vidthumb] = _imgdata[0].replace(r"\x3d", "=")
 
-    # {google.ldidly=-1;google.ldi={"vidthumb8":"https://...
-    for script in eval_xpath_list(dom, '//script[contains(., "google.ldi={")]'):
-        _script = script.text
-        for key_val in _re(r'"%s\d+\":\"[^\"]*"' % thumb_name).findall( _script) :
-            match = _re(r'"(%s\d+)":"(.*)"' % thumb_name).search(key_val)
-            if match:
-                # At least the equal sign in the URL needs to be decoded
-                ret_val[match.group(1)] = match.group(2).replace(r"\u003d", "=")
-
     logger.debug("found %s imgdata for: %s", thumb_name, ret_val.keys())
     return ret_val
 
@@ -144,9 +148,11 @@ def response(resp):
     # convert the text to dom
     dom = html.fromstring(resp.text)
     vidthumb_imgdata = scrap_out_thumbs(dom)
+    thumbs_src = scrap_out_thumbs_src(dom)
+    logger.debug(str(thumbs_src))
 
     # parse results
-    for result in eval_xpath_list(dom, results_xpath):
+    for result in eval_xpath_list(dom, '//div[contains(@class, "g ")]'):
 
         # google *sections*
         if extract_text(eval_xpath(result, g_section_with_header)):
@@ -154,21 +160,24 @@ def response(resp):
             continue
 
         title = extract_text(eval_xpath_getindex(result, title_xpath, 0))
-        url = eval_xpath_getindex(result, href_xpath, 0)
-        c_node = eval_xpath_getindex(result, content_xpath, 0)
+        url = eval_xpath_getindex(result, './/div[@class="dXiKIc"]//a/@href', 0)
 
         # <img id="vidthumb1" ...>
-        img_id = eval_xpath_getindex(c_node, './div[1]//a/g-img/img/@id', 0, default=None)
+        img_id = eval_xpath_getindex(result, './/g-img/img/@id', 0, default=None)
         if img_id is None:
+            logger.error("no img_id for: %s" % result)
             continue
+
         img_src = vidthumb_imgdata.get(img_id, None)
         if not img_src:
             logger.error("no vidthumb imgdata for: %s" % img_id)
-            img_src = eval_xpath_getindex(c_node, './div[1]//a/g-img/img/@src', 0)
+            img_src = thumbs_src.get(img_id, "")
 
-        length = extract_text(eval_xpath(c_node, './/div[1]//a/div[3]'))
-        content = extract_text(eval_xpath(c_node, './/div[2]/span'))
-        pub_info = extract_text(eval_xpath(c_node, './/div[2]/div'))
+        length = extract_text(eval_xpath(
+            result, './/div[contains(@class, "P7xzyf")]/span/span'))
+        c_node = eval_xpath_getindex(result, './/div[@class="Uroaid"]', 0)
+        content = extract_text(c_node)
+        pub_info = extract_text(eval_xpath(result, './/div[@class="Zg1NU"]'))
 
         results.append({
             'url':         url,