|
@@ -27,10 +27,8 @@ The google news API ignores some parameters from the common :ref:`google API`:
|
|
|
|
|
|
from typing import TYPE_CHECKING
|
|
|
|
|
|
-import binascii
|
|
|
-import re
|
|
|
from urllib.parse import urlencode
|
|
|
-from base64 import b64decode
|
|
|
+import base64
|
|
|
from lxml import html
|
|
|
import babel
|
|
|
|
|
@@ -144,34 +142,17 @@ def response(resp):
|
|
|
|
|
|
for result in eval_xpath_list(dom, '//div[@class="xrnccd"]'):
|
|
|
|
|
|
-
|
|
|
-
|
|
|
-
|
|
|
-
|
|
|
-
|
|
|
-
|
|
|
-
|
|
|
-
|
|
|
-
|
|
|
- jslog = eval_xpath_getindex(result, './article/a/@jslog', 0)
|
|
|
- url = re.findall('http[^;]*', jslog)
|
|
|
- if url:
|
|
|
- url = url[0]
|
|
|
- else:
|
|
|
-
|
|
|
-
|
|
|
- jslog = jslog.split(";")[1].split(':')[1].strip()
|
|
|
- try:
|
|
|
- padding = (4 - (len(jslog) % 4)) * "="
|
|
|
- jslog = b64decode(jslog + padding)
|
|
|
- except binascii.Error:
|
|
|
-
|
|
|
- continue
|
|
|
-
|
|
|
-
|
|
|
- url = re.findall('http[^;"]*', str(jslog))[0]
|
|
|
-
|
|
|
-
|
|
|
+
|
|
|
+
|
|
|
+
|
|
|
+
|
|
|
+ href = eval_xpath_getindex(result, './article/a/@href', 0)
|
|
|
+ href = href.split('?')[0]
|
|
|
+ href = href.split('/')[-1]
|
|
|
+ href = base64.urlsafe_b64decode(href + '====')
|
|
|
+ href = href[4:].split(b'\xd2')[0]
|
|
|
+ href = href.decode()
|
|
|
+
|
|
|
title = extract_text(eval_xpath(result, './article/h3[1]'))
|
|
|
|
|
|
|
|
@@ -189,7 +170,7 @@ def response(resp):
|
|
|
|
|
|
results.append(
|
|
|
{
|
|
|
- 'url': url,
|
|
|
+ 'url': href,
|
|
|
'title': title,
|
|
|
'content': content,
|
|
|
'img_src': img_src,
|