Browse Source

[fix] brave.news engine: response is HTML and no longer JSON

The response from brave.com for news is no longer a JSON string.

Closes: https://github.com/searxng/searxng/issues/4352
Signed-off-by: Markus Heiser <markus.heiser@darmarit.de>
Markus Heiser 2 months ago
parent
commit
feb15e3878
1 changed files with 24 additions and 11 deletions
  1. 24 11
      searx/engines/brave.py

+ 24 - 11
searx/engines/brave.py

@@ -254,14 +254,14 @@ def response(resp) -> EngineResults:
     if brave_category in ('search', 'goggles'):
         return _parse_search(resp)
 
+    if brave_category in ('news'):
+        return _parse_news(resp)
+
     datastr = extr(resp.text, "const data = ", ";\n").strip()
 
     json_data = js_variable_to_python(datastr)
     json_resp = json_data[1]['data']['body']['response']
 
-    if brave_category == 'news':
-        return _parse_news(json_resp['news'])
-
     if brave_category == 'images':
         return _parse_images(json_resp)
     if brave_category == 'videos':
@@ -339,18 +339,31 @@ def _parse_search(resp) -> EngineResults:
     return result_list
 
 
-def _parse_news(json_resp) -> EngineResults:
+def _parse_news(resp) -> EngineResults:
+
     result_list = EngineResults()
+    dom = html.fromstring(resp.text)
+
+    for result in eval_xpath_list(dom, '//div[contains(@class, "results")]//div[@data-type="news"]'):
+
+        # import pdb
+        # pdb.set_trace()
+
+        url = eval_xpath_getindex(result, './/a[contains(@class, "result-header")]/@href', 0, default=None)
+        if url is None:
+            continue
+
+        title = extract_text(eval_xpath_list(result, './/span[contains(@class, "snippet-title")]'))
+        content = extract_text(eval_xpath_list(result, './/p[contains(@class, "desc")]'))
+        thumbnail = eval_xpath_getindex(result, './/div[contains(@class, "image-wrapper")]//img/@src', 0, default='')
 
-    for result in json_resp["results"]:
         item = {
-            'url': result['url'],
-            'title': result['title'],
-            'content': result['description'],
-            'publishedDate': _extract_published_date(result['age']),
+            "url": url,
+            "title": title,
+            "content": content,
+            "thumbnail": thumbnail,
         }
-        if result['thumbnail'] is not None:
-            item['thumbnail'] = result['thumbnail']['src']
+
         result_list.append(item)
 
     return result_list