2 years ago · 460bbe5b81
--- a/docs/dev/engines/online/brave.rst
+++ b/docs/dev/engines/online/brave.rst
@@ -0,0 +1,13 @@
 
				+.. _brave engine:
			
 
				+
			
 
				+=============
			
 
				+Brave Engines
			
 
				+=============
			
 
				+
			
 
				+.. contents:: Contents
			
 
				+   :depth: 2
			
 
				+   :local:
			
 
				+   :backlinks: entry
			
 
				+
			
 
				+.. automodule:: searx.engines.brave
			
 
				+  :members:
			
--- a/searx/engines/brave.py
+++ b/searx/engines/brave.py
@@ -1,10 +1,56 @@
 
				 # SPDX-License-Identifier: AGPL-3.0-or-later
			
 
				+# lint: pylint
			
 
				+"""Brave supports the categories listed in :py:obj:`brave_category` (General,
			
 
				+news, videos, images).  The support of :py:obj:`paging` and :py:obj:`time range
			
 
				+<time_range_support>` is limited (see remarks).
			
 
				+
			
 
				+Configured ``brave`` engines:
			
 
				+
			
 
				+.. code:: yaml
			
 
				+
			
 
				+  - name: brave
			
 
				+    engine: brave
			
 
				+    ...
			
 
				+    brave_category: search
			
 
				+    time_range_support: true
			
 
				+    paging: true
			
 
				+
			
 
				+  - name: brave.images
			
 
				+    engine: brave
			
 
				+    ...
			
 
				+    brave_category: images
			
 
				+
			
 
				+  - name: brave.videos
			
 
				+    engine: brave
			
 
				+    ...
			
 
				+    brave_category: videos
			
 
				+
			
 
				+  - name: brave.news
			
 
				+    engine: brave
			
 
				+    ...
			
 
				+    brave_category: news
			
 
				+
			
 
				+
			
 
				+Implementations
			
 
				+===============
			
 
				+
			
 
				 """
			
 
				- Brave (General, news, videos, images)
			
 
				-"""
			
 
				+# pylint: disable=fixme
			
 
				+
			
 
				+from urllib.parse import (
			
 
				+    urlencode,
			
 
				+    urlparse,
			
 
				+    parse_qs,
			
 
				+)
			
 
				 
			
 
				-from urllib.parse import urlencode
			
 
				 import chompjs
			
 
				+from lxml import html
			
 
				+
			
 
				+from searx.utils import (
			
 
				+    extract_text,
			
 
				+    eval_xpath_list,
			
 
				+    eval_xpath_getindex,
			
 
				+)
			
 
				 
			
 
				 about = {
			
 
				     "website": 'https://search.brave.com/',
			
@@ -14,41 +60,87 @@ about = {
 
				     "require_api_key": False,
			
 
				     "results": 'HTML',
			
 
				 }
			
 
				+
			
 
				 base_url = "https://search.brave.com/"
			
 
				+categories = []
			
 
				+brave_category = 'search'
			
 
				+"""Brave supports common web-search, video search, image and video search.
			
 
				+
			
 
				+- ``search``: Common WEB search
			
 
				+- ``videos``: search for videos
			
 
				+- ``images``: search for images
			
 
				+- ``news``: search for news
			
 
				+"""
			
 
				+
			
 
				+brave_spellcheck = False
			
 
				+"""Brave supports some kind of spell checking.  When activated, Brave tries to
			
 
				+fix typos, e.g. it searches for ``food`` when the user queries for ``fooh``.  In
			
 
				+the UI of Brave the user gets warned about this, since we can not warn the user
			
 
				+in SearXNG, the spellchecking is disabled by default.
			
 
				+"""
			
 
				+
			
 
				+send_accept_language_header = True
			
 
				 paging = False
			
 
				-categories = ['images', 'videos', 'news']  # images, videos, news
			
 
				+"""Brave only supports paging in :py:obj:`brave_category` ``search`` (UI
			
 
				+category All)."""
			
 
				+
			
 
				+safesearch = True
			
 
				+safesearch_map = {2: 'strict', 1: 'moderate', 0: 'off'}  # cookie: safesearch=off
			
 
				+
			
 
				+time_range_support = False
			
 
				+"""Brave only supports time-range in :py:obj:`brave_category` ``search`` (UI
			
 
				+category All)."""
			
 
				+
			
 
				+time_range_map = {
			
 
				+    'day': 'pd',
			
 
				+    'week': 'pw',
			
 
				+    'month': 'pm',
			
 
				+    'year': 'py',
			
 
				+}
			
 
				 
			
 
				 
			
 
				 def request(query, params):
			
 
				+
			
 
				+    # Don't accept br encoding / see https://github.com/searxng/searxng/pull/1787
			
 
				+    params['headers']['Accept-Encoding'] = 'gzip, deflate'
			
 
				+
			
 
				     args = {
			
 
				         'q': query,
			
 
				-        'spellcheck': 1,
			
 
				     }
			
 
				-    params["url"] = f"{base_url}{categories[0]}?{urlencode(args)}"
			
 
				-
			
 
				-
			
 
				-def get_video_results(json_data):
			
 
				-    results = []
			
 
				-
			
 
				-    for result in json_data:
			
 
				-        results.append(
			
 
				-            {
			
 
				-                'template': 'videos.html',
			
 
				-                'url': result['url'],
			
 
				-                'thumbnail_src': result['thumbnail']['src'],
			
 
				-                'img_src': result['properties']['url'],
			
 
				-                'content': result['description'],
			
 
				-                'title': result['title'],
			
 
				-                'source': result['source'],
			
 
				-                'duration': result['video']['duration'],
			
 
				-            }
			
 
				-        )
			
 
				+    if brave_spellcheck:
			
 
				+        args['spellcheck'] = '1'
			
 
				+
			
 
				+    if brave_category == 'search':
			
 
				+        if params.get('pageno', 1) - 1:
			
 
				+            args['offset'] = params.get('pageno', 1) - 1
			
 
				+        if time_range_map.get(params['time_range']):
			
 
				+            args['tf'] = time_range_map.get(params['time_range'])
			
 
				+
			
 
				+    params["url"] = f"{base_url}{brave_category}?{urlencode(args)}"
			
 
				+
			
 
				+    # set preferences in cookie
			
 
				+    params['cookies']['safesearch'] = safesearch_map.get(params['safesearch'], 'off')
			
 
				+
			
 
				+    # ToDo: we need a fetch_traits(..) implementation / the ui_lang of Brave are
			
 
				+    #       limited and the country handling has it quirks
			
 
				+
			
 
				+    eng_locale = params.get('searxng_locale')
			
 
				+    params['cookies']['useLocation'] = '0'  # the useLocation is IP based, we use 'country'
			
 
				+    params['cookies']['summarizer'] = '0'
			
 
				 
			
 
				-    return results
			
 
				+    if not eng_locale or eng_locale == 'all':
			
 
				+        params['cookies']['country'] = 'all'  # country=all
			
 
				+    else:
			
 
				+        params['cookies']['country'] = eng_locale.split('-')[-1].lower()
			
 
				+        params['cookies']['ui_lang'] = eng_locale.split('-')[0].lower()
			
 
				+
			
 
				+    # logger.debug("cookies %s", params['cookies'])
			
 
				 
			
 
				 
			
 
				 def response(resp):
			
 
				-    results = []
			
 
				+
			
 
				+    if brave_category == 'search':
			
 
				+        return _parse_search(resp)
			
 
				 
			
 
				     datastr = ""
			
 
				     for line in resp.text.split("\n"):
			
@@ -57,10 +149,81 @@ def response(resp):
 
				             break
			
 
				 
			
 
				     json_data = chompjs.parse_js_object(datastr)
			
 
				-
			
 
				     json_resp = json_data[1]['data']['body']['response']
			
 
				-    if categories[0] == 'news':
			
 
				+
			
 
				+    if brave_category == 'news':
			
 
				         json_resp = json_resp['news']
			
 
				+        return _parse_news(json_resp)
			
 
				+
			
 
				+    if brave_category == 'images':
			
 
				+        return _parse_images(json_resp)
			
 
				+    if brave_category == 'videos':
			
 
				+        return _parse_videos(json_resp)
			
 
				+
			
 
				+    return []
			
 
				+
			
 
				+
			
 
				+def _parse_search(resp):
			
 
				+
			
 
				+    result_list = []
			
 
				+    dom = html.fromstring(resp.text)
			
 
				+
			
 
				+    answer_tag = eval_xpath_getindex(dom, '//div[@class="answer"]', 0, default=None)
			
 
				+    if answer_tag:
			
 
				+        result_list.append({'answer': extract_text(answer_tag)})
			
 
				+
			
 
				+    # xpath_results = '//div[contains(@class, "snippet fdb") and @data-type="web"]'
			
 
				+    xpath_results = '//div[contains(@class, "snippet")]'
			
 
				+
			
 
				+    for result in eval_xpath_list(dom, xpath_results):
			
 
				+
			
 
				+        url = eval_xpath_getindex(result, './/a[@class="result-header"]/@href', 0, default=None)
			
 
				+        title_tag = eval_xpath_getindex(result, './/span[@class="snippet-title"]', 0, default=None)
			
 
				+        if not (url and title_tag):
			
 
				+            continue
			
 
				+
			
 
				+        content_tag = eval_xpath_getindex(result, './/p[@class="snippet-description"]', 0, default='')
			
 
				+        img_src = eval_xpath_getindex(result, './/img[@class="thumb"]/@src', 0, default='')
			
 
				+
			
 
				+        item = {
			
 
				+            'url': url,
			
 
				+            'title': extract_text(title_tag),
			
 
				+            'content': extract_text(content_tag),
			
 
				+            'img_src': img_src,
			
 
				+        }
			
 
				+
			
 
				+        video_tag = eval_xpath_getindex(
			
 
				+            result, './/div[contains(@class, "video-snippet") and @data-macro="video"]', 0, default=None
			
 
				+        )
			
 
				+        if video_tag:
			
 
				+
			
 
				+            # In my tests a video tag in the WEB search was mostoften not a
			
 
				+            # video, except the ones from youtube ..
			
 
				+
			
 
				+            iframe_src = _get_iframe_src(url)
			
 
				+            if iframe_src:
			
 
				+                item['iframe_src'] = iframe_src
			
 
				+                item['template'] = 'videos.html'
			
 
				+                item['thumbnail'] = eval_xpath_getindex(video_tag, './/img/@src', 0, default='')
			
 
				+            else:
			
 
				+                item['img_src'] = eval_xpath_getindex(video_tag, './/img/@src', 0, default='')
			
 
				+
			
 
				+        result_list.append(item)
			
 
				+
			
 
				+    return result_list
			
 
				+
			
 
				+
			
 
				+def _get_iframe_src(url):
			
 
				+    parsed_url = urlparse(url)
			
 
				+    if parsed_url.path == '/watch' and parsed_url.query:
			
 
				+        video_id = parse_qs(parsed_url.query).get('v', [])  # type: ignore
			
 
				+        if video_id:
			
 
				+            return 'https://www.youtube-nocookie.com/embed/' + video_id[0]  # type: ignore
			
 
				+    return None
			
 
				+
			
 
				+
			
 
				+def _parse_news(json_resp):
			
 
				+    result_list = []
			
 
				 
			
 
				     for result in json_resp["results"]:
			
 
				         item = {
			
@@ -68,18 +231,53 @@ def response(resp):
 
				             'title': result['title'],
			
 
				             'content': result['description'],
			
 
				         }
			
 
				+        if result['thumbnail'] != "null":
			
 
				+            item['img_src'] = result['thumbnail']['src']
			
 
				+        result_list.append(item)
			
 
				+
			
 
				+    return result_list
			
 
				+
			
 
				+
			
 
				+def _parse_images(json_resp):
			
 
				+    result_list = []
			
 
				+
			
 
				+    for result in json_resp["results"]:
			
 
				+        item = {
			
 
				+            'url': result['url'],
			
 
				+            'title': result['title'],
			
 
				+            'content': result['description'],
			
 
				+            'template': 'images.html',
			
 
				+            'img_format': result['properties']['format'],
			
 
				+            'source': result['source'],
			
 
				+            'img_src': result['properties']['url'],
			
 
				+        }
			
 
				+        result_list.append(item)
			
 
				+
			
 
				+    return result_list
			
 
				+
			
 
				+
			
 
				+def _parse_videos(json_resp):
			
 
				+    result_list = []
			
 
				+
			
 
				+    for result in json_resp["results"]:
			
 
				+
			
 
				+        url = result['url']
			
 
				+        item = {
			
 
				+            'url': url,
			
 
				+            'title': result['title'],
			
 
				+            'content': result['description'],
			
 
				+            'template': 'videos.html',
			
 
				+            'length': result['video']['duration'],
			
 
				+            'duration': result['video']['duration'],
			
 
				+        }
			
 
				+
			
 
				         if result['thumbnail'] != "null":
			
 
				             item['thumbnail'] = result['thumbnail']['src']
			
 
				 
			
 
				-        if categories[0] == 'images':
			
 
				-            item['template'] = 'images.html'
			
 
				-            item['img_format'] = result['properties']['format']
			
 
				-            item['source'] = result['source']
			
 
				-            item['img_src'] = result['properties']['url']
			
 
				-        elif categories[0] == 'videos':
			
 
				-            item['template'] = 'videos.html'
			
 
				-            item['length'] = result['video']['duration']
			
 
				+        iframe_src = _get_iframe_src(url)
			
 
				+        if iframe_src:
			
 
				+            item['iframe_src'] = iframe_src
			
 
				 
			
 
				-        results.append(item)
			
 
				+        result_list.append(item)
			
 
				 
			
 
				-    return results
			
 
				+    return result_list
			
--- a/searx/settings.yml
+++ b/searx/settings.yml
@@ -1816,50 +1816,34 @@ engines:
 
				     timeout: 9.0
			
 
				 
			
 
				   - name: brave
			
 
				-    shortcut: brave
			
 
				-    engine: xpath
			
 
				-    paging: true
			
 
				+    engine: brave
			
 
				+    shortcut: br
			
 
				     time_range_support: true
			
 
				-    first_page_num: 0
			
 
				-    time_range_url: "&tf={time_range_val}"
			
 
				-    search_url: https://search.brave.com/search?q={query}&offset={pageno}&spellcheck=1{time_range}
			
 
				-    url_xpath: //a[@class="result-header"]/@href
			
 
				-    title_xpath: //span[@class="snippet-title"]
			
 
				-    content_xpath: //p[1][@class="snippet-description"]
			
 
				-    suggestion_xpath: //div[@class="text-gray h6"]/a
			
 
				-    time_range_map:
			
 
				-      day: 'pd'
			
 
				-      week: 'pw'
			
 
				-      month: 'pm'
			
 
				-      year: 'py'
			
 
				+    paging: true
			
 
				     categories: [general, web]
			
 
				-    disabled: true
			
 
				-    headers:
			
 
				-      Accept-Encoding: gzip, deflate
			
 
				-    about:
			
 
				-      website: https://brave.com/search/
			
 
				-      wikidata_id: Q107355971
			
 
				-      use_official_api: false
			
 
				-      require_api_key: false
			
 
				-      results: HTML
			
 
				+    brave_category: search
			
 
				+    # brave_spellcheck: true
			
 
				 
			
 
				   - name: brave.images
			
 
				-    shortcut: braveimg
			
 
				     engine: brave
			
 
				-    categories: images
			
 
				-    disabled: true
			
 
				+    network: brave
			
 
				+    shortcut: brimg
			
 
				+    categories: [images, web]
			
 
				+    brave_category: images
			
 
				 
			
 
				   - name: brave.videos
			
 
				-    shortcut: bravevid
			
 
				     engine: brave
			
 
				-    categories: videos
			
 
				-    disabled: true
			
 
				+    network: brave
			
 
				+    shortcut: brvid
			
 
				+    categories: [videos, web]
			
 
				+    brave_category: videos
			
 
				 
			
 
				   - name: brave.news
			
 
				-    shortcut: bravenews
			
 
				     engine: brave
			
 
				+    network: brave
			
 
				+    shortcut: brnews
			
 
				     categories: news
			
 
				-    disabled: true
			
 
				+    brave_category: news
			
 
				 
			
 
				   - name: petalsearch
			
 
				     shortcut: pts