1 year ago · 44392bd436
--- a/searx/engines/presearch.py
+++ b/searx/engines/presearch.py
@@ -1,6 +1,20 @@
 
				 # SPDX-License-Identifier: AGPL-3.0-or-later
			
 
				 # lint: pylint
			
 
				 """Presearch (general, images, videos, news)
			
 
				+
			
 
				+.. hint::
			
 
				+
			
 
				+   The results in the video category are most often links to pages that contain
			
 
				+   a video, for instance many links from preasearch's video category link
			
 
				+   content from facebook (aka Meta) or Twitter (aka X).  Since these are not
			
 
				+   real links to video streams SearXNG can't use the video template for this and
			
 
				+   if SearXNG can't use this template, then the user doesn't want to see these
			
 
				+   hits in the videos category.
			
 
				+
			
 
				+   TL;DR; by default presearch's video category is placed into categories::
			
 
				+
			
 
				+       categories: [general, web]
			
 
				+
			
 
				 """
			
 
				 
			
 
				 from urllib.parse import urlencode
			
@@ -19,12 +33,18 @@ paging = True
 
				 time_range_support = True
			
 
				 categories = ["general", "web"]  # general, images, videos, news
			
 
				 
			
 
				-search_type = "search"  # must be any of "search", "images", "videos", "news"
			
 
				+search_type = "search"
			
 
				+"""must be any of ``search``, ``images``, ``videos``, ``news``"""
			
 
				 
			
 
				 base_url = "https://presearch.com"
			
 
				 safesearch_map = {0: 'false', 1: 'true', 2: 'true'}
			
 
				 
			
 
				 
			
 
				+def init(_):
			
 
				+    if search_type not in ['search', 'images', 'videos', 'news']:
			
 
				+        raise ValueError(f'presearch search_type: {search_type}')
			
 
				+
			
 
				+
			
 
				 def _get_request_id(query, page, time_range, safesearch):
			
 
				     args = {
			
 
				         "q": query,
			
@@ -38,7 +58,7 @@ def _get_request_id(query, page, time_range, safesearch):
 
				         'User-Agent': gen_useragent(),
			
 
				         'Cookie': f"b=1;presearch_session=;use_safe_search={safesearch_map[safesearch]}",
			
 
				     }
			
 
				-    resp_text = get(url, headers=headers).text
			
 
				+    resp_text = get(url, headers=headers).text  # type: ignore
			
 
				 
			
 
				     for line in resp_text.split("\n"):
			
 
				         if "window.searchId = " in line:
			
@@ -47,11 +67,6 @@ def _get_request_id(query, page, time_range, safesearch):
 
				     return None
			
 
				 
			
 
				 
			
 
				-def _is_valid_img_src(url):
			
 
				-    # in some cases, the image url is a base64 encoded string, which has to be skipped
			
 
				-    return "https://" in url
			
 
				-
			
 
				-
			
 
				 def request(query, params):
			
 
				     request_id = _get_request_id(query, params["pageno"], params["time_range"], params["safesearch"])
			
 
				 
			
@@ -61,42 +76,105 @@ def request(query, params):
 
				     return params
			
 
				 
			
 
				 
			
 
				-def response(resp):
			
 
				-    results = []
			
 
				+def _strip_leading_strings(text):
			
 
				+    for x in ['wikipedia', 'google']:
			
 
				+        if text.lower().endswith(x):
			
 
				+            text = text[: -len(x)]
			
 
				+    return text.strip()
			
 
				 
			
 
				-    json = resp.json()
			
 
				 
			
 
				-    json_results = []
			
 
				-    if search_type == "search":
			
 
				-        json_results = json['results'].get('standardResults', [])
			
 
				-    else:
			
 
				-        json_results = json.get(search_type, [])
			
 
				+def parse_search_query(json_results):
			
 
				+    results = []
			
 
				 
			
 
				-    for json_result in json_results:
			
 
				+    for item in json_results.get('specialSections', {}).get('topStoriesCompact', {}).get('data', []):
			
 
				         result = {
			
 
				-            'url': json_result['link'],
			
 
				-            'title': json_result['title'],
			
 
				-            'content': html_to_text(json_result.get('description', '')),
			
 
				+            'url': item['link'],
			
 
				+            'title': item['title'],
			
 
				+            'img_src': item['image'],
			
 
				+            'content': '',
			
 
				+            'metadata': item.get('source'),
			
 
				         }
			
 
				-        if search_type == "images":
			
 
				-            result['template'] = 'images.html'
			
 
				-
			
 
				-            if not _is_valid_img_src(json_result['image']):
			
 
				-                continue
			
 
				-
			
 
				-            result['img_src'] = json_result['image']
			
 
				-            if _is_valid_img_src(json_result['thumbnail']):
			
 
				-                result['thumbnail'] = json_result['thumbnail']
			
 
				+        results.append(result)
			
 
				 
			
 
				-        elif search_type == "videos":
			
 
				-            result['template'] = 'videos.html'
			
 
				+    for item in json_results.get('standardResults', []):
			
 
				+        result = {
			
 
				+            'url': item['link'],
			
 
				+            'title': item['title'],
			
 
				+            'content': html_to_text(item['description']),
			
 
				+        }
			
 
				+        results.append(result)
			
 
				 
			
 
				-            if _is_valid_img_src(json_result['image']):
			
 
				-                result['thumbnail'] = json_result['image']
			
 
				+    info = json_results.get('infoSection', {}).get('data')
			
 
				+    if info:
			
 
				+        attributes = []
			
 
				+        for item in info.get('about', []):
			
 
				+            label, value = html_to_text(item).split(':', 1)
			
 
				+            value = _strip_leading_strings(value)
			
 
				+            attributes.append({'label': label, 'value': value})
			
 
				+        content = []
			
 
				+        for item in [info['subtitle'], info['description']]:
			
 
				+            item = _strip_leading_strings(html_to_text(item))
			
 
				+            if item:
			
 
				+                content.append(item)
			
 
				+
			
 
				+        results.append(
			
 
				+            {
			
 
				+                'infobox': info['title'],
			
 
				+                'id': info['title'],
			
 
				+                'img_src': info.get('image'),
			
 
				+                'content': ' | '.join(content),
			
 
				+                'attributes': attributes,
			
 
				+            }
			
 
				+        )
			
 
				+    return results
			
 
				 
			
 
				-            result['duration'] = json_result['duration']
			
 
				-            result['length'] = json_result['duration']
			
 
				 
			
 
				-        results.append(result)
			
 
				+def response(resp):
			
 
				+    results = []
			
 
				+    json_resp = resp.json()
			
 
				+
			
 
				+    if search_type == 'search':
			
 
				+        results = parse_search_query(json_resp['results'])
			
 
				+
			
 
				+    elif search_type == 'images':
			
 
				+        for item in json_resp['images']:
			
 
				+            results.append(
			
 
				+                {
			
 
				+                    'template': 'images.html',
			
 
				+                    'title': item['title'],
			
 
				+                    'url': item['link'],
			
 
				+                    'img_src': item['image'],
			
 
				+                    'thumbnail_src': item['thumbnail'],
			
 
				+                }
			
 
				+            )
			
 
				+
			
 
				+    elif search_type == 'videos':
			
 
				+        # The results in the video category are most often links to pages that contain
			
 
				+        # a video and not to a video stream --> SearXNG can't use the video template.
			
 
				+
			
 
				+        for item in json_resp['videos']:
			
 
				+            metadata = [x for x in [item.get('description'), item.get('duration')] if x]
			
 
				+            results.append(
			
 
				+                {
			
 
				+                    'title': item['title'],
			
 
				+                    'url': item['link'],
			
 
				+                    'content': '',
			
 
				+                    'metadata': ' / '.join(metadata),
			
 
				+                    'img_src': item.get('image'),
			
 
				+                }
			
 
				+            )
			
 
				+
			
 
				+    elif search_type == 'news':
			
 
				+        for item in json_resp['news']:
			
 
				+            metadata = [x for x in [item.get('source'), item.get('time')] if x]
			
 
				+            results.append(
			
 
				+                {
			
 
				+                    'title': item['title'],
			
 
				+                    'url': item['link'],
			
 
				+                    'content': item['description'],
			
 
				+                    'metadata': ' / '.join(metadata),
			
 
				+                    'img_src': item.get('image'),
			
 
				+                }
			
 
				+            )
			
 
				 
			
 
				     return results
			
--- a/searx/settings.yml
+++ b/searx/settings.yml
@@ -1295,6 +1295,7 @@ engines:
 
				     search_type: search
			
 
				     categories: [general, web]
			
 
				     shortcut: ps
			
 
				+    disabled: true
			
 
				 
			
 
				   - name: presearch images
			
 
				     engine: presearch
			
@@ -1307,7 +1308,7 @@ engines:
 
				   - name: presearch videos
			
 
				     engine: presearch
			
 
				     search_type: videos
			
 
				-    categories: [videos, web]
			
 
				+    categories: [general, web]
			
 
				     timeout: 4.0
			
 
				     shortcut: psvid
			
 
				     disabled: true