|
@@ -1,6 +1,20 @@
|
|
|
# SPDX-License-Identifier: AGPL-3.0-or-later
|
|
|
# lint: pylint
|
|
|
"""Presearch (general, images, videos, news)
|
|
|
+
|
|
|
+.. hint::
|
|
|
+
|
|
|
+ The results in the video category are most often links to pages that contain
|
|
|
+ a video, for instance many links from preasearch's video category link
|
|
|
+ content from facebook (aka Meta) or Twitter (aka X). Since these are not
|
|
|
+ real links to video streams SearXNG can't use the video template for this and
|
|
|
+ if SearXNG can't use this template, then the user doesn't want to see these
|
|
|
+ hits in the videos category.
|
|
|
+
|
|
|
+ TL;DR; by default presearch's video category is placed into categories::
|
|
|
+
|
|
|
+ categories: [general, web]
|
|
|
+
|
|
|
"""
|
|
|
|
|
|
from urllib.parse import urlencode
|
|
@@ -19,12 +33,18 @@ paging = True
|
|
|
time_range_support = True
|
|
|
categories = ["general", "web"] # general, images, videos, news
|
|
|
|
|
|
-search_type = "search" # must be any of "search", "images", "videos", "news"
|
|
|
+search_type = "search"
|
|
|
+"""must be any of ``search``, ``images``, ``videos``, ``news``"""
|
|
|
|
|
|
base_url = "https://presearch.com"
|
|
|
safesearch_map = {0: 'false', 1: 'true', 2: 'true'}
|
|
|
|
|
|
|
|
|
+def init(_):
|
|
|
+ if search_type not in ['search', 'images', 'videos', 'news']:
|
|
|
+ raise ValueError(f'presearch search_type: {search_type}')
|
|
|
+
|
|
|
+
|
|
|
def _get_request_id(query, page, time_range, safesearch):
|
|
|
args = {
|
|
|
"q": query,
|
|
@@ -38,7 +58,7 @@ def _get_request_id(query, page, time_range, safesearch):
|
|
|
'User-Agent': gen_useragent(),
|
|
|
'Cookie': f"b=1;presearch_session=;use_safe_search={safesearch_map[safesearch]}",
|
|
|
}
|
|
|
- resp_text = get(url, headers=headers).text
|
|
|
+ resp_text = get(url, headers=headers).text # type: ignore
|
|
|
|
|
|
for line in resp_text.split("\n"):
|
|
|
if "window.searchId = " in line:
|
|
@@ -47,11 +67,6 @@ def _get_request_id(query, page, time_range, safesearch):
|
|
|
return None
|
|
|
|
|
|
|
|
|
-def _is_valid_img_src(url):
|
|
|
- # in some cases, the image url is a base64 encoded string, which has to be skipped
|
|
|
- return "https://" in url
|
|
|
-
|
|
|
-
|
|
|
def request(query, params):
|
|
|
request_id = _get_request_id(query, params["pageno"], params["time_range"], params["safesearch"])
|
|
|
|
|
@@ -61,42 +76,105 @@ def request(query, params):
|
|
|
return params
|
|
|
|
|
|
|
|
|
-def response(resp):
|
|
|
- results = []
|
|
|
+def _strip_leading_strings(text):
|
|
|
+ for x in ['wikipedia', 'google']:
|
|
|
+ if text.lower().endswith(x):
|
|
|
+ text = text[: -len(x)]
|
|
|
+ return text.strip()
|
|
|
|
|
|
- json = resp.json()
|
|
|
|
|
|
- json_results = []
|
|
|
- if search_type == "search":
|
|
|
- json_results = json['results'].get('standardResults', [])
|
|
|
- else:
|
|
|
- json_results = json.get(search_type, [])
|
|
|
+def parse_search_query(json_results):
|
|
|
+ results = []
|
|
|
|
|
|
- for json_result in json_results:
|
|
|
+ for item in json_results.get('specialSections', {}).get('topStoriesCompact', {}).get('data', []):
|
|
|
result = {
|
|
|
- 'url': json_result['link'],
|
|
|
- 'title': json_result['title'],
|
|
|
- 'content': html_to_text(json_result.get('description', '')),
|
|
|
+ 'url': item['link'],
|
|
|
+ 'title': item['title'],
|
|
|
+ 'img_src': item['image'],
|
|
|
+ 'content': '',
|
|
|
+ 'metadata': item.get('source'),
|
|
|
}
|
|
|
- if search_type == "images":
|
|
|
- result['template'] = 'images.html'
|
|
|
-
|
|
|
- if not _is_valid_img_src(json_result['image']):
|
|
|
- continue
|
|
|
-
|
|
|
- result['img_src'] = json_result['image']
|
|
|
- if _is_valid_img_src(json_result['thumbnail']):
|
|
|
- result['thumbnail'] = json_result['thumbnail']
|
|
|
+ results.append(result)
|
|
|
|
|
|
- elif search_type == "videos":
|
|
|
- result['template'] = 'videos.html'
|
|
|
+ for item in json_results.get('standardResults', []):
|
|
|
+ result = {
|
|
|
+ 'url': item['link'],
|
|
|
+ 'title': item['title'],
|
|
|
+ 'content': html_to_text(item['description']),
|
|
|
+ }
|
|
|
+ results.append(result)
|
|
|
|
|
|
- if _is_valid_img_src(json_result['image']):
|
|
|
- result['thumbnail'] = json_result['image']
|
|
|
+ info = json_results.get('infoSection', {}).get('data')
|
|
|
+ if info:
|
|
|
+ attributes = []
|
|
|
+ for item in info.get('about', []):
|
|
|
+ label, value = html_to_text(item).split(':', 1)
|
|
|
+ value = _strip_leading_strings(value)
|
|
|
+ attributes.append({'label': label, 'value': value})
|
|
|
+ content = []
|
|
|
+ for item in [info['subtitle'], info['description']]:
|
|
|
+ item = _strip_leading_strings(html_to_text(item))
|
|
|
+ if item:
|
|
|
+ content.append(item)
|
|
|
+
|
|
|
+ results.append(
|
|
|
+ {
|
|
|
+ 'infobox': info['title'],
|
|
|
+ 'id': info['title'],
|
|
|
+ 'img_src': info.get('image'),
|
|
|
+ 'content': ' | '.join(content),
|
|
|
+ 'attributes': attributes,
|
|
|
+ }
|
|
|
+ )
|
|
|
+ return results
|
|
|
|
|
|
- result['duration'] = json_result['duration']
|
|
|
- result['length'] = json_result['duration']
|
|
|
|
|
|
- results.append(result)
|
|
|
+def response(resp):
|
|
|
+ results = []
|
|
|
+ json_resp = resp.json()
|
|
|
+
|
|
|
+ if search_type == 'search':
|
|
|
+ results = parse_search_query(json_resp['results'])
|
|
|
+
|
|
|
+ elif search_type == 'images':
|
|
|
+ for item in json_resp['images']:
|
|
|
+ results.append(
|
|
|
+ {
|
|
|
+ 'template': 'images.html',
|
|
|
+ 'title': item['title'],
|
|
|
+ 'url': item['link'],
|
|
|
+ 'img_src': item['image'],
|
|
|
+ 'thumbnail_src': item['thumbnail'],
|
|
|
+ }
|
|
|
+ )
|
|
|
+
|
|
|
+ elif search_type == 'videos':
|
|
|
+ # The results in the video category are most often links to pages that contain
|
|
|
+ # a video and not to a video stream --> SearXNG can't use the video template.
|
|
|
+
|
|
|
+ for item in json_resp['videos']:
|
|
|
+ metadata = [x for x in [item.get('description'), item.get('duration')] if x]
|
|
|
+ results.append(
|
|
|
+ {
|
|
|
+ 'title': item['title'],
|
|
|
+ 'url': item['link'],
|
|
|
+ 'content': '',
|
|
|
+ 'metadata': ' / '.join(metadata),
|
|
|
+ 'img_src': item.get('image'),
|
|
|
+ }
|
|
|
+ )
|
|
|
+
|
|
|
+ elif search_type == 'news':
|
|
|
+ for item in json_resp['news']:
|
|
|
+ metadata = [x for x in [item.get('source'), item.get('time')] if x]
|
|
|
+ results.append(
|
|
|
+ {
|
|
|
+ 'title': item['title'],
|
|
|
+ 'url': item['link'],
|
|
|
+ 'content': item['description'],
|
|
|
+ 'metadata': ' / '.join(metadata),
|
|
|
+ 'img_src': item.get('image'),
|
|
|
+ }
|
|
|
+ )
|
|
|
|
|
|
return results
|