8 months ago · 0f2fc5879d
--- a/searx/engines/startpage.py
+++ b/searx/engines/startpage.py
@@ -74,24 +74,25 @@ Startpage's category (for Web-search, News, Videos, ..) is set by
 
				 
			
 
				 .. hint::
			
 
				 
			
 
				-   The default category is ``web`` .. and other categories than ``web`` are not
			
 
				-   yet implemented.
			
 
				+  Supported categories are ``web``, ``news`` and ``images``.
			
 
				 
			
 
				 """
			
 
				 # pylint: disable=too-many-statements
			
 
				+from __future__ import annotations
			
 
				 
			
 
				-from typing import TYPE_CHECKING
			
 
				+from typing import TYPE_CHECKING, Any
			
 
				 from collections import OrderedDict
			
 
				 import re
			
 
				 from unicodedata import normalize, combining
			
 
				 from time import time
			
 
				 from datetime import datetime, timedelta
			
 
				+from json import loads
			
 
				 
			
 
				 import dateutil.parser
			
 
				 import lxml.html
			
 
				 import babel.localedata
			
 
				 
			
 
				-from searx.utils import extract_text, eval_xpath, gen_useragent
			
 
				+from searx.utils import extr, extract_text, eval_xpath, gen_useragent, html_to_text, humanize_bytes, remove_pua_from_str
			
 
				 from searx.network import get  # see https://github.com/searxng/searxng/issues/762
			
 
				 from searx.exceptions import SearxEngineCaptchaException
			
 
				 from searx.locales import region_tag
			
@@ -250,22 +251,13 @@ def request(query, params):
 
				     Additionally the arguments form Startpage's search form needs to be set in
			
 
				     HTML POST data / compare ``<input>`` elements: :py:obj:`search_form_xpath`.
			
 
				     """
			
 
				-    if startpage_categ == 'web':
			
 
				-        return _request_cat_web(query, params)
			
 
				-
			
 
				-    logger.error("Startpages's category '%' is not yet implemented.", startpage_categ)
			
 
				-    return params
			
 
				-
			
 
				-
			
 
				-def _request_cat_web(query, params):
			
 
				-
			
 
				     engine_region = traits.get_region(params['searxng_locale'], 'en-US')
			
 
				     engine_language = traits.get_language(params['searxng_locale'], 'en')
			
 
				 
			
 
				     # build arguments
			
 
				     args = {
			
 
				         'query': query,
			
 
				-        'cat': 'web',
			
 
				+        'cat': startpage_categ,
			
 
				         't': 'device',
			
 
				         'sc': get_sc_code(params['searxng_locale'], params),  # hint: this func needs HTTP headers,
			
 
				         'with_date': time_range_dict.get(params['time_range'], ''),
			
@@ -317,73 +309,118 @@ def _request_cat_web(query, params):
 
				     return params
			
 
				 
			
 
				 
			
 
				-# get response from search-request
			
 
				-def response(resp):
			
 
				-    dom = lxml.html.fromstring(resp.text)
			
 
				+def _parse_published_date(content: str) -> tuple[str, datetime | None]:
			
 
				+    published_date = None
			
 
				 
			
 
				-    if startpage_categ == 'web':
			
 
				-        return _response_cat_web(dom)
			
 
				+    # check if search result starts with something like: "2 Sep 2014 ... "
			
 
				+    if re.match(r"^([1-9]|[1-2][0-9]|3[0-1]) [A-Z][a-z]{2} [0-9]{4} \.\.\. ", content):
			
 
				+        date_pos = content.find('...') + 4
			
 
				+        date_string = content[0 : date_pos - 5]
			
 
				+        # fix content string
			
 
				+        content = content[date_pos:]
			
 
				 
			
 
				-    logger.error("Startpages's category '%' is not yet implemented.", startpage_categ)
			
 
				-    return []
			
 
				+        try:
			
 
				+            published_date = dateutil.parser.parse(date_string, dayfirst=True)
			
 
				+        except ValueError:
			
 
				+            pass
			
 
				 
			
 
				+    # check if search result starts with something like: "5 days ago ... "
			
 
				+    elif re.match(r"^[0-9]+ days? ago \.\.\. ", content):
			
 
				+        date_pos = content.find('...') + 4
			
 
				+        date_string = content[0 : date_pos - 5]
			
 
				 
			
 
				-def _response_cat_web(dom):
			
 
				-    results = []
			
 
				+        # calculate datetime
			
 
				+        published_date = datetime.now() - timedelta(days=int(re.match(r'\d+', date_string).group()))  # type: ignore
			
 
				 
			
 
				-    # parse results
			
 
				-    for result in eval_xpath(dom, '//div[@class="w-gl"]/div[contains(@class, "result")]'):
			
 
				-        links = eval_xpath(result, './/a[contains(@class, "result-title result-link")]')
			
 
				-        if not links:
			
 
				-            continue
			
 
				-        link = links[0]
			
 
				-        url = link.attrib.get('href')
			
 
				+        # fix content string
			
 
				+        content = content[date_pos:]
			
 
				 
			
 
				-        # block google-ad url's
			
 
				-        if re.match(r"^http(s|)://(www\.)?google\.[a-z]+/aclk.*$", url):
			
 
				-            continue
			
 
				+    return content, published_date
			
 
				 
			
 
				-        # block startpage search url's
			
 
				-        if re.match(r"^http(s|)://(www\.)?startpage\.com/do/search\?.*$", url):
			
 
				-            continue
			
 
				 
			
 
				-        title = extract_text(eval_xpath(link, 'h2'))
			
 
				-        content = eval_xpath(result, './/p[contains(@class, "description")]')
			
 
				-        content = extract_text(content, allow_none=True) or ''
			
 
				+def _get_web_result(result):
			
 
				+    content = html_to_text(result.get('description'))
			
 
				+    content, publishedDate = _parse_published_date(content)
			
 
				+
			
 
				+    return {
			
 
				+        'url': result['clickUrl'],
			
 
				+        'title': html_to_text(result['title']),
			
 
				+        'content': content,
			
 
				+        'publishedDate': publishedDate,
			
 
				+    }
			
 
				 
			
 
				-        published_date = None
			
 
				 
			
 
				-        # check if search result starts with something like: "2 Sep 2014 ... "
			
 
				-        if re.match(r"^([1-9]|[1-2][0-9]|3[0-1]) [A-Z][a-z]{2} [0-9]{4} \.\.\. ", content):
			
 
				-            date_pos = content.find('...') + 4
			
 
				-            date_string = content[0 : date_pos - 5]
			
 
				-            # fix content string
			
 
				-            content = content[date_pos:]
			
 
				+def _get_news_result(result):
			
 
				 
			
 
				-            try:
			
 
				-                published_date = dateutil.parser.parse(date_string, dayfirst=True)
			
 
				-            except ValueError:
			
 
				-                pass
			
 
				+    title = remove_pua_from_str(html_to_text(result['title']))
			
 
				+    content = remove_pua_from_str(html_to_text(result.get('description')))
			
 
				+
			
 
				+    publishedDate = None
			
 
				+    if result.get('date'):
			
 
				+        publishedDate = datetime.fromtimestamp(result['date'] / 1000)
			
 
				 
			
 
				-        # check if search result starts with something like: "5 days ago ... "
			
 
				-        elif re.match(r"^[0-9]+ days? ago \.\.\. ", content):
			
 
				-            date_pos = content.find('...') + 4
			
 
				-            date_string = content[0 : date_pos - 5]
			
 
				+    thumbnailUrl = None
			
 
				+    if result.get('thumbnailUrl'):
			
 
				+        thumbnailUrl = base_url + result['thumbnailUrl']
			
 
				 
			
 
				-            # calculate datetime
			
 
				-            published_date = datetime.now() - timedelta(days=int(re.match(r'\d+', date_string).group()))  # type: ignore
			
 
				+    return {
			
 
				+        'url': result['clickUrl'],
			
 
				+        'title': title,
			
 
				+        'content': content,
			
 
				+        'publishedDate': publishedDate,
			
 
				+        'thumbnail': thumbnailUrl,
			
 
				+    }
			
 
				 
			
 
				-            # fix content string
			
 
				-            content = content[date_pos:]
			
 
				 
			
 
				-        if published_date:
			
 
				-            # append result
			
 
				-            results.append({'url': url, 'title': title, 'content': content, 'publishedDate': published_date})
			
 
				-        else:
			
 
				-            # append result
			
 
				-            results.append({'url': url, 'title': title, 'content': content})
			
 
				+def _get_image_result(result) -> dict[str, Any] | None:
			
 
				+    url = result.get('altClickUrl')
			
 
				+    if not url:
			
 
				+        return None
			
 
				+
			
 
				+    thumbnailUrl = None
			
 
				+    if result.get('thumbnailUrl'):
			
 
				+        thumbnailUrl = base_url + result['thumbnailUrl']
			
 
				+
			
 
				+    resolution = None
			
 
				+    if result.get('width') and result.get('height'):
			
 
				+        resolution = f"{result['width']}x{result['height']}"
			
 
				+
			
 
				+    filesize = None
			
 
				+    if result.get('filesize'):
			
 
				+        size_str = ''.join(filter(str.isdigit, result['filesize']))
			
 
				+        filesize = humanize_bytes(int(size_str))
			
 
				+
			
 
				+    return {
			
 
				+        'template': 'images.html',
			
 
				+        'url': url,
			
 
				+        'title': html_to_text(result['title']),
			
 
				+        'content': '',
			
 
				+        'img_src': result.get('rawImageUrl'),
			
 
				+        'thumbnail_src': thumbnailUrl,
			
 
				+        'resolution': resolution,
			
 
				+        'img_format': result.get('format'),
			
 
				+        'filesize': filesize,
			
 
				+    }
			
 
				+
			
 
				+
			
 
				+def response(resp):
			
 
				+    categ = startpage_categ.capitalize()
			
 
				+    results_raw = '{' + extr(resp.text, f"React.createElement(UIStartpage.AppSerp{categ}, {{", '}})') + '}}'
			
 
				+    results_json = loads(results_raw)
			
 
				+    results_obj = results_json.get('render', {}).get('presenter', {}).get('regions', {})
			
 
				+
			
 
				+    results = []
			
 
				+    for results_categ in results_obj.get('mainline', []):
			
 
				+        for item in results_categ.get('results', []):
			
 
				+            if results_categ['display_type'] == 'web-google':
			
 
				+                results.append(_get_web_result(item))
			
 
				+            elif results_categ['display_type'] == 'news-bing':
			
 
				+                results.append(_get_news_result(item))
			
 
				+            elif 'images' in results_categ['display_type']:
			
 
				+                item = _get_image_result(item)
			
 
				+                if item:
			
 
				+                    results.append(item)
			
 
				 
			
 
				-    # return results
			
 
				     return results
			
 
				 
			
 
				 
			
--- a/searx/settings.yml
+++ b/searx/settings.yml
@@ -1792,6 +1792,20 @@ engines:
 
				     additional_tests:
			
 
				       rosebud: *test_rosebud
			
 
				 
			
 
				+  - name: startpage news
			
 
				+    engine: startpage
			
 
				+    startpage_categ: news
			
 
				+    shortcut: spn
			
 
				+    timeout: 6.0
			
 
				+    disabled: true
			
 
				+
			
 
				+  - name: startpage images
			
 
				+    engine: startpage
			
 
				+    startpage_categ: images
			
 
				+    shortcut: spi
			
 
				+    timeout: 6.0
			
 
				+    disabled: true
			
 
				+
			
 
				   - name: tokyotoshokan
			
 
				     engine: tokyotoshokan
			
 
				     shortcut: tt
			
--- a/searx/utils.py
+++ b/searx/utils.py
@@ -470,6 +470,21 @@ def ecma_unescape(string: str) -> str:
 
				     return string
			
 
				 
			
 
				 
			
 
				+def remove_pua_from_str(string):
			
 
				+    """Removes unicode's "PRIVATE USE CHARACTER"s (PUA_) from a string.
			
 
				+
			
 
				+    _PUA: https://en.wikipedia.org/wiki/Private_Use_Areas
			
 
				+    """
			
 
				+    pua_ranges = ((0xE000, 0xF8FF), (0xF0000, 0xFFFFD), (0x100000, 0x10FFFD))
			
 
				+    s = []
			
 
				+    for c in string:
			
 
				+        i = ord(c)
			
 
				+        if any(a <= i <= b for (a, b) in pua_ranges):
			
 
				+            continue
			
 
				+        s.append(c)
			
 
				+    return "".join(s)
			
 
				+
			
 
				+
			
 
				 def get_string_replaces_function(replaces: Dict[str, str]) -> Callable[[str], str]:
			
 
				     rep = {re.escape(k): v for k, v in replaces.items()}
			
 
				     pattern = re.compile("|".join(rep.keys()))