Browse Source

[feat] startpage: support for news and images

Bnyro 2 months ago
parent
commit
0f2fc5879d
3 changed files with 131 additions and 65 deletions
  1. 102 65
      searx/engines/startpage.py
  2. 14 0
      searx/settings.yml
  3. 15 0
      searx/utils.py

+ 102 - 65
searx/engines/startpage.py

@@ -74,24 +74,25 @@ Startpage's category (for Web-search, News, Videos, ..) is set by
 
 
 .. hint::
 .. hint::
 
 
-   The default category is ``web`` .. and other categories than ``web`` are not
-   yet implemented.
+  Supported categories are ``web``, ``news`` and ``images``.
 
 
 """
 """
 # pylint: disable=too-many-statements
 # pylint: disable=too-many-statements
+from __future__ import annotations
 
 
-from typing import TYPE_CHECKING
+from typing import TYPE_CHECKING, Any
 from collections import OrderedDict
 from collections import OrderedDict
 import re
 import re
 from unicodedata import normalize, combining
 from unicodedata import normalize, combining
 from time import time
 from time import time
 from datetime import datetime, timedelta
 from datetime import datetime, timedelta
+from json import loads
 
 
 import dateutil.parser
 import dateutil.parser
 import lxml.html
 import lxml.html
 import babel.localedata
 import babel.localedata
 
 
-from searx.utils import extract_text, eval_xpath, gen_useragent
+from searx.utils import extr, extract_text, eval_xpath, gen_useragent, html_to_text, humanize_bytes, remove_pua_from_str
 from searx.network import get  # see https://github.com/searxng/searxng/issues/762
 from searx.network import get  # see https://github.com/searxng/searxng/issues/762
 from searx.exceptions import SearxEngineCaptchaException
 from searx.exceptions import SearxEngineCaptchaException
 from searx.locales import region_tag
 from searx.locales import region_tag
@@ -250,22 +251,13 @@ def request(query, params):
     Additionally the arguments form Startpage's search form needs to be set in
     Additionally the arguments form Startpage's search form needs to be set in
     HTML POST data / compare ``<input>`` elements: :py:obj:`search_form_xpath`.
     HTML POST data / compare ``<input>`` elements: :py:obj:`search_form_xpath`.
     """
     """
-    if startpage_categ == 'web':
-        return _request_cat_web(query, params)
-
-    logger.error("Startpages's category '%' is not yet implemented.", startpage_categ)
-    return params
-
-
-def _request_cat_web(query, params):
-
     engine_region = traits.get_region(params['searxng_locale'], 'en-US')
     engine_region = traits.get_region(params['searxng_locale'], 'en-US')
     engine_language = traits.get_language(params['searxng_locale'], 'en')
     engine_language = traits.get_language(params['searxng_locale'], 'en')
 
 
     # build arguments
     # build arguments
     args = {
     args = {
         'query': query,
         'query': query,
-        'cat': 'web',
+        'cat': startpage_categ,
         't': 'device',
         't': 'device',
         'sc': get_sc_code(params['searxng_locale'], params),  # hint: this func needs HTTP headers,
         'sc': get_sc_code(params['searxng_locale'], params),  # hint: this func needs HTTP headers,
         'with_date': time_range_dict.get(params['time_range'], ''),
         'with_date': time_range_dict.get(params['time_range'], ''),
@@ -317,73 +309,118 @@ def _request_cat_web(query, params):
     return params
     return params
 
 
 
 
-# get response from search-request
-def response(resp):
-    dom = lxml.html.fromstring(resp.text)
+def _parse_published_date(content: str) -> tuple[str, datetime | None]:
+    published_date = None
 
 
-    if startpage_categ == 'web':
-        return _response_cat_web(dom)
+    # check if search result starts with something like: "2 Sep 2014 ... "
+    if re.match(r"^([1-9]|[1-2][0-9]|3[0-1]) [A-Z][a-z]{2} [0-9]{4} \.\.\. ", content):
+        date_pos = content.find('...') + 4
+        date_string = content[0 : date_pos - 5]
+        # fix content string
+        content = content[date_pos:]
 
 
-    logger.error("Startpages's category '%' is not yet implemented.", startpage_categ)
-    return []
+        try:
+            published_date = dateutil.parser.parse(date_string, dayfirst=True)
+        except ValueError:
+            pass
 
 
+    # check if search result starts with something like: "5 days ago ... "
+    elif re.match(r"^[0-9]+ days? ago \.\.\. ", content):
+        date_pos = content.find('...') + 4
+        date_string = content[0 : date_pos - 5]
 
 
-def _response_cat_web(dom):
-    results = []
+        # calculate datetime
+        published_date = datetime.now() - timedelta(days=int(re.match(r'\d+', date_string).group()))  # type: ignore
 
 
-    # parse results
-    for result in eval_xpath(dom, '//div[@class="w-gl"]/div[contains(@class, "result")]'):
-        links = eval_xpath(result, './/a[contains(@class, "result-title result-link")]')
-        if not links:
-            continue
-        link = links[0]
-        url = link.attrib.get('href')
+        # fix content string
+        content = content[date_pos:]
 
 
-        # block google-ad url's
-        if re.match(r"^http(s|)://(www\.)?google\.[a-z]+/aclk.*$", url):
-            continue
+    return content, published_date
 
 
-        # block startpage search url's
-        if re.match(r"^http(s|)://(www\.)?startpage\.com/do/search\?.*$", url):
-            continue
 
 
-        title = extract_text(eval_xpath(link, 'h2'))
-        content = eval_xpath(result, './/p[contains(@class, "description")]')
-        content = extract_text(content, allow_none=True) or ''
+def _get_web_result(result):
+    content = html_to_text(result.get('description'))
+    content, publishedDate = _parse_published_date(content)
+
+    return {
+        'url': result['clickUrl'],
+        'title': html_to_text(result['title']),
+        'content': content,
+        'publishedDate': publishedDate,
+    }
 
 
-        published_date = None
 
 
-        # check if search result starts with something like: "2 Sep 2014 ... "
-        if re.match(r"^([1-9]|[1-2][0-9]|3[0-1]) [A-Z][a-z]{2} [0-9]{4} \.\.\. ", content):
-            date_pos = content.find('...') + 4
-            date_string = content[0 : date_pos - 5]
-            # fix content string
-            content = content[date_pos:]
+def _get_news_result(result):
 
 
-            try:
-                published_date = dateutil.parser.parse(date_string, dayfirst=True)
-            except ValueError:
-                pass
+    title = remove_pua_from_str(html_to_text(result['title']))
+    content = remove_pua_from_str(html_to_text(result.get('description')))
+
+    publishedDate = None
+    if result.get('date'):
+        publishedDate = datetime.fromtimestamp(result['date'] / 1000)
 
 
-        # check if search result starts with something like: "5 days ago ... "
-        elif re.match(r"^[0-9]+ days? ago \.\.\. ", content):
-            date_pos = content.find('...') + 4
-            date_string = content[0 : date_pos - 5]
+    thumbnailUrl = None
+    if result.get('thumbnailUrl'):
+        thumbnailUrl = base_url + result['thumbnailUrl']
 
 
-            # calculate datetime
-            published_date = datetime.now() - timedelta(days=int(re.match(r'\d+', date_string).group()))  # type: ignore
+    return {
+        'url': result['clickUrl'],
+        'title': title,
+        'content': content,
+        'publishedDate': publishedDate,
+        'thumbnail': thumbnailUrl,
+    }
 
 
-            # fix content string
-            content = content[date_pos:]
 
 
-        if published_date:
-            # append result
-            results.append({'url': url, 'title': title, 'content': content, 'publishedDate': published_date})
-        else:
-            # append result
-            results.append({'url': url, 'title': title, 'content': content})
+def _get_image_result(result) -> dict[str, Any] | None:
+    url = result.get('altClickUrl')
+    if not url:
+        return None
+
+    thumbnailUrl = None
+    if result.get('thumbnailUrl'):
+        thumbnailUrl = base_url + result['thumbnailUrl']
+
+    resolution = None
+    if result.get('width') and result.get('height'):
+        resolution = f"{result['width']}x{result['height']}"
+
+    filesize = None
+    if result.get('filesize'):
+        size_str = ''.join(filter(str.isdigit, result['filesize']))
+        filesize = humanize_bytes(int(size_str))
+
+    return {
+        'template': 'images.html',
+        'url': url,
+        'title': html_to_text(result['title']),
+        'content': '',
+        'img_src': result.get('rawImageUrl'),
+        'thumbnail_src': thumbnailUrl,
+        'resolution': resolution,
+        'img_format': result.get('format'),
+        'filesize': filesize,
+    }
+
+
+def response(resp):
+    categ = startpage_categ.capitalize()
+    results_raw = '{' + extr(resp.text, f"React.createElement(UIStartpage.AppSerp{categ}, {{", '}})') + '}}'
+    results_json = loads(results_raw)
+    results_obj = results_json.get('render', {}).get('presenter', {}).get('regions', {})
+
+    results = []
+    for results_categ in results_obj.get('mainline', []):
+        for item in results_categ.get('results', []):
+            if results_categ['display_type'] == 'web-google':
+                results.append(_get_web_result(item))
+            elif results_categ['display_type'] == 'news-bing':
+                results.append(_get_news_result(item))
+            elif 'images' in results_categ['display_type']:
+                item = _get_image_result(item)
+                if item:
+                    results.append(item)
 
 
-    # return results
     return results
     return results
 
 
 
 

+ 14 - 0
searx/settings.yml

@@ -1792,6 +1792,20 @@ engines:
     additional_tests:
     additional_tests:
       rosebud: *test_rosebud
       rosebud: *test_rosebud
 
 
+  - name: startpage news
+    engine: startpage
+    startpage_categ: news
+    shortcut: spn
+    timeout: 6.0
+    disabled: true
+
+  - name: startpage images
+    engine: startpage
+    startpage_categ: images
+    shortcut: spi
+    timeout: 6.0
+    disabled: true
+
   - name: tokyotoshokan
   - name: tokyotoshokan
     engine: tokyotoshokan
     engine: tokyotoshokan
     shortcut: tt
     shortcut: tt

+ 15 - 0
searx/utils.py

@@ -470,6 +470,21 @@ def ecma_unescape(string: str) -> str:
     return string
     return string
 
 
 
 
+def remove_pua_from_str(string):
+    """Removes unicode's "PRIVATE USE CHARACTER"s (PUA_) from a string.
+
+    _PUA: https://en.wikipedia.org/wiki/Private_Use_Areas
+    """
+    pua_ranges = ((0xE000, 0xF8FF), (0xF0000, 0xFFFFD), (0x100000, 0x10FFFD))
+    s = []
+    for c in string:
+        i = ord(c)
+        if any(a <= i <= b for (a, b) in pua_ranges):
+            continue
+        s.append(c)
+    return "".join(s)
+
+
 def get_string_replaces_function(replaces: Dict[str, str]) -> Callable[[str], str]:
 def get_string_replaces_function(replaces: Dict[str, str]) -> Callable[[str], str]:
     rep = {re.escape(k): v for k, v in replaces.items()}
     rep = {re.escape(k): v for k, v in replaces.items()}
     pattern = re.compile("|".join(rep.keys()))
     pattern = re.compile("|".join(rep.keys()))