Browse Source

[fix] hardening against arguments of type None, where str or dict is expected

On a long-running server, the tracebacks below can be found (albeit rarely),
which indicate problems with NoneType where a string or another data type is
expected.

result.img_src::

    File "/usr/local/searxng/searxng-src/searx/templates/simple/result_templates/images.html", line 13, in top-level template code
      <img src="" data-src="{{ image_proxify(result.img_src) }}" alt="{{ result.title|striptags }}">{{- "" -}}
      ^
    File "/usr/local/searxng/searxng-src/searx/webapp.py", line 284, in image_proxify
      if url.startswith('//'):
         ^^^^^^^^^^^^^^
    AttributeError: 'NoneType' object has no attribute 'startswith'

result.content::

    File "/usr/local/searxng/searxng-src/searx/result_types/_base.py", line 105, in _normalize_text_fields
      result.content = WHITESPACE_REGEX.sub(" ", result.content).strip()
                       ~~~~~~~~~~~~~~~~~~~~^^^^^^^^^^^^^^^^^^^^^
    TypeError: expected string or bytes-like object, got 'NoneType'

html_to_text, when html_str is a NoneType::

    File "/usr/local/searxng/searxng-src/searx/engines/wikipedia.py", line 190, in response
      title = utils.html_to_text(api_result.get('titles', {}).get('display') or api_result.get('title'))
    File "/usr/local/searxng/searxng-src/searx/utils.py", line 158, in html_to_text
      html_str = html_str.replace('\n', ' ').replace('\r', ' ')
                 ^^^^^^^^^^^^^^^^
    AttributeError: 'NoneType' object has no attribute 'replace'

presearch engine, when json_resp is a NoneType::

    File "/usr/local/searxng/searxng-src/searx/engines/presearch.py", line 221, in response
      results = parse_search_query(json_resp.get('results'))
    File "/usr/local/searxng/searxng-src/searx/engines/presearch.py", line 161, in parse_search_query
      for item in json_results.get('specialSections', {}).get('topStoriesCompact', {}).get('data', []):
                  ^^^^^^^^^^^^^^^^
    AttributeError: 'NoneType' object has no attribute 'get'

Signed-off-by: Markus Heiser <markus.heiser@darmarit.de>
Markus Heiser 1 month ago
parent
commit
e6308b8167
4 changed files with 14 additions and 3 deletions
  1. 3 1
      searx/engines/presearch.py
  2. 4 2
      searx/result_types/_base.py
  3. 2 0
      searx/utils.py
  4. 5 0
      searx/webapp.py

+ 3 - 1
searx/engines/presearch.py

@@ -184,6 +184,8 @@ def _fix_title(title, url):
 
 
 def parse_search_query(json_results):
 def parse_search_query(json_results):
     results = []
     results = []
+    if not json_results:
+        return results
 
 
     for item in json_results.get('specialSections', {}).get('topStoriesCompact', {}).get('data', []):
     for item in json_results.get('specialSections', {}).get('topStoriesCompact', {}).get('data', []):
         result = {
         result = {
@@ -245,7 +247,7 @@ def response(resp):
     json_resp = resp.json()
     json_resp = resp.json()
 
 
     if search_type == 'search':
     if search_type == 'search':
-        results = parse_search_query(json_resp.get('results'))
+        results = parse_search_query(json_resp.get('results', {}))
 
 
     elif search_type == 'images':
     elif search_type == 'images':
         for item in json_resp.get('images', []):
         for item in json_resp.get('images', []):

+ 4 - 2
searx/result_types/_base.py

@@ -103,8 +103,10 @@ def _normalize_text_fields(result: MainResult | LegacyResult):
         result.content = str(result)
         result.content = str(result)
 
 
     # normalize title and content
     # normalize title and content
-    result.title = WHITESPACE_REGEX.sub(" ", result.title).strip()
-    result.content = WHITESPACE_REGEX.sub(" ", result.content).strip()
+    if result.title:
+        result.title = WHITESPACE_REGEX.sub(" ", result.title).strip()
+    if result.content:
+        result.content = WHITESPACE_REGEX.sub(" ", result.content).strip()
     if result.content == result.title:
     if result.content == result.title:
         # avoid duplicate content between the content and title fields
         # avoid duplicate content between the content and title fields
         result.content = ""
         result.content = ""

+ 2 - 0
searx/utils.py

@@ -154,6 +154,8 @@ def html_to_text(html_str: str) -> str:
         >>> html_to_text(r'regexp: (?<![a-zA-Z]')
         >>> html_to_text(r'regexp: (?<![a-zA-Z]')
         'regexp: (?<![a-zA-Z]'
         'regexp: (?<![a-zA-Z]'
     """
     """
+    if not html_str:
+        return ""
     html_str = html_str.replace('\n', ' ').replace('\r', ' ')
     html_str = html_str.replace('\n', ' ').replace('\r', ' ')
     html_str = ' '.join(html_str.split())
     html_str = ' '.join(html_str.split())
     s = _HTMLTextExtractor()
     s = _HTMLTextExtractor()

+ 5 - 0
searx/webapp.py

@@ -265,6 +265,9 @@ def custom_url_for(endpoint: str, **values):
 
 
 
 
 def morty_proxify(url: str):
 def morty_proxify(url: str):
+    if not url:
+        return url
+
     if url.startswith('//'):
     if url.startswith('//'):
         url = 'https:' + url
         url = 'https:' + url
 
 
@@ -280,6 +283,8 @@ def morty_proxify(url: str):
 
 
 
 
 def image_proxify(url: str):
 def image_proxify(url: str):
+    if not url:
+        return url
 
 
     if url.startswith('//'):
     if url.startswith('//'):
         url = 'https:' + url
         url = 'https:' + url