2 years ago · ef56e1d684
--- a/searx/utils.py
+++ b/searx/utils.py
@@ -15,6 +15,7 @@ from numbers import Number
 
				 from os.path import splitext, join
			
 
				 from random import choice
			
 
				 from html.parser import HTMLParser
			
 
				+from html import escape
			
 
				 from urllib.parse import urljoin, urlparse
			
 
				 from markdown_it import MarkdownIt
			
 
				 
			
@@ -88,7 +89,7 @@ class _HTMLTextExtractorException(Exception):
 
				     """Internal exception raised when the HTML is invalid"""
			
 
				 
			
 
				 
			
 
				-class _HTMLTextExtractor(HTMLParser):  # pylint: disable=W0223  # (see https://bugs.python.org/issue31844)
			
 
				+class _HTMLTextExtractor(HTMLParser):
			
 
				     """Internal class to extract text from HTML"""
			
 
				 
			
 
				     def __init__(self):
			
@@ -137,6 +138,11 @@ class _HTMLTextExtractor(HTMLParser):  # pylint: disable=W0223  # (see https://b
 
				     def get_text(self):
			
 
				         return ''.join(self.result).strip()
			
 
				 
			
 
				+    def error(self, message):
			
 
				+        # error handle is needed in <py3.10
			
 
				+        # https://github.com/python/cpython/pull/8562/files
			
 
				+        raise AssertionError(message)
			
 
				+
			
 
				 
			
 
				 def html_to_text(html_str: str) -> str:
			
 
				     """Extract text from a HTML string
			
@@ -153,12 +159,18 @@ def html_to_text(html_str: str) -> str:
 
				 
			
 
				         >>> html_to_text('<style>.span { color: red; }</style><span>Example</span>')
			
 
				         'Example'
			
 
				+
			
 
				+        >>> html_to_text(r'regexp: (?<![a-zA-Z]')
			
 
				+        'regexp: (?<![a-zA-Z]'
			
 
				     """
			
 
				     html_str = html_str.replace('\n', ' ').replace('\r', ' ')
			
 
				     html_str = ' '.join(html_str.split())
			
 
				     s = _HTMLTextExtractor()
			
 
				     try:
			
 
				         s.feed(html_str)
			
 
				+    except AssertionError:
			
 
				+        s = _HTMLTextExtractor()
			
 
				+        s.feed(escape(html_str, quote=True))
			
 
				     except _HTMLTextExtractorException:
			
 
				         logger.debug("HTMLTextExtractor: invalid HTML\n%s", html_str)
			
 
				     return s.get_text()
			
--- a/tests/unit/test_utils.py
+++ b/tests/unit/test_utils.py
@@ -41,6 +41,7 @@ class TestUtils(SearxTestCase):
 
				         self.assertIsInstance(utils.html_to_text(html_str), str)
			
 
				         self.assertIsNotNone(utils.html_to_text(html_str))
			
 
				         self.assertEqual(utils.html_to_text(html_str), "Test text")
			
 
				+        self.assertEqual(utils.html_to_text(r"regexp: (?<![a-zA-Z]"), "regexp: (?<![a-zA-Z]")
			
 
				 
			
 
				     def test_extract_text(self):
			
 
				         html_str = """