5 years ago · 6deb85072a
--- a/searx/utils.py
+++ b/searx/utils.py
@@ -77,6 +77,10 @@ def highlight_content(content, query):
 
				     return content
			
 
				 
			
 
				 
			
 
				+class HTMLTextExtractorException(Exception):
			
 
				+    pass
			
 
				+
			
 
				+
			
 
				 class HTMLTextExtractor(HTMLParser):
			
 
				 
			
 
				     def __init__(self):
			
@@ -92,7 +96,7 @@ class HTMLTextExtractor(HTMLParser):
 
				             return
			
 
				 
			
 
				         if tag != self.tags[-1]:
			
 
				-            raise Exception("invalid html")
			
 
				+            raise HTMLTextExtractorException()
			
 
				 
			
 
				         self.tags.pop()
			
 
				 
			
@@ -128,7 +132,10 @@ def html_to_text(html):
 
				     html = html.replace('\n', ' ')
			
 
				     html = ' '.join(html.split())
			
 
				     s = HTMLTextExtractor()
			
 
				-    s.feed(html)
			
 
				+    try:
			
 
				+        s.feed(html)
			
 
				+    except HTMLTextExtractorException:
			
 
				+        logger.debug("HTMLTextExtractor: invalid HTML\n%s", html)
			
 
				     return s.get_text()
			
 
				 
			
 
				 
			
--- a/tests/unit/test_utils.py
+++ b/tests/unit/test_utils.py
@@ -52,6 +52,10 @@ class TestUtils(SearxTestCase):
 
				         self.assertIsNotNone(utils.html_to_text(html))
			
 
				         self.assertEqual(utils.html_to_text(html), "Test text")
			
 
				 
			
 
				+    def test_html_to_text_invalid(self):
			
 
				+        html = '<p><b>Lorem ipsum</i>dolor sit amet</p>'
			
 
				+        self.assertEqual(utils.html_to_text(html), "Lorem ipsum")
			
 
				+
			
 
				     def test_prettify_url(self):
			
 
				         data = (('https://searx.me/', 'https://searx.me/'),
			
 
				                 ('https://searx.me/ű', 'https://searx.me/ű'),
			
@@ -116,6 +120,11 @@ class TestHTMLTextExtractor(SearxTestCase):
 
				         self.html_text_extractor.handle_entityref(entity)
			
 
				         self.assertIn(entity, self.html_text_extractor.result)
			
 
				 
			
 
				+    def test_invalid_html(self):
			
 
				+        text = '<p><b>Lorem ipsum</i>dolor sit amet</p>'
			
 
				+        with self.assertRaises(utils.HTMLTextExtractorException):
			
 
				+            self.html_text_extractor.feed(text)
			
 
				+
			
 
				 
			
 
				 class TestUnicodeWriter(SearxTestCase):