Browse Source

Merge pull request #2190 from dalf/fix-htmltextextractor

[fix] searx.utils.HTMLTextExtractor: invalid HTML don't raise an Exception
Alexandre Flament 4 years ago
parent
commit
530fc4bda7
2 changed files with 18 additions and 2 deletions
  1. 9 2
      searx/utils.py
  2. 9 0
      tests/unit/test_utils.py

+ 9 - 2
searx/utils.py

@@ -77,6 +77,10 @@ def highlight_content(content, query):
     return content
 
 
+class HTMLTextExtractorException(Exception):
+    pass
+
+
 class HTMLTextExtractor(HTMLParser):
 
     def __init__(self):
@@ -92,7 +96,7 @@ class HTMLTextExtractor(HTMLParser):
             return
 
         if tag != self.tags[-1]:
-            raise Exception("invalid html")
+            raise HTMLTextExtractorException()
 
         self.tags.pop()
 
@@ -128,7 +132,10 @@ def html_to_text(html):
     html = html.replace('\n', ' ')
     html = ' '.join(html.split())
     s = HTMLTextExtractor()
-    s.feed(html)
+    try:
+        s.feed(html)
+    except HTMLTextExtractorException:
+        logger.debug("HTMLTextExtractor: invalid HTML\n%s", html)
     return s.get_text()
 
 

+ 9 - 0
tests/unit/test_utils.py

@@ -52,6 +52,10 @@ class TestUtils(SearxTestCase):
         self.assertIsNotNone(utils.html_to_text(html))
         self.assertEqual(utils.html_to_text(html), "Test text")
 
+    def test_html_to_text_invalid(self):
+        html = '<p><b>Lorem ipsum</i>dolor sit amet</p>'
+        self.assertEqual(utils.html_to_text(html), "Lorem ipsum")
+
     def test_prettify_url(self):
         data = (('https://searx.me/', 'https://searx.me/'),
                 ('https://searx.me/ű', 'https://searx.me/ű'),
@@ -116,6 +120,11 @@ class TestHTMLTextExtractor(SearxTestCase):
         self.html_text_extractor.handle_entityref(entity)
         self.assertIn(entity, self.html_text_extractor.result)
 
+    def test_invalid_html(self):
+        text = '<p><b>Lorem ipsum</i>dolor sit amet</p>'
+        with self.assertRaises(utils.HTMLTextExtractorException):
+            self.html_text_extractor.feed(text)
+
 
 class TestUnicodeWriter(SearxTestCase):