test_utils.py 9.1 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228
  1. # SPDX-License-Identifier: AGPL-3.0-or-later
  2. # pylint: disable=missing-module-docstring, invalid-name
  3. import lxml.etree
  4. from lxml import html
  5. from searx.exceptions import SearxXPathSyntaxException, SearxEngineXPathException
  6. from searx import utils
  7. from tests import SearxTestCase
  8. class TestUtils(SearxTestCase): # pylint: disable=missing-class-docstring
  9. def test_gen_useragent(self):
  10. self.assertIsInstance(utils.gen_useragent(), str)
  11. self.assertIsNotNone(utils.gen_useragent())
  12. self.assertTrue(utils.gen_useragent().startswith('Mozilla'))
  13. def test_searx_useragent(self):
  14. self.assertIsInstance(utils.searx_useragent(), str)
  15. self.assertIsNotNone(utils.searx_useragent())
  16. self.assertTrue(utils.searx_useragent().startswith('searx'))
  17. def test_html_to_text(self):
  18. html_str = """
  19. <a href="/testlink" class="link_access_account">
  20. <style>
  21. .toto {
  22. color: red;
  23. }
  24. </style>
  25. <span class="toto">
  26. <span>
  27. <img src="test.jpg" />
  28. </span>
  29. </span>
  30. <span class="titi">
  31. Test text
  32. </span>
  33. <script>value='dummy';</script>
  34. </a>
  35. """
  36. self.assertIsInstance(utils.html_to_text(html_str), str)
  37. self.assertIsNotNone(utils.html_to_text(html_str))
  38. self.assertEqual(utils.html_to_text(html_str), "Test text")
  39. self.assertEqual(utils.html_to_text(r"regexp: (?<![a-zA-Z]"), "regexp: (?<![a-zA-Z]")
  40. def test_extract_text(self):
  41. html_str = """
  42. <a href="/testlink" class="link_access_account">
  43. <span class="toto">
  44. <span>
  45. <img src="test.jpg" />
  46. </span>
  47. </span>
  48. <span class="titi">
  49. Test text
  50. </span>
  51. </a>
  52. """
  53. dom = html.fromstring(html_str)
  54. self.assertEqual(utils.extract_text(dom), 'Test text')
  55. self.assertEqual(utils.extract_text(dom.xpath('//span')), 'Test text')
  56. self.assertEqual(utils.extract_text(dom.xpath('//span/text()')), 'Test text')
  57. self.assertEqual(utils.extract_text(dom.xpath('count(//span)')), '3.0')
  58. self.assertEqual(utils.extract_text(dom.xpath('boolean(//span)')), 'True')
  59. self.assertEqual(utils.extract_text(dom.xpath('//img/@src')), 'test.jpg')
  60. self.assertEqual(utils.extract_text(dom.xpath('//unexistingtag')), '')
  61. self.assertEqual(utils.extract_text(None, allow_none=True), None)
  62. with self.assertRaises(ValueError):
  63. utils.extract_text(None)
  64. with self.assertRaises(ValueError):
  65. utils.extract_text({})
  66. def test_extract_url(self):
  67. def f(html_str, search_url):
  68. return utils.extract_url(html.fromstring(html_str), search_url)
  69. self.assertEqual(f('<span id="42">https://example.com</span>', 'http://example.com/'), 'https://example.com/')
  70. self.assertEqual(f('https://example.com', 'http://example.com/'), 'https://example.com/')
  71. self.assertEqual(f('//example.com', 'http://example.com/'), 'http://example.com/')
  72. self.assertEqual(f('//example.com', 'https://example.com/'), 'https://example.com/')
  73. self.assertEqual(f('/path?a=1', 'https://example.com'), 'https://example.com/path?a=1')
  74. with self.assertRaises(lxml.etree.ParserError):
  75. f('', 'https://example.com')
  76. with self.assertRaises(Exception):
  77. utils.extract_url([], 'https://example.com')
  78. def test_html_to_text_invalid(self):
  79. _html = '<p><b>Lorem ipsum</i>dolor sit amet</p>'
  80. self.assertEqual(utils.html_to_text(_html), "Lorem ipsum")
  81. def test_ecma_unscape(self):
  82. self.assertEqual(utils.ecma_unescape('text%20with%20space'), 'text with space')
  83. self.assertEqual(utils.ecma_unescape('text using %xx: %F3'), 'text using %xx: ó')
  84. self.assertEqual(utils.ecma_unescape('text using %u: %u5409, %u4E16%u754c'), 'text using %u: 吉, 世界')
  85. class TestHTMLTextExtractor(SearxTestCase): # pylint: disable=missing-class-docstring
  86. def setUp(self):
  87. self.html_text_extractor = utils._HTMLTextExtractor() # pylint: disable=protected-access
  88. def test__init__(self):
  89. self.assertEqual(self.html_text_extractor.result, [])
  90. def test_handle_charref(self):
  91. self.html_text_extractor.handle_charref('xF')
  92. self.assertIn('\x0f', self.html_text_extractor.result)
  93. self.html_text_extractor.handle_charref('XF')
  94. self.assertIn('\x0f', self.html_text_extractor.result)
  95. self.html_text_extractor.handle_charref('97')
  96. self.assertIn('a', self.html_text_extractor.result)
  97. def test_handle_entityref(self):
  98. entity = 'test'
  99. self.html_text_extractor.handle_entityref(entity)
  100. self.assertIn(entity, self.html_text_extractor.result)
  101. def test_invalid_html(self):
  102. text = '<p><b>Lorem ipsum</i>dolor sit amet</p>'
  103. with self.assertRaises(utils._HTMLTextExtractorException): # pylint: disable=protected-access
  104. self.html_text_extractor.feed(text)
  105. class TestXPathUtils(SearxTestCase): # pylint: disable=missing-class-docstring
  106. TEST_DOC = """<ul>
  107. <li>Text in <b>bold</b> and <i>italic</i> </li>
  108. <li>Another <b>text</b> <img src="data:image/gif;base64,R0lGODlhAQABAIAAAAUEBAAAACwAAAAAAQABAAACAkQBADs="></li>
  109. </ul>"""
  110. def test_get_xpath_cache(self):
  111. xp1 = utils.get_xpath('//a')
  112. xp2 = utils.get_xpath('//div')
  113. xp3 = utils.get_xpath('//a')
  114. self.assertEqual(id(xp1), id(xp3))
  115. self.assertNotEqual(id(xp1), id(xp2))
  116. def test_get_xpath_type(self):
  117. utils.get_xpath(lxml.etree.XPath('//a'))
  118. with self.assertRaises(TypeError):
  119. utils.get_xpath([])
  120. def test_get_xpath_invalid(self):
  121. invalid_xpath = '//a[0].text'
  122. with self.assertRaises(SearxXPathSyntaxException) as context:
  123. utils.get_xpath(invalid_xpath)
  124. self.assertEqual(context.exception.message, 'Invalid expression')
  125. self.assertEqual(context.exception.xpath_str, invalid_xpath)
  126. def test_eval_xpath_unregistered_function(self):
  127. doc = html.fromstring(TestXPathUtils.TEST_DOC)
  128. invalid_function_xpath = 'int(//a)'
  129. with self.assertRaises(SearxEngineXPathException) as context:
  130. utils.eval_xpath(doc, invalid_function_xpath)
  131. self.assertEqual(context.exception.message, 'Unregistered function')
  132. self.assertEqual(context.exception.xpath_str, invalid_function_xpath)
  133. def test_eval_xpath(self):
  134. doc = html.fromstring(TestXPathUtils.TEST_DOC)
  135. self.assertEqual(utils.eval_xpath(doc, '//p'), [])
  136. self.assertEqual(utils.eval_xpath(doc, '//i/text()'), ['italic'])
  137. self.assertEqual(utils.eval_xpath(doc, 'count(//i)'), 1.0)
  138. def test_eval_xpath_list(self):
  139. doc = html.fromstring(TestXPathUtils.TEST_DOC)
  140. # check a not empty list
  141. self.assertEqual(utils.eval_xpath_list(doc, '//i/text()'), ['italic'])
  142. # check min_len parameter
  143. with self.assertRaises(SearxEngineXPathException) as context:
  144. utils.eval_xpath_list(doc, '//p', min_len=1)
  145. self.assertEqual(context.exception.message, 'len(xpath_str) < 1')
  146. self.assertEqual(context.exception.xpath_str, '//p')
  147. def test_eval_xpath_getindex(self):
  148. doc = html.fromstring(TestXPathUtils.TEST_DOC)
  149. # check index 0
  150. self.assertEqual(utils.eval_xpath_getindex(doc, '//i/text()', 0), 'italic')
  151. # default is 'something'
  152. self.assertEqual(utils.eval_xpath_getindex(doc, '//i/text()', 1, default='something'), 'something')
  153. # default is None
  154. self.assertEqual(utils.eval_xpath_getindex(doc, '//i/text()', 1, default=None), None)
  155. # index not found
  156. with self.assertRaises(SearxEngineXPathException) as context:
  157. utils.eval_xpath_getindex(doc, '//i/text()', 1)
  158. self.assertEqual(context.exception.message, 'index 1 not found')
  159. # not a list
  160. with self.assertRaises(SearxEngineXPathException) as context:
  161. utils.eval_xpath_getindex(doc, 'count(//i)', 1)
  162. self.assertEqual(context.exception.message, 'the result is not a list')
  163. def test_detect_language(self):
  164. # make sure new line are not an issue
  165. # fasttext.predict('') does not accept new line.
  166. l = utils.detect_language('The quick brown fox jumps over\nthe lazy dog')
  167. self.assertEqual(l, 'en')
  168. l = utils.detect_language(
  169. 'いろはにほへと ちりぬるを わかよたれそ つねならむ うゐのおくやま けふこえて あさきゆめみし ゑひもせす'
  170. )
  171. self.assertEqual(l, 'ja')
  172. l = utils.detect_language('Pijamalı hasta yağız şoföre çabucak güvendi.')
  173. self.assertEqual(l, 'tr')
  174. l = utils.detect_language('')
  175. self.assertIsNone(l)
  176. # mix languages --> None
  177. l = utils.detect_language('The いろはにほへと Pijamalı')
  178. self.assertIsNone(l)
  179. with self.assertRaises(ValueError):
  180. utils.detect_language(None)