Browse Source

js_variable_to_python: add tests, handle more JS syntax

The tests from chompjs are copied.
The comment out tests do not pass.
The implementation of js_variable_to_python has been updated:
* in the main looop, try to make the four different cases more clear
* handle decimal number like "-.5", "5." or "- 5"  (without double quote)
* the character ` is seen a string delimiter as intended in JS
* the identifiers follow JS specification ($, _, letters and numbers)
Alexandre Flament 1 year ago
parent
commit
72f5e7cfb8
3 changed files with 389 additions and 43 deletions
  1. 1 0
      requirements-dev.txt
  2. 105 43
      searx/utils.py
  3. 283 0
      tests/unit/test_js_variable_to_python.py

+ 1 - 0
requirements-dev.txt

@@ -21,3 +21,4 @@ aiounittest==1.4.2
 yamllint==1.32.0
 yamllint==1.32.0
 wlc==1.13
 wlc==1.13
 coloredlogs==15.0.1
 coloredlogs==15.0.1
+parameterized==0.9.0

+ 105 - 43
searx/utils.py

@@ -38,9 +38,14 @@ _BLOCKED_TAGS = ('script', 'style')
 _ECMA_UNESCAPE4_RE = re.compile(r'%u([0-9a-fA-F]{4})', re.UNICODE)
 _ECMA_UNESCAPE4_RE = re.compile(r'%u([0-9a-fA-F]{4})', re.UNICODE)
 _ECMA_UNESCAPE2_RE = re.compile(r'%([0-9a-fA-F]{2})', re.UNICODE)
 _ECMA_UNESCAPE2_RE = re.compile(r'%([0-9a-fA-F]{2})', re.UNICODE)
 
 
-_JS_QUOTE_KEYS_RE = re.compile(r'([\{\s,])(\w+)(:)')
-_JS_VOID_RE = re.compile(r'void\s+[0-9]+|void\s*\([0-9]+\)')
-_JS_DECIMAL_RE = re.compile(r":\s*\.")
+_JS_STRING_DELIMITERS = re.compile(r'(["\'`])')
+_JS_QUOTE_KEYS_RE = re.compile(r'([\{\s,])([\$_\w][\$_\w0-9]*)(:)')
+_JS_VOID_OR_UNDEFINED_RE = re.compile(r'void\s+[0-9]+|void\s*\([0-9]+\)|undefined')
+_JS_DECIMAL_RE = re.compile(r"([\[\,:])\s*(\-?)\s*([0-9_]*)\.([0-9_]*)")
+_JS_DECIMAL2_RE = re.compile(r"([\[\,:])\s*(\-?)\s*([0-9_]+)")
+_JS_EXTRA_COMA_RE = re.compile(r"\s*,\s*([\]\}])")
+_JS_STRING_ESCAPE_RE = re.compile(r'\\(.)')
+_JSON_PASSTHROUGH_ESCAPES = R'"\bfnrtu'
 
 
 _STORAGE_UNIT_VALUE: Dict[str, int] = {
 _STORAGE_UNIT_VALUE: Dict[str, int] = {
     'TB': 1024 * 1024 * 1024 * 1024,
     'TB': 1024 * 1024 * 1024 * 1024,
@@ -652,12 +657,45 @@ def detect_language(text: str, threshold: float = 0.3, only_search_languages: bo
     return None
     return None
 
 
 
 
+def _j2p_process_escape(match):
+    # deal with ECMA escape characters
+    escape = match.group(1) or match.group(2)
+    return (
+        Rf'\{escape}'
+        if escape in _JSON_PASSTHROUGH_ESCAPES
+        else R'\u00'
+        if escape == 'x'
+        else ''
+        if escape == '\n'
+        else escape
+    )
+
+
+def _j2p_decimal(match):
+    return (
+        match.group(1)
+        + match.group(2)
+        + (match.group(3).replace("_", "") or "0")
+        + "."
+        + (match.group(4).replace("_", "") or "0")
+    )
+
+
+def _j2p_decimal2(match):
+    return match.group(1) + match.group(2) + match.group(3).replace("_", "")
+
+
 def js_variable_to_python(js_variable):
 def js_variable_to_python(js_variable):
     """Convert a javascript variable into JSON and then load the value
     """Convert a javascript variable into JSON and then load the value
 
 
     It does not deal with all cases, but it is good enough for now.
     It does not deal with all cases, but it is good enough for now.
     chompjs has a better implementation.
     chompjs has a better implementation.
     """
     """
+    if not isinstance(js_variable, str):
+        raise ValueError("js_variable must be of type str")
+    if js_variable == "":
+        raise ValueError("js_variable can't be an empty string")
+
     # when in_string is not None, it contains the character that has opened the string
     # when in_string is not None, it contains the character that has opened the string
     # either simple quote or double quote
     # either simple quote or double quote
     in_string = None
     in_string = None
@@ -665,49 +703,68 @@ def js_variable_to_python(js_variable):
     # r"""{ a:"f\"irst", c:'sec"ond'}"""
     # r"""{ a:"f\"irst", c:'sec"ond'}"""
     # becomes
     # becomes
     # ['{ a:', '"', 'f\\', '"', 'irst', '"', ', c:', "'", 'sec', '"', 'ond', "'", '}']
     # ['{ a:', '"', 'f\\', '"', 'irst', '"', ', c:', "'", 'sec', '"', 'ond', "'", '}']
-    parts = re.split(r'(["\'])', js_variable)
-    # previous part (to check the escape character antislash)
-    previous_p = ""
+    parts = _JS_STRING_DELIMITERS.split(js_variable)
+    # does the previous part ends with a backslash?
+    blackslash_just_before = False
     for i, p in enumerate(parts):
     for i, p in enumerate(parts):
-        # parse characters inside a ECMA string
-        if in_string:
-            # we are in a JS string: replace the colon by a temporary character
-            # so quote_keys_regex doesn't have to deal with colon inside the JS strings
-            parts[i] = parts[i].replace(':', chr(1))
-            if in_string == "'":
-                # the JS string is delimited by simple quote.
-                # This is not supported by JSON.
-                # simple quote delimited string are converted to double quote delimited string
-                # here, inside a JS string, we escape the double quote
-                parts[i] = parts[i].replace('"', r'\"')
-
-        # deal with delimieters and escape character
-        if not in_string and p in ('"', "'"):
-            # we are not in string
-            # but p is double or simple quote
-            # that's the start of a new string
-            # replace simple quote by double quote
-            # (JSON doesn't support simple quote)
+        if p == in_string and not blackslash_just_before:
+            # * the current part matches the character which has opened the string
+            # * there is no antislash just before
+            # --> the current part close the current string
+            in_string = None
+            # replace simple quote and ` by double quote
+            # since JSON supports only double quote for string
             parts[i] = '"'
             parts[i] = '"'
+
+        elif in_string:
+            # --> we are in a JS string
+            # replace the colon by a temporary character
+            # so _JS_QUOTE_KEYS_RE doesn't have to deal with colon inside the JS strings
+            p = p.replace(':', chr(1))
+            # replace JS escape sequences by JSON escape sequences
+            p = _JS_STRING_ESCAPE_RE.sub(_j2p_process_escape, p)
+            # the JS string is delimited by simple quote.
+            # This is not supported by JSON.
+            # simple quote delimited string are converted to double quote delimited string
+            # here, inside a JS string, we escape the double quote
+            if in_string == "'":
+                p = p.replace('"', r'\"')
+            parts[i] = p
+            # deal with the sequence blackslash then quote
+            # since js_variable splits on quote, we detect this case:
+            # * the previous part ends with a black slash
+            # * the current part is a single quote
+            # when detected the blackslash is removed on the previous part
+            if blackslash_just_before and p[:1] == "'":
+                parts[i - 1] = parts[i - 1][:-1]
+
+        elif in_string is None and p in ('"', "'", "`"):
+            # we are not in string but p is string delimiter
+            # --> that's the start of a new string
             in_string = p
             in_string = p
-            continue
-        if p == in_string:
-            # we are in a string and the current part MAY close the string
-            if len(previous_p) > 0 and previous_p[-1] == '\\':
-                # there is an antislash just before: the ECMA string continue
-                continue
-            # the current p close the string
             # replace simple quote by double quote
             # replace simple quote by double quote
+            # since JSON supports only double quote for string
             parts[i] = '"'
             parts[i] = '"'
-            in_string = None
 
 
-        if not in_string:
-            # replace void 0 by null
+        elif in_string is None:
+            # we are not in a string
+            # replace by null these values:
+            # * void 0
+            # * void(0)
+            # * undefined
             # https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Operators/void
             # https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Operators/void
-            # we are sure there is no string in p
-            parts[i] = _JS_VOID_RE.sub("null", p)
-        # update previous_p
-        previous_p = p
+            p = _JS_VOID_OR_UNDEFINED_RE.sub("null", p)
+            # make sure there is a leading zero in front of float
+            p = _JS_DECIMAL_RE.sub(_j2p_decimal, p)
+            p = _JS_DECIMAL2_RE.sub(_j2p_decimal2, p)
+            # remove extra coma in a list or an object
+            # for example [1,2,3,] becomes [1,2,3]
+            p = _JS_EXTRA_COMA_RE.sub(lambda match: match.group(1), p)
+            parts[i] = p
+
+        # update for the next iteration
+        blackslash_just_before = len(p) > 0 and p[-1] == '\\'
+
     # join the string
     # join the string
     s = ''.join(parts)
     s = ''.join(parts)
     # add quote arround the key
     # add quote arround the key
@@ -715,8 +772,13 @@ def js_variable_to_python(js_variable):
     # becomes
     # becomes
     # { "a": 12 }
     # { "a": 12 }
     s = _JS_QUOTE_KEYS_RE.sub(r'\1"\2"\3', s)
     s = _JS_QUOTE_KEYS_RE.sub(r'\1"\2"\3', s)
-    s = _JS_DECIMAL_RE.sub(":0.", s)
-    # replace the surogate character by colon
-    s = s.replace(chr(1), ':')
+    # replace the surogate character by colon and strip whitespaces
+    s = s.replace(chr(1), ':').strip()
     # load the JSON and return the result
     # load the JSON and return the result
-    return json.loads(s)
+    if s == "":
+        raise ValueError("js_variable can't be an empty string")
+    try:
+        return json.loads(s)
+    except json.JSONDecodeError as e:
+        logger.debug("Internal error: js_variable_to_python creates invalid JSON:\n%s", s)
+        raise ValueError("js_variable_to_python creates invalid JSON") from e

+ 283 - 0
tests/unit/test_js_variable_to_python.py

@@ -0,0 +1,283 @@
+# -*- coding: utf-8 -*-
+"""
+Tests for the function searx.utils.js_variable_to_python
+
+The tests are copied from https://github.com/Nykakin/chompjs/blob/c1501b5cd82c0044539875331745b820e7bfd067/chompjs/test_parser.py
+
+Comment out tests do not pass
+"""
+import math
+
+from parameterized import parameterized
+
+from searx.utils import js_variable_to_python
+
+from tests import SearxTestCase
+
+
+class TestParser(SearxTestCase):
+    @parameterized.expand(
+        [
+            ("{'hello': 'world'}", {'hello': 'world'}),
+            ("{'hello': 'world', 'my': 'master'}", {'hello': 'world', 'my': 'master'}),
+            (
+                "{'hello': 'world', 'my': {'master': 'of Orion'}, 'test': 'xx'}",
+                {'hello': 'world', 'my': {'master': 'of Orion'}, 'test': 'xx'},
+            ),
+            ("{}", {}),
+        ]
+    )
+    def test_parse_object(self, js, expected_py):
+        py = js_variable_to_python(js)
+        self.assertEqual(py, expected_py)
+
+    @parameterized.expand(
+        [
+            ("[]", []),
+            ("[[[]]]", [[[]]]),
+            ("[[[1]]]", [[[1]]]),
+            ("[1]", [1]),
+            ("[1, 2, 3, 4]", [1, 2, 3, 4]),
+            ("['h', 'e', 'l', 'l', 'o']", ['h', 'e', 'l', 'l', 'o']),
+            ("[[[[[[[[[[[[[[[1]]]]]]]]]]]]]]]", [[[[[[[[[[[[[[[1]]]]]]]]]]]]]]]),
+        ]
+    )
+    def test_parse_list(self, js, expected_py):
+        py = js_variable_to_python(js)
+        self.assertEqual(py, expected_py)
+
+    @parameterized.expand(
+        [
+            ("{'hello': [], 'world': [0]}", {'hello': [], 'world': [0]}),
+            ("{'hello': [1, 2, 3, 4]}", {'hello': [1, 2, 3, 4]}),
+            ("[{'a':12}, {'b':33}]", [{'a': 12}, {'b': 33}]),
+            (
+                "[false, {'true': true, `pies`: \"kot\"}, false,]",
+                [False, {"true": True, 'pies': 'kot'}, False],
+            ),
+            (
+                "{a:1,b:1,c:1,d:1,e:1,f:1,g:1,h:1,i:1,j:1}",
+                {k: 1 for k in 'abcdefghij'},
+            ),
+            (
+                "{'a':[{'b':1},{'c':[{'d':{'f':{'g':[1,2]}}},{'e':1}]}]}",
+                {'a': [{'b': 1}, {'c': [{'d': {'f': {'g': [1, 2]}}}, {'e': 1}]}]},
+            ),
+        ]
+    )
+    def test_parse_mixed(self, js, expected_py):
+        py = js_variable_to_python(js)
+        self.assertEqual(py, expected_py)
+
+    @parameterized.expand(
+        [
+            ("{'hello': 12, 'world': 10002.21}", {'hello': 12, 'world': 10002.21}),
+            ("[12, -323, 0.32, -32.22, .2, - 4]", [12, -323, 0.32, -32.22, 0.2, -4]),
+            ('{"a": -12, "b": - 5}', {'a': -12, 'b': -5}),
+            ("{'a': true, 'b': false, 'c': null}", {'a': True, 'b': False, 'c': None}),
+            ("[\"\\uD834\\uDD1E\"]", ['𝄞']),
+            ("{'a': '123\\'456\\n'}", {'a': "123'456\n"}),
+            ("['\u00E9']", ['é']),
+            ('{"cache":{"\u002Ftest\u002F": 0}}', {'cache': {'/test/': 0}}),
+            ('{"a": 3.125e7}', {'a': 3.125e7}),
+            ('''{"a": "b\\'"}''', {'a': "b'"}),
+            ('{"a": .99, "b": -.1}', {"a": 0.99, "b": -0.1}),
+            ('["/* ... */", "// ..."]', ["/* ... */", "// ..."]),
+            ('{"inclusions":["/*","/"]}', {'inclusions': ['/*', '/']}),
+        ]
+    )
+    def test_parse_standard_values(self, js, expected_py):
+        py = js_variable_to_python(js)
+        self.assertEqual(py, expected_py)
+
+    def test_parse_nan(self):
+        js = '{"A": NaN}'
+        py = js_variable_to_python(js)
+        self.assertTrue(math.isnan(py["A"]))
+
+    @parameterized.expand(
+        [
+            ("{abc: 100, dev: 200}", {'abc': 100, 'dev': 200}),
+            ("{abcdefghijklmnopqrstuvwxyz: 12}", {"abcdefghijklmnopqrstuvwxyz": 12}),
+            # (
+            #     "{age: function(yearBorn,thisYear) {return thisYear - yearBorn;}}",
+            #     {"age": "function(yearBorn,thisYear) {return thisYear - yearBorn;}"}
+            # ),
+            # (
+            #     "{\"abc\": function() {return '])))))))))))))))';}}",
+            #     {"abc": "function() {return '])))))))))))))))';}"},
+            # ),
+            ('{"a": undefined}', {"a": None}),  # chompjs returns {"a": "undefined"}
+            ('[undefined, undefined]', [None, None]),  # chompjs returns ["undefined", "undefined"]
+            ("{_a: 1, $b: 2}", {"_a": 1, "$b": 2}),
+            # ("{regex: /a[^d]{1,12}/i}", {'regex': '/a[^d]{1,12}/i'}),
+            # ("{'a': function(){return '\"'}}", {'a': 'function(){return \'"\'}'}),
+            ("{1: 1, 2: 2, 3: 3, 4: 4}", {'1': 1, '2': 2, '3': 3, '4': 4}),
+            ("{'a': 121.}", {'a': 121.0}),
+        ]
+    )
+    def test_parse_strange_values(self, js, expected_py):
+        py = js_variable_to_python(js)
+        self.assertEqual(py, expected_py)
+
+    @parameterized.expand(
+        [
+            # ('{"a": {"b": [12, 13, 14]}}text text', {"a": {"b": [12, 13, 14]}}),
+            # ('var test = {"a": {"b": [12, 13, 14]}}', {"a": {"b": [12, 13, 14]}}),
+            ('{"a":\r\n10}', {'a': 10}),
+            ("{'foo': 0,\r\n}", {'foo': 0}),
+            ("{truefalse: 0, falsefalse: 1, nullnull: 2}", {'truefalse': 0, 'falsefalse': 1, 'nullnull': 2}),
+        ]
+    )
+    def test_strange_input(self, js, expected_py):
+        py = js_variable_to_python(js)
+        self.assertEqual(py, expected_py)
+
+    @parameterized.expand(
+        [
+            ("[0]", [0]),
+            ("[1]", [1]),
+            ("[12]", [12]),
+            ("[12_12]", [1212]),
+            # ("[0x12]", [18]),
+            # ("[0xab]", [171]),
+            # ("[0xAB]", [171]),
+            # ("[0X12]", [18]),
+            # ("[0Xab]", [171]),
+            # ("[0XAB]", [171]),
+            # ("[01234]", [668]),
+            # ("[0o1234]", [668]),
+            # ("[0O1234]", [668]),
+            # ("[0b1111]", [15]),
+            # ("[0B1111]", [15]),
+            ("[-0]", [-0]),
+            ("[-1]", [-1]),
+            ("[-12]", [-12]),
+            ("[-12_12]", [-1212]),
+            # ("[-0x12]", [-18]),
+            # ("[-0xab]", [-171]),
+            # ("[-0xAB]", [-171]),
+            # ("[-0X12]", [-18]),
+            # ("[-0Xab]", [-171]),
+            # ("[-0XAB]", [-171]),
+            # ("[-01234]", [-668]),
+            # ("[-0o1234]", [-668]),
+            # ("[-0O1234]", [-668]),
+            # ("[-0b1111]", [-15]),
+            # ("[-0B1111]", [-15]),
+        ]
+    )
+    def test_integer_numeric_values(self, js, expected_py):
+        py = js_variable_to_python(js)
+        self.assertEqual(py, expected_py)
+
+    @parameterized.expand(
+        [
+            ("[0.32]", [0.32]),
+            ("[-0.32]", [-0.32]),
+            ("[.32]", [0.32]),
+            ("[-.32]", [-0.32]),
+            ("[12.]", [12.0]),
+            ("[-12.]", [-12.0]),
+            ("[12.32]", [12.32]),
+            ("[-12.12]", [-12.12]),
+            ("[3.1415926]", [3.1415926]),
+            ("[.123456789]", [0.123456789]),
+            ("[.0123]", [0.0123]),
+            ("[0.0123]", [0.0123]),
+            ("[-.0123]", [-0.0123]),
+            ("[-0.0123]", [-0.0123]),
+            ("[3.1E+12]", [3.1e12]),
+            ("[3.1e+12]", [3.1e12]),
+            ("[.1e-23]", [0.1e-23]),
+            ("[.1e-23]", [0.1e-23]),
+        ]
+    )
+    def test_float_numeric_values(self, js, expected_py):
+        py = js_variable_to_python(js)
+        self.assertEqual(py, expected_py)
+
+    # @parameterized.expand([
+    #     ('["Test\\nDrive"]\n{"Test": "Drive"}', [['Test\nDrive'], {'Test': 'Drive'}]),
+    # ])
+    # def test_jsonlines(self, js, expected_py):
+    #     py = js_variable_to_python(js)
+    #     self.assertEqual(py, expected_py)
+
+
+class TestParserExceptions(SearxTestCase):
+    @parameterized.expand(
+        [
+            ('}{', ValueError),
+            ('', ValueError),
+            (None, ValueError),
+        ]
+    )
+    def test_exceptions(self, js, expected_exception):
+        with self.assertRaises(expected_exception):
+            js_variable_to_python(js)
+
+    @parameterized.expand(
+        [
+            ("{whose: 's's', category_name: '>'}", ValueError),
+        ]
+    )
+    def test_malformed_input(self, in_data, expected_exception):
+        with self.assertRaises(expected_exception):
+            js_variable_to_python(in_data)
+
+    @parameterized.expand(
+        [
+            (
+                '{"test": """}',
+                ValueError,
+                'js_variable_to_python creates invalid JSON',
+            ),
+        ]
+    )
+    def test_error_messages(self, js, expected_exception, expected_exception_text):
+        with self.assertRaisesRegex(expected_exception, expected_exception_text):
+            js_variable_to_python(js)
+
+
+# class TestOptions(SearxTestCase):
+#     @parameterized.expand(
+#         [
+#             ('{\\\"a\\\": 12}', {'a': 12}),
+#         ]
+#     )
+#     def test_unicode_escape(self, js, expected_py):
+#         py = js_variable_to_python(js)
+#         self.assertEqual(py, expected_py)
+
+
+class TestParseJsonObjects(SearxTestCase):
+    @parameterized.expand(
+        [
+            # ("", []),
+            # ("aaaaaaaaaaaaaaaa", []),
+            # ("         ", []),
+            ("      {'a': 12}", [{'a': 12}]),
+            # ("[1, 2, 3, 4]xxxxxxxxxxxxxxxxxxxxxxxx", [[1, 2, 3, 4]]),
+            # ("[12] [13] [14]", [[12], [13], [14]]),
+            # ("[10] {'a': [1, 1, 1,]}", [[10], {'a': [1, 1, 1]}]),
+            # ("[1][1][1]", [[1], [1], [1]]),
+            # ("[1] [2] {'a': ", [[1], [2]]),
+            # ("[]", [[]]),
+            # ("[][][][]", [[], [], [], []]),
+            ("{}", [{}]),
+            # ("{}{}{}{}", [{}, {}, {}, {}]),
+            # ("{{}}{{}}", []),
+            # ("[[]][[]]", [[[]], [[]]]),
+            # ("{am: 'ab'}\n{'ab': 'xx'}", [{'am': 'ab'}, {'ab': 'xx'}]),
+            # (
+            #     'function(a, b, c){ /* ... */ }({"a": 12}, Null, [1, 2, 3])',
+            #     [{}, {'a': 12}, [1, 2, 3]],
+            # ),
+            # ('{"a": 12, broken}{"c": 100}', [{'c': 100}]),
+            # ('[12,,,,21][211,,,][12,12][12,,,21]', [[12, 12]]),
+        ]
+    )
+    def test_parse_json_objects(self, js, expected_py):
+        py_in_list = [js_variable_to_python(js)]
+        self.assertEqual(py_in_list, expected_py)