2 years ago · 6d72ef3cbe
--- a/searx/webutils.py
+++ b/searx/webutils.py
@@ -113,31 +113,68 @@ def prettify_url(url, max_length=74):
 
															         return url
														
 
															+def contains_cjko(s: str) -> bool:
														
 
															+    """This function check whether or not a string contains Chinese, Japanese,
														
 
															+    or Korean characters. It employs regex and uses the u escape sequence to
														
 
															+    match any character in a set of Unicode ranges.
														
 
															+
														
 
															+    Args:
														
 
															+        s (str): string to be checked.
														
 
															+
														
 
															+    Returns:
														
 
															+        bool: True if the input s contains the characters and False otherwise.
														
 
															+    """
														
 
															+    unicode_ranges = (
														
 
															+        '\u4e00-\u9fff'  # Chinese characters
														
 
															+        '\u3040-\u309f'  # Japanese hiragana
														
 
															+        '\u30a0-\u30ff'  # Japanese katakana
														
 
															+        '\u4e00-\u9faf'  # Japanese kanji
														
 
															+        '\uac00-\ud7af'  # Korean hangul syllables
														
 
															+        '\u1100-\u11ff'  # Korean hangul jamo
														
 
															+    )
														
 
															+    return bool(re.search(fr'[{unicode_ranges}]', s))
														
 
															+
														
 
															+
														
 
															+def regex_highlight_cjk(word: str) -> str:
														
 
															+    """Generate the regex pattern to match for a given word according
														
 
															+    to whether or not the word contains CJK characters or not.
														
 
															+    If the word is and/or contains CJK character, the regex pattern
														
 
															+    will match standalone word by taking into account the presence
														
 
															+    of whitespace before and after it; if not, it will match any presence
														
 
															+    of the word throughout the text, ignoring the whitespace.
														
 
															+
														
 
															+    Args:
														
 
															+        word (str): the word to be matched with regex pattern.
														
 
															+
														
 
															+    Returns:
														
 
															+        str: the regex pattern for the word.
														
 
															+    """
														
 
															+    rword = re.escape(word)
														
 
															+    if contains_cjko(rword):
														
 
															+        return fr'({rword})'
														
 
															+    else:
														
 
															+        return fr'\b({rword})(?!\w)'
														
 
															+
														
 
															+
														
 
															 def highlight_content(content, query):
														
 
															     if not content:
														
 
															         return None
														
 
															+
														
 
															     # ignoring html contents
														
 
															     # TODO better html content detection
														
 
															     if content.find('<') != -1:
														
 
															         return content
														
 
															-    if content.lower().find(query.lower()) > -1:
														
 
															-        query_regex = '({0})'.format(re.escape(query))
														
 
															-        content = re.sub(query_regex, '<span class="highlight">\\1</span>', content, flags=re.I | re.U)
														
 
															-    else:
														
 
															-        regex_parts = []
														
 
															-        for chunk in query.split():
														
 
															-            chunk = chunk.replace('"', '')
														
 
															-            if len(chunk) == 0:
														
 
															-                continue
														
 
															-            elif len(chunk) == 1:
														
 
															-                regex_parts.append('\\W+{0}\\W+'.format(re.escape(chunk)))
														
 
															-            else:
														
 
															-                regex_parts.append('{0}'.format(re.escape(chunk)))
														
 
															-        query_regex = '({0})'.format('|'.join(regex_parts))
														
 
															-        content = re.sub(query_regex, '<span class="highlight">\\1</span>', content, flags=re.I | re.U)
														
 
															-
														
 
															+    querysplit = query.split()
														
 
															+    queries = []
														
 
															+    for qs in querysplit:
														
 
															+        qs = qs.replace("'", "").replace('"', '').replace(" ", "")
														
 
															+        if len(qs) > 0:
														
 
															+            queries.extend(re.findall(regex_highlight_cjk(qs), content, flags=re.I | re.U))
														
 
															+    if len(queries) > 0:
														
 
															+        for q in set(queries):
														
 
															+            content = re.sub(regex_highlight_cjk(q), f'<span class="highlight">{q}</span>', content)
														
 
															     return content
														
--- a/tests/unit/test_webutils.py
+++ b/tests/unit/test_webutils.py
@@ -28,32 +28,33 @@ class TestWebUtils(SearxTestCase):
 
															         content = 'a'
														
 
															         query = 'test'
														
 
															-        self.assertEqual(webutils.highlight_content(content, query), content)
														
 
															+        self.assertEqual(webutils.highlight_content(content, query), 'a')
														
 
															         query = 'a test'
														
 
															-        self.assertEqual(webutils.highlight_content(content, query), content)
														
 
															+        self.assertEqual(webutils.highlight_content(content, query), '<span class="highlight">a</span>')
														
 
															         data = (
														
 
															             ('" test "', 'a test string', 'a <span class="highlight">test</span> string'),
														
 
															-            ('"a"', 'this is a test string', 'this is<span class="highlight"> a </span>test string'),
														
 
															+            ('"a"', 'this is a test string', 'this is <span class="highlight">a</span> test string'),
														
 
															             (
														
 
															                 'a test',
														
 
															                 'this is a test string that matches entire query',
														
 
															-                'this is <span class="highlight">a test</span> string that matches entire query',
														
 
															+                'this is <span class="highlight">a</span> <span class="highlight">test</span> string that matches entire query',
														
 
															             ),
														
 
															             (
														
 
															                 'this a test',
														
 
															                 'this is a string to test.',
														
 
															                 (
														
 
															-                    '<span class="highlight">this</span> is<span class="highlight"> a </span>'
														
 
															-                    'string to <span class="highlight">test</span>.'
														
 
															+                    '<span class="highlight">this</span> is <span class="highlight">a</span> string to <span class="highlight">test</span>.'
														
 
															                 ),
														
 
															             ),
														
 
															             (
														
 
															                 'match this "exact phrase"',
														
 
															                 'this string contains the exact phrase we want to match',
														
 
															-                (
														
 
															-                    '<span class="highlight">this</span> string contains the <span class="highlight">exact</span>'
														
 
															-                    ' <span class="highlight">phrase</span> we want to <span class="highlight">match</span>'
														
 
															+                ''.join(
														
 
															+                    [
														
 
															+                        '<span class="highlight">this</span> string contains the <span class="highlight">exact</span> ',
														
 
															+                        '<span class="highlight">phrase</span> we want to <span class="highlight">match</span>',
														
 
															+                    ]
														
 
															                 ),
														
 
															             ),
														
 
															         )