Browse Source

[fix] fix flickr_noapi decoding (#1655)

Characters that were not ASCII were incorrectly decoded.
Add an helper function: searx.utils.ecma_unescape (Python implementation of unescape Javascript function).
Alexandre Flament 5 years ago
parent
commit
2179079a91
3 changed files with 32 additions and 6 deletions
  1. 6 6
      searx/engines/flickr_noapi.py
  2. 19 0
      searx/utils.py
  3. 7 0
      tests/unit/test_utils.py

+ 6 - 6
searx/engines/flickr_noapi.py

@@ -16,7 +16,8 @@ from json import loads
 from time import time
 import re
 from searx.engines import logger
-from searx.url_utils import urlencode, unquote
+from searx.url_utils import urlencode
+from searx.utils import ecma_unescape, html_to_text
 
 logger = logger.getChild('flickr-noapi')
 
@@ -75,11 +76,10 @@ def response(resp):
 
     for index in legend:
         photo = model_export['main'][index[0]][int(index[1])][index[2]][index[3]][int(index[4])]
-        author = unquote(photo.get('realname', ''))
-        source = unquote(photo.get('username', '')) + ' @ Flickr'
-        title = unquote(photo.get('title', ''))
-        content = unquote(photo.get('description', ''))
-
+        author = ecma_unescape(photo.get('realname', ''))
+        source = ecma_unescape(photo.get('username', '')) + ' @ Flickr'
+        title = ecma_unescape(photo.get('title', ''))
+        content = html_to_text(ecma_unescape(photo.get('description', '')))
         img_src = None
         # From the biggest to the lowest format
         for image_size in image_sizes:

+ 19 - 0
searx/utils.py

@@ -1,3 +1,4 @@
+# -*- coding: utf-8 -*-
 import csv
 import hashlib
 import hmac
@@ -44,6 +45,9 @@ logger = logger.getChild('utils')
 blocked_tags = ('script',
                 'style')
 
+ecma_unescape4_re = re.compile(r'%u([0-9a-fA-F]{4})', re.UNICODE)
+ecma_unescape2_re = re.compile(r'%([0-9a-fA-F]{2})', re.UNICODE)
+
 useragents = json.loads(open(os.path.dirname(os.path.realpath(__file__))
                              + "/data/useragents.json", 'r', encoding='utf-8').read())
 
@@ -415,3 +419,18 @@ def to_string(obj):
         return obj.__str__()
     if hasattr(obj, '__repr__'):
         return obj.__repr__()
+
+
+def ecma_unescape(s):
+    """
+    python implementation of the unescape javascript function
+
+    https://www.ecma-international.org/ecma-262/6.0/#sec-unescape-string
+    https://developer.mozilla.org/fr/docs/Web/JavaScript/Reference/Objets_globaux/unescape
+    """
+    # s = unicode(s)
+    # "%u5409" becomes "吉"
+    s = ecma_unescape4_re.sub(lambda e: unichr(int(e.group(1), 16)), s)
+    # "%20" becomes " ", "%F3" becomes "ó"
+    s = ecma_unescape2_re.sub(lambda e: unichr(int(e.group(1), 16)), s)
+    return s

+ 7 - 0
tests/unit/test_utils.py

@@ -90,6 +90,13 @@ class TestUtils(SearxTestCase):
         self.assertEqual(utils.match_language('iw-IL', ['he-IL']), 'he-IL')
         self.assertEqual(utils.match_language('he-IL', ['iw-IL'], aliases), 'iw-IL')
 
+    def test_ecma_unscape(self):
+        self.assertEqual(utils.ecma_unescape('text%20with%20space'), 'text with space')
+        self.assertEqual(utils.ecma_unescape('text using %xx: %F3'),
+                         u'text using %xx: ó')
+        self.assertEqual(utils.ecma_unescape('text using %u: %u5409, %u4E16%u754c'),
+                         u'text using %u: 吉, 世界')
+
 
 class TestHTMLTextExtractor(SearxTestCase):