Browse Source

[enh] flickr_noapi: use complete JSON data block, add 'content', 'img_format', 'source', etc. (#1571)

Fetch complete JSON data block, use legend to extract images. 
Unquote urlencoded strings.
Add image description as 'content'. 
Add 'img_format' and 'source' data (needs PR #1567 to enable this data to be displayed). 
Show images which lack ownerid instead of discarding them.
Frank de Lange 5 years ago
parent
commit
cbc5e13275
2 changed files with 277 additions and 244 deletions
  1. 28 23
      searx/engines/flickr_noapi.py
  2. 249 221
      tests/unit/engines/test_flickr_noapi.py

+ 28 - 23
searx/engines/flickr_noapi.py

@@ -16,8 +16,7 @@ from json import loads
 from time import time
 import re
 from searx.engines import logger
-from searx.url_utils import urlencode
-
+from searx.url_utils import urlencode, unquote
 
 logger = logger.getChild('flickr-noapi')
 
@@ -27,7 +26,7 @@ url = 'https://www.flickr.com/'
 search_url = url + 'search?{query}&page={page}'
 time_range_url = '&min_upload_date={start}&max_upload_date={end}'
 photo_url = 'https://www.flickr.com/photos/{userid}/{photoid}'
-regex = re.compile(r"\"search-photos-lite-models\",\"photos\":(.*}),\"totalItems\":", re.DOTALL)
+modelexport_re = re.compile(r"^\s*modelExport:\s*({.*}),$", re.M)
 image_sizes = ('o', 'k', 'h', 'b', 'c', 'z', 'n', 'm', 't', 'q', 's')
 
 paging = True
@@ -57,40 +56,45 @@ def request(query, params):
 def response(resp):
     results = []
 
-    matches = regex.search(resp.text)
+    matches = modelexport_re.search(resp.text)
 
     if matches is None:
         return results
 
     match = matches.group(1)
-    search_results = loads(match)
+    model_export = loads(match)
 
-    if '_data' not in search_results:
-        return []
+    if 'legend' not in model_export:
+        return results
 
-    photos = search_results['_data']
+    legend = model_export['legend']
 
-    for photo in photos:
+    # handle empty page
+    if not legend or not legend[0]:
+        return results
 
-        # In paged configuration, the first pages' photos
-        # are represented by a None object
-        if photo is None:
-            continue
+    for index in legend:
+        photo = model_export['main'][index[0]][int(index[1])][index[2]][index[3]][int(index[4])]
+        author = unquote(photo.get('realname', ''))
+        source = unquote(photo.get('username', '')) + ' @ Flickr'
+        title = unquote(photo.get('title', ''))
+        content = unquote(photo.get('description', ''))
 
         img_src = None
         # From the biggest to the lowest format
         for image_size in image_sizes:
             if image_size in photo['sizes']:
                 img_src = photo['sizes'][image_size]['url']
+                img_format = 'jpg ' \
+                    + str(photo['sizes'][image_size]['width']) \
+                    + 'x' \
+                    + str(photo['sizes'][image_size]['height'])
                 break
 
         if not img_src:
             logger.debug('cannot find valid image size: {0}'.format(repr(photo)))
             continue
 
-        if 'ownerNsid' not in photo:
-            continue
-
         # For a bigger thumbnail, keep only the url_z, not the url_n
         if 'n' in photo['sizes']:
             thumbnail_src = photo['sizes']['n']['url']
@@ -99,19 +103,20 @@ def response(resp):
         else:
             thumbnail_src = img_src
 
-        url = build_flickr_url(photo['ownerNsid'], photo['id'])
-
-        title = photo.get('title', '')
-
-        author = photo['username']
+        if 'ownerNsid' not in photo:
+            # should not happen, disowned photo? Show it anyway
+            url = img_src
+        else:
+            url = build_flickr_url(photo['ownerNsid'], photo['id'])
 
-        # append result
         results.append({'url': url,
                         'title': title,
                         'img_src': img_src,
                         'thumbnail_src': thumbnail_src,
-                        'content': '',
+                        'content': content,
                         'author': author,
+                        'source': source,
+                        'img_format': img_format,
                         'template': 'images.html'})
 
     return results

+ 249 - 221
tests/unit/engines/test_flickr_noapi.py

@@ -27,116 +27,132 @@ class TestFlickrNoapiEngine(SearxTestCase):
         self.assertRaises(AttributeError, flickr_noapi.response, '')
         self.assertRaises(AttributeError, flickr_noapi.response, '[]')
 
-        response = mock.Mock(text='"search-photos-lite-models","photos":{},"totalItems":')
+        response = mock.Mock(text='"modelExport:{"legend":[],"main":{"search-photos-lite-models":[{"photos":{}}]}}')
         self.assertEqual(flickr_noapi.response(response), [])
 
-        response = mock.Mock(text='search-photos-lite-models","photos":{"data": []},"totalItems":')
+        response = \
+            mock.Mock(text='"modelExport:{"legend":[],"main":{"search-photos-lite-models":[{"photos":{"_data":[]}}]}}')
         self.assertEqual(flickr_noapi.response(response), [])
 
         # everthing is ok test
         json = """
-        "search-photos-lite-models","photos":
-        {
-          "_data": [
-            {
-              "_flickrModelRegistry": "photo-lite-models",
-              "title": "This is the title",
-              "username": "Owner",
-              "pathAlias": "klink692",
-              "realname": "Owner",
-              "license": 0,
-              "ownerNsid": "59729010@N00",
-              "canComment": false,
-              "commentCount": 14,
-              "faveCount": 21,
-              "id": "14001294434",
-              "sizes": {
-                "c": {
-                  "displayUrl": "//farm8.staticflickr.com/7246/14001294434_410f653777_c.jpg",
-                  "width": 541,
-                  "height": 800,
-                  "url": "//c4.staticflickr.com/8/7246/14001294434_410f653777_c.jpg",
-                  "key": "c"
-                },
-                "h": {
-                  "displayUrl": "//farm8.staticflickr.com/7246/14001294434_761d32237a_h.jpg",
-                  "width": 1081,
-                  "height": 1600,
-                  "url": "//c4.staticflickr.com/8/7246/14001294434_761d32237a_h.jpg",
-                  "key": "h"
-                },
-                "k": {
-                  "displayUrl": "//farm8.staticflickr.com/7246/14001294434_f145a2c11a_k.jpg",
-                  "width": 1383,
-                  "height": 2048,
-                  "url": "//c4.staticflickr.com/8/7246/14001294434_f145a2c11a_k.jpg",
-                  "key": "k"
-                },
-                "l": {
-                  "displayUrl": "//farm8.staticflickr.com/7246/14001294434_410f653777_b.jpg",
-                  "width": 692,
-                  "height": 1024,
-                  "url": "//c4.staticflickr.com/8/7246/14001294434_410f653777_b.jpg",
-                  "key": "l"
-                },
-                "m": {
-                  "displayUrl": "//farm8.staticflickr.com/7246/14001294434_410f653777.jpg",
-                  "width": 338,
-                  "height": 500,
-                  "url": "//c4.staticflickr.com/8/7246/14001294434_410f653777.jpg",
-                  "key": "m"
-                },
-                "n": {
-                  "displayUrl": "//farm8.staticflickr.com/7246/14001294434_410f653777_n.jpg",
-                  "width": 216,
-                  "height": 320,
-                  "url": "//c4.staticflickr.com/8/7246/14001294434_410f653777_n.jpg",
-                  "key": "n"
-                },
-                "q": {
-                  "displayUrl": "//farm8.staticflickr.com/7246/14001294434_410f653777_q.jpg",
-                  "width": 150,
-                  "height": 150,
-                  "url": "//c4.staticflickr.com/8/7246/14001294434_410f653777_q.jpg",
-                  "key": "q"
-                },
-                "s": {
-                  "displayUrl": "//farm8.staticflickr.com/7246/14001294434_410f653777_m.jpg",
-                  "width": 162,
-                  "height": 240,
-                  "url": "//c4.staticflickr.com/8/7246/14001294434_410f653777_m.jpg",
-                  "key": "s"
-                },
-                "sq": {
-                  "displayUrl": "//farm8.staticflickr.com/7246/14001294434_410f653777_s.jpg",
-                  "width": 75,
-                  "height": 75,
-                  "url": "//c4.staticflickr.com/8/7246/14001294434_410f653777_s.jpg",
-                  "key": "sq"
-                },
-                "t": {
-                  "displayUrl": "//farm8.staticflickr.com/7246/14001294434_410f653777_t.jpg",
-                  "width": 68,
-                  "height": 100,
-                  "url": "//c4.staticflickr.com/8/7246/14001294434_410f653777_t.jpg",
-                  "key": "t"
-                },
-                "z": {
-                  "displayUrl": "//farm8.staticflickr.com/7246/14001294434_410f653777_z.jpg",
-                  "width": 433,
-                  "height": 640,
-                  "url": "//c4.staticflickr.com/8/7246/14001294434_410f653777_z.jpg",
-                  "key": "z"
+        modelExport: {
+          "legend": [
+            [
+              "search-photos-lite-models",
+              "0",
+              "photos",
+              "_data",
+              "0"
+            ]
+          ],
+          "main": {
+            "search-photos-lite-models": [
+              {
+                "photos": {
+                  "_data": [
+                    {
+                      "_flickrModelRegistry": "photo-lite-models",
+                      "title": "This%20is%20the%20title",
+                      "username": "Owner",
+                      "pathAlias": "klink692",
+                      "realname": "Owner",
+                      "license": 0,
+                      "ownerNsid": "59729010@N00",
+                      "canComment": false,
+                      "commentCount": 14,
+                      "faveCount": 21,
+                      "id": "14001294434",
+                      "sizes": {
+                        "c": {
+                          "displayUrl": "//farm8.staticflickr.com/7246/14001294434_410f653777_c.jpg",
+                          "width": 541,
+                          "height": 800,
+                          "url": "//c4.staticflickr.com/8/7246/14001294434_410f653777_c.jpg",
+                          "key": "c"
+                        },
+                        "h": {
+                          "displayUrl": "//farm8.staticflickr.com/7246/14001294434_761d32237a_h.jpg",
+                          "width": 1081,
+                          "height": 1600,
+                          "url": "//c4.staticflickr.com/8/7246/14001294434_761d32237a_h.jpg",
+                          "key": "h"
+                        },
+                        "k": {
+                          "displayUrl": "//farm8.staticflickr.com/7246/14001294434_f145a2c11a_k.jpg",
+                          "width": 1383,
+                          "height": 2048,
+                          "url": "//c4.staticflickr.com/8/7246/14001294434_f145a2c11a_k.jpg",
+                          "key": "k"
+                        },
+                        "l": {
+                          "displayUrl": "//farm8.staticflickr.com/7246/14001294434_410f653777_b.jpg",
+                          "width": 692,
+                          "height": 1024,
+                          "url": "//c4.staticflickr.com/8/7246/14001294434_410f653777_b.jpg",
+                          "key": "l"
+                        },
+                        "m": {
+                          "displayUrl": "//farm8.staticflickr.com/7246/14001294434_410f653777.jpg",
+                          "width": 338,
+                          "height": 500,
+                          "url": "//c4.staticflickr.com/8/7246/14001294434_410f653777.jpg",
+                          "key": "m"
+                        },
+                        "n": {
+                          "displayUrl": "//farm8.staticflickr.com/7246/14001294434_410f653777_n.jpg",
+                          "width": 216,
+                          "height": 320,
+                          "url": "//c4.staticflickr.com/8/7246/14001294434_410f653777_n.jpg",
+                          "key": "n"
+                        },
+                        "q": {
+                          "displayUrl": "//farm8.staticflickr.com/7246/14001294434_410f653777_q.jpg",
+                          "width": 150,
+                          "height": 150,
+                          "url": "//c4.staticflickr.com/8/7246/14001294434_410f653777_q.jpg",
+                          "key": "q"
+                        },
+                        "s": {
+                          "displayUrl": "//farm8.staticflickr.com/7246/14001294434_410f653777_m.jpg",
+                          "width": 162,
+                          "height": 240,
+                          "url": "//c4.staticflickr.com/8/7246/14001294434_410f653777_m.jpg",
+                          "key": "s"
+                        },
+                        "sq": {
+                          "displayUrl": "//farm8.staticflickr.com/7246/14001294434_410f653777_s.jpg",
+                          "width": 75,
+                          "height": 75,
+                          "url": "//c4.staticflickr.com/8/7246/14001294434_410f653777_s.jpg",
+                          "key": "sq"
+                        },
+                        "t": {
+                          "displayUrl": "//farm8.staticflickr.com/7246/14001294434_410f653777_t.jpg",
+                          "width": 68,
+                          "height": 100,
+                          "url": "//c4.staticflickr.com/8/7246/14001294434_410f653777_t.jpg",
+                          "key": "t"
+                        },
+                        "z": {
+                          "displayUrl": "//farm8.staticflickr.com/7246/14001294434_410f653777_z.jpg",
+                          "width": 433,
+                          "height": 640,
+                          "url": "//c4.staticflickr.com/8/7246/14001294434_410f653777_z.jpg",
+                          "key": "z"
+                        }
+                      }
+                    }
+                  ]
                 }
               }
-            }
-          ],
-          "fetchedStart": true,
-          "fetchedEnd": false,
-          "totalItems": "4386039"
-        },"totalItems":
+            ]
+          }
+        }
         """
-        json = json.replace('\r\n', '').replace('\n', '').replace('\r', '')
+        # Flickr serves search results in a json block named 'modelExport' buried inside a script tag,
+        # this json is served as a single line terminating with a comma.
+        json = ''.join(json.split()) + ',\n'
         response = mock.Mock(text=json)
         results = flickr_noapi.response(response)
         self.assertEqual(type(results), list)
@@ -149,37 +165,51 @@ class TestFlickrNoapiEngine(SearxTestCase):
 
         # no n size, only the z size
         json = """
-        "search-photos-lite-models","photos":
-        {
-          "_data": [
-            {
-              "_flickrModelRegistry": "photo-lite-models",
-              "title": "This is the title",
-              "username": "Owner",
-              "pathAlias": "klink692",
-              "realname": "Owner",
-              "license": 0,
-              "ownerNsid": "59729010@N00",
-              "canComment": false,
-              "commentCount": 14,
-              "faveCount": 21,
-              "id": "14001294434",
-              "sizes": {
-                "z": {
-                  "displayUrl": "//farm8.staticflickr.com/7246/14001294434_410f653777_z.jpg",
-                  "width": 433,
-                  "height": 640,
-                  "url": "//c4.staticflickr.com/8/7246/14001294434_410f653777_z.jpg",
-                  "key": "z"
+        modelExport: {
+          "legend": [
+            [
+              "search-photos-lite-models",
+              "0",
+              "photos",
+              "_data",
+              "0"
+            ]
+          ],
+          "main": {
+            "search-photos-lite-models": [
+              {
+                "photos": {
+                  "_data": [
+                    {
+                      "_flickrModelRegistry": "photo-lite-models",
+                      "title": "This%20is%20the%20title",
+                      "username": "Owner",
+                      "pathAlias": "klink692",
+                      "realname": "Owner",
+                      "license": 0,
+                      "ownerNsid": "59729010@N00",
+                      "canComment": false,
+                      "commentCount": 14,
+                      "faveCount": 21,
+                      "id": "14001294434",
+                      "sizes": {
+                        "z": {
+                          "displayUrl": "//farm8.staticflickr.com/7246/14001294434_410f653777_z.jpg",
+                          "width": 433,
+                          "height": 640,
+                          "url": "//c4.staticflickr.com/8/7246/14001294434_410f653777_z.jpg",
+                          "key": "z"
+                        }
+                      }
+                    }
+                  ]
                 }
               }
-            }
-          ],
-          "fetchedStart": true,
-          "fetchedEnd": false,
-          "totalItems": "4386039"
-        },"totalItems":
+            ]
+          }
+        }
         """
+        json = ''.join(json.split()) + ',\n'
         response = mock.Mock(text=json)
         results = flickr_noapi.response(response)
         self.assertEqual(type(results), list)
@@ -192,37 +222,51 @@ class TestFlickrNoapiEngine(SearxTestCase):
 
         # no z or n size
         json = """
-        "search-photos-lite-models","photos":
-        {
-          "_data": [
-            {
-              "_flickrModelRegistry": "photo-lite-models",
-              "title": "This is the title",
-              "username": "Owner",
-              "pathAlias": "klink692",
-              "realname": "Owner",
-              "license": 0,
-              "ownerNsid": "59729010@N00",
-              "canComment": false,
-              "commentCount": 14,
-              "faveCount": 21,
-              "id": "14001294434",
-              "sizes": {
-                "o": {
-                  "displayUrl": "//farm8.staticflickr.com/7246/14001294434_410f653777_o.jpg",
-                  "width": 433,
-                  "height": 640,
-                  "url": "//c4.staticflickr.com/8/7246/14001294434_410f653777_o.jpg",
-                  "key": "o"
+        modelExport: {
+          "legend": [
+            [
+              "search-photos-lite-models",
+              "0",
+              "photos",
+              "_data",
+              "0"
+            ]
+          ],
+          "main": {
+            "search-photos-lite-models": [
+              {
+                "photos": {
+                  "_data": [
+                    {
+                      "_flickrModelRegistry": "photo-lite-models",
+                      "title": "This%20is%20the%20title",
+                      "username": "Owner",
+                      "pathAlias": "klink692",
+                      "realname": "Owner",
+                      "license": 0,
+                      "ownerNsid": "59729010@N00",
+                      "canComment": false,
+                      "commentCount": 14,
+                      "faveCount": 21,
+                      "id": "14001294434",
+                      "sizes": {
+                        "o": {
+                          "displayUrl": "//farm8.staticflickr.com/7246/14001294434_410f653777_o.jpg",
+                          "width": 433,
+                          "height": 640,
+                          "url": "//c4.staticflickr.com/8/7246/14001294434_410f653777_o.jpg",
+                          "key": "o"
+                        }
+                      }
+                    }
+                  ]
                 }
               }
-            }
-          ],
-          "fetchedStart": true,
-          "fetchedEnd": false,
-          "totalItems": "4386039"
-        },"totalItems":
+            ]
+          }
+        }
         """
+        json = ''.join(json.split()) + ',\n'
         response = mock.Mock(text=json)
         results = flickr_noapi.response(response)
         self.assertEqual(type(results), list)
@@ -235,30 +279,44 @@ class TestFlickrNoapiEngine(SearxTestCase):
 
         # no image test
         json = """
-        "search-photos-lite-models","photos":
-        {
-          "_data": [
-            {
-              "_flickrModelRegistry": "photo-lite-models",
-              "title": "This is the title",
-              "username": "Owner",
-              "pathAlias": "klink692",
-              "realname": "Owner",
-              "license": 0,
-              "ownerNsid": "59729010@N00",
-              "canComment": false,
-              "commentCount": 14,
-              "faveCount": 21,
-              "id": "14001294434",
-              "sizes": {
-              }
-            }
+        modelExport: {
+          "legend": [
+            [
+              "search-photos-lite-models",
+              "0",
+              "photos",
+              "_data",
+              "0"
+            ]
           ],
-          "fetchedStart": true,
-          "fetchedEnd": false,
-          "totalItems": "4386039"
-        },"totalItems":
+          "main": {
+            "search-photos-lite-models": [
+              {
+                "photos": {
+                  "_data": [
+                    {
+                      "_flickrModelRegistry": "photo-lite-models",
+                      "title": "This is the title",
+                      "username": "Owner",
+                      "pathAlias": "klink692",
+                      "realname": "Owner",
+                      "license": 0,
+                      "ownerNsid": "59729010@N00",
+                      "canComment": false,
+                      "commentCount": 14,
+                      "faveCount": 21,
+                      "id": "14001294434",
+                      "sizes": {
+                      }
+                    }
+                  ]
+                }
+              }
+            ]
+          }
+        }
         """
+        json = ''.join(json.split()) + ',\n'
         response = mock.Mock(text=json)
         results = flickr_noapi.response(response)
         self.assertEqual(type(results), list)
@@ -266,51 +324,20 @@ class TestFlickrNoapiEngine(SearxTestCase):
 
         # null test
         json = """
-        "search-photos-models","photos":
-        {
-          "_data": [null],
-          "fetchedStart": true,
-          "fetchedEnd": false,
-          "totalItems": "4386039"
-        },"totalItems":
-        """
-        response = mock.Mock(text=json)
-        results = flickr_noapi.response(response)
-        self.assertEqual(type(results), list)
-        self.assertEqual(len(results), 0)
-
-        # no ownerNsid test
-        json = """
-        "search-photos-lite-models","photos":
-        {
-          "_data": [
-            {
-              "_flickrModelRegistry": "photo-lite-models",
-              "title": "This is the title",
-              "username": "Owner",
-              "pathAlias": "klink692",
-              "realname": "Owner",
-              "license": 0,
-              "canComment": false,
-              "commentCount": 14,
-              "faveCount": 21,
-              "id": "14001294434",
-              "sizes": {
-                "o": {
-                  "displayUrl": "//farm8.staticflickr.com/7246/14001294434_410f653777_o.jpg",
-                  "width": 433,
-                  "height": 640,
-                  "url": "//c4.staticflickr.com/8/7246/14001294434_410f653777_o.jpg",
-                  "key": "o"
+        modelExport: {
+          "legend": [null],
+          "main": {
+            "search-photos-lite-models": [
+              {
+                "photos": {
+                  "_data": [null]
                 }
               }
-            }
-          ],
-          "fetchedStart": true,
-          "fetchedEnd": false,
-          "totalItems": "4386039"
-        },"totalItems":
+            ]
+          }
+        }
         """
+        json = ''.join(json.split()) + ',\n'
         response = mock.Mock(text=json)
         results = flickr_noapi.response(response)
         self.assertEqual(type(results), list)
@@ -323,6 +350,7 @@ class TestFlickrNoapiEngine(SearxTestCase):
             "link":"http:\/\/www.flickr.com\/artist\/1217","type":"artist"}
         ]}
         """
+        json = ''.join(json.split()) + ',\n'
         response = mock.Mock(text=json)
         results = flickr_noapi.response(response)
         self.assertEqual(type(results), list)