Browse Source

add duckduckgo images engine

marc 8 years ago
parent
commit
c65a409f0d

+ 35 - 30
searx/engines/duckduckgo.py

@@ -41,46 +41,51 @@ title_xpath = './/a[@class="result__a"]'
 content_xpath = './/a[@class="result__snippet"]'
 
 
-# do search-request
-def request(query, params):
-    if params['time_range'] and params['time_range'] not in time_range_dict:
-        return params
-
-    offset = 30 + (params['pageno'] - 1) * 50
-    dc_param = offset + 1
-
+# match query's language to a region code that duckduckgo will accept
+def get_region_code(lang):
     # custom fixes for languages
-    if params['language'] == 'all':
-        locale = None
-    elif params['language'][:2] == 'ja':
-        locale = 'jp-jp'
-    elif params['language'][:2] == 'sl':
-        locale = 'sl-sl'
-    elif params['language'] == 'zh-TW':
-        locale = 'tw-tzh'
-    elif params['language'] == 'zh-HK':
-        locale = 'hk-tzh'
-    elif params['language'][-2:] == 'SA':
-        locale = 'xa-' + params['language'].split('-')[0]
-    elif params['language'][-2:] == 'GB':
-        locale = 'uk-' + params['language'].split('-')[0]
+    if lang == 'all':
+        region_code = None
+    elif lang[:2] == 'ja':
+        region_code = 'jp-jp'
+    elif lang[:2] == 'sl':
+        region_code = 'sl-sl'
+    elif lang == 'zh-TW':
+        region_code = 'tw-tzh'
+    elif lang == 'zh-HK':
+        region_code = 'hk-tzh'
+    elif lang[-2:] == 'SA':
+        region_code = 'xa-' + lang.split('-')[0]
+    elif lang[-2:] == 'GB':
+        region_code = 'uk-' + lang.split('-')[0]
     else:
-        locale = params['language'].split('-')
-        if len(locale) == 2:
+        region_code = lang.split('-')
+        if len(region_code) == 2:
             # country code goes first
-            locale = locale[1].lower() + '-' + locale[0].lower()
+            region_code = region_code[1].lower() + '-' + region_code[0].lower()
         else:
             # tries to get a country code from language
-            locale = locale[0].lower()
+            region_code = region_code[0].lower()
             for lc in supported_languages:
                 lc = lc.split('-')
-                if locale == lc[0]:
-                    locale = lc[1].lower() + '-' + lc[0].lower()
+                if region_code == lc[0]:
+                    region_code = lc[1].lower() + '-' + lc[0].lower()
                     break
+    return region_code
+
+
+# do search-request
+def request(query, params):
+    if params['time_range'] and params['time_range'] not in time_range_dict:
+        return params
+
+    offset = 30 + (params['pageno'] - 1) * 50
+    dc_param = offset + 1
 
-    if locale:
+    region_code = get_region_code(params['language'])
+    if region_code:
         params['url'] = url.format(
-            query=urlencode({'q': query, 'kl': locale}), offset=offset, dc_param=dc_param)
+            query=urlencode({'q': query, 'kl': region_code}), offset=offset, dc_param=dc_param)
     else:
         params['url'] = url.format(
             query=urlencode({'q': query}), offset=offset, dc_param=dc_param)

+ 91 - 0
searx/engines/duckduckgo_images.py

@@ -0,0 +1,91 @@
+"""
+ DuckDuckGo (Images)
+
+ @website     https://duckduckgo.com/
+ @provide-api yes (https://duckduckgo.com/api),
+              but images are not supported
+
+ @using-api   no
+ @results     JSON (site requires js to get images)
+ @stable      no (JSON can change)
+ @parse       url, title, img_src
+
+ @todo        avoid extra request
+"""
+
+from requests import get
+from json import loads
+from searx.engines.xpath import extract_text
+from searx.engines.duckduckgo import _fetch_supported_languages, supported_languages_url, get_region_code
+from searx.url_utils import urlencode
+
+# engine dependent config
+categories = ['images']
+paging = True
+language_support = True
+safesearch = True
+
+# search-url
+images_url = 'https://duckduckgo.com/i.js?{query}&s={offset}&p={safesearch}&o=json&vqd={vqd}'
+site_url = 'https://duckduckgo.com/?{query}&iar=images&iax=1&ia=images'
+
+
+# run query in site to get vqd number needed for requesting images
+# TODO: find a way to get this number without an extra request (is it a hash of the query?)
+def get_vqd(query):
+    res = get(site_url.format(query=urlencode({'q': query})))
+    content = res.text
+    vqd = content[content.find('vqd=\'') + 5:]
+    vqd = vqd[:vqd.find('\'')]
+    return vqd
+
+
+# do search-request
+def request(query, params):
+    # to avoid running actual external requests when testing
+    if 'is_test' not in params:
+        vqd = get_vqd(query)
+    else:
+        vqd = '12345'
+
+    offset = (params['pageno'] - 1) * 50
+
+    safesearch = params['safesearch'] - 1
+
+    region_code = get_region_code(params['language'])
+    if region_code:
+        params['url'] = images_url.format(
+            query=urlencode({'q': query, 'l': region_code}), offset=offset, safesearch=safesearch, vqd=vqd)
+    else:
+        params['url'] = images_url.format(
+            query=urlencode({'q': query}), offset=offset, safesearch=safesearch, vqd=vqd)
+
+    return params
+
+
+# get response from search-request
+def response(resp):
+    results = []
+
+    content = resp.text
+    try:
+        res_json = loads(content)
+    except:
+        return []
+
+    # parse results
+    for result in res_json['results']:
+        title = result['title']
+        url = result['url']
+        thumbnail = result['thumbnail']
+        image = result['image']
+
+        # append result
+        results.append({'template': 'images.html',
+                        'title': title,
+                        'content': '',
+                        'thumbnail_src': thumbnail,
+                        'img_src': image,
+                        'url': url})
+
+    return results

+ 6 - 0
searx/settings.yml

@@ -167,6 +167,12 @@ engines:
     shortcut : ddg
     disabled : True
 
+  - name : duckduckgo images
+    engine : duckduckgo_images
+    shortcut : ddi
+    timeout: 3.0
+    disabled : True
+
   - name : etymonline
     engine : xpath
     paging : True

+ 72 - 0
tests/unit/engines/test_duckduckgo_images.py

@@ -0,0 +1,72 @@
+# -*- coding: utf-8 -*-
+from collections import defaultdict
+import mock
+from searx.engines import duckduckgo_images
+from searx.testing import SearxTestCase
+
+
+class TestDuckduckgoImagesEngine(SearxTestCase):
+
+    def test_request(self):
+        query = 'test_query'
+        dicto = defaultdict(dict)
+        dicto['is_test'] = True
+        dicto['pageno'] = 1
+        dicto['safesearch'] = 0
+        dicto['language'] = 'all'
+        params = duckduckgo_images.request(query, dicto)
+        self.assertIn('url', params)
+        self.assertIn(query, params['url'])
+        self.assertIn('duckduckgo.com', params['url'])
+        self.assertIn('s=0', params['url'])
+        self.assertIn('p=-1', params['url'])
+        self.assertIn('vqd=12345', params['url'])
+
+        # test paging and safe search
+        dicto['pageno'] = 2
+        dicto['safesearch'] = 2
+        params = duckduckgo_images.request(query, dicto)
+        self.assertIn('url', params)
+        self.assertIn(query, params['url'])
+        self.assertIn('s=50', params['url'])
+        self.assertIn('p=1', params['url'])
+
+    def test_response(self):
+        self.assertRaises(AttributeError, duckduckgo_images.response, None)
+        self.assertRaises(AttributeError, duckduckgo_images.response, [])
+        self.assertRaises(AttributeError, duckduckgo_images.response, '')
+        self.assertRaises(AttributeError, duckduckgo_images.response, '[]')
+
+        response = mock.Mock(text='If this error persists, please let us know: ops@duckduckgo.com')
+        self.assertEqual(duckduckgo_images.response(response), [])
+
+        json = u"""
+        {
+            "query": "test_query",
+            "results": [
+                {
+                    "title": "Result 1",
+                    "url": "https://site1.url",
+                    "thumbnail": "https://thumb1.nail",
+                    "image": "https://image1"
+                },
+                {
+                    "title": "Result 2",
+                    "url": "https://site2.url",
+                    "thumbnail": "https://thumb2.nail",
+                    "image": "https://image2"
+                }
+            ]
+        }
+        """
+        response = mock.Mock(text=json)
+        results = duckduckgo_images.response(response)
+        self.assertEqual(len(results), 2)
+        self.assertEqual(results[0]['title'], 'Result 1')
+        self.assertEqual(results[0]['url'], 'https://site1.url')
+        self.assertEqual(results[0]['thumbnail_src'], 'https://thumb1.nail')
+        self.assertEqual(results[0]['img_src'], 'https://image1')
+        self.assertEqual(results[1]['title'], 'Result 2')
+        self.assertEqual(results[1]['url'], 'https://site2.url')
+        self.assertEqual(results[1]['thumbnail_src'], 'https://thumb2.nail')
+        self.assertEqual(results[1]['img_src'], 'https://image2')