Browse Source

[mod] ddg engine mods

Adam Tauber 11 years ago
parent
commit
3854703d95
1 changed files with 45 additions and 13 deletions
  1. 45 13
      searx/engines/duckduckgo.py

+ 45 - 13
searx/engines/duckduckgo.py

@@ -1,29 +1,61 @@
-from json import loads
 from urllib import urlencode
+from lxml.html import fromstring
 from searx.utils import html_to_text
 
-url = 'https://duckduckgo.com/'
-search_url = url + 'd.js?{query}&p=1&s={offset}'
+url = 'https://duckduckgo.com/html?{query}&s={offset}'
 locale = 'us-en'
 
-paging = True
-
-
 def request(query, params):
     offset = (params['pageno'] - 1) * 30
     q = urlencode({'q': query,
                    'l': locale})
-    params['url'] = search_url.format(query=q, offset=offset)
+    params['url'] = url.format(query=q, offset=offset)
     return params
 
 
 def response(resp):
+    result_xpath = '//div[@class="results_links results_links_deep web-result"]'
+    url_xpath = './/a[@class="large"]/@href'
+    title_xpath = './/a[@class="large"]//text()'
+    content_xpath = './/div[@class="snippet"]//text()'
     results = []
-    search_res = loads(resp.text[resp.text.find('[{'):-2])[:-1]
-    for r in search_res:
-        if not r.get('t'):
+
+    doc = fromstring(resp.text)
+
+    for r in doc.xpath(result_xpath):
+        res_url = r.xpath(url_xpath)[-1]
+        if not res_url:
             continue
-        results.append({'title': r['t'],
-                       'content': html_to_text(r['a']),
-                       'url': r['u']})
+        title = html_to_text(''.join(r.xpath(title_xpath)))
+        content = html_to_text(''.join(r.xpath(content_xpath)))
+        results.append({'title': title,
+                        'content': content,
+                        'url': res_url})
+
     return results
+
+
+#from json import loads
+#search_url = url + 'd.js?{query}&p=1&s={offset}'
+#
+#paging = True
+#
+#
+#def request(query, params):
+#    offset = (params['pageno'] - 1) * 30
+#    q = urlencode({'q': query,
+#                   'l': locale})
+#    params['url'] = search_url.format(query=q, offset=offset)
+#    return params
+#
+#
+#def response(resp):
+#    results = []
+#    search_res = loads(resp.text[resp.text.find('[{'):-2])[:-1]
+#    for r in search_res:
+#        if not r.get('t'):
+#            continue
+#        results.append({'title': r['t'],
+#                       'content': html_to_text(r['a']),
+#                       'url': r['u']})
+#    return results