Browse Source

[enh] infobox for wolframalpha

TODO:
    - infobox styles
    - unit tests

ISSUES:
    - no_api version needs to re-call server for additional pods, such
      as plots. therefore, it's even slower than before. comment out the
part that calls get_async_pod if requests reach timeout or increase
timeout in settings.yml.
a01200356 9 years ago
parent
commit
78d3f3d6b1
3 changed files with 150 additions and 46 deletions
  1. 61 23
      searx/engines/wolframalpha_api.py
  2. 86 20
      searx/engines/wolframalpha_noapi.py
  3. 3 3
      searx/settings.yml

+ 61 - 23
searx/engines/wolframalpha_api.py

@@ -1,40 +1,56 @@
-# Wolfram Alpha (Maths)
+# Wolfram Alpha (Science)
 #
 #
-# @website     http://www.wolframalpha.com
+# @website     https://www.wolframalpha.com
-# @provide-api yes (http://api.wolframalpha.com/v2/)
+# @provide-api yes (https://api.wolframalpha.com/v2/)
 #
 #
 # @using-api   yes
 # @using-api   yes
 # @results     XML
 # @results     XML
 # @stable      yes
 # @stable      yes
-# @parse       result
+# @parse       url, infobox
 
 
 from urllib import urlencode
 from urllib import urlencode
 from lxml import etree
 from lxml import etree
-from re import search
 
 
 # search-url
 # search-url
-base_url = 'http://api.wolframalpha.com/v2/query'
+search_url = 'https://api.wolframalpha.com/v2/query?appid={api_key}&{query}'
-search_url = base_url + '?appid={api_key}&{query}&format=plaintext'
+site_url = 'https://www.wolframalpha.com/input/?{query}'
-site_url = 'http://www.wolframalpha.com/input/?{query}'
 api_key = ''  # defined in settings.yml
 api_key = ''  # defined in settings.yml
 
 
 # xpath variables
 # xpath variables
 failure_xpath = '/queryresult[attribute::success="false"]'
 failure_xpath = '/queryresult[attribute::success="false"]'
 answer_xpath = '//pod[attribute::primary="true"]/subpod/plaintext'
 answer_xpath = '//pod[attribute::primary="true"]/subpod/plaintext'
 input_xpath = '//pod[starts-with(attribute::title, "Input")]/subpod/plaintext'
 input_xpath = '//pod[starts-with(attribute::title, "Input")]/subpod/plaintext'
+pods_xpath = '//pod'
+subpods_xpath = './subpod'
+pod_title_xpath = './@title'
+plaintext_xpath = './plaintext'
+image_xpath = './img'
+img_src_xpath = './@src'
+img_alt_xpath = './@alt'
+
+# pods to display as image in infobox
+# this pods do return a plaintext, but they look better and are more useful as images
+image_pods = {'Visual representation',
+              'Manipulatives illustration'}
 
 
 
 
 # do search-request
 # do search-request
 def request(query, params):
 def request(query, params):
     params['url'] = search_url.format(query=urlencode({'input': query}),
     params['url'] = search_url.format(query=urlencode({'input': query}),
                                       api_key=api_key)
                                       api_key=api_key)
+    params['headers']['Referer'] = site_url.format(query=urlencode({'i': query}))
 
 
     return params
     return params
 
 
 
 
 # replace private user area characters to make text legible
 # replace private user area characters to make text legible
 def replace_pua_chars(text):
 def replace_pua_chars(text):
-    pua_chars = {u'\uf74c': 'd',
+    pua_chars = {u'\uf522': u'\u2192',
+                 u'\uf7b1': u'\u2115',
+                 u'\uf7b4': u'\u211a',
+                 u'\uf7b5': u'\u211d',
+                 u'\uf7bd': u'\u2124',
+                 u'\uf74c': 'd',
                  u'\uf74d': u'\u212f',
                  u'\uf74d': u'\u212f',
                  u'\uf74e': 'i',
                  u'\uf74e': 'i',
                  u'\uf7d9': '='}
                  u'\uf7d9': '='}
@@ -55,23 +71,45 @@ def response(resp):
     if search_results.xpath(failure_xpath):
     if search_results.xpath(failure_xpath):
         return []
         return []
 
 
-    # parse answers
+    infobox_title = search_results.xpath(input_xpath)
-    answers = search_results.xpath(answer_xpath)
+    if infobox_title:
-    if answers:
+        infobox_title = replace_pua_chars(infobox_title[0].text)
-        for answer in answers:
+
-            answer = replace_pua_chars(answer.text)
+    pods = search_results.xpath(pods_xpath)
+    result_chunks = []
+    for pod in pods:
+        pod_title = replace_pua_chars(pod.xpath(pod_title_xpath)[0])
+
+        subpods = pod.xpath(subpods_xpath)
+        if not subpods:
+            continue
+
+        for subpod in subpods:
+            content = subpod.xpath(plaintext_xpath)[0].text
+            image = subpod.xpath(image_xpath)
+            if content and pod_title not in image_pods:
+                content = replace_pua_chars(content)
+                result_chunks.append({'label': pod_title, 'value': content})
 
 
-            results.append({'answer': answer})
+                # if there's no input pod, infobox_title is content of first pod
+                if not infobox_title:
+                    infobox_title = content
+
+            elif image:
+                result_chunks.append({'label': pod_title,
+                                      'image': {'src': image[0].xpath(img_src_xpath)[0],
+                                                'alt': image[0].xpath(img_alt_xpath)[0]}})
+
+    if not result_chunks:
+        return []
 
 
-    # if there's no input section in search_results, check if answer has the input embedded (before their "=" sign)
+    results.append({'infobox': infobox_title,
-    try:
+                    'attributes': result_chunks,
-        query_input = search_results.xpath(input_xpath)[0].text
+                    'urls': [{'title': 'Wolfram|Alpha', 'url': resp.request.headers['Referer']}]})
-    except IndexError:
-        query_input = search(u'([^\uf7d9]+)', answers[0].text).group(1)
 
 
     # append link to site
     # append link to site
-    result_url = site_url.format(query=urlencode({'i': query_input.encode('utf-8')}))
+    results.append({'url': resp.request.headers['Referer'],
-    results.append({'url': result_url,
+                    'title': 'Wolfram|Alpha',
-                    'title': query_input + " - Wolfram|Alpha"})
+                    'content': infobox_title})
 
 
     return results
     return results

+ 86 - 20
searx/engines/wolframalpha_noapi.py

@@ -1,23 +1,23 @@
-# WolframAlpha (Maths)
+# Wolfram|Alpha (Science)
 #
 #
-# @website     http://www.wolframalpha.com/
+# @website     https://www.wolframalpha.com/
-# @provide-api yes (http://api.wolframalpha.com/v2/)
+# @provide-api yes (https://api.wolframalpha.com/v2/)
 #
 #
 # @using-api   no
 # @using-api   no
-# @results     HTML
+# @results     JSON
 # @stable      no
 # @stable      no
-# @parse       answer
+# @parse       url, infobox
 
 
 from cgi import escape
 from cgi import escape
 from json import loads
 from json import loads
 from time import time
 from time import time
 from urllib import urlencode
 from urllib import urlencode
+from lxml.etree import XML
 
 
 from searx.poolrequests import get as http_get
 from searx.poolrequests import get as http_get
 
 
 # search-url
 # search-url
 url = 'https://www.wolframalpha.com/'
 url = 'https://www.wolframalpha.com/'
-search_url = url + 'input/?{query}'
 
 
 search_url = url + 'input/json.jsp'\
 search_url = url + 'input/json.jsp'\
     '?async=true'\
     '?async=true'\
@@ -33,13 +33,25 @@ search_url = url + 'input/json.jsp'\
     '&sponsorcategories=true'\
     '&sponsorcategories=true'\
     '&statemethod=deploybutton'
     '&statemethod=deploybutton'
 
 
-# xpath variables
+referer_url = url + 'input/?{query}'
-scripts_xpath = '//script'
+
-title_xpath = '//title'
-failure_xpath = '//p[attribute::class="pfail"]'
 token = {'value': '',
 token = {'value': '',
          'last_updated': None}
          'last_updated': None}
 
 
+# xpath variables
+success_xpath = '/pod[attribute::error="false"]'
+plaintext_xpath = './plaintext'
+title_xpath = './@title'
+image_xpath = './img'
+img_src_xpath = './img/@src'
+img_alt_xpath = './img/@alt'
+
+# pods to display as image in infobox
+# this pods do return a plaintext, but they look better and are more useful as images
+image_pods = {'Visual representation',
+              'Manipulatives illustration',
+              'Symbol'}
+
 
 
 # seems, wolframalpha resets its token in every hour
 # seems, wolframalpha resets its token in every hour
 def obtain_token():
 def obtain_token():
@@ -62,13 +74,42 @@ def request(query, params):
     if time() - token['last_updated'] > 3600:
     if time() - token['last_updated'] > 3600:
         obtain_token()
         obtain_token()
     params['url'] = search_url.format(query=urlencode({'input': query}), token=token['value'])
     params['url'] = search_url.format(query=urlencode({'input': query}), token=token['value'])
-    params['headers']['Referer'] = 'https://www.wolframalpha.com/input/?i=' + query
+    params['headers']['Referer'] = referer_url.format(query=urlencode({'i': query}))
 
 
     return params
     return params
 
 
 
 
+# get additional pod
+# NOTE: this makes an additional requests to server, so the response will take longer and might reach timeout
+def get_async_pod(url):
+    pod = {'subpods': []}
+
+    try:
+        resp = http_get(url, timeout=2.0)
+
+        resp_pod = XML(resp.content)
+        if resp_pod.xpath(success_xpath):
+
+            for subpod in resp_pod:
+                plaintext = subpod.xpath(plaintext_xpath)[0].text
+                if plaintext:
+                    pod['subpods'].append({'title': subpod.xpath(title_xpath)[0],
+                                           'plaintext': plaintext})
+                elif subpod.xpath(image_xpath):
+                    pod['subpods'].append({'title': subpod.xpath(title_xpath)[0],
+                                           'plaintext': '',
+                                           'img': {'src': subpod.xpath(img_src_xpath)[0],
+                                                   'alt': subpod.xpath(img_alt_xpath)[0]}})
+    except:
+        pass
+
+    return pod
+
+
 # get response from search-request
 # get response from search-request
 def response(resp):
 def response(resp):
+    results = []
+
     resp_json = loads(resp.text)
     resp_json = loads(resp.text)
 
 
     if not resp_json['queryresult']['success']:
     if not resp_json['queryresult']['success']:
@@ -76,20 +117,45 @@ def response(resp):
 
 
     # TODO handle resp_json['queryresult']['assumptions']
     # TODO handle resp_json['queryresult']['assumptions']
     result_chunks = []
     result_chunks = []
+    infobox_title = None
     for pod in resp_json['queryresult']['pods']:
     for pod in resp_json['queryresult']['pods']:
         pod_title = pod.get('title', '')
         pod_title = pod.get('title', '')
+
         if 'subpods' not in pod:
         if 'subpods' not in pod:
-            continue
+            # comment this section if your requests always reach timeout
+            if pod['async']:
+                result = get_async_pod(pod['async'])
+                if result:
+                    pod = result
+            else:
+                continue
+
+        # infobox title is input or text content on first pod
+        if pod_title.startswith('Input') or not infobox_title:
+            try:
+                infobox_title = pod['subpods'][0]['plaintext']
+            except:
+                infobox_title = ''
+                pass
+
         for subpod in pod['subpods']:
         for subpod in pod['subpods']:
-            if 'img' in subpod:
+            if subpod['plaintext'] != '' and pod_title not in image_pods:
-                result_chunks.append(u'<p>{0}<br /><img src="{1}" alt="{2}" /></p>'
+                # append unless it's not an actual answer
-                                     .format(escape(pod_title or subpod['img']['alt']),
+                if subpod['plaintext'] != '(requires interactivity)':
-                                             escape(subpod['img']['src']),
+                    result_chunks.append({'label': pod_title, 'value': subpod['plaintext']})
-                                             escape(subpod['img']['alt'])))
+
+            elif 'img' in subpod:
+                result_chunks.append({'label': pod_title, 'image': subpod['img']})
 
 
     if not result_chunks:
     if not result_chunks:
         return []
         return []
 
 
-    return [{'url': resp.request.headers['Referer'].decode('utf-8'),
+    results.append({'infobox': infobox_title,
-             'title': 'Wolframalpha',
+                    'attributes': result_chunks,
-             'content': ''.join(result_chunks)}]
+                    'urls': [{'title': 'Wolfram|Alpha', 'url': resp.request.headers['Referer']}]})
+
+    results.append({'url': resp.request.headers['Referer'],
+                    'title': 'Wolfram|Alpha',
+                    'content': infobox_title})
+
+    return results

+ 3 - 3
searx/settings.yml

@@ -310,10 +310,10 @@ engines:
     shortcut : wa
     shortcut : wa
     # You can use the engine using the official stable API, but you need an API key
     # You can use the engine using the official stable API, but you need an API key
     # See : http://products.wolframalpha.com/api/
     # See : http://products.wolframalpha.com/api/
-    #    engine : wolframalpha_api
+    # engine : wolframalpha_api
-    #    api_key: 'apikey' # required!
+    # api_key: '5952JX-X52L3VKWT8' # required!
     engine : wolframalpha_noapi
     engine : wolframalpha_noapi
-    timeout: 6.0
+    timeout: 10.0
     categories : science
     categories : science
 
 
 #The blekko technology and team have joined IBM Watson! -> https://blekko.com/
 #The blekko technology and team have joined IBM Watson! -> https://blekko.com/