Browse Source

Merge pull request #88 from pointhi/engines

update and fix search engines
Adam Tauber 10 years ago
parent
commit
f36d1e28fa

+ 39 - 6
searx/engines/bing.py

@@ -1,48 +1,81 @@
+## Bing (Web)
+# 
+# @website     https://www.bing.com
+# @provide-api yes (http://datamarket.azure.com/dataset/bing/search), max. 5000 query/month
+# 
+# @using-api   no (because of query limit)
+# @results     HTML (using search portal)
+# @stable      no (HTML can change)
+# @parse       url, title, content
+#
+# @todo        publishedDate
+
 from urllib import urlencode
 from urllib import urlencode
 from cgi import escape
 from cgi import escape
 from lxml import html
 from lxml import html
 
 
-base_url = 'http://www.bing.com/'
+# engine dependent config
-search_string = 'search?{query}&first={offset}'
+categories = ['general']
 paging = True
 paging = True
 language_support = True
 language_support = True
 
 
+# search-url
+base_url = 'https://www.bing.com/'
+search_string = 'search?{query}&first={offset}'
 
 
+# do search-request
 def request(query, params):
 def request(query, params):
     offset = (params['pageno'] - 1) * 10 + 1
     offset = (params['pageno'] - 1) * 10 + 1
+
     if params['language'] == 'all':
     if params['language'] == 'all':
         language = 'en-US'
         language = 'en-US'
     else:
     else:
         language = params['language'].replace('_', '-')
         language = params['language'].replace('_', '-')
+
     search_path = search_string.format(
     search_path = search_string.format(
         query=urlencode({'q': query, 'setmkt': language}),
         query=urlencode({'q': query, 'setmkt': language}),
         offset=offset)
         offset=offset)
 
 
     params['cookies']['SRCHHPGUSR'] = \
     params['cookies']['SRCHHPGUSR'] = \
         'NEWWND=0&NRSLT=-1&SRCHLANG=' + language.split('-')[0]
         'NEWWND=0&NRSLT=-1&SRCHLANG=' + language.split('-')[0]
-    #if params['category'] == 'images':
+
-    #    params['url'] = base_url + 'images/' + search_path
     params['url'] = base_url + search_path
     params['url'] = base_url + search_path
     return params
     return params
 
 
 
 
+# get response from search-request
 def response(resp):
 def response(resp):
     results = []
     results = []
+
     dom = html.fromstring(resp.content)
     dom = html.fromstring(resp.content)
+
+    # parse results
     for result in dom.xpath('//div[@class="sa_cc"]'):
     for result in dom.xpath('//div[@class="sa_cc"]'):
         link = result.xpath('.//h3/a')[0]
         link = result.xpath('.//h3/a')[0]
         url = link.attrib.get('href')
         url = link.attrib.get('href')
         title = ' '.join(link.xpath('.//text()'))
         title = ' '.join(link.xpath('.//text()'))
         content = escape(' '.join(result.xpath('.//p//text()')))
         content = escape(' '.join(result.xpath('.//p//text()')))
-        results.append({'url': url, 'title': title, 'content': content})
 
 
+        # append result
+        results.append({'url': url, 
+                        'title': title, 
+                        'content': content})
+
+    # return results if something is found
     if results:
     if results:
         return results
         return results
 
 
+    # parse results again if nothing is found yet
     for result in dom.xpath('//li[@class="b_algo"]'):
     for result in dom.xpath('//li[@class="b_algo"]'):
         link = result.xpath('.//h2/a')[0]
         link = result.xpath('.//h2/a')[0]
         url = link.attrib.get('href')
         url = link.attrib.get('href')
         title = ' '.join(link.xpath('.//text()'))
         title = ' '.join(link.xpath('.//text()'))
         content = escape(' '.join(result.xpath('.//p//text()')))
         content = escape(' '.join(result.xpath('.//p//text()')))
-        results.append({'url': url, 'title': title, 'content': content})
+
+        # append result
+        results.append({'url': url, 
+                        'title': title, 
+                        'content': content})
+
+    # return results
     return results
     return results

+ 81 - 0
searx/engines/bing_images.py

@@ -0,0 +1,81 @@
+## Bing (Images)
+# 
+# @website     https://www.bing.com/images
+# @provide-api yes (http://datamarket.azure.com/dataset/bing/search), max. 5000 query/month
+# 
+# @using-api   no (because of query limit)
+# @results     HTML (using search portal)
+# @stable      no (HTML can change)
+# @parse       url, title, img_src
+#
+# @todo        currently there are up to 35 images receive per page, because bing does not parse count=10. limited response to 10 images
+
+from urllib import urlencode
+from cgi import escape
+from lxml import html
+from yaml import load
+import re
+
+# engine dependent config
+categories = ['images']
+paging = True
+
+# search-url
+base_url = 'https://www.bing.com/'
+search_string = 'images/search?{query}&count=10&first={offset}'
+
+# do search-request
+def request(query, params):
+    offset = (params['pageno'] - 1) * 10 + 1
+
+    # required for cookie
+    language = 'en-US'
+
+    search_path = search_string.format(
+        query=urlencode({'q': query}),
+        offset=offset)
+
+    params['cookies']['SRCHHPGUSR'] = \
+        'NEWWND=0&NRSLT=-1&SRCHLANG=' + language.split('-')[0]
+
+    params['url'] = base_url + search_path
+
+    print(params['url'])
+
+    return params
+
+
+# get response from search-request
+def response(resp):
+    results = []
+
+    dom = html.fromstring(resp.content)
+
+    # init regex for yaml-parsing
+    p = re.compile( '({|,)([a-z]+):(")')
+
+    # parse results
+    for result in dom.xpath('//div[@class="dg_u"]'):
+        link = result.xpath('./a')[0]
+
+        # parse yaml-data (it is required to add a space, to make it parsable)
+        yaml_data = load(p.sub( r'\1\2: \3', link.attrib.get('m')))
+ 
+        title = link.attrib.get('t1')
+        #url = 'http://' + link.attrib.get('t3')
+        url = yaml_data.get('surl')
+        img_src = yaml_data.get('imgurl')
+
+        # append result
+        results.append({'template': 'images.html',
+                        'url': url,
+                        'title': title,
+                        'content': '',  
+                        'img_src': img_src})
+
+        # TODO stop parsing if 10 images are found
+        if len(results) >= 10:
+            break
+
+    # return results
+    return results

+ 53 - 17
searx/engines/bing_news.py

@@ -1,50 +1,86 @@
+## Bing (News)
+# 
+# @website     https://www.bing.com/news
+# @provide-api yes (http://datamarket.azure.com/dataset/bing/search), max. 5000 query/month
+# 
+# @using-api   no (because of query limit)
+# @results     HTML (using search portal)
+# @stable      no (HTML can change)
+# @parse       url, title, content, publishedDate
+
 from urllib import urlencode
 from urllib import urlencode
 from cgi import escape
 from cgi import escape
 from lxml import html
 from lxml import html
+from datetime import datetime, timedelta
+from dateutil import parser
+import re
 
 
+# engine dependent config
 categories = ['news']
 categories = ['news']
-
-base_url = 'http://www.bing.com/'
-search_string = 'news/search?{query}&first={offset}'
 paging = True
 paging = True
 language_support = True
 language_support = True
 
 
+# search-url
+base_url = 'https://www.bing.com/'
+search_string = 'news/search?{query}&first={offset}'
 
 
+# do search-request
 def request(query, params):
 def request(query, params):
     offset = (params['pageno'] - 1) * 10 + 1
     offset = (params['pageno'] - 1) * 10 + 1
+
     if params['language'] == 'all':
     if params['language'] == 'all':
         language = 'en-US'
         language = 'en-US'
     else:
     else:
         language = params['language'].replace('_', '-')
         language = params['language'].replace('_', '-')
+
     search_path = search_string.format(
     search_path = search_string.format(
         query=urlencode({'q': query, 'setmkt': language}),
         query=urlencode({'q': query, 'setmkt': language}),
         offset=offset)
         offset=offset)
 
 
     params['cookies']['SRCHHPGUSR'] = \
     params['cookies']['SRCHHPGUSR'] = \
         'NEWWND=0&NRSLT=-1&SRCHLANG=' + language.split('-')[0]
         'NEWWND=0&NRSLT=-1&SRCHLANG=' + language.split('-')[0]
-    #if params['category'] == 'images':
+
-    # params['url'] = base_url + 'images/' + search_path
     params['url'] = base_url + search_path
     params['url'] = base_url + search_path
     return params
     return params
 
 
 
 
+# get response from search-request
 def response(resp):
 def response(resp):
     results = []
     results = []
+
     dom = html.fromstring(resp.content)
     dom = html.fromstring(resp.content)
-    for result in dom.xpath('//div[@class="sa_cc"]'):
+
-        link = result.xpath('.//h3/a')[0]
+    # parse results
+    for result in dom.xpath('//div[@class="sn_r"]'):
+        link = result.xpath('.//div[@class="newstitle"]/a')[0]
         url = link.attrib.get('href')
         url = link.attrib.get('href')
         title = ' '.join(link.xpath('.//text()'))
         title = ' '.join(link.xpath('.//text()'))
-        content = escape(' '.join(result.xpath('.//p//text()')))
+        content = escape(' '.join(result.xpath('.//div[@class="sn_txt"]/div//span[@class="sn_snip"]//text()')))
-        results.append({'url': url, 'title': title, 'content': content})
+        
+        # parse publishedDate
+        publishedDate = escape(' '.join(result.xpath('.//div[@class="sn_txt"]/div//span[@class="sn_ST"]//span[@class="sn_tm"]//text()')))
 
 
-    if results:
+        if re.match("^[0-9]+ minute(s|) ago$", publishedDate):
-        return results
+            timeNumbers = re.findall(r'\d+', publishedDate)
+            publishedDate = datetime.now()\
+                - timedelta(minutes=int(timeNumbers[0]))
+        elif re.match("^[0-9]+ hour(s|) ago$", publishedDate):
+            timeNumbers = re.findall(r'\d+', publishedDate)
+            publishedDate = datetime.now()\
+                - timedelta(hours=int(timeNumbers[0]))
+        elif re.match("^[0-9]+ hour(s|), [0-9]+ minute(s|) ago$", publishedDate):
+            timeNumbers = re.findall(r'\d+', publishedDate)
+            publishedDate = datetime.now()\
+                - timedelta(hours=int(timeNumbers[0]))\
+                - timedelta(minutes=int(timeNumbers[1]))
+        else:
+            publishedDate = parser.parse(publishedDate)  
 
 
-    for result in dom.xpath('//li[@class="b_algo"]'):
+        # append result
-        link = result.xpath('.//h2/a')[0]
+        results.append({'url': url, 
-        url = link.attrib.get('href')
+                        'title': title, 
-        title = ' '.join(link.xpath('.//text()'))
+                        'publishedDate': publishedDate,
-        content = escape(' '.join(result.xpath('.//p//text()')))
+                        'content': content})
-        results.append({'url': url, 'title': title, 'content': content})
+
+    # return results
     return results
     return results

+ 35 - 19
searx/engines/dailymotion.py

@@ -1,45 +1,61 @@
+## Dailymotion (Videos)
+# 
+# @website     https://www.dailymotion.com
+# @provide-api yes (http://www.dailymotion.com/developer)
+# 
+# @using-api   yes
+# @results     JSON
+# @stable      yes
+# @parse       url, title, thumbnail
+#
+# @todo        set content-parameter with correct data
+
 from urllib import urlencode
 from urllib import urlencode
 from json import loads
 from json import loads
 from lxml import html
 from lxml import html
 
 
+# engine dependent config
 categories = ['videos']
 categories = ['videos']
 locale = 'en_US'
 locale = 'en_US'
+paging = True
 
 
+# search-url
 # see http://www.dailymotion.com/doc/api/obj-video.html
 # see http://www.dailymotion.com/doc/api/obj-video.html
-search_url = 'https://api.dailymotion.com/videos?fields=title,description,duration,url,thumbnail_360_url&sort=relevance&limit=25&page={pageno}&{query}'  # noqa
+search_url = 'https://api.dailymotion.com/videos?fields=title,description,duration,url,thumbnail_360_url&sort=relevance&limit=5&page={pageno}&{query}'  # noqa
-
-# TODO use video result template
-content_tpl = '<a href="{0}" title="{0}" ><img src="{1}" /></a><br />'
-
-paging = True
 
 
 
 
+# do search-request
 def request(query, params):
 def request(query, params):
     params['url'] = search_url.format(
     params['url'] = search_url.format(
         query=urlencode({'search': query, 'localization': locale}),
         query=urlencode({'search': query, 'localization': locale}),
         pageno=params['pageno'])
         pageno=params['pageno'])
+
     return params
     return params
 
 
 
 
+# get response from search-request
 def response(resp):
 def response(resp):
     results = []
     results = []
+
     search_res = loads(resp.text)
     search_res = loads(resp.text)
+
+    # return empty array if there are no results
     if not 'list' in search_res:
     if not 'list' in search_res:
-        return results
+        return []
+
+    # parse results
     for res in search_res['list']:
     for res in search_res['list']:
         title = res['title']
         title = res['title']
         url = res['url']
         url = res['url']
-        if res['thumbnail_360_url']:
+        #content = res['description']
-            content = content_tpl.format(url, res['thumbnail_360_url'])
+        content = ''
-        else:
+        thumbnail = res['thumbnail_360_url']
-            content = ''
-        if res['description']:
-            description = text_content_from_html(res['description'])
-            content += description[:500]
-        results.append({'url': url, 'title': title, 'content': content})
-    return results
 
 
+        results.append({'template': 'videos.html',
+                        'url': url,
+                        'title': title,
+                        'content': content,
+                        'thumbnail': thumbnail})
 
 
-def text_content_from_html(html_string):
+    # return results
-    desc_html = html.fragment_fromstring(html_string, create_parent=True)
+    return results
-    return desc_html.text_content()

+ 24 - 4
searx/engines/google.py

@@ -1,37 +1,57 @@
-#!/usr/bin/env python
+## Google (Web)
+# 
+# @website     https://www.google.com
+# @provide-api yes (https://developers.google.com/web-search/docs/), deprecated!
+# 
+# @using-api   yes
+# @results     JSON
+# @stable      yes (but deprecated)
+# @parse       url, title, content
 
 
 from urllib import urlencode
 from urllib import urlencode
 from json import loads
 from json import loads
 
 
+# engine dependent config
 categories = ['general']
 categories = ['general']
+paging = True
+language_support = True
 
 
+# search-url
 url = 'https://ajax.googleapis.com/'
 url = 'https://ajax.googleapis.com/'
 search_url = url + 'ajax/services/search/web?v=2.0&start={offset}&rsz=large&safe=off&filter=off&{query}&hl={language}'  # noqa
 search_url = url + 'ajax/services/search/web?v=2.0&start={offset}&rsz=large&safe=off&filter=off&{query}&hl={language}'  # noqa
 
 
-paging = True
-language_support = True
-
 
 
+# do search-request
 def request(query, params):
 def request(query, params):
     offset = (params['pageno'] - 1) * 8
     offset = (params['pageno'] - 1) * 8
+
     language = 'en-US'
     language = 'en-US'
     if params['language'] != 'all':
     if params['language'] != 'all':
         language = params['language'].replace('_', '-')
         language = params['language'].replace('_', '-')
+
     params['url'] = search_url.format(offset=offset,
     params['url'] = search_url.format(offset=offset,
                                       query=urlencode({'q': query}),
                                       query=urlencode({'q': query}),
                                       language=language)
                                       language=language)
+
     return params
     return params
 
 
 
 
+# get response from search-request
 def response(resp):
 def response(resp):
     results = []
     results = []
+
     search_res = loads(resp.text)
     search_res = loads(resp.text)
 
 
+    # return empty array if there are no results
     if not search_res.get('responseData', {}).get('results'):
     if not search_res.get('responseData', {}).get('results'):
         return []
         return []
 
 
+    # parse results
     for result in search_res['responseData']['results']:
     for result in search_res['responseData']['results']:
+        # append result
         results.append({'url': result['unescapedUrl'],
         results.append({'url': result['unescapedUrl'],
                         'title': result['titleNoFormatting'],
                         'title': result['titleNoFormatting'],
                         'content': result['content']})
                         'content': result['content']})
+
+    # return results
     return results
     return results

+ 26 - 5
searx/engines/google_images.py

@@ -1,37 +1,58 @@
-#!/usr/bin/env python
+## Google (Images)
+# 
+# @website     https://www.google.com
+# @provide-api yes (https://developers.google.com/web-search/docs/), deprecated!
+# 
+# @using-api   yes
+# @results     JSON
+# @stable      yes (but deprecated)
+# @parse       url, title, img_src
 
 
 from urllib import urlencode
 from urllib import urlencode
 from json import loads
 from json import loads
 
 
+# engine dependent config
 categories = ['images']
 categories = ['images']
+paging = True
 
 
+# search-url
 url = 'https://ajax.googleapis.com/'
 url = 'https://ajax.googleapis.com/'
 search_url = url + 'ajax/services/search/images?v=1.0&start={offset}&rsz=large&safe=off&filter=off&{query}'  # noqa
 search_url = url + 'ajax/services/search/images?v=1.0&start={offset}&rsz=large&safe=off&filter=off&{query}'  # noqa
 
 
-paging = True
 
 
+# do search-request
 def request(query, params):
 def request(query, params):
     offset = (params['pageno'] - 1) * 8
     offset = (params['pageno'] - 1) * 8
+
     params['url'] = search_url.format(query=urlencode({'q': query}),
     params['url'] = search_url.format(query=urlencode({'q': query}),
                                       offset=offset)
                                       offset=offset)
+
     return params
     return params
 
 
 
 
+# get response from search-request
 def response(resp):
 def response(resp):
     results = []
     results = []
+
     search_res = loads(resp.text)
     search_res = loads(resp.text)
-    if not search_res.get('responseData'):
+
-        return []
+    # return empty array if there are no results
-    if not search_res['responseData'].get('results'):
+    if not search_res.get('responseData', {}).get('results'):
         return []
         return []
+
+    # parse results
     for result in search_res['responseData']['results']:
     for result in search_res['responseData']['results']:
         href = result['originalContextUrl']
         href = result['originalContextUrl']
         title = result['title']
         title = result['title']
         if not result['url']:
         if not result['url']:
             continue
             continue
+
+        # append result
         results.append({'url': href,
         results.append({'url': href,
                         'title': title,
                         'title': title,
                         'content': '',
                         'content': '',
                         'img_src': result['url'],
                         'img_src': result['url'],
                         'template': 'images.html'})
                         'template': 'images.html'})
+
+    # return results
     return results
     return results

+ 25 - 6
searx/engines/google_news.py

@@ -1,43 +1,62 @@
-#!/usr/bin/env python
+## Google (News)
+# 
+# @website     https://www.google.com
+# @provide-api yes (https://developers.google.com/web-search/docs/), deprecated!
+# 
+# @using-api   yes
+# @results     JSON
+# @stable      yes (but deprecated)
+# @parse       url, title, content, publishedDate
 
 
 from urllib import urlencode
 from urllib import urlencode
 from json import loads
 from json import loads
 from dateutil import parser
 from dateutil import parser
 
 
+# search-url
 categories = ['news']
 categories = ['news']
+paging = True
+language_support = True
 
 
+# engine dependent config
 url = 'https://ajax.googleapis.com/'
 url = 'https://ajax.googleapis.com/'
 search_url = url + 'ajax/services/search/news?v=2.0&start={offset}&rsz=large&safe=off&filter=off&{query}&hl={language}'  # noqa
 search_url = url + 'ajax/services/search/news?v=2.0&start={offset}&rsz=large&safe=off&filter=off&{query}&hl={language}'  # noqa
 
 
-paging = True
-language_support = True
-
 
 
+# do search-request
 def request(query, params):
 def request(query, params):
     offset = (params['pageno'] - 1) * 8
     offset = (params['pageno'] - 1) * 8
+
     language = 'en-US'
     language = 'en-US'
     if params['language'] != 'all':
     if params['language'] != 'all':
         language = params['language'].replace('_', '-')
         language = params['language'].replace('_', '-')
+
     params['url'] = search_url.format(offset=offset,
     params['url'] = search_url.format(offset=offset,
                                       query=urlencode({'q': query}),
                                       query=urlencode({'q': query}),
                                       language=language)
                                       language=language)
+
     return params
     return params
 
 
 
 
+# get response from search-request
 def response(resp):
 def response(resp):
     results = []
     results = []
+
     search_res = loads(resp.text)
     search_res = loads(resp.text)
 
 
+    # return empty array if there are no results
     if not search_res.get('responseData', {}).get('results'):
     if not search_res.get('responseData', {}).get('results'):
         return []
         return []
 
 
+    # parse results
     for result in search_res['responseData']['results']:
     for result in search_res['responseData']['results']:
-
+        # parse publishedDate
-# Mon, 10 Mar 2014 16:26:15 -0700
         publishedDate = parser.parse(result['publishedDate'])
         publishedDate = parser.parse(result['publishedDate'])
 
 
+        # append result
         results.append({'url': result['unescapedUrl'],
         results.append({'url': result['unescapedUrl'],
                         'title': result['titleNoFormatting'],
                         'title': result['titleNoFormatting'],
                         'publishedDate': publishedDate,
                         'publishedDate': publishedDate,
                         'content': result['content']})
                         'content': result['content']})
+
+    # return results
     return results
     return results

+ 40 - 22
searx/engines/vimeo.py

@@ -1,43 +1,58 @@
+## Vimeo (Videos)
+# 
+# @website     https://vimeo.com/
+# @provide-api yes (http://developer.vimeo.com/api), they have a maximum count of queries/hour
+# 
+# @using-api   no (TODO, rewrite to api)
+# @results     HTML (using search portal)
+# @stable      no (HTML can change)
+# @parse       url, title, publishedDate,  thumbnail
+#
+# @todo        rewrite to api
+# @todo        set content-parameter with correct data
+
 from urllib import urlencode
 from urllib import urlencode
 from HTMLParser import HTMLParser
 from HTMLParser import HTMLParser
 from lxml import html
 from lxml import html
 from searx.engines.xpath import extract_text
 from searx.engines.xpath import extract_text
 from dateutil import parser
 from dateutil import parser
 
 
-base_url = 'http://vimeo.com'
+# engine dependent config
-search_url = base_url + '/search?{query}'
+categories = ['videos']
-url_xpath = None
+paging = True
-content_xpath = None
-title_xpath = None
-results_xpath = ''
-content_tpl = '<a href="{0}">  <img src="{2}"/> </a>'
-publishedDate_xpath = './/p[@class="meta"]//attribute::datetime'
 
 
-# the cookie set by vimeo contains all the following values,
+# search-url
-# but only __utma seems to be requiered
+base_url = 'https://vimeo.com'
-cookie = {
+search_url = base_url + '/search/page:{pageno}?{query}'
-    #'vuid':'918282893.1027205400'
+
-    # 'ab_bs':'%7B%223%22%3A279%7D'
+# specific xpath variables
-     '__utma': '00000000.000#0000000.0000000000.0000000000.0000000000.0'
+url_xpath = './a/@href'
-    # '__utmb':'18302654.1.10.1388942090'
+content_xpath = './a/img/@src'
-    #, '__utmc':'18302654'
+title_xpath = './a/div[@class="data"]/p[@class="title"]/text()'
-    #, '__utmz':'18#302654.1388942090.1.1.utmcsr=(direct)|utmccn=(direct)|utmcmd=(none)'  # noqa
+results_xpath = '//div[@id="browse_content"]/ol/li'
-    #, '__utml':'search'
+publishedDate_xpath = './/p[@class="meta"]//attribute::datetime'
-}
 
 
 
 
+# do search-request
 def request(query, params):
 def request(query, params):
-    params['url'] = search_url.format(query=urlencode({'q': query}))
+    params['url'] = search_url.format(pageno=params['pageno'] ,
-    params['cookies'] = cookie
+                                      query=urlencode({'q': query}))
+
+    # TODO required?
+    params['cookies']['__utma'] = '00000000.000#0000000.0000000000.0000000000.0000000000.0'
+
     return params
     return params
 
 
 
 
+# get response from search-request
 def response(resp):
 def response(resp):
     results = []
     results = []
+
     dom = html.fromstring(resp.text)
     dom = html.fromstring(resp.text)
 
 
     p = HTMLParser()
     p = HTMLParser()
 
 
+    # parse results
     for result in dom.xpath(results_xpath):
     for result in dom.xpath(results_xpath):
         url = base_url + result.xpath(url_xpath)[0]
         url = base_url + result.xpath(url_xpath)[0]
         title = p.unescape(extract_text(result.xpath(title_xpath)))
         title = p.unescape(extract_text(result.xpath(title_xpath)))
@@ -45,10 +60,13 @@ def response(resp):
         publishedDate = parser.parse(extract_text(
         publishedDate = parser.parse(extract_text(
             result.xpath(publishedDate_xpath)[0]))
             result.xpath(publishedDate_xpath)[0]))
 
 
+        # append result
         results.append({'url': url,
         results.append({'url': url,
                         'title': title,
                         'title': title,
-                        'content': content_tpl.format(url, title, thumbnail),
+                        'content': '',
                         'template': 'videos.html',
                         'template': 'videos.html',
                         'publishedDate': publishedDate,
                         'publishedDate': publishedDate,
                         'thumbnail': thumbnail})
                         'thumbnail': thumbnail})
+
+    # return results
     return results
     return results

+ 41 - 6
searx/engines/yahoo.py

@@ -1,64 +1,99 @@
-#!/usr/bin/env python
+## Yahoo (Web)
+# 
+# @website     https://search.yahoo.com/web
+# @provide-api yes (https://developer.yahoo.com/boss/search/), $0.80/1000 queries
+# 
+# @using-api   no (because pricing)
+# @results     HTML (using search portal)
+# @stable      no (HTML can change)
+# @parse       url, title, content, suggestion
 
 
 from urllib import urlencode
 from urllib import urlencode
 from urlparse import unquote
 from urlparse import unquote
 from lxml import html
 from lxml import html
 from searx.engines.xpath import extract_text, extract_url
 from searx.engines.xpath import extract_text, extract_url
 
 
+# engine dependent config
 categories = ['general']
 categories = ['general']
-search_url = 'http://search.yahoo.com/search?{query}&b={offset}'
+paging = True
+language_support = True
+
+# search-url
+search_url = 'https://search.yahoo.com/search?{query}&b={offset}&fl=1&vl=lang_{lang}'
+
+# specific xpath variables
 results_xpath = '//div[@class="res"]'
 results_xpath = '//div[@class="res"]'
 url_xpath = './/h3/a/@href'
 url_xpath = './/h3/a/@href'
 title_xpath = './/h3/a'
 title_xpath = './/h3/a'
 content_xpath = './/div[@class="abstr"]'
 content_xpath = './/div[@class="abstr"]'
 suggestion_xpath = '//div[@id="satat"]//a'
 suggestion_xpath = '//div[@id="satat"]//a'
 
 
-paging = True
-
 
 
+# remove yahoo-specific tracking-url
 def parse_url(url_string):
 def parse_url(url_string):
     endings = ['/RS', '/RK']
     endings = ['/RS', '/RK']
     endpositions = []
     endpositions = []
     start = url_string.find('http', url_string.find('/RU=')+1)
     start = url_string.find('http', url_string.find('/RU=')+1)
+
     for ending in endings:
     for ending in endings:
         endpos = url_string.rfind(ending)
         endpos = url_string.rfind(ending)
         if endpos > -1:
         if endpos > -1:
             endpositions.append(endpos)
             endpositions.append(endpos)
 
 
     end = min(endpositions)
     end = min(endpositions)
+
     return unquote(url_string[start:end])
     return unquote(url_string[start:end])
 
 
 
 
+# do search-request
 def request(query, params):
 def request(query, params):
     offset = (params['pageno'] - 1) * 10 + 1
     offset = (params['pageno'] - 1) * 10 + 1
+
     if params['language'] == 'all':
     if params['language'] == 'all':
         language = 'en'
         language = 'en'
     else:
     else:
         language = params['language'].split('_')[0]
         language = params['language'].split('_')[0]
+
     params['url'] = search_url.format(offset=offset,
     params['url'] = search_url.format(offset=offset,
-                                      query=urlencode({'p': query}))
+                                      query=urlencode({'p': query}),
+                                      lang=language)
+
+    # TODO required?
     params['cookies']['sB'] = 'fl=1&vl=lang_{lang}&sh=1&rw=new&v=1'\
     params['cookies']['sB'] = 'fl=1&vl=lang_{lang}&sh=1&rw=new&v=1'\
         .format(lang=language)
         .format(lang=language)
+
     return params
     return params
 
 
 
 
+# get response from search-request
 def response(resp):
 def response(resp):
     results = []
     results = []
+
     dom = html.fromstring(resp.text)
     dom = html.fromstring(resp.text)
 
 
+    # parse results
     for result in dom.xpath(results_xpath):
     for result in dom.xpath(results_xpath):
         try:
         try:
             url = parse_url(extract_url(result.xpath(url_xpath), search_url))
             url = parse_url(extract_url(result.xpath(url_xpath), search_url))
             title = extract_text(result.xpath(title_xpath)[0])
             title = extract_text(result.xpath(title_xpath)[0])
         except:
         except:
             continue
             continue
+
         content = extract_text(result.xpath(content_xpath)[0])
         content = extract_text(result.xpath(content_xpath)[0])
-        results.append({'url': url, 'title': title, 'content': content})
 
 
+        # append result
+        results.append({'url': url, 
+                        'title': title, 
+                        'content': content})
+
+    # if no suggestion found, return results
     if not suggestion_xpath:
     if not suggestion_xpath:
         return results
         return results
 
 
+    # parse suggestion
     for suggestion in dom.xpath(suggestion_xpath):
     for suggestion in dom.xpath(suggestion_xpath):
+        # append suggestion
         results.append({'suggestion': extract_text(suggestion)})
         results.append({'suggestion': extract_text(suggestion)})
 
 
+    # return results
     return results
     return results

+ 31 - 11
searx/engines/yahoo_news.py

@@ -1,4 +1,12 @@
-#!/usr/bin/env python
+## Yahoo (News)
+# 
+# @website     https://news.yahoo.com
+# @provide-api yes (https://developer.yahoo.com/boss/search/), $0.80/1000 queries
+# 
+# @using-api   no (because pricing)
+# @results     HTML (using search portal)
+# @stable      no (HTML can change)
+# @parse       url, title, content, publishedDate
 
 
 from urllib import urlencode
 from urllib import urlencode
 from lxml import html
 from lxml import html
@@ -8,8 +16,15 @@ from datetime import datetime, timedelta
 import re
 import re
 from dateutil import parser
 from dateutil import parser
 
 
+# engine dependent config
 categories = ['news']
 categories = ['news']
-search_url = 'http://news.search.yahoo.com/search?{query}&b={offset}'
+paging = True
+language_support = True
+
+# search-url
+search_url = 'https://news.search.yahoo.com/search?{query}&b={offset}&fl=1&vl=lang_{lang}'
+
+# specific xpath variables
 results_xpath = '//div[@class="res"]'
 results_xpath = '//div[@class="res"]'
 url_xpath = './/h3/a/@href'
 url_xpath = './/h3/a/@href'
 title_xpath = './/h3/a'
 title_xpath = './/h3/a'
@@ -17,30 +32,39 @@ content_xpath = './/div[@class="abstr"]'
 publishedDate_xpath = './/span[@class="timestamp"]'
 publishedDate_xpath = './/span[@class="timestamp"]'
 suggestion_xpath = '//div[@id="satat"]//a'
 suggestion_xpath = '//div[@id="satat"]//a'
 
 
-paging = True
-
 
 
+# do search-request
 def request(query, params):
 def request(query, params):
     offset = (params['pageno'] - 1) * 10 + 1
     offset = (params['pageno'] - 1) * 10 + 1
+
     if params['language'] == 'all':
     if params['language'] == 'all':
         language = 'en'
         language = 'en'
     else:
     else:
         language = params['language'].split('_')[0]
         language = params['language'].split('_')[0]
+    
     params['url'] = search_url.format(offset=offset,
     params['url'] = search_url.format(offset=offset,
-                                      query=urlencode({'p': query}))
+                                      query=urlencode({'p': query}),
+                                      lang=language)
+
+    # TODO required?
     params['cookies']['sB'] = 'fl=1&vl=lang_{lang}&sh=1&rw=new&v=1'\
     params['cookies']['sB'] = 'fl=1&vl=lang_{lang}&sh=1&rw=new&v=1'\
         .format(lang=language)
         .format(lang=language)
     return params
     return params
 
 
 
 
+# get response from search-request
 def response(resp):
 def response(resp):
     results = []
     results = []
+
     dom = html.fromstring(resp.text)
     dom = html.fromstring(resp.text)
 
 
+    # parse results
     for result in dom.xpath(results_xpath):
     for result in dom.xpath(results_xpath):
         url = parse_url(extract_url(result.xpath(url_xpath), search_url))
         url = parse_url(extract_url(result.xpath(url_xpath), search_url))
         title = extract_text(result.xpath(title_xpath)[0])
         title = extract_text(result.xpath(title_xpath)[0])
         content = extract_text(result.xpath(content_xpath)[0])
         content = extract_text(result.xpath(content_xpath)[0])
+
+        # parse publishedDate
         publishedDate = extract_text(result.xpath(publishedDate_xpath)[0])
         publishedDate = extract_text(result.xpath(publishedDate_xpath)[0])
 
 
         if re.match("^[0-9]+ minute(s|) ago$", publishedDate):
         if re.match("^[0-9]+ minute(s|) ago$", publishedDate):
@@ -58,15 +82,11 @@ def response(resp):
         if publishedDate.year == 1900:
         if publishedDate.year == 1900:
             publishedDate = publishedDate.replace(year=datetime.now().year)
             publishedDate = publishedDate.replace(year=datetime.now().year)
 
 
+        # append result
         results.append({'url': url,
         results.append({'url': url,
                         'title': title,
                         'title': title,
                         'content': content,
                         'content': content,
                         'publishedDate': publishedDate})
                         'publishedDate': publishedDate})
 
 
-    if not suggestion_xpath:
+    # return results
-        return results
-
-    for suggestion in dom.xpath(suggestion_xpath):
-        results.append({'suggestion': extract_text(suggestion)})
-
     return results
     return results

+ 6 - 5
searx/settings.yml

@@ -20,6 +20,11 @@ engines:
     locale : en-US
     locale : en-US
     shortcut : bi
     shortcut : bi
 
 
+  - name : bing images
+    engine : bing_images
+    locale : en-US
+    shortcut : bii
+
   - name : bing news
   - name : bing news
     engine : bing_news
     engine : bing_news
     locale : en-US
     locale : en-US
@@ -148,11 +153,7 @@ engines:
 
 
   - name : vimeo
   - name : vimeo
     engine : vimeo
     engine : vimeo
-    categories : videos
+    locale : en-US
-    results_xpath : //div[@id="browse_content"]/ol/li
-    url_xpath : ./a/@href
-    title_xpath : ./a/div[@class="data"]/p[@class="title"]/text()
-    content_xpath : ./a/img/@src
     shortcut : vm
     shortcut : vm
 
 
 locales:
 locales: