Browse Source

Merge pull request #89 from pointhi/engines

update search engines and add comments to it
Adam Tauber 10 years ago
parent
commit
f825752145

+ 1 - 0
searx/engines/bing.py

@@ -23,6 +23,7 @@ language_support = True
 base_url = 'https://www.bing.com/'
 base_url = 'https://www.bing.com/'
 search_string = 'search?{query}&first={offset}'
 search_string = 'search?{query}&first={offset}'
 
 
+
 # do search-request
 # do search-request
 def request(query, params):
 def request(query, params):
     offset = (params['pageno'] - 1) * 10 + 1
     offset = (params['pageno'] - 1) * 10 + 1

+ 1 - 0
searx/engines/bing_images.py

@@ -24,6 +24,7 @@ paging = True
 base_url = 'https://www.bing.com/'
 base_url = 'https://www.bing.com/'
 search_string = 'images/search?{query}&count=10&first={offset}'
 search_string = 'images/search?{query}&count=10&first={offset}'
 
 
+
 # do search-request
 # do search-request
 def request(query, params):
 def request(query, params):
     offset = (params['pageno'] - 1) * 10 + 1
     offset = (params['pageno'] - 1) * 10 + 1

+ 1 - 0
searx/engines/bing_news.py

@@ -24,6 +24,7 @@ language_support = True
 base_url = 'https://www.bing.com/'
 base_url = 'https://www.bing.com/'
 search_string = 'news/search?{query}&first={offset}'
 search_string = 'news/search?{query}&first={offset}'
 
 
+
 # do search-request
 # do search-request
 def request(query, params):
 def request(query, params):
     offset = (params['pageno'] - 1) * 10 + 1
     offset = (params['pageno'] - 1) * 10 + 1

+ 29 - 3
searx/engines/deviantart.py

@@ -1,35 +1,61 @@
+## Deviantart (Images)
+# 
+# @website     https://www.deviantart.com/
+# @provide-api yes (https://www.deviantart.com/developers/) (RSS)
+# 
+# @using-api   no (TODO, rewrite to api)
+# @results     HTML
+# @stable      no (HTML can change)
+# @parse       url, title, thumbnail, img_src
+#
+# @todo        rewrite to api
+
 from urllib import urlencode
 from urllib import urlencode
 from urlparse import urljoin
 from urlparse import urljoin
 from lxml import html
 from lxml import html
 
 
+# engine dependent config
 categories = ['images']
 categories = ['images']
+paging = True
 
 
+# search-url
 base_url = 'https://www.deviantart.com/'
 base_url = 'https://www.deviantart.com/'
 search_url = base_url+'search?offset={offset}&{query}'
 search_url = base_url+'search?offset={offset}&{query}'
 
 
-paging = True
-
 
 
+# do search-request
 def request(query, params):
 def request(query, params):
     offset = (params['pageno'] - 1) * 24
     offset = (params['pageno'] - 1) * 24
+
     params['url'] = search_url.format(offset=offset,
     params['url'] = search_url.format(offset=offset,
                                       query=urlencode({'q': query}))
                                       query=urlencode({'q': query}))
+
     return params
     return params
 
 
 
 
+# get response from search-request
 def response(resp):
 def response(resp):
     results = []
     results = []
+
+    # return empty array if a redirection code is returned
     if resp.status_code == 302:
     if resp.status_code == 302:
-        return results
+        return []
+
     dom = html.fromstring(resp.text)
     dom = html.fromstring(resp.text)
+
+    # parse results
     for result in dom.xpath('//div[contains(@class, "tt-a tt-fh")]'):
     for result in dom.xpath('//div[contains(@class, "tt-a tt-fh")]'):
         link = result.xpath('.//a[contains(@class, "thumb")]')[0]
         link = result.xpath('.//a[contains(@class, "thumb")]')[0]
         url = urljoin(base_url, link.attrib.get('href'))
         url = urljoin(base_url, link.attrib.get('href'))
         title_links = result.xpath('.//span[@class="details"]//a[contains(@class, "t")]')  # noqa
         title_links = result.xpath('.//span[@class="details"]//a[contains(@class, "t")]')  # noqa
         title = ''.join(title_links[0].xpath('.//text()'))
         title = ''.join(title_links[0].xpath('.//text()'))
         img_src = link.xpath('.//img')[0].attrib['src']
         img_src = link.xpath('.//img')[0].attrib['src']
+
+        # append result
         results.append({'url': url,
         results.append({'url': url,
                         'title': title,
                         'title': title,
                         'img_src': img_src,
                         'img_src': img_src,
                         'template': 'images.html'})
                         'template': 'images.html'})
+
+    # return results
     return results
     return results

+ 38 - 34
searx/engines/duckduckgo.py

@@ -1,65 +1,69 @@
+## DuckDuckGo (Web)
+# 
+# @website     https://duckduckgo.com/
+# @provide-api yes (https://duckduckgo.com/api), but not all results from search-site
+# 
+# @using-api   no
+# @results     HTML (using search portal)
+# @stable      no (HTML can change)
+# @parse       url, title, content
+#
+# @todo        rewrite to api
+# @todo        language support (the current used site does not support language-change)
+
 from urllib import urlencode
 from urllib import urlencode
 from lxml.html import fromstring
 from lxml.html import fromstring
 from searx.utils import html_to_text
 from searx.utils import html_to_text
 
 
-url = 'https://duckduckgo.com/html?{query}&s={offset}'
+# engine dependent config
+categories = ['general']
+paging = True
 locale = 'us-en'
 locale = 'us-en'
 
 
+# search-url
+url = 'https://duckduckgo.com/html?{query}&s={offset}'
+
+# specific xpath variables
+result_xpath = '//div[@class="results_links results_links_deep web-result"]'  # noqa
+url_xpath = './/a[@class="large"]/@href'
+title_xpath = './/a[@class="large"]//text()'
+content_xpath = './/div[@class="snippet"]//text()'
 
 
+
+# do search-request
 def request(query, params):
 def request(query, params):
     offset = (params['pageno'] - 1) * 30
     offset = (params['pageno'] - 1) * 30
-    q = urlencode({'q': query,
-                   'l': locale})
-    params['url'] = url.format(query=q, offset=offset)
+
+    params['url'] = url.format(
+        query=urlencode({'q': query, 'l': locale}),
+        offset=offset)
+
     return params
     return params
 
 
 
 
+# get response from search-request
 def response(resp):
 def response(resp):
-    result_xpath = '//div[@class="results_links results_links_deep web-result"]'  # noqa
-    url_xpath = './/a[@class="large"]/@href'
-    title_xpath = './/a[@class="large"]//text()'
-    content_xpath = './/div[@class="snippet"]//text()'
     results = []
     results = []
 
 
     doc = fromstring(resp.text)
     doc = fromstring(resp.text)
 
 
+    # parse results
     for r in doc.xpath(result_xpath):
     for r in doc.xpath(result_xpath):
         try:
         try:
             res_url = r.xpath(url_xpath)[-1]
             res_url = r.xpath(url_xpath)[-1]
         except:
         except:
             continue
             continue
+
         if not res_url:
         if not res_url:
             continue
             continue
+
         title = html_to_text(''.join(r.xpath(title_xpath)))
         title = html_to_text(''.join(r.xpath(title_xpath)))
         content = html_to_text(''.join(r.xpath(content_xpath)))
         content = html_to_text(''.join(r.xpath(content_xpath)))
+
+        # append result
         results.append({'title': title,
         results.append({'title': title,
                         'content': content,
                         'content': content,
                         'url': res_url})
                         'url': res_url})
 
 
+    # return results
     return results
     return results
-
-
-#from json import loads
-#search_url = url + 'd.js?{query}&p=1&s={offset}'
-#
-#paging = True
-#
-#
-#def request(query, params):
-#    offset = (params['pageno'] - 1) * 30
-#    q = urlencode({'q': query,
-#                   'l': locale})
-#    params['url'] = search_url.format(query=q, offset=offset)
-#    return params
-#
-#
-#def response(resp):
-#    results = []
-#    search_res = loads(resp.text[resp.text.find('[{'):-2])[:-1]
-#    for r in search_res:
-#        if not r.get('t'):
-#            continue
-#        results.append({'title': r['t'],
-#                       'content': html_to_text(r['a']),
-#                       'url': r['u']})
-#    return results

+ 8 - 0
searx/engines/dummy.py

@@ -1,6 +1,14 @@
+## Dummy
+# 
+# @results     empty array
+# @stable      yes
+
+
+# do search-request
 def request(query, params):
 def request(query, params):
     return params
     return params
 
 
 
 
+# get response from search-request
 def response(resp):
 def response(resp):
     return []
     return []

+ 28 - 3
searx/engines/generalfile.py

@@ -1,35 +1,60 @@
+## General Files (Files)
+# 
+# @website     http://www.general-files.org
+# @provide-api no (nothing found)
+# 
+# @using-api   no (because nothing found)
+# @results     HTML (using search portal)
+# @stable      no (HTML can change)
+# @parse       url, title, content
+#
+# @todo        detect torrents?
+
 from lxml import html
 from lxml import html
 
 
+# engine dependent config
+categories = ['files']
+paging = True
 
 
+# search-url
 base_url = 'http://www.general-file.com'
 base_url = 'http://www.general-file.com'
 search_url = base_url + '/files-{letter}/{query}/{pageno}'
 search_url = base_url + '/files-{letter}/{query}/{pageno}'
 
 
+# specific xpath variables
 result_xpath = '//table[@class="block-file"]'
 result_xpath = '//table[@class="block-file"]'
 title_xpath = './/h2/a//text()'
 title_xpath = './/h2/a//text()'
 url_xpath = './/h2/a/@href'
 url_xpath = './/h2/a/@href'
 content_xpath = './/p//text()'
 content_xpath = './/p//text()'
 
 
-paging = True
-
 
 
+# do search-request
 def request(query, params):
 def request(query, params):
+
     params['url'] = search_url.format(query=query,
     params['url'] = search_url.format(query=query,
                                       letter=query[0],
                                       letter=query[0],
                                       pageno=params['pageno'])
                                       pageno=params['pageno'])
+
     return params
     return params
 
 
 
 
+# get response from search-request
 def response(resp):
 def response(resp):
-
     results = []
     results = []
+
     dom = html.fromstring(resp.text)
     dom = html.fromstring(resp.text)
+
+    # parse results
     for result in dom.xpath(result_xpath):
     for result in dom.xpath(result_xpath):
         url = result.xpath(url_xpath)[0]
         url = result.xpath(url_xpath)[0]
+
         # skip fast download links
         # skip fast download links
         if not url.startswith('/'):
         if not url.startswith('/'):
             continue
             continue
+
+        # append result
         results.append({'url': base_url + url,
         results.append({'url': base_url + url,
                         'title': ''.join(result.xpath(title_xpath)),
                         'title': ''.join(result.xpath(title_xpath)),
                         'content': ''.join(result.xpath(content_xpath))})
                         'content': ''.join(result.xpath(content_xpath))})
 
 
+    # return results
     return results
     return results

+ 30 - 2
searx/engines/github.py

@@ -1,31 +1,59 @@
+## Github (It)
+# 
+# @website     https://github.com/
+# @provide-api yes (https://developer.github.com/v3/)
+# 
+# @using-api   yes
+# @results     JSON
+# @stable      yes (using api)
+# @parse       url, title, content
+
 from urllib import urlencode
 from urllib import urlencode
 from json import loads
 from json import loads
 from cgi import escape
 from cgi import escape
 
 
+# engine dependent config
 categories = ['it']
 categories = ['it']
 
 
+# search-url
 search_url = 'https://api.github.com/search/repositories?sort=stars&order=desc&{query}'  # noqa
 search_url = 'https://api.github.com/search/repositories?sort=stars&order=desc&{query}'  # noqa
 
 
 accept_header = 'application/vnd.github.preview.text-match+json'
 accept_header = 'application/vnd.github.preview.text-match+json'
 
 
 
 
+# do search-request
 def request(query, params):
 def request(query, params):
     params['url'] = search_url.format(query=urlencode({'q': query}))
     params['url'] = search_url.format(query=urlencode({'q': query}))
+
     params['headers']['Accept'] = accept_header
     params['headers']['Accept'] = accept_header
+
     return params
     return params
 
 
 
 
+# get response from search-request
 def response(resp):
 def response(resp):
     results = []
     results = []
+
     search_res = loads(resp.text)
     search_res = loads(resp.text)
+
+    # check if items are recieved
     if not 'items' in search_res:
     if not 'items' in search_res:
-        return results
+        return []
+
+    # parse results
     for res in search_res['items']:
     for res in search_res['items']:
         title = res['name']
         title = res['name']
         url = res['html_url']
         url = res['html_url']
+
         if res['description']:
         if res['description']:
             content = escape(res['description'][:500])
             content = escape(res['description'][:500])
         else:
         else:
             content = ''
             content = ''
-        results.append({'url': url, 'title': title, 'content': content})
+
+        # append result
+        results.append({'url': url,
+                        'title': title,
+                        'content': content})
+
+    # return results
     return results
     return results

+ 34 - 7
searx/engines/piratebay.py

@@ -1,39 +1,61 @@
+## Piratebay (Videos, Music, Files)
+# 
+# @website     https://thepiratebay.se
+# @provide-api no (nothing found)
+# 
+# @using-api   no
+# @results     HTML (using search portal)
+# @stable      yes (HTML can change)
+# @parse       url, title, content, seed, leech, magnetlink
+
 from urlparse import urljoin
 from urlparse import urljoin
 from cgi import escape
 from cgi import escape
 from urllib import quote
 from urllib import quote
 from lxml import html
 from lxml import html
 from operator import itemgetter
 from operator import itemgetter
 
 
-categories = ['videos', 'music']
+# engine dependent config
+categories = ['videos', 'music', 'files']
+paging = True
 
 
+# search-url
 url = 'https://thepiratebay.se/'
 url = 'https://thepiratebay.se/'
 search_url = url + 'search/{search_term}/{pageno}/99/{search_type}'
 search_url = url + 'search/{search_term}/{pageno}/99/{search_type}'
-search_types = {'videos': '200',
+
+# piratebay specific type-definitions
+search_types = {'files': '0',                
                 'music': '100',
                 'music': '100',
-                'files': '0'}
+                'videos': '200'}
 
 
+# specific xpath variables
 magnet_xpath = './/a[@title="Download this torrent using magnet"]'
 magnet_xpath = './/a[@title="Download this torrent using magnet"]'
 content_xpath = './/font[@class="detDesc"]//text()'
 content_xpath = './/font[@class="detDesc"]//text()'
 
 
-paging = True
-
 
 
+# do search-request
 def request(query, params):
 def request(query, params):
-    search_type = search_types.get(params['category'], '200')
+    search_type = search_types.get(params['category'], '0')
+
     params['url'] = search_url.format(search_term=quote(query),
     params['url'] = search_url.format(search_term=quote(query),
                                       search_type=search_type,
                                       search_type=search_type,
                                       pageno=params['pageno'] - 1)
                                       pageno=params['pageno'] - 1)
+
     return params
     return params
 
 
 
 
+# get response from search-request
 def response(resp):
 def response(resp):
     results = []
     results = []
+
     dom = html.fromstring(resp.text)
     dom = html.fromstring(resp.text)
+
     search_res = dom.xpath('//table[@id="searchResult"]//tr')
     search_res = dom.xpath('//table[@id="searchResult"]//tr')
 
 
+    # return empty array if nothing is found
     if not search_res:
     if not search_res:
-        return results
+        return []
 
 
+    # parse results
     for result in search_res[1:]:
     for result in search_res[1:]:
         link = result.xpath('.//div[@class="detName"]//a')[0]
         link = result.xpath('.//div[@class="detName"]//a')[0]
         href = urljoin(url, link.attrib.get('href'))
         href = urljoin(url, link.attrib.get('href'))
@@ -41,17 +63,21 @@ def response(resp):
         content = escape(' '.join(result.xpath(content_xpath)))
         content = escape(' '.join(result.xpath(content_xpath)))
         seed, leech = result.xpath('.//td[@align="right"]/text()')[:2]
         seed, leech = result.xpath('.//td[@align="right"]/text()')[:2]
 
 
+        # convert seed to int if possible
         if seed.isdigit():
         if seed.isdigit():
             seed = int(seed)
             seed = int(seed)
         else:
         else:
             seed = 0
             seed = 0
 
 
+        # convert leech to int if possible
         if leech.isdigit():
         if leech.isdigit():
             leech = int(leech)
             leech = int(leech)
         else:
         else:
             leech = 0
             leech = 0
 
 
         magnetlink = result.xpath(magnet_xpath)[0]
         magnetlink = result.xpath(magnet_xpath)[0]
+
+        # append result
         results.append({'url': href,
         results.append({'url': href,
                         'title': title,
                         'title': title,
                         'content': content,
                         'content': content,
@@ -60,4 +86,5 @@ def response(resp):
                         'magnetlink': magnetlink.attrib['href'],
                         'magnetlink': magnetlink.attrib['href'],
                         'template': 'torrent.html'})
                         'template': 'torrent.html'})
 
 
+    # return results sorted by seeder
     return sorted(results, key=itemgetter('seed'), reverse=True)
     return sorted(results, key=itemgetter('seed'), reverse=True)

+ 29 - 4
searx/engines/soundcloud.py

@@ -1,30 +1,55 @@
+## Soundcloud (Music)
+# 
+# @website     https://soundcloud.com
+# @provide-api yes (https://developers.soundcloud.com/)
+# 
+# @using-api   yes
+# @results     JSON
+# @stable      yes
+# @parse       url, title, content
+
 from json import loads
 from json import loads
 from urllib import urlencode
 from urllib import urlencode
 
 
+# engine dependent config
 categories = ['music']
 categories = ['music']
+paging = True
 
 
+# api-key
 guest_client_id = 'b45b1aa10f1ac2941910a7f0d10f8e28'
 guest_client_id = 'b45b1aa10f1ac2941910a7f0d10f8e28'
-url = 'https://api.soundcloud.com/'
-search_url = url + 'search?{query}&facet=model&limit=20&offset={offset}&linked_partitioning=1&client_id='+guest_client_id  # noqa
 
 
-paging = True
+# search-url
+url = 'https://api.soundcloud.com/'
+search_url = url + 'search?{query}&facet=model&limit=20&offset={offset}&linked_partitioning=1&client_id={client_id}'
 
 
 
 
+# do search-request
 def request(query, params):
 def request(query, params):
     offset = (params['pageno'] - 1) * 20
     offset = (params['pageno'] - 1) * 20
+
     params['url'] = search_url.format(query=urlencode({'q': query}),
     params['url'] = search_url.format(query=urlencode({'q': query}),
-                                      offset=offset)
+                                      offset=offset,
+                                      client_id=guest_client_id)
+
     return params
     return params
 
 
 
 
+# get response from search-request
 def response(resp):
 def response(resp):
     results = []
     results = []
+
     search_res = loads(resp.text)
     search_res = loads(resp.text)
+
+    # parse results
     for result in search_res.get('collection', []):
     for result in search_res.get('collection', []):
         if result['kind'] in ('track', 'playlist'):
         if result['kind'] in ('track', 'playlist'):
             title = result['title']
             title = result['title']
             content = result['description']
             content = result['description']
+
+            # append result
             results.append({'url': result['permalink_url'],
             results.append({'url': result['permalink_url'],
                             'title': title,
                             'title': title,
                             'content': content})
                             'content': content})
+
+    # return results
     return results
     return results

+ 35 - 7
searx/engines/stackoverflow.py

@@ -1,30 +1,58 @@
+## Stackoverflow (It)
+# 
+# @website     https://stackoverflow.com/
+# @provide-api not clear (https://api.stackexchange.com/docs/advanced-search)
+# 
+# @using-api   no
+# @results     HTML
+# @stable      no (HTML can change)
+# @parse       url, title, content
+
 from urlparse import urljoin
 from urlparse import urljoin
 from cgi import escape
 from cgi import escape
 from urllib import urlencode
 from urllib import urlencode
 from lxml import html
 from lxml import html
 
 
+# engine dependent config
 categories = ['it']
 categories = ['it']
+paging = True
 
 
+# search-url
 url = 'http://stackoverflow.com/'
 url = 'http://stackoverflow.com/'
 search_url = url+'search?{query}&page={pageno}'
 search_url = url+'search?{query}&page={pageno}'
-result_xpath = './/div[@class="excerpt"]//text()'
 
 
-paging = True
+# specific xpath variables
+results_xpath = '//div[contains(@class,"question-summary")]'
+link_xpath = './/div[@class="result-link"]//a|.//div[@class="summary"]//h3//a'
+title_xpath = './/text()'
+content_xpath = './/div[@class="excerpt"]//text()'
 
 
 
 
+# do search-request
 def request(query, params):
 def request(query, params):
     params['url'] = search_url.format(query=urlencode({'q': query}),
     params['url'] = search_url.format(query=urlencode({'q': query}),
                                       pageno=params['pageno'])
                                       pageno=params['pageno'])
+
     return params
     return params
 
 
 
 
+# get response from search-request
 def response(resp):
 def response(resp):
     results = []
     results = []
+
     dom = html.fromstring(resp.text)
     dom = html.fromstring(resp.text)
-    for result in dom.xpath('//div[@class="question-summary search-result"]'):
-        link = result.xpath('.//div[@class="result-link"]//a')[0]
+
+    # parse results
+    for result in dom.xpath(results_xpath):
+        link = result.xpath(link_xpath)[0]
         href = urljoin(url, link.attrib.get('href'))
         href = urljoin(url, link.attrib.get('href'))
-        title = escape(' '.join(link.xpath('.//text()')))
-        content = escape(' '.join(result.xpath(result_xpath)))
-        results.append({'url': href, 'title': title, 'content': content})
+        title = escape(' '.join(link.xpath(title_xpath)))
+        content = escape(' '.join(result.xpath(content_xpath)))
+
+        # append result
+        results.append({'url': href, 
+                        'title': title, 
+                        'content': content})
+
+    # return results
     return results
     return results

+ 53 - 21
searx/engines/startpage.py

@@ -1,47 +1,79 @@
+## Startpage (Web)
+# 
+# @website     https://startpage.com
+# @provide-api no (nothing found)
+# 
+# @using-api   no
+# @results     HTML
+# @stable      no (HTML can change)
+# @parse       url, title, content
+#
+# @todo        paging
+
 from urllib import urlencode
 from urllib import urlencode
 from lxml import html
 from lxml import html
 from cgi import escape
 from cgi import escape
+import re
+
+# engine dependent config
+categories = ['general']
+# there is a mechanism to block "bot" search (probably the parameter qid), require storing of qid's between mulitble search-calls
+#paging = False 
+language_support = True
 
 
-base_url = None
-search_url = None
+# search-url
+base_url = 'https://startpage.com/'
+search_url = base_url + 'do/search'
 
 
-# TODO paging
-paging = False
-# TODO complete list of country mapping
-country_map = {'en_US': 'eng',
-               'en_UK': 'uk',
-               'nl_NL': 'ned'}
+# specific xpath variables
+# ads xpath //div[@id="results"]/div[@id="sponsored"]//div[@class="result"]
+# not ads: div[@class="result"] are the direct childs of div[@id="results"]
+results_xpath = '//div[@class="result"]'
+link_xpath = './/h3/a'
 
 
 
 
+# do search-request
 def request(query, params):
 def request(query, params):
+    offset = (params['pageno'] - 1) * 10
     query = urlencode({'q': query})[2:]
     query = urlencode({'q': query})[2:]
+
     params['url'] = search_url
     params['url'] = search_url
     params['method'] = 'POST'
     params['method'] = 'POST'
     params['data'] = {'query': query,
     params['data'] = {'query': query,
-                      'startat': (params['pageno'] - 1) * 10}  # offset
-    country = country_map.get(params['language'], 'eng')
-    params['cookies']['preferences'] = \
-        'lang_homepageEEEs/air/{country}/N1NsslEEE1N1Nfont_sizeEEEmediumN1Nrecent_results_filterEEE1N1Nlanguage_uiEEEenglishN1Ndisable_open_in_new_windowEEE0N1Ncolor_schemeEEEnewN1Nnum_of_resultsEEE10N1N'.format(country=country)  # noqa
+                      'startat': offset}   
+
+    # set language if specified
+    if params['language'] != 'all':
+        params['data']['with_language'] = 'lang_' + params['language'].split('_')[0]
+
     return params
     return params
 
 
 
 
+# get response from search-request
 def response(resp):
 def response(resp):
     results = []
     results = []
+
     dom = html.fromstring(resp.content)
     dom = html.fromstring(resp.content)
-    # ads xpath //div[@id="results"]/div[@id="sponsored"]//div[@class="result"]
-    # not ads: div[@class="result"] are the direct childs of div[@id="results"]
-    for result in dom.xpath('//div[@class="result"]'):
-        link = result.xpath('.//h3/a')[0]
+    
+    # parse results
+    for result in dom.xpath(results_xpath):
+        link = result.xpath(link_xpath)[0]
         url = link.attrib.get('href')
         url = link.attrib.get('href')
-        if url.startswith('http://www.google.')\
-           or url.startswith('https://www.google.'):
-            continue
         title = escape(link.text_content())
         title = escape(link.text_content())
 
 
-        content = ''
+        # block google-ad url's
+        if re.match("^http(s|)://www.google.[a-z]+/aclk.*$", url):
+            continue
+
         if result.xpath('./p[@class="desc"]'):
         if result.xpath('./p[@class="desc"]'):
             content = escape(result.xpath('./p[@class="desc"]')[0].text_content())
             content = escape(result.xpath('./p[@class="desc"]')[0].text_content())
+        else:
+            content = ''
 
 
-        results.append({'url': url, 'title': title, 'content': content})
+        # append result
+        results.append({'url': url, 
+                        'title': title, 
+                        'content': content})
 
 
+    # return results
     return results
     return results

+ 35 - 2
searx/engines/twitter.py

@@ -1,30 +1,63 @@
+## Twitter (Social media)
+# 
+# @website     https://www.bing.com/news
+# @provide-api yes (https://dev.twitter.com/docs/using-search)
+# 
+# @using-api   no
+# @results     HTML (using search portal)
+# @stable      no (HTML can change)
+# @parse       url, title, content
+#
+# @todo        publishedDate
+
 from urlparse import urljoin
 from urlparse import urljoin
 from urllib import urlencode
 from urllib import urlencode
 from lxml import html
 from lxml import html
 from cgi import escape
 from cgi import escape
 
 
+# engine dependent config
 categories = ['social media']
 categories = ['social media']
+language_support = True
 
 
+# search-url
 base_url = 'https://twitter.com/'
 base_url = 'https://twitter.com/'
 search_url = base_url+'search?'
 search_url = base_url+'search?'
+
+# specific xpath variables
+results_xpath = '//li[@data-item-type="tweet"]'
+link_xpath = './/small[@class="time"]//a'
 title_xpath = './/span[@class="username js-action-profile-name"]//text()'
 title_xpath = './/span[@class="username js-action-profile-name"]//text()'
 content_xpath = './/p[@class="js-tweet-text tweet-text"]//text()'
 content_xpath = './/p[@class="js-tweet-text tweet-text"]//text()'
 
 
 
 
+# do search-request
 def request(query, params):
 def request(query, params):
     params['url'] = search_url + urlencode({'q': query})
     params['url'] = search_url + urlencode({'q': query})
+
+    # set language if specified
+    if params['language'] != 'all':
+        params['cookies']['lang'] = params['language'].split('_')[0]
+
     return params
     return params
 
 
 
 
+# get response from search-request
 def response(resp):
 def response(resp):
     results = []
     results = []
+
     dom = html.fromstring(resp.text)
     dom = html.fromstring(resp.text)
-    for tweet in dom.xpath('//li[@data-item-type="tweet"]'):
-        link = tweet.xpath('.//small[@class="time"]//a')[0]
+
+    # parse results
+    for tweet in dom.xpath(results_xpath):
+        link = tweet.xpath(link_xpath)[0]
         url = urljoin(base_url, link.attrib.get('href'))
         url = urljoin(base_url, link.attrib.get('href'))
         title = ''.join(tweet.xpath(title_xpath))
         title = ''.join(tweet.xpath(title_xpath))
         content = escape(''.join(tweet.xpath(content_xpath)))
         content = escape(''.join(tweet.xpath(content_xpath)))
+
+        # append result
         results.append({'url': url,
         results.append({'url': url,
                         'title': title,
                         'title': title,
                         'content': content})
                         'content': content})
+
+    # return results
     return results
     return results

+ 47 - 10
searx/engines/wikipedia.py

@@ -1,30 +1,67 @@
+## Wikipedia (Web)
+# 
+# @website     http://www.wikipedia.org
+# @provide-api yes (http://www.mediawiki.org/wiki/API:Search)
+# 
+# @using-api   yes
+# @results     JSON
+# @stable      yes
+# @parse       url, title 
+#
+# @todo        content
+
 from json import loads
 from json import loads
 from urllib import urlencode, quote
 from urllib import urlencode, quote
 
 
-url = 'https://{language}.wikipedia.org/'
-
-search_url = url + 'w/api.php?action=query&list=search&{query}&srprop=timestamp&format=json&sroffset={offset}'  # noqa
-
-number_of_results = 10
-
+# engine dependent config
+categories = ['general']
 language_support = True
 language_support = True
+paging = True
+number_of_results = 1
+    
+# search-url
+url = 'https://{language}.wikipedia.org/'
+search_url = url + 'w/api.php?action=query&list=search&{query}&srprop=timestamp&format=json&sroffset={offset}&srlimit={limit}'  # noqa
 
 
 
 
+# do search-request
 def request(query, params):
 def request(query, params):
-    offset = (params['pageno'] - 1) * 10
+    offset = (params['pageno'] - 1) * number_of_results
+
     if params['language'] == 'all':
     if params['language'] == 'all':
         language = 'en'
         language = 'en'
     else:
     else:
         language = params['language'].split('_')[0]
         language = params['language'].split('_')[0]
+    
+    # write search-language back to params, required in response
     params['language'] = language
     params['language'] = language
+
     params['url'] = search_url.format(query=urlencode({'srsearch': query}),
     params['url'] = search_url.format(query=urlencode({'srsearch': query}),
                                       offset=offset,
                                       offset=offset,
+                                      limit=number_of_results,
                                       language=language)
                                       language=language)
+
     return params
     return params
 
 
 
 
+# get response from search-request
 def response(resp):
 def response(resp):
+    results = []
+
     search_results = loads(resp.text)
     search_results = loads(resp.text)
-    res = search_results.get('query', {}).get('search', [])
-    return [{'url': url.format(language=resp.search_params['language']) + 'wiki/' + quote(result['title'].replace(' ', '_').encode('utf-8')),  # noqa
-        'title': result['title']} for result in res[:int(number_of_results)]]
+
+    # return empty array if there are no results
+    if not search_results.get('query', {}).get('search'):
+        return []
+
+    # parse results
+    for result in search_results['query']['search']:
+        res_url = url.format(language=resp.search_params['language']) + 'wiki/' + quote(result['title'].replace(' ', '_').encode('utf-8'))
+        
+        # append result
+        results.append({'url': res_url,
+                        'title': result['title'],
+                        'content': ''})
+
+    # return results
+    return results

+ 36 - 7
searx/engines/youtube.py

@@ -1,42 +1,69 @@
+## Youtube (Videos)
+# 
+# @website     https://www.youtube.com/
+# @provide-api yes (http://gdata-samples-youtube-search-py.appspot.com/)
+# 
+# @using-api   yes
+# @results     JSON
+# @stable      yes
+# @parse       url, title, content, publishedDate, thumbnail
+
 from json import loads
 from json import loads
 from urllib import urlencode
 from urllib import urlencode
 from dateutil import parser
 from dateutil import parser
 
 
+# engine dependent config
 categories = ['videos']
 categories = ['videos']
-
-search_url = ('https://gdata.youtube.com/feeds/api/videos'
-              '?alt=json&{query}&start-index={index}&max-results=25')  # noqa
-
 paging = True
 paging = True
+language_support = True
+
+# search-url
+base_url = 'https://gdata.youtube.com/feeds/api/videos'
+search_url = base_url + '?alt=json&{query}&start-index={index}&max-results=5'  # noqa
 
 
 
 
+# do search-request
 def request(query, params):
 def request(query, params):
-    index = (params['pageno'] - 1) * 25 + 1
+    index = (params['pageno'] - 1) * 5 + 1
+
     params['url'] = search_url.format(query=urlencode({'q': query}),
     params['url'] = search_url.format(query=urlencode({'q': query}),
                                       index=index)
                                       index=index)
+
+    # add language tag if specified
+    if params['language'] != 'all':
+        params['url'] += '&lr=' + params['language'].split('_')[0]
+
     return params
     return params
 
 
 
 
+# get response from search-request
 def response(resp):
 def response(resp):
     results = []
     results = []
+
     search_results = loads(resp.text)
     search_results = loads(resp.text)
+
+    # return empty array if there are no results
     if not 'feed' in search_results:
     if not 'feed' in search_results:
-        return results
+        return []
+
     feed = search_results['feed']
     feed = search_results['feed']
 
 
+    # parse results
     for result in feed['entry']:
     for result in feed['entry']:
         url = [x['href'] for x in result['link'] if x['type'] == 'text/html']
         url = [x['href'] for x in result['link'] if x['type'] == 'text/html']
+
         if not url:
         if not url:
             return
             return
+
         # remove tracking
         # remove tracking
         url = url[0].replace('feature=youtube_gdata', '')
         url = url[0].replace('feature=youtube_gdata', '')
         if url.endswith('&'):
         if url.endswith('&'):
             url = url[:-1]
             url = url[:-1]
+
         title = result['title']['$t']
         title = result['title']['$t']
         content = ''
         content = ''
         thumbnail = ''
         thumbnail = ''
 
 
-#"2013-12-31T15:22:51.000Z"
         pubdate = result['published']['$t']
         pubdate = result['published']['$t']
         publishedDate = parser.parse(pubdate)
         publishedDate = parser.parse(pubdate)
 
 
@@ -49,6 +76,7 @@ def response(resp):
         else:
         else:
             content = result['content']['$t']
             content = result['content']['$t']
 
 
+        # append result
         results.append({'url': url,
         results.append({'url': url,
                         'title': title,
                         'title': title,
                         'content': content,
                         'content': content,
@@ -56,4 +84,5 @@ def response(resp):
                         'publishedDate': publishedDate,
                         'publishedDate': publishedDate,
                         'thumbnail': thumbnail})
                         'thumbnail': thumbnail})
 
 
+    # return results
     return results
     return results

+ 1 - 15
searx/settings.yml

@@ -11,9 +11,8 @@ server:
 engines:
 engines:
   - name : wikipedia
   - name : wikipedia
     engine : wikipedia
     engine : wikipedia
-    number_of_results : 1
-    paging : False
     shortcut : wp
     shortcut : wp
+#    number_of_results : 1 # default is 1
 
 
   - name : bing
   - name : bing
     engine : bing
     engine : bing
@@ -37,7 +36,6 @@ engines:
 
 
   - name : deviantart
   - name : deviantart
     engine : deviantart
     engine : deviantart
-    categories : images
     shortcut : da
     shortcut : da
     timeout: 3.0
     timeout: 3.0
 
 
@@ -47,7 +45,6 @@ engines:
 
 
   - name : duckduckgo
   - name : duckduckgo
     engine : duckduckgo
     engine : duckduckgo
-    locale : en-us
     shortcut : ddg
     shortcut : ddg
 
 
 # down - website is under criminal investigation by the UK
 # down - website is under criminal investigation by the UK
@@ -64,12 +61,10 @@ engines:
 
 
   - name : general-file
   - name : general-file
     engine : generalfile
     engine : generalfile
-    categories : files
     shortcut : gf
     shortcut : gf
 
 
   - name : github
   - name : github
     engine : github
     engine : github
-    categories : it
     shortcut : gh
     shortcut : gh
 
 
   - name : google
   - name : google
@@ -86,23 +81,18 @@ engines:
 
 
   - name : piratebay
   - name : piratebay
     engine : piratebay
     engine : piratebay
-    categories : videos, music, files
     shortcut : tpb
     shortcut : tpb
 
 
   - name : soundcloud
   - name : soundcloud
     engine : soundcloud
     engine : soundcloud
-    categories : music
     shortcut : sc
     shortcut : sc
 
 
   - name : stackoverflow
   - name : stackoverflow
     engine : stackoverflow
     engine : stackoverflow
-    categories : it
     shortcut : st
     shortcut : st
 
 
   - name : startpage
   - name : startpage
     engine : startpage
     engine : startpage
-    base_url : 'https://startpage.com/'
-    search_url : 'https://startpage.com/do/search'
     shortcut : sp
     shortcut : sp
 
 
 # +30% page load time
 # +30% page load time
@@ -113,7 +103,6 @@ engines:
 
 
   - name : twitter
   - name : twitter
     engine : twitter
     engine : twitter
-    categories : social media
     shortcut : tw
     shortcut : tw
 
 
 # maybe in a fun category
 # maybe in a fun category
@@ -142,13 +131,10 @@ engines:
 
 
   - name : youtube
   - name : youtube
     engine : youtube
     engine : youtube
-    categories : videos
     shortcut : yt
     shortcut : yt
 
 
   - name : dailymotion
   - name : dailymotion
     engine : dailymotion
     engine : dailymotion
-    locale : en_US
-    categories : videos
     shortcut : dm
     shortcut : dm
 
 
   - name : vimeo
   - name : vimeo