Browse Source

fix yahoo engines and add comments

Thomas Pointhuber 10 years ago
parent
commit
03db970e6a
2 changed files with 72 additions and 17 deletions
  1. 41 6
      searx/engines/yahoo.py
  2. 31 11
      searx/engines/yahoo_news.py

+ 41 - 6
searx/engines/yahoo.py

@@ -1,64 +1,99 @@
-#!/usr/bin/env python
+## Yahoo (Web)
+# 
+# @website     https://search.yahoo.com/web
+# @provide-api yes (https://developer.yahoo.com/boss/search/), $0.80/1000 queries
+# 
+# @using-api   no (because pricing)
+# @results     HTML (using search portal)
+# @stable      no (HTML can change)
+# @parse       url, title, content, suggestion
 
 
 from urllib import urlencode
 from urllib import urlencode
 from urlparse import unquote
 from urlparse import unquote
 from lxml import html
 from lxml import html
 from searx.engines.xpath import extract_text, extract_url
 from searx.engines.xpath import extract_text, extract_url
 
 
+# engine dependent config
 categories = ['general']
 categories = ['general']
-search_url = 'http://search.yahoo.com/search?{query}&b={offset}'
+paging = True
+language_support = True
+
+# search-url
+search_url = 'https://search.yahoo.com/search?{query}&b={offset}&fl=1&vl=lang_{lang}'
+
+# specific xpath variables
 results_xpath = '//div[@class="res"]'
 results_xpath = '//div[@class="res"]'
 url_xpath = './/h3/a/@href'
 url_xpath = './/h3/a/@href'
 title_xpath = './/h3/a'
 title_xpath = './/h3/a'
 content_xpath = './/div[@class="abstr"]'
 content_xpath = './/div[@class="abstr"]'
 suggestion_xpath = '//div[@id="satat"]//a'
 suggestion_xpath = '//div[@id="satat"]//a'
 
 
-paging = True
-
 
 
+# remove yahoo-specific tracking-url
 def parse_url(url_string):
 def parse_url(url_string):
     endings = ['/RS', '/RK']
     endings = ['/RS', '/RK']
     endpositions = []
     endpositions = []
     start = url_string.find('http', url_string.find('/RU=')+1)
     start = url_string.find('http', url_string.find('/RU=')+1)
+
     for ending in endings:
     for ending in endings:
         endpos = url_string.rfind(ending)
         endpos = url_string.rfind(ending)
         if endpos > -1:
         if endpos > -1:
             endpositions.append(endpos)
             endpositions.append(endpos)
 
 
     end = min(endpositions)
     end = min(endpositions)
+
     return unquote(url_string[start:end])
     return unquote(url_string[start:end])
 
 
 
 
+# do search-request
 def request(query, params):
 def request(query, params):
     offset = (params['pageno'] - 1) * 10 + 1
     offset = (params['pageno'] - 1) * 10 + 1
+
     if params['language'] == 'all':
     if params['language'] == 'all':
         language = 'en'
         language = 'en'
     else:
     else:
         language = params['language'].split('_')[0]
         language = params['language'].split('_')[0]
+
     params['url'] = search_url.format(offset=offset,
     params['url'] = search_url.format(offset=offset,
-                                      query=urlencode({'p': query}))
+                                      query=urlencode({'p': query}),
+                                      lang=language)
+
+    # TODO required?
     params['cookies']['sB'] = 'fl=1&vl=lang_{lang}&sh=1&rw=new&v=1'\
     params['cookies']['sB'] = 'fl=1&vl=lang_{lang}&sh=1&rw=new&v=1'\
         .format(lang=language)
         .format(lang=language)
+
     return params
     return params
 
 
 
 
+# get response from search-request
 def response(resp):
 def response(resp):
     results = []
     results = []
+
     dom = html.fromstring(resp.text)
     dom = html.fromstring(resp.text)
 
 
+    # parse results
     for result in dom.xpath(results_xpath):
     for result in dom.xpath(results_xpath):
         try:
         try:
             url = parse_url(extract_url(result.xpath(url_xpath), search_url))
             url = parse_url(extract_url(result.xpath(url_xpath), search_url))
             title = extract_text(result.xpath(title_xpath)[0])
             title = extract_text(result.xpath(title_xpath)[0])
         except:
         except:
             continue
             continue
+
         content = extract_text(result.xpath(content_xpath)[0])
         content = extract_text(result.xpath(content_xpath)[0])
-        results.append({'url': url, 'title': title, 'content': content})
 
 
+        # append result
+        results.append({'url': url, 
+                        'title': title, 
+                        'content': content})
+
+    # if no suggestion found, return results
     if not suggestion_xpath:
     if not suggestion_xpath:
         return results
         return results
 
 
+    # parse suggestion
     for suggestion in dom.xpath(suggestion_xpath):
     for suggestion in dom.xpath(suggestion_xpath):
+        # append suggestion
         results.append({'suggestion': extract_text(suggestion)})
         results.append({'suggestion': extract_text(suggestion)})
 
 
+    # return results
     return results
     return results

+ 31 - 11
searx/engines/yahoo_news.py

@@ -1,4 +1,12 @@
-#!/usr/bin/env python
+## Yahoo (News)
+# 
+# @website     https://news.yahoo.com
+# @provide-api yes (https://developer.yahoo.com/boss/search/), $0.80/1000 queries
+# 
+# @using-api   no (because pricing)
+# @results     HTML (using search portal)
+# @stable      no (HTML can change)
+# @parse       url, title, content, publishedDate
 
 
 from urllib import urlencode
 from urllib import urlencode
 from lxml import html
 from lxml import html
@@ -8,8 +16,15 @@ from datetime import datetime, timedelta
 import re
 import re
 from dateutil import parser
 from dateutil import parser
 
 
+# engine dependent config
 categories = ['news']
 categories = ['news']
-search_url = 'http://news.search.yahoo.com/search?{query}&b={offset}'
+paging = True
+language_support = True
+
+# search-url
+search_url = 'https://news.search.yahoo.com/search?{query}&b={offset}&fl=1&vl=lang_{lang}'
+
+# specific xpath variables
 results_xpath = '//div[@class="res"]'
 results_xpath = '//div[@class="res"]'
 url_xpath = './/h3/a/@href'
 url_xpath = './/h3/a/@href'
 title_xpath = './/h3/a'
 title_xpath = './/h3/a'
@@ -17,30 +32,39 @@ content_xpath = './/div[@class="abstr"]'
 publishedDate_xpath = './/span[@class="timestamp"]'
 publishedDate_xpath = './/span[@class="timestamp"]'
 suggestion_xpath = '//div[@id="satat"]//a'
 suggestion_xpath = '//div[@id="satat"]//a'
 
 
-paging = True
-
 
 
+# do search-request
 def request(query, params):
 def request(query, params):
     offset = (params['pageno'] - 1) * 10 + 1
     offset = (params['pageno'] - 1) * 10 + 1
+
     if params['language'] == 'all':
     if params['language'] == 'all':
         language = 'en'
         language = 'en'
     else:
     else:
         language = params['language'].split('_')[0]
         language = params['language'].split('_')[0]
+    
     params['url'] = search_url.format(offset=offset,
     params['url'] = search_url.format(offset=offset,
-                                      query=urlencode({'p': query}))
+                                      query=urlencode({'p': query}),
+                                      lang=language)
+
+    # TODO required?
     params['cookies']['sB'] = 'fl=1&vl=lang_{lang}&sh=1&rw=new&v=1'\
     params['cookies']['sB'] = 'fl=1&vl=lang_{lang}&sh=1&rw=new&v=1'\
         .format(lang=language)
         .format(lang=language)
     return params
     return params
 
 
 
 
+# get response from search-request
 def response(resp):
 def response(resp):
     results = []
     results = []
+
     dom = html.fromstring(resp.text)
     dom = html.fromstring(resp.text)
 
 
+    # parse results
     for result in dom.xpath(results_xpath):
     for result in dom.xpath(results_xpath):
         url = parse_url(extract_url(result.xpath(url_xpath), search_url))
         url = parse_url(extract_url(result.xpath(url_xpath), search_url))
         title = extract_text(result.xpath(title_xpath)[0])
         title = extract_text(result.xpath(title_xpath)[0])
         content = extract_text(result.xpath(content_xpath)[0])
         content = extract_text(result.xpath(content_xpath)[0])
+
+        # parse publishedDate
         publishedDate = extract_text(result.xpath(publishedDate_xpath)[0])
         publishedDate = extract_text(result.xpath(publishedDate_xpath)[0])
 
 
         if re.match("^[0-9]+ minute(s|) ago$", publishedDate):
         if re.match("^[0-9]+ minute(s|) ago$", publishedDate):
@@ -58,15 +82,11 @@ def response(resp):
         if publishedDate.year == 1900:
         if publishedDate.year == 1900:
             publishedDate = publishedDate.replace(year=datetime.now().year)
             publishedDate = publishedDate.replace(year=datetime.now().year)
 
 
+        # append result
         results.append({'url': url,
         results.append({'url': url,
                         'title': title,
                         'title': title,
                         'content': content,
                         'content': content,
                         'publishedDate': publishedDate})
                         'publishedDate': publishedDate})
 
 
-    if not suggestion_xpath:
-        return results
-
-    for suggestion in dom.xpath(suggestion_xpath):
-        results.append({'suggestion': extract_text(suggestion)})
-
+    # return results
     return results
     return results