| 
					
				 | 
			
			
				@@ -1,4 +1,12 @@ 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-#!/usr/bin/env python 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+## Yahoo (News) 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+#  
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+# @website     https://news.yahoo.com 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+# @provide-api yes (https://developer.yahoo.com/boss/search/), $0.80/1000 queries 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+#  
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+# @using-api   no (because pricing) 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+# @results     HTML (using search portal) 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+# @stable      no (HTML can change) 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+# @parse       url, title, content, publishedDate 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				  
			 | 
		
	
		
			
				 | 
				 | 
			
			
				 from urllib import urlencode 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				 from lxml import html 
			 | 
		
	
	
		
			
				| 
					
				 | 
			
			
				@@ -8,8 +16,15 @@ from datetime import datetime, timedelta 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				 import re 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				 from dateutil import parser 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				  
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+# engine dependent config 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				 categories = ['news'] 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-search_url = 'http://news.search.yahoo.com/search?{query}&b={offset}' 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+paging = True 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+language_support = True 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+ 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+# search-url 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+search_url = 'https://news.search.yahoo.com/search?{query}&b={offset}&fl=1&vl=lang_{lang}' 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+ 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+# specific xpath variables 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				 results_xpath = '//div[@class="res"]' 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				 url_xpath = './/h3/a/@href' 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				 title_xpath = './/h3/a' 
			 | 
		
	
	
		
			
				| 
					
				 | 
			
			
				@@ -17,30 +32,39 @@ content_xpath = './/div[@class="abstr"]' 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				 publishedDate_xpath = './/span[@class="timestamp"]' 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				 suggestion_xpath = '//div[@id="satat"]//a' 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				  
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-paging = True 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				- 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				  
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+# do search-request 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				 def request(query, params): 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				     offset = (params['pageno'] - 1) * 10 + 1 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+ 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				     if params['language'] == 'all': 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				         language = 'en' 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				     else: 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				         language = params['language'].split('_')[0] 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+     
			 | 
		
	
		
			
				 | 
				 | 
			
			
				     params['url'] = search_url.format(offset=offset, 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-                                      query=urlencode({'p': query})) 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+                                      query=urlencode({'p': query}), 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+                                      lang=language) 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+ 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    # TODO required? 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				     params['cookies']['sB'] = 'fl=1&vl=lang_{lang}&sh=1&rw=new&v=1'\ 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				         .format(lang=language) 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				     return params 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				  
			 | 
		
	
		
			
				 | 
				 | 
			
			
				  
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+# get response from search-request 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				 def response(resp): 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				     results = [] 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+ 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				     dom = html.fromstring(resp.text) 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				  
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    # parse results 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				     for result in dom.xpath(results_xpath): 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				         url = parse_url(extract_url(result.xpath(url_xpath), search_url)) 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				         title = extract_text(result.xpath(title_xpath)[0]) 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				         content = extract_text(result.xpath(content_xpath)[0]) 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+ 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+        # parse publishedDate 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				         publishedDate = extract_text(result.xpath(publishedDate_xpath)[0]) 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				  
			 | 
		
	
		
			
				 | 
				 | 
			
			
				         if re.match("^[0-9]+ minute(s|) ago$", publishedDate): 
			 | 
		
	
	
		
			
				| 
					
				 | 
			
			
				@@ -58,15 +82,11 @@ def response(resp): 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				         if publishedDate.year == 1900: 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				             publishedDate = publishedDate.replace(year=datetime.now().year) 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				  
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+        # append result 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				         results.append({'url': url, 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				                         'title': title, 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				                         'content': content, 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				                         'publishedDate': publishedDate}) 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				  
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-    if not suggestion_xpath: 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-        return results 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				- 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-    for suggestion in dom.xpath(suggestion_xpath): 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-        results.append({'suggestion': extract_text(suggestion)}) 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				- 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    # return results 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				     return results 
			 |