|
@@ -12,6 +12,8 @@
|
|
|
|
|
|
from lxml import html
|
|
|
from cgi import escape
|
|
|
+from dateutil import parser
|
|
|
+from datetime import datetime, timedelta
|
|
|
import re
|
|
|
from searx.engines.xpath import extract_text
|
|
|
|
|
@@ -79,15 +81,44 @@ def response(resp):
|
|
|
|
|
|
title = escape(extract_text(link))
|
|
|
|
|
|
- if result.xpath('./p[@class="desc"]'):
|
|
|
- content = escape(extract_text(result.xpath('./p[@class="desc"]')))
|
|
|
+ if result.xpath('./p[@class="desc clk"]'):
|
|
|
+ content = escape(extract_text(result.xpath('./p[@class="desc clk"]')))
|
|
|
else:
|
|
|
content = ''
|
|
|
|
|
|
- # append result
|
|
|
- results.append({'url': url,
|
|
|
- 'title': title,
|
|
|
- 'content': content})
|
|
|
+ published_date = None
|
|
|
+
|
|
|
+ # check if search result starts with something like: "2 Sep 2014 ... "
|
|
|
+ if re.match("^([1-9]|[1-2][0-9]|3[0-1]) [A-Z][a-z]{2} [0-9]{4} \.\.\. ", content):
|
|
|
+ date_pos = content.find('...')+4
|
|
|
+ date_string = content[0:date_pos-5]
|
|
|
+ published_date = parser.parse(date_string, dayfirst=True)
|
|
|
+
|
|
|
+ # fix content string
|
|
|
+ content = content[date_pos:]
|
|
|
+
|
|
|
+ # check if search result starts with something like: "5 days ago ... "
|
|
|
+ elif re.match("^[0-9]+ days? ago \.\.\. ", content):
|
|
|
+ date_pos = content.find('...')+4
|
|
|
+ date_string = content[0:date_pos-5]
|
|
|
+
|
|
|
+ # calculate datetime
|
|
|
+ published_date = datetime.now() - timedelta(days=int(re.match(r'\d+', date_string).group()))
|
|
|
+
|
|
|
+ # fix content string
|
|
|
+ content = content[date_pos:]
|
|
|
+
|
|
|
+ if published_date:
|
|
|
+ # append result
|
|
|
+ results.append({'url': url,
|
|
|
+ 'title': title,
|
|
|
+ 'content': content,
|
|
|
+ 'publishedDate': published_date})
|
|
|
+ else:
|
|
|
+ # append result
|
|
|
+ results.append({'url': url,
|
|
|
+ 'title': title,
|
|
|
+ 'content': content})
|
|
|
|
|
|
# return results
|
|
|
return results
|