digg.py 2.0 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374
  1. """
  2. Digg (News, Social media)
  3. @website https://digg.com/
  4. @provide-api no
  5. @using-api no
  6. @results HTML (using search portal)
  7. @stable no (HTML can change)
  8. @parse url, title, content, publishedDate, thumbnail
  9. """
  10. from dateutil import parser
  11. from json import loads
  12. from lxml import html
  13. from searx.url_utils import quote_plus
  14. # engine dependent config
  15. categories = ['news', 'social media']
  16. paging = True
  17. # search-url
  18. base_url = 'https://digg.com/'
  19. search_url = base_url + 'api/search/{query}.json?position={position}&format=html'
  20. # specific xpath variables
  21. results_xpath = '//article'
  22. link_xpath = './/small[@class="time"]//a'
  23. title_xpath = './/h2//a//text()'
  24. content_xpath = './/p//text()'
  25. pubdate_xpath = './/time'
  26. # do search-request
  27. def request(query, params):
  28. offset = (params['pageno'] - 1) * 10
  29. params['url'] = search_url.format(position=offset,
  30. query=quote_plus(query))
  31. return params
  32. # get response from search-request
  33. def response(resp):
  34. results = []
  35. search_result = loads(resp.text)
  36. if 'html' not in search_result or search_result['html'] == '':
  37. return results
  38. dom = html.fromstring(search_result['html'])
  39. # parse results
  40. for result in dom.xpath(results_xpath):
  41. url = result.attrib.get('data-contenturl')
  42. thumbnail = result.xpath('.//img')[0].attrib.get('src')
  43. title = ''.join(result.xpath(title_xpath))
  44. content = ''.join(result.xpath(content_xpath))
  45. pubdate = result.xpath(pubdate_xpath)[0].attrib.get('datetime')
  46. publishedDate = parser.parse(pubdate)
  47. # http to https
  48. thumbnail = thumbnail.replace("http://static.digg.com", "https://static.digg.com")
  49. # append result
  50. results.append({'url': url,
  51. 'title': title,
  52. 'content': content,
  53. 'template': 'videos.html',
  54. 'publishedDate': publishedDate,
  55. 'thumbnail': thumbnail})
  56. # return results
  57. return results