| 1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374 | """ Digg (News, Social media) @website     https://digg.com/ @provide-api no @using-api   no @results     HTML (using search portal) @stable      no (HTML can change) @parse       url, title, content, publishedDate, thumbnail"""from urllib import quote_plusfrom json import loadsfrom lxml import htmlfrom dateutil import parser# engine dependent configcategories = ['news', 'social media']paging = True# search-urlbase_url = 'https://digg.com/'search_url = base_url + 'api/search/{query}.json?position={position}&format=html'# specific xpath variablesresults_xpath = '//article'link_xpath = './/small[@class="time"]//a'title_xpath = './/h2//a//text()'content_xpath = './/p//text()'pubdate_xpath = './/time'# do search-requestdef request(query, params):    offset = (params['pageno'] - 1) * 10    params['url'] = search_url.format(position=offset,                                      query=quote_plus(query))    return params# get response from search-requestdef response(resp):    results = []    search_result = loads(resp.text)    if 'html' not in search_result or search_result['html'] == '':        return results    dom = html.fromstring(search_result['html'])    # parse results    for result in dom.xpath(results_xpath):        url = result.attrib.get('data-contenturl')        thumbnail = result.xpath('.//img')[0].attrib.get('src')        title = ''.join(result.xpath(title_xpath))        content = ''.join(result.xpath(content_xpath))        pubdate = result.xpath(pubdate_xpath)[0].attrib.get('datetime')        publishedDate = parser.parse(pubdate)        # http to https        thumbnail = thumbnail.replace("http://static.digg.com", "https://static.digg.com")        # append result        results.append({'url': url,                        'title': title,                        'content': content,                        'template': 'videos.html',                        'publishedDate': publishedDate,                        'thumbnail': thumbnail})    # return results    return results
 |