twitter.py 2.2 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374
  1. ## Twitter (Social media)
  2. #
  3. # @website https://twitter.com/
  4. # @provide-api yes (https://dev.twitter.com/docs/using-search)
  5. #
  6. # @using-api no
  7. # @results HTML (using search portal)
  8. # @stable no (HTML can change)
  9. # @parse url, title, content
  10. #
  11. # @todo publishedDate
  12. from urlparse import urljoin
  13. from urllib import urlencode
  14. from lxml import html
  15. from cgi import escape
  16. from datetime import datetime
  17. # engine dependent config
  18. categories = ['social media']
  19. language_support = True
  20. # search-url
  21. base_url = 'https://twitter.com/'
  22. search_url = base_url+'search?'
  23. # specific xpath variables
  24. results_xpath = '//li[@data-item-type="tweet"]'
  25. link_xpath = './/small[@class="time"]//a'
  26. title_xpath = './/span[@class="username js-action-profile-name"]//text()'
  27. content_xpath = './/p[@class="js-tweet-text tweet-text"]'
  28. timestamp_xpath = './/span[contains(@class,"_timestamp")]'
  29. # do search-request
  30. def request(query, params):
  31. params['url'] = search_url + urlencode({'q': query})
  32. # set language if specified
  33. if params['language'] != 'all':
  34. params['cookies']['lang'] = params['language'].split('_')[0]
  35. return params
  36. # get response from search-request
  37. def response(resp):
  38. results = []
  39. dom = html.fromstring(resp.text)
  40. # parse results
  41. for tweet in dom.xpath(results_xpath):
  42. link = tweet.xpath(link_xpath)[0]
  43. url = urljoin(base_url, link.attrib.get('href'))
  44. title = ''.join(tweet.xpath(title_xpath))
  45. content = escape(html.tostring(tweet.xpath(content_xpath)[0], method='text', encoding='UTF-8').decode("utf-8"))
  46. pubdate = tweet.xpath(timestamp_xpath)
  47. if len(pubdate) > 0:
  48. timestamp = float(pubdate[0].attrib.get('data-time'))
  49. publishedDate = datetime.fromtimestamp(timestamp, None)
  50. # append result
  51. results.append({'url': url,
  52. 'title': title,
  53. 'content': content,
  54. 'publishedDate': publishedDate})
  55. else:
  56. # append result
  57. results.append({'url': url,
  58. 'title': title,
  59. 'content': content})
  60. # return results
  61. return results