twitter.py 2.1 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273
  1. ## Twitter (Social media)
  2. #
  3. # @website https://twitter.com/
  4. # @provide-api yes (https://dev.twitter.com/docs/using-search)
  5. #
  6. # @using-api no
  7. # @results HTML (using search portal)
  8. # @stable no (HTML can change)
  9. # @parse url, title, content
  10. #
  11. # @todo publishedDate
  12. from urlparse import urljoin
  13. from urllib import urlencode
  14. from lxml import html
  15. from cgi import escape
  16. from datetime import datetime
  17. # engine dependent config
  18. categories = ['social media']
  19. language_support = True
  20. # search-url
  21. base_url = 'https://twitter.com/'
  22. search_url = base_url+'search?'
  23. # specific xpath variables
  24. results_xpath = '//li[@data-item-type="tweet"]'
  25. link_xpath = './/small[@class="time"]//a'
  26. title_xpath = './/span[@class="username js-action-profile-name"]//text()'
  27. content_xpath = './/p[@class="js-tweet-text tweet-text"]//text()'
  28. timestamp_xpath = './/span[contains(@class,"_timestamp")]'
  29. # do search-request
  30. def request(query, params):
  31. params['url'] = search_url + urlencode({'q': query})
  32. # set language if specified
  33. if params['language'] != 'all':
  34. params['cookies']['lang'] = params['language'].split('_')[0]
  35. return params
  36. # get response from search-request
  37. def response(resp):
  38. results = []
  39. dom = html.fromstring(resp.text)
  40. # parse results
  41. for tweet in dom.xpath(results_xpath):
  42. link = tweet.xpath(link_xpath)[0]
  43. url = urljoin(base_url, link.attrib.get('href'))
  44. title = ''.join(tweet.xpath(title_xpath))
  45. content = escape(''.join(tweet.xpath(content_xpath)))
  46. pubdate = tweet.xpath(timestamp_xpath)
  47. if len(pubdate) > 0:
  48. publishedDate = datetime.fromtimestamp(float(pubdate[0].attrib.get('data-time')), None)
  49. # append result
  50. results.append({'url': url,
  51. 'title': title,
  52. 'content': content,
  53. 'publishedDate': publishedDate})
  54. else:
  55. # append result
  56. results.append({'url': url,
  57. 'title': title,
  58. 'content': content})
  59. # return results
  60. return results