twitter.py 2.2 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677
  1. ## Twitter (Social media)
  2. #
  3. # @website https://twitter.com/
  4. # @provide-api yes (https://dev.twitter.com/docs/using-search)
  5. #
  6. # @using-api no
  7. # @results HTML (using search portal)
  8. # @stable no (HTML can change)
  9. # @parse url, title, content
  10. #
  11. # @todo publishedDate
  12. from urlparse import urljoin
  13. from urllib import urlencode
  14. from lxml import html
  15. from datetime import datetime
  16. from searx.engines.xpath import extract_text
  17. # engine dependent config
  18. categories = ['social media']
  19. language_support = True
  20. # search-url
  21. base_url = 'https://twitter.com/'
  22. search_url = base_url + 'search?'
  23. # specific xpath variables
  24. results_xpath = '//li[@data-item-type="tweet"]'
  25. link_xpath = './/small[@class="time"]//a'
  26. title_xpath = './/span[@class="username js-action-profile-name"]'
  27. content_xpath = './/p[@class="js-tweet-text tweet-text"]'
  28. timestamp_xpath = './/span[contains(@class,"_timestamp")]'
  29. # do search-request
  30. def request(query, params):
  31. params['url'] = search_url + urlencode({'q': query})
  32. # set language if specified
  33. if params['language'] != 'all':
  34. params['cookies']['lang'] = params['language'].split('_')[0]
  35. else:
  36. params['cookies']['lang'] = 'en'
  37. return params
  38. # get response from search-request
  39. def response(resp):
  40. results = []
  41. dom = html.fromstring(resp.text)
  42. # parse results
  43. for tweet in dom.xpath(results_xpath):
  44. link = tweet.xpath(link_xpath)[0]
  45. url = urljoin(base_url, link.attrib.get('href'))
  46. title = extract_text(tweet.xpath(title_xpath))
  47. content = extract_text(tweet.xpath(content_xpath)[0])
  48. pubdate = tweet.xpath(timestamp_xpath)
  49. if len(pubdate) > 0:
  50. timestamp = float(pubdate[0].attrib.get('data-time'))
  51. publishedDate = datetime.fromtimestamp(timestamp, None)
  52. # append result
  53. results.append({'url': url,
  54. 'title': title,
  55. 'content': content,
  56. 'publishedDate': publishedDate})
  57. else:
  58. # append result
  59. results.append({'url': url,
  60. 'title': title,
  61. 'content': content})
  62. # return results
  63. return results