twitter.py 2.2 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879
  1. """
  2. Twitter (Social media)
  3. @website https://twitter.com/
  4. @provide-api yes (https://dev.twitter.com/docs/using-search)
  5. @using-api no
  6. @results HTML (using search portal)
  7. @stable no (HTML can change)
  8. @parse url, title, content
  9. @todo publishedDate
  10. """
  11. from urlparse import urljoin
  12. from urllib import urlencode
  13. from lxml import html
  14. from datetime import datetime
  15. from searx.engines.xpath import extract_text
  16. # engine dependent config
  17. categories = ['social media']
  18. language_support = True
  19. # search-url
  20. base_url = 'https://twitter.com/'
  21. search_url = base_url + 'search?'
  22. # specific xpath variables
  23. results_xpath = '//li[@data-item-type="tweet"]'
  24. link_xpath = './/small[@class="time"]//a'
  25. title_xpath = './/span[@class="username js-action-profile-name"]'
  26. content_xpath = './/p[@class="js-tweet-text tweet-text"]'
  27. timestamp_xpath = './/span[contains(@class,"_timestamp")]'
  28. # do search-request
  29. def request(query, params):
  30. params['url'] = search_url + urlencode({'q': query})
  31. # set language if specified
  32. if params['language'] != 'all':
  33. params['cookies']['lang'] = params['language'].split('_')[0]
  34. else:
  35. params['cookies']['lang'] = 'en'
  36. return params
  37. # get response from search-request
  38. def response(resp):
  39. results = []
  40. dom = html.fromstring(resp.text)
  41. # parse results
  42. for tweet in dom.xpath(results_xpath):
  43. link = tweet.xpath(link_xpath)[0]
  44. url = urljoin(base_url, link.attrib.get('href'))
  45. title = extract_text(tweet.xpath(title_xpath))
  46. content = extract_text(tweet.xpath(content_xpath)[0])
  47. pubdate = tweet.xpath(timestamp_xpath)
  48. if len(pubdate) > 0:
  49. timestamp = float(pubdate[0].attrib.get('data-time'))
  50. publishedDate = datetime.fromtimestamp(timestamp, None)
  51. # append result
  52. results.append({'url': url,
  53. 'title': title,
  54. 'content': content,
  55. 'publishedDate': publishedDate})
  56. else:
  57. # append result
  58. results.append({'url': url,
  59. 'title': title,
  60. 'content': content})
  61. # return results
  62. return results