twitter.py 2.3 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182
  1. """
  2. Twitter (Social media)
  3. @website https://twitter.com/
  4. @provide-api yes (https://dev.twitter.com/docs/using-search)
  5. @using-api no
  6. @results HTML (using search portal)
  7. @stable no (HTML can change)
  8. @parse url, title, content
  9. @todo publishedDate
  10. """
  11. from lxml import html
  12. from datetime import datetime
  13. from searx.engines.xpath import extract_text
  14. from searx.url_utils import urlencode, urljoin
  15. # engine dependent config
  16. categories = ['social media']
  17. language_support = True
  18. # search-url
  19. base_url = 'https://twitter.com/'
  20. search_url = base_url + 'search?'
  21. # specific xpath variables
  22. results_xpath = '//li[@data-item-type="tweet"]'
  23. link_xpath = './/small[@class="time"]//a'
  24. title_xpath = './/span[contains(@class, "username")]'
  25. content_xpath = './/p[contains(@class, "tweet-text")]'
  26. timestamp_xpath = './/span[contains(@class,"_timestamp")]'
  27. # do search-request
  28. def request(query, params):
  29. params['url'] = search_url + urlencode({'q': query})
  30. # set language if specified
  31. if params['language'] != 'all':
  32. params['cookies']['lang'] = params['language'].split('-')[0]
  33. else:
  34. params['cookies']['lang'] = 'en'
  35. return params
  36. # get response from search-request
  37. def response(resp):
  38. results = []
  39. dom = html.fromstring(resp.text)
  40. # parse results
  41. for tweet in dom.xpath(results_xpath):
  42. try:
  43. link = tweet.xpath(link_xpath)[0]
  44. content = extract_text(tweet.xpath(content_xpath)[0])
  45. except Exception:
  46. continue
  47. url = urljoin(base_url, link.attrib.get('href'))
  48. title = extract_text(tweet.xpath(title_xpath))
  49. pubdate = tweet.xpath(timestamp_xpath)
  50. if len(pubdate) > 0:
  51. timestamp = float(pubdate[0].attrib.get('data-time'))
  52. publishedDate = datetime.fromtimestamp(timestamp, None)
  53. # append result
  54. results.append({'url': url,
  55. 'title': title,
  56. 'content': content,
  57. 'publishedDate': publishedDate})
  58. else:
  59. # append result
  60. results.append({'url': url,
  61. 'title': title,
  62. 'content': content})
  63. # return results
  64. return results