utils.py 707 B

1234567891011121314151617181920212223242526
  1. from HTMLParser import HTMLParser
  2. import htmlentitydefs
  3. class HTMLTextExtractor(HTMLParser):
  4. def __init__(self):
  5. HTMLParser.__init__(self)
  6. self.result = [ ]
  7. def handle_data(self, d):
  8. self.result.append(d)
  9. def handle_charref(self, number):
  10. codepoint = int(number[1:], 16) if number[0] in (u'x', u'X') else int(number)
  11. self.result.append(unichr(codepoint))
  12. def handle_entityref(self, name):
  13. codepoint = htmlentitydefs.name2codepoint[name]
  14. self.result.append(unichr(codepoint))
  15. def get_text(self):
  16. return u''.join(self.result)
  17. def html_to_text(html):
  18. s = HTMLTextExtractor()
  19. s.feed(html)
  20. return s.get_text()