/source/Python/FileDownload/HTMLParsing/HTMLTableParser.py

https://github.com/alexdgarland/Weather_Analysis
Python | 65 lines | 38 code | 15 blank | 12 comment | 6 complexity | 8b0ab25c4b1b4d1ea0141267a3a3add4 MD5 | raw file
  1. #!/usr/bin/python
  2. import sys
  3. if sys.version_info[0] >= 3:
  4. from html.parser import HTMLParser
  5. else:
  6. from HTMLParser import HTMLParser
  7. from . import TagHandlingState as ths
  8. from .TableBuilder import TableBuilder
  9. class HTMLTableParser(HTMLParser, object):
  10. """
  11. Parses HTML text into a nested list -
  12. each inner list represents a row in the table.
  13. The methods used to build the in-memory table structure
  14. are taken from a state machine, which transitions
  15. as the parser progresses through the document.
  16. """
  17. def __init__(self):
  18. self._state_instances = { }
  19. if sys.version_info[0] >= 3:
  20. super(type(self), self).__init__(convert_charrefs=True)
  21. else:
  22. super(type(self), self).__init__()
  23. def _transition(self, new_state_class):
  24. # Only act if handler function has returned a new state type
  25. if new_state_class:
  26. # Memoise reusable instances of state classes
  27. if new_state_class not in self._state_instances:
  28. self._state_instances[new_state_class] = new_state_class()
  29. # Assign instance to current state
  30. self._state = self._state_instances[new_state_class]
  31. def handle_starttag(self, tag, attrs):
  32. new_state = self._state.handle_starttag_state(tag, attrs, self._builder)
  33. self._transition(new_state)
  34. def handle_endtag(self, tag):
  35. new_state = self._state.handle_endtag_state(tag, self._builder)
  36. self._transition(new_state)
  37. def handle_data(self, data):
  38. new_state = self._state.handle_data_state(data, self._builder)
  39. self._transition(new_state)
  40. def handle_entityref(self, name):
  41. new_state = self._state.handle_entityref_state(name, self._builder)
  42. self._transition(new_state)
  43. def GetTable(self, html):
  44. self.feed(html)
  45. return self._builder.table
  46. def feed(self, html):
  47. self._builder = TableBuilder()
  48. self._transition(ths.DefaultTagHandlingState)
  49. super(type(self), self).feed(html)