/desktop/core/ext-py/lxml/src/lxml/html/soupparser.py

https://github.com/jcrobak/hue · Python · 122 lines · 74 code · 17 blank · 31 comment · 23 complexity · 16311760675f4807b636aef597b17478 MD5 · raw file

  1. __doc__ = """External interface to the BeautifulSoup HTML parser.
  2. """
  3. __all__ = ["fromstring", "parse", "convert_tree"]
  4. from lxml import etree, html
  5. from BeautifulSoup import \
  6. BeautifulSoup, Tag, Comment, ProcessingInstruction, NavigableString
  7. def fromstring(data, beautifulsoup=None, makeelement=None, **bsargs):
  8. """Parse a string of HTML data into an Element tree using the
  9. BeautifulSoup parser.
  10. Returns the root ``<html>`` Element of the tree.
  11. You can pass a different BeautifulSoup parser through the
  12. `beautifulsoup` keyword, and a diffent Element factory function
  13. through the `makeelement` keyword. By default, the standard
  14. ``BeautifulSoup`` class and the default factory of `lxml.html` are
  15. used.
  16. """
  17. return _parse(data, beautifulsoup, makeelement, **bsargs)
  18. def parse(file, beautifulsoup=None, makeelement=None, **bsargs):
  19. """Parse a file into an ElemenTree using the BeautifulSoup parser.
  20. You can pass a different BeautifulSoup parser through the
  21. `beautifulsoup` keyword, and a diffent Element factory function
  22. through the `makeelement` keyword. By default, the standard
  23. ``BeautifulSoup`` class and the default factory of `lxml.html` are
  24. used.
  25. """
  26. if not hasattr(file, 'read'):
  27. file = open(file)
  28. root = _parse(file, beautifulsoup, makeelement, **bsargs)
  29. return etree.ElementTree(root)
  30. def convert_tree(beautiful_soup_tree, makeelement=None):
  31. """Convert a BeautifulSoup tree to a list of Element trees.
  32. Returns a list instead of a single root Element to support
  33. HTML-like soup with more than one root element.
  34. You can pass a different Element factory through the `makeelement`
  35. keyword.
  36. """
  37. if makeelement is None:
  38. makeelement = html.html_parser.makeelement
  39. root = _convert_tree(beautiful_soup_tree, makeelement)
  40. children = root.getchildren()
  41. for child in children:
  42. root.remove(child)
  43. return children
  44. # helpers
  45. def _parse(source, beautifulsoup, makeelement, **bsargs):
  46. if beautifulsoup is None:
  47. beautifulsoup = BeautifulSoup
  48. if makeelement is None:
  49. makeelement = html.html_parser.makeelement
  50. if 'convertEntities' not in bsargs:
  51. bsargs['convertEntities'] = 'html'
  52. tree = beautifulsoup(source, **bsargs)
  53. root = _convert_tree(tree, makeelement)
  54. # from ET: wrap the document in a html root element, if necessary
  55. if len(root) == 1 and root[0].tag == "html":
  56. return root[0]
  57. root.tag = "html"
  58. return root
  59. def _convert_tree(beautiful_soup_tree, makeelement):
  60. root = makeelement(beautiful_soup_tree.name,
  61. attrib=dict(beautiful_soup_tree.attrs))
  62. _convert_children(root, beautiful_soup_tree, makeelement)
  63. return root
  64. def _convert_children(parent, beautiful_soup_tree, makeelement):
  65. SubElement = etree.SubElement
  66. et_child = None
  67. for child in beautiful_soup_tree:
  68. if isinstance(child, Tag):
  69. et_child = SubElement(parent, child.name, attrib=dict(
  70. [(k, unescape(v)) for (k,v) in child.attrs]))
  71. _convert_children(et_child, child, makeelement)
  72. elif type(child) is NavigableString:
  73. _append_text(parent, et_child, unescape(child))
  74. else:
  75. if isinstance(child, Comment):
  76. parent.append(etree.Comment(child))
  77. elif isinstance(child, ProcessingInstruction):
  78. parent.append(etree.ProcessingInstruction(
  79. *child.split(' ', 1)))
  80. else: # CData
  81. _append_text(parent, et_child, unescape(child))
  82. def _append_text(parent, element, text):
  83. if element is None:
  84. parent.text = (parent.text or '') + text
  85. else:
  86. element.tail = (element.tail or '') + text
  87. # copied from ET's ElementSoup
  88. from htmlentitydefs import name2codepoint
  89. import re
  90. handle_entities = re.compile("&(\w+);").sub
  91. def unescape(string):
  92. if not string:
  93. return ''
  94. # work around oddities in BeautifulSoup's entity handling
  95. def unescape_entity(m):
  96. try:
  97. return unichr(name2codepoint[m.group(1)])
  98. except KeyError:
  99. return m.group(0) # use as is
  100. return handle_entities(unescape_entity, string)