PageRenderTime 119ms CodeModel.GetById 22ms RepoModel.GetById 0ms app.codeStats 0ms

/lib/python/html5lib/treebuilders/soup.py

https://github.com/mozilla/affiliates-lib
Python | 228 lines | 193 code | 28 blank | 7 comment | 30 complexity | 91eeede1afb58a46c3ecf78c4c77cca7 MD5 | raw file
  1. import warnings
  2. warnings.warn("BeautifulSoup 3.x (as of 3.1) is not fully compatible with html5lib and support will be removed in the future", DeprecationWarning)
  3. from BeautifulSoup import BeautifulSoup, Tag, NavigableString, Comment, Declaration
  4. import _base
  5. from html5lib.constants import namespaces, DataLossWarning
  6. class AttrList(object):
  7. def __init__(self, element):
  8. self.element = element
  9. self.attrs = dict(self.element.attrs)
  10. def __iter__(self):
  11. return self.attrs.items().__iter__()
  12. def __setitem__(self, name, value):
  13. "set attr", name, value
  14. self.element[name] = value
  15. def items(self):
  16. return self.attrs.items()
  17. def keys(self):
  18. return self.attrs.keys()
  19. def __getitem__(self, name):
  20. return self.attrs[name]
  21. def __contains__(self, name):
  22. return name in self.attrs.keys()
  23. class Element(_base.Node):
  24. def __init__(self, element, soup, namespace):
  25. _base.Node.__init__(self, element.name)
  26. self.element = element
  27. self.soup = soup
  28. self.namespace = namespace
  29. def _nodeIndex(self, node, refNode):
  30. # Finds a node by identity rather than equality
  31. for index in range(len(self.element.contents)):
  32. if id(self.element.contents[index]) == id(refNode.element):
  33. return index
  34. return None
  35. def appendChild(self, node):
  36. if (node.element.__class__ == NavigableString and self.element.contents
  37. and self.element.contents[-1].__class__ == NavigableString):
  38. # Concatenate new text onto old text node
  39. # (TODO: This has O(n^2) performance, for input like "a</a>a</a>a</a>...")
  40. newStr = NavigableString(self.element.contents[-1]+node.element)
  41. # Remove the old text node
  42. # (Can't simply use .extract() by itself, because it fails if
  43. # an equal text node exists within the parent node)
  44. oldElement = self.element.contents[-1]
  45. del self.element.contents[-1]
  46. oldElement.parent = None
  47. oldElement.extract()
  48. self.element.insert(len(self.element.contents), newStr)
  49. else:
  50. self.element.insert(len(self.element.contents), node.element)
  51. node.parent = self
  52. def getAttributes(self):
  53. return AttrList(self.element)
  54. def setAttributes(self, attributes):
  55. if attributes:
  56. for name, value in attributes.items():
  57. self.element[name] = value
  58. attributes = property(getAttributes, setAttributes)
  59. def insertText(self, data, insertBefore=None):
  60. text = TextNode(NavigableString(data), self.soup)
  61. if insertBefore:
  62. self.insertBefore(text, insertBefore)
  63. else:
  64. self.appendChild(text)
  65. def insertBefore(self, node, refNode):
  66. index = self._nodeIndex(node, refNode)
  67. if (node.element.__class__ == NavigableString and self.element.contents
  68. and self.element.contents[index-1].__class__ == NavigableString):
  69. # (See comments in appendChild)
  70. newStr = NavigableString(self.element.contents[index-1]+node.element)
  71. oldNode = self.element.contents[index-1]
  72. del self.element.contents[index-1]
  73. oldNode.parent = None
  74. oldNode.extract()
  75. self.element.insert(index-1, newStr)
  76. else:
  77. self.element.insert(index, node.element)
  78. node.parent = self
  79. def removeChild(self, node):
  80. index = self._nodeIndex(node.parent, node)
  81. del node.parent.element.contents[index]
  82. node.element.parent = None
  83. node.element.extract()
  84. node.parent = None
  85. def reparentChildren(self, newParent):
  86. while self.element.contents:
  87. child = self.element.contents[0]
  88. child.extract()
  89. if isinstance(child, Tag):
  90. newParent.appendChild(Element(child, self.soup, namespaces["html"]))
  91. else:
  92. newParent.appendChild(TextNode(child, self.soup))
  93. def cloneNode(self):
  94. node = Element(Tag(self.soup, self.element.name), self.soup, self.namespace)
  95. for key,value in self.attributes:
  96. node.attributes[key] = value
  97. return node
  98. def hasContent(self):
  99. return self.element.contents
  100. def getNameTuple(self):
  101. if self.namespace == None:
  102. return namespaces["html"], self.name
  103. else:
  104. return self.namespace, self.name
  105. nameTuple = property(getNameTuple)
  106. class TextNode(Element):
  107. def __init__(self, element, soup):
  108. _base.Node.__init__(self, None)
  109. self.element = element
  110. self.soup = soup
  111. def cloneNode(self):
  112. raise NotImplementedError
  113. class TreeBuilder(_base.TreeBuilder):
  114. def __init__(self, namespaceHTMLElements):
  115. if namespaceHTMLElements:
  116. warnings.warn("BeautifulSoup cannot represent elements in any namespace", DataLossWarning)
  117. _base.TreeBuilder.__init__(self, namespaceHTMLElements)
  118. def documentClass(self):
  119. self.soup = BeautifulSoup("")
  120. return Element(self.soup, self.soup, None)
  121. def insertDoctype(self, token):
  122. name = token["name"]
  123. publicId = token["publicId"]
  124. systemId = token["systemId"]
  125. if publicId:
  126. self.soup.insert(0, Declaration("%s PUBLIC \"%s\" \"%s\""%(name, publicId, systemId or "")))
  127. elif systemId:
  128. self.soup.insert(0, Declaration("%s SYSTEM \"%s\""%
  129. (name, systemId)))
  130. else:
  131. self.soup.insert(0, Declaration(name))
  132. def elementClass(self, name, namespace):
  133. if namespace is not None:
  134. warnings.warn("BeautifulSoup cannot represent elements in any namespace", DataLossWarning)
  135. return Element(Tag(self.soup, name), self.soup, namespace)
  136. def commentClass(self, data):
  137. return TextNode(Comment(data), self.soup)
  138. def fragmentClass(self):
  139. self.soup = BeautifulSoup("")
  140. self.soup.name = "[document_fragment]"
  141. return Element(self.soup, self.soup, None)
  142. def appendChild(self, node):
  143. self.soup.insert(len(self.soup.contents), node.element)
  144. def testSerializer(self, element):
  145. return testSerializer(element)
  146. def getDocument(self):
  147. return self.soup
  148. def getFragment(self):
  149. return _base.TreeBuilder.getFragment(self).element
  150. def testSerializer(element):
  151. import re
  152. rv = []
  153. def serializeElement(element, indent=0):
  154. if isinstance(element, Declaration):
  155. doctype_regexp = r'(?P<name>[^\s]*)( PUBLIC "(?P<publicId>.*)" "(?P<systemId1>.*)"| SYSTEM "(?P<systemId2>.*)")?'
  156. m = re.compile(doctype_regexp).match(element.string)
  157. assert m is not None, "DOCTYPE did not match expected format"
  158. name = m.group('name')
  159. publicId = m.group('publicId')
  160. if publicId is not None:
  161. systemId = m.group('systemId1') or ""
  162. else:
  163. systemId = m.group('systemId2')
  164. if publicId is not None or systemId is not None:
  165. rv.append("""|%s<!DOCTYPE %s "%s" "%s">"""%
  166. (' '*indent, name, publicId or "", systemId or ""))
  167. else:
  168. rv.append("|%s<!DOCTYPE %s>"%(' '*indent, name))
  169. elif isinstance(element, BeautifulSoup):
  170. if element.name == "[document_fragment]":
  171. rv.append("#document-fragment")
  172. else:
  173. rv.append("#document")
  174. elif isinstance(element, Comment):
  175. rv.append("|%s<!-- %s -->"%(' '*indent, element.string))
  176. elif isinstance(element, unicode):
  177. rv.append("|%s\"%s\"" %(' '*indent, element))
  178. else:
  179. rv.append("|%s<%s>"%(' '*indent, element.name))
  180. if element.attrs:
  181. for name, value in element.attrs:
  182. rv.append('|%s%s="%s"' % (' '*(indent+2), name, value))
  183. indent += 2
  184. if hasattr(element, "contents"):
  185. for child in element.contents:
  186. serializeElement(child, indent)
  187. serializeElement(element, 0)
  188. return "\n".join(rv)