soup.py - Finds a node by identity rather than equality Con…

/gaeapp/html5lib/treebuilders/soup.py

https://github.com/scottjasta/Places · Python · 228 lines · 181 code · 40 blank · 7 comment · 45 complexity · 91eeede1afb58a46c3ecf78c4c77cca7 MD5 · raw file


import warnings

warnings.warn("BeautifulSoup 3.x (as of 3.1) is not fully compatible with html5lib and support will be removed in the future", DeprecationWarning)

from BeautifulSoup import BeautifulSoup, Tag, NavigableString, Comment, Declaration

import _base
from html5lib.constants import namespaces, DataLossWarning

class AttrList(object):
    def __init__(self, element):
        self.element = element
        self.attrs = dict(self.element.attrs)
    def __iter__(self):
        return self.attrs.items().__iter__()
    def __setitem__(self, name, value):
        "set attr", name, value
        self.element[name] = value
    def items(self):
        return self.attrs.items()
    def keys(self):
        return self.attrs.keys()
    def __getitem__(self, name):
        return self.attrs[name]
    def __contains__(self, name):
        return name in self.attrs.keys()


class Element(_base.Node):
    def __init__(self, element, soup, namespace):
        _base.Node.__init__(self, element.name)
        self.element = element
        self.soup = soup
        self.namespace = namespace

    def _nodeIndex(self, node, refNode):
        # Finds a node by identity rather than equality
        for index in range(len(self.element.contents)):
            if id(self.element.contents[index]) == id(refNode.element):
                return index
        return None

    def appendChild(self, node):
        if (node.element.__class__ == NavigableString and self.element.contents
            and self.element.contents[-1].__class__ == NavigableString):
            # Concatenate new text onto old text node
            # (TODO: This has O(n^2) performance, for input like "a</a>a</a>a</a>...")
            newStr = NavigableString(self.element.contents[-1]+node.element)

            # Remove the old text node
            # (Can't simply use .extract() by itself, because it fails if
            # an equal text node exists within the parent node)
            oldElement = self.element.contents[-1]
            del self.element.contents[-1]
            oldElement.parent = None
            oldElement.extract()

            self.element.insert(len(self.element.contents), newStr)
        else:
            self.element.insert(len(self.element.contents), node.element)
            node.parent = self

    def getAttributes(self):
        return AttrList(self.element)

    def setAttributes(self, attributes):
        if attributes:
            for name, value in attributes.items():
                self.element[name] =  value

    attributes = property(getAttributes, setAttributes)
    
    def insertText(self, data, insertBefore=None):
        text = TextNode(NavigableString(data), self.soup)
        if insertBefore:
            self.insertBefore(text, insertBefore)
        else:
            self.appendChild(text)

    def insertBefore(self, node, refNode):
        index = self._nodeIndex(node, refNode)
        if (node.element.__class__ == NavigableString and self.element.contents
            and self.element.contents[index-1].__class__ == NavigableString):
            # (See comments in appendChild)
            newStr = NavigableString(self.element.contents[index-1]+node.element)
            oldNode = self.element.contents[index-1]
            del self.element.contents[index-1]
            oldNode.parent = None
            oldNode.extract()

            self.element.insert(index-1, newStr)
        else:
            self.element.insert(index, node.element)
            node.parent = self

    def removeChild(self, node):
        index = self._nodeIndex(node.parent, node)
        del node.parent.element.contents[index]
        node.element.parent = None
        node.element.extract()
        node.parent = None

    def reparentChildren(self, newParent):
        while self.element.contents:
            child = self.element.contents[0]
            child.extract()
            if isinstance(child, Tag):
                newParent.appendChild(Element(child, self.soup, namespaces["html"]))
            else:
                newParent.appendChild(TextNode(child, self.soup))

    def cloneNode(self):
        node = Element(Tag(self.soup, self.element.name), self.soup, self.namespace)
        for key,value in self.attributes:
            node.attributes[key] = value
        return node

    def hasContent(self):
        return self.element.contents

    def getNameTuple(self):
        if self.namespace == None:
            return namespaces["html"], self.name
        else:
            return self.namespace, self.name

    nameTuple = property(getNameTuple)

class TextNode(Element):
    def __init__(self, element, soup):
        _base.Node.__init__(self, None)
        self.element = element
        self.soup = soup
    
    def cloneNode(self):
        raise NotImplementedError

class TreeBuilder(_base.TreeBuilder):
    def __init__(self, namespaceHTMLElements):
        if namespaceHTMLElements:
            warnings.warn("BeautifulSoup cannot represent elements in any namespace", DataLossWarning)
        _base.TreeBuilder.__init__(self, namespaceHTMLElements)
        
    def documentClass(self):
        self.soup = BeautifulSoup("")
        return Element(self.soup, self.soup, None)
    
    def insertDoctype(self, token):
        name = token["name"]
        publicId = token["publicId"]
        systemId = token["systemId"]

        if publicId:
            self.soup.insert(0, Declaration("%s PUBLIC \"%s\" \"%s\""%(name, publicId, systemId or "")))
        elif systemId:
            self.soup.insert(0, Declaration("%s SYSTEM \"%s\""%
                                            (name, systemId)))
        else:
            self.soup.insert(0, Declaration(name))
    
    def elementClass(self, name, namespace):
        if namespace is not None:
            warnings.warn("BeautifulSoup cannot represent elements in any namespace", DataLossWarning)
        return Element(Tag(self.soup, name), self.soup, namespace)
        
    def commentClass(self, data):
        return TextNode(Comment(data), self.soup)
    
    def fragmentClass(self):
        self.soup = BeautifulSoup("")
        self.soup.name = "[document_fragment]"
        return Element(self.soup, self.soup, None) 

    def appendChild(self, node):
        self.soup.insert(len(self.soup.contents), node.element)

    def testSerializer(self, element):
        return testSerializer(element)

    def getDocument(self):
        return self.soup
    
    def getFragment(self):
        return _base.TreeBuilder.getFragment(self).element
    
def testSerializer(element):
    import re
    rv = []
    def serializeElement(element, indent=0):
        if isinstance(element, Declaration):
            doctype_regexp = r'(?P<name>[^\s]*)( PUBLIC "(?P<publicId>.*)" "(?P<systemId1>.*)"| SYSTEM "(?P<systemId2>.*)")?'
            m = re.compile(doctype_regexp).match(element.string)
            assert m is not None, "DOCTYPE did not match expected format"
            name = m.group('name')
            publicId = m.group('publicId')
            if publicId is not None:
                systemId = m.group('systemId1') or ""
            else:
                systemId = m.group('systemId2')

            if publicId is not None or systemId is not None:
                rv.append("""|%s<!DOCTYPE %s "%s" "%s">"""%
                          (' '*indent, name, publicId or "", systemId or ""))
            else:
                rv.append("|%s<!DOCTYPE %s>"%(' '*indent, name))
            
        elif isinstance(element, BeautifulSoup):
            if element.name == "[document_fragment]":
                rv.append("#document-fragment")                
            else:
                rv.append("#document")

        elif isinstance(element, Comment):
            rv.append("|%s<!-- %s -->"%(' '*indent, element.string))
        elif isinstance(element, unicode):
            rv.append("|%s\"%s\"" %(' '*indent, element))
        else:
            rv.append("|%s<%s>"%(' '*indent, element.name))
            if element.attrs:
                for name, value in element.attrs:
                    rv.append('|%s%s="%s"' % (' '*(indent+2), name, value))
        indent += 2
        if hasattr(element, "contents"):
            for child in element.contents:
                serializeElement(child, indent)
    serializeElement(element, 0)

    return "\n".join(rv)

Tech Fingerprint

Alerts (45)

'def' Ensure functions have docstrings for documentation
19 21 43 63 66 73 80 96 103 112 118 121 135 144 148 161 166 169 174 177 180 183 186 189
Complexity hotspot; lines 44 to 45 (total complexity: 3)
44 45
'del' Avoid unless necessary; Python's garbage collector typically handles object deletion
54 87 98
Complexity hotspot; lines 82 to 83 (total complexity: 3)
82 83
'isinstance(' Overuse may indicate design issues; consider polymorphism
107 190 207 213 215
'== None' Use 'is' for None comparisons (e.g., x is None)
122
Complexity hotspot; lines 153 to 155 (total complexity: 3)
153 154 155
Complexity hotspot; lines 196 to 198 (total complexity: 3)
196 197 198
Complexity hotspot; lines 203 to 204 (total complexity: 3)
203 204