/lib/html5lib/treebuilders/__init__.py
Python | 96 lines | 59 code | 3 blank | 34 comment | 0 complexity | 06fe3442576d2f07c3d677146dc2dc4d MD5 | raw file
- """A collection of modules for building different kinds of tree from
- HTML documents.
- To create a treebuilder for a new type of tree, you need to do
- implement several things:
- 1) A set of classes for various types of elements: Document, Doctype,
- Comment, Element. These must implement the interface of
- _base.treebuilders.Node (although comment nodes have a different
- signature for their constructor, see treebuilders.simpletree.Comment)
- Textual content may also be implemented as another node type, or not, as
- your tree implementation requires.
- 2) A treebuilder object (called TreeBuilder by convention) that
- inherits from treebuilders._base.TreeBuilder. This has 4 required attributes:
- documentClass - the class to use for the bottommost node of a document
- elementClass - the class to use for HTML Elements
- commentClass - the class to use for comments
- doctypeClass - the class to use for doctypes
- It also has one required method:
- getDocument - Returns the root node of the complete document tree
- 3) If you wish to run the unit tests, you must also create a
- testSerializer method on your treebuilder which accepts a node and
- returns a string containing Node and its children serialized according
- to the format used in the unittests
- The supplied simpletree module provides a python-only implementation
- of a full treebuilder and is a useful reference for the semantics of
- the various methods.
- """
- treeBuilderCache = {}
- import sys
- def getTreeBuilder(treeType, implementation=None, **kwargs):
- """Get a TreeBuilder class for various types of tree with built-in support
-
- treeType - the name of the tree type required (case-insensitive). Supported
- values are "simpletree", "dom", "etree" and "beautifulsoup"
-
- "simpletree" - a built-in DOM-ish tree type with support for some
- more pythonic idioms.
- "dom" - A generic builder for DOM implementations, defaulting to
- a xml.dom.minidom based implementation for the sake of
- backwards compatibility (as releases up until 0.10 had a
- builder called "dom" that was a minidom implemenation).
- "etree" - A generic builder for tree implementations exposing an
- elementtree-like interface (known to work with
- ElementTree, cElementTree and lxml.etree).
- "beautifulsoup" - Beautiful soup (if installed)
-
- implementation - (Currently applies to the "etree" and "dom" tree types). A
- module implementing the tree type e.g.
- xml.etree.ElementTree or lxml.etree."""
-
- treeType = treeType.lower()
- if treeType not in treeBuilderCache:
- if treeType == "dom":
- import dom
- # XXX: Keep backwards compatibility by using minidom if no implementation is given
- if implementation == None:
- from xml.dom import minidom
- implementation = minidom
- # XXX: NEVER cache here, caching is done in the dom submodule
- return dom.getDomModule(implementation, **kwargs).TreeBuilder
- elif treeType == "simpletree":
- import simpletree
- treeBuilderCache[treeType] = simpletree.TreeBuilder
- elif treeType == "beautifulsoup":
- import soup
- treeBuilderCache[treeType] = soup.TreeBuilder
- elif treeType == "lxml":
- import etree_lxml
- treeBuilderCache[treeType] = etree_lxml.TreeBuilder
- elif treeType == "etree":
- # Come up with a sane default
- if implementation == None:
- try:
- import xml.etree.cElementTree as ET
- except ImportError:
- try:
- import xml.etree.ElementTree as ET
- except ImportError:
- try:
- import cElementTree as ET
- except ImportError:
- import elementtree.ElementTree as ET
- implementation = ET
- import etree
- # NEVER cache here, caching is done in the etree submodule
- return etree.getETreeModule(implementation, **kwargs).TreeBuilder
- else:
- raise ValueError("""Unrecognised treebuilder "%s" """%treeType)
- return treeBuilderCache.get(treeType)