/utils/newsfetcher/planet/vendor/html5lib/treebuilders/__init__.py

https://bitbucket.org/fenics-project/fenics-web · Python · 79 lines · 39 code · 3 blank · 37 comment · 7 complexity · eb04262f48d882f8c9f3ac0971db9591 MD5 · raw file

  1. """A collection of modules for building different kinds of tree from
  2. HTML documents.
  3. To create a treebuilder for a new type of tree, you need to do
  4. implement several things:
  5. 1) A set of classes for various types of elements: Document, Doctype,
  6. Comment, Element. These must implement the interface of
  7. _base.treebuilders.Node (although comment nodes have a different
  8. signature for their constructor, see treebuilders.simpletree.Comment)
  9. Textual content may also be implemented as another node type, or not, as
  10. your tree implementation requires.
  11. 2) A treebuilder object (called TreeBuilder by convention) that
  12. inherits from treebuilders._base.TreeBuilder. This has 4 required attributes:
  13. documentClass - the class to use for the bottommost node of a document
  14. elementClass - the class to use for HTML Elements
  15. commentClass - the class to use for comments
  16. doctypeClass - the class to use for doctypes
  17. It also has one required method:
  18. getDocument - Returns the root node of the complete document tree
  19. 3) If you wish to run the unit tests, you must also create a
  20. testSerializer method on your treebuilder which accepts a node and
  21. returns a string containing Node and its children serialized according
  22. to the format used in the unittests
  23. The supplied simpletree module provides a python-only implementation
  24. of a full treebuilder and is a useful reference for the semantics of
  25. the various methods.
  26. """
  27. treeBuilderCache = {}
  28. def getTreeBuilder(treeType, implementation=None, **kwargs):
  29. """Get a TreeBuilder class for various types of tree with built-in support
  30. treeType - the name of the tree type required (case-insensitive). Supported
  31. values are "simpletree", "dom", "etree" and "beautifulsoup"
  32. "simpletree" - a built-in DOM-ish tree type with support for some
  33. more pythonic idioms.
  34. "dom" - A generic builder for DOM implementations, defaulting to
  35. a xml.dom.minidom based implementation for the sake of
  36. backwards compatibility (as releases up until 0.10 had a
  37. builder called "dom" that was a minidom implemenation).
  38. "etree" - A generic builder for tree implementations exposing an
  39. elementtree-like interface (known to work with
  40. ElementTree, cElementTree and lxml.etree).
  41. "beautifulsoup" - Beautiful soup (if installed)
  42. implementation - (Currently applies to the "etree" and "dom" tree types). A
  43. module implementing the tree type e.g.
  44. xml.etree.ElementTree or lxml.etree."""
  45. treeType = treeType.lower()
  46. if treeType not in treeBuilderCache:
  47. if treeType == "dom":
  48. import dom
  49. # XXX: Keep backwards compatibility by using minidom if no implementation is given
  50. if implementation == None:
  51. from xml.dom import minidom
  52. implementation = minidom
  53. # XXX: NEVER cache here, caching is done in the dom submodule
  54. return dom.getDomModule(implementation, **kwargs).TreeBuilder
  55. elif treeType == "simpletree":
  56. import simpletree
  57. treeBuilderCache[treeType] = simpletree.TreeBuilder
  58. elif treeType == "beautifulsoup":
  59. import soup
  60. treeBuilderCache[treeType] = soup.TreeBuilder
  61. elif treeType == "lxml":
  62. import etree_lxml
  63. treeBuilderCache[treeType] = etree_lxml.TreeBuilder
  64. elif treeType == "etree":
  65. import etree
  66. # XXX: NEVER cache here, caching is done in the etree submodule
  67. return etree.getETreeModule(implementation, **kwargs).TreeBuilder
  68. return treeBuilderCache.get(treeType)