PageRenderTime 169ms CodeModel.GetById 22ms RepoModel.GetById 1ms app.codeStats 0ms

/lib/html5lib/treebuilders/__init__.py

https://github.com/junalmeida/Sick-Beard
Python | 96 lines | 59 code | 3 blank | 34 comment | 0 complexity | 06fe3442576d2f07c3d677146dc2dc4d MD5 | raw file
  1. """A collection of modules for building different kinds of tree from
  2. HTML documents.
  3. To create a treebuilder for a new type of tree, you need to do
  4. implement several things:
  5. 1) A set of classes for various types of elements: Document, Doctype,
  6. Comment, Element. These must implement the interface of
  7. _base.treebuilders.Node (although comment nodes have a different
  8. signature for their constructor, see treebuilders.simpletree.Comment)
  9. Textual content may also be implemented as another node type, or not, as
  10. your tree implementation requires.
  11. 2) A treebuilder object (called TreeBuilder by convention) that
  12. inherits from treebuilders._base.TreeBuilder. This has 4 required attributes:
  13. documentClass - the class to use for the bottommost node of a document
  14. elementClass - the class to use for HTML Elements
  15. commentClass - the class to use for comments
  16. doctypeClass - the class to use for doctypes
  17. It also has one required method:
  18. getDocument - Returns the root node of the complete document tree
  19. 3) If you wish to run the unit tests, you must also create a
  20. testSerializer method on your treebuilder which accepts a node and
  21. returns a string containing Node and its children serialized according
  22. to the format used in the unittests
  23. The supplied simpletree module provides a python-only implementation
  24. of a full treebuilder and is a useful reference for the semantics of
  25. the various methods.
  26. """
  27. treeBuilderCache = {}
  28. import sys
  29. def getTreeBuilder(treeType, implementation=None, **kwargs):
  30. """Get a TreeBuilder class for various types of tree with built-in support
  31. treeType - the name of the tree type required (case-insensitive). Supported
  32. values are "simpletree", "dom", "etree" and "beautifulsoup"
  33. "simpletree" - a built-in DOM-ish tree type with support for some
  34. more pythonic idioms.
  35. "dom" - A generic builder for DOM implementations, defaulting to
  36. a xml.dom.minidom based implementation for the sake of
  37. backwards compatibility (as releases up until 0.10 had a
  38. builder called "dom" that was a minidom implemenation).
  39. "etree" - A generic builder for tree implementations exposing an
  40. elementtree-like interface (known to work with
  41. ElementTree, cElementTree and lxml.etree).
  42. "beautifulsoup" - Beautiful soup (if installed)
  43. implementation - (Currently applies to the "etree" and "dom" tree types). A
  44. module implementing the tree type e.g.
  45. xml.etree.ElementTree or lxml.etree."""
  46. treeType = treeType.lower()
  47. if treeType not in treeBuilderCache:
  48. if treeType == "dom":
  49. import dom
  50. # XXX: Keep backwards compatibility by using minidom if no implementation is given
  51. if implementation == None:
  52. from xml.dom import minidom
  53. implementation = minidom
  54. # XXX: NEVER cache here, caching is done in the dom submodule
  55. return dom.getDomModule(implementation, **kwargs).TreeBuilder
  56. elif treeType == "simpletree":
  57. import simpletree
  58. treeBuilderCache[treeType] = simpletree.TreeBuilder
  59. elif treeType == "beautifulsoup":
  60. import soup
  61. treeBuilderCache[treeType] = soup.TreeBuilder
  62. elif treeType == "lxml":
  63. import etree_lxml
  64. treeBuilderCache[treeType] = etree_lxml.TreeBuilder
  65. elif treeType == "etree":
  66. # Come up with a sane default
  67. if implementation == None:
  68. try:
  69. import xml.etree.cElementTree as ET
  70. except ImportError:
  71. try:
  72. import xml.etree.ElementTree as ET
  73. except ImportError:
  74. try:
  75. import cElementTree as ET
  76. except ImportError:
  77. import elementtree.ElementTree as ET
  78. implementation = ET
  79. import etree
  80. # NEVER cache here, caching is done in the etree submodule
  81. return etree.getETreeModule(implementation, **kwargs).TreeBuilder
  82. else:
  83. raise ValueError("""Unrecognised treebuilder "%s" """%treeType)
  84. return treeBuilderCache.get(treeType)