PageRenderTime 506ms CodeModel.GetById 201ms app.highlight 187ms RepoModel.GetById 104ms app.codeStats 0ms

/Lib/xml/etree/ElementTree.py

http://unladen-swallow.googlecode.com/
Python | 1260 lines | 1162 code | 5 blank | 93 comment | 7 complexity | 6556d86428b61f94ae07fd61c75146f6 MD5 | raw file
   1#
   2# ElementTree
   3# $Id: ElementTree.py 2326 2005-03-17 07:45:21Z fredrik $
   4#
   5# light-weight XML support for Python 1.5.2 and later.
   6#
   7# history:
   8# 2001-10-20 fl   created (from various sources)
   9# 2001-11-01 fl   return root from parse method
  10# 2002-02-16 fl   sort attributes in lexical order
  11# 2002-04-06 fl   TreeBuilder refactoring, added PythonDoc markup
  12# 2002-05-01 fl   finished TreeBuilder refactoring
  13# 2002-07-14 fl   added basic namespace support to ElementTree.write
  14# 2002-07-25 fl   added QName attribute support
  15# 2002-10-20 fl   fixed encoding in write
  16# 2002-11-24 fl   changed default encoding to ascii; fixed attribute encoding
  17# 2002-11-27 fl   accept file objects or file names for parse/write
  18# 2002-12-04 fl   moved XMLTreeBuilder back to this module
  19# 2003-01-11 fl   fixed entity encoding glitch for us-ascii
  20# 2003-02-13 fl   added XML literal factory
  21# 2003-02-21 fl   added ProcessingInstruction/PI factory
  22# 2003-05-11 fl   added tostring/fromstring helpers
  23# 2003-05-26 fl   added ElementPath support
  24# 2003-07-05 fl   added makeelement factory method
  25# 2003-07-28 fl   added more well-known namespace prefixes
  26# 2003-08-15 fl   fixed typo in ElementTree.findtext (Thomas Dartsch)
  27# 2003-09-04 fl   fall back on emulator if ElementPath is not installed
  28# 2003-10-31 fl   markup updates
  29# 2003-11-15 fl   fixed nested namespace bug
  30# 2004-03-28 fl   added XMLID helper
  31# 2004-06-02 fl   added default support to findtext
  32# 2004-06-08 fl   fixed encoding of non-ascii element/attribute names
  33# 2004-08-23 fl   take advantage of post-2.1 expat features
  34# 2005-02-01 fl   added iterparse implementation
  35# 2005-03-02 fl   fixed iterparse support for pre-2.2 versions
  36#
  37# Copyright (c) 1999-2005 by Fredrik Lundh.  All rights reserved.
  38#
  39# fredrik@pythonware.com
  40# http://www.pythonware.com
  41#
  42# --------------------------------------------------------------------
  43# The ElementTree toolkit is
  44#
  45# Copyright (c) 1999-2005 by Fredrik Lundh
  46#
  47# By obtaining, using, and/or copying this software and/or its
  48# associated documentation, you agree that you have read, understood,
  49# and will comply with the following terms and conditions:
  50#
  51# Permission to use, copy, modify, and distribute this software and
  52# its associated documentation for any purpose and without fee is
  53# hereby granted, provided that the above copyright notice appears in
  54# all copies, and that both that copyright notice and this permission
  55# notice appear in supporting documentation, and that the name of
  56# Secret Labs AB or the author not be used in advertising or publicity
  57# pertaining to distribution of the software without specific, written
  58# prior permission.
  59#
  60# SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD
  61# TO THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANT-
  62# ABILITY AND FITNESS.  IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR
  63# BE LIABLE FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY
  64# DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
  65# WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
  66# ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
  67# OF THIS SOFTWARE.
  68# --------------------------------------------------------------------
  69
  70# Licensed to PSF under a Contributor Agreement.
  71# See http://www.python.org/2.4/license for licensing details.
  72
  73__all__ = [
  74    # public symbols
  75    "Comment",
  76    "dump",
  77    "Element", "ElementTree",
  78    "fromstring",
  79    "iselement", "iterparse",
  80    "parse",
  81    "PI", "ProcessingInstruction",
  82    "QName",
  83    "SubElement",
  84    "tostring",
  85    "TreeBuilder",
  86    "VERSION", "XML",
  87    "XMLParser", "XMLTreeBuilder",
  88    ]
  89
  90##
  91# The <b>Element</b> type is a flexible container object, designed to
  92# store hierarchical data structures in memory. The type can be
  93# described as a cross between a list and a dictionary.
  94# <p>
  95# Each element has a number of properties associated with it:
  96# <ul>
  97# <li>a <i>tag</i>. This is a string identifying what kind of data
  98# this element represents (the element type, in other words).</li>
  99# <li>a number of <i>attributes</i>, stored in a Python dictionary.</li>
 100# <li>a <i>text</i> string.</li>
 101# <li>an optional <i>tail</i> string.</li>
 102# <li>a number of <i>child elements</i>, stored in a Python sequence</li>
 103# </ul>
 104#
 105# To create an element instance, use the {@link #Element} or {@link
 106# #SubElement} factory functions.
 107# <p>
 108# The {@link #ElementTree} class can be used to wrap an element
 109# structure, and convert it from and to XML.
 110##
 111
 112import string, sys, re
 113
 114class _SimpleElementPath:
 115    # emulate pre-1.2 find/findtext/findall behaviour
 116    def find(self, element, tag):
 117        for elem in element:
 118            if elem.tag == tag:
 119                return elem
 120        return None
 121    def findtext(self, element, tag, default=None):
 122        for elem in element:
 123            if elem.tag == tag:
 124                return elem.text or ""
 125        return default
 126    def findall(self, element, tag):
 127        if tag[:3] == ".//":
 128            return element.getiterator(tag[3:])
 129        result = []
 130        for elem in element:
 131            if elem.tag == tag:
 132                result.append(elem)
 133        return result
 134
 135try:
 136    import ElementPath
 137except ImportError:
 138    # FIXME: issue warning in this case?
 139    ElementPath = _SimpleElementPath()
 140
 141# TODO: add support for custom namespace resolvers/default namespaces
 142# TODO: add improved support for incremental parsing
 143
 144VERSION = "1.2.6"
 145
 146##
 147# Internal element class.  This class defines the Element interface,
 148# and provides a reference implementation of this interface.
 149# <p>
 150# You should not create instances of this class directly.  Use the
 151# appropriate factory functions instead, such as {@link #Element}
 152# and {@link #SubElement}.
 153#
 154# @see Element
 155# @see SubElement
 156# @see Comment
 157# @see ProcessingInstruction
 158
 159class _ElementInterface:
 160    # <tag attrib>text<child/>...</tag>tail
 161
 162    ##
 163    # (Attribute) Element tag.
 164
 165    tag = None
 166
 167    ##
 168    # (Attribute) Element attribute dictionary.  Where possible, use
 169    # {@link #_ElementInterface.get},
 170    # {@link #_ElementInterface.set},
 171    # {@link #_ElementInterface.keys}, and
 172    # {@link #_ElementInterface.items} to access
 173    # element attributes.
 174
 175    attrib = None
 176
 177    ##
 178    # (Attribute) Text before first subelement.  This is either a
 179    # string or the value None, if there was no text.
 180
 181    text = None
 182
 183    ##
 184    # (Attribute) Text after this element's end tag, but before the
 185    # next sibling element's start tag.  This is either a string or
 186    # the value None, if there was no text.
 187
 188    tail = None # text after end tag, if any
 189
 190    def __init__(self, tag, attrib):
 191        self.tag = tag
 192        self.attrib = attrib
 193        self._children = []
 194
 195    def __repr__(self):
 196        return "<Element %s at %x>" % (self.tag, id(self))
 197
 198    ##
 199    # Creates a new element object of the same type as this element.
 200    #
 201    # @param tag Element tag.
 202    # @param attrib Element attributes, given as a dictionary.
 203    # @return A new element instance.
 204
 205    def makeelement(self, tag, attrib):
 206        return Element(tag, attrib)
 207
 208    ##
 209    # Returns the number of subelements.
 210    #
 211    # @return The number of subelements.
 212
 213    def __len__(self):
 214        return len(self._children)
 215
 216    ##
 217    # Returns the given subelement.
 218    #
 219    # @param index What subelement to return.
 220    # @return The given subelement.
 221    # @exception IndexError If the given element does not exist.
 222
 223    def __getitem__(self, index):
 224        return self._children[index]
 225
 226    ##
 227    # Replaces the given subelement.
 228    #
 229    # @param index What subelement to replace.
 230    # @param element The new element value.
 231    # @exception IndexError If the given element does not exist.
 232    # @exception AssertionError If element is not a valid object.
 233
 234    def __setitem__(self, index, element):
 235        assert iselement(element)
 236        self._children[index] = element
 237
 238    ##
 239    # Deletes the given subelement.
 240    #
 241    # @param index What subelement to delete.
 242    # @exception IndexError If the given element does not exist.
 243
 244    def __delitem__(self, index):
 245        del self._children[index]
 246
 247    ##
 248    # Returns a list containing subelements in the given range.
 249    #
 250    # @param start The first subelement to return.
 251    # @param stop The first subelement that shouldn't be returned.
 252    # @return A sequence object containing subelements.
 253
 254    def __getslice__(self, start, stop):
 255        return self._children[start:stop]
 256
 257    ##
 258    # Replaces a number of subelements with elements from a sequence.
 259    #
 260    # @param start The first subelement to replace.
 261    # @param stop The first subelement that shouldn't be replaced.
 262    # @param elements A sequence object with zero or more elements.
 263    # @exception AssertionError If a sequence member is not a valid object.
 264
 265    def __setslice__(self, start, stop, elements):
 266        for element in elements:
 267            assert iselement(element)
 268        self._children[start:stop] = list(elements)
 269
 270    ##
 271    # Deletes a number of subelements.
 272    #
 273    # @param start The first subelement to delete.
 274    # @param stop The first subelement to leave in there.
 275
 276    def __delslice__(self, start, stop):
 277        del self._children[start:stop]
 278
 279    ##
 280    # Adds a subelement to the end of this element.
 281    #
 282    # @param element The element to add.
 283    # @exception AssertionError If a sequence member is not a valid object.
 284
 285    def append(self, element):
 286        assert iselement(element)
 287        self._children.append(element)
 288
 289    ##
 290    # Inserts a subelement at the given position in this element.
 291    #
 292    # @param index Where to insert the new subelement.
 293    # @exception AssertionError If the element is not a valid object.
 294
 295    def insert(self, index, element):
 296        assert iselement(element)
 297        self._children.insert(index, element)
 298
 299    ##
 300    # Removes a matching subelement.  Unlike the <b>find</b> methods,
 301    # this method compares elements based on identity, not on tag
 302    # value or contents.
 303    #
 304    # @param element What element to remove.
 305    # @exception ValueError If a matching element could not be found.
 306    # @exception AssertionError If the element is not a valid object.
 307
 308    def remove(self, element):
 309        assert iselement(element)
 310        self._children.remove(element)
 311
 312    ##
 313    # Returns all subelements.  The elements are returned in document
 314    # order.
 315    #
 316    # @return A list of subelements.
 317    # @defreturn list of Element instances
 318
 319    def getchildren(self):
 320        return self._children
 321
 322    ##
 323    # Finds the first matching subelement, by tag name or path.
 324    #
 325    # @param path What element to look for.
 326    # @return The first matching element, or None if no element was found.
 327    # @defreturn Element or None
 328
 329    def find(self, path):
 330        return ElementPath.find(self, path)
 331
 332    ##
 333    # Finds text for the first matching subelement, by tag name or path.
 334    #
 335    # @param path What element to look for.
 336    # @param default What to return if the element was not found.
 337    # @return The text content of the first matching element, or the
 338    #     default value no element was found.  Note that if the element
 339    #     has is found, but has no text content, this method returns an
 340    #     empty string.
 341    # @defreturn string
 342
 343    def findtext(self, path, default=None):
 344        return ElementPath.findtext(self, path, default)
 345
 346    ##
 347    # Finds all matching subelements, by tag name or path.
 348    #
 349    # @param path What element to look for.
 350    # @return A list or iterator containing all matching elements,
 351    #    in document order.
 352    # @defreturn list of Element instances
 353
 354    def findall(self, path):
 355        return ElementPath.findall(self, path)
 356
 357    ##
 358    # Resets an element.  This function removes all subelements, clears
 359    # all attributes, and sets the text and tail attributes to None.
 360
 361    def clear(self):
 362        self.attrib.clear()
 363        self._children = []
 364        self.text = self.tail = None
 365
 366    ##
 367    # Gets an element attribute.
 368    #
 369    # @param key What attribute to look for.
 370    # @param default What to return if the attribute was not found.
 371    # @return The attribute value, or the default value, if the
 372    #     attribute was not found.
 373    # @defreturn string or None
 374
 375    def get(self, key, default=None):
 376        return self.attrib.get(key, default)
 377
 378    ##
 379    # Sets an element attribute.
 380    #
 381    # @param key What attribute to set.
 382    # @param value The attribute value.
 383
 384    def set(self, key, value):
 385        self.attrib[key] = value
 386
 387    ##
 388    # Gets a list of attribute names.  The names are returned in an
 389    # arbitrary order (just like for an ordinary Python dictionary).
 390    #
 391    # @return A list of element attribute names.
 392    # @defreturn list of strings
 393
 394    def keys(self):
 395        return self.attrib.keys()
 396
 397    ##
 398    # Gets element attributes, as a sequence.  The attributes are
 399    # returned in an arbitrary order.
 400    #
 401    # @return A list of (name, value) tuples for all attributes.
 402    # @defreturn list of (string, string) tuples
 403
 404    def items(self):
 405        return self.attrib.items()
 406
 407    ##
 408    # Creates a tree iterator.  The iterator loops over this element
 409    # and all subelements, in document order, and returns all elements
 410    # with a matching tag.
 411    # <p>
 412    # If the tree structure is modified during iteration, the result
 413    # is undefined.
 414    #
 415    # @param tag What tags to look for (default is to return all elements).
 416    # @return A list or iterator containing all the matching elements.
 417    # @defreturn list or iterator
 418
 419    def getiterator(self, tag=None):
 420        nodes = []
 421        if tag == "*":
 422            tag = None
 423        if tag is None or self.tag == tag:
 424            nodes.append(self)
 425        for node in self._children:
 426            nodes.extend(node.getiterator(tag))
 427        return nodes
 428
 429# compatibility
 430_Element = _ElementInterface
 431
 432##
 433# Element factory.  This function returns an object implementing the
 434# standard Element interface.  The exact class or type of that object
 435# is implementation dependent, but it will always be compatible with
 436# the {@link #_ElementInterface} class in this module.
 437# <p>
 438# The element name, attribute names, and attribute values can be
 439# either 8-bit ASCII strings or Unicode strings.
 440#
 441# @param tag The element name.
 442# @param attrib An optional dictionary, containing element attributes.
 443# @param **extra Additional attributes, given as keyword arguments.
 444# @return An element instance.
 445# @defreturn Element
 446
 447def Element(tag, attrib={}, **extra):
 448    attrib = attrib.copy()
 449    attrib.update(extra)
 450    return _ElementInterface(tag, attrib)
 451
 452##
 453# Subelement factory.  This function creates an element instance, and
 454# appends it to an existing element.
 455# <p>
 456# The element name, attribute names, and attribute values can be
 457# either 8-bit ASCII strings or Unicode strings.
 458#
 459# @param parent The parent element.
 460# @param tag The subelement name.
 461# @param attrib An optional dictionary, containing element attributes.
 462# @param **extra Additional attributes, given as keyword arguments.
 463# @return An element instance.
 464# @defreturn Element
 465
 466def SubElement(parent, tag, attrib={}, **extra):
 467    attrib = attrib.copy()
 468    attrib.update(extra)
 469    element = parent.makeelement(tag, attrib)
 470    parent.append(element)
 471    return element
 472
 473##
 474# Comment element factory.  This factory function creates a special
 475# element that will be serialized as an XML comment.
 476# <p>
 477# The comment string can be either an 8-bit ASCII string or a Unicode
 478# string.
 479#
 480# @param text A string containing the comment string.
 481# @return An element instance, representing a comment.
 482# @defreturn Element
 483
 484def Comment(text=None):
 485    element = Element(Comment)
 486    element.text = text
 487    return element
 488
 489##
 490# PI element factory.  This factory function creates a special element
 491# that will be serialized as an XML processing instruction.
 492#
 493# @param target A string containing the PI target.
 494# @param text A string containing the PI contents, if any.
 495# @return An element instance, representing a PI.
 496# @defreturn Element
 497
 498def ProcessingInstruction(target, text=None):
 499    element = Element(ProcessingInstruction)
 500    element.text = target
 501    if text:
 502        element.text = element.text + " " + text
 503    return element
 504
 505PI = ProcessingInstruction
 506
 507##
 508# QName wrapper.  This can be used to wrap a QName attribute value, in
 509# order to get proper namespace handling on output.
 510#
 511# @param text A string containing the QName value, in the form {uri}local,
 512#     or, if the tag argument is given, the URI part of a QName.
 513# @param tag Optional tag.  If given, the first argument is interpreted as
 514#     an URI, and this argument is interpreted as a local name.
 515# @return An opaque object, representing the QName.
 516
 517class QName:
 518    def __init__(self, text_or_uri, tag=None):
 519        if tag:
 520            text_or_uri = "{%s}%s" % (text_or_uri, tag)
 521        self.text = text_or_uri
 522    def __str__(self):
 523        return self.text
 524    def __hash__(self):
 525        return hash(self.text)
 526    def __cmp__(self, other):
 527        if isinstance(other, QName):
 528            return cmp(self.text, other.text)
 529        return cmp(self.text, other)
 530
 531##
 532# ElementTree wrapper class.  This class represents an entire element
 533# hierarchy, and adds some extra support for serialization to and from
 534# standard XML.
 535#
 536# @param element Optional root element.
 537# @keyparam file Optional file handle or name.  If given, the
 538#     tree is initialized with the contents of this XML file.
 539
 540class ElementTree:
 541
 542    def __init__(self, element=None, file=None):
 543        assert element is None or iselement(element)
 544        self._root = element # first node
 545        if file:
 546            self.parse(file)
 547
 548    ##
 549    # Gets the root element for this tree.
 550    #
 551    # @return An element instance.
 552    # @defreturn Element
 553
 554    def getroot(self):
 555        return self._root
 556
 557    ##
 558    # Replaces the root element for this tree.  This discards the
 559    # current contents of the tree, and replaces it with the given
 560    # element.  Use with care.
 561    #
 562    # @param element An element instance.
 563
 564    def _setroot(self, element):
 565        assert iselement(element)
 566        self._root = element
 567
 568    ##
 569    # Loads an external XML document into this element tree.
 570    #
 571    # @param source A file name or file object.
 572    # @param parser An optional parser instance.  If not given, the
 573    #     standard {@link XMLTreeBuilder} parser is used.
 574    # @return The document root element.
 575    # @defreturn Element
 576
 577    def parse(self, source, parser=None):
 578        if not hasattr(source, "read"):
 579            source = open(source, "rb")
 580        if not parser:
 581            parser = XMLTreeBuilder()
 582        while 1:
 583            data = source.read(32768)
 584            if not data:
 585                break
 586            parser.feed(data)
 587        self._root = parser.close()
 588        return self._root
 589
 590    ##
 591    # Creates a tree iterator for the root element.  The iterator loops
 592    # over all elements in this tree, in document order.
 593    #
 594    # @param tag What tags to look for (default is to return all elements)
 595    # @return An iterator.
 596    # @defreturn iterator
 597
 598    def getiterator(self, tag=None):
 599        assert self._root is not None
 600        return self._root.getiterator(tag)
 601
 602    ##
 603    # Finds the first toplevel element with given tag.
 604    # Same as getroot().find(path).
 605    #
 606    # @param path What element to look for.
 607    # @return The first matching element, or None if no element was found.
 608    # @defreturn Element or None
 609
 610    def find(self, path):
 611        assert self._root is not None
 612        if path[:1] == "/":
 613            path = "." + path
 614        return self._root.find(path)
 615
 616    ##
 617    # Finds the element text for the first toplevel element with given
 618    # tag.  Same as getroot().findtext(path).
 619    #
 620    # @param path What toplevel element to look for.
 621    # @param default What to return if the element was not found.
 622    # @return The text content of the first matching element, or the
 623    #     default value no element was found.  Note that if the element
 624    #     has is found, but has no text content, this method returns an
 625    #     empty string.
 626    # @defreturn string
 627
 628    def findtext(self, path, default=None):
 629        assert self._root is not None
 630        if path[:1] == "/":
 631            path = "." + path
 632        return self._root.findtext(path, default)
 633
 634    ##
 635    # Finds all toplevel elements with the given tag.
 636    # Same as getroot().findall(path).
 637    #
 638    # @param path What element to look for.
 639    # @return A list or iterator containing all matching elements,
 640    #    in document order.
 641    # @defreturn list of Element instances
 642
 643    def findall(self, path):
 644        assert self._root is not None
 645        if path[:1] == "/":
 646            path = "." + path
 647        return self._root.findall(path)
 648
 649    ##
 650    # Writes the element tree to a file, as XML.
 651    #
 652    # @param file A file name, or a file object opened for writing.
 653    # @param encoding Optional output encoding (default is US-ASCII).
 654
 655    def write(self, file, encoding="us-ascii"):
 656        assert self._root is not None
 657        if not hasattr(file, "write"):
 658            file = open(file, "wb")
 659        if not encoding:
 660            encoding = "us-ascii"
 661        elif encoding != "utf-8" and encoding != "us-ascii":
 662            file.write("<?xml version='1.0' encoding='%s'?>\n" % encoding)
 663        self._write(file, self._root, encoding, {})
 664
 665    def _write(self, file, node, encoding, namespaces):
 666        # write XML to file
 667        tag = node.tag
 668        if tag is Comment:
 669            file.write("<!-- %s -->" % _escape_cdata(node.text, encoding))
 670        elif tag is ProcessingInstruction:
 671            file.write("<?%s?>" % _escape_cdata(node.text, encoding))
 672        else:
 673            items = node.items()
 674            xmlns_items = [] # new namespaces in this scope
 675            try:
 676                if isinstance(tag, QName) or tag[:1] == "{":
 677                    tag, xmlns = fixtag(tag, namespaces)
 678                    if xmlns: xmlns_items.append(xmlns)
 679            except TypeError:
 680                _raise_serialization_error(tag)
 681            file.write("<" + _encode(tag, encoding))
 682            if items or xmlns_items:
 683                items.sort() # lexical order
 684                for k, v in items:
 685                    try:
 686                        if isinstance(k, QName) or k[:1] == "{":
 687                            k, xmlns = fixtag(k, namespaces)
 688                            if xmlns: xmlns_items.append(xmlns)
 689                    except TypeError:
 690                        _raise_serialization_error(k)
 691                    try:
 692                        if isinstance(v, QName):
 693                            v, xmlns = fixtag(v, namespaces)
 694                            if xmlns: xmlns_items.append(xmlns)
 695                    except TypeError:
 696                        _raise_serialization_error(v)
 697                    file.write(" %s=\"%s\"" % (_encode(k, encoding),
 698                                               _escape_attrib(v, encoding)))
 699                for k, v in xmlns_items:
 700                    file.write(" %s=\"%s\"" % (_encode(k, encoding),
 701                                               _escape_attrib(v, encoding)))
 702            if node.text or len(node):
 703                file.write(">")
 704                if node.text:
 705                    file.write(_escape_cdata(node.text, encoding))
 706                for n in node:
 707                    self._write(file, n, encoding, namespaces)
 708                file.write("</" + _encode(tag, encoding) + ">")
 709            else:
 710                file.write(" />")
 711            for k, v in xmlns_items:
 712                del namespaces[v]
 713        if node.tail:
 714            file.write(_escape_cdata(node.tail, encoding))
 715
 716# --------------------------------------------------------------------
 717# helpers
 718
 719##
 720# Checks if an object appears to be a valid element object.
 721#
 722# @param An element instance.
 723# @return A true value if this is an element object.
 724# @defreturn flag
 725
 726def iselement(element):
 727    # FIXME: not sure about this; might be a better idea to look
 728    # for tag/attrib/text attributes
 729    return isinstance(element, _ElementInterface) or hasattr(element, "tag")
 730
 731##
 732# Writes an element tree or element structure to sys.stdout.  This
 733# function should be used for debugging only.
 734# <p>
 735# The exact output format is implementation dependent.  In this
 736# version, it's written as an ordinary XML file.
 737#
 738# @param elem An element tree or an individual element.
 739
 740def dump(elem):
 741    # debugging
 742    if not isinstance(elem, ElementTree):
 743        elem = ElementTree(elem)
 744    elem.write(sys.stdout)
 745    tail = elem.getroot().tail
 746    if not tail or tail[-1] != "\n":
 747        sys.stdout.write("\n")
 748
 749def _encode(s, encoding):
 750    try:
 751        return s.encode(encoding)
 752    except AttributeError:
 753        return s # 1.5.2: assume the string uses the right encoding
 754
 755if sys.version[:3] == "1.5":
 756    _escape = re.compile(r"[&<>\"\x80-\xff]+") # 1.5.2
 757else:
 758    _escape = re.compile(eval(r'u"[&<>\"\u0080-\uffff]+"'))
 759
 760_escape_map = {
 761    "&": "&amp;",
 762    "<": "&lt;",
 763    ">": "&gt;",
 764    '"': "&quot;",
 765}
 766
 767_namespace_map = {
 768    # "well-known" namespace prefixes
 769    "http://www.w3.org/XML/1998/namespace": "xml",
 770    "http://www.w3.org/1999/xhtml": "html",
 771    "http://www.w3.org/1999/02/22-rdf-syntax-ns#": "rdf",
 772    "http://schemas.xmlsoap.org/wsdl/": "wsdl",
 773}
 774
 775def _raise_serialization_error(text):
 776    raise TypeError(
 777        "cannot serialize %r (type %s)" % (text, type(text).__name__)
 778        )
 779
 780def _encode_entity(text, pattern=_escape):
 781    # map reserved and non-ascii characters to numerical entities
 782    def escape_entities(m, map=_escape_map):
 783        out = []
 784        append = out.append
 785        for char in m.group():
 786            text = map.get(char)
 787            if text is None:
 788                text = "&#%d;" % ord(char)
 789            append(text)
 790        return string.join(out, "")
 791    try:
 792        return _encode(pattern.sub(escape_entities, text), "ascii")
 793    except TypeError:
 794        _raise_serialization_error(text)
 795
 796#
 797# the following functions assume an ascii-compatible encoding
 798# (or "utf-16")
 799
 800def _escape_cdata(text, encoding=None, replace=string.replace):
 801    # escape character data
 802    try:
 803        if encoding:
 804            try:
 805                text = _encode(text, encoding)
 806            except UnicodeError:
 807                return _encode_entity(text)
 808        text = replace(text, "&", "&amp;")
 809        text = replace(text, "<", "&lt;")
 810        text = replace(text, ">", "&gt;")
 811        return text
 812    except (TypeError, AttributeError):
 813        _raise_serialization_error(text)
 814
 815def _escape_attrib(text, encoding=None, replace=string.replace):
 816    # escape attribute value
 817    try:
 818        if encoding:
 819            try:
 820                text = _encode(text, encoding)
 821            except UnicodeError:
 822                return _encode_entity(text)
 823        text = replace(text, "&", "&amp;")
 824        text = replace(text, "'", "&apos;") # FIXME: overkill
 825        text = replace(text, "\"", "&quot;")
 826        text = replace(text, "<", "&lt;")
 827        text = replace(text, ">", "&gt;")
 828        return text
 829    except (TypeError, AttributeError):
 830        _raise_serialization_error(text)
 831
 832def fixtag(tag, namespaces):
 833    # given a decorated tag (of the form {uri}tag), return prefixed
 834    # tag and namespace declaration, if any
 835    if isinstance(tag, QName):
 836        tag = tag.text
 837    namespace_uri, tag = string.split(tag[1:], "}", 1)
 838    prefix = namespaces.get(namespace_uri)
 839    if prefix is None:
 840        prefix = _namespace_map.get(namespace_uri)
 841        if prefix is None:
 842            prefix = "ns%d" % len(namespaces)
 843        namespaces[namespace_uri] = prefix
 844        if prefix == "xml":
 845            xmlns = None
 846        else:
 847            xmlns = ("xmlns:%s" % prefix, namespace_uri)
 848    else:
 849        xmlns = None
 850    return "%s:%s" % (prefix, tag), xmlns
 851
 852##
 853# Parses an XML document into an element tree.
 854#
 855# @param source A filename or file object containing XML data.
 856# @param parser An optional parser instance.  If not given, the
 857#     standard {@link XMLTreeBuilder} parser is used.
 858# @return An ElementTree instance
 859
 860def parse(source, parser=None):
 861    tree = ElementTree()
 862    tree.parse(source, parser)
 863    return tree
 864
 865##
 866# Parses an XML document into an element tree incrementally, and reports
 867# what's going on to the user.
 868#
 869# @param source A filename or file object containing XML data.
 870# @param events A list of events to report back.  If omitted, only "end"
 871#     events are reported.
 872# @return A (event, elem) iterator.
 873
 874class iterparse:
 875
 876    def __init__(self, source, events=None):
 877        if not hasattr(source, "read"):
 878            source = open(source, "rb")
 879        self._file = source
 880        self._events = []
 881        self._index = 0
 882        self.root = self._root = None
 883        self._parser = XMLTreeBuilder()
 884        # wire up the parser for event reporting
 885        parser = self._parser._parser
 886        append = self._events.append
 887        if events is None:
 888            events = ["end"]
 889        for event in events:
 890            if event == "start":
 891                try:
 892                    parser.ordered_attributes = 1
 893                    parser.specified_attributes = 1
 894                    def handler(tag, attrib_in, event=event, append=append,
 895                                start=self._parser._start_list):
 896                        append((event, start(tag, attrib_in)))
 897                    parser.StartElementHandler = handler
 898                except AttributeError:
 899                    def handler(tag, attrib_in, event=event, append=append,
 900                                start=self._parser._start):
 901                        append((event, start(tag, attrib_in)))
 902                    parser.StartElementHandler = handler
 903            elif event == "end":
 904                def handler(tag, event=event, append=append,
 905                            end=self._parser._end):
 906                    append((event, end(tag)))
 907                parser.EndElementHandler = handler
 908            elif event == "start-ns":
 909                def handler(prefix, uri, event=event, append=append):
 910                    try:
 911                        uri = _encode(uri, "ascii")
 912                    except UnicodeError:
 913                        pass
 914                    append((event, (prefix or "", uri)))
 915                parser.StartNamespaceDeclHandler = handler
 916            elif event == "end-ns":
 917                def handler(prefix, event=event, append=append):
 918                    append((event, None))
 919                parser.EndNamespaceDeclHandler = handler
 920
 921    def next(self):
 922        while 1:
 923            try:
 924                item = self._events[self._index]
 925            except IndexError:
 926                if self._parser is None:
 927                    self.root = self._root
 928                    try:
 929                        raise StopIteration
 930                    except NameError:
 931                        raise IndexError
 932                # load event buffer
 933                del self._events[:]
 934                self._index = 0
 935                data = self._file.read(16384)
 936                if data:
 937                    self._parser.feed(data)
 938                else:
 939                    self._root = self._parser.close()
 940                    self._parser = None
 941            else:
 942                self._index = self._index + 1
 943                return item
 944
 945    try:
 946        iter
 947        def __iter__(self):
 948            return self
 949    except NameError:
 950        def __getitem__(self, index):
 951            return self.next()
 952
 953##
 954# Parses an XML document from a string constant.  This function can
 955# be used to embed "XML literals" in Python code.
 956#
 957# @param source A string containing XML data.
 958# @return An Element instance.
 959# @defreturn Element
 960
 961def XML(text):
 962    parser = XMLTreeBuilder()
 963    parser.feed(text)
 964    return parser.close()
 965
 966##
 967# Parses an XML document from a string constant, and also returns
 968# a dictionary which maps from element id:s to elements.
 969#
 970# @param source A string containing XML data.
 971# @return A tuple containing an Element instance and a dictionary.
 972# @defreturn (Element, dictionary)
 973
 974def XMLID(text):
 975    parser = XMLTreeBuilder()
 976    parser.feed(text)
 977    tree = parser.close()
 978    ids = {}
 979    for elem in tree.getiterator():
 980        id = elem.get("id")
 981        if id:
 982            ids[id] = elem
 983    return tree, ids
 984
 985##
 986# Parses an XML document from a string constant.  Same as {@link #XML}.
 987#
 988# @def fromstring(text)
 989# @param source A string containing XML data.
 990# @return An Element instance.
 991# @defreturn Element
 992
 993fromstring = XML
 994
 995##
 996# Generates a string representation of an XML element, including all
 997# subelements.
 998#
 999# @param element An Element instance.
1000# @return An encoded string containing the XML data.
1001# @defreturn string
1002
1003def tostring(element, encoding=None):
1004    class dummy:
1005        pass
1006    data = []
1007    file = dummy()
1008    file.write = data.append
1009    ElementTree(element).write(file, encoding)
1010    return string.join(data, "")
1011
1012##
1013# Generic element structure builder.  This builder converts a sequence
1014# of {@link #TreeBuilder.start}, {@link #TreeBuilder.data}, and {@link
1015# #TreeBuilder.end} method calls to a well-formed element structure.
1016# <p>
1017# You can use this class to build an element structure using a custom XML
1018# parser, or a parser for some other XML-like format.
1019#
1020# @param element_factory Optional element factory.  This factory
1021#    is called to create new Element instances, as necessary.
1022
1023class TreeBuilder:
1024
1025    def __init__(self, element_factory=None):
1026        self._data = [] # data collector
1027        self._elem = [] # element stack
1028        self._last = None # last element
1029        self._tail = None # true if we're after an end tag
1030        if element_factory is None:
1031            element_factory = _ElementInterface
1032        self._factory = element_factory
1033
1034    ##
1035    # Flushes the parser buffers, and returns the toplevel documen
1036    # element.
1037    #
1038    # @return An Element instance.
1039    # @defreturn Element
1040
1041    def close(self):
1042        assert len(self._elem) == 0, "missing end tags"
1043        assert self._last != None, "missing toplevel element"
1044        return self._last
1045
1046    def _flush(self):
1047        if self._data:
1048            if self._last is not None:
1049                text = string.join(self._data, "")
1050                if self._tail:
1051                    assert self._last.tail is None, "internal error (tail)"
1052                    self._last.tail = text
1053                else:
1054                    assert self._last.text is None, "internal error (text)"
1055                    self._last.text = text
1056            self._data = []
1057
1058    ##
1059    # Adds text to the current element.
1060    #
1061    # @param data A string.  This should be either an 8-bit string
1062    #    containing ASCII text, or a Unicode string.
1063
1064    def data(self, data):
1065        self._data.append(data)
1066
1067    ##
1068    # Opens a new element.
1069    #
1070    # @param tag The element name.
1071    # @param attrib A dictionary containing element attributes.
1072    # @return The opened element.
1073    # @defreturn Element
1074
1075    def start(self, tag, attrs):
1076        self._flush()
1077        self._last = elem = self._factory(tag, attrs)
1078        if self._elem:
1079            self._elem[-1].append(elem)
1080        self._elem.append(elem)
1081        self._tail = 0
1082        return elem
1083
1084    ##
1085    # Closes the current element.
1086    #
1087    # @param tag The element name.
1088    # @return The closed element.
1089    # @defreturn Element
1090
1091    def end(self, tag):
1092        self._flush()
1093        self._last = self._elem.pop()
1094        assert self._last.tag == tag,\
1095               "end tag mismatch (expected %s, got %s)" % (
1096                   self._last.tag, tag)
1097        self._tail = 1
1098        return self._last
1099
1100##
1101# Element structure builder for XML source data, based on the
1102# <b>expat</b> parser.
1103#
1104# @keyparam target Target object.  If omitted, the builder uses an
1105#     instance of the standard {@link #TreeBuilder} class.
1106# @keyparam html Predefine HTML entities.  This flag is not supported
1107#     by the current implementation.
1108# @see #ElementTree
1109# @see #TreeBuilder
1110
1111class XMLTreeBuilder:
1112
1113    def __init__(self, html=0, target=None):
1114        try:
1115            from xml.parsers import expat
1116        except ImportError:
1117            raise ImportError(
1118                "No module named expat; use SimpleXMLTreeBuilder instead"
1119                )
1120        self._parser = parser = expat.ParserCreate(None, "}")
1121        if target is None:
1122            target = TreeBuilder()
1123        self._target = target
1124        self._names = {} # name memo cache
1125        # callbacks
1126        parser.DefaultHandlerExpand = self._default
1127        parser.StartElementHandler = self._start
1128        parser.EndElementHandler = self._end
1129        parser.CharacterDataHandler = self._data
1130        # let expat do the buffering, if supported
1131        try:
1132            self._parser.buffer_text = 1
1133        except AttributeError:
1134            pass
1135        # use new-style attribute handling, if supported
1136        try:
1137            self._parser.ordered_attributes = 1
1138            self._parser.specified_attributes = 1
1139            parser.StartElementHandler = self._start_list
1140        except AttributeError:
1141            pass
1142        encoding = None
1143        if not parser.returns_unicode:
1144            encoding = "utf-8"
1145        # target.xml(encoding, None)
1146        self._doctype = None
1147        self.entity = {}
1148
1149    def _fixtext(self, text):
1150        # convert text string to ascii, if possible
1151        try:
1152            return _encode(text, "ascii")
1153        except UnicodeError:
1154            return text
1155
1156    def _fixname(self, key):
1157        # expand qname, and convert name string to ascii, if possible
1158        try:
1159            name = self._names[key]
1160        except KeyError:
1161            name = key
1162            if "}" in name:
1163                name = "{" + name
1164            self._names[key] = name = self._fixtext(name)
1165        return name
1166
1167    def _start(self, tag, attrib_in):
1168        fixname = self._fixname
1169        tag = fixname(tag)
1170        attrib = {}
1171        for key, value in attrib_in.items():
1172            attrib[fixname(key)] = self._fixtext(value)
1173        return self._target.start(tag, attrib)
1174
1175    def _start_list(self, tag, attrib_in):
1176        fixname = self._fixname
1177        tag = fixname(tag)
1178        attrib = {}
1179        if attrib_in:
1180            for i in range(0, len(attrib_in), 2):
1181                attrib[fixname(attrib_in[i])] = self._fixtext(attrib_in[i+1])
1182        return self._target.start(tag, attrib)
1183
1184    def _data(self, text):
1185        return self._target.data(self._fixtext(text))
1186
1187    def _end(self, tag):
1188        return self._target.end(self._fixname(tag))
1189
1190    def _default(self, text):
1191        prefix = text[:1]
1192        if prefix == "&":
1193            # deal with undefined entities
1194            try:
1195                self._target.data(self.entity[text[1:-1]])
1196            except KeyError:
1197                from xml.parsers import expat
1198                raise expat.error(
1199                    "undefined entity %s: line %d, column %d" %
1200                    (text, self._parser.ErrorLineNumber,
1201                    self._parser.ErrorColumnNumber)
1202                    )
1203        elif prefix == "<" and text[:9] == "<!DOCTYPE":
1204            self._doctype = [] # inside a doctype declaration
1205        elif self._doctype is not None:
1206            # parse doctype contents
1207            if prefix == ">":
1208                self._doctype = None
1209                return
1210            text = string.strip(text)
1211            if not text:
1212                return
1213            self._doctype.append(text)
1214            n = len(self._doctype)
1215            if n > 2:
1216                type = self._doctype[1]
1217                if type == "PUBLIC" and n == 4:
1218                    name, type, pubid, system = self._doctype
1219                elif type == "SYSTEM" and n == 3:
1220                    name, type, system = self._doctype
1221                    pubid = None
1222                else:
1223                    return
1224                if pubid:
1225                    pubid = pubid[1:-1]
1226                self.doctype(name, pubid, system[1:-1])
1227                self._doctype = None
1228
1229    ##
1230    # Handles a doctype declaration.
1231    #
1232    # @param name Doctype name.
1233    # @param pubid Public identifier.
1234    # @param system System identifier.
1235
1236    def doctype(self, name, pubid, system):
1237        pass
1238
1239    ##
1240    # Feeds data to the parser.
1241    #
1242    # @param data Encoded data.
1243
1244    def feed(self, data):
1245        self._parser.Parse(data, 0)
1246
1247    ##
1248    # Finishes feeding data to the parser.
1249    #
1250    # @return An element structure.
1251    # @defreturn Element
1252
1253    def close(self):
1254        self._parser.Parse("", 1) # end of data
1255        tree = self._target.close()
1256        del self._target, self._parser # get rid of circular references
1257        return tree
1258
1259# compatibility
1260XMLParser = XMLTreeBuilder