PageRenderTime 61ms CodeModel.GetById 31ms app.highlight 24ms RepoModel.GetById 2ms app.codeStats 0ms

/Lib/xml/sax/expatreader.py

http://unladen-swallow.googlecode.com/
Python | 414 lines | 368 code | 25 blank | 21 comment | 25 complexity | eaa99dc3b0fcd846a44b18b6c9c4fc94 MD5 | raw file
  1"""
  2SAX driver for the pyexpat C module.  This driver works with
  3pyexpat.__version__ == '2.22'.
  4"""
  5
  6version = "0.20"
  7
  8from xml.sax._exceptions import *
  9from xml.sax.handler import feature_validation, feature_namespaces
 10from xml.sax.handler import feature_namespace_prefixes
 11from xml.sax.handler import feature_external_ges, feature_external_pes
 12from xml.sax.handler import feature_string_interning
 13from xml.sax.handler import property_xml_string, property_interning_dict
 14
 15# xml.parsers.expat does not raise ImportError in Jython
 16import sys
 17if sys.platform[:4] == "java":
 18    raise SAXReaderNotAvailable("expat not available in Java", None)
 19del sys
 20
 21try:
 22    from xml.parsers import expat
 23except ImportError:
 24    raise SAXReaderNotAvailable("expat not supported", None)
 25else:
 26    if not hasattr(expat, "ParserCreate"):
 27        raise SAXReaderNotAvailable("expat not supported", None)
 28from xml.sax import xmlreader, saxutils, handler
 29
 30AttributesImpl = xmlreader.AttributesImpl
 31AttributesNSImpl = xmlreader.AttributesNSImpl
 32
 33# If we're using a sufficiently recent version of Python, we can use
 34# weak references to avoid cycles between the parser and content
 35# handler, otherwise we'll just have to pretend.
 36try:
 37    import _weakref
 38except ImportError:
 39    def _mkproxy(o):
 40        return o
 41else:
 42    import weakref
 43    _mkproxy = weakref.proxy
 44    del weakref, _weakref
 45
 46# --- ExpatLocator
 47
 48class ExpatLocator(xmlreader.Locator):
 49    """Locator for use with the ExpatParser class.
 50
 51    This uses a weak reference to the parser object to avoid creating
 52    a circular reference between the parser and the content handler.
 53    """
 54    def __init__(self, parser):
 55        self._ref = _mkproxy(parser)
 56
 57    def getColumnNumber(self):
 58        parser = self._ref
 59        if parser._parser is None:
 60            return None
 61        return parser._parser.ErrorColumnNumber
 62
 63    def getLineNumber(self):
 64        parser = self._ref
 65        if parser._parser is None:
 66            return 1
 67        return parser._parser.ErrorLineNumber
 68
 69    def getPublicId(self):
 70        parser = self._ref
 71        if parser is None:
 72            return None
 73        return parser._source.getPublicId()
 74
 75    def getSystemId(self):
 76        parser = self._ref
 77        if parser is None:
 78            return None
 79        return parser._source.getSystemId()
 80
 81
 82# --- ExpatParser
 83
 84class ExpatParser(xmlreader.IncrementalParser, xmlreader.Locator):
 85    """SAX driver for the pyexpat C module."""
 86
 87    def __init__(self, namespaceHandling=0, bufsize=2**16-20):
 88        xmlreader.IncrementalParser.__init__(self, bufsize)
 89        self._source = xmlreader.InputSource()
 90        self._parser = None
 91        self._namespaces = namespaceHandling
 92        self._lex_handler_prop = None
 93        self._parsing = 0
 94        self._entity_stack = []
 95        self._external_ges = 1
 96        self._interning = None
 97
 98    # XMLReader methods
 99
100    def parse(self, source):
101        "Parse an XML document from a URL or an InputSource."
102        source = saxutils.prepare_input_source(source)
103
104        self._source = source
105        self.reset()
106        self._cont_handler.setDocumentLocator(ExpatLocator(self))
107        xmlreader.IncrementalParser.parse(self, source)
108
109    def prepareParser(self, source):
110        if source.getSystemId() is not None:
111            self._parser.SetBase(source.getSystemId())
112
113    # Redefined setContentHandler to allow changing handlers during parsing
114
115    def setContentHandler(self, handler):
116        xmlreader.IncrementalParser.setContentHandler(self, handler)
117        if self._parsing:
118            self._reset_cont_handler()
119
120    def getFeature(self, name):
121        if name == feature_namespaces:
122            return self._namespaces
123        elif name == feature_string_interning:
124            return self._interning is not None
125        elif name in (feature_validation, feature_external_pes,
126                      feature_namespace_prefixes):
127            return 0
128        elif name == feature_external_ges:
129            return self._external_ges
130        raise SAXNotRecognizedException("Feature '%s' not recognized" % name)
131
132    def setFeature(self, name, state):
133        if self._parsing:
134            raise SAXNotSupportedException("Cannot set features while parsing")
135
136        if name == feature_namespaces:
137            self._namespaces = state
138        elif name == feature_external_ges:
139            self._external_ges = state
140        elif name == feature_string_interning:
141            if state:
142                if self._interning is None:
143                    self._interning = {}
144            else:
145                self._interning = None
146        elif name == feature_validation:
147            if state:
148                raise SAXNotSupportedException(
149                    "expat does not support validation")
150        elif name == feature_external_pes:
151            if state:
152                raise SAXNotSupportedException(
153                    "expat does not read external parameter entities")
154        elif name == feature_namespace_prefixes:
155            if state:
156                raise SAXNotSupportedException(
157                    "expat does not report namespace prefixes")
158        else:
159            raise SAXNotRecognizedException(
160                "Feature '%s' not recognized" % name)
161
162    def getProperty(self, name):
163        if name == handler.property_lexical_handler:
164            return self._lex_handler_prop
165        elif name == property_interning_dict:
166            return self._interning
167        elif name == property_xml_string:
168            if self._parser:
169                if hasattr(self._parser, "GetInputContext"):
170                    return self._parser.GetInputContext()
171                else:
172                    raise SAXNotRecognizedException(
173                        "This version of expat does not support getting"
174                        " the XML string")
175            else:
176                raise SAXNotSupportedException(
177                    "XML string cannot be returned when not parsing")
178        raise SAXNotRecognizedException("Property '%s' not recognized" % name)
179
180    def setProperty(self, name, value):
181        if name == handler.property_lexical_handler:
182            self._lex_handler_prop = value
183            if self._parsing:
184                self._reset_lex_handler_prop()
185        elif name == property_interning_dict:
186            self._interning = value
187        elif name == property_xml_string:
188            raise SAXNotSupportedException("Property '%s' cannot be set" %
189                                           name)
190        else:
191            raise SAXNotRecognizedException("Property '%s' not recognized" %
192                                            name)
193
194    # IncrementalParser methods
195
196    def feed(self, data, isFinal = 0):
197        if not self._parsing:
198            self.reset()
199            self._parsing = 1
200            self._cont_handler.startDocument()
201
202        try:
203            # The isFinal parameter is internal to the expat reader.
204            # If it is set to true, expat will check validity of the entire
205            # document. When feeding chunks, they are not normally final -
206            # except when invoked from close.
207            self._parser.Parse(data, isFinal)
208        except expat.error, e:
209            exc = SAXParseException(expat.ErrorString(e.code), e, self)
210            # FIXME: when to invoke error()?
211            self._err_handler.fatalError(exc)
212
213    def close(self):
214        if self._entity_stack:
215            # If we are completing an external entity, do nothing here
216            return
217        self.feed("", isFinal = 1)
218        self._cont_handler.endDocument()
219        self._parsing = 0
220        # break cycle created by expat handlers pointing to our methods
221        self._parser = None
222
223    def _reset_cont_handler(self):
224        self._parser.ProcessingInstructionHandler = \
225                                    self._cont_handler.processingInstruction
226        self._parser.CharacterDataHandler = self._cont_handler.characters
227
228    def _reset_lex_handler_prop(self):
229        lex = self._lex_handler_prop
230        parser = self._parser
231        if lex is None:
232            parser.CommentHandler = None
233            parser.StartCdataSectionHandler = None
234            parser.EndCdataSectionHandler = None
235            parser.StartDoctypeDeclHandler = None
236            parser.EndDoctypeDeclHandler = None
237        else:
238            parser.CommentHandler = lex.comment
239            parser.StartCdataSectionHandler = lex.startCDATA
240            parser.EndCdataSectionHandler = lex.endCDATA
241            parser.StartDoctypeDeclHandler = self.start_doctype_decl
242            parser.EndDoctypeDeclHandler = lex.endDTD
243
244    def reset(self):
245        if self._namespaces:
246            self._parser = expat.ParserCreate(self._source.getEncoding(), " ",
247                                              intern=self._interning)
248            self._parser.namespace_prefixes = 1
249            self._parser.StartElementHandler = self.start_element_ns
250            self._parser.EndElementHandler = self.end_element_ns
251        else:
252            self._parser = expat.ParserCreate(self._source.getEncoding(),
253                                              intern = self._interning)
254            self._parser.StartElementHandler = self.start_element
255            self._parser.EndElementHandler = self.end_element
256
257        self._reset_cont_handler()
258        self._parser.UnparsedEntityDeclHandler = self.unparsed_entity_decl
259        self._parser.NotationDeclHandler = self.notation_decl
260        self._parser.StartNamespaceDeclHandler = self.start_namespace_decl
261        self._parser.EndNamespaceDeclHandler = self.end_namespace_decl
262
263        self._decl_handler_prop = None
264        if self._lex_handler_prop:
265            self._reset_lex_handler_prop()
266#         self._parser.DefaultHandler =
267#         self._parser.DefaultHandlerExpand =
268#         self._parser.NotStandaloneHandler =
269        self._parser.ExternalEntityRefHandler = self.external_entity_ref
270        try:
271            self._parser.SkippedEntityHandler = self.skipped_entity_handler
272        except AttributeError:
273            # This pyexpat does not support SkippedEntity
274            pass
275        self._parser.SetParamEntityParsing(
276            expat.XML_PARAM_ENTITY_PARSING_UNLESS_STANDALONE)
277
278        self._parsing = 0
279        self._entity_stack = []
280
281    # Locator methods
282
283    def getColumnNumber(self):
284        if self._parser is None:
285            return None
286        return self._parser.ErrorColumnNumber
287
288    def getLineNumber(self):
289        if self._parser is None:
290            return 1
291        return self._parser.ErrorLineNumber
292
293    def getPublicId(self):
294        return self._source.getPublicId()
295
296    def getSystemId(self):
297        return self._source.getSystemId()
298
299    # event handlers
300    def start_element(self, name, attrs):
301        self._cont_handler.startElement(name, AttributesImpl(attrs))
302
303    def end_element(self, name):
304        self._cont_handler.endElement(name)
305
306    def start_element_ns(self, name, attrs):
307        pair = name.split()
308        if len(pair) == 1:
309            # no namespace
310            pair = (None, name)
311        elif len(pair) == 3:
312            pair = pair[0], pair[1]
313        else:
314            # default namespace
315            pair = tuple(pair)
316
317        newattrs = {}
318        qnames = {}
319        for (aname, value) in attrs.items():
320            parts = aname.split()
321            length = len(parts)
322            if length == 1:
323                # no namespace
324                qname = aname
325                apair = (None, aname)
326            elif length == 3:
327                qname = "%s:%s" % (parts[2], parts[1])
328                apair = parts[0], parts[1]
329            else:
330                # default namespace
331                qname = parts[1]
332                apair = tuple(parts)
333
334            newattrs[apair] = value
335            qnames[apair] = qname
336
337        self._cont_handler.startElementNS(pair, None,
338                                          AttributesNSImpl(newattrs, qnames))
339
340    def end_element_ns(self, name):
341        pair = name.split()
342        if len(pair) == 1:
343            pair = (None, name)
344        elif len(pair) == 3:
345            pair = pair[0], pair[1]
346        else:
347            pair = tuple(pair)
348
349        self._cont_handler.endElementNS(pair, None)
350
351    # this is not used (call directly to ContentHandler)
352    def processing_instruction(self, target, data):
353        self._cont_handler.processingInstruction(target, data)
354
355    # this is not used (call directly to ContentHandler)
356    def character_data(self, data):
357        self._cont_handler.characters(data)
358
359    def start_namespace_decl(self, prefix, uri):
360        self._cont_handler.startPrefixMapping(prefix, uri)
361
362    def end_namespace_decl(self, prefix):
363        self._cont_handler.endPrefixMapping(prefix)
364
365    def start_doctype_decl(self, name, sysid, pubid, has_internal_subset):
366        self._lex_handler_prop.startDTD(name, pubid, sysid)
367
368    def unparsed_entity_decl(self, name, base, sysid, pubid, notation_name):
369        self._dtd_handler.unparsedEntityDecl(name, pubid, sysid, notation_name)
370
371    def notation_decl(self, name, base, sysid, pubid):
372        self._dtd_handler.notationDecl(name, pubid, sysid)
373
374    def external_entity_ref(self, context, base, sysid, pubid):
375        if not self._external_ges:
376            return 1
377
378        source = self._ent_handler.resolveEntity(pubid, sysid)
379        source = saxutils.prepare_input_source(source,
380                                               self._source.getSystemId() or
381                                               "")
382
383        self._entity_stack.append((self._parser, self._source))
384        self._parser = self._parser.ExternalEntityParserCreate(context)
385        self._source = source
386
387        try:
388            xmlreader.IncrementalParser.parse(self, source)
389        except:
390            return 0  # FIXME: save error info here?
391
392        (self._parser, self._source) = self._entity_stack[-1]
393        del self._entity_stack[-1]
394        return 1
395
396    def skipped_entity_handler(self, name, is_pe):
397        if is_pe:
398            # The SAX spec requires to report skipped PEs with a '%'
399            name = '%'+name
400        self._cont_handler.skippedEntity(name)
401
402# ---
403
404def create_parser(*args, **kwargs):
405    return ExpatParser(*args, **kwargs)
406
407# ---
408
409if __name__ == "__main__":
410    import xml.sax
411    p = create_parser()
412    p.setContentHandler(xml.sax.XMLGenerator())
413    p.setErrorHandler(xml.sax.ErrorHandler())
414    p.parse("../../../hamlet.xml")