/Lib/xml/sax/expatreader.py

http://unladen-swallow.googlecode.com/ · Python · 414 lines · 316 code · 61 blank · 37 comment · 66 complexity · eaa99dc3b0fcd846a44b18b6c9c4fc94 MD5 · raw file

  1. """
  2. SAX driver for the pyexpat C module. This driver works with
  3. pyexpat.__version__ == '2.22'.
  4. """
  5. version = "0.20"
  6. from xml.sax._exceptions import *
  7. from xml.sax.handler import feature_validation, feature_namespaces
  8. from xml.sax.handler import feature_namespace_prefixes
  9. from xml.sax.handler import feature_external_ges, feature_external_pes
  10. from xml.sax.handler import feature_string_interning
  11. from xml.sax.handler import property_xml_string, property_interning_dict
  12. # xml.parsers.expat does not raise ImportError in Jython
  13. import sys
  14. if sys.platform[:4] == "java":
  15. raise SAXReaderNotAvailable("expat not available in Java", None)
  16. del sys
  17. try:
  18. from xml.parsers import expat
  19. except ImportError:
  20. raise SAXReaderNotAvailable("expat not supported", None)
  21. else:
  22. if not hasattr(expat, "ParserCreate"):
  23. raise SAXReaderNotAvailable("expat not supported", None)
  24. from xml.sax import xmlreader, saxutils, handler
  25. AttributesImpl = xmlreader.AttributesImpl
  26. AttributesNSImpl = xmlreader.AttributesNSImpl
  27. # If we're using a sufficiently recent version of Python, we can use
  28. # weak references to avoid cycles between the parser and content
  29. # handler, otherwise we'll just have to pretend.
  30. try:
  31. import _weakref
  32. except ImportError:
  33. def _mkproxy(o):
  34. return o
  35. else:
  36. import weakref
  37. _mkproxy = weakref.proxy
  38. del weakref, _weakref
  39. # --- ExpatLocator
  40. class ExpatLocator(xmlreader.Locator):
  41. """Locator for use with the ExpatParser class.
  42. This uses a weak reference to the parser object to avoid creating
  43. a circular reference between the parser and the content handler.
  44. """
  45. def __init__(self, parser):
  46. self._ref = _mkproxy(parser)
  47. def getColumnNumber(self):
  48. parser = self._ref
  49. if parser._parser is None:
  50. return None
  51. return parser._parser.ErrorColumnNumber
  52. def getLineNumber(self):
  53. parser = self._ref
  54. if parser._parser is None:
  55. return 1
  56. return parser._parser.ErrorLineNumber
  57. def getPublicId(self):
  58. parser = self._ref
  59. if parser is None:
  60. return None
  61. return parser._source.getPublicId()
  62. def getSystemId(self):
  63. parser = self._ref
  64. if parser is None:
  65. return None
  66. return parser._source.getSystemId()
  67. # --- ExpatParser
  68. class ExpatParser(xmlreader.IncrementalParser, xmlreader.Locator):
  69. """SAX driver for the pyexpat C module."""
  70. def __init__(self, namespaceHandling=0, bufsize=2**16-20):
  71. xmlreader.IncrementalParser.__init__(self, bufsize)
  72. self._source = xmlreader.InputSource()
  73. self._parser = None
  74. self._namespaces = namespaceHandling
  75. self._lex_handler_prop = None
  76. self._parsing = 0
  77. self._entity_stack = []
  78. self._external_ges = 1
  79. self._interning = None
  80. # XMLReader methods
  81. def parse(self, source):
  82. "Parse an XML document from a URL or an InputSource."
  83. source = saxutils.prepare_input_source(source)
  84. self._source = source
  85. self.reset()
  86. self._cont_handler.setDocumentLocator(ExpatLocator(self))
  87. xmlreader.IncrementalParser.parse(self, source)
  88. def prepareParser(self, source):
  89. if source.getSystemId() is not None:
  90. self._parser.SetBase(source.getSystemId())
  91. # Redefined setContentHandler to allow changing handlers during parsing
  92. def setContentHandler(self, handler):
  93. xmlreader.IncrementalParser.setContentHandler(self, handler)
  94. if self._parsing:
  95. self._reset_cont_handler()
  96. def getFeature(self, name):
  97. if name == feature_namespaces:
  98. return self._namespaces
  99. elif name == feature_string_interning:
  100. return self._interning is not None
  101. elif name in (feature_validation, feature_external_pes,
  102. feature_namespace_prefixes):
  103. return 0
  104. elif name == feature_external_ges:
  105. return self._external_ges
  106. raise SAXNotRecognizedException("Feature '%s' not recognized" % name)
  107. def setFeature(self, name, state):
  108. if self._parsing:
  109. raise SAXNotSupportedException("Cannot set features while parsing")
  110. if name == feature_namespaces:
  111. self._namespaces = state
  112. elif name == feature_external_ges:
  113. self._external_ges = state
  114. elif name == feature_string_interning:
  115. if state:
  116. if self._interning is None:
  117. self._interning = {}
  118. else:
  119. self._interning = None
  120. elif name == feature_validation:
  121. if state:
  122. raise SAXNotSupportedException(
  123. "expat does not support validation")
  124. elif name == feature_external_pes:
  125. if state:
  126. raise SAXNotSupportedException(
  127. "expat does not read external parameter entities")
  128. elif name == feature_namespace_prefixes:
  129. if state:
  130. raise SAXNotSupportedException(
  131. "expat does not report namespace prefixes")
  132. else:
  133. raise SAXNotRecognizedException(
  134. "Feature '%s' not recognized" % name)
  135. def getProperty(self, name):
  136. if name == handler.property_lexical_handler:
  137. return self._lex_handler_prop
  138. elif name == property_interning_dict:
  139. return self._interning
  140. elif name == property_xml_string:
  141. if self._parser:
  142. if hasattr(self._parser, "GetInputContext"):
  143. return self._parser.GetInputContext()
  144. else:
  145. raise SAXNotRecognizedException(
  146. "This version of expat does not support getting"
  147. " the XML string")
  148. else:
  149. raise SAXNotSupportedException(
  150. "XML string cannot be returned when not parsing")
  151. raise SAXNotRecognizedException("Property '%s' not recognized" % name)
  152. def setProperty(self, name, value):
  153. if name == handler.property_lexical_handler:
  154. self._lex_handler_prop = value
  155. if self._parsing:
  156. self._reset_lex_handler_prop()
  157. elif name == property_interning_dict:
  158. self._interning = value
  159. elif name == property_xml_string:
  160. raise SAXNotSupportedException("Property '%s' cannot be set" %
  161. name)
  162. else:
  163. raise SAXNotRecognizedException("Property '%s' not recognized" %
  164. name)
  165. # IncrementalParser methods
  166. def feed(self, data, isFinal = 0):
  167. if not self._parsing:
  168. self.reset()
  169. self._parsing = 1
  170. self._cont_handler.startDocument()
  171. try:
  172. # The isFinal parameter is internal to the expat reader.
  173. # If it is set to true, expat will check validity of the entire
  174. # document. When feeding chunks, they are not normally final -
  175. # except when invoked from close.
  176. self._parser.Parse(data, isFinal)
  177. except expat.error, e:
  178. exc = SAXParseException(expat.ErrorString(e.code), e, self)
  179. # FIXME: when to invoke error()?
  180. self._err_handler.fatalError(exc)
  181. def close(self):
  182. if self._entity_stack:
  183. # If we are completing an external entity, do nothing here
  184. return
  185. self.feed("", isFinal = 1)
  186. self._cont_handler.endDocument()
  187. self._parsing = 0
  188. # break cycle created by expat handlers pointing to our methods
  189. self._parser = None
  190. def _reset_cont_handler(self):
  191. self._parser.ProcessingInstructionHandler = \
  192. self._cont_handler.processingInstruction
  193. self._parser.CharacterDataHandler = self._cont_handler.characters
  194. def _reset_lex_handler_prop(self):
  195. lex = self._lex_handler_prop
  196. parser = self._parser
  197. if lex is None:
  198. parser.CommentHandler = None
  199. parser.StartCdataSectionHandler = None
  200. parser.EndCdataSectionHandler = None
  201. parser.StartDoctypeDeclHandler = None
  202. parser.EndDoctypeDeclHandler = None
  203. else:
  204. parser.CommentHandler = lex.comment
  205. parser.StartCdataSectionHandler = lex.startCDATA
  206. parser.EndCdataSectionHandler = lex.endCDATA
  207. parser.StartDoctypeDeclHandler = self.start_doctype_decl
  208. parser.EndDoctypeDeclHandler = lex.endDTD
  209. def reset(self):
  210. if self._namespaces:
  211. self._parser = expat.ParserCreate(self._source.getEncoding(), " ",
  212. intern=self._interning)
  213. self._parser.namespace_prefixes = 1
  214. self._parser.StartElementHandler = self.start_element_ns
  215. self._parser.EndElementHandler = self.end_element_ns
  216. else:
  217. self._parser = expat.ParserCreate(self._source.getEncoding(),
  218. intern = self._interning)
  219. self._parser.StartElementHandler = self.start_element
  220. self._parser.EndElementHandler = self.end_element
  221. self._reset_cont_handler()
  222. self._parser.UnparsedEntityDeclHandler = self.unparsed_entity_decl
  223. self._parser.NotationDeclHandler = self.notation_decl
  224. self._parser.StartNamespaceDeclHandler = self.start_namespace_decl
  225. self._parser.EndNamespaceDeclHandler = self.end_namespace_decl
  226. self._decl_handler_prop = None
  227. if self._lex_handler_prop:
  228. self._reset_lex_handler_prop()
  229. # self._parser.DefaultHandler =
  230. # self._parser.DefaultHandlerExpand =
  231. # self._parser.NotStandaloneHandler =
  232. self._parser.ExternalEntityRefHandler = self.external_entity_ref
  233. try:
  234. self._parser.SkippedEntityHandler = self.skipped_entity_handler
  235. except AttributeError:
  236. # This pyexpat does not support SkippedEntity
  237. pass
  238. self._parser.SetParamEntityParsing(
  239. expat.XML_PARAM_ENTITY_PARSING_UNLESS_STANDALONE)
  240. self._parsing = 0
  241. self._entity_stack = []
  242. # Locator methods
  243. def getColumnNumber(self):
  244. if self._parser is None:
  245. return None
  246. return self._parser.ErrorColumnNumber
  247. def getLineNumber(self):
  248. if self._parser is None:
  249. return 1
  250. return self._parser.ErrorLineNumber
  251. def getPublicId(self):
  252. return self._source.getPublicId()
  253. def getSystemId(self):
  254. return self._source.getSystemId()
  255. # event handlers
  256. def start_element(self, name, attrs):
  257. self._cont_handler.startElement(name, AttributesImpl(attrs))
  258. def end_element(self, name):
  259. self._cont_handler.endElement(name)
  260. def start_element_ns(self, name, attrs):
  261. pair = name.split()
  262. if len(pair) == 1:
  263. # no namespace
  264. pair = (None, name)
  265. elif len(pair) == 3:
  266. pair = pair[0], pair[1]
  267. else:
  268. # default namespace
  269. pair = tuple(pair)
  270. newattrs = {}
  271. qnames = {}
  272. for (aname, value) in attrs.items():
  273. parts = aname.split()
  274. length = len(parts)
  275. if length == 1:
  276. # no namespace
  277. qname = aname
  278. apair = (None, aname)
  279. elif length == 3:
  280. qname = "%s:%s" % (parts[2], parts[1])
  281. apair = parts[0], parts[1]
  282. else:
  283. # default namespace
  284. qname = parts[1]
  285. apair = tuple(parts)
  286. newattrs[apair] = value
  287. qnames[apair] = qname
  288. self._cont_handler.startElementNS(pair, None,
  289. AttributesNSImpl(newattrs, qnames))
  290. def end_element_ns(self, name):
  291. pair = name.split()
  292. if len(pair) == 1:
  293. pair = (None, name)
  294. elif len(pair) == 3:
  295. pair = pair[0], pair[1]
  296. else:
  297. pair = tuple(pair)
  298. self._cont_handler.endElementNS(pair, None)
  299. # this is not used (call directly to ContentHandler)
  300. def processing_instruction(self, target, data):
  301. self._cont_handler.processingInstruction(target, data)
  302. # this is not used (call directly to ContentHandler)
  303. def character_data(self, data):
  304. self._cont_handler.characters(data)
  305. def start_namespace_decl(self, prefix, uri):
  306. self._cont_handler.startPrefixMapping(prefix, uri)
  307. def end_namespace_decl(self, prefix):
  308. self._cont_handler.endPrefixMapping(prefix)
  309. def start_doctype_decl(self, name, sysid, pubid, has_internal_subset):
  310. self._lex_handler_prop.startDTD(name, pubid, sysid)
  311. def unparsed_entity_decl(self, name, base, sysid, pubid, notation_name):
  312. self._dtd_handler.unparsedEntityDecl(name, pubid, sysid, notation_name)
  313. def notation_decl(self, name, base, sysid, pubid):
  314. self._dtd_handler.notationDecl(name, pubid, sysid)
  315. def external_entity_ref(self, context, base, sysid, pubid):
  316. if not self._external_ges:
  317. return 1
  318. source = self._ent_handler.resolveEntity(pubid, sysid)
  319. source = saxutils.prepare_input_source(source,
  320. self._source.getSystemId() or
  321. "")
  322. self._entity_stack.append((self._parser, self._source))
  323. self._parser = self._parser.ExternalEntityParserCreate(context)
  324. self._source = source
  325. try:
  326. xmlreader.IncrementalParser.parse(self, source)
  327. except:
  328. return 0 # FIXME: save error info here?
  329. (self._parser, self._source) = self._entity_stack[-1]
  330. del self._entity_stack[-1]
  331. return 1
  332. def skipped_entity_handler(self, name, is_pe):
  333. if is_pe:
  334. # The SAX spec requires to report skipped PEs with a '%'
  335. name = '%'+name
  336. self._cont_handler.skippedEntity(name)
  337. # ---
  338. def create_parser(*args, **kwargs):
  339. return ExpatParser(*args, **kwargs)
  340. # ---
  341. if __name__ == "__main__":
  342. import xml.sax
  343. p = create_parser()
  344. p.setContentHandler(xml.sax.XMLGenerator())
  345. p.setErrorHandler(xml.sax.ErrorHandler())
  346. p.parse("../../../hamlet.xml")