/xmltodict.py

http://github.com/martinblech/xmltodict · Python · 540 lines · 487 code · 28 blank · 25 comment · 126 complexity · dbae247fa0bf3a0c29b6e830d7e7a79c MD5 · raw file

  1. #!/usr/bin/env python
  2. "Makes working with XML feel like you are working with JSON"
  3. try:
  4. from defusedexpat import pyexpat as expat
  5. except ImportError:
  6. from xml.parsers import expat
  7. from xml.sax.saxutils import XMLGenerator
  8. from xml.sax.xmlreader import AttributesImpl
  9. try: # pragma no cover
  10. from cStringIO import StringIO
  11. except ImportError: # pragma no cover
  12. try:
  13. from StringIO import StringIO
  14. except ImportError:
  15. from io import StringIO
  16. from collections import OrderedDict
  17. from inspect import isgenerator
  18. try: # pragma no cover
  19. _basestring = basestring
  20. except NameError: # pragma no cover
  21. _basestring = str
  22. try: # pragma no cover
  23. _unicode = unicode
  24. except NameError: # pragma no cover
  25. _unicode = str
  26. __author__ = 'Martin Blech'
  27. __version__ = '0.12.0'
  28. __license__ = 'MIT'
  29. class ParsingInterrupted(Exception):
  30. pass
  31. class _DictSAXHandler(object):
  32. def __init__(self,
  33. item_depth=0,
  34. item_callback=lambda *args: True,
  35. xml_attribs=True,
  36. attr_prefix='@',
  37. cdata_key='#text',
  38. force_cdata=False,
  39. cdata_separator='',
  40. postprocessor=None,
  41. dict_constructor=OrderedDict,
  42. strip_whitespace=True,
  43. namespace_separator=':',
  44. namespaces=None,
  45. force_list=None,
  46. comment_key='#comment'):
  47. self.path = []
  48. self.stack = []
  49. self.data = []
  50. self.item = None
  51. self.item_depth = item_depth
  52. self.xml_attribs = xml_attribs
  53. self.item_callback = item_callback
  54. self.attr_prefix = attr_prefix
  55. self.cdata_key = cdata_key
  56. self.force_cdata = force_cdata
  57. self.cdata_separator = cdata_separator
  58. self.postprocessor = postprocessor
  59. self.dict_constructor = dict_constructor
  60. self.strip_whitespace = strip_whitespace
  61. self.namespace_separator = namespace_separator
  62. self.namespaces = namespaces
  63. self.namespace_declarations = OrderedDict()
  64. self.force_list = force_list
  65. self.comment_key = comment_key
  66. def _build_name(self, full_name):
  67. if self.namespaces is None:
  68. return full_name
  69. i = full_name.rfind(self.namespace_separator)
  70. if i == -1:
  71. return full_name
  72. namespace, name = full_name[:i], full_name[i+1:]
  73. try:
  74. short_namespace = self.namespaces[namespace]
  75. except KeyError:
  76. short_namespace = namespace
  77. if not short_namespace:
  78. return name
  79. else:
  80. return self.namespace_separator.join((short_namespace, name))
  81. def _attrs_to_dict(self, attrs):
  82. if isinstance(attrs, dict):
  83. return attrs
  84. return self.dict_constructor(zip(attrs[0::2], attrs[1::2]))
  85. def startNamespaceDecl(self, prefix, uri):
  86. self.namespace_declarations[prefix or ''] = uri
  87. def startElement(self, full_name, attrs):
  88. name = self._build_name(full_name)
  89. attrs = self._attrs_to_dict(attrs)
  90. if attrs and self.namespace_declarations:
  91. attrs['xmlns'] = self.namespace_declarations
  92. self.namespace_declarations = OrderedDict()
  93. self.path.append((name, attrs or None))
  94. if len(self.path) > self.item_depth:
  95. self.stack.append((self.item, self.data))
  96. if self.xml_attribs:
  97. attr_entries = []
  98. for key, value in attrs.items():
  99. key = self.attr_prefix+self._build_name(key)
  100. if self.postprocessor:
  101. entry = self.postprocessor(self.path, key, value)
  102. else:
  103. entry = (key, value)
  104. if entry:
  105. attr_entries.append(entry)
  106. attrs = self.dict_constructor(attr_entries)
  107. else:
  108. attrs = None
  109. self.item = attrs or None
  110. self.data = []
  111. def endElement(self, full_name):
  112. name = self._build_name(full_name)
  113. if len(self.path) == self.item_depth:
  114. item = self.item
  115. if item is None:
  116. item = (None if not self.data
  117. else self.cdata_separator.join(self.data))
  118. should_continue = self.item_callback(self.path, item)
  119. if not should_continue:
  120. raise ParsingInterrupted()
  121. if len(self.stack):
  122. data = (None if not self.data
  123. else self.cdata_separator.join(self.data))
  124. item = self.item
  125. self.item, self.data = self.stack.pop()
  126. if self.strip_whitespace and data:
  127. data = data.strip() or None
  128. if data and self.force_cdata and item is None:
  129. item = self.dict_constructor()
  130. if item is not None:
  131. if data:
  132. self.push_data(item, self.cdata_key, data)
  133. self.item = self.push_data(self.item, name, item)
  134. else:
  135. self.item = self.push_data(self.item, name, data)
  136. else:
  137. self.item = None
  138. self.data = []
  139. self.path.pop()
  140. def characters(self, data):
  141. if not self.data:
  142. self.data = [data]
  143. else:
  144. self.data.append(data)
  145. def comments(self, data):
  146. if self.strip_whitespace:
  147. data = data.strip()
  148. self.item = self.push_data(self.item, self.comment_key, data)
  149. def push_data(self, item, key, data):
  150. if self.postprocessor is not None:
  151. result = self.postprocessor(self.path, key, data)
  152. if result is None:
  153. return item
  154. key, data = result
  155. if item is None:
  156. item = self.dict_constructor()
  157. try:
  158. value = item[key]
  159. if isinstance(value, list):
  160. value.append(data)
  161. else:
  162. item[key] = [value, data]
  163. except KeyError:
  164. if self._should_force_list(key, data):
  165. item[key] = [data]
  166. else:
  167. item[key] = data
  168. return item
  169. def _should_force_list(self, key, value):
  170. if not self.force_list:
  171. return False
  172. if isinstance(self.force_list, bool):
  173. return self.force_list
  174. try:
  175. return key in self.force_list
  176. except TypeError:
  177. return self.force_list(self.path[:-1], key, value)
  178. def parse(xml_input, encoding=None, expat=expat, process_namespaces=False,
  179. namespace_separator=':', disable_entities=True, process_comments=False, **kwargs):
  180. """Parse the given XML input and convert it into a dictionary.
  181. `xml_input` can either be a `string`, a file-like object, or a generator of strings.
  182. If `xml_attribs` is `True`, element attributes are put in the dictionary
  183. among regular child elements, using `@` as a prefix to avoid collisions. If
  184. set to `False`, they are just ignored.
  185. Simple example::
  186. >>> import xmltodict
  187. >>> doc = xmltodict.parse(\"\"\"
  188. ... <a prop="x">
  189. ... <b>1</b>
  190. ... <b>2</b>
  191. ... </a>
  192. ... \"\"\")
  193. >>> doc['a']['@prop']
  194. u'x'
  195. >>> doc['a']['b']
  196. [u'1', u'2']
  197. If `item_depth` is `0`, the function returns a dictionary for the root
  198. element (default behavior). Otherwise, it calls `item_callback` every time
  199. an item at the specified depth is found and returns `None` in the end
  200. (streaming mode).
  201. The callback function receives two parameters: the `path` from the document
  202. root to the item (name-attribs pairs), and the `item` (dict). If the
  203. callback's return value is false-ish, parsing will be stopped with the
  204. :class:`ParsingInterrupted` exception.
  205. Streaming example::
  206. >>> def handle(path, item):
  207. ... print('path:%s item:%s' % (path, item))
  208. ... return True
  209. ...
  210. >>> xmltodict.parse(\"\"\"
  211. ... <a prop="x">
  212. ... <b>1</b>
  213. ... <b>2</b>
  214. ... </a>\"\"\", item_depth=2, item_callback=handle)
  215. path:[(u'a', {u'prop': u'x'}), (u'b', None)] item:1
  216. path:[(u'a', {u'prop': u'x'}), (u'b', None)] item:2
  217. The optional argument `postprocessor` is a function that takes `path`,
  218. `key` and `value` as positional arguments and returns a new `(key, value)`
  219. pair where both `key` and `value` may have changed. Usage example::
  220. >>> def postprocessor(path, key, value):
  221. ... try:
  222. ... return key + ':int', int(value)
  223. ... except (ValueError, TypeError):
  224. ... return key, value
  225. >>> xmltodict.parse('<a><b>1</b><b>2</b><b>x</b></a>',
  226. ... postprocessor=postprocessor)
  227. OrderedDict([(u'a', OrderedDict([(u'b:int', [1, 2]), (u'b', u'x')]))])
  228. You can pass an alternate version of `expat` (such as `defusedexpat`) by
  229. using the `expat` parameter. E.g:
  230. >>> import defusedexpat
  231. >>> xmltodict.parse('<a>hello</a>', expat=defusedexpat.pyexpat)
  232. OrderedDict([(u'a', u'hello')])
  233. You can use the force_list argument to force lists to be created even
  234. when there is only a single child of a given level of hierarchy. The
  235. force_list argument is a tuple of keys. If the key for a given level
  236. of hierarchy is in the force_list argument, that level of hierarchy
  237. will have a list as a child (even if there is only one sub-element).
  238. The index_keys operation takes precedence over this. This is applied
  239. after any user-supplied postprocessor has already run.
  240. For example, given this input:
  241. <servers>
  242. <server>
  243. <name>host1</name>
  244. <os>Linux</os>
  245. <interfaces>
  246. <interface>
  247. <name>em0</name>
  248. <ip_address>10.0.0.1</ip_address>
  249. </interface>
  250. </interfaces>
  251. </server>
  252. </servers>
  253. If called with force_list=('interface',), it will produce
  254. this dictionary:
  255. {'servers':
  256. {'server':
  257. {'name': 'host1',
  258. 'os': 'Linux'},
  259. 'interfaces':
  260. {'interface':
  261. [ {'name': 'em0', 'ip_address': '10.0.0.1' } ] } } }
  262. `force_list` can also be a callable that receives `path`, `key` and
  263. `value`. This is helpful in cases where the logic that decides whether
  264. a list should be forced is more complex.
  265. If `process_comment` is `True` then comment will be added with comment_key
  266. (default=`'#comment'`) to then tag which contains comment
  267. For example, given this input:
  268. <a>
  269. <b>
  270. <!-- b comment -->
  271. <c>
  272. <!-- c comment -->
  273. 1
  274. </c>
  275. <d>2</d>
  276. </b>
  277. </a>
  278. If called with process_comment=True, it will produce
  279. this dictionary:
  280. 'a': {
  281. 'b': {
  282. '#comment': 'b comment',
  283. 'c': {
  284. '#comment': 'c comment',
  285. '#text': '1',
  286. },
  287. 'd': '2',
  288. },
  289. }
  290. """
  291. handler = _DictSAXHandler(namespace_separator=namespace_separator,
  292. **kwargs)
  293. if isinstance(xml_input, _unicode):
  294. if not encoding:
  295. encoding = 'utf-8'
  296. xml_input = xml_input.encode(encoding)
  297. if not process_namespaces:
  298. namespace_separator = None
  299. parser = expat.ParserCreate(
  300. encoding,
  301. namespace_separator
  302. )
  303. try:
  304. parser.ordered_attributes = True
  305. except AttributeError:
  306. # Jython's expat does not support ordered_attributes
  307. pass
  308. parser.StartNamespaceDeclHandler = handler.startNamespaceDecl
  309. parser.StartElementHandler = handler.startElement
  310. parser.EndElementHandler = handler.endElement
  311. parser.CharacterDataHandler = handler.characters
  312. if process_comments:
  313. parser.CommentHandler = handler.comments
  314. parser.buffer_text = True
  315. if disable_entities:
  316. try:
  317. # Attempt to disable DTD in Jython's expat parser (Xerces-J).
  318. feature = "http://apache.org/xml/features/disallow-doctype-decl"
  319. parser._reader.setFeature(feature, True)
  320. except AttributeError:
  321. # For CPython / expat parser.
  322. # Anything not handled ends up here and entities aren't expanded.
  323. parser.DefaultHandler = lambda x: None
  324. # Expects an integer return; zero means failure -> expat.ExpatError.
  325. parser.ExternalEntityRefHandler = lambda *x: 1
  326. if hasattr(xml_input, 'read'):
  327. parser.ParseFile(xml_input)
  328. elif isgenerator(xml_input):
  329. for chunk in xml_input:
  330. parser.Parse(chunk,False)
  331. parser.Parse(b'',True)
  332. else:
  333. parser.Parse(xml_input, True)
  334. return handler.item
  335. def _process_namespace(name, namespaces, ns_sep=':', attr_prefix='@'):
  336. if not namespaces:
  337. return name
  338. try:
  339. ns, name = name.rsplit(ns_sep, 1)
  340. except ValueError:
  341. pass
  342. else:
  343. ns_res = namespaces.get(ns.strip(attr_prefix))
  344. name = '{}{}{}{}'.format(
  345. attr_prefix if ns.startswith(attr_prefix) else '',
  346. ns_res, ns_sep, name) if ns_res else name
  347. return name
  348. def _emit(key, value, content_handler,
  349. attr_prefix='@',
  350. cdata_key='#text',
  351. depth=0,
  352. preprocessor=None,
  353. pretty=False,
  354. newl='\n',
  355. indent='\t',
  356. namespace_separator=':',
  357. namespaces=None,
  358. full_document=True,
  359. expand_iter=None):
  360. key = _process_namespace(key, namespaces, namespace_separator, attr_prefix)
  361. if preprocessor is not None:
  362. result = preprocessor(key, value)
  363. if result is None:
  364. return
  365. key, value = result
  366. if (not hasattr(value, '__iter__')
  367. or isinstance(value, _basestring)
  368. or isinstance(value, dict)):
  369. value = [value]
  370. for index, v in enumerate(value):
  371. if full_document and depth == 0 and index > 0:
  372. raise ValueError('document with multiple roots')
  373. if v is None:
  374. v = OrderedDict()
  375. elif isinstance(v, bool):
  376. if v:
  377. v = _unicode('true')
  378. else:
  379. v = _unicode('false')
  380. elif not isinstance(v, dict):
  381. if expand_iter and hasattr(v, '__iter__') and not isinstance(v, _basestring):
  382. v = OrderedDict(((expand_iter, v),))
  383. else:
  384. v = _unicode(v)
  385. if isinstance(v, _basestring):
  386. v = OrderedDict(((cdata_key, v),))
  387. cdata = None
  388. attrs = OrderedDict()
  389. children = []
  390. for ik, iv in v.items():
  391. if ik == cdata_key:
  392. cdata = iv
  393. continue
  394. if ik.startswith(attr_prefix):
  395. ik = _process_namespace(ik, namespaces, namespace_separator,
  396. attr_prefix)
  397. if ik == '@xmlns' and isinstance(iv, dict):
  398. for k, v in iv.items():
  399. attr = 'xmlns{}'.format(':{}'.format(k) if k else '')
  400. attrs[attr] = _unicode(v)
  401. continue
  402. if not isinstance(iv, _unicode):
  403. iv = _unicode(iv)
  404. attrs[ik[len(attr_prefix):]] = iv
  405. continue
  406. children.append((ik, iv))
  407. if pretty:
  408. content_handler.ignorableWhitespace(depth * indent)
  409. content_handler.startElement(key, AttributesImpl(attrs))
  410. if pretty and children:
  411. content_handler.ignorableWhitespace(newl)
  412. for child_key, child_value in children:
  413. _emit(child_key, child_value, content_handler,
  414. attr_prefix, cdata_key, depth+1, preprocessor,
  415. pretty, newl, indent, namespaces=namespaces,
  416. namespace_separator=namespace_separator,
  417. expand_iter=expand_iter)
  418. if cdata is not None:
  419. content_handler.characters(cdata)
  420. if pretty and children:
  421. content_handler.ignorableWhitespace(depth * indent)
  422. content_handler.endElement(key)
  423. if pretty and depth:
  424. content_handler.ignorableWhitespace(newl)
  425. def unparse(input_dict, output=None, encoding='utf-8', full_document=True,
  426. short_empty_elements=False,
  427. **kwargs):
  428. """Emit an XML document for the given `input_dict` (reverse of `parse`).
  429. The resulting XML document is returned as a string, but if `output` (a
  430. file-like object) is specified, it is written there instead.
  431. Dictionary keys prefixed with `attr_prefix` (default=`'@'`) are interpreted
  432. as XML node attributes, whereas keys equal to `cdata_key`
  433. (default=`'#text'`) are treated as character data.
  434. The `pretty` parameter (default=`False`) enables pretty-printing. In this
  435. mode, lines are terminated with `'\n'` and indented with `'\t'`, but this
  436. can be customized with the `newl` and `indent` parameters.
  437. """
  438. if full_document and len(input_dict) != 1:
  439. raise ValueError('Document must have exactly one root.')
  440. must_return = False
  441. if output is None:
  442. output = StringIO()
  443. must_return = True
  444. if short_empty_elements:
  445. content_handler = XMLGenerator(output, encoding, True)
  446. else:
  447. content_handler = XMLGenerator(output, encoding)
  448. if full_document:
  449. content_handler.startDocument()
  450. for key, value in input_dict.items():
  451. _emit(key, value, content_handler, full_document=full_document,
  452. **kwargs)
  453. if full_document:
  454. content_handler.endDocument()
  455. if must_return:
  456. value = output.getvalue()
  457. try: # pragma no cover
  458. value = value.decode(encoding)
  459. except AttributeError: # pragma no cover
  460. pass
  461. return value
  462. if __name__ == '__main__': # pragma: no cover
  463. import sys
  464. import marshal
  465. try:
  466. stdin = sys.stdin.buffer
  467. stdout = sys.stdout.buffer
  468. except AttributeError:
  469. stdin = sys.stdin
  470. stdout = sys.stdout
  471. (item_depth,) = sys.argv[1:]
  472. item_depth = int(item_depth)
  473. def handle_item(path, item):
  474. marshal.dump((path, item), stdout)
  475. return True
  476. try:
  477. root = parse(stdin,
  478. item_depth=item_depth,
  479. item_callback=handle_item,
  480. dict_constructor=dict)
  481. if item_depth == 0:
  482. handle_item([], root)
  483. except KeyboardInterrupt:
  484. pass