PageRenderTime 48ms CodeModel.GetById 19ms RepoModel.GetById 0ms app.codeStats 0ms

/feedvalidator/xmlEncoding.py

https://github.com/dh-benamor/restful-openerp
Python | 288 lines | 229 code | 25 blank | 34 comment | 40 complexity | d45b6980190f1d0f1eadbc94419ae10a MD5 | raw file
  1. #!/usr/bin/python
  2. """
  3. $Id$
  4. This module deals with detecting XML encodings, using both BOMs and
  5. explicit declarations.
  6. """
  7. __author__ = "Joseph Walton <http://www.kafsemo.org/>"
  8. __version__ = "$Revision$"
  9. __copyright__ = "Copyright (c) 2004 Joseph Walton"
  10. import codecs
  11. import re
  12. from logging import ObscureEncoding, NonstdEncoding
  13. import logging
  14. class FailingCodec:
  15. def __init__(self, name):
  16. self.name = name
  17. def fail(self, txt, errors='strict'):
  18. raise UnicodeError('No codec available for ' + self.name + ' in this installation of FeedValidator')
  19. # Don't die if the codec can't be found, but return
  20. # a decoder that will fail on use
  21. def getdecoder(codec):
  22. try:
  23. return codecs.getdecoder(codec)
  24. except:
  25. return FailingCodec(codec).fail
  26. # These are generic decoders that are only used
  27. # to decode the XML declaration, from which we can read
  28. # the real encoding
  29. _decUTF32BE = getdecoder('UTF-32BE')
  30. _decUTF32LE = getdecoder('UTF-32LE')
  31. _decUTF16BE = getdecoder('UTF-16BE')
  32. _decUTF16LE = getdecoder('UTF-16LE')
  33. _decEBCDIC = getdecoder('IBM037') # EBCDIC
  34. _decACE = getdecoder('ISO-8859-1') # An ASCII-compatible encoding
  35. # Given a character index into a string, calculate its 1-based row and column
  36. def _position(txt, idx):
  37. row = txt.count('\n', 0, idx) + 1
  38. ln = txt.rfind('\n', 0, idx) + 1
  39. column = 0
  40. for c in txt[ln:idx]:
  41. if c == '\t':
  42. column = (column // 8 + 1) * 8
  43. else:
  44. column += 1
  45. column += 1
  46. return (row, column)
  47. def _normaliseNewlines(txt):
  48. return txt.replace('\r\n', '\n').replace('\r', '\n')
  49. def _logEvent(loggedEvents, e, pos=None):
  50. if pos:
  51. e.params['line'], e.params['column'] = pos
  52. loggedEvents.append(e)
  53. # Return the encoding from the declaration, or 'None'
  54. # Return None if the 'permitted' list is passed in and the encoding
  55. # isn't found in it. This is so that, e.g., a 4-byte-character XML file
  56. # that claims to be US-ASCII will fail now.
  57. def _decodeDeclaration(sig, dec, permitted, loggedEvents):
  58. sig = _normaliseNewlines(dec(sig)[0])
  59. eo = _encodingFromDecl(sig)
  60. if not(eo):
  61. _logEvent(loggedEvents,
  62. logging.UnicodeError({'exception': 'This XML file (apparently ' + permitted[0] + ') requires an encoding declaration'}), (1, 1))
  63. elif permitted and not(eo[0].upper() in permitted):
  64. if _hasCodec(eo[0]):
  65. # see if the codec is an alias of one of the permitted encodings
  66. codec=codecs.lookup(eo[0])
  67. for encoding in permitted:
  68. if _hasCodec(encoding) and codecs.lookup(encoding)[-1]==codec[-1]: break
  69. else:
  70. _logEvent(loggedEvents,
  71. logging.UnicodeError({'exception': 'This XML file claims an encoding of ' + eo[0] + ', but looks more like ' + permitted[0]}), eo[1])
  72. return eo
  73. # Return the encoding from the declaration, or 'fallback' if none is
  74. # present. Return None if the 'permitted' list is passed in and
  75. # the encoding isn't found in it
  76. def _decodePostBOMDeclaration(sig, dec, permitted, loggedEvents, fallback=None):
  77. sig = _normaliseNewlines(dec(sig)[0])
  78. eo = _encodingFromDecl(sig)
  79. if eo and not(eo[0].upper() in permitted):
  80. _logEvent(loggedEvents,
  81. logging.UnicodeError({'exception': 'Document starts with ' + permitted[0] + ' BOM marker but has incompatible declaration of ' + eo[0]}), eo[1])
  82. return None
  83. else:
  84. return eo or (fallback, None)
  85. def isStandard(x):
  86. """ Is this encoding required by the XML 1.0 Specification, 4.3.3? """
  87. return x.upper() in ['UTF-8', 'UTF-16']
  88. def isCommon(x):
  89. """Is this encoding commonly used, according to
  90. <http://www.syndic8.com/stats.php?Section=feeds#XMLEncodings>
  91. (as of 2004-03-27)?"""
  92. return isStandard(x) or x.upper() in ['US-ASCII', 'ISO-8859-1',
  93. 'EUC-JP', 'ISO-8859-2', 'ISO-8859-15', 'ISO-8859-7',
  94. 'KOI8-R', 'SHIFT_JIS', 'WINDOWS-1250', 'WINDOWS-1251',
  95. 'WINDOWS-1252', 'WINDOWS-1254', 'WINDOWS-1255', 'WINDOWS-1256',
  96. # This doesn't seem to be popular, but is the Chinese
  97. # government's mandatory standard
  98. 'GB18030'
  99. ]
  100. # Inspired by xmlproc's autodetect_encoding, but rewritten
  101. def _detect(doc_start, loggedEvents=[], fallback='UTF-8'):
  102. """This is the logic from appendix F.1 of the XML 1.0 specification.
  103. Pass in the start of a document (>= 256 octets), and receive the encoding to
  104. use, or None if there is a problem with the document."""
  105. sig = doc_start[:4]
  106. # With a BOM. We also check for a declaration, and make sure
  107. # it doesn't contradict (for 4-byte encodings, it's required)
  108. if sig == '\x00\x00\xFE\xFF': # UTF-32 BE
  109. eo = _decodeDeclaration(doc_start[4:], _decUTF32BE, ['UTF-32', 'ISO-10646-UCS-4', 'CSUCS4', 'UCS-4'], loggedEvents)
  110. elif sig == '\xFF\xFE\x00\x00': # UTF-32 LE
  111. eo = _decodeDeclaration(doc_start[4:], _decUTF32LE, ['UTF-32', 'ISO-10646-UCS-4', 'CSUCS4', 'UCS-4'], loggedEvents)
  112. elif sig == '\x00\x00\xFF\xFE' or sig == '\xFE\xFF\x00\x00':
  113. raise UnicodeError('Unable to process UCS-4 with unusual octet ordering')
  114. elif sig[:2] == '\xFE\xFF': # UTF-16 BE
  115. eo = _decodePostBOMDeclaration(doc_start[2:], _decUTF16BE, ['UTF-16', 'ISO-10646-UCS-2', 'CSUNICODE', 'UCS-2'], loggedEvents, fallback='UTF-16')
  116. elif sig[:2] == '\xFF\xFE': # UTF-16 LE
  117. eo = _decodePostBOMDeclaration(doc_start[2:], _decUTF16LE, ['UTF-16', 'ISO-10646-UCS-2', 'CSUNICODE', 'UCS-2'], loggedEvents, fallback='UTF-16')
  118. elif sig[:3] == '\xEF\xBB\xBF':
  119. eo = _decodePostBOMDeclaration(doc_start[3:], _decACE, ['UTF-8'], loggedEvents, fallback='UTF-8')
  120. # Without a BOM; we must read the declaration
  121. elif sig == '\x00\x00\x00\x3C':
  122. eo = _decodeDeclaration(doc_start, _decUTF32BE, ['UTF-32BE', 'UTF-32', 'ISO-10646-UCS-4', 'CSUCS4', 'UCS-4'], loggedEvents)
  123. elif sig == '\x3C\x00\x00\x00':
  124. eo = _decodeDeclaration(doc_start, _decUTF32LE, ['UTF-32LE', 'UTF-32', 'ISO-10646-UCS-4', 'CSUCS4', 'UCS-4'], loggedEvents)
  125. elif sig == '\x00\x3C\x00\x3F':
  126. eo = _decodeDeclaration(doc_start, _decUTF16BE, ['UTF-16BE', 'UTF-16', 'ISO-10646-UCS-2', 'CSUNICODE', 'UCS-2'], loggedEvents)
  127. elif sig == '\x3C\x00\x3F\x00':
  128. eo = _decodeDeclaration(doc_start, _decUTF16LE, ['UTF-16LE', 'UTF-16', 'ISO-10646-UCS-2', 'CSUNICODE', 'UCS-2'], loggedEvents)
  129. elif sig == '\x3C\x3F\x78\x6D':
  130. eo = _encodingFromDecl(_normaliseNewlines(_decACE(doc_start)[0])) or ('UTF-8', None)
  131. elif sig == '\x4C\x6F\xA7\x94':
  132. eo = _decodeDeclaration(doc_start, _decEBCDIC, ['IBM037', 'CP037', 'IBM038', 'EBCDIC-INT'], loggedEvents)
  133. # There's no BOM, and no declaration. It's UTF-8, or mislabelled.
  134. else:
  135. eo = (fallback, None)
  136. return eo
  137. def detect(doc_start, loggedEvents=[], fallback='UTF-8'):
  138. eo = _detect(doc_start, loggedEvents, fallback)
  139. if eo:
  140. return eo[0]
  141. else:
  142. return None
  143. _encRe = re.compile(r'<\?xml\s+version\s*=\s*(?:"[-a-zA-Z0-9_.:]+"|\'[-a-zA-Z0-9_.:]+\')\s+(encoding\s*=\s*(?:"([-A-Za-z0-9._]+)"|\'([-A-Za-z0-9._]+)\'))')
  144. def _encodingFromDecl(x):
  145. m = _encRe.match(x)
  146. if m:
  147. if m.group(2):
  148. return m.group(2), _position(x, m.start(2))
  149. else:
  150. return m.group(3), _position(x, m.start(3))
  151. else:
  152. return None
  153. def removeDeclaration(x):
  154. """Replace an XML document string's encoding declaration with the
  155. same number of spaces. Some XML parsers don't allow the
  156. encoding to be overridden, and this is a workaround."""
  157. m = _encRe.match(x)
  158. if m:
  159. s = m.start(1)
  160. e = m.end(1)
  161. res = x[:s] + ' ' * (e - s) + x[e:]
  162. else:
  163. res = x
  164. return res
  165. def _hasCodec(enc):
  166. try:
  167. return codecs.lookup(enc) is not None
  168. except:
  169. return False
  170. def decode(mediaType, charset, bs, loggedEvents, fallback=None):
  171. eo = _detect(bs, loggedEvents, fallback=None)
  172. # Check declared encodings
  173. if eo and eo[1] and _hasCodec(eo[0]):
  174. if not(isCommon(eo[0])):
  175. _logEvent(loggedEvents, ObscureEncoding({"encoding": eo[0]}), eo[1])
  176. elif not(isStandard(eo[0])):
  177. _logEvent(loggedEvents, NonstdEncoding({"encoding": eo[0]}), eo[1])
  178. if eo:
  179. encoding = eo[0]
  180. else:
  181. encoding = None
  182. if charset and encoding and charset.lower() != encoding.lower():
  183. # RFC 3023 requires us to use 'charset', but a number of aggregators
  184. # ignore this recommendation, so we should warn.
  185. loggedEvents.append(logging.EncodingMismatch({"charset": charset, "encoding": encoding}))
  186. if mediaType and mediaType.startswith("text/") and charset is None:
  187. loggedEvents.append(logging.TextXml({}))
  188. # RFC 3023 requires text/* to default to US-ASCII. Issue a warning
  189. # if this occurs, but continue validation using the detected encoding
  190. try:
  191. bs.decode("US-ASCII")
  192. except:
  193. if not encoding:
  194. try:
  195. bs.decode(fallback)
  196. encoding=fallback
  197. except:
  198. pass
  199. if encoding and encoding.lower() != 'us-ascii':
  200. loggedEvents.append(logging.EncodingMismatch({"charset": "US-ASCII", "encoding": encoding}))
  201. enc = charset or encoding
  202. if enc is None:
  203. loggedEvents.append(logging.MissingEncoding({}))
  204. enc = fallback
  205. elif not(_hasCodec(enc)):
  206. if eo:
  207. _logEvent(loggedEvents, logging.UnknownEncoding({'encoding': enc}), eo[1])
  208. else:
  209. _logEvent(loggedEvents, logging.UnknownEncoding({'encoding': enc}))
  210. enc = fallback
  211. if enc is None:
  212. return enc, None
  213. dec = getdecoder(enc)
  214. try:
  215. return enc, dec(bs)[0]
  216. except UnicodeError, ue:
  217. salvage = dec(bs, 'replace')[0]
  218. if 'start' in ue.__dict__:
  219. # XXX 'start' is in bytes, not characters. This is wrong for multibyte
  220. # encodings
  221. pos = _position(salvage, ue.start)
  222. else:
  223. pos = None
  224. _logEvent(loggedEvents, logging.UnicodeError({"exception":ue}), pos)
  225. return enc, salvage
  226. _encUTF8 = codecs.getencoder('UTF-8')
  227. def asUTF8(x):
  228. """Accept a Unicode string and return a UTF-8 encoded string, with
  229. its encoding declaration removed, suitable for parsing."""
  230. x = removeDeclaration(unicode(x))
  231. return _encUTF8(x)[0]
  232. if __name__ == '__main__':
  233. from sys import argv
  234. from os.path import isfile
  235. for x in argv[1:]:
  236. if isfile(x):
  237. f = open(x, 'r')
  238. l = f.read(1024)
  239. log = []
  240. eo = detect(l, log)
  241. if eo:
  242. print x,eo
  243. else:
  244. print repr(log)