/Lib/encodings/utf_8_sig.py

http://unladen-swallow.googlecode.com/ · Python · 111 lines · 111 code · 0 blank · 0 comment · 0 complexity · 1c442edae1b5cf9ed992c087251f002f MD5 · raw file

  1. """ Python 'utf-8-sig' Codec
  2. This work similar to UTF-8 with the following changes:
  3. * On encoding/writing a UTF-8 encoded BOM will be prepended/written as the
  4. first three bytes.
  5. * On decoding/reading if the first three bytes are a UTF-8 encoded BOM, these
  6. bytes will be skipped.
  7. """
  8. import codecs
  9. ### Codec APIs
  10. def encode(input, errors='strict'):
  11. return (codecs.BOM_UTF8 + codecs.utf_8_encode(input, errors)[0], len(input))
  12. def decode(input, errors='strict'):
  13. prefix = 0
  14. if input[:3] == codecs.BOM_UTF8:
  15. input = input[3:]
  16. prefix = 3
  17. (output, consumed) = codecs.utf_8_decode(input, errors, True)
  18. return (output, consumed+prefix)
  19. class IncrementalEncoder(codecs.IncrementalEncoder):
  20. def __init__(self, errors='strict'):
  21. codecs.IncrementalEncoder.__init__(self, errors)
  22. self.first = True
  23. def encode(self, input, final=False):
  24. if self.first:
  25. self.first = False
  26. return codecs.BOM_UTF8 + codecs.utf_8_encode(input, self.errors)[0]
  27. else:
  28. return codecs.utf_8_encode(input, self.errors)[0]
  29. def reset(self):
  30. codecs.IncrementalEncoder.reset(self)
  31. self.first = True
  32. class IncrementalDecoder(codecs.BufferedIncrementalDecoder):
  33. def __init__(self, errors='strict'):
  34. codecs.BufferedIncrementalDecoder.__init__(self, errors)
  35. self.first = True
  36. def _buffer_decode(self, input, errors, final):
  37. if self.first:
  38. if len(input) < 3:
  39. if codecs.BOM_UTF8.startswith(input):
  40. # not enough data to decide if this really is a BOM
  41. # => try again on the next call
  42. return (u"", 0)
  43. else:
  44. self.first = None
  45. else:
  46. self.first = None
  47. if input[:3] == codecs.BOM_UTF8:
  48. (output, consumed) = codecs.utf_8_decode(input[3:], errors, final)
  49. return (output, consumed+3)
  50. return codecs.utf_8_decode(input, errors, final)
  51. def reset(self):
  52. codecs.BufferedIncrementalDecoder.reset(self)
  53. self.first = True
  54. class StreamWriter(codecs.StreamWriter):
  55. def reset(self):
  56. codecs.StreamWriter.reset(self)
  57. try:
  58. del self.encode
  59. except AttributeError:
  60. pass
  61. def encode(self, input, errors='strict'):
  62. self.encode = codecs.utf_8_encode
  63. return encode(input, errors)
  64. class StreamReader(codecs.StreamReader):
  65. def reset(self):
  66. codecs.StreamReader.reset(self)
  67. try:
  68. del self.decode
  69. except AttributeError:
  70. pass
  71. def decode(self, input, errors='strict'):
  72. if len(input) < 3:
  73. if codecs.BOM_UTF8.startswith(input):
  74. # not enough data to decide if this is a BOM
  75. # => try again on the next call
  76. return (u"", 0)
  77. elif input[:3] == codecs.BOM_UTF8:
  78. self.decode = codecs.utf_8_decode
  79. (output, consumed) = codecs.utf_8_decode(input[3:],errors)
  80. return (output, consumed+3)
  81. # (else) no BOM present
  82. self.decode = codecs.utf_8_decode
  83. return codecs.utf_8_decode(input, errors)
  84. ### encodings module API
  85. def getregentry():
  86. return codecs.CodecInfo(
  87. name='utf-8-sig',
  88. encode=encode,
  89. decode=decode,
  90. incrementalencoder=IncrementalEncoder,
  91. incrementaldecoder=IncrementalDecoder,
  92. streamreader=StreamReader,
  93. streamwriter=StreamWriter,
  94. )