PageRenderTime 53ms CodeModel.GetById 23ms RepoModel.GetById 0ms app.codeStats 1ms

/pypy/module/_multibytecodec/c_codecs.py

https://bitbucket.org/pypy/pypy/
Python | 270 lines | 237 code | 22 blank | 11 comment | 15 complexity | fa0f00fd7a967301a62ea2cc28d9b760 MD5 | raw file
Possible License(s): AGPL-3.0, BSD-3-Clause, Apache-2.0
  1. import py
  2. from rpython.rtyper.lltypesystem import lltype, rffi
  3. from rpython.translator.tool.cbuild import ExternalCompilationInfo
  4. from rpython.translator import cdir
  5. UNICODE_REPLACEMENT_CHARACTER = u'\uFFFD'
  6. class EncodeDecodeError(Exception):
  7. def __init__(self, start, end, reason):
  8. self.start = start
  9. self.end = end
  10. self.reason = reason
  11. def __repr__(self):
  12. return 'EncodeDecodeError(%r, %r, %r)' % (self.start, self.end,
  13. self.reason)
  14. srcdir = py.path.local(__file__).dirpath()
  15. codecs = [
  16. # _codecs_cn
  17. 'gb2312', 'gbk', 'gb18030', 'hz',
  18. # _codecs_hk
  19. 'big5hkscs',
  20. # _codecs_iso2022
  21. 'iso2022_kr', 'iso2022_jp', 'iso2022_jp_1', 'iso2022_jp_2',
  22. 'iso2022_jp_2004', 'iso2022_jp_3', 'iso2022_jp_ext',
  23. # _codecs_jp
  24. 'shift_jis', 'cp932', 'euc_jp', 'shift_jis_2004',
  25. 'euc_jis_2004', 'euc_jisx0213', 'shift_jisx0213',
  26. # _codecs_kr
  27. 'euc_kr', 'cp949', 'johab',
  28. # _codecs_tw
  29. 'big5', 'cp950',
  30. ]
  31. eci = ExternalCompilationInfo(
  32. separate_module_files = [
  33. srcdir.join('src', 'cjkcodecs', '_codecs_cn.c'),
  34. srcdir.join('src', 'cjkcodecs', '_codecs_hk.c'),
  35. srcdir.join('src', 'cjkcodecs', '_codecs_iso2022.c'),
  36. srcdir.join('src', 'cjkcodecs', '_codecs_jp.c'),
  37. srcdir.join('src', 'cjkcodecs', '_codecs_kr.c'),
  38. srcdir.join('src', 'cjkcodecs', '_codecs_tw.c'),
  39. srcdir.join('src', 'cjkcodecs', 'multibytecodec.c'),
  40. ],
  41. includes = ['src/cjkcodecs/multibytecodec.h'],
  42. include_dirs = [str(srcdir), cdir],
  43. )
  44. MBERR_TOOSMALL = -1 # insufficient output buffer space
  45. MBERR_TOOFEW = -2 # incomplete input buffer
  46. MBERR_INTERNAL = -3 # internal runtime error
  47. MBERR_NOMEMORY = -4 # out of memory
  48. MULTIBYTECODEC_P = rffi.COpaquePtr('struct MultibyteCodec_s',
  49. compilation_info=eci)
  50. def llexternal(*args, **kwds):
  51. kwds.setdefault('compilation_info', eci)
  52. kwds.setdefault('sandboxsafe', True)
  53. kwds.setdefault('_nowrapper', True)
  54. return rffi.llexternal(*args, **kwds)
  55. def getter_for(name):
  56. return llexternal('pypy_cjkcodec_%s' % name, [], MULTIBYTECODEC_P)
  57. _codecs_getters = dict([(name, getter_for(name)) for name in codecs])
  58. assert len(_codecs_getters) == len(codecs)
  59. def getcodec(name):
  60. getter = _codecs_getters[name]
  61. return getter()
  62. # ____________________________________________________________
  63. # Decoding
  64. DECODEBUF_P = rffi.COpaquePtr('struct pypy_cjk_dec_s', compilation_info=eci)
  65. pypy_cjk_dec_new = llexternal('pypy_cjk_dec_new',
  66. [MULTIBYTECODEC_P], DECODEBUF_P)
  67. pypy_cjk_dec_init = llexternal('pypy_cjk_dec_init',
  68. [DECODEBUF_P, rffi.CCHARP, rffi.SSIZE_T],
  69. rffi.SSIZE_T)
  70. pypy_cjk_dec_free = llexternal('pypy_cjk_dec_free', [DECODEBUF_P],
  71. lltype.Void)
  72. pypy_cjk_dec_chunk = llexternal('pypy_cjk_dec_chunk', [DECODEBUF_P],
  73. rffi.SSIZE_T)
  74. pypy_cjk_dec_outbuf = llexternal('pypy_cjk_dec_outbuf', [DECODEBUF_P],
  75. rffi.CWCHARP)
  76. pypy_cjk_dec_outlen = llexternal('pypy_cjk_dec_outlen', [DECODEBUF_P],
  77. rffi.SSIZE_T)
  78. pypy_cjk_dec_inbuf_remaining = llexternal('pypy_cjk_dec_inbuf_remaining',
  79. [DECODEBUF_P], rffi.SSIZE_T)
  80. pypy_cjk_dec_inbuf_consumed = llexternal('pypy_cjk_dec_inbuf_consumed',
  81. [DECODEBUF_P], rffi.SSIZE_T)
  82. pypy_cjk_dec_replace_on_error = llexternal('pypy_cjk_dec_replace_on_error',
  83. [DECODEBUF_P, rffi.CWCHARP,
  84. rffi.SSIZE_T, rffi.SSIZE_T],
  85. rffi.SSIZE_T)
  86. def decode(codec, stringdata, errors="strict", errorcb=None, namecb=None):
  87. decodebuf = pypy_cjk_dec_new(codec)
  88. if not decodebuf:
  89. raise MemoryError
  90. try:
  91. return decodeex(decodebuf, stringdata, errors, errorcb, namecb)
  92. finally:
  93. pypy_cjk_dec_free(decodebuf)
  94. def decodeex(decodebuf, stringdata, errors="strict", errorcb=None, namecb=None,
  95. ignore_error=0):
  96. inleft = len(stringdata)
  97. with rffi.scoped_nonmovingbuffer(stringdata) as inbuf:
  98. if pypy_cjk_dec_init(decodebuf, inbuf, inleft) < 0:
  99. raise MemoryError
  100. while True:
  101. r = pypy_cjk_dec_chunk(decodebuf)
  102. if r == 0 or r == ignore_error:
  103. break
  104. multibytecodec_decerror(decodebuf, r, errors,
  105. errorcb, namecb, stringdata)
  106. src = pypy_cjk_dec_outbuf(decodebuf)
  107. length = pypy_cjk_dec_outlen(decodebuf)
  108. return rffi.wcharpsize2unicode(src, length)
  109. def multibytecodec_decerror(decodebuf, e, errors,
  110. errorcb, namecb, stringdata):
  111. if e > 0:
  112. reason = "illegal multibyte sequence"
  113. esize = e
  114. elif e == MBERR_TOOFEW:
  115. reason = "incomplete multibyte sequence"
  116. esize = pypy_cjk_dec_inbuf_remaining(decodebuf)
  117. elif e == MBERR_NOMEMORY:
  118. raise MemoryError
  119. else:
  120. raise RuntimeError
  121. #
  122. # compute the unicode to use as a replacement -> 'replace', and
  123. # the current position in the input 'unicodedata' -> 'end'
  124. start = pypy_cjk_dec_inbuf_consumed(decodebuf)
  125. end = start + esize
  126. if errors == "strict":
  127. raise EncodeDecodeError(start, end, reason)
  128. elif errors == "ignore":
  129. replace = u""
  130. elif errors == "replace":
  131. replace = UNICODE_REPLACEMENT_CHARACTER
  132. else:
  133. assert errorcb
  134. replace, end = errorcb(errors, namecb, reason,
  135. stringdata, start, end)
  136. with rffi.scoped_nonmoving_unicodebuffer(replace) as inbuf:
  137. r = pypy_cjk_dec_replace_on_error(decodebuf, inbuf, len(replace), end)
  138. if r == MBERR_NOMEMORY:
  139. raise MemoryError
  140. # ____________________________________________________________
  141. # Encoding
  142. ENCODEBUF_P = rffi.COpaquePtr('struct pypy_cjk_enc_s', compilation_info=eci)
  143. pypy_cjk_enc_new = llexternal('pypy_cjk_enc_new',
  144. [MULTIBYTECODEC_P], ENCODEBUF_P)
  145. pypy_cjk_enc_init = llexternal('pypy_cjk_enc_init',
  146. [ENCODEBUF_P, rffi.CWCHARP, rffi.SSIZE_T],
  147. rffi.SSIZE_T)
  148. pypy_cjk_enc_free = llexternal('pypy_cjk_enc_free', [ENCODEBUF_P],
  149. lltype.Void)
  150. pypy_cjk_enc_chunk = llexternal('pypy_cjk_enc_chunk',
  151. [ENCODEBUF_P, rffi.SSIZE_T], rffi.SSIZE_T)
  152. pypy_cjk_enc_reset = llexternal('pypy_cjk_enc_reset', [ENCODEBUF_P],
  153. rffi.SSIZE_T)
  154. pypy_cjk_enc_outbuf = llexternal('pypy_cjk_enc_outbuf', [ENCODEBUF_P],
  155. rffi.CCHARP)
  156. pypy_cjk_enc_outlen = llexternal('pypy_cjk_enc_outlen', [ENCODEBUF_P],
  157. rffi.SSIZE_T)
  158. pypy_cjk_enc_inbuf_remaining = llexternal('pypy_cjk_enc_inbuf_remaining',
  159. [ENCODEBUF_P], rffi.SSIZE_T)
  160. pypy_cjk_enc_inbuf_consumed = llexternal('pypy_cjk_enc_inbuf_consumed',
  161. [ENCODEBUF_P], rffi.SSIZE_T)
  162. pypy_cjk_enc_replace_on_error = llexternal('pypy_cjk_enc_replace_on_error',
  163. [ENCODEBUF_P, rffi.CCHARP,
  164. rffi.SSIZE_T, rffi.SSIZE_T],
  165. rffi.SSIZE_T)
  166. pypy_cjk_enc_getcodec = llexternal('pypy_cjk_enc_getcodec',
  167. [ENCODEBUF_P], MULTIBYTECODEC_P)
  168. MBENC_FLUSH = 1
  169. MBENC_RESET = 2
  170. def encode(codec, unicodedata, errors="strict", errorcb=None, namecb=None):
  171. encodebuf = pypy_cjk_enc_new(codec)
  172. if not encodebuf:
  173. raise MemoryError
  174. try:
  175. return encodeex(encodebuf, unicodedata, errors, errorcb, namecb)
  176. finally:
  177. pypy_cjk_enc_free(encodebuf)
  178. def encodeex(encodebuf, unicodedata, errors="strict", errorcb=None,
  179. namecb=None, ignore_error=0):
  180. inleft = len(unicodedata)
  181. with rffi.scoped_nonmoving_unicodebuffer(unicodedata) as inbuf:
  182. if pypy_cjk_enc_init(encodebuf, inbuf, inleft) < 0:
  183. raise MemoryError
  184. if ignore_error == 0:
  185. flags = MBENC_FLUSH | MBENC_RESET
  186. else:
  187. flags = 0
  188. while True:
  189. r = pypy_cjk_enc_chunk(encodebuf, flags)
  190. if r == 0 or r == ignore_error:
  191. break
  192. multibytecodec_encerror(encodebuf, r, errors,
  193. errorcb, namecb, unicodedata)
  194. while flags & MBENC_RESET:
  195. r = pypy_cjk_enc_reset(encodebuf)
  196. if r == 0:
  197. break
  198. multibytecodec_encerror(encodebuf, r, errors,
  199. errorcb, namecb, unicodedata)
  200. src = pypy_cjk_enc_outbuf(encodebuf)
  201. length = pypy_cjk_enc_outlen(encodebuf)
  202. return rffi.charpsize2str(src, length)
  203. def multibytecodec_encerror(encodebuf, e, errors,
  204. errorcb, namecb, unicodedata):
  205. if e > 0:
  206. reason = "illegal multibyte sequence"
  207. esize = e
  208. elif e == MBERR_TOOFEW:
  209. reason = "incomplete multibyte sequence"
  210. esize = pypy_cjk_enc_inbuf_remaining(encodebuf)
  211. elif e == MBERR_NOMEMORY:
  212. raise MemoryError
  213. else:
  214. raise RuntimeError
  215. #
  216. # compute the string to use as a replacement -> 'replace', and
  217. # the current position in the input 'unicodedata' -> 'end'
  218. start = pypy_cjk_enc_inbuf_consumed(encodebuf)
  219. end = start + esize
  220. if errors == "strict":
  221. raise EncodeDecodeError(start, end, reason)
  222. elif errors == "ignore":
  223. replace = ""
  224. elif errors == "replace":
  225. codec = pypy_cjk_enc_getcodec(encodebuf)
  226. try:
  227. replace = encode(codec, u"?")
  228. except EncodeDecodeError:
  229. replace = "?"
  230. else:
  231. assert errorcb
  232. retu, rets, end = errorcb(errors, namecb, reason,
  233. unicodedata, start, end)
  234. if rets is not None:
  235. # py3k only
  236. replace = rets
  237. else:
  238. assert retu is not None
  239. codec = pypy_cjk_enc_getcodec(encodebuf)
  240. replace = encode(codec, retu, "strict", errorcb, namecb)
  241. with rffi.scoped_nonmovingbuffer(replace) as inbuf:
  242. r = pypy_cjk_enc_replace_on_error(encodebuf, inbuf, len(replace), end)
  243. if r == MBERR_NOMEMORY:
  244. raise MemoryError