PageRenderTime 124ms CodeModel.GetById 60ms app.highlight 29ms RepoModel.GetById 32ms app.codeStats 0ms

/pypy/module/_multibytecodec/c_codecs.py

https://bitbucket.org/pypy/pypy/
Python | 270 lines | 237 code | 22 blank | 11 comment | 10 complexity | fa0f00fd7a967301a62ea2cc28d9b760 MD5 | raw file
  1import py
  2from rpython.rtyper.lltypesystem import lltype, rffi
  3from rpython.translator.tool.cbuild import ExternalCompilationInfo
  4from rpython.translator import cdir
  5
  6UNICODE_REPLACEMENT_CHARACTER = u'\uFFFD'
  7
  8
  9class EncodeDecodeError(Exception):
 10    def __init__(self, start, end, reason):
 11        self.start = start
 12        self.end = end
 13        self.reason = reason
 14    def __repr__(self):
 15        return 'EncodeDecodeError(%r, %r, %r)' % (self.start, self.end,
 16                                                  self.reason)
 17
 18srcdir = py.path.local(__file__).dirpath()
 19
 20codecs = [
 21    # _codecs_cn
 22    'gb2312', 'gbk', 'gb18030', 'hz',
 23
 24    # _codecs_hk
 25    'big5hkscs',
 26
 27    # _codecs_iso2022
 28    'iso2022_kr', 'iso2022_jp', 'iso2022_jp_1', 'iso2022_jp_2',
 29    'iso2022_jp_2004', 'iso2022_jp_3', 'iso2022_jp_ext',
 30
 31    # _codecs_jp
 32    'shift_jis', 'cp932', 'euc_jp', 'shift_jis_2004',
 33    'euc_jis_2004', 'euc_jisx0213', 'shift_jisx0213',
 34
 35    # _codecs_kr
 36    'euc_kr', 'cp949', 'johab',
 37
 38    # _codecs_tw
 39    'big5', 'cp950',
 40]
 41
 42eci = ExternalCompilationInfo(
 43    separate_module_files = [
 44        srcdir.join('src', 'cjkcodecs', '_codecs_cn.c'),
 45        srcdir.join('src', 'cjkcodecs', '_codecs_hk.c'),
 46        srcdir.join('src', 'cjkcodecs', '_codecs_iso2022.c'),
 47        srcdir.join('src', 'cjkcodecs', '_codecs_jp.c'),
 48        srcdir.join('src', 'cjkcodecs', '_codecs_kr.c'),
 49        srcdir.join('src', 'cjkcodecs', '_codecs_tw.c'),
 50        srcdir.join('src', 'cjkcodecs', 'multibytecodec.c'),
 51    ],
 52    includes = ['src/cjkcodecs/multibytecodec.h'],
 53    include_dirs = [str(srcdir), cdir],
 54)
 55
 56MBERR_TOOSMALL = -1  # insufficient output buffer space
 57MBERR_TOOFEW   = -2  # incomplete input buffer
 58MBERR_INTERNAL = -3  # internal runtime error
 59MBERR_NOMEMORY = -4  # out of memory
 60
 61MULTIBYTECODEC_P = rffi.COpaquePtr('struct MultibyteCodec_s',
 62                                   compilation_info=eci)
 63
 64def llexternal(*args, **kwds):
 65    kwds.setdefault('compilation_info', eci)
 66    kwds.setdefault('sandboxsafe', True)
 67    kwds.setdefault('_nowrapper', True)
 68    return rffi.llexternal(*args, **kwds)
 69
 70def getter_for(name):
 71    return llexternal('pypy_cjkcodec_%s' % name, [], MULTIBYTECODEC_P)
 72
 73_codecs_getters = dict([(name, getter_for(name)) for name in codecs])
 74assert len(_codecs_getters) == len(codecs)
 75
 76def getcodec(name):
 77    getter = _codecs_getters[name]
 78    return getter()
 79
 80# ____________________________________________________________
 81# Decoding
 82
 83DECODEBUF_P = rffi.COpaquePtr('struct pypy_cjk_dec_s', compilation_info=eci)
 84pypy_cjk_dec_new = llexternal('pypy_cjk_dec_new',
 85                              [MULTIBYTECODEC_P], DECODEBUF_P)
 86pypy_cjk_dec_init = llexternal('pypy_cjk_dec_init',
 87                               [DECODEBUF_P, rffi.CCHARP, rffi.SSIZE_T],
 88                               rffi.SSIZE_T)
 89pypy_cjk_dec_free = llexternal('pypy_cjk_dec_free', [DECODEBUF_P],
 90                               lltype.Void)
 91pypy_cjk_dec_chunk = llexternal('pypy_cjk_dec_chunk', [DECODEBUF_P],
 92                                rffi.SSIZE_T)
 93pypy_cjk_dec_outbuf = llexternal('pypy_cjk_dec_outbuf', [DECODEBUF_P],
 94                                 rffi.CWCHARP)
 95pypy_cjk_dec_outlen = llexternal('pypy_cjk_dec_outlen', [DECODEBUF_P],
 96                                 rffi.SSIZE_T)
 97pypy_cjk_dec_inbuf_remaining = llexternal('pypy_cjk_dec_inbuf_remaining',
 98                                          [DECODEBUF_P], rffi.SSIZE_T)
 99pypy_cjk_dec_inbuf_consumed = llexternal('pypy_cjk_dec_inbuf_consumed',
100                                         [DECODEBUF_P], rffi.SSIZE_T)
101pypy_cjk_dec_replace_on_error = llexternal('pypy_cjk_dec_replace_on_error',
102                                           [DECODEBUF_P, rffi.CWCHARP,
103                                            rffi.SSIZE_T, rffi.SSIZE_T],
104                                           rffi.SSIZE_T)
105
106def decode(codec, stringdata, errors="strict", errorcb=None, namecb=None):
107    decodebuf = pypy_cjk_dec_new(codec)
108    if not decodebuf:
109        raise MemoryError
110    try:
111        return decodeex(decodebuf, stringdata, errors, errorcb, namecb)
112    finally:
113        pypy_cjk_dec_free(decodebuf)
114
115def decodeex(decodebuf, stringdata, errors="strict", errorcb=None, namecb=None,
116             ignore_error=0):
117    inleft = len(stringdata)
118    with rffi.scoped_nonmovingbuffer(stringdata) as inbuf:
119        if pypy_cjk_dec_init(decodebuf, inbuf, inleft) < 0:
120            raise MemoryError
121        while True:
122            r = pypy_cjk_dec_chunk(decodebuf)
123            if r == 0 or r == ignore_error:
124                break
125            multibytecodec_decerror(decodebuf, r, errors,
126                                    errorcb, namecb, stringdata)
127        src = pypy_cjk_dec_outbuf(decodebuf)
128        length = pypy_cjk_dec_outlen(decodebuf)
129        return rffi.wcharpsize2unicode(src, length)
130
131def multibytecodec_decerror(decodebuf, e, errors,
132                            errorcb, namecb, stringdata):
133    if e > 0:
134        reason = "illegal multibyte sequence"
135        esize = e
136    elif e == MBERR_TOOFEW:
137        reason = "incomplete multibyte sequence"
138        esize = pypy_cjk_dec_inbuf_remaining(decodebuf)
139    elif e == MBERR_NOMEMORY:
140        raise MemoryError
141    else:
142        raise RuntimeError
143    #
144    # compute the unicode to use as a replacement -> 'replace', and
145    # the current position in the input 'unicodedata' -> 'end'
146    start = pypy_cjk_dec_inbuf_consumed(decodebuf)
147    end = start + esize
148    if errors == "strict":
149        raise EncodeDecodeError(start, end, reason)
150    elif errors == "ignore":
151        replace = u""
152    elif errors == "replace":
153        replace = UNICODE_REPLACEMENT_CHARACTER
154    else:
155        assert errorcb
156        replace, end = errorcb(errors, namecb, reason,
157                               stringdata, start, end)
158    with rffi.scoped_nonmoving_unicodebuffer(replace) as inbuf:
159        r = pypy_cjk_dec_replace_on_error(decodebuf, inbuf, len(replace), end)
160    if r == MBERR_NOMEMORY:
161        raise MemoryError
162
163# ____________________________________________________________
164# Encoding
165ENCODEBUF_P = rffi.COpaquePtr('struct pypy_cjk_enc_s', compilation_info=eci)
166pypy_cjk_enc_new = llexternal('pypy_cjk_enc_new',
167                               [MULTIBYTECODEC_P], ENCODEBUF_P)
168pypy_cjk_enc_init = llexternal('pypy_cjk_enc_init',
169                               [ENCODEBUF_P, rffi.CWCHARP, rffi.SSIZE_T],
170                               rffi.SSIZE_T)
171pypy_cjk_enc_free = llexternal('pypy_cjk_enc_free', [ENCODEBUF_P],
172                               lltype.Void)
173pypy_cjk_enc_chunk = llexternal('pypy_cjk_enc_chunk',
174                                [ENCODEBUF_P, rffi.SSIZE_T], rffi.SSIZE_T)
175pypy_cjk_enc_reset = llexternal('pypy_cjk_enc_reset', [ENCODEBUF_P],
176                                rffi.SSIZE_T)
177pypy_cjk_enc_outbuf = llexternal('pypy_cjk_enc_outbuf', [ENCODEBUF_P],
178                                 rffi.CCHARP)
179pypy_cjk_enc_outlen = llexternal('pypy_cjk_enc_outlen', [ENCODEBUF_P],
180                                 rffi.SSIZE_T)
181pypy_cjk_enc_inbuf_remaining = llexternal('pypy_cjk_enc_inbuf_remaining',
182                                          [ENCODEBUF_P], rffi.SSIZE_T)
183pypy_cjk_enc_inbuf_consumed = llexternal('pypy_cjk_enc_inbuf_consumed',
184                                         [ENCODEBUF_P], rffi.SSIZE_T)
185pypy_cjk_enc_replace_on_error = llexternal('pypy_cjk_enc_replace_on_error',
186                                           [ENCODEBUF_P, rffi.CCHARP,
187                                            rffi.SSIZE_T, rffi.SSIZE_T],
188                                           rffi.SSIZE_T)
189pypy_cjk_enc_getcodec = llexternal('pypy_cjk_enc_getcodec',
190                                   [ENCODEBUF_P], MULTIBYTECODEC_P)
191MBENC_FLUSH = 1
192MBENC_RESET = 2
193
194def encode(codec, unicodedata, errors="strict", errorcb=None, namecb=None):
195    encodebuf = pypy_cjk_enc_new(codec)
196    if not encodebuf:
197        raise MemoryError
198    try:
199        return encodeex(encodebuf, unicodedata, errors, errorcb, namecb)
200    finally:
201        pypy_cjk_enc_free(encodebuf)
202
203def encodeex(encodebuf, unicodedata, errors="strict", errorcb=None,
204             namecb=None, ignore_error=0):
205    inleft = len(unicodedata)
206    with rffi.scoped_nonmoving_unicodebuffer(unicodedata) as inbuf:
207        if pypy_cjk_enc_init(encodebuf, inbuf, inleft) < 0:
208            raise MemoryError
209        if ignore_error == 0:
210            flags = MBENC_FLUSH | MBENC_RESET
211        else:
212            flags = 0
213        while True:
214            r = pypy_cjk_enc_chunk(encodebuf, flags)
215            if r == 0 or r == ignore_error:
216                break
217            multibytecodec_encerror(encodebuf, r, errors,
218                                    errorcb, namecb, unicodedata)
219        while flags & MBENC_RESET:
220            r = pypy_cjk_enc_reset(encodebuf)
221            if r == 0:
222                break
223            multibytecodec_encerror(encodebuf, r, errors,
224                                    errorcb, namecb, unicodedata)
225        src = pypy_cjk_enc_outbuf(encodebuf)
226        length = pypy_cjk_enc_outlen(encodebuf)
227        return rffi.charpsize2str(src, length)
228
229def multibytecodec_encerror(encodebuf, e, errors,
230                            errorcb, namecb, unicodedata):
231    if e > 0:
232        reason = "illegal multibyte sequence"
233        esize = e
234    elif e == MBERR_TOOFEW:
235        reason = "incomplete multibyte sequence"
236        esize = pypy_cjk_enc_inbuf_remaining(encodebuf)
237    elif e == MBERR_NOMEMORY:
238        raise MemoryError
239    else:
240        raise RuntimeError
241    #
242    # compute the string to use as a replacement -> 'replace', and
243    # the current position in the input 'unicodedata' -> 'end'
244    start = pypy_cjk_enc_inbuf_consumed(encodebuf)
245    end = start + esize
246    if errors == "strict":
247        raise EncodeDecodeError(start, end, reason)
248    elif errors == "ignore":
249        replace = ""
250    elif errors == "replace":
251        codec = pypy_cjk_enc_getcodec(encodebuf)
252        try:
253            replace = encode(codec, u"?")
254        except EncodeDecodeError:
255            replace = "?"
256    else:
257        assert errorcb
258        retu, rets, end = errorcb(errors, namecb, reason,
259                                  unicodedata, start, end)
260        if rets is not None:
261            # py3k only
262            replace = rets
263        else:
264            assert retu is not None
265            codec = pypy_cjk_enc_getcodec(encodebuf)
266            replace = encode(codec, retu, "strict", errorcb, namecb)
267    with rffi.scoped_nonmovingbuffer(replace) as inbuf:
268        r = pypy_cjk_enc_replace_on_error(encodebuf, inbuf, len(replace), end)
269    if r == MBERR_NOMEMORY:
270        raise MemoryError