PageRenderTime 70ms CodeModel.GetById 24ms app.highlight 39ms RepoModel.GetById 1ms app.codeStats 0ms

/pypy/module/cpyext/unicodeobject.py

https://bitbucket.org/pypy/pypy/
Python | 761 lines | 697 code | 31 blank | 33 comment | 5 complexity | 00bf2400d9e5043a347a37a65d762767 MD5 | raw file
  1from pypy.interpreter.error import OperationError, oefmt
  2from rpython.rtyper.lltypesystem import rffi, lltype
  3from pypy.module.unicodedata import unicodedb
  4from pypy.module.cpyext.api import (
  5    CANNOT_FAIL, Py_ssize_t, build_type_checkers, cpython_api,
  6    bootstrap_function, PyObjectFields, cpython_struct, CONST_STRING,
  7    CONST_WSTRING)
  8from pypy.module.cpyext.pyerrors import PyErr_BadArgument
  9from pypy.module.cpyext.pyobject import (
 10    PyObject, PyObjectP, Py_DecRef, make_ref, from_ref, track_reference,
 11    make_typedescr, get_typedescr, as_pyobj)
 12from pypy.module.cpyext.bytesobject import PyString_Check
 13from pypy.module.sys.interp_encoding import setdefaultencoding
 14from pypy.module._codecs.interp_codecs import CodecState
 15from pypy.objspace.std import unicodeobject
 16from rpython.rlib import rstring, runicode
 17from rpython.tool.sourcetools import func_renamer
 18import sys
 19
 20## See comment in bytesobject.py.
 21
 22PyUnicodeObjectStruct = lltype.ForwardReference()
 23PyUnicodeObject = lltype.Ptr(PyUnicodeObjectStruct)
 24PyUnicodeObjectFields = (PyObjectFields +
 25    (("str", rffi.CWCHARP), ("length", Py_ssize_t),
 26     ("hash", rffi.LONG), ("defenc", PyObject)))
 27cpython_struct("PyUnicodeObject", PyUnicodeObjectFields, PyUnicodeObjectStruct)
 28
 29@bootstrap_function
 30def init_unicodeobject(space):
 31    make_typedescr(space.w_unicode.layout.typedef,
 32                   basestruct=PyUnicodeObject.TO,
 33                   attach=unicode_attach,
 34                   dealloc=unicode_dealloc,
 35                   realize=unicode_realize)
 36
 37# Buffer for the default encoding (used by PyUnicde_GetDefaultEncoding)
 38DEFAULT_ENCODING_SIZE = 100
 39default_encoding = lltype.malloc(rffi.CCHARP.TO, DEFAULT_ENCODING_SIZE,
 40                                 flavor='raw', zero=True)
 41
 42PyUnicode_Check, PyUnicode_CheckExact = build_type_checkers("Unicode", "w_unicode")
 43
 44Py_UNICODE = lltype.UniChar
 45
 46def new_empty_unicode(space, length):
 47    """
 48    Allocate a PyUnicodeObject and its buffer, but without a corresponding
 49    interpreter object.  The buffer may be mutated, until unicode_realize() is
 50    called.  Refcount of the result is 1.
 51    """
 52    typedescr = get_typedescr(space.w_unicode.layout.typedef)
 53    py_obj = typedescr.allocate(space, space.w_unicode)
 54    py_uni = rffi.cast(PyUnicodeObject, py_obj)
 55
 56    buflen = length + 1
 57    py_uni.c_length = length
 58    py_uni.c_str = lltype.malloc(rffi.CWCHARP.TO, buflen,
 59                                 flavor='raw', zero=True,
 60                                 add_memory_pressure=True)
 61    py_uni.c_hash = -1
 62    py_uni.c_defenc = lltype.nullptr(PyObject.TO)
 63    return py_uni
 64
 65def unicode_attach(space, py_obj, w_obj):
 66    "Fills a newly allocated PyUnicodeObject with a unicode string"
 67    py_unicode = rffi.cast(PyUnicodeObject, py_obj)
 68    py_unicode.c_length = len(space.unicode_w(w_obj))
 69    py_unicode.c_str = lltype.nullptr(rffi.CWCHARP.TO)
 70    py_unicode.c_hash = space.hash_w(w_obj)
 71    py_unicode.c_defenc = lltype.nullptr(PyObject.TO)
 72
 73def unicode_realize(space, py_obj):
 74    """
 75    Creates the unicode in the interpreter. The PyUnicodeObject buffer must not
 76    be modified after this call.
 77    """
 78    py_uni = rffi.cast(PyUnicodeObject, py_obj)
 79    s = rffi.wcharpsize2unicode(py_uni.c_str, py_uni.c_length)
 80    w_type = from_ref(space, rffi.cast(PyObject, py_obj.c_ob_type))
 81    w_obj = space.allocate_instance(unicodeobject.W_UnicodeObject, w_type)
 82    w_obj.__init__(s)
 83    py_uni.c_hash = space.hash_w(w_obj)
 84    track_reference(space, py_obj, w_obj)
 85    return w_obj
 86
 87@cpython_api([PyObject], lltype.Void, header=None)
 88def unicode_dealloc(space, py_obj):
 89    py_unicode = rffi.cast(PyUnicodeObject, py_obj)
 90    Py_DecRef(space, py_unicode.c_defenc)
 91    if py_unicode.c_str:
 92        lltype.free(py_unicode.c_str, flavor="raw")
 93
 94    from pypy.module.cpyext.object import _dealloc
 95    _dealloc(space, py_obj)
 96
 97@cpython_api([Py_UNICODE], rffi.INT_real, error=CANNOT_FAIL)
 98def Py_UNICODE_ISSPACE(space, ch):
 99    """Return 1 or 0 depending on whether ch is a whitespace character."""
100    return unicodedb.isspace(ord(ch))
101
102@cpython_api([Py_UNICODE], rffi.INT_real, error=CANNOT_FAIL)
103def Py_UNICODE_ISALPHA(space, ch):
104    """Return 1 or 0 depending on whether ch is an alphabetic character."""
105    return unicodedb.isalpha(ord(ch))
106
107@cpython_api([Py_UNICODE], rffi.INT_real, error=CANNOT_FAIL)
108def Py_UNICODE_ISALNUM(space, ch):
109    """Return 1 or 0 depending on whether ch is an alphanumeric character."""
110    return unicodedb.isalnum(ord(ch))
111
112@cpython_api([Py_UNICODE], rffi.INT_real, error=CANNOT_FAIL)
113def Py_UNICODE_ISLINEBREAK(space, ch):
114    """Return 1 or 0 depending on whether ch is a linebreak character."""
115    return unicodedb.islinebreak(ord(ch))
116
117@cpython_api([Py_UNICODE], rffi.INT_real, error=CANNOT_FAIL)
118def Py_UNICODE_ISDECIMAL(space, ch):
119    """Return 1 or 0 depending on whether ch is a decimal character."""
120    return unicodedb.isdecimal(ord(ch))
121
122@cpython_api([Py_UNICODE], rffi.INT_real, error=CANNOT_FAIL)
123def Py_UNICODE_ISDIGIT(space, ch):
124    """Return 1 or 0 depending on whether ch is a digit character."""
125    return unicodedb.isdigit(ord(ch))
126
127@cpython_api([Py_UNICODE], rffi.INT_real, error=CANNOT_FAIL)
128def Py_UNICODE_ISNUMERIC(space, ch):
129    """Return 1 or 0 depending on whether ch is a numeric character."""
130    return unicodedb.isnumeric(ord(ch))
131
132@cpython_api([Py_UNICODE], rffi.INT_real, error=CANNOT_FAIL)
133def Py_UNICODE_ISLOWER(space, ch):
134    """Return 1 or 0 depending on whether ch is a lowercase character."""
135    return unicodedb.islower(ord(ch))
136
137@cpython_api([Py_UNICODE], rffi.INT_real, error=CANNOT_FAIL)
138def Py_UNICODE_ISUPPER(space, ch):
139    """Return 1 or 0 depending on whether ch is an uppercase character."""
140    return unicodedb.isupper(ord(ch))
141
142@cpython_api([Py_UNICODE], rffi.INT_real, error=CANNOT_FAIL)
143def Py_UNICODE_ISTITLE(space, ch):
144    """Return 1 or 0 depending on whether ch is a titlecase character."""
145    return unicodedb.istitle(ord(ch))
146
147@cpython_api([Py_UNICODE], Py_UNICODE, error=CANNOT_FAIL)
148def Py_UNICODE_TOLOWER(space, ch):
149    """Return the character ch converted to lower case."""
150    return unichr(unicodedb.tolower(ord(ch)))
151
152@cpython_api([Py_UNICODE], Py_UNICODE, error=CANNOT_FAIL)
153def Py_UNICODE_TOUPPER(space, ch):
154    """Return the character ch converted to upper case."""
155    return unichr(unicodedb.toupper(ord(ch)))
156
157@cpython_api([Py_UNICODE], Py_UNICODE, error=CANNOT_FAIL)
158def Py_UNICODE_TOTITLE(space, ch):
159    """Return the character ch converted to title case."""
160    return unichr(unicodedb.totitle(ord(ch)))
161
162@cpython_api([Py_UNICODE], rffi.INT_real, error=CANNOT_FAIL)
163def Py_UNICODE_TODECIMAL(space, ch):
164    """Return the character ch converted to a decimal positive integer.  Return
165    -1 if this is not possible.  This macro does not raise exceptions."""
166    try:
167        return unicodedb.decimal(ord(ch))
168    except KeyError:
169        return -1
170
171@cpython_api([Py_UNICODE], rffi.INT_real, error=CANNOT_FAIL)
172def Py_UNICODE_TODIGIT(space, ch):
173    """Return the character ch converted to a single digit integer. Return -1 if
174    this is not possible.  This macro does not raise exceptions."""
175    try:
176        return unicodedb.digit(ord(ch))
177    except KeyError:
178        return -1
179
180@cpython_api([Py_UNICODE], rffi.DOUBLE, error=CANNOT_FAIL)
181def Py_UNICODE_TONUMERIC(space, ch):
182    """Return the character ch converted to a double. Return -1.0 if this is not
183    possible.  This macro does not raise exceptions."""
184    try:
185        return unicodedb.numeric(ord(ch))
186    except KeyError:
187        return -1.0
188
189@cpython_api([], Py_UNICODE, error=CANNOT_FAIL)
190def PyUnicode_GetMax(space):
191    """Get the maximum ordinal for a Unicode character."""
192    return runicode.UNICHR(runicode.MAXUNICODE)
193
194@cpython_api([rffi.VOIDP], rffi.CCHARP, error=CANNOT_FAIL)
195def PyUnicode_AS_DATA(space, ref):
196    """Return a pointer to the internal buffer of the object. o has to be a
197    PyUnicodeObject (not checked)."""
198    return rffi.cast(rffi.CCHARP, PyUnicode_AS_UNICODE(space, ref))
199
200@cpython_api([rffi.VOIDP], Py_ssize_t, error=CANNOT_FAIL)
201def PyUnicode_GET_DATA_SIZE(space, w_obj):
202    """Return the size of the object's internal buffer in bytes.  o has to be a
203    PyUnicodeObject (not checked)."""
204    return rffi.sizeof(lltype.UniChar) * PyUnicode_GET_SIZE(space, w_obj)
205
206@cpython_api([rffi.VOIDP], Py_ssize_t, error=CANNOT_FAIL)
207def PyUnicode_GET_SIZE(space, w_obj):
208    """Return the size of the object.  o has to be a PyUnicodeObject (not
209    checked)."""
210    assert isinstance(w_obj, unicodeobject.W_UnicodeObject)
211    return space.len_w(w_obj)
212
213@cpython_api([rffi.VOIDP], rffi.CWCHARP, error=CANNOT_FAIL)
214def PyUnicode_AS_UNICODE(space, ref):
215    """Return a pointer to the internal Py_UNICODE buffer of the object.  ref
216    has to be a PyUnicodeObject (not checked)."""
217    ref_unicode = rffi.cast(PyUnicodeObject, ref)
218    if not ref_unicode.c_str:
219        # Copy unicode buffer
220        w_unicode = from_ref(space, rffi.cast(PyObject, ref))
221        u = space.unicode_w(w_unicode)
222        ref_unicode.c_str = rffi.unicode2wcharp(u)
223    return ref_unicode.c_str
224
225@cpython_api([PyObject], rffi.CWCHARP)
226def PyUnicode_AsUnicode(space, ref):
227    """Return a read-only pointer to the Unicode object's internal Py_UNICODE
228    buffer, NULL if unicode is not a Unicode object."""
229    # Don't use PyUnicode_Check, it will realize the object :-(
230    w_type = from_ref(space, rffi.cast(PyObject, ref.c_ob_type))
231    if not space.issubtype_w(w_type, space.w_unicode):
232        raise oefmt(space.w_TypeError, "expected unicode object")
233    return PyUnicode_AS_UNICODE(space, rffi.cast(rffi.VOIDP, ref))
234
235@cpython_api([PyObject], Py_ssize_t, error=-1)
236def PyUnicode_GetSize(space, ref):
237    if from_ref(space, rffi.cast(PyObject, ref.c_ob_type)) is space.w_unicode:
238        ref = rffi.cast(PyUnicodeObject, ref)
239        return ref.c_length
240    else:
241        w_obj = from_ref(space, ref)
242        return space.len_w(w_obj)
243
244@cpython_api([PyUnicodeObject, rffi.CWCHARP, Py_ssize_t], Py_ssize_t, error=-1)
245def PyUnicode_AsWideChar(space, ref, buf, size):
246    """Copy the Unicode object contents into the wchar_t buffer w.  At most
247    size wchar_t characters are copied (excluding a possibly trailing
248    0-termination character).  Return the number of wchar_t characters
249    copied or -1 in case of an error.  Note that the resulting wchar_t
250    string may or may not be 0-terminated.  It is the responsibility of the caller
251    to make sure that the wchar_t string is 0-terminated in case this is
252    required by the application."""
253    c_str = PyUnicode_AS_UNICODE(space, rffi.cast(rffi.VOIDP, ref))
254    c_length = ref.c_length
255
256    # If possible, try to copy the 0-termination as well
257    if size > c_length:
258        size = c_length + 1
259
260
261    i = 0
262    while i < size:
263        buf[i] = c_str[i]
264        i += 1
265
266    if size > c_length:
267        return c_length
268    else:
269        return size
270
271@cpython_api([], rffi.CCHARP, error=CANNOT_FAIL)
272def PyUnicode_GetDefaultEncoding(space):
273    """Returns the currently active default encoding."""
274    if default_encoding[0] == '\x00':
275        encoding = unicodeobject.getdefaultencoding(space)
276        i = 0
277        while i < len(encoding) and i < DEFAULT_ENCODING_SIZE:
278            default_encoding[i] = encoding[i]
279            i += 1
280    return default_encoding
281
282@cpython_api([CONST_STRING], rffi.INT_real, error=-1)
283def PyUnicode_SetDefaultEncoding(space, encoding):
284    """Sets the currently active default encoding. Returns 0 on
285    success, -1 in case of an error."""
286    if not encoding:
287        PyErr_BadArgument(space)
288    w_encoding = space.wrap(rffi.charp2str(encoding))
289    setdefaultencoding(space, w_encoding)
290    default_encoding[0] = '\x00'
291    return 0
292
293@cpython_api([PyObject, CONST_STRING, CONST_STRING], PyObject)
294def PyUnicode_AsEncodedObject(space, w_unicode, llencoding, llerrors):
295    """Encode a Unicode object and return the result as Python object.
296    encoding and errors have the same meaning as the parameters of the same name
297    in the Unicode encode() method. The codec to be used is looked up using
298    the Python codec registry. Return NULL if an exception was raised by the
299    codec."""
300    if not PyUnicode_Check(space, w_unicode):
301        PyErr_BadArgument(space)
302
303    encoding = errors = None
304    if llencoding:
305        encoding = rffi.charp2str(llencoding)
306    if llerrors:
307        errors = rffi.charp2str(llerrors)
308    return unicodeobject.encode_object(space, w_unicode, encoding, errors)
309
310@cpython_api([PyObject, CONST_STRING, CONST_STRING], PyObject)
311def PyUnicode_AsEncodedString(space, w_unicode, llencoding, llerrors):
312    """Encode a Unicode object and return the result as Python string object.
313    encoding and errors have the same meaning as the parameters of the same name
314    in the Unicode encode() method. The codec to be used is looked up using
315    the Python codec registry. Return NULL if an exception was raised by the
316    codec."""
317    w_str = PyUnicode_AsEncodedObject(space, w_unicode, llencoding, llerrors)
318    if not PyString_Check(space, w_str):
319        raise oefmt(space.w_TypeError,
320                    "encoder did not return a string object")
321    return w_str
322
323@cpython_api([PyObject], PyObject)
324def PyUnicode_AsUnicodeEscapeString(space, w_unicode):
325    """Encode a Unicode object using Unicode-Escape and return the result as Python
326    string object.  Error handling is "strict". Return NULL if an exception was
327    raised by the codec."""
328    if not PyUnicode_Check(space, w_unicode):
329        PyErr_BadArgument(space)
330
331    return unicodeobject.encode_object(space, w_unicode, 'unicode-escape', 'strict')
332
333@cpython_api([CONST_WSTRING, Py_ssize_t], PyObject, result_is_ll=True)
334def PyUnicode_FromUnicode(space, wchar_p, length):
335    """Create a Unicode Object from the Py_UNICODE buffer u of the given size. u
336    may be NULL which causes the contents to be undefined. It is the user's
337    responsibility to fill in the needed data.  The buffer is copied into the new
338    object. If the buffer is not NULL, the return value might be a shared object.
339    Therefore, modification of the resulting Unicode object is only allowed when u
340    is NULL."""
341    if wchar_p:
342        s = rffi.wcharpsize2unicode(wchar_p, length)
343        return make_ref(space, space.wrap(s))
344    else:
345        return rffi.cast(PyObject, new_empty_unicode(space, length))
346
347@cpython_api([CONST_WSTRING, Py_ssize_t], PyObject, result_is_ll=True)
348def PyUnicode_FromWideChar(space, wchar_p, length):
349    """Create a Unicode object from the wchar_t buffer w of the given size.
350    Return NULL on failure."""
351    # PyPy supposes Py_UNICODE == wchar_t
352    return PyUnicode_FromUnicode(space, wchar_p, length)
353
354@cpython_api([PyObject, CONST_STRING], PyObject, result_is_ll=True)
355def _PyUnicode_AsDefaultEncodedString(space, ref, errors):
356    # Returns a borrowed reference.
357    py_uni = rffi.cast(PyUnicodeObject, ref)
358    if not py_uni.c_defenc:
359        py_uni.c_defenc = make_ref(
360            space, PyUnicode_AsEncodedString(
361                space, ref,
362                lltype.nullptr(rffi.CCHARP.TO), errors))
363    return py_uni.c_defenc
364
365@cpython_api([CONST_STRING, Py_ssize_t, CONST_STRING, CONST_STRING], PyObject)
366def PyUnicode_Decode(space, s, size, encoding, errors):
367    """Create a Unicode object by decoding size bytes of the encoded string s.
368    encoding and errors have the same meaning as the parameters of the same name
369    in the unicode() built-in function.  The codec to be used is looked up
370    using the Python codec registry.  Return NULL if an exception was raised by
371    the codec."""
372    if not encoding:
373        # This tracks CPython 2.7, in CPython 3.4 'utf-8' is hardcoded instead
374        encoding = PyUnicode_GetDefaultEncoding(space)
375    w_str = space.newbytes(rffi.charpsize2str(s, size))
376    w_encoding = space.wrap(rffi.charp2str(encoding))
377    if errors:
378        w_errors = space.newbytes(rffi.charp2str(errors))
379    else:
380        w_errors = None
381    return space.call_method(w_str, 'decode', w_encoding, w_errors)
382
383@cpython_api([PyObject], PyObject)
384def PyUnicode_FromObject(space, w_obj):
385    """Shortcut for PyUnicode_FromEncodedObject(obj, NULL, "strict") which is used
386    throughout the interpreter whenever coercion to Unicode is needed."""
387    if space.is_w(space.type(w_obj), space.w_unicode):
388        return w_obj
389    else:
390        return space.call_function(space.w_unicode, w_obj)
391
392@cpython_api([PyObject, CONST_STRING, CONST_STRING], PyObject)
393def PyUnicode_FromEncodedObject(space, w_obj, encoding, errors):
394    """Coerce an encoded object obj to an Unicode object and return a reference with
395    incremented refcount.
396
397    String and other char buffer compatible objects are decoded according to the
398    given encoding and using the error handling defined by errors.  Both can be
399    NULL to have the interface use the default values (see the next section for
400    details).
401
402    All other objects, including Unicode objects, cause a TypeError to be
403    set."""
404    if not encoding:
405        raise oefmt(space.w_TypeError, "decoding Unicode is not supported")
406    w_encoding = space.wrap(rffi.charp2str(encoding))
407    if errors:
408        w_errors = space.wrap(rffi.charp2str(errors))
409    else:
410        w_errors = None
411
412    # - unicode is disallowed
413    # - raise TypeError for non-string types
414    if space.isinstance_w(w_obj, space.w_unicode):
415        w_meth = None
416    else:
417        try:
418            w_meth = space.getattr(w_obj, space.wrap('decode'))
419        except OperationError as e:
420            if not e.match(space, space.w_AttributeError):
421                raise
422            w_meth = None
423    if w_meth is None:
424        raise oefmt(space.w_TypeError, "decoding Unicode is not supported")
425    return space.call_function(w_meth, w_encoding, w_errors)
426
427@cpython_api([CONST_STRING], PyObject)
428def PyUnicode_FromString(space, s):
429    """Create a Unicode object from an UTF-8 encoded null-terminated char buffer"""
430    w_str = space.newbytes(rffi.charp2str(s))
431    return space.call_method(w_str, 'decode', space.wrap("utf-8"))
432
433@cpython_api([CONST_STRING, Py_ssize_t], PyObject, result_is_ll=True)
434def PyUnicode_FromStringAndSize(space, s, size):
435    """Create a Unicode Object from the char buffer u. The bytes will be
436    interpreted as being UTF-8 encoded. u may also be NULL which causes the
437    contents to be undefined. It is the user's responsibility to fill in the
438    needed data. The buffer is copied into the new object. If the buffer is not
439    NULL, the return value might be a shared object. Therefore, modification of
440    the resulting Unicode object is only allowed when u is NULL."""
441    if s:
442        return make_ref(space, PyUnicode_DecodeUTF8(
443            space, s, size, lltype.nullptr(rffi.CCHARP.TO)))
444    else:
445        return rffi.cast(PyObject, new_empty_unicode(space, size))
446
447@cpython_api([rffi.INT_real], PyObject)
448def PyUnicode_FromOrdinal(space, ordinal):
449    """Create a Unicode Object from the given Unicode code point ordinal.
450
451    The ordinal must be in range(0x10000) on narrow Python builds
452    (UCS2), and range(0x110000) on wide builds (UCS4). A ValueError is
453    raised in case it is not."""
454    w_ordinal = space.wrap(rffi.cast(lltype.Signed, ordinal))
455    return space.call_function(space.builtin.get('unichr'), w_ordinal)
456
457@cpython_api([PyObjectP, Py_ssize_t], rffi.INT_real, error=-1)
458def PyUnicode_Resize(space, ref, newsize):
459    # XXX always create a new string so far
460    py_uni = rffi.cast(PyUnicodeObject, ref[0])
461    if not py_uni.c_str:
462        raise oefmt(space.w_SystemError,
463                    "PyUnicode_Resize called on already created string")
464    try:
465        py_newuni = new_empty_unicode(space, newsize)
466    except MemoryError:
467        Py_DecRef(space, ref[0])
468        ref[0] = lltype.nullptr(PyObject.TO)
469        raise
470    to_cp = newsize
471    oldsize = py_uni.c_length
472    if oldsize < newsize:
473        to_cp = oldsize
474    for i in range(to_cp):
475        py_newuni.c_str[i] = py_uni.c_str[i]
476    Py_DecRef(space, ref[0])
477    ref[0] = rffi.cast(PyObject, py_newuni)
478    return 0
479
480def make_conversion_functions(suffix, encoding):
481    @cpython_api([PyObject], PyObject)
482    @func_renamer('PyUnicode_As%sString' % suffix)
483    def PyUnicode_AsXXXString(space, w_unicode):
484        """Encode a Unicode object and return the result as Python
485        string object.  Error handling is "strict".  Return NULL if an
486        exception was raised by the codec."""
487        if not PyUnicode_Check(space, w_unicode):
488            PyErr_BadArgument(space)
489        return unicodeobject.encode_object(space, w_unicode, encoding, "strict")
490
491    @cpython_api([CONST_STRING, Py_ssize_t, CONST_STRING], PyObject)
492    @func_renamer('PyUnicode_Decode%s' % suffix)
493    def PyUnicode_DecodeXXX(space, s, size, errors):
494        """Create a Unicode object by decoding size bytes of the
495        encoded string s. Return NULL if an exception was raised by
496        the codec.
497        """
498        w_s = space.newbytes(rffi.charpsize2str(s, size))
499        if errors:
500            w_errors = space.wrap(rffi.charp2str(errors))
501        else:
502            w_errors = None
503        return space.call_method(w_s, 'decode', space.wrap(encoding), w_errors)
504    globals()['PyUnicode_Decode%s' % suffix] = PyUnicode_DecodeXXX
505
506    @cpython_api([CONST_WSTRING, Py_ssize_t, CONST_STRING], PyObject)
507    @func_renamer('PyUnicode_Encode%s' % suffix)
508    def PyUnicode_EncodeXXX(space, s, size, errors):
509        """Encode the Py_UNICODE buffer of the given size and return a
510        Python string object.  Return NULL if an exception was raised
511        by the codec."""
512        w_u = space.wrap(rffi.wcharpsize2unicode(s, size))
513        if errors:
514            w_errors = space.wrap(rffi.charp2str(errors))
515        else:
516            w_errors = None
517        return space.call_method(w_u, 'encode', space.wrap(encoding), w_errors)
518    globals()['PyUnicode_Encode%s' % suffix] = PyUnicode_EncodeXXX
519
520make_conversion_functions('UTF8', 'utf-8')
521make_conversion_functions('ASCII', 'ascii')
522make_conversion_functions('Latin1', 'latin-1')
523if sys.platform == 'win32':
524    make_conversion_functions('MBCS', 'mbcs')
525
526@cpython_api([rffi.CCHARP, Py_ssize_t, rffi.CCHARP, rffi.INTP], PyObject)
527def PyUnicode_DecodeUTF16(space, s, size, llerrors, pbyteorder):
528    """Decode length bytes from a UTF-16 encoded buffer string and return the
529    corresponding Unicode object.  errors (if non-NULL) defines the error
530    handling. It defaults to "strict".
531
532    If byteorder is non-NULL, the decoder starts decoding using the given byte
533    order:
534
535    *byteorder == -1: little endian
536    *byteorder == 0:  native order
537    *byteorder == 1:  big endian
538
539    If *byteorder is zero, and the first two bytes of the input data are a
540    byte order mark (BOM), the decoder switches to this byte order and the BOM is
541    not copied into the resulting Unicode string.  If *byteorder is -1 or
542    1, any byte order mark is copied to the output (where it will result in
543    either a \ufeff or a \ufffe character).
544
545    After completion, *byteorder is set to the current byte order at the end
546    of input data.
547
548    If byteorder is NULL, the codec starts in native order mode.
549
550    Return NULL if an exception was raised by the codec."""
551
552    string = rffi.charpsize2str(s, size)
553
554    if pbyteorder is not None:
555        llbyteorder = rffi.cast(lltype.Signed, pbyteorder[0])
556        if llbyteorder < 0:
557            byteorder = "little"
558        elif llbyteorder > 0:
559            byteorder = "big"
560        else:
561            byteorder = "native"
562    else:
563        byteorder = "native"
564
565    if llerrors:
566        errors = rffi.charp2str(llerrors)
567    else:
568        errors = None
569
570    result, length, byteorder = runicode.str_decode_utf_16_helper(
571        string, size, errors,
572        True, # final ? false for multiple passes?
573        None, # errorhandler
574        byteorder)
575    if pbyteorder is not None:
576        pbyteorder[0] = rffi.cast(rffi.INT, byteorder)
577
578    return space.wrap(result)
579
580@cpython_api([rffi.CCHARP, Py_ssize_t, rffi.CCHARP, rffi.INTP], PyObject)
581def PyUnicode_DecodeUTF32(space, s, size, llerrors, pbyteorder):
582    """Decode length bytes from a UTF-32 encoded buffer string and
583    return the corresponding Unicode object.  errors (if non-NULL)
584    defines the error handling. It defaults to "strict".
585
586    If byteorder is non-NULL, the decoder starts decoding using the
587    given byte order:
588    *byteorder == -1: little endian
589    *byteorder == 0:  native order
590    *byteorder == 1:  big endian
591
592    If *byteorder is zero, and the first four bytes of the input data
593    are a byte order mark (BOM), the decoder switches to this byte
594    order and the BOM is not copied into the resulting Unicode string.
595    If *byteorder is -1 or 1, any byte order mark is copied to the
596    output.
597
598    After completion, *byteorder is set to the current byte order at
599    the end of input data.
600
601    In a narrow build codepoints outside the BMP will be decoded as
602    surrogate pairs.
603
604    If byteorder is NULL, the codec starts in native order mode.
605
606    Return NULL if an exception was raised by the codec.
607    """
608    string = rffi.charpsize2str(s, size)
609
610    if pbyteorder:
611        llbyteorder = rffi.cast(lltype.Signed, pbyteorder[0])
612        if llbyteorder < 0:
613            byteorder = "little"
614        elif llbyteorder > 0:
615            byteorder = "big"
616        else:
617            byteorder = "native"
618    else:
619        byteorder = "native"
620
621    if llerrors:
622        errors = rffi.charp2str(llerrors)
623    else:
624        errors = None
625
626    result, length, byteorder = runicode.str_decode_utf_32_helper(
627        string, size, errors,
628        True, # final ? false for multiple passes?
629        None, # errorhandler
630        byteorder)
631    if pbyteorder is not None:
632        pbyteorder[0] = rffi.cast(rffi.INT, byteorder)
633
634    return space.wrap(result)
635
636@cpython_api([rffi.CWCHARP, Py_ssize_t, rffi.CCHARP, rffi.CCHARP],
637             rffi.INT_real, error=-1)
638def PyUnicode_EncodeDecimal(space, s, length, output, llerrors):
639    """Takes a Unicode string holding a decimal value and writes it
640    into an output buffer using standard ASCII digit codes.
641
642    The output buffer has to provide at least length+1 bytes of
643    storage area. The output string is 0-terminated.
644
645    The encoder converts whitespace to ' ', decimal characters to
646    their corresponding ASCII digit and all other Latin-1 characters
647    except \0 as-is. Characters outside this range (Unicode ordinals
648    1-256) are treated as errors. This includes embedded NULL bytes.
649
650    Returns 0 on success, -1 on failure.
651    """
652    u = rffi.wcharpsize2unicode(s, length)
653    if llerrors:
654        errors = rffi.charp2str(llerrors)
655    else:
656        errors = None
657    state = space.fromcache(CodecState)
658    result = runicode.unicode_encode_decimal(u, length, errors,
659                                             state.encode_error_handler)
660    i = len(result)
661    output[i] = '\0'
662    i -= 1
663    while i >= 0:
664        output[i] = result[i]
665        i -= 1
666    return 0
667
668@cpython_api([PyObject, PyObject], rffi.INT_real, error=-2)
669def PyUnicode_Compare(space, w_left, w_right):
670    """Compare two strings and return -1, 0, 1 for less than, equal, and greater
671    than, respectively."""
672    return space.int_w(space.cmp(w_left, w_right))
673
674@cpython_api([PyObject, PyObject], PyObject)
675def PyUnicode_Concat(space, w_left, w_right):
676    """Concat two strings giving a new Unicode string."""
677    return space.add(w_left, w_right)
678
679@cpython_api([rffi.CWCHARP, rffi.CWCHARP, Py_ssize_t], lltype.Void)
680def Py_UNICODE_COPY(space, target, source, length):
681    """Roughly equivalent to memcpy() only the base size is Py_UNICODE
682    copies sizeof(Py_UNICODE) * length bytes from source to target"""
683    for i in range(0, length):
684        target[i] = source[i]
685
686@cpython_api([PyObject, PyObject], PyObject)
687def PyUnicode_Format(space, w_format, w_args):
688    """Return a new string object from format and args; this is analogous to
689    format % args.  The args argument must be a tuple."""
690    return space.mod(w_format, w_args)
691
692@cpython_api([PyObject, PyObject], PyObject)
693def PyUnicode_Join(space, w_sep, w_seq):
694    """Join a sequence of strings using the given separator and return
695    the resulting Unicode string."""
696    return space.call_method(w_sep, 'join', w_seq)
697
698@cpython_api([PyObject, PyObject, PyObject, Py_ssize_t], PyObject)
699def PyUnicode_Replace(space, w_str, w_substr, w_replstr, maxcount):
700    """Replace at most maxcount occurrences of substr in str with replstr and
701    return the resulting Unicode object. maxcount == -1 means replace all
702    occurrences."""
703    return space.call_method(w_str, "replace", w_substr, w_replstr,
704                             space.wrap(maxcount))
705
706@cpython_api([PyObject, PyObject, Py_ssize_t, Py_ssize_t, rffi.INT_real],
707             rffi.INT_real, error=-1)
708def PyUnicode_Tailmatch(space, w_str, w_substr, start, end, direction):
709    """Return 1 if substr matches str[start:end] at the given tail end
710    (direction == -1 means to do a prefix match, direction == 1 a
711    suffix match), 0 otherwise. Return -1 if an error occurred."""
712    str = space.unicode_w(w_str)
713    substr = space.unicode_w(w_substr)
714    if rffi.cast(lltype.Signed, direction) <= 0:
715        return rstring.startswith(str, substr, start, end)
716    else:
717        return rstring.endswith(str, substr, start, end)
718
719@cpython_api([PyObject, PyObject, Py_ssize_t, Py_ssize_t], Py_ssize_t, error=-1)
720def PyUnicode_Count(space, w_str, w_substr, start, end):
721    """Return the number of non-overlapping occurrences of substr in
722    str[start:end].  Return -1 if an error occurred."""
723    w_count = space.call_method(w_str, "count", w_substr,
724                                space.wrap(start), space.wrap(end))
725    return space.int_w(w_count)
726
727@cpython_api([PyObject, PyObject, Py_ssize_t, Py_ssize_t, rffi.INT_real],
728             Py_ssize_t, error=-2)
729def PyUnicode_Find(space, w_str, w_substr, start, end, direction):
730    """Return the first position of substr in str*[*start:end] using
731    the given direction (direction == 1 means to do a forward search,
732    direction == -1 a backward search).  The return value is the index
733    of the first match; a value of -1 indicates that no match was
734    found, and -2 indicates that an error occurred and an exception
735    has been set."""
736    if rffi.cast(lltype.Signed, direction) > 0:
737        w_pos = space.call_method(w_str, "find", w_substr,
738                                  space.wrap(start), space.wrap(end))
739    else:
740        w_pos = space.call_method(w_str, "rfind", w_substr,
741                                  space.wrap(start), space.wrap(end))
742    return space.int_w(w_pos)
743
744@cpython_api([PyObject, PyObject, Py_ssize_t], PyObject)
745def PyUnicode_Split(space, w_str, w_sep, maxsplit):
746    """Split a string giving a list of Unicode strings.  If sep is
747    NULL, splitting will be done at all whitespace substrings.
748    Otherwise, splits occur at the given separator.  At most maxsplit
749    splits will be done.  If negative, no limit is set.  Separators
750    are not included in the resulting list."""
751    if w_sep is None:
752        w_sep = space.w_None
753    return space.call_method(w_str, "split", w_sep, space.wrap(maxsplit))
754
755@cpython_api([PyObject, rffi.INT_real], PyObject)
756def PyUnicode_Splitlines(space, w_str, keepend):
757    """Split a Unicode string at line breaks, returning a list of
758    Unicode strings.  CRLF is considered to be one line break.  If
759    keepend is 0, the Line break characters are not included in the
760    resulting strings."""
761    return space.call_method(w_str, "splitlines", space.wrap(keepend))