unicodeobject.c - This C code implements a Unicode characte…

/Objects/unicodeobject.c

http://unladen-swallow.googlecode.com/ · C · 9106 lines · 7348 code · 957 blank · 801 comment · 1907 complexity · 9db69b4d6a66f4afb0d8ed2b2d32eecc MD5 · raw file
Large files are truncated click here to view the full file

/*

Unicode implementation based on original code by Fredrik Lundh,
modified by Marc-Andre Lemburg <mal@lemburg.com> according to the
Unicode Integration Proposal (see file Misc/unicode.txt).

Major speed upgrades to the method implementations at the Reykjavik
NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke.

Copyright (c) Corporation for National Research Initiatives.

--------------------------------------------------------------------
The original string type implementation is:

  Copyright (c) 1999 by Secret Labs AB
  Copyright (c) 1999 by Fredrik Lundh

By obtaining, using, and/or copying this software and/or its
associated documentation, you agree that you have read, understood,
and will comply with the following terms and conditions:

Permission to use, copy, modify, and distribute this software and its
associated documentation for any purpose and without fee is hereby
granted, provided that the above copyright notice appears in all
copies, and that both that copyright notice and this permission notice
appear in supporting documentation, and that the name of Secret Labs
AB or the author not be used in advertising or publicity pertaining to
distribution of the software without specific, written prior
permission.

SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
FITNESS.  IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
--------------------------------------------------------------------

*/

#define PY_SSIZE_T_CLEAN
#include "Python.h"

#include "unicodeobject.h"
#include "ucnhash.h"

#ifdef MS_WINDOWS
#include <windows.h>
#endif

/* Limit for the Unicode object free list */

#define PyUnicode_MAXFREELIST       1024

/* Limit for the Unicode object free list stay alive optimization.

   The implementation will keep allocated Unicode memory intact for
   all objects on the free list having a size less than this
   limit. This reduces malloc() overhead for small Unicode objects.

   At worst this will result in PyUnicode_MAXFREELIST *
   (sizeof(PyUnicodeObject) + KEEPALIVE_SIZE_LIMIT +
   malloc()-overhead) bytes of unused garbage.

   Setting the limit to 0 effectively turns the feature off.

   Note: This is an experimental feature ! If you get core dumps when
   using Unicode objects, turn this feature off.

*/

#define KEEPALIVE_SIZE_LIMIT       9

/* Endianness switches; defaults to little endian */

#ifdef WORDS_BIGENDIAN
# define BYTEORDER_IS_BIG_ENDIAN
#else
# define BYTEORDER_IS_LITTLE_ENDIAN
#endif

/* --- Globals ------------------------------------------------------------

   The globals are initialized by the _PyUnicode_Init() API and should
   not be used before calling that API.

*/


#ifdef __cplusplus
extern "C" {
#endif

/* Free list for Unicode objects */
static PyUnicodeObject *free_list;
static int numfree;

/* The empty Unicode object is shared to improve performance. */
static PyUnicodeObject *unicode_empty;

/* Single character Unicode strings in the Latin-1 range are being
   shared as well. */
static PyUnicodeObject *unicode_latin1[256];

/* Default encoding to use and assume when NULL is passed as encoding
   parameter; it is initialized by _PyUnicode_Init().

   Always use the PyUnicode_SetDefaultEncoding() and
   PyUnicode_GetDefaultEncoding() APIs to access this global.

*/
static char unicode_default_encoding[100];

/* Fast detection of the most frequent whitespace characters */
const unsigned char _Py_ascii_whitespace[] = {
    0, 0, 0, 0, 0, 0, 0, 0,
/*     case 0x0009: * HORIZONTAL TABULATION */
/*     case 0x000A: * LINE FEED */
/*     case 0x000B: * VERTICAL TABULATION */
/*     case 0x000C: * FORM FEED */
/*     case 0x000D: * CARRIAGE RETURN */
    0, 1, 1, 1, 1, 1, 0, 0,
    0, 0, 0, 0, 0, 0, 0, 0,
/*     case 0x001C: * FILE SEPARATOR */
/*     case 0x001D: * GROUP SEPARATOR */
/*     case 0x001E: * RECORD SEPARATOR */
/*     case 0x001F: * UNIT SEPARATOR */
    0, 0, 0, 0, 1, 1, 1, 1,
/*     case 0x0020: * SPACE */
    1, 0, 0, 0, 0, 0, 0, 0,
    0, 0, 0, 0, 0, 0, 0, 0,
    0, 0, 0, 0, 0, 0, 0, 0,
    0, 0, 0, 0, 0, 0, 0, 0,

    0, 0, 0, 0, 0, 0, 0, 0,
    0, 0, 0, 0, 0, 0, 0, 0,
    0, 0, 0, 0, 0, 0, 0, 0,
    0, 0, 0, 0, 0, 0, 0, 0,
    0, 0, 0, 0, 0, 0, 0, 0,
    0, 0, 0, 0, 0, 0, 0, 0,
    0, 0, 0, 0, 0, 0, 0, 0,
    0, 0, 0, 0, 0, 0, 0, 0
};

/* Same for linebreaks */
static unsigned char ascii_linebreak[] = {
    0, 0, 0, 0, 0, 0, 0, 0,
/*         0x000A, * LINE FEED */
/*         0x000D, * CARRIAGE RETURN */
    0, 0, 1, 0, 0, 1, 0, 0,
    0, 0, 0, 0, 0, 0, 0, 0,
/*         0x001C, * FILE SEPARATOR */
/*         0x001D, * GROUP SEPARATOR */
/*         0x001E, * RECORD SEPARATOR */
    0, 0, 0, 0, 1, 1, 1, 0,
    0, 0, 0, 0, 0, 0, 0, 0,
    0, 0, 0, 0, 0, 0, 0, 0,
    0, 0, 0, 0, 0, 0, 0, 0,
    0, 0, 0, 0, 0, 0, 0, 0,

    0, 0, 0, 0, 0, 0, 0, 0,
    0, 0, 0, 0, 0, 0, 0, 0,
    0, 0, 0, 0, 0, 0, 0, 0,
    0, 0, 0, 0, 0, 0, 0, 0,
    0, 0, 0, 0, 0, 0, 0, 0,
    0, 0, 0, 0, 0, 0, 0, 0,
    0, 0, 0, 0, 0, 0, 0, 0,
    0, 0, 0, 0, 0, 0, 0, 0
};


Py_UNICODE
PyUnicode_GetMax(void)
{
#ifdef Py_UNICODE_WIDE
    return 0x10FFFF;
#else
    /* This is actually an illegal character, so it should
       not be passed to unichr. */
    return 0xFFFF;
#endif
}

/* --- Bloom Filters ----------------------------------------------------- */

/* stuff to implement simple "bloom filters" for Unicode characters.
   to keep things simple, we use a single bitmask, using the least 5
   bits from each unicode characters as the bit index. */

/* the linebreak mask is set up by Unicode_Init below */

#define BLOOM_MASK unsigned long

static BLOOM_MASK bloom_linebreak;

#define BLOOM(mask, ch) ((mask & (1 << ((ch) & 0x1F))))

#define BLOOM_LINEBREAK(ch)                                             \
    ((ch) < 128U ? ascii_linebreak[(ch)] :                              \
     (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK(ch)))

Py_LOCAL_INLINE(BLOOM_MASK) make_bloom_mask(Py_UNICODE* ptr, Py_ssize_t len)
{
    /* calculate simple bloom-style bitmask for a given unicode string */

    long mask;
    Py_ssize_t i;

    mask = 0;
    for (i = 0; i < len; i++)
        mask |= (1 << (ptr[i] & 0x1F));

    return mask;
}

Py_LOCAL_INLINE(int) unicode_member(Py_UNICODE chr, Py_UNICODE* set, Py_ssize_t setlen)
{
    Py_ssize_t i;

    for (i = 0; i < setlen; i++)
        if (set[i] == chr)
            return 1;

    return 0;
}

#define BLOOM_MEMBER(mask, chr, set, setlen)                    \
    BLOOM(mask, chr) && unicode_member(chr, set, setlen)

/* --- Unicode Object ----------------------------------------------------- */

static
int unicode_resize(register PyUnicodeObject *unicode,
                   Py_ssize_t length)
{
    void *oldstr;

    /* Shortcut if there's nothing much to do. */
    if (unicode->length == length)
        goto reset;

    /* Resizing shared object (unicode_empty or single character
       objects) in-place is not allowed. Use PyUnicode_Resize()
       instead ! */

    if (unicode == unicode_empty ||
        (unicode->length == 1 &&
         unicode->str[0] < 256U &&
         unicode_latin1[unicode->str[0]] == unicode)) {
        PyErr_SetString(PyExc_SystemError,
                        "can't resize shared unicode objects");
        return -1;
    }

    /* We allocate one more byte to make sure the string is Ux0000 terminated.
       The overallocation is also used by fastsearch, which assumes that it's
       safe to look at str[length] (without making any assumptions about what
       it contains). */

    oldstr = unicode->str;
    unicode->str = PyObject_REALLOC(unicode->str,
                                    sizeof(Py_UNICODE) * (length + 1));
    if (!unicode->str) {
        unicode->str = (Py_UNICODE *)oldstr;
        PyErr_NoMemory();
        return -1;
    }
    unicode->str[length] = 0;
    unicode->length = length;

  reset:
    /* Reset the object caches */
    if (unicode->defenc) {
        Py_DECREF(unicode->defenc);
        unicode->defenc = NULL;
    }
    unicode->hash = -1;

    return 0;
}

/* We allocate one more byte to make sure the string is
   Ux0000 terminated -- XXX is this needed ?

   XXX This allocator could further be enhanced by assuring that the
   free list never reduces its size below 1.

*/

static
PyUnicodeObject *_PyUnicode_New(Py_ssize_t length)
{
    register PyUnicodeObject *unicode;

    /* Optimization for empty strings */
    if (length == 0 && unicode_empty != NULL) {
        Py_INCREF(unicode_empty);
        return unicode_empty;
    }

    /* Ensure we won't overflow the size. */
    if (length > ((PY_SSIZE_T_MAX / sizeof(Py_UNICODE)) - 1)) {
        return (PyUnicodeObject *)PyErr_NoMemory();
    }

    /* Unicode freelist & memory allocation */
    if (free_list) {
        unicode = free_list;
        free_list = *(PyUnicodeObject **)unicode;
        numfree--;
        if (unicode->str) {
            /* Keep-Alive optimization: we only upsize the buffer,
               never downsize it. */
            if ((unicode->length < length) &&
                unicode_resize(unicode, length) < 0) {
                PyObject_DEL(unicode->str);
                unicode->str = NULL;
            }
        }
        else {
            size_t new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
            unicode->str = (Py_UNICODE*) PyObject_MALLOC(new_size);
        }
        PyObject_INIT(unicode, &PyUnicode_Type);
    }
    else {
        size_t new_size;
        unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
        if (unicode == NULL)
            return NULL;
        new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
        unicode->str = (Py_UNICODE*) PyObject_MALLOC(new_size);
    }

    if (!unicode->str) {
        PyErr_NoMemory();
        goto onError;
    }
    /* Initialize the first element to guard against cases where
     * the caller fails before initializing str -- unicode_resize()
     * reads str[0], and the Keep-Alive optimization can keep memory
     * allocated for str alive across a call to unicode_dealloc(unicode).
     * We don't want unicode_resize to read uninitialized memory in
     * that case.
     */
    unicode->str[0] = 0;
    unicode->str[length] = 0;
    unicode->length = length;
    unicode->hash = -1;
    unicode->defenc = NULL;
    return unicode;

  onError:
    /* XXX UNREF/NEWREF interface should be more symmetrical */
    _Py_DEC_REFTOTAL;
    _Py_ForgetReference((PyObject *)unicode);
    PyObject_Del(unicode);
    return NULL;
}

static
void unicode_dealloc(register PyUnicodeObject *unicode)
{
    if (PyUnicode_CheckExact(unicode) &&
        numfree < PyUnicode_MAXFREELIST) {
        /* Keep-Alive optimization */
        if (unicode->length >= KEEPALIVE_SIZE_LIMIT) {
            PyObject_DEL(unicode->str);
            unicode->str = NULL;
            unicode->length = 0;
        }
        if (unicode->defenc) {
            Py_DECREF(unicode->defenc);
            unicode->defenc = NULL;
        }
        /* Add to free list */
        *(PyUnicodeObject **)unicode = free_list;
        free_list = unicode;
        numfree++;
    }
    else {
        PyObject_DEL(unicode->str);
        Py_XDECREF(unicode->defenc);
        Py_TYPE(unicode)->tp_free((PyObject *)unicode);
    }
}

static
int _PyUnicode_Resize(PyUnicodeObject **unicode, Py_ssize_t length)
{
    register PyUnicodeObject *v;

    /* Argument checks */
    if (unicode == NULL) {
        PyErr_BadInternalCall();
        return -1;
    }
    v = *unicode;
    if (v == NULL || !PyUnicode_Check(v) || Py_REFCNT(v) != 1 || length < 0) {
        PyErr_BadInternalCall();
        return -1;
    }

    /* Resizing unicode_empty and single character objects is not
       possible since these are being shared. We simply return a fresh
       copy with the same Unicode content. */
    if (v->length != length &&
        (v == unicode_empty || v->length == 1)) {
        PyUnicodeObject *w = _PyUnicode_New(length);
        if (w == NULL)
            return -1;
        Py_UNICODE_COPY(w->str, v->str,
                        length < v->length ? length : v->length);
        Py_DECREF(*unicode);
        *unicode = w;
        return 0;
    }

    /* Note that we don't have to modify *unicode for unshared Unicode
       objects, since we can modify them in-place. */
    return unicode_resize(v, length);
}

int PyUnicode_Resize(PyObject **unicode, Py_ssize_t length)
{
    return _PyUnicode_Resize((PyUnicodeObject **)unicode, length);
}

PyObject *PyUnicode_FromUnicode(const Py_UNICODE *u,
                                Py_ssize_t size)
{
    PyUnicodeObject *unicode;

    /* If the Unicode data is known at construction time, we can apply
       some optimizations which share commonly used objects. */
    if (u != NULL) {

        /* Optimization for empty strings */
        if (size == 0 && unicode_empty != NULL) {
            Py_INCREF(unicode_empty);
            return (PyObject *)unicode_empty;
        }

        /* Single character Unicode objects in the Latin-1 range are
           shared when using this constructor */
        if (size == 1 && *u < 256) {
            unicode = unicode_latin1[*u];
            if (!unicode) {
                unicode = _PyUnicode_New(1);
                if (!unicode)
                    return NULL;
                unicode->str[0] = *u;
                unicode_latin1[*u] = unicode;
            }
            Py_INCREF(unicode);
            return (PyObject *)unicode;
        }
    }

    unicode = _PyUnicode_New(size);
    if (!unicode)
        return NULL;

    /* Copy the Unicode data into the new object */
    if (u != NULL)
        Py_UNICODE_COPY(unicode->str, u, size);

    return (PyObject *)unicode;
}

PyObject *PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size)
{
    PyUnicodeObject *unicode;

    if (size < 0) {
        PyErr_SetString(PyExc_SystemError,
                        "Negative size passed to PyUnicode_FromStringAndSize");
        return NULL;
    }

    /* If the Unicode data is known at construction time, we can apply
       some optimizations which share commonly used objects.
       Also, this means the input must be UTF-8, so fall back to the
       UTF-8 decoder at the end. */
    if (u != NULL) {

        /* Optimization for empty strings */
        if (size == 0 && unicode_empty != NULL) {
            Py_INCREF(unicode_empty);
            return (PyObject *)unicode_empty;
        }

        /* Single characters are shared when using this constructor.
           Restrict to ASCII, since the input must be UTF-8. */
        if (size == 1 && Py_CHARMASK(*u) < 128) {
            unicode = unicode_latin1[Py_CHARMASK(*u)];
            if (!unicode) {
                unicode = _PyUnicode_New(1);
                if (!unicode)
                    return NULL;
                unicode->str[0] = Py_CHARMASK(*u);
                unicode_latin1[Py_CHARMASK(*u)] = unicode;
            }
            Py_INCREF(unicode);
            return (PyObject *)unicode;
        }

        return PyUnicode_DecodeUTF8(u, size, NULL);
    }

    unicode = _PyUnicode_New(size);
    if (!unicode)
        return NULL;

    return (PyObject *)unicode;
}

PyObject *PyUnicode_FromString(const char *u)
{
    size_t size = strlen(u);
    if (size > PY_SSIZE_T_MAX) {
        PyErr_SetString(PyExc_OverflowError, "input too long");
        return NULL;
    }

    return PyUnicode_FromStringAndSize(u, size);
}

#ifdef HAVE_WCHAR_H

PyObject *PyUnicode_FromWideChar(register const wchar_t *w,
                                 Py_ssize_t size)
{
    PyUnicodeObject *unicode;

    if (w == NULL) {
        PyErr_BadInternalCall();
        return NULL;
    }

    unicode = _PyUnicode_New(size);
    if (!unicode)
        return NULL;

    /* Copy the wchar_t data into the new object */
#ifdef HAVE_USABLE_WCHAR_T
    memcpy(unicode->str, w, size * sizeof(wchar_t));
#else
    {
        register Py_UNICODE *u;
        register Py_ssize_t i;
        u = PyUnicode_AS_UNICODE(unicode);
        for (i = size; i > 0; i--)
            *u++ = *w++;
    }
#endif

    return (PyObject *)unicode;
}

static void
makefmt(char *fmt, int longflag, int size_tflag, int zeropad, int width, int precision, char c)
{
    *fmt++ = '%';
    if (width) {
        if (zeropad)
            *fmt++ = '0';
        fmt += sprintf(fmt, "%d", width);
    }
    if (precision)
        fmt += sprintf(fmt, ".%d", precision);
    if (longflag)
        *fmt++ = 'l';
    else if (size_tflag) {
        char *f = PY_FORMAT_SIZE_T;
        while (*f)
            *fmt++ = *f++;
    }
    *fmt++ = c;
    *fmt = '\0';
}

#define appendstring(string) {for (copy = string;*copy;) *s++ = *copy++;}

PyObject *
PyUnicode_FromFormatV(const char *format, va_list vargs)
{
    va_list count;
    Py_ssize_t callcount = 0;
    PyObject **callresults = NULL;
    PyObject **callresult = NULL;
    Py_ssize_t n = 0;
    int width = 0;
    int precision = 0;
    int zeropad;
    const char* f;
    Py_UNICODE *s;
    PyObject *string;
    /* used by sprintf */
    char buffer[21];
    /* use abuffer instead of buffer, if we need more space
     * (which can happen if there's a format specifier with width). */
    char *abuffer = NULL;
    char *realbuffer;
    Py_ssize_t abuffersize = 0;
    char fmt[60]; /* should be enough for %0width.precisionld */
    const char *copy;

#ifdef VA_LIST_IS_ARRAY
    Py_MEMCPY(count, vargs, sizeof(va_list));
#else
#ifdef  __va_copy
    __va_copy(count, vargs);
#else
    count = vargs;
#endif
#endif
     /* step 1: count the number of %S/%R/%s format specifications
      * (we call PyObject_Str()/PyObject_Repr()/PyUnicode_DecodeUTF8() for these
      * objects once during step 3 and put the result in an array) */
    for (f = format; *f; f++) {
         if (*f == '%') {
             if (*(f+1)=='%')
                 continue;
             if (*(f+1)=='S' || *(f+1)=='R')
                 ++callcount;
             while (isdigit((unsigned)*f))
                 width = (width*10) + *f++ - '0';
             while (*++f && *f != '%' && !isalpha((unsigned)*f))
                 ;
             if (*f == 's')
                 ++callcount;
         }
    }
    /* step 2: allocate memory for the results of
     * PyObject_Str()/PyObject_Repr()/PyUnicode_DecodeUTF8() calls */
    if (callcount) {
        callresults = PyObject_Malloc(sizeof(PyObject *)*callcount);
        if (!callresults) {
            PyErr_NoMemory();
            return NULL;
        }
        callresult = callresults;
    }
    /* step 3: figure out how large a buffer we need */
    for (f = format; *f; f++) {
        if (*f == '%') {
            const char* p = f;
            width = 0;
            while (isdigit((unsigned)*f))
                width = (width*10) + *f++ - '0';
            while (*++f && *f != '%' && !isalpha((unsigned)*f))
                ;

            /* skip the 'l' or 'z' in {%ld, %zd, %lu, %zu} since
             * they don't affect the amount of space we reserve.
             */
            if ((*f == 'l' || *f == 'z') &&
                (f[1] == 'd' || f[1] == 'u'))
                ++f;

            switch (*f) {
            case 'c':
                (void)va_arg(count, int);
                /* fall through... */
            case '%':
                n++;
                break;
            case 'd': case 'u': case 'i': case 'x':
                (void) va_arg(count, int);
                /* 20 bytes is enough to hold a 64-bit
                   integer.  Decimal takes the most space.
                   This isn't enough for octal.
                   If a width is specified we need more
                   (which we allocate later). */
                if (width < 20)
                    width = 20;
                n += width;
                if (abuffersize < width)
                    abuffersize = width;
                break;
            case 's':
            {
                /* UTF-8 */
                unsigned char *s = va_arg(count, unsigned char*);
                PyObject *str = PyUnicode_DecodeUTF8(s, strlen(s), "replace");
                if (!str)
                    goto fail;
                n += PyUnicode_GET_SIZE(str);
                /* Remember the str and switch to the next slot */
                *callresult++ = str;
                break;
            }
            case 'U':
            {
                PyObject *obj = va_arg(count, PyObject *);
                assert(obj && PyUnicode_Check(obj));
                n += PyUnicode_GET_SIZE(obj);
                break;
            }
            case 'V':
            {
                PyObject *obj = va_arg(count, PyObject *);
                const char *str = va_arg(count, const char *);
                assert(obj || str);
                assert(!obj || PyUnicode_Check(obj));
                if (obj)
                    n += PyUnicode_GET_SIZE(obj);
                else
                    n += strlen(str);
                break;
            }
            case 'S':
            {
                PyObject *obj = va_arg(count, PyObject *);
                PyObject *str;
                assert(obj);
                str = PyObject_Str(obj);
                if (!str)
                    goto fail;
                n += PyUnicode_GET_SIZE(str);
                /* Remember the str and switch to the next slot */
                *callresult++ = str;
                break;
            }
            case 'R':
            {
                PyObject *obj = va_arg(count, PyObject *);
                PyObject *repr;
                assert(obj);
                repr = PyObject_Repr(obj);
                if (!repr)
                    goto fail;
                n += PyUnicode_GET_SIZE(repr);
                /* Remember the repr and switch to the next slot */
                *callresult++ = repr;
                break;
            }
            case 'p':
                (void) va_arg(count, int);
                /* maximum 64-bit pointer representation:
                 * 0xffffffffffffffff
                 * so 19 characters is enough.
                 * XXX I count 18 -- what's the extra for?
                 */
                n += 19;
                break;
            default:
                /* if we stumble upon an unknown
                   formatting code, copy the rest of
                   the format string to the output
                   string. (we cannot just skip the
                   code, since there's no way to know
                   what's in the argument list) */
                n += strlen(p);
                goto expand;
            }
        } else
            n++;
    }
  expand:
    if (abuffersize > 20) {
        abuffer = PyObject_Malloc(abuffersize);
        if (!abuffer) {
            PyErr_NoMemory();
            goto fail;
        }
        realbuffer = abuffer;
    }
    else
        realbuffer = buffer;
    /* step 4: fill the buffer */
    /* Since we've analyzed how much space we need for the worst case,
       we don't have to resize the string.
       There can be no errors beyond this point. */
    string = PyUnicode_FromUnicode(NULL, n);
    if (!string)
        goto fail;

    s = PyUnicode_AS_UNICODE(string);
    callresult = callresults;

    for (f = format; *f; f++) {
        if (*f == '%') {
            const char* p = f++;
            int longflag = 0;
            int size_tflag = 0;
            zeropad = (*f == '0');
            /* parse the width.precision part */
            width = 0;
            while (isdigit((unsigned)*f))
                width = (width*10) + *f++ - '0';
            precision = 0;
            if (*f == '.') {
                f++;
                while (isdigit((unsigned)*f))
                    precision = (precision*10) + *f++ - '0';
            }
            /* handle the long flag, but only for %ld and %lu.
               others can be added when necessary. */
            if (*f == 'l' && (f[1] == 'd' || f[1] == 'u')) {
                longflag = 1;
                ++f;
            }
            /* handle the size_t flag. */
            if (*f == 'z' && (f[1] == 'd' || f[1] == 'u')) {
                size_tflag = 1;
                ++f;
            }

            switch (*f) {
            case 'c':
                *s++ = va_arg(vargs, int);
                break;
            case 'd':
                makefmt(fmt, longflag, size_tflag, zeropad, width, precision, 'd');
                if (longflag)
                    sprintf(realbuffer, fmt, va_arg(vargs, long));
                else if (size_tflag)
                    sprintf(realbuffer, fmt, va_arg(vargs, Py_ssize_t));
                else
                    sprintf(realbuffer, fmt, va_arg(vargs, int));
                appendstring(realbuffer);
                break;
            case 'u':
                makefmt(fmt, longflag, size_tflag, zeropad, width, precision, 'u');
                if (longflag)
                    sprintf(realbuffer, fmt, va_arg(vargs, unsigned long));
                else if (size_tflag)
                    sprintf(realbuffer, fmt, va_arg(vargs, size_t));
                else
                    sprintf(realbuffer, fmt, va_arg(vargs, unsigned int));
                appendstring(realbuffer);
                break;
            case 'i':
                makefmt(fmt, 0, 0, zeropad, width, precision, 'i');
                sprintf(realbuffer, fmt, va_arg(vargs, int));
                appendstring(realbuffer);
                break;
            case 'x':
                makefmt(fmt, 0, 0, zeropad, width, precision, 'x');
                sprintf(realbuffer, fmt, va_arg(vargs, int));
                appendstring(realbuffer);
                break;
            case 's':
            {
                /* unused, since we already have the result */
                (void) va_arg(vargs, char *);
                Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(*callresult),
                                PyUnicode_GET_SIZE(*callresult));
                s += PyUnicode_GET_SIZE(*callresult);
                /* We're done with the unicode()/repr() => forget it */
                Py_DECREF(*callresult);
                /* switch to next unicode()/repr() result */
                ++callresult;
                break;
            }
            case 'U':
            {
                PyObject *obj = va_arg(vargs, PyObject *);
                Py_ssize_t size = PyUnicode_GET_SIZE(obj);
                Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(obj), size);
                s += size;
                break;
            }
            case 'V':
            {
                PyObject *obj = va_arg(vargs, PyObject *);
                const char *str = va_arg(vargs, const char *);
                if (obj) {
                    Py_ssize_t size = PyUnicode_GET_SIZE(obj);
                    Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(obj), size);
                    s += size;
                } else {
                    appendstring(str);
                }
                break;
            }
            case 'S':
            case 'R':
            {
                Py_UNICODE *ucopy;
                Py_ssize_t usize;
                Py_ssize_t upos;
                /* unused, since we already have the result */
                (void) va_arg(vargs, PyObject *);
                ucopy = PyUnicode_AS_UNICODE(*callresult);
                usize = PyUnicode_GET_SIZE(*callresult);
                for (upos = 0; upos<usize;)
                    *s++ = ucopy[upos++];
                /* We're done with the unicode()/repr() => forget it */
                Py_DECREF(*callresult);
                /* switch to next unicode()/repr() result */
                ++callresult;
                break;
            }
            case 'p':
                sprintf(buffer, "%p", va_arg(vargs, void*));
                /* %p is ill-defined:  ensure leading 0x. */
                if (buffer[1] == 'X')
                    buffer[1] = 'x';
                else if (buffer[1] != 'x') {
                    memmove(buffer+2, buffer, strlen(buffer)+1);
                    buffer[0] = '0';
                    buffer[1] = 'x';
                }
                appendstring(buffer);
                break;
            case '%':
                *s++ = '%';
                break;
            default:
                appendstring(p);
                goto end;
            }
        } else
            *s++ = *f;
    }

  end:
    if (callresults)
        PyObject_Free(callresults);
    if (abuffer)
        PyObject_Free(abuffer);
    PyUnicode_Resize(&string, s - PyUnicode_AS_UNICODE(string));
    return string;
  fail:
    if (callresults) {
        PyObject **callresult2 = callresults;
        while (callresult2 < callresult) {
            Py_DECREF(*callresult2);
            ++callresult2;
        }
        PyObject_Free(callresults);
    }
    if (abuffer)
        PyObject_Free(abuffer);
    return NULL;
}

#undef appendstring

PyObject *
PyUnicode_FromFormat(const char *format, ...)
{
    PyObject* ret;
    va_list vargs;

#ifdef HAVE_STDARG_PROTOTYPES
    va_start(vargs, format);
#else
    va_start(vargs);
#endif
    ret = PyUnicode_FromFormatV(format, vargs);
    va_end(vargs);
    return ret;
}

Py_ssize_t PyUnicode_AsWideChar(PyUnicodeObject *unicode,
                                wchar_t *w,
                                Py_ssize_t size)
{
    if (unicode == NULL) {
        PyErr_BadInternalCall();
        return -1;
    }

    /* If possible, try to copy the 0-termination as well */
    if (size > PyUnicode_GET_SIZE(unicode))
        size = PyUnicode_GET_SIZE(unicode) + 1;

#ifdef HAVE_USABLE_WCHAR_T
    memcpy(w, unicode->str, size * sizeof(wchar_t));
#else
    {
        register Py_UNICODE *u;
        register Py_ssize_t i;
        u = PyUnicode_AS_UNICODE(unicode);
        for (i = size; i > 0; i--)
            *w++ = *u++;
    }
#endif

    if (size > PyUnicode_GET_SIZE(unicode))
        return PyUnicode_GET_SIZE(unicode);
    else
        return size;
}

#endif

PyObject *PyUnicode_FromOrdinal(int ordinal)
{
    Py_UNICODE s[1];

#ifdef Py_UNICODE_WIDE
    if (ordinal < 0 || ordinal > 0x10ffff) {
        PyErr_SetString(PyExc_ValueError,
                        "unichr() arg not in range(0x110000) "
                        "(wide Python build)");
        return NULL;
    }
#else
    if (ordinal < 0 || ordinal > 0xffff) {
        PyErr_SetString(PyExc_ValueError,
                        "unichr() arg not in range(0x10000) "
                        "(narrow Python build)");
        return NULL;
    }
#endif

    s[0] = (Py_UNICODE)ordinal;
    return PyUnicode_FromUnicode(s, 1);
}

PyObject *PyUnicode_FromObject(register PyObject *obj)
{
    /* XXX Perhaps we should make this API an alias of
       PyObject_Unicode() instead ?! */
    if (PyUnicode_CheckExact(obj)) {
        Py_INCREF(obj);
        return obj;
    }
    if (PyUnicode_Check(obj)) {
        /* For a Unicode subtype that's not a Unicode object,
           return a true Unicode object with the same data. */
        return PyUnicode_FromUnicode(PyUnicode_AS_UNICODE(obj),
                                     PyUnicode_GET_SIZE(obj));
    }
    return PyUnicode_FromEncodedObject(obj, NULL, "strict");
}

PyObject *PyUnicode_FromEncodedObject(register PyObject *obj,
                                      const char *encoding,
                                      const char *errors)
{
    const char *s = NULL;
    Py_ssize_t len;
    PyObject *v;

    if (obj == NULL) {
        PyErr_BadInternalCall();
        return NULL;
    }

#if 0
    /* For b/w compatibility we also accept Unicode objects provided
       that no encodings is given and then redirect to
       PyObject_Unicode() which then applies the additional logic for
       Unicode subclasses.

       NOTE: This API should really only be used for object which
       represent *encoded* Unicode !

    */
    if (PyUnicode_Check(obj)) {
        if (encoding) {
            PyErr_SetString(PyExc_TypeError,
                            "decoding Unicode is not supported");
            return NULL;
        }
        return PyObject_Unicode(obj);
    }
#else
    if (PyUnicode_Check(obj)) {
        PyErr_SetString(PyExc_TypeError,
                        "decoding Unicode is not supported");
        return NULL;
    }
#endif

    /* Coerce object */
    if (PyString_Check(obj)) {
        s = PyString_AS_STRING(obj);
        len = PyString_GET_SIZE(obj);
    }
    else if (PyByteArray_Check(obj)) {
        /* Python 2.x specific */
        PyErr_Format(PyExc_TypeError,
                     "decoding bytearray is not supported");
        return NULL;
    }
    else if (PyObject_AsCharBuffer(obj, &s, &len)) {
        /* Overwrite the error message with something more useful in
           case of a TypeError. */
        if (PyErr_ExceptionMatches(PyExc_TypeError))
            PyErr_Format(PyExc_TypeError,
                         "coercing to Unicode: need string or buffer, "
                         "%.80s found",
                         Py_TYPE(obj)->tp_name);
        goto onError;
    }

    /* Convert to Unicode */
    if (len == 0) {
        Py_INCREF(unicode_empty);
        v = (PyObject *)unicode_empty;
    }
    else
        v = PyUnicode_Decode(s, len, encoding, errors);

    return v;

  onError:
    return NULL;
}

PyObject *PyUnicode_Decode(const char *s,
                           Py_ssize_t size,
                           const char *encoding,
                           const char *errors)
{
    PyObject *buffer = NULL, *unicode;

    if (encoding == NULL)
        encoding = PyUnicode_GetDefaultEncoding();

    /* Shortcuts for common default encodings */
    if (strcmp(encoding, "utf-8") == 0)
        return PyUnicode_DecodeUTF8(s, size, errors);
    else if (strcmp(encoding, "latin-1") == 0)
        return PyUnicode_DecodeLatin1(s, size, errors);
#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
    else if (strcmp(encoding, "mbcs") == 0)
        return PyUnicode_DecodeMBCS(s, size, errors);
#endif
    else if (strcmp(encoding, "ascii") == 0)
        return PyUnicode_DecodeASCII(s, size, errors);

    /* Decode via the codec registry */
    buffer = PyBuffer_FromMemory((void *)s, size);
    if (buffer == NULL)
        goto onError;
    unicode = PyCodec_Decode(buffer, encoding, errors);
    if (unicode == NULL)
        goto onError;
    if (!PyUnicode_Check(unicode)) {
        PyErr_Format(PyExc_TypeError,
                     "decoder did not return an unicode object (type=%.400s)",
                     Py_TYPE(unicode)->tp_name);
        Py_DECREF(unicode);
        goto onError;
    }
    Py_DECREF(buffer);
    return unicode;

  onError:
    Py_XDECREF(buffer);
    return NULL;
}

PyObject *PyUnicode_AsDecodedObject(PyObject *unicode,
                                    const char *encoding,
                                    const char *errors)
{
    PyObject *v;

    if (!PyUnicode_Check(unicode)) {
        PyErr_BadArgument();
        goto onError;
    }

    if (encoding == NULL)
        encoding = PyUnicode_GetDefaultEncoding();

    /* Decode via the codec registry */
    v = PyCodec_Decode(unicode, encoding, errors);
    if (v == NULL)
        goto onError;
    return v;

  onError:
    return NULL;
}

PyObject *PyUnicode_Encode(const Py_UNICODE *s,
                           Py_ssize_t size,
                           const char *encoding,
                           const char *errors)
{
    PyObject *v, *unicode;

    unicode = PyUnicode_FromUnicode(s, size);
    if (unicode == NULL)
        return NULL;
    v = PyUnicode_AsEncodedString(unicode, encoding, errors);
    Py_DECREF(unicode);
    return v;
}

PyObject *PyUnicode_AsEncodedObject(PyObject *unicode,
                                    const char *encoding,
                                    const char *errors)
{
    PyObject *v;

    if (!PyUnicode_Check(unicode)) {
        PyErr_BadArgument();
        goto onError;
    }

    if (encoding == NULL)
        encoding = PyUnicode_GetDefaultEncoding();

    /* Encode via the codec registry */
    v = PyCodec_Encode(unicode, encoding, errors);
    if (v == NULL)
        goto onError;
    return v;

  onError:
    return NULL;
}

PyObject *PyUnicode_AsEncodedString(PyObject *unicode,
                                    const char *encoding,
                                    const char *errors)
{
    PyObject *v;

    if (!PyUnicode_Check(unicode)) {
        PyErr_BadArgument();
        goto onError;
    }

    if (encoding == NULL)
        encoding = PyUnicode_GetDefaultEncoding();

    /* Shortcuts for common default encodings */
    if (errors == NULL) {
        if (strcmp(encoding, "utf-8") == 0)
            return PyUnicode_AsUTF8String(unicode);
        else if (strcmp(encoding, "latin-1") == 0)
            return PyUnicode_AsLatin1String(unicode);
#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
        else if (strcmp(encoding, "mbcs") == 0)
            return PyUnicode_AsMBCSString(unicode);
#endif
        else if (strcmp(encoding, "ascii") == 0)
            return PyUnicode_AsASCIIString(unicode);
    }

    /* Encode via the codec registry */
    v = PyCodec_Encode(unicode, encoding, errors);
    if (v == NULL)
        goto onError;
    if (!PyString_Check(v)) {
        PyErr_Format(PyExc_TypeError,
                     "encoder did not return a string object (type=%.400s)",
                     Py_TYPE(v)->tp_name);
        Py_DECREF(v);
        goto onError;
    }
    return v;

  onError:
    return NULL;
}

PyObject *_PyUnicode_AsDefaultEncodedString(PyObject *unicode,
                                            const char *errors)
{
    PyObject *v = ((PyUnicodeObject *)unicode)->defenc;

    if (v)
        return v;
    v = PyUnicode_AsEncodedString(unicode, NULL, errors);
    if (v && errors == NULL)
        ((PyUnicodeObject *)unicode)->defenc = v;
    return v;
}

Py_UNICODE *PyUnicode_AsUnicode(PyObject *unicode)
{
    if (!PyUnicode_Check(unicode)) {
        PyErr_BadArgument();
        goto onError;
    }
    return PyUnicode_AS_UNICODE(unicode);

  onError:
    return NULL;
}

Py_ssize_t PyUnicode_GetSize(PyObject *unicode)
{
    if (!PyUnicode_Check(unicode)) {
        PyErr_BadArgument();
        goto onError;
    }
    return PyUnicode_GET_SIZE(unicode);

  onError:
    return -1;
}

const char *PyUnicode_GetDefaultEncoding(void)
{
    return unicode_default_encoding;
}

int PyUnicode_SetDefaultEncoding(const char *encoding)
{
    PyObject *v;

    /* Make sure the encoding is valid. As side effect, this also
       loads the encoding into the codec registry cache. */
    v = _PyCodec_Lookup(encoding);
    if (v == NULL)
        goto onError;
    Py_DECREF(v);
    strncpy(unicode_default_encoding,
            encoding,
            sizeof(unicode_default_encoding));
    return 0;

  onError:
    return -1;
}

/* error handling callback helper:
   build arguments, call the callback and check the arguments,
   if no exception occurred, copy the replacement to the output
   and adjust various state variables.
   return 0 on success, -1 on error
*/

static
int unicode_decode_call_errorhandler(const char *errors, PyObject **errorHandler,
                                     const char *encoding, const char *reason,
                                     const char *input, Py_ssize_t insize, Py_ssize_t *startinpos,
                                     Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
                                     PyUnicodeObject **output, Py_ssize_t *outpos, Py_UNICODE **outptr)
{
    static char *argparse = "O!n;decoding error handler must return (unicode, int) tuple";

    PyObject *restuple = NULL;
    PyObject *repunicode = NULL;
    Py_ssize_t outsize = PyUnicode_GET_SIZE(*output);
    Py_ssize_t requiredsize;
    Py_ssize_t newpos;
    Py_UNICODE *repptr;
    Py_ssize_t repsize;
    int res = -1;

    if (*errorHandler == NULL) {
        *errorHandler = PyCodec_LookupError(errors);
        if (*errorHandler == NULL)
            goto onError;
    }

    if (*exceptionObject == NULL) {
        *exceptionObject = PyUnicodeDecodeError_Create(
            encoding, input, insize, *startinpos, *endinpos, reason);
        if (*exceptionObject == NULL)
            goto onError;
    }
    else {
        if (PyUnicodeDecodeError_SetStart(*exceptionObject, *startinpos))
            goto onError;
        if (PyUnicodeDecodeError_SetEnd(*exceptionObject, *endinpos))
            goto onError;
        if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
            goto onError;
    }

    restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
    if (restuple == NULL)
        goto onError;
    if (!PyTuple_Check(restuple)) {
        PyErr_SetString(PyExc_TypeError, &argparse[4]);
        goto onError;
    }
    if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos))
        goto onError;
    if (newpos<0)
        newpos = insize+newpos;
    if (newpos<0 || newpos>insize) {
        PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
        goto onError;
    }

    /* need more space? (at least enough for what we
       have+the replacement+the rest of the string (starting
       at the new input position), so we won't have to check space
       when there are no errors in the rest of the string) */
    repptr = PyUnicode_AS_UNICODE(repunicode);
    repsize = PyUnicode_GET_SIZE(repunicode);
    requiredsize = *outpos + repsize + insize-newpos;
    if (requiredsize > outsize) {
        if (requiredsize<2*outsize)
            requiredsize = 2*outsize;
        if (_PyUnicode_Resize(output, requiredsize) < 0)
            goto onError;
        *outptr = PyUnicode_AS_UNICODE(*output) + *outpos;
    }
    *endinpos = newpos;
    *inptr = input + newpos;
    Py_UNICODE_COPY(*outptr, repptr, repsize);
    *outptr += repsize;
    *outpos += repsize;
    /* we made it! */
    res = 0;

  onError:
    Py_XDECREF(restuple);
    return res;
}

/* --- UTF-7 Codec -------------------------------------------------------- */

/* see RFC2152 for details */

static
char utf7_special[128] = {
    /* indicate whether a UTF-7 character is special i.e. cannot be directly
       encoded:
       0 - not special
       1 - special
       2 - whitespace (optional)
       3 - RFC2152 Set O (optional) */
    1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 1, 1, 2, 1, 1,
    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
    2, 3, 3, 3, 3, 3, 3, 0, 0, 0, 3, 1, 0, 0, 0, 1,
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 3, 0,
    3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 1, 3, 3, 3,
    3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 1, 1,

};

/* Note: The comparison (c) <= 0 is a trick to work-around gcc
   warnings about the comparison always being false; since
   utf7_special[0] is 1, we can safely make that one comparison
   true  */

#define SPECIAL(c, encodeO, encodeWS)                   \
    ((c) > 127 || (c) <= 0 || utf7_special[(c)] == 1 || \
     (encodeWS && (utf7_special[(c)] == 2)) ||          \
     (encodeO && (utf7_special[(c)] == 3)))

#define B64(n)                                                          \
    ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
#define B64CHAR(c)                              \
    (isalnum(c) || (c) == '+' || (c) == '/')
#define UB64(c)                                         \
    ((c) == '+' ? 62 : (c) == '/' ? 63 : (c) >= 'a' ?   \
     (c) - 71 : (c) >= 'A' ? (c) - 65 : (c) + 4 )

#define ENCODE(out, ch, bits)                   \
    while (bits >= 6) {                         \
        *out++ = B64(ch >> (bits-6));           \
        bits -= 6;                              \
    }

#define DECODE(out, ch, bits, surrogate)                                \
    while (bits >= 16) {                                                \
        Py_UNICODE outCh = (Py_UNICODE) ((ch >> (bits-16)) & 0xffff);   \
        bits -= 16;                                                     \
        if (surrogate) {                                                \
            /* We have already generated an error for the high surrogate \
               so let's not bother seeing if the low surrogate is correct or not */ \
            surrogate = 0;                                              \
        } else if (0xDC00 <= outCh && outCh <= 0xDFFF) {                \
            /* This is a surrogate pair. Unfortunately we can't represent \
               it in a 16-bit character */                              \
            surrogate = 1;                                              \
            errmsg = "code pairs are not supported";                    \
            goto utf7Error;                                             \
        } else {                                                        \
            *out++ = outCh;                                             \
        }                                                               \
    }

PyObject *PyUnicode_DecodeUTF7(const char *s,
                               Py_ssize_t size,
                               const char *errors)
{
    return PyUnicode_DecodeUTF7Stateful(s, size, errors, NULL);
}

PyObject *PyUnicode_DecodeUTF7Stateful(const char *s,
                                       Py_ssize_t size,
                                       const char *errors,
                                       Py_ssize_t *consumed)
{
    const char *starts = s;
    Py_ssize_t startinpos;
    Py_ssize_t endinpos;
    Py_ssize_t outpos;
    const char *e;
    PyUnicodeObject *unicode;
    Py_UNICODE *p;
    const char *errmsg = "";
    int inShift = 0;
    unsigned int bitsleft = 0;
    unsigned long charsleft = 0;
    int surrogate = 0;
    PyObject *errorHandler = NULL;
    PyObject *exc = NULL;

    unicode = _PyUnicode_New(size);
    if (!unicode)
        return NULL;
    if (size == 0) {
        if (consumed)
            *consumed = 0;
        return (PyObject *)unicode;
    }

    p = unicode->str;
    e = s + size;

    while (s < e) {
        Py_UNICODE ch;
      restart:
        ch = (unsigned char) *s;

        if (inShift) {
            if ((ch == '-') || !B64CHAR(ch)) {
                inShift = 0;
                s++;

                /* p, charsleft, bitsleft, surrogate = */ DECODE(p, charsleft, bitsleft, surrogate);
                if (bitsleft >= 6) {
                    /* The shift sequence has a partial character in it. If
                       bitsleft < 6 then we could just classify it as padding
                       but that is not the case here */

                    errmsg = "partial character in shift sequence";
                    goto utf7Error;
                }
                /* According to RFC2152 the remaining bits should be zero. We
                   choose to signal an error/insert a replacement character
                   here so indicate the potential of a misencoded character. */

                /* On x86, a << b == a << (b%32) so make sure that bitsleft != 0 */
                if (bitsleft && charsleft << (sizeof(charsleft) * 8 - bitsleft)) {
                    errmsg = "non-zero padding bits in shift sequence";
                    goto utf7Error;
                }

                if (ch == '-') {
                    if ((s < e) && (*(s) == '-')) {
                        *p++ = '-';
                        inShift = 1;
                    }
                } else if (SPECIAL(ch,0,0)) {
                    errmsg = "unexpected special character";
                    goto utf7Error;
                } else  {
                    *p++ = ch;
                }
            } else {
                charsleft = (charsleft << 6) | UB64(ch);
                bitsleft += 6;
                s++;
                /* p, charsleft, bitsleft, surrogate = */ DECODE(p, charsleft, bitsleft, surrogate);
            }
        }
        else if ( ch == '+' ) {
            startinpos = s-starts;
            s++;
            if (s < e && *s == '-') {
                s++;
                *p++ = '+';
            } else
            {
                inShift = 1;
                bitsleft = 0;
            }
        }
        else if (SPECIAL(ch,0,0)) {
            startinpos = s-starts;
            errmsg = "unexpected special character";
            s++;
            goto utf7Error;
        }
        else {
            *p++ = ch;
            s++;
        }
        continue;
      utf7Error:
        outpos = p-PyUnicode_AS_UNICODE(unicode);
        endinpos = s-starts;
        if (unicode_decode_call_errorhandler(
                errors, &errorHandler,
                "utf7", errmsg,
                starts, size, &startinpos, &endinpos, &exc, &s,
                &unicode, &outpos, &p))
            goto onError;
    }

    if (inShift && !consumed) {
        outpos = p-PyUnicode_AS_UNICODE(unicode);
        endinpos = size;
        if (unicode_decode_call_errorhandler(
                errors, &errorHandler,
                "utf7", "unterminated shift sequence",
                starts, size, &startinpos, &endinpos, &exc, &s,
                &unicode, &outpos, &p))
            goto onError;
        if (s < e)
            goto restart;
    }
    if (consumed) {
        if(inShift)
            *consumed = startinpos;
        else
            *consumed = s-starts;
    }

    if (_PyUnicode_Resiz…
Summary ✨

This C code implements a Unicode character set for Python. It defines a PyUnicodeObject struct to represent individual characters, and initializes various data structures such as an array of line breaks, a bloom filter for efficient lookup, and a free list for memory management. The code also sets up the Python type system with a custom unicode type and its associated methods and functions.
Alerts (3)

Complexity hotspot; lines 1440 to 1442 (total complexity: 9)
1440 1441 1442