/python/src/Objects/unicodeobject.c
C | 9087 lines | 7330 code | 956 blank | 801 comment | 1899 complexity | 6404833e5dd99f3adf7d746448f7db96 MD5 | raw file
Possible License(s): GPL-3.0, 0BSD, GPL-2.0, Apache-2.0, LGPL-3.0, AGPL-1.0, BSD-3-Clause
Large files files are truncated, but you can click here to view the full file
- /*
- Unicode implementation based on original code by Fredrik Lundh,
- modified by Marc-Andre Lemburg <mal@lemburg.com> according to the
- Unicode Integration Proposal (see file Misc/unicode.txt).
- Major speed upgrades to the method implementations at the Reykjavik
- NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke.
- Copyright (c) Corporation for National Research Initiatives.
- --------------------------------------------------------------------
- The original string type implementation is:
- Copyright (c) 1999 by Secret Labs AB
- Copyright (c) 1999 by Fredrik Lundh
- By obtaining, using, and/or copying this software and/or its
- associated documentation, you agree that you have read, understood,
- and will comply with the following terms and conditions:
- Permission to use, copy, modify, and distribute this software and its
- associated documentation for any purpose and without fee is hereby
- granted, provided that the above copyright notice appears in all
- copies, and that both that copyright notice and this permission notice
- appear in supporting documentation, and that the name of Secret Labs
- AB or the author not be used in advertising or publicity pertaining to
- distribution of the software without specific, written prior
- permission.
- SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
- THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
- FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
- ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
- WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
- ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
- OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
- --------------------------------------------------------------------
- */
- #define PY_SSIZE_T_CLEAN
- #include "Python.h"
- #include "unicodeobject.h"
- #include "ucnhash.h"
- #ifdef MS_WINDOWS
- #include <windows.h>
- #endif
- /* Limit for the Unicode object free list */
- #define PyUnicode_MAXFREELIST 1024
- /* Limit for the Unicode object free list stay alive optimization.
- The implementation will keep allocated Unicode memory intact for
- all objects on the free list having a size less than this
- limit. This reduces malloc() overhead for small Unicode objects.
- At worst this will result in PyUnicode_MAXFREELIST *
- (sizeof(PyUnicodeObject) + KEEPALIVE_SIZE_LIMIT +
- malloc()-overhead) bytes of unused garbage.
- Setting the limit to 0 effectively turns the feature off.
- Note: This is an experimental feature ! If you get core dumps when
- using Unicode objects, turn this feature off.
- */
- #define KEEPALIVE_SIZE_LIMIT 9
- /* Endianness switches; defaults to little endian */
- #ifdef WORDS_BIGENDIAN
- # define BYTEORDER_IS_BIG_ENDIAN
- #else
- # define BYTEORDER_IS_LITTLE_ENDIAN
- #endif
- /* --- Globals ------------------------------------------------------------
- The globals are initialized by the _PyUnicode_Init() API and should
- not be used before calling that API.
- */
- #ifdef __cplusplus
- extern "C" {
- #endif
- /* Free list for Unicode objects */
- static PyUnicodeObject *free_list;
- static int numfree;
- /* The empty Unicode object is shared to improve performance. */
- static PyUnicodeObject *unicode_empty;
- /* Single character Unicode strings in the Latin-1 range are being
- shared as well. */
- static PyUnicodeObject *unicode_latin1[256];
- /* Default encoding to use and assume when NULL is passed as encoding
- parameter; it is initialized by _PyUnicode_Init().
- Always use the PyUnicode_SetDefaultEncoding() and
- PyUnicode_GetDefaultEncoding() APIs to access this global.
- */
- static char unicode_default_encoding[100];
- /* Fast detection of the most frequent whitespace characters */
- const unsigned char _Py_ascii_whitespace[] = {
- 0, 0, 0, 0, 0, 0, 0, 0,
- /* case 0x0009: * HORIZONTAL TABULATION */
- /* case 0x000A: * LINE FEED */
- /* case 0x000B: * VERTICAL TABULATION */
- /* case 0x000C: * FORM FEED */
- /* case 0x000D: * CARRIAGE RETURN */
- 0, 1, 1, 1, 1, 1, 0, 0,
- 0, 0, 0, 0, 0, 0, 0, 0,
- /* case 0x001C: * FILE SEPARATOR */
- /* case 0x001D: * GROUP SEPARATOR */
- /* case 0x001E: * RECORD SEPARATOR */
- /* case 0x001F: * UNIT SEPARATOR */
- 0, 0, 0, 0, 1, 1, 1, 1,
- /* case 0x0020: * SPACE */
- 1, 0, 0, 0, 0, 0, 0, 0,
- 0, 0, 0, 0, 0, 0, 0, 0,
- 0, 0, 0, 0, 0, 0, 0, 0,
- 0, 0, 0, 0, 0, 0, 0, 0,
- 0, 0, 0, 0, 0, 0, 0, 0,
- 0, 0, 0, 0, 0, 0, 0, 0,
- 0, 0, 0, 0, 0, 0, 0, 0,
- 0, 0, 0, 0, 0, 0, 0, 0,
- 0, 0, 0, 0, 0, 0, 0, 0,
- 0, 0, 0, 0, 0, 0, 0, 0,
- 0, 0, 0, 0, 0, 0, 0, 0,
- 0, 0, 0, 0, 0, 0, 0, 0
- };
- /* Same for linebreaks */
- static unsigned char ascii_linebreak[] = {
- 0, 0, 0, 0, 0, 0, 0, 0,
- /* 0x000A, * LINE FEED */
- /* 0x000D, * CARRIAGE RETURN */
- 0, 0, 1, 0, 0, 1, 0, 0,
- 0, 0, 0, 0, 0, 0, 0, 0,
- /* 0x001C, * FILE SEPARATOR */
- /* 0x001D, * GROUP SEPARATOR */
- /* 0x001E, * RECORD SEPARATOR */
- 0, 0, 0, 0, 1, 1, 1, 0,
- 0, 0, 0, 0, 0, 0, 0, 0,
- 0, 0, 0, 0, 0, 0, 0, 0,
- 0, 0, 0, 0, 0, 0, 0, 0,
- 0, 0, 0, 0, 0, 0, 0, 0,
- 0, 0, 0, 0, 0, 0, 0, 0,
- 0, 0, 0, 0, 0, 0, 0, 0,
- 0, 0, 0, 0, 0, 0, 0, 0,
- 0, 0, 0, 0, 0, 0, 0, 0,
- 0, 0, 0, 0, 0, 0, 0, 0,
- 0, 0, 0, 0, 0, 0, 0, 0,
- 0, 0, 0, 0, 0, 0, 0, 0,
- 0, 0, 0, 0, 0, 0, 0, 0
- };
- Py_UNICODE
- PyUnicode_GetMax(void)
- {
- #ifdef Py_UNICODE_WIDE
- return 0x10FFFF;
- #else
- /* This is actually an illegal character, so it should
- not be passed to unichr. */
- return 0xFFFF;
- #endif
- }
- /* --- Bloom Filters ----------------------------------------------------- */
- /* stuff to implement simple "bloom filters" for Unicode characters.
- to keep things simple, we use a single bitmask, using the least 5
- bits from each unicode characters as the bit index. */
- /* the linebreak mask is set up by Unicode_Init below */
- #define BLOOM_MASK unsigned long
- static BLOOM_MASK bloom_linebreak;
- #define BLOOM(mask, ch) ((mask & (1 << ((ch) & 0x1F))))
- #define BLOOM_LINEBREAK(ch) \
- ((ch) < 128U ? ascii_linebreak[(ch)] : \
- (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK(ch)))
- Py_LOCAL_INLINE(BLOOM_MASK) make_bloom_mask(Py_UNICODE* ptr, Py_ssize_t len)
- {
- /* calculate simple bloom-style bitmask for a given unicode string */
- long mask;
- Py_ssize_t i;
- mask = 0;
- for (i = 0; i < len; i++)
- mask |= (1 << (ptr[i] & 0x1F));
- return mask;
- }
- Py_LOCAL_INLINE(int) unicode_member(Py_UNICODE chr, Py_UNICODE* set, Py_ssize_t setlen)
- {
- Py_ssize_t i;
- for (i = 0; i < setlen; i++)
- if (set[i] == chr)
- return 1;
- return 0;
- }
- #define BLOOM_MEMBER(mask, chr, set, setlen) \
- BLOOM(mask, chr) && unicode_member(chr, set, setlen)
- /* --- Unicode Object ----------------------------------------------------- */
- static
- int unicode_resize(register PyUnicodeObject *unicode,
- Py_ssize_t length)
- {
- void *oldstr;
- /* Shortcut if there's nothing much to do. */
- if (unicode->length == length)
- goto reset;
- /* Resizing shared object (unicode_empty or single character
- objects) in-place is not allowed. Use PyUnicode_Resize()
- instead ! */
- if (unicode == unicode_empty ||
- (unicode->length == 1 &&
- unicode->str[0] < 256U &&
- unicode_latin1[unicode->str[0]] == unicode)) {
- PyErr_SetString(PyExc_SystemError,
- "can't resize shared unicode objects");
- return -1;
- }
- /* We allocate one more byte to make sure the string is Ux0000 terminated.
- The overallocation is also used by fastsearch, which assumes that it's
- safe to look at str[length] (without making any assumptions about what
- it contains). */
- oldstr = unicode->str;
- unicode->str = PyObject_REALLOC(unicode->str,
- sizeof(Py_UNICODE) * (length + 1));
- if (!unicode->str) {
- unicode->str = (Py_UNICODE *)oldstr;
- PyErr_NoMemory();
- return -1;
- }
- unicode->str[length] = 0;
- unicode->length = length;
- reset:
- /* Reset the object caches */
- if (unicode->defenc) {
- Py_DECREF(unicode->defenc);
- unicode->defenc = NULL;
- }
- unicode->hash = -1;
- return 0;
- }
- /* We allocate one more byte to make sure the string is
- Ux0000 terminated -- XXX is this needed ?
- XXX This allocator could further be enhanced by assuring that the
- free list never reduces its size below 1.
- */
- static
- PyUnicodeObject *_PyUnicode_New(Py_ssize_t length)
- {
- register PyUnicodeObject *unicode;
- /* Optimization for empty strings */
- if (length == 0 && unicode_empty != NULL) {
- Py_INCREF(unicode_empty);
- return unicode_empty;
- }
- /* Ensure we won't overflow the size. */
- if (length > ((PY_SSIZE_T_MAX / sizeof(Py_UNICODE)) - 1)) {
- return (PyUnicodeObject *)PyErr_NoMemory();
- }
- /* Unicode freelist & memory allocation */
- if (free_list) {
- unicode = free_list;
- free_list = *(PyUnicodeObject **)unicode;
- numfree--;
- if (unicode->str) {
- /* Keep-Alive optimization: we only upsize the buffer,
- never downsize it. */
- if ((unicode->length < length) &&
- unicode_resize(unicode, length) < 0) {
- PyObject_DEL(unicode->str);
- unicode->str = NULL;
- }
- }
- else {
- size_t new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
- unicode->str = (Py_UNICODE*) PyObject_MALLOC(new_size);
- }
- PyObject_INIT(unicode, &PyUnicode_Type);
- }
- else {
- size_t new_size;
- unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
- if (unicode == NULL)
- return NULL;
- new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
- unicode->str = (Py_UNICODE*) PyObject_MALLOC(new_size);
- }
- if (!unicode->str) {
- PyErr_NoMemory();
- goto onError;
- }
- /* Initialize the first element to guard against cases where
- * the caller fails before initializing str -- unicode_resize()
- * reads str[0], and the Keep-Alive optimization can keep memory
- * allocated for str alive across a call to unicode_dealloc(unicode).
- * We don't want unicode_resize to read uninitialized memory in
- * that case.
- */
- unicode->str[0] = 0;
- unicode->str[length] = 0;
- unicode->length = length;
- unicode->hash = -1;
- unicode->defenc = NULL;
- return unicode;
- onError:
- /* XXX UNREF/NEWREF interface should be more symmetrical */
- _Py_DEC_REFTOTAL;
- _Py_ForgetReference((PyObject *)unicode);
- PyObject_Del(unicode);
- return NULL;
- }
- static
- void unicode_dealloc(register PyUnicodeObject *unicode)
- {
- if (PyUnicode_CheckExact(unicode) &&
- numfree < PyUnicode_MAXFREELIST) {
- /* Keep-Alive optimization */
- if (unicode->length >= KEEPALIVE_SIZE_LIMIT) {
- PyObject_DEL(unicode->str);
- unicode->str = NULL;
- unicode->length = 0;
- }
- if (unicode->defenc) {
- Py_DECREF(unicode->defenc);
- unicode->defenc = NULL;
- }
- /* Add to free list */
- *(PyUnicodeObject **)unicode = free_list;
- free_list = unicode;
- numfree++;
- }
- else {
- PyObject_DEL(unicode->str);
- Py_XDECREF(unicode->defenc);
- Py_TYPE(unicode)->tp_free((PyObject *)unicode);
- }
- }
- static
- int _PyUnicode_Resize(PyUnicodeObject **unicode, Py_ssize_t length)
- {
- register PyUnicodeObject *v;
- /* Argument checks */
- if (unicode == NULL) {
- PyErr_BadInternalCall();
- return -1;
- }
- v = *unicode;
- if (v == NULL || !PyUnicode_Check(v) || Py_REFCNT(v) != 1 || length < 0) {
- PyErr_BadInternalCall();
- return -1;
- }
- /* Resizing unicode_empty and single character objects is not
- possible since these are being shared. We simply return a fresh
- copy with the same Unicode content. */
- if (v->length != length &&
- (v == unicode_empty || v->length == 1)) {
- PyUnicodeObject *w = _PyUnicode_New(length);
- if (w == NULL)
- return -1;
- Py_UNICODE_COPY(w->str, v->str,
- length < v->length ? length : v->length);
- Py_DECREF(*unicode);
- *unicode = w;
- return 0;
- }
- /* Note that we don't have to modify *unicode for unshared Unicode
- objects, since we can modify them in-place. */
- return unicode_resize(v, length);
- }
- int PyUnicode_Resize(PyObject **unicode, Py_ssize_t length)
- {
- return _PyUnicode_Resize((PyUnicodeObject **)unicode, length);
- }
- PyObject *PyUnicode_FromUnicode(const Py_UNICODE *u,
- Py_ssize_t size)
- {
- PyUnicodeObject *unicode;
- /* If the Unicode data is known at construction time, we can apply
- some optimizations which share commonly used objects. */
- if (u != NULL) {
- /* Optimization for empty strings */
- if (size == 0 && unicode_empty != NULL) {
- Py_INCREF(unicode_empty);
- return (PyObject *)unicode_empty;
- }
- /* Single character Unicode objects in the Latin-1 range are
- shared when using this constructor */
- if (size == 1 && *u < 256) {
- unicode = unicode_latin1[*u];
- if (!unicode) {
- unicode = _PyUnicode_New(1);
- if (!unicode)
- return NULL;
- unicode->str[0] = *u;
- unicode_latin1[*u] = unicode;
- }
- Py_INCREF(unicode);
- return (PyObject *)unicode;
- }
- }
- unicode = _PyUnicode_New(size);
- if (!unicode)
- return NULL;
- /* Copy the Unicode data into the new object */
- if (u != NULL)
- Py_UNICODE_COPY(unicode->str, u, size);
- return (PyObject *)unicode;
- }
- PyObject *PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size)
- {
- PyUnicodeObject *unicode;
- if (size < 0) {
- PyErr_SetString(PyExc_SystemError,
- "Negative size passed to PyUnicode_FromStringAndSize");
- return NULL;
- }
- /* If the Unicode data is known at construction time, we can apply
- some optimizations which share commonly used objects.
- Also, this means the input must be UTF-8, so fall back to the
- UTF-8 decoder at the end. */
- if (u != NULL) {
- /* Optimization for empty strings */
- if (size == 0 && unicode_empty != NULL) {
- Py_INCREF(unicode_empty);
- return (PyObject *)unicode_empty;
- }
- /* Single characters are shared when using this constructor.
- Restrict to ASCII, since the input must be UTF-8. */
- if (size == 1 && Py_CHARMASK(*u) < 128) {
- unicode = unicode_latin1[Py_CHARMASK(*u)];
- if (!unicode) {
- unicode = _PyUnicode_New(1);
- if (!unicode)
- return NULL;
- unicode->str[0] = Py_CHARMASK(*u);
- unicode_latin1[Py_CHARMASK(*u)] = unicode;
- }
- Py_INCREF(unicode);
- return (PyObject *)unicode;
- }
- return PyUnicode_DecodeUTF8(u, size, NULL);
- }
- unicode = _PyUnicode_New(size);
- if (!unicode)
- return NULL;
- return (PyObject *)unicode;
- }
- PyObject *PyUnicode_FromString(const char *u)
- {
- size_t size = strlen(u);
- if (size > PY_SSIZE_T_MAX) {
- PyErr_SetString(PyExc_OverflowError, "input too long");
- return NULL;
- }
- return PyUnicode_FromStringAndSize(u, size);
- }
- #ifdef HAVE_WCHAR_H
- PyObject *PyUnicode_FromWideChar(register const wchar_t *w,
- Py_ssize_t size)
- {
- PyUnicodeObject *unicode;
- if (w == NULL) {
- PyErr_BadInternalCall();
- return NULL;
- }
- unicode = _PyUnicode_New(size);
- if (!unicode)
- return NULL;
- /* Copy the wchar_t data into the new object */
- #ifdef HAVE_USABLE_WCHAR_T
- memcpy(unicode->str, w, size * sizeof(wchar_t));
- #else
- {
- register Py_UNICODE *u;
- register Py_ssize_t i;
- u = PyUnicode_AS_UNICODE(unicode);
- for (i = size; i > 0; i--)
- *u++ = *w++;
- }
- #endif
- return (PyObject *)unicode;
- }
- static void
- makefmt(char *fmt, int longflag, int size_tflag, int zeropad, int width, int precision, char c)
- {
- *fmt++ = '%';
- if (width) {
- if (zeropad)
- *fmt++ = '0';
- fmt += sprintf(fmt, "%d", width);
- }
- if (precision)
- fmt += sprintf(fmt, ".%d", precision);
- if (longflag)
- *fmt++ = 'l';
- else if (size_tflag) {
- char *f = PY_FORMAT_SIZE_T;
- while (*f)
- *fmt++ = *f++;
- }
- *fmt++ = c;
- *fmt = '\0';
- }
- #define appendstring(string) {for (copy = string;*copy;) *s++ = *copy++;}
- PyObject *
- PyUnicode_FromFormatV(const char *format, va_list vargs)
- {
- va_list count;
- Py_ssize_t callcount = 0;
- PyObject **callresults = NULL;
- PyObject **callresult = NULL;
- Py_ssize_t n = 0;
- int width = 0;
- int precision = 0;
- int zeropad;
- const char* f;
- Py_UNICODE *s;
- PyObject *string;
- /* used by sprintf */
- char buffer[21];
- /* use abuffer instead of buffer, if we need more space
- * (which can happen if there's a format specifier with width). */
- char *abuffer = NULL;
- char *realbuffer;
- Py_ssize_t abuffersize = 0;
- char fmt[60]; /* should be enough for %0width.precisionld */
- const char *copy;
- #ifdef VA_LIST_IS_ARRAY
- Py_MEMCPY(count, vargs, sizeof(va_list));
- #else
- #ifdef __va_copy
- __va_copy(count, vargs);
- #else
- count = vargs;
- #endif
- #endif
- /* step 1: count the number of %S/%R format specifications
- * (we call PyObject_Str()/PyObject_Repr() for these objects
- * once during step 3 and put the result in an array) */
- for (f = format; *f; f++) {
- if (*f == '%' && (*(f+1)=='S' || *(f+1)=='R'))
- ++callcount;
- }
- /* step 2: allocate memory for the results of
- * PyObject_Str()/PyObject_Repr() calls */
- if (callcount) {
- callresults = PyObject_Malloc(sizeof(PyObject *)*callcount);
- if (!callresults) {
- PyErr_NoMemory();
- return NULL;
- }
- callresult = callresults;
- }
- /* step 3: figure out how large a buffer we need */
- for (f = format; *f; f++) {
- if (*f == '%') {
- const char* p = f;
- width = 0;
- while (isdigit((unsigned)*f))
- width = (width*10) + *f++ - '0';
- while (*++f && *f != '%' && !isalpha((unsigned)*f))
- ;
- /* skip the 'l' or 'z' in {%ld, %zd, %lu, %zu} since
- * they don't affect the amount of space we reserve.
- */
- if ((*f == 'l' || *f == 'z') &&
- (f[1] == 'd' || f[1] == 'u'))
- ++f;
- switch (*f) {
- case 'c':
- (void)va_arg(count, int);
- /* fall through... */
- case '%':
- n++;
- break;
- case 'd': case 'u': case 'i': case 'x':
- (void) va_arg(count, int);
- /* 20 bytes is enough to hold a 64-bit
- integer. Decimal takes the most space.
- This isn't enough for octal.
- If a width is specified we need more
- (which we allocate later). */
- if (width < 20)
- width = 20;
- n += width;
- if (abuffersize < width)
- abuffersize = width;
- break;
- case 's':
- {
- /* UTF-8 */
- unsigned char*s;
- s = va_arg(count, unsigned char*);
- while (*s) {
- if (*s < 128) {
- n++; s++;
- } else if (*s < 0xc0) {
- /* invalid UTF-8 */
- n++; s++;
- } else if (*s < 0xc0) {
- n++;
- s++; if(!*s)break;
- s++;
- } else if (*s < 0xe0) {
- n++;
- s++; if(!*s)break;
- s++; if(!*s)break;
- s++;
- } else {
- #ifdef Py_UNICODE_WIDE
- n++;
- #else
- n+=2;
- #endif
- s++; if(!*s)break;
- s++; if(!*s)break;
- s++; if(!*s)break;
- s++;
- }
- }
- break;
- }
- case 'U':
- {
- PyObject *obj = va_arg(count, PyObject *);
- assert(obj && PyUnicode_Check(obj));
- n += PyUnicode_GET_SIZE(obj);
- break;
- }
- case 'V':
- {
- PyObject *obj = va_arg(count, PyObject *);
- const char *str = va_arg(count, const char *);
- assert(obj || str);
- assert(!obj || PyUnicode_Check(obj));
- if (obj)
- n += PyUnicode_GET_SIZE(obj);
- else
- n += strlen(str);
- break;
- }
- case 'S':
- {
- PyObject *obj = va_arg(count, PyObject *);
- PyObject *str;
- assert(obj);
- str = PyObject_Str(obj);
- if (!str)
- goto fail;
- n += PyUnicode_GET_SIZE(str);
- /* Remember the str and switch to the next slot */
- *callresult++ = str;
- break;
- }
- case 'R':
- {
- PyObject *obj = va_arg(count, PyObject *);
- PyObject *repr;
- assert(obj);
- repr = PyObject_Repr(obj);
- if (!repr)
- goto fail;
- n += PyUnicode_GET_SIZE(repr);
- /* Remember the repr and switch to the next slot */
- *callresult++ = repr;
- break;
- }
- case 'p':
- (void) va_arg(count, int);
- /* maximum 64-bit pointer representation:
- * 0xffffffffffffffff
- * so 19 characters is enough.
- * XXX I count 18 -- what's the extra for?
- */
- n += 19;
- break;
- default:
- /* if we stumble upon an unknown
- formatting code, copy the rest of
- the format string to the output
- string. (we cannot just skip the
- code, since there's no way to know
- what's in the argument list) */
- n += strlen(p);
- goto expand;
- }
- } else
- n++;
- }
- expand:
- if (abuffersize > 20) {
- abuffer = PyObject_Malloc(abuffersize);
- if (!abuffer) {
- PyErr_NoMemory();
- goto fail;
- }
- realbuffer = abuffer;
- }
- else
- realbuffer = buffer;
- /* step 4: fill the buffer */
- /* Since we've analyzed how much space we need for the worst case,
- we don't have to resize the string.
- There can be no errors beyond this point. */
- string = PyUnicode_FromUnicode(NULL, n);
- if (!string)
- goto fail;
- s = PyUnicode_AS_UNICODE(string);
- callresult = callresults;
- for (f = format; *f; f++) {
- if (*f == '%') {
- const char* p = f++;
- int longflag = 0;
- int size_tflag = 0;
- zeropad = (*f == '0');
- /* parse the width.precision part */
- width = 0;
- while (isdigit((unsigned)*f))
- width = (width*10) + *f++ - '0';
- precision = 0;
- if (*f == '.') {
- f++;
- while (isdigit((unsigned)*f))
- precision = (precision*10) + *f++ - '0';
- }
- /* handle the long flag, but only for %ld and %lu.
- others can be added when necessary. */
- if (*f == 'l' && (f[1] == 'd' || f[1] == 'u')) {
- longflag = 1;
- ++f;
- }
- /* handle the size_t flag. */
- if (*f == 'z' && (f[1] == 'd' || f[1] == 'u')) {
- size_tflag = 1;
- ++f;
- }
- switch (*f) {
- case 'c':
- *s++ = va_arg(vargs, int);
- break;
- case 'd':
- makefmt(fmt, longflag, size_tflag, zeropad, width, precision, 'd');
- if (longflag)
- sprintf(realbuffer, fmt, va_arg(vargs, long));
- else if (size_tflag)
- sprintf(realbuffer, fmt, va_arg(vargs, Py_ssize_t));
- else
- sprintf(realbuffer, fmt, va_arg(vargs, int));
- appendstring(realbuffer);
- break;
- case 'u':
- makefmt(fmt, longflag, size_tflag, zeropad, width, precision, 'u');
- if (longflag)
- sprintf(realbuffer, fmt, va_arg(vargs, unsigned long));
- else if (size_tflag)
- sprintf(realbuffer, fmt, va_arg(vargs, size_t));
- else
- sprintf(realbuffer, fmt, va_arg(vargs, unsigned int));
- appendstring(realbuffer);
- break;
- case 'i':
- makefmt(fmt, 0, 0, zeropad, width, precision, 'i');
- sprintf(realbuffer, fmt, va_arg(vargs, int));
- appendstring(realbuffer);
- break;
- case 'x':
- makefmt(fmt, 0, 0, zeropad, width, precision, 'x');
- sprintf(realbuffer, fmt, va_arg(vargs, int));
- appendstring(realbuffer);
- break;
- case 's':
- {
- /* Parameter must be UTF-8 encoded.
- In case of encoding errors, use
- the replacement character. */
- PyObject *u;
- p = va_arg(vargs, char*);
- u = PyUnicode_DecodeUTF8(p, strlen(p),
- "replace");
- if (!u)
- goto fail;
- Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(u),
- PyUnicode_GET_SIZE(u));
- s += PyUnicode_GET_SIZE(u);
- Py_DECREF(u);
- break;
- }
- case 'U':
- {
- PyObject *obj = va_arg(vargs, PyObject *);
- Py_ssize_t size = PyUnicode_GET_SIZE(obj);
- Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(obj), size);
- s += size;
- break;
- }
- case 'V':
- {
- PyObject *obj = va_arg(vargs, PyObject *);
- const char *str = va_arg(vargs, const char *);
- if (obj) {
- Py_ssize_t size = PyUnicode_GET_SIZE(obj);
- Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(obj), size);
- s += size;
- } else {
- appendstring(str);
- }
- break;
- }
- case 'S':
- case 'R':
- {
- Py_UNICODE *ucopy;
- Py_ssize_t usize;
- Py_ssize_t upos;
- /* unused, since we already have the result */
- (void) va_arg(vargs, PyObject *);
- ucopy = PyUnicode_AS_UNICODE(*callresult);
- usize = PyUnicode_GET_SIZE(*callresult);
- for (upos = 0; upos<usize;)
- *s++ = ucopy[upos++];
- /* We're done with the unicode()/repr() => forget it */
- Py_DECREF(*callresult);
- /* switch to next unicode()/repr() result */
- ++callresult;
- break;
- }
- case 'p':
- sprintf(buffer, "%p", va_arg(vargs, void*));
- /* %p is ill-defined: ensure leading 0x. */
- if (buffer[1] == 'X')
- buffer[1] = 'x';
- else if (buffer[1] != 'x') {
- memmove(buffer+2, buffer, strlen(buffer)+1);
- buffer[0] = '0';
- buffer[1] = 'x';
- }
- appendstring(buffer);
- break;
- case '%':
- *s++ = '%';
- break;
- default:
- appendstring(p);
- goto end;
- }
- } else
- *s++ = *f;
- }
- end:
- if (callresults)
- PyObject_Free(callresults);
- if (abuffer)
- PyObject_Free(abuffer);
- PyUnicode_Resize(&string, s - PyUnicode_AS_UNICODE(string));
- return string;
- fail:
- if (callresults) {
- PyObject **callresult2 = callresults;
- while (callresult2 < callresult) {
- Py_DECREF(*callresult2);
- ++callresult2;
- }
- PyObject_Free(callresults);
- }
- if (abuffer)
- PyObject_Free(abuffer);
- return NULL;
- }
- #undef appendstring
- PyObject *
- PyUnicode_FromFormat(const char *format, ...)
- {
- PyObject* ret;
- va_list vargs;
- #ifdef HAVE_STDARG_PROTOTYPES
- va_start(vargs, format);
- #else
- va_start(vargs);
- #endif
- ret = PyUnicode_FromFormatV(format, vargs);
- va_end(vargs);
- return ret;
- }
- Py_ssize_t PyUnicode_AsWideChar(PyUnicodeObject *unicode,
- wchar_t *w,
- Py_ssize_t size)
- {
- if (unicode == NULL) {
- PyErr_BadInternalCall();
- return -1;
- }
- /* If possible, try to copy the 0-termination as well */
- if (size > PyUnicode_GET_SIZE(unicode))
- size = PyUnicode_GET_SIZE(unicode) + 1;
- #ifdef HAVE_USABLE_WCHAR_T
- memcpy(w, unicode->str, size * sizeof(wchar_t));
- #else
- {
- register Py_UNICODE *u;
- register Py_ssize_t i;
- u = PyUnicode_AS_UNICODE(unicode);
- for (i = size; i > 0; i--)
- *w++ = *u++;
- }
- #endif
- if (size > PyUnicode_GET_SIZE(unicode))
- return PyUnicode_GET_SIZE(unicode);
- else
- return size;
- }
- #endif
- PyObject *PyUnicode_FromOrdinal(int ordinal)
- {
- Py_UNICODE s[1];
- #ifdef Py_UNICODE_WIDE
- if (ordinal < 0 || ordinal > 0x10ffff) {
- PyErr_SetString(PyExc_ValueError,
- "unichr() arg not in range(0x110000) "
- "(wide Python build)");
- return NULL;
- }
- #else
- if (ordinal < 0 || ordinal > 0xffff) {
- PyErr_SetString(PyExc_ValueError,
- "unichr() arg not in range(0x10000) "
- "(narrow Python build)");
- return NULL;
- }
- #endif
- s[0] = (Py_UNICODE)ordinal;
- return PyUnicode_FromUnicode(s, 1);
- }
- PyObject *PyUnicode_FromObject(register PyObject *obj)
- {
- /* XXX Perhaps we should make this API an alias of
- PyObject_Unicode() instead ?! */
- if (PyUnicode_CheckExact(obj)) {
- Py_INCREF(obj);
- return obj;
- }
- if (PyUnicode_Check(obj)) {
- /* For a Unicode subtype that's not a Unicode object,
- return a true Unicode object with the same data. */
- return PyUnicode_FromUnicode(PyUnicode_AS_UNICODE(obj),
- PyUnicode_GET_SIZE(obj));
- }
- return PyUnicode_FromEncodedObject(obj, NULL, "strict");
- }
- PyObject *PyUnicode_FromEncodedObject(register PyObject *obj,
- const char *encoding,
- const char *errors)
- {
- const char *s = NULL;
- Py_ssize_t len;
- PyObject *v;
- if (obj == NULL) {
- PyErr_BadInternalCall();
- return NULL;
- }
- #if 0
- /* For b/w compatibility we also accept Unicode objects provided
- that no encodings is given and then redirect to
- PyObject_Unicode() which then applies the additional logic for
- Unicode subclasses.
- NOTE: This API should really only be used for object which
- represent *encoded* Unicode !
- */
- if (PyUnicode_Check(obj)) {
- if (encoding) {
- PyErr_SetString(PyExc_TypeError,
- "decoding Unicode is not supported");
- return NULL;
- }
- return PyObject_Unicode(obj);
- }
- #else
- if (PyUnicode_Check(obj)) {
- PyErr_SetString(PyExc_TypeError,
- "decoding Unicode is not supported");
- return NULL;
- }
- #endif
- /* Coerce object */
- if (PyString_Check(obj)) {
- s = PyString_AS_STRING(obj);
- len = PyString_GET_SIZE(obj);
- }
- else if (PyByteArray_Check(obj)) {
- /* Python 2.x specific */
- PyErr_Format(PyExc_TypeError,
- "decoding bytearray is not supported");
- return NULL;
- }
- else if (PyObject_AsCharBuffer(obj, &s, &len)) {
- /* Overwrite the error message with something more useful in
- case of a TypeError. */
- if (PyErr_ExceptionMatches(PyExc_TypeError))
- PyErr_Format(PyExc_TypeError,
- "coercing to Unicode: need string or buffer, "
- "%.80s found",
- Py_TYPE(obj)->tp_name);
- goto onError;
- }
- /* Convert to Unicode */
- if (len == 0) {
- Py_INCREF(unicode_empty);
- v = (PyObject *)unicode_empty;
- }
- else
- v = PyUnicode_Decode(s, len, encoding, errors);
- return v;
- onError:
- return NULL;
- }
- PyObject *PyUnicode_Decode(const char *s,
- Py_ssize_t size,
- const char *encoding,
- const char *errors)
- {
- PyObject *buffer = NULL, *unicode;
- if (encoding == NULL)
- encoding = PyUnicode_GetDefaultEncoding();
- /* Shortcuts for common default encodings */
- if (strcmp(encoding, "utf-8") == 0)
- return PyUnicode_DecodeUTF8(s, size, errors);
- else if (strcmp(encoding, "latin-1") == 0)
- return PyUnicode_DecodeLatin1(s, size, errors);
- #if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
- else if (strcmp(encoding, "mbcs") == 0)
- return PyUnicode_DecodeMBCS(s, size, errors);
- #endif
- else if (strcmp(encoding, "ascii") == 0)
- return PyUnicode_DecodeASCII(s, size, errors);
- /* Decode via the codec registry */
- buffer = PyBuffer_FromMemory((void *)s, size);
- if (buffer == NULL)
- goto onError;
- unicode = PyCodec_Decode(buffer, encoding, errors);
- if (unicode == NULL)
- goto onError;
- if (!PyUnicode_Check(unicode)) {
- PyErr_Format(PyExc_TypeError,
- "decoder did not return an unicode object (type=%.400s)",
- Py_TYPE(unicode)->tp_name);
- Py_DECREF(unicode);
- goto onError;
- }
- Py_DECREF(buffer);
- return unicode;
- onError:
- Py_XDECREF(buffer);
- return NULL;
- }
- PyObject *PyUnicode_AsDecodedObject(PyObject *unicode,
- const char *encoding,
- const char *errors)
- {
- PyObject *v;
- if (!PyUnicode_Check(unicode)) {
- PyErr_BadArgument();
- goto onError;
- }
- if (encoding == NULL)
- encoding = PyUnicode_GetDefaultEncoding();
- /* Decode via the codec registry */
- v = PyCodec_Decode(unicode, encoding, errors);
- if (v == NULL)
- goto onError;
- return v;
- onError:
- return NULL;
- }
- PyObject *PyUnicode_Encode(const Py_UNICODE *s,
- Py_ssize_t size,
- const char *encoding,
- const char *errors)
- {
- PyObject *v, *unicode;
- unicode = PyUnicode_FromUnicode(s, size);
- if (unicode == NULL)
- return NULL;
- v = PyUnicode_AsEncodedString(unicode, encoding, errors);
- Py_DECREF(unicode);
- return v;
- }
- PyObject *PyUnicode_AsEncodedObject(PyObject *unicode,
- const char *encoding,
- const char *errors)
- {
- PyObject *v;
- if (!PyUnicode_Check(unicode)) {
- PyErr_BadArgument();
- goto onError;
- }
- if (encoding == NULL)
- encoding = PyUnicode_GetDefaultEncoding();
- /* Encode via the codec registry */
- v = PyCodec_Encode(unicode, encoding, errors);
- if (v == NULL)
- goto onError;
- return v;
- onError:
- return NULL;
- }
- PyObject *PyUnicode_AsEncodedString(PyObject *unicode,
- const char *encoding,
- const char *errors)
- {
- PyObject *v;
- if (!PyUnicode_Check(unicode)) {
- PyErr_BadArgument();
- goto onError;
- }
- if (encoding == NULL)
- encoding = PyUnicode_GetDefaultEncoding();
- /* Shortcuts for common default encodings */
- if (errors == NULL) {
- if (strcmp(encoding, "utf-8") == 0)
- return PyUnicode_AsUTF8String(unicode);
- else if (strcmp(encoding, "latin-1") == 0)
- return PyUnicode_AsLatin1String(unicode);
- #if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
- else if (strcmp(encoding, "mbcs") == 0)
- return PyUnicode_AsMBCSString(unicode);
- #endif
- else if (strcmp(encoding, "ascii") == 0)
- return PyUnicode_AsASCIIString(unicode);
- }
- /* Encode via the codec registry */
- v = PyCodec_Encode(unicode, encoding, errors);
- if (v == NULL)
- goto onError;
- if (!PyString_Check(v)) {
- PyErr_Format(PyExc_TypeError,
- "encoder did not return a string object (type=%.400s)",
- Py_TYPE(v)->tp_name);
- Py_DECREF(v);
- goto onError;
- }
- return v;
- onError:
- return NULL;
- }
- PyObject *_PyUnicode_AsDefaultEncodedString(PyObject *unicode,
- const char *errors)
- {
- PyObject *v = ((PyUnicodeObject *)unicode)->defenc;
- if (v)
- return v;
- v = PyUnicode_AsEncodedString(unicode, NULL, errors);
- if (v && errors == NULL)
- ((PyUnicodeObject *)unicode)->defenc = v;
- return v;
- }
- Py_UNICODE *PyUnicode_AsUnicode(PyObject *unicode)
- {
- if (!PyUnicode_Check(unicode)) {
- PyErr_BadArgument();
- goto onError;
- }
- return PyUnicode_AS_UNICODE(unicode);
- onError:
- return NULL;
- }
- Py_ssize_t PyUnicode_GetSize(PyObject *unicode)
- {
- if (!PyUnicode_Check(unicode)) {
- PyErr_BadArgument();
- goto onError;
- }
- return PyUnicode_GET_SIZE(unicode);
- onError:
- return -1;
- }
- const char *PyUnicode_GetDefaultEncoding(void)
- {
- return unicode_default_encoding;
- }
- int PyUnicode_SetDefaultEncoding(const char *encoding)
- {
- PyObject *v;
- /* Make sure the encoding is valid. As side effect, this also
- loads the encoding into the codec registry cache. */
- v = _PyCodec_Lookup(encoding);
- if (v == NULL)
- goto onError;
- Py_DECREF(v);
- strncpy(unicode_default_encoding,
- encoding,
- sizeof(unicode_default_encoding));
- return 0;
- onError:
- return -1;
- }
- /* error handling callback helper:
- build arguments, call the callback and check the arguments,
- if no exception occurred, copy the replacement to the output
- and adjust various state variables.
- return 0 on success, -1 on error
- */
- static
- int unicode_decode_call_errorhandler(const char *errors, PyObject **errorHandler,
- const char *encoding, const char *reason,
- const char *input, Py_ssize_t insize, Py_ssize_t *startinpos,
- Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
- PyUnicodeObject **output, Py_ssize_t *outpos, Py_UNICODE **outptr)
- {
- static char *argparse = "O!n;decoding error handler must return (unicode, int) tuple";
- PyObject *restuple = NULL;
- PyObject *repunicode = NULL;
- Py_ssize_t outsize = PyUnicode_GET_SIZE(*output);
- Py_ssize_t requiredsize;
- Py_ssize_t newpos;
- Py_UNICODE *repptr;
- Py_ssize_t repsize;
- int res = -1;
- if (*errorHandler == NULL) {
- *errorHandler = PyCodec_LookupError(errors);
- if (*errorHandler == NULL)
- goto onError;
- }
- if (*exceptionObject == NULL) {
- *exceptionObject = PyUnicodeDecodeError_Create(
- encoding, input, insize, *startinpos, *endinpos, reason);
- if (*exceptionObject == NULL)
- goto onError;
- }
- else {
- if (PyUnicodeDecodeError_SetStart(*exceptionObject, *startinpos))
- goto onError;
- if (PyUnicodeDecodeError_SetEnd(*exceptionObject, *endinpos))
- goto onError;
- if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
- goto onError;
- }
- restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
- if (restuple == NULL)
- goto onError;
- if (!PyTuple_Check(restuple)) {
- PyErr_SetString(PyExc_TypeError, &argparse[4]);
- goto onError;
- }
- if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos))
- goto onError;
- if (newpos<0)
- newpos = insize+newpos;
- if (newpos<0 || newpos>insize) {
- PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
- goto onError;
- }
- /* need more space? (at least enough for what we
- have+the replacement+the rest of the string (starting
- at the new input position), so we won't have to check space
- when there are no errors in the rest of the string) */
- repptr = PyUnicode_AS_UNICODE(repunicode);
- repsize = PyUnicode_GET_SIZE(repunicode);
- requiredsize = *outpos + repsize + insize-newpos;
- if (requiredsize > outsize) {
- if (requiredsize<2*outsize)
- requiredsize = 2*outsize;
- if (_PyUnicode_Resize(output, requiredsize) < 0)
- goto onError;
- *outptr = PyUnicode_AS_UNICODE(*output) + *outpos;
- }
- *endinpos = newpos;
- *inptr = input + newpos;
- Py_UNICODE_COPY(*outptr, repptr, repsize);
- *outptr += repsize;
- *outpos += repsize;
- /* we made it! */
- res = 0;
- onError:
- Py_XDECREF(restuple);
- return res;
- }
- /* --- UTF-7 Codec -------------------------------------------------------- */
- /* see RFC2152 for details */
- static
- char utf7_special[128] = {
- /* indicate whether a UTF-7 character is special i.e. cannot be directly
- encoded:
- 0 - not special
- 1 - special
- 2 - whitespace (optional)
- 3 - RFC2152 Set O (optional) */
- 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 1, 1, 2, 1, 1,
- 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
- 2, 3, 3, 3, 3, 3, 3, 0, 0, 0, 3, 1, 0, 0, 0, 1,
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 3, 0,
- 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 1, 3, 3, 3,
- 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 1, 1,
- };
- /* Note: The comparison (c) <= 0 is a trick to work-around gcc
- warnings about the comparison always being false; since
- utf7_special[0] is 1, we can safely make that one comparison
- true */
- #define SPECIAL(c, encodeO, encodeWS) \
- ((c) > 127 || (c) <= 0 || utf7_special[(c)] == 1 || \
- (encodeWS && (utf7_special[(c)] == 2)) || \
- (encodeO && (utf7_special[(c)] == 3)))
- #define B64(n) \
- ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
- #define B64CHAR(c) \
- (isalnum(c) || (c) == '+' || (c) == '/')
- #define UB64(c) \
- ((c) == '+' ? 62 : (c) == '/' ? 63 : (c) >= 'a' ? \
- (c) - 71 : (c) >= 'A' ? (c) - 65 : (c) + 4 )
- #define ENCODE(out, ch, bits) \
- while (bits >= 6) { \
- *out++ = B64(ch >> (bits-6)); \
- bits -= 6; \
- }
- #define DECODE(out, ch, bits, surrogate) \
- while (bits >= 16) { \
- Py_UNICODE outCh = (Py_UNICODE) ((ch >> (bits-16)) & 0xffff); \
- bits -= 16; \
- if (surrogate) { \
- /* We have already generated an error for the high surrogate \
- so let's not bother seeing if the low surrogate is correct or not */ \
- surrogate = 0; \
- } else if (0xDC00 <= outCh && outCh <= 0xDFFF) { \
- /* This is a surrogate pair. Unfortunately we can't represent \
- it in a 16-bit character */ \
- surrogate = 1; \
- errmsg = "code pairs are not supported"; \
- goto utf7Error; \
- } else { \
- *out++ = outCh; \
- } \
- }
- PyObject *PyUnicode_DecodeUTF7(const char *s,
- Py_ssize_t size,
- const char *errors)
- {
- return PyUnicode_DecodeUTF7Stateful(s, size, errors, NULL);
- }
- PyObject *PyUnicode_DecodeUTF7Stateful(const char *s,
- Py_ssize_t size,
- const char *errors,
- Py_ssize_t *consumed)
- {
- const char *starts = s;
- Py_ssize_t startinpos;
- Py_ssize_t endinpos;
- Py_ssize_t outpos;
- const char *e;
- PyUnicodeObject *unicode;
- Py_UNICODE *p;
- const char *errmsg = "";
- int inShift = 0;
- unsigned int bitsleft = 0;
- unsigned long charsleft = 0;
- int surrogate = 0;
- PyObject *errorHandler = NULL;
- PyObject *exc = NULL;
- unicode = _PyUnicode_New(size);
- if (!unicode)
- return NULL;
- if (size == 0) {
- if (consumed)
- *consumed = 0;
- return (PyObject *)unicode;
- }
- p = unicode->str;
- e = s + size;
- while (s < e) {
- Py_UNICODE ch;
- restart:
- ch = (unsigned char) *s;
- if (inShift) {
- if ((ch == '-') || !B64CHAR(ch)) {
- inShift = 0;
- s++;
- /* p, charsleft, bitsleft, surrogate = */ DECODE(p, charsleft, bitsleft, surrogate);
- if (bitsleft >= 6) {
- /* The shift sequence has a partial character in it. If
- bitsleft < 6 then we could just classify it as padding
- but that is not the case here */
- errmsg = "partial character in shift sequence";
- goto utf7Error;
- }
- /* According to RFC2152 the remaining bits should be zero. We
- choose to signal an error/insert a replacement character
- here so indicate the potential of a misencoded character. */
- /* On x86, a << b == a << (b%32) so make sure that bitsleft != 0 */
- if (bitsleft && charsleft << (sizeof(charsleft) * 8 - bitsleft)) {
- errmsg = "non-zero padding bits in shift sequence";
- goto utf7Error;
- }
- if (ch == '-') {
- if ((s < e) && (*(s) == '-')) {
- *p++ = '-';
- inShift = 1;
- }
- } else if (SPECIAL(ch,0,0)) {
- errmsg = "unexpected special character";
- goto utf7Error;
- } else {
- *p++ = ch;
- }
- } else {
- charsleft = (charsleft << 6) | UB64(ch);
- bitsleft += 6;
- s++;
- /* p, charsleft, bitsleft, surrogate = */ DECODE(p, charsleft, bitsleft, surrogate);
- }
- }
- else if ( ch == '+' ) {
- startinpos = s-starts;
- s++;
- if (s < e && *s == '-') {
- s++;
- *p++ = '+';
- } else
- {
- inShift = 1;
- bitsleft = 0;
- }
- }
- else if (SPECIAL(ch,0,0)) {
- startinpos = s-starts;
- errmsg = "unexpected special character";
- s++;
- goto utf7Error;
- }
- else {
- *p++ = ch;
- s++;
- }
- continue;
- utf7Error:
- outpos = p-PyUnicode_AS_UNICODE(unicode);
- endinpos = s-starts;
- if (unicode_decode_call_errorhandler(
- errors, &errorHandler,
- "utf7", errmsg,
- starts, size, &startinpos, &endinpos, &exc, &s,
- &unicode, &outpos, &p))
- goto onError;
- }
- if (inShift && !consumed) {
- outpos = p-PyUnicode_AS_UNICODE(unicode);
- endinpos = size;
- if (unicode_decode_call_errorhandler(
- errors, &errorHandler,
- "utf7", "unterminated shift sequence",
- starts, size,…
Large files files are truncated, but you can click here to view the full file