PageRenderTime 184ms CodeModel.GetById 19ms app.highlight 118ms RepoModel.GetById 1ms app.codeStats 4ms

/Objects/unicodeobject.c

http://unladen-swallow.googlecode.com/
C | 9106 lines | 7348 code | 957 blank | 801 comment | 1905 complexity | 9db69b4d6a66f4afb0d8ed2b2d32eecc MD5 | raw file

Large files files are truncated, but you can click here to view the full file

   1/*
   2
   3Unicode implementation based on original code by Fredrik Lundh,
   4modified by Marc-Andre Lemburg <mal@lemburg.com> according to the
   5Unicode Integration Proposal (see file Misc/unicode.txt).
   6
   7Major speed upgrades to the method implementations at the Reykjavik
   8NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke.
   9
  10Copyright (c) Corporation for National Research Initiatives.
  11
  12--------------------------------------------------------------------
  13The original string type implementation is:
  14
  15  Copyright (c) 1999 by Secret Labs AB
  16  Copyright (c) 1999 by Fredrik Lundh
  17
  18By obtaining, using, and/or copying this software and/or its
  19associated documentation, you agree that you have read, understood,
  20and will comply with the following terms and conditions:
  21
  22Permission to use, copy, modify, and distribute this software and its
  23associated documentation for any purpose and without fee is hereby
  24granted, provided that the above copyright notice appears in all
  25copies, and that both that copyright notice and this permission notice
  26appear in supporting documentation, and that the name of Secret Labs
  27AB or the author not be used in advertising or publicity pertaining to
  28distribution of the software without specific, written prior
  29permission.
  30
  31SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
  32THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
  33FITNESS.  IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
  34ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
  35WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
  36ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
  37OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
  38--------------------------------------------------------------------
  39
  40*/
  41
  42#define PY_SSIZE_T_CLEAN
  43#include "Python.h"
  44
  45#include "unicodeobject.h"
  46#include "ucnhash.h"
  47
  48#ifdef MS_WINDOWS
  49#include <windows.h>
  50#endif
  51
  52/* Limit for the Unicode object free list */
  53
  54#define PyUnicode_MAXFREELIST       1024
  55
  56/* Limit for the Unicode object free list stay alive optimization.
  57
  58   The implementation will keep allocated Unicode memory intact for
  59   all objects on the free list having a size less than this
  60   limit. This reduces malloc() overhead for small Unicode objects.
  61
  62   At worst this will result in PyUnicode_MAXFREELIST *
  63   (sizeof(PyUnicodeObject) + KEEPALIVE_SIZE_LIMIT +
  64   malloc()-overhead) bytes of unused garbage.
  65
  66   Setting the limit to 0 effectively turns the feature off.
  67
  68   Note: This is an experimental feature ! If you get core dumps when
  69   using Unicode objects, turn this feature off.
  70
  71*/
  72
  73#define KEEPALIVE_SIZE_LIMIT       9
  74
  75/* Endianness switches; defaults to little endian */
  76
  77#ifdef WORDS_BIGENDIAN
  78# define BYTEORDER_IS_BIG_ENDIAN
  79#else
  80# define BYTEORDER_IS_LITTLE_ENDIAN
  81#endif
  82
  83/* --- Globals ------------------------------------------------------------
  84
  85   The globals are initialized by the _PyUnicode_Init() API and should
  86   not be used before calling that API.
  87
  88*/
  89
  90
  91#ifdef __cplusplus
  92extern "C" {
  93#endif
  94
  95/* Free list for Unicode objects */
  96static PyUnicodeObject *free_list;
  97static int numfree;
  98
  99/* The empty Unicode object is shared to improve performance. */
 100static PyUnicodeObject *unicode_empty;
 101
 102/* Single character Unicode strings in the Latin-1 range are being
 103   shared as well. */
 104static PyUnicodeObject *unicode_latin1[256];
 105
 106/* Default encoding to use and assume when NULL is passed as encoding
 107   parameter; it is initialized by _PyUnicode_Init().
 108
 109   Always use the PyUnicode_SetDefaultEncoding() and
 110   PyUnicode_GetDefaultEncoding() APIs to access this global.
 111
 112*/
 113static char unicode_default_encoding[100];
 114
 115/* Fast detection of the most frequent whitespace characters */
 116const unsigned char _Py_ascii_whitespace[] = {
 117    0, 0, 0, 0, 0, 0, 0, 0,
 118/*     case 0x0009: * HORIZONTAL TABULATION */
 119/*     case 0x000A: * LINE FEED */
 120/*     case 0x000B: * VERTICAL TABULATION */
 121/*     case 0x000C: * FORM FEED */
 122/*     case 0x000D: * CARRIAGE RETURN */
 123    0, 1, 1, 1, 1, 1, 0, 0,
 124    0, 0, 0, 0, 0, 0, 0, 0,
 125/*     case 0x001C: * FILE SEPARATOR */
 126/*     case 0x001D: * GROUP SEPARATOR */
 127/*     case 0x001E: * RECORD SEPARATOR */
 128/*     case 0x001F: * UNIT SEPARATOR */
 129    0, 0, 0, 0, 1, 1, 1, 1,
 130/*     case 0x0020: * SPACE */
 131    1, 0, 0, 0, 0, 0, 0, 0,
 132    0, 0, 0, 0, 0, 0, 0, 0,
 133    0, 0, 0, 0, 0, 0, 0, 0,
 134    0, 0, 0, 0, 0, 0, 0, 0,
 135
 136    0, 0, 0, 0, 0, 0, 0, 0,
 137    0, 0, 0, 0, 0, 0, 0, 0,
 138    0, 0, 0, 0, 0, 0, 0, 0,
 139    0, 0, 0, 0, 0, 0, 0, 0,
 140    0, 0, 0, 0, 0, 0, 0, 0,
 141    0, 0, 0, 0, 0, 0, 0, 0,
 142    0, 0, 0, 0, 0, 0, 0, 0,
 143    0, 0, 0, 0, 0, 0, 0, 0
 144};
 145
 146/* Same for linebreaks */
 147static unsigned char ascii_linebreak[] = {
 148    0, 0, 0, 0, 0, 0, 0, 0,
 149/*         0x000A, * LINE FEED */
 150/*         0x000D, * CARRIAGE RETURN */
 151    0, 0, 1, 0, 0, 1, 0, 0,
 152    0, 0, 0, 0, 0, 0, 0, 0,
 153/*         0x001C, * FILE SEPARATOR */
 154/*         0x001D, * GROUP SEPARATOR */
 155/*         0x001E, * RECORD SEPARATOR */
 156    0, 0, 0, 0, 1, 1, 1, 0,
 157    0, 0, 0, 0, 0, 0, 0, 0,
 158    0, 0, 0, 0, 0, 0, 0, 0,
 159    0, 0, 0, 0, 0, 0, 0, 0,
 160    0, 0, 0, 0, 0, 0, 0, 0,
 161
 162    0, 0, 0, 0, 0, 0, 0, 0,
 163    0, 0, 0, 0, 0, 0, 0, 0,
 164    0, 0, 0, 0, 0, 0, 0, 0,
 165    0, 0, 0, 0, 0, 0, 0, 0,
 166    0, 0, 0, 0, 0, 0, 0, 0,
 167    0, 0, 0, 0, 0, 0, 0, 0,
 168    0, 0, 0, 0, 0, 0, 0, 0,
 169    0, 0, 0, 0, 0, 0, 0, 0
 170};
 171
 172
 173Py_UNICODE
 174PyUnicode_GetMax(void)
 175{
 176#ifdef Py_UNICODE_WIDE
 177    return 0x10FFFF;
 178#else
 179    /* This is actually an illegal character, so it should
 180       not be passed to unichr. */
 181    return 0xFFFF;
 182#endif
 183}
 184
 185/* --- Bloom Filters ----------------------------------------------------- */
 186
 187/* stuff to implement simple "bloom filters" for Unicode characters.
 188   to keep things simple, we use a single bitmask, using the least 5
 189   bits from each unicode characters as the bit index. */
 190
 191/* the linebreak mask is set up by Unicode_Init below */
 192
 193#define BLOOM_MASK unsigned long
 194
 195static BLOOM_MASK bloom_linebreak;
 196
 197#define BLOOM(mask, ch) ((mask & (1 << ((ch) & 0x1F))))
 198
 199#define BLOOM_LINEBREAK(ch)                                             \
 200    ((ch) < 128U ? ascii_linebreak[(ch)] :                              \
 201     (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK(ch)))
 202
 203Py_LOCAL_INLINE(BLOOM_MASK) make_bloom_mask(Py_UNICODE* ptr, Py_ssize_t len)
 204{
 205    /* calculate simple bloom-style bitmask for a given unicode string */
 206
 207    long mask;
 208    Py_ssize_t i;
 209
 210    mask = 0;
 211    for (i = 0; i < len; i++)
 212        mask |= (1 << (ptr[i] & 0x1F));
 213
 214    return mask;
 215}
 216
 217Py_LOCAL_INLINE(int) unicode_member(Py_UNICODE chr, Py_UNICODE* set, Py_ssize_t setlen)
 218{
 219    Py_ssize_t i;
 220
 221    for (i = 0; i < setlen; i++)
 222        if (set[i] == chr)
 223            return 1;
 224
 225    return 0;
 226}
 227
 228#define BLOOM_MEMBER(mask, chr, set, setlen)                    \
 229    BLOOM(mask, chr) && unicode_member(chr, set, setlen)
 230
 231/* --- Unicode Object ----------------------------------------------------- */
 232
 233static
 234int unicode_resize(register PyUnicodeObject *unicode,
 235                   Py_ssize_t length)
 236{
 237    void *oldstr;
 238
 239    /* Shortcut if there's nothing much to do. */
 240    if (unicode->length == length)
 241        goto reset;
 242
 243    /* Resizing shared object (unicode_empty or single character
 244       objects) in-place is not allowed. Use PyUnicode_Resize()
 245       instead ! */
 246
 247    if (unicode == unicode_empty ||
 248        (unicode->length == 1 &&
 249         unicode->str[0] < 256U &&
 250         unicode_latin1[unicode->str[0]] == unicode)) {
 251        PyErr_SetString(PyExc_SystemError,
 252                        "can't resize shared unicode objects");
 253        return -1;
 254    }
 255
 256    /* We allocate one more byte to make sure the string is Ux0000 terminated.
 257       The overallocation is also used by fastsearch, which assumes that it's
 258       safe to look at str[length] (without making any assumptions about what
 259       it contains). */
 260
 261    oldstr = unicode->str;
 262    unicode->str = PyObject_REALLOC(unicode->str,
 263                                    sizeof(Py_UNICODE) * (length + 1));
 264    if (!unicode->str) {
 265        unicode->str = (Py_UNICODE *)oldstr;
 266        PyErr_NoMemory();
 267        return -1;
 268    }
 269    unicode->str[length] = 0;
 270    unicode->length = length;
 271
 272  reset:
 273    /* Reset the object caches */
 274    if (unicode->defenc) {
 275        Py_DECREF(unicode->defenc);
 276        unicode->defenc = NULL;
 277    }
 278    unicode->hash = -1;
 279
 280    return 0;
 281}
 282
 283/* We allocate one more byte to make sure the string is
 284   Ux0000 terminated -- XXX is this needed ?
 285
 286   XXX This allocator could further be enhanced by assuring that the
 287   free list never reduces its size below 1.
 288
 289*/
 290
 291static
 292PyUnicodeObject *_PyUnicode_New(Py_ssize_t length)
 293{
 294    register PyUnicodeObject *unicode;
 295
 296    /* Optimization for empty strings */
 297    if (length == 0 && unicode_empty != NULL) {
 298        Py_INCREF(unicode_empty);
 299        return unicode_empty;
 300    }
 301
 302    /* Ensure we won't overflow the size. */
 303    if (length > ((PY_SSIZE_T_MAX / sizeof(Py_UNICODE)) - 1)) {
 304        return (PyUnicodeObject *)PyErr_NoMemory();
 305    }
 306
 307    /* Unicode freelist & memory allocation */
 308    if (free_list) {
 309        unicode = free_list;
 310        free_list = *(PyUnicodeObject **)unicode;
 311        numfree--;
 312        if (unicode->str) {
 313            /* Keep-Alive optimization: we only upsize the buffer,
 314               never downsize it. */
 315            if ((unicode->length < length) &&
 316                unicode_resize(unicode, length) < 0) {
 317                PyObject_DEL(unicode->str);
 318                unicode->str = NULL;
 319            }
 320        }
 321        else {
 322            size_t new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
 323            unicode->str = (Py_UNICODE*) PyObject_MALLOC(new_size);
 324        }
 325        PyObject_INIT(unicode, &PyUnicode_Type);
 326    }
 327    else {
 328        size_t new_size;
 329        unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
 330        if (unicode == NULL)
 331            return NULL;
 332        new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
 333        unicode->str = (Py_UNICODE*) PyObject_MALLOC(new_size);
 334    }
 335
 336    if (!unicode->str) {
 337        PyErr_NoMemory();
 338        goto onError;
 339    }
 340    /* Initialize the first element to guard against cases where
 341     * the caller fails before initializing str -- unicode_resize()
 342     * reads str[0], and the Keep-Alive optimization can keep memory
 343     * allocated for str alive across a call to unicode_dealloc(unicode).
 344     * We don't want unicode_resize to read uninitialized memory in
 345     * that case.
 346     */
 347    unicode->str[0] = 0;
 348    unicode->str[length] = 0;
 349    unicode->length = length;
 350    unicode->hash = -1;
 351    unicode->defenc = NULL;
 352    return unicode;
 353
 354  onError:
 355    /* XXX UNREF/NEWREF interface should be more symmetrical */
 356    _Py_DEC_REFTOTAL;
 357    _Py_ForgetReference((PyObject *)unicode);
 358    PyObject_Del(unicode);
 359    return NULL;
 360}
 361
 362static
 363void unicode_dealloc(register PyUnicodeObject *unicode)
 364{
 365    if (PyUnicode_CheckExact(unicode) &&
 366        numfree < PyUnicode_MAXFREELIST) {
 367        /* Keep-Alive optimization */
 368        if (unicode->length >= KEEPALIVE_SIZE_LIMIT) {
 369            PyObject_DEL(unicode->str);
 370            unicode->str = NULL;
 371            unicode->length = 0;
 372        }
 373        if (unicode->defenc) {
 374            Py_DECREF(unicode->defenc);
 375            unicode->defenc = NULL;
 376        }
 377        /* Add to free list */
 378        *(PyUnicodeObject **)unicode = free_list;
 379        free_list = unicode;
 380        numfree++;
 381    }
 382    else {
 383        PyObject_DEL(unicode->str);
 384        Py_XDECREF(unicode->defenc);
 385        Py_TYPE(unicode)->tp_free((PyObject *)unicode);
 386    }
 387}
 388
 389static
 390int _PyUnicode_Resize(PyUnicodeObject **unicode, Py_ssize_t length)
 391{
 392    register PyUnicodeObject *v;
 393
 394    /* Argument checks */
 395    if (unicode == NULL) {
 396        PyErr_BadInternalCall();
 397        return -1;
 398    }
 399    v = *unicode;
 400    if (v == NULL || !PyUnicode_Check(v) || Py_REFCNT(v) != 1 || length < 0) {
 401        PyErr_BadInternalCall();
 402        return -1;
 403    }
 404
 405    /* Resizing unicode_empty and single character objects is not
 406       possible since these are being shared. We simply return a fresh
 407       copy with the same Unicode content. */
 408    if (v->length != length &&
 409        (v == unicode_empty || v->length == 1)) {
 410        PyUnicodeObject *w = _PyUnicode_New(length);
 411        if (w == NULL)
 412            return -1;
 413        Py_UNICODE_COPY(w->str, v->str,
 414                        length < v->length ? length : v->length);
 415        Py_DECREF(*unicode);
 416        *unicode = w;
 417        return 0;
 418    }
 419
 420    /* Note that we don't have to modify *unicode for unshared Unicode
 421       objects, since we can modify them in-place. */
 422    return unicode_resize(v, length);
 423}
 424
 425int PyUnicode_Resize(PyObject **unicode, Py_ssize_t length)
 426{
 427    return _PyUnicode_Resize((PyUnicodeObject **)unicode, length);
 428}
 429
 430PyObject *PyUnicode_FromUnicode(const Py_UNICODE *u,
 431                                Py_ssize_t size)
 432{
 433    PyUnicodeObject *unicode;
 434
 435    /* If the Unicode data is known at construction time, we can apply
 436       some optimizations which share commonly used objects. */
 437    if (u != NULL) {
 438
 439        /* Optimization for empty strings */
 440        if (size == 0 && unicode_empty != NULL) {
 441            Py_INCREF(unicode_empty);
 442            return (PyObject *)unicode_empty;
 443        }
 444
 445        /* Single character Unicode objects in the Latin-1 range are
 446           shared when using this constructor */
 447        if (size == 1 && *u < 256) {
 448            unicode = unicode_latin1[*u];
 449            if (!unicode) {
 450                unicode = _PyUnicode_New(1);
 451                if (!unicode)
 452                    return NULL;
 453                unicode->str[0] = *u;
 454                unicode_latin1[*u] = unicode;
 455            }
 456            Py_INCREF(unicode);
 457            return (PyObject *)unicode;
 458        }
 459    }
 460
 461    unicode = _PyUnicode_New(size);
 462    if (!unicode)
 463        return NULL;
 464
 465    /* Copy the Unicode data into the new object */
 466    if (u != NULL)
 467        Py_UNICODE_COPY(unicode->str, u, size);
 468
 469    return (PyObject *)unicode;
 470}
 471
 472PyObject *PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size)
 473{
 474    PyUnicodeObject *unicode;
 475
 476    if (size < 0) {
 477        PyErr_SetString(PyExc_SystemError,
 478                        "Negative size passed to PyUnicode_FromStringAndSize");
 479        return NULL;
 480    }
 481
 482    /* If the Unicode data is known at construction time, we can apply
 483       some optimizations which share commonly used objects.
 484       Also, this means the input must be UTF-8, so fall back to the
 485       UTF-8 decoder at the end. */
 486    if (u != NULL) {
 487
 488        /* Optimization for empty strings */
 489        if (size == 0 && unicode_empty != NULL) {
 490            Py_INCREF(unicode_empty);
 491            return (PyObject *)unicode_empty;
 492        }
 493
 494        /* Single characters are shared when using this constructor.
 495           Restrict to ASCII, since the input must be UTF-8. */
 496        if (size == 1 && Py_CHARMASK(*u) < 128) {
 497            unicode = unicode_latin1[Py_CHARMASK(*u)];
 498            if (!unicode) {
 499                unicode = _PyUnicode_New(1);
 500                if (!unicode)
 501                    return NULL;
 502                unicode->str[0] = Py_CHARMASK(*u);
 503                unicode_latin1[Py_CHARMASK(*u)] = unicode;
 504            }
 505            Py_INCREF(unicode);
 506            return (PyObject *)unicode;
 507        }
 508
 509        return PyUnicode_DecodeUTF8(u, size, NULL);
 510    }
 511
 512    unicode = _PyUnicode_New(size);
 513    if (!unicode)
 514        return NULL;
 515
 516    return (PyObject *)unicode;
 517}
 518
 519PyObject *PyUnicode_FromString(const char *u)
 520{
 521    size_t size = strlen(u);
 522    if (size > PY_SSIZE_T_MAX) {
 523        PyErr_SetString(PyExc_OverflowError, "input too long");
 524        return NULL;
 525    }
 526
 527    return PyUnicode_FromStringAndSize(u, size);
 528}
 529
 530#ifdef HAVE_WCHAR_H
 531
 532PyObject *PyUnicode_FromWideChar(register const wchar_t *w,
 533                                 Py_ssize_t size)
 534{
 535    PyUnicodeObject *unicode;
 536
 537    if (w == NULL) {
 538        PyErr_BadInternalCall();
 539        return NULL;
 540    }
 541
 542    unicode = _PyUnicode_New(size);
 543    if (!unicode)
 544        return NULL;
 545
 546    /* Copy the wchar_t data into the new object */
 547#ifdef HAVE_USABLE_WCHAR_T
 548    memcpy(unicode->str, w, size * sizeof(wchar_t));
 549#else
 550    {
 551        register Py_UNICODE *u;
 552        register Py_ssize_t i;
 553        u = PyUnicode_AS_UNICODE(unicode);
 554        for (i = size; i > 0; i--)
 555            *u++ = *w++;
 556    }
 557#endif
 558
 559    return (PyObject *)unicode;
 560}
 561
 562static void
 563makefmt(char *fmt, int longflag, int size_tflag, int zeropad, int width, int precision, char c)
 564{
 565    *fmt++ = '%';
 566    if (width) {
 567        if (zeropad)
 568            *fmt++ = '0';
 569        fmt += sprintf(fmt, "%d", width);
 570    }
 571    if (precision)
 572        fmt += sprintf(fmt, ".%d", precision);
 573    if (longflag)
 574        *fmt++ = 'l';
 575    else if (size_tflag) {
 576        char *f = PY_FORMAT_SIZE_T;
 577        while (*f)
 578            *fmt++ = *f++;
 579    }
 580    *fmt++ = c;
 581    *fmt = '\0';
 582}
 583
 584#define appendstring(string) {for (copy = string;*copy;) *s++ = *copy++;}
 585
 586PyObject *
 587PyUnicode_FromFormatV(const char *format, va_list vargs)
 588{
 589    va_list count;
 590    Py_ssize_t callcount = 0;
 591    PyObject **callresults = NULL;
 592    PyObject **callresult = NULL;
 593    Py_ssize_t n = 0;
 594    int width = 0;
 595    int precision = 0;
 596    int zeropad;
 597    const char* f;
 598    Py_UNICODE *s;
 599    PyObject *string;
 600    /* used by sprintf */
 601    char buffer[21];
 602    /* use abuffer instead of buffer, if we need more space
 603     * (which can happen if there's a format specifier with width). */
 604    char *abuffer = NULL;
 605    char *realbuffer;
 606    Py_ssize_t abuffersize = 0;
 607    char fmt[60]; /* should be enough for %0width.precisionld */
 608    const char *copy;
 609
 610#ifdef VA_LIST_IS_ARRAY
 611    Py_MEMCPY(count, vargs, sizeof(va_list));
 612#else
 613#ifdef  __va_copy
 614    __va_copy(count, vargs);
 615#else
 616    count = vargs;
 617#endif
 618#endif
 619     /* step 1: count the number of %S/%R/%s format specifications
 620      * (we call PyObject_Str()/PyObject_Repr()/PyUnicode_DecodeUTF8() for these
 621      * objects once during step 3 and put the result in an array) */
 622    for (f = format; *f; f++) {
 623         if (*f == '%') {
 624             if (*(f+1)=='%')
 625                 continue;
 626             if (*(f+1)=='S' || *(f+1)=='R')
 627                 ++callcount;
 628             while (isdigit((unsigned)*f))
 629                 width = (width*10) + *f++ - '0';
 630             while (*++f && *f != '%' && !isalpha((unsigned)*f))
 631                 ;
 632             if (*f == 's')
 633                 ++callcount;
 634         }
 635    }
 636    /* step 2: allocate memory for the results of
 637     * PyObject_Str()/PyObject_Repr()/PyUnicode_DecodeUTF8() calls */
 638    if (callcount) {
 639        callresults = PyObject_Malloc(sizeof(PyObject *)*callcount);
 640        if (!callresults) {
 641            PyErr_NoMemory();
 642            return NULL;
 643        }
 644        callresult = callresults;
 645    }
 646    /* step 3: figure out how large a buffer we need */
 647    for (f = format; *f; f++) {
 648        if (*f == '%') {
 649            const char* p = f;
 650            width = 0;
 651            while (isdigit((unsigned)*f))
 652                width = (width*10) + *f++ - '0';
 653            while (*++f && *f != '%' && !isalpha((unsigned)*f))
 654                ;
 655
 656            /* skip the 'l' or 'z' in {%ld, %zd, %lu, %zu} since
 657             * they don't affect the amount of space we reserve.
 658             */
 659            if ((*f == 'l' || *f == 'z') &&
 660                (f[1] == 'd' || f[1] == 'u'))
 661                ++f;
 662
 663            switch (*f) {
 664            case 'c':
 665                (void)va_arg(count, int);
 666                /* fall through... */
 667            case '%':
 668                n++;
 669                break;
 670            case 'd': case 'u': case 'i': case 'x':
 671                (void) va_arg(count, int);
 672                /* 20 bytes is enough to hold a 64-bit
 673                   integer.  Decimal takes the most space.
 674                   This isn't enough for octal.
 675                   If a width is specified we need more
 676                   (which we allocate later). */
 677                if (width < 20)
 678                    width = 20;
 679                n += width;
 680                if (abuffersize < width)
 681                    abuffersize = width;
 682                break;
 683            case 's':
 684            {
 685                /* UTF-8 */
 686                unsigned char *s = va_arg(count, unsigned char*);
 687                PyObject *str = PyUnicode_DecodeUTF8(s, strlen(s), "replace");
 688                if (!str)
 689                    goto fail;
 690                n += PyUnicode_GET_SIZE(str);
 691                /* Remember the str and switch to the next slot */
 692                *callresult++ = str;
 693                break;
 694            }
 695            case 'U':
 696            {
 697                PyObject *obj = va_arg(count, PyObject *);
 698                assert(obj && PyUnicode_Check(obj));
 699                n += PyUnicode_GET_SIZE(obj);
 700                break;
 701            }
 702            case 'V':
 703            {
 704                PyObject *obj = va_arg(count, PyObject *);
 705                const char *str = va_arg(count, const char *);
 706                assert(obj || str);
 707                assert(!obj || PyUnicode_Check(obj));
 708                if (obj)
 709                    n += PyUnicode_GET_SIZE(obj);
 710                else
 711                    n += strlen(str);
 712                break;
 713            }
 714            case 'S':
 715            {
 716                PyObject *obj = va_arg(count, PyObject *);
 717                PyObject *str;
 718                assert(obj);
 719                str = PyObject_Str(obj);
 720                if (!str)
 721                    goto fail;
 722                n += PyUnicode_GET_SIZE(str);
 723                /* Remember the str and switch to the next slot */
 724                *callresult++ = str;
 725                break;
 726            }
 727            case 'R':
 728            {
 729                PyObject *obj = va_arg(count, PyObject *);
 730                PyObject *repr;
 731                assert(obj);
 732                repr = PyObject_Repr(obj);
 733                if (!repr)
 734                    goto fail;
 735                n += PyUnicode_GET_SIZE(repr);
 736                /* Remember the repr and switch to the next slot */
 737                *callresult++ = repr;
 738                break;
 739            }
 740            case 'p':
 741                (void) va_arg(count, int);
 742                /* maximum 64-bit pointer representation:
 743                 * 0xffffffffffffffff
 744                 * so 19 characters is enough.
 745                 * XXX I count 18 -- what's the extra for?
 746                 */
 747                n += 19;
 748                break;
 749            default:
 750                /* if we stumble upon an unknown
 751                   formatting code, copy the rest of
 752                   the format string to the output
 753                   string. (we cannot just skip the
 754                   code, since there's no way to know
 755                   what's in the argument list) */
 756                n += strlen(p);
 757                goto expand;
 758            }
 759        } else
 760            n++;
 761    }
 762  expand:
 763    if (abuffersize > 20) {
 764        abuffer = PyObject_Malloc(abuffersize);
 765        if (!abuffer) {
 766            PyErr_NoMemory();
 767            goto fail;
 768        }
 769        realbuffer = abuffer;
 770    }
 771    else
 772        realbuffer = buffer;
 773    /* step 4: fill the buffer */
 774    /* Since we've analyzed how much space we need for the worst case,
 775       we don't have to resize the string.
 776       There can be no errors beyond this point. */
 777    string = PyUnicode_FromUnicode(NULL, n);
 778    if (!string)
 779        goto fail;
 780
 781    s = PyUnicode_AS_UNICODE(string);
 782    callresult = callresults;
 783
 784    for (f = format; *f; f++) {
 785        if (*f == '%') {
 786            const char* p = f++;
 787            int longflag = 0;
 788            int size_tflag = 0;
 789            zeropad = (*f == '0');
 790            /* parse the width.precision part */
 791            width = 0;
 792            while (isdigit((unsigned)*f))
 793                width = (width*10) + *f++ - '0';
 794            precision = 0;
 795            if (*f == '.') {
 796                f++;
 797                while (isdigit((unsigned)*f))
 798                    precision = (precision*10) + *f++ - '0';
 799            }
 800            /* handle the long flag, but only for %ld and %lu.
 801               others can be added when necessary. */
 802            if (*f == 'l' && (f[1] == 'd' || f[1] == 'u')) {
 803                longflag = 1;
 804                ++f;
 805            }
 806            /* handle the size_t flag. */
 807            if (*f == 'z' && (f[1] == 'd' || f[1] == 'u')) {
 808                size_tflag = 1;
 809                ++f;
 810            }
 811
 812            switch (*f) {
 813            case 'c':
 814                *s++ = va_arg(vargs, int);
 815                break;
 816            case 'd':
 817                makefmt(fmt, longflag, size_tflag, zeropad, width, precision, 'd');
 818                if (longflag)
 819                    sprintf(realbuffer, fmt, va_arg(vargs, long));
 820                else if (size_tflag)
 821                    sprintf(realbuffer, fmt, va_arg(vargs, Py_ssize_t));
 822                else
 823                    sprintf(realbuffer, fmt, va_arg(vargs, int));
 824                appendstring(realbuffer);
 825                break;
 826            case 'u':
 827                makefmt(fmt, longflag, size_tflag, zeropad, width, precision, 'u');
 828                if (longflag)
 829                    sprintf(realbuffer, fmt, va_arg(vargs, unsigned long));
 830                else if (size_tflag)
 831                    sprintf(realbuffer, fmt, va_arg(vargs, size_t));
 832                else
 833                    sprintf(realbuffer, fmt, va_arg(vargs, unsigned int));
 834                appendstring(realbuffer);
 835                break;
 836            case 'i':
 837                makefmt(fmt, 0, 0, zeropad, width, precision, 'i');
 838                sprintf(realbuffer, fmt, va_arg(vargs, int));
 839                appendstring(realbuffer);
 840                break;
 841            case 'x':
 842                makefmt(fmt, 0, 0, zeropad, width, precision, 'x');
 843                sprintf(realbuffer, fmt, va_arg(vargs, int));
 844                appendstring(realbuffer);
 845                break;
 846            case 's':
 847            {
 848                /* unused, since we already have the result */
 849                (void) va_arg(vargs, char *);
 850                Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(*callresult),
 851                                PyUnicode_GET_SIZE(*callresult));
 852                s += PyUnicode_GET_SIZE(*callresult);
 853                /* We're done with the unicode()/repr() => forget it */
 854                Py_DECREF(*callresult);
 855                /* switch to next unicode()/repr() result */
 856                ++callresult;
 857                break;
 858            }
 859            case 'U':
 860            {
 861                PyObject *obj = va_arg(vargs, PyObject *);
 862                Py_ssize_t size = PyUnicode_GET_SIZE(obj);
 863                Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(obj), size);
 864                s += size;
 865                break;
 866            }
 867            case 'V':
 868            {
 869                PyObject *obj = va_arg(vargs, PyObject *);
 870                const char *str = va_arg(vargs, const char *);
 871                if (obj) {
 872                    Py_ssize_t size = PyUnicode_GET_SIZE(obj);
 873                    Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(obj), size);
 874                    s += size;
 875                } else {
 876                    appendstring(str);
 877                }
 878                break;
 879            }
 880            case 'S':
 881            case 'R':
 882            {
 883                Py_UNICODE *ucopy;
 884                Py_ssize_t usize;
 885                Py_ssize_t upos;
 886                /* unused, since we already have the result */
 887                (void) va_arg(vargs, PyObject *);
 888                ucopy = PyUnicode_AS_UNICODE(*callresult);
 889                usize = PyUnicode_GET_SIZE(*callresult);
 890                for (upos = 0; upos<usize;)
 891                    *s++ = ucopy[upos++];
 892                /* We're done with the unicode()/repr() => forget it */
 893                Py_DECREF(*callresult);
 894                /* switch to next unicode()/repr() result */
 895                ++callresult;
 896                break;
 897            }
 898            case 'p':
 899                sprintf(buffer, "%p", va_arg(vargs, void*));
 900                /* %p is ill-defined:  ensure leading 0x. */
 901                if (buffer[1] == 'X')
 902                    buffer[1] = 'x';
 903                else if (buffer[1] != 'x') {
 904                    memmove(buffer+2, buffer, strlen(buffer)+1);
 905                    buffer[0] = '0';
 906                    buffer[1] = 'x';
 907                }
 908                appendstring(buffer);
 909                break;
 910            case '%':
 911                *s++ = '%';
 912                break;
 913            default:
 914                appendstring(p);
 915                goto end;
 916            }
 917        } else
 918            *s++ = *f;
 919    }
 920
 921  end:
 922    if (callresults)
 923        PyObject_Free(callresults);
 924    if (abuffer)
 925        PyObject_Free(abuffer);
 926    PyUnicode_Resize(&string, s - PyUnicode_AS_UNICODE(string));
 927    return string;
 928  fail:
 929    if (callresults) {
 930        PyObject **callresult2 = callresults;
 931        while (callresult2 < callresult) {
 932            Py_DECREF(*callresult2);
 933            ++callresult2;
 934        }
 935        PyObject_Free(callresults);
 936    }
 937    if (abuffer)
 938        PyObject_Free(abuffer);
 939    return NULL;
 940}
 941
 942#undef appendstring
 943
 944PyObject *
 945PyUnicode_FromFormat(const char *format, ...)
 946{
 947    PyObject* ret;
 948    va_list vargs;
 949
 950#ifdef HAVE_STDARG_PROTOTYPES
 951    va_start(vargs, format);
 952#else
 953    va_start(vargs);
 954#endif
 955    ret = PyUnicode_FromFormatV(format, vargs);
 956    va_end(vargs);
 957    return ret;
 958}
 959
 960Py_ssize_t PyUnicode_AsWideChar(PyUnicodeObject *unicode,
 961                                wchar_t *w,
 962                                Py_ssize_t size)
 963{
 964    if (unicode == NULL) {
 965        PyErr_BadInternalCall();
 966        return -1;
 967    }
 968
 969    /* If possible, try to copy the 0-termination as well */
 970    if (size > PyUnicode_GET_SIZE(unicode))
 971        size = PyUnicode_GET_SIZE(unicode) + 1;
 972
 973#ifdef HAVE_USABLE_WCHAR_T
 974    memcpy(w, unicode->str, size * sizeof(wchar_t));
 975#else
 976    {
 977        register Py_UNICODE *u;
 978        register Py_ssize_t i;
 979        u = PyUnicode_AS_UNICODE(unicode);
 980        for (i = size; i > 0; i--)
 981            *w++ = *u++;
 982    }
 983#endif
 984
 985    if (size > PyUnicode_GET_SIZE(unicode))
 986        return PyUnicode_GET_SIZE(unicode);
 987    else
 988        return size;
 989}
 990
 991#endif
 992
 993PyObject *PyUnicode_FromOrdinal(int ordinal)
 994{
 995    Py_UNICODE s[1];
 996
 997#ifdef Py_UNICODE_WIDE
 998    if (ordinal < 0 || ordinal > 0x10ffff) {
 999        PyErr_SetString(PyExc_ValueError,
1000                        "unichr() arg not in range(0x110000) "
1001                        "(wide Python build)");
1002        return NULL;
1003    }
1004#else
1005    if (ordinal < 0 || ordinal > 0xffff) {
1006        PyErr_SetString(PyExc_ValueError,
1007                        "unichr() arg not in range(0x10000) "
1008                        "(narrow Python build)");
1009        return NULL;
1010    }
1011#endif
1012
1013    s[0] = (Py_UNICODE)ordinal;
1014    return PyUnicode_FromUnicode(s, 1);
1015}
1016
1017PyObject *PyUnicode_FromObject(register PyObject *obj)
1018{
1019    /* XXX Perhaps we should make this API an alias of
1020       PyObject_Unicode() instead ?! */
1021    if (PyUnicode_CheckExact(obj)) {
1022        Py_INCREF(obj);
1023        return obj;
1024    }
1025    if (PyUnicode_Check(obj)) {
1026        /* For a Unicode subtype that's not a Unicode object,
1027           return a true Unicode object with the same data. */
1028        return PyUnicode_FromUnicode(PyUnicode_AS_UNICODE(obj),
1029                                     PyUnicode_GET_SIZE(obj));
1030    }
1031    return PyUnicode_FromEncodedObject(obj, NULL, "strict");
1032}
1033
1034PyObject *PyUnicode_FromEncodedObject(register PyObject *obj,
1035                                      const char *encoding,
1036                                      const char *errors)
1037{
1038    const char *s = NULL;
1039    Py_ssize_t len;
1040    PyObject *v;
1041
1042    if (obj == NULL) {
1043        PyErr_BadInternalCall();
1044        return NULL;
1045    }
1046
1047#if 0
1048    /* For b/w compatibility we also accept Unicode objects provided
1049       that no encodings is given and then redirect to
1050       PyObject_Unicode() which then applies the additional logic for
1051       Unicode subclasses.
1052
1053       NOTE: This API should really only be used for object which
1054       represent *encoded* Unicode !
1055
1056    */
1057    if (PyUnicode_Check(obj)) {
1058        if (encoding) {
1059            PyErr_SetString(PyExc_TypeError,
1060                            "decoding Unicode is not supported");
1061            return NULL;
1062        }
1063        return PyObject_Unicode(obj);
1064    }
1065#else
1066    if (PyUnicode_Check(obj)) {
1067        PyErr_SetString(PyExc_TypeError,
1068                        "decoding Unicode is not supported");
1069        return NULL;
1070    }
1071#endif
1072
1073    /* Coerce object */
1074    if (PyString_Check(obj)) {
1075        s = PyString_AS_STRING(obj);
1076        len = PyString_GET_SIZE(obj);
1077    }
1078    else if (PyByteArray_Check(obj)) {
1079        /* Python 2.x specific */
1080        PyErr_Format(PyExc_TypeError,
1081                     "decoding bytearray is not supported");
1082        return NULL;
1083    }
1084    else if (PyObject_AsCharBuffer(obj, &s, &len)) {
1085        /* Overwrite the error message with something more useful in
1086           case of a TypeError. */
1087        if (PyErr_ExceptionMatches(PyExc_TypeError))
1088            PyErr_Format(PyExc_TypeError,
1089                         "coercing to Unicode: need string or buffer, "
1090                         "%.80s found",
1091                         Py_TYPE(obj)->tp_name);
1092        goto onError;
1093    }
1094
1095    /* Convert to Unicode */
1096    if (len == 0) {
1097        Py_INCREF(unicode_empty);
1098        v = (PyObject *)unicode_empty;
1099    }
1100    else
1101        v = PyUnicode_Decode(s, len, encoding, errors);
1102
1103    return v;
1104
1105  onError:
1106    return NULL;
1107}
1108
1109PyObject *PyUnicode_Decode(const char *s,
1110                           Py_ssize_t size,
1111                           const char *encoding,
1112                           const char *errors)
1113{
1114    PyObject *buffer = NULL, *unicode;
1115
1116    if (encoding == NULL)
1117        encoding = PyUnicode_GetDefaultEncoding();
1118
1119    /* Shortcuts for common default encodings */
1120    if (strcmp(encoding, "utf-8") == 0)
1121        return PyUnicode_DecodeUTF8(s, size, errors);
1122    else if (strcmp(encoding, "latin-1") == 0)
1123        return PyUnicode_DecodeLatin1(s, size, errors);
1124#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
1125    else if (strcmp(encoding, "mbcs") == 0)
1126        return PyUnicode_DecodeMBCS(s, size, errors);
1127#endif
1128    else if (strcmp(encoding, "ascii") == 0)
1129        return PyUnicode_DecodeASCII(s, size, errors);
1130
1131    /* Decode via the codec registry */
1132    buffer = PyBuffer_FromMemory((void *)s, size);
1133    if (buffer == NULL)
1134        goto onError;
1135    unicode = PyCodec_Decode(buffer, encoding, errors);
1136    if (unicode == NULL)
1137        goto onError;
1138    if (!PyUnicode_Check(unicode)) {
1139        PyErr_Format(PyExc_TypeError,
1140                     "decoder did not return an unicode object (type=%.400s)",
1141                     Py_TYPE(unicode)->tp_name);
1142        Py_DECREF(unicode);
1143        goto onError;
1144    }
1145    Py_DECREF(buffer);
1146    return unicode;
1147
1148  onError:
1149    Py_XDECREF(buffer);
1150    return NULL;
1151}
1152
1153PyObject *PyUnicode_AsDecodedObject(PyObject *unicode,
1154                                    const char *encoding,
1155                                    const char *errors)
1156{
1157    PyObject *v;
1158
1159    if (!PyUnicode_Check(unicode)) {
1160        PyErr_BadArgument();
1161        goto onError;
1162    }
1163
1164    if (encoding == NULL)
1165        encoding = PyUnicode_GetDefaultEncoding();
1166
1167    /* Decode via the codec registry */
1168    v = PyCodec_Decode(unicode, encoding, errors);
1169    if (v == NULL)
1170        goto onError;
1171    return v;
1172
1173  onError:
1174    return NULL;
1175}
1176
1177PyObject *PyUnicode_Encode(const Py_UNICODE *s,
1178                           Py_ssize_t size,
1179                           const char *encoding,
1180                           const char *errors)
1181{
1182    PyObject *v, *unicode;
1183
1184    unicode = PyUnicode_FromUnicode(s, size);
1185    if (unicode == NULL)
1186        return NULL;
1187    v = PyUnicode_AsEncodedString(unicode, encoding, errors);
1188    Py_DECREF(unicode);
1189    return v;
1190}
1191
1192PyObject *PyUnicode_AsEncodedObject(PyObject *unicode,
1193                                    const char *encoding,
1194                                    const char *errors)
1195{
1196    PyObject *v;
1197
1198    if (!PyUnicode_Check(unicode)) {
1199        PyErr_BadArgument();
1200        goto onError;
1201    }
1202
1203    if (encoding == NULL)
1204        encoding = PyUnicode_GetDefaultEncoding();
1205
1206    /* Encode via the codec registry */
1207    v = PyCodec_Encode(unicode, encoding, errors);
1208    if (v == NULL)
1209        goto onError;
1210    return v;
1211
1212  onError:
1213    return NULL;
1214}
1215
1216PyObject *PyUnicode_AsEncodedString(PyObject *unicode,
1217                                    const char *encoding,
1218                                    const char *errors)
1219{
1220    PyObject *v;
1221
1222    if (!PyUnicode_Check(unicode)) {
1223        PyErr_BadArgument();
1224        goto onError;
1225    }
1226
1227    if (encoding == NULL)
1228        encoding = PyUnicode_GetDefaultEncoding();
1229
1230    /* Shortcuts for common default encodings */
1231    if (errors == NULL) {
1232        if (strcmp(encoding, "utf-8") == 0)
1233            return PyUnicode_AsUTF8String(unicode);
1234        else if (strcmp(encoding, "latin-1") == 0)
1235            return PyUnicode_AsLatin1String(unicode);
1236#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
1237        else if (strcmp(encoding, "mbcs") == 0)
1238            return PyUnicode_AsMBCSString(unicode);
1239#endif
1240        else if (strcmp(encoding, "ascii") == 0)
1241            return PyUnicode_AsASCIIString(unicode);
1242    }
1243
1244    /* Encode via the codec registry */
1245    v = PyCodec_Encode(unicode, encoding, errors);
1246    if (v == NULL)
1247        goto onError;
1248    if (!PyString_Check(v)) {
1249        PyErr_Format(PyExc_TypeError,
1250                     "encoder did not return a string object (type=%.400s)",
1251                     Py_TYPE(v)->tp_name);
1252        Py_DECREF(v);
1253        goto onError;
1254    }
1255    return v;
1256
1257  onError:
1258    return NULL;
1259}
1260
1261PyObject *_PyUnicode_AsDefaultEncodedString(PyObject *unicode,
1262                                            const char *errors)
1263{
1264    PyObject *v = ((PyUnicodeObject *)unicode)->defenc;
1265
1266    if (v)
1267        return v;
1268    v = PyUnicode_AsEncodedString(unicode, NULL, errors);
1269    if (v && errors == NULL)
1270        ((PyUnicodeObject *)unicode)->defenc = v;
1271    return v;
1272}
1273
1274Py_UNICODE *PyUnicode_AsUnicode(PyObject *unicode)
1275{
1276    if (!PyUnicode_Check(unicode)) {
1277        PyErr_BadArgument();
1278        goto onError;
1279    }
1280    return PyUnicode_AS_UNICODE(unicode);
1281
1282  onError:
1283    return NULL;
1284}
1285
1286Py_ssize_t PyUnicode_GetSize(PyObject *unicode)
1287{
1288    if (!PyUnicode_Check(unicode)) {
1289        PyErr_BadArgument();
1290        goto onError;
1291    }
1292    return PyUnicode_GET_SIZE(unicode);
1293
1294  onError:
1295    return -1;
1296}
1297
1298const char *PyUnicode_GetDefaultEncoding(void)
1299{
1300    return unicode_default_encoding;
1301}
1302
1303int PyUnicode_SetDefaultEncoding(const char *encoding)
1304{
1305    PyObject *v;
1306
1307    /* Make sure the encoding is valid. As side effect, this also
1308       loads the encoding into the codec registry cache. */
1309    v = _PyCodec_Lookup(encoding);
1310    if (v == NULL)
1311        goto onError;
1312    Py_DECREF(v);
1313    strncpy(unicode_default_encoding,
1314            encoding,
1315            sizeof(unicode_default_encoding));
1316    return 0;
1317
1318  onError:
1319    return -1;
1320}
1321
1322/* error handling callback helper:
1323   build arguments, call the callback and check the arguments,
1324   if no exception occurred, copy the replacement to the output
1325   and adjust various state variables.
1326   return 0 on success, -1 on error
1327*/
1328
1329static
1330int unicode_decode_call_errorhandler(const char *errors, PyObject **errorHandler,
1331                                     const char *encoding, const char *reason,
1332                                     const char *input, Py_ssize_t insize, Py_ssize_t *startinpos,
1333                                     Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
1334                                     PyUnicodeObject **output, Py_ssize_t *outpos, Py_UNICODE **outptr)
1335{
1336    static char *argparse = "O!n;decoding error handler must return (unicode, int) tuple";
1337
1338    PyObject *restuple = NULL;
1339    PyObject *repunicode = NULL;
1340    Py_ssize_t outsize = PyUnicode_GET_SIZE(*output);
1341    Py_ssize_t requiredsize;
1342    Py_ssize_t newpos;
1343    Py_UNICODE *repptr;
1344    Py_ssize_t repsize;
1345    int res = -1;
1346
1347    if (*errorHandler == NULL) {
1348        *errorHandler = PyCodec_LookupError(errors);
1349        if (*errorHandler == NULL)
1350            goto onError;
1351    }
1352
1353    if (*exceptionObject == NULL) {
1354        *exceptionObject = PyUnicodeDecodeError_Create(
1355            encoding, input, insize, *startinpos, *endinpos, reason);
1356        if (*exceptionObject == NULL)
1357            goto onError;
1358    }
1359    else {
1360        if (PyUnicodeDecodeError_SetStart(*exceptionObject, *startinpos))
1361            goto onError;
1362        if (PyUnicodeDecodeError_SetEnd(*exceptionObject, *endinpos))
1363            goto onError;
1364        if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
1365            goto onError;
1366    }
1367
1368    restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
1369    if (restuple == NULL)
1370        goto onError;
1371    if (!PyTuple_Check(restuple)) {
1372        PyErr_SetString(PyExc_TypeError, &argparse[4]);
1373        goto onError;
1374    }
1375    if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos))
1376        goto onError;
1377    if (newpos<0)
1378        newpos = insize+newpos;
1379    if (newpos<0 || newpos>insize) {
1380        PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
1381        goto onError;
1382    }
1383
1384    /* need more space? (at least enough for what we
1385       have+the replacement+the rest of the string (starting
1386       at the new input position), so we won't have to check space
1387       when there are no errors in the rest of the string) */
1388    repptr = PyUnicode_AS_UNICODE(repunicode);
1389    repsize = PyUnicode_GET_SIZE(repunicode);
1390    requiredsize = *outpos + repsize + insize-newpos;
1391    if (requiredsize > outsize) {
1392        if (requiredsize<2*outsize)
1393            requiredsize = 2*outsize;
1394        if (_PyUnicode_Resize(output, requiredsize) < 0)
1395            goto onError;
1396        *outptr = PyUnicode_AS_UNICODE(*output) + *outpos;
1397    }
1398    *endinpos = newpos;
1399    *inptr = input + newpos;
1400    Py_UNICODE_COPY(*outptr, repptr, repsize);
1401    *outptr += repsize;
1402    *outpos += repsize;
1403    /* we made it! */
1404    res = 0;
1405
1406  onError:
1407    Py_XDECREF(restuple);
1408    return res;
1409}
1410
1411/* --- UTF-7 Codec -------------------------------------------------------- */
1412
1413/* see RFC2152 for details */
1414
1415static
1416char utf7_special[128] = {
1417    /* indicate whether a UTF-7 character is special i.e. cannot be directly
1418       encoded:
1419       0 - not special
1420       1 - special
1421       2 - whitespace (optional)
1422       3 - RFC2152 Set O (optional) */
1423    1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 1, 1, 2, 1, 1,
1424    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1425    2, 3, 3, 3, 3, 3, 3, 0, 0, 0, 3, 1, 0, 0, 0, 1,
1426    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 3, 0,
1427    3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1428    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 1, 3, 3, 3,
1429    3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1430    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 1, 1,
1431
1432};
1433
1434/* Note: The comparison (c) <= 0 is a trick to work-around gcc
1435   warnings about the comparison always being false; since
1436   utf7_special[0] is 1, we can safely make that one comparison
1437   true  */
1438
1439#define SPECIAL(c, encodeO, encodeWS)                   \
1440    ((c) > 127 || (c) <= 0 || utf7_special[(c)] == 1 || \
1441     (encodeWS && (utf7_special[(c)] == 2)) ||          \
1442     (encodeO && (utf7_special[(c)] == 3)))
1443
1444#define B64(n)                                                          \
1445    ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
1446#define B64CHAR(c)                              \
1447    (isalnum(c) || (c) == '+' || (c) == '/')
1448#define UB64(c)                                         \
1449    ((c) == '+' ? 62 : (c) == '/' ? 63 : (c) >= 'a' ?   \
1450     (c) - 71 : (c) >= 'A' ? (c) - 65 : (c) + 4 )
1451
1452#define ENCODE(out, ch, bits)                   \
1453    while (bits >= 6) {                         \
1454        *out++ = B64(ch >> (bits-6));           \
1455        bits -= 6;                              \
1456    }
1457
1458#define DECODE(out, ch, bits, surrogate)                                \
1459    while (bits >= 16) {                                                \
1460        Py_UNICODE outCh = (Py_UNICODE) ((ch >> (bits-16)) & 0xffff);   \
1461        bits -= 16;                                                     \
1462        if (surrogate) {                                                \
1463            /* We have already generated an error for the high surrogate \
1464               so let's not bother seeing if the low surrogate is correct or not */ \
1465            surrogate = 0;                                              \
1466        } else if (0xDC00 <= outCh && outCh <= 0xDFFF) {                \
1467            /* This is a surrogate pair. Unfortunately we can't represent \
1468               it in a 16-bit character */                              \
1469            surrogate = 1;                                              \
1470            errmsg = "code pairs are not supported";                    \
1471            goto utf7Error;                                             \
1472        } else {                                                        \
1473            *out++ = outCh;                                             \
1474        }                                                               \
1475    }
1476
1477PyObject *PyUnicode_DecodeUTF7(const char *s,
1478                               Py_ssize_t size,
1479                               const char *errors)
1480{
1481    return PyUnicode_DecodeUTF7Stateful(s, size, errors, NULL);
1482}
1483
1484PyObject *PyUnicode_DecodeUTF7Stateful(const char *s,
1485                                       Py_ssize_t size,
1486                                       const char *errors,
1487                                       Py_ssize_t *consumed)
1488{
1489    const char *starts = s;
1490    Py_ssize_t startinpos;
1491    Py_ssize_t endinpos;
1492    Py_ssize_t outpos;
1493    const char *e;
1494    PyUnicodeObject *unicode;
1495    Py_UNICODE *p;
1496    const char *errmsg = "";
1497    int inShift = 0;
1498    unsigned int bitsleft = 0;
1499    unsigned long charsleft = 0;
1500    int surrogate = 0;
1501    PyObject *errorHandler = NULL;
1502    PyObject *exc = NULL;
1503
1504    unicode = _PyUnicode_New(size);
1505    if (!unicode)
1506        return NULL;
1507    if (size == 0) {
1508        if (consumed)
1509            *consumed = 0;
1510        return (PyObject *)unicode;
1511    }
1512
1513    p = unicode->str;
1514    e = s + size;
1515
1516    while (s < e) {
1517        Py_UNICODE ch;
1518      restart:
1519        ch = (unsigned char) *s;
1520
1521        if (inShift) {
1522            if ((ch == '-') || !B64CHAR(ch)) {
1523                inShift = 0;
1524                s++;
1525
1526                /* p, charsleft, bitsleft, surrogate = */ DECODE(p, charsleft, bitsleft, surrogate);
1527                if (bitsleft >= 6) {
1528                    /* The shift sequence has a partial character in it. If
1529                       bitsleft < 6 then we could just classify it as padding
1530                       but that is not the case here */
1531
1532                    errmsg = "partial character in shift sequence";
1533                    goto utf7Error;
1534                }
1535                /* According to RFC2152 the remaining bits should be zero. We
1536                   choose to signal an error/insert a replacement character
1537                   here so indicate the potential of a misencoded character. */
1538
1539                /* On x86, a << b == a << (b%32) so make sure that bitsleft != 0 */
1540                if (bitsleft && charsleft << (sizeof(charsleft) * 8 - bitsleft)) {
1541                    errmsg = "non-zero padding bits in shift sequence";
1542                    goto utf7Error;
1543                }
1544
1545                if (ch == '-') {
1546                    if ((s < e) && (*(s) == '-')) {
1547                        *p++ = '-';
1548                        inShift = 1;
1549                    }
1550                } else if (SPECIAL(ch,0,0)) {
1551                    errmsg = "unexpected special character";
1552                    goto utf7Error;
1553                } else  {
1554                    *p++ = ch;
1555                }
1556            } else {
1557                charsleft = (charsleft << 6) | UB64(ch);
1558                bitsleft += 6;
1559                s++;
1560                /* p, charsleft, bitsleft, surrogate = */ DECODE(p, charsleft, bitsleft, surrogate);
1561            }
1562        }
1563        else if ( ch == '+' ) {
1564            startinpos = s-starts;
1565            s++;
1566            if (s < e && *s == '-') {
1567                s++;
1568                *p++ = '+';
1569            } else
1570            {
1571                inShift = 1;
1572                bitsleft = 0;
1573            }
1574        }
1575        else if (SPECIAL(ch,0,0)) {
1576            startinpos = s-starts;
1577            errmsg = "unexpected special character";
1578            s++;
1579            goto utf7Error;
1580        }
1581        else {
1582            *p++ = ch;
1583            s++;
1584        }
1585        continue;
1586      utf7Error:
1587        outpos = p-PyUnicode_AS_UNICODE(unicode);
1588        endinpos = s-starts;
1589        if (unicode_decode_call_errorhandler(
1590                errors, &errorHandler,
1591                "utf7", errmsg,
1592                starts, size, &startinpos, &endinpos, &exc, &s,
1593                &unicode, &outpos, &p))
1594            goto onError;
1595    }
1596
1597    if (inShift && !consumed) {
1598        outpos = p-PyUnicode_AS_UNICODE(unicode);
1599        endinpos = size;
1600        if (unicode_decode_call_errorhandler(
1601                errors, &errorHandler,
1602                "utf7", "unterminated shift sequence",
1603                starts, size, &startinpos, &endinpos, &exc, &s,
1604                &unicode, &outpos, &p))
1605            goto onError;
1606        if (s < e)
1607            goto restart;
1608    }
1609    if (consumed) {
1610        if(inShift)
1611            *consumed = startinpos;
1612        else
1613            *consumed = s-starts;
1614    }
1615
1616    if (_PyUnicode_Resiz

Large files files are truncated, but you can click here to view the full file