PageRenderTime 73ms CodeModel.GetById 10ms app.highlight 58ms RepoModel.GetById 1ms app.codeStats 0ms

/Modules/_json.c

http://unladen-swallow.googlecode.com/
C | 621 lines | 569 code | 19 blank | 33 comment | 116 complexity | e4f23e56d00556b0331a604f2a61b6e6 MD5 | raw file
  1#include "Python.h"
  2
  3#define DEFAULT_ENCODING "utf-8"
  4#define S_CHAR(c) (c >= ' ' && c <= '~' && c != '\\' && c != '"')
  5#define MIN_EXPANSION 6
  6
  7#ifdef Py_UNICODE_WIDE
  8#define MAX_EXPANSION (2 * MIN_EXPANSION)
  9#else
 10#define MAX_EXPANSION MIN_EXPANSION
 11#endif
 12
 13static Py_ssize_t
 14ascii_escape_char(Py_UNICODE c, char *output, Py_ssize_t chars)
 15{
 16    Py_UNICODE x;
 17    output[chars++] = '\\';
 18    switch (c) {
 19        case '\\': output[chars++] = (char)c; break;
 20        case '"': output[chars++] = (char)c; break;
 21        case '\b': output[chars++] = 'b'; break;
 22        case '\f': output[chars++] = 'f'; break;
 23        case '\n': output[chars++] = 'n'; break;
 24        case '\r': output[chars++] = 'r'; break;
 25        case '\t': output[chars++] = 't'; break;
 26        default:
 27#ifdef Py_UNICODE_WIDE
 28            if (c >= 0x10000) {
 29                /* UTF-16 surrogate pair */
 30                Py_UNICODE v = c - 0x10000;
 31                c = 0xd800 | ((v >> 10) & 0x3ff);
 32                output[chars++] = 'u';
 33                x = (c & 0xf000) >> 12;
 34                output[chars++] = (x < 10) ? '0' + x : 'a' + (x - 10);
 35                x = (c & 0x0f00) >> 8;
 36                output[chars++] = (x < 10) ? '0' + x : 'a' + (x - 10);
 37                x = (c & 0x00f0) >> 4;
 38                output[chars++] = (x < 10) ? '0' + x : 'a' + (x - 10);
 39                x = (c & 0x000f);
 40                output[chars++] = (x < 10) ? '0' + x : 'a' + (x - 10);
 41                c = 0xdc00 | (v & 0x3ff);
 42                output[chars++] = '\\';
 43            }
 44#endif
 45            output[chars++] = 'u';
 46            x = (c & 0xf000) >> 12;
 47            output[chars++] = (x < 10) ? '0' + x : 'a' + (x - 10);
 48            x = (c & 0x0f00) >> 8;
 49            output[chars++] = (x < 10) ? '0' + x : 'a' + (x - 10);
 50            x = (c & 0x00f0) >> 4;
 51            output[chars++] = (x < 10) ? '0' + x : 'a' + (x - 10);
 52            x = (c & 0x000f);
 53            output[chars++] = (x < 10) ? '0' + x : 'a' + (x - 10);
 54    }
 55    return chars;
 56}
 57
 58static PyObject *
 59ascii_escape_unicode(PyObject *pystr)
 60{
 61    Py_ssize_t i;
 62    Py_ssize_t input_chars;
 63    Py_ssize_t output_size;
 64    Py_ssize_t chars;
 65    PyObject *rval;
 66    char *output;
 67    Py_UNICODE *input_unicode;
 68
 69    input_chars = PyUnicode_GET_SIZE(pystr);
 70    input_unicode = PyUnicode_AS_UNICODE(pystr);
 71    /* One char input can be up to 6 chars output, estimate 4 of these */
 72    output_size = 2 + (MIN_EXPANSION * 4) + input_chars;
 73    rval = PyString_FromStringAndSize(NULL, output_size);
 74    if (rval == NULL) {
 75        return NULL;
 76    }
 77    output = PyString_AS_STRING(rval);
 78    chars = 0;
 79    output[chars++] = '"';
 80    for (i = 0; i < input_chars; i++) {
 81        Py_UNICODE c = input_unicode[i];
 82        if (S_CHAR(c)) {
 83            output[chars++] = (char)c;
 84        }
 85	else {
 86            chars = ascii_escape_char(c, output, chars);
 87        }
 88        if (output_size - chars < (1 + MAX_EXPANSION)) {
 89            /* There's more than four, so let's resize by a lot */
 90            output_size *= 2;
 91            /* This is an upper bound */
 92            if (output_size > 2 + (input_chars * MAX_EXPANSION)) {
 93                output_size = 2 + (input_chars * MAX_EXPANSION);
 94            }
 95            if (_PyString_Resize(&rval, output_size) == -1) {
 96                return NULL;
 97            }
 98            output = PyString_AS_STRING(rval);
 99        }
100    }
101    output[chars++] = '"';
102    if (_PyString_Resize(&rval, chars) == -1) {
103        return NULL;
104    }
105    return rval;
106}
107
108static PyObject *
109ascii_escape_str(PyObject *pystr)
110{
111    Py_ssize_t i;
112    Py_ssize_t input_chars;
113    Py_ssize_t output_size;
114    Py_ssize_t chars;
115    PyObject *rval;
116    char *output;
117    char *input_str;
118
119    input_chars = PyString_GET_SIZE(pystr);
120    input_str = PyString_AS_STRING(pystr);
121    /* One char input can be up to 6 chars output, estimate 4 of these */
122    output_size = 2 + (MIN_EXPANSION * 4) + input_chars;
123    rval = PyString_FromStringAndSize(NULL, output_size);
124    if (rval == NULL) {
125        return NULL;
126    }
127    output = PyString_AS_STRING(rval);
128    chars = 0;
129    output[chars++] = '"';
130    for (i = 0; i < input_chars; i++) {
131        Py_UNICODE c = (Py_UNICODE)input_str[i];
132        if (S_CHAR(c)) {
133            output[chars++] = (char)c;
134        }
135	else if (c > 0x7F) {
136            /* We hit a non-ASCII character, bail to unicode mode */
137            PyObject *uni;
138            Py_DECREF(rval);
139            uni = PyUnicode_DecodeUTF8(input_str, input_chars, "strict");
140            if (uni == NULL) {
141                return NULL;
142            }
143            rval = ascii_escape_unicode(uni);
144            Py_DECREF(uni);
145            return rval;
146        }
147	else {
148            chars = ascii_escape_char(c, output, chars);
149        }
150        /* An ASCII char can't possibly expand to a surrogate! */
151        if (output_size - chars < (1 + MIN_EXPANSION)) {
152            /* There's more than four, so let's resize by a lot */
153            output_size *= 2;
154            if (output_size > 2 + (input_chars * MIN_EXPANSION)) {
155                output_size = 2 + (input_chars * MIN_EXPANSION);
156            }
157            if (_PyString_Resize(&rval, output_size) == -1) {
158                return NULL;
159            }
160            output = PyString_AS_STRING(rval);
161        }
162    }
163    output[chars++] = '"';
164    if (_PyString_Resize(&rval, chars) == -1) {
165        return NULL;
166    }
167    return rval;
168}
169
170void
171raise_errmsg(char *msg, PyObject *s, Py_ssize_t end)
172{
173    static PyObject *errmsg_fn = NULL;
174    PyObject *pymsg;
175    if (errmsg_fn == NULL) {
176        PyObject *decoder = PyImport_ImportModule("json.decoder");
177        if (decoder == NULL)
178            return;
179        errmsg_fn = PyObject_GetAttrString(decoder, "errmsg");
180        if (errmsg_fn == NULL)
181            return;
182        Py_DECREF(decoder);
183    }
184    pymsg = PyObject_CallFunction(errmsg_fn, "(zOn)", msg, s, end);
185    if (pymsg) {
186        PyErr_SetObject(PyExc_ValueError, pymsg);
187        Py_DECREF(pymsg);
188    }
189/*
190
191def linecol(doc, pos):
192    lineno = doc.count('\n', 0, pos) + 1
193    if lineno == 1:
194        colno = pos
195    else:
196        colno = pos - doc.rindex('\n', 0, pos)
197    return lineno, colno
198
199def errmsg(msg, doc, pos, end=None):
200    lineno, colno = linecol(doc, pos)
201    if end is None:
202        return '%s: line %d column %d (char %d)' % (msg, lineno, colno, pos)
203    endlineno, endcolno = linecol(doc, end)
204    return '%s: line %d column %d - line %d column %d (char %d - %d)' % (
205        msg, lineno, colno, endlineno, endcolno, pos, end)
206
207*/
208}
209
210static PyObject *
211join_list_unicode(PyObject *lst)
212{
213    static PyObject *ustr = NULL;
214    static PyObject *joinstr = NULL;
215    if (ustr == NULL) {
216        Py_UNICODE c = 0;
217        ustr = PyUnicode_FromUnicode(&c, 0);
218    }
219    if (joinstr == NULL) {
220        joinstr = PyString_InternFromString("join");
221    }
222    if (joinstr == NULL || ustr == NULL) {
223        return NULL;
224    }
225    return PyObject_CallMethodObjArgs(ustr, joinstr, lst, NULL);
226}
227
228static PyObject *
229scanstring_str(PyObject *pystr, Py_ssize_t end, char *encoding, int strict)
230{
231    PyObject *rval;
232    Py_ssize_t len = PyString_GET_SIZE(pystr);
233    Py_ssize_t begin = end - 1;
234    Py_ssize_t next = begin;
235    char *buf = PyString_AS_STRING(pystr);
236    PyObject *chunks = PyList_New(0);
237    if (chunks == NULL) {
238        goto bail;
239    }
240    if (end < 0 || len <= end) {
241        PyErr_SetString(PyExc_ValueError, "end is out of bounds");
242        goto bail;
243    }
244    while (1) {
245        /* Find the end of the string or the next escape */
246        Py_UNICODE c = 0;
247        PyObject *chunk = NULL;
248        for (next = end; next < len; next++) {
249            c = buf[next];
250            if (c == '"' || c == '\\') {
251                break;
252            }
253            else if (strict && c <= 0x1f) {
254                raise_errmsg("Invalid control character at", pystr, next);
255                goto bail;
256            }
257        }
258        if (!(c == '"' || c == '\\')) {
259            raise_errmsg("Unterminated string starting at", pystr, begin);
260            goto bail;
261        }
262        /* Pick up this chunk if it's not zero length */
263        if (next != end) {
264            PyObject *strchunk = PyBuffer_FromMemory(&buf[end], next - end);
265            if (strchunk == NULL) {
266                goto bail;
267            }
268            chunk = PyUnicode_FromEncodedObject(strchunk, encoding, NULL);
269            Py_DECREF(strchunk);
270            if (chunk == NULL) {
271                goto bail;
272            }
273            if (PyList_Append(chunks, chunk)) {
274                Py_DECREF(chunk);
275                goto bail;
276            }
277            Py_DECREF(chunk);
278        }
279        next++;
280        if (c == '"') {
281            end = next;
282            break;
283        }
284        if (next == len) {
285            raise_errmsg("Unterminated string starting at", pystr, begin);
286            goto bail;
287        }
288        c = buf[next];
289        if (c != 'u') {
290            /* Non-unicode backslash escapes */
291            end = next + 1;
292            switch (c) {
293                case '"': break;
294                case '\\': break;
295                case '/': break;
296                case 'b': c = '\b'; break;
297                case 'f': c = '\f'; break;
298                case 'n': c = '\n'; break;
299                case 'r': c = '\r'; break;
300                case 't': c = '\t'; break;
301                default: c = 0;
302            }
303            if (c == 0) {
304                raise_errmsg("Invalid \\escape", pystr, end - 2);
305                goto bail;
306            }
307        }
308        else {
309            c = 0;
310            next++;
311            end = next + 4;
312            if (end >= len) {
313                raise_errmsg("Invalid \\uXXXX escape", pystr, next - 1);
314                goto bail;
315            }
316            /* Decode 4 hex digits */
317            for (; next < end; next++) {
318                Py_ssize_t shl = (end - next - 1) << 2;
319                Py_UNICODE digit = buf[next];
320                switch (digit) {
321                    case '0': case '1': case '2': case '3': case '4':
322                    case '5': case '6': case '7': case '8': case '9':
323                        c |= (digit - '0') << shl; break;
324                    case 'a': case 'b': case 'c': case 'd': case 'e':
325                    case 'f':
326                        c |= (digit - 'a' + 10) << shl; break;
327                    case 'A': case 'B': case 'C': case 'D': case 'E':
328                    case 'F':
329                        c |= (digit - 'A' + 10) << shl; break;
330                    default:
331                        raise_errmsg("Invalid \\uXXXX escape", pystr, end - 5);
332                        goto bail;
333                }
334            }
335#ifdef Py_UNICODE_WIDE
336            /* Surrogate pair */
337            if (c >= 0xd800 && c <= 0xdbff) {
338                Py_UNICODE c2 = 0;
339                if (end + 6 >= len) {
340                    raise_errmsg("Invalid \\uXXXX\\uXXXX surrogate pair", pystr,
341                        end - 5);
342                }
343                if (buf[next++] != '\\' || buf[next++] != 'u') {
344                    raise_errmsg("Invalid \\uXXXX\\uXXXX surrogate pair", pystr,
345                        end - 5);
346                }
347                end += 6;
348                /* Decode 4 hex digits */
349                for (; next < end; next++) {
350                    Py_ssize_t shl = (end - next - 1) << 2;
351                    Py_UNICODE digit = buf[next];
352                    switch (digit) {
353                        case '0': case '1': case '2': case '3': case '4':
354                        case '5': case '6': case '7': case '8': case '9':
355                            c2 |= (digit - '0') << shl; break;
356                        case 'a': case 'b': case 'c': case 'd': case 'e':
357                        case 'f':
358                            c2 |= (digit - 'a' + 10) << shl; break;
359                        case 'A': case 'B': case 'C': case 'D': case 'E':
360                        case 'F':
361                            c2 |= (digit - 'A' + 10) << shl; break;
362                        default:
363                            raise_errmsg("Invalid \\uXXXX escape", pystr, end - 5);
364                            goto bail;
365                    }
366                }
367                c = 0x10000 + (((c - 0xd800) << 10) | (c2 - 0xdc00));
368            }
369#endif
370        }
371        chunk = PyUnicode_FromUnicode(&c, 1);
372        if (chunk == NULL) {
373            goto bail;
374        }
375        if (PyList_Append(chunks, chunk)) {
376            Py_DECREF(chunk);
377            goto bail;
378        }
379        Py_DECREF(chunk);
380    }
381
382    rval = join_list_unicode(chunks);
383    if (rval == NULL) {
384        goto bail;
385    }
386    Py_CLEAR(chunks);
387    return Py_BuildValue("(Nn)", rval, end);
388bail:
389    Py_XDECREF(chunks);
390    return NULL;
391}
392
393
394static PyObject *
395scanstring_unicode(PyObject *pystr, Py_ssize_t end, int strict)
396{
397    PyObject *rval;
398    Py_ssize_t len = PyUnicode_GET_SIZE(pystr);
399    Py_ssize_t begin = end - 1;
400    Py_ssize_t next = begin;
401    const Py_UNICODE *buf = PyUnicode_AS_UNICODE(pystr);
402    PyObject *chunks = PyList_New(0);
403    if (chunks == NULL) {
404        goto bail;
405    }
406    if (end < 0 || len <= end) {
407        PyErr_SetString(PyExc_ValueError, "end is out of bounds");
408        goto bail;
409    }
410    while (1) {
411        /* Find the end of the string or the next escape */
412        Py_UNICODE c = 0;
413        PyObject *chunk = NULL;
414        for (next = end; next < len; next++) {
415            c = buf[next];
416            if (c == '"' || c == '\\') {
417                break;
418            }
419            else if (strict && c <= 0x1f) {
420                raise_errmsg("Invalid control character at", pystr, next);
421                goto bail;
422            }
423        }
424        if (!(c == '"' || c == '\\')) {
425            raise_errmsg("Unterminated string starting at", pystr, begin);
426            goto bail;
427        }
428        /* Pick up this chunk if it's not zero length */
429        if (next != end) {
430            chunk = PyUnicode_FromUnicode(&buf[end], next - end);
431            if (chunk == NULL) {
432                goto bail;
433            }
434            if (PyList_Append(chunks, chunk)) {
435                Py_DECREF(chunk);
436                goto bail;
437            }
438            Py_DECREF(chunk);
439        }
440        next++;
441        if (c == '"') {
442            end = next;
443            break;
444        }
445        if (next == len) {
446            raise_errmsg("Unterminated string starting at", pystr, begin);
447            goto bail;
448        }
449        c = buf[next];
450        if (c != 'u') {
451            /* Non-unicode backslash escapes */
452            end = next + 1;
453            switch (c) {
454                case '"': break;
455                case '\\': break;
456                case '/': break;
457                case 'b': c = '\b'; break;
458                case 'f': c = '\f'; break;
459                case 'n': c = '\n'; break;
460                case 'r': c = '\r'; break;
461                case 't': c = '\t'; break;
462                default: c = 0;
463            }
464            if (c == 0) {
465                raise_errmsg("Invalid \\escape", pystr, end - 2);
466                goto bail;
467            }
468        }
469        else {
470            c = 0;
471            next++;
472            end = next + 4;
473            if (end >= len) {
474                raise_errmsg("Invalid \\uXXXX escape", pystr, next - 1);
475                goto bail;
476            }
477            /* Decode 4 hex digits */
478            for (; next < end; next++) {
479                Py_ssize_t shl = (end - next - 1) << 2;
480                Py_UNICODE digit = buf[next];
481                switch (digit) {
482                    case '0': case '1': case '2': case '3': case '4':
483                    case '5': case '6': case '7': case '8': case '9':
484                        c |= (digit - '0') << shl; break;
485                    case 'a': case 'b': case 'c': case 'd': case 'e':
486                    case 'f':
487                        c |= (digit - 'a' + 10) << shl; break;
488                    case 'A': case 'B': case 'C': case 'D': case 'E':
489                    case 'F':
490                        c |= (digit - 'A' + 10) << shl; break;
491                    default:
492                        raise_errmsg("Invalid \\uXXXX escape", pystr, end - 5);
493                        goto bail;
494                }
495            }
496#ifdef Py_UNICODE_WIDE
497            /* Surrogate pair */
498            if (c >= 0xd800 && c <= 0xdbff) {
499                Py_UNICODE c2 = 0;
500                if (end + 6 >= len) {
501                    raise_errmsg("Invalid \\uXXXX\\uXXXX surrogate pair", pystr,
502                        end - 5);
503                }
504                if (buf[next++] != '\\' || buf[next++] != 'u') {
505                    raise_errmsg("Invalid \\uXXXX\\uXXXX surrogate pair", pystr,
506                        end - 5);
507                }
508                end += 6;
509                /* Decode 4 hex digits */
510                for (; next < end; next++) {
511                    Py_ssize_t shl = (end - next - 1) << 2;
512                    Py_UNICODE digit = buf[next];
513                    switch (digit) {
514                        case '0': case '1': case '2': case '3': case '4':
515                        case '5': case '6': case '7': case '8': case '9':
516                            c2 |= (digit - '0') << shl; break;
517                        case 'a': case 'b': case 'c': case 'd': case 'e':
518                        case 'f':
519                            c2 |= (digit - 'a' + 10) << shl; break;
520                        case 'A': case 'B': case 'C': case 'D': case 'E':
521                        case 'F':
522                            c2 |= (digit - 'A' + 10) << shl; break;
523                        default:
524                            raise_errmsg("Invalid \\uXXXX escape", pystr, end - 5);
525                            goto bail;
526                    }
527                }
528                c = 0x10000 + (((c - 0xd800) << 10) | (c2 - 0xdc00));
529            }
530#endif
531        }
532        chunk = PyUnicode_FromUnicode(&c, 1);
533        if (chunk == NULL) {
534            goto bail;
535        }
536        if (PyList_Append(chunks, chunk)) {
537            Py_DECREF(chunk);
538            goto bail;
539        }
540        Py_DECREF(chunk);
541    }
542
543    rval = join_list_unicode(chunks);
544    if (rval == NULL) {
545        goto bail;
546    }
547    Py_CLEAR(chunks);
548    return Py_BuildValue("(Nn)", rval, end);
549bail:
550    Py_XDECREF(chunks);
551    return NULL;
552}
553
554PyDoc_STRVAR(pydoc_scanstring,
555"scanstring(basestring, end, encoding) -> (str, end)\n");
556
557static PyObject *
558py_scanstring(PyObject* self, PyObject *args)
559{
560    PyObject *pystr;
561    Py_ssize_t end;
562    char *encoding = NULL;
563    int strict = 0;
564    if (!PyArg_ParseTuple(args, "On|zi:scanstring", &pystr, &end, &encoding, &strict)) {
565        return NULL;
566    }
567    if (encoding == NULL) {
568        encoding = DEFAULT_ENCODING;
569    }
570    if (PyString_Check(pystr)) {
571        return scanstring_str(pystr, end, encoding, strict);
572    }
573    else if (PyUnicode_Check(pystr)) {
574        return scanstring_unicode(pystr, end, strict);
575    }
576    else {
577        PyErr_Format(PyExc_TypeError, 
578                     "first argument must be a string or unicode, not %.80s",
579                     Py_TYPE(pystr)->tp_name);
580        return NULL;
581    }
582}
583
584PyDoc_STRVAR(pydoc_encode_basestring_ascii,
585"encode_basestring_ascii(basestring) -> str\n");
586
587static PyObject *
588py_encode_basestring_ascii(PyObject* self, PyObject *pystr)
589{
590    /* METH_O */
591    if (PyString_Check(pystr)) {
592        return ascii_escape_str(pystr);
593    }
594    else if (PyUnicode_Check(pystr)) {
595        return ascii_escape_unicode(pystr);
596    }
597    else {
598        PyErr_Format(PyExc_TypeError, 
599                     "first argument must be a string or unicode, not %.80s",
600                     Py_TYPE(pystr)->tp_name);
601        return NULL;
602    }
603}
604
605static PyMethodDef json_methods[] = {
606    {"encode_basestring_ascii", (PyCFunction)py_encode_basestring_ascii,
607     METH_O, pydoc_encode_basestring_ascii},
608    {"scanstring", (PyCFunction)py_scanstring, METH_VARARGS,
609     pydoc_scanstring},
610    {NULL, NULL, 0, NULL}
611};
612
613PyDoc_STRVAR(module_doc,
614"json speedups\n");
615
616void
617init_json(void)
618{
619    PyObject *m;
620    m = Py_InitModule3("_json", json_methods, module_doc);
621}