/Modules/_json.c

http://unladen-swallow.googlecode.com/ · C · 621 lines · 569 code · 19 blank · 33 comment · 116 complexity · e4f23e56d00556b0331a604f2a61b6e6 MD5 · raw file

  1. #include "Python.h"
  2. #define DEFAULT_ENCODING "utf-8"
  3. #define S_CHAR(c) (c >= ' ' && c <= '~' && c != '\\' && c != '"')
  4. #define MIN_EXPANSION 6
  5. #ifdef Py_UNICODE_WIDE
  6. #define MAX_EXPANSION (2 * MIN_EXPANSION)
  7. #else
  8. #define MAX_EXPANSION MIN_EXPANSION
  9. #endif
  10. static Py_ssize_t
  11. ascii_escape_char(Py_UNICODE c, char *output, Py_ssize_t chars)
  12. {
  13. Py_UNICODE x;
  14. output[chars++] = '\\';
  15. switch (c) {
  16. case '\\': output[chars++] = (char)c; break;
  17. case '"': output[chars++] = (char)c; break;
  18. case '\b': output[chars++] = 'b'; break;
  19. case '\f': output[chars++] = 'f'; break;
  20. case '\n': output[chars++] = 'n'; break;
  21. case '\r': output[chars++] = 'r'; break;
  22. case '\t': output[chars++] = 't'; break;
  23. default:
  24. #ifdef Py_UNICODE_WIDE
  25. if (c >= 0x10000) {
  26. /* UTF-16 surrogate pair */
  27. Py_UNICODE v = c - 0x10000;
  28. c = 0xd800 | ((v >> 10) & 0x3ff);
  29. output[chars++] = 'u';
  30. x = (c & 0xf000) >> 12;
  31. output[chars++] = (x < 10) ? '0' + x : 'a' + (x - 10);
  32. x = (c & 0x0f00) >> 8;
  33. output[chars++] = (x < 10) ? '0' + x : 'a' + (x - 10);
  34. x = (c & 0x00f0) >> 4;
  35. output[chars++] = (x < 10) ? '0' + x : 'a' + (x - 10);
  36. x = (c & 0x000f);
  37. output[chars++] = (x < 10) ? '0' + x : 'a' + (x - 10);
  38. c = 0xdc00 | (v & 0x3ff);
  39. output[chars++] = '\\';
  40. }
  41. #endif
  42. output[chars++] = 'u';
  43. x = (c & 0xf000) >> 12;
  44. output[chars++] = (x < 10) ? '0' + x : 'a' + (x - 10);
  45. x = (c & 0x0f00) >> 8;
  46. output[chars++] = (x < 10) ? '0' + x : 'a' + (x - 10);
  47. x = (c & 0x00f0) >> 4;
  48. output[chars++] = (x < 10) ? '0' + x : 'a' + (x - 10);
  49. x = (c & 0x000f);
  50. output[chars++] = (x < 10) ? '0' + x : 'a' + (x - 10);
  51. }
  52. return chars;
  53. }
  54. static PyObject *
  55. ascii_escape_unicode(PyObject *pystr)
  56. {
  57. Py_ssize_t i;
  58. Py_ssize_t input_chars;
  59. Py_ssize_t output_size;
  60. Py_ssize_t chars;
  61. PyObject *rval;
  62. char *output;
  63. Py_UNICODE *input_unicode;
  64. input_chars = PyUnicode_GET_SIZE(pystr);
  65. input_unicode = PyUnicode_AS_UNICODE(pystr);
  66. /* One char input can be up to 6 chars output, estimate 4 of these */
  67. output_size = 2 + (MIN_EXPANSION * 4) + input_chars;
  68. rval = PyString_FromStringAndSize(NULL, output_size);
  69. if (rval == NULL) {
  70. return NULL;
  71. }
  72. output = PyString_AS_STRING(rval);
  73. chars = 0;
  74. output[chars++] = '"';
  75. for (i = 0; i < input_chars; i++) {
  76. Py_UNICODE c = input_unicode[i];
  77. if (S_CHAR(c)) {
  78. output[chars++] = (char)c;
  79. }
  80. else {
  81. chars = ascii_escape_char(c, output, chars);
  82. }
  83. if (output_size - chars < (1 + MAX_EXPANSION)) {
  84. /* There's more than four, so let's resize by a lot */
  85. output_size *= 2;
  86. /* This is an upper bound */
  87. if (output_size > 2 + (input_chars * MAX_EXPANSION)) {
  88. output_size = 2 + (input_chars * MAX_EXPANSION);
  89. }
  90. if (_PyString_Resize(&rval, output_size) == -1) {
  91. return NULL;
  92. }
  93. output = PyString_AS_STRING(rval);
  94. }
  95. }
  96. output[chars++] = '"';
  97. if (_PyString_Resize(&rval, chars) == -1) {
  98. return NULL;
  99. }
  100. return rval;
  101. }
  102. static PyObject *
  103. ascii_escape_str(PyObject *pystr)
  104. {
  105. Py_ssize_t i;
  106. Py_ssize_t input_chars;
  107. Py_ssize_t output_size;
  108. Py_ssize_t chars;
  109. PyObject *rval;
  110. char *output;
  111. char *input_str;
  112. input_chars = PyString_GET_SIZE(pystr);
  113. input_str = PyString_AS_STRING(pystr);
  114. /* One char input can be up to 6 chars output, estimate 4 of these */
  115. output_size = 2 + (MIN_EXPANSION * 4) + input_chars;
  116. rval = PyString_FromStringAndSize(NULL, output_size);
  117. if (rval == NULL) {
  118. return NULL;
  119. }
  120. output = PyString_AS_STRING(rval);
  121. chars = 0;
  122. output[chars++] = '"';
  123. for (i = 0; i < input_chars; i++) {
  124. Py_UNICODE c = (Py_UNICODE)input_str[i];
  125. if (S_CHAR(c)) {
  126. output[chars++] = (char)c;
  127. }
  128. else if (c > 0x7F) {
  129. /* We hit a non-ASCII character, bail to unicode mode */
  130. PyObject *uni;
  131. Py_DECREF(rval);
  132. uni = PyUnicode_DecodeUTF8(input_str, input_chars, "strict");
  133. if (uni == NULL) {
  134. return NULL;
  135. }
  136. rval = ascii_escape_unicode(uni);
  137. Py_DECREF(uni);
  138. return rval;
  139. }
  140. else {
  141. chars = ascii_escape_char(c, output, chars);
  142. }
  143. /* An ASCII char can't possibly expand to a surrogate! */
  144. if (output_size - chars < (1 + MIN_EXPANSION)) {
  145. /* There's more than four, so let's resize by a lot */
  146. output_size *= 2;
  147. if (output_size > 2 + (input_chars * MIN_EXPANSION)) {
  148. output_size = 2 + (input_chars * MIN_EXPANSION);
  149. }
  150. if (_PyString_Resize(&rval, output_size) == -1) {
  151. return NULL;
  152. }
  153. output = PyString_AS_STRING(rval);
  154. }
  155. }
  156. output[chars++] = '"';
  157. if (_PyString_Resize(&rval, chars) == -1) {
  158. return NULL;
  159. }
  160. return rval;
  161. }
  162. void
  163. raise_errmsg(char *msg, PyObject *s, Py_ssize_t end)
  164. {
  165. static PyObject *errmsg_fn = NULL;
  166. PyObject *pymsg;
  167. if (errmsg_fn == NULL) {
  168. PyObject *decoder = PyImport_ImportModule("json.decoder");
  169. if (decoder == NULL)
  170. return;
  171. errmsg_fn = PyObject_GetAttrString(decoder, "errmsg");
  172. if (errmsg_fn == NULL)
  173. return;
  174. Py_DECREF(decoder);
  175. }
  176. pymsg = PyObject_CallFunction(errmsg_fn, "(zOn)", msg, s, end);
  177. if (pymsg) {
  178. PyErr_SetObject(PyExc_ValueError, pymsg);
  179. Py_DECREF(pymsg);
  180. }
  181. /*
  182. def linecol(doc, pos):
  183. lineno = doc.count('\n', 0, pos) + 1
  184. if lineno == 1:
  185. colno = pos
  186. else:
  187. colno = pos - doc.rindex('\n', 0, pos)
  188. return lineno, colno
  189. def errmsg(msg, doc, pos, end=None):
  190. lineno, colno = linecol(doc, pos)
  191. if end is None:
  192. return '%s: line %d column %d (char %d)' % (msg, lineno, colno, pos)
  193. endlineno, endcolno = linecol(doc, end)
  194. return '%s: line %d column %d - line %d column %d (char %d - %d)' % (
  195. msg, lineno, colno, endlineno, endcolno, pos, end)
  196. */
  197. }
  198. static PyObject *
  199. join_list_unicode(PyObject *lst)
  200. {
  201. static PyObject *ustr = NULL;
  202. static PyObject *joinstr = NULL;
  203. if (ustr == NULL) {
  204. Py_UNICODE c = 0;
  205. ustr = PyUnicode_FromUnicode(&c, 0);
  206. }
  207. if (joinstr == NULL) {
  208. joinstr = PyString_InternFromString("join");
  209. }
  210. if (joinstr == NULL || ustr == NULL) {
  211. return NULL;
  212. }
  213. return PyObject_CallMethodObjArgs(ustr, joinstr, lst, NULL);
  214. }
  215. static PyObject *
  216. scanstring_str(PyObject *pystr, Py_ssize_t end, char *encoding, int strict)
  217. {
  218. PyObject *rval;
  219. Py_ssize_t len = PyString_GET_SIZE(pystr);
  220. Py_ssize_t begin = end - 1;
  221. Py_ssize_t next = begin;
  222. char *buf = PyString_AS_STRING(pystr);
  223. PyObject *chunks = PyList_New(0);
  224. if (chunks == NULL) {
  225. goto bail;
  226. }
  227. if (end < 0 || len <= end) {
  228. PyErr_SetString(PyExc_ValueError, "end is out of bounds");
  229. goto bail;
  230. }
  231. while (1) {
  232. /* Find the end of the string or the next escape */
  233. Py_UNICODE c = 0;
  234. PyObject *chunk = NULL;
  235. for (next = end; next < len; next++) {
  236. c = buf[next];
  237. if (c == '"' || c == '\\') {
  238. break;
  239. }
  240. else if (strict && c <= 0x1f) {
  241. raise_errmsg("Invalid control character at", pystr, next);
  242. goto bail;
  243. }
  244. }
  245. if (!(c == '"' || c == '\\')) {
  246. raise_errmsg("Unterminated string starting at", pystr, begin);
  247. goto bail;
  248. }
  249. /* Pick up this chunk if it's not zero length */
  250. if (next != end) {
  251. PyObject *strchunk = PyBuffer_FromMemory(&buf[end], next - end);
  252. if (strchunk == NULL) {
  253. goto bail;
  254. }
  255. chunk = PyUnicode_FromEncodedObject(strchunk, encoding, NULL);
  256. Py_DECREF(strchunk);
  257. if (chunk == NULL) {
  258. goto bail;
  259. }
  260. if (PyList_Append(chunks, chunk)) {
  261. Py_DECREF(chunk);
  262. goto bail;
  263. }
  264. Py_DECREF(chunk);
  265. }
  266. next++;
  267. if (c == '"') {
  268. end = next;
  269. break;
  270. }
  271. if (next == len) {
  272. raise_errmsg("Unterminated string starting at", pystr, begin);
  273. goto bail;
  274. }
  275. c = buf[next];
  276. if (c != 'u') {
  277. /* Non-unicode backslash escapes */
  278. end = next + 1;
  279. switch (c) {
  280. case '"': break;
  281. case '\\': break;
  282. case '/': break;
  283. case 'b': c = '\b'; break;
  284. case 'f': c = '\f'; break;
  285. case 'n': c = '\n'; break;
  286. case 'r': c = '\r'; break;
  287. case 't': c = '\t'; break;
  288. default: c = 0;
  289. }
  290. if (c == 0) {
  291. raise_errmsg("Invalid \\escape", pystr, end - 2);
  292. goto bail;
  293. }
  294. }
  295. else {
  296. c = 0;
  297. next++;
  298. end = next + 4;
  299. if (end >= len) {
  300. raise_errmsg("Invalid \\uXXXX escape", pystr, next - 1);
  301. goto bail;
  302. }
  303. /* Decode 4 hex digits */
  304. for (; next < end; next++) {
  305. Py_ssize_t shl = (end - next - 1) << 2;
  306. Py_UNICODE digit = buf[next];
  307. switch (digit) {
  308. case '0': case '1': case '2': case '3': case '4':
  309. case '5': case '6': case '7': case '8': case '9':
  310. c |= (digit - '0') << shl; break;
  311. case 'a': case 'b': case 'c': case 'd': case 'e':
  312. case 'f':
  313. c |= (digit - 'a' + 10) << shl; break;
  314. case 'A': case 'B': case 'C': case 'D': case 'E':
  315. case 'F':
  316. c |= (digit - 'A' + 10) << shl; break;
  317. default:
  318. raise_errmsg("Invalid \\uXXXX escape", pystr, end - 5);
  319. goto bail;
  320. }
  321. }
  322. #ifdef Py_UNICODE_WIDE
  323. /* Surrogate pair */
  324. if (c >= 0xd800 && c <= 0xdbff) {
  325. Py_UNICODE c2 = 0;
  326. if (end + 6 >= len) {
  327. raise_errmsg("Invalid \\uXXXX\\uXXXX surrogate pair", pystr,
  328. end - 5);
  329. }
  330. if (buf[next++] != '\\' || buf[next++] != 'u') {
  331. raise_errmsg("Invalid \\uXXXX\\uXXXX surrogate pair", pystr,
  332. end - 5);
  333. }
  334. end += 6;
  335. /* Decode 4 hex digits */
  336. for (; next < end; next++) {
  337. Py_ssize_t shl = (end - next - 1) << 2;
  338. Py_UNICODE digit = buf[next];
  339. switch (digit) {
  340. case '0': case '1': case '2': case '3': case '4':
  341. case '5': case '6': case '7': case '8': case '9':
  342. c2 |= (digit - '0') << shl; break;
  343. case 'a': case 'b': case 'c': case 'd': case 'e':
  344. case 'f':
  345. c2 |= (digit - 'a' + 10) << shl; break;
  346. case 'A': case 'B': case 'C': case 'D': case 'E':
  347. case 'F':
  348. c2 |= (digit - 'A' + 10) << shl; break;
  349. default:
  350. raise_errmsg("Invalid \\uXXXX escape", pystr, end - 5);
  351. goto bail;
  352. }
  353. }
  354. c = 0x10000 + (((c - 0xd800) << 10) | (c2 - 0xdc00));
  355. }
  356. #endif
  357. }
  358. chunk = PyUnicode_FromUnicode(&c, 1);
  359. if (chunk == NULL) {
  360. goto bail;
  361. }
  362. if (PyList_Append(chunks, chunk)) {
  363. Py_DECREF(chunk);
  364. goto bail;
  365. }
  366. Py_DECREF(chunk);
  367. }
  368. rval = join_list_unicode(chunks);
  369. if (rval == NULL) {
  370. goto bail;
  371. }
  372. Py_CLEAR(chunks);
  373. return Py_BuildValue("(Nn)", rval, end);
  374. bail:
  375. Py_XDECREF(chunks);
  376. return NULL;
  377. }
  378. static PyObject *
  379. scanstring_unicode(PyObject *pystr, Py_ssize_t end, int strict)
  380. {
  381. PyObject *rval;
  382. Py_ssize_t len = PyUnicode_GET_SIZE(pystr);
  383. Py_ssize_t begin = end - 1;
  384. Py_ssize_t next = begin;
  385. const Py_UNICODE *buf = PyUnicode_AS_UNICODE(pystr);
  386. PyObject *chunks = PyList_New(0);
  387. if (chunks == NULL) {
  388. goto bail;
  389. }
  390. if (end < 0 || len <= end) {
  391. PyErr_SetString(PyExc_ValueError, "end is out of bounds");
  392. goto bail;
  393. }
  394. while (1) {
  395. /* Find the end of the string or the next escape */
  396. Py_UNICODE c = 0;
  397. PyObject *chunk = NULL;
  398. for (next = end; next < len; next++) {
  399. c = buf[next];
  400. if (c == '"' || c == '\\') {
  401. break;
  402. }
  403. else if (strict && c <= 0x1f) {
  404. raise_errmsg("Invalid control character at", pystr, next);
  405. goto bail;
  406. }
  407. }
  408. if (!(c == '"' || c == '\\')) {
  409. raise_errmsg("Unterminated string starting at", pystr, begin);
  410. goto bail;
  411. }
  412. /* Pick up this chunk if it's not zero length */
  413. if (next != end) {
  414. chunk = PyUnicode_FromUnicode(&buf[end], next - end);
  415. if (chunk == NULL) {
  416. goto bail;
  417. }
  418. if (PyList_Append(chunks, chunk)) {
  419. Py_DECREF(chunk);
  420. goto bail;
  421. }
  422. Py_DECREF(chunk);
  423. }
  424. next++;
  425. if (c == '"') {
  426. end = next;
  427. break;
  428. }
  429. if (next == len) {
  430. raise_errmsg("Unterminated string starting at", pystr, begin);
  431. goto bail;
  432. }
  433. c = buf[next];
  434. if (c != 'u') {
  435. /* Non-unicode backslash escapes */
  436. end = next + 1;
  437. switch (c) {
  438. case '"': break;
  439. case '\\': break;
  440. case '/': break;
  441. case 'b': c = '\b'; break;
  442. case 'f': c = '\f'; break;
  443. case 'n': c = '\n'; break;
  444. case 'r': c = '\r'; break;
  445. case 't': c = '\t'; break;
  446. default: c = 0;
  447. }
  448. if (c == 0) {
  449. raise_errmsg("Invalid \\escape", pystr, end - 2);
  450. goto bail;
  451. }
  452. }
  453. else {
  454. c = 0;
  455. next++;
  456. end = next + 4;
  457. if (end >= len) {
  458. raise_errmsg("Invalid \\uXXXX escape", pystr, next - 1);
  459. goto bail;
  460. }
  461. /* Decode 4 hex digits */
  462. for (; next < end; next++) {
  463. Py_ssize_t shl = (end - next - 1) << 2;
  464. Py_UNICODE digit = buf[next];
  465. switch (digit) {
  466. case '0': case '1': case '2': case '3': case '4':
  467. case '5': case '6': case '7': case '8': case '9':
  468. c |= (digit - '0') << shl; break;
  469. case 'a': case 'b': case 'c': case 'd': case 'e':
  470. case 'f':
  471. c |= (digit - 'a' + 10) << shl; break;
  472. case 'A': case 'B': case 'C': case 'D': case 'E':
  473. case 'F':
  474. c |= (digit - 'A' + 10) << shl; break;
  475. default:
  476. raise_errmsg("Invalid \\uXXXX escape", pystr, end - 5);
  477. goto bail;
  478. }
  479. }
  480. #ifdef Py_UNICODE_WIDE
  481. /* Surrogate pair */
  482. if (c >= 0xd800 && c <= 0xdbff) {
  483. Py_UNICODE c2 = 0;
  484. if (end + 6 >= len) {
  485. raise_errmsg("Invalid \\uXXXX\\uXXXX surrogate pair", pystr,
  486. end - 5);
  487. }
  488. if (buf[next++] != '\\' || buf[next++] != 'u') {
  489. raise_errmsg("Invalid \\uXXXX\\uXXXX surrogate pair", pystr,
  490. end - 5);
  491. }
  492. end += 6;
  493. /* Decode 4 hex digits */
  494. for (; next < end; next++) {
  495. Py_ssize_t shl = (end - next - 1) << 2;
  496. Py_UNICODE digit = buf[next];
  497. switch (digit) {
  498. case '0': case '1': case '2': case '3': case '4':
  499. case '5': case '6': case '7': case '8': case '9':
  500. c2 |= (digit - '0') << shl; break;
  501. case 'a': case 'b': case 'c': case 'd': case 'e':
  502. case 'f':
  503. c2 |= (digit - 'a' + 10) << shl; break;
  504. case 'A': case 'B': case 'C': case 'D': case 'E':
  505. case 'F':
  506. c2 |= (digit - 'A' + 10) << shl; break;
  507. default:
  508. raise_errmsg("Invalid \\uXXXX escape", pystr, end - 5);
  509. goto bail;
  510. }
  511. }
  512. c = 0x10000 + (((c - 0xd800) << 10) | (c2 - 0xdc00));
  513. }
  514. #endif
  515. }
  516. chunk = PyUnicode_FromUnicode(&c, 1);
  517. if (chunk == NULL) {
  518. goto bail;
  519. }
  520. if (PyList_Append(chunks, chunk)) {
  521. Py_DECREF(chunk);
  522. goto bail;
  523. }
  524. Py_DECREF(chunk);
  525. }
  526. rval = join_list_unicode(chunks);
  527. if (rval == NULL) {
  528. goto bail;
  529. }
  530. Py_CLEAR(chunks);
  531. return Py_BuildValue("(Nn)", rval, end);
  532. bail:
  533. Py_XDECREF(chunks);
  534. return NULL;
  535. }
  536. PyDoc_STRVAR(pydoc_scanstring,
  537. "scanstring(basestring, end, encoding) -> (str, end)\n");
  538. static PyObject *
  539. py_scanstring(PyObject* self, PyObject *args)
  540. {
  541. PyObject *pystr;
  542. Py_ssize_t end;
  543. char *encoding = NULL;
  544. int strict = 0;
  545. if (!PyArg_ParseTuple(args, "On|zi:scanstring", &pystr, &end, &encoding, &strict)) {
  546. return NULL;
  547. }
  548. if (encoding == NULL) {
  549. encoding = DEFAULT_ENCODING;
  550. }
  551. if (PyString_Check(pystr)) {
  552. return scanstring_str(pystr, end, encoding, strict);
  553. }
  554. else if (PyUnicode_Check(pystr)) {
  555. return scanstring_unicode(pystr, end, strict);
  556. }
  557. else {
  558. PyErr_Format(PyExc_TypeError,
  559. "first argument must be a string or unicode, not %.80s",
  560. Py_TYPE(pystr)->tp_name);
  561. return NULL;
  562. }
  563. }
  564. PyDoc_STRVAR(pydoc_encode_basestring_ascii,
  565. "encode_basestring_ascii(basestring) -> str\n");
  566. static PyObject *
  567. py_encode_basestring_ascii(PyObject* self, PyObject *pystr)
  568. {
  569. /* METH_O */
  570. if (PyString_Check(pystr)) {
  571. return ascii_escape_str(pystr);
  572. }
  573. else if (PyUnicode_Check(pystr)) {
  574. return ascii_escape_unicode(pystr);
  575. }
  576. else {
  577. PyErr_Format(PyExc_TypeError,
  578. "first argument must be a string or unicode, not %.80s",
  579. Py_TYPE(pystr)->tp_name);
  580. return NULL;
  581. }
  582. }
  583. static PyMethodDef json_methods[] = {
  584. {"encode_basestring_ascii", (PyCFunction)py_encode_basestring_ascii,
  585. METH_O, pydoc_encode_basestring_ascii},
  586. {"scanstring", (PyCFunction)py_scanstring, METH_VARARGS,
  587. pydoc_scanstring},
  588. {NULL, NULL, 0, NULL}
  589. };
  590. PyDoc_STRVAR(module_doc,
  591. "json speedups\n");
  592. void
  593. init_json(void)
  594. {
  595. PyObject *m;
  596. m = Py_InitModule3("_json", json_methods, module_doc);
  597. }