/jansson/src/load.c

http://github.com/nicolasff/webdis · C · 885 lines · 778 code · 85 blank · 22 comment · 111 complexity · 8f07c2a0caaffca1507a9a2a8ac6c2fc MD5 · raw file

  1. /*
  2. * Copyright (c) 2009, 2010 Petri Lehtinen <petri@digip.org>
  3. *
  4. * Jansson is free software; you can redistribute it and/or modify
  5. * it under the terms of the MIT license. See LICENSE for details.
  6. */
  7. #define _GNU_SOURCE
  8. #include <ctype.h>
  9. #include <errno.h>
  10. #include <limits.h>
  11. #include <stdio.h>
  12. #include <stdlib.h>
  13. #include <string.h>
  14. #include <stdarg.h>
  15. #include <assert.h>
  16. #include <jansson.h>
  17. #include "jansson_private.h"
  18. #include "strbuffer.h"
  19. #include "utf.h"
  20. #define TOKEN_INVALID -1
  21. #define TOKEN_EOF 0
  22. #define TOKEN_STRING 256
  23. #define TOKEN_INTEGER 257
  24. #define TOKEN_REAL 258
  25. #define TOKEN_TRUE 259
  26. #define TOKEN_FALSE 260
  27. #define TOKEN_NULL 261
  28. /* read one byte from stream, return EOF on end of file */
  29. typedef int (*get_func)(void *data);
  30. /* return non-zero if end of file has been reached */
  31. typedef int (*eof_func)(void *data);
  32. typedef struct {
  33. get_func get;
  34. eof_func eof;
  35. void *data;
  36. int stream_pos;
  37. char buffer[5];
  38. int buffer_pos;
  39. } stream_t;
  40. typedef struct {
  41. stream_t stream;
  42. strbuffer_t saved_text;
  43. int token;
  44. int line, column;
  45. union {
  46. char *string;
  47. json_int_t integer;
  48. double real;
  49. } value;
  50. } lex_t;
  51. /*** error reporting ***/
  52. static void error_set(json_error_t *error, const lex_t *lex,
  53. const char *msg, ...)
  54. {
  55. va_list ap;
  56. char msg_text[JSON_ERROR_TEXT_LENGTH];
  57. int line = -1, col = -1;
  58. const char *result = msg_text;
  59. if(!error)
  60. return;
  61. va_start(ap, msg);
  62. vsnprintf(msg_text, JSON_ERROR_TEXT_LENGTH, msg, ap);
  63. va_end(ap);
  64. if(lex)
  65. {
  66. const char *saved_text = strbuffer_value(&lex->saved_text);
  67. char msg_with_context[JSON_ERROR_TEXT_LENGTH];
  68. line = lex->line;
  69. if(saved_text && saved_text[0])
  70. {
  71. if(lex->saved_text.length <= 20) {
  72. snprintf(msg_with_context, JSON_ERROR_TEXT_LENGTH,
  73. "%s near '%s'", msg_text, saved_text);
  74. result = msg_with_context;
  75. }
  76. }
  77. else
  78. {
  79. snprintf(msg_with_context, JSON_ERROR_TEXT_LENGTH,
  80. "%s near end of file", msg_text);
  81. result = msg_with_context;
  82. }
  83. }
  84. jsonp_error_set(error, line, col, "%s", result);
  85. }
  86. /*** lexical analyzer ***/
  87. static void
  88. stream_init(stream_t *stream, get_func get, eof_func eof, void *data)
  89. {
  90. stream->get = get;
  91. stream->eof = eof;
  92. stream->data = data;
  93. stream->stream_pos = 0;
  94. stream->buffer[0] = '\0';
  95. stream->buffer_pos = 0;
  96. }
  97. static char stream_get(stream_t *stream, json_error_t *error)
  98. {
  99. char c;
  100. if(!stream->buffer[stream->buffer_pos])
  101. {
  102. stream->buffer[0] = stream->get(stream->data);
  103. stream->buffer_pos = 0;
  104. c = stream->buffer[0];
  105. if((unsigned char)c >= 0x80 && c != (char)EOF)
  106. {
  107. /* multi-byte UTF-8 sequence */
  108. int i, count;
  109. count = utf8_check_first(c);
  110. if(!count)
  111. goto out;
  112. assert(count >= 2);
  113. for(i = 1; i < count; i++)
  114. stream->buffer[i] = stream->get(stream->data);
  115. if(!utf8_check_full(stream->buffer, count, NULL))
  116. goto out;
  117. stream->stream_pos += count;
  118. stream->buffer[count] = '\0';
  119. }
  120. else {
  121. stream->buffer[1] = '\0';
  122. stream->stream_pos++;
  123. }
  124. }
  125. return stream->buffer[stream->buffer_pos++];
  126. out:
  127. error_set(error, NULL, "unable to decode byte 0x%x at position %d",
  128. (unsigned char)c, stream->stream_pos);
  129. stream->buffer[0] = EOF;
  130. stream->buffer[1] = '\0';
  131. stream->buffer_pos = 1;
  132. return EOF;
  133. }
  134. static void stream_unget(stream_t *stream, char c)
  135. {
  136. assert(stream->buffer_pos > 0);
  137. stream->buffer_pos--;
  138. assert(stream->buffer[stream->buffer_pos] == c);
  139. }
  140. static int lex_get(lex_t *lex, json_error_t *error)
  141. {
  142. return stream_get(&lex->stream, error);
  143. }
  144. static int lex_eof(lex_t *lex)
  145. {
  146. return lex->stream.eof(lex->stream.data);
  147. }
  148. static void lex_save(lex_t *lex, char c)
  149. {
  150. strbuffer_append_byte(&lex->saved_text, c);
  151. }
  152. static int lex_get_save(lex_t *lex, json_error_t *error)
  153. {
  154. char c = stream_get(&lex->stream, error);
  155. lex_save(lex, c);
  156. return c;
  157. }
  158. static void lex_unget_unsave(lex_t *lex, char c)
  159. {
  160. char d;
  161. stream_unget(&lex->stream, c);
  162. d = strbuffer_pop(&lex->saved_text);
  163. assert(c == d);
  164. }
  165. static void lex_save_cached(lex_t *lex)
  166. {
  167. while(lex->stream.buffer[lex->stream.buffer_pos] != '\0')
  168. {
  169. lex_save(lex, lex->stream.buffer[lex->stream.buffer_pos]);
  170. lex->stream.buffer_pos++;
  171. }
  172. }
  173. /* assumes that str points to 'u' plus at least 4 valid hex digits */
  174. static int32_t decode_unicode_escape(const char *str)
  175. {
  176. int i;
  177. int32_t value = 0;
  178. assert(str[0] == 'u');
  179. for(i = 1; i <= 4; i++) {
  180. char c = str[i];
  181. value <<= 4;
  182. if(isdigit(c))
  183. value += c - '0';
  184. else if(islower(c))
  185. value += c - 'a' + 10;
  186. else if(isupper(c))
  187. value += c - 'A' + 10;
  188. else
  189. assert(0);
  190. }
  191. return value;
  192. }
  193. static void lex_scan_string(lex_t *lex, json_error_t *error)
  194. {
  195. char c;
  196. const char *p;
  197. char *t;
  198. int i;
  199. lex->value.string = NULL;
  200. lex->token = TOKEN_INVALID;
  201. c = lex_get_save(lex, error);
  202. while(c != '"') {
  203. if(c == (char)EOF) {
  204. lex_unget_unsave(lex, c);
  205. if(lex_eof(lex))
  206. error_set(error, lex, "premature end of input");
  207. goto out;
  208. }
  209. else if((unsigned char)c <= 0x1F) {
  210. /* control character */
  211. lex_unget_unsave(lex, c);
  212. if(c == '\n')
  213. error_set(error, lex, "unexpected newline", c);
  214. else
  215. error_set(error, lex, "control character 0x%x", c);
  216. goto out;
  217. }
  218. else if(c == '\\') {
  219. c = lex_get_save(lex, error);
  220. if(c == 'u') {
  221. c = lex_get_save(lex, error);
  222. for(i = 0; i < 4; i++) {
  223. if(!isxdigit(c)) {
  224. lex_unget_unsave(lex, c);
  225. error_set(error, lex, "invalid escape");
  226. goto out;
  227. }
  228. c = lex_get_save(lex, error);
  229. }
  230. }
  231. else if(c == '"' || c == '\\' || c == '/' || c == 'b' ||
  232. c == 'f' || c == 'n' || c == 'r' || c == 't')
  233. c = lex_get_save(lex, error);
  234. else {
  235. lex_unget_unsave(lex, c);
  236. error_set(error, lex, "invalid escape");
  237. goto out;
  238. }
  239. }
  240. else
  241. c = lex_get_save(lex, error);
  242. }
  243. /* the actual value is at most of the same length as the source
  244. string, because:
  245. - shortcut escapes (e.g. "\t") (length 2) are converted to 1 byte
  246. - a single \uXXXX escape (length 6) is converted to at most 3 bytes
  247. - two \uXXXX escapes (length 12) forming an UTF-16 surrogate pair
  248. are converted to 4 bytes
  249. */
  250. lex->value.string = malloc(lex->saved_text.length + 1);
  251. if(!lex->value.string) {
  252. /* this is not very nice, since TOKEN_INVALID is returned */
  253. goto out;
  254. }
  255. /* the target */
  256. t = lex->value.string;
  257. /* + 1 to skip the " */
  258. p = strbuffer_value(&lex->saved_text) + 1;
  259. while(*p != '"') {
  260. if(*p == '\\') {
  261. p++;
  262. if(*p == 'u') {
  263. char buffer[4];
  264. int length;
  265. int32_t value;
  266. value = decode_unicode_escape(p);
  267. p += 5;
  268. if(0xD800 <= value && value <= 0xDBFF) {
  269. /* surrogate pair */
  270. if(*p == '\\' && *(p + 1) == 'u') {
  271. int32_t value2 = decode_unicode_escape(++p);
  272. p += 5;
  273. if(0xDC00 <= value2 && value2 <= 0xDFFF) {
  274. /* valid second surrogate */
  275. value =
  276. ((value - 0xD800) << 10) +
  277. (value2 - 0xDC00) +
  278. 0x10000;
  279. }
  280. else {
  281. /* invalid second surrogate */
  282. error_set(error, lex,
  283. "invalid Unicode '\\u%04X\\u%04X'",
  284. value, value2);
  285. goto out;
  286. }
  287. }
  288. else {
  289. /* no second surrogate */
  290. error_set(error, lex, "invalid Unicode '\\u%04X'",
  291. value);
  292. goto out;
  293. }
  294. }
  295. else if(0xDC00 <= value && value <= 0xDFFF) {
  296. error_set(error, lex, "invalid Unicode '\\u%04X'", value);
  297. goto out;
  298. }
  299. else if(value == 0)
  300. {
  301. error_set(error, lex, "\\u0000 is not allowed");
  302. goto out;
  303. }
  304. if(utf8_encode(value, buffer, &length))
  305. assert(0);
  306. memcpy(t, buffer, length);
  307. t += length;
  308. }
  309. else {
  310. switch(*p) {
  311. case '"': case '\\': case '/':
  312. *t = *p; break;
  313. case 'b': *t = '\b'; break;
  314. case 'f': *t = '\f'; break;
  315. case 'n': *t = '\n'; break;
  316. case 'r': *t = '\r'; break;
  317. case 't': *t = '\t'; break;
  318. default: assert(0);
  319. }
  320. t++;
  321. p++;
  322. }
  323. }
  324. else
  325. *(t++) = *(p++);
  326. }
  327. *t = '\0';
  328. lex->token = TOKEN_STRING;
  329. return;
  330. out:
  331. free(lex->value.string);
  332. }
  333. #if JSON_INTEGER_IS_LONG_LONG
  334. #define json_strtoint strtoll
  335. #else
  336. #define json_strtoint strtol
  337. #endif
  338. static int lex_scan_number(lex_t *lex, char c, json_error_t *error)
  339. {
  340. const char *saved_text;
  341. char *end;
  342. double value;
  343. lex->token = TOKEN_INVALID;
  344. if(c == '-')
  345. c = lex_get_save(lex, error);
  346. if(c == '0') {
  347. c = lex_get_save(lex, error);
  348. if(isdigit(c)) {
  349. lex_unget_unsave(lex, c);
  350. goto out;
  351. }
  352. }
  353. else if(isdigit(c)) {
  354. c = lex_get_save(lex, error);
  355. while(isdigit(c))
  356. c = lex_get_save(lex, error);
  357. }
  358. else {
  359. lex_unget_unsave(lex, c);
  360. goto out;
  361. }
  362. if(c != '.' && c != 'E' && c != 'e') {
  363. json_int_t value;
  364. lex_unget_unsave(lex, c);
  365. saved_text = strbuffer_value(&lex->saved_text);
  366. errno = 0;
  367. value = json_strtoint(saved_text, &end, 10);
  368. if(errno == ERANGE) {
  369. if(value < 0)
  370. error_set(error, lex, "too big negative integer");
  371. else
  372. error_set(error, lex, "too big integer");
  373. goto out;
  374. }
  375. assert(end == saved_text + lex->saved_text.length);
  376. lex->token = TOKEN_INTEGER;
  377. lex->value.integer = value;
  378. return 0;
  379. }
  380. if(c == '.') {
  381. c = lex_get(lex, error);
  382. if(!isdigit(c))
  383. goto out;
  384. lex_save(lex, c);
  385. c = lex_get_save(lex, error);
  386. while(isdigit(c))
  387. c = lex_get_save(lex, error);
  388. }
  389. if(c == 'E' || c == 'e') {
  390. c = lex_get_save(lex, error);
  391. if(c == '+' || c == '-')
  392. c = lex_get_save(lex, error);
  393. if(!isdigit(c)) {
  394. lex_unget_unsave(lex, c);
  395. goto out;
  396. }
  397. c = lex_get_save(lex, error);
  398. while(isdigit(c))
  399. c = lex_get_save(lex, error);
  400. }
  401. lex_unget_unsave(lex, c);
  402. saved_text = strbuffer_value(&lex->saved_text);
  403. value = strtod(saved_text, &end);
  404. assert(end == saved_text + lex->saved_text.length);
  405. if(errno == ERANGE && value != 0) {
  406. error_set(error, lex, "real number overflow");
  407. goto out;
  408. }
  409. lex->token = TOKEN_REAL;
  410. lex->value.real = value;
  411. return 0;
  412. out:
  413. return -1;
  414. }
  415. static int lex_scan(lex_t *lex, json_error_t *error)
  416. {
  417. char c;
  418. strbuffer_clear(&lex->saved_text);
  419. if(lex->token == TOKEN_STRING) {
  420. free(lex->value.string);
  421. lex->value.string = NULL;
  422. }
  423. c = lex_get(lex, error);
  424. while(c == ' ' || c == '\t' || c == '\n' || c == '\r')
  425. {
  426. if(c == '\n')
  427. lex->line++;
  428. c = lex_get(lex, error);
  429. }
  430. if(c == (char)EOF) {
  431. if(lex_eof(lex))
  432. lex->token = TOKEN_EOF;
  433. else
  434. lex->token = TOKEN_INVALID;
  435. goto out;
  436. }
  437. lex_save(lex, c);
  438. if(c == '{' || c == '}' || c == '[' || c == ']' || c == ':' || c == ',')
  439. lex->token = c;
  440. else if(c == '"')
  441. lex_scan_string(lex, error);
  442. else if(isdigit(c) || c == '-') {
  443. if(lex_scan_number(lex, c, error))
  444. goto out;
  445. }
  446. else if(isupper(c) || islower(c)) {
  447. /* eat up the whole identifier for clearer error messages */
  448. const char *saved_text;
  449. c = lex_get_save(lex, error);
  450. while(isupper(c) || islower(c))
  451. c = lex_get_save(lex, error);
  452. lex_unget_unsave(lex, c);
  453. saved_text = strbuffer_value(&lex->saved_text);
  454. if(strcmp(saved_text, "true") == 0)
  455. lex->token = TOKEN_TRUE;
  456. else if(strcmp(saved_text, "false") == 0)
  457. lex->token = TOKEN_FALSE;
  458. else if(strcmp(saved_text, "null") == 0)
  459. lex->token = TOKEN_NULL;
  460. else
  461. lex->token = TOKEN_INVALID;
  462. }
  463. else {
  464. /* save the rest of the input UTF-8 sequence to get an error
  465. message of valid UTF-8 */
  466. lex_save_cached(lex);
  467. lex->token = TOKEN_INVALID;
  468. }
  469. out:
  470. return lex->token;
  471. }
  472. static char *lex_steal_string(lex_t *lex)
  473. {
  474. char *result = NULL;
  475. if(lex->token == TOKEN_STRING)
  476. {
  477. result = lex->value.string;
  478. lex->value.string = NULL;
  479. }
  480. return result;
  481. }
  482. static int lex_init(lex_t *lex, get_func get, eof_func eof, void *data)
  483. {
  484. stream_init(&lex->stream, get, eof, data);
  485. if(strbuffer_init(&lex->saved_text))
  486. return -1;
  487. lex->token = TOKEN_INVALID;
  488. lex->line = 1;
  489. return 0;
  490. }
  491. static void lex_close(lex_t *lex)
  492. {
  493. if(lex->token == TOKEN_STRING)
  494. free(lex->value.string);
  495. strbuffer_close(&lex->saved_text);
  496. }
  497. /*** parser ***/
  498. static json_t *parse_value(lex_t *lex, json_error_t *error);
  499. static json_t *parse_object(lex_t *lex, json_error_t *error)
  500. {
  501. json_t *object = json_object();
  502. if(!object)
  503. return NULL;
  504. lex_scan(lex, error);
  505. if(lex->token == '}')
  506. return object;
  507. while(1) {
  508. char *key;
  509. json_t *value;
  510. if(lex->token != TOKEN_STRING) {
  511. error_set(error, lex, "string or '}' expected");
  512. goto error;
  513. }
  514. key = lex_steal_string(lex);
  515. if(!key)
  516. return NULL;
  517. lex_scan(lex, error);
  518. if(lex->token != ':') {
  519. free(key);
  520. error_set(error, lex, "':' expected");
  521. goto error;
  522. }
  523. lex_scan(lex, error);
  524. value = parse_value(lex, error);
  525. if(!value) {
  526. free(key);
  527. goto error;
  528. }
  529. if(json_object_set_nocheck(object, key, value)) {
  530. free(key);
  531. json_decref(value);
  532. goto error;
  533. }
  534. json_decref(value);
  535. free(key);
  536. lex_scan(lex, error);
  537. if(lex->token != ',')
  538. break;
  539. lex_scan(lex, error);
  540. }
  541. if(lex->token != '}') {
  542. error_set(error, lex, "'}' expected");
  543. goto error;
  544. }
  545. return object;
  546. error:
  547. json_decref(object);
  548. return NULL;
  549. }
  550. static json_t *parse_array(lex_t *lex, json_error_t *error)
  551. {
  552. json_t *array = json_array();
  553. if(!array)
  554. return NULL;
  555. lex_scan(lex, error);
  556. if(lex->token == ']')
  557. return array;
  558. while(lex->token) {
  559. json_t *elem = parse_value(lex, error);
  560. if(!elem)
  561. goto error;
  562. if(json_array_append(array, elem)) {
  563. json_decref(elem);
  564. goto error;
  565. }
  566. json_decref(elem);
  567. lex_scan(lex, error);
  568. if(lex->token != ',')
  569. break;
  570. lex_scan(lex, error);
  571. }
  572. if(lex->token != ']') {
  573. error_set(error, lex, "']' expected");
  574. goto error;
  575. }
  576. return array;
  577. error:
  578. json_decref(array);
  579. return NULL;
  580. }
  581. static json_t *parse_value(lex_t *lex, json_error_t *error)
  582. {
  583. json_t *json;
  584. switch(lex->token) {
  585. case TOKEN_STRING: {
  586. json = json_string_nocheck(lex->value.string);
  587. break;
  588. }
  589. case TOKEN_INTEGER: {
  590. json = json_integer(lex->value.integer);
  591. break;
  592. }
  593. case TOKEN_REAL: {
  594. json = json_real(lex->value.real);
  595. break;
  596. }
  597. case TOKEN_TRUE:
  598. json = json_true();
  599. break;
  600. case TOKEN_FALSE:
  601. json = json_false();
  602. break;
  603. case TOKEN_NULL:
  604. json = json_null();
  605. break;
  606. case '{':
  607. json = parse_object(lex, error);
  608. break;
  609. case '[':
  610. json = parse_array(lex, error);
  611. break;
  612. case TOKEN_INVALID:
  613. error_set(error, lex, "invalid token");
  614. return NULL;
  615. default:
  616. error_set(error, lex, "unexpected token");
  617. return NULL;
  618. }
  619. if(!json)
  620. return NULL;
  621. return json;
  622. }
  623. static json_t *parse_json(lex_t *lex, json_error_t *error)
  624. {
  625. lex_scan(lex, error);
  626. if(lex->token != '[' && lex->token != '{') {
  627. error_set(error, lex, "'[' or '{' expected");
  628. return NULL;
  629. }
  630. return parse_value(lex, error);
  631. }
  632. typedef struct
  633. {
  634. const char *data;
  635. int pos;
  636. } string_data_t;
  637. static int string_get(void *data)
  638. {
  639. char c;
  640. string_data_t *stream = (string_data_t *)data;
  641. c = stream->data[stream->pos];
  642. if(c == '\0')
  643. return EOF;
  644. else
  645. {
  646. stream->pos++;
  647. return c;
  648. }
  649. }
  650. static int string_eof(void *data)
  651. {
  652. string_data_t *stream = (string_data_t *)data;
  653. return (stream->data[stream->pos] == '\0');
  654. }
  655. json_t *json_loads(const char *string, size_t flags, json_error_t *error)
  656. {
  657. lex_t lex;
  658. json_t *result;
  659. (void)flags; /* unused */
  660. string_data_t stream_data = {string, 0};
  661. if(lex_init(&lex, string_get, string_eof, (void *)&stream_data))
  662. return NULL;
  663. jsonp_error_init(error, "<string>");
  664. result = parse_json(&lex, error);
  665. if(!result)
  666. goto out;
  667. lex_scan(&lex, error);
  668. if(lex.token != TOKEN_EOF) {
  669. error_set(error, &lex, "end of file expected");
  670. json_decref(result);
  671. result = NULL;
  672. }
  673. out:
  674. lex_close(&lex);
  675. return result;
  676. }
  677. json_t *json_loadf(FILE *input, size_t flags, json_error_t *error)
  678. {
  679. lex_t lex;
  680. const char *source;
  681. json_t *result;
  682. (void)flags; /* unused */
  683. if(lex_init(&lex, (get_func)fgetc, (eof_func)feof, input))
  684. return NULL;
  685. if(input == stdin)
  686. source = "<stdin>";
  687. else
  688. source = "<stream>";
  689. jsonp_error_init(error, source);
  690. result = parse_json(&lex, error);
  691. if(!result)
  692. goto out;
  693. lex_scan(&lex, error);
  694. if(lex.token != TOKEN_EOF) {
  695. error_set(error, &lex, "end of file expected");
  696. json_decref(result);
  697. result = NULL;
  698. }
  699. out:
  700. lex_close(&lex);
  701. return result;
  702. }
  703. json_t *json_load_file(const char *path, size_t flags, json_error_t *error)
  704. {
  705. json_t *result;
  706. FILE *fp;
  707. jsonp_error_init(error, path);
  708. fp = fopen(path, "r");
  709. if(!fp)
  710. {
  711. error_set(error, NULL, "unable to open %s: %s",
  712. path, strerror(errno));
  713. return NULL;
  714. }
  715. result = json_loadf(fp, flags, error);
  716. fclose(fp);
  717. return result;
  718. }