/src/libyaml/reader.c

https://code.google.com/ · C · 367 lines · 203 code · 74 blank · 90 comment · 79 complexity · a7f073ee4eed7197686c386178fb97dd MD5 · raw file

  1. #include "yaml_private.h"
  2. /*
  3. * Declarations.
  4. */
  5. static int
  6. yaml_parser_set_reader_error(yaml_parser_t *parser, const char *problem,
  7. size_t offset, int value);
  8. static int
  9. yaml_parser_update_raw_buffer(yaml_parser_t *parser);
  10. static int
  11. yaml_parser_determine_encoding(yaml_parser_t *parser);
  12. YAML_DECLARE(int)
  13. yaml_parser_update_buffer(yaml_parser_t *parser, size_t length);
  14. /*
  15. * Set the reader error and return 0.
  16. */
  17. static int
  18. yaml_parser_set_reader_error(yaml_parser_t *parser, const char *problem,
  19. size_t offset, int value)
  20. {
  21. parser->error = YAML_READER_ERROR;
  22. parser->problem = problem;
  23. parser->problem_offset = offset;
  24. parser->problem_value = value;
  25. return 0;
  26. }
  27. /*
  28. * Byte order marks.
  29. */
  30. #define BOM_UTF8 "\xef\xbb\xbf"
  31. #define BOM_UTF16LE "\xff\xfe"
  32. #define BOM_UTF16BE "\xfe\xff"
  33. /*
  34. * Determine the input stream encoding by checking the BOM symbol. If no BOM is
  35. * found, the UTF-8 encoding is assumed. Return 1 on success, 0 on failure.
  36. */
  37. static int
  38. yaml_parser_determine_encoding(yaml_parser_t *parser)
  39. {
  40. /* Ensure that we had enough bytes in the raw buffer. */
  41. while (!parser->eof
  42. && parser->raw_buffer.last - parser->raw_buffer.pointer < 3) {
  43. if (!yaml_parser_update_raw_buffer(parser)) {
  44. return 0;
  45. }
  46. }
  47. /* Determine the encoding. */
  48. if (parser->raw_buffer.last - parser->raw_buffer.pointer >= 2
  49. && !memcmp(parser->raw_buffer.pointer, BOM_UTF16LE, 2)) {
  50. parser->encoding = YAML_UTF16LE_ENCODING;
  51. parser->raw_buffer.pointer += 2;
  52. parser->offset += 2;
  53. }
  54. else if (parser->raw_buffer.last - parser->raw_buffer.pointer >= 2
  55. && !memcmp(parser->raw_buffer.pointer, BOM_UTF16BE, 2)) {
  56. parser->encoding = YAML_UTF16BE_ENCODING;
  57. parser->raw_buffer.pointer += 2;
  58. parser->offset += 2;
  59. }
  60. else if (parser->raw_buffer.last - parser->raw_buffer.pointer >= 3
  61. && !memcmp(parser->raw_buffer.pointer, BOM_UTF8, 3)) {
  62. parser->encoding = YAML_UTF8_ENCODING;
  63. parser->raw_buffer.pointer += 3;
  64. parser->offset += 3;
  65. }
  66. else {
  67. parser->encoding = YAML_UTF8_ENCODING;
  68. }
  69. return 1;
  70. }
  71. /*
  72. * Update the raw buffer.
  73. */
  74. static int
  75. yaml_parser_update_raw_buffer(yaml_parser_t *parser)
  76. {
  77. size_t size_read = 0;
  78. /* Return if the raw buffer is full. */
  79. if (parser->raw_buffer.start == parser->raw_buffer.pointer
  80. && parser->raw_buffer.last == parser->raw_buffer.end)
  81. return 1;
  82. /* Return on EOF. */
  83. if (parser->eof) return 1;
  84. /* Move the remaining bytes in the raw buffer to the beginning. */
  85. if (parser->raw_buffer.start < parser->raw_buffer.pointer
  86. && parser->raw_buffer.pointer < parser->raw_buffer.last) {
  87. memmove(parser->raw_buffer.start, parser->raw_buffer.pointer,
  88. parser->raw_buffer.last - parser->raw_buffer.pointer);
  89. }
  90. parser->raw_buffer.last -=
  91. parser->raw_buffer.pointer - parser->raw_buffer.start;
  92. parser->raw_buffer.pointer = parser->raw_buffer.start;
  93. /* Call the read handler to fill the buffer. */
  94. if (!parser->read_handler(parser->read_handler_data, parser->raw_buffer.last,
  95. parser->raw_buffer.end - parser->raw_buffer.last, &size_read)) {
  96. return yaml_parser_set_reader_error(parser, "input error",
  97. parser->offset, -1);
  98. }
  99. parser->raw_buffer.last += size_read;
  100. if (!size_read) {
  101. parser->eof = 1;
  102. }
  103. return 1;
  104. }
  105. /*
  106. * Ensure that the buffer contains at least `length` characters.
  107. * Return 1 on success, 0 on failure.
  108. *
  109. * The length is supposed to be significantly less that the buffer size.
  110. */
  111. YAML_DECLARE(int)
  112. yaml_parser_update_buffer(yaml_parser_t *parser, size_t length)
  113. {
  114. int first = 1;
  115. assert(parser->read_handler); /* Read handler must be set. */
  116. /* If the EOF flag is set and the raw buffer is empty, do nothing. */
  117. if (parser->eof && parser->raw_buffer.pointer == parser->raw_buffer.last)
  118. return 1;
  119. /* Return if the buffer contains enough characters. */
  120. if (parser->unread >= length)
  121. return 1;
  122. /* Determine the input encoding if it is not known yet. */
  123. if (!parser->encoding) {
  124. if (!yaml_parser_determine_encoding(parser))
  125. return 0;
  126. }
  127. /* Move the unread characters to the beginning of the buffer. */
  128. if (parser->buffer.start < parser->buffer.pointer
  129. && parser->buffer.pointer < parser->buffer.last) {
  130. size_t size = parser->buffer.last - parser->buffer.pointer;
  131. memmove(parser->buffer.start, parser->buffer.pointer, size);
  132. parser->buffer.pointer = parser->buffer.start;
  133. parser->buffer.last = parser->buffer.start + size;
  134. }
  135. else if (parser->buffer.pointer == parser->buffer.last) {
  136. parser->buffer.pointer = parser->buffer.start;
  137. parser->buffer.last = parser->buffer.start;
  138. }
  139. /* Fill the buffer until it has enough characters. */
  140. while (parser->unread < length)
  141. {
  142. /* Fill the raw buffer if necessary. */
  143. if (!first || parser->raw_buffer.pointer == parser->raw_buffer.last) {
  144. if (!yaml_parser_update_raw_buffer(parser)) return 0;
  145. }
  146. first = 0;
  147. /* Decode the raw buffer. */
  148. while (parser->raw_buffer.pointer != parser->raw_buffer.last)
  149. {
  150. unsigned int value = 0, value2 = 0;
  151. unsigned char octet;
  152. unsigned int width = 0;
  153. int low, high;
  154. size_t raw_unread = parser->raw_buffer.last - parser->raw_buffer.pointer;
  155. /* Decode the next character. */
  156. switch (parser->encoding)
  157. {
  158. case YAML_UTF8_ENCODING:
  159. octet = parser->raw_buffer.pointer[0];
  160. /*
  161. We can only disallow characters without the high
  162. bit set. Characters with the high bit set are required
  163. for invalid UTF-8 strings to be encoded, as we
  164. cannot rely on any backslash sequences working.
  165. */
  166. if (! (octet == 0x09 || octet == 0x0A || octet == 0x0D
  167. || (octet >= 0x20 && octet != 0x7F) ))
  168. return yaml_parser_set_reader_error(parser,
  169. "Control characters are not allowed",
  170. parser->offset, value);
  171. parser->raw_buffer.pointer++;
  172. parser->offset++;
  173. *(parser->buffer.last++) = octet;
  174. break;
  175. case YAML_UTF16LE_ENCODING:
  176. case YAML_UTF16BE_ENCODING:
  177. low = (parser->encoding == YAML_UTF16LE_ENCODING ? 0 : 1);
  178. high = (parser->encoding == YAML_UTF16LE_ENCODING ? 1 : 0);
  179. /*
  180. * The UTF-16 encoding is not as simple as one might
  181. * naively think. Check RFC 2781
  182. * (http://www.ietf.org/rfc/rfc2781.txt).
  183. *
  184. * Normally, two subsequent bytes describe a Unicode
  185. * character. However a special technique (called a
  186. * surrogate pair) is used for specifying character
  187. * values larger than 0xFFFF.
  188. *
  189. * A surrogate pair consists of two pseudo-characters:
  190. * high surrogate area (0xD800-0xDBFF)
  191. * low surrogate area (0xDC00-0xDFFF)
  192. *
  193. * The following formulas are used for decoding
  194. * and encoding characters using surrogate pairs:
  195. *
  196. * U = U' + 0x10000 (0x01 00 00 <= U <= 0x10 FF FF)
  197. * U' = yyyyyyyyyyxxxxxxxxxx (0 <= U' <= 0x0F FF FF)
  198. * W1 = 110110yyyyyyyyyy
  199. * W2 = 110111xxxxxxxxxx
  200. *
  201. * where U is the character value, W1 is the high surrogate
  202. * area, W2 is the low surrogate area.
  203. */
  204. /* Check for incomplete UTF-16 character. */
  205. if (raw_unread < 2) {
  206. if (parser->eof) {
  207. return yaml_parser_set_reader_error(parser,
  208. "incomplete UTF-16 character",
  209. parser->offset, -1);
  210. }
  211. break;
  212. }
  213. /* Get the character. */
  214. value = parser->raw_buffer.pointer[low]
  215. + (parser->raw_buffer.pointer[high] << 8);
  216. width = 2;
  217. /* Check for a high surrogate area. */
  218. if ((value & 0xFC00) == 0xD800) {
  219. /* Check for incomplete surrogate pair. */
  220. if (raw_unread < 4) {
  221. if (parser->eof) { /* trailing high surrogate */
  222. width = 2;
  223. } else {
  224. break; /* Can't tell until we have more raw characters */
  225. }
  226. } else {
  227. /* Get the next character. */
  228. value2 = parser->raw_buffer.pointer[low+2]
  229. + (parser->raw_buffer.pointer[high+2] << 8);
  230. /* Check for a low surrogate area. */
  231. if ((value2 & 0xFC00) == 0xDC00) {
  232. width = 4;
  233. /* Generate the value of the surrogate pair. */
  234. value = 0x10000 + ((value & 0x3FF) << 10) + (value2 & 0x3FF);
  235. }
  236. }
  237. }
  238. /* Check if the raw buffer contains enough bytes to form a character. */
  239. /*
  240. * Check if the character is in the allowed range:
  241. * #x9 | #xA | #xD | [#x20-#x7E] (8 bit)
  242. * | #x85 | [#xA0-#xD7FF] | [#xE000-#xFFFD] (16 bit)
  243. * | [#x10000-#x10FFFF] (32 bit)
  244. */
  245. /* Modified to allow all 16-bit values as \uNNNN may not
  246. work for some parsers for these values. */
  247. if (! (value == 0x09 || value == 0x0A || value == 0x0D
  248. || (value >= 0x20 && value <= 0x7E)
  249. || (value == 0x85) || value >= 0xA0))
  250. return yaml_parser_set_reader_error(parser,
  251. "Control characters are not allowed",
  252. parser->offset, value);
  253. /* Move the raw pointers. */
  254. parser->raw_buffer.pointer += width;
  255. parser->offset += width;
  256. /* Finally put the character into the buffer. */
  257. /* 0000 0000-0000 007F -> 0xxxxxxx */
  258. if (value <= 0x7F) {
  259. *(parser->buffer.last++) = value;
  260. }
  261. /* 0000 0080-0000 07FF -> 110xxxxx 10xxxxxx */
  262. else if (value <= 0x7FF) {
  263. *(parser->buffer.last++) = 0xC0 + (value >> 6);
  264. *(parser->buffer.last++) = 0x80 + (value & 0x3F);
  265. }
  266. /* 0000 0800-0000 FFFF -> 1110xxxx 10xxxxxx 10xxxxxx */
  267. else if (value <= 0xFFFF) {
  268. *(parser->buffer.last++) = 0xE0 + (value >> 12);
  269. *(parser->buffer.last++) = 0x80 + ((value >> 6) & 0x3F);
  270. *(parser->buffer.last++) = 0x80 + (value & 0x3F);
  271. }
  272. /* 0001 0000-0010 FFFF -> 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx */
  273. else {
  274. *(parser->buffer.last++) = 0xF0 + (value >> 18);
  275. *(parser->buffer.last++) = 0x80 + ((value >> 12) & 0x3F);
  276. *(parser->buffer.last++) = 0x80 + ((value >> 6) & 0x3F);
  277. *(parser->buffer.last++) = 0x80 + (value & 0x3F);
  278. }
  279. break;
  280. default:
  281. assert(1); /* Impossible. */
  282. }
  283. parser->unread ++;
  284. }
  285. /* On EOF, put NUL into the buffer and return. */
  286. if (parser->eof) {
  287. *(parser->buffer.last++) = '\0';
  288. parser->unread ++;
  289. return 1;
  290. }
  291. }
  292. return 1;
  293. }