/src/libyaml/reader.c
C | 367 lines | 203 code | 74 blank | 90 comment | 79 complexity | a7f073ee4eed7197686c386178fb97dd MD5 | raw file
1 2#include "yaml_private.h" 3 4/* 5 * Declarations. 6 */ 7 8static int 9yaml_parser_set_reader_error(yaml_parser_t *parser, const char *problem, 10 size_t offset, int value); 11 12static int 13yaml_parser_update_raw_buffer(yaml_parser_t *parser); 14 15static int 16yaml_parser_determine_encoding(yaml_parser_t *parser); 17 18YAML_DECLARE(int) 19yaml_parser_update_buffer(yaml_parser_t *parser, size_t length); 20 21/* 22 * Set the reader error and return 0. 23 */ 24 25static int 26yaml_parser_set_reader_error(yaml_parser_t *parser, const char *problem, 27 size_t offset, int value) 28{ 29 parser->error = YAML_READER_ERROR; 30 parser->problem = problem; 31 parser->problem_offset = offset; 32 parser->problem_value = value; 33 34 return 0; 35} 36 37/* 38 * Byte order marks. 39 */ 40 41#define BOM_UTF8 "\xef\xbb\xbf" 42#define BOM_UTF16LE "\xff\xfe" 43#define BOM_UTF16BE "\xfe\xff" 44 45/* 46 * Determine the input stream encoding by checking the BOM symbol. If no BOM is 47 * found, the UTF-8 encoding is assumed. Return 1 on success, 0 on failure. 48 */ 49 50static int 51yaml_parser_determine_encoding(yaml_parser_t *parser) 52{ 53 /* Ensure that we had enough bytes in the raw buffer. */ 54 55 while (!parser->eof 56 && parser->raw_buffer.last - parser->raw_buffer.pointer < 3) { 57 if (!yaml_parser_update_raw_buffer(parser)) { 58 return 0; 59 } 60 } 61 62 /* Determine the encoding. */ 63 64 if (parser->raw_buffer.last - parser->raw_buffer.pointer >= 2 65 && !memcmp(parser->raw_buffer.pointer, BOM_UTF16LE, 2)) { 66 parser->encoding = YAML_UTF16LE_ENCODING; 67 parser->raw_buffer.pointer += 2; 68 parser->offset += 2; 69 } 70 else if (parser->raw_buffer.last - parser->raw_buffer.pointer >= 2 71 && !memcmp(parser->raw_buffer.pointer, BOM_UTF16BE, 2)) { 72 parser->encoding = YAML_UTF16BE_ENCODING; 73 parser->raw_buffer.pointer += 2; 74 parser->offset += 2; 75 } 76 else if (parser->raw_buffer.last - parser->raw_buffer.pointer >= 3 77 && !memcmp(parser->raw_buffer.pointer, BOM_UTF8, 3)) { 78 parser->encoding = YAML_UTF8_ENCODING; 79 parser->raw_buffer.pointer += 3; 80 parser->offset += 3; 81 } 82 else { 83 parser->encoding = YAML_UTF8_ENCODING; 84 } 85 86 return 1; 87} 88 89/* 90 * Update the raw buffer. 91 */ 92 93static int 94yaml_parser_update_raw_buffer(yaml_parser_t *parser) 95{ 96 size_t size_read = 0; 97 98 /* Return if the raw buffer is full. */ 99 100 if (parser->raw_buffer.start == parser->raw_buffer.pointer 101 && parser->raw_buffer.last == parser->raw_buffer.end) 102 return 1; 103 104 /* Return on EOF. */ 105 106 if (parser->eof) return 1; 107 108 /* Move the remaining bytes in the raw buffer to the beginning. */ 109 110 if (parser->raw_buffer.start < parser->raw_buffer.pointer 111 && parser->raw_buffer.pointer < parser->raw_buffer.last) { 112 memmove(parser->raw_buffer.start, parser->raw_buffer.pointer, 113 parser->raw_buffer.last - parser->raw_buffer.pointer); 114 } 115 parser->raw_buffer.last -= 116 parser->raw_buffer.pointer - parser->raw_buffer.start; 117 parser->raw_buffer.pointer = parser->raw_buffer.start; 118 119 /* Call the read handler to fill the buffer. */ 120 121 if (!parser->read_handler(parser->read_handler_data, parser->raw_buffer.last, 122 parser->raw_buffer.end - parser->raw_buffer.last, &size_read)) { 123 return yaml_parser_set_reader_error(parser, "input error", 124 parser->offset, -1); 125 } 126 parser->raw_buffer.last += size_read; 127 if (!size_read) { 128 parser->eof = 1; 129 } 130 131 return 1; 132} 133 134/* 135 * Ensure that the buffer contains at least `length` characters. 136 * Return 1 on success, 0 on failure. 137 * 138 * The length is supposed to be significantly less that the buffer size. 139 */ 140 141YAML_DECLARE(int) 142yaml_parser_update_buffer(yaml_parser_t *parser, size_t length) 143{ 144 int first = 1; 145 146 assert(parser->read_handler); /* Read handler must be set. */ 147 148 /* If the EOF flag is set and the raw buffer is empty, do nothing. */ 149 150 if (parser->eof && parser->raw_buffer.pointer == parser->raw_buffer.last) 151 return 1; 152 153 /* Return if the buffer contains enough characters. */ 154 155 if (parser->unread >= length) 156 return 1; 157 158 /* Determine the input encoding if it is not known yet. */ 159 160 if (!parser->encoding) { 161 if (!yaml_parser_determine_encoding(parser)) 162 return 0; 163 } 164 165 /* Move the unread characters to the beginning of the buffer. */ 166 167 if (parser->buffer.start < parser->buffer.pointer 168 && parser->buffer.pointer < parser->buffer.last) { 169 size_t size = parser->buffer.last - parser->buffer.pointer; 170 memmove(parser->buffer.start, parser->buffer.pointer, size); 171 parser->buffer.pointer = parser->buffer.start; 172 parser->buffer.last = parser->buffer.start + size; 173 } 174 else if (parser->buffer.pointer == parser->buffer.last) { 175 parser->buffer.pointer = parser->buffer.start; 176 parser->buffer.last = parser->buffer.start; 177 } 178 179 /* Fill the buffer until it has enough characters. */ 180 181 while (parser->unread < length) 182 { 183 /* Fill the raw buffer if necessary. */ 184 185 if (!first || parser->raw_buffer.pointer == parser->raw_buffer.last) { 186 if (!yaml_parser_update_raw_buffer(parser)) return 0; 187 } 188 first = 0; 189 190 /* Decode the raw buffer. */ 191 192 while (parser->raw_buffer.pointer != parser->raw_buffer.last) 193 { 194 unsigned int value = 0, value2 = 0; 195 unsigned char octet; 196 unsigned int width = 0; 197 int low, high; 198 size_t raw_unread = parser->raw_buffer.last - parser->raw_buffer.pointer; 199 200 /* Decode the next character. */ 201 202 switch (parser->encoding) 203 { 204 case YAML_UTF8_ENCODING: 205 octet = parser->raw_buffer.pointer[0]; 206 /* 207 We can only disallow characters without the high 208 bit set. Characters with the high bit set are required 209 for invalid UTF-8 strings to be encoded, as we 210 cannot rely on any backslash sequences working. 211 */ 212 if (! (octet == 0x09 || octet == 0x0A || octet == 0x0D 213 || (octet >= 0x20 && octet != 0x7F) )) 214 return yaml_parser_set_reader_error(parser, 215 "Control characters are not allowed", 216 parser->offset, value); 217 218 parser->raw_buffer.pointer++; 219 parser->offset++; 220 *(parser->buffer.last++) = octet; 221 break; 222 223 case YAML_UTF16LE_ENCODING: 224 case YAML_UTF16BE_ENCODING: 225 226 low = (parser->encoding == YAML_UTF16LE_ENCODING ? 0 : 1); 227 high = (parser->encoding == YAML_UTF16LE_ENCODING ? 1 : 0); 228 229 /* 230 * The UTF-16 encoding is not as simple as one might 231 * naively think. Check RFC 2781 232 * (http://www.ietf.org/rfc/rfc2781.txt). 233 * 234 * Normally, two subsequent bytes describe a Unicode 235 * character. However a special technique (called a 236 * surrogate pair) is used for specifying character 237 * values larger than 0xFFFF. 238 * 239 * A surrogate pair consists of two pseudo-characters: 240 * high surrogate area (0xD800-0xDBFF) 241 * low surrogate area (0xDC00-0xDFFF) 242 * 243 * The following formulas are used for decoding 244 * and encoding characters using surrogate pairs: 245 * 246 * U = U' + 0x10000 (0x01 00 00 <= U <= 0x10 FF FF) 247 * U' = yyyyyyyyyyxxxxxxxxxx (0 <= U' <= 0x0F FF FF) 248 * W1 = 110110yyyyyyyyyy 249 * W2 = 110111xxxxxxxxxx 250 * 251 * where U is the character value, W1 is the high surrogate 252 * area, W2 is the low surrogate area. 253 */ 254 255 /* Check for incomplete UTF-16 character. */ 256 257 if (raw_unread < 2) { 258 if (parser->eof) { 259 return yaml_parser_set_reader_error(parser, 260 "incomplete UTF-16 character", 261 parser->offset, -1); 262 } 263 break; 264 } 265 266 /* Get the character. */ 267 268 value = parser->raw_buffer.pointer[low] 269 + (parser->raw_buffer.pointer[high] << 8); 270 width = 2; 271 272 /* Check for a high surrogate area. */ 273 274 if ((value & 0xFC00) == 0xD800) { 275 276 /* Check for incomplete surrogate pair. */ 277 278 if (raw_unread < 4) { 279 if (parser->eof) { /* trailing high surrogate */ 280 width = 2; 281 } else { 282 break; /* Can't tell until we have more raw characters */ 283 } 284 } else { 285 286 /* Get the next character. */ 287 288 value2 = parser->raw_buffer.pointer[low+2] 289 + (parser->raw_buffer.pointer[high+2] << 8); 290 291 /* Check for a low surrogate area. */ 292 if ((value2 & 0xFC00) == 0xDC00) { 293 width = 4; 294 /* Generate the value of the surrogate pair. */ 295 value = 0x10000 + ((value & 0x3FF) << 10) + (value2 & 0x3FF); 296 } 297 } 298 } 299 300 /* Check if the raw buffer contains enough bytes to form a character. */ 301 302 /* 303 * Check if the character is in the allowed range: 304 * #x9 | #xA | #xD | [#x20-#x7E] (8 bit) 305 * | #x85 | [#xA0-#xD7FF] | [#xE000-#xFFFD] (16 bit) 306 * | [#x10000-#x10FFFF] (32 bit) 307 */ 308 /* Modified to allow all 16-bit values as \uNNNN may not 309 work for some parsers for these values. */ 310 if (! (value == 0x09 || value == 0x0A || value == 0x0D 311 || (value >= 0x20 && value <= 0x7E) 312 || (value == 0x85) || value >= 0xA0)) 313 return yaml_parser_set_reader_error(parser, 314 "Control characters are not allowed", 315 parser->offset, value); 316 317 /* Move the raw pointers. */ 318 319 parser->raw_buffer.pointer += width; 320 parser->offset += width; 321 322 /* Finally put the character into the buffer. */ 323 324 /* 0000 0000-0000 007F -> 0xxxxxxx */ 325 if (value <= 0x7F) { 326 *(parser->buffer.last++) = value; 327 } 328 /* 0000 0080-0000 07FF -> 110xxxxx 10xxxxxx */ 329 else if (value <= 0x7FF) { 330 *(parser->buffer.last++) = 0xC0 + (value >> 6); 331 *(parser->buffer.last++) = 0x80 + (value & 0x3F); 332 } 333 /* 0000 0800-0000 FFFF -> 1110xxxx 10xxxxxx 10xxxxxx */ 334 else if (value <= 0xFFFF) { 335 *(parser->buffer.last++) = 0xE0 + (value >> 12); 336 *(parser->buffer.last++) = 0x80 + ((value >> 6) & 0x3F); 337 *(parser->buffer.last++) = 0x80 + (value & 0x3F); 338 } 339 /* 0001 0000-0010 FFFF -> 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx */ 340 else { 341 *(parser->buffer.last++) = 0xF0 + (value >> 18); 342 *(parser->buffer.last++) = 0x80 + ((value >> 12) & 0x3F); 343 *(parser->buffer.last++) = 0x80 + ((value >> 6) & 0x3F); 344 *(parser->buffer.last++) = 0x80 + (value & 0x3F); 345 } 346 break; 347 348 default: 349 assert(1); /* Impossible. */ 350 } 351 352 parser->unread ++; 353 } 354 355 /* On EOF, put NUL into the buffer and return. */ 356 357 if (parser->eof) { 358 *(parser->buffer.last++) = '\0'; 359 parser->unread ++; 360 return 1; 361 } 362 363 } 364 365 return 1; 366} 367