/peek-build/src/netdepends/hubbub-0.0.2/src/tokeniser/tokeniser.c
C | 3370 lines | 2902 code | 319 blank | 149 comment | 615 complexity | 6530fbcb7409a5b2eafc2432fd3ae059 MD5 | raw file
Large files files are truncated, but you can click here to view the full file
1/* 2 * This file is part of Hubbub. 3 * Licensed under the MIT License, 4 * http://www.opensource.org/licenses/mit-license.php 5 * Copyright 2007 John-Mark Bell <jmb@netsurf-browser.org> 6 * Copyright 2008 Andrew Sidwell <takkaria@netsurf-browser.org> 7 */ 8#include <assert.h> 9#include <stdbool.h> 10#include <string.h> 11 12#include <stdio.h> 13 14#include <parserutils/charset/utf8.h> 15 16#include "utils/parserutilserror.h" 17#include "utils/utils.h" 18 19#include "tokeniser/entities.h" 20#include "tokeniser/tokeniser.h" 21 22/** 23 * Table of mappings between Windows-1252 codepoints 128-159 and UCS4 24 */ 25static const uint32_t cp1252Table[32] = { 26 0x20AC, 0xFFFD, 0x201A, 0x0192, 0x201E, 0x2026, 0x2020, 0x2021, 27 0x02C6, 0x2030, 0x0160, 0x2039, 0x0152, 0xFFFD, 0x017D, 0xFFFD, 28 0xFFFD, 0x2018, 0x2019, 0x201C, 0x201D, 0x2022, 0x2013, 0x2014, 29 0x02DC, 0x2122, 0x0161, 0x203A, 0x0153, 0xFFFD, 0x017E, 0x0178 30}; 31 32/** 33 * UTF-8 encoding of U+FFFD REPLACEMENT CHARACTER 34 */ 35static const uint8_t u_fffd[3] = { '\xEF', '\xBF', '\xBD' }; 36static const hubbub_string u_fffd_str = { u_fffd, sizeof(u_fffd) }; 37 38 39/** 40 * String for when we want to emit newlines 41 */ 42static const uint8_t lf = '\n'; 43static const hubbub_string lf_str = { &lf, 1 }; 44 45 46/** 47 * Tokeniser states 48 */ 49typedef enum hubbub_tokeniser_state { 50 STATE_DATA, 51 STATE_CHARACTER_REFERENCE_DATA, 52 STATE_TAG_OPEN, 53 STATE_CLOSE_TAG_OPEN, 54 STATE_TAG_NAME, 55 STATE_BEFORE_ATTRIBUTE_NAME, 56 STATE_ATTRIBUTE_NAME, 57 STATE_AFTER_ATTRIBUTE_NAME, 58 STATE_BEFORE_ATTRIBUTE_VALUE, 59 STATE_ATTRIBUTE_VALUE_DQ, 60 STATE_ATTRIBUTE_VALUE_SQ, 61 STATE_ATTRIBUTE_VALUE_UQ, 62 STATE_CHARACTER_REFERENCE_IN_ATTRIBUTE_VALUE, 63 STATE_AFTER_ATTRIBUTE_VALUE_Q, 64 STATE_SELF_CLOSING_START_TAG, 65 STATE_BOGUS_COMMENT, 66 STATE_MARKUP_DECLARATION_OPEN, 67 STATE_MATCH_COMMENT, 68 STATE_COMMENT_START, 69 STATE_COMMENT_START_DASH, 70 STATE_COMMENT, 71 STATE_COMMENT_END_DASH, 72 STATE_COMMENT_END, 73 STATE_MATCH_DOCTYPE, 74 STATE_DOCTYPE, 75 STATE_BEFORE_DOCTYPE_NAME, 76 STATE_DOCTYPE_NAME, 77 STATE_AFTER_DOCTYPE_NAME, 78 STATE_MATCH_PUBLIC, 79 STATE_BEFORE_DOCTYPE_PUBLIC, 80 STATE_DOCTYPE_PUBLIC_DQ, 81 STATE_DOCTYPE_PUBLIC_SQ, 82 STATE_AFTER_DOCTYPE_PUBLIC, 83 STATE_MATCH_SYSTEM, 84 STATE_BEFORE_DOCTYPE_SYSTEM, 85 STATE_DOCTYPE_SYSTEM_DQ, 86 STATE_DOCTYPE_SYSTEM_SQ, 87 STATE_AFTER_DOCTYPE_SYSTEM, 88 STATE_BOGUS_DOCTYPE, 89 STATE_MATCH_CDATA, 90 STATE_CDATA_BLOCK, 91 STATE_NUMBERED_ENTITY, 92 STATE_NAMED_ENTITY 93} hubbub_tokeniser_state; 94 95/** 96 * Context for tokeniser 97 */ 98typedef struct hubbub_tokeniser_context { 99 size_t pending; /**< Count of pending chars */ 100 101 hubbub_string current_comment; /**< Current comment text */ 102 103 hubbub_token_type current_tag_type; /**< Type of current_tag */ 104 hubbub_tag current_tag; /**< Current tag */ 105 hubbub_doctype current_doctype; /**< Current doctype */ 106 hubbub_tokeniser_state prev_state; /**< Previous state */ 107 108 uint8_t last_start_tag_name[10]; /**< Name of the last start tag 109 * emitted */ 110 size_t last_start_tag_len; /**< Length of last start tag */ 111 112 struct { 113 uint32_t count; 114 bool match; 115 } close_tag_match; /**< State for matching close 116 * tags */ 117 118 struct { 119 uint32_t count; /**< Index into "DOCTYPE" */ 120 } match_doctype; /**< State for matching doctype */ 121 122 struct { 123 uint32_t count; /**< Index into "[CDATA[" */ 124 uint32_t end; /**< Index into "]]>" */ 125 } match_cdata; /**< State for matching cdata */ 126 127 struct { 128 size_t offset; /**< Offset in buffer */ 129 uint32_t length; /**< Length of entity */ 130 uint32_t codepoint; /**< UCS4 codepoint */ 131 bool complete; /**< True if match complete */ 132 133 uint32_t poss_length; /**< Optimistic length 134 * when matching named 135 * character references */ 136 uint8_t base; /**< Base for numeric 137 * entities */ 138 void *context; /**< Context for named 139 * entity search */ 140 size_t prev_len; /**< Previous byte length 141 * of str */ 142 bool had_data; /**< Whether we read 143 * anything after &#(x)? */ 144 bool overflow; /**< Whether this entity has 145 * has overflowed the maximum 146 * numeric entity value */ 147 hubbub_tokeniser_state return_state; /**< State we were 148 * called from */ 149 } match_entity; /**< Entity matching state */ 150 151 struct { 152 uint32_t line; /**< Current line of input */ 153 uint32_t col; /**< Current character in 154 * line */ 155 } position; /**< Position in source data */ 156 157 uint32_t allowed_char; /**< Used for quote matching */ 158} hubbub_tokeniser_context; 159 160/** 161 * Tokeniser data structure 162 */ 163struct hubbub_tokeniser { 164 hubbub_tokeniser_state state; /**< Current tokeniser state */ 165 hubbub_content_model content_model; /**< Current content 166 * model flag */ 167 bool escape_flag; /**< Escape flag **/ 168 bool process_cdata_section; /**< Whether to process CDATA sections*/ 169 170 parserutils_inputstream *input; /**< Input stream */ 171 parserutils_buffer *buffer; /**< Input buffer */ 172 173 hubbub_tokeniser_context context; /**< Tokeniser context */ 174 175 hubbub_token_handler token_handler; /**< Token handling callback */ 176 void *token_pw; /**< Token handler data */ 177 178 hubbub_error_handler error_handler; /**< Error handling callback */ 179 void *error_pw; /**< Error handler data */ 180 181 hubbub_allocator_fn alloc; /**< Memory (de)allocation function */ 182 void *alloc_pw; /**< Client private data */ 183}; 184 185static hubbub_error hubbub_tokeniser_handle_data(hubbub_tokeniser *tokeniser); 186static hubbub_error hubbub_tokeniser_handle_character_reference_data( 187 hubbub_tokeniser *tokeniser); 188static hubbub_error hubbub_tokeniser_handle_tag_open( 189 hubbub_tokeniser *tokeniser); 190static hubbub_error hubbub_tokeniser_handle_close_tag_open( 191 hubbub_tokeniser *tokeniser); 192static hubbub_error hubbub_tokeniser_handle_tag_name( 193 hubbub_tokeniser *tokeniser); 194static hubbub_error hubbub_tokeniser_handle_before_attribute_name( 195 hubbub_tokeniser *tokeniser); 196static hubbub_error hubbub_tokeniser_handle_attribute_name( 197 hubbub_tokeniser *tokeniser); 198static hubbub_error hubbub_tokeniser_handle_after_attribute_name( 199 hubbub_tokeniser *tokeniser); 200static hubbub_error hubbub_tokeniser_handle_before_attribute_value( 201 hubbub_tokeniser *tokeniser); 202static hubbub_error hubbub_tokeniser_handle_attribute_value_dq( 203 hubbub_tokeniser *tokeniser); 204static hubbub_error hubbub_tokeniser_handle_attribute_value_sq( 205 hubbub_tokeniser *tokeniser); 206static hubbub_error hubbub_tokeniser_handle_attribute_value_uq( 207 hubbub_tokeniser *tokeniser); 208static hubbub_error hubbub_tokeniser_handle_character_reference_in_attribute_value( 209 hubbub_tokeniser *tokeniser); 210static hubbub_error hubbub_tokeniser_handle_after_attribute_value_q( 211 hubbub_tokeniser *tokeniser); 212static hubbub_error hubbub_tokeniser_handle_self_closing_start_tag( 213 hubbub_tokeniser *tokeniser); 214static hubbub_error hubbub_tokeniser_handle_bogus_comment( 215 hubbub_tokeniser *tokeniser); 216static hubbub_error hubbub_tokeniser_handle_markup_declaration_open( 217 hubbub_tokeniser *tokeniser); 218static hubbub_error hubbub_tokeniser_handle_match_comment( 219 hubbub_tokeniser *tokeniser); 220static hubbub_error hubbub_tokeniser_handle_comment( 221 hubbub_tokeniser *tokeniser); 222static hubbub_error hubbub_tokeniser_handle_match_doctype( 223 hubbub_tokeniser *tokeniser); 224static hubbub_error hubbub_tokeniser_handle_doctype( 225 hubbub_tokeniser *tokeniser); 226static hubbub_error hubbub_tokeniser_handle_before_doctype_name( 227 hubbub_tokeniser *tokeniser); 228static hubbub_error hubbub_tokeniser_handle_doctype_name( 229 hubbub_tokeniser *tokeniser); 230static hubbub_error hubbub_tokeniser_handle_after_doctype_name( 231 hubbub_tokeniser *tokeniser); 232static hubbub_error hubbub_tokeniser_handle_match_public( 233 hubbub_tokeniser *tokeniser); 234static hubbub_error hubbub_tokeniser_handle_before_doctype_public( 235 hubbub_tokeniser *tokeniser); 236static hubbub_error hubbub_tokeniser_handle_doctype_public_dq( 237 hubbub_tokeniser *tokeniser); 238static hubbub_error hubbub_tokeniser_handle_doctype_public_sq( 239 hubbub_tokeniser *tokeniser); 240static hubbub_error hubbub_tokeniser_handle_after_doctype_public( 241 hubbub_tokeniser *tokeniser); 242static hubbub_error hubbub_tokeniser_handle_match_system( 243 hubbub_tokeniser *tokeniser); 244static hubbub_error hubbub_tokeniser_handle_before_doctype_system( 245 hubbub_tokeniser *tokeniser); 246static hubbub_error hubbub_tokeniser_handle_doctype_system_dq( 247 hubbub_tokeniser *tokeniser); 248static hubbub_error hubbub_tokeniser_handle_doctype_system_sq( 249 hubbub_tokeniser *tokeniser); 250static hubbub_error hubbub_tokeniser_handle_after_doctype_system( 251 hubbub_tokeniser *tokeniser); 252static hubbub_error hubbub_tokeniser_handle_bogus_doctype( 253 hubbub_tokeniser *tokeniser); 254static hubbub_error hubbub_tokeniser_handle_match_cdata( 255 hubbub_tokeniser *tokeniser); 256static hubbub_error hubbub_tokeniser_handle_cdata_block( 257 hubbub_tokeniser *tokeniser); 258static hubbub_error hubbub_tokeniser_consume_character_reference( 259 hubbub_tokeniser *tokeniser, size_t off); 260static hubbub_error hubbub_tokeniser_handle_numbered_entity( 261 hubbub_tokeniser *tokeniser); 262static hubbub_error hubbub_tokeniser_handle_named_entity( 263 hubbub_tokeniser *tokeniser); 264 265static inline hubbub_error emit_character_token(hubbub_tokeniser *tokeniser, 266 const hubbub_string *chars); 267static inline hubbub_error emit_current_chars(hubbub_tokeniser *tokeniser); 268static inline hubbub_error emit_current_tag(hubbub_tokeniser *tokeniser); 269static inline hubbub_error emit_current_comment(hubbub_tokeniser *tokeniser); 270static inline hubbub_error emit_current_doctype(hubbub_tokeniser *tokeniser, 271 bool force_quirks); 272static hubbub_error hubbub_tokeniser_emit_token(hubbub_tokeniser *tokeniser, 273 hubbub_token *token); 274 275/** 276 * Create a hubbub tokeniser 277 * 278 * \param input Input stream instance 279 * \param alloc Memory (de)allocation function 280 * \param pw Pointer to client-specific private data (may be NULL) 281 * \param tokeniser Pointer to location to receive tokeniser instance 282 * \return HUBBUB_OK on success, 283 * HUBBUB_BADPARM on bad parameters, 284 * HUBBUB_NOMEM on memory exhaustion 285 */ 286hubbub_error hubbub_tokeniser_create(parserutils_inputstream *input, 287 hubbub_allocator_fn alloc, void *pw, 288 hubbub_tokeniser **tokeniser) 289{ 290 parserutils_error perror; 291 hubbub_tokeniser *tok; 292 293 if (input == NULL || alloc == NULL || tokeniser == NULL) 294 return HUBBUB_BADPARM; 295 296 tok = alloc(NULL, sizeof(hubbub_tokeniser), pw); 297 if (tok == NULL) 298 return HUBBUB_NOMEM; 299 300 perror = parserutils_buffer_create(alloc, pw, &tok->buffer); 301 if (perror != PARSERUTILS_OK) { 302 alloc(tok, 0, pw); 303 return hubbub_error_from_parserutils_error(perror); 304 } 305 306 tok->state = STATE_DATA; 307 tok->content_model = HUBBUB_CONTENT_MODEL_PCDATA; 308 309 tok->escape_flag = false; 310 tok->process_cdata_section = false; 311 312 tok->input = input; 313 314 tok->token_handler = NULL; 315 tok->token_pw = NULL; 316 317 tok->error_handler = NULL; 318 tok->error_pw = NULL; 319 320 tok->alloc = alloc; 321 tok->alloc_pw = pw; 322 323 memset(&tok->context, 0, sizeof(hubbub_tokeniser_context)); 324 325 *tokeniser = tok; 326 327 return HUBBUB_OK; 328} 329 330/** 331 * Destroy a hubbub tokeniser 332 * 333 * \param tokeniser The tokeniser instance to destroy 334 * \return HUBBUB_OK on success, appropriate error otherwise 335 */ 336hubbub_error hubbub_tokeniser_destroy(hubbub_tokeniser *tokeniser) 337{ 338 if (tokeniser == NULL) 339 return HUBBUB_BADPARM; 340 341 if (tokeniser->context.current_tag.attributes != NULL) { 342 tokeniser->alloc(tokeniser->context.current_tag.attributes, 343 0, tokeniser->alloc_pw); 344 } 345 346 parserutils_buffer_destroy(tokeniser->buffer); 347 348 tokeniser->alloc(tokeniser, 0, tokeniser->alloc_pw); 349 350 return HUBBUB_OK; 351} 352 353/** 354 * Configure a hubbub tokeniser 355 * 356 * \param tokeniser The tokeniser instance to configure 357 * \param type The option type to set 358 * \param params Option-specific parameters 359 * \return HUBBUB_OK on success, appropriate error otherwise 360 */ 361hubbub_error hubbub_tokeniser_setopt(hubbub_tokeniser *tokeniser, 362 hubbub_tokeniser_opttype type, 363 hubbub_tokeniser_optparams *params) 364{ 365 if (tokeniser == NULL || params == NULL) 366 return HUBBUB_BADPARM; 367 368 switch (type) { 369 case HUBBUB_TOKENISER_TOKEN_HANDLER: 370 tokeniser->token_handler = params->token_handler.handler; 371 tokeniser->token_pw = params->token_handler.pw; 372 break; 373 case HUBBUB_TOKENISER_ERROR_HANDLER: 374 tokeniser->error_handler = params->error_handler.handler; 375 tokeniser->error_pw = params->error_handler.pw; 376 break; 377 case HUBBUB_TOKENISER_CONTENT_MODEL: 378 tokeniser->content_model = params->content_model.model; 379 break; 380 case HUBBUB_TOKENISER_PROCESS_CDATA: 381 tokeniser->process_cdata_section = params->process_cdata; 382 break; 383 } 384 385 return HUBBUB_OK; 386} 387 388/** 389 * Process remaining data in the input stream 390 * 391 * \param tokeniser The tokeniser instance to invoke 392 * \return HUBBUB_OK on success, appropriate error otherwise 393 */ 394hubbub_error hubbub_tokeniser_run(hubbub_tokeniser *tokeniser) 395{ 396 hubbub_error cont = HUBBUB_OK; 397 398 if (tokeniser == NULL) 399 return HUBBUB_BADPARM; 400 401#if 0 402#define state(x) \ 403 case x: \ 404 printf( #x "\n"); 405#else 406#define state(x) \ 407 case x: 408#endif 409 410 while (cont == HUBBUB_OK) { 411 switch (tokeniser->state) { 412 state(STATE_DATA) 413 cont = hubbub_tokeniser_handle_data(tokeniser); 414 break; 415 state(STATE_CHARACTER_REFERENCE_DATA) 416 cont = hubbub_tokeniser_handle_character_reference_data( 417 tokeniser); 418 break; 419 state(STATE_TAG_OPEN) 420 cont = hubbub_tokeniser_handle_tag_open(tokeniser); 421 break; 422 state(STATE_CLOSE_TAG_OPEN) 423 cont = hubbub_tokeniser_handle_close_tag_open( 424 tokeniser); 425 break; 426 state(STATE_TAG_NAME) 427 cont = hubbub_tokeniser_handle_tag_name(tokeniser); 428 break; 429 state(STATE_BEFORE_ATTRIBUTE_NAME) 430 cont = hubbub_tokeniser_handle_before_attribute_name( 431 tokeniser); 432 break; 433 state(STATE_ATTRIBUTE_NAME) 434 cont = hubbub_tokeniser_handle_attribute_name( 435 tokeniser); 436 break; 437 state(STATE_AFTER_ATTRIBUTE_NAME) 438 cont = hubbub_tokeniser_handle_after_attribute_name( 439 tokeniser); 440 break; 441 state(STATE_BEFORE_ATTRIBUTE_VALUE) 442 cont = hubbub_tokeniser_handle_before_attribute_value( 443 tokeniser); 444 break; 445 state(STATE_ATTRIBUTE_VALUE_DQ) 446 cont = hubbub_tokeniser_handle_attribute_value_dq( 447 tokeniser); 448 break; 449 state(STATE_ATTRIBUTE_VALUE_SQ) 450 cont = hubbub_tokeniser_handle_attribute_value_sq( 451 tokeniser); 452 break; 453 state(STATE_ATTRIBUTE_VALUE_UQ) 454 cont = hubbub_tokeniser_handle_attribute_value_uq( 455 tokeniser); 456 break; 457 state(STATE_CHARACTER_REFERENCE_IN_ATTRIBUTE_VALUE) 458 cont = hubbub_tokeniser_handle_character_reference_in_attribute_value( 459 tokeniser); 460 break; 461 state(STATE_AFTER_ATTRIBUTE_VALUE_Q) 462 cont = hubbub_tokeniser_handle_after_attribute_value_q( 463 tokeniser); 464 break; 465 state(STATE_SELF_CLOSING_START_TAG) 466 cont = hubbub_tokeniser_handle_self_closing_start_tag( 467 tokeniser); 468 break; 469 state(STATE_BOGUS_COMMENT) 470 cont = hubbub_tokeniser_handle_bogus_comment( 471 tokeniser); 472 break; 473 state(STATE_MARKUP_DECLARATION_OPEN) 474 cont = hubbub_tokeniser_handle_markup_declaration_open( 475 tokeniser); 476 break; 477 state(STATE_MATCH_COMMENT) 478 cont = hubbub_tokeniser_handle_match_comment( 479 tokeniser); 480 break; 481 case STATE_COMMENT_START: 482 case STATE_COMMENT_START_DASH: 483 case STATE_COMMENT: 484 case STATE_COMMENT_END_DASH: 485 case STATE_COMMENT_END: 486 cont = hubbub_tokeniser_handle_comment(tokeniser); 487 break; 488 state(STATE_MATCH_DOCTYPE) 489 cont = hubbub_tokeniser_handle_match_doctype( 490 tokeniser); 491 break; 492 state(STATE_DOCTYPE) 493 cont = hubbub_tokeniser_handle_doctype(tokeniser); 494 break; 495 state(STATE_BEFORE_DOCTYPE_NAME) 496 cont = hubbub_tokeniser_handle_before_doctype_name( 497 tokeniser); 498 break; 499 state(STATE_DOCTYPE_NAME) 500 cont = hubbub_tokeniser_handle_doctype_name( 501 tokeniser); 502 break; 503 state(STATE_AFTER_DOCTYPE_NAME) 504 cont = hubbub_tokeniser_handle_after_doctype_name( 505 tokeniser); 506 break; 507 508 state(STATE_MATCH_PUBLIC) 509 cont = hubbub_tokeniser_handle_match_public( 510 tokeniser); 511 break; 512 state(STATE_BEFORE_DOCTYPE_PUBLIC) 513 cont = hubbub_tokeniser_handle_before_doctype_public( 514 tokeniser); 515 break; 516 state(STATE_DOCTYPE_PUBLIC_DQ) 517 cont = hubbub_tokeniser_handle_doctype_public_dq( 518 tokeniser); 519 break; 520 state(STATE_DOCTYPE_PUBLIC_SQ) 521 cont = hubbub_tokeniser_handle_doctype_public_sq( 522 tokeniser); 523 break; 524 state(STATE_AFTER_DOCTYPE_PUBLIC) 525 cont = hubbub_tokeniser_handle_after_doctype_public( 526 tokeniser); 527 break; 528 state(STATE_MATCH_SYSTEM) 529 cont = hubbub_tokeniser_handle_match_system( 530 tokeniser); 531 break; 532 state(STATE_BEFORE_DOCTYPE_SYSTEM) 533 cont = hubbub_tokeniser_handle_before_doctype_system( 534 tokeniser); 535 break; 536 state(STATE_DOCTYPE_SYSTEM_DQ) 537 cont = hubbub_tokeniser_handle_doctype_system_dq( 538 tokeniser); 539 break; 540 state(STATE_DOCTYPE_SYSTEM_SQ) 541 cont = hubbub_tokeniser_handle_doctype_system_sq( 542 tokeniser); 543 break; 544 state(STATE_AFTER_DOCTYPE_SYSTEM) 545 cont = hubbub_tokeniser_handle_after_doctype_system( 546 tokeniser); 547 break; 548 state(STATE_BOGUS_DOCTYPE) 549 cont = hubbub_tokeniser_handle_bogus_doctype( 550 tokeniser); 551 break; 552 state(STATE_MATCH_CDATA) 553 cont = hubbub_tokeniser_handle_match_cdata( 554 tokeniser); 555 break; 556 state(STATE_CDATA_BLOCK) 557 cont = hubbub_tokeniser_handle_cdata_block( 558 tokeniser); 559 break; 560 state(STATE_NUMBERED_ENTITY) 561 cont = hubbub_tokeniser_handle_numbered_entity( 562 tokeniser); 563 break; 564 state(STATE_NAMED_ENTITY) 565 cont = hubbub_tokeniser_handle_named_entity( 566 tokeniser); 567 break; 568 } 569 } 570 571 return (cont == HUBBUB_NEEDDATA) ? HUBBUB_OK : cont; 572} 573 574 575/** 576 * Various macros for manipulating buffers. 577 * 578 * \todo make some of these inline functions (type-safety) 579 * \todo document them properly here 580 */ 581 582#define START_BUF(str, cptr, length) \ 583 do { \ 584 parserutils_error perror; \ 585 perror = parserutils_buffer_append(tokeniser->buffer, \ 586 (uint8_t *) (cptr), (length)); \ 587 if (perror != PARSERUTILS_OK) \ 588 return hubbub_error_from_parserutils_error(perror); \ 589 (str).len = (length); \ 590 } while (0) 591 592#define COLLECT(str, cptr, length) \ 593 do { \ 594 parserutils_error perror; \ 595 assert(str.len != 0); \ 596 perror = parserutils_buffer_append(tokeniser->buffer, \ 597 (uint8_t *) (cptr), (length)); \ 598 if (perror != PARSERUTILS_OK) \ 599 return hubbub_error_from_parserutils_error(perror); \ 600 (str).len += (length); \ 601 } while (0) 602 603#define COLLECT_MS(str, cptr, length) \ 604 do { \ 605 parserutils_error perror; \ 606 perror = parserutils_buffer_append(tokeniser->buffer, \ 607 (uint8_t *) (cptr), (length)); \ 608 if (perror != PARSERUTILS_OK) \ 609 return hubbub_error_from_parserutils_error(perror); \ 610 (str).len += (length); \ 611 } while (0) 612 613 614/* this should always be called with an empty "chars" buffer */ 615hubbub_error hubbub_tokeniser_handle_data(hubbub_tokeniser *tokeniser) 616{ 617 parserutils_error error; 618 hubbub_token token; 619 const uint8_t *cptr; 620 size_t len; 621 622 while ((error = parserutils_inputstream_peek(tokeniser->input, 623 tokeniser->context.pending, &cptr, &len)) == 624 PARSERUTILS_OK) { 625 const uint8_t c = *cptr; 626 627 if (c == '&' && 628 (tokeniser->content_model == HUBBUB_CONTENT_MODEL_PCDATA || 629 tokeniser->content_model == HUBBUB_CONTENT_MODEL_RCDATA) && 630 tokeniser->escape_flag == false) { 631 tokeniser->state = 632 STATE_CHARACTER_REFERENCE_DATA; 633 /* Don't eat the '&'; it'll be handled by entity 634 * consumption */ 635 break; 636 } else if (c == '-' && 637 tokeniser->escape_flag == false && 638 (tokeniser->content_model == 639 HUBBUB_CONTENT_MODEL_RCDATA || 640 tokeniser->content_model == 641 HUBBUB_CONTENT_MODEL_CDATA) && 642 tokeniser->context.pending >= 3) { 643 size_t ignore; 644 error = parserutils_inputstream_peek( 645 tokeniser->input, 646 tokeniser->context.pending - 3, 647 &cptr, 648 &ignore); 649 650 assert(error == PARSERUTILS_OK); 651 652 if (strncmp((char *)cptr, 653 "<!--", SLEN("<!--")) == 0) { 654 tokeniser->escape_flag = true; 655 } 656 657 tokeniser->context.pending += len; 658 } else if (c == '<' && (tokeniser->content_model == 659 HUBBUB_CONTENT_MODEL_PCDATA || 660 ((tokeniser->content_model == 661 HUBBUB_CONTENT_MODEL_RCDATA || 662 tokeniser->content_model == 663 HUBBUB_CONTENT_MODEL_CDATA) && 664 tokeniser->escape_flag == false))) { 665 if (tokeniser->context.pending > 0) { 666 /* Emit any pending characters */ 667 emit_current_chars(tokeniser); 668 } 669 670 /* Buffer '<' */ 671 tokeniser->context.pending = len; 672 tokeniser->state = STATE_TAG_OPEN; 673 break; 674 } else if (c == '>' && tokeniser->escape_flag == true && 675 (tokeniser->content_model == 676 HUBBUB_CONTENT_MODEL_RCDATA || 677 tokeniser->content_model == 678 HUBBUB_CONTENT_MODEL_CDATA)) { 679 /* no need to check that there are enough characters, 680 * since you can only run into this if the flag is 681 * true in the first place, which requires four 682 * characters. */ 683 error = parserutils_inputstream_peek( 684 tokeniser->input, 685 tokeniser->context.pending - 2, 686 &cptr, 687 &len); 688 689 assert(error == PARSERUTILS_OK); 690 691 if (strncmp((char *) cptr, "-->", SLEN("-->")) == 0) { 692 tokeniser->escape_flag = false; 693 } 694 695 tokeniser->context.pending += len; 696 } else if (c == '\0') { 697 if (tokeniser->context.pending > 0) { 698 /* Emit any pending characters */ 699 emit_current_chars(tokeniser); 700 } 701 702 /* Emit a replacement character */ 703 emit_character_token(tokeniser, &u_fffd_str); 704 705 /* Advance past NUL */ 706 parserutils_inputstream_advance(tokeniser->input, 1); 707 } else if (c == '\r') { 708 error = parserutils_inputstream_peek( 709 tokeniser->input, 710 tokeniser->context.pending + len, 711 &cptr, 712 &len); 713 714 if (error != PARSERUTILS_OK && 715 error != PARSERUTILS_EOF) { 716 break; 717 } 718 719 if (tokeniser->context.pending > 0) { 720 /* Emit any pending characters */ 721 emit_current_chars(tokeniser); 722 } 723 724 if (error == PARSERUTILS_EOF || *cptr != '\n') { 725 /* Emit newline */ 726 emit_character_token(tokeniser, &lf_str); 727 } 728 729 /* Advance over */ 730 parserutils_inputstream_advance(tokeniser->input, 1); 731 } else { 732 /* Just collect into buffer */ 733 tokeniser->context.pending += len; 734 } 735 } 736 737 if (tokeniser->state != STATE_TAG_OPEN && 738 (tokeniser->state != STATE_DATA || error == PARSERUTILS_EOF) && 739 tokeniser->context.pending > 0) { 740 /* Emit any pending characters */ 741 emit_current_chars(tokeniser); 742 } 743 744 if (error == PARSERUTILS_EOF) { 745 token.type = HUBBUB_TOKEN_EOF; 746 hubbub_tokeniser_emit_token(tokeniser, &token); 747 } 748 749 if (error == PARSERUTILS_EOF) { 750 return HUBBUB_NEEDDATA; 751 } else { 752 return hubbub_error_from_parserutils_error(error); 753 } 754} 755 756/* emit any pending tokens before calling */ 757hubbub_error hubbub_tokeniser_handle_character_reference_data( 758 hubbub_tokeniser *tokeniser) 759{ 760 assert(tokeniser->context.pending == 0); 761 762 if (tokeniser->context.match_entity.complete == false) { 763 return hubbub_tokeniser_consume_character_reference(tokeniser, 764 tokeniser->context.pending); 765 } else { 766 hubbub_token token; 767 768 uint8_t utf8[6]; 769 uint8_t *utf8ptr = utf8; 770 size_t len = sizeof(utf8); 771 772 token.type = HUBBUB_TOKEN_CHARACTER; 773 774 if (tokeniser->context.match_entity.codepoint) { 775 parserutils_charset_utf8_from_ucs4( 776 tokeniser->context.match_entity.codepoint, 777 &utf8ptr, &len); 778 779 token.data.character.ptr = utf8; 780 token.data.character.len = sizeof(utf8) - len; 781 782 hubbub_tokeniser_emit_token(tokeniser, &token); 783 784 /* +1 for ampersand */ 785 parserutils_inputstream_advance(tokeniser->input, 786 tokeniser->context.match_entity.length 787 + 1); 788 } else { 789 parserutils_error error; 790 const uint8_t *cptr = NULL; 791 error = parserutils_inputstream_peek( 792 tokeniser->input, 793 tokeniser->context.pending, 794 &cptr, 795 &len); 796 797 assert(error == PARSERUTILS_OK); 798 799 token.data.character.ptr = cptr; 800 token.data.character.len = len; 801 802 hubbub_tokeniser_emit_token(tokeniser, &token); 803 parserutils_inputstream_advance(tokeniser->input, len); 804 } 805 806 /* Reset for next time */ 807 tokeniser->context.match_entity.complete = false; 808 809 tokeniser->state = STATE_DATA; 810 } 811 812 return HUBBUB_OK; 813} 814 815/* this state always switches to another state straight away */ 816/* this state expects the current character to be '<' */ 817hubbub_error hubbub_tokeniser_handle_tag_open(hubbub_tokeniser *tokeniser) 818{ 819 hubbub_tag *ctag = &tokeniser->context.current_tag; 820 821 size_t len; 822 const uint8_t *cptr; 823 parserutils_error error; 824 uint8_t c; 825 826 assert(tokeniser->context.pending == 1); 827/* assert(tokeniser->context.chars.ptr[0] == '<'); */ 828 829 error = parserutils_inputstream_peek(tokeniser->input, 830 tokeniser->context.pending, &cptr, &len); 831 832 if (error != PARSERUTILS_OK) { 833 if (error == PARSERUTILS_EOF) { 834 /* Return to data state with '<' still in "chars" */ 835 tokeniser->state = STATE_DATA; 836 return HUBBUB_OK; 837 } else { 838 return hubbub_error_from_parserutils_error(error); 839 } 840 } 841 842 c = *cptr; 843 844 if (c == '/') { 845 tokeniser->context.pending += len; 846 847 tokeniser->context.close_tag_match.match = false; 848 tokeniser->context.close_tag_match.count = 0; 849 850 tokeniser->state = STATE_CLOSE_TAG_OPEN; 851 } else if (tokeniser->content_model == HUBBUB_CONTENT_MODEL_RCDATA || 852 tokeniser->content_model == 853 HUBBUB_CONTENT_MODEL_CDATA) { 854 /* Return to data state with '<' still in "chars" */ 855 tokeniser->state = STATE_DATA; 856 } else if (tokeniser->content_model == HUBBUB_CONTENT_MODEL_PCDATA) { 857 if (c == '!') { 858 parserutils_inputstream_advance(tokeniser->input, 859 SLEN("<!")); 860 861 tokeniser->context.pending = 0; 862 tokeniser->state = STATE_MARKUP_DECLARATION_OPEN; 863 } else if ('A' <= c && c <= 'Z') { 864 uint8_t lc = (c + 0x20); 865 866 START_BUF(ctag->name, &lc, len); 867 ctag->n_attributes = 0; 868 tokeniser->context.current_tag_type = 869 HUBBUB_TOKEN_START_TAG; 870 871 tokeniser->context.pending += len; 872 873 tokeniser->state = STATE_TAG_NAME; 874 } else if ('a' <= c && c <= 'z') { 875 START_BUF(ctag->name, cptr, len); 876 ctag->n_attributes = 0; 877 tokeniser->context.current_tag_type = 878 HUBBUB_TOKEN_START_TAG; 879 880 tokeniser->context.pending += len; 881 882 tokeniser->state = STATE_TAG_NAME; 883 } else if (c == '>') { 884 /** \todo parse error */ 885 886 tokeniser->context.pending += len; 887 tokeniser->state = STATE_DATA; 888 } else if (c == '?') { 889 /** \todo parse error */ 890 891 /* Cursor still at "<", need to advance past it */ 892 parserutils_inputstream_advance( 893 tokeniser->input, SLEN("<")); 894 tokeniser->context.pending = 0; 895 896 tokeniser->state = STATE_BOGUS_COMMENT; 897 } else { 898 /* Return to data state with '<' still in "chars" */ 899 tokeniser->state = STATE_DATA; 900 } 901 } 902 903 return HUBBUB_OK; 904} 905 906/* this state expects tokeniser->context.chars to be "</" */ 907/* this state never stays in this state for more than one character */ 908hubbub_error hubbub_tokeniser_handle_close_tag_open(hubbub_tokeniser *tokeniser) 909{ 910 hubbub_tokeniser_context *ctx = &tokeniser->context; 911 912 size_t len; 913 const uint8_t *cptr; 914 parserutils_error error; 915 uint8_t c; 916 917 assert(tokeniser->context.pending == 2); 918/* assert(tokeniser->context.chars.ptr[0] == '<'); */ 919/* assert(tokeniser->context.chars.ptr[1] == '/'); */ 920 921 /**\todo fragment case */ 922 923 if (tokeniser->content_model == HUBBUB_CONTENT_MODEL_RCDATA || 924 tokeniser->content_model == 925 HUBBUB_CONTENT_MODEL_CDATA) { 926 uint8_t *start_tag_name = 927 tokeniser->context.last_start_tag_name; 928 size_t start_tag_len = 929 tokeniser->context.last_start_tag_len; 930 931 while ((error = parserutils_inputstream_peek(tokeniser->input, 932 ctx->pending + 933 ctx->close_tag_match.count, 934 &cptr, 935 &len)) == PARSERUTILS_OK) { 936 c = *cptr; 937 938 if ((start_tag_name[ctx->close_tag_match.count] & ~0x20) 939 != (c & ~0x20)) { 940 break; 941 } 942 943 ctx->close_tag_match.count += len; 944 945 if (ctx->close_tag_match.count == start_tag_len) { 946 ctx->close_tag_match.match = true; 947 break; 948 } 949 } 950 951 if (error != PARSERUTILS_OK && error != PARSERUTILS_EOF) { 952 return hubbub_error_from_parserutils_error(error); 953 } 954 955 if (ctx->close_tag_match.match == true) { 956 error = parserutils_inputstream_peek( 957 tokeniser->input, 958 ctx->pending + 959 ctx->close_tag_match.count, 960 &cptr, 961 &len); 962 963 if (error != PARSERUTILS_OK && 964 error != PARSERUTILS_EOF) { 965 return hubbub_error_from_parserutils_error( 966 error); 967 } else if (error != PARSERUTILS_EOF) { 968 c = *cptr; 969 970 if (c != '\t' && c != '\n' && c != '\f' && 971 c != ' ' && c != '>' && 972 c != '/') { 973 ctx->close_tag_match.match = false; 974 } 975 } 976 } 977 } 978 979 if (ctx->close_tag_match.match == false && 980 tokeniser->content_model != 981 HUBBUB_CONTENT_MODEL_PCDATA) { 982 /* We should emit "</" here, but instead we leave it in the 983 * buffer so the data state emits it with any characters 984 * following it */ 985 tokeniser->state = STATE_DATA; 986 } else { 987 error = parserutils_inputstream_peek(tokeniser->input, 988 tokeniser->context.pending, &cptr, &len); 989 990 if (error == PARSERUTILS_EOF) { 991 /** \todo parse error */ 992 993 /* Return to data state with "</" pending */ 994 tokeniser->state = STATE_DATA; 995 return HUBBUB_OK; 996 } else if (error != PARSERUTILS_OK) { 997 return hubbub_error_from_parserutils_error(error); 998 } 999 1000 c = *cptr; 1001 1002 if ('A' <= c && c <= 'Z') { 1003 uint8_t lc = (c + 0x20); 1004 START_BUF(tokeniser->context.current_tag.name, 1005 &lc, len); 1006 tokeniser->context.current_tag.n_attributes = 0; 1007 1008 tokeniser->context.current_tag_type = 1009 HUBBUB_TOKEN_END_TAG; 1010 1011 tokeniser->context.pending += len; 1012 1013 tokeniser->state = STATE_TAG_NAME; 1014 } else if ('a' <= c && c <= 'z') { 1015 START_BUF(tokeniser->context.current_tag.name, 1016 cptr, len); 1017 tokeniser->context.current_tag.n_attributes = 0; 1018 1019 tokeniser->context.current_tag_type = 1020 HUBBUB_TOKEN_END_TAG; 1021 1022 tokeniser->context.pending += len; 1023 1024 tokeniser->state = STATE_TAG_NAME; 1025 } else if (c == '>') { 1026 /* Cursor still at "</", need to collect ">" */ 1027 tokeniser->context.pending += len; 1028 1029 /* Now need to advance past "</>" */ 1030 parserutils_inputstream_advance(tokeniser->input, 1031 tokeniser->context.pending); 1032 tokeniser->context.pending = 0; 1033 1034 /** \todo parse error */ 1035 tokeniser->state = STATE_DATA; 1036 } else { 1037 /** \todo parse error */ 1038 1039 /* Cursor still at "</", need to advance past it */ 1040 parserutils_inputstream_advance(tokeniser->input, 1041 tokeniser->context.pending); 1042 tokeniser->context.pending = 0; 1043 1044 tokeniser->state = STATE_BOGUS_COMMENT; 1045 } 1046 } 1047 1048 return HUBBUB_OK; 1049} 1050 1051/* this state expects tokeniser->context.current_tag to already have its 1052 first character set */ 1053hubbub_error hubbub_tokeniser_handle_tag_name(hubbub_tokeniser *tokeniser) 1054{ 1055 hubbub_tag *ctag = &tokeniser->context.current_tag; 1056 1057 size_t len; 1058 const uint8_t *cptr; 1059 parserutils_error error; 1060 uint8_t c; 1061 1062 assert(tokeniser->context.pending > 0); 1063/* assert(tokeniser->context.chars.ptr[0] == '<'); */ 1064 assert(ctag->name.len > 0); 1065/* assert(ctag->name.ptr); */ 1066 1067 error = parserutils_inputstream_peek(tokeniser->input, 1068 tokeniser->context.pending, &cptr, &len); 1069 1070 if (error != PARSERUTILS_OK) { 1071 if (error == PARSERUTILS_EOF) { 1072 tokeniser->state = STATE_DATA; 1073 return emit_current_tag(tokeniser); 1074 } else { 1075 return hubbub_error_from_parserutils_error(error); 1076 } 1077 } 1078 1079 c = *cptr; 1080 1081 if (c == '\t' || c == '\n' || c == '\f' || c == ' ' || c == '\r') { 1082 tokeniser->context.pending += len; 1083 tokeniser->state = STATE_BEFORE_ATTRIBUTE_NAME; 1084 } else if (c == '>') { 1085 tokeniser->context.pending += len; 1086 tokeniser->state = STATE_DATA; 1087 return emit_current_tag(tokeniser); 1088 } else if (c == '\0') { 1089 COLLECT(ctag->name, u_fffd, sizeof(u_fffd)); 1090 tokeniser->context.pending += len; 1091 } else if (c == '/') { 1092 tokeniser->context.pending += len; 1093 tokeniser->state = STATE_SELF_CLOSING_START_TAG; 1094 } else if ('A' <= c && c <= 'Z') { 1095 uint8_t lc = (c + 0x20); 1096 COLLECT(ctag->name, &lc, len); 1097 tokeniser->context.pending += len; 1098 } else { 1099 COLLECT(ctag->name, cptr, len); 1100 tokeniser->context.pending += len; 1101 } 1102 1103 return HUBBUB_OK; 1104} 1105 1106hubbub_error hubbub_tokeniser_handle_before_attribute_name( 1107 hubbub_tokeniser *tokeniser) 1108{ 1109 hubbub_tag *ctag = &tokeniser->context.current_tag; 1110 1111 size_t len; 1112 const uint8_t *cptr; 1113 parserutils_error error; 1114 uint8_t c; 1115 1116 error = parserutils_inputstream_peek(tokeniser->input, 1117 tokeniser->context.pending, &cptr, &len); 1118 1119 if (error != PARSERUTILS_OK) { 1120 if (error == PARSERUTILS_EOF) { 1121 tokeniser->state = STATE_DATA; 1122 return emit_current_tag(tokeniser); 1123 } else { 1124 return hubbub_error_from_parserutils_error(error); 1125 } 1126 } 1127 1128 c = *cptr; 1129 1130 if (c == '\t' || c == '\n' || c == '\f' || c == ' ' || c == '\r') { 1131 /* pass over in silence */ 1132 tokeniser->context.pending += len; 1133 } else if (c == '>') { 1134 tokeniser->context.pending += len; 1135 tokeniser->state = STATE_DATA; 1136 return emit_current_tag(tokeniser); 1137 } else if (c == '/') { 1138 tokeniser->context.pending += len; 1139 tokeniser->state = STATE_SELF_CLOSING_START_TAG; 1140 } else { 1141 hubbub_attribute *attr; 1142 1143 if (c == '"' || c == '\'' || c == '=') { 1144 /** \todo parse error */ 1145 } 1146 1147 attr = tokeniser->alloc(ctag->attributes, 1148 (ctag->n_attributes + 1) * 1149 sizeof(hubbub_attribute), 1150 tokeniser->alloc_pw); 1151 if (attr == NULL) 1152 return HUBBUB_NOMEM; 1153 1154 ctag->attributes = attr; 1155 1156 if ('A' <= c && c <= 'Z') { 1157 uint8_t lc = (c + 0x20); 1158 START_BUF(attr[ctag->n_attributes].name, &lc, len); 1159 } else if (c == '\0') { 1160 START_BUF(attr[ctag->n_attributes].name, 1161 u_fffd, sizeof(u_fffd)); 1162 } else { 1163 START_BUF(attr[ctag->n_attributes].name, cptr, len); 1164 } 1165 1166 attr[ctag->n_attributes].ns = HUBBUB_NS_NULL; 1167 attr[ctag->n_attributes].value.ptr = NULL; 1168 attr[ctag->n_attributes].value.len = 0; 1169 1170 ctag->n_attributes++; 1171 1172 tokeniser->context.pending += len; 1173 tokeniser->state = STATE_ATTRIBUTE_NAME; 1174 } 1175 1176 return HUBBUB_OK; 1177} 1178 1179hubbub_error hubbub_tokeniser_handle_attribute_name(hubbub_tokeniser *tokeniser) 1180{ 1181 hubbub_tag *ctag = &tokeniser->context.current_tag; 1182 1183 size_t len; 1184 const uint8_t *cptr; 1185 parserutils_error error; 1186 uint8_t c; 1187 1188 assert(ctag->attributes[ctag->n_attributes - 1].name.len > 0); 1189 1190 error = parserutils_inputstream_peek(tokeniser->input, 1191 tokeniser->context.pending, &cptr, &len); 1192 1193 if (error != PARSERUTILS_OK) { 1194 if (error == PARSERUTILS_EOF) { 1195 tokeniser->state = STATE_DATA; 1196 return emit_current_tag(tokeniser); 1197 } else { 1198 return hubbub_error_from_parserutils_error(error); 1199 } 1200 } 1201 1202 c = *cptr; 1203 1204 if (c == '\t' || c == '\n' || c == '\f' || c == ' ' || c == '\r') { 1205 tokeniser->context.pending += len; 1206 tokeniser->state = STATE_AFTER_ATTRIBUTE_NAME; 1207 } else if (c == '=') { 1208 tokeniser->context.pending += len; 1209 tokeniser->state = STATE_BEFORE_ATTRIBUTE_VALUE; 1210 } else if (c == '>') { 1211 tokeniser->context.pending += len; 1212 tokeniser->state = STATE_DATA; 1213 return emit_current_tag(tokeniser); 1214 } else if (c == '/') { 1215 tokeniser->context.pending += len; 1216 tokeniser->state = STATE_SELF_CLOSING_START_TAG; 1217 } else if (c == '\0') { 1218 COLLECT(ctag->attributes[ctag->n_attributes - 1].name, 1219 u_fffd, sizeof(u_fffd)); 1220 tokeniser->context.pending += len; 1221 } else if ('A' <= c && c <= 'Z') { 1222 uint8_t lc = (c + 0x20); 1223 COLLECT(ctag->attributes[ctag->n_attributes - 1].name, 1224 &lc, len); 1225 tokeniser->context.pending += len; 1226 } else { 1227 COLLECT(ctag->attributes[ctag->n_attributes - 1].name, 1228 cptr, len); 1229 tokeniser->context.pending += len; 1230 } 1231 1232 return HUBBUB_OK; 1233} 1234 1235hubbub_error hubbub_tokeniser_handle_after_attribute_name( 1236 hubbub_tokeniser *tokeniser) 1237{ 1238 hubbub_tag *ctag = &tokeniser->context.current_tag; 1239 1240 size_t len; 1241 const uint8_t *cptr; 1242 parserutils_error error; 1243 uint8_t c; 1244 1245 error = parserutils_inputstream_peek(tokeniser->input, 1246 tokeniser->context.pending, &cptr, &len); 1247 1248 if (error != PARSERUTILS_OK) { 1249 if (error == PARSERUTILS_EOF) { 1250 tokeniser->state = STATE_DATA; 1251 return emit_current_tag(tokeniser); 1252 } else { 1253 return hubbub_error_from_parserutils_error(error); 1254 } 1255 } 1256 1257 c = *cptr; 1258 1259 if (c == '\t' || c == '\n' || c == '\f' || c == ' ' || c == '\r') { 1260 tokeniser->context.pending += len; 1261 } else if (c == '=') { 1262 tokeniser->context.pending += len; 1263 tokeniser->state = STATE_BEFORE_ATTRIBUTE_VALUE; 1264 } else if (c == '>') { 1265 tokeniser->context.pending += len; 1266 1267 tokeniser->state = STATE_DATA; 1268 return emit_current_tag(tokeniser); 1269 } else if (c == '/') { 1270 tokeniser->context.pending += len; 1271 tokeniser->state = STATE_SELF_CLOSING_START_TAG; 1272 } else { 1273 hubbub_attribute *attr; 1274 1275 if (c == '"' || c == '\'') { 1276 /** \todo parse error */ 1277 } 1278 1279 attr = tokeniser->alloc(ctag->attributes, 1280 (ctag->n_attributes + 1) * 1281 sizeof(hubbub_attribute), 1282 tokeniser->alloc_pw); 1283 if (attr == NULL) 1284 return HUBBUB_NOMEM; 1285 1286 ctag->attributes = attr; 1287 1288 if ('A' <= c && c <= 'Z') { 1289 uint8_t lc = (c + 0x20); 1290 START_BUF(attr[ctag->n_attributes].name, &lc, len); 1291 } else if (c == '\0') { 1292 START_BUF(attr[ctag->n_attributes].name, 1293 u_fffd, sizeof(u_fffd)); 1294 } else { 1295 START_BUF(attr[ctag->n_attributes].name, cptr, len); 1296 } 1297 1298 attr[ctag->n_attributes].ns = HUBBUB_NS_NULL; 1299 attr[ctag->n_attributes].value.ptr = NULL; 1300 attr[ctag->n_attributes].value.len = 0; 1301 1302 ctag->n_attributes++; 1303 1304 tokeniser->context.pending += len; 1305 tokeniser->state = STATE_ATTRIBUTE_NAME; 1306 } 1307 1308 return HUBBUB_OK; 1309} 1310 1311/* this state is only ever triggered by an '=' */ 1312hubbub_error hubbub_tokeniser_handle_before_attribute_value( 1313 hubbub_tokeniser *tokeniser) 1314{ 1315 hubbub_tag *ctag = &tokeniser->context.current_tag; 1316 1317 size_t len; 1318 const uint8_t *cptr; 1319 parserutils_error error; 1320 uint8_t c; 1321 1322 error = parserutils_inputstream_peek(tokeniser->input, 1323 tokeniser->context.pending, &cptr, &len); 1324 1325 if (error != PARSERUTILS_OK) { 1326 if (error == PARSERUTILS_EOF) { 1327 /** \todo parse error */ 1328 tokeniser->state = STATE_DATA; 1329 return emit_current_tag(tokeniser); 1330 } else { 1331 return hubbub_error_from_parserutils_error(error); 1332 } 1333 } 1334 1335 c = *cptr; 1336 1337 if (c == '\t' || c == '\n' || c == '\f' || c == ' ' || c == '\r') { 1338 tokeniser->context.pending += len; 1339 } else if (c == '"') { 1340 tokeniser->context.pending += len; 1341 tokeniser->state = STATE_ATTRIBUTE_VALUE_DQ; 1342 } else if (c == '&') { 1343 /* Don't consume the '&' -- reprocess in UQ state */ 1344 tokeniser->state = STATE_ATTRIBUTE_VALUE_UQ; 1345 } else if (c == '\'') { 1346 tokeniser->context.pending += len; 1347 tokeniser->state = STATE_ATTRIBUTE_VALUE_SQ; 1348 } else if (c == '>') { 1349 /** \todo parse error */ 1350 tokeniser->context.pending += len; 1351 1352 tokeniser->state = STATE_DATA; 1353 return emit_current_tag(tokeniser); 1354 } else if (c == '\0') { 1355 START_BUF(ctag->attributes[ctag->n_attributes - 1].value, 1356 u_fffd, sizeof(u_fffd)); 1357 tokeniser->context.pending += len; 1358 tokeniser->state = STATE_ATTRIBUTE_VALUE_UQ; 1359 } else { 1360 if (c == '=') { 1361 /** \todo parse error */ 1362 } 1363 1364 START_BUF(ctag->attributes[ctag->n_attributes - 1].value, 1365 cptr, len); 1366 1367 tokeniser->context.pending += len; 1368 tokeniser->state = STATE_ATTRIBUTE_VALUE_UQ; 1369 } 1370 1371 return HUBBUB_OK; 1372} 1373 1374hubbub_error hubbub_tokeniser_handle_attribute_value_dq( 1375 hubbub_tokeniser *tokeniser) 1376{ 1377 hubbub_tag *ctag = &tokeniser->context.current_tag; 1378 1379 size_t len; 1380 const uint8_t *cptr; 1381 parserutils_error error; 1382 uint8_t c; 1383 1384 error = parserutils_inputstream_peek(tokeniser->input, 1385 tokeniser->context.pending, &cptr, &len); 1386 1387 if (error != PARSERUTILS_OK) { 1388 if (error == PARSERUTILS_EOF) { 1389 tokeniser->state = STATE_DATA; 1390 return emit_current_tag(tokeniser); 1391 } else { 1392 return hubbub_error_from_parserutils_error(error); 1393 } 1394 } 1395 1396 c = *cptr; 1397 1398 if (c == '"') { 1399 tokeniser->context.pending += len; 1400 tokeniser->state = STATE_AFTER_ATTRIBUTE_VALUE_Q; 1401 } else if (c == '&') { 1402 tokeniser->context.prev_state = tokeniser->state; 1403 tokeniser->state = STATE_CHARACTER_REFERENCE_IN_ATTRIBUTE_VALUE; 1404 tokeniser->context.allowed_char = '"'; 1405 /* Don't eat the '&'; it'll be handled by entity consumption */ 1406 } else if (c == '\0') { 1407 COLLECT_MS(ctag->attributes[ctag->n_attributes - 1].value, 1408 u_fffd, sizeof(u_fffd)); 1409 tokeniser->context.pending += len; 1410 } else if (c == '\r') { 1411 error = parserutils_inputstream_peek( 1412 tokeniser->input, 1413 tokeniser->context.pending + len, 1414 &cptr, 1415 &len); 1416 1417 if (error != PARSERUTILS_OK && error != PARSERUTILS_EOF) { 1418 return hubbub_error_from_parserutils_error(error); 1419 } else if (error == PARSERUTILS_EOF || *cptr != '\n') { 1420 COLLECT_MS(ctag->attributes[ 1421 ctag->n_attributes - 1].value, 1422 &lf, sizeof(lf)); 1423 } 1424 1425 /* Consume '\r' */ 1426 tokeniser->context.pending += 1; 1427 } else { 1428 COLLECT_MS(ctag->attributes[ctag->n_attributes - 1].value, 1429 cptr, len); 1430 tokeniser->context.pending += len; 1431 } 1432 1433 return HUBBUB_OK; 1434} 1435 1436hubbub_error hubbub_tokeniser_handle_attribute_value_sq( 1437 hubbub_tokeniser *tokeniser) 1438{ 1439 hubbub_tag *ctag = &tokeniser->context.current_tag; 1440 1441 size_t len; 1442 const uint8_t *cptr; 1443 parserutils_error error; 1444 uint8_t c; 1445 1446 error = parserutils_inputstream_peek(tokeniser->input, 1447 tokeniser->context.pending, &cptr, &len); 1448 1449 if (error != PARSERUTILS_OK) { 1450 if (error == PARSERUTILS_EOF) { 1451 tokeniser->state = STATE_DATA; 1452 return emit_current_tag(tokeniser); 1453 } else { 1454 return hubbub_error_from_parserutils_error(error); 1455 } 1456 } 1457 1458 c = *cptr; 1459 1460 if (c == '\'') { 1461 tokeniser->context.pending += len; 1462 tokeniser->state = STATE_AFTER_ATTRIBUTE_VALUE_Q; 1463 } else if (c == '&') { 1464 tokeniser->context.prev_state = tokeniser->state; 1465 tokeniser->state = STATE_CHARACTER_REFERENCE_IN_ATTRIBUTE_VALUE; 1466 tokeniser->context.allowed_char = '\''; 1467 /* Don't eat the '&'; it'll be handled by entity consumption */ 1468 } else if (c == '\0') { 1469 COLLECT_MS(ctag->attributes[ctag->n_attributes - 1].value, 1470 u_fffd, sizeof(u_fffd)); 1471 tokeniser->context.pending += len; 1472 } else if (c == '\r') { 1473 error = parserutils_inputstream_peek( 1474 tokeniser->input, 1475 tokeniser->context.pending + len, 1476 &cptr, 1477 &len); 1478 1479 if (error != PARSERUTILS_OK && error != PARSERUTILS_EOF) { 1480 return hubbub_error_from_parserutils_error(error); 1481 } else if (error == PARSERUTILS_EOF || *cptr != '\n') { 1482 COLLECT_MS(ctag->attributes[ 1483 ctag->n_attributes - 1].value, 1484 &lf, sizeof(lf)); 1485 } 1486 1487 /* Consume \r */ 1488 tokeniser->context.pending += 1; 1489 } else { 1490 COLLECT_MS(ctag->attributes[ctag->n_attributes - 1].value, 1491 cptr, len); 1492 tokeniser->context.pending += len; 1493 } 1494 1495 return HUBBUB_OK; 1496} 1497 1498hubbub_error hubbub_tokeniser_handle_attribute_value_uq( 1499 hubbub_tokeniser *tokeniser) 1500{ 1501 hubbub_tag *ctag = &tokeniser->context.current_tag; 1502 uint8_t c; 1503 1504 size_t len; 1505 const uint8_t *cptr; 1506 parserutils_error error; 1507 1508 error = parserutils_inputstream_peek(tokeniser->input, 1509 tokeniser->context.pending, &cptr, &len); 1510 1511 if (error != PARSERUTILS_OK) { 1512 if (error == PARSERUTILS_EOF) { 1513 tokeniser->state = STATE_DATA; 1514 return emit_current_tag(tokeniser); 1515 } else { 1516 return hubbub_error_from_parserutils_error(error); 1517 } 1518 } 1519 1520 c = *cptr; 1521 1522 assert(c == '&' || 1523 ctag->attributes[ctag->n_attributes - 1].value.len >= 1); 1524 1525 if (c == '\t' || c == '\n' || c == '\f' || c == ' ' || c == '\r') { 1526 tokeniser->context.pending += len; 1527 tokeniser->state = STATE_BEFORE_ATTRIBUTE_NAME; 1528 } else if (c == '&') { 1529 tokeniser->context.prev_state = tokeniser->state; 1530 tokeniser->state = STATE_CHARACTER_REFERENCE_IN_ATTRIBUTE_VALUE; 1531 /* Don't eat the '&'; it'll be handled by entity consumption */ 1532 } else if (c == '>') { 1533 tokeniser->context.pending += len; 1534 tokeniser->state = STATE_DATA; 1535 return emit_current_tag(tokeniser); 1536 } else if (c == '\0') { 1537 COLLECT(ctag->attributes[ctag->n_attributes - 1].value, 1538 u_fffd, sizeof(u_fffd)); 1539 tokeniser->context.pending += len; 1540 } else { 1541 if (c == '"' || c == '\'' || c == '=') { 1542 /** \todo parse error */ 1543 } 1544 1545 COLLECT(ctag->attributes[ctag->n_attributes - 1].value, 1546 cptr, len); 1547 tokeniser->context.pending += len; 1548 } 1549 1550 return HUBBUB_OK; 1551} 1552 1553hubbub_error hubbub_tokeniser_handle_character_reference_in_attribute_value( 1554 hubbub_tokeniser *tokeniser) 1555{ 1556 if (tokeniser->context.match_entity.complete == false) { 1557 return hubbub_tokeniser_consume_character_reference(tokeniser, 1558 tokeniser->context.pending); 1559 } else { 1560 hubbub_tag *ctag = &tokeniser->context.current_tag; 1561 hubbub_attribute *attr = &ctag->attributes[ 1562 ctag->n_attributes - 1]; 1563 1564 uint8_t utf8[6]; 1565 uint8_t *utf8ptr = utf8; 1566 size_t len = sizeof(utf8); 1567 1568 if (tokeniser->context.match_entity.codepoint) { 1569 parserutils_charset_utf8_from_ucs4( 1570 tokeniser->context.match_entity.codepoint, 1571 &utf8ptr, &len); 1572 1573 COLLECT_MS(attr->value, utf8, sizeof(utf8) - len); 1574 1575 /* +1 for the ampersand */ 1576 tokeniser->context.pending += 1577 tokeniser->context.match_entity.length 1578 + 1; 1579 } else { 1580 size_t len = 0; 1581 const uint8_t *cptr = NULL; 1582 parserutils_error error; 1583 1584 error = parserutils_inputstream_peek( 1585 tokeniser->input, 1586 tokeniser->context.pending, 1587 &cptr, 1588 &len); 1589 1590 assert(error == PARSERUTILS_OK); 1591 1592 /* Insert the ampersand */ 1593 COLLECT_MS(attr->value, cptr, len); 1594 tokeniser->context.pending += len; 1595 } 1596 1597 /* Reset for next time */ 1598 tokeniser->context.match_entity.complete = false; 1599 1600 /* And back to the previous state */ 1601 tokeniser->state = tokeniser->context.prev_state; 1602 } 1603 1604 return HUBBUB_OK; 1605} 1606 1607/* always switches state */ 1608hubbub_error hubbub_tokeniser_handle_after_attribute_value_q( 1609 hubbub_tokeniser *tokeniser) 1610{ 1611 size_t len; 1612 const uint8_t *cptr; 1613 parserutils_error error; 1614 uint8_t c; 1615 1616 error = parserutils_inputstream_peek(tokeniser->input, 1617 tokeniser->context.pending, &cptr, &len); 1618 1619 if (error != PARSERUTILS_OK) { 1620 if (error == PARSERUTILS_EOF) { 1621 tokeniser->state = STATE_DATA; 1622 return emit_current_tag(tokeniser); 1623 } else { 1624 return hubbub_error_from_parserutils_error(error); 1625 } 1626 } 1627 1628 c = *cptr; 1629 1630 if (c == '\t' || c == '\n' || c == '\f' || c == ' ' || c == '\r') { 1631 tokeniser->context.pending += len; 1632 tokeniser->state = STATE_BEFORE_ATTRIBUTE_NAME; 1633 } else if (c == '>') { 1634 tokeniser->context.pending += len; 1635 1636 tokeniser->state = STATE_DATA; 1637 return emit_current_tag(tokeniser); 1638 } else if (c == '/') { 1639 tokeniser->context.pending += len; 1640 tokeniser->state = STATE_SELF_CLOSING_START_TAG; 1641 } else { 1642 /** \todo parse error */ 1643 /* Reprocess character in before attribute name state */ 1644 tokeniser->state = STATE_BEFORE_ATTRIBUTE_NAME; 1645 } 1646 1647 return HUBBUB_OK; 1648} 1649 1650hubbub_error hubbub_tokeniser_handle_self_closing_start_tag( 1651 hubbub_tokeniser *tokeniser) 1652{ 1653 size_t len; 1654 const uint8_t *cptr; 1655 parserutils_error error; 1656 uint8_t c; 1657 1658 error = parserutils_inputstream_peek(tokeniser->input, 1659 tokeniser->context.pending, &cptr, &len); 1660 1661 if (error != PARSERUTILS_OK) { 1662 if (error == PARSERUTILS_EOF) { 1663 tokeniser->state = STATE_DATA; 1664 return emit_current_tag(tokeniser); 1665 } else { 1666 return hubbub_error_from_parserutils_error(error); 1667 } 1668 } 1669 1670 c = *cptr; 1671 1672 if (c == '>') { 1673 tokeniser->context.pending += len; 1674 tokeniser->state = STATE_DATA; 1675 1676 tokeniser->context.current_tag.self_closing = true; 1677 return emit_current_tag(tokeniser); 1678 } else { 1679 /* Reprocess character in before attribute name state */ 1680 tokeniser->state = STATE_BEFORE_ATTRIBUTE_NAME; 1681 } 1682 1683 return HUBBUB_OK; 1684} 1685 1686/* this state expects tokeniser->context.chars to be empty on first entry */ 1687hubbub_error hubbub_tokeniser_handle_bogus_comment(hubbub_tokeniser *tokeniser) 1688{ 1689 size_t len; 1690 const uint8_t *cptr; 1691 parserutils_error error; 1692 uint8_t c; 1693 1694 error = parserutils_inputstream_peek(tokeniser->input, 1695 tokeniser->context.pending, &cptr, &len); 1696 1697 if (error != PARSERUTILS_OK) { 1698 if (error == PARSERUTILS_EOF) { 1699 tokeniser->state = STATE_DATA; 1700 return emit_current_comment(tokeniser); 1701 } else { 1702 return hubbub_error_from_parserutils_error(error); 1703 } 1704 } 1705 1706 c = *cptr; 1707 1708 if (c == '>') { 1709 tokeniser->context.pending += len; 1710 tokeniser->state = STATE_DATA; 1711 return emit_current_comment(tokeniser); 1712 } else if (c == '\0') { 1713 error = parserutils_buffer_append(tokeniser->buffer, 1714 u_fffd, sizeof(u_fffd)); 1715 if (error != PARSERUTILS_OK) 1716 return hubbub_error_from_parserutils_error(error); 1717 1718 tokeniser->context.pending += len; 1719 } else if (c == '\r') { 1720 error = parserutils_inputstream_peek( 1721 tokeniser->input, 1722 tokeniser->context.pending, 1723 &cptr, 1724 &len); 1725 1726 if (error != PARSERUTILS_OK && error != PARSERUTILS_EOF) { 1727 return hubbub_error_from_parserutils_error(error); 1728 } else if (error == PARSERUTILS_EOF || *cptr != '\n') { 1729 error = parserutils_buffer_append(tokeniser->buffer, 1730 &lf, sizeof(lf)); 1731 if (error != PARSERUTILS_OK) { 1732 return hubbub_error_from_parserutils_error( 1733 error); 1734 } 1735 } 1736 tokeniser->context.pending += len; 1737 } else { 1738 error = parserutils_buffer_append(tokeniser->buffer, 1739 (uint8_t *) cptr, len); 1740 if (error != PARSERUTILS_OK) 1741 return hubbub_error_from_parserutils_error(error); 1742 1743 tokeniser->context.pending += len; 1744 } 1745 1746 return HUBBUB_OK; 1747} 1748 1749/* this state always switches to another state straight away */ 1750hubbub_error hubbub_tokeniser_handle_markup_declaration_open( 1751 hubbub_tokeniser *tokeniser) 1752{ 1753 size_t len; 1754 const uint8_t *cptr; 1755 parserutils_error error; 1756 uint8_t c; 1757 1758 assert(tokeniser->context.pending == 0); 1759 1760 error = parserutils_inputstream_peek(tokeniser->input, 0, &cptr, &len); 1761 1762 if (error != PARSERUTILS_OK) { 1763 if (error == PARSERUTILS_EOF) { 1764 tokeniser->state = STATE_BOGUS_COMMENT; 1765 return HUBBUB_OK; 1766 } else { 1767 return hubbub_error_from_parserutils_error(error); 1768 } 1769 } 1770 1771 c = *cptr; 1772 1773 if (c == '-') { 1774 tokeniser->context.pending = len; 1775 tokeniser->state = STATE_MATCH_COMMENT; 1776 } else if ((c & ~0x20) == 'D') { 1777 tokeniser->context.pending = len; 1778 tokeniser->context.match_doctype.count = len; 1779 tokeniser->state = STATE_MATCH_DOCTYPE; 1780 } else if (tokeniser->process_cdata_section == true && c == '[') { 1781 tokeniser->context.pending = len; 1782 tokeniser->context.match_cdata.count = len; 1783 tokeniser->state = STATE_MATCH_CDATA; 1784 } else { 1785 tokeniser->state = STATE_BOGUS_COMMENT; 1786 } 1787 1788 return HUBBUB_OK; 1789} 1790 1791 1792hubbub_error hubbub_tokeniser_handle_match_comment(hubbub_tokeniser *tokeniser) 1793{ 1794 size_t len; 1795 const uint8_t *cptr; 1796 parserutils_error error; 1797 1798 error = parserutils_inputstream_peek(tokeniser->input, 1799 tokeniser->context.pending, &cptr, &len); 1800 1801 if (error != PARSERUTILS_OK) { 1802 if (error == PARSERUTI…
Large files files are truncated, but you can click here to view the full file