/peek-build/src/netdepends/hubbub-0.0.2/src/tokeniser/tokeniser.c
C | 3370 lines | 2902 code | 319 blank | 149 comment | 615 complexity | 6530fbcb7409a5b2eafc2432fd3ae059 MD5 | raw file
1/* 2 * This file is part of Hubbub. 3 * Licensed under the MIT License, 4 * http://www.opensource.org/licenses/mit-license.php 5 * Copyright 2007 John-Mark Bell <jmb@netsurf-browser.org> 6 * Copyright 2008 Andrew Sidwell <takkaria@netsurf-browser.org> 7 */ 8#include <assert.h> 9#include <stdbool.h> 10#include <string.h> 11 12#include <stdio.h> 13 14#include <parserutils/charset/utf8.h> 15 16#include "utils/parserutilserror.h" 17#include "utils/utils.h" 18 19#include "tokeniser/entities.h" 20#include "tokeniser/tokeniser.h" 21 22/** 23 * Table of mappings between Windows-1252 codepoints 128-159 and UCS4 24 */ 25static const uint32_t cp1252Table[32] = { 26 0x20AC, 0xFFFD, 0x201A, 0x0192, 0x201E, 0x2026, 0x2020, 0x2021, 27 0x02C6, 0x2030, 0x0160, 0x2039, 0x0152, 0xFFFD, 0x017D, 0xFFFD, 28 0xFFFD, 0x2018, 0x2019, 0x201C, 0x201D, 0x2022, 0x2013, 0x2014, 29 0x02DC, 0x2122, 0x0161, 0x203A, 0x0153, 0xFFFD, 0x017E, 0x0178 30}; 31 32/** 33 * UTF-8 encoding of U+FFFD REPLACEMENT CHARACTER 34 */ 35static const uint8_t u_fffd[3] = { '\xEF', '\xBF', '\xBD' }; 36static const hubbub_string u_fffd_str = { u_fffd, sizeof(u_fffd) }; 37 38 39/** 40 * String for when we want to emit newlines 41 */ 42static const uint8_t lf = '\n'; 43static const hubbub_string lf_str = { &lf, 1 }; 44 45 46/** 47 * Tokeniser states 48 */ 49typedef enum hubbub_tokeniser_state { 50 STATE_DATA, 51 STATE_CHARACTER_REFERENCE_DATA, 52 STATE_TAG_OPEN, 53 STATE_CLOSE_TAG_OPEN, 54 STATE_TAG_NAME, 55 STATE_BEFORE_ATTRIBUTE_NAME, 56 STATE_ATTRIBUTE_NAME, 57 STATE_AFTER_ATTRIBUTE_NAME, 58 STATE_BEFORE_ATTRIBUTE_VALUE, 59 STATE_ATTRIBUTE_VALUE_DQ, 60 STATE_ATTRIBUTE_VALUE_SQ, 61 STATE_ATTRIBUTE_VALUE_UQ, 62 STATE_CHARACTER_REFERENCE_IN_ATTRIBUTE_VALUE, 63 STATE_AFTER_ATTRIBUTE_VALUE_Q, 64 STATE_SELF_CLOSING_START_TAG, 65 STATE_BOGUS_COMMENT, 66 STATE_MARKUP_DECLARATION_OPEN, 67 STATE_MATCH_COMMENT, 68 STATE_COMMENT_START, 69 STATE_COMMENT_START_DASH, 70 STATE_COMMENT, 71 STATE_COMMENT_END_DASH, 72 STATE_COMMENT_END, 73 STATE_MATCH_DOCTYPE, 74 STATE_DOCTYPE, 75 STATE_BEFORE_DOCTYPE_NAME, 76 STATE_DOCTYPE_NAME, 77 STATE_AFTER_DOCTYPE_NAME, 78 STATE_MATCH_PUBLIC, 79 STATE_BEFORE_DOCTYPE_PUBLIC, 80 STATE_DOCTYPE_PUBLIC_DQ, 81 STATE_DOCTYPE_PUBLIC_SQ, 82 STATE_AFTER_DOCTYPE_PUBLIC, 83 STATE_MATCH_SYSTEM, 84 STATE_BEFORE_DOCTYPE_SYSTEM, 85 STATE_DOCTYPE_SYSTEM_DQ, 86 STATE_DOCTYPE_SYSTEM_SQ, 87 STATE_AFTER_DOCTYPE_SYSTEM, 88 STATE_BOGUS_DOCTYPE, 89 STATE_MATCH_CDATA, 90 STATE_CDATA_BLOCK, 91 STATE_NUMBERED_ENTITY, 92 STATE_NAMED_ENTITY 93} hubbub_tokeniser_state; 94 95/** 96 * Context for tokeniser 97 */ 98typedef struct hubbub_tokeniser_context { 99 size_t pending; /**< Count of pending chars */ 100 101 hubbub_string current_comment; /**< Current comment text */ 102 103 hubbub_token_type current_tag_type; /**< Type of current_tag */ 104 hubbub_tag current_tag; /**< Current tag */ 105 hubbub_doctype current_doctype; /**< Current doctype */ 106 hubbub_tokeniser_state prev_state; /**< Previous state */ 107 108 uint8_t last_start_tag_name[10]; /**< Name of the last start tag 109 * emitted */ 110 size_t last_start_tag_len; /**< Length of last start tag */ 111 112 struct { 113 uint32_t count; 114 bool match; 115 } close_tag_match; /**< State for matching close 116 * tags */ 117 118 struct { 119 uint32_t count; /**< Index into "DOCTYPE" */ 120 } match_doctype; /**< State for matching doctype */ 121 122 struct { 123 uint32_t count; /**< Index into "[CDATA[" */ 124 uint32_t end; /**< Index into "]]>" */ 125 } match_cdata; /**< State for matching cdata */ 126 127 struct { 128 size_t offset; /**< Offset in buffer */ 129 uint32_t length; /**< Length of entity */ 130 uint32_t codepoint; /**< UCS4 codepoint */ 131 bool complete; /**< True if match complete */ 132 133 uint32_t poss_length; /**< Optimistic length 134 * when matching named 135 * character references */ 136 uint8_t base; /**< Base for numeric 137 * entities */ 138 void *context; /**< Context for named 139 * entity search */ 140 size_t prev_len; /**< Previous byte length 141 * of str */ 142 bool had_data; /**< Whether we read 143 * anything after &#(x)? */ 144 bool overflow; /**< Whether this entity has 145 * has overflowed the maximum 146 * numeric entity value */ 147 hubbub_tokeniser_state return_state; /**< State we were 148 * called from */ 149 } match_entity; /**< Entity matching state */ 150 151 struct { 152 uint32_t line; /**< Current line of input */ 153 uint32_t col; /**< Current character in 154 * line */ 155 } position; /**< Position in source data */ 156 157 uint32_t allowed_char; /**< Used for quote matching */ 158} hubbub_tokeniser_context; 159 160/** 161 * Tokeniser data structure 162 */ 163struct hubbub_tokeniser { 164 hubbub_tokeniser_state state; /**< Current tokeniser state */ 165 hubbub_content_model content_model; /**< Current content 166 * model flag */ 167 bool escape_flag; /**< Escape flag **/ 168 bool process_cdata_section; /**< Whether to process CDATA sections*/ 169 170 parserutils_inputstream *input; /**< Input stream */ 171 parserutils_buffer *buffer; /**< Input buffer */ 172 173 hubbub_tokeniser_context context; /**< Tokeniser context */ 174 175 hubbub_token_handler token_handler; /**< Token handling callback */ 176 void *token_pw; /**< Token handler data */ 177 178 hubbub_error_handler error_handler; /**< Error handling callback */ 179 void *error_pw; /**< Error handler data */ 180 181 hubbub_allocator_fn alloc; /**< Memory (de)allocation function */ 182 void *alloc_pw; /**< Client private data */ 183}; 184 185static hubbub_error hubbub_tokeniser_handle_data(hubbub_tokeniser *tokeniser); 186static hubbub_error hubbub_tokeniser_handle_character_reference_data( 187 hubbub_tokeniser *tokeniser); 188static hubbub_error hubbub_tokeniser_handle_tag_open( 189 hubbub_tokeniser *tokeniser); 190static hubbub_error hubbub_tokeniser_handle_close_tag_open( 191 hubbub_tokeniser *tokeniser); 192static hubbub_error hubbub_tokeniser_handle_tag_name( 193 hubbub_tokeniser *tokeniser); 194static hubbub_error hubbub_tokeniser_handle_before_attribute_name( 195 hubbub_tokeniser *tokeniser); 196static hubbub_error hubbub_tokeniser_handle_attribute_name( 197 hubbub_tokeniser *tokeniser); 198static hubbub_error hubbub_tokeniser_handle_after_attribute_name( 199 hubbub_tokeniser *tokeniser); 200static hubbub_error hubbub_tokeniser_handle_before_attribute_value( 201 hubbub_tokeniser *tokeniser); 202static hubbub_error hubbub_tokeniser_handle_attribute_value_dq( 203 hubbub_tokeniser *tokeniser); 204static hubbub_error hubbub_tokeniser_handle_attribute_value_sq( 205 hubbub_tokeniser *tokeniser); 206static hubbub_error hubbub_tokeniser_handle_attribute_value_uq( 207 hubbub_tokeniser *tokeniser); 208static hubbub_error hubbub_tokeniser_handle_character_reference_in_attribute_value( 209 hubbub_tokeniser *tokeniser); 210static hubbub_error hubbub_tokeniser_handle_after_attribute_value_q( 211 hubbub_tokeniser *tokeniser); 212static hubbub_error hubbub_tokeniser_handle_self_closing_start_tag( 213 hubbub_tokeniser *tokeniser); 214static hubbub_error hubbub_tokeniser_handle_bogus_comment( 215 hubbub_tokeniser *tokeniser); 216static hubbub_error hubbub_tokeniser_handle_markup_declaration_open( 217 hubbub_tokeniser *tokeniser); 218static hubbub_error hubbub_tokeniser_handle_match_comment( 219 hubbub_tokeniser *tokeniser); 220static hubbub_error hubbub_tokeniser_handle_comment( 221 hubbub_tokeniser *tokeniser); 222static hubbub_error hubbub_tokeniser_handle_match_doctype( 223 hubbub_tokeniser *tokeniser); 224static hubbub_error hubbub_tokeniser_handle_doctype( 225 hubbub_tokeniser *tokeniser); 226static hubbub_error hubbub_tokeniser_handle_before_doctype_name( 227 hubbub_tokeniser *tokeniser); 228static hubbub_error hubbub_tokeniser_handle_doctype_name( 229 hubbub_tokeniser *tokeniser); 230static hubbub_error hubbub_tokeniser_handle_after_doctype_name( 231 hubbub_tokeniser *tokeniser); 232static hubbub_error hubbub_tokeniser_handle_match_public( 233 hubbub_tokeniser *tokeniser); 234static hubbub_error hubbub_tokeniser_handle_before_doctype_public( 235 hubbub_tokeniser *tokeniser); 236static hubbub_error hubbub_tokeniser_handle_doctype_public_dq( 237 hubbub_tokeniser *tokeniser); 238static hubbub_error hubbub_tokeniser_handle_doctype_public_sq( 239 hubbub_tokeniser *tokeniser); 240static hubbub_error hubbub_tokeniser_handle_after_doctype_public( 241 hubbub_tokeniser *tokeniser); 242static hubbub_error hubbub_tokeniser_handle_match_system( 243 hubbub_tokeniser *tokeniser); 244static hubbub_error hubbub_tokeniser_handle_before_doctype_system( 245 hubbub_tokeniser *tokeniser); 246static hubbub_error hubbub_tokeniser_handle_doctype_system_dq( 247 hubbub_tokeniser *tokeniser); 248static hubbub_error hubbub_tokeniser_handle_doctype_system_sq( 249 hubbub_tokeniser *tokeniser); 250static hubbub_error hubbub_tokeniser_handle_after_doctype_system( 251 hubbub_tokeniser *tokeniser); 252static hubbub_error hubbub_tokeniser_handle_bogus_doctype( 253 hubbub_tokeniser *tokeniser); 254static hubbub_error hubbub_tokeniser_handle_match_cdata( 255 hubbub_tokeniser *tokeniser); 256static hubbub_error hubbub_tokeniser_handle_cdata_block( 257 hubbub_tokeniser *tokeniser); 258static hubbub_error hubbub_tokeniser_consume_character_reference( 259 hubbub_tokeniser *tokeniser, size_t off); 260static hubbub_error hubbub_tokeniser_handle_numbered_entity( 261 hubbub_tokeniser *tokeniser); 262static hubbub_error hubbub_tokeniser_handle_named_entity( 263 hubbub_tokeniser *tokeniser); 264 265static inline hubbub_error emit_character_token(hubbub_tokeniser *tokeniser, 266 const hubbub_string *chars); 267static inline hubbub_error emit_current_chars(hubbub_tokeniser *tokeniser); 268static inline hubbub_error emit_current_tag(hubbub_tokeniser *tokeniser); 269static inline hubbub_error emit_current_comment(hubbub_tokeniser *tokeniser); 270static inline hubbub_error emit_current_doctype(hubbub_tokeniser *tokeniser, 271 bool force_quirks); 272static hubbub_error hubbub_tokeniser_emit_token(hubbub_tokeniser *tokeniser, 273 hubbub_token *token); 274 275/** 276 * Create a hubbub tokeniser 277 * 278 * \param input Input stream instance 279 * \param alloc Memory (de)allocation function 280 * \param pw Pointer to client-specific private data (may be NULL) 281 * \param tokeniser Pointer to location to receive tokeniser instance 282 * \return HUBBUB_OK on success, 283 * HUBBUB_BADPARM on bad parameters, 284 * HUBBUB_NOMEM on memory exhaustion 285 */ 286hubbub_error hubbub_tokeniser_create(parserutils_inputstream *input, 287 hubbub_allocator_fn alloc, void *pw, 288 hubbub_tokeniser **tokeniser) 289{ 290 parserutils_error perror; 291 hubbub_tokeniser *tok; 292 293 if (input == NULL || alloc == NULL || tokeniser == NULL) 294 return HUBBUB_BADPARM; 295 296 tok = alloc(NULL, sizeof(hubbub_tokeniser), pw); 297 if (tok == NULL) 298 return HUBBUB_NOMEM; 299 300 perror = parserutils_buffer_create(alloc, pw, &tok->buffer); 301 if (perror != PARSERUTILS_OK) { 302 alloc(tok, 0, pw); 303 return hubbub_error_from_parserutils_error(perror); 304 } 305 306 tok->state = STATE_DATA; 307 tok->content_model = HUBBUB_CONTENT_MODEL_PCDATA; 308 309 tok->escape_flag = false; 310 tok->process_cdata_section = false; 311 312 tok->input = input; 313 314 tok->token_handler = NULL; 315 tok->token_pw = NULL; 316 317 tok->error_handler = NULL; 318 tok->error_pw = NULL; 319 320 tok->alloc = alloc; 321 tok->alloc_pw = pw; 322 323 memset(&tok->context, 0, sizeof(hubbub_tokeniser_context)); 324 325 *tokeniser = tok; 326 327 return HUBBUB_OK; 328} 329 330/** 331 * Destroy a hubbub tokeniser 332 * 333 * \param tokeniser The tokeniser instance to destroy 334 * \return HUBBUB_OK on success, appropriate error otherwise 335 */ 336hubbub_error hubbub_tokeniser_destroy(hubbub_tokeniser *tokeniser) 337{ 338 if (tokeniser == NULL) 339 return HUBBUB_BADPARM; 340 341 if (tokeniser->context.current_tag.attributes != NULL) { 342 tokeniser->alloc(tokeniser->context.current_tag.attributes, 343 0, tokeniser->alloc_pw); 344 } 345 346 parserutils_buffer_destroy(tokeniser->buffer); 347 348 tokeniser->alloc(tokeniser, 0, tokeniser->alloc_pw); 349 350 return HUBBUB_OK; 351} 352 353/** 354 * Configure a hubbub tokeniser 355 * 356 * \param tokeniser The tokeniser instance to configure 357 * \param type The option type to set 358 * \param params Option-specific parameters 359 * \return HUBBUB_OK on success, appropriate error otherwise 360 */ 361hubbub_error hubbub_tokeniser_setopt(hubbub_tokeniser *tokeniser, 362 hubbub_tokeniser_opttype type, 363 hubbub_tokeniser_optparams *params) 364{ 365 if (tokeniser == NULL || params == NULL) 366 return HUBBUB_BADPARM; 367 368 switch (type) { 369 case HUBBUB_TOKENISER_TOKEN_HANDLER: 370 tokeniser->token_handler = params->token_handler.handler; 371 tokeniser->token_pw = params->token_handler.pw; 372 break; 373 case HUBBUB_TOKENISER_ERROR_HANDLER: 374 tokeniser->error_handler = params->error_handler.handler; 375 tokeniser->error_pw = params->error_handler.pw; 376 break; 377 case HUBBUB_TOKENISER_CONTENT_MODEL: 378 tokeniser->content_model = params->content_model.model; 379 break; 380 case HUBBUB_TOKENISER_PROCESS_CDATA: 381 tokeniser->process_cdata_section = params->process_cdata; 382 break; 383 } 384 385 return HUBBUB_OK; 386} 387 388/** 389 * Process remaining data in the input stream 390 * 391 * \param tokeniser The tokeniser instance to invoke 392 * \return HUBBUB_OK on success, appropriate error otherwise 393 */ 394hubbub_error hubbub_tokeniser_run(hubbub_tokeniser *tokeniser) 395{ 396 hubbub_error cont = HUBBUB_OK; 397 398 if (tokeniser == NULL) 399 return HUBBUB_BADPARM; 400 401#if 0 402#define state(x) \ 403 case x: \ 404 printf( #x "\n"); 405#else 406#define state(x) \ 407 case x: 408#endif 409 410 while (cont == HUBBUB_OK) { 411 switch (tokeniser->state) { 412 state(STATE_DATA) 413 cont = hubbub_tokeniser_handle_data(tokeniser); 414 break; 415 state(STATE_CHARACTER_REFERENCE_DATA) 416 cont = hubbub_tokeniser_handle_character_reference_data( 417 tokeniser); 418 break; 419 state(STATE_TAG_OPEN) 420 cont = hubbub_tokeniser_handle_tag_open(tokeniser); 421 break; 422 state(STATE_CLOSE_TAG_OPEN) 423 cont = hubbub_tokeniser_handle_close_tag_open( 424 tokeniser); 425 break; 426 state(STATE_TAG_NAME) 427 cont = hubbub_tokeniser_handle_tag_name(tokeniser); 428 break; 429 state(STATE_BEFORE_ATTRIBUTE_NAME) 430 cont = hubbub_tokeniser_handle_before_attribute_name( 431 tokeniser); 432 break; 433 state(STATE_ATTRIBUTE_NAME) 434 cont = hubbub_tokeniser_handle_attribute_name( 435 tokeniser); 436 break; 437 state(STATE_AFTER_ATTRIBUTE_NAME) 438 cont = hubbub_tokeniser_handle_after_attribute_name( 439 tokeniser); 440 break; 441 state(STATE_BEFORE_ATTRIBUTE_VALUE) 442 cont = hubbub_tokeniser_handle_before_attribute_value( 443 tokeniser); 444 break; 445 state(STATE_ATTRIBUTE_VALUE_DQ) 446 cont = hubbub_tokeniser_handle_attribute_value_dq( 447 tokeniser); 448 break; 449 state(STATE_ATTRIBUTE_VALUE_SQ) 450 cont = hubbub_tokeniser_handle_attribute_value_sq( 451 tokeniser); 452 break; 453 state(STATE_ATTRIBUTE_VALUE_UQ) 454 cont = hubbub_tokeniser_handle_attribute_value_uq( 455 tokeniser); 456 break; 457 state(STATE_CHARACTER_REFERENCE_IN_ATTRIBUTE_VALUE) 458 cont = hubbub_tokeniser_handle_character_reference_in_attribute_value( 459 tokeniser); 460 break; 461 state(STATE_AFTER_ATTRIBUTE_VALUE_Q) 462 cont = hubbub_tokeniser_handle_after_attribute_value_q( 463 tokeniser); 464 break; 465 state(STATE_SELF_CLOSING_START_TAG) 466 cont = hubbub_tokeniser_handle_self_closing_start_tag( 467 tokeniser); 468 break; 469 state(STATE_BOGUS_COMMENT) 470 cont = hubbub_tokeniser_handle_bogus_comment( 471 tokeniser); 472 break; 473 state(STATE_MARKUP_DECLARATION_OPEN) 474 cont = hubbub_tokeniser_handle_markup_declaration_open( 475 tokeniser); 476 break; 477 state(STATE_MATCH_COMMENT) 478 cont = hubbub_tokeniser_handle_match_comment( 479 tokeniser); 480 break; 481 case STATE_COMMENT_START: 482 case STATE_COMMENT_START_DASH: 483 case STATE_COMMENT: 484 case STATE_COMMENT_END_DASH: 485 case STATE_COMMENT_END: 486 cont = hubbub_tokeniser_handle_comment(tokeniser); 487 break; 488 state(STATE_MATCH_DOCTYPE) 489 cont = hubbub_tokeniser_handle_match_doctype( 490 tokeniser); 491 break; 492 state(STATE_DOCTYPE) 493 cont = hubbub_tokeniser_handle_doctype(tokeniser); 494 break; 495 state(STATE_BEFORE_DOCTYPE_NAME) 496 cont = hubbub_tokeniser_handle_before_doctype_name( 497 tokeniser); 498 break; 499 state(STATE_DOCTYPE_NAME) 500 cont = hubbub_tokeniser_handle_doctype_name( 501 tokeniser); 502 break; 503 state(STATE_AFTER_DOCTYPE_NAME) 504 cont = hubbub_tokeniser_handle_after_doctype_name( 505 tokeniser); 506 break; 507 508 state(STATE_MATCH_PUBLIC) 509 cont = hubbub_tokeniser_handle_match_public( 510 tokeniser); 511 break; 512 state(STATE_BEFORE_DOCTYPE_PUBLIC) 513 cont = hubbub_tokeniser_handle_before_doctype_public( 514 tokeniser); 515 break; 516 state(STATE_DOCTYPE_PUBLIC_DQ) 517 cont = hubbub_tokeniser_handle_doctype_public_dq( 518 tokeniser); 519 break; 520 state(STATE_DOCTYPE_PUBLIC_SQ) 521 cont = hubbub_tokeniser_handle_doctype_public_sq( 522 tokeniser); 523 break; 524 state(STATE_AFTER_DOCTYPE_PUBLIC) 525 cont = hubbub_tokeniser_handle_after_doctype_public( 526 tokeniser); 527 break; 528 state(STATE_MATCH_SYSTEM) 529 cont = hubbub_tokeniser_handle_match_system( 530 tokeniser); 531 break; 532 state(STATE_BEFORE_DOCTYPE_SYSTEM) 533 cont = hubbub_tokeniser_handle_before_doctype_system( 534 tokeniser); 535 break; 536 state(STATE_DOCTYPE_SYSTEM_DQ) 537 cont = hubbub_tokeniser_handle_doctype_system_dq( 538 tokeniser); 539 break; 540 state(STATE_DOCTYPE_SYSTEM_SQ) 541 cont = hubbub_tokeniser_handle_doctype_system_sq( 542 tokeniser); 543 break; 544 state(STATE_AFTER_DOCTYPE_SYSTEM) 545 cont = hubbub_tokeniser_handle_after_doctype_system( 546 tokeniser); 547 break; 548 state(STATE_BOGUS_DOCTYPE) 549 cont = hubbub_tokeniser_handle_bogus_doctype( 550 tokeniser); 551 break; 552 state(STATE_MATCH_CDATA) 553 cont = hubbub_tokeniser_handle_match_cdata( 554 tokeniser); 555 break; 556 state(STATE_CDATA_BLOCK) 557 cont = hubbub_tokeniser_handle_cdata_block( 558 tokeniser); 559 break; 560 state(STATE_NUMBERED_ENTITY) 561 cont = hubbub_tokeniser_handle_numbered_entity( 562 tokeniser); 563 break; 564 state(STATE_NAMED_ENTITY) 565 cont = hubbub_tokeniser_handle_named_entity( 566 tokeniser); 567 break; 568 } 569 } 570 571 return (cont == HUBBUB_NEEDDATA) ? HUBBUB_OK : cont; 572} 573 574 575/** 576 * Various macros for manipulating buffers. 577 * 578 * \todo make some of these inline functions (type-safety) 579 * \todo document them properly here 580 */ 581 582#define START_BUF(str, cptr, length) \ 583 do { \ 584 parserutils_error perror; \ 585 perror = parserutils_buffer_append(tokeniser->buffer, \ 586 (uint8_t *) (cptr), (length)); \ 587 if (perror != PARSERUTILS_OK) \ 588 return hubbub_error_from_parserutils_error(perror); \ 589 (str).len = (length); \ 590 } while (0) 591 592#define COLLECT(str, cptr, length) \ 593 do { \ 594 parserutils_error perror; \ 595 assert(str.len != 0); \ 596 perror = parserutils_buffer_append(tokeniser->buffer, \ 597 (uint8_t *) (cptr), (length)); \ 598 if (perror != PARSERUTILS_OK) \ 599 return hubbub_error_from_parserutils_error(perror); \ 600 (str).len += (length); \ 601 } while (0) 602 603#define COLLECT_MS(str, cptr, length) \ 604 do { \ 605 parserutils_error perror; \ 606 perror = parserutils_buffer_append(tokeniser->buffer, \ 607 (uint8_t *) (cptr), (length)); \ 608 if (perror != PARSERUTILS_OK) \ 609 return hubbub_error_from_parserutils_error(perror); \ 610 (str).len += (length); \ 611 } while (0) 612 613 614/* this should always be called with an empty "chars" buffer */ 615hubbub_error hubbub_tokeniser_handle_data(hubbub_tokeniser *tokeniser) 616{ 617 parserutils_error error; 618 hubbub_token token; 619 const uint8_t *cptr; 620 size_t len; 621 622 while ((error = parserutils_inputstream_peek(tokeniser->input, 623 tokeniser->context.pending, &cptr, &len)) == 624 PARSERUTILS_OK) { 625 const uint8_t c = *cptr; 626 627 if (c == '&' && 628 (tokeniser->content_model == HUBBUB_CONTENT_MODEL_PCDATA || 629 tokeniser->content_model == HUBBUB_CONTENT_MODEL_RCDATA) && 630 tokeniser->escape_flag == false) { 631 tokeniser->state = 632 STATE_CHARACTER_REFERENCE_DATA; 633 /* Don't eat the '&'; it'll be handled by entity 634 * consumption */ 635 break; 636 } else if (c == '-' && 637 tokeniser->escape_flag == false && 638 (tokeniser->content_model == 639 HUBBUB_CONTENT_MODEL_RCDATA || 640 tokeniser->content_model == 641 HUBBUB_CONTENT_MODEL_CDATA) && 642 tokeniser->context.pending >= 3) { 643 size_t ignore; 644 error = parserutils_inputstream_peek( 645 tokeniser->input, 646 tokeniser->context.pending - 3, 647 &cptr, 648 &ignore); 649 650 assert(error == PARSERUTILS_OK); 651 652 if (strncmp((char *)cptr, 653 "<!--", SLEN("<!--")) == 0) { 654 tokeniser->escape_flag = true; 655 } 656 657 tokeniser->context.pending += len; 658 } else if (c == '<' && (tokeniser->content_model == 659 HUBBUB_CONTENT_MODEL_PCDATA || 660 ((tokeniser->content_model == 661 HUBBUB_CONTENT_MODEL_RCDATA || 662 tokeniser->content_model == 663 HUBBUB_CONTENT_MODEL_CDATA) && 664 tokeniser->escape_flag == false))) { 665 if (tokeniser->context.pending > 0) { 666 /* Emit any pending characters */ 667 emit_current_chars(tokeniser); 668 } 669 670 /* Buffer '<' */ 671 tokeniser->context.pending = len; 672 tokeniser->state = STATE_TAG_OPEN; 673 break; 674 } else if (c == '>' && tokeniser->escape_flag == true && 675 (tokeniser->content_model == 676 HUBBUB_CONTENT_MODEL_RCDATA || 677 tokeniser->content_model == 678 HUBBUB_CONTENT_MODEL_CDATA)) { 679 /* no need to check that there are enough characters, 680 * since you can only run into this if the flag is 681 * true in the first place, which requires four 682 * characters. */ 683 error = parserutils_inputstream_peek( 684 tokeniser->input, 685 tokeniser->context.pending - 2, 686 &cptr, 687 &len); 688 689 assert(error == PARSERUTILS_OK); 690 691 if (strncmp((char *) cptr, "-->", SLEN("-->")) == 0) { 692 tokeniser->escape_flag = false; 693 } 694 695 tokeniser->context.pending += len; 696 } else if (c == '\0') { 697 if (tokeniser->context.pending > 0) { 698 /* Emit any pending characters */ 699 emit_current_chars(tokeniser); 700 } 701 702 /* Emit a replacement character */ 703 emit_character_token(tokeniser, &u_fffd_str); 704 705 /* Advance past NUL */ 706 parserutils_inputstream_advance(tokeniser->input, 1); 707 } else if (c == '\r') { 708 error = parserutils_inputstream_peek( 709 tokeniser->input, 710 tokeniser->context.pending + len, 711 &cptr, 712 &len); 713 714 if (error != PARSERUTILS_OK && 715 error != PARSERUTILS_EOF) { 716 break; 717 } 718 719 if (tokeniser->context.pending > 0) { 720 /* Emit any pending characters */ 721 emit_current_chars(tokeniser); 722 } 723 724 if (error == PARSERUTILS_EOF || *cptr != '\n') { 725 /* Emit newline */ 726 emit_character_token(tokeniser, &lf_str); 727 } 728 729 /* Advance over */ 730 parserutils_inputstream_advance(tokeniser->input, 1); 731 } else { 732 /* Just collect into buffer */ 733 tokeniser->context.pending += len; 734 } 735 } 736 737 if (tokeniser->state != STATE_TAG_OPEN && 738 (tokeniser->state != STATE_DATA || error == PARSERUTILS_EOF) && 739 tokeniser->context.pending > 0) { 740 /* Emit any pending characters */ 741 emit_current_chars(tokeniser); 742 } 743 744 if (error == PARSERUTILS_EOF) { 745 token.type = HUBBUB_TOKEN_EOF; 746 hubbub_tokeniser_emit_token(tokeniser, &token); 747 } 748 749 if (error == PARSERUTILS_EOF) { 750 return HUBBUB_NEEDDATA; 751 } else { 752 return hubbub_error_from_parserutils_error(error); 753 } 754} 755 756/* emit any pending tokens before calling */ 757hubbub_error hubbub_tokeniser_handle_character_reference_data( 758 hubbub_tokeniser *tokeniser) 759{ 760 assert(tokeniser->context.pending == 0); 761 762 if (tokeniser->context.match_entity.complete == false) { 763 return hubbub_tokeniser_consume_character_reference(tokeniser, 764 tokeniser->context.pending); 765 } else { 766 hubbub_token token; 767 768 uint8_t utf8[6]; 769 uint8_t *utf8ptr = utf8; 770 size_t len = sizeof(utf8); 771 772 token.type = HUBBUB_TOKEN_CHARACTER; 773 774 if (tokeniser->context.match_entity.codepoint) { 775 parserutils_charset_utf8_from_ucs4( 776 tokeniser->context.match_entity.codepoint, 777 &utf8ptr, &len); 778 779 token.data.character.ptr = utf8; 780 token.data.character.len = sizeof(utf8) - len; 781 782 hubbub_tokeniser_emit_token(tokeniser, &token); 783 784 /* +1 for ampersand */ 785 parserutils_inputstream_advance(tokeniser->input, 786 tokeniser->context.match_entity.length 787 + 1); 788 } else { 789 parserutils_error error; 790 const uint8_t *cptr = NULL; 791 error = parserutils_inputstream_peek( 792 tokeniser->input, 793 tokeniser->context.pending, 794 &cptr, 795 &len); 796 797 assert(error == PARSERUTILS_OK); 798 799 token.data.character.ptr = cptr; 800 token.data.character.len = len; 801 802 hubbub_tokeniser_emit_token(tokeniser, &token); 803 parserutils_inputstream_advance(tokeniser->input, len); 804 } 805 806 /* Reset for next time */ 807 tokeniser->context.match_entity.complete = false; 808 809 tokeniser->state = STATE_DATA; 810 } 811 812 return HUBBUB_OK; 813} 814 815/* this state always switches to another state straight away */ 816/* this state expects the current character to be '<' */ 817hubbub_error hubbub_tokeniser_handle_tag_open(hubbub_tokeniser *tokeniser) 818{ 819 hubbub_tag *ctag = &tokeniser->context.current_tag; 820 821 size_t len; 822 const uint8_t *cptr; 823 parserutils_error error; 824 uint8_t c; 825 826 assert(tokeniser->context.pending == 1); 827/* assert(tokeniser->context.chars.ptr[0] == '<'); */ 828 829 error = parserutils_inputstream_peek(tokeniser->input, 830 tokeniser->context.pending, &cptr, &len); 831 832 if (error != PARSERUTILS_OK) { 833 if (error == PARSERUTILS_EOF) { 834 /* Return to data state with '<' still in "chars" */ 835 tokeniser->state = STATE_DATA; 836 return HUBBUB_OK; 837 } else { 838 return hubbub_error_from_parserutils_error(error); 839 } 840 } 841 842 c = *cptr; 843 844 if (c == '/') { 845 tokeniser->context.pending += len; 846 847 tokeniser->context.close_tag_match.match = false; 848 tokeniser->context.close_tag_match.count = 0; 849 850 tokeniser->state = STATE_CLOSE_TAG_OPEN; 851 } else if (tokeniser->content_model == HUBBUB_CONTENT_MODEL_RCDATA || 852 tokeniser->content_model == 853 HUBBUB_CONTENT_MODEL_CDATA) { 854 /* Return to data state with '<' still in "chars" */ 855 tokeniser->state = STATE_DATA; 856 } else if (tokeniser->content_model == HUBBUB_CONTENT_MODEL_PCDATA) { 857 if (c == '!') { 858 parserutils_inputstream_advance(tokeniser->input, 859 SLEN("<!")); 860 861 tokeniser->context.pending = 0; 862 tokeniser->state = STATE_MARKUP_DECLARATION_OPEN; 863 } else if ('A' <= c && c <= 'Z') { 864 uint8_t lc = (c + 0x20); 865 866 START_BUF(ctag->name, &lc, len); 867 ctag->n_attributes = 0; 868 tokeniser->context.current_tag_type = 869 HUBBUB_TOKEN_START_TAG; 870 871 tokeniser->context.pending += len; 872 873 tokeniser->state = STATE_TAG_NAME; 874 } else if ('a' <= c && c <= 'z') { 875 START_BUF(ctag->name, cptr, len); 876 ctag->n_attributes = 0; 877 tokeniser->context.current_tag_type = 878 HUBBUB_TOKEN_START_TAG; 879 880 tokeniser->context.pending += len; 881 882 tokeniser->state = STATE_TAG_NAME; 883 } else if (c == '>') { 884 /** \todo parse error */ 885 886 tokeniser->context.pending += len; 887 tokeniser->state = STATE_DATA; 888 } else if (c == '?') { 889 /** \todo parse error */ 890 891 /* Cursor still at "<", need to advance past it */ 892 parserutils_inputstream_advance( 893 tokeniser->input, SLEN("<")); 894 tokeniser->context.pending = 0; 895 896 tokeniser->state = STATE_BOGUS_COMMENT; 897 } else { 898 /* Return to data state with '<' still in "chars" */ 899 tokeniser->state = STATE_DATA; 900 } 901 } 902 903 return HUBBUB_OK; 904} 905 906/* this state expects tokeniser->context.chars to be "</" */ 907/* this state never stays in this state for more than one character */ 908hubbub_error hubbub_tokeniser_handle_close_tag_open(hubbub_tokeniser *tokeniser) 909{ 910 hubbub_tokeniser_context *ctx = &tokeniser->context; 911 912 size_t len; 913 const uint8_t *cptr; 914 parserutils_error error; 915 uint8_t c; 916 917 assert(tokeniser->context.pending == 2); 918/* assert(tokeniser->context.chars.ptr[0] == '<'); */ 919/* assert(tokeniser->context.chars.ptr[1] == '/'); */ 920 921 /**\todo fragment case */ 922 923 if (tokeniser->content_model == HUBBUB_CONTENT_MODEL_RCDATA || 924 tokeniser->content_model == 925 HUBBUB_CONTENT_MODEL_CDATA) { 926 uint8_t *start_tag_name = 927 tokeniser->context.last_start_tag_name; 928 size_t start_tag_len = 929 tokeniser->context.last_start_tag_len; 930 931 while ((error = parserutils_inputstream_peek(tokeniser->input, 932 ctx->pending + 933 ctx->close_tag_match.count, 934 &cptr, 935 &len)) == PARSERUTILS_OK) { 936 c = *cptr; 937 938 if ((start_tag_name[ctx->close_tag_match.count] & ~0x20) 939 != (c & ~0x20)) { 940 break; 941 } 942 943 ctx->close_tag_match.count += len; 944 945 if (ctx->close_tag_match.count == start_tag_len) { 946 ctx->close_tag_match.match = true; 947 break; 948 } 949 } 950 951 if (error != PARSERUTILS_OK && error != PARSERUTILS_EOF) { 952 return hubbub_error_from_parserutils_error(error); 953 } 954 955 if (ctx->close_tag_match.match == true) { 956 error = parserutils_inputstream_peek( 957 tokeniser->input, 958 ctx->pending + 959 ctx->close_tag_match.count, 960 &cptr, 961 &len); 962 963 if (error != PARSERUTILS_OK && 964 error != PARSERUTILS_EOF) { 965 return hubbub_error_from_parserutils_error( 966 error); 967 } else if (error != PARSERUTILS_EOF) { 968 c = *cptr; 969 970 if (c != '\t' && c != '\n' && c != '\f' && 971 c != ' ' && c != '>' && 972 c != '/') { 973 ctx->close_tag_match.match = false; 974 } 975 } 976 } 977 } 978 979 if (ctx->close_tag_match.match == false && 980 tokeniser->content_model != 981 HUBBUB_CONTENT_MODEL_PCDATA) { 982 /* We should emit "</" here, but instead we leave it in the 983 * buffer so the data state emits it with any characters 984 * following it */ 985 tokeniser->state = STATE_DATA; 986 } else { 987 error = parserutils_inputstream_peek(tokeniser->input, 988 tokeniser->context.pending, &cptr, &len); 989 990 if (error == PARSERUTILS_EOF) { 991 /** \todo parse error */ 992 993 /* Return to data state with "</" pending */ 994 tokeniser->state = STATE_DATA; 995 return HUBBUB_OK; 996 } else if (error != PARSERUTILS_OK) { 997 return hubbub_error_from_parserutils_error(error); 998 } 999 1000 c = *cptr; 1001 1002 if ('A' <= c && c <= 'Z') { 1003 uint8_t lc = (c + 0x20); 1004 START_BUF(tokeniser->context.current_tag.name, 1005 &lc, len); 1006 tokeniser->context.current_tag.n_attributes = 0; 1007 1008 tokeniser->context.current_tag_type = 1009 HUBBUB_TOKEN_END_TAG; 1010 1011 tokeniser->context.pending += len; 1012 1013 tokeniser->state = STATE_TAG_NAME; 1014 } else if ('a' <= c && c <= 'z') { 1015 START_BUF(tokeniser->context.current_tag.name, 1016 cptr, len); 1017 tokeniser->context.current_tag.n_attributes = 0; 1018 1019 tokeniser->context.current_tag_type = 1020 HUBBUB_TOKEN_END_TAG; 1021 1022 tokeniser->context.pending += len; 1023 1024 tokeniser->state = STATE_TAG_NAME; 1025 } else if (c == '>') { 1026 /* Cursor still at "</", need to collect ">" */ 1027 tokeniser->context.pending += len; 1028 1029 /* Now need to advance past "</>" */ 1030 parserutils_inputstream_advance(tokeniser->input, 1031 tokeniser->context.pending); 1032 tokeniser->context.pending = 0; 1033 1034 /** \todo parse error */ 1035 tokeniser->state = STATE_DATA; 1036 } else { 1037 /** \todo parse error */ 1038 1039 /* Cursor still at "</", need to advance past it */ 1040 parserutils_inputstream_advance(tokeniser->input, 1041 tokeniser->context.pending); 1042 tokeniser->context.pending = 0; 1043 1044 tokeniser->state = STATE_BOGUS_COMMENT; 1045 } 1046 } 1047 1048 return HUBBUB_OK; 1049} 1050 1051/* this state expects tokeniser->context.current_tag to already have its 1052 first character set */ 1053hubbub_error hubbub_tokeniser_handle_tag_name(hubbub_tokeniser *tokeniser) 1054{ 1055 hubbub_tag *ctag = &tokeniser->context.current_tag; 1056 1057 size_t len; 1058 const uint8_t *cptr; 1059 parserutils_error error; 1060 uint8_t c; 1061 1062 assert(tokeniser->context.pending > 0); 1063/* assert(tokeniser->context.chars.ptr[0] == '<'); */ 1064 assert(ctag->name.len > 0); 1065/* assert(ctag->name.ptr); */ 1066 1067 error = parserutils_inputstream_peek(tokeniser->input, 1068 tokeniser->context.pending, &cptr, &len); 1069 1070 if (error != PARSERUTILS_OK) { 1071 if (error == PARSERUTILS_EOF) { 1072 tokeniser->state = STATE_DATA; 1073 return emit_current_tag(tokeniser); 1074 } else { 1075 return hubbub_error_from_parserutils_error(error); 1076 } 1077 } 1078 1079 c = *cptr; 1080 1081 if (c == '\t' || c == '\n' || c == '\f' || c == ' ' || c == '\r') { 1082 tokeniser->context.pending += len; 1083 tokeniser->state = STATE_BEFORE_ATTRIBUTE_NAME; 1084 } else if (c == '>') { 1085 tokeniser->context.pending += len; 1086 tokeniser->state = STATE_DATA; 1087 return emit_current_tag(tokeniser); 1088 } else if (c == '\0') { 1089 COLLECT(ctag->name, u_fffd, sizeof(u_fffd)); 1090 tokeniser->context.pending += len; 1091 } else if (c == '/') { 1092 tokeniser->context.pending += len; 1093 tokeniser->state = STATE_SELF_CLOSING_START_TAG; 1094 } else if ('A' <= c && c <= 'Z') { 1095 uint8_t lc = (c + 0x20); 1096 COLLECT(ctag->name, &lc, len); 1097 tokeniser->context.pending += len; 1098 } else { 1099 COLLECT(ctag->name, cptr, len); 1100 tokeniser->context.pending += len; 1101 } 1102 1103 return HUBBUB_OK; 1104} 1105 1106hubbub_error hubbub_tokeniser_handle_before_attribute_name( 1107 hubbub_tokeniser *tokeniser) 1108{ 1109 hubbub_tag *ctag = &tokeniser->context.current_tag; 1110 1111 size_t len; 1112 const uint8_t *cptr; 1113 parserutils_error error; 1114 uint8_t c; 1115 1116 error = parserutils_inputstream_peek(tokeniser->input, 1117 tokeniser->context.pending, &cptr, &len); 1118 1119 if (error != PARSERUTILS_OK) { 1120 if (error == PARSERUTILS_EOF) { 1121 tokeniser->state = STATE_DATA; 1122 return emit_current_tag(tokeniser); 1123 } else { 1124 return hubbub_error_from_parserutils_error(error); 1125 } 1126 } 1127 1128 c = *cptr; 1129 1130 if (c == '\t' || c == '\n' || c == '\f' || c == ' ' || c == '\r') { 1131 /* pass over in silence */ 1132 tokeniser->context.pending += len; 1133 } else if (c == '>') { 1134 tokeniser->context.pending += len; 1135 tokeniser->state = STATE_DATA; 1136 return emit_current_tag(tokeniser); 1137 } else if (c == '/') { 1138 tokeniser->context.pending += len; 1139 tokeniser->state = STATE_SELF_CLOSING_START_TAG; 1140 } else { 1141 hubbub_attribute *attr; 1142 1143 if (c == '"' || c == '\'' || c == '=') { 1144 /** \todo parse error */ 1145 } 1146 1147 attr = tokeniser->alloc(ctag->attributes, 1148 (ctag->n_attributes + 1) * 1149 sizeof(hubbub_attribute), 1150 tokeniser->alloc_pw); 1151 if (attr == NULL) 1152 return HUBBUB_NOMEM; 1153 1154 ctag->attributes = attr; 1155 1156 if ('A' <= c && c <= 'Z') { 1157 uint8_t lc = (c + 0x20); 1158 START_BUF(attr[ctag->n_attributes].name, &lc, len); 1159 } else if (c == '\0') { 1160 START_BUF(attr[ctag->n_attributes].name, 1161 u_fffd, sizeof(u_fffd)); 1162 } else { 1163 START_BUF(attr[ctag->n_attributes].name, cptr, len); 1164 } 1165 1166 attr[ctag->n_attributes].ns = HUBBUB_NS_NULL; 1167 attr[ctag->n_attributes].value.ptr = NULL; 1168 attr[ctag->n_attributes].value.len = 0; 1169 1170 ctag->n_attributes++; 1171 1172 tokeniser->context.pending += len; 1173 tokeniser->state = STATE_ATTRIBUTE_NAME; 1174 } 1175 1176 return HUBBUB_OK; 1177} 1178 1179hubbub_error hubbub_tokeniser_handle_attribute_name(hubbub_tokeniser *tokeniser) 1180{ 1181 hubbub_tag *ctag = &tokeniser->context.current_tag; 1182 1183 size_t len; 1184 const uint8_t *cptr; 1185 parserutils_error error; 1186 uint8_t c; 1187 1188 assert(ctag->attributes[ctag->n_attributes - 1].name.len > 0); 1189 1190 error = parserutils_inputstream_peek(tokeniser->input, 1191 tokeniser->context.pending, &cptr, &len); 1192 1193 if (error != PARSERUTILS_OK) { 1194 if (error == PARSERUTILS_EOF) { 1195 tokeniser->state = STATE_DATA; 1196 return emit_current_tag(tokeniser); 1197 } else { 1198 return hubbub_error_from_parserutils_error(error); 1199 } 1200 } 1201 1202 c = *cptr; 1203 1204 if (c == '\t' || c == '\n' || c == '\f' || c == ' ' || c == '\r') { 1205 tokeniser->context.pending += len; 1206 tokeniser->state = STATE_AFTER_ATTRIBUTE_NAME; 1207 } else if (c == '=') { 1208 tokeniser->context.pending += len; 1209 tokeniser->state = STATE_BEFORE_ATTRIBUTE_VALUE; 1210 } else if (c == '>') { 1211 tokeniser->context.pending += len; 1212 tokeniser->state = STATE_DATA; 1213 return emit_current_tag(tokeniser); 1214 } else if (c == '/') { 1215 tokeniser->context.pending += len; 1216 tokeniser->state = STATE_SELF_CLOSING_START_TAG; 1217 } else if (c == '\0') { 1218 COLLECT(ctag->attributes[ctag->n_attributes - 1].name, 1219 u_fffd, sizeof(u_fffd)); 1220 tokeniser->context.pending += len; 1221 } else if ('A' <= c && c <= 'Z') { 1222 uint8_t lc = (c + 0x20); 1223 COLLECT(ctag->attributes[ctag->n_attributes - 1].name, 1224 &lc, len); 1225 tokeniser->context.pending += len; 1226 } else { 1227 COLLECT(ctag->attributes[ctag->n_attributes - 1].name, 1228 cptr, len); 1229 tokeniser->context.pending += len; 1230 } 1231 1232 return HUBBUB_OK; 1233} 1234 1235hubbub_error hubbub_tokeniser_handle_after_attribute_name( 1236 hubbub_tokeniser *tokeniser) 1237{ 1238 hubbub_tag *ctag = &tokeniser->context.current_tag; 1239 1240 size_t len; 1241 const uint8_t *cptr; 1242 parserutils_error error; 1243 uint8_t c; 1244 1245 error = parserutils_inputstream_peek(tokeniser->input, 1246 tokeniser->context.pending, &cptr, &len); 1247 1248 if (error != PARSERUTILS_OK) { 1249 if (error == PARSERUTILS_EOF) { 1250 tokeniser->state = STATE_DATA; 1251 return emit_current_tag(tokeniser); 1252 } else { 1253 return hubbub_error_from_parserutils_error(error); 1254 } 1255 } 1256 1257 c = *cptr; 1258 1259 if (c == '\t' || c == '\n' || c == '\f' || c == ' ' || c == '\r') { 1260 tokeniser->context.pending += len; 1261 } else if (c == '=') { 1262 tokeniser->context.pending += len; 1263 tokeniser->state = STATE_BEFORE_ATTRIBUTE_VALUE; 1264 } else if (c == '>') { 1265 tokeniser->context.pending += len; 1266 1267 tokeniser->state = STATE_DATA; 1268 return emit_current_tag(tokeniser); 1269 } else if (c == '/') { 1270 tokeniser->context.pending += len; 1271 tokeniser->state = STATE_SELF_CLOSING_START_TAG; 1272 } else { 1273 hubbub_attribute *attr; 1274 1275 if (c == '"' || c == '\'') { 1276 /** \todo parse error */ 1277 } 1278 1279 attr = tokeniser->alloc(ctag->attributes, 1280 (ctag->n_attributes + 1) * 1281 sizeof(hubbub_attribute), 1282 tokeniser->alloc_pw); 1283 if (attr == NULL) 1284 return HUBBUB_NOMEM; 1285 1286 ctag->attributes = attr; 1287 1288 if ('A' <= c && c <= 'Z') { 1289 uint8_t lc = (c + 0x20); 1290 START_BUF(attr[ctag->n_attributes].name, &lc, len); 1291 } else if (c == '\0') { 1292 START_BUF(attr[ctag->n_attributes].name, 1293 u_fffd, sizeof(u_fffd)); 1294 } else { 1295 START_BUF(attr[ctag->n_attributes].name, cptr, len); 1296 } 1297 1298 attr[ctag->n_attributes].ns = HUBBUB_NS_NULL; 1299 attr[ctag->n_attributes].value.ptr = NULL; 1300 attr[ctag->n_attributes].value.len = 0; 1301 1302 ctag->n_attributes++; 1303 1304 tokeniser->context.pending += len; 1305 tokeniser->state = STATE_ATTRIBUTE_NAME; 1306 } 1307 1308 return HUBBUB_OK; 1309} 1310 1311/* this state is only ever triggered by an '=' */ 1312hubbub_error hubbub_tokeniser_handle_before_attribute_value( 1313 hubbub_tokeniser *tokeniser) 1314{ 1315 hubbub_tag *ctag = &tokeniser->context.current_tag; 1316 1317 size_t len; 1318 const uint8_t *cptr; 1319 parserutils_error error; 1320 uint8_t c; 1321 1322 error = parserutils_inputstream_peek(tokeniser->input, 1323 tokeniser->context.pending, &cptr, &len); 1324 1325 if (error != PARSERUTILS_OK) { 1326 if (error == PARSERUTILS_EOF) { 1327 /** \todo parse error */ 1328 tokeniser->state = STATE_DATA; 1329 return emit_current_tag(tokeniser); 1330 } else { 1331 return hubbub_error_from_parserutils_error(error); 1332 } 1333 } 1334 1335 c = *cptr; 1336 1337 if (c == '\t' || c == '\n' || c == '\f' || c == ' ' || c == '\r') { 1338 tokeniser->context.pending += len; 1339 } else if (c == '"') { 1340 tokeniser->context.pending += len; 1341 tokeniser->state = STATE_ATTRIBUTE_VALUE_DQ; 1342 } else if (c == '&') { 1343 /* Don't consume the '&' -- reprocess in UQ state */ 1344 tokeniser->state = STATE_ATTRIBUTE_VALUE_UQ; 1345 } else if (c == '\'') { 1346 tokeniser->context.pending += len; 1347 tokeniser->state = STATE_ATTRIBUTE_VALUE_SQ; 1348 } else if (c == '>') { 1349 /** \todo parse error */ 1350 tokeniser->context.pending += len; 1351 1352 tokeniser->state = STATE_DATA; 1353 return emit_current_tag(tokeniser); 1354 } else if (c == '\0') { 1355 START_BUF(ctag->attributes[ctag->n_attributes - 1].value, 1356 u_fffd, sizeof(u_fffd)); 1357 tokeniser->context.pending += len; 1358 tokeniser->state = STATE_ATTRIBUTE_VALUE_UQ; 1359 } else { 1360 if (c == '=') { 1361 /** \todo parse error */ 1362 } 1363 1364 START_BUF(ctag->attributes[ctag->n_attributes - 1].value, 1365 cptr, len); 1366 1367 tokeniser->context.pending += len; 1368 tokeniser->state = STATE_ATTRIBUTE_VALUE_UQ; 1369 } 1370 1371 return HUBBUB_OK; 1372} 1373 1374hubbub_error hubbub_tokeniser_handle_attribute_value_dq( 1375 hubbub_tokeniser *tokeniser) 1376{ 1377 hubbub_tag *ctag = &tokeniser->context.current_tag; 1378 1379 size_t len; 1380 const uint8_t *cptr; 1381 parserutils_error error; 1382 uint8_t c; 1383 1384 error = parserutils_inputstream_peek(tokeniser->input, 1385 tokeniser->context.pending, &cptr, &len); 1386 1387 if (error != PARSERUTILS_OK) { 1388 if (error == PARSERUTILS_EOF) { 1389 tokeniser->state = STATE_DATA; 1390 return emit_current_tag(tokeniser); 1391 } else { 1392 return hubbub_error_from_parserutils_error(error); 1393 } 1394 } 1395 1396 c = *cptr; 1397 1398 if (c == '"') { 1399 tokeniser->context.pending += len; 1400 tokeniser->state = STATE_AFTER_ATTRIBUTE_VALUE_Q; 1401 } else if (c == '&') { 1402 tokeniser->context.prev_state = tokeniser->state; 1403 tokeniser->state = STATE_CHARACTER_REFERENCE_IN_ATTRIBUTE_VALUE; 1404 tokeniser->context.allowed_char = '"'; 1405 /* Don't eat the '&'; it'll be handled by entity consumption */ 1406 } else if (c == '\0') { 1407 COLLECT_MS(ctag->attributes[ctag->n_attributes - 1].value, 1408 u_fffd, sizeof(u_fffd)); 1409 tokeniser->context.pending += len; 1410 } else if (c == '\r') { 1411 error = parserutils_inputstream_peek( 1412 tokeniser->input, 1413 tokeniser->context.pending + len, 1414 &cptr, 1415 &len); 1416 1417 if (error != PARSERUTILS_OK && error != PARSERUTILS_EOF) { 1418 return hubbub_error_from_parserutils_error(error); 1419 } else if (error == PARSERUTILS_EOF || *cptr != '\n') { 1420 COLLECT_MS(ctag->attributes[ 1421 ctag->n_attributes - 1].value, 1422 &lf, sizeof(lf)); 1423 } 1424 1425 /* Consume '\r' */ 1426 tokeniser->context.pending += 1; 1427 } else { 1428 COLLECT_MS(ctag->attributes[ctag->n_attributes - 1].value, 1429 cptr, len); 1430 tokeniser->context.pending += len; 1431 } 1432 1433 return HUBBUB_OK; 1434} 1435 1436hubbub_error hubbub_tokeniser_handle_attribute_value_sq( 1437 hubbub_tokeniser *tokeniser) 1438{ 1439 hubbub_tag *ctag = &tokeniser->context.current_tag; 1440 1441 size_t len; 1442 const uint8_t *cptr; 1443 parserutils_error error; 1444 uint8_t c; 1445 1446 error = parserutils_inputstream_peek(tokeniser->input, 1447 tokeniser->context.pending, &cptr, &len); 1448 1449 if (error != PARSERUTILS_OK) { 1450 if (error == PARSERUTILS_EOF) { 1451 tokeniser->state = STATE_DATA; 1452 return emit_current_tag(tokeniser); 1453 } else { 1454 return hubbub_error_from_parserutils_error(error); 1455 } 1456 } 1457 1458 c = *cptr; 1459 1460 if (c == '\'') { 1461 tokeniser->context.pending += len; 1462 tokeniser->state = STATE_AFTER_ATTRIBUTE_VALUE_Q; 1463 } else if (c == '&') { 1464 tokeniser->context.prev_state = tokeniser->state; 1465 tokeniser->state = STATE_CHARACTER_REFERENCE_IN_ATTRIBUTE_VALUE; 1466 tokeniser->context.allowed_char = '\''; 1467 /* Don't eat the '&'; it'll be handled by entity consumption */ 1468 } else if (c == '\0') { 1469 COLLECT_MS(ctag->attributes[ctag->n_attributes - 1].value, 1470 u_fffd, sizeof(u_fffd)); 1471 tokeniser->context.pending += len; 1472 } else if (c == '\r') { 1473 error = parserutils_inputstream_peek( 1474 tokeniser->input, 1475 tokeniser->context.pending + len, 1476 &cptr, 1477 &len); 1478 1479 if (error != PARSERUTILS_OK && error != PARSERUTILS_EOF) { 1480 return hubbub_error_from_parserutils_error(error); 1481 } else if (error == PARSERUTILS_EOF || *cptr != '\n') { 1482 COLLECT_MS(ctag->attributes[ 1483 ctag->n_attributes - 1].value, 1484 &lf, sizeof(lf)); 1485 } 1486 1487 /* Consume \r */ 1488 tokeniser->context.pending += 1; 1489 } else { 1490 COLLECT_MS(ctag->attributes[ctag->n_attributes - 1].value, 1491 cptr, len); 1492 tokeniser->context.pending += len; 1493 } 1494 1495 return HUBBUB_OK; 1496} 1497 1498hubbub_error hubbub_tokeniser_handle_attribute_value_uq( 1499 hubbub_tokeniser *tokeniser) 1500{ 1501 hubbub_tag *ctag = &tokeniser->context.current_tag; 1502 uint8_t c; 1503 1504 size_t len; 1505 const uint8_t *cptr; 1506 parserutils_error error; 1507 1508 error = parserutils_inputstream_peek(tokeniser->input, 1509 tokeniser->context.pending, &cptr, &len); 1510 1511 if (error != PARSERUTILS_OK) { 1512 if (error == PARSERUTILS_EOF) { 1513 tokeniser->state = STATE_DATA; 1514 return emit_current_tag(tokeniser); 1515 } else { 1516 return hubbub_error_from_parserutils_error(error); 1517 } 1518 } 1519 1520 c = *cptr; 1521 1522 assert(c == '&' || 1523 ctag->attributes[ctag->n_attributes - 1].value.len >= 1); 1524 1525 if (c == '\t' || c == '\n' || c == '\f' || c == ' ' || c == '\r') { 1526 tokeniser->context.pending += len; 1527 tokeniser->state = STATE_BEFORE_ATTRIBUTE_NAME; 1528 } else if (c == '&') { 1529 tokeniser->context.prev_state = tokeniser->state; 1530 tokeniser->state = STATE_CHARACTER_REFERENCE_IN_ATTRIBUTE_VALUE; 1531 /* Don't eat the '&'; it'll be handled by entity consumption */ 1532 } else if (c == '>') { 1533 tokeniser->context.pending += len; 1534 tokeniser->state = STATE_DATA; 1535 return emit_current_tag(tokeniser); 1536 } else if (c == '\0') { 1537 COLLECT(ctag->attributes[ctag->n_attributes - 1].value, 1538 u_fffd, sizeof(u_fffd)); 1539 tokeniser->context.pending += len; 1540 } else { 1541 if (c == '"' || c == '\'' || c == '=') { 1542 /** \todo parse error */ 1543 } 1544 1545 COLLECT(ctag->attributes[ctag->n_attributes - 1].value, 1546 cptr, len); 1547 tokeniser->context.pending += len; 1548 } 1549 1550 return HUBBUB_OK; 1551} 1552 1553hubbub_error hubbub_tokeniser_handle_character_reference_in_attribute_value( 1554 hubbub_tokeniser *tokeniser) 1555{ 1556 if (tokeniser->context.match_entity.complete == false) { 1557 return hubbub_tokeniser_consume_character_reference(tokeniser, 1558 tokeniser->context.pending); 1559 } else { 1560 hubbub_tag *ctag = &tokeniser->context.current_tag; 1561 hubbub_attribute *attr = &ctag->attributes[ 1562 ctag->n_attributes - 1]; 1563 1564 uint8_t utf8[6]; 1565 uint8_t *utf8ptr = utf8; 1566 size_t len = sizeof(utf8); 1567 1568 if (tokeniser->context.match_entity.codepoint) { 1569 parserutils_charset_utf8_from_ucs4( 1570 tokeniser->context.match_entity.codepoint, 1571 &utf8ptr, &len); 1572 1573 COLLECT_MS(attr->value, utf8, sizeof(utf8) - len); 1574 1575 /* +1 for the ampersand */ 1576 tokeniser->context.pending += 1577 tokeniser->context.match_entity.length 1578 + 1; 1579 } else { 1580 size_t len = 0; 1581 const uint8_t *cptr = NULL; 1582 parserutils_error error; 1583 1584 error = parserutils_inputstream_peek( 1585 tokeniser->input, 1586 tokeniser->context.pending, 1587 &cptr, 1588 &len); 1589 1590 assert(error == PARSERUTILS_OK); 1591 1592 /* Insert the ampersand */ 1593 COLLECT_MS(attr->value, cptr, len); 1594 tokeniser->context.pending += len; 1595 } 1596 1597 /* Reset for next time */ 1598 tokeniser->context.match_entity.complete = false; 1599 1600 /* And back to the previous state */ 1601 tokeniser->state = tokeniser->context.prev_state; 1602 } 1603 1604 return HUBBUB_OK; 1605} 1606 1607/* always switches state */ 1608hubbub_error hubbub_tokeniser_handle_after_attribute_value_q( 1609 hubbub_tokeniser *tokeniser) 1610{ 1611 size_t len; 1612 const uint8_t *cptr; 1613 parserutils_error error; 1614 uint8_t c; 1615 1616 error = parserutils_inputstream_peek(tokeniser->input, 1617 tokeniser->context.pending, &cptr, &len); 1618 1619 if (error != PARSERUTILS_OK) { 1620 if (error == PARSERUTILS_EOF) { 1621 tokeniser->state = STATE_DATA; 1622 return emit_current_tag(tokeniser); 1623 } else { 1624 return hubbub_error_from_parserutils_error(error); 1625 } 1626 } 1627 1628 c = *cptr; 1629 1630 if (c == '\t' || c == '\n' || c == '\f' || c == ' ' || c == '\r') { 1631 tokeniser->context.pending += len; 1632 tokeniser->state = STATE_BEFORE_ATTRIBUTE_NAME; 1633 } else if (c == '>') { 1634 tokeniser->context.pending += len; 1635 1636 tokeniser->state = STATE_DATA; 1637 return emit_current_tag(tokeniser); 1638 } else if (c == '/') { 1639 tokeniser->context.pending += len; 1640 tokeniser->state = STATE_SELF_CLOSING_START_TAG; 1641 } else { 1642 /** \todo parse error */ 1643 /* Reprocess character in before attribute name state */ 1644 tokeniser->state = STATE_BEFORE_ATTRIBUTE_NAME; 1645 } 1646 1647 return HUBBUB_OK; 1648} 1649 1650hubbub_error hubbub_tokeniser_handle_self_closing_start_tag( 1651 hubbub_tokeniser *tokeniser) 1652{ 1653 size_t len; 1654 const uint8_t *cptr; 1655 parserutils_error error; 1656 uint8_t c; 1657 1658 error = parserutils_inputstream_peek(tokeniser->input, 1659 tokeniser->context.pending, &cptr, &len); 1660 1661 if (error != PARSERUTILS_OK) { 1662 if (error == PARSERUTILS_EOF) { 1663 tokeniser->state = STATE_DATA; 1664 return emit_current_tag(tokeniser); 1665 } else { 1666 return hubbub_error_from_parserutils_error(error); 1667 } 1668 } 1669 1670 c = *cptr; 1671 1672 if (c == '>') { 1673 tokeniser->context.pending += len; 1674 tokeniser->state = STATE_DATA; 1675 1676 tokeniser->context.current_tag.self_closing = true; 1677 return emit_current_tag(tokeniser); 1678 } else { 1679 /* Reprocess character in before attribute name state */ 1680 tokeniser->state = STATE_BEFORE_ATTRIBUTE_NAME; 1681 } 1682 1683 return HUBBUB_OK; 1684} 1685 1686/* this state expects tokeniser->context.chars to be empty on first entry */ 1687hubbub_error hubbub_tokeniser_handle_bogus_comment(hubbub_tokeniser *tokeniser) 1688{ 1689 size_t len; 1690 const uint8_t *cptr; 1691 parserutils_error error; 1692 uint8_t c; 1693 1694 error = parserutils_inputstream_peek(tokeniser->input, 1695 tokeniser->context.pending, &cptr, &len); 1696 1697 if (error != PARSERUTILS_OK) { 1698 if (error == PARSERUTILS_EOF) { 1699 tokeniser->state = STATE_DATA; 1700 return emit_current_comment(tokeniser); 1701 } else { 1702 return hubbub_error_from_parserutils_error(error); 1703 } 1704 } 1705 1706 c = *cptr; 1707 1708 if (c == '>') { 1709 tokeniser->context.pending += len; 1710 tokeniser->state = STATE_DATA; 1711 return emit_current_comment(tokeniser); 1712 } else if (c == '\0') { 1713 error = parserutils_buffer_append(tokeniser->buffer, 1714 u_fffd, sizeof(u_fffd)); 1715 if (error != PARSERUTILS_OK) 1716 return hubbub_error_from_parserutils_error(error); 1717 1718 tokeniser->context.pending += len; 1719 } else if (c == '\r') { 1720 error = parserutils_inputstream_peek( 1721 tokeniser->input, 1722 tokeniser->context.pending, 1723 &cptr, 1724 &len); 1725 1726 if (error != PARSERUTILS_OK && error != PARSERUTILS_EOF) { 1727 return hubbub_error_from_parserutils_error(error); 1728 } else if (error == PARSERUTILS_EOF || *cptr != '\n') { 1729 error = parserutils_buffer_append(tokeniser->buffer, 1730 &lf, sizeof(lf)); 1731 if (error != PARSERUTILS_OK) { 1732 return hubbub_error_from_parserutils_error( 1733 error); 1734 } 1735 } 1736 tokeniser->context.pending += len; 1737 } else { 1738 error = parserutils_buffer_append(tokeniser->buffer, 1739 (uint8_t *) cptr, len); 1740 if (error != PARSERUTILS_OK) 1741 return hubbub_error_from_parserutils_error(error); 1742 1743 tokeniser->context.pending += len; 1744 } 1745 1746 return HUBBUB_OK; 1747} 1748 1749/* this state always switches to another state straight away */ 1750hubbub_error hubbub_tokeniser_handle_markup_declaration_open( 1751 hubbub_tokeniser *tokeniser) 1752{ 1753 size_t len; 1754 const uint8_t *cptr; 1755 parserutils_error error; 1756 uint8_t c; 1757 1758 assert(tokeniser->context.pending == 0); 1759 1760 error = parserutils_inputstream_peek(tokeniser->input, 0, &cptr, &len); 1761 1762 if (error != PARSERUTILS_OK) { 1763 if (error == PARSERUTILS_EOF) { 1764 tokeniser->state = STATE_BOGUS_COMMENT; 1765 return HUBBUB_OK; 1766 } else { 1767 return hubbub_error_from_parserutils_error(error); 1768 } 1769 } 1770 1771 c = *cptr; 1772 1773 if (c == '-') { 1774 tokeniser->context.pending = len; 1775 tokeniser->state = STATE_MATCH_COMMENT; 1776 } else if ((c & ~0x20) == 'D') { 1777 tokeniser->context.pending = len; 1778 tokeniser->context.match_doctype.count = len; 1779 tokeniser->state = STATE_MATCH_DOCTYPE; 1780 } else if (tokeniser->process_cdata_section == true && c == '[') { 1781 tokeniser->context.pending = len; 1782 tokeniser->context.match_cdata.count = len; 1783 tokeniser->state = STATE_MATCH_CDATA; 1784 } else { 1785 tokeniser->state = STATE_BOGUS_COMMENT; 1786 } 1787 1788 return HUBBUB_OK; 1789} 1790 1791 1792hubbub_error hubbub_tokeniser_handle_match_comment(hubbub_tokeniser *tokeniser) 1793{ 1794 size_t len; 1795 const uint8_t *cptr; 1796 parserutils_error error; 1797 1798 error = parserutils_inputstream_peek(tokeniser->input, 1799 tokeniser->context.pending, &cptr, &len); 1800 1801 if (error != PARSERUTILS_OK) { 1802 if (error == PARSERUTILS_EOF) { 1803 tokeniser->context.pending = 1804 tokeniser->context.current_comment.len = 0; 1805 tokeniser->state = STATE_BOGUS_COMMENT; 1806 return HUBBUB_OK; 1807 } else { 1808 return hubbub_error_from_parserutils_error(error); 1809 } 1810 } 1811 1812 tokeniser->context.pending = tokeniser->context.current_comment.len = 0; 1813 1814 if (*cptr == '-') { 1815 parserutils_inputstream_advance(tokeniser->input, SLEN("--")); 1816 tokeniser->state = STATE_COMMENT_START; 1817 } else { 1818 tokeniser->state = STATE_BOGUS_COMMENT; 1819 } 1820 1821 return HUBBUB_OK; 1822} 1823 1824 1825hubbub_error hubbub_tokeniser_handle_comment(hubbub_tokeniser *tokeniser) 1826{ 1827 size_t len; 1828 const uint8_t *cptr; 1829 parserutils_error error; 1830 uint8_t c; 1831 1832 error = parserutils_inputstream_peek(tokeniser->input, 1833 tokeniser->context.pending, &cptr, &len); 1834 1835 if (error != PARSERUTILS_OK) { 1836 if (error == PARSERUTILS_EOF) { 1837 tokeniser->state = STATE_DATA; 1838 return emit_current_comment(tokeniser); 1839 } else { 1840 return hubbub_error_from_parserutils_error(error); 1841 } 1842 } 1843 1844 c = *cptr; 1845 1846 if (c == '>' && (tokeniser->state == STATE_COMMENT_START_DASH || 1847 tokeniser->state == STATE_COMMENT_START || 1848 tokeniser->state == STATE_COMMENT_END)) { 1849 tokeniser->context.pending += len; 1850 1851 /** \todo parse error if state != COMMENT_END */ 1852 tokeniser->state = STATE_DATA; 1853 return emit_current_comment(tokeniser); 1854 } else if (c == '-') { 1855 if (tokeniser->state == STATE_COMMENT_START) { 1856 tokeniser->state = STATE_COMMENT_START_DASH; 1857 } else if (tokeniser->state == STATE_COMMENT_START_DASH) { 1858 tokeniser->state = STATE_COMMENT_END; 1859 } else if (tokeniser->state == STATE_COMMENT) { 1860 tokeniser->state = STATE_COMMENT_END_DASH; 1861 } else if (tokeniser->state == STATE_COMMENT_END_DASH) { 1862 tokeniser->state = STATE_COMMENT_END; 1863 } else if (tokeniser->state == STATE_COMMENT_END) { 1864 error = parserutils_buffer_append(tokeniser->buffer, 1865 (uint8_t *) "-", SLEN("-")); 1866 if (error != PARSERUTILS_OK) { 1867 return hubbub_error_from_parserutils_error( 1868 error); 1869 } 1870 } 1871 1872 tokeniser->context.pending += len; 1873 } else { 1874 if (tokeniser->state == STATE_COMMENT_START_DASH || 1875 tokeniser->state == STATE_COMMENT_END_DASH) { 1876 error = parserutils_buffer_append(tokeniser->buffer, 1877 (uint8_t *) "-", SLEN("-")); 1878 if (error != PARSERUTILS_OK) { 1879 return hubbub_error_from_parserutils_error( 1880 error); 1881 } 1882 } else if (tokeniser->state == STATE_COMMENT_END) { 1883 error = parserutils_buffer_append(tokeniser->buffer, 1884 (uint8_t *) "--", SLEN("--")); 1885 if (error != PARSERUTILS_OK) { 1886 return hubbub_error_from_parserutils_error( 1887 error); 1888 } 1889 } 1890 1891 if (c == '\0') { 1892 error = parserutils_buffer_append(tokeniser->buffer, 1893 u_fffd, sizeof(u_fffd)); 1894 if (error != PARSERUTILS_OK) { 1895 return hubbub_error_from_parserutils_error( 1896 error); 1897 } 1898 } else if (c == '\r') { 1899 size_t next_len; 1900 error = parserutils_inputstream_peek( 1901 tokeniser->input, 1902 tokeniser->context.pending + len, 1903 &cptr, 1904 &next_len); 1905 if (error != PARSERUTILS_OK && 1906 error != PARSERUTILS_EOF) { 1907 return hubbub_error_from_parserutils_error( 1908 error); 1909 } else if (error != PARSERUTILS_EOF && *cptr != '\n') { 1910 error = parserutils_buffer_append( 1911 tokeniser->buffer, 1912 &lf, sizeof(lf)); 1913 if (error != PARSERUTILS_OK) { 1914 return hubbub_error_from_parserutils_error( 1915 error); 1916 } 1917 } 1918 } else { 1919 error = parserutils_buffer_append(tokeniser->buffer, 1920 cptr, len); 1921 if (error != PARSERUTILS_OK) { 1922 return hubbub_error_from_parserutils_error( 1923 error); 1924 } 1925 } 1926 1927 tokeniser->context.pending += len; 1928 tokeniser->state = STATE_COMMENT; 1929 } 1930 1931 return HUBBUB_OK; 1932} 1933 1934 1935 1936 1937#define DOCTYPE "DOCTYPE" 1938#define DOCTYPE_LEN (SLEN(DOCTYPE) - 1) 1939 1940hubbub_error hubbub_tokeniser_handle_match_doctype(hubbub_tokeniser *tokeniser) 1941{ 1942 size_t len; 1943 const uint8_t *cptr; 1944 parserutils_error error; 1945 uint8_t c; 1946 1947 error = parserutils_inputstream_peek(tokeniser->input, 1948 tokeniser->context.match_doctype.count, &cptr, &len); 1949 1950 if (error != PARSERUTILS_OK) { 1951 if (error == PARSERUTILS_EOF) { 1952 tokeniser->context.current_comment.len = 1953 tokeniser->context.pending = 0; 1954 tokeniser->state = STATE_BOGUS_COMMENT; 1955 return HUBBUB_OK; 1956 } else { 1957 return hubbub_error_from_parserutils_error(error); 1958 } 1959 } 1960 1961 c = *cptr; 1962 1963 assert(tokeniser->context.match_doctype.count <= DOCTYPE_LEN); 1964 1965 if (DOCTYPE[tokeniser->context.match_doctype.count] != (c & ~0x20)) { 1966 tokeniser->context.current_comment.len = 1967 tokeniser->context.pending = 0; 1968 tokeniser->state = STATE_BOGUS_COMMENT; 1969 return HUBBUB_OK; 1970 } 1971 1972 tokeniser->context.pending += len; 1973 1974 if (tokeniser->context.match_doctype.count == DOCTYPE_LEN) { 1975 /* Skip over the DOCTYPE bit */ 1976 parserutils_inputstream_advance(tokeniser->input, 1977 tokeniser->context.pending); 1978 1979 memset(&tokeniser->context.current_doctype, 0, 1980 sizeof tokeniser->context.current_doctype); 1981 tokeniser->context.current_doctype.public_missing = true; 1982 tokeniser->context.current_doctype.system_missing = true; 1983 tokeniser->context.pending = 0; 1984 1985 tokeniser->state = STATE_DOCTYPE; 1986 } 1987 1988 tokeniser->context.match_doctype.count++; 1989 1990 return HUBBUB_OK; 1991} 1992 1993#undef DOCTYPE 1994#undef DOCTYPE_LEN 1995 1996hubbub_error hubbub_tokeniser_handle_doctype(hubbub_tokeniser *tokeniser) 1997{ 1998 size_t len; 1999 const uint8_t *cptr; 2000 parserutils_error error; 2001 uint8_t c; 2002 2003 error = parserutils_inputstream_peek(tokeniser->input, 2004 tokeniser->context.pending, &cptr, &len); 2005 2006 if (error != PARSERUTILS_OK) { 2007 if (error == PARSERUTILS_EOF) { 2008 tokeniser->state = STATE_BEFORE_DOCTYPE_NAME; 2009 return HUBBUB_OK; 2010 } else { 2011 return hubbub_error_from_parserutils_error(error); 2012 } 2013 } 2014 2015 c = *cptr; 2016 2017 if (c == '\t' || c == '\n' || c == '\f' || c == ' ' || c == '\r') { 2018 tokeniser->context.pending += len; 2019 } 2020 2021 tokeniser->state = STATE_BEFORE_DOCTYPE_NAME; 2022 2023 return HUBBUB_OK; 2024} 2025 2026hubbub_error hubbub_tokeniser_handle_before_doctype_name( 2027 hubbub_tokeniser *tokeniser) 2028{ 2029 hubbub_doctype *cdoc = &tokeniser->context.current_doctype; 2030 size_t len; 2031 const uint8_t *cptr; 2032 parserutils_error error; 2033 uint8_t c; 2034 2035 error = parserutils_inputstream_peek(tokeniser->input, 2036 tokeniser->context.pending, &cptr, &len); 2037 2038 if (error != PARSERUTILS_OK) { 2039 if (error == PARSERUTILS_EOF) { 2040 /** \todo parse error */ 2041 /* Emit current doctype, force-quirks on */ 2042 tokeniser->state = STATE_DATA; 2043 return emit_current_doctype(tokeniser, true); 2044 } else { 2045 return hubbub_error_from_parserutils_error(error); 2046 } 2047 } 2048 2049 c = *cptr; 2050 2051 if (c == '\t' || c == '\n' || c == '\f' || c == ' ' || c == '\r') { 2052 /* pass over in silence */ 2053 tokeniser->context.pending += len; 2054 } else if (c == '>') { 2055 /** \todo parse error */ 2056 tokeniser->context.pending += len; 2057 tokeniser->state = STATE_DATA; 2058 return emit_current_doctype(tokeniser, true); 2059 } else { 2060 if (c == '\0') { 2061 START_BUF(cdoc->name, u_fffd, sizeof(u_fffd)); 2062 } else if ('A' <= c && c <= 'Z') { 2063 uint8_t lc = c + 0x20; 2064 2065 START_BUF(cdoc->name, &lc, len); 2066 } else { 2067 START_BUF(cdoc->name, cptr, len); 2068 } 2069 2070 tokeniser->context.pending += len; 2071 tokeniser->state = STATE_DOCTYPE_NAME; 2072 } 2073 2074 return HUBBUB_OK; 2075} 2076 2077hubbub_error hubbub_tokeniser_handle_doctype_name(hubbub_tokeniser *tokeniser) 2078{ 2079 hubbub_doctype *cdoc = &tokeniser->context.current_doctype; 2080 size_t len; 2081 const uint8_t *cptr; 2082 parserutils_error error; 2083 uint8_t c; 2084 2085 error = parserutils_inputstream_peek(tokeniser->input, 2086 tokeniser->context.pending, &cptr, &len); 2087 2088 if (error != PARSERUTILS_OK) { 2089 if (error == PARSERUTILS_EOF) { 2090 tokeniser->state = STATE_DATA; 2091 return emit_current_doctype(tokeniser, true); 2092 } else { 2093 return hubbub_error_from_parserutils_error(error); 2094 } 2095 } 2096 2097 c = *cptr; 2098 2099 if (c == '\t' || c == '\n' || c == '\f' || c == ' ' || c == '\r') { 2100 tokeniser->context.pending += len; 2101 tokeniser->state = STATE_AFTER_DOCTYPE_NAME; 2102 } else if (c == '>') { 2103 tokeniser->context.pending += len; 2104 tokeniser->state = STATE_DATA; 2105 return emit_current_doctype(tokeniser, false); 2106 } else if (c == '\0') { 2107 COLLECT(cdoc->name, u_fffd, sizeof(u_fffd)); 2108 tokeniser->context.pending += len; 2109 } else if ('A' <= c && c <= 'Z') { 2110 uint8_t lc = c + 0x20; 2111 COLLECT(cdoc->name, &lc, len); 2112 tokeniser->context.pending += len; 2113 } else { 2114 COLLECT(cdoc->name, cptr, len); 2115 tokeniser->context.pending += len; 2116 } 2117 2118 return HUBBUB_OK; 2119} 2120 2121hubbub_error hubbub_tokeniser_handle_after_doctype_name( 2122 hubbub_tokeniser *tokeniser) 2123{ 2124 size_t len; 2125 const uint8_t *cptr; 2126 parserutils_error error; 2127 uint8_t c; 2128 2129 error = parserutils_inputstream_peek(tokeniser->input, 2130 tokeniser->context.pending, &cptr, &len); 2131 2132 if (error != PARSERUTILS_OK) { 2133 if (error == PARSERUTILS_EOF) { 2134 tokeniser->state = STATE_DATA; 2135 return emit_current_doctype(tokeniser, true); 2136 } else { 2137 return hubbub_error_from_parserutils_error(error); 2138 } 2139 } 2140 2141 c = *cptr; 2142 tokeniser->context.pending += len; 2143 2144 if (c == '\t' || c == '\n' || c == '\f' || c == ' ' || c == '\r') { 2145 /* pass over in silence */ 2146 } else if (c == '>') { 2147 tokeniser->state = STATE_DATA; 2148 return emit_current_doctype(tokeniser, false); 2149 } else if ((c & ~0x20) == 'P') { 2150 tokeniser->context.match_doctype.count = 1; 2151 tokeniser->state = STATE_MATCH_PUBLIC; 2152 } else if ((c & ~0x20) == 'S') { 2153 tokeniser->context.match_doctype.count = 1; 2154 tokeniser->state = STATE_MATCH_SYSTEM; 2155 } else { 2156 tokeniser->state = STATE_BOGUS_DOCTYPE; 2157 tokeniser->context.current_doctype.force_quirks = true; 2158 } 2159 2160 return HUBBUB_OK; 2161} 2162 2163#define PUBLIC "PUBLIC" 2164#define PUBLIC_LEN (SLEN(PUBLIC) - 1) 2165 2166hubbub_error hubbub_tokeniser_handle_match_public(hubbub_tokeniser *tokeniser) 2167{ 2168 size_t len; 2169 const uint8_t *cptr; 2170 parserutils_error error; 2171 uint8_t c; 2172 2173 error = parserutils_inputstream_peek(tokeniser->input, 2174 tokeniser->context.pending, &cptr, &len); 2175 2176 if (error != PARSERUTILS_OK) { 2177 if (error == PARSERUTILS_EOF) { 2178 tokeniser->context.current_doctype.force_quirks = true; 2179 tokeniser->state = STATE_BOGUS_DOCTYPE; 2180 return HUBBUB_OK; 2181 } else { 2182 return hubbub_error_from_parserutils_error(error); 2183 } 2184 } 2185 2186 c = *cptr; 2187 2188 assert(tokeniser->context.match_doctype.count <= PUBLIC_LEN); 2189 2190 if (PUBLIC[tokeniser->context.match_doctype.count] != (c & ~0x20)) { 2191 tokeniser->context.current_doctype.force_quirks = true; 2192 tokeniser->state = STATE_BOGUS_DOCTYPE; 2193 return HUBBUB_OK; 2194 } 2195 2196 tokeniser->context.pending += len; 2197 2198 if (tokeniser->context.match_doctype.count == PUBLIC_LEN) { 2199 tokeniser->state = STATE_BEFORE_DOCTYPE_PUBLIC; 2200 } 2201 2202 tokeniser->context.match_doctype.count++; 2203 2204 return HUBBUB_OK; 2205} 2206 2207#undef PUBLIC 2208#undef PUBLIC_LEN 2209 2210hubbub_error hubbub_tokeniser_handle_before_doctype_public( 2211 hubbub_tokeniser *tokeniser) 2212{ 2213 hubbub_doctype *cdoc = &tokeniser->context.current_doctype; 2214 size_t len; 2215 const uint8_t *cptr; 2216 parserutils_error error; 2217 uint8_t c; 2218 2219 error = parserutils_inputstream_peek(tokeniser->input, 2220 tokeniser->context.pending, &cptr, &len); 2221 2222 if (error != PARSERUTILS_OK) { 2223 if (error == PARSERUTILS_EOF) { 2224 tokeniser->state = STATE_DATA; 2225 return emit_current_doctype(tokeniser, true); 2226 } else { 2227 return hubbub_error_from_parserutils_error(error); 2228 } 2229 } 2230 2231 c = *cptr; 2232 tokeniser->context.pending += len; 2233 2234 if (c == '\t' || c == '\n' || c == '\f' || c == ' ' || c == '\r') { 2235 /* pass over in silence */ 2236 } else if (c == '"') { 2237 cdoc->public_missing = false; 2238 cdoc->public_id.len = 0; 2239 tokeniser->state = STATE_DOCTYPE_PUBLIC_DQ; 2240 } else if (c == '\'') { 2241 cdoc->public_missing = false; 2242 cdoc->public_id.len = 0; 2243 tokeniser->state = STATE_DOCTYPE_PUBLIC_SQ; 2244 } else if (c == '>') { 2245 tokeniser->state = STATE_DATA; 2246 return emit_current_doctype(tokeniser, true); 2247 } else { 2248 cdoc->force_quirks = true; 2249 tokeniser->state = STATE_BOGUS_DOCTYPE; 2250 } 2251 2252 return HUBBUB_OK; 2253} 2254 2255hubbub_error hubbub_tokeniser_handle_doctype_public_dq( 2256 hubbub_tokeniser *tokeniser) 2257{ 2258 hubbub_doctype *cdoc = &tokeniser->context.current_doctype; 2259 size_t len; 2260 const uint8_t *cptr; 2261 parserutils_error error; 2262 uint8_t c; 2263 2264 error = parserutils_inputstream_peek(tokeniser->input, 2265 tokeniser->context.pending, &cptr, &len); 2266 2267 if (error != PARSERUTILS_OK) { 2268 if (error == PARSERUTILS_EOF) { 2269 tokeniser->state = STATE_DATA; 2270 return emit_current_doctype(tokeniser, true); 2271 } else { 2272 return hubbub_error_from_parserutils_error(error); 2273 } 2274 } 2275 2276 c = *cptr; 2277 2278 if (c == '"') { 2279 tokeniser->context.pending += len; 2280 tokeniser->state = STATE_AFTER_DOCTYPE_PUBLIC; 2281 } else if (c == '>') { 2282 tokeniser->context.pending += len; 2283 tokeniser->state = STATE_DATA; 2284 return emit_current_doctype(tokeniser, true); 2285 } else if (c == '\0') { 2286 COLLECT_MS(cdoc->public_id, u_fffd, sizeof(u_fffd)); 2287 tokeniser->context.pending += len; 2288 } else if (c == '\r') { 2289 error = parserutils_inputstream_peek( 2290 tokeniser->input, 2291 tokeniser->context.pending, 2292 &cptr, 2293 &len); 2294 2295 if (error != PARSERUTILS_OK && error != PARSERUTILS_EOF) { 2296 return hubbub_error_from_parserutils_error(error); 2297 } else if (error == PARSERUTILS_EOF || *cptr != '\n') { 2298 COLLECT_MS(cdoc->public_id, &lf, sizeof(lf)); 2299 } 2300 2301 /* Collect '\r' */ 2302 tokeniser->context.pending += 1; 2303 } else { 2304 COLLECT_MS(cdoc->public_id, cptr, len); 2305 2306 tokeniser->context.pending += len; 2307 } 2308 2309 return HUBBUB_OK; 2310} 2311 2312hubbub_error hubbub_tokeniser_handle_doctype_public_sq( 2313 hubbub_tokeniser *tokeniser) 2314{ 2315 hubbub_doctype *cdoc = &tokeniser->context.current_doctype; 2316 size_t len; 2317 const uint8_t *cptr; 2318 parserutils_error error; 2319 uint8_t c; 2320 2321 error = parserutils_inputstream_peek(tokeniser->input, 2322 tokeniser->context.pending, &cptr, &len); 2323 2324 if (error != PARSERUTILS_OK) { 2325 if (error == PARSERUTILS_EOF) { 2326 tokeniser->state = STATE_DATA; 2327 return emit_current_doctype(tokeniser, true); 2328 } else { 2329 return hubbub_error_from_parserutils_error(error); 2330 } 2331 } 2332 2333 c = *cptr; 2334 2335 if (c == '\'') { 2336 tokeniser->context.pending += len; 2337 tokeniser->state = STATE_AFTER_DOCTYPE_PUBLIC; 2338 } else if (c == '>') { 2339 tokeniser->context.pending += len; 2340 tokeniser->state = STATE_DATA; 2341 return emit_current_doctype(tokeniser, true); 2342 } else if (c == '\0') { 2343 COLLECT_MS(cdoc->public_id, u_fffd, sizeof(u_fffd)); 2344 tokeniser->context.pending += len; 2345 } else if (c == '\r') { 2346 error = parserutils_inputstream_peek( 2347 tokeniser->input, 2348 tokeniser->context.pending, 2349 &cptr, 2350 &len); 2351 2352 if (error != PARSERUTILS_OK && error != PARSERUTILS_EOF) { 2353 return hubbub_error_from_parserutils_error(error); 2354 } else if (error == PARSERUTILS_EOF || *cptr != '\n') { 2355 COLLECT_MS(cdoc->public_id, &lf, sizeof(lf)); 2356 } 2357 2358 /* Collect '\r' */ 2359 tokeniser->context.pending += 1; 2360 } else { 2361 COLLECT_MS(cdoc->public_id, cptr, len); 2362 tokeniser->context.pending += len; 2363 } 2364 2365 return HUBBUB_OK; 2366} 2367 2368 2369hubbub_error hubbub_tokeniser_handle_after_doctype_public( 2370 hubbub_tokeniser *tokeniser) 2371{ 2372 hubbub_doctype *cdoc = &tokeniser->context.current_doctype; 2373 size_t len; 2374 const uint8_t *cptr; 2375 parserutils_error error; 2376 uint8_t c; 2377 2378 error = parserutils_inputstream_peek(tokeniser->input, 2379 tokeniser->context.pending, &cptr, &len); 2380 2381 if (error != PARSERUTILS_OK) { 2382 if (error == PARSERUTILS_EOF) { 2383 tokeniser->state = STATE_DATA; 2384 return emit_current_doctype(tokeniser, true); 2385 } else { 2386 return hubbub_error_from_parserutils_error(error); 2387 } 2388 } 2389 2390 c = *cptr; 2391 tokeniser->context.pending += len; 2392 2393 if (c == '\t' || c == '\n' || c == '\f' || c == ' ' || c == '\r') { 2394 /* pass over in silence */ 2395 } else if (c == '"') { 2396 cdoc->system_missing = false; 2397 cdoc->system_id.len = 0; 2398 2399 tokeniser->state = STATE_DOCTYPE_SYSTEM_DQ; 2400 } else if (c == '\'') { 2401 cdoc->system_missing = false; 2402 cdoc->system_id.len = 0; 2403 2404 tokeniser->state = STATE_DOCTYPE_SYSTEM_SQ; 2405 } else if (c == '>') { 2406 tokeniser->state = STATE_DATA; 2407 return emit_current_doctype(tokeniser, false); 2408 } else { 2409 cdoc->force_quirks = true; 2410 tokeniser->state = STATE_BOGUS_DOCTYPE; 2411 } 2412 2413 return HUBBUB_OK; 2414} 2415 2416 2417 2418#define SYSTEM "SYSTEM" 2419#define SYSTEM_LEN (SLEN(SYSTEM) - 1) 2420 2421hubbub_error hubbub_tokeniser_handle_match_system(hubbub_tokeniser *tokeniser) 2422{ 2423 size_t len; 2424 const uint8_t *cptr; 2425 parserutils_error error; 2426 uint8_t c; 2427 2428 error = parserutils_inputstream_peek(tokeniser->input, 2429 tokeniser->context.pending, &cptr, &len); 2430 2431 if (error != PARSERUTILS_OK){ 2432 if (error == PARSERUTILS_EOF) { 2433 tokeniser->context.current_doctype.force_quirks = true; 2434 tokeniser->state = STATE_BOGUS_DOCTYPE; 2435 return HUBBUB_OK; 2436 } else { 2437 return hubbub_error_from_parserutils_error(error); 2438 } 2439 } 2440 2441 c = *cptr; 2442 2443 assert(tokeniser->context.match_doctype.count <= SYSTEM_LEN); 2444 2445 if (SYSTEM[tokeniser->context.match_doctype.count] != (c & ~0x20)) { 2446 tokeniser->context.current_doctype.force_quirks = true; 2447 tokeniser->state = STATE_BOGUS_DOCTYPE; 2448 return HUBBUB_OK; 2449 } 2450 2451 tokeniser->context.pending += len; 2452 2453 if (tokeniser->context.match_doctype.count == SYSTEM_LEN) { 2454 tokeniser->state = STATE_BEFORE_DOCTYPE_SYSTEM; 2455 } 2456 2457 tokeniser->context.match_doctype.count++; 2458 2459 return HUBBUB_OK; 2460} 2461 2462#undef SYSTEM 2463#undef SYSTEM_LEN 2464 2465hubbub_error hubbub_tokeniser_handle_before_doctype_system( 2466 hubbub_tokeniser *tokeniser) 2467{ 2468 hubbub_doctype *cdoc = &tokeniser->context.current_doctype; 2469 size_t len; 2470 const uint8_t *cptr; 2471 parserutils_error error; 2472 uint8_t c; 2473 2474 error = parserutils_inputstream_peek(tokeniser->input, 2475 tokeniser->context.pending, &cptr, &len); 2476 2477 if (error != PARSERUTILS_OK) { 2478 if (error == PARSERUTILS_EOF) { 2479 tokeniser->state = STATE_DATA; 2480 return emit_current_doctype(tokeniser, true); 2481 } else { 2482 return hubbub_error_from_parserutils_error(error); 2483 } 2484 } 2485 2486 c = *cptr; 2487 tokeniser->context.pending += len; 2488 2489 if (c == '\t' || c == '\n' || c == '\f' || c == ' ' || c == '\r') { 2490 /* pass over */ 2491 } else if (c == '"') { 2492 cdoc->system_missing = false; 2493 cdoc->system_id.len = 0; 2494 2495 tokeniser->state = STATE_DOCTYPE_SYSTEM_DQ; 2496 } else if (c == '\'') { 2497 cdoc->system_missing = false; 2498 cdoc->system_id.len = 0; 2499 2500 tokeniser->state = STATE_DOCTYPE_SYSTEM_SQ; 2501 } else if (c == '>') { 2502 tokeniser->state = STATE_DATA; 2503 return emit_current_doctype(tokeniser, true); 2504 } else { 2505 cdoc->force_quirks = true; 2506 tokeniser->state = STATE_BOGUS_DOCTYPE; 2507 } 2508 2509 return HUBBUB_OK; 2510} 2511 2512hubbub_error hubbub_tokeniser_handle_doctype_system_dq( 2513 hubbub_tokeniser *tokeniser) 2514{ 2515 hubbub_doctype *cdoc = &tokeniser->context.current_doctype; 2516 size_t len; 2517 const uint8_t *cptr; 2518 parserutils_error error; 2519 uint8_t c; 2520 2521 error = parserutils_inputstream_peek(tokeniser->input, 2522 tokeniser->context.pending, &cptr, &len); 2523 2524 if (error != PARSERUTILS_OK) { 2525 if (error == PARSERUTILS_EOF) { 2526 tokeniser->state = STATE_DATA; 2527 return emit_current_doctype(tokeniser, true); 2528 } else { 2529 return hubbub_error_from_parserutils_error(error); 2530 } 2531 } 2532 2533 c = *cptr; 2534 2535 if (c == '"') { 2536 tokeniser->context.pending += len; 2537 tokeniser->state = STATE_AFTER_DOCTYPE_SYSTEM; 2538 } else if (c == '>') { 2539 tokeniser->context.pending += len; 2540 tokeniser->state = STATE_DATA; 2541 return emit_current_doctype(tokeniser, true); 2542 } else if (c == '\0') { 2543 COLLECT_MS(cdoc->system_id, u_fffd, sizeof(u_fffd)); 2544 tokeniser->context.pending += len; 2545 } else if (c == '\r') { 2546 error = parserutils_inputstream_peek( 2547 tokeniser->input, 2548 tokeniser->context.pending, 2549 &cptr, 2550 &len); 2551 2552 if (error != PARSERUTILS_OK && error != PARSERUTILS_EOF) { 2553 return hubbub_error_from_parserutils_error(error); 2554 } else if (error == PARSERUTILS_EOF || *cptr != '\n') { 2555 COLLECT_MS(cdoc->system_id, &lf, sizeof(lf)); 2556 } 2557 2558 /* Collect '\r' */ 2559 tokeniser->context.pending += 1; 2560 } else { 2561 COLLECT_MS(cdoc->system_id, cptr, len); 2562 tokeniser->context.pending += len; 2563 } 2564 2565 return HUBBUB_OK; 2566} 2567 2568hubbub_error hubbub_tokeniser_handle_doctype_system_sq( 2569 hubbub_tokeniser *tokeniser) 2570{ 2571 hubbub_doctype *cdoc = &tokeniser->context.current_doctype; 2572 size_t len; 2573 const uint8_t *cptr; 2574 parserutils_error error; 2575 uint8_t c; 2576 2577 error = parserutils_inputstream_peek(tokeniser->input, 2578 tokeniser->context.pending, &cptr, &len); 2579 2580 if (error != PARSERUTILS_OK) { 2581 if (error == PARSERUTILS_EOF) { 2582 tokeniser->state = STATE_DATA; 2583 return emit_current_doctype(tokeniser, true); 2584 } else { 2585 return hubbub_error_from_parserutils_error(error); 2586 } 2587 } 2588 2589 c = *cptr; 2590 2591 if (c == '\'') { 2592 tokeniser->context.pending += len; 2593 tokeniser->state = STATE_AFTER_DOCTYPE_SYSTEM; 2594 } else if (c == '>') { 2595 tokeniser->context.pending += len; 2596 tokeniser->state = STATE_DATA; 2597 return emit_current_doctype(tokeniser, true); 2598 } else if (c == '\0') { 2599 COLLECT_MS(cdoc->system_id, u_fffd, sizeof(u_fffd)); 2600 tokeniser->context.pending += len; 2601 } else if (c == '\r') { 2602 error = parserutils_inputstream_peek( 2603 tokeniser->input, 2604 tokeniser->context.pending, 2605 &cptr, 2606 &len); 2607 2608 if (error != PARSERUTILS_OK && error != PARSERUTILS_EOF) { 2609 return hubbub_error_from_parserutils_error(error); 2610 } else if (error == PARSERUTILS_EOF || *cptr != '\n') { 2611 COLLECT_MS(cdoc->system_id, &lf, sizeof(lf)); 2612 } 2613 2614 /* Collect '\r' */ 2615 tokeniser->context.pending += 1; 2616 } else { 2617 COLLECT_MS(cdoc->system_id, cptr, len); 2618 tokeniser->context.pending += len; 2619 } 2620 2621 return HUBBUB_OK; 2622} 2623 2624hubbub_error hubbub_tokeniser_handle_after_doctype_system( 2625 hubbub_tokeniser *tokeniser) 2626{ 2627 size_t len; 2628 const uint8_t *cptr; 2629 parserutils_error error; 2630 uint8_t c; 2631 2632 error = parserutils_inputstream_peek(tokeniser->input, 2633 tokeniser->context.pending, &cptr, &len); 2634 2635 if (error != PARSERUTILS_OK) { 2636 if (error == PARSERUTILS_EOF) { 2637 tokeniser->state = STATE_DATA; 2638 return emit_current_doctype(tokeniser, true); 2639 } else { 2640 return hubbub_error_from_parserutils_error(error); 2641 } 2642 } 2643 2644 c = *cptr; 2645 tokeniser->context.pending += len; 2646 2647 if (c == '\t' || c == '\n' || c == '\f' || c == ' ' || c == '\r') { 2648 /* pass over in silence */ 2649 } else if (c == '>') { 2650 tokeniser->state = STATE_DATA; 2651 return emit_current_doctype(tokeniser, false); 2652 } else { 2653 tokeniser->state = STATE_BOGUS_DOCTYPE; 2654 } 2655 2656 return HUBBUB_OK; 2657} 2658 2659 2660hubbub_error hubbub_tokeniser_handle_bogus_doctype(hubbub_tokeniser *tokeniser) 2661{ 2662 size_t len; 2663 const uint8_t *cptr; 2664 parserutils_error error; 2665 uint8_t c; 2666 2667 error = parserutils_inputstream_peek(tokeniser->input, 2668 tokeniser->context.pending, &cptr, &len); 2669 2670 if (error != PARSERUTILS_OK) { 2671 if (error == PARSERUTILS_EOF) { 2672 tokeniser->state = STATE_DATA; 2673 return emit_current_doctype(tokeniser, false); 2674 } else { 2675 return hubbub_error_from_parserutils_error(error); 2676 } 2677 } 2678 2679 c = *cptr; 2680 tokeniser->context.pending += len; 2681 2682 if (c == '>') { 2683 tokeniser->state = STATE_DATA; 2684 return emit_current_doctype(tokeniser, false); 2685 } 2686 2687 return HUBBUB_OK; 2688} 2689 2690 2691 2692#define CDATA "[CDATA[" 2693#define CDATA_LEN (SLEN(CDATA) - 1) 2694 2695hubbub_error hubbub_tokeniser_handle_match_cdata(hubbub_tokeniser *tokeniser) 2696{ 2697 size_t len; 2698 const uint8_t *cptr; 2699 parserutils_error error; 2700 uint8_t c; 2701 2702 error = parserutils_inputstream_peek(tokeniser->input, 2703 tokeniser->context.pending, &cptr, &len); 2704 2705 if (error != PARSERUTILS_OK) { 2706 if (error == PARSERUTILS_EOF) { 2707 tokeniser->context.current_comment.len = 2708 tokeniser->context.pending = 0; 2709 tokeniser->state = STATE_BOGUS_COMMENT; 2710 return HUBBUB_OK; 2711 } else { 2712 return hubbub_error_from_parserutils_error(error); 2713 } 2714 } 2715 2716 c = *cptr; 2717 2718 assert(tokeniser->context.match_cdata.count <= CDATA_LEN); 2719 2720 if (CDATA[tokeniser->context.match_cdata.count] != (c & ~0x20)) { 2721 tokeniser->context.current_comment.len = 2722 tokeniser->context.pending = 2723 0; 2724 tokeniser->state = STATE_BOGUS_COMMENT; 2725 return HUBBUB_OK; 2726 } 2727 2728 tokeniser->context.pending += len; 2729 2730 if (tokeniser->context.match_cdata.count == CDATA_LEN) { 2731 parserutils_inputstream_advance(tokeniser->input, 2732 tokeniser->context.match_cdata.count + len); 2733 tokeniser->context.pending = 0; 2734 tokeniser->context.match_cdata.end = 0; 2735 tokeniser->state = STATE_CDATA_BLOCK; 2736 } 2737 2738 tokeniser->context.match_cdata.count += len; 2739 2740 return HUBBUB_OK; 2741} 2742 2743#undef CDATA 2744#undef CDATA_LEN 2745 2746 2747hubbub_error hubbub_tokeniser_handle_cdata_block(hubbub_tokeniser *tokeniser) 2748{ 2749 size_t len; 2750 const uint8_t *cptr; 2751 parserutils_error error; 2752 uint8_t c; 2753 2754 error = parserutils_inputstream_peek(tokeniser->input, 2755 tokeniser->context.pending, &cptr, &len); 2756 2757 if (error != PARSERUTILS_OK) { 2758 if (error == PARSERUTILS_EOF) { 2759 tokeniser->state = STATE_DATA; 2760 return emit_current_chars(tokeniser); 2761 } else { 2762 return hubbub_error_from_parserutils_error(error); 2763 } 2764 } 2765 2766 c = *cptr; 2767 2768 if (c == ']' && (tokeniser->context.match_cdata.end == 0 || 2769 tokeniser->context.match_cdata.end == 1)) { 2770 tokeniser->context.pending += len; 2771 tokeniser->context.match_cdata.end += len; 2772 } else if (c == '>' && tokeniser->context.match_cdata.end == 2) { 2773 /* Remove the previous two "]]" */ 2774 tokeniser->context.pending -= 2; 2775 2776 /* Emit any pending characters */ 2777 emit_current_chars(tokeniser); 2778 2779 /* Now move past the "]]>" bit */ 2780 parserutils_inputstream_advance(tokeniser->input, SLEN("]]>")); 2781 2782 tokeniser->state = STATE_DATA; 2783 } else if (c == '\0') { 2784 if (tokeniser->context.pending > 0) { 2785 /* Emit any pending characters */ 2786 emit_current_chars(tokeniser); 2787 } 2788 2789 /* Perform NUL-byte replacement */ 2790 emit_character_token(tokeniser, &u_fffd_str); 2791 2792 parserutils_inputstream_advance(tokeniser->input, len); 2793 tokeniser->context.match_cdata.end = 0; 2794 } else if (c == '\r') { 2795 error = parserutils_inputstream_peek( 2796 tokeniser->input, 2797 tokeniser->context.pending + len, 2798 &cptr, 2799 &len); 2800 2801 if (error != PARSERUTILS_OK && error != PARSERUTILS_EOF) { 2802 return hubbub_error_from_parserutils_error(error); 2803 } 2804 2805 if (tokeniser->context.pending > 0) { 2806 /* Emit any pending characters */ 2807 emit_current_chars(tokeniser); 2808 } 2809 2810 if (error == PARSERUTILS_EOF || *cptr != '\n') { 2811 /* Emit newline */ 2812 emit_character_token(tokeniser, &lf_str); 2813 } 2814 2815 /* Advance over \r */ 2816 parserutils_inputstream_advance(tokeniser->input, 1); 2817 tokeniser->context.match_cdata.end = 0; 2818 } else { 2819 tokeniser->context.pending += len; 2820 tokeniser->context.match_cdata.end = 0; 2821 } 2822 2823 return HUBBUB_OK; 2824} 2825 2826 2827hubbub_error hubbub_tokeniser_consume_character_reference( 2828 hubbub_tokeniser *tokeniser, size_t pos) 2829{ 2830 uint32_t allowed_char = tokeniser->context.allowed_char; 2831 2832 size_t len; 2833 const uint8_t *cptr; 2834 parserutils_error error; 2835 uint8_t c; 2836 size_t off; 2837 2838 error = parserutils_inputstream_peek(tokeniser->input, pos, 2839 &cptr, &len); 2840 2841 /* We should always start on an ampersand */ 2842 assert(error == PARSERUTILS_OK); 2843 assert(len == 1 && *cptr == '&'); 2844 2845 off = pos + len; 2846 2847 /* Look at the character after the ampersand */ 2848 error = parserutils_inputstream_peek(tokeniser->input, off, 2849 &cptr, &len); 2850 2851 if (error != PARSERUTILS_OK) { 2852 if (error == PARSERUTILS_EOF) { 2853 tokeniser->context.match_entity.complete = true; 2854 tokeniser->context.match_entity.codepoint = 0; 2855 return HUBBUB_OK; 2856 } else { 2857 return hubbub_error_from_parserutils_error(error); 2858 } 2859 } 2860 2861 c = *cptr; 2862 2863 /* Set things up */ 2864 tokeniser->context.match_entity.offset = off; 2865 tokeniser->context.match_entity.poss_length = 0; 2866 tokeniser->context.match_entity.length = 0; 2867 tokeniser->context.match_entity.base = 0; 2868 tokeniser->context.match_entity.codepoint = 0; 2869 tokeniser->context.match_entity.had_data = false; 2870 tokeniser->context.match_entity.return_state = tokeniser->state; 2871 tokeniser->context.match_entity.complete = false; 2872 tokeniser->context.match_entity.overflow = false; 2873 tokeniser->context.match_entity.context = NULL; 2874 tokeniser->context.match_entity.prev_len = len; 2875 2876 /* Reset allowed character for future calls */ 2877 tokeniser->context.allowed_char = '\0'; 2878 2879 if (c == '\t' || c == '\n' || c == '\f' || c == ' ' || 2880 c == '<' || c == '&' || 2881 (allowed_char && c == allowed_char)) { 2882 tokeniser->context.match_entity.complete = true; 2883 tokeniser->context.match_entity.codepoint = 0; 2884 } else if (c == '#') { 2885 tokeniser->context.match_entity.length += len; 2886 tokeniser->state = STATE_NUMBERED_ENTITY; 2887 } else { 2888 tokeniser->state = STATE_NAMED_ENTITY; 2889 } 2890 2891 return HUBBUB_OK; 2892} 2893 2894 2895hubbub_error hubbub_tokeniser_handle_numbered_entity( 2896 hubbub_tokeniser *tokeniser) 2897{ 2898 hubbub_tokeniser_context *ctx = &tokeniser->context; 2899 2900 size_t len; 2901 const uint8_t *cptr; 2902 parserutils_error error; 2903 2904 error = parserutils_inputstream_peek(tokeniser->input, 2905 ctx->match_entity.offset + ctx->match_entity.length, 2906 &cptr, &len); 2907 2908 if (error != PARSERUTILS_OK && error != PARSERUTILS_EOF) { 2909 return hubbub_error_from_parserutils_error(error); 2910 } 2911 2912 if (error != PARSERUTILS_EOF && ctx->match_entity.base == 0) { 2913 uint8_t c = *cptr; 2914 if ((c & ~0x20) == 'X') { 2915 ctx->match_entity.base = 16; 2916 ctx->match_entity.length += len; 2917 } else { 2918 ctx->match_entity.base = 10; 2919 } 2920 } 2921 2922 while ((error = parserutils_inputstream_peek(tokeniser->input, 2923 ctx->match_entity.offset + ctx->match_entity.length, 2924 &cptr, &len)) == PARSERUTILS_OK) { 2925 uint8_t c = *cptr; 2926 2927 if (ctx->match_entity.base == 10 && 2928 ('0' <= c && c <= '9')) { 2929 ctx->match_entity.had_data = true; 2930 ctx->match_entity.codepoint = 2931 ctx->match_entity.codepoint * 10 + (c - '0'); 2932 2933 ctx->match_entity.length += len; 2934 } else if (ctx->match_entity.base == 16 && 2935 (('0' <= c && c <= '9') || 2936 ('A' <= (c & ~0x20) && 2937 (c & ~0x20) <= 'F'))) { 2938 ctx->match_entity.had_data = true; 2939 ctx->match_entity.codepoint *= 16; 2940 2941 if ('0' <= c && c <= '9') { 2942 ctx->match_entity.codepoint += (c - '0'); 2943 } else { 2944 ctx->match_entity.codepoint += 2945 ((c & ~0x20) - 'A' + 10); 2946 } 2947 2948 ctx->match_entity.length += len; 2949 } else { 2950 break; 2951 } 2952 2953 if (ctx->match_entity.codepoint >= 0x10FFFF) { 2954 ctx->match_entity.overflow = true; 2955 } 2956 } 2957 2958 if (error != PARSERUTILS_OK && error != PARSERUTILS_EOF) { 2959 return hubbub_error_from_parserutils_error(error); 2960 } 2961 2962 /* Eat trailing semicolon, if any */ 2963 if (error != PARSERUTILS_EOF && *cptr == ';') { 2964 ctx->match_entity.length += len; 2965 } 2966 2967 /* Had data, so calculate final codepoint */ 2968 if (ctx->match_entity.had_data) { 2969 uint32_t cp = ctx->match_entity.codepoint; 2970 2971 if (0x80 <= cp && cp <= 0x9F) { 2972 cp = cp1252Table[cp - 0x80]; 2973 } else if (cp == 0x0D) { 2974 cp = 0x000A; 2975 } else if (ctx->match_entity.overflow || 2976 cp <= 0x0008 || cp == 0x000B || 2977 (0x000E <= cp && cp <= 0x001F) || 2978 (0x007F <= cp && cp <= 0x009F) || 2979 (0xD800 <= cp && cp <= 0xDFFF) || 2980 (0xFDD0 <= cp && cp <= 0xFDEF) || 2981 (cp & 0xFFFE) == 0xFFFE) { 2982 /* the check for cp > 0x10FFFF per spec is performed 2983 * in the loop above to avoid overflow */ 2984 cp = 0xFFFD; 2985 } 2986 2987 ctx->match_entity.codepoint = cp; 2988 } 2989 2990 /* Flag completion */ 2991 ctx->match_entity.complete = true; 2992 2993 /* And back to the state we were entered in */ 2994 tokeniser->state = ctx->match_entity.return_state; 2995 2996 return HUBBUB_OK; 2997} 2998 2999hubbub_error hubbub_tokeniser_handle_named_entity(hubbub_tokeniser *tokeniser) 3000{ 3001 hubbub_tokeniser_context *ctx = &tokeniser->context; 3002 3003 size_t len; 3004 const uint8_t *cptr; 3005 parserutils_error error; 3006 3007 while ((error = parserutils_inputstream_peek(tokeniser->input, 3008 ctx->match_entity.offset + 3009 ctx->match_entity.poss_length, 3010 &cptr, &len)) == PARSERUTILS_OK) { 3011 uint32_t cp; 3012 3013 uint8_t c = *cptr; 3014 hubbub_error error; 3015 3016 if (c > 0x7F) { 3017 /* Entity names are ASCII only */ 3018 break; 3019 } 3020 3021 error = hubbub_entities_search_step(c, &cp, 3022 &ctx->match_entity.context); 3023 if (error == HUBBUB_OK) { 3024 /* Had a match - store it for later */ 3025 ctx->match_entity.codepoint = cp; 3026 3027 ctx->match_entity.length = 3028 ctx->match_entity.poss_length + len; 3029 ctx->match_entity.poss_length = 3030 ctx->match_entity.length; 3031 } else if (error == HUBBUB_INVALID) { 3032 /* No further matches - use last found */ 3033 break; 3034 } else { 3035 /* Need more data */ 3036 ctx->match_entity.poss_length += len; 3037 } 3038 } 3039 3040 if (error != PARSERUTILS_OK && error != PARSERUTILS_EOF) { 3041 return hubbub_error_from_parserutils_error(error); 3042 } 3043 3044 if (ctx->match_entity.length > 0) { 3045 uint8_t c; 3046 error = parserutils_inputstream_peek(tokeniser->input, 3047 ctx->match_entity.offset + 3048 ctx->match_entity.length - 1, 3049 &cptr, &len); 3050 /* We're re-reading a character we've already read after. 3051 * Therefore, there's no way that an error may occur as 3052 * a result. */ 3053 assert(error == PARSERUTILS_OK); 3054 3055 c = *cptr; 3056 3057 if ((tokeniser->context.match_entity.return_state == 3058 STATE_CHARACTER_REFERENCE_IN_ATTRIBUTE_VALUE) && 3059 c != ';') { 3060 error = parserutils_inputstream_peek(tokeniser->input, 3061 ctx->match_entity.offset + 3062 ctx->match_entity.length, 3063 &cptr, &len); 3064 /* We must have attempted to read one more character 3065 * than was present in the entity name, as that is the 3066 * only way to break out of the loop above. If that 3067 * failed, then any non-EOF case will have been handled 3068 * by the if statement after the loop thus it cannot 3069 * occur here. */ 3070 assert(error == PARSERUTILS_OK || 3071 error == PARSERUTILS_EOF); 3072 3073 if (error == PARSERUTILS_EOF) { 3074 ctx->match_entity.codepoint = 0; 3075 } 3076 3077 c = *cptr; 3078 if ((0x0030 <= c && c <= 0x0039) || 3079 (0x0041 <= c && c <= 0x005A) || 3080 (0x0061 <= c && c <= 0x007A)) { 3081 ctx->match_entity.codepoint = 0; 3082 } 3083 } 3084 } 3085 3086 /* Flag completion */ 3087 ctx->match_entity.complete = true; 3088 3089 /* And back to the state from whence we came */ 3090 tokeniser->state = ctx->match_entity.return_state; 3091 3092 return HUBBUB_OK; 3093} 3094 3095 3096 3097/*** Token emitting bits ***/ 3098 3099/** 3100 * Emit a character token. 3101 * 3102 * \param tokeniser Tokeniser instance 3103 * \param chars Pointer to hubbub_string to emit 3104 * \return true 3105 */ 3106hubbub_error emit_character_token(hubbub_tokeniser *tokeniser, 3107 const hubbub_string *chars) 3108{ 3109 hubbub_token token; 3110 3111 token.type = HUBBUB_TOKEN_CHARACTER; 3112 token.data.character = *chars; 3113 3114 return hubbub_tokeniser_emit_token(tokeniser, &token); 3115} 3116 3117/** 3118 * Emit the current pending characters being stored in the tokeniser context. 3119 * 3120 * \param tokeniser Tokeniser instance 3121 * \return true 3122 */ 3123hubbub_error emit_current_chars(hubbub_tokeniser *tokeniser) 3124{ 3125 hubbub_token token; 3126 size_t len; 3127 const uint8_t *cptr = NULL; 3128 parserutils_error error; 3129 3130 /* Calling this with nothing to output is a probable bug */ 3131 assert(tokeniser->context.pending > 0); 3132 3133 error = parserutils_inputstream_peek(tokeniser->input, 0, &cptr, &len); 3134 3135 assert(error == PARSERUTILS_OK); 3136 3137 token.type = HUBBUB_TOKEN_CHARACTER; 3138 token.data.character.ptr = cptr; 3139 token.data.character.len = tokeniser->context.pending; 3140 3141 return hubbub_tokeniser_emit_token(tokeniser, &token); 3142} 3143 3144/** 3145 * Emit the current tag token being stored in the tokeniser context. 3146 * 3147 * \param tokeniser Tokeniser instance 3148 * \return true 3149 */ 3150hubbub_error emit_current_tag(hubbub_tokeniser *tokeniser) 3151{ 3152 hubbub_error err; 3153 hubbub_token token; 3154 uint32_t n_attributes; 3155 hubbub_attribute *attrs; 3156 uint8_t *ptr; 3157 uint32_t i, j; 3158 3159 /* Emit current tag */ 3160 token.type = tokeniser->context.current_tag_type; 3161 token.data.tag = tokeniser->context.current_tag; 3162 token.data.tag.ns = HUBBUB_NS_HTML; 3163 3164 3165 n_attributes = token.data.tag.n_attributes; 3166 attrs = token.data.tag.attributes; 3167 3168 /* Set pointers correctly... */ 3169 ptr = tokeniser->buffer->data; 3170 token.data.tag.name.ptr = tokeniser->buffer->data; 3171 ptr += token.data.tag.name.len; 3172 3173 for (i = 0; i < n_attributes; i++) { 3174 attrs[i].name.ptr = ptr; 3175 ptr += attrs[i].name.len; 3176 attrs[i].value.ptr = ptr; 3177 ptr += attrs[i].value.len; 3178 } 3179 3180 3181 /* Discard duplicate attributes */ 3182 for (i = 0; i < n_attributes; i++) { 3183 for (j = 0; j < n_attributes; j++) { 3184 uint32_t move; 3185 3186 if (j == i || 3187 attrs[i].name.len != 3188 attrs[j].name.len || 3189 strncmp((char *) attrs[i].name.ptr, 3190 (char *) attrs[j].name.ptr, 3191 attrs[i].name.len) != 0) { 3192 /* Attributes don't match */ 3193 continue; 3194 } 3195 3196 assert(i < j); 3197 3198 /* Calculate amount to move */ 3199 move = (n_attributes - 1 - j) * 3200 sizeof(hubbub_attribute); 3201 3202 if (move > 0) { 3203 memmove(&attrs[j],&attrs[j+1], move); 3204 } 3205 3206 /* We've deleted an item, so we need to 3207 * reprocess this index */ 3208 j--; 3209 3210 /* And reduce the number of attributes */ 3211 n_attributes--; 3212 } 3213 } 3214 3215 token.data.tag.n_attributes = n_attributes; 3216 3217 err = hubbub_tokeniser_emit_token(tokeniser, &token); 3218 3219 if (token.type == HUBBUB_TOKEN_START_TAG) { 3220 /* Save start tag name for R?CDATA */ 3221 if (token.data.tag.name.len < 3222 sizeof(tokeniser->context.last_start_tag_name)) { 3223 strncpy((char *) tokeniser->context.last_start_tag_name, 3224 (const char *) token.data.tag.name.ptr, 3225 token.data.tag.name.len); 3226 tokeniser->context.last_start_tag_len = 3227 token.data.tag.name.len; 3228 } else { 3229 tokeniser->context.last_start_tag_name[0] = '\0'; 3230 tokeniser->context.last_start_tag_len = 0; 3231 } 3232 } else /* if (token->type == HUBBUB_TOKEN_END_TAG) */ { 3233 /* Reset content model after R?CDATA elements */ 3234 tokeniser->content_model = HUBBUB_CONTENT_MODEL_PCDATA; 3235 } 3236 3237 /* Reset the self-closing flag */ 3238 tokeniser->context.current_tag.self_closing = false; 3239 3240 return err; 3241} 3242 3243/** 3244 * Emit the current comment token being stored in the tokeniser context. 3245 * 3246 * \param tokeniser Tokeniser instance 3247 * \return true 3248 */ 3249hubbub_error emit_current_comment(hubbub_tokeniser *tokeniser) 3250{ 3251 hubbub_token token; 3252 3253 token.type = HUBBUB_TOKEN_COMMENT; 3254 token.data.comment.ptr = tokeniser->buffer->data; 3255 token.data.comment.len = tokeniser->buffer->length; 3256 3257 return hubbub_tokeniser_emit_token(tokeniser, &token); 3258} 3259 3260/** 3261 * Emit the current doctype token being stored in the tokeniser context. 3262 * 3263 * \param tokeniser Tokeniser instance 3264 * \param force_quirks Force quirks mode on this document 3265 * \return true 3266 */ 3267hubbub_error emit_current_doctype(hubbub_tokeniser *tokeniser, 3268 bool force_quirks) 3269{ 3270 hubbub_token token; 3271 3272 /* Emit doctype */ 3273 token.type = HUBBUB_TOKEN_DOCTYPE; 3274 token.data.doctype = tokeniser->context.current_doctype; 3275 if (force_quirks == true) 3276 token.data.doctype.force_quirks = true; 3277 3278 /* Set pointers correctly */ 3279 token.data.doctype.name.ptr = tokeniser->buffer->data; 3280 3281 if (token.data.doctype.public_missing == false) { 3282 token.data.doctype.public_id.ptr = tokeniser->buffer->data + 3283 token.data.doctype.name.len; 3284 } 3285 3286 if (token.data.doctype.system_missing == false) { 3287 token.data.doctype.system_id.ptr = tokeniser->buffer->data + 3288 token.data.doctype.name.len + 3289 token.data.doctype.public_id.len; 3290 } 3291 3292 return hubbub_tokeniser_emit_token(tokeniser, &token); 3293} 3294 3295/** 3296 * Emit a token, performing sanity checks if necessary 3297 * 3298 * \param tokeniser Tokeniser instance 3299 * \param token Token to emit 3300 */ 3301hubbub_error hubbub_tokeniser_emit_token(hubbub_tokeniser *tokeniser, 3302 hubbub_token *token) 3303{ 3304 hubbub_error err = HUBBUB_OK; 3305 3306 assert(tokeniser != NULL); 3307 assert(token != NULL); 3308 3309#ifndef NDEBUG 3310 /* Sanity checks */ 3311 switch (token->type) { 3312 case HUBBUB_TOKEN_DOCTYPE: 3313 assert(memchr(token->data.doctype.name.ptr, 0xff, 3314 token->data.doctype.name.len) == NULL); 3315 if (token->data.doctype.public_missing == false) 3316 assert(memchr(token->data.doctype.public_id.ptr, 0xff, 3317 token->data.doctype.public_id.len) == NULL); 3318 if (token->data.doctype.system_missing == false) 3319 assert(memchr(token->data.doctype.system_id.ptr, 0xff, 3320 token->data.doctype.system_id.len) == NULL); 3321 break; 3322 case HUBBUB_TOKEN_START_TAG: 3323 case HUBBUB_TOKEN_END_TAG: 3324 { 3325 uint32_t i; 3326 assert(memchr(token->data.tag.name.ptr, 0xff, 3327 token->data.tag.name.len) == NULL); 3328 for (i = 0; i < token->data.tag.n_attributes; i++) { 3329 hubbub_attribute *attr = &token->data.tag.attributes[i]; 3330 3331 assert(memchr(attr->name.ptr, 0xff, attr->name.len) == 3332 NULL); 3333 assert(memchr(attr->value.ptr, 0xff, attr->value.len) == 3334 NULL); 3335 } 3336 } 3337 break; 3338 case HUBBUB_TOKEN_COMMENT: 3339 assert(memchr(token->data.comment.ptr, 0xff, 3340 token->data.comment.len) == NULL); 3341 break; 3342 case HUBBUB_TOKEN_CHARACTER: 3343 assert(memchr(token->data.character.ptr, 0xff, 3344 token->data.character.len) == NULL); 3345 break; 3346 case HUBBUB_TOKEN_EOF: 3347 break; 3348 } 3349#endif 3350 3351 /* Emit the token */ 3352 if (tokeniser->token_handler) { 3353 err = tokeniser->token_handler(token, tokeniser->token_pw); 3354 } 3355 3356 /* Discard current buffer */ 3357 if (tokeniser->buffer->length) { 3358 parserutils_buffer_discard(tokeniser->buffer, 0, 3359 tokeniser->buffer->length); 3360 } 3361 3362 /* Advance the pointer */ 3363 if (tokeniser->context.pending) { 3364 parserutils_inputstream_advance(tokeniser->input, 3365 tokeniser->context.pending); 3366 tokeniser->context.pending = 0; 3367 } 3368 3369 return err; 3370}