PageRenderTime 127ms CodeModel.GetById 4ms app.highlight 111ms RepoModel.GetById 2ms app.codeStats 0ms

/peek-build/src/netdepends/hubbub-0.0.2/src/tokeniser/tokeniser.c

https://bitbucket.org/C0deMaver1ck/peeklinux
C | 3370 lines | 2902 code | 319 blank | 149 comment | 615 complexity | 6530fbcb7409a5b2eafc2432fd3ae059 MD5 | raw file

Large files files are truncated, but you can click here to view the full file

   1/*
   2 * This file is part of Hubbub.
   3 * Licensed under the MIT License,
   4 *                http://www.opensource.org/licenses/mit-license.php
   5 * Copyright 2007 John-Mark Bell <jmb@netsurf-browser.org>
   6 * Copyright 2008 Andrew Sidwell <takkaria@netsurf-browser.org>
   7 */
   8#include <assert.h>
   9#include <stdbool.h>
  10#include <string.h>
  11
  12#include <stdio.h>
  13
  14#include <parserutils/charset/utf8.h>
  15
  16#include "utils/parserutilserror.h"
  17#include "utils/utils.h"
  18
  19#include "tokeniser/entities.h"
  20#include "tokeniser/tokeniser.h"
  21
  22/**
  23 * Table of mappings between Windows-1252 codepoints 128-159 and UCS4
  24 */
  25static const uint32_t cp1252Table[32] = {
  26	0x20AC, 0xFFFD, 0x201A, 0x0192, 0x201E, 0x2026, 0x2020, 0x2021,
  27	0x02C6, 0x2030, 0x0160, 0x2039, 0x0152, 0xFFFD, 0x017D, 0xFFFD,
  28	0xFFFD, 0x2018, 0x2019, 0x201C, 0x201D, 0x2022, 0x2013, 0x2014,
  29	0x02DC, 0x2122, 0x0161, 0x203A, 0x0153, 0xFFFD, 0x017E, 0x0178
  30};
  31
  32/**
  33 * UTF-8 encoding of U+FFFD REPLACEMENT CHARACTER
  34 */
  35static const uint8_t u_fffd[3] = { '\xEF', '\xBF', '\xBD' };
  36static const hubbub_string u_fffd_str = { u_fffd, sizeof(u_fffd) };
  37
  38
  39/**
  40 * String for when we want to emit newlines
  41 */
  42static const uint8_t lf = '\n';
  43static const hubbub_string lf_str = { &lf, 1 };
  44
  45
  46/**
  47 * Tokeniser states
  48 */
  49typedef enum hubbub_tokeniser_state {
  50	STATE_DATA,
  51	STATE_CHARACTER_REFERENCE_DATA,
  52	STATE_TAG_OPEN,
  53	STATE_CLOSE_TAG_OPEN,
  54	STATE_TAG_NAME,
  55	STATE_BEFORE_ATTRIBUTE_NAME,
  56	STATE_ATTRIBUTE_NAME,
  57	STATE_AFTER_ATTRIBUTE_NAME,
  58	STATE_BEFORE_ATTRIBUTE_VALUE,
  59	STATE_ATTRIBUTE_VALUE_DQ,
  60	STATE_ATTRIBUTE_VALUE_SQ,
  61	STATE_ATTRIBUTE_VALUE_UQ,
  62	STATE_CHARACTER_REFERENCE_IN_ATTRIBUTE_VALUE,
  63	STATE_AFTER_ATTRIBUTE_VALUE_Q,
  64	STATE_SELF_CLOSING_START_TAG,
  65	STATE_BOGUS_COMMENT,
  66	STATE_MARKUP_DECLARATION_OPEN,
  67	STATE_MATCH_COMMENT,
  68	STATE_COMMENT_START,
  69	STATE_COMMENT_START_DASH,
  70	STATE_COMMENT,
  71	STATE_COMMENT_END_DASH,
  72	STATE_COMMENT_END,
  73	STATE_MATCH_DOCTYPE,
  74	STATE_DOCTYPE,
  75	STATE_BEFORE_DOCTYPE_NAME,
  76	STATE_DOCTYPE_NAME,
  77	STATE_AFTER_DOCTYPE_NAME,
  78	STATE_MATCH_PUBLIC,
  79	STATE_BEFORE_DOCTYPE_PUBLIC,
  80	STATE_DOCTYPE_PUBLIC_DQ,
  81	STATE_DOCTYPE_PUBLIC_SQ,
  82	STATE_AFTER_DOCTYPE_PUBLIC,
  83	STATE_MATCH_SYSTEM,
  84	STATE_BEFORE_DOCTYPE_SYSTEM,
  85	STATE_DOCTYPE_SYSTEM_DQ,
  86	STATE_DOCTYPE_SYSTEM_SQ,
  87	STATE_AFTER_DOCTYPE_SYSTEM,
  88	STATE_BOGUS_DOCTYPE,
  89	STATE_MATCH_CDATA,
  90	STATE_CDATA_BLOCK,
  91	STATE_NUMBERED_ENTITY,
  92	STATE_NAMED_ENTITY
  93} hubbub_tokeniser_state;
  94
  95/**
  96 * Context for tokeniser
  97 */
  98typedef struct hubbub_tokeniser_context {
  99	size_t pending;				/**< Count of pending chars */
 100
 101	hubbub_string current_comment;		/**< Current comment text */
 102
 103	hubbub_token_type current_tag_type;	/**< Type of current_tag */
 104	hubbub_tag current_tag;			/**< Current tag */
 105	hubbub_doctype current_doctype;		/**< Current doctype */
 106	hubbub_tokeniser_state prev_state;	/**< Previous state */
 107
 108	uint8_t last_start_tag_name[10];	/**< Name of the last start tag
 109						 * emitted */
 110	size_t last_start_tag_len;		/**< Length of last start tag */
 111
 112	struct {
 113		uint32_t count;
 114		bool match;
 115	} close_tag_match;			/**< State for matching close 
 116						 * tags */
 117
 118	struct {
 119		uint32_t count;			/**< Index into "DOCTYPE" */
 120	} match_doctype;			/**< State for matching doctype */
 121
 122	struct {
 123		uint32_t count;			/**< Index into "[CDATA[" */
 124		uint32_t end;			/**< Index into "]]>" */
 125	} match_cdata;				/**< State for matching cdata */
 126
 127	struct {
 128		size_t offset;			/**< Offset in buffer */
 129		uint32_t length;		/**< Length of entity */
 130		uint32_t codepoint;		/**< UCS4 codepoint */
 131		bool complete;			/**< True if match complete */
 132
 133		uint32_t poss_length;		/**< Optimistic length
 134						 * when matching named
 135						 * character references */
 136		uint8_t base;			/**< Base for numeric
 137						 * entities */
 138		void *context;			/**< Context for named
 139						 * entity search */
 140		size_t prev_len;		/**< Previous byte length
 141						 * of str */
 142		bool had_data;			/**< Whether we read
 143						 * anything after &#(x)? */
 144		bool overflow;			/**< Whether this entity has
 145						 * has overflowed the maximum
 146						 * numeric entity value */
 147		hubbub_tokeniser_state return_state;	/**< State we were
 148							 * called from */
 149	} match_entity;				/**< Entity matching state */
 150
 151	struct {
 152		uint32_t line;			/**< Current line of input */
 153		uint32_t col;			/**< Current character in
 154						 * line */
 155	} position;				/**< Position in source data */
 156
 157	uint32_t allowed_char;			/**< Used for quote matching */
 158} hubbub_tokeniser_context;
 159
 160/**
 161 * Tokeniser data structure
 162 */
 163struct hubbub_tokeniser {
 164	hubbub_tokeniser_state state;	/**< Current tokeniser state */
 165	hubbub_content_model content_model;	/**< Current content
 166						 * model flag */
 167	bool escape_flag;		/**< Escape flag **/
 168	bool process_cdata_section;	/**< Whether to process CDATA sections*/
 169
 170	parserutils_inputstream *input;	/**< Input stream */
 171	parserutils_buffer *buffer;	/**< Input buffer */
 172
 173	hubbub_tokeniser_context context;	/**< Tokeniser context */
 174
 175	hubbub_token_handler token_handler;	/**< Token handling callback */
 176	void *token_pw;				/**< Token handler data */
 177
 178	hubbub_error_handler error_handler;	/**< Error handling callback */
 179	void *error_pw;				/**< Error handler data */
 180
 181	hubbub_allocator_fn alloc;	/**< Memory (de)allocation function */
 182	void *alloc_pw;			/**< Client private data */
 183};
 184
 185static hubbub_error hubbub_tokeniser_handle_data(hubbub_tokeniser *tokeniser);
 186static hubbub_error hubbub_tokeniser_handle_character_reference_data(
 187		hubbub_tokeniser *tokeniser);
 188static hubbub_error hubbub_tokeniser_handle_tag_open(
 189		hubbub_tokeniser *tokeniser);
 190static hubbub_error hubbub_tokeniser_handle_close_tag_open(
 191		hubbub_tokeniser *tokeniser);
 192static hubbub_error hubbub_tokeniser_handle_tag_name(
 193		hubbub_tokeniser *tokeniser);
 194static hubbub_error hubbub_tokeniser_handle_before_attribute_name(
 195		hubbub_tokeniser *tokeniser);
 196static hubbub_error hubbub_tokeniser_handle_attribute_name(
 197		hubbub_tokeniser *tokeniser);
 198static hubbub_error hubbub_tokeniser_handle_after_attribute_name(
 199		hubbub_tokeniser *tokeniser);
 200static hubbub_error hubbub_tokeniser_handle_before_attribute_value(
 201		hubbub_tokeniser *tokeniser);
 202static hubbub_error hubbub_tokeniser_handle_attribute_value_dq(
 203		hubbub_tokeniser *tokeniser);
 204static hubbub_error hubbub_tokeniser_handle_attribute_value_sq(
 205		hubbub_tokeniser *tokeniser);
 206static hubbub_error hubbub_tokeniser_handle_attribute_value_uq(
 207		hubbub_tokeniser *tokeniser);
 208static hubbub_error hubbub_tokeniser_handle_character_reference_in_attribute_value(
 209		hubbub_tokeniser *tokeniser);
 210static hubbub_error hubbub_tokeniser_handle_after_attribute_value_q(
 211		hubbub_tokeniser *tokeniser);
 212static hubbub_error hubbub_tokeniser_handle_self_closing_start_tag(
 213		hubbub_tokeniser *tokeniser);
 214static hubbub_error hubbub_tokeniser_handle_bogus_comment(
 215		hubbub_tokeniser *tokeniser);
 216static hubbub_error hubbub_tokeniser_handle_markup_declaration_open(
 217		hubbub_tokeniser *tokeniser);
 218static hubbub_error hubbub_tokeniser_handle_match_comment(
 219		hubbub_tokeniser *tokeniser);
 220static hubbub_error hubbub_tokeniser_handle_comment(
 221		hubbub_tokeniser *tokeniser);
 222static hubbub_error hubbub_tokeniser_handle_match_doctype(
 223		hubbub_tokeniser *tokeniser);
 224static hubbub_error hubbub_tokeniser_handle_doctype(
 225		hubbub_tokeniser *tokeniser);
 226static hubbub_error hubbub_tokeniser_handle_before_doctype_name(
 227		hubbub_tokeniser *tokeniser);
 228static hubbub_error hubbub_tokeniser_handle_doctype_name(
 229		hubbub_tokeniser *tokeniser);
 230static hubbub_error hubbub_tokeniser_handle_after_doctype_name(
 231		hubbub_tokeniser *tokeniser);
 232static hubbub_error hubbub_tokeniser_handle_match_public(
 233		hubbub_tokeniser *tokeniser);
 234static hubbub_error hubbub_tokeniser_handle_before_doctype_public(
 235		hubbub_tokeniser *tokeniser);
 236static hubbub_error hubbub_tokeniser_handle_doctype_public_dq(
 237		hubbub_tokeniser *tokeniser);
 238static hubbub_error hubbub_tokeniser_handle_doctype_public_sq(
 239		hubbub_tokeniser *tokeniser);
 240static hubbub_error hubbub_tokeniser_handle_after_doctype_public(
 241		hubbub_tokeniser *tokeniser);
 242static hubbub_error hubbub_tokeniser_handle_match_system(
 243		hubbub_tokeniser *tokeniser);
 244static hubbub_error hubbub_tokeniser_handle_before_doctype_system(
 245		hubbub_tokeniser *tokeniser);
 246static hubbub_error hubbub_tokeniser_handle_doctype_system_dq(
 247		hubbub_tokeniser *tokeniser);
 248static hubbub_error hubbub_tokeniser_handle_doctype_system_sq(
 249		hubbub_tokeniser *tokeniser);
 250static hubbub_error hubbub_tokeniser_handle_after_doctype_system(
 251		hubbub_tokeniser *tokeniser);
 252static hubbub_error hubbub_tokeniser_handle_bogus_doctype(
 253		hubbub_tokeniser *tokeniser);
 254static hubbub_error hubbub_tokeniser_handle_match_cdata(
 255		hubbub_tokeniser *tokeniser);
 256static hubbub_error hubbub_tokeniser_handle_cdata_block(
 257		hubbub_tokeniser *tokeniser);
 258static hubbub_error hubbub_tokeniser_consume_character_reference(
 259		hubbub_tokeniser *tokeniser, size_t off);
 260static hubbub_error hubbub_tokeniser_handle_numbered_entity(
 261		hubbub_tokeniser *tokeniser);
 262static hubbub_error hubbub_tokeniser_handle_named_entity(
 263		hubbub_tokeniser *tokeniser);
 264
 265static inline hubbub_error emit_character_token(hubbub_tokeniser *tokeniser,
 266		const hubbub_string *chars);
 267static inline hubbub_error emit_current_chars(hubbub_tokeniser *tokeniser);
 268static inline hubbub_error emit_current_tag(hubbub_tokeniser *tokeniser);
 269static inline hubbub_error emit_current_comment(hubbub_tokeniser *tokeniser);
 270static inline hubbub_error emit_current_doctype(hubbub_tokeniser *tokeniser,
 271		bool force_quirks);
 272static hubbub_error hubbub_tokeniser_emit_token(hubbub_tokeniser *tokeniser,
 273		hubbub_token *token);
 274
 275/**
 276 * Create a hubbub tokeniser
 277 *
 278 * \param input      Input stream instance
 279 * \param alloc      Memory (de)allocation function
 280 * \param pw         Pointer to client-specific private data (may be NULL)
 281 * \param tokeniser  Pointer to location to receive tokeniser instance
 282 * \return HUBBUB_OK on success,
 283 *         HUBBUB_BADPARM on bad parameters,
 284 *         HUBBUB_NOMEM on memory exhaustion
 285 */
 286hubbub_error hubbub_tokeniser_create(parserutils_inputstream *input,
 287		hubbub_allocator_fn alloc, void *pw, 
 288		hubbub_tokeniser **tokeniser)
 289{
 290	parserutils_error perror;
 291	hubbub_tokeniser *tok;
 292
 293	if (input == NULL || alloc == NULL || tokeniser == NULL)
 294		return HUBBUB_BADPARM;
 295
 296	tok = alloc(NULL, sizeof(hubbub_tokeniser), pw);
 297	if (tok == NULL)
 298		return HUBBUB_NOMEM;
 299
 300	perror = parserutils_buffer_create(alloc, pw, &tok->buffer);
 301	if (perror != PARSERUTILS_OK) {
 302		alloc(tok, 0, pw);
 303		return hubbub_error_from_parserutils_error(perror);
 304	}
 305
 306	tok->state = STATE_DATA;
 307	tok->content_model = HUBBUB_CONTENT_MODEL_PCDATA;
 308
 309	tok->escape_flag = false;
 310	tok->process_cdata_section = false;
 311
 312	tok->input = input;
 313
 314	tok->token_handler = NULL;
 315	tok->token_pw = NULL;
 316
 317	tok->error_handler = NULL;
 318	tok->error_pw = NULL;
 319
 320	tok->alloc = alloc;
 321	tok->alloc_pw = pw;
 322
 323	memset(&tok->context, 0, sizeof(hubbub_tokeniser_context));
 324
 325	*tokeniser = tok;
 326
 327	return HUBBUB_OK;
 328}
 329
 330/**
 331 * Destroy a hubbub tokeniser
 332 *
 333 * \param tokeniser  The tokeniser instance to destroy
 334 * \return HUBBUB_OK on success, appropriate error otherwise
 335 */
 336hubbub_error hubbub_tokeniser_destroy(hubbub_tokeniser *tokeniser)
 337{
 338	if (tokeniser == NULL)
 339		return HUBBUB_BADPARM;
 340
 341	if (tokeniser->context.current_tag.attributes != NULL) {
 342		tokeniser->alloc(tokeniser->context.current_tag.attributes,
 343				0, tokeniser->alloc_pw);
 344	}
 345
 346	parserutils_buffer_destroy(tokeniser->buffer);
 347
 348	tokeniser->alloc(tokeniser, 0, tokeniser->alloc_pw);
 349
 350	return HUBBUB_OK;
 351}
 352
 353/**
 354 * Configure a hubbub tokeniser
 355 *
 356 * \param tokeniser  The tokeniser instance to configure
 357 * \param type       The option type to set
 358 * \param params     Option-specific parameters
 359 * \return HUBBUB_OK on success, appropriate error otherwise
 360 */
 361hubbub_error hubbub_tokeniser_setopt(hubbub_tokeniser *tokeniser,
 362		hubbub_tokeniser_opttype type,
 363		hubbub_tokeniser_optparams *params)
 364{
 365	if (tokeniser == NULL || params == NULL)
 366		return HUBBUB_BADPARM;
 367
 368	switch (type) {
 369	case HUBBUB_TOKENISER_TOKEN_HANDLER:
 370		tokeniser->token_handler = params->token_handler.handler;
 371		tokeniser->token_pw = params->token_handler.pw;
 372		break;
 373	case HUBBUB_TOKENISER_ERROR_HANDLER:
 374		tokeniser->error_handler = params->error_handler.handler;
 375		tokeniser->error_pw = params->error_handler.pw;
 376		break;
 377	case HUBBUB_TOKENISER_CONTENT_MODEL:
 378		tokeniser->content_model = params->content_model.model;
 379		break;
 380	case HUBBUB_TOKENISER_PROCESS_CDATA:
 381		tokeniser->process_cdata_section = params->process_cdata;
 382		break;
 383	}
 384
 385	return HUBBUB_OK;
 386}
 387
 388/**
 389 * Process remaining data in the input stream
 390 *
 391 * \param tokeniser  The tokeniser instance to invoke
 392 * \return HUBBUB_OK on success, appropriate error otherwise
 393 */
 394hubbub_error hubbub_tokeniser_run(hubbub_tokeniser *tokeniser)
 395{
 396	hubbub_error cont = HUBBUB_OK;
 397
 398	if (tokeniser == NULL)
 399		return HUBBUB_BADPARM;
 400
 401#if 0
 402#define state(x) \
 403		case x: \
 404			printf( #x "\n");
 405#else
 406#define state(x) \
 407		case x:
 408#endif
 409
 410	while (cont == HUBBUB_OK) {
 411		switch (tokeniser->state) {
 412		state(STATE_DATA)
 413			cont = hubbub_tokeniser_handle_data(tokeniser);
 414			break;
 415		state(STATE_CHARACTER_REFERENCE_DATA)
 416			cont = hubbub_tokeniser_handle_character_reference_data(
 417					tokeniser);
 418			break;
 419		state(STATE_TAG_OPEN)
 420			cont = hubbub_tokeniser_handle_tag_open(tokeniser);
 421			break;
 422		state(STATE_CLOSE_TAG_OPEN)
 423			cont = hubbub_tokeniser_handle_close_tag_open(
 424					tokeniser);
 425			break;
 426		state(STATE_TAG_NAME)
 427			cont = hubbub_tokeniser_handle_tag_name(tokeniser);
 428			break;
 429		state(STATE_BEFORE_ATTRIBUTE_NAME)
 430			cont = hubbub_tokeniser_handle_before_attribute_name(
 431					tokeniser);
 432			break;
 433		state(STATE_ATTRIBUTE_NAME)
 434			cont = hubbub_tokeniser_handle_attribute_name(
 435					tokeniser);
 436			break;
 437		state(STATE_AFTER_ATTRIBUTE_NAME)
 438			cont = hubbub_tokeniser_handle_after_attribute_name(
 439					tokeniser);
 440			break;
 441		state(STATE_BEFORE_ATTRIBUTE_VALUE)
 442			cont = hubbub_tokeniser_handle_before_attribute_value(
 443					tokeniser);
 444			break;
 445		state(STATE_ATTRIBUTE_VALUE_DQ)
 446			cont = hubbub_tokeniser_handle_attribute_value_dq(
 447					tokeniser);
 448			break;
 449		state(STATE_ATTRIBUTE_VALUE_SQ)
 450			cont = hubbub_tokeniser_handle_attribute_value_sq(
 451					tokeniser);
 452			break;
 453		state(STATE_ATTRIBUTE_VALUE_UQ)
 454			cont = hubbub_tokeniser_handle_attribute_value_uq(
 455					tokeniser);
 456			break;
 457		state(STATE_CHARACTER_REFERENCE_IN_ATTRIBUTE_VALUE)
 458			cont = hubbub_tokeniser_handle_character_reference_in_attribute_value(
 459					tokeniser);
 460			break;
 461		state(STATE_AFTER_ATTRIBUTE_VALUE_Q)
 462			cont = hubbub_tokeniser_handle_after_attribute_value_q(
 463					tokeniser);
 464			break;
 465		state(STATE_SELF_CLOSING_START_TAG)
 466			cont = hubbub_tokeniser_handle_self_closing_start_tag(
 467					tokeniser);
 468			break;
 469		state(STATE_BOGUS_COMMENT)
 470			cont = hubbub_tokeniser_handle_bogus_comment(
 471					tokeniser);
 472			break;
 473		state(STATE_MARKUP_DECLARATION_OPEN)
 474			cont = hubbub_tokeniser_handle_markup_declaration_open(
 475					tokeniser);
 476			break;
 477		state(STATE_MATCH_COMMENT)
 478			cont = hubbub_tokeniser_handle_match_comment(
 479					tokeniser);
 480			break;
 481		case STATE_COMMENT_START:
 482		case STATE_COMMENT_START_DASH:
 483		case STATE_COMMENT:
 484		case STATE_COMMENT_END_DASH:
 485		case STATE_COMMENT_END:
 486			cont = hubbub_tokeniser_handle_comment(tokeniser);
 487			break;
 488		state(STATE_MATCH_DOCTYPE)
 489			cont = hubbub_tokeniser_handle_match_doctype(
 490					tokeniser);
 491			break;
 492		state(STATE_DOCTYPE)
 493			cont = hubbub_tokeniser_handle_doctype(tokeniser);
 494			break;
 495		state(STATE_BEFORE_DOCTYPE_NAME)
 496			cont = hubbub_tokeniser_handle_before_doctype_name(
 497					tokeniser);
 498			break;
 499		state(STATE_DOCTYPE_NAME)
 500			cont = hubbub_tokeniser_handle_doctype_name(
 501					tokeniser);
 502			break;
 503		state(STATE_AFTER_DOCTYPE_NAME)
 504			cont = hubbub_tokeniser_handle_after_doctype_name(
 505					tokeniser);
 506			break;
 507
 508		state(STATE_MATCH_PUBLIC)
 509			cont = hubbub_tokeniser_handle_match_public(
 510					tokeniser);
 511			break;
 512		state(STATE_BEFORE_DOCTYPE_PUBLIC)
 513			cont = hubbub_tokeniser_handle_before_doctype_public(
 514					tokeniser);
 515			break;
 516		state(STATE_DOCTYPE_PUBLIC_DQ)
 517			cont = hubbub_tokeniser_handle_doctype_public_dq(
 518					tokeniser);
 519			break;
 520		state(STATE_DOCTYPE_PUBLIC_SQ)
 521			cont = hubbub_tokeniser_handle_doctype_public_sq(
 522					tokeniser);
 523			break;
 524		state(STATE_AFTER_DOCTYPE_PUBLIC)
 525			cont = hubbub_tokeniser_handle_after_doctype_public(
 526					tokeniser);
 527			break;
 528		state(STATE_MATCH_SYSTEM)
 529			cont = hubbub_tokeniser_handle_match_system(
 530					tokeniser);
 531			break;
 532		state(STATE_BEFORE_DOCTYPE_SYSTEM)
 533			cont = hubbub_tokeniser_handle_before_doctype_system(
 534					tokeniser);
 535			break;
 536		state(STATE_DOCTYPE_SYSTEM_DQ)
 537			cont = hubbub_tokeniser_handle_doctype_system_dq(
 538					tokeniser);
 539			break;
 540		state(STATE_DOCTYPE_SYSTEM_SQ)
 541			cont = hubbub_tokeniser_handle_doctype_system_sq(
 542					tokeniser);
 543			break;
 544		state(STATE_AFTER_DOCTYPE_SYSTEM)
 545			cont = hubbub_tokeniser_handle_after_doctype_system(
 546					tokeniser);
 547			break;
 548		state(STATE_BOGUS_DOCTYPE)
 549			cont = hubbub_tokeniser_handle_bogus_doctype(
 550					tokeniser);
 551			break;
 552		state(STATE_MATCH_CDATA)
 553			cont = hubbub_tokeniser_handle_match_cdata(
 554					tokeniser);
 555			break;
 556		state(STATE_CDATA_BLOCK)
 557			cont = hubbub_tokeniser_handle_cdata_block(
 558					tokeniser);
 559			break;
 560		state(STATE_NUMBERED_ENTITY)
 561			cont = hubbub_tokeniser_handle_numbered_entity(
 562					tokeniser);
 563			break;
 564		state(STATE_NAMED_ENTITY)
 565			cont = hubbub_tokeniser_handle_named_entity(
 566					tokeniser);
 567			break;
 568		}
 569	}
 570
 571	return (cont == HUBBUB_NEEDDATA) ? HUBBUB_OK : cont;
 572}
 573
 574
 575/**
 576 * Various macros for manipulating buffers.
 577 *
 578 * \todo make some of these inline functions (type-safety)
 579 * \todo document them properly here
 580 */
 581
 582#define START_BUF(str, cptr, length) \
 583	do { \
 584		parserutils_error perror; \
 585		perror = parserutils_buffer_append(tokeniser->buffer, \
 586				(uint8_t *) (cptr), (length)); \
 587		if (perror != PARSERUTILS_OK) \
 588			return hubbub_error_from_parserutils_error(perror); \
 589		(str).len = (length); \
 590	} while (0)
 591
 592#define COLLECT(str, cptr, length) \
 593	do { \
 594		parserutils_error perror; \
 595		assert(str.len != 0); \
 596		perror = parserutils_buffer_append(tokeniser->buffer, \
 597				(uint8_t *) (cptr), (length)); \
 598		if (perror != PARSERUTILS_OK) \
 599			return hubbub_error_from_parserutils_error(perror); \
 600		(str).len += (length); \
 601	} while (0)
 602
 603#define COLLECT_MS(str, cptr, length) \
 604	do { \
 605		parserutils_error perror; \
 606		perror = parserutils_buffer_append(tokeniser->buffer, \
 607				(uint8_t *) (cptr), (length)); \
 608		if (perror != PARSERUTILS_OK) \
 609			return hubbub_error_from_parserutils_error(perror); \
 610		(str).len += (length); \
 611	} while (0)
 612
 613
 614/* this should always be called with an empty "chars" buffer */
 615hubbub_error hubbub_tokeniser_handle_data(hubbub_tokeniser *tokeniser)
 616{
 617	parserutils_error error;
 618	hubbub_token token;
 619	const uint8_t *cptr;
 620	size_t len;
 621
 622	while ((error = parserutils_inputstream_peek(tokeniser->input,
 623			tokeniser->context.pending, &cptr, &len)) ==
 624					PARSERUTILS_OK) {
 625		const uint8_t c = *cptr;
 626
 627		if (c == '&' &&
 628				(tokeniser->content_model == HUBBUB_CONTENT_MODEL_PCDATA ||
 629				tokeniser->content_model == HUBBUB_CONTENT_MODEL_RCDATA) &&
 630				tokeniser->escape_flag == false) {
 631			tokeniser->state =
 632					STATE_CHARACTER_REFERENCE_DATA;
 633			/* Don't eat the '&'; it'll be handled by entity
 634			 * consumption */
 635			break;
 636		} else if (c == '-' &&
 637				tokeniser->escape_flag == false &&
 638				(tokeniser->content_model ==
 639						HUBBUB_CONTENT_MODEL_RCDATA ||
 640				tokeniser->content_model ==
 641						HUBBUB_CONTENT_MODEL_CDATA) &&
 642				tokeniser->context.pending >= 3) {
 643			size_t ignore;
 644			error = parserutils_inputstream_peek(
 645					tokeniser->input,
 646					tokeniser->context.pending - 3,
 647					&cptr,
 648					&ignore);
 649
 650			assert(error == PARSERUTILS_OK);
 651
 652			if (strncmp((char *)cptr,
 653					"<!--", SLEN("<!--")) == 0) {
 654				tokeniser->escape_flag = true;
 655			}
 656
 657			tokeniser->context.pending += len;
 658		} else if (c == '<' && (tokeniser->content_model ==
 659						HUBBUB_CONTENT_MODEL_PCDATA ||
 660					((tokeniser->content_model ==
 661						HUBBUB_CONTENT_MODEL_RCDATA ||
 662					tokeniser->content_model ==
 663						HUBBUB_CONTENT_MODEL_CDATA) &&
 664				tokeniser->escape_flag == false))) {
 665			if (tokeniser->context.pending > 0) {
 666				/* Emit any pending characters */
 667				emit_current_chars(tokeniser);
 668			}
 669
 670			/* Buffer '<' */
 671			tokeniser->context.pending = len;
 672			tokeniser->state = STATE_TAG_OPEN;
 673			break;
 674		} else if (c == '>' && tokeniser->escape_flag == true &&
 675				(tokeniser->content_model ==
 676						HUBBUB_CONTENT_MODEL_RCDATA ||
 677				tokeniser->content_model ==
 678						HUBBUB_CONTENT_MODEL_CDATA)) {
 679			/* no need to check that there are enough characters,
 680			 * since you can only run into this if the flag is
 681			 * true in the first place, which requires four
 682			 * characters. */
 683			error = parserutils_inputstream_peek(
 684					tokeniser->input,
 685					tokeniser->context.pending - 2,
 686					&cptr,
 687					&len);
 688
 689			assert(error == PARSERUTILS_OK);
 690
 691			if (strncmp((char *) cptr, "-->", SLEN("-->")) == 0) {
 692				tokeniser->escape_flag = false;
 693			}
 694
 695			tokeniser->context.pending += len;
 696		} else if (c == '\0') {
 697			if (tokeniser->context.pending > 0) {
 698				/* Emit any pending characters */
 699				emit_current_chars(tokeniser);
 700			}
 701
 702			/* Emit a replacement character */
 703			emit_character_token(tokeniser, &u_fffd_str);
 704
 705			/* Advance past NUL */
 706			parserutils_inputstream_advance(tokeniser->input, 1);
 707		} else if (c == '\r') {
 708			error = parserutils_inputstream_peek(
 709					tokeniser->input,
 710					tokeniser->context.pending + len,
 711					&cptr,
 712					&len);
 713
 714			if (error != PARSERUTILS_OK && 
 715					error != PARSERUTILS_EOF) {
 716				break;
 717			}
 718
 719			if (tokeniser->context.pending > 0) {
 720				/* Emit any pending characters */
 721				emit_current_chars(tokeniser);
 722			}
 723
 724			if (error == PARSERUTILS_EOF ||	*cptr != '\n') {
 725				/* Emit newline */
 726				emit_character_token(tokeniser, &lf_str);
 727			}
 728
 729			/* Advance over */
 730			parserutils_inputstream_advance(tokeniser->input, 1);
 731		} else {
 732			/* Just collect into buffer */
 733			tokeniser->context.pending += len;
 734		}
 735	}
 736
 737	if (tokeniser->state != STATE_TAG_OPEN &&
 738		(tokeniser->state != STATE_DATA || error == PARSERUTILS_EOF) &&
 739			tokeniser->context.pending > 0) {
 740		/* Emit any pending characters */
 741		emit_current_chars(tokeniser);
 742	}
 743
 744	if (error == PARSERUTILS_EOF) {
 745		token.type = HUBBUB_TOKEN_EOF;
 746		hubbub_tokeniser_emit_token(tokeniser, &token);
 747	}
 748
 749	if (error == PARSERUTILS_EOF) {
 750		return HUBBUB_NEEDDATA;
 751	} else {
 752		return hubbub_error_from_parserutils_error(error);
 753	}
 754}
 755
 756/* emit any pending tokens before calling */
 757hubbub_error hubbub_tokeniser_handle_character_reference_data(
 758		hubbub_tokeniser *tokeniser)
 759{
 760	assert(tokeniser->context.pending == 0);
 761
 762	if (tokeniser->context.match_entity.complete == false) {
 763		return hubbub_tokeniser_consume_character_reference(tokeniser,
 764				tokeniser->context.pending);
 765	} else {
 766		hubbub_token token;
 767
 768		uint8_t utf8[6];
 769		uint8_t *utf8ptr = utf8;
 770		size_t len = sizeof(utf8);
 771
 772		token.type = HUBBUB_TOKEN_CHARACTER;
 773
 774		if (tokeniser->context.match_entity.codepoint) {
 775			parserutils_charset_utf8_from_ucs4(
 776				tokeniser->context.match_entity.codepoint,
 777				&utf8ptr, &len);
 778
 779			token.data.character.ptr = utf8;
 780			token.data.character.len = sizeof(utf8) - len;
 781
 782			hubbub_tokeniser_emit_token(tokeniser, &token);
 783
 784			/* +1 for ampersand */
 785			parserutils_inputstream_advance(tokeniser->input,
 786					tokeniser->context.match_entity.length
 787							+ 1);
 788		} else {
 789			parserutils_error error;
 790			const uint8_t *cptr = NULL;
 791			error = parserutils_inputstream_peek(
 792					tokeniser->input,
 793					tokeniser->context.pending,
 794					&cptr,
 795					&len);
 796
 797			assert(error == PARSERUTILS_OK);
 798
 799			token.data.character.ptr = cptr;
 800			token.data.character.len = len;
 801
 802			hubbub_tokeniser_emit_token(tokeniser, &token);
 803			parserutils_inputstream_advance(tokeniser->input, len);
 804		}
 805
 806		/* Reset for next time */
 807		tokeniser->context.match_entity.complete = false;
 808
 809		tokeniser->state = STATE_DATA;
 810	}
 811
 812	return HUBBUB_OK;
 813}
 814
 815/* this state always switches to another state straight away */
 816/* this state expects the current character to be '<' */
 817hubbub_error hubbub_tokeniser_handle_tag_open(hubbub_tokeniser *tokeniser)
 818{
 819	hubbub_tag *ctag = &tokeniser->context.current_tag;
 820
 821	size_t len;
 822	const uint8_t *cptr;
 823	parserutils_error error;
 824	uint8_t c;
 825
 826	assert(tokeniser->context.pending == 1);
 827/*	assert(tokeniser->context.chars.ptr[0] == '<'); */
 828
 829	error = parserutils_inputstream_peek(tokeniser->input, 
 830			tokeniser->context.pending, &cptr, &len);
 831
 832	if (error != PARSERUTILS_OK) {
 833		if (error == PARSERUTILS_EOF) {
 834			/* Return to data state with '<' still in "chars" */
 835			tokeniser->state = STATE_DATA;
 836			return HUBBUB_OK;
 837		} else {
 838			return hubbub_error_from_parserutils_error(error);
 839		}
 840	}
 841
 842	c = *cptr;
 843
 844	if (c == '/') {
 845		tokeniser->context.pending += len;
 846
 847		tokeniser->context.close_tag_match.match = false;
 848		tokeniser->context.close_tag_match.count = 0;
 849
 850		tokeniser->state = STATE_CLOSE_TAG_OPEN;
 851	} else if (tokeniser->content_model == HUBBUB_CONTENT_MODEL_RCDATA ||
 852			tokeniser->content_model ==
 853					HUBBUB_CONTENT_MODEL_CDATA) {
 854		/* Return to data state with '<' still in "chars" */
 855		tokeniser->state = STATE_DATA;
 856	} else if (tokeniser->content_model == HUBBUB_CONTENT_MODEL_PCDATA) {
 857		if (c == '!') {
 858			parserutils_inputstream_advance(tokeniser->input,
 859					SLEN("<!"));
 860
 861			tokeniser->context.pending = 0;
 862			tokeniser->state = STATE_MARKUP_DECLARATION_OPEN;
 863		} else if ('A' <= c && c <= 'Z') {
 864			uint8_t lc = (c + 0x20);
 865
 866			START_BUF(ctag->name, &lc, len);
 867			ctag->n_attributes = 0;
 868			tokeniser->context.current_tag_type =
 869					HUBBUB_TOKEN_START_TAG;
 870
 871			tokeniser->context.pending += len;
 872
 873			tokeniser->state = STATE_TAG_NAME;
 874		} else if ('a' <= c && c <= 'z') {
 875			START_BUF(ctag->name, cptr, len);
 876			ctag->n_attributes = 0;
 877			tokeniser->context.current_tag_type =
 878					HUBBUB_TOKEN_START_TAG;
 879
 880			tokeniser->context.pending += len;
 881
 882			tokeniser->state = STATE_TAG_NAME;
 883		} else if (c == '>') {
 884			/** \todo parse error */
 885
 886			tokeniser->context.pending += len;
 887			tokeniser->state = STATE_DATA;
 888		} else if (c == '?') {
 889			/** \todo parse error */
 890
 891			/* Cursor still at "<", need to advance past it */
 892			parserutils_inputstream_advance(
 893					tokeniser->input, SLEN("<"));
 894			tokeniser->context.pending = 0;
 895
 896			tokeniser->state = STATE_BOGUS_COMMENT;
 897		} else {
 898			/* Return to data state with '<' still in "chars" */
 899			tokeniser->state = STATE_DATA;
 900		}
 901	}
 902
 903	return HUBBUB_OK;
 904}
 905
 906/* this state expects tokeniser->context.chars to be "</" */
 907/* this state never stays in this state for more than one character */
 908hubbub_error hubbub_tokeniser_handle_close_tag_open(hubbub_tokeniser *tokeniser)
 909{
 910	hubbub_tokeniser_context *ctx = &tokeniser->context;
 911
 912	size_t len;
 913	const uint8_t *cptr;
 914	parserutils_error error;
 915	uint8_t c;
 916
 917	assert(tokeniser->context.pending == 2);
 918/*	assert(tokeniser->context.chars.ptr[0] == '<'); */
 919/*	assert(tokeniser->context.chars.ptr[1] == '/'); */
 920
 921	/**\todo fragment case */
 922
 923	if (tokeniser->content_model == HUBBUB_CONTENT_MODEL_RCDATA ||
 924			tokeniser->content_model ==
 925					HUBBUB_CONTENT_MODEL_CDATA) {
 926		uint8_t *start_tag_name =
 927			tokeniser->context.last_start_tag_name;
 928		size_t start_tag_len =
 929			tokeniser->context.last_start_tag_len;
 930
 931		while ((error = parserutils_inputstream_peek(tokeniser->input,
 932					ctx->pending +
 933						ctx->close_tag_match.count,
 934					&cptr,
 935					&len)) == PARSERUTILS_OK) {
 936			c = *cptr;
 937
 938			if ((start_tag_name[ctx->close_tag_match.count] & ~0x20)
 939					!= (c & ~0x20)) {
 940				break;
 941			}
 942
 943			ctx->close_tag_match.count += len;
 944
 945			if (ctx->close_tag_match.count == start_tag_len) {
 946				ctx->close_tag_match.match = true;
 947				break;
 948			}
 949		}
 950
 951		if (error != PARSERUTILS_OK && error != PARSERUTILS_EOF) {
 952			return hubbub_error_from_parserutils_error(error);
 953		}
 954
 955		if (ctx->close_tag_match.match == true) {
 956			error = parserutils_inputstream_peek(
 957			 		tokeniser->input,
 958			 		ctx->pending +
 959				 		ctx->close_tag_match.count,
 960					&cptr,
 961			 		&len);
 962
 963			if (error != PARSERUTILS_OK && 
 964					error != PARSERUTILS_EOF) {
 965				return hubbub_error_from_parserutils_error(
 966						error);
 967			} else if (error != PARSERUTILS_EOF) {
 968				c = *cptr;
 969
 970				if (c != '\t' && c != '\n' && c != '\f' &&
 971						c != ' ' && c != '>' &&
 972						c != '/') {
 973					ctx->close_tag_match.match = false;
 974				}
 975			}
 976		}
 977	}
 978
 979	if (ctx->close_tag_match.match == false &&
 980			tokeniser->content_model !=
 981					HUBBUB_CONTENT_MODEL_PCDATA) {
 982		/* We should emit "</" here, but instead we leave it in the
 983		 * buffer so the data state emits it with any characters
 984		 * following it */
 985		tokeniser->state = STATE_DATA;
 986	} else {
 987		error = parserutils_inputstream_peek(tokeniser->input,
 988				tokeniser->context.pending, &cptr, &len);
 989
 990		if (error == PARSERUTILS_EOF) {
 991			/** \todo parse error */
 992
 993			/* Return to data state with "</" pending */
 994			tokeniser->state = STATE_DATA;
 995			return HUBBUB_OK;
 996		} else if (error != PARSERUTILS_OK) {
 997			return hubbub_error_from_parserutils_error(error);
 998		}
 999
1000		c = *cptr;
1001
1002		if ('A' <= c && c <= 'Z') {
1003			uint8_t lc = (c + 0x20);
1004			START_BUF(tokeniser->context.current_tag.name,
1005					&lc, len);
1006			tokeniser->context.current_tag.n_attributes = 0;
1007
1008			tokeniser->context.current_tag_type =
1009					HUBBUB_TOKEN_END_TAG;
1010
1011			tokeniser->context.pending += len;
1012
1013			tokeniser->state = STATE_TAG_NAME;
1014		} else if ('a' <= c && c <= 'z') {
1015			START_BUF(tokeniser->context.current_tag.name,
1016					cptr, len);
1017			tokeniser->context.current_tag.n_attributes = 0;
1018
1019			tokeniser->context.current_tag_type =
1020					HUBBUB_TOKEN_END_TAG;
1021
1022			tokeniser->context.pending += len;
1023
1024			tokeniser->state = STATE_TAG_NAME;
1025		} else if (c == '>') {
1026			/* Cursor still at "</", need to collect ">" */
1027			tokeniser->context.pending += len;
1028
1029			/* Now need to advance past "</>" */
1030			parserutils_inputstream_advance(tokeniser->input,
1031					tokeniser->context.pending);
1032			tokeniser->context.pending = 0;
1033
1034			/** \todo parse error */
1035			tokeniser->state = STATE_DATA;
1036		} else {
1037			/** \todo parse error */
1038
1039			/* Cursor still at "</", need to advance past it */
1040			parserutils_inputstream_advance(tokeniser->input,
1041					tokeniser->context.pending);
1042			tokeniser->context.pending = 0;
1043
1044			tokeniser->state = STATE_BOGUS_COMMENT;
1045		}
1046	}
1047
1048	return HUBBUB_OK;
1049}
1050
1051/* this state expects tokeniser->context.current_tag to already have its
1052   first character set */
1053hubbub_error hubbub_tokeniser_handle_tag_name(hubbub_tokeniser *tokeniser)
1054{
1055	hubbub_tag *ctag = &tokeniser->context.current_tag;
1056
1057	size_t len;
1058	const uint8_t *cptr;
1059	parserutils_error error;
1060	uint8_t c;
1061
1062	assert(tokeniser->context.pending > 0);
1063/*	assert(tokeniser->context.chars.ptr[0] == '<'); */
1064	assert(ctag->name.len > 0);
1065/*	assert(ctag->name.ptr); */
1066
1067	error = parserutils_inputstream_peek(tokeniser->input, 
1068			tokeniser->context.pending, &cptr, &len);
1069
1070	if (error != PARSERUTILS_OK) {
1071		if (error == PARSERUTILS_EOF) {
1072			tokeniser->state = STATE_DATA;
1073			return emit_current_tag(tokeniser);
1074		} else {
1075			return hubbub_error_from_parserutils_error(error);
1076		}
1077	}
1078
1079	c = *cptr;
1080
1081	if (c == '\t' || c == '\n' || c == '\f' || c == ' ' || c == '\r') {
1082		tokeniser->context.pending += len;
1083		tokeniser->state = STATE_BEFORE_ATTRIBUTE_NAME;
1084	} else if (c == '>') {
1085		tokeniser->context.pending += len;
1086		tokeniser->state = STATE_DATA;
1087		return emit_current_tag(tokeniser);
1088	} else if (c == '\0') {
1089		COLLECT(ctag->name, u_fffd, sizeof(u_fffd));
1090		tokeniser->context.pending += len;
1091	} else if (c == '/') {
1092		tokeniser->context.pending += len;
1093		tokeniser->state = STATE_SELF_CLOSING_START_TAG;
1094	} else if ('A' <= c && c <= 'Z') {
1095		uint8_t lc = (c + 0x20);
1096		COLLECT(ctag->name, &lc, len);
1097		tokeniser->context.pending += len;
1098	} else {
1099		COLLECT(ctag->name, cptr, len);
1100		tokeniser->context.pending += len;
1101	}
1102
1103	return HUBBUB_OK;
1104}
1105
1106hubbub_error hubbub_tokeniser_handle_before_attribute_name(
1107		hubbub_tokeniser *tokeniser)
1108{
1109	hubbub_tag *ctag = &tokeniser->context.current_tag;
1110
1111	size_t len;
1112	const uint8_t *cptr;
1113	parserutils_error error;
1114	uint8_t c;
1115
1116	error = parserutils_inputstream_peek(tokeniser->input, 
1117			tokeniser->context.pending, &cptr, &len);
1118
1119	if (error != PARSERUTILS_OK) {
1120		if (error == PARSERUTILS_EOF) {
1121			tokeniser->state = STATE_DATA;
1122			return emit_current_tag(tokeniser);
1123		} else {
1124			return hubbub_error_from_parserutils_error(error);
1125		}
1126	}
1127
1128	c = *cptr;
1129
1130	if (c == '\t' || c == '\n' || c == '\f' || c == ' ' || c == '\r') {
1131		/* pass over in silence */
1132		tokeniser->context.pending += len;
1133	} else if (c == '>') {
1134		tokeniser->context.pending += len;
1135		tokeniser->state = STATE_DATA;
1136		return emit_current_tag(tokeniser);
1137	} else if (c == '/') {
1138		tokeniser->context.pending += len;
1139		tokeniser->state = STATE_SELF_CLOSING_START_TAG;
1140	} else {
1141		hubbub_attribute *attr;
1142
1143		if (c == '"' || c == '\'' || c == '=') {
1144			/** \todo parse error */
1145		}
1146
1147		attr = tokeniser->alloc(ctag->attributes,
1148				(ctag->n_attributes + 1) *
1149					sizeof(hubbub_attribute),
1150				tokeniser->alloc_pw);
1151		if (attr == NULL)
1152			return HUBBUB_NOMEM;
1153
1154		ctag->attributes = attr;
1155
1156		if ('A' <= c && c <= 'Z') {
1157			uint8_t lc = (c + 0x20);
1158			START_BUF(attr[ctag->n_attributes].name, &lc, len);
1159		} else if (c == '\0') {
1160			START_BUF(attr[ctag->n_attributes].name,
1161					u_fffd, sizeof(u_fffd));
1162		} else {
1163			START_BUF(attr[ctag->n_attributes].name, cptr, len);
1164		}
1165
1166		attr[ctag->n_attributes].ns = HUBBUB_NS_NULL;
1167		attr[ctag->n_attributes].value.ptr = NULL;
1168		attr[ctag->n_attributes].value.len = 0;
1169
1170		ctag->n_attributes++;
1171
1172		tokeniser->context.pending += len;
1173		tokeniser->state = STATE_ATTRIBUTE_NAME;
1174	}
1175
1176	return HUBBUB_OK;
1177}
1178
1179hubbub_error hubbub_tokeniser_handle_attribute_name(hubbub_tokeniser *tokeniser)
1180{
1181	hubbub_tag *ctag = &tokeniser->context.current_tag;
1182
1183	size_t len;
1184	const uint8_t *cptr;
1185	parserutils_error error;
1186	uint8_t c;
1187
1188	assert(ctag->attributes[ctag->n_attributes - 1].name.len > 0);
1189
1190	error = parserutils_inputstream_peek(tokeniser->input, 
1191			tokeniser->context.pending, &cptr, &len);
1192
1193	if (error != PARSERUTILS_OK) {
1194		if (error == PARSERUTILS_EOF) {
1195			tokeniser->state = STATE_DATA;
1196			return emit_current_tag(tokeniser);
1197		} else {
1198			return hubbub_error_from_parserutils_error(error);
1199		}
1200	}
1201
1202	c = *cptr;
1203
1204	if (c == '\t' || c == '\n' || c == '\f' || c == ' ' || c == '\r') {
1205		tokeniser->context.pending += len;
1206		tokeniser->state = STATE_AFTER_ATTRIBUTE_NAME;
1207	} else if (c == '=') {
1208		tokeniser->context.pending += len;
1209		tokeniser->state = STATE_BEFORE_ATTRIBUTE_VALUE;
1210	} else if (c == '>') {
1211		tokeniser->context.pending += len;
1212		tokeniser->state = STATE_DATA;
1213		return emit_current_tag(tokeniser);
1214	} else if (c == '/') {
1215		tokeniser->context.pending += len;
1216		tokeniser->state = STATE_SELF_CLOSING_START_TAG;
1217	} else if (c == '\0') {
1218		COLLECT(ctag->attributes[ctag->n_attributes - 1].name,
1219				u_fffd, sizeof(u_fffd));
1220		tokeniser->context.pending += len;
1221	} else if ('A' <= c && c <= 'Z') {
1222		uint8_t lc = (c + 0x20);
1223		COLLECT(ctag->attributes[ctag->n_attributes - 1].name,
1224				&lc, len);
1225		tokeniser->context.pending += len;
1226	} else {
1227		COLLECT(ctag->attributes[ctag->n_attributes - 1].name,
1228				cptr, len);
1229		tokeniser->context.pending += len;
1230	}
1231
1232	return HUBBUB_OK;
1233}
1234
1235hubbub_error hubbub_tokeniser_handle_after_attribute_name(
1236		hubbub_tokeniser *tokeniser)
1237{
1238	hubbub_tag *ctag = &tokeniser->context.current_tag;
1239
1240	size_t len;
1241	const uint8_t *cptr;
1242	parserutils_error error;
1243	uint8_t c;
1244
1245	error = parserutils_inputstream_peek(tokeniser->input, 
1246			tokeniser->context.pending, &cptr, &len);
1247
1248	if (error != PARSERUTILS_OK) {
1249		if (error == PARSERUTILS_EOF) {
1250			tokeniser->state = STATE_DATA;
1251			return emit_current_tag(tokeniser);
1252		} else {
1253			return hubbub_error_from_parserutils_error(error);
1254		}
1255	}
1256
1257	c = *cptr;
1258
1259	if (c == '\t' || c == '\n' || c == '\f' || c == ' ' || c == '\r') {
1260		tokeniser->context.pending += len;
1261	} else if (c == '=') {
1262		tokeniser->context.pending += len;
1263		tokeniser->state = STATE_BEFORE_ATTRIBUTE_VALUE;
1264	} else if (c == '>') {
1265		tokeniser->context.pending += len;
1266
1267		tokeniser->state = STATE_DATA;
1268		return emit_current_tag(tokeniser);
1269	} else if (c == '/') {
1270		tokeniser->context.pending += len;
1271		tokeniser->state = STATE_SELF_CLOSING_START_TAG;
1272	} else {
1273		hubbub_attribute *attr;
1274
1275		if (c == '"' || c == '\'') {
1276			/** \todo parse error */
1277		}
1278
1279		attr = tokeniser->alloc(ctag->attributes,
1280				(ctag->n_attributes + 1) *
1281					sizeof(hubbub_attribute),
1282				tokeniser->alloc_pw);
1283		if (attr == NULL)
1284			return HUBBUB_NOMEM;
1285
1286		ctag->attributes = attr;
1287
1288		if ('A' <= c && c <= 'Z') {
1289			uint8_t lc = (c + 0x20);
1290			START_BUF(attr[ctag->n_attributes].name, &lc, len);
1291		} else if (c == '\0') {
1292			START_BUF(attr[ctag->n_attributes].name,
1293					u_fffd, sizeof(u_fffd));
1294		} else {
1295			START_BUF(attr[ctag->n_attributes].name, cptr, len);
1296		}
1297
1298		attr[ctag->n_attributes].ns = HUBBUB_NS_NULL;
1299		attr[ctag->n_attributes].value.ptr = NULL;
1300		attr[ctag->n_attributes].value.len = 0;
1301
1302		ctag->n_attributes++;
1303
1304		tokeniser->context.pending += len;
1305		tokeniser->state = STATE_ATTRIBUTE_NAME;
1306	}
1307
1308	return HUBBUB_OK;
1309}
1310
1311/* this state is only ever triggered by an '=' */
1312hubbub_error hubbub_tokeniser_handle_before_attribute_value(
1313		hubbub_tokeniser *tokeniser)
1314{
1315	hubbub_tag *ctag = &tokeniser->context.current_tag;
1316
1317	size_t len;
1318	const uint8_t *cptr;
1319	parserutils_error error;
1320	uint8_t c;
1321
1322	error = parserutils_inputstream_peek(tokeniser->input, 
1323			tokeniser->context.pending, &cptr, &len);
1324
1325	if (error != PARSERUTILS_OK) {
1326		if (error == PARSERUTILS_EOF) {
1327			/** \todo parse error */
1328			tokeniser->state = STATE_DATA;
1329			return emit_current_tag(tokeniser);
1330		} else {
1331			return hubbub_error_from_parserutils_error(error);
1332		}
1333	}
1334
1335	c = *cptr;
1336
1337	if (c == '\t' || c == '\n' || c == '\f' || c == ' ' || c == '\r') {
1338		tokeniser->context.pending += len;
1339	} else if (c == '"') {
1340		tokeniser->context.pending += len;
1341		tokeniser->state = STATE_ATTRIBUTE_VALUE_DQ;
1342	} else if (c == '&') {
1343		/* Don't consume the '&' -- reprocess in UQ state */
1344		tokeniser->state = STATE_ATTRIBUTE_VALUE_UQ;
1345	} else if (c == '\'') {
1346		tokeniser->context.pending += len;
1347		tokeniser->state = STATE_ATTRIBUTE_VALUE_SQ;
1348	} else if (c == '>') {
1349		/** \todo parse error */
1350		tokeniser->context.pending += len;
1351
1352		tokeniser->state = STATE_DATA;
1353		return emit_current_tag(tokeniser);
1354	} else if (c == '\0') {
1355		START_BUF(ctag->attributes[ctag->n_attributes - 1].value,
1356				u_fffd, sizeof(u_fffd));
1357		tokeniser->context.pending += len;
1358		tokeniser->state = STATE_ATTRIBUTE_VALUE_UQ;
1359	} else {
1360		if (c == '=') {
1361			/** \todo parse error */
1362		}
1363
1364		START_BUF(ctag->attributes[ctag->n_attributes - 1].value,
1365				cptr, len);
1366
1367		tokeniser->context.pending += len;
1368		tokeniser->state = STATE_ATTRIBUTE_VALUE_UQ;
1369	}
1370
1371	return HUBBUB_OK;
1372}
1373
1374hubbub_error hubbub_tokeniser_handle_attribute_value_dq(
1375		hubbub_tokeniser *tokeniser)
1376{
1377	hubbub_tag *ctag = &tokeniser->context.current_tag;
1378
1379	size_t len;
1380	const uint8_t *cptr;
1381	parserutils_error error;
1382	uint8_t c;
1383
1384	error = parserutils_inputstream_peek(tokeniser->input, 
1385			tokeniser->context.pending, &cptr, &len);
1386
1387	if (error != PARSERUTILS_OK) {
1388		if (error == PARSERUTILS_EOF) {
1389			tokeniser->state = STATE_DATA;
1390			return emit_current_tag(tokeniser);
1391		} else {
1392			return hubbub_error_from_parserutils_error(error);
1393		}
1394	}
1395
1396	c = *cptr;
1397
1398	if (c == '"') {
1399		tokeniser->context.pending += len;
1400		tokeniser->state = STATE_AFTER_ATTRIBUTE_VALUE_Q;
1401	} else if (c == '&') {
1402		tokeniser->context.prev_state = tokeniser->state;
1403		tokeniser->state = STATE_CHARACTER_REFERENCE_IN_ATTRIBUTE_VALUE;
1404		tokeniser->context.allowed_char = '"';
1405		/* Don't eat the '&'; it'll be handled by entity consumption */
1406	} else if (c == '\0') {
1407		COLLECT_MS(ctag->attributes[ctag->n_attributes - 1].value,
1408				u_fffd, sizeof(u_fffd));
1409		tokeniser->context.pending += len;
1410	} else if (c == '\r') {
1411		error = parserutils_inputstream_peek(
1412				tokeniser->input,
1413				tokeniser->context.pending + len,
1414				&cptr,
1415				&len);
1416
1417		if (error != PARSERUTILS_OK && error != PARSERUTILS_EOF) {
1418			return hubbub_error_from_parserutils_error(error);
1419		} else if (error == PARSERUTILS_EOF || *cptr != '\n') {
1420			COLLECT_MS(ctag->attributes[
1421					ctag->n_attributes - 1].value,
1422					&lf, sizeof(lf));
1423		}
1424
1425		/* Consume '\r' */
1426		tokeniser->context.pending += 1;
1427	} else {
1428		COLLECT_MS(ctag->attributes[ctag->n_attributes - 1].value,
1429				cptr, len);
1430		tokeniser->context.pending += len;
1431	}
1432
1433	return HUBBUB_OK;
1434}
1435
1436hubbub_error hubbub_tokeniser_handle_attribute_value_sq(
1437		hubbub_tokeniser *tokeniser)
1438{
1439	hubbub_tag *ctag = &tokeniser->context.current_tag;
1440
1441	size_t len;
1442	const uint8_t *cptr;
1443	parserutils_error error;
1444	uint8_t c;
1445
1446	error = parserutils_inputstream_peek(tokeniser->input, 
1447			tokeniser->context.pending, &cptr, &len);
1448
1449	if (error != PARSERUTILS_OK) {
1450		if (error == PARSERUTILS_EOF) {
1451			tokeniser->state = STATE_DATA;
1452			return emit_current_tag(tokeniser);
1453		} else {
1454			return hubbub_error_from_parserutils_error(error);
1455		}
1456	}
1457
1458	c = *cptr;
1459
1460	if (c == '\'') {
1461		tokeniser->context.pending += len;
1462		tokeniser->state = STATE_AFTER_ATTRIBUTE_VALUE_Q;
1463	} else if (c == '&') {
1464		tokeniser->context.prev_state = tokeniser->state;
1465		tokeniser->state = STATE_CHARACTER_REFERENCE_IN_ATTRIBUTE_VALUE;
1466		tokeniser->context.allowed_char = '\'';
1467		/* Don't eat the '&'; it'll be handled by entity consumption */
1468	} else if (c == '\0') {
1469		COLLECT_MS(ctag->attributes[ctag->n_attributes - 1].value,
1470				u_fffd, sizeof(u_fffd));
1471		tokeniser->context.pending += len;
1472	} else if (c == '\r') {
1473		error = parserutils_inputstream_peek(
1474				tokeniser->input,
1475				tokeniser->context.pending + len,
1476				&cptr,
1477				&len);
1478
1479		if (error != PARSERUTILS_OK && error != PARSERUTILS_EOF) {
1480			return hubbub_error_from_parserutils_error(error);
1481		} else if (error == PARSERUTILS_EOF || *cptr != '\n') {
1482			COLLECT_MS(ctag->attributes[
1483					ctag->n_attributes - 1].value,
1484					&lf, sizeof(lf));
1485		}
1486
1487		/* Consume \r */
1488		tokeniser->context.pending += 1;
1489	} else {
1490		COLLECT_MS(ctag->attributes[ctag->n_attributes - 1].value,
1491				cptr, len);
1492		tokeniser->context.pending += len;
1493	}
1494
1495	return HUBBUB_OK;
1496}
1497
1498hubbub_error hubbub_tokeniser_handle_attribute_value_uq(
1499		hubbub_tokeniser *tokeniser)
1500{
1501	hubbub_tag *ctag = &tokeniser->context.current_tag;
1502	uint8_t c;
1503
1504	size_t len;
1505	const uint8_t *cptr;
1506	parserutils_error error;
1507
1508	error = parserutils_inputstream_peek(tokeniser->input, 
1509			tokeniser->context.pending, &cptr, &len);
1510
1511	if (error != PARSERUTILS_OK) {
1512		if (error == PARSERUTILS_EOF) {
1513			tokeniser->state = STATE_DATA;
1514			return emit_current_tag(tokeniser);
1515		} else {
1516			return hubbub_error_from_parserutils_error(error);
1517		}
1518	}
1519
1520	c = *cptr;
1521
1522	assert(c == '&' ||
1523		ctag->attributes[ctag->n_attributes - 1].value.len >= 1);
1524
1525	if (c == '\t' || c == '\n' || c == '\f' || c == ' ' || c == '\r') {
1526		tokeniser->context.pending += len;
1527		tokeniser->state = STATE_BEFORE_ATTRIBUTE_NAME;
1528	} else if (c == '&') {
1529		tokeniser->context.prev_state = tokeniser->state;
1530		tokeniser->state = STATE_CHARACTER_REFERENCE_IN_ATTRIBUTE_VALUE;
1531		/* Don't eat the '&'; it'll be handled by entity consumption */
1532	} else if (c == '>') {
1533		tokeniser->context.pending += len;
1534		tokeniser->state = STATE_DATA;
1535		return emit_current_tag(tokeniser);
1536	} else if (c == '\0') {
1537		COLLECT(ctag->attributes[ctag->n_attributes - 1].value,
1538				u_fffd, sizeof(u_fffd));
1539		tokeniser->context.pending += len;
1540	} else {
1541		if (c == '"' || c == '\'' || c == '=') {
1542			/** \todo parse error */
1543		}
1544
1545		COLLECT(ctag->attributes[ctag->n_attributes - 1].value,
1546				cptr, len);
1547		tokeniser->context.pending += len;
1548	}
1549
1550	return HUBBUB_OK;
1551}
1552
1553hubbub_error hubbub_tokeniser_handle_character_reference_in_attribute_value(
1554		hubbub_tokeniser *tokeniser)
1555{
1556	if (tokeniser->context.match_entity.complete == false) {
1557		return hubbub_tokeniser_consume_character_reference(tokeniser,
1558				tokeniser->context.pending);
1559	} else {
1560		hubbub_tag *ctag = &tokeniser->context.current_tag;
1561		hubbub_attribute *attr = &ctag->attributes[
1562				ctag->n_attributes - 1];
1563
1564		uint8_t utf8[6];
1565		uint8_t *utf8ptr = utf8;
1566		size_t len = sizeof(utf8);
1567
1568		if (tokeniser->context.match_entity.codepoint) {
1569			parserutils_charset_utf8_from_ucs4(
1570				tokeniser->context.match_entity.codepoint,
1571				&utf8ptr, &len);
1572
1573			COLLECT_MS(attr->value, utf8, sizeof(utf8) - len);
1574
1575			/* +1 for the ampersand */
1576			tokeniser->context.pending +=
1577					tokeniser->context.match_entity.length
1578					+ 1;
1579		} else {
1580			size_t len = 0;
1581			const uint8_t *cptr = NULL;
1582			parserutils_error error;
1583
1584			error = parserutils_inputstream_peek(
1585					tokeniser->input,
1586					tokeniser->context.pending, 
1587					&cptr,
1588					&len);
1589
1590			assert(error == PARSERUTILS_OK);
1591
1592			/* Insert the ampersand */
1593			COLLECT_MS(attr->value, cptr, len);
1594			tokeniser->context.pending += len;
1595		}
1596
1597		/* Reset for next time */
1598		tokeniser->context.match_entity.complete = false;
1599
1600		/* And back to the previous state */
1601		tokeniser->state = tokeniser->context.prev_state;
1602	}
1603
1604	return HUBBUB_OK;
1605}
1606
1607/* always switches state */
1608hubbub_error hubbub_tokeniser_handle_after_attribute_value_q(
1609		hubbub_tokeniser *tokeniser)
1610{
1611	size_t len;
1612	const uint8_t *cptr;
1613	parserutils_error error;
1614	uint8_t c;
1615
1616	error = parserutils_inputstream_peek(tokeniser->input, 
1617			tokeniser->context.pending, &cptr, &len);
1618
1619	if (error != PARSERUTILS_OK) {
1620		if (error == PARSERUTILS_EOF) {
1621			tokeniser->state = STATE_DATA;
1622			return emit_current_tag(tokeniser);
1623		} else {
1624			return hubbub_error_from_parserutils_error(error);
1625		}
1626	}
1627
1628	c = *cptr;
1629
1630	if (c == '\t' || c == '\n' || c == '\f' || c == ' ' || c == '\r') {
1631		tokeniser->context.pending += len;
1632		tokeniser->state = STATE_BEFORE_ATTRIBUTE_NAME;
1633	} else if (c == '>') {
1634		tokeniser->context.pending += len;
1635
1636		tokeniser->state = STATE_DATA;
1637		return emit_current_tag(tokeniser);
1638	} else if (c == '/') {
1639		tokeniser->context.pending += len;
1640		tokeniser->state = STATE_SELF_CLOSING_START_TAG;
1641	} else {
1642		/** \todo parse error */
1643		/* Reprocess character in before attribute name state */
1644		tokeniser->state = STATE_BEFORE_ATTRIBUTE_NAME;
1645	}
1646
1647	return HUBBUB_OK;
1648}
1649
1650hubbub_error hubbub_tokeniser_handle_self_closing_start_tag(
1651		hubbub_tokeniser *tokeniser)
1652{
1653	size_t len;
1654	const uint8_t *cptr;
1655	parserutils_error error;
1656	uint8_t c;
1657
1658	error = parserutils_inputstream_peek(tokeniser->input,
1659			tokeniser->context.pending, &cptr, &len);
1660
1661	if (error != PARSERUTILS_OK) {
1662		if (error == PARSERUTILS_EOF) {
1663			tokeniser->state = STATE_DATA;
1664			return emit_current_tag(tokeniser);
1665		} else {
1666			return hubbub_error_from_parserutils_error(error);
1667		}
1668	}
1669
1670	c = *cptr;
1671
1672	if (c == '>') {
1673		tokeniser->context.pending += len;
1674		tokeniser->state = STATE_DATA;
1675
1676		tokeniser->context.current_tag.self_closing = true;
1677		return emit_current_tag(tokeniser);
1678	} else {
1679		/* Reprocess character in before attribute name state */
1680		tokeniser->state = STATE_BEFORE_ATTRIBUTE_NAME;
1681	}
1682
1683	return HUBBUB_OK;
1684}
1685
1686/* this state expects tokeniser->context.chars to be empty on first entry */
1687hubbub_error hubbub_tokeniser_handle_bogus_comment(hubbub_tokeniser *tokeniser)
1688{
1689	size_t len;
1690	const uint8_t *cptr;
1691	parserutils_error error;
1692	uint8_t c;
1693
1694	error = parserutils_inputstream_peek(tokeniser->input,
1695			tokeniser->context.pending, &cptr, &len);
1696
1697	if (error != PARSERUTILS_OK) {
1698		if (error == PARSERUTILS_EOF) {
1699			tokeniser->state = STATE_DATA;
1700			return emit_current_comment(tokeniser);
1701		} else {
1702			return hubbub_error_from_parserutils_error(error);
1703		}
1704	}
1705
1706	c = *cptr;
1707
1708	if (c == '>') {
1709		tokeniser->context.pending += len;
1710		tokeniser->state = STATE_DATA;
1711		return emit_current_comment(tokeniser);
1712	} else if (c == '\0') {
1713		error = parserutils_buffer_append(tokeniser->buffer,
1714				u_fffd, sizeof(u_fffd));
1715		if (error != PARSERUTILS_OK)
1716			return hubbub_error_from_parserutils_error(error);
1717
1718		tokeniser->context.pending += len;
1719	} else if (c == '\r') {
1720		error = parserutils_inputstream_peek(
1721				tokeniser->input,
1722				tokeniser->context.pending,
1723				&cptr,
1724				&len);
1725
1726		if (error != PARSERUTILS_OK && error != PARSERUTILS_EOF) {
1727			return hubbub_error_from_parserutils_error(error);
1728		} else if (error == PARSERUTILS_EOF || *cptr != '\n') {
1729			error = parserutils_buffer_append(tokeniser->buffer,
1730					&lf, sizeof(lf));
1731			if (error != PARSERUTILS_OK) {
1732				return hubbub_error_from_parserutils_error(
1733						error);
1734			}
1735		}
1736		tokeniser->context.pending += len;
1737	} else {
1738		error = parserutils_buffer_append(tokeniser->buffer,
1739				(uint8_t *) cptr, len);
1740		if (error != PARSERUTILS_OK)
1741			return hubbub_error_from_parserutils_error(error);
1742
1743		tokeniser->context.pending += len;
1744	}
1745
1746	return HUBBUB_OK;
1747}
1748
1749/* this state always switches to another state straight away */
1750hubbub_error hubbub_tokeniser_handle_markup_declaration_open(
1751		hubbub_tokeniser *tokeniser)
1752{
1753	size_t len;
1754	const uint8_t *cptr;
1755	parserutils_error error;
1756	uint8_t c;
1757
1758	assert(tokeniser->context.pending == 0);
1759
1760	error = parserutils_inputstream_peek(tokeniser->input, 0, &cptr, &len);
1761
1762	if (error != PARSERUTILS_OK) {
1763		if (error == PARSERUTILS_EOF) {
1764			tokeniser->state = STATE_BOGUS_COMMENT;
1765			return HUBBUB_OK;
1766		} else {
1767			return hubbub_error_from_parserutils_error(error);
1768		}
1769	}
1770
1771	c = *cptr;
1772
1773	if (c == '-') {
1774		tokeniser->context.pending = len;
1775		tokeniser->state = STATE_MATCH_COMMENT;
1776	} else if ((c & ~0x20) == 'D') {
1777		tokeniser->context.pending = len;
1778		tokeniser->context.match_doctype.count = len;
1779		tokeniser->state = STATE_MATCH_DOCTYPE;
1780	} else if (tokeniser->process_cdata_section == true && c == '[') {
1781		tokeniser->context.pending = len;
1782		tokeniser->context.match_cdata.count = len;
1783		tokeniser->state = STATE_MATCH_CDATA;
1784	} else {
1785		tokeniser->state = STATE_BOGUS_COMMENT;
1786	}
1787
1788	return HUBBUB_OK;
1789}
1790
1791
1792hubbub_error hubbub_tokeniser_handle_match_comment(hubbub_tokeniser *tokeniser)
1793{
1794	size_t len;
1795	const uint8_t *cptr;
1796	parserutils_error error;
1797
1798	error = parserutils_inputstream_peek(tokeniser->input, 
1799			tokeniser->context.pending, &cptr, &len);
1800
1801	if (error != PARSERUTILS_OK) {
1802		if (error == PARSERUTI

Large files files are truncated, but you can click here to view the full file