PageRenderTime 172ms CodeModel.GetById 3ms app.highlight 151ms RepoModel.GetById 2ms app.codeStats 0ms

/peek-build/src/netdepends/hubbub-0.0.2/src/tokeniser/tokeniser.c

https://bitbucket.org/C0deMaver1ck/peeklinux
C | 3370 lines | 2902 code | 319 blank | 149 comment | 615 complexity | 6530fbcb7409a5b2eafc2432fd3ae059 MD5 | raw file
   1/*
   2 * This file is part of Hubbub.
   3 * Licensed under the MIT License,
   4 *                http://www.opensource.org/licenses/mit-license.php
   5 * Copyright 2007 John-Mark Bell <jmb@netsurf-browser.org>
   6 * Copyright 2008 Andrew Sidwell <takkaria@netsurf-browser.org>
   7 */
   8#include <assert.h>
   9#include <stdbool.h>
  10#include <string.h>
  11
  12#include <stdio.h>
  13
  14#include <parserutils/charset/utf8.h>
  15
  16#include "utils/parserutilserror.h"
  17#include "utils/utils.h"
  18
  19#include "tokeniser/entities.h"
  20#include "tokeniser/tokeniser.h"
  21
  22/**
  23 * Table of mappings between Windows-1252 codepoints 128-159 and UCS4
  24 */
  25static const uint32_t cp1252Table[32] = {
  26	0x20AC, 0xFFFD, 0x201A, 0x0192, 0x201E, 0x2026, 0x2020, 0x2021,
  27	0x02C6, 0x2030, 0x0160, 0x2039, 0x0152, 0xFFFD, 0x017D, 0xFFFD,
  28	0xFFFD, 0x2018, 0x2019, 0x201C, 0x201D, 0x2022, 0x2013, 0x2014,
  29	0x02DC, 0x2122, 0x0161, 0x203A, 0x0153, 0xFFFD, 0x017E, 0x0178
  30};
  31
  32/**
  33 * UTF-8 encoding of U+FFFD REPLACEMENT CHARACTER
  34 */
  35static const uint8_t u_fffd[3] = { '\xEF', '\xBF', '\xBD' };
  36static const hubbub_string u_fffd_str = { u_fffd, sizeof(u_fffd) };
  37
  38
  39/**
  40 * String for when we want to emit newlines
  41 */
  42static const uint8_t lf = '\n';
  43static const hubbub_string lf_str = { &lf, 1 };
  44
  45
  46/**
  47 * Tokeniser states
  48 */
  49typedef enum hubbub_tokeniser_state {
  50	STATE_DATA,
  51	STATE_CHARACTER_REFERENCE_DATA,
  52	STATE_TAG_OPEN,
  53	STATE_CLOSE_TAG_OPEN,
  54	STATE_TAG_NAME,
  55	STATE_BEFORE_ATTRIBUTE_NAME,
  56	STATE_ATTRIBUTE_NAME,
  57	STATE_AFTER_ATTRIBUTE_NAME,
  58	STATE_BEFORE_ATTRIBUTE_VALUE,
  59	STATE_ATTRIBUTE_VALUE_DQ,
  60	STATE_ATTRIBUTE_VALUE_SQ,
  61	STATE_ATTRIBUTE_VALUE_UQ,
  62	STATE_CHARACTER_REFERENCE_IN_ATTRIBUTE_VALUE,
  63	STATE_AFTER_ATTRIBUTE_VALUE_Q,
  64	STATE_SELF_CLOSING_START_TAG,
  65	STATE_BOGUS_COMMENT,
  66	STATE_MARKUP_DECLARATION_OPEN,
  67	STATE_MATCH_COMMENT,
  68	STATE_COMMENT_START,
  69	STATE_COMMENT_START_DASH,
  70	STATE_COMMENT,
  71	STATE_COMMENT_END_DASH,
  72	STATE_COMMENT_END,
  73	STATE_MATCH_DOCTYPE,
  74	STATE_DOCTYPE,
  75	STATE_BEFORE_DOCTYPE_NAME,
  76	STATE_DOCTYPE_NAME,
  77	STATE_AFTER_DOCTYPE_NAME,
  78	STATE_MATCH_PUBLIC,
  79	STATE_BEFORE_DOCTYPE_PUBLIC,
  80	STATE_DOCTYPE_PUBLIC_DQ,
  81	STATE_DOCTYPE_PUBLIC_SQ,
  82	STATE_AFTER_DOCTYPE_PUBLIC,
  83	STATE_MATCH_SYSTEM,
  84	STATE_BEFORE_DOCTYPE_SYSTEM,
  85	STATE_DOCTYPE_SYSTEM_DQ,
  86	STATE_DOCTYPE_SYSTEM_SQ,
  87	STATE_AFTER_DOCTYPE_SYSTEM,
  88	STATE_BOGUS_DOCTYPE,
  89	STATE_MATCH_CDATA,
  90	STATE_CDATA_BLOCK,
  91	STATE_NUMBERED_ENTITY,
  92	STATE_NAMED_ENTITY
  93} hubbub_tokeniser_state;
  94
  95/**
  96 * Context for tokeniser
  97 */
  98typedef struct hubbub_tokeniser_context {
  99	size_t pending;				/**< Count of pending chars */
 100
 101	hubbub_string current_comment;		/**< Current comment text */
 102
 103	hubbub_token_type current_tag_type;	/**< Type of current_tag */
 104	hubbub_tag current_tag;			/**< Current tag */
 105	hubbub_doctype current_doctype;		/**< Current doctype */
 106	hubbub_tokeniser_state prev_state;	/**< Previous state */
 107
 108	uint8_t last_start_tag_name[10];	/**< Name of the last start tag
 109						 * emitted */
 110	size_t last_start_tag_len;		/**< Length of last start tag */
 111
 112	struct {
 113		uint32_t count;
 114		bool match;
 115	} close_tag_match;			/**< State for matching close 
 116						 * tags */
 117
 118	struct {
 119		uint32_t count;			/**< Index into "DOCTYPE" */
 120	} match_doctype;			/**< State for matching doctype */
 121
 122	struct {
 123		uint32_t count;			/**< Index into "[CDATA[" */
 124		uint32_t end;			/**< Index into "]]>" */
 125	} match_cdata;				/**< State for matching cdata */
 126
 127	struct {
 128		size_t offset;			/**< Offset in buffer */
 129		uint32_t length;		/**< Length of entity */
 130		uint32_t codepoint;		/**< UCS4 codepoint */
 131		bool complete;			/**< True if match complete */
 132
 133		uint32_t poss_length;		/**< Optimistic length
 134						 * when matching named
 135						 * character references */
 136		uint8_t base;			/**< Base for numeric
 137						 * entities */
 138		void *context;			/**< Context for named
 139						 * entity search */
 140		size_t prev_len;		/**< Previous byte length
 141						 * of str */
 142		bool had_data;			/**< Whether we read
 143						 * anything after &#(x)? */
 144		bool overflow;			/**< Whether this entity has
 145						 * has overflowed the maximum
 146						 * numeric entity value */
 147		hubbub_tokeniser_state return_state;	/**< State we were
 148							 * called from */
 149	} match_entity;				/**< Entity matching state */
 150
 151	struct {
 152		uint32_t line;			/**< Current line of input */
 153		uint32_t col;			/**< Current character in
 154						 * line */
 155	} position;				/**< Position in source data */
 156
 157	uint32_t allowed_char;			/**< Used for quote matching */
 158} hubbub_tokeniser_context;
 159
 160/**
 161 * Tokeniser data structure
 162 */
 163struct hubbub_tokeniser {
 164	hubbub_tokeniser_state state;	/**< Current tokeniser state */
 165	hubbub_content_model content_model;	/**< Current content
 166						 * model flag */
 167	bool escape_flag;		/**< Escape flag **/
 168	bool process_cdata_section;	/**< Whether to process CDATA sections*/
 169
 170	parserutils_inputstream *input;	/**< Input stream */
 171	parserutils_buffer *buffer;	/**< Input buffer */
 172
 173	hubbub_tokeniser_context context;	/**< Tokeniser context */
 174
 175	hubbub_token_handler token_handler;	/**< Token handling callback */
 176	void *token_pw;				/**< Token handler data */
 177
 178	hubbub_error_handler error_handler;	/**< Error handling callback */
 179	void *error_pw;				/**< Error handler data */
 180
 181	hubbub_allocator_fn alloc;	/**< Memory (de)allocation function */
 182	void *alloc_pw;			/**< Client private data */
 183};
 184
 185static hubbub_error hubbub_tokeniser_handle_data(hubbub_tokeniser *tokeniser);
 186static hubbub_error hubbub_tokeniser_handle_character_reference_data(
 187		hubbub_tokeniser *tokeniser);
 188static hubbub_error hubbub_tokeniser_handle_tag_open(
 189		hubbub_tokeniser *tokeniser);
 190static hubbub_error hubbub_tokeniser_handle_close_tag_open(
 191		hubbub_tokeniser *tokeniser);
 192static hubbub_error hubbub_tokeniser_handle_tag_name(
 193		hubbub_tokeniser *tokeniser);
 194static hubbub_error hubbub_tokeniser_handle_before_attribute_name(
 195		hubbub_tokeniser *tokeniser);
 196static hubbub_error hubbub_tokeniser_handle_attribute_name(
 197		hubbub_tokeniser *tokeniser);
 198static hubbub_error hubbub_tokeniser_handle_after_attribute_name(
 199		hubbub_tokeniser *tokeniser);
 200static hubbub_error hubbub_tokeniser_handle_before_attribute_value(
 201		hubbub_tokeniser *tokeniser);
 202static hubbub_error hubbub_tokeniser_handle_attribute_value_dq(
 203		hubbub_tokeniser *tokeniser);
 204static hubbub_error hubbub_tokeniser_handle_attribute_value_sq(
 205		hubbub_tokeniser *tokeniser);
 206static hubbub_error hubbub_tokeniser_handle_attribute_value_uq(
 207		hubbub_tokeniser *tokeniser);
 208static hubbub_error hubbub_tokeniser_handle_character_reference_in_attribute_value(
 209		hubbub_tokeniser *tokeniser);
 210static hubbub_error hubbub_tokeniser_handle_after_attribute_value_q(
 211		hubbub_tokeniser *tokeniser);
 212static hubbub_error hubbub_tokeniser_handle_self_closing_start_tag(
 213		hubbub_tokeniser *tokeniser);
 214static hubbub_error hubbub_tokeniser_handle_bogus_comment(
 215		hubbub_tokeniser *tokeniser);
 216static hubbub_error hubbub_tokeniser_handle_markup_declaration_open(
 217		hubbub_tokeniser *tokeniser);
 218static hubbub_error hubbub_tokeniser_handle_match_comment(
 219		hubbub_tokeniser *tokeniser);
 220static hubbub_error hubbub_tokeniser_handle_comment(
 221		hubbub_tokeniser *tokeniser);
 222static hubbub_error hubbub_tokeniser_handle_match_doctype(
 223		hubbub_tokeniser *tokeniser);
 224static hubbub_error hubbub_tokeniser_handle_doctype(
 225		hubbub_tokeniser *tokeniser);
 226static hubbub_error hubbub_tokeniser_handle_before_doctype_name(
 227		hubbub_tokeniser *tokeniser);
 228static hubbub_error hubbub_tokeniser_handle_doctype_name(
 229		hubbub_tokeniser *tokeniser);
 230static hubbub_error hubbub_tokeniser_handle_after_doctype_name(
 231		hubbub_tokeniser *tokeniser);
 232static hubbub_error hubbub_tokeniser_handle_match_public(
 233		hubbub_tokeniser *tokeniser);
 234static hubbub_error hubbub_tokeniser_handle_before_doctype_public(
 235		hubbub_tokeniser *tokeniser);
 236static hubbub_error hubbub_tokeniser_handle_doctype_public_dq(
 237		hubbub_tokeniser *tokeniser);
 238static hubbub_error hubbub_tokeniser_handle_doctype_public_sq(
 239		hubbub_tokeniser *tokeniser);
 240static hubbub_error hubbub_tokeniser_handle_after_doctype_public(
 241		hubbub_tokeniser *tokeniser);
 242static hubbub_error hubbub_tokeniser_handle_match_system(
 243		hubbub_tokeniser *tokeniser);
 244static hubbub_error hubbub_tokeniser_handle_before_doctype_system(
 245		hubbub_tokeniser *tokeniser);
 246static hubbub_error hubbub_tokeniser_handle_doctype_system_dq(
 247		hubbub_tokeniser *tokeniser);
 248static hubbub_error hubbub_tokeniser_handle_doctype_system_sq(
 249		hubbub_tokeniser *tokeniser);
 250static hubbub_error hubbub_tokeniser_handle_after_doctype_system(
 251		hubbub_tokeniser *tokeniser);
 252static hubbub_error hubbub_tokeniser_handle_bogus_doctype(
 253		hubbub_tokeniser *tokeniser);
 254static hubbub_error hubbub_tokeniser_handle_match_cdata(
 255		hubbub_tokeniser *tokeniser);
 256static hubbub_error hubbub_tokeniser_handle_cdata_block(
 257		hubbub_tokeniser *tokeniser);
 258static hubbub_error hubbub_tokeniser_consume_character_reference(
 259		hubbub_tokeniser *tokeniser, size_t off);
 260static hubbub_error hubbub_tokeniser_handle_numbered_entity(
 261		hubbub_tokeniser *tokeniser);
 262static hubbub_error hubbub_tokeniser_handle_named_entity(
 263		hubbub_tokeniser *tokeniser);
 264
 265static inline hubbub_error emit_character_token(hubbub_tokeniser *tokeniser,
 266		const hubbub_string *chars);
 267static inline hubbub_error emit_current_chars(hubbub_tokeniser *tokeniser);
 268static inline hubbub_error emit_current_tag(hubbub_tokeniser *tokeniser);
 269static inline hubbub_error emit_current_comment(hubbub_tokeniser *tokeniser);
 270static inline hubbub_error emit_current_doctype(hubbub_tokeniser *tokeniser,
 271		bool force_quirks);
 272static hubbub_error hubbub_tokeniser_emit_token(hubbub_tokeniser *tokeniser,
 273		hubbub_token *token);
 274
 275/**
 276 * Create a hubbub tokeniser
 277 *
 278 * \param input      Input stream instance
 279 * \param alloc      Memory (de)allocation function
 280 * \param pw         Pointer to client-specific private data (may be NULL)
 281 * \param tokeniser  Pointer to location to receive tokeniser instance
 282 * \return HUBBUB_OK on success,
 283 *         HUBBUB_BADPARM on bad parameters,
 284 *         HUBBUB_NOMEM on memory exhaustion
 285 */
 286hubbub_error hubbub_tokeniser_create(parserutils_inputstream *input,
 287		hubbub_allocator_fn alloc, void *pw, 
 288		hubbub_tokeniser **tokeniser)
 289{
 290	parserutils_error perror;
 291	hubbub_tokeniser *tok;
 292
 293	if (input == NULL || alloc == NULL || tokeniser == NULL)
 294		return HUBBUB_BADPARM;
 295
 296	tok = alloc(NULL, sizeof(hubbub_tokeniser), pw);
 297	if (tok == NULL)
 298		return HUBBUB_NOMEM;
 299
 300	perror = parserutils_buffer_create(alloc, pw, &tok->buffer);
 301	if (perror != PARSERUTILS_OK) {
 302		alloc(tok, 0, pw);
 303		return hubbub_error_from_parserutils_error(perror);
 304	}
 305
 306	tok->state = STATE_DATA;
 307	tok->content_model = HUBBUB_CONTENT_MODEL_PCDATA;
 308
 309	tok->escape_flag = false;
 310	tok->process_cdata_section = false;
 311
 312	tok->input = input;
 313
 314	tok->token_handler = NULL;
 315	tok->token_pw = NULL;
 316
 317	tok->error_handler = NULL;
 318	tok->error_pw = NULL;
 319
 320	tok->alloc = alloc;
 321	tok->alloc_pw = pw;
 322
 323	memset(&tok->context, 0, sizeof(hubbub_tokeniser_context));
 324
 325	*tokeniser = tok;
 326
 327	return HUBBUB_OK;
 328}
 329
 330/**
 331 * Destroy a hubbub tokeniser
 332 *
 333 * \param tokeniser  The tokeniser instance to destroy
 334 * \return HUBBUB_OK on success, appropriate error otherwise
 335 */
 336hubbub_error hubbub_tokeniser_destroy(hubbub_tokeniser *tokeniser)
 337{
 338	if (tokeniser == NULL)
 339		return HUBBUB_BADPARM;
 340
 341	if (tokeniser->context.current_tag.attributes != NULL) {
 342		tokeniser->alloc(tokeniser->context.current_tag.attributes,
 343				0, tokeniser->alloc_pw);
 344	}
 345
 346	parserutils_buffer_destroy(tokeniser->buffer);
 347
 348	tokeniser->alloc(tokeniser, 0, tokeniser->alloc_pw);
 349
 350	return HUBBUB_OK;
 351}
 352
 353/**
 354 * Configure a hubbub tokeniser
 355 *
 356 * \param tokeniser  The tokeniser instance to configure
 357 * \param type       The option type to set
 358 * \param params     Option-specific parameters
 359 * \return HUBBUB_OK on success, appropriate error otherwise
 360 */
 361hubbub_error hubbub_tokeniser_setopt(hubbub_tokeniser *tokeniser,
 362		hubbub_tokeniser_opttype type,
 363		hubbub_tokeniser_optparams *params)
 364{
 365	if (tokeniser == NULL || params == NULL)
 366		return HUBBUB_BADPARM;
 367
 368	switch (type) {
 369	case HUBBUB_TOKENISER_TOKEN_HANDLER:
 370		tokeniser->token_handler = params->token_handler.handler;
 371		tokeniser->token_pw = params->token_handler.pw;
 372		break;
 373	case HUBBUB_TOKENISER_ERROR_HANDLER:
 374		tokeniser->error_handler = params->error_handler.handler;
 375		tokeniser->error_pw = params->error_handler.pw;
 376		break;
 377	case HUBBUB_TOKENISER_CONTENT_MODEL:
 378		tokeniser->content_model = params->content_model.model;
 379		break;
 380	case HUBBUB_TOKENISER_PROCESS_CDATA:
 381		tokeniser->process_cdata_section = params->process_cdata;
 382		break;
 383	}
 384
 385	return HUBBUB_OK;
 386}
 387
 388/**
 389 * Process remaining data in the input stream
 390 *
 391 * \param tokeniser  The tokeniser instance to invoke
 392 * \return HUBBUB_OK on success, appropriate error otherwise
 393 */
 394hubbub_error hubbub_tokeniser_run(hubbub_tokeniser *tokeniser)
 395{
 396	hubbub_error cont = HUBBUB_OK;
 397
 398	if (tokeniser == NULL)
 399		return HUBBUB_BADPARM;
 400
 401#if 0
 402#define state(x) \
 403		case x: \
 404			printf( #x "\n");
 405#else
 406#define state(x) \
 407		case x:
 408#endif
 409
 410	while (cont == HUBBUB_OK) {
 411		switch (tokeniser->state) {
 412		state(STATE_DATA)
 413			cont = hubbub_tokeniser_handle_data(tokeniser);
 414			break;
 415		state(STATE_CHARACTER_REFERENCE_DATA)
 416			cont = hubbub_tokeniser_handle_character_reference_data(
 417					tokeniser);
 418			break;
 419		state(STATE_TAG_OPEN)
 420			cont = hubbub_tokeniser_handle_tag_open(tokeniser);
 421			break;
 422		state(STATE_CLOSE_TAG_OPEN)
 423			cont = hubbub_tokeniser_handle_close_tag_open(
 424					tokeniser);
 425			break;
 426		state(STATE_TAG_NAME)
 427			cont = hubbub_tokeniser_handle_tag_name(tokeniser);
 428			break;
 429		state(STATE_BEFORE_ATTRIBUTE_NAME)
 430			cont = hubbub_tokeniser_handle_before_attribute_name(
 431					tokeniser);
 432			break;
 433		state(STATE_ATTRIBUTE_NAME)
 434			cont = hubbub_tokeniser_handle_attribute_name(
 435					tokeniser);
 436			break;
 437		state(STATE_AFTER_ATTRIBUTE_NAME)
 438			cont = hubbub_tokeniser_handle_after_attribute_name(
 439					tokeniser);
 440			break;
 441		state(STATE_BEFORE_ATTRIBUTE_VALUE)
 442			cont = hubbub_tokeniser_handle_before_attribute_value(
 443					tokeniser);
 444			break;
 445		state(STATE_ATTRIBUTE_VALUE_DQ)
 446			cont = hubbub_tokeniser_handle_attribute_value_dq(
 447					tokeniser);
 448			break;
 449		state(STATE_ATTRIBUTE_VALUE_SQ)
 450			cont = hubbub_tokeniser_handle_attribute_value_sq(
 451					tokeniser);
 452			break;
 453		state(STATE_ATTRIBUTE_VALUE_UQ)
 454			cont = hubbub_tokeniser_handle_attribute_value_uq(
 455					tokeniser);
 456			break;
 457		state(STATE_CHARACTER_REFERENCE_IN_ATTRIBUTE_VALUE)
 458			cont = hubbub_tokeniser_handle_character_reference_in_attribute_value(
 459					tokeniser);
 460			break;
 461		state(STATE_AFTER_ATTRIBUTE_VALUE_Q)
 462			cont = hubbub_tokeniser_handle_after_attribute_value_q(
 463					tokeniser);
 464			break;
 465		state(STATE_SELF_CLOSING_START_TAG)
 466			cont = hubbub_tokeniser_handle_self_closing_start_tag(
 467					tokeniser);
 468			break;
 469		state(STATE_BOGUS_COMMENT)
 470			cont = hubbub_tokeniser_handle_bogus_comment(
 471					tokeniser);
 472			break;
 473		state(STATE_MARKUP_DECLARATION_OPEN)
 474			cont = hubbub_tokeniser_handle_markup_declaration_open(
 475					tokeniser);
 476			break;
 477		state(STATE_MATCH_COMMENT)
 478			cont = hubbub_tokeniser_handle_match_comment(
 479					tokeniser);
 480			break;
 481		case STATE_COMMENT_START:
 482		case STATE_COMMENT_START_DASH:
 483		case STATE_COMMENT:
 484		case STATE_COMMENT_END_DASH:
 485		case STATE_COMMENT_END:
 486			cont = hubbub_tokeniser_handle_comment(tokeniser);
 487			break;
 488		state(STATE_MATCH_DOCTYPE)
 489			cont = hubbub_tokeniser_handle_match_doctype(
 490					tokeniser);
 491			break;
 492		state(STATE_DOCTYPE)
 493			cont = hubbub_tokeniser_handle_doctype(tokeniser);
 494			break;
 495		state(STATE_BEFORE_DOCTYPE_NAME)
 496			cont = hubbub_tokeniser_handle_before_doctype_name(
 497					tokeniser);
 498			break;
 499		state(STATE_DOCTYPE_NAME)
 500			cont = hubbub_tokeniser_handle_doctype_name(
 501					tokeniser);
 502			break;
 503		state(STATE_AFTER_DOCTYPE_NAME)
 504			cont = hubbub_tokeniser_handle_after_doctype_name(
 505					tokeniser);
 506			break;
 507
 508		state(STATE_MATCH_PUBLIC)
 509			cont = hubbub_tokeniser_handle_match_public(
 510					tokeniser);
 511			break;
 512		state(STATE_BEFORE_DOCTYPE_PUBLIC)
 513			cont = hubbub_tokeniser_handle_before_doctype_public(
 514					tokeniser);
 515			break;
 516		state(STATE_DOCTYPE_PUBLIC_DQ)
 517			cont = hubbub_tokeniser_handle_doctype_public_dq(
 518					tokeniser);
 519			break;
 520		state(STATE_DOCTYPE_PUBLIC_SQ)
 521			cont = hubbub_tokeniser_handle_doctype_public_sq(
 522					tokeniser);
 523			break;
 524		state(STATE_AFTER_DOCTYPE_PUBLIC)
 525			cont = hubbub_tokeniser_handle_after_doctype_public(
 526					tokeniser);
 527			break;
 528		state(STATE_MATCH_SYSTEM)
 529			cont = hubbub_tokeniser_handle_match_system(
 530					tokeniser);
 531			break;
 532		state(STATE_BEFORE_DOCTYPE_SYSTEM)
 533			cont = hubbub_tokeniser_handle_before_doctype_system(
 534					tokeniser);
 535			break;
 536		state(STATE_DOCTYPE_SYSTEM_DQ)
 537			cont = hubbub_tokeniser_handle_doctype_system_dq(
 538					tokeniser);
 539			break;
 540		state(STATE_DOCTYPE_SYSTEM_SQ)
 541			cont = hubbub_tokeniser_handle_doctype_system_sq(
 542					tokeniser);
 543			break;
 544		state(STATE_AFTER_DOCTYPE_SYSTEM)
 545			cont = hubbub_tokeniser_handle_after_doctype_system(
 546					tokeniser);
 547			break;
 548		state(STATE_BOGUS_DOCTYPE)
 549			cont = hubbub_tokeniser_handle_bogus_doctype(
 550					tokeniser);
 551			break;
 552		state(STATE_MATCH_CDATA)
 553			cont = hubbub_tokeniser_handle_match_cdata(
 554					tokeniser);
 555			break;
 556		state(STATE_CDATA_BLOCK)
 557			cont = hubbub_tokeniser_handle_cdata_block(
 558					tokeniser);
 559			break;
 560		state(STATE_NUMBERED_ENTITY)
 561			cont = hubbub_tokeniser_handle_numbered_entity(
 562					tokeniser);
 563			break;
 564		state(STATE_NAMED_ENTITY)
 565			cont = hubbub_tokeniser_handle_named_entity(
 566					tokeniser);
 567			break;
 568		}
 569	}
 570
 571	return (cont == HUBBUB_NEEDDATA) ? HUBBUB_OK : cont;
 572}
 573
 574
 575/**
 576 * Various macros for manipulating buffers.
 577 *
 578 * \todo make some of these inline functions (type-safety)
 579 * \todo document them properly here
 580 */
 581
 582#define START_BUF(str, cptr, length) \
 583	do { \
 584		parserutils_error perror; \
 585		perror = parserutils_buffer_append(tokeniser->buffer, \
 586				(uint8_t *) (cptr), (length)); \
 587		if (perror != PARSERUTILS_OK) \
 588			return hubbub_error_from_parserutils_error(perror); \
 589		(str).len = (length); \
 590	} while (0)
 591
 592#define COLLECT(str, cptr, length) \
 593	do { \
 594		parserutils_error perror; \
 595		assert(str.len != 0); \
 596		perror = parserutils_buffer_append(tokeniser->buffer, \
 597				(uint8_t *) (cptr), (length)); \
 598		if (perror != PARSERUTILS_OK) \
 599			return hubbub_error_from_parserutils_error(perror); \
 600		(str).len += (length); \
 601	} while (0)
 602
 603#define COLLECT_MS(str, cptr, length) \
 604	do { \
 605		parserutils_error perror; \
 606		perror = parserutils_buffer_append(tokeniser->buffer, \
 607				(uint8_t *) (cptr), (length)); \
 608		if (perror != PARSERUTILS_OK) \
 609			return hubbub_error_from_parserutils_error(perror); \
 610		(str).len += (length); \
 611	} while (0)
 612
 613
 614/* this should always be called with an empty "chars" buffer */
 615hubbub_error hubbub_tokeniser_handle_data(hubbub_tokeniser *tokeniser)
 616{
 617	parserutils_error error;
 618	hubbub_token token;
 619	const uint8_t *cptr;
 620	size_t len;
 621
 622	while ((error = parserutils_inputstream_peek(tokeniser->input,
 623			tokeniser->context.pending, &cptr, &len)) ==
 624					PARSERUTILS_OK) {
 625		const uint8_t c = *cptr;
 626
 627		if (c == '&' &&
 628				(tokeniser->content_model == HUBBUB_CONTENT_MODEL_PCDATA ||
 629				tokeniser->content_model == HUBBUB_CONTENT_MODEL_RCDATA) &&
 630				tokeniser->escape_flag == false) {
 631			tokeniser->state =
 632					STATE_CHARACTER_REFERENCE_DATA;
 633			/* Don't eat the '&'; it'll be handled by entity
 634			 * consumption */
 635			break;
 636		} else if (c == '-' &&
 637				tokeniser->escape_flag == false &&
 638				(tokeniser->content_model ==
 639						HUBBUB_CONTENT_MODEL_RCDATA ||
 640				tokeniser->content_model ==
 641						HUBBUB_CONTENT_MODEL_CDATA) &&
 642				tokeniser->context.pending >= 3) {
 643			size_t ignore;
 644			error = parserutils_inputstream_peek(
 645					tokeniser->input,
 646					tokeniser->context.pending - 3,
 647					&cptr,
 648					&ignore);
 649
 650			assert(error == PARSERUTILS_OK);
 651
 652			if (strncmp((char *)cptr,
 653					"<!--", SLEN("<!--")) == 0) {
 654				tokeniser->escape_flag = true;
 655			}
 656
 657			tokeniser->context.pending += len;
 658		} else if (c == '<' && (tokeniser->content_model ==
 659						HUBBUB_CONTENT_MODEL_PCDATA ||
 660					((tokeniser->content_model ==
 661						HUBBUB_CONTENT_MODEL_RCDATA ||
 662					tokeniser->content_model ==
 663						HUBBUB_CONTENT_MODEL_CDATA) &&
 664				tokeniser->escape_flag == false))) {
 665			if (tokeniser->context.pending > 0) {
 666				/* Emit any pending characters */
 667				emit_current_chars(tokeniser);
 668			}
 669
 670			/* Buffer '<' */
 671			tokeniser->context.pending = len;
 672			tokeniser->state = STATE_TAG_OPEN;
 673			break;
 674		} else if (c == '>' && tokeniser->escape_flag == true &&
 675				(tokeniser->content_model ==
 676						HUBBUB_CONTENT_MODEL_RCDATA ||
 677				tokeniser->content_model ==
 678						HUBBUB_CONTENT_MODEL_CDATA)) {
 679			/* no need to check that there are enough characters,
 680			 * since you can only run into this if the flag is
 681			 * true in the first place, which requires four
 682			 * characters. */
 683			error = parserutils_inputstream_peek(
 684					tokeniser->input,
 685					tokeniser->context.pending - 2,
 686					&cptr,
 687					&len);
 688
 689			assert(error == PARSERUTILS_OK);
 690
 691			if (strncmp((char *) cptr, "-->", SLEN("-->")) == 0) {
 692				tokeniser->escape_flag = false;
 693			}
 694
 695			tokeniser->context.pending += len;
 696		} else if (c == '\0') {
 697			if (tokeniser->context.pending > 0) {
 698				/* Emit any pending characters */
 699				emit_current_chars(tokeniser);
 700			}
 701
 702			/* Emit a replacement character */
 703			emit_character_token(tokeniser, &u_fffd_str);
 704
 705			/* Advance past NUL */
 706			parserutils_inputstream_advance(tokeniser->input, 1);
 707		} else if (c == '\r') {
 708			error = parserutils_inputstream_peek(
 709					tokeniser->input,
 710					tokeniser->context.pending + len,
 711					&cptr,
 712					&len);
 713
 714			if (error != PARSERUTILS_OK && 
 715					error != PARSERUTILS_EOF) {
 716				break;
 717			}
 718
 719			if (tokeniser->context.pending > 0) {
 720				/* Emit any pending characters */
 721				emit_current_chars(tokeniser);
 722			}
 723
 724			if (error == PARSERUTILS_EOF ||	*cptr != '\n') {
 725				/* Emit newline */
 726				emit_character_token(tokeniser, &lf_str);
 727			}
 728
 729			/* Advance over */
 730			parserutils_inputstream_advance(tokeniser->input, 1);
 731		} else {
 732			/* Just collect into buffer */
 733			tokeniser->context.pending += len;
 734		}
 735	}
 736
 737	if (tokeniser->state != STATE_TAG_OPEN &&
 738		(tokeniser->state != STATE_DATA || error == PARSERUTILS_EOF) &&
 739			tokeniser->context.pending > 0) {
 740		/* Emit any pending characters */
 741		emit_current_chars(tokeniser);
 742	}
 743
 744	if (error == PARSERUTILS_EOF) {
 745		token.type = HUBBUB_TOKEN_EOF;
 746		hubbub_tokeniser_emit_token(tokeniser, &token);
 747	}
 748
 749	if (error == PARSERUTILS_EOF) {
 750		return HUBBUB_NEEDDATA;
 751	} else {
 752		return hubbub_error_from_parserutils_error(error);
 753	}
 754}
 755
 756/* emit any pending tokens before calling */
 757hubbub_error hubbub_tokeniser_handle_character_reference_data(
 758		hubbub_tokeniser *tokeniser)
 759{
 760	assert(tokeniser->context.pending == 0);
 761
 762	if (tokeniser->context.match_entity.complete == false) {
 763		return hubbub_tokeniser_consume_character_reference(tokeniser,
 764				tokeniser->context.pending);
 765	} else {
 766		hubbub_token token;
 767
 768		uint8_t utf8[6];
 769		uint8_t *utf8ptr = utf8;
 770		size_t len = sizeof(utf8);
 771
 772		token.type = HUBBUB_TOKEN_CHARACTER;
 773
 774		if (tokeniser->context.match_entity.codepoint) {
 775			parserutils_charset_utf8_from_ucs4(
 776				tokeniser->context.match_entity.codepoint,
 777				&utf8ptr, &len);
 778
 779			token.data.character.ptr = utf8;
 780			token.data.character.len = sizeof(utf8) - len;
 781
 782			hubbub_tokeniser_emit_token(tokeniser, &token);
 783
 784			/* +1 for ampersand */
 785			parserutils_inputstream_advance(tokeniser->input,
 786					tokeniser->context.match_entity.length
 787							+ 1);
 788		} else {
 789			parserutils_error error;
 790			const uint8_t *cptr = NULL;
 791			error = parserutils_inputstream_peek(
 792					tokeniser->input,
 793					tokeniser->context.pending,
 794					&cptr,
 795					&len);
 796
 797			assert(error == PARSERUTILS_OK);
 798
 799			token.data.character.ptr = cptr;
 800			token.data.character.len = len;
 801
 802			hubbub_tokeniser_emit_token(tokeniser, &token);
 803			parserutils_inputstream_advance(tokeniser->input, len);
 804		}
 805
 806		/* Reset for next time */
 807		tokeniser->context.match_entity.complete = false;
 808
 809		tokeniser->state = STATE_DATA;
 810	}
 811
 812	return HUBBUB_OK;
 813}
 814
 815/* this state always switches to another state straight away */
 816/* this state expects the current character to be '<' */
 817hubbub_error hubbub_tokeniser_handle_tag_open(hubbub_tokeniser *tokeniser)
 818{
 819	hubbub_tag *ctag = &tokeniser->context.current_tag;
 820
 821	size_t len;
 822	const uint8_t *cptr;
 823	parserutils_error error;
 824	uint8_t c;
 825
 826	assert(tokeniser->context.pending == 1);
 827/*	assert(tokeniser->context.chars.ptr[0] == '<'); */
 828
 829	error = parserutils_inputstream_peek(tokeniser->input, 
 830			tokeniser->context.pending, &cptr, &len);
 831
 832	if (error != PARSERUTILS_OK) {
 833		if (error == PARSERUTILS_EOF) {
 834			/* Return to data state with '<' still in "chars" */
 835			tokeniser->state = STATE_DATA;
 836			return HUBBUB_OK;
 837		} else {
 838			return hubbub_error_from_parserutils_error(error);
 839		}
 840	}
 841
 842	c = *cptr;
 843
 844	if (c == '/') {
 845		tokeniser->context.pending += len;
 846
 847		tokeniser->context.close_tag_match.match = false;
 848		tokeniser->context.close_tag_match.count = 0;
 849
 850		tokeniser->state = STATE_CLOSE_TAG_OPEN;
 851	} else if (tokeniser->content_model == HUBBUB_CONTENT_MODEL_RCDATA ||
 852			tokeniser->content_model ==
 853					HUBBUB_CONTENT_MODEL_CDATA) {
 854		/* Return to data state with '<' still in "chars" */
 855		tokeniser->state = STATE_DATA;
 856	} else if (tokeniser->content_model == HUBBUB_CONTENT_MODEL_PCDATA) {
 857		if (c == '!') {
 858			parserutils_inputstream_advance(tokeniser->input,
 859					SLEN("<!"));
 860
 861			tokeniser->context.pending = 0;
 862			tokeniser->state = STATE_MARKUP_DECLARATION_OPEN;
 863		} else if ('A' <= c && c <= 'Z') {
 864			uint8_t lc = (c + 0x20);
 865
 866			START_BUF(ctag->name, &lc, len);
 867			ctag->n_attributes = 0;
 868			tokeniser->context.current_tag_type =
 869					HUBBUB_TOKEN_START_TAG;
 870
 871			tokeniser->context.pending += len;
 872
 873			tokeniser->state = STATE_TAG_NAME;
 874		} else if ('a' <= c && c <= 'z') {
 875			START_BUF(ctag->name, cptr, len);
 876			ctag->n_attributes = 0;
 877			tokeniser->context.current_tag_type =
 878					HUBBUB_TOKEN_START_TAG;
 879
 880			tokeniser->context.pending += len;
 881
 882			tokeniser->state = STATE_TAG_NAME;
 883		} else if (c == '>') {
 884			/** \todo parse error */
 885
 886			tokeniser->context.pending += len;
 887			tokeniser->state = STATE_DATA;
 888		} else if (c == '?') {
 889			/** \todo parse error */
 890
 891			/* Cursor still at "<", need to advance past it */
 892			parserutils_inputstream_advance(
 893					tokeniser->input, SLEN("<"));
 894			tokeniser->context.pending = 0;
 895
 896			tokeniser->state = STATE_BOGUS_COMMENT;
 897		} else {
 898			/* Return to data state with '<' still in "chars" */
 899			tokeniser->state = STATE_DATA;
 900		}
 901	}
 902
 903	return HUBBUB_OK;
 904}
 905
 906/* this state expects tokeniser->context.chars to be "</" */
 907/* this state never stays in this state for more than one character */
 908hubbub_error hubbub_tokeniser_handle_close_tag_open(hubbub_tokeniser *tokeniser)
 909{
 910	hubbub_tokeniser_context *ctx = &tokeniser->context;
 911
 912	size_t len;
 913	const uint8_t *cptr;
 914	parserutils_error error;
 915	uint8_t c;
 916
 917	assert(tokeniser->context.pending == 2);
 918/*	assert(tokeniser->context.chars.ptr[0] == '<'); */
 919/*	assert(tokeniser->context.chars.ptr[1] == '/'); */
 920
 921	/**\todo fragment case */
 922
 923	if (tokeniser->content_model == HUBBUB_CONTENT_MODEL_RCDATA ||
 924			tokeniser->content_model ==
 925					HUBBUB_CONTENT_MODEL_CDATA) {
 926		uint8_t *start_tag_name =
 927			tokeniser->context.last_start_tag_name;
 928		size_t start_tag_len =
 929			tokeniser->context.last_start_tag_len;
 930
 931		while ((error = parserutils_inputstream_peek(tokeniser->input,
 932					ctx->pending +
 933						ctx->close_tag_match.count,
 934					&cptr,
 935					&len)) == PARSERUTILS_OK) {
 936			c = *cptr;
 937
 938			if ((start_tag_name[ctx->close_tag_match.count] & ~0x20)
 939					!= (c & ~0x20)) {
 940				break;
 941			}
 942
 943			ctx->close_tag_match.count += len;
 944
 945			if (ctx->close_tag_match.count == start_tag_len) {
 946				ctx->close_tag_match.match = true;
 947				break;
 948			}
 949		}
 950
 951		if (error != PARSERUTILS_OK && error != PARSERUTILS_EOF) {
 952			return hubbub_error_from_parserutils_error(error);
 953		}
 954
 955		if (ctx->close_tag_match.match == true) {
 956			error = parserutils_inputstream_peek(
 957			 		tokeniser->input,
 958			 		ctx->pending +
 959				 		ctx->close_tag_match.count,
 960					&cptr,
 961			 		&len);
 962
 963			if (error != PARSERUTILS_OK && 
 964					error != PARSERUTILS_EOF) {
 965				return hubbub_error_from_parserutils_error(
 966						error);
 967			} else if (error != PARSERUTILS_EOF) {
 968				c = *cptr;
 969
 970				if (c != '\t' && c != '\n' && c != '\f' &&
 971						c != ' ' && c != '>' &&
 972						c != '/') {
 973					ctx->close_tag_match.match = false;
 974				}
 975			}
 976		}
 977	}
 978
 979	if (ctx->close_tag_match.match == false &&
 980			tokeniser->content_model !=
 981					HUBBUB_CONTENT_MODEL_PCDATA) {
 982		/* We should emit "</" here, but instead we leave it in the
 983		 * buffer so the data state emits it with any characters
 984		 * following it */
 985		tokeniser->state = STATE_DATA;
 986	} else {
 987		error = parserutils_inputstream_peek(tokeniser->input,
 988				tokeniser->context.pending, &cptr, &len);
 989
 990		if (error == PARSERUTILS_EOF) {
 991			/** \todo parse error */
 992
 993			/* Return to data state with "</" pending */
 994			tokeniser->state = STATE_DATA;
 995			return HUBBUB_OK;
 996		} else if (error != PARSERUTILS_OK) {
 997			return hubbub_error_from_parserutils_error(error);
 998		}
 999
1000		c = *cptr;
1001
1002		if ('A' <= c && c <= 'Z') {
1003			uint8_t lc = (c + 0x20);
1004			START_BUF(tokeniser->context.current_tag.name,
1005					&lc, len);
1006			tokeniser->context.current_tag.n_attributes = 0;
1007
1008			tokeniser->context.current_tag_type =
1009					HUBBUB_TOKEN_END_TAG;
1010
1011			tokeniser->context.pending += len;
1012
1013			tokeniser->state = STATE_TAG_NAME;
1014		} else if ('a' <= c && c <= 'z') {
1015			START_BUF(tokeniser->context.current_tag.name,
1016					cptr, len);
1017			tokeniser->context.current_tag.n_attributes = 0;
1018
1019			tokeniser->context.current_tag_type =
1020					HUBBUB_TOKEN_END_TAG;
1021
1022			tokeniser->context.pending += len;
1023
1024			tokeniser->state = STATE_TAG_NAME;
1025		} else if (c == '>') {
1026			/* Cursor still at "</", need to collect ">" */
1027			tokeniser->context.pending += len;
1028
1029			/* Now need to advance past "</>" */
1030			parserutils_inputstream_advance(tokeniser->input,
1031					tokeniser->context.pending);
1032			tokeniser->context.pending = 0;
1033
1034			/** \todo parse error */
1035			tokeniser->state = STATE_DATA;
1036		} else {
1037			/** \todo parse error */
1038
1039			/* Cursor still at "</", need to advance past it */
1040			parserutils_inputstream_advance(tokeniser->input,
1041					tokeniser->context.pending);
1042			tokeniser->context.pending = 0;
1043
1044			tokeniser->state = STATE_BOGUS_COMMENT;
1045		}
1046	}
1047
1048	return HUBBUB_OK;
1049}
1050
1051/* this state expects tokeniser->context.current_tag to already have its
1052   first character set */
1053hubbub_error hubbub_tokeniser_handle_tag_name(hubbub_tokeniser *tokeniser)
1054{
1055	hubbub_tag *ctag = &tokeniser->context.current_tag;
1056
1057	size_t len;
1058	const uint8_t *cptr;
1059	parserutils_error error;
1060	uint8_t c;
1061
1062	assert(tokeniser->context.pending > 0);
1063/*	assert(tokeniser->context.chars.ptr[0] == '<'); */
1064	assert(ctag->name.len > 0);
1065/*	assert(ctag->name.ptr); */
1066
1067	error = parserutils_inputstream_peek(tokeniser->input, 
1068			tokeniser->context.pending, &cptr, &len);
1069
1070	if (error != PARSERUTILS_OK) {
1071		if (error == PARSERUTILS_EOF) {
1072			tokeniser->state = STATE_DATA;
1073			return emit_current_tag(tokeniser);
1074		} else {
1075			return hubbub_error_from_parserutils_error(error);
1076		}
1077	}
1078
1079	c = *cptr;
1080
1081	if (c == '\t' || c == '\n' || c == '\f' || c == ' ' || c == '\r') {
1082		tokeniser->context.pending += len;
1083		tokeniser->state = STATE_BEFORE_ATTRIBUTE_NAME;
1084	} else if (c == '>') {
1085		tokeniser->context.pending += len;
1086		tokeniser->state = STATE_DATA;
1087		return emit_current_tag(tokeniser);
1088	} else if (c == '\0') {
1089		COLLECT(ctag->name, u_fffd, sizeof(u_fffd));
1090		tokeniser->context.pending += len;
1091	} else if (c == '/') {
1092		tokeniser->context.pending += len;
1093		tokeniser->state = STATE_SELF_CLOSING_START_TAG;
1094	} else if ('A' <= c && c <= 'Z') {
1095		uint8_t lc = (c + 0x20);
1096		COLLECT(ctag->name, &lc, len);
1097		tokeniser->context.pending += len;
1098	} else {
1099		COLLECT(ctag->name, cptr, len);
1100		tokeniser->context.pending += len;
1101	}
1102
1103	return HUBBUB_OK;
1104}
1105
1106hubbub_error hubbub_tokeniser_handle_before_attribute_name(
1107		hubbub_tokeniser *tokeniser)
1108{
1109	hubbub_tag *ctag = &tokeniser->context.current_tag;
1110
1111	size_t len;
1112	const uint8_t *cptr;
1113	parserutils_error error;
1114	uint8_t c;
1115
1116	error = parserutils_inputstream_peek(tokeniser->input, 
1117			tokeniser->context.pending, &cptr, &len);
1118
1119	if (error != PARSERUTILS_OK) {
1120		if (error == PARSERUTILS_EOF) {
1121			tokeniser->state = STATE_DATA;
1122			return emit_current_tag(tokeniser);
1123		} else {
1124			return hubbub_error_from_parserutils_error(error);
1125		}
1126	}
1127
1128	c = *cptr;
1129
1130	if (c == '\t' || c == '\n' || c == '\f' || c == ' ' || c == '\r') {
1131		/* pass over in silence */
1132		tokeniser->context.pending += len;
1133	} else if (c == '>') {
1134		tokeniser->context.pending += len;
1135		tokeniser->state = STATE_DATA;
1136		return emit_current_tag(tokeniser);
1137	} else if (c == '/') {
1138		tokeniser->context.pending += len;
1139		tokeniser->state = STATE_SELF_CLOSING_START_TAG;
1140	} else {
1141		hubbub_attribute *attr;
1142
1143		if (c == '"' || c == '\'' || c == '=') {
1144			/** \todo parse error */
1145		}
1146
1147		attr = tokeniser->alloc(ctag->attributes,
1148				(ctag->n_attributes + 1) *
1149					sizeof(hubbub_attribute),
1150				tokeniser->alloc_pw);
1151		if (attr == NULL)
1152			return HUBBUB_NOMEM;
1153
1154		ctag->attributes = attr;
1155
1156		if ('A' <= c && c <= 'Z') {
1157			uint8_t lc = (c + 0x20);
1158			START_BUF(attr[ctag->n_attributes].name, &lc, len);
1159		} else if (c == '\0') {
1160			START_BUF(attr[ctag->n_attributes].name,
1161					u_fffd, sizeof(u_fffd));
1162		} else {
1163			START_BUF(attr[ctag->n_attributes].name, cptr, len);
1164		}
1165
1166		attr[ctag->n_attributes].ns = HUBBUB_NS_NULL;
1167		attr[ctag->n_attributes].value.ptr = NULL;
1168		attr[ctag->n_attributes].value.len = 0;
1169
1170		ctag->n_attributes++;
1171
1172		tokeniser->context.pending += len;
1173		tokeniser->state = STATE_ATTRIBUTE_NAME;
1174	}
1175
1176	return HUBBUB_OK;
1177}
1178
1179hubbub_error hubbub_tokeniser_handle_attribute_name(hubbub_tokeniser *tokeniser)
1180{
1181	hubbub_tag *ctag = &tokeniser->context.current_tag;
1182
1183	size_t len;
1184	const uint8_t *cptr;
1185	parserutils_error error;
1186	uint8_t c;
1187
1188	assert(ctag->attributes[ctag->n_attributes - 1].name.len > 0);
1189
1190	error = parserutils_inputstream_peek(tokeniser->input, 
1191			tokeniser->context.pending, &cptr, &len);
1192
1193	if (error != PARSERUTILS_OK) {
1194		if (error == PARSERUTILS_EOF) {
1195			tokeniser->state = STATE_DATA;
1196			return emit_current_tag(tokeniser);
1197		} else {
1198			return hubbub_error_from_parserutils_error(error);
1199		}
1200	}
1201
1202	c = *cptr;
1203
1204	if (c == '\t' || c == '\n' || c == '\f' || c == ' ' || c == '\r') {
1205		tokeniser->context.pending += len;
1206		tokeniser->state = STATE_AFTER_ATTRIBUTE_NAME;
1207	} else if (c == '=') {
1208		tokeniser->context.pending += len;
1209		tokeniser->state = STATE_BEFORE_ATTRIBUTE_VALUE;
1210	} else if (c == '>') {
1211		tokeniser->context.pending += len;
1212		tokeniser->state = STATE_DATA;
1213		return emit_current_tag(tokeniser);
1214	} else if (c == '/') {
1215		tokeniser->context.pending += len;
1216		tokeniser->state = STATE_SELF_CLOSING_START_TAG;
1217	} else if (c == '\0') {
1218		COLLECT(ctag->attributes[ctag->n_attributes - 1].name,
1219				u_fffd, sizeof(u_fffd));
1220		tokeniser->context.pending += len;
1221	} else if ('A' <= c && c <= 'Z') {
1222		uint8_t lc = (c + 0x20);
1223		COLLECT(ctag->attributes[ctag->n_attributes - 1].name,
1224				&lc, len);
1225		tokeniser->context.pending += len;
1226	} else {
1227		COLLECT(ctag->attributes[ctag->n_attributes - 1].name,
1228				cptr, len);
1229		tokeniser->context.pending += len;
1230	}
1231
1232	return HUBBUB_OK;
1233}
1234
1235hubbub_error hubbub_tokeniser_handle_after_attribute_name(
1236		hubbub_tokeniser *tokeniser)
1237{
1238	hubbub_tag *ctag = &tokeniser->context.current_tag;
1239
1240	size_t len;
1241	const uint8_t *cptr;
1242	parserutils_error error;
1243	uint8_t c;
1244
1245	error = parserutils_inputstream_peek(tokeniser->input, 
1246			tokeniser->context.pending, &cptr, &len);
1247
1248	if (error != PARSERUTILS_OK) {
1249		if (error == PARSERUTILS_EOF) {
1250			tokeniser->state = STATE_DATA;
1251			return emit_current_tag(tokeniser);
1252		} else {
1253			return hubbub_error_from_parserutils_error(error);
1254		}
1255	}
1256
1257	c = *cptr;
1258
1259	if (c == '\t' || c == '\n' || c == '\f' || c == ' ' || c == '\r') {
1260		tokeniser->context.pending += len;
1261	} else if (c == '=') {
1262		tokeniser->context.pending += len;
1263		tokeniser->state = STATE_BEFORE_ATTRIBUTE_VALUE;
1264	} else if (c == '>') {
1265		tokeniser->context.pending += len;
1266
1267		tokeniser->state = STATE_DATA;
1268		return emit_current_tag(tokeniser);
1269	} else if (c == '/') {
1270		tokeniser->context.pending += len;
1271		tokeniser->state = STATE_SELF_CLOSING_START_TAG;
1272	} else {
1273		hubbub_attribute *attr;
1274
1275		if (c == '"' || c == '\'') {
1276			/** \todo parse error */
1277		}
1278
1279		attr = tokeniser->alloc(ctag->attributes,
1280				(ctag->n_attributes + 1) *
1281					sizeof(hubbub_attribute),
1282				tokeniser->alloc_pw);
1283		if (attr == NULL)
1284			return HUBBUB_NOMEM;
1285
1286		ctag->attributes = attr;
1287
1288		if ('A' <= c && c <= 'Z') {
1289			uint8_t lc = (c + 0x20);
1290			START_BUF(attr[ctag->n_attributes].name, &lc, len);
1291		} else if (c == '\0') {
1292			START_BUF(attr[ctag->n_attributes].name,
1293					u_fffd, sizeof(u_fffd));
1294		} else {
1295			START_BUF(attr[ctag->n_attributes].name, cptr, len);
1296		}
1297
1298		attr[ctag->n_attributes].ns = HUBBUB_NS_NULL;
1299		attr[ctag->n_attributes].value.ptr = NULL;
1300		attr[ctag->n_attributes].value.len = 0;
1301
1302		ctag->n_attributes++;
1303
1304		tokeniser->context.pending += len;
1305		tokeniser->state = STATE_ATTRIBUTE_NAME;
1306	}
1307
1308	return HUBBUB_OK;
1309}
1310
1311/* this state is only ever triggered by an '=' */
1312hubbub_error hubbub_tokeniser_handle_before_attribute_value(
1313		hubbub_tokeniser *tokeniser)
1314{
1315	hubbub_tag *ctag = &tokeniser->context.current_tag;
1316
1317	size_t len;
1318	const uint8_t *cptr;
1319	parserutils_error error;
1320	uint8_t c;
1321
1322	error = parserutils_inputstream_peek(tokeniser->input, 
1323			tokeniser->context.pending, &cptr, &len);
1324
1325	if (error != PARSERUTILS_OK) {
1326		if (error == PARSERUTILS_EOF) {
1327			/** \todo parse error */
1328			tokeniser->state = STATE_DATA;
1329			return emit_current_tag(tokeniser);
1330		} else {
1331			return hubbub_error_from_parserutils_error(error);
1332		}
1333	}
1334
1335	c = *cptr;
1336
1337	if (c == '\t' || c == '\n' || c == '\f' || c == ' ' || c == '\r') {
1338		tokeniser->context.pending += len;
1339	} else if (c == '"') {
1340		tokeniser->context.pending += len;
1341		tokeniser->state = STATE_ATTRIBUTE_VALUE_DQ;
1342	} else if (c == '&') {
1343		/* Don't consume the '&' -- reprocess in UQ state */
1344		tokeniser->state = STATE_ATTRIBUTE_VALUE_UQ;
1345	} else if (c == '\'') {
1346		tokeniser->context.pending += len;
1347		tokeniser->state = STATE_ATTRIBUTE_VALUE_SQ;
1348	} else if (c == '>') {
1349		/** \todo parse error */
1350		tokeniser->context.pending += len;
1351
1352		tokeniser->state = STATE_DATA;
1353		return emit_current_tag(tokeniser);
1354	} else if (c == '\0') {
1355		START_BUF(ctag->attributes[ctag->n_attributes - 1].value,
1356				u_fffd, sizeof(u_fffd));
1357		tokeniser->context.pending += len;
1358		tokeniser->state = STATE_ATTRIBUTE_VALUE_UQ;
1359	} else {
1360		if (c == '=') {
1361			/** \todo parse error */
1362		}
1363
1364		START_BUF(ctag->attributes[ctag->n_attributes - 1].value,
1365				cptr, len);
1366
1367		tokeniser->context.pending += len;
1368		tokeniser->state = STATE_ATTRIBUTE_VALUE_UQ;
1369	}
1370
1371	return HUBBUB_OK;
1372}
1373
1374hubbub_error hubbub_tokeniser_handle_attribute_value_dq(
1375		hubbub_tokeniser *tokeniser)
1376{
1377	hubbub_tag *ctag = &tokeniser->context.current_tag;
1378
1379	size_t len;
1380	const uint8_t *cptr;
1381	parserutils_error error;
1382	uint8_t c;
1383
1384	error = parserutils_inputstream_peek(tokeniser->input, 
1385			tokeniser->context.pending, &cptr, &len);
1386
1387	if (error != PARSERUTILS_OK) {
1388		if (error == PARSERUTILS_EOF) {
1389			tokeniser->state = STATE_DATA;
1390			return emit_current_tag(tokeniser);
1391		} else {
1392			return hubbub_error_from_parserutils_error(error);
1393		}
1394	}
1395
1396	c = *cptr;
1397
1398	if (c == '"') {
1399		tokeniser->context.pending += len;
1400		tokeniser->state = STATE_AFTER_ATTRIBUTE_VALUE_Q;
1401	} else if (c == '&') {
1402		tokeniser->context.prev_state = tokeniser->state;
1403		tokeniser->state = STATE_CHARACTER_REFERENCE_IN_ATTRIBUTE_VALUE;
1404		tokeniser->context.allowed_char = '"';
1405		/* Don't eat the '&'; it'll be handled by entity consumption */
1406	} else if (c == '\0') {
1407		COLLECT_MS(ctag->attributes[ctag->n_attributes - 1].value,
1408				u_fffd, sizeof(u_fffd));
1409		tokeniser->context.pending += len;
1410	} else if (c == '\r') {
1411		error = parserutils_inputstream_peek(
1412				tokeniser->input,
1413				tokeniser->context.pending + len,
1414				&cptr,
1415				&len);
1416
1417		if (error != PARSERUTILS_OK && error != PARSERUTILS_EOF) {
1418			return hubbub_error_from_parserutils_error(error);
1419		} else if (error == PARSERUTILS_EOF || *cptr != '\n') {
1420			COLLECT_MS(ctag->attributes[
1421					ctag->n_attributes - 1].value,
1422					&lf, sizeof(lf));
1423		}
1424
1425		/* Consume '\r' */
1426		tokeniser->context.pending += 1;
1427	} else {
1428		COLLECT_MS(ctag->attributes[ctag->n_attributes - 1].value,
1429				cptr, len);
1430		tokeniser->context.pending += len;
1431	}
1432
1433	return HUBBUB_OK;
1434}
1435
1436hubbub_error hubbub_tokeniser_handle_attribute_value_sq(
1437		hubbub_tokeniser *tokeniser)
1438{
1439	hubbub_tag *ctag = &tokeniser->context.current_tag;
1440
1441	size_t len;
1442	const uint8_t *cptr;
1443	parserutils_error error;
1444	uint8_t c;
1445
1446	error = parserutils_inputstream_peek(tokeniser->input, 
1447			tokeniser->context.pending, &cptr, &len);
1448
1449	if (error != PARSERUTILS_OK) {
1450		if (error == PARSERUTILS_EOF) {
1451			tokeniser->state = STATE_DATA;
1452			return emit_current_tag(tokeniser);
1453		} else {
1454			return hubbub_error_from_parserutils_error(error);
1455		}
1456	}
1457
1458	c = *cptr;
1459
1460	if (c == '\'') {
1461		tokeniser->context.pending += len;
1462		tokeniser->state = STATE_AFTER_ATTRIBUTE_VALUE_Q;
1463	} else if (c == '&') {
1464		tokeniser->context.prev_state = tokeniser->state;
1465		tokeniser->state = STATE_CHARACTER_REFERENCE_IN_ATTRIBUTE_VALUE;
1466		tokeniser->context.allowed_char = '\'';
1467		/* Don't eat the '&'; it'll be handled by entity consumption */
1468	} else if (c == '\0') {
1469		COLLECT_MS(ctag->attributes[ctag->n_attributes - 1].value,
1470				u_fffd, sizeof(u_fffd));
1471		tokeniser->context.pending += len;
1472	} else if (c == '\r') {
1473		error = parserutils_inputstream_peek(
1474				tokeniser->input,
1475				tokeniser->context.pending + len,
1476				&cptr,
1477				&len);
1478
1479		if (error != PARSERUTILS_OK && error != PARSERUTILS_EOF) {
1480			return hubbub_error_from_parserutils_error(error);
1481		} else if (error == PARSERUTILS_EOF || *cptr != '\n') {
1482			COLLECT_MS(ctag->attributes[
1483					ctag->n_attributes - 1].value,
1484					&lf, sizeof(lf));
1485		}
1486
1487		/* Consume \r */
1488		tokeniser->context.pending += 1;
1489	} else {
1490		COLLECT_MS(ctag->attributes[ctag->n_attributes - 1].value,
1491				cptr, len);
1492		tokeniser->context.pending += len;
1493	}
1494
1495	return HUBBUB_OK;
1496}
1497
1498hubbub_error hubbub_tokeniser_handle_attribute_value_uq(
1499		hubbub_tokeniser *tokeniser)
1500{
1501	hubbub_tag *ctag = &tokeniser->context.current_tag;
1502	uint8_t c;
1503
1504	size_t len;
1505	const uint8_t *cptr;
1506	parserutils_error error;
1507
1508	error = parserutils_inputstream_peek(tokeniser->input, 
1509			tokeniser->context.pending, &cptr, &len);
1510
1511	if (error != PARSERUTILS_OK) {
1512		if (error == PARSERUTILS_EOF) {
1513			tokeniser->state = STATE_DATA;
1514			return emit_current_tag(tokeniser);
1515		} else {
1516			return hubbub_error_from_parserutils_error(error);
1517		}
1518	}
1519
1520	c = *cptr;
1521
1522	assert(c == '&' ||
1523		ctag->attributes[ctag->n_attributes - 1].value.len >= 1);
1524
1525	if (c == '\t' || c == '\n' || c == '\f' || c == ' ' || c == '\r') {
1526		tokeniser->context.pending += len;
1527		tokeniser->state = STATE_BEFORE_ATTRIBUTE_NAME;
1528	} else if (c == '&') {
1529		tokeniser->context.prev_state = tokeniser->state;
1530		tokeniser->state = STATE_CHARACTER_REFERENCE_IN_ATTRIBUTE_VALUE;
1531		/* Don't eat the '&'; it'll be handled by entity consumption */
1532	} else if (c == '>') {
1533		tokeniser->context.pending += len;
1534		tokeniser->state = STATE_DATA;
1535		return emit_current_tag(tokeniser);
1536	} else if (c == '\0') {
1537		COLLECT(ctag->attributes[ctag->n_attributes - 1].value,
1538				u_fffd, sizeof(u_fffd));
1539		tokeniser->context.pending += len;
1540	} else {
1541		if (c == '"' || c == '\'' || c == '=') {
1542			/** \todo parse error */
1543		}
1544
1545		COLLECT(ctag->attributes[ctag->n_attributes - 1].value,
1546				cptr, len);
1547		tokeniser->context.pending += len;
1548	}
1549
1550	return HUBBUB_OK;
1551}
1552
1553hubbub_error hubbub_tokeniser_handle_character_reference_in_attribute_value(
1554		hubbub_tokeniser *tokeniser)
1555{
1556	if (tokeniser->context.match_entity.complete == false) {
1557		return hubbub_tokeniser_consume_character_reference(tokeniser,
1558				tokeniser->context.pending);
1559	} else {
1560		hubbub_tag *ctag = &tokeniser->context.current_tag;
1561		hubbub_attribute *attr = &ctag->attributes[
1562				ctag->n_attributes - 1];
1563
1564		uint8_t utf8[6];
1565		uint8_t *utf8ptr = utf8;
1566		size_t len = sizeof(utf8);
1567
1568		if (tokeniser->context.match_entity.codepoint) {
1569			parserutils_charset_utf8_from_ucs4(
1570				tokeniser->context.match_entity.codepoint,
1571				&utf8ptr, &len);
1572
1573			COLLECT_MS(attr->value, utf8, sizeof(utf8) - len);
1574
1575			/* +1 for the ampersand */
1576			tokeniser->context.pending +=
1577					tokeniser->context.match_entity.length
1578					+ 1;
1579		} else {
1580			size_t len = 0;
1581			const uint8_t *cptr = NULL;
1582			parserutils_error error;
1583
1584			error = parserutils_inputstream_peek(
1585					tokeniser->input,
1586					tokeniser->context.pending, 
1587					&cptr,
1588					&len);
1589
1590			assert(error == PARSERUTILS_OK);
1591
1592			/* Insert the ampersand */
1593			COLLECT_MS(attr->value, cptr, len);
1594			tokeniser->context.pending += len;
1595		}
1596
1597		/* Reset for next time */
1598		tokeniser->context.match_entity.complete = false;
1599
1600		/* And back to the previous state */
1601		tokeniser->state = tokeniser->context.prev_state;
1602	}
1603
1604	return HUBBUB_OK;
1605}
1606
1607/* always switches state */
1608hubbub_error hubbub_tokeniser_handle_after_attribute_value_q(
1609		hubbub_tokeniser *tokeniser)
1610{
1611	size_t len;
1612	const uint8_t *cptr;
1613	parserutils_error error;
1614	uint8_t c;
1615
1616	error = parserutils_inputstream_peek(tokeniser->input, 
1617			tokeniser->context.pending, &cptr, &len);
1618
1619	if (error != PARSERUTILS_OK) {
1620		if (error == PARSERUTILS_EOF) {
1621			tokeniser->state = STATE_DATA;
1622			return emit_current_tag(tokeniser);
1623		} else {
1624			return hubbub_error_from_parserutils_error(error);
1625		}
1626	}
1627
1628	c = *cptr;
1629
1630	if (c == '\t' || c == '\n' || c == '\f' || c == ' ' || c == '\r') {
1631		tokeniser->context.pending += len;
1632		tokeniser->state = STATE_BEFORE_ATTRIBUTE_NAME;
1633	} else if (c == '>') {
1634		tokeniser->context.pending += len;
1635
1636		tokeniser->state = STATE_DATA;
1637		return emit_current_tag(tokeniser);
1638	} else if (c == '/') {
1639		tokeniser->context.pending += len;
1640		tokeniser->state = STATE_SELF_CLOSING_START_TAG;
1641	} else {
1642		/** \todo parse error */
1643		/* Reprocess character in before attribute name state */
1644		tokeniser->state = STATE_BEFORE_ATTRIBUTE_NAME;
1645	}
1646
1647	return HUBBUB_OK;
1648}
1649
1650hubbub_error hubbub_tokeniser_handle_self_closing_start_tag(
1651		hubbub_tokeniser *tokeniser)
1652{
1653	size_t len;
1654	const uint8_t *cptr;
1655	parserutils_error error;
1656	uint8_t c;
1657
1658	error = parserutils_inputstream_peek(tokeniser->input,
1659			tokeniser->context.pending, &cptr, &len);
1660
1661	if (error != PARSERUTILS_OK) {
1662		if (error == PARSERUTILS_EOF) {
1663			tokeniser->state = STATE_DATA;
1664			return emit_current_tag(tokeniser);
1665		} else {
1666			return hubbub_error_from_parserutils_error(error);
1667		}
1668	}
1669
1670	c = *cptr;
1671
1672	if (c == '>') {
1673		tokeniser->context.pending += len;
1674		tokeniser->state = STATE_DATA;
1675
1676		tokeniser->context.current_tag.self_closing = true;
1677		return emit_current_tag(tokeniser);
1678	} else {
1679		/* Reprocess character in before attribute name state */
1680		tokeniser->state = STATE_BEFORE_ATTRIBUTE_NAME;
1681	}
1682
1683	return HUBBUB_OK;
1684}
1685
1686/* this state expects tokeniser->context.chars to be empty on first entry */
1687hubbub_error hubbub_tokeniser_handle_bogus_comment(hubbub_tokeniser *tokeniser)
1688{
1689	size_t len;
1690	const uint8_t *cptr;
1691	parserutils_error error;
1692	uint8_t c;
1693
1694	error = parserutils_inputstream_peek(tokeniser->input,
1695			tokeniser->context.pending, &cptr, &len);
1696
1697	if (error != PARSERUTILS_OK) {
1698		if (error == PARSERUTILS_EOF) {
1699			tokeniser->state = STATE_DATA;
1700			return emit_current_comment(tokeniser);
1701		} else {
1702			return hubbub_error_from_parserutils_error(error);
1703		}
1704	}
1705
1706	c = *cptr;
1707
1708	if (c == '>') {
1709		tokeniser->context.pending += len;
1710		tokeniser->state = STATE_DATA;
1711		return emit_current_comment(tokeniser);
1712	} else if (c == '\0') {
1713		error = parserutils_buffer_append(tokeniser->buffer,
1714				u_fffd, sizeof(u_fffd));
1715		if (error != PARSERUTILS_OK)
1716			return hubbub_error_from_parserutils_error(error);
1717
1718		tokeniser->context.pending += len;
1719	} else if (c == '\r') {
1720		error = parserutils_inputstream_peek(
1721				tokeniser->input,
1722				tokeniser->context.pending,
1723				&cptr,
1724				&len);
1725
1726		if (error != PARSERUTILS_OK && error != PARSERUTILS_EOF) {
1727			return hubbub_error_from_parserutils_error(error);
1728		} else if (error == PARSERUTILS_EOF || *cptr != '\n') {
1729			error = parserutils_buffer_append(tokeniser->buffer,
1730					&lf, sizeof(lf));
1731			if (error != PARSERUTILS_OK) {
1732				return hubbub_error_from_parserutils_error(
1733						error);
1734			}
1735		}
1736		tokeniser->context.pending += len;
1737	} else {
1738		error = parserutils_buffer_append(tokeniser->buffer,
1739				(uint8_t *) cptr, len);
1740		if (error != PARSERUTILS_OK)
1741			return hubbub_error_from_parserutils_error(error);
1742
1743		tokeniser->context.pending += len;
1744	}
1745
1746	return HUBBUB_OK;
1747}
1748
1749/* this state always switches to another state straight away */
1750hubbub_error hubbub_tokeniser_handle_markup_declaration_open(
1751		hubbub_tokeniser *tokeniser)
1752{
1753	size_t len;
1754	const uint8_t *cptr;
1755	parserutils_error error;
1756	uint8_t c;
1757
1758	assert(tokeniser->context.pending == 0);
1759
1760	error = parserutils_inputstream_peek(tokeniser->input, 0, &cptr, &len);
1761
1762	if (error != PARSERUTILS_OK) {
1763		if (error == PARSERUTILS_EOF) {
1764			tokeniser->state = STATE_BOGUS_COMMENT;
1765			return HUBBUB_OK;
1766		} else {
1767			return hubbub_error_from_parserutils_error(error);
1768		}
1769	}
1770
1771	c = *cptr;
1772
1773	if (c == '-') {
1774		tokeniser->context.pending = len;
1775		tokeniser->state = STATE_MATCH_COMMENT;
1776	} else if ((c & ~0x20) == 'D') {
1777		tokeniser->context.pending = len;
1778		tokeniser->context.match_doctype.count = len;
1779		tokeniser->state = STATE_MATCH_DOCTYPE;
1780	} else if (tokeniser->process_cdata_section == true && c == '[') {
1781		tokeniser->context.pending = len;
1782		tokeniser->context.match_cdata.count = len;
1783		tokeniser->state = STATE_MATCH_CDATA;
1784	} else {
1785		tokeniser->state = STATE_BOGUS_COMMENT;
1786	}
1787
1788	return HUBBUB_OK;
1789}
1790
1791
1792hubbub_error hubbub_tokeniser_handle_match_comment(hubbub_tokeniser *tokeniser)
1793{
1794	size_t len;
1795	const uint8_t *cptr;
1796	parserutils_error error;
1797
1798	error = parserutils_inputstream_peek(tokeniser->input, 
1799			tokeniser->context.pending, &cptr, &len);
1800
1801	if (error != PARSERUTILS_OK) {
1802		if (error == PARSERUTILS_EOF) {
1803			tokeniser->context.pending =
1804				tokeniser->context.current_comment.len = 0;
1805			tokeniser->state = STATE_BOGUS_COMMENT;
1806			return HUBBUB_OK;
1807		} else {
1808			return hubbub_error_from_parserutils_error(error);
1809		}
1810	}
1811
1812	tokeniser->context.pending = tokeniser->context.current_comment.len = 0;
1813
1814	if (*cptr == '-') {
1815		parserutils_inputstream_advance(tokeniser->input, SLEN("--"));
1816		tokeniser->state = STATE_COMMENT_START;
1817	} else {
1818		tokeniser->state = STATE_BOGUS_COMMENT;
1819	}
1820
1821	return HUBBUB_OK;
1822}
1823
1824
1825hubbub_error hubbub_tokeniser_handle_comment(hubbub_tokeniser *tokeniser)
1826{
1827	size_t len;
1828	const uint8_t *cptr;
1829	parserutils_error error;
1830	uint8_t c;
1831
1832	error = parserutils_inputstream_peek(tokeniser->input, 
1833			tokeniser->context.pending, &cptr, &len);
1834
1835	if (error != PARSERUTILS_OK) {
1836		if (error == PARSERUTILS_EOF) {
1837			tokeniser->state = STATE_DATA;
1838			return emit_current_comment(tokeniser);
1839		} else {
1840			return hubbub_error_from_parserutils_error(error);
1841		}
1842	}
1843
1844	c = *cptr;
1845
1846	if (c == '>' && (tokeniser->state == STATE_COMMENT_START_DASH ||
1847			tokeniser->state == STATE_COMMENT_START ||
1848			tokeniser->state == STATE_COMMENT_END)) {
1849		tokeniser->context.pending += len;
1850
1851		/** \todo parse error if state != COMMENT_END */
1852		tokeniser->state = STATE_DATA;
1853		return emit_current_comment(tokeniser);
1854	} else if (c == '-') {
1855		if (tokeniser->state == STATE_COMMENT_START) {
1856			tokeniser->state = STATE_COMMENT_START_DASH;
1857		} else if (tokeniser->state == STATE_COMMENT_START_DASH) {
1858			tokeniser->state = STATE_COMMENT_END;
1859		} else if (tokeniser->state == STATE_COMMENT) {
1860			tokeniser->state = STATE_COMMENT_END_DASH;
1861		} else if (tokeniser->state == STATE_COMMENT_END_DASH) {
1862			tokeniser->state = STATE_COMMENT_END;
1863		} else if (tokeniser->state == STATE_COMMENT_END) {
1864			error = parserutils_buffer_append(tokeniser->buffer,
1865					(uint8_t *) "-", SLEN("-"));
1866			if (error != PARSERUTILS_OK) {
1867				return hubbub_error_from_parserutils_error(
1868						error);
1869			}
1870		}
1871
1872		tokeniser->context.pending += len;
1873	} else {
1874		if (tokeniser->state == STATE_COMMENT_START_DASH ||
1875				tokeniser->state == STATE_COMMENT_END_DASH) {
1876			error = parserutils_buffer_append(tokeniser->buffer,
1877					(uint8_t *) "-", SLEN("-"));
1878			if (error != PARSERUTILS_OK) {
1879				return hubbub_error_from_parserutils_error(
1880						error);
1881			}
1882		} else if (tokeniser->state == STATE_COMMENT_END) {
1883			error = parserutils_buffer_append(tokeniser->buffer,
1884					(uint8_t *) "--", SLEN("--"));
1885			if (error != PARSERUTILS_OK) {
1886				return hubbub_error_from_parserutils_error(
1887						error);
1888			}
1889		}
1890
1891		if (c == '\0') {
1892			error = parserutils_buffer_append(tokeniser->buffer,
1893					u_fffd, sizeof(u_fffd));
1894			if (error != PARSERUTILS_OK) {
1895				return hubbub_error_from_parserutils_error(
1896						error);
1897			}
1898		} else if (c == '\r') {
1899			size_t next_len;
1900			error = parserutils_inputstream_peek(
1901					tokeniser->input,
1902					tokeniser->context.pending + len,
1903					&cptr,
1904					&next_len);
1905			if (error != PARSERUTILS_OK && 
1906					error != PARSERUTILS_EOF) {
1907				return hubbub_error_from_parserutils_error(
1908						error);
1909			} else if (error != PARSERUTILS_EOF && *cptr != '\n') {
1910				error = parserutils_buffer_append(
1911						tokeniser->buffer,
1912						&lf, sizeof(lf));
1913				if (error != PARSERUTILS_OK) {
1914					return hubbub_error_from_parserutils_error(
1915							error);
1916				}
1917			}
1918		} else {
1919			error = parserutils_buffer_append(tokeniser->buffer, 
1920					cptr, len);
1921			if (error != PARSERUTILS_OK) {
1922				return hubbub_error_from_parserutils_error(
1923						error);
1924			}
1925		}
1926
1927		tokeniser->context.pending += len;
1928		tokeniser->state = STATE_COMMENT;
1929	}
1930
1931	return HUBBUB_OK;
1932}
1933
1934
1935
1936
1937#define DOCTYPE		"DOCTYPE"
1938#define DOCTYPE_LEN	(SLEN(DOCTYPE) - 1)
1939
1940hubbub_error hubbub_tokeniser_handle_match_doctype(hubbub_tokeniser *tokeniser)
1941{
1942	size_t len;
1943	const uint8_t *cptr;
1944	parserutils_error error;
1945	uint8_t c;
1946
1947	error = parserutils_inputstream_peek(tokeniser->input,
1948			tokeniser->context.match_doctype.count, &cptr, &len);
1949
1950	if (error != PARSERUTILS_OK) {
1951		if (error == PARSERUTILS_EOF) {
1952			tokeniser->context.current_comment.len =
1953					tokeniser->context.pending = 0;
1954			tokeniser->state = STATE_BOGUS_COMMENT;
1955			return HUBBUB_OK;
1956		} else {
1957			return hubbub_error_from_parserutils_error(error);
1958		}
1959	}
1960
1961	c = *cptr;
1962
1963	assert(tokeniser->context.match_doctype.count <= DOCTYPE_LEN);
1964
1965	if (DOCTYPE[tokeniser->context.match_doctype.count] != (c & ~0x20)) {
1966		tokeniser->context.current_comment.len =
1967				tokeniser->context.pending = 0;
1968		tokeniser->state = STATE_BOGUS_COMMENT;
1969		return HUBBUB_OK;
1970	}
1971
1972	tokeniser->context.pending += len;
1973
1974	if (tokeniser->context.match_doctype.count == DOCTYPE_LEN) {
1975		/* Skip over the DOCTYPE bit */
1976		parserutils_inputstream_advance(tokeniser->input,
1977				tokeniser->context.pending);
1978
1979		memset(&tokeniser->context.current_doctype, 0,
1980				sizeof tokeniser->context.current_doctype);
1981		tokeniser->context.current_doctype.public_missing = true;
1982		tokeniser->context.current_doctype.system_missing = true;
1983		tokeniser->context.pending = 0;
1984
1985		tokeniser->state = STATE_DOCTYPE;
1986	}
1987
1988	tokeniser->context.match_doctype.count++;
1989
1990	return HUBBUB_OK;
1991}
1992
1993#undef DOCTYPE
1994#undef DOCTYPE_LEN
1995
1996hubbub_error hubbub_tokeniser_handle_doctype(hubbub_tokeniser *tokeniser)
1997{
1998	size_t len;
1999	const uint8_t *cptr;
2000	parserutils_error error;
2001	uint8_t c;
2002
2003	error = parserutils_inputstream_peek(tokeniser->input,
2004			tokeniser->context.pending, &cptr, &len);
2005
2006	if (error != PARSERUTILS_OK) {
2007		if (error == PARSERUTILS_EOF) {
2008			tokeniser->state = STATE_BEFORE_DOCTYPE_NAME;
2009			return HUBBUB_OK;
2010		} else {
2011			return hubbub_error_from_parserutils_error(error);
2012		}
2013	}
2014
2015	c = *cptr;
2016
2017	if (c == '\t' || c == '\n' || c == '\f' || c == ' ' || c == '\r') {
2018		tokeniser->context.pending += len;
2019	}
2020
2021	tokeniser->state = STATE_BEFORE_DOCTYPE_NAME;
2022
2023	return HUBBUB_OK;
2024}
2025
2026hubbub_error hubbub_tokeniser_handle_before_doctype_name(
2027		hubbub_tokeniser *tokeniser)
2028{
2029	hubbub_doctype *cdoc = &tokeniser->context.current_doctype;
2030	size_t len;
2031	const uint8_t *cptr;
2032	parserutils_error error;
2033	uint8_t c;
2034
2035	error = parserutils_inputstream_peek(tokeniser->input,
2036			tokeniser->context.pending, &cptr, &len);
2037
2038	if (error != PARSERUTILS_OK) {
2039		if (error == PARSERUTILS_EOF) {
2040			/** \todo parse error */
2041			/* Emit current doctype, force-quirks on */
2042			tokeniser->state = STATE_DATA;
2043			return emit_current_doctype(tokeniser, true);
2044		} else {
2045			return hubbub_error_from_parserutils_error(error);
2046		}
2047	}
2048
2049	c = *cptr;
2050
2051	if (c == '\t' || c == '\n' || c == '\f' || c == ' ' || c == '\r') {
2052		/* pass over in silence */
2053		tokeniser->context.pending += len;
2054	} else if (c == '>') {
2055		/** \todo parse error */
2056		tokeniser->context.pending += len;
2057		tokeniser->state = STATE_DATA;
2058		return emit_current_doctype(tokeniser, true);
2059	} else {
2060		if (c == '\0') {
2061			START_BUF(cdoc->name, u_fffd, sizeof(u_fffd));
2062		} else if ('A' <= c && c <= 'Z') {
2063			uint8_t lc = c + 0x20;
2064
2065			START_BUF(cdoc->name, &lc, len);
2066		} else {
2067			START_BUF(cdoc->name, cptr, len);
2068		}
2069
2070		tokeniser->context.pending += len;
2071		tokeniser->state = STATE_DOCTYPE_NAME;
2072	}
2073
2074	return HUBBUB_OK;
2075}
2076
2077hubbub_error hubbub_tokeniser_handle_doctype_name(hubbub_tokeniser *tokeniser)
2078{
2079	hubbub_doctype *cdoc = &tokeniser->context.current_doctype;
2080	size_t len;
2081	const uint8_t *cptr;
2082	parserutils_error error;
2083	uint8_t c;
2084
2085	error = parserutils_inputstream_peek(tokeniser->input,
2086			tokeniser->context.pending, &cptr, &len);
2087
2088	if (error != PARSERUTILS_OK) {
2089		if (error == PARSERUTILS_EOF) {
2090			tokeniser->state = STATE_DATA;
2091			return emit_current_doctype(tokeniser, true);
2092		} else {
2093			return hubbub_error_from_parserutils_error(error);
2094		}
2095	}
2096
2097	c = *cptr;
2098
2099	if (c == '\t' || c == '\n' || c == '\f' || c == ' ' || c == '\r') {
2100		tokeniser->context.pending += len;
2101		tokeniser->state = STATE_AFTER_DOCTYPE_NAME;
2102	} else if (c == '>') {
2103		tokeniser->context.pending += len;
2104		tokeniser->state = STATE_DATA;
2105		return emit_current_doctype(tokeniser, false);
2106	} else if (c == '\0') {
2107		COLLECT(cdoc->name, u_fffd, sizeof(u_fffd));
2108		tokeniser->context.pending += len;
2109	} else if ('A' <= c && c <= 'Z') {
2110		uint8_t lc = c + 0x20;
2111		COLLECT(cdoc->name, &lc, len);
2112		tokeniser->context.pending += len;
2113	} else {
2114		COLLECT(cdoc->name, cptr, len);
2115		tokeniser->context.pending += len;
2116	}
2117
2118	return HUBBUB_OK;
2119}
2120
2121hubbub_error hubbub_tokeniser_handle_after_doctype_name(
2122		hubbub_tokeniser *tokeniser)
2123{
2124	size_t len;
2125	const uint8_t *cptr;
2126	parserutils_error error;
2127	uint8_t c;
2128
2129	error = parserutils_inputstream_peek(tokeniser->input,
2130			tokeniser->context.pending, &cptr, &len);
2131
2132	if (error != PARSERUTILS_OK) {
2133		if (error == PARSERUTILS_EOF) {
2134			tokeniser->state = STATE_DATA;
2135			return emit_current_doctype(tokeniser, true);
2136		} else {
2137			return hubbub_error_from_parserutils_error(error);
2138		}
2139	}
2140
2141	c = *cptr;
2142	tokeniser->context.pending += len;
2143
2144	if (c == '\t' || c == '\n' || c == '\f' || c == ' ' || c == '\r') {
2145		/* pass over in silence */
2146	} else if (c == '>') {
2147		tokeniser->state = STATE_DATA;
2148		return emit_current_doctype(tokeniser, false);
2149	} else if ((c & ~0x20) == 'P') {
2150		tokeniser->context.match_doctype.count = 1;
2151		tokeniser->state = STATE_MATCH_PUBLIC;
2152	} else if ((c & ~0x20) == 'S') {
2153		tokeniser->context.match_doctype.count = 1;
2154		tokeniser->state = STATE_MATCH_SYSTEM;
2155	} else {
2156		tokeniser->state = STATE_BOGUS_DOCTYPE;
2157		tokeniser->context.current_doctype.force_quirks = true;
2158	}
2159
2160	return HUBBUB_OK;
2161}
2162
2163#define PUBLIC		"PUBLIC"
2164#define PUBLIC_LEN	(SLEN(PUBLIC) - 1)
2165
2166hubbub_error hubbub_tokeniser_handle_match_public(hubbub_tokeniser *tokeniser)
2167{
2168	size_t len;
2169	const uint8_t *cptr;
2170	parserutils_error error;
2171	uint8_t c;
2172
2173	error = parserutils_inputstream_peek(tokeniser->input,
2174			tokeniser->context.pending, &cptr, &len);
2175
2176	if (error != PARSERUTILS_OK) {
2177		if (error == PARSERUTILS_EOF) {
2178			tokeniser->context.current_doctype.force_quirks = true;
2179			tokeniser->state = STATE_BOGUS_DOCTYPE;
2180			return HUBBUB_OK;
2181		} else {
2182			return hubbub_error_from_parserutils_error(error);
2183		}
2184	}
2185
2186	c = *cptr;
2187
2188	assert(tokeniser->context.match_doctype.count <= PUBLIC_LEN);
2189
2190	if (PUBLIC[tokeniser->context.match_doctype.count] != (c & ~0x20)) {
2191		tokeniser->context.current_doctype.force_quirks = true;
2192		tokeniser->state = STATE_BOGUS_DOCTYPE;
2193		return HUBBUB_OK;
2194	}
2195
2196	tokeniser->context.pending += len;
2197
2198	if (tokeniser->context.match_doctype.count == PUBLIC_LEN) {
2199		tokeniser->state = STATE_BEFORE_DOCTYPE_PUBLIC;
2200	}
2201
2202	tokeniser->context.match_doctype.count++;
2203
2204	return HUBBUB_OK;
2205}
2206
2207#undef PUBLIC
2208#undef PUBLIC_LEN
2209
2210hubbub_error hubbub_tokeniser_handle_before_doctype_public(
2211		hubbub_tokeniser *tokeniser)
2212{
2213	hubbub_doctype *cdoc = &tokeniser->context.current_doctype;
2214	size_t len;
2215	const uint8_t *cptr;
2216	parserutils_error error;
2217	uint8_t c;
2218
2219	error = parserutils_inputstream_peek(tokeniser->input,
2220			tokeniser->context.pending, &cptr, &len);
2221
2222	if (error != PARSERUTILS_OK) {
2223		if (error == PARSERUTILS_EOF) {
2224			tokeniser->state = STATE_DATA;
2225			return emit_current_doctype(tokeniser, true);
2226		} else {
2227			return hubbub_error_from_parserutils_error(error);
2228		}
2229	}
2230
2231	c = *cptr;
2232	tokeniser->context.pending += len;
2233
2234	if (c == '\t' || c == '\n' || c == '\f' || c == ' ' || c == '\r') {
2235		/* pass over in silence */
2236	} else if (c == '"') {
2237		cdoc->public_missing = false;
2238		cdoc->public_id.len = 0;
2239		tokeniser->state = STATE_DOCTYPE_PUBLIC_DQ;
2240	} else if (c == '\'') {
2241		cdoc->public_missing = false;
2242		cdoc->public_id.len = 0;
2243		tokeniser->state = STATE_DOCTYPE_PUBLIC_SQ;
2244	} else if (c == '>') {
2245		tokeniser->state = STATE_DATA;
2246		return emit_current_doctype(tokeniser, true);
2247	} else {
2248		cdoc->force_quirks = true;
2249		tokeniser->state = STATE_BOGUS_DOCTYPE;
2250	}
2251
2252	return HUBBUB_OK;
2253}
2254
2255hubbub_error hubbub_tokeniser_handle_doctype_public_dq(
2256		hubbub_tokeniser *tokeniser)
2257{
2258	hubbub_doctype *cdoc = &tokeniser->context.current_doctype;
2259	size_t len;
2260	const uint8_t *cptr;
2261	parserutils_error error;
2262	uint8_t c;
2263
2264	error = parserutils_inputstream_peek(tokeniser->input,
2265			tokeniser->context.pending, &cptr, &len);
2266
2267	if (error != PARSERUTILS_OK) {
2268		if (error == PARSERUTILS_EOF) {
2269			tokeniser->state = STATE_DATA;
2270			return emit_current_doctype(tokeniser, true);
2271		} else {
2272			return hubbub_error_from_parserutils_error(error);
2273		}
2274	}
2275
2276	c = *cptr;
2277
2278	if (c == '"') {
2279		tokeniser->context.pending += len;
2280		tokeniser->state = STATE_AFTER_DOCTYPE_PUBLIC;
2281	} else if (c == '>') {
2282		tokeniser->context.pending += len;
2283		tokeniser->state = STATE_DATA;
2284		return emit_current_doctype(tokeniser, true);
2285	} else if (c == '\0') {
2286		COLLECT_MS(cdoc->public_id, u_fffd, sizeof(u_fffd));
2287		tokeniser->context.pending += len;
2288	} else if (c == '\r') {
2289		error = parserutils_inputstream_peek(
2290				tokeniser->input,
2291				tokeniser->context.pending,
2292				&cptr,
2293				&len);
2294
2295		if (error != PARSERUTILS_OK && error != PARSERUTILS_EOF) {
2296			return hubbub_error_from_parserutils_error(error);
2297		} else if (error == PARSERUTILS_EOF || *cptr != '\n') {
2298			COLLECT_MS(cdoc->public_id, &lf, sizeof(lf));
2299		}
2300
2301		/* Collect '\r' */
2302		tokeniser->context.pending += 1;
2303	} else {
2304		COLLECT_MS(cdoc->public_id, cptr, len);
2305
2306		tokeniser->context.pending += len;
2307	}
2308
2309	return HUBBUB_OK;
2310}
2311
2312hubbub_error hubbub_tokeniser_handle_doctype_public_sq(
2313		hubbub_tokeniser *tokeniser)
2314{
2315	hubbub_doctype *cdoc = &tokeniser->context.current_doctype;
2316	size_t len;
2317	const uint8_t *cptr;
2318	parserutils_error error;
2319	uint8_t c;
2320
2321	error = parserutils_inputstream_peek(tokeniser->input,
2322			tokeniser->context.pending, &cptr, &len);
2323
2324	if (error != PARSERUTILS_OK) {
2325		if (error == PARSERUTILS_EOF) {
2326			tokeniser->state = STATE_DATA;
2327			return emit_current_doctype(tokeniser, true);
2328		} else {
2329			return hubbub_error_from_parserutils_error(error);
2330		}
2331	}
2332
2333	c = *cptr;
2334
2335	if (c == '\'') {
2336		tokeniser->context.pending += len;
2337		tokeniser->state = STATE_AFTER_DOCTYPE_PUBLIC;
2338	} else if (c == '>') {
2339		tokeniser->context.pending += len;
2340		tokeniser->state = STATE_DATA;
2341		return emit_current_doctype(tokeniser, true);
2342	} else if (c == '\0') {
2343		COLLECT_MS(cdoc->public_id, u_fffd, sizeof(u_fffd));
2344		tokeniser->context.pending += len;
2345	} else if (c == '\r') {
2346		error = parserutils_inputstream_peek(
2347				tokeniser->input,
2348				tokeniser->context.pending,
2349				&cptr,
2350				&len);
2351
2352		if (error != PARSERUTILS_OK && error != PARSERUTILS_EOF) {
2353			return hubbub_error_from_parserutils_error(error);
2354		} else if (error == PARSERUTILS_EOF || *cptr != '\n') {
2355			COLLECT_MS(cdoc->public_id, &lf, sizeof(lf));
2356		}
2357	
2358		/* Collect '\r' */
2359		tokeniser->context.pending += 1;
2360	} else {
2361		COLLECT_MS(cdoc->public_id, cptr, len);
2362		tokeniser->context.pending += len;
2363	}
2364
2365	return HUBBUB_OK;
2366}
2367
2368
2369hubbub_error hubbub_tokeniser_handle_after_doctype_public(
2370		hubbub_tokeniser *tokeniser)
2371{
2372	hubbub_doctype *cdoc = &tokeniser->context.current_doctype;
2373	size_t len;
2374	const uint8_t *cptr;
2375	parserutils_error error;
2376	uint8_t c;
2377
2378	error = parserutils_inputstream_peek(tokeniser->input,
2379			tokeniser->context.pending, &cptr, &len);
2380
2381	if (error != PARSERUTILS_OK) {
2382		if (error == PARSERUTILS_EOF) {
2383			tokeniser->state = STATE_DATA;
2384			return emit_current_doctype(tokeniser, true);
2385		} else {
2386			return hubbub_error_from_parserutils_error(error);
2387		}
2388	}
2389
2390	c = *cptr;
2391	tokeniser->context.pending += len;
2392
2393	if (c == '\t' || c == '\n' || c == '\f' || c == ' ' || c == '\r') {
2394		/* pass over in silence */
2395	} else if (c == '"') {
2396		cdoc->system_missing = false;
2397		cdoc->system_id.len = 0;
2398
2399		tokeniser->state = STATE_DOCTYPE_SYSTEM_DQ;
2400	} else if (c == '\'') {
2401		cdoc->system_missing = false;
2402		cdoc->system_id.len = 0;
2403
2404		tokeniser->state = STATE_DOCTYPE_SYSTEM_SQ;
2405	} else if (c == '>') {
2406		tokeniser->state = STATE_DATA;
2407		return emit_current_doctype(tokeniser, false);
2408	} else {
2409		cdoc->force_quirks = true;
2410		tokeniser->state = STATE_BOGUS_DOCTYPE;
2411	}
2412
2413	return HUBBUB_OK;
2414}
2415
2416
2417
2418#define SYSTEM		"SYSTEM"
2419#define SYSTEM_LEN	(SLEN(SYSTEM) - 1)
2420
2421hubbub_error hubbub_tokeniser_handle_match_system(hubbub_tokeniser *tokeniser)
2422{
2423	size_t len;
2424	const uint8_t *cptr;
2425	parserutils_error error;
2426	uint8_t c;
2427
2428	error = parserutils_inputstream_peek(tokeniser->input,
2429			tokeniser->context.pending, &cptr, &len);
2430
2431	if (error != PARSERUTILS_OK){
2432		if (error == PARSERUTILS_EOF) {
2433			tokeniser->context.current_doctype.force_quirks = true;
2434			tokeniser->state = STATE_BOGUS_DOCTYPE;
2435			return HUBBUB_OK;
2436		} else {
2437			return hubbub_error_from_parserutils_error(error);
2438		}
2439	}
2440
2441	c = *cptr;
2442
2443	assert(tokeniser->context.match_doctype.count <= SYSTEM_LEN);
2444
2445	if (SYSTEM[tokeniser->context.match_doctype.count] != (c & ~0x20)) {
2446		tokeniser->context.current_doctype.force_quirks = true;
2447		tokeniser->state = STATE_BOGUS_DOCTYPE;
2448		return HUBBUB_OK;
2449	}
2450
2451	tokeniser->context.pending += len;
2452
2453	if (tokeniser->context.match_doctype.count == SYSTEM_LEN) {
2454		tokeniser->state = STATE_BEFORE_DOCTYPE_SYSTEM;
2455	}
2456
2457	tokeniser->context.match_doctype.count++;
2458
2459	return HUBBUB_OK;
2460}
2461
2462#undef SYSTEM
2463#undef SYSTEM_LEN
2464
2465hubbub_error hubbub_tokeniser_handle_before_doctype_system(
2466		hubbub_tokeniser *tokeniser)
2467{
2468	hubbub_doctype *cdoc = &tokeniser->context.current_doctype;
2469	size_t len;
2470	const uint8_t *cptr;
2471	parserutils_error error;
2472	uint8_t c;
2473
2474	error = parserutils_inputstream_peek(tokeniser->input,
2475			tokeniser->context.pending, &cptr, &len);
2476
2477	if (error != PARSERUTILS_OK) {
2478		if (error == PARSERUTILS_EOF) {
2479			tokeniser->state = STATE_DATA;
2480			return emit_current_doctype(tokeniser, true);
2481		} else {
2482			return hubbub_error_from_parserutils_error(error);
2483		}
2484	}
2485
2486	c = *cptr;
2487	tokeniser->context.pending += len;
2488
2489	if (c == '\t' || c == '\n' || c == '\f' || c == ' ' || c == '\r') {
2490		/* pass over */
2491	} else if (c == '"') {
2492		cdoc->system_missing = false;
2493		cdoc->system_id.len = 0;
2494
2495		tokeniser->state = STATE_DOCTYPE_SYSTEM_DQ;
2496	} else if (c == '\'') {
2497		cdoc->system_missing = false;
2498		cdoc->system_id.len = 0;
2499
2500		tokeniser->state = STATE_DOCTYPE_SYSTEM_SQ;
2501	} else if (c == '>') {
2502		tokeniser->state = STATE_DATA;
2503		return emit_current_doctype(tokeniser, true);
2504	} else {
2505		cdoc->force_quirks = true;
2506		tokeniser->state = STATE_BOGUS_DOCTYPE;
2507	}
2508
2509	return HUBBUB_OK;
2510}
2511
2512hubbub_error hubbub_tokeniser_handle_doctype_system_dq(
2513		hubbub_tokeniser *tokeniser)
2514{
2515	hubbub_doctype *cdoc = &tokeniser->context.current_doctype;
2516	size_t len;
2517	const uint8_t *cptr;
2518	parserutils_error error;
2519	uint8_t c;
2520
2521	error = parserutils_inputstream_peek(tokeniser->input,
2522			tokeniser->context.pending, &cptr, &len);
2523
2524	if (error != PARSERUTILS_OK) {
2525		if (error == PARSERUTILS_EOF) {
2526			tokeniser->state = STATE_DATA;
2527			return emit_current_doctype(tokeniser, true);
2528		} else {
2529			return hubbub_error_from_parserutils_error(error);
2530		}
2531	}
2532
2533	c = *cptr;
2534
2535	if (c == '"') {
2536		tokeniser->context.pending += len;
2537		tokeniser->state = STATE_AFTER_DOCTYPE_SYSTEM;
2538	} else if (c == '>') {
2539		tokeniser->context.pending += len;
2540		tokeniser->state = STATE_DATA;
2541		return emit_current_doctype(tokeniser, true);
2542	} else if (c == '\0') {
2543		COLLECT_MS(cdoc->system_id, u_fffd, sizeof(u_fffd));
2544		tokeniser->context.pending += len;
2545	} else if (c == '\r') {
2546		error = parserutils_inputstream_peek(
2547				tokeniser->input,
2548				tokeniser->context.pending,
2549				&cptr,
2550				&len);
2551
2552		if (error != PARSERUTILS_OK && error != PARSERUTILS_EOF) {
2553			return hubbub_error_from_parserutils_error(error);
2554		} else if (error == PARSERUTILS_EOF || *cptr != '\n') {
2555			COLLECT_MS(cdoc->system_id, &lf, sizeof(lf));
2556		}
2557
2558		/* Collect '\r' */
2559		tokeniser->context.pending += 1;
2560	} else {
2561		COLLECT_MS(cdoc->system_id, cptr, len);
2562		tokeniser->context.pending += len;
2563	}
2564
2565	return HUBBUB_OK;
2566}
2567
2568hubbub_error hubbub_tokeniser_handle_doctype_system_sq(
2569		hubbub_tokeniser *tokeniser)
2570{
2571	hubbub_doctype *cdoc = &tokeniser->context.current_doctype;
2572	size_t len;
2573	const uint8_t *cptr;
2574	parserutils_error error;
2575	uint8_t c;
2576
2577	error = parserutils_inputstream_peek(tokeniser->input,
2578			tokeniser->context.pending, &cptr, &len);
2579
2580	if (error != PARSERUTILS_OK) {
2581		if (error == PARSERUTILS_EOF) {
2582			tokeniser->state = STATE_DATA;
2583			return emit_current_doctype(tokeniser, true);
2584		} else {
2585			return hubbub_error_from_parserutils_error(error);
2586		}
2587	}
2588
2589	c = *cptr;
2590
2591	if (c == '\'') {
2592		tokeniser->context.pending += len;
2593		tokeniser->state = STATE_AFTER_DOCTYPE_SYSTEM;
2594	} else if (c == '>') {
2595		tokeniser->context.pending += len;
2596		tokeniser->state = STATE_DATA;
2597		return emit_current_doctype(tokeniser, true);
2598	} else if (c == '\0') {
2599		COLLECT_MS(cdoc->system_id, u_fffd, sizeof(u_fffd));
2600		tokeniser->context.pending += len;
2601	} else if (c == '\r') {
2602		error = parserutils_inputstream_peek(
2603				tokeniser->input,
2604				tokeniser->context.pending,
2605				&cptr,
2606				&len);
2607
2608		if (error != PARSERUTILS_OK && error != PARSERUTILS_EOF) {
2609			return hubbub_error_from_parserutils_error(error);
2610		} else if (error == PARSERUTILS_EOF || *cptr != '\n') {
2611			COLLECT_MS(cdoc->system_id, &lf, sizeof(lf));
2612		}
2613
2614		/* Collect '\r' */
2615		tokeniser->context.pending += 1;
2616	} else {
2617		COLLECT_MS(cdoc->system_id, cptr, len);
2618		tokeniser->context.pending += len;
2619	}
2620
2621	return HUBBUB_OK;
2622}
2623
2624hubbub_error hubbub_tokeniser_handle_after_doctype_system(
2625		hubbub_tokeniser *tokeniser)
2626{
2627	size_t len;
2628	const uint8_t *cptr;
2629	parserutils_error error;
2630	uint8_t c;
2631
2632	error = parserutils_inputstream_peek(tokeniser->input,
2633			tokeniser->context.pending, &cptr, &len);
2634
2635	if (error != PARSERUTILS_OK) {
2636		if (error == PARSERUTILS_EOF) {
2637			tokeniser->state = STATE_DATA;
2638			return emit_current_doctype(tokeniser, true);
2639		} else {
2640			return hubbub_error_from_parserutils_error(error);
2641		}
2642	}
2643
2644	c = *cptr;
2645	tokeniser->context.pending += len;
2646
2647	if (c == '\t' || c == '\n' || c == '\f' || c == ' ' || c == '\r') {
2648		/* pass over in silence */
2649	} else if (c == '>') {
2650		tokeniser->state = STATE_DATA;
2651		return emit_current_doctype(tokeniser, false);
2652	} else {
2653		tokeniser->state = STATE_BOGUS_DOCTYPE;
2654	}
2655
2656	return HUBBUB_OK;
2657}
2658
2659
2660hubbub_error hubbub_tokeniser_handle_bogus_doctype(hubbub_tokeniser *tokeniser)
2661{
2662	size_t len;
2663	const uint8_t *cptr;
2664	parserutils_error error;
2665	uint8_t c;
2666
2667	error = parserutils_inputstream_peek(tokeniser->input,
2668			tokeniser->context.pending, &cptr, &len);
2669
2670	if (error != PARSERUTILS_OK) {
2671		if (error == PARSERUTILS_EOF) {
2672			tokeniser->state = STATE_DATA;
2673			return emit_current_doctype(tokeniser, false);
2674		} else {
2675			return hubbub_error_from_parserutils_error(error);
2676		}
2677	}
2678
2679	c = *cptr;
2680	tokeniser->context.pending += len;
2681
2682	if (c == '>') {
2683		tokeniser->state = STATE_DATA;
2684		return emit_current_doctype(tokeniser, false);
2685	}
2686
2687	return HUBBUB_OK;
2688}
2689
2690
2691
2692#define CDATA		"[CDATA["
2693#define CDATA_LEN	(SLEN(CDATA) - 1)
2694
2695hubbub_error hubbub_tokeniser_handle_match_cdata(hubbub_tokeniser *tokeniser)
2696{
2697	size_t len;
2698	const uint8_t *cptr;
2699	parserutils_error error;
2700	uint8_t c;
2701
2702	error = parserutils_inputstream_peek(tokeniser->input,
2703			tokeniser->context.pending, &cptr, &len);
2704
2705	if (error != PARSERUTILS_OK) {
2706		if (error == PARSERUTILS_EOF) {
2707			tokeniser->context.current_comment.len =
2708					tokeniser->context.pending = 0;
2709			tokeniser->state = STATE_BOGUS_COMMENT;
2710			return HUBBUB_OK;
2711		} else {
2712			return hubbub_error_from_parserutils_error(error);
2713		}
2714	}
2715
2716	c = *cptr;
2717
2718	assert(tokeniser->context.match_cdata.count <= CDATA_LEN);
2719
2720	if (CDATA[tokeniser->context.match_cdata.count] != (c & ~0x20)) {
2721		tokeniser->context.current_comment.len =
2722				tokeniser->context.pending =
2723				0;
2724		tokeniser->state = STATE_BOGUS_COMMENT;
2725		return HUBBUB_OK;
2726	}
2727
2728	tokeniser->context.pending += len;
2729
2730	if (tokeniser->context.match_cdata.count == CDATA_LEN) {
2731		parserutils_inputstream_advance(tokeniser->input,
2732				tokeniser->context.match_cdata.count + len);
2733		tokeniser->context.pending = 0;
2734		tokeniser->context.match_cdata.end = 0;
2735		tokeniser->state = STATE_CDATA_BLOCK;
2736	}
2737
2738	tokeniser->context.match_cdata.count += len;
2739
2740	return HUBBUB_OK;
2741}
2742
2743#undef CDATA
2744#undef CDATA_LEN
2745
2746
2747hubbub_error hubbub_tokeniser_handle_cdata_block(hubbub_tokeniser *tokeniser)
2748{
2749	size_t len;
2750	const uint8_t *cptr;
2751	parserutils_error error;
2752	uint8_t c;
2753
2754	error = parserutils_inputstream_peek(tokeniser->input,
2755			tokeniser->context.pending, &cptr, &len);
2756
2757	if (error != PARSERUTILS_OK) {
2758		if (error == PARSERUTILS_EOF) {
2759			tokeniser->state = STATE_DATA;
2760			return emit_current_chars(tokeniser);
2761		} else {
2762			return hubbub_error_from_parserutils_error(error);
2763		}
2764	}
2765
2766	c = *cptr;
2767
2768	if (c == ']' && (tokeniser->context.match_cdata.end == 0 ||
2769			tokeniser->context.match_cdata.end == 1)) {
2770		tokeniser->context.pending += len;
2771		tokeniser->context.match_cdata.end += len;
2772	} else if (c == '>' && tokeniser->context.match_cdata.end == 2) {
2773		/* Remove the previous two "]]" */
2774		tokeniser->context.pending -= 2;
2775
2776		/* Emit any pending characters */
2777		emit_current_chars(tokeniser);
2778
2779		/* Now move past the "]]>" bit */
2780		parserutils_inputstream_advance(tokeniser->input, SLEN("]]>"));
2781
2782		tokeniser->state = STATE_DATA;
2783	} else if (c == '\0') {
2784		if (tokeniser->context.pending > 0) {
2785			/* Emit any pending characters */
2786			emit_current_chars(tokeniser);
2787		}
2788
2789		/* Perform NUL-byte replacement */
2790		emit_character_token(tokeniser, &u_fffd_str);
2791
2792		parserutils_inputstream_advance(tokeniser->input, len);
2793		tokeniser->context.match_cdata.end = 0;
2794	} else if (c == '\r') {
2795		error = parserutils_inputstream_peek(
2796				tokeniser->input,
2797				tokeniser->context.pending + len,
2798				&cptr,
2799				&len);
2800
2801		if (error != PARSERUTILS_OK && error != PARSERUTILS_EOF) {
2802			return hubbub_error_from_parserutils_error(error);
2803		}
2804
2805		if (tokeniser->context.pending > 0) {
2806			/* Emit any pending characters */
2807			emit_current_chars(tokeniser);
2808		}
2809
2810		if (error == PARSERUTILS_EOF || *cptr != '\n') {
2811			/* Emit newline */
2812			emit_character_token(tokeniser, &lf_str);
2813		}
2814
2815		/* Advance over \r */
2816		parserutils_inputstream_advance(tokeniser->input, 1);
2817		tokeniser->context.match_cdata.end = 0;
2818	} else {
2819		tokeniser->context.pending += len;
2820		tokeniser->context.match_cdata.end = 0;
2821	}
2822
2823	return HUBBUB_OK;
2824}
2825
2826
2827hubbub_error hubbub_tokeniser_consume_character_reference(
2828		hubbub_tokeniser *tokeniser, size_t pos)
2829{
2830	uint32_t allowed_char = tokeniser->context.allowed_char;
2831
2832	size_t len;
2833	const uint8_t *cptr;
2834	parserutils_error error;
2835	uint8_t c;
2836	size_t off;
2837
2838	error = parserutils_inputstream_peek(tokeniser->input, pos, 
2839			&cptr, &len);
2840
2841	/* We should always start on an ampersand */
2842	assert(error == PARSERUTILS_OK);
2843	assert(len == 1 && *cptr == '&');
2844
2845	off = pos + len;
2846
2847	/* Look at the character after the ampersand */
2848	error = parserutils_inputstream_peek(tokeniser->input, off, 
2849			&cptr, &len);
2850
2851	if (error != PARSERUTILS_OK) {
2852		if (error == PARSERUTILS_EOF) {
2853			tokeniser->context.match_entity.complete = true;
2854			tokeniser->context.match_entity.codepoint = 0;
2855			return HUBBUB_OK;
2856		} else {
2857			return hubbub_error_from_parserutils_error(error);
2858		}
2859	}
2860
2861	c = *cptr;
2862
2863	/* Set things up */
2864	tokeniser->context.match_entity.offset = off;
2865	tokeniser->context.match_entity.poss_length = 0;
2866	tokeniser->context.match_entity.length = 0;
2867	tokeniser->context.match_entity.base = 0;
2868	tokeniser->context.match_entity.codepoint = 0;
2869	tokeniser->context.match_entity.had_data = false;
2870	tokeniser->context.match_entity.return_state = tokeniser->state;
2871	tokeniser->context.match_entity.complete = false;
2872	tokeniser->context.match_entity.overflow = false;
2873	tokeniser->context.match_entity.context = NULL;
2874	tokeniser->context.match_entity.prev_len = len;
2875
2876	/* Reset allowed character for future calls */
2877	tokeniser->context.allowed_char = '\0';
2878
2879	if (c == '\t' || c == '\n' || c == '\f' || c == ' ' ||
2880			c == '<' || c == '&' ||
2881			(allowed_char && c == allowed_char)) {
2882		tokeniser->context.match_entity.complete = true;
2883		tokeniser->context.match_entity.codepoint = 0;
2884	} else if (c == '#') {
2885		tokeniser->context.match_entity.length += len;
2886		tokeniser->state = STATE_NUMBERED_ENTITY;
2887	} else {
2888		tokeniser->state = STATE_NAMED_ENTITY;
2889	}
2890
2891	return HUBBUB_OK;
2892}
2893
2894
2895hubbub_error hubbub_tokeniser_handle_numbered_entity(
2896		hubbub_tokeniser *tokeniser)
2897{
2898	hubbub_tokeniser_context *ctx = &tokeniser->context;
2899
2900	size_t len;
2901	const uint8_t *cptr;
2902	parserutils_error error;
2903
2904	error = parserutils_inputstream_peek(tokeniser->input,
2905			ctx->match_entity.offset + ctx->match_entity.length,
2906			&cptr, &len);
2907
2908	if (error != PARSERUTILS_OK && error != PARSERUTILS_EOF) {
2909		return hubbub_error_from_parserutils_error(error);
2910	}
2911
2912	if (error != PARSERUTILS_EOF && ctx->match_entity.base == 0) {
2913		uint8_t c = *cptr;
2914		if ((c & ~0x20) == 'X') {
2915			ctx->match_entity.base = 16;
2916			ctx->match_entity.length += len;
2917		} else {
2918			ctx->match_entity.base = 10;
2919		}
2920	}
2921
2922	while ((error = parserutils_inputstream_peek(tokeniser->input,
2923			ctx->match_entity.offset + ctx->match_entity.length,
2924			&cptr, &len)) == PARSERUTILS_OK) {
2925		uint8_t c = *cptr;
2926
2927		if (ctx->match_entity.base == 10 &&
2928				('0' <= c && c <= '9')) {
2929			ctx->match_entity.had_data = true;
2930			ctx->match_entity.codepoint =
2931				ctx->match_entity.codepoint * 10 + (c - '0');
2932
2933			ctx->match_entity.length += len;
2934		} else if (ctx->match_entity.base == 16 &&
2935				(('0' <= c && c <= '9') ||
2936				('A' <= (c & ~0x20) &&
2937						(c & ~0x20) <= 'F'))) {
2938			ctx->match_entity.had_data = true;
2939			ctx->match_entity.codepoint *= 16;
2940
2941			if ('0' <= c && c <= '9') {
2942				ctx->match_entity.codepoint += (c - '0');
2943			} else {
2944				ctx->match_entity.codepoint +=
2945						((c & ~0x20) - 'A' + 10);
2946			}
2947
2948			ctx->match_entity.length += len;
2949		} else {
2950			break;
2951		}
2952
2953		if (ctx->match_entity.codepoint >= 0x10FFFF) {
2954			ctx->match_entity.overflow = true;
2955		}
2956	}
2957
2958	if (error != PARSERUTILS_OK && error != PARSERUTILS_EOF) {
2959		return hubbub_error_from_parserutils_error(error);
2960	}
2961
2962	/* Eat trailing semicolon, if any */
2963	if (error != PARSERUTILS_EOF && *cptr == ';') {
2964		ctx->match_entity.length += len;
2965	}
2966
2967	/* Had data, so calculate final codepoint */
2968	if (ctx->match_entity.had_data) {
2969		uint32_t cp = ctx->match_entity.codepoint;
2970
2971		if (0x80 <= cp && cp <= 0x9F) {
2972			cp = cp1252Table[cp - 0x80];
2973		} else if (cp == 0x0D) {
2974			cp = 0x000A;
2975		} else if (ctx->match_entity.overflow || 
2976				cp <= 0x0008 || cp == 0x000B ||
2977				(0x000E <= cp && cp <= 0x001F) ||
2978				(0x007F <= cp && cp <= 0x009F) ||
2979				(0xD800 <= cp && cp <= 0xDFFF) ||
2980				(0xFDD0 <= cp && cp <= 0xFDEF) ||
2981				(cp & 0xFFFE) == 0xFFFE) {
2982			/* the check for cp > 0x10FFFF per spec is performed
2983			 * in the loop above to avoid overflow */
2984			cp = 0xFFFD;
2985		}
2986
2987		ctx->match_entity.codepoint = cp;
2988	}
2989
2990	/* Flag completion */
2991	ctx->match_entity.complete = true;
2992
2993	/* And back to the state we were entered in */
2994	tokeniser->state = ctx->match_entity.return_state;
2995
2996	return HUBBUB_OK;
2997}
2998
2999hubbub_error hubbub_tokeniser_handle_named_entity(hubbub_tokeniser *tokeniser)
3000{
3001	hubbub_tokeniser_context *ctx = &tokeniser->context;
3002
3003	size_t len;
3004	const uint8_t *cptr;
3005	parserutils_error error;
3006
3007	while ((error = parserutils_inputstream_peek(tokeniser->input,
3008			ctx->match_entity.offset +
3009					ctx->match_entity.poss_length,
3010			&cptr, &len)) == PARSERUTILS_OK) {
3011		uint32_t cp;
3012
3013		uint8_t c = *cptr;
3014		hubbub_error error;
3015
3016		if (c > 0x7F) {
3017			/* Entity names are ASCII only */
3018			break;
3019		}
3020
3021		error = hubbub_entities_search_step(c, &cp,
3022				&ctx->match_entity.context);
3023		if (error == HUBBUB_OK) {
3024			/* Had a match - store it for later */
3025			ctx->match_entity.codepoint = cp;
3026
3027			ctx->match_entity.length =
3028					ctx->match_entity.poss_length + len;
3029			ctx->match_entity.poss_length =
3030					ctx->match_entity.length;
3031		} else if (error == HUBBUB_INVALID) {
3032			/* No further matches - use last found */
3033			break;
3034		} else {
3035			/* Need more data */
3036			ctx->match_entity.poss_length += len;
3037		}
3038	}
3039
3040	if (error != PARSERUTILS_OK && error != PARSERUTILS_EOF) {
3041		return hubbub_error_from_parserutils_error(error);
3042	}
3043
3044	if (ctx->match_entity.length > 0) {
3045		uint8_t c;
3046		error = parserutils_inputstream_peek(tokeniser->input,
3047				ctx->match_entity.offset + 
3048					ctx->match_entity.length - 1,
3049				&cptr, &len);
3050		/* We're re-reading a character we've already read after. 
3051		 * Therefore, there's no way that an error may occur as 
3052		 * a result. */
3053		assert(error == PARSERUTILS_OK);
3054
3055		c = *cptr;
3056
3057		if ((tokeniser->context.match_entity.return_state ==
3058				STATE_CHARACTER_REFERENCE_IN_ATTRIBUTE_VALUE) &&
3059				c != ';') {
3060			error = parserutils_inputstream_peek(tokeniser->input,
3061					ctx->match_entity.offset +
3062						ctx->match_entity.length,
3063					&cptr, &len);
3064			/* We must have attempted to read one more character 
3065			 * than was present in the entity name, as that is the 
3066			 * only way to break out of the loop above. If that 
3067			 * failed, then any non-EOF case will have been handled
3068			 * by the if statement after the loop thus it cannot 
3069			 * occur here. */
3070			assert(error == PARSERUTILS_OK || 
3071					error == PARSERUTILS_EOF);
3072
3073			if (error == PARSERUTILS_EOF) {
3074				ctx->match_entity.codepoint = 0;
3075			}
3076
3077			c = *cptr;
3078			if ((0x0030 <= c && c <= 0x0039) ||
3079					(0x0041 <= c && c <= 0x005A) ||
3080					(0x0061 <= c && c <= 0x007A)) {
3081				ctx->match_entity.codepoint = 0;
3082			}
3083		}
3084	}
3085
3086	/* Flag completion */
3087	ctx->match_entity.complete = true;
3088
3089	/* And back to the state from whence we came */
3090	tokeniser->state = ctx->match_entity.return_state;
3091
3092	return HUBBUB_OK;
3093}
3094
3095
3096
3097/*** Token emitting bits ***/
3098
3099/**
3100 * Emit a character token.
3101 *
3102 * \param tokeniser	Tokeniser instance
3103 * \param chars		Pointer to hubbub_string to emit
3104 * \return	true
3105 */
3106hubbub_error emit_character_token(hubbub_tokeniser *tokeniser,
3107		const hubbub_string *chars)
3108{
3109	hubbub_token token;
3110
3111	token.type = HUBBUB_TOKEN_CHARACTER;
3112	token.data.character = *chars;
3113
3114	return hubbub_tokeniser_emit_token(tokeniser, &token);
3115}
3116
3117/**
3118 * Emit the current pending characters being stored in the tokeniser context.
3119 *
3120 * \param tokeniser	Tokeniser instance
3121 * \return	true
3122 */
3123hubbub_error emit_current_chars(hubbub_tokeniser *tokeniser)
3124{
3125	hubbub_token token;
3126	size_t len;
3127	const uint8_t *cptr = NULL;
3128	parserutils_error error;
3129
3130	/* Calling this with nothing to output is a probable bug */
3131	assert(tokeniser->context.pending > 0);
3132
3133	error = parserutils_inputstream_peek(tokeniser->input, 0, &cptr, &len);
3134
3135	assert(error == PARSERUTILS_OK);
3136
3137	token.type = HUBBUB_TOKEN_CHARACTER;
3138	token.data.character.ptr = cptr;
3139	token.data.character.len = tokeniser->context.pending;
3140
3141	return hubbub_tokeniser_emit_token(tokeniser, &token);
3142}
3143
3144/**
3145 * Emit the current tag token being stored in the tokeniser context.
3146 *
3147 * \param tokeniser	Tokeniser instance
3148 * \return	true
3149 */
3150hubbub_error emit_current_tag(hubbub_tokeniser *tokeniser)
3151{
3152	hubbub_error err;
3153	hubbub_token token;
3154	uint32_t n_attributes;
3155	hubbub_attribute *attrs;
3156	uint8_t *ptr;
3157	uint32_t i, j;
3158
3159	/* Emit current tag */
3160	token.type = tokeniser->context.current_tag_type;
3161	token.data.tag = tokeniser->context.current_tag;
3162 	token.data.tag.ns = HUBBUB_NS_HTML;
3163
3164
3165	n_attributes = token.data.tag.n_attributes;
3166	attrs = token.data.tag.attributes;
3167
3168	/* Set pointers correctly... */
3169	ptr = tokeniser->buffer->data;
3170	token.data.tag.name.ptr = tokeniser->buffer->data;
3171	ptr += token.data.tag.name.len;
3172
3173	for (i = 0; i < n_attributes; i++) {
3174		attrs[i].name.ptr = ptr;
3175		ptr += attrs[i].name.len;
3176		attrs[i].value.ptr = ptr;
3177		ptr += attrs[i].value.len;
3178	}
3179
3180
3181	/* Discard duplicate attributes */
3182	for (i = 0; i < n_attributes; i++) {
3183		for (j = 0; j < n_attributes; j++) {
3184			uint32_t move;
3185
3186			if (j == i ||
3187				attrs[i].name.len !=
3188						attrs[j].name.len ||
3189				strncmp((char *) attrs[i].name.ptr,
3190					(char *) attrs[j].name.ptr,
3191					attrs[i].name.len) != 0) {
3192				/* Attributes don't match */
3193				continue;
3194			}
3195
3196			assert(i < j);
3197
3198			/* Calculate amount to move */
3199			move = (n_attributes - 1 - j) *
3200					sizeof(hubbub_attribute);
3201
3202			if (move > 0) {
3203				memmove(&attrs[j],&attrs[j+1], move);
3204			}
3205
3206			/* We've deleted an item, so we need to 
3207			 * reprocess this index */
3208			j--;
3209
3210			/* And reduce the number of attributes */
3211			n_attributes--;
3212		}
3213	}
3214
3215	token.data.tag.n_attributes = n_attributes;
3216
3217	err = hubbub_tokeniser_emit_token(tokeniser, &token);
3218
3219	if (token.type == HUBBUB_TOKEN_START_TAG) {
3220		/* Save start tag name for R?CDATA */
3221		if (token.data.tag.name.len <
3222			sizeof(tokeniser->context.last_start_tag_name)) {
3223			strncpy((char *) tokeniser->context.last_start_tag_name,
3224				(const char *) token.data.tag.name.ptr,
3225				token.data.tag.name.len);
3226			tokeniser->context.last_start_tag_len =
3227					token.data.tag.name.len;
3228		} else {
3229			tokeniser->context.last_start_tag_name[0] = '\0';
3230			tokeniser->context.last_start_tag_len = 0;
3231		}
3232	} else /* if (token->type == HUBBUB_TOKEN_END_TAG) */ {
3233		/* Reset content model after R?CDATA elements */
3234		tokeniser->content_model = HUBBUB_CONTENT_MODEL_PCDATA;
3235	}
3236
3237	/* Reset the self-closing flag */
3238	tokeniser->context.current_tag.self_closing = false;
3239
3240	return err;
3241}
3242
3243/**
3244 * Emit the current comment token being stored in the tokeniser context.
3245 *
3246 * \param tokeniser	Tokeniser instance
3247 * \return	true
3248 */
3249hubbub_error emit_current_comment(hubbub_tokeniser *tokeniser)
3250{
3251	hubbub_token token;
3252
3253	token.type = HUBBUB_TOKEN_COMMENT;
3254	token.data.comment.ptr = tokeniser->buffer->data;
3255	token.data.comment.len = tokeniser->buffer->length;
3256
3257	return hubbub_tokeniser_emit_token(tokeniser, &token);
3258}
3259
3260/**
3261 * Emit the current doctype token being stored in the tokeniser context.
3262 *
3263 * \param tokeniser	Tokeniser instance
3264 * \param force_quirks	Force quirks mode on this document
3265 * \return	true
3266 */
3267hubbub_error emit_current_doctype(hubbub_tokeniser *tokeniser,
3268		bool force_quirks)
3269{
3270	hubbub_token token;
3271
3272	/* Emit doctype */
3273	token.type = HUBBUB_TOKEN_DOCTYPE;
3274	token.data.doctype = tokeniser->context.current_doctype;
3275	if (force_quirks == true)
3276		token.data.doctype.force_quirks = true;
3277
3278	/* Set pointers correctly */
3279	token.data.doctype.name.ptr = tokeniser->buffer->data;
3280
3281	if (token.data.doctype.public_missing == false) {
3282		token.data.doctype.public_id.ptr = tokeniser->buffer->data + 
3283				token.data.doctype.name.len;
3284	}
3285
3286	if (token.data.doctype.system_missing == false) {
3287		token.data.doctype.system_id.ptr = tokeniser->buffer->data +
3288				token.data.doctype.name.len +
3289				token.data.doctype.public_id.len;
3290	}
3291
3292	return hubbub_tokeniser_emit_token(tokeniser, &token);
3293}
3294
3295/**
3296 * Emit a token, performing sanity checks if necessary
3297 *
3298 * \param tokeniser  Tokeniser instance
3299 * \param token      Token to emit
3300 */
3301hubbub_error hubbub_tokeniser_emit_token(hubbub_tokeniser *tokeniser,
3302		hubbub_token *token)
3303{
3304	hubbub_error err = HUBBUB_OK;
3305
3306	assert(tokeniser != NULL);
3307	assert(token != NULL);
3308
3309#ifndef NDEBUG
3310	/* Sanity checks */
3311	switch (token->type) {
3312	case HUBBUB_TOKEN_DOCTYPE:
3313		assert(memchr(token->data.doctype.name.ptr, 0xff, 
3314				token->data.doctype.name.len) == NULL);
3315		if (token->data.doctype.public_missing == false)
3316			assert(memchr(token->data.doctype.public_id.ptr, 0xff,
3317				token->data.doctype.public_id.len) == NULL);
3318		if (token->data.doctype.system_missing == false)
3319			assert(memchr(token->data.doctype.system_id.ptr, 0xff,
3320				token->data.doctype.system_id.len) == NULL);
3321		break;
3322	case HUBBUB_TOKEN_START_TAG:
3323	case HUBBUB_TOKEN_END_TAG:
3324	{
3325		uint32_t i;
3326		assert(memchr(token->data.tag.name.ptr, 0xff, 
3327				token->data.tag.name.len) == NULL);
3328		for (i = 0; i < token->data.tag.n_attributes; i++) {
3329			hubbub_attribute *attr = &token->data.tag.attributes[i];
3330
3331			assert(memchr(attr->name.ptr, 0xff, attr->name.len) == 
3332					NULL);
3333			assert(memchr(attr->value.ptr, 0xff, attr->value.len) ==
3334					NULL);
3335		}
3336	}
3337		break;
3338	case HUBBUB_TOKEN_COMMENT:
3339		assert(memchr(token->data.comment.ptr, 0xff, 
3340				token->data.comment.len) == NULL);
3341		break;
3342	case HUBBUB_TOKEN_CHARACTER:
3343		assert(memchr(token->data.character.ptr, 0xff,
3344				token->data.character.len) == NULL);
3345		break;
3346	case HUBBUB_TOKEN_EOF:
3347		break;
3348	}
3349#endif
3350
3351	/* Emit the token */
3352	if (tokeniser->token_handler) {
3353		err = tokeniser->token_handler(token, tokeniser->token_pw);
3354	}
3355
3356	/* Discard current buffer */
3357	if (tokeniser->buffer->length) {
3358		parserutils_buffer_discard(tokeniser->buffer, 0,
3359				tokeniser->buffer->length);
3360	}
3361
3362	/* Advance the pointer */
3363	if (tokeniser->context.pending) {
3364		parserutils_inputstream_advance(tokeniser->input,
3365				tokeniser->context.pending);
3366		tokeniser->context.pending = 0;
3367	}
3368
3369	return err;
3370}