tokeniser.c | searchcode

/peek-build/src/netdepends/hubbub-0.0.2/src/tokeniser/tokeniser.c

https://bitbucket.org/C0deMaver1ck/peeklinux · C · 3370 lines · 2902 code · 319 blank · 149 comment · 615 complexity · 6530fbcb7409a5b2eafc2432fd3ae059 MD5 · raw file

/*
 * This file is part of Hubbub.
 * Licensed under the MIT License,
 *                http://www.opensource.org/licenses/mit-license.php
 * Copyright 2007 John-Mark Bell <jmb@netsurf-browser.org>
 * Copyright 2008 Andrew Sidwell <takkaria@netsurf-browser.org>
 */
#include <assert.h>
#include <stdbool.h>
#include <string.h>

#include <stdio.h>

#include <parserutils/charset/utf8.h>

#include "utils/parserutilserror.h"
#include "utils/utils.h"

#include "tokeniser/entities.h"
#include "tokeniser/tokeniser.h"

/**
 * Table of mappings between Windows-1252 codepoints 128-159 and UCS4
 */
static const uint32_t cp1252Table[32] = {
	0x20AC, 0xFFFD, 0x201A, 0x0192, 0x201E, 0x2026, 0x2020, 0x2021,
	0x02C6, 0x2030, 0x0160, 0x2039, 0x0152, 0xFFFD, 0x017D, 0xFFFD,
	0xFFFD, 0x2018, 0x2019, 0x201C, 0x201D, 0x2022, 0x2013, 0x2014,
	0x02DC, 0x2122, 0x0161, 0x203A, 0x0153, 0xFFFD, 0x017E, 0x0178
};

/**
 * UTF-8 encoding of U+FFFD REPLACEMENT CHARACTER
 */
static const uint8_t u_fffd[3] = { '\xEF', '\xBF', '\xBD' };
static const hubbub_string u_fffd_str = { u_fffd, sizeof(u_fffd) };


/**
 * String for when we want to emit newlines
 */
static const uint8_t lf = '\n';
static const hubbub_string lf_str = { &lf, 1 };


/**
 * Tokeniser states
 */
typedef enum hubbub_tokeniser_state {
	STATE_DATA,
	STATE_CHARACTER_REFERENCE_DATA,
	STATE_TAG_OPEN,
	STATE_CLOSE_TAG_OPEN,
	STATE_TAG_NAME,
	STATE_BEFORE_ATTRIBUTE_NAME,
	STATE_ATTRIBUTE_NAME,
	STATE_AFTER_ATTRIBUTE_NAME,
	STATE_BEFORE_ATTRIBUTE_VALUE,
	STATE_ATTRIBUTE_VALUE_DQ,
	STATE_ATTRIBUTE_VALUE_SQ,
	STATE_ATTRIBUTE_VALUE_UQ,
	STATE_CHARACTER_REFERENCE_IN_ATTRIBUTE_VALUE,
	STATE_AFTER_ATTRIBUTE_VALUE_Q,
	STATE_SELF_CLOSING_START_TAG,
	STATE_BOGUS_COMMENT,
	STATE_MARKUP_DECLARATION_OPEN,
	STATE_MATCH_COMMENT,
	STATE_COMMENT_START,
	STATE_COMMENT_START_DASH,
	STATE_COMMENT,
	STATE_COMMENT_END_DASH,
	STATE_COMMENT_END,
	STATE_MATCH_DOCTYPE,
	STATE_DOCTYPE,
	STATE_BEFORE_DOCTYPE_NAME,
	STATE_DOCTYPE_NAME,
	STATE_AFTER_DOCTYPE_NAME,
	STATE_MATCH_PUBLIC,
	STATE_BEFORE_DOCTYPE_PUBLIC,
	STATE_DOCTYPE_PUBLIC_DQ,
	STATE_DOCTYPE_PUBLIC_SQ,
	STATE_AFTER_DOCTYPE_PUBLIC,
	STATE_MATCH_SYSTEM,
	STATE_BEFORE_DOCTYPE_SYSTEM,
	STATE_DOCTYPE_SYSTEM_DQ,
	STATE_DOCTYPE_SYSTEM_SQ,
	STATE_AFTER_DOCTYPE_SYSTEM,
	STATE_BOGUS_DOCTYPE,
	STATE_MATCH_CDATA,
	STATE_CDATA_BLOCK,
	STATE_NUMBERED_ENTITY,
	STATE_NAMED_ENTITY
} hubbub_tokeniser_state;

/**
 * Context for tokeniser
 */
typedef struct hubbub_tokeniser_context {
	size_t pending;				/**< Count of pending chars */

	hubbub_string current_comment;		/**< Current comment text */

	hubbub_token_type current_tag_type;	/**< Type of current_tag */
	hubbub_tag current_tag;			/**< Current tag */
	hubbub_doctype current_doctype;		/**< Current doctype */
	hubbub_tokeniser_state prev_state;	/**< Previous state */

	uint8_t last_start_tag_name[10];	/**< Name of the last start tag
						 * emitted */
	size_t last_start_tag_len;		/**< Length of last start tag */

	struct {
		uint32_t count;
		bool match;
	} close_tag_match;			/**< State for matching close 
						 * tags */

	struct {
		uint32_t count;			/**< Index into "DOCTYPE" */
	} match_doctype;			/**< State for matching doctype */

	struct {
		uint32_t count;			/**< Index into "[CDATA[" */
		uint32_t end;			/**< Index into "]]>" */
	} match_cdata;				/**< State for matching cdata */

	struct {
		size_t offset;			/**< Offset in buffer */
		uint32_t length;		/**< Length of entity */
		uint32_t codepoint;		/**< UCS4 codepoint */
		bool complete;			/**< True if match complete */

		uint32_t poss_length;		/**< Optimistic length
						 * when matching named
						 * character references */
		uint8_t base;			/**< Base for numeric
						 * entities */
		void *context;			/**< Context for named
						 * entity search */
		size_t prev_len;		/**< Previous byte length
						 * of str */
		bool had_data;			/**< Whether we read
						 * anything after &#(x)? */
		bool overflow;			/**< Whether this entity has
						 * has overflowed the maximum
						 * numeric entity value */
		hubbub_tokeniser_state return_state;	/**< State we were
							 * called from */
	} match_entity;				/**< Entity matching state */

	struct {
		uint32_t line;			/**< Current line of input */
		uint32_t col;			/**< Current character in
						 * line */
	} position;				/**< Position in source data */

	uint32_t allowed_char;			/**< Used for quote matching */
} hubbub_tokeniser_context;

/**
 * Tokeniser data structure
 */
struct hubbub_tokeniser {
	hubbub_tokeniser_state state;	/**< Current tokeniser state */
	hubbub_content_model content_model;	/**< Current content
						 * model flag */
	bool escape_flag;		/**< Escape flag **/
	bool process_cdata_section;	/**< Whether to process CDATA sections*/

	parserutils_inputstream *input;	/**< Input stream */
	parserutils_buffer *buffer;	/**< Input buffer */

	hubbub_tokeniser_context context;	/**< Tokeniser context */

	hubbub_token_handler token_handler;	/**< Token handling callback */
	void *token_pw;				/**< Token handler data */

	hubbub_error_handler error_handler;	/**< Error handling callback */
	void *error_pw;				/**< Error handler data */

	hubbub_allocator_fn alloc;	/**< Memory (de)allocation function */
	void *alloc_pw;			/**< Client private data */
};

static hubbub_error hubbub_tokeniser_handle_data(hubbub_tokeniser *tokeniser);
static hubbub_error hubbub_tokeniser_handle_character_reference_data(
		hubbub_tokeniser *tokeniser);
static hubbub_error hubbub_tokeniser_handle_tag_open(
		hubbub_tokeniser *tokeniser);
static hubbub_error hubbub_tokeniser_handle_close_tag_open(
		hubbub_tokeniser *tokeniser);
static hubbub_error hubbub_tokeniser_handle_tag_name(
		hubbub_tokeniser *tokeniser);
static hubbub_error hubbub_tokeniser_handle_before_attribute_name(
		hubbub_tokeniser *tokeniser);
static hubbub_error hubbub_tokeniser_handle_attribute_name(
		hubbub_tokeniser *tokeniser);
static hubbub_error hubbub_tokeniser_handle_after_attribute_name(
		hubbub_tokeniser *tokeniser);
static hubbub_error hubbub_tokeniser_handle_before_attribute_value(
		hubbub_tokeniser *tokeniser);
static hubbub_error hubbub_tokeniser_handle_attribute_value_dq(
		hubbub_tokeniser *tokeniser);
static hubbub_error hubbub_tokeniser_handle_attribute_value_sq(
		hubbub_tokeniser *tokeniser);
static hubbub_error hubbub_tokeniser_handle_attribute_value_uq(
		hubbub_tokeniser *tokeniser);
static hubbub_error hubbub_tokeniser_handle_character_reference_in_attribute_value(
		hubbub_tokeniser *tokeniser);
static hubbub_error hubbub_tokeniser_handle_after_attribute_value_q(
		hubbub_tokeniser *tokeniser);
static hubbub_error hubbub_tokeniser_handle_self_closing_start_tag(
		hubbub_tokeniser *tokeniser);
static hubbub_error hubbub_tokeniser_handle_bogus_comment(
		hubbub_tokeniser *tokeniser);
static hubbub_error hubbub_tokeniser_handle_markup_declaration_open(
		hubbub_tokeniser *tokeniser);
static hubbub_error hubbub_tokeniser_handle_match_comment(
		hubbub_tokeniser *tokeniser);
static hubbub_error hubbub_tokeniser_handle_comment(
		hubbub_tokeniser *tokeniser);
static hubbub_error hubbub_tokeniser_handle_match_doctype(
		hubbub_tokeniser *tokeniser);
static hubbub_error hubbub_tokeniser_handle_doctype(
		hubbub_tokeniser *tokeniser);
static hubbub_error hubbub_tokeniser_handle_before_doctype_name(
		hubbub_tokeniser *tokeniser);
static hubbub_error hubbub_tokeniser_handle_doctype_name(
		hubbub_tokeniser *tokeniser);
static hubbub_error hubbub_tokeniser_handle_after_doctype_name(
		hubbub_tokeniser *tokeniser);
static hubbub_error hubbub_tokeniser_handle_match_public(
		hubbub_tokeniser *tokeniser);
static hubbub_error hubbub_tokeniser_handle_before_doctype_public(
		hubbub_tokeniser *tokeniser);
static hubbub_error hubbub_tokeniser_handle_doctype_public_dq(
		hubbub_tokeniser *tokeniser);
static hubbub_error hubbub_tokeniser_handle_doctype_public_sq(
		hubbub_tokeniser *tokeniser);
static hubbub_error hubbub_tokeniser_handle_after_doctype_public(
		hubbub_tokeniser *tokeniser);
static hubbub_error hubbub_tokeniser_handle_match_system(
		hubbub_tokeniser *tokeniser);
static hubbub_error hubbub_tokeniser_handle_before_doctype_system(
		hubbub_tokeniser *tokeniser);
static hubbub_error hubbub_tokeniser_handle_doctype_system_dq(
		hubbub_tokeniser *tokeniser);
static hubbub_error hubbub_tokeniser_handle_doctype_system_sq(
		hubbub_tokeniser *tokeniser);
static hubbub_error hubbub_tokeniser_handle_after_doctype_system(
		hubbub_tokeniser *tokeniser);
static hubbub_error hubbub_tokeniser_handle_bogus_doctype(
		hubbub_tokeniser *tokeniser);
static hubbub_error hubbub_tokeniser_handle_match_cdata(
		hubbub_tokeniser *tokeniser);
static hubbub_error hubbub_tokeniser_handle_cdata_block(
		hubbub_tokeniser *tokeniser);
static hubbub_error hubbub_tokeniser_consume_character_reference(
		hubbub_tokeniser *tokeniser, size_t off);
static hubbub_error hubbub_tokeniser_handle_numbered_entity(
		hubbub_tokeniser *tokeniser);
static hubbub_error hubbub_tokeniser_handle_named_entity(
		hubbub_tokeniser *tokeniser);

static inline hubbub_error emit_character_token(hubbub_tokeniser *tokeniser,
		const hubbub_string *chars);
static inline hubbub_error emit_current_chars(hubbub_tokeniser *tokeniser);
static inline hubbub_error emit_current_tag(hubbub_tokeniser *tokeniser);
static inline hubbub_error emit_current_comment(hubbub_tokeniser *tokeniser);
static inline hubbub_error emit_current_doctype(hubbub_tokeniser *tokeniser,
		bool force_quirks);
static hubbub_error hubbub_tokeniser_emit_token(hubbub_tokeniser *tokeniser,
		hubbub_token *token);

/**
 * Create a hubbub tokeniser
 *
 * \param input      Input stream instance
 * \param alloc      Memory (de)allocation function
 * \param pw         Pointer to client-specific private data (may be NULL)
 * \param tokeniser  Pointer to location to receive tokeniser instance
 * \return HUBBUB_OK on success,
 *         HUBBUB_BADPARM on bad parameters,
 *         HUBBUB_NOMEM on memory exhaustion
 */
hubbub_error hubbub_tokeniser_create(parserutils_inputstream *input,
		hubbub_allocator_fn alloc, void *pw, 
		hubbub_tokeniser **tokeniser)
{
	parserutils_error perror;
	hubbub_tokeniser *tok;

	if (input == NULL || alloc == NULL || tokeniser == NULL)
		return HUBBUB_BADPARM;

	tok = alloc(NULL, sizeof(hubbub_tokeniser), pw);
	if (tok == NULL)
		return HUBBUB_NOMEM;

	perror = parserutils_buffer_create(alloc, pw, &tok->buffer);
	if (perror != PARSERUTILS_OK) {
		alloc(tok, 0, pw);
		return hubbub_error_from_parserutils_error(perror);
	}

	tok->state = STATE_DATA;
	tok->content_model = HUBBUB_CONTENT_MODEL_PCDATA;

	tok->escape_flag = false;
	tok->process_cdata_section = false;

	tok->input = input;

	tok->token_handler = NULL;
	tok->token_pw = NULL;

	tok->error_handler = NULL;
	tok->error_pw = NULL;

	tok->alloc = alloc;
	tok->alloc_pw = pw;

	memset(&tok->context, 0, sizeof(hubbub_tokeniser_context));

	*tokeniser = tok;

	return HUBBUB_OK;
}

/**
 * Destroy a hubbub tokeniser
 *
 * \param tokeniser  The tokeniser instance to destroy
 * \return HUBBUB_OK on success, appropriate error otherwise
 */
hubbub_error hubbub_tokeniser_destroy(hubbub_tokeniser *tokeniser)
{
	if (tokeniser == NULL)
		return HUBBUB_BADPARM;

	if (tokeniser->context.current_tag.attributes != NULL) {
		tokeniser->alloc(tokeniser->context.current_tag.attributes,
				0, tokeniser->alloc_pw);
	}

	parserutils_buffer_destroy(tokeniser->buffer);

	tokeniser->alloc(tokeniser, 0, tokeniser->alloc_pw);

	return HUBBUB_OK;
}

/**
 * Configure a hubbub tokeniser
 *
 * \param tokeniser  The tokeniser instance to configure
 * \param type       The option type to set
 * \param params     Option-specific parameters
 * \return HUBBUB_OK on success, appropriate error otherwise
 */
hubbub_error hubbub_tokeniser_setopt(hubbub_tokeniser *tokeniser,
		hubbub_tokeniser_opttype type,
		hubbub_tokeniser_optparams *params)
{
	if (tokeniser == NULL || params == NULL)
		return HUBBUB_BADPARM;

	switch (type) {
	case HUBBUB_TOKENISER_TOKEN_HANDLER:
		tokeniser->token_handler = params->token_handler.handler;
		tokeniser->token_pw = params->token_handler.pw;
		break;
	case HUBBUB_TOKENISER_ERROR_HANDLER:
		tokeniser->error_handler = params->error_handler.handler;
		tokeniser->error_pw = params->error_handler.pw;
		break;
	case HUBBUB_TOKENISER_CONTENT_MODEL:
		tokeniser->content_model = params->content_model.model;
		break;
	case HUBBUB_TOKENISER_PROCESS_CDATA:
		tokeniser->process_cdata_section = params->process_cdata;
		break;
	}

	return HUBBUB_OK;
}

/**
 * Process remaining data in the input stream
 *
 * \param tokeniser  The tokeniser instance to invoke
 * \return HUBBUB_OK on success, appropriate error otherwise
 */
hubbub_error hubbub_tokeniser_run(hubbub_tokeniser *tokeniser)
{
	hubbub_error cont = HUBBUB_OK;

	if (tokeniser == NULL)
		return HUBBUB_BADPARM;

#if 0
#define state(x) \
		case x: \
			printf( #x "\n");
#else
#define state(x) \
		case x:
#endif

	while (cont == HUBBUB_OK) {
		switch (tokeniser->state) {
		state(STATE_DATA)
			cont = hubbub_tokeniser_handle_data(tokeniser);
			break;
		state(STATE_CHARACTER_REFERENCE_DATA)
			cont = hubbub_tokeniser_handle_character_reference_data(
					tokeniser);
			break;
		state(STATE_TAG_OPEN)
			cont = hubbub_tokeniser_handle_tag_open(tokeniser);
			break;
		state(STATE_CLOSE_TAG_OPEN)
			cont = hubbub_tokeniser_handle_close_tag_open(
					tokeniser);
			break;
		state(STATE_TAG_NAME)
			cont = hubbub_tokeniser_handle_tag_name(tokeniser);
			break;
		state(STATE_BEFORE_ATTRIBUTE_NAME)
			cont = hubbub_tokeniser_handle_before_attribute_name(
					tokeniser);
			break;
		state(STATE_ATTRIBUTE_NAME)
			cont = hubbub_tokeniser_handle_attribute_name(
					tokeniser);
			break;
		state(STATE_AFTER_ATTRIBUTE_NAME)
			cont = hubbub_tokeniser_handle_after_attribute_name(
					tokeniser);
			break;
		state(STATE_BEFORE_ATTRIBUTE_VALUE)
			cont = hubbub_tokeniser_handle_before_attribute_value(
					tokeniser);
			break;
		state(STATE_ATTRIBUTE_VALUE_DQ)
			cont = hubbub_tokeniser_handle_attribute_value_dq(
					tokeniser);
			break;
		state(STATE_ATTRIBUTE_VALUE_SQ)
			cont = hubbub_tokeniser_handle_attribute_value_sq(
					tokeniser);
			break;
		state(STATE_ATTRIBUTE_VALUE_UQ)
			cont = hubbub_tokeniser_handle_attribute_value_uq(
					tokeniser);
			break;
		state(STATE_CHARACTER_REFERENCE_IN_ATTRIBUTE_VALUE)
			cont = hubbub_tokeniser_handle_character_reference_in_attribute_value(
					tokeniser);
			break;
		state(STATE_AFTER_ATTRIBUTE_VALUE_Q)
			cont = hubbub_tokeniser_handle_after_attribute_value_q(
					tokeniser);
			break;
		state(STATE_SELF_CLOSING_START_TAG)
			cont = hubbub_tokeniser_handle_self_closing_start_tag(
					tokeniser);
			break;
		state(STATE_BOGUS_COMMENT)
			cont = hubbub_tokeniser_handle_bogus_comment(
					tokeniser);
			break;
		state(STATE_MARKUP_DECLARATION_OPEN)
			cont = hubbub_tokeniser_handle_markup_declaration_open(
					tokeniser);
			break;
		state(STATE_MATCH_COMMENT)
			cont = hubbub_tokeniser_handle_match_comment(
					tokeniser);
			break;
		case STATE_COMMENT_START:
		case STATE_COMMENT_START_DASH:
		case STATE_COMMENT:
		case STATE_COMMENT_END_DASH:
		case STATE_COMMENT_END:
			cont = hubbub_tokeniser_handle_comment(tokeniser);
			break;
		state(STATE_MATCH_DOCTYPE)
			cont = hubbub_tokeniser_handle_match_doctype(
					tokeniser);
			break;
		state(STATE_DOCTYPE)
			cont = hubbub_tokeniser_handle_doctype(tokeniser);
			break;
		state(STATE_BEFORE_DOCTYPE_NAME)
			cont = hubbub_tokeniser_handle_before_doctype_name(
					tokeniser);
			break;
		state(STATE_DOCTYPE_NAME)
			cont = hubbub_tokeniser_handle_doctype_name(
					tokeniser);
			break;
		state(STATE_AFTER_DOCTYPE_NAME)
			cont = hubbub_tokeniser_handle_after_doctype_name(
					tokeniser);
			break;

		state(STATE_MATCH_PUBLIC)
			cont = hubbub_tokeniser_handle_match_public(
					tokeniser);
			break;
		state(STATE_BEFORE_DOCTYPE_PUBLIC)
			cont = hubbub_tokeniser_handle_before_doctype_public(
					tokeniser);
			break;
		state(STATE_DOCTYPE_PUBLIC_DQ)
			cont = hubbub_tokeniser_handle_doctype_public_dq(
					tokeniser);
			break;
		state(STATE_DOCTYPE_PUBLIC_SQ)
			cont = hubbub_tokeniser_handle_doctype_public_sq(
					tokeniser);
			break;
		state(STATE_AFTER_DOCTYPE_PUBLIC)
			cont = hubbub_tokeniser_handle_after_doctype_public(
					tokeniser);
			break;
		state(STATE_MATCH_SYSTEM)
			cont = hubbub_tokeniser_handle_match_system(
					tokeniser);
			break;
		state(STATE_BEFORE_DOCTYPE_SYSTEM)
			cont = hubbub_tokeniser_handle_before_doctype_system(
					tokeniser);
			break;
		state(STATE_DOCTYPE_SYSTEM_DQ)
			cont = hubbub_tokeniser_handle_doctype_system_dq(
					tokeniser);
			break;
		state(STATE_DOCTYPE_SYSTEM_SQ)
			cont = hubbub_tokeniser_handle_doctype_system_sq(
					tokeniser);
			break;
		state(STATE_AFTER_DOCTYPE_SYSTEM)
			cont = hubbub_tokeniser_handle_after_doctype_system(
					tokeniser);
			break;
		state(STATE_BOGUS_DOCTYPE)
			cont = hubbub_tokeniser_handle_bogus_doctype(
					tokeniser);
			break;
		state(STATE_MATCH_CDATA)
			cont = hubbub_tokeniser_handle_match_cdata(
					tokeniser);
			break;
		state(STATE_CDATA_BLOCK)
			cont = hubbub_tokeniser_handle_cdata_block(
					tokeniser);
			break;
		state(STATE_NUMBERED_ENTITY)
			cont = hubbub_tokeniser_handle_numbered_entity(
					tokeniser);
			break;
		state(STATE_NAMED_ENTITY)
			cont = hubbub_tokeniser_handle_named_entity(
					tokeniser);
			break;
		}
	}

	return (cont == HUBBUB_NEEDDATA) ? HUBBUB_OK : cont;
}


/**
 * Various macros for manipulating buffers.
 *
 * \todo make some of these inline functions (type-safety)
 * \todo document them properly here
 */

#define START_BUF(str, cptr, length) \
	do { \
		parserutils_error perror; \
		perror = parserutils_buffer_append(tokeniser->buffer, \
				(uint8_t *) (cptr), (length)); \
		if (perror != PARSERUTILS_OK) \
			return hubbub_error_from_parserutils_error(perror); \
		(str).len = (length); \
	} while (0)

#define COLLECT(str, cptr, length) \
	do { \
		parserutils_error perror; \
		assert(str.len != 0); \
		perror = parserutils_buffer_append(tokeniser->buffer, \
				(uint8_t *) (cptr), (length)); \
		if (perror != PARSERUTILS_OK) \
			return hubbub_error_from_parserutils_error(perror); \
		(str).len += (length); \
	} while (0)

#define COLLECT_MS(str, cptr, length) \
	do { \
		parserutils_error perror; \
		perror = parserutils_buffer_append(tokeniser->buffer, \
				(uint8_t *) (cptr), (length)); \
		if (perror != PARSERUTILS_OK) \
			return hubbub_error_from_parserutils_error(perror); \
		(str).len += (length); \
	} while (0)


/* this should always be called with an empty "chars" buffer */
hubbub_error hubbub_tokeniser_handle_data(hubbub_tokeniser *tokeniser)
{
	parserutils_error error;
	hubbub_token token;
	const uint8_t *cptr;
	size_t len;

	while ((error = parserutils_inputstream_peek(tokeniser->input,
			tokeniser->context.pending, &cptr, &len)) ==
					PARSERUTILS_OK) {
		const uint8_t c = *cptr;

		if (c == '&' &&
				(tokeniser->content_model == HUBBUB_CONTENT_MODEL_PCDATA ||
				tokeniser->content_model == HUBBUB_CONTENT_MODEL_RCDATA) &&
				tokeniser->escape_flag == false) {
			tokeniser->state =
					STATE_CHARACTER_REFERENCE_DATA;
			/* Don't eat the '&'; it'll be handled by entity
			 * consumption */
			break;
		} else if (c == '-' &&
				tokeniser->escape_flag == false &&
				(tokeniser->content_model ==
						HUBBUB_CONTENT_MODEL_RCDATA ||
				tokeniser->content_model ==
						HUBBUB_CONTENT_MODEL_CDATA) &&
				tokeniser->context.pending >= 3) {
			size_t ignore;
			error = parserutils_inputstream_peek(
					tokeniser->input,
					tokeniser->context.pending - 3,
					&cptr,
					&ignore);

			assert(error == PARSERUTILS_OK);

			if (strncmp((char *)cptr,
					"<!--", SLEN("<!--")) == 0) {
				tokeniser->escape_flag = true;
			}

			tokeniser->context.pending += len;
		} else if (c == '<' && (tokeniser->content_model ==
						HUBBUB_CONTENT_MODEL_PCDATA ||
					((tokeniser->content_model ==
						HUBBUB_CONTENT_MODEL_RCDATA ||
					tokeniser->content_model ==
						HUBBUB_CONTENT_MODEL_CDATA) &&
				tokeniser->escape_flag == false))) {
			if (tokeniser->context.pending > 0) {
				/* Emit any pending characters */
				emit_current_chars(tokeniser);
			}

			/* Buffer '<' */
			tokeniser->context.pending = len;
			tokeniser->state = STATE_TAG_OPEN;
			break;
		} else if (c == '>' && tokeniser->escape_flag == true &&
				(tokeniser->content_model ==
						HUBBUB_CONTENT_MODEL_RCDATA ||
				tokeniser->content_model ==
						HUBBUB_CONTENT_MODEL_CDATA)) {
			/* no need to check that there are enough characters,
			 * since you can only run into this if the flag is
			 * true in the first place, which requires four
			 * characters. */
			error = parserutils_inputstream_peek(
					tokeniser->input,
					tokeniser->context.pending - 2,
					&cptr,
					&len);

			assert(error == PARSERUTILS_OK);

			if (strncmp((char *) cptr, "-->", SLEN("-->")) == 0) {
				tokeniser->escape_flag = false;
			}

			tokeniser->context.pending += len;
		} else if (c == '\0') {
			if (tokeniser->context.pending > 0) {
				/* Emit any pending characters */
				emit_current_chars(tokeniser);
			}

			/* Emit a replacement character */
			emit_character_token(tokeniser, &u_fffd_str);

			/* Advance past NUL */
			parserutils_inputstream_advance(tokeniser->input, 1);
		} else if (c == '\r') {
			error = parserutils_inputstream_peek(
					tokeniser->input,
					tokeniser->context.pending + len,
					&cptr,
					&len);

			if (error != PARSERUTILS_OK && 
					error != PARSERUTILS_EOF) {
				break;
			}

			if (tokeniser->context.pending > 0) {
				/* Emit any pending characters */
				emit_current_chars(tokeniser);
			}

			if (error == PARSERUTILS_EOF ||	*cptr != '\n') {
				/* Emit newline */
				emit_character_token(tokeniser, &lf_str);
			}

			/* Advance over */
			parserutils_inputstream_advance(tokeniser->input, 1);
		} else {
			/* Just collect into buffer */
			tokeniser->context.pending += len;
		}
	}

	if (tokeniser->state != STATE_TAG_OPEN &&
		(tokeniser->state != STATE_DATA || error == PARSERUTILS_EOF) &&
			tokeniser->context.pending > 0) {
		/* Emit any pending characters */
		emit_current_chars(tokeniser);
	}

	if (error == PARSERUTILS_EOF) {
		token.type = HUBBUB_TOKEN_EOF;
		hubbub_tokeniser_emit_token(tokeniser, &token);
	}

	if (error == PARSERUTILS_EOF) {
		return HUBBUB_NEEDDATA;
	} else {
		return hubbub_error_from_parserutils_error(error);
	}
}

/* emit any pending tokens before calling */
hubbub_error hubbub_tokeniser_handle_character_reference_data(
		hubbub_tokeniser *tokeniser)
{
	assert(tokeniser->context.pending == 0);

	if (tokeniser->context.match_entity.complete == false) {
		return hubbub_tokeniser_consume_character_reference(tokeniser,
				tokeniser->context.pending);
	} else {
		hubbub_token token;

		uint8_t utf8[6];
		uint8_t *utf8ptr = utf8;
		size_t len = sizeof(utf8);

		token.type = HUBBUB_TOKEN_CHARACTER;

		if (tokeniser->context.match_entity.codepoint) {
			parserutils_charset_utf8_from_ucs4(
				tokeniser->context.match_entity.codepoint,
				&utf8ptr, &len);

			token.data.character.ptr = utf8;
			token.data.character.len = sizeof(utf8) - len;

			hubbub_tokeniser_emit_token(tokeniser, &token);

			/* +1 for ampersand */
			parserutils_inputstream_advance(tokeniser->input,
					tokeniser->context.match_entity.length
							+ 1);
		} else {
			parserutils_error error;
			const uint8_t *cptr = NULL;
			error = parserutils_inputstream_peek(
					tokeniser->input,
					tokeniser->context.pending,
					&cptr,
					&len);

			assert(error == PARSERUTILS_OK);

			token.data.character.ptr = cptr;
			token.data.character.len = len;

			hubbub_tokeniser_emit_token(tokeniser, &token);
			parserutils_inputstream_advance(tokeniser->input, len);
		}

		/* Reset for next time */
		tokeniser->context.match_entity.complete = false;

		tokeniser->state = STATE_DATA;
	}

	return HUBBUB_OK;
}

/* this state always switches to another state straight away */
/* this state expects the current character to be '<' */
hubbub_error hubbub_tokeniser_handle_tag_open(hubbub_tokeniser *tokeniser)
{
	hubbub_tag *ctag = &tokeniser->context.current_tag;

	size_t len;
	const uint8_t *cptr;
	parserutils_error error;
	uint8_t c;

	assert(tokeniser->context.pending == 1);
/*	assert(tokeniser->context.chars.ptr[0] == '<'); */

	error = parserutils_inputstream_peek(tokeniser->input, 
			tokeniser->context.pending, &cptr, &len);

	if (error != PARSERUTILS_OK) {
		if (error == PARSERUTILS_EOF) {
			/* Return to data state with '<' still in "chars" */
			tokeniser->state = STATE_DATA;
			return HUBBUB_OK;
		} else {
			return hubbub_error_from_parserutils_error(error);
		}
	}

	c = *cptr;

	if (c == '/') {
		tokeniser->context.pending += len;

		tokeniser->context.close_tag_match.match = false;
		tokeniser->context.close_tag_match.count = 0;

		tokeniser->state = STATE_CLOSE_TAG_OPEN;
	} else if (tokeniser->content_model == HUBBUB_CONTENT_MODEL_RCDATA ||
			tokeniser->content_model ==
					HUBBUB_CONTENT_MODEL_CDATA) {
		/* Return to data state with '<' still in "chars" */
		tokeniser->state = STATE_DATA;
	} else if (tokeniser->content_model == HUBBUB_CONTENT_MODEL_PCDATA) {
		if (c == '!') {
			parserutils_inputstream_advance(tokeniser->input,
					SLEN("<!"));

			tokeniser->context.pending = 0;
			tokeniser->state = STATE_MARKUP_DECLARATION_OPEN;
		} else if ('A' <= c && c <= 'Z') {
			uint8_t lc = (c + 0x20);

			START_BUF(ctag->name, &lc, len);
			ctag->n_attributes = 0;
			tokeniser->context.current_tag_type =
					HUBBUB_TOKEN_START_TAG;

			tokeniser->context.pending += len;

			tokeniser->state = STATE_TAG_NAME;
		} else if ('a' <= c && c <= 'z') {
			START_BUF(ctag->name, cptr, len);
			ctag->n_attributes = 0;
			tokeniser->context.current_tag_type =
					HUBBUB_TOKEN_START_TAG;

			tokeniser->context.pending += len;

			tokeniser->state = STATE_TAG_NAME;
		} else if (c == '>') {
			/** \todo parse error */

			tokeniser->context.pending += len;
			tokeniser->state = STATE_DATA;
		} else if (c == '?') {
			/** \todo parse error */

			/* Cursor still at "<", need to advance past it */
			parserutils_inputstream_advance(
					tokeniser->input, SLEN("<"));
			tokeniser->context.pending = 0;

			tokeniser->state = STATE_BOGUS_COMMENT;
		} else {
			/* Return to data state with '<' still in "chars" */
			tokeniser->state = STATE_DATA;
		}
	}

	return HUBBUB_OK;
}

/* this state expects tokeniser->context.chars to be "</" */
/* this state never stays in this state for more than one character */
hubbub_error hubbub_tokeniser_handle_close_tag_open(hubbub_tokeniser *tokeniser)
{
	hubbub_tokeniser_context *ctx = &tokeniser->context;

	size_t len;
	const uint8_t *cptr;
	parserutils_error error;
	uint8_t c;

	assert(tokeniser->context.pending == 2);
/*	assert(tokeniser->context.chars.ptr[0] == '<'); */
/*	assert(tokeniser->context.chars.ptr[1] == '/'); */

	/**\todo fragment case */

	if (tokeniser->content_model == HUBBUB_CONTENT_MODEL_RCDATA ||
			tokeniser->content_model ==
					HUBBUB_CONTENT_MODEL_CDATA) {
		uint8_t *start_tag_name =
			tokeniser->context.last_start_tag_name;
		size_t start_tag_len =
			tokeniser->context.last_start_tag_len;

		while ((error = parserutils_inputstream_peek(tokeniser->input,
					ctx->pending +
						ctx->close_tag_match.count,
					&cptr,
					&len)) == PARSERUTILS_OK) {
			c = *cptr;

			if ((start_tag_name[ctx->close_tag_match.count] & ~0x20)
					!= (c & ~0x20)) {
				break;
			}

			ctx->close_tag_match.count += len;

			if (ctx->close_tag_match.count == start_tag_len) {
				ctx->close_tag_match.match = true;
				break;
			}
		}

		if (error != PARSERUTILS_OK && error != PARSERUTILS_EOF) {
			return hubbub_error_from_parserutils_error(error);
		}

		if (ctx->close_tag_match.match == true) {
			error = parserutils_inputstream_peek(
			 		tokeniser->input,
			 		ctx->pending +
				 		ctx->close_tag_match.count,
					&cptr,
			 		&len);

			if (error != PARSERUTILS_OK && 
					error != PARSERUTILS_EOF) {
				return hubbub_error_from_parserutils_error(
						error);
			} else if (error != PARSERUTILS_EOF) {
				c = *cptr;

				if (c != '\t' && c != '\n' && c != '\f' &&
						c != ' ' && c != '>' &&
						c != '/') {
					ctx->close_tag_match.match = false;
				}
			}
		}
	}

	if (ctx->close_tag_match.match == false &&
			tokeniser->content_model !=
					HUBBUB_CONTENT_MODEL_PCDATA) {
		/* We should emit "</" here, but instead we leave it in the
		 * buffer so the data state emits it with any characters
		 * following it */
		tokeniser->state = STATE_DATA;
	} else {
		error = parserutils_inputstream_peek(tokeniser->input,
				tokeniser->context.pending, &cptr, &len);

		if (error == PARSERUTILS_EOF) {
			/** \todo parse error */

			/* Return to data state with "</" pending */
			tokeniser->state = STATE_DATA;
			return HUBBUB_OK;
		} else if (error != PARSERUTILS_OK) {
			return hubbub_error_from_parserutils_error(error);
		}

		c = *cptr;

		if ('A' <= c && c <= 'Z') {
			uint8_t lc = (c + 0x20);
			START_BUF(tokeniser->context.current_tag.name,
					&lc, len);
			tokeniser->context.current_tag.n_attributes = 0;

			tokeniser->context.current_tag_type =
					HUBBUB_TOKEN_END_TAG;

			tokeniser->context.pending += len;

			tokeniser->state = STATE_TAG_NAME;
		} else if ('a' <= c && c <= 'z') {
			START_BUF(tokeniser->context.current_tag.name,
					cptr, len);
			tokeniser->context.current_tag.n_attributes = 0;

			tokeniser->context.current_tag_type =
					HUBBUB_TOKEN_END_TAG;

			tokeniser->context.pending += len;

			tokeniser->state = STATE_TAG_NAME;
		} else if (c == '>') {
			/* Cursor still at "</", need to collect ">" */
			tokeniser->context.pending += len;

			/* Now need to advance past "</>" */
			parserutils_inputstream_advance(tokeniser->input,
					tokeniser->context.pending);
			tokeniser->context.pending = 0;

			/** \todo parse error */
			tokeniser->state = STATE_DATA;
		} else {
			/** \todo parse error */

			/* Cursor still at "</", need to advance past it */
			parserutils_inputstream_advance(tokeniser->input,
					tokeniser->context.pending);
			tokeniser->context.pending = 0;

			tokeniser->state = STATE_BOGUS_COMMENT;
		}
	}

	return HUBBUB_OK;
}

/* this state expects tokeniser->context.current_tag to already have its
   first character set */
hubbub_error hubbub_tokeniser_handle_tag_name(hubbub_tokeniser *tokeniser)
{
	hubbub_tag *ctag = &tokeniser->context.current_tag;

	size_t len;
	const uint8_t *cptr;
	parserutils_error error;
	uint8_t c;

	assert(tokeniser->context.pending > 0);
/*	assert(tokeniser->context.chars.ptr[0] == '<'); */
	assert(ctag->name.len > 0);
/*	assert(ctag->name.ptr); */

	error = parserutils_inputstream_peek(tokeniser->input, 
			tokeniser->context.pending, &cptr, &len);

	if (error != PARSERUTILS_OK) {
		if (error == PARSERUTILS_EOF) {
			tokeniser->state = STATE_DATA;
			return emit_current_tag(tokeniser);
		} else {
			return hubbub_error_from_parserutils_error(error);
		}
	}

	c = *cptr;

	if (c == '\t' || c == '\n' || c == '\f' || c == ' ' || c == '\r') {
		tokeniser->context.pending += len;
		tokeniser->state = STATE_BEFORE_ATTRIBUTE_NAME;
	} else if (c == '>') {
		tokeniser->context.pending += len;
		tokeniser->state = STATE_DATA;
		return emit_current_tag(tokeniser);
	} else if (c == '\0') {
		COLLECT(ctag->name, u_fffd, sizeof(u_fffd));
		tokeniser->context.pending += len;
	} else if (c == '/') {
		tokeniser->context.pending += len;
		tokeniser->state = STATE_SELF_CLOSING_START_TAG;
	} else if ('A' <= c && c <= 'Z') {
		uint8_t lc = (c + 0x20);
		COLLECT(ctag->name, &lc, len);
		tokeniser->context.pending += len;
	} else {
		COLLECT(ctag->name, cptr, len);
		tokeniser->context.pending += len;
	}

	return HUBBUB_OK;
}

hubbub_error hubbub_tokeniser_handle_before_attribute_name(
		hubbub_tokeniser *tokeniser)
{
	hubbub_tag *ctag = &tokeniser->context.current_tag;

	size_t len;
	const uint8_t *cptr;
	parserutils_error error;
	uint8_t c;

	error = parserutils_inputstream_peek(tokeniser->input, 
			tokeniser->context.pending, &cptr, &len);

	if (error != PARSERUTILS_OK) {
		if (error == PARSERUTILS_EOF) {
			tokeniser->state = STATE_DATA;
			return emit_current_tag(tokeniser);
		} else {
			return hubbub_error_from_parserutils_error(error);
		}
	}

	c = *cptr;

	if (c == '\t' || c == '\n' || c == '\f' || c == ' ' || c == '\r') {
		/* pass over in silence */
		tokeniser->context.pending += len;
	} else if (c == '>') {
		tokeniser->context.pending += len;
		tokeniser->state = STATE_DATA;
		return emit_current_tag(tokeniser);
	} else if (c == '/') {
		tokeniser->context.pending += len;
		tokeniser->state = STATE_SELF_CLOSING_START_TAG;
	} else {
		hubbub_attribute *attr;

		if (c == '"' || c == '\'' || c == '=') {
			/** \todo parse error */
		}

		attr = tokeniser->alloc(ctag->attributes,
				(ctag->n_attributes + 1) *
					sizeof(hubbub_attribute),
				tokeniser->alloc_pw);
		if (attr == NULL)
			return HUBBUB_NOMEM;

		ctag->attributes = attr;

		if ('A' <= c && c <= 'Z') {
			uint8_t lc = (c + 0x20);
			START_BUF(attr[ctag->n_attributes].name, &lc, len);
		} else if (c == '\0') {
			START_BUF(attr[ctag->n_attributes].name,
					u_fffd, sizeof(u_fffd));
		} else {
			START_BUF(attr[ctag->n_attributes].name, cptr, len);
		}

		attr[ctag->n_attributes].ns = HUBBUB_NS_NULL;
		attr[ctag->n_attributes].value.ptr = NULL;
		attr[ctag->n_attributes].value.len = 0;

		ctag->n_attributes++;

		tokeniser->context.pending += len;
		tokeniser->state = STATE_ATTRIBUTE_NAME;
	}

	return HUBBUB_OK;
}

hubbub_error hubbub_tokeniser_handle_attribute_name(hubbub_tokeniser *tokeniser)
{
	hubbub_tag *ctag = &tokeniser->context.current_tag;

	size_t len;
	const uint8_t *cptr;
	parserutils_error error;
	uint8_t c;

	assert(ctag->attributes[ctag->n_attributes - 1].name.len > 0);

	error = parserutils_inputstream_peek(tokeniser->input, 
			tokeniser->context.pending, &cptr, &len);

	if (error != PARSERUTILS_OK) {
		if (error == PARSERUTILS_EOF) {
			tokeniser->state = STATE_DATA;
			return emit_current_tag(tokeniser);
		} else {
			return hubbub_error_from_parserutils_error(error);
		}
	}

	c = *cptr;

	if (c == '\t' || c == '\n' || c == '\f' || c == ' ' || c == '\r') {
		tokeniser->context.pending += len;
		tokeniser->state = STATE_AFTER_ATTRIBUTE_NAME;
	} else if (c == '=') {
		tokeniser->context.pending += len;
		tokeniser->state = STATE_BEFORE_ATTRIBUTE_VALUE;
	} else if (c == '>') {
		tokeniser->context.pending += len;
		tokeniser->state = STATE_DATA;
		return emit_current_tag(tokeniser);
	} else if (c == '/') {
		tokeniser->context.pending += len;
		tokeniser->state = STATE_SELF_CLOSING_START_TAG;
	} else if (c == '\0') {
		COLLECT(ctag->attributes[ctag->n_attributes - 1].name,
				u_fffd, sizeof(u_fffd));
		tokeniser->context.pending += len;
	} else if ('A' <= c && c <= 'Z') {
		uint8_t lc = (c + 0x20);
		COLLECT(ctag->attributes[ctag->n_attributes - 1].name,
				&lc, len);
		tokeniser->context.pending += len;
	} else {
		COLLECT(ctag->attributes[ctag->n_attributes - 1].name,
				cptr, len);
		tokeniser->context.pending += len;
	}

	return HUBBUB_OK;
}

hubbub_error hubbub_tokeniser_handle_after_attribute_name(
		hubbub_tokeniser *tokeniser)
{
	hubbub_tag *ctag = &tokeniser->context.current_tag;

	size_t len;
	const uint8_t *cptr;
	parserutils_error error;
	uint8_t c;

	error = parserutils_inputstream_peek(tokeniser->input, 
			tokeniser->context.pending, &cptr, &len);

	if (error != PARSERUTILS_OK) {
		if (error == PARSERUTILS_EOF) {
			tokeniser->state = STATE_DATA;
			return emit_current_tag(tokeniser);
		} else {
			return hubbub_error_from_parserutils_error(error);
		}
	}

	c = *cptr;

	if (c == '\t' || c == '\n' || c == '\f' || c == ' ' || c == '\r') {
		tokeniser->context.pending += len;
	} else if (c == '=') {
		tokeniser->context.pending += len;
		tokeniser->state = STATE_BEFORE_ATTRIBUTE_VALUE;
	} else if (c == '>') {
		tokeniser->context.pending += len;

		tokeniser->state = STATE_DATA;
		return emit_current_tag(tokeniser);
	} else if (c == '/') {
		tokeniser->context.pending += len;
		tokeniser->state = STATE_SELF_CLOSING_START_TAG;
	} else {
		hubbub_attribute *attr;

		if (c == '"' || c == '\'') {
			/** \todo parse error */
		}

		attr = tokeniser->alloc(ctag->attributes,
				(ctag->n_attributes + 1) *
					sizeof(hubbub_attribute),
				tokeniser->alloc_pw);
		if (attr == NULL)
			return HUBBUB_NOMEM;

		ctag->attributes = attr;

		if ('A' <= c && c <= 'Z') {
			uint8_t lc = (c + 0x20);
			START_BUF(attr[ctag->n_attributes].name, &lc, len);
		} else if (c == '\0') {
			START_BUF(attr[ctag->n_attributes].name,
					u_fffd, sizeof(u_fffd));
		} else {
			START_BUF(attr[ctag->n_attributes].name, cptr, len);
		}

		attr[ctag->n_attributes].ns = HUBBUB_NS_NULL;
		attr[ctag->n_attributes].value.ptr = NULL;
		attr[ctag->n_attributes].value.len = 0;

		ctag->n_attributes++;

		tokeniser->context.pending += len;
		tokeniser->state = STATE_ATTRIBUTE_NAME;
	}

	return HUBBUB_OK;
}

/* this state is only ever triggered by an '=' */
hubbub_error hubbub_tokeniser_handle_before_attribute_value(
		hubbub_tokeniser *tokeniser)
{
	hubbub_tag *ctag = &tokeniser->context.current_tag;

	size_t len;
	const uint8_t *cptr;
	parserutils_error error;
	uint8_t c;

	error = parserutils_inputstream_peek(tokeniser->input, 
			tokeniser->context.pending, &cptr, &len);

	if (error != PARSERUTILS_OK) {
		if (error == PARSERUTILS_EOF) {
			/** \todo parse error */
			tokeniser->state = STATE_DATA;
			return emit_current_tag(tokeniser);
		} else {
			return hubbub_error_from_parserutils_error(error);
		}
	}

	c = *cptr;

	if (c == '\t' || c == '\n' || c == '\f' || c == ' ' || c == '\r') {
		tokeniser->context.pending += len;
	} else if (c == '"') {
		tokeniser->context.pending += len;
		tokeniser->state = STATE_ATTRIBUTE_VALUE_DQ;
	} else if (c == '&') {
		/* Don't consume the '&' -- reprocess in UQ state */
		tokeniser->state = STATE_ATTRIBUTE_VALUE_UQ;
	} else if (c == '\'') {
		tokeniser->context.pending += len;
		tokeniser->state = STATE_ATTRIBUTE_VALUE_SQ;
	} else if (c == '>') {
		/** \todo parse error */
		tokeniser->context.pending += len;

		tokeniser->state = STATE_DATA;
		return emit_current_tag(tokeniser);
	} else if (c == '\0') {
		START_BUF(ctag->attributes[ctag->n_attributes - 1].value,
				u_fffd, sizeof(u_fffd));
		tokeniser->context.pending += len;
		tokeniser->state = STATE_ATTRIBUTE_VALUE_UQ;
	} else {
		if (c == '=') {
			/** \todo parse error */
		}

		START_BUF(ctag->attributes[ctag->n_attributes - 1].value,
				cptr, len);

		tokeniser->context.pending += len;
		tokeniser->state = STATE_ATTRIBUTE_VALUE_UQ;
	}

	return HUBBUB_OK;
}

hubbub_error hubbub_tokeniser_handle_attribute_value_dq(
		hubbub_tokeniser *tokeniser)
{
	hubbub_tag *ctag = &tokeniser->context.current_tag;

	size_t len;
	const uint8_t *cptr;
	parserutils_error error;
	uint8_t c;

	error = parserutils_inputstream_peek(tokeniser->input, 
			tokeniser->context.pending, &cptr, &len);

	if (error != PARSERUTILS_OK) {
		if (error == PARSERUTILS_EOF) {
			tokeniser->state = STATE_DATA;
			return emit_current_tag(tokeniser);
		} else {
			return hubbub_error_from_parserutils_error(error);
		}
	}

	c = *cptr;

	if (c == '"') {
		tokeniser->context.pending += len;
		tokeniser->state = STATE_AFTER_ATTRIBUTE_VALUE_Q;
	} else if (c == '&') {
		tokeniser->context.prev_state = tokeniser->state;
		tokeniser->state = STATE_CHARACTER_REFERENCE_IN_ATTRIBUTE_VALUE;
		tokeniser->context.allowed_char = '"';
		/* Don't eat the '&'; it'll be handled by entity consumption */
	} else if (c == '\0') {
		COLLECT_MS(ctag->attributes[ctag->n_attributes - 1].value,
				u_fffd, sizeof(u_fffd));
		tokeniser->context.pending += len;
	} else if (c == '\r') {
		error = parserutils_inputstream_peek(
				tokeniser->input,
				tokeniser->context.pending + len,
				&cptr,
				&len);

		if (error != PARSERUTILS_OK && error != PARSERUTILS_EOF) {
			return hubbub_error_from_parserutils_error(error);
		} else if (error == PARSERUTILS_EOF || *cptr != '\n') {
			COLLECT_MS(ctag->attributes[
					ctag->n_attributes - 1].value,
					&lf, sizeof(lf));
		}

		/* Consume '\r' */
		tokeniser->context.pending += 1;
	} else {
		COLLECT_MS(ctag->attributes[ctag->n_attributes - 1].value,
				cptr, len);
		tokeniser->context.pending += len;
	}

	return HUBBUB_OK;
}

hubbub_error hubbub_tokeniser_handle_attribute_value_sq(
		hubbub_tokeniser *tokeniser)
{
	hubbub_tag *ctag = &tokeniser->context.current_tag;

	size_t len;
	const uint8_t *cptr;
	parserutils_error error;
	uint8_t c;

	error = parserutils_inputstream_peek(tokeniser->input, 
			tokeniser->context.pending, &cptr, &len);

	if (error != PARSERUTILS_OK) {
		if (error == PARSERUTILS_EOF) {
			tokeniser->state = STATE_DATA;
			return emit_current_tag(tokeniser);
		} else {
			return hubbub_error_from_parserutils_error(error);
		}
	}

	c = *cptr;

	if (c == '\'') {
		tokeniser->context.pending += len;
		tokeniser->state = STATE_AFTER_ATTRIBUTE_VALUE_Q;
	} else if (c == '&') {
		tokeniser->context.prev_state = tokeniser->state;
		tokeniser->state = STATE_CHARACTER_REFERENCE_IN_ATTRIBUTE_VALUE;
		tokeniser->context.allowed_char = '\'';
		/* Don't eat the '&'; it'll be handled by entity consumption */
	} else if (c == '\0') {
		COLLECT_MS(ctag->attributes[ctag->n_attributes - 1].value,
				u_fffd, sizeof(u_fffd));
		tokeniser->context.pending += len;
	} else if (c == '\r') {
		error = parserutils_inputstream_peek(
				tokeniser->input,
				tokeniser->context.pending + len,
				&cptr,
				&len);

		if (error != PARSERUTILS_OK && error != PARSERUTILS_EOF) {
			return hubbub_error_from_parserutils_error(error);
		} else if (error == PARSERUTILS_EOF || *cptr != '\n') {
			COLLECT_MS(ctag->attributes[
					ctag->n_attributes - 1].value,
					&lf, sizeof(lf));
		}

		/* Consume \r */
		tokeniser->context.pending += 1;
	} else {
		COLLECT_MS(ctag->attributes[ctag->n_attributes - 1].value,
				cptr, len);
		tokeniser->context.pending += len;
	}

	return HUBBUB_OK;
}

hubbub_error hubbub_tokeniser_handle_attribute_value_uq(
		hubbub_tokeniser *tokeniser)
{
	hubbub_tag *ctag = &tokeniser->context.current_tag;
	uint8_t c;

	size_t len;
	const uint8_t *cptr;
	parserutils_error error;

	error = parserutils_inputstream_peek(tokeniser->input, 
			tokeniser->context.pending, &cptr, &len);

	if (error != PARSERUTILS_OK) {
		if (error == PARSERUTILS_EOF) {
			tokeniser->state = STATE_DATA;
			return emit_current_tag(tokeniser);
		} else {
			return hubbub_error_from_parserutils_error(error);
		}
	}

	c = *cptr;

	assert(c == '&' ||
		ctag->attributes[ctag->n_attributes - 1].value.len >= 1);

	if (c == '\t' || c == '\n' || c == '\f' || c == ' ' || c == '\r') {
		tokeniser->context.pending += len;
		tokeniser->state = STATE_BEFORE_ATTRIBUTE_NAME;
	} else if (c == '&') {
		tokeniser->context.prev_state = tokeniser->state;
		tokeniser->state = STATE_CHARACTER_REFERENCE_IN_ATTRIBUTE_VALUE;
		/* Don't eat the '&'; it'll be handled by entity consumption */
	} else if (c == '>') {
		tokeniser->context.pending += len;
		tokeniser->state = STATE_DATA;
		return emit_current_tag(tokeniser);
	} else if (c == '\0') {
		COLLECT(ctag->attributes[ctag->n_attributes - 1].value,
				u_fffd, sizeof(u_fffd));
		tokeniser->context.pending += len;
	} else {
		if (c == '"' || c == '\'' || c == '=') {
			/** \todo parse error */
		}

		COLLECT(ctag->attributes[ctag->n_attributes - 1].value,
				cptr, len);
		tokeniser->context.pending += len;
	}

	return HUBBUB_OK;
}

hubbub_error hubbub_tokeniser_handle_character_reference_in_attribute_value(
		hubbub_tokeniser *tokeniser)
{
	if (tokeniser->context.match_entity.complete == false) {
		return hubbub_tokeniser_consume_character_reference(tokeniser,
				tokeniser->context.pending);
	} else {
		hubbub_tag *ctag = &tokeniser->context.current_tag;
		hubbub_attribute *attr = &ctag->attributes[
				ctag->n_attributes - 1];

		uint8_t utf8[6];
		uint8_t *utf8ptr = utf8;
		size_t len = sizeof(utf8);

		if (tokeniser->context.match_entity.codepoint) {
			parserutils_charset_utf8_from_ucs4(
				tokeniser->context.match_entity.codepoint,
				&utf8ptr, &len);

			COLLECT_MS(attr->value, utf8, sizeof(utf8) - len);

			/* +1 for the ampersand */
			tokeniser->context.pending +=
					tokeniser->context.match_entity.length
					+ 1;
		} else {
			size_t len = 0;
			const uint8_t *cptr = NULL;
			parserutils_error error;

			error = parserutils_inputstream_peek(
					tokeniser->input,
					tokeniser->context.pending, 
					&cptr,
					&len);

			assert(error == PARSERUTILS_OK);

			/* Insert the ampersand */
			COLLECT_MS(attr->value, cptr, len);
			tokeniser->context.pending += len;
		}

		/* Reset for next time */
		tokeniser->context.match_entity.complete = false;

		/* And back to the previous state */
		tokeniser->state = tokeniser->context.prev_state;
	}

	return HUBBUB_OK;
}

/* always switches state */
hubbub_error hubbub_tokeniser_handle_after_attribute_value_q(
		hubbub_tokeniser *tokeniser)
{
	size_t len;
	const uint8_t *cptr;
	parserutils_error error;
	uint8_t c;

	error = parserutils_inputstream_peek(tokeniser->input, 
			tokeniser->context.pending, &cptr, &len);

	if (error != PARSERUTILS_OK) {
		if (error == PARSERUTILS_EOF) {
			tokeniser->state = STATE_DATA;
			return emit_current_tag(tokeniser);
		} else {
			return hubbub_error_from_parserutils_error(error);
		}
	}

	c = *cptr;

	if (c == '\t' || c == '\n' || c == '\f' || c == ' ' || c == '\r') {
		tokeniser->context.pending += len;
		tokeniser->state = STATE_BEFORE_ATTRIBUTE_NAME;
	} else if (c == '>') {
		tokeniser->context.pending += len;

		tokeniser->state = STATE_DATA;
		return emit_current_tag(tokeniser);
	} else if (c == '/') {
		tokeniser->context.pending += len;
		tokeniser->state = STATE_SELF_CLOSING_START_TAG;
	} else {
		/** \todo parse error */
		/* Reprocess character in before attribute name state */
		tokeniser->state = STATE_BEFORE_ATTRIBUTE_NAME;
	}

	return HUBBUB_OK;
}

hubbub_error hubbub_tokeniser_handle_self_closing_start_tag(
		hubbub_tokeniser *tokeniser)
{
	size_t len;
	const uint8_t *cptr;
	parserutils_error error;
	uint8_t c;

	error = parserutils_inputstream_peek(tokeniser->input,
			tokeniser->context.pending, &cptr, &len);

	if (error != PARSERUTILS_OK) {
		if (error == PARSERUTILS_EOF) {
			tokeniser->state = STATE_DATA;
			return emit_current_tag(tokeniser);
		} else {
			return hubbub_error_from_parserutils_error(error);
		}
	}

	c = *cptr;

	if (c == '>') {
		tokeniser->context.pending += len;
		tokeniser->state = STATE_DATA;

		tokeniser->context.current_tag.self_closing = true;
		return emit_current_tag(tokeniser);
	} else {
		/* Reprocess character in before attribute name state */
		tokeniser->state = STATE_BEFORE_ATTRIBUTE_NAME;
	}

	return HUBBUB_OK;
}

/* this state expects tokeniser->context.chars to be empty on first entry */
hubbub_error hubbub_tokeniser_handle_bogus_comment(hubbub_tokeniser *tokeniser)
{
	size_t len;
	const uint8_t *cptr;
	parserutils_error error;
	uint8_t c;

	error = parserutils_inputstream_peek(tokeniser->input,
			tokeniser->context.pending, &cptr, &len);

	if (error != PARSERUTILS_OK) {
		if (error == PARSERUTILS_EOF) {
			tokeniser->state = STATE_DATA;
			return emit_current_comment(tokeniser);
		} else {
			return hubbub_error_from_parserutils_error(error);
		}
	}

	c = *cptr;

	if (c == '>') {
		tokeniser->context.pending += len;
		tokeniser->state = STATE_DATA;
		return emit_current_comment(tokeniser);
	} else if (c == '\0') {
		error = parserutils_buffer_append(tokeniser->buffer,
				u_fffd, sizeof(u_fffd));
		if (error != PARSERUTILS_OK)
			return hubbub_error_from_parserutils_error(error);

		tokeniser->context.pending += len;
	} else if (c == '\r') {
		error = parserutils_inputstream_peek(
				tokeniser->input,
				tokeniser->context.pending,
				&cptr,
				&len);

		if (error != PARSERUTILS_OK && error != PARSERUTILS_EOF) {
			return hubbub_error_from_parserutils_error(error);
		} else if (error == PARSERUTILS_EOF || *cptr != '\n') {
			error = parserutils_buffer_append(tokeniser->buffer,
					&lf, sizeof(lf));
			if (error != PARSERUTILS_OK) {
				return hubbub_error_from_parserutils_error(
						error);
			}
		}
		tokeniser->context.pending += len;
	} else {
		error = parserutils_buffer_append(tokeniser->buffer,
				(uint8_t *) cptr, len);
		if (error != PARSERUTILS_OK)
			return hubbub_error_from_parserutils_error(error);

		tokeniser->context.pending += len;
	}

	return HUBBUB_OK;
}

/* this state always switches to another state straight away */
hubbub_error hubbub_tokeniser_handle_markup_declaration_open(
		hubbub_tokeniser *tokeniser)
{
	size_t len;
	const uint8_t *cptr;
	parserutils_error error;
	uint8_t c;

	assert(tokeniser->context.pending == 0);

	error = parserutils_inputstream_peek(tokeniser->input, 0, &cptr, &len);

	if (error != PARSERUTILS_OK) {
		if (error == PARSERUTILS_EOF) {
			tokeniser->state = STATE_BOGUS_COMMENT;
			return HUBBUB_OK;
		} else {
			return hubbub_error_from_parserutils_error(error);
		}
	}

	c = *cptr;

	if (c == '-') {
		tokeniser->context.pending = len;
		tokeniser->state = STATE_MATCH_COMMENT;
	} else if ((c & ~0x20) == 'D') {
		tokeniser->context.pending = len;
		tokeniser->context.match_doctype.count = len;
		tokeniser->state = STATE_MATCH_DOCTYPE;
	} else if (tokeniser->process_cdata_section == true && c == '[') {
		tokeniser->context.pending = len;
		tokeniser->context.match_cdata.count = len;
		tokeniser->state = STATE_MATCH_CDATA;
	} else {
		tokeniser->state = STATE_BOGUS_COMMENT;
	}

	return HUBBUB_OK;
}


hubbub_error hubbub_tokeniser_handle_match_comment(hubbub_tokeniser *tokeniser)
{
	size_t len;
	const uint8_t *cptr;
	parserutils_error error;

	error = parserutils_inputstream_peek(tokeniser->input, 
			tokeniser->context.pending, &cptr, &len);

	if (error != PARSERUTILS_OK) {
		if (error == PARSERUTILS_EOF) {
			tokeniser->context.pending =
				tokeniser->context.current_comment.len = 0;
			tokeniser->state = STATE_BOGUS_COMMENT;
			return HUBBUB_OK;
		} else {
			return hubbub_error_from_parserutils_error(error);
		}
	}

	tokeniser->context.pending = tokeniser->context.current_comment.len = 0;

	if (*cptr == '-') {
		parserutils_inputstream_advance(tokeniser->input, SLEN("--"));
		tokeniser->state = STATE_COMMENT_START;
	} else {
		tokeniser->state = STATE_BOGUS_COMMENT;
	}

	return HUBBUB_OK;
}


hubbub_error hubbub_tokeniser_handle_comment(hubbub_tokeniser *tokeniser)
{
	size_t len;
	const uint8_t *cptr;
	parserutils_error error;
	uint8_t c;

	error = parserutils_inputstream_peek(tokeniser->input, 
			tokeniser->context.pending, &cptr, &len);

	if (error != PARSERUTILS_OK) {
		if (error == PARSERUTILS_EOF) {
			tokeniser->state = STATE_DATA;
			return emit_current_comment(tokeniser);
		} else {
			return hubbub_error_from_parserutils_error(error);
		}
	}

	c = *cptr;

	if (c == '>' && (tokeniser->state == STATE_COMMENT_START_DASH ||
			tokeniser->state == STATE_COMMENT_START ||
			tokeniser->state == STATE_COMMENT_END)) {
		tokeniser->context.pending += len;

		/** \todo parse error if state != COMMENT_END */
		tokeniser->state = STATE_DATA;
		return emit_current_comment(tokeniser);
	} else if (c == '-') {
		if (tokeniser->state == STATE_COMMENT_START) {
			tokeniser->state = STATE_COMMENT_START_DASH;
		} else if (tokeniser->state == STATE_COMMENT_START_DASH) {
			tokeniser->state = STATE_COMMENT_END;
		} else if (tokeniser->state == STATE_COMMENT) {
			tokeniser->state = STATE_COMMENT_END_DASH;
		} else if (tokeniser->state == STATE_COMMENT_END_DASH) {
			tokeniser->state = STATE_COMMENT_END;
		} else if (tokeniser->state == STATE_COMMENT_END) {
			error = parserutils_buffer_append(tokeniser->buffer,
					(uint8_t *) "-", SLEN("-"));
			if (error != PARSERUTILS_OK) {
				return hubbub_error_from_parserutils_error(
						error);
			}
		}

		tokeniser->context.pending += len;
	} else {
		if (tokeniser->state == STATE_COMMENT_START_DASH ||
				tokeniser->state == STATE_COMMENT_END_DASH) {
			error = parserutils_buffer_append(tokeniser->buffer,
					(uint8_t *) "-", SLEN("-"));
			if (error != PARSERUTILS_OK) {
				return hubbub_error_from_parserutils_error(
						error);
			}
		} else if (tokeniser->state == STATE_COMMENT_END) {
			error = parserutils_buffer_append(tokeniser->buffer,
					(uint8_t *) "--", SLEN("--"));
			if (error != PARSERUTILS_OK) {
				return hubbub_error_from_parserutils_error(
						error);
			}
		}

		if (c == '\0') {
			error = parserutils_buffer_append(tokeniser->buffer,
					u_fffd, sizeof(u_fffd));
			if (error != PARSERUTILS_OK) {
				return hubbub_error_from_parserutils_error(
						error);
			}
		} else if (c == '\r') {
			size_t next_len;
			error = parserutils_inputstream_peek(
					tokeniser->input,
					tokeniser->context.pending + len,
					&cptr,
					&next_len);
			if (error != PARSERUTILS_OK && 
					error != PARSERUTILS_EOF) {
				return hubbub_error_from_parserutils_error(
						error);
			} else if (error != PARSERUTILS_EOF && *cptr != '\n') {
				error = parserutils_buffer_append(
						tokeniser->buffer,
						&lf, sizeof(lf));
				if (error != PARSERUTILS_OK) {
					return hubbub_error_from_parserutils_error(
							error);
				}
			}
		} else {
			error = parserutils_buffer_append(tokeniser->buffer, 
					cptr, len);
			if (error != PARSERUTILS_OK) {
				return hubbub_error_from_parserutils_error(
						error);
			}
		}

		tokeniser->context.pending += len;
		tokeniser->state = STATE_COMMENT;
	}

	return HUBBUB_OK;
}




#define DOCTYPE		"DOCTYPE"
#define DOCTYPE_LEN	(SLEN(DOCTYPE) - 1)

hubbub_error hubbub_tokeniser_handle_match_doctype(hubbub_tokeniser *tokeniser)
{
	size_t len;
	const uint8_t *cptr;
	parserutils_error error;
	uint8_t c;

	error = parserutils_inputstream_peek(tokeniser->input,
			tokeniser->context.match_doctype.count, &cptr, &len);

	if (error != PARSERUTILS_OK) {
		if (error == PARSERUTILS_EOF) {
			tokeniser->context.current_comment.len =
					tokeniser->context.pending = 0;
			tokeniser->state = STATE_BOGUS_COMMENT;
			return HUBBUB_OK;
		} else {
			return hubbub_error_from_parserutils_error(error);
		}
	}

	c = *cptr;

	assert(tokeniser->context.match_doctype.count <= DOCTYPE_LEN);

	if (DOCTYPE[tokeniser->context.match_doctype.count] != (c & ~0x20)) {
		tokeniser->context.current_comment.len =
				tokeniser->context.pending = 0;
		tokeniser->state = STATE_BOGUS_COMMENT;
		return HUBBUB_OK;
	}

	tokeniser->context.pending += len;

	if (tokeniser->context.match_doctype.count == DOCTYPE_LEN) {
		/* Skip over the DOCTYPE bit */
		parserutils_inputstream_advance(tokeniser->input,
				tokeniser->context.pending);

		memset(&tokeniser->context.current_doctype, 0,
				sizeof tokeniser->context.current_doctype);
		tokeniser->context.current_doctype.public_missing = true;
		tokeniser->context.current_doctype.system_missing = true;
		tokeniser->context.pending = 0;

		tokeniser->state = STATE_DOCTYPE;
	}

	tokeniser->context.match_doctype.count++;

	return HUBBUB_OK;
}

#undef DOCTYPE
#undef DOCTYPE_LEN

hubbub_error hubbub_tokeniser_handle_doctype(hubbub_tokeniser *tokeniser)
{
	size_t len;
	const uint8_t *cptr;
	parserutils_error error;
	uint8_t c;

	error = parserutils_inputstream_peek(tokeniser->input,
			tokeniser->context.pending, &cptr, &len);

	if (error != PARSERUTILS_OK) {
		if (error == PARSERUTILS_EOF) {
			tokeniser->state = STATE_BEFORE_DOCTYPE_NAME;
			return HUBBUB_OK;
		} else {
			return hubbub_error_from_parserutils_error(error);
		}
	}

	c = *cptr;

	if (c == '\t' || c == '\n' || c == '\f' || c == ' ' || c == '\r') {
		tokeniser->context.pending += len;
	}

	tokeniser->state = STATE_BEFORE_DOCTYPE_NAME;

	return HUBBUB_OK;
}

hubbub_error hubbub_tokeniser_handle_before_doctype_name(
		hubbub_tokeniser *tokeniser)
{
	hubbub_doctype *cdoc = &tokeniser->context.current_doctype;
	size_t len;
	const uint8_t *cptr;
	parserutils_error error;
	uint8_t c;

	error = parserutils_inputstream_peek(tokeniser->input,
			tokeniser->context.pending, &cptr, &len);

	if (error != PARSERUTILS_OK) {
		if (error == PARSERUTILS_EOF) {
			/** \todo parse error */
			/* Emit current doctype, force-quirks on */
			tokeniser->state = STATE_DATA;
			return emit_current_doctype(tokeniser, true);
		} else {
			return hubbub_error_from_parserutils_error(error);
		}
	}

	c = *cptr;

	if (c == '\t' || c == '\n' || c == '\f' || c == ' ' || c == '\r') {
		/* pass over in silence */
		tokeniser->context.pending += len;
	} else if (c == '>') {
		/** \todo parse error */
		tokeniser->context.pending += len;
		tokeniser->state = STATE_DATA;
		return emit_current_doctype(tokeniser, true);
	} else {
		if (c == '\0') {
			START_BUF(cdoc->name, u_fffd, sizeof(u_fffd));
		} else if ('A' <= c && c <= 'Z') {
			uint8_t lc = c + 0x20;

			START_BUF(cdoc->name, &lc, len);
		} else {
			START_BUF(cdoc->name, cptr, len);
		}

		tokeniser->context.pending += len;
		tokeniser->state = STATE_DOCTYPE_NAME;
	}

	return HUBBUB_OK;
}

hubbub_error hubbub_tokeniser_handle_doctype_name(hubbub_tokeniser *tokeniser)
{
	hubbub_doctype *cdoc = &tokeniser->context.current_doctype;
	size_t len;
	const uint8_t *cptr;
	parserutils_error error;
	uint8_t c;

	error = parserutils_inputstream_peek(tokeniser->input,
			tokeniser->context.pending, &cptr, &len);

	if (error != PARSERUTILS_OK) {
		if (error == PARSERUTILS_EOF) {
			tokeniser->state = STATE_DATA;
			return emit_current_doctype(tokeniser, true);
		} else {
			return hubbub_error_from_parserutils_error(error);
		}
	}

	c = *cptr;

	if (c == '\t' || c == '\n' || c == '\f' || c == ' ' || c == '\r') {
		tokeniser->context.pending += len;
		tokeniser->state = STATE_AFTER_DOCTYPE_NAME;
	} else if (c == '>') {
		tokeniser->context.pending += len;
		tokeniser->state = STATE_DATA;
		return emit_current_doctype(tokeniser, false);
	} else if (c == '\0') {
		COLLECT(cdoc->name, u_fffd, sizeof(u_fffd));
		tokeniser->context.pending += len;
	} else if ('A' <= c && c <= 'Z') {
		uint8_t lc = c + 0x20;
		COLLECT(cdoc->name, &lc, len);
		tokeniser->context.pending += len;
	} else {
		COLLECT(cdoc->name, cptr, len);
		tokeniser->context.pending += len;
	}

	return HUBBUB_OK;
}

hubbub_error hubbub_tokeniser_handle_after_doctype_name(
		hubbub_tokeniser *tokeniser)
{
	size_t len;
	const uint8_t *cptr;
	parserutils_error error;
	uint8_t c;

	error = parserutils_inputstream_peek(tokeniser->input,
			tokeniser->context.pending, &cptr, &len);

	if (error != PARSERUTILS_OK) {
		if (error == PARSERUTILS_EOF) {
			tokeniser->state = STATE_DATA;
			return emit_current_doctype(tokeniser, true);
		} else {
			return hubbub_error_from_parserutils_error(error);
		}
	}

	c = *cptr;
	tokeniser->context.pending += len;

	if (c == '\t' || c == '\n' || c == '\f' || c == ' ' || c == '\r') {
		/* pass over in silence */
	} else if (c == '>') {
		tokeniser->state = STATE_DATA;
		return emit_current_doctype(tokeniser, false);
	} else if ((c & ~0x20) == 'P') {
		tokeniser->context.match_doctype.count = 1;
		tokeniser->state = STATE_MATCH_PUBLIC;
	} else if ((c & ~0x20) == 'S') {
		tokeniser->context.match_doctype.count = 1;
		tokeniser->state = STATE_MATCH_SYSTEM;
	} else {
		tokeniser->state = STATE_BOGUS_DOCTYPE;
		tokeniser->context.current_doctype.force_quirks = true;
	}

	return HUBBUB_OK;
}

#define PUBLIC		"PUBLIC"
#define PUBLIC_LEN	(SLEN(PUBLIC) - 1)

hubbub_error hubbub_tokeniser_handle_match_public(hubbub_tokeniser *tokeniser)
{
	size_t len;
	const uint8_t *cptr;
	parserutils_error error;
	uint8_t c;

	error = parserutils_inputstream_peek(tokeniser->input,
			tokeniser->context.pending, &cptr, &len);

	if (error != PARSERUTILS_OK) {
		if (error == PARSERUTILS_EOF) {
			tokeniser->context.current_doctype.force_quirks = true;
			tokeniser->state = STATE_BOGUS_DOCTYPE;
			return HUBBUB_OK;
		} else {
			return hubbub_error_from_parserutils_error(error);
		}
	}

	c = *cptr;

	assert(tokeniser->context.match_doctype.count <= PUBLIC_LEN);

	if (PUBLIC[tokeniser->context.match_doctype.count] != (c & ~0x20)) {
		tokeniser->context.current_doctype.force_quirks = true;
		tokeniser->state = STATE_BOGUS_DOCTYPE;
		return HUBBUB_OK;
	}

	tokeniser->context.pending += len;

	if (tokeniser->context.match_doctype.count == PUBLIC_LEN) {
		tokeniser->state = STATE_BEFORE_DOCTYPE_PUBLIC;
	}

	tokeniser->context.match_doctype.count++;

	return HUBBUB_OK;
}

#undef PUBLIC
#undef PUBLIC_LEN

hubbub_error hubbub_tokeniser_handle_before_doctype_public(
		hubbub_tokeniser *tokeniser)
{
	hubbub_doctype *cdoc = &tokeniser->context.current_doctype;
	size_t len;
	const uint8_t *cptr;
	parserutils_error error;
	uint8_t c;

	error = parserutils_inputstream_peek(tokeniser->input,
			tokeniser->context.pending, &cptr, &len);

	if (error != PARSERUTILS_OK) {
		if (error == PARSERUTILS_EOF) {
			tokeniser->state = STATE_DATA;
			return emit_current_doctype(tokeniser, true);
		} else {
			return hubbub_error_from_parserutils_error(error);
		}
	}

	c = *cptr;
	tokeniser->context.pending += len;

	if (c == '\t' || c == '\n' || c == '\f' || c == ' ' || c == '\r') {
		/* pass over in silence */
	} else if (c == '"') {
		cdoc->public_missing = false;
		cdoc->public_id.len = 0;
		tokeniser->state = STATE_DOCTYPE_PUBLIC_DQ;
	} else if (c == '\'') {
		cdoc->public_missing = false;
		cdoc->public_id.len = 0;
		tokeniser->state = STATE_DOCTYPE_PUBLIC_SQ;
	} else if (c == '>') {
		tokeniser->state = STATE_DATA;
		return emit_current_doctype(tokeniser, true);
	} else {
		cdoc->force_quirks = true;
		tokeniser->state = STATE_BOGUS_DOCTYPE;
	}

	return HUBBUB_OK;
}

hubbub_error hubbub_tokeniser_handle_doctype_public_dq(
		hubbub_tokeniser *tokeniser)
{
	hubbub_doctype *cdoc = &tokeniser->context.current_doctype;
	size_t len;
	const uint8_t *cptr;
	parserutils_error error;
	uint8_t c;

	error = parserutils_inputstream_peek(tokeniser->input,
			tokeniser->context.pending, &cptr, &len);

	if (error != PARSERUTILS_OK) {
		if (error == PARSERUTILS_EOF) {
			tokeniser->state = STATE_DATA;
			return emit_current_doctype(tokeniser, true);
		} else {
			return hubbub_error_from_parserutils_error(error);
		}
	}

	c = *cptr;

	if (c == '"') {
		tokeniser->context.pending += len;
		tokeniser->state = STATE_AFTER_DOCTYPE_PUBLIC;
	} else if (c == '>') {
		tokeniser->context.pending += len;
		tokeniser->state = STATE_DATA;
		return emit_current_doctype(tokeniser, true);
	} else if (c == '\0') {
		COLLECT_MS(cdoc->public_id, u_fffd, sizeof(u_fffd));
		tokeniser->context.pending += len;
	} else if (c == '\r') {
		error = parserutils_inputstream_peek(
				tokeniser->input,
				tokeniser->context.pending,
				&cptr,
				&len);

		if (error != PARSERUTILS_OK && error != PARSERUTILS_EOF) {
			return hubbub_error_from_parserutils_error(error);
		} else if (error == PARSERUTILS_EOF || *cptr != '\n') {
			COLLECT_MS(cdoc->public_id, &lf, sizeof(lf));
		}

		/* Collect '\r' */
		tokeniser->context.pending += 1;
	} else {
		COLLECT_MS(cdoc->public_id, cptr, len);

		tokeniser->context.pending += len;
	}

	return HUBBUB_OK;
}

hubbub_error hubbub_tokeniser_handle_doctype_public_sq(
		hubbub_tokeniser *tokeniser)
{
	hubbub_doctype *cdoc = &tokeniser->context.current_doctype;
	size_t len;
	const uint8_t *cptr;
	parserutils_error error;
	uint8_t c;

	error = parserutils_inputstream_peek(tokeniser->input,
			tokeniser->context.pending, &cptr, &len);

	if (error != PARSERUTILS_OK) {
		if (error == PARSERUTILS_EOF) {
			tokeniser->state = STATE_DATA;
			return emit_current_doctype(tokeniser, true);
		} else {
			return hubbub_error_from_parserutils_error(error);
		}
	}

	c = *cptr;

	if (c == '\'') {
		tokeniser->context.pending += len;
		tokeniser->state = STATE_AFTER_DOCTYPE_PUBLIC;
	} else if (c == '>') {
		tokeniser->context.pending += len;
		tokeniser->state = STATE_DATA;
		return emit_current_doctype(tokeniser, true);
	} else if (c == '\0') {
		COLLECT_MS(cdoc->public_id, u_fffd, sizeof(u_fffd));
		tokeniser->context.pending += len;
	} else if (c == '\r') {
		error = parserutils_inputstream_peek(
				tokeniser->input,
				tokeniser->context.pending,
				&cptr,
				&len);

		if (error != PARSERUTILS_OK && error != PARSERUTILS_EOF) {
			return hubbub_error_from_parserutils_error(error);
		} else if (error == PARSERUTILS_EOF || *cptr != '\n') {
			COLLECT_MS(cdoc->public_id, &lf, sizeof(lf));
		}
	
		/* Collect '\r' */
		tokeniser->context.pending += 1;
	} else {
		COLLECT_MS(cdoc->public_id, cptr, len);
		tokeniser->context.pending += len;
	}

	return HUBBUB_OK;
}


hubbub_error hubbub_tokeniser_handle_after_doctype_public(
		hubbub_tokeniser *tokeniser)
{
	hubbub_doctype *cdoc = &tokeniser->context.current_doctype;
	size_t len;
	const uint8_t *cptr;
	parserutils_error error;
	uint8_t c;

	error = parserutils_inputstream_peek(tokeniser->input,
			tokeniser->context.pending, &cptr, &len);

	if (error != PARSERUTILS_OK) {
		if (error == PARSERUTILS_EOF) {
			tokeniser->state = STATE_DATA;
			return emit_current_doctype(tokeniser, true);
		} else {
			return hubbub_error_from_parserutils_error(error);
		}
	}

	c = *cptr;
	tokeniser->context.pending += len;

	if (c == '\t' || c == '\n' || c == '\f' || c == ' ' || c == '\r') {
		/* pass over in silence */
	} else if (c == '"') {
		cdoc->system_missing = false;
		cdoc->system_id.len = 0;

		tokeniser->state = STATE_DOCTYPE_SYSTEM_DQ;
	} else if (c == '\'') {
		cdoc->system_missing = false;
		cdoc->system_id.len = 0;

		tokeniser->state = STATE_DOCTYPE_SYSTEM_SQ;
	} else if (c == '>') {
		tokeniser->state = STATE_DATA;
		return emit_current_doctype(tokeniser, false);
	} else {
		cdoc->force_quirks = true;
		tokeniser->state = STATE_BOGUS_DOCTYPE;
	}

	return HUBBUB_OK;
}



#define SYSTEM		"SYSTEM"
#define SYSTEM_LEN	(SLEN(SYSTEM) - 1)

hubbub_error hubbub_tokeniser_handle_match_system(hubbub_tokeniser *tokeniser)
{
	size_t len;
	const uint8_t *cptr;
	parserutils_error error;
	uint8_t c;

	error = parserutils_inputstream_peek(tokeniser->input,
			tokeniser->context.pending, &cptr, &len);

	if (error != PARSERUTILS_OK){
		if (error == PARSERUTILS_EOF) {
			tokeniser->context.current_doctype.force_quirks = true;
			tokeniser->state = STATE_BOGUS_DOCTYPE;
			return HUBBUB_OK;
		} else {
			return hubbub_error_from_parserutils_error(error);
		}
	}

	c = *cptr;

	assert(tokeniser->context.match_doctype.count <= SYSTEM_LEN);

	if (SYSTEM[tokeniser->context.match_doctype.count] != (c & ~0x20)) {
		tokeniser->context.current_doctype.force_quirks = true;
		tokeniser->state = STATE_BOGUS_DOCTYPE;
		return HUBBUB_OK;
	}

	tokeniser->context.pending += len;

	if (tokeniser->context.match_doctype.count == SYSTEM_LEN) {
		tokeniser->state = STATE_BEFORE_DOCTYPE_SYSTEM;
	}

	tokeniser->context.match_doctype.count++;

	return HUBBUB_OK;
}

#undef SYSTEM
#undef SYSTEM_LEN

hubbub_error hubbub_tokeniser_handle_before_doctype_system(
		hubbub_tokeniser *tokeniser)
{
	hubbub_doctype *cdoc = &tokeniser->context.current_doctype;
	size_t len;
	const uint8_t *cptr;
	parserutils_error error;
	uint8_t c;

	error = parserutils_inputstream_peek(tokeniser->input,
			tokeniser->context.pending, &cptr, &len);

	if (error != PARSERUTILS_OK) {
		if (error == PARSERUTILS_EOF) {
			tokeniser->state = STATE_DATA;
			return emit_current_doctype(tokeniser, true);
		} else {
			return hubbub_error_from_parserutils_error(error);
		}
	}

	c = *cptr;
	tokeniser->context.pending += len;

	if (c == '\t' || c == '\n' || c == '\f' || c == ' ' || c == '\r') {
		/* pass over */
	} else if (c == '"') {
		cdoc->system_missing = false;
		cdoc->system_id.len = 0;

		tokeniser->state = STATE_DOCTYPE_SYSTEM_DQ;
	} else if (c == '\'') {
		cdoc->system_missing = false;
		cdoc->system_id.len = 0;

		tokeniser->state = STATE_DOCTYPE_SYSTEM_SQ;
	} else if (c == '>') {
		tokeniser->state = STATE_DATA;
		return emit_current_doctype(tokeniser, true);
	} else {
		cdoc->force_quirks = true;
		tokeniser->state = STATE_BOGUS_DOCTYPE;
	}

	return HUBBUB_OK;
}

hubbub_error hubbub_tokeniser_handle_doctype_system_dq(
		hubbub_tokeniser *tokeniser)
{
	hubbub_doctype *cdoc = &tokeniser->context.current_doctype;
	size_t len;
	const uint8_t *cptr;
	parserutils_error error;
	uint8_t c;

	error = parserutils_inputstream_peek(tokeniser->input,
			tokeniser->context.pending, &cptr, &len);

	if (error != PARSERUTILS_OK) {
		if (error == PARSERUTILS_EOF) {
			tokeniser->state = STATE_DATA;
			return emit_current_doctype(tokeniser, true);
		} else {
			return hubbub_error_from_parserutils_error(error);
		}
	}

	c = *cptr;

	if (c == '"') {
		tokeniser->context.pending += len;
		tokeniser->state = STATE_AFTER_DOCTYPE_SYSTEM;
	} else if (c == '>') {
		tokeniser->context.pending += len;
		tokeniser->state = STATE_DATA;
		return emit_current_doctype(tokeniser, true);
	} else if (c == '\0') {
		COLLECT_MS(cdoc->system_id, u_fffd, sizeof(u_fffd));
		tokeniser->context.pending += len;
	} else if (c == '\r') {
		error = parserutils_inputstream_peek(
				tokeniser->input,
				tokeniser->context.pending,
				&cptr,
				&len);

		if (error != PARSERUTILS_OK && error != PARSERUTILS_EOF) {
			return hubbub_error_from_parserutils_error(error);
		} else if (error == PARSERUTILS_EOF || *cptr != '\n') {
			COLLECT_MS(cdoc->system_id, &lf, sizeof(lf));
		}

		/* Collect '\r' */
		tokeniser->context.pending += 1;
	} else {
		COLLECT_MS(cdoc->system_id, cptr, len);
		tokeniser->context.pending += len;
	}

	return HUBBUB_OK;
}

hubbub_error hubbub_tokeniser_handle_doctype_system_sq(
		hubbub_tokeniser *tokeniser)
{
	hubbub_doctype *cdoc = &tokeniser->context.current_doctype;
	size_t len;
	const uint8_t *cptr;
	parserutils_error error;
	uint8_t c;

	error = parserutils_inputstream_peek(tokeniser->input,
			tokeniser->context.pending, &cptr, &len);

	if (error != PARSERUTILS_OK) {
		if (error == PARSERUTILS_EOF) {
			tokeniser->state = STATE_DATA;
			return emit_current_doctype(tokeniser, true);
		} else {
			return hubbub_error_from_parserutils_error(error);
		}
	}

	c = *cptr;

	if (c == '\'') {
		tokeniser->context.pending += len;
		tokeniser->state = STATE_AFTER_DOCTYPE_SYSTEM;
	} else if (c == '>') {
		tokeniser->context.pending += len;
		tokeniser->state = STATE_DATA;
		return emit_current_doctype(tokeniser, true);
	} else if (c == '\0') {
		COLLECT_MS(cdoc->system_id, u_fffd, sizeof(u_fffd));
		tokeniser->context.pending += len;
	} else if (c == '\r') {
		error = parserutils_inputstream_peek(
				tokeniser->input,
				tokeniser->context.pending,
				&cptr,
				&len);

		if (error != PARSERUTILS_OK && error != PARSERUTILS_EOF) {
			return hubbub_error_from_parserutils_error(error);
		} else if (error == PARSERUTILS_EOF || *cptr != '\n') {
			COLLECT_MS(cdoc->system_id, &lf, sizeof(lf));
		}

		/* Collect '\r' */
		tokeniser->context.pending += 1;
	} else {
		COLLECT_MS(cdoc->system_id, cptr, len);
		tokeniser->context.pending += len;
	}

	return HUBBUB_OK;
}

hubbub_error hubbub_tokeniser_handle_after_doctype_system(
		hubbub_tokeniser *tokeniser)
{
	size_t len;
	const uint8_t *cptr;
	parserutils_error error;
	uint8_t c;

	error = parserutils_inputstream_peek(tokeniser->input,
			tokeniser->context.pending, &cptr, &len);

	if (error != PARSERUTILS_OK) {
		if (error == PARSERUTILS_EOF) {
			tokeniser->state = STATE_DATA;
			return emit_current_doctype(tokeniser, true);
		} else {
			return hubbub_error_from_parserutils_error(error);
		}
	}

	c = *cptr;
	tokeniser->context.pending += len;

	if (c == '\t' || c == '\n' || c == '\f' || c == ' ' || c == '\r') {
		/* pass over in silence */
	} else if (c == '>') {
		tokeniser->state = STATE_DATA;
		return emit_current_doctype(tokeniser, false);
	} else {
		tokeniser->state = STATE_BOGUS_DOCTYPE;
	}

	return HUBBUB_OK;
}


hubbub_error hubbub_tokeniser_handle_bogus_doctype(hubbub_tokeniser *tokeniser)
{
	size_t len;
	const uint8_t *cptr;
	parserutils_error error;
	uint8_t c;

	error = parserutils_inputstream_peek(tokeniser->input,
			tokeniser->context.pending, &cptr, &len);

	if (error != PARSERUTILS_OK) {
		if (error == PARSERUTILS_EOF) {
			tokeniser->state = STATE_DATA;
			return emit_current_doctype(tokeniser, false);
		} else {
			return hubbub_error_from_parserutils_error(error);
		}
	}

	c = *cptr;
	tokeniser->context.pending += len;

	if (c == '>') {
		tokeniser->state = STATE_DATA;
		return emit_current_doctype(tokeniser, false);
	}

	return HUBBUB_OK;
}



#define CDATA		"[CDATA["
#define CDATA_LEN	(SLEN(CDATA) - 1)

hubbub_error hubbub_tokeniser_handle_match_cdata(hubbub_tokeniser *tokeniser)
{
	size_t len;
	const uint8_t *cptr;
	parserutils_error error;
	uint8_t c;

	error = parserutils_inputstream_peek(tokeniser->input,
			tokeniser->context.pending, &cptr, &len);

	if (error != PARSERUTILS_OK) {
		if (error == PARSERUTILS_EOF) {
			tokeniser->context.current_comment.len =
					tokeniser->context.pending = 0;
			tokeniser->state = STATE_BOGUS_COMMENT;
			return HUBBUB_OK;
		} else {
			return hubbub_error_from_parserutils_error(error);
		}
	}

	c = *cptr;

	assert(tokeniser->context.match_cdata.count <= CDATA_LEN);

	if (CDATA[tokeniser->context.match_cdata.count] != (c & ~0x20)) {
		tokeniser->context.current_comment.len =
				tokeniser->context.pending =
				0;
		tokeniser->state = STATE_BOGUS_COMMENT;
		return HUBBUB_OK;
	}

	tokeniser->context.pending += len;

	if (tokeniser->context.match_cdata.count == CDATA_LEN) {
		parserutils_inputstream_advance(tokeniser->input,
				tokeniser->context.match_cdata.count + len);
		tokeniser->context.pending = 0;
		tokeniser->context.match_cdata.end = 0;
		tokeniser->state = STATE_CDATA_BLOCK;
	}

	tokeniser->context.match_cdata.count += len;

	return HUBBUB_OK;
}

#undef CDATA
#undef CDATA_LEN


hubbub_error hubbub_tokeniser_handle_cdata_block(hubbub_tokeniser *tokeniser)
{
	size_t len;
	const uint8_t *cptr;
	parserutils_error error;
	uint8_t c;

	error = parserutils_inputstream_peek(tokeniser->input,
			tokeniser->context.pending, &cptr, &len);

	if (error != PARSERUTILS_OK) {
		if (error == PARSERUTILS_EOF) {
			tokeniser->state = STATE_DATA;
			return emit_current_chars(tokeniser);
		} else {
			return hubbub_error_from_parserutils_error(error);
		}
	}

	c = *cptr;

	if (c == ']' && (tokeniser->context.match_cdata.end == 0 ||
			tokeniser->context.match_cdata.end == 1)) {
		tokeniser->context.pending += len;
		tokeniser->context.match_cdata.end += len;
	} else if (c == '>' && tokeniser->context.match_cdata.end == 2) {
		/* Remove the previous two "]]" */
		tokeniser->context.pending -= 2;

		/* Emit any pending characters */
		emit_current_chars(tokeniser);

		/* Now move past the "]]>" bit */
		parserutils_inputstream_advance(tokeniser->input, SLEN("]]>"));

		tokeniser->state = STATE_DATA;
	} else if (c == '\0') {
		if (tokeniser->context.pending > 0) {
			/* Emit any pending characters */
			emit_current_chars(tokeniser);
		}

		/* Perform NUL-byte replacement */
		emit_character_token(tokeniser, &u_fffd_str);

		parserutils_inputstream_advance(tokeniser->input, len);
		tokeniser->context.match_cdata.end = 0;
	} else if (c == '\r') {
		error = parserutils_inputstream_peek(
				tokeniser->input,
				tokeniser->context.pending + len,
				&cptr,
				&len);

		if (error != PARSERUTILS_OK && error != PARSERUTILS_EOF) {
			return hubbub_error_from_parserutils_error(error);
		}

		if (tokeniser->context.pending > 0) {
			/* Emit any pending characters */
			emit_current_chars(tokeniser);
		}

		if (error == PARSERUTILS_EOF || *cptr != '\n') {
			/* Emit newline */
			emit_character_token(tokeniser, &lf_str);
		}

		/* Advance over \r */
		parserutils_inputstream_advance(tokeniser->input, 1);
		tokeniser->context.match_cdata.end = 0;
	} else {
		tokeniser->context.pending += len;
		tokeniser->context.match_cdata.end = 0;
	}

	return HUBBUB_OK;
}


hubbub_error hubbub_tokeniser_consume_character_reference(
		hubbub_tokeniser *tokeniser, size_t pos)
{
	uint32_t allowed_char = tokeniser->context.allowed_char;

	size_t len;
	const uint8_t *cptr;
	parserutils_error error;
	uint8_t c;
	size_t off;

	error = parserutils_inputstream_peek(tokeniser->input, pos, 
			&cptr, &len);

	/* We should always start on an ampersand */
	assert(error == PARSERUTILS_OK);
	assert(len == 1 && *cptr == '&');

	off = pos + len;

	/* Look at the character after the ampersand */
	error = parserutils_inputstream_peek(tokeniser->input, off, 
			&cptr, &len);

	if (error != PARSERUTILS_OK) {
		if (error == PARSERUTILS_EOF) {
			tokeniser->context.match_entity.complete = true;
			tokeniser->context.match_entity.codepoint = 0;
			return HUBBUB_OK;
		} else {
			return hubbub_error_from_parserutils_error(error);
		}
	}

	c = *cptr;

	/* Set things up */
	tokeniser->context.match_entity.offset = off;
	tokeniser->context.match_entity.poss_length = 0;
	tokeniser->context.match_entity.length = 0;
	tokeniser->context.match_entity.base = 0;
	tokeniser->context.match_entity.codepoint = 0;
	tokeniser->context.match_entity.had_data = false;
	tokeniser->context.match_entity.return_state = tokeniser->state;
	tokeniser->context.match_entity.complete = false;
	tokeniser->context.match_entity.overflow = false;
	tokeniser->context.match_entity.context = NULL;
	tokeniser->context.match_entity.prev_len = len;

	/* Reset allowed character for future calls */
	tokeniser->context.allowed_char = '\0';

	if (c == '\t' || c == '\n' || c == '\f' || c == ' ' ||
			c == '<' || c == '&' ||
			(allowed_char && c == allowed_char)) {
		tokeniser->context.match_entity.complete = true;
		tokeniser->context.match_entity.codepoint = 0;
	} else if (c == '#') {
		tokeniser->context.match_entity.length += len;
		tokeniser->state = STATE_NUMBERED_ENTITY;
	} else {
		tokeniser->state = STATE_NAMED_ENTITY;
	}

	return HUBBUB_OK;
}


hubbub_error hubbub_tokeniser_handle_numbered_entity(
		hubbub_tokeniser *tokeniser)
{
	hubbub_tokeniser_context *ctx = &tokeniser->context;

	size_t len;
	const uint8_t *cptr;
	parserutils_error error;

	error = parserutils_inputstream_peek(tokeniser->input,
			ctx->match_entity.offset + ctx->match_entity.length,
			&cptr, &len);

	if (error != PARSERUTILS_OK && error != PARSERUTILS_EOF) {
		return hubbub_error_from_parserutils_error(error);
	}

	if (error != PARSERUTILS_EOF && ctx->match_entity.base == 0) {
		uint8_t c = *cptr;
		if ((c & ~0x20) == 'X') {
			ctx->match_entity.base = 16;
			ctx->match_entity.length += len;
		} else {
			ctx->match_entity.base = 10;
		}
	}

	while ((error = parserutils_inputstream_peek(tokeniser->input,
			ctx->match_entity.offset + ctx->match_entity.length,
			&cptr, &len)) == PARSERUTILS_OK) {
		uint8_t c = *cptr;

		if (ctx->match_entity.base == 10 &&
				('0' <= c && c <= '9')) {
			ctx->match_entity.had_data = true;
			ctx->match_entity.codepoint =
				ctx->match_entity.codepoint * 10 + (c - '0');

			ctx->match_entity.length += len;
		} else if (ctx->match_entity.base == 16 &&
				(('0' <= c && c <= '9') ||
				('A' <= (c & ~0x20) &&
						(c & ~0x20) <= 'F'))) {
			ctx->match_entity.had_data = true;
			ctx->match_entity.codepoint *= 16;

			if ('0' <= c && c <= '9') {
				ctx->match_entity.codepoint += (c - '0');
			} else {
				ctx->match_entity.codepoint +=
						((c & ~0x20) - 'A' + 10);
			}

			ctx->match_entity.length += len;
		} else {
			break;
		}

		if (ctx->match_entity.codepoint >= 0x10FFFF) {
			ctx->match_entity.overflow = true;
		}
	}

	if (error != PARSERUTILS_OK && error != PARSERUTILS_EOF) {
		return hubbub_error_from_parserutils_error(error);
	}

	/* Eat trailing semicolon, if any */
	if (error != PARSERUTILS_EOF && *cptr == ';') {
		ctx->match_entity.length += len;
	}

	/* Had data, so calculate final codepoint */
	if (ctx->match_entity.had_data) {
		uint32_t cp = ctx->match_entity.codepoint;

		if (0x80 <= cp && cp <= 0x9F) {
			cp = cp1252Table[cp - 0x80];
		} else if (cp == 0x0D) {
			cp = 0x000A;
		} else if (ctx->match_entity.overflow || 
				cp <= 0x0008 || cp == 0x000B ||
				(0x000E <= cp && cp <= 0x001F) ||
				(0x007F <= cp && cp <= 0x009F) ||
				(0xD800 <= cp && cp <= 0xDFFF) ||
				(0xFDD0 <= cp && cp <= 0xFDEF) ||
				(cp & 0xFFFE) == 0xFFFE) {
			/* the check for cp > 0x10FFFF per spec is performed
			 * in the loop above to avoid overflow */
			cp = 0xFFFD;
		}

		ctx->match_entity.codepoint = cp;
	}

	/* Flag completion */
	ctx->match_entity.complete = true;

	/* And back to the state we were entered in */
	tokeniser->state = ctx->match_entity.return_state;

	return HUBBUB_OK;
}

hubbub_error hubbub_tokeniser_handle_named_entity(hubbub_tokeniser *tokeniser)
{
	hubbub_tokeniser_context *ctx = &tokeniser->context;

	size_t len;
	const uint8_t *cptr;
	parserutils_error error;

	while ((error = parserutils_inputstream_peek(tokeniser->input,
			ctx->match_entity.offset +
					ctx->match_entity.poss_length,
			&cptr, &len)) == PARSERUTILS_OK) {
		uint32_t cp;

		uint8_t c = *cptr;
		hubbub_error error;

		if (c > 0x7F) {
			/* Entity names are ASCII only */
			break;
		}

		error = hubbub_entities_search_step(c, &cp,
				&ctx->match_entity.context);
		if (error == HUBBUB_OK) {
			/* Had a match - store it for later */
			ctx->match_entity.codepoint = cp;

			ctx->match_entity.length =
					ctx->match_entity.poss_length + len;
			ctx->match_entity.poss_length =
					ctx->match_entity.length;
		} else if (error == HUBBUB_INVALID) {
			/* No further matches - use last found */
			break;
		} else {
			/* Need more data */
			ctx->match_entity.poss_length += len;
		}
	}

	if (error != PARSERUTILS_OK && error != PARSERUTILS_EOF) {
		return hubbub_error_from_parserutils_error(error);
	}

	if (ctx->match_entity.length > 0) {
		uint8_t c;
		error = parserutils_inputstream_peek(tokeniser->input,
				ctx->match_entity.offset + 
					ctx->match_entity.length - 1,
				&cptr, &len);
		/* We're re-reading a character we've already read after. 
		 * Therefore, there's no way that an error may occur as 
		 * a result. */
		assert(error == PARSERUTILS_OK);

		c = *cptr;

		if ((tokeniser->context.match_entity.return_state ==
				STATE_CHARACTER_REFERENCE_IN_ATTRIBUTE_VALUE) &&
				c != ';') {
			error = parserutils_inputstream_peek(tokeniser->input,
					ctx->match_entity.offset +
						ctx->match_entity.length,
					&cptr, &len);
			/* We must have attempted to read one more character 
			 * than was present in the entity name, as that is the 
			 * only way to break out of the loop above. If that 
			 * failed, then any non-EOF case will have been handled
			 * by the if statement after the loop thus it cannot 
			 * occur here. */
			assert(error == PARSERUTILS_OK || 
					error == PARSERUTILS_EOF);

			if (error == PARSERUTILS_EOF) {
				ctx->match_entity.codepoint = 0;
			}

			c = *cptr;
			if ((0x0030 <= c && c <= 0x0039) ||
					(0x0041 <= c && c <= 0x005A) ||
					(0x0061 <= c && c <= 0x007A)) {
				ctx->match_entity.codepoint = 0;
			}
		}
	}

	/* Flag completion */
	ctx->match_entity.complete = true;

	/* And back to the state from whence we came */
	tokeniser->state = ctx->match_entity.return_state;

	return HUBBUB_OK;
}



/*** Token emitting bits ***/

/**
 * Emit a character token.
 *
 * \param tokeniser	Tokeniser instance
 * \param chars		Pointer to hubbub_string to emit
 * \return	true
 */
hubbub_error emit_character_token(hubbub_tokeniser *tokeniser,
		const hubbub_string *chars)
{
	hubbub_token token;

	token.type = HUBBUB_TOKEN_CHARACTER;
	token.data.character = *chars;

	return hubbub_tokeniser_emit_token(tokeniser, &token);
}

/**
 * Emit the current pending characters being stored in the tokeniser context.
 *
 * \param tokeniser	Tokeniser instance
 * \return	true
 */
hubbub_error emit_current_chars(hubbub_tokeniser *tokeniser)
{
	hubbub_token token;
	size_t len;
	const uint8_t *cptr = NULL;
	parserutils_error error;

	/* Calling this with nothing to output is a probable bug */
	assert(tokeniser->context.pending > 0);

	error = parserutils_inputstream_peek(tokeniser->input, 0, &cptr, &len);

	assert(error == PARSERUTILS_OK);

	token.type = HUBBUB_TOKEN_CHARACTER;
	token.data.character.ptr = cptr;
	token.data.character.len = tokeniser->context.pending;

	return hubbub_tokeniser_emit_token(tokeniser, &token);
}

/**
 * Emit the current tag token being stored in the tokeniser context.
 *
 * \param tokeniser	Tokeniser instance
 * \return	true
 */
hubbub_error emit_current_tag(hubbub_tokeniser *tokeniser)
{
	hubbub_error err;
	hubbub_token token;
	uint32_t n_attributes;
	hubbub_attribute *attrs;
	uint8_t *ptr;
	uint32_t i, j;

	/* Emit current tag */
	token.type = tokeniser->context.current_tag_type;
	token.data.tag = tokeniser->context.current_tag;
 	token.data.tag.ns = HUBBUB_NS_HTML;


	n_attributes = token.data.tag.n_attributes;
	attrs = token.data.tag.attributes;

	/* Set pointers correctly... */
	ptr = tokeniser->buffer->data;
	token.data.tag.name.ptr = tokeniser->buffer->data;
	ptr += token.data.tag.name.len;

	for (i = 0; i < n_attributes; i++) {
		attrs[i].name.ptr = ptr;
		ptr += attrs[i].name.len;
		attrs[i].value.ptr = ptr;
		ptr += attrs[i].value.len;
	}


	/* Discard duplicate attributes */
	for (i = 0; i < n_attributes; i++) {
		for (j = 0; j < n_attributes; j++) {
			uint32_t move;

			if (j == i ||
				attrs[i].name.len !=
						attrs[j].name.len ||
				strncmp((char *) attrs[i].name.ptr,
					(char *) attrs[j].name.ptr,
					attrs[i].name.len) != 0) {
				/* Attributes don't match */
				continue;
			}

			assert(i < j);

			/* Calculate amount to move */
			move = (n_attributes - 1 - j) *
					sizeof(hubbub_attribute);

			if (move > 0) {
				memmove(&attrs[j],&attrs[j+1], move);
			}

			/* We've deleted an item, so we need to 
			 * reprocess this index */
			j--;

			/* And reduce the number of attributes */
			n_attributes--;
		}
	}

	token.data.tag.n_attributes = n_attributes;

	err = hubbub_tokeniser_emit_token(tokeniser, &token);

	if (token.type == HUBBUB_TOKEN_START_TAG) {
		/* Save start tag name for R?CDATA */
		if (token.data.tag.name.len <
			sizeof(tokeniser->context.last_start_tag_name)) {
			strncpy((char *) tokeniser->context.last_start_tag_name,
				(const char *) token.data.tag.name.ptr,
				token.data.tag.name.len);
			tokeniser->context.last_start_tag_len =
					token.data.tag.name.len;
		} else {
			tokeniser->context.last_start_tag_name[0] = '\0';
			tokeniser->context.last_start_tag_len = 0;
		}
	} else /* if (token->type == HUBBUB_TOKEN_END_TAG) */ {
		/* Reset content model after R?CDATA elements */
		tokeniser->content_model = HUBBUB_CONTENT_MODEL_PCDATA;
	}

	/* Reset the self-closing flag */
	tokeniser->context.current_tag.self_closing = false;

	return err;
}

/**
 * Emit the current comment token being stored in the tokeniser context.
 *
 * \param tokeniser	Tokeniser instance
 * \return	true
 */
hubbub_error emit_current_comment(hubbub_tokeniser *tokeniser)
{
	hubbub_token token;

	token.type = HUBBUB_TOKEN_COMMENT;
	token.data.comment.ptr = tokeniser->buffer->data;
	token.data.comment.len = tokeniser->buffer->length;

	return hubbub_tokeniser_emit_token(tokeniser, &token);
}

/**
 * Emit the current doctype token being stored in the tokeniser context.
 *
 * \param tokeniser	Tokeniser instance
 * \param force_quirks	Force quirks mode on this document
 * \return	true
 */
hubbub_error emit_current_doctype(hubbub_tokeniser *tokeniser,
		bool force_quirks)
{
	hubbub_token token;

	/* Emit doctype */
	token.type = HUBBUB_TOKEN_DOCTYPE;
	token.data.doctype = tokeniser->context.current_doctype;
	if (force_quirks == true)
		token.data.doctype.force_quirks = true;

	/* Set pointers correctly */
	token.data.doctype.name.ptr = tokeniser->buffer->data;

	if (token.data.doctype.public_missing == false) {
		token.data.doctype.public_id.ptr = tokeniser->buffer->data + 
				token.data.doctype.name.len;
	}

	if (token.data.doctype.system_missing == false) {
		token.data.doctype.system_id.ptr = tokeniser->buffer->data +
				token.data.doctype.name.len +
				token.data.doctype.public_id.len;
	}

	return hubbub_tokeniser_emit_token(tokeniser, &token);
}

/**
 * Emit a token, performing sanity checks if necessary
 *
 * \param tokeniser  Tokeniser instance
 * \param token      Token to emit
 */
hubbub_error hubbub_tokeniser_emit_token(hubbub_tokeniser *tokeniser,
		hubbub_token *token)
{
	hubbub_error err = HUBBUB_OK;

	assert(tokeniser != NULL);
	assert(token != NULL);

#ifndef NDEBUG
	/* Sanity checks */
	switch (token->type) {
	case HUBBUB_TOKEN_DOCTYPE:
		assert(memchr(token->data.doctype.name.ptr, 0xff, 
				token->data.doctype.name.len) == NULL);
		if (token->data.doctype.public_missing == false)
			assert(memchr(token->data.doctype.public_id.ptr, 0xff,
				token->data.doctype.public_id.len) == NULL);
		if (token->data.doctype.system_missing == false)
			assert(memchr(token->data.doctype.system_id.ptr, 0xff,
				token->data.doctype.system_id.len) == NULL);
		break;
	case HUBBUB_TOKEN_START_TAG:
	case HUBBUB_TOKEN_END_TAG:
	{
		uint32_t i;
		assert(memchr(token->data.tag.name.ptr, 0xff, 
				token->data.tag.name.len) == NULL);
		for (i = 0; i < token->data.tag.n_attributes; i++) {
			hubbub_attribute *attr = &token->data.tag.attributes[i];

			assert(memchr(attr->name.ptr, 0xff, attr->name.len) == 
					NULL);
			assert(memchr(attr->value.ptr, 0xff, attr->value.len) ==
					NULL);
		}
	}
		break;
	case HUBBUB_TOKEN_COMMENT:
		assert(memchr(token->data.comment.ptr, 0xff, 
				token->data.comment.len) == NULL);
		break;
	case HUBBUB_TOKEN_CHARACTER:
		assert(memchr(token->data.character.ptr, 0xff,
				token->data.character.len) == NULL);
		break;
	case HUBBUB_TOKEN_EOF:
		break;
	}
#endif

	/* Emit the token */
	if (tokeniser->token_handler) {
		err = tokeniser->token_handler(token, tokeniser->token_pw);
	}

	/* Discard current buffer */
	if (tokeniser->buffer->length) {
		parserutils_buffer_discard(tokeniser->buffer, 0,
				tokeniser->buffer->length);
	}

	/* Advance the pointer */
	if (tokeniser->context.pending) {
		parserutils_inputstream_advance(tokeniser->input,
				tokeniser->context.pending);
		tokeniser->context.pending = 0;
	}

	return err;
}
Tech Fingerprint

Alerts (13)

Complexity hotspot; lines 970 to 972 (total complexity: 10)
970 971 972
Complexity hotspot; line 1081 (total complexity: 10)
1081
Complexity hotspot; line 1130 (total complexity: 10)
1130
Complexity hotspot; line 1337 (total complexity: 10)
1337
Complexity hotspot; line 1630 (total complexity: 10)
1630
Complexity hotspot; line 2017 (total complexity: 10)
2017
Complexity hotspot; line 2051 (total complexity: 10)
2051
Complexity hotspot; line 2099 (total complexity: 10)
2099
Complexity hotspot; line 2144 (total complexity: 10)
2144
Complexity hotspot; line 2234 (total complexity: 10)
2234
Complexity hotspot; line 2393 (total complexity: 10)
2393