PageRenderTime 59ms CodeModel.GetById 16ms RepoModel.GetById 0ms app.codeStats 1ms

/peek-build/src/netdepends/hubbub-0.0.2/src/tokeniser/tokeniser.c

https://bitbucket.org/C0deMaver1ck/peeklinux
C | 3370 lines | 2902 code | 319 blank | 149 comment | 615 complexity | 6530fbcb7409a5b2eafc2432fd3ae059 MD5 | raw file
Possible License(s): GPL-2.0, LGPL-2.1, LGPL-2.0
  1. /*
  2. * This file is part of Hubbub.
  3. * Licensed under the MIT License,
  4. * http://www.opensource.org/licenses/mit-license.php
  5. * Copyright 2007 John-Mark Bell <jmb@netsurf-browser.org>
  6. * Copyright 2008 Andrew Sidwell <takkaria@netsurf-browser.org>
  7. */
  8. #include <assert.h>
  9. #include <stdbool.h>
  10. #include <string.h>
  11. #include <stdio.h>
  12. #include <parserutils/charset/utf8.h>
  13. #include "utils/parserutilserror.h"
  14. #include "utils/utils.h"
  15. #include "tokeniser/entities.h"
  16. #include "tokeniser/tokeniser.h"
  17. /**
  18. * Table of mappings between Windows-1252 codepoints 128-159 and UCS4
  19. */
  20. static const uint32_t cp1252Table[32] = {
  21. 0x20AC, 0xFFFD, 0x201A, 0x0192, 0x201E, 0x2026, 0x2020, 0x2021,
  22. 0x02C6, 0x2030, 0x0160, 0x2039, 0x0152, 0xFFFD, 0x017D, 0xFFFD,
  23. 0xFFFD, 0x2018, 0x2019, 0x201C, 0x201D, 0x2022, 0x2013, 0x2014,
  24. 0x02DC, 0x2122, 0x0161, 0x203A, 0x0153, 0xFFFD, 0x017E, 0x0178
  25. };
  26. /**
  27. * UTF-8 encoding of U+FFFD REPLACEMENT CHARACTER
  28. */
  29. static const uint8_t u_fffd[3] = { '\xEF', '\xBF', '\xBD' };
  30. static const hubbub_string u_fffd_str = { u_fffd, sizeof(u_fffd) };
  31. /**
  32. * String for when we want to emit newlines
  33. */
  34. static const uint8_t lf = '\n';
  35. static const hubbub_string lf_str = { &lf, 1 };
  36. /**
  37. * Tokeniser states
  38. */
  39. typedef enum hubbub_tokeniser_state {
  40. STATE_DATA,
  41. STATE_CHARACTER_REFERENCE_DATA,
  42. STATE_TAG_OPEN,
  43. STATE_CLOSE_TAG_OPEN,
  44. STATE_TAG_NAME,
  45. STATE_BEFORE_ATTRIBUTE_NAME,
  46. STATE_ATTRIBUTE_NAME,
  47. STATE_AFTER_ATTRIBUTE_NAME,
  48. STATE_BEFORE_ATTRIBUTE_VALUE,
  49. STATE_ATTRIBUTE_VALUE_DQ,
  50. STATE_ATTRIBUTE_VALUE_SQ,
  51. STATE_ATTRIBUTE_VALUE_UQ,
  52. STATE_CHARACTER_REFERENCE_IN_ATTRIBUTE_VALUE,
  53. STATE_AFTER_ATTRIBUTE_VALUE_Q,
  54. STATE_SELF_CLOSING_START_TAG,
  55. STATE_BOGUS_COMMENT,
  56. STATE_MARKUP_DECLARATION_OPEN,
  57. STATE_MATCH_COMMENT,
  58. STATE_COMMENT_START,
  59. STATE_COMMENT_START_DASH,
  60. STATE_COMMENT,
  61. STATE_COMMENT_END_DASH,
  62. STATE_COMMENT_END,
  63. STATE_MATCH_DOCTYPE,
  64. STATE_DOCTYPE,
  65. STATE_BEFORE_DOCTYPE_NAME,
  66. STATE_DOCTYPE_NAME,
  67. STATE_AFTER_DOCTYPE_NAME,
  68. STATE_MATCH_PUBLIC,
  69. STATE_BEFORE_DOCTYPE_PUBLIC,
  70. STATE_DOCTYPE_PUBLIC_DQ,
  71. STATE_DOCTYPE_PUBLIC_SQ,
  72. STATE_AFTER_DOCTYPE_PUBLIC,
  73. STATE_MATCH_SYSTEM,
  74. STATE_BEFORE_DOCTYPE_SYSTEM,
  75. STATE_DOCTYPE_SYSTEM_DQ,
  76. STATE_DOCTYPE_SYSTEM_SQ,
  77. STATE_AFTER_DOCTYPE_SYSTEM,
  78. STATE_BOGUS_DOCTYPE,
  79. STATE_MATCH_CDATA,
  80. STATE_CDATA_BLOCK,
  81. STATE_NUMBERED_ENTITY,
  82. STATE_NAMED_ENTITY
  83. } hubbub_tokeniser_state;
  84. /**
  85. * Context for tokeniser
  86. */
  87. typedef struct hubbub_tokeniser_context {
  88. size_t pending; /**< Count of pending chars */
  89. hubbub_string current_comment; /**< Current comment text */
  90. hubbub_token_type current_tag_type; /**< Type of current_tag */
  91. hubbub_tag current_tag; /**< Current tag */
  92. hubbub_doctype current_doctype; /**< Current doctype */
  93. hubbub_tokeniser_state prev_state; /**< Previous state */
  94. uint8_t last_start_tag_name[10]; /**< Name of the last start tag
  95. * emitted */
  96. size_t last_start_tag_len; /**< Length of last start tag */
  97. struct {
  98. uint32_t count;
  99. bool match;
  100. } close_tag_match; /**< State for matching close
  101. * tags */
  102. struct {
  103. uint32_t count; /**< Index into "DOCTYPE" */
  104. } match_doctype; /**< State for matching doctype */
  105. struct {
  106. uint32_t count; /**< Index into "[CDATA[" */
  107. uint32_t end; /**< Index into "]]>" */
  108. } match_cdata; /**< State for matching cdata */
  109. struct {
  110. size_t offset; /**< Offset in buffer */
  111. uint32_t length; /**< Length of entity */
  112. uint32_t codepoint; /**< UCS4 codepoint */
  113. bool complete; /**< True if match complete */
  114. uint32_t poss_length; /**< Optimistic length
  115. * when matching named
  116. * character references */
  117. uint8_t base; /**< Base for numeric
  118. * entities */
  119. void *context; /**< Context for named
  120. * entity search */
  121. size_t prev_len; /**< Previous byte length
  122. * of str */
  123. bool had_data; /**< Whether we read
  124. * anything after &#(x)? */
  125. bool overflow; /**< Whether this entity has
  126. * has overflowed the maximum
  127. * numeric entity value */
  128. hubbub_tokeniser_state return_state; /**< State we were
  129. * called from */
  130. } match_entity; /**< Entity matching state */
  131. struct {
  132. uint32_t line; /**< Current line of input */
  133. uint32_t col; /**< Current character in
  134. * line */
  135. } position; /**< Position in source data */
  136. uint32_t allowed_char; /**< Used for quote matching */
  137. } hubbub_tokeniser_context;
  138. /**
  139. * Tokeniser data structure
  140. */
  141. struct hubbub_tokeniser {
  142. hubbub_tokeniser_state state; /**< Current tokeniser state */
  143. hubbub_content_model content_model; /**< Current content
  144. * model flag */
  145. bool escape_flag; /**< Escape flag **/
  146. bool process_cdata_section; /**< Whether to process CDATA sections*/
  147. parserutils_inputstream *input; /**< Input stream */
  148. parserutils_buffer *buffer; /**< Input buffer */
  149. hubbub_tokeniser_context context; /**< Tokeniser context */
  150. hubbub_token_handler token_handler; /**< Token handling callback */
  151. void *token_pw; /**< Token handler data */
  152. hubbub_error_handler error_handler; /**< Error handling callback */
  153. void *error_pw; /**< Error handler data */
  154. hubbub_allocator_fn alloc; /**< Memory (de)allocation function */
  155. void *alloc_pw; /**< Client private data */
  156. };
  157. static hubbub_error hubbub_tokeniser_handle_data(hubbub_tokeniser *tokeniser);
  158. static hubbub_error hubbub_tokeniser_handle_character_reference_data(
  159. hubbub_tokeniser *tokeniser);
  160. static hubbub_error hubbub_tokeniser_handle_tag_open(
  161. hubbub_tokeniser *tokeniser);
  162. static hubbub_error hubbub_tokeniser_handle_close_tag_open(
  163. hubbub_tokeniser *tokeniser);
  164. static hubbub_error hubbub_tokeniser_handle_tag_name(
  165. hubbub_tokeniser *tokeniser);
  166. static hubbub_error hubbub_tokeniser_handle_before_attribute_name(
  167. hubbub_tokeniser *tokeniser);
  168. static hubbub_error hubbub_tokeniser_handle_attribute_name(
  169. hubbub_tokeniser *tokeniser);
  170. static hubbub_error hubbub_tokeniser_handle_after_attribute_name(
  171. hubbub_tokeniser *tokeniser);
  172. static hubbub_error hubbub_tokeniser_handle_before_attribute_value(
  173. hubbub_tokeniser *tokeniser);
  174. static hubbub_error hubbub_tokeniser_handle_attribute_value_dq(
  175. hubbub_tokeniser *tokeniser);
  176. static hubbub_error hubbub_tokeniser_handle_attribute_value_sq(
  177. hubbub_tokeniser *tokeniser);
  178. static hubbub_error hubbub_tokeniser_handle_attribute_value_uq(
  179. hubbub_tokeniser *tokeniser);
  180. static hubbub_error hubbub_tokeniser_handle_character_reference_in_attribute_value(
  181. hubbub_tokeniser *tokeniser);
  182. static hubbub_error hubbub_tokeniser_handle_after_attribute_value_q(
  183. hubbub_tokeniser *tokeniser);
  184. static hubbub_error hubbub_tokeniser_handle_self_closing_start_tag(
  185. hubbub_tokeniser *tokeniser);
  186. static hubbub_error hubbub_tokeniser_handle_bogus_comment(
  187. hubbub_tokeniser *tokeniser);
  188. static hubbub_error hubbub_tokeniser_handle_markup_declaration_open(
  189. hubbub_tokeniser *tokeniser);
  190. static hubbub_error hubbub_tokeniser_handle_match_comment(
  191. hubbub_tokeniser *tokeniser);
  192. static hubbub_error hubbub_tokeniser_handle_comment(
  193. hubbub_tokeniser *tokeniser);
  194. static hubbub_error hubbub_tokeniser_handle_match_doctype(
  195. hubbub_tokeniser *tokeniser);
  196. static hubbub_error hubbub_tokeniser_handle_doctype(
  197. hubbub_tokeniser *tokeniser);
  198. static hubbub_error hubbub_tokeniser_handle_before_doctype_name(
  199. hubbub_tokeniser *tokeniser);
  200. static hubbub_error hubbub_tokeniser_handle_doctype_name(
  201. hubbub_tokeniser *tokeniser);
  202. static hubbub_error hubbub_tokeniser_handle_after_doctype_name(
  203. hubbub_tokeniser *tokeniser);
  204. static hubbub_error hubbub_tokeniser_handle_match_public(
  205. hubbub_tokeniser *tokeniser);
  206. static hubbub_error hubbub_tokeniser_handle_before_doctype_public(
  207. hubbub_tokeniser *tokeniser);
  208. static hubbub_error hubbub_tokeniser_handle_doctype_public_dq(
  209. hubbub_tokeniser *tokeniser);
  210. static hubbub_error hubbub_tokeniser_handle_doctype_public_sq(
  211. hubbub_tokeniser *tokeniser);
  212. static hubbub_error hubbub_tokeniser_handle_after_doctype_public(
  213. hubbub_tokeniser *tokeniser);
  214. static hubbub_error hubbub_tokeniser_handle_match_system(
  215. hubbub_tokeniser *tokeniser);
  216. static hubbub_error hubbub_tokeniser_handle_before_doctype_system(
  217. hubbub_tokeniser *tokeniser);
  218. static hubbub_error hubbub_tokeniser_handle_doctype_system_dq(
  219. hubbub_tokeniser *tokeniser);
  220. static hubbub_error hubbub_tokeniser_handle_doctype_system_sq(
  221. hubbub_tokeniser *tokeniser);
  222. static hubbub_error hubbub_tokeniser_handle_after_doctype_system(
  223. hubbub_tokeniser *tokeniser);
  224. static hubbub_error hubbub_tokeniser_handle_bogus_doctype(
  225. hubbub_tokeniser *tokeniser);
  226. static hubbub_error hubbub_tokeniser_handle_match_cdata(
  227. hubbub_tokeniser *tokeniser);
  228. static hubbub_error hubbub_tokeniser_handle_cdata_block(
  229. hubbub_tokeniser *tokeniser);
  230. static hubbub_error hubbub_tokeniser_consume_character_reference(
  231. hubbub_tokeniser *tokeniser, size_t off);
  232. static hubbub_error hubbub_tokeniser_handle_numbered_entity(
  233. hubbub_tokeniser *tokeniser);
  234. static hubbub_error hubbub_tokeniser_handle_named_entity(
  235. hubbub_tokeniser *tokeniser);
  236. static inline hubbub_error emit_character_token(hubbub_tokeniser *tokeniser,
  237. const hubbub_string *chars);
  238. static inline hubbub_error emit_current_chars(hubbub_tokeniser *tokeniser);
  239. static inline hubbub_error emit_current_tag(hubbub_tokeniser *tokeniser);
  240. static inline hubbub_error emit_current_comment(hubbub_tokeniser *tokeniser);
  241. static inline hubbub_error emit_current_doctype(hubbub_tokeniser *tokeniser,
  242. bool force_quirks);
  243. static hubbub_error hubbub_tokeniser_emit_token(hubbub_tokeniser *tokeniser,
  244. hubbub_token *token);
  245. /**
  246. * Create a hubbub tokeniser
  247. *
  248. * \param input Input stream instance
  249. * \param alloc Memory (de)allocation function
  250. * \param pw Pointer to client-specific private data (may be NULL)
  251. * \param tokeniser Pointer to location to receive tokeniser instance
  252. * \return HUBBUB_OK on success,
  253. * HUBBUB_BADPARM on bad parameters,
  254. * HUBBUB_NOMEM on memory exhaustion
  255. */
  256. hubbub_error hubbub_tokeniser_create(parserutils_inputstream *input,
  257. hubbub_allocator_fn alloc, void *pw,
  258. hubbub_tokeniser **tokeniser)
  259. {
  260. parserutils_error perror;
  261. hubbub_tokeniser *tok;
  262. if (input == NULL || alloc == NULL || tokeniser == NULL)
  263. return HUBBUB_BADPARM;
  264. tok = alloc(NULL, sizeof(hubbub_tokeniser), pw);
  265. if (tok == NULL)
  266. return HUBBUB_NOMEM;
  267. perror = parserutils_buffer_create(alloc, pw, &tok->buffer);
  268. if (perror != PARSERUTILS_OK) {
  269. alloc(tok, 0, pw);
  270. return hubbub_error_from_parserutils_error(perror);
  271. }
  272. tok->state = STATE_DATA;
  273. tok->content_model = HUBBUB_CONTENT_MODEL_PCDATA;
  274. tok->escape_flag = false;
  275. tok->process_cdata_section = false;
  276. tok->input = input;
  277. tok->token_handler = NULL;
  278. tok->token_pw = NULL;
  279. tok->error_handler = NULL;
  280. tok->error_pw = NULL;
  281. tok->alloc = alloc;
  282. tok->alloc_pw = pw;
  283. memset(&tok->context, 0, sizeof(hubbub_tokeniser_context));
  284. *tokeniser = tok;
  285. return HUBBUB_OK;
  286. }
  287. /**
  288. * Destroy a hubbub tokeniser
  289. *
  290. * \param tokeniser The tokeniser instance to destroy
  291. * \return HUBBUB_OK on success, appropriate error otherwise
  292. */
  293. hubbub_error hubbub_tokeniser_destroy(hubbub_tokeniser *tokeniser)
  294. {
  295. if (tokeniser == NULL)
  296. return HUBBUB_BADPARM;
  297. if (tokeniser->context.current_tag.attributes != NULL) {
  298. tokeniser->alloc(tokeniser->context.current_tag.attributes,
  299. 0, tokeniser->alloc_pw);
  300. }
  301. parserutils_buffer_destroy(tokeniser->buffer);
  302. tokeniser->alloc(tokeniser, 0, tokeniser->alloc_pw);
  303. return HUBBUB_OK;
  304. }
  305. /**
  306. * Configure a hubbub tokeniser
  307. *
  308. * \param tokeniser The tokeniser instance to configure
  309. * \param type The option type to set
  310. * \param params Option-specific parameters
  311. * \return HUBBUB_OK on success, appropriate error otherwise
  312. */
  313. hubbub_error hubbub_tokeniser_setopt(hubbub_tokeniser *tokeniser,
  314. hubbub_tokeniser_opttype type,
  315. hubbub_tokeniser_optparams *params)
  316. {
  317. if (tokeniser == NULL || params == NULL)
  318. return HUBBUB_BADPARM;
  319. switch (type) {
  320. case HUBBUB_TOKENISER_TOKEN_HANDLER:
  321. tokeniser->token_handler = params->token_handler.handler;
  322. tokeniser->token_pw = params->token_handler.pw;
  323. break;
  324. case HUBBUB_TOKENISER_ERROR_HANDLER:
  325. tokeniser->error_handler = params->error_handler.handler;
  326. tokeniser->error_pw = params->error_handler.pw;
  327. break;
  328. case HUBBUB_TOKENISER_CONTENT_MODEL:
  329. tokeniser->content_model = params->content_model.model;
  330. break;
  331. case HUBBUB_TOKENISER_PROCESS_CDATA:
  332. tokeniser->process_cdata_section = params->process_cdata;
  333. break;
  334. }
  335. return HUBBUB_OK;
  336. }
  337. /**
  338. * Process remaining data in the input stream
  339. *
  340. * \param tokeniser The tokeniser instance to invoke
  341. * \return HUBBUB_OK on success, appropriate error otherwise
  342. */
  343. hubbub_error hubbub_tokeniser_run(hubbub_tokeniser *tokeniser)
  344. {
  345. hubbub_error cont = HUBBUB_OK;
  346. if (tokeniser == NULL)
  347. return HUBBUB_BADPARM;
  348. #if 0
  349. #define state(x) \
  350. case x: \
  351. printf( #x "\n");
  352. #else
  353. #define state(x) \
  354. case x:
  355. #endif
  356. while (cont == HUBBUB_OK) {
  357. switch (tokeniser->state) {
  358. state(STATE_DATA)
  359. cont = hubbub_tokeniser_handle_data(tokeniser);
  360. break;
  361. state(STATE_CHARACTER_REFERENCE_DATA)
  362. cont = hubbub_tokeniser_handle_character_reference_data(
  363. tokeniser);
  364. break;
  365. state(STATE_TAG_OPEN)
  366. cont = hubbub_tokeniser_handle_tag_open(tokeniser);
  367. break;
  368. state(STATE_CLOSE_TAG_OPEN)
  369. cont = hubbub_tokeniser_handle_close_tag_open(
  370. tokeniser);
  371. break;
  372. state(STATE_TAG_NAME)
  373. cont = hubbub_tokeniser_handle_tag_name(tokeniser);
  374. break;
  375. state(STATE_BEFORE_ATTRIBUTE_NAME)
  376. cont = hubbub_tokeniser_handle_before_attribute_name(
  377. tokeniser);
  378. break;
  379. state(STATE_ATTRIBUTE_NAME)
  380. cont = hubbub_tokeniser_handle_attribute_name(
  381. tokeniser);
  382. break;
  383. state(STATE_AFTER_ATTRIBUTE_NAME)
  384. cont = hubbub_tokeniser_handle_after_attribute_name(
  385. tokeniser);
  386. break;
  387. state(STATE_BEFORE_ATTRIBUTE_VALUE)
  388. cont = hubbub_tokeniser_handle_before_attribute_value(
  389. tokeniser);
  390. break;
  391. state(STATE_ATTRIBUTE_VALUE_DQ)
  392. cont = hubbub_tokeniser_handle_attribute_value_dq(
  393. tokeniser);
  394. break;
  395. state(STATE_ATTRIBUTE_VALUE_SQ)
  396. cont = hubbub_tokeniser_handle_attribute_value_sq(
  397. tokeniser);
  398. break;
  399. state(STATE_ATTRIBUTE_VALUE_UQ)
  400. cont = hubbub_tokeniser_handle_attribute_value_uq(
  401. tokeniser);
  402. break;
  403. state(STATE_CHARACTER_REFERENCE_IN_ATTRIBUTE_VALUE)
  404. cont = hubbub_tokeniser_handle_character_reference_in_attribute_value(
  405. tokeniser);
  406. break;
  407. state(STATE_AFTER_ATTRIBUTE_VALUE_Q)
  408. cont = hubbub_tokeniser_handle_after_attribute_value_q(
  409. tokeniser);
  410. break;
  411. state(STATE_SELF_CLOSING_START_TAG)
  412. cont = hubbub_tokeniser_handle_self_closing_start_tag(
  413. tokeniser);
  414. break;
  415. state(STATE_BOGUS_COMMENT)
  416. cont = hubbub_tokeniser_handle_bogus_comment(
  417. tokeniser);
  418. break;
  419. state(STATE_MARKUP_DECLARATION_OPEN)
  420. cont = hubbub_tokeniser_handle_markup_declaration_open(
  421. tokeniser);
  422. break;
  423. state(STATE_MATCH_COMMENT)
  424. cont = hubbub_tokeniser_handle_match_comment(
  425. tokeniser);
  426. break;
  427. case STATE_COMMENT_START:
  428. case STATE_COMMENT_START_DASH:
  429. case STATE_COMMENT:
  430. case STATE_COMMENT_END_DASH:
  431. case STATE_COMMENT_END:
  432. cont = hubbub_tokeniser_handle_comment(tokeniser);
  433. break;
  434. state(STATE_MATCH_DOCTYPE)
  435. cont = hubbub_tokeniser_handle_match_doctype(
  436. tokeniser);
  437. break;
  438. state(STATE_DOCTYPE)
  439. cont = hubbub_tokeniser_handle_doctype(tokeniser);
  440. break;
  441. state(STATE_BEFORE_DOCTYPE_NAME)
  442. cont = hubbub_tokeniser_handle_before_doctype_name(
  443. tokeniser);
  444. break;
  445. state(STATE_DOCTYPE_NAME)
  446. cont = hubbub_tokeniser_handle_doctype_name(
  447. tokeniser);
  448. break;
  449. state(STATE_AFTER_DOCTYPE_NAME)
  450. cont = hubbub_tokeniser_handle_after_doctype_name(
  451. tokeniser);
  452. break;
  453. state(STATE_MATCH_PUBLIC)
  454. cont = hubbub_tokeniser_handle_match_public(
  455. tokeniser);
  456. break;
  457. state(STATE_BEFORE_DOCTYPE_PUBLIC)
  458. cont = hubbub_tokeniser_handle_before_doctype_public(
  459. tokeniser);
  460. break;
  461. state(STATE_DOCTYPE_PUBLIC_DQ)
  462. cont = hubbub_tokeniser_handle_doctype_public_dq(
  463. tokeniser);
  464. break;
  465. state(STATE_DOCTYPE_PUBLIC_SQ)
  466. cont = hubbub_tokeniser_handle_doctype_public_sq(
  467. tokeniser);
  468. break;
  469. state(STATE_AFTER_DOCTYPE_PUBLIC)
  470. cont = hubbub_tokeniser_handle_after_doctype_public(
  471. tokeniser);
  472. break;
  473. state(STATE_MATCH_SYSTEM)
  474. cont = hubbub_tokeniser_handle_match_system(
  475. tokeniser);
  476. break;
  477. state(STATE_BEFORE_DOCTYPE_SYSTEM)
  478. cont = hubbub_tokeniser_handle_before_doctype_system(
  479. tokeniser);
  480. break;
  481. state(STATE_DOCTYPE_SYSTEM_DQ)
  482. cont = hubbub_tokeniser_handle_doctype_system_dq(
  483. tokeniser);
  484. break;
  485. state(STATE_DOCTYPE_SYSTEM_SQ)
  486. cont = hubbub_tokeniser_handle_doctype_system_sq(
  487. tokeniser);
  488. break;
  489. state(STATE_AFTER_DOCTYPE_SYSTEM)
  490. cont = hubbub_tokeniser_handle_after_doctype_system(
  491. tokeniser);
  492. break;
  493. state(STATE_BOGUS_DOCTYPE)
  494. cont = hubbub_tokeniser_handle_bogus_doctype(
  495. tokeniser);
  496. break;
  497. state(STATE_MATCH_CDATA)
  498. cont = hubbub_tokeniser_handle_match_cdata(
  499. tokeniser);
  500. break;
  501. state(STATE_CDATA_BLOCK)
  502. cont = hubbub_tokeniser_handle_cdata_block(
  503. tokeniser);
  504. break;
  505. state(STATE_NUMBERED_ENTITY)
  506. cont = hubbub_tokeniser_handle_numbered_entity(
  507. tokeniser);
  508. break;
  509. state(STATE_NAMED_ENTITY)
  510. cont = hubbub_tokeniser_handle_named_entity(
  511. tokeniser);
  512. break;
  513. }
  514. }
  515. return (cont == HUBBUB_NEEDDATA) ? HUBBUB_OK : cont;
  516. }
  517. /**
  518. * Various macros for manipulating buffers.
  519. *
  520. * \todo make some of these inline functions (type-safety)
  521. * \todo document them properly here
  522. */
  523. #define START_BUF(str, cptr, length) \
  524. do { \
  525. parserutils_error perror; \
  526. perror = parserutils_buffer_append(tokeniser->buffer, \
  527. (uint8_t *) (cptr), (length)); \
  528. if (perror != PARSERUTILS_OK) \
  529. return hubbub_error_from_parserutils_error(perror); \
  530. (str).len = (length); \
  531. } while (0)
  532. #define COLLECT(str, cptr, length) \
  533. do { \
  534. parserutils_error perror; \
  535. assert(str.len != 0); \
  536. perror = parserutils_buffer_append(tokeniser->buffer, \
  537. (uint8_t *) (cptr), (length)); \
  538. if (perror != PARSERUTILS_OK) \
  539. return hubbub_error_from_parserutils_error(perror); \
  540. (str).len += (length); \
  541. } while (0)
  542. #define COLLECT_MS(str, cptr, length) \
  543. do { \
  544. parserutils_error perror; \
  545. perror = parserutils_buffer_append(tokeniser->buffer, \
  546. (uint8_t *) (cptr), (length)); \
  547. if (perror != PARSERUTILS_OK) \
  548. return hubbub_error_from_parserutils_error(perror); \
  549. (str).len += (length); \
  550. } while (0)
  551. /* this should always be called with an empty "chars" buffer */
  552. hubbub_error hubbub_tokeniser_handle_data(hubbub_tokeniser *tokeniser)
  553. {
  554. parserutils_error error;
  555. hubbub_token token;
  556. const uint8_t *cptr;
  557. size_t len;
  558. while ((error = parserutils_inputstream_peek(tokeniser->input,
  559. tokeniser->context.pending, &cptr, &len)) ==
  560. PARSERUTILS_OK) {
  561. const uint8_t c = *cptr;
  562. if (c == '&' &&
  563. (tokeniser->content_model == HUBBUB_CONTENT_MODEL_PCDATA ||
  564. tokeniser->content_model == HUBBUB_CONTENT_MODEL_RCDATA) &&
  565. tokeniser->escape_flag == false) {
  566. tokeniser->state =
  567. STATE_CHARACTER_REFERENCE_DATA;
  568. /* Don't eat the '&'; it'll be handled by entity
  569. * consumption */
  570. break;
  571. } else if (c == '-' &&
  572. tokeniser->escape_flag == false &&
  573. (tokeniser->content_model ==
  574. HUBBUB_CONTENT_MODEL_RCDATA ||
  575. tokeniser->content_model ==
  576. HUBBUB_CONTENT_MODEL_CDATA) &&
  577. tokeniser->context.pending >= 3) {
  578. size_t ignore;
  579. error = parserutils_inputstream_peek(
  580. tokeniser->input,
  581. tokeniser->context.pending - 3,
  582. &cptr,
  583. &ignore);
  584. assert(error == PARSERUTILS_OK);
  585. if (strncmp((char *)cptr,
  586. "<!--", SLEN("<!--")) == 0) {
  587. tokeniser->escape_flag = true;
  588. }
  589. tokeniser->context.pending += len;
  590. } else if (c == '<' && (tokeniser->content_model ==
  591. HUBBUB_CONTENT_MODEL_PCDATA ||
  592. ((tokeniser->content_model ==
  593. HUBBUB_CONTENT_MODEL_RCDATA ||
  594. tokeniser->content_model ==
  595. HUBBUB_CONTENT_MODEL_CDATA) &&
  596. tokeniser->escape_flag == false))) {
  597. if (tokeniser->context.pending > 0) {
  598. /* Emit any pending characters */
  599. emit_current_chars(tokeniser);
  600. }
  601. /* Buffer '<' */
  602. tokeniser->context.pending = len;
  603. tokeniser->state = STATE_TAG_OPEN;
  604. break;
  605. } else if (c == '>' && tokeniser->escape_flag == true &&
  606. (tokeniser->content_model ==
  607. HUBBUB_CONTENT_MODEL_RCDATA ||
  608. tokeniser->content_model ==
  609. HUBBUB_CONTENT_MODEL_CDATA)) {
  610. /* no need to check that there are enough characters,
  611. * since you can only run into this if the flag is
  612. * true in the first place, which requires four
  613. * characters. */
  614. error = parserutils_inputstream_peek(
  615. tokeniser->input,
  616. tokeniser->context.pending - 2,
  617. &cptr,
  618. &len);
  619. assert(error == PARSERUTILS_OK);
  620. if (strncmp((char *) cptr, "-->", SLEN("-->")) == 0) {
  621. tokeniser->escape_flag = false;
  622. }
  623. tokeniser->context.pending += len;
  624. } else if (c == '\0') {
  625. if (tokeniser->context.pending > 0) {
  626. /* Emit any pending characters */
  627. emit_current_chars(tokeniser);
  628. }
  629. /* Emit a replacement character */
  630. emit_character_token(tokeniser, &u_fffd_str);
  631. /* Advance past NUL */
  632. parserutils_inputstream_advance(tokeniser->input, 1);
  633. } else if (c == '\r') {
  634. error = parserutils_inputstream_peek(
  635. tokeniser->input,
  636. tokeniser->context.pending + len,
  637. &cptr,
  638. &len);
  639. if (error != PARSERUTILS_OK &&
  640. error != PARSERUTILS_EOF) {
  641. break;
  642. }
  643. if (tokeniser->context.pending > 0) {
  644. /* Emit any pending characters */
  645. emit_current_chars(tokeniser);
  646. }
  647. if (error == PARSERUTILS_EOF || *cptr != '\n') {
  648. /* Emit newline */
  649. emit_character_token(tokeniser, &lf_str);
  650. }
  651. /* Advance over */
  652. parserutils_inputstream_advance(tokeniser->input, 1);
  653. } else {
  654. /* Just collect into buffer */
  655. tokeniser->context.pending += len;
  656. }
  657. }
  658. if (tokeniser->state != STATE_TAG_OPEN &&
  659. (tokeniser->state != STATE_DATA || error == PARSERUTILS_EOF) &&
  660. tokeniser->context.pending > 0) {
  661. /* Emit any pending characters */
  662. emit_current_chars(tokeniser);
  663. }
  664. if (error == PARSERUTILS_EOF) {
  665. token.type = HUBBUB_TOKEN_EOF;
  666. hubbub_tokeniser_emit_token(tokeniser, &token);
  667. }
  668. if (error == PARSERUTILS_EOF) {
  669. return HUBBUB_NEEDDATA;
  670. } else {
  671. return hubbub_error_from_parserutils_error(error);
  672. }
  673. }
  674. /* emit any pending tokens before calling */
  675. hubbub_error hubbub_tokeniser_handle_character_reference_data(
  676. hubbub_tokeniser *tokeniser)
  677. {
  678. assert(tokeniser->context.pending == 0);
  679. if (tokeniser->context.match_entity.complete == false) {
  680. return hubbub_tokeniser_consume_character_reference(tokeniser,
  681. tokeniser->context.pending);
  682. } else {
  683. hubbub_token token;
  684. uint8_t utf8[6];
  685. uint8_t *utf8ptr = utf8;
  686. size_t len = sizeof(utf8);
  687. token.type = HUBBUB_TOKEN_CHARACTER;
  688. if (tokeniser->context.match_entity.codepoint) {
  689. parserutils_charset_utf8_from_ucs4(
  690. tokeniser->context.match_entity.codepoint,
  691. &utf8ptr, &len);
  692. token.data.character.ptr = utf8;
  693. token.data.character.len = sizeof(utf8) - len;
  694. hubbub_tokeniser_emit_token(tokeniser, &token);
  695. /* +1 for ampersand */
  696. parserutils_inputstream_advance(tokeniser->input,
  697. tokeniser->context.match_entity.length
  698. + 1);
  699. } else {
  700. parserutils_error error;
  701. const uint8_t *cptr = NULL;
  702. error = parserutils_inputstream_peek(
  703. tokeniser->input,
  704. tokeniser->context.pending,
  705. &cptr,
  706. &len);
  707. assert(error == PARSERUTILS_OK);
  708. token.data.character.ptr = cptr;
  709. token.data.character.len = len;
  710. hubbub_tokeniser_emit_token(tokeniser, &token);
  711. parserutils_inputstream_advance(tokeniser->input, len);
  712. }
  713. /* Reset for next time */
  714. tokeniser->context.match_entity.complete = false;
  715. tokeniser->state = STATE_DATA;
  716. }
  717. return HUBBUB_OK;
  718. }
  719. /* this state always switches to another state straight away */
  720. /* this state expects the current character to be '<' */
  721. hubbub_error hubbub_tokeniser_handle_tag_open(hubbub_tokeniser *tokeniser)
  722. {
  723. hubbub_tag *ctag = &tokeniser->context.current_tag;
  724. size_t len;
  725. const uint8_t *cptr;
  726. parserutils_error error;
  727. uint8_t c;
  728. assert(tokeniser->context.pending == 1);
  729. /* assert(tokeniser->context.chars.ptr[0] == '<'); */
  730. error = parserutils_inputstream_peek(tokeniser->input,
  731. tokeniser->context.pending, &cptr, &len);
  732. if (error != PARSERUTILS_OK) {
  733. if (error == PARSERUTILS_EOF) {
  734. /* Return to data state with '<' still in "chars" */
  735. tokeniser->state = STATE_DATA;
  736. return HUBBUB_OK;
  737. } else {
  738. return hubbub_error_from_parserutils_error(error);
  739. }
  740. }
  741. c = *cptr;
  742. if (c == '/') {
  743. tokeniser->context.pending += len;
  744. tokeniser->context.close_tag_match.match = false;
  745. tokeniser->context.close_tag_match.count = 0;
  746. tokeniser->state = STATE_CLOSE_TAG_OPEN;
  747. } else if (tokeniser->content_model == HUBBUB_CONTENT_MODEL_RCDATA ||
  748. tokeniser->content_model ==
  749. HUBBUB_CONTENT_MODEL_CDATA) {
  750. /* Return to data state with '<' still in "chars" */
  751. tokeniser->state = STATE_DATA;
  752. } else if (tokeniser->content_model == HUBBUB_CONTENT_MODEL_PCDATA) {
  753. if (c == '!') {
  754. parserutils_inputstream_advance(tokeniser->input,
  755. SLEN("<!"));
  756. tokeniser->context.pending = 0;
  757. tokeniser->state = STATE_MARKUP_DECLARATION_OPEN;
  758. } else if ('A' <= c && c <= 'Z') {
  759. uint8_t lc = (c + 0x20);
  760. START_BUF(ctag->name, &lc, len);
  761. ctag->n_attributes = 0;
  762. tokeniser->context.current_tag_type =
  763. HUBBUB_TOKEN_START_TAG;
  764. tokeniser->context.pending += len;
  765. tokeniser->state = STATE_TAG_NAME;
  766. } else if ('a' <= c && c <= 'z') {
  767. START_BUF(ctag->name, cptr, len);
  768. ctag->n_attributes = 0;
  769. tokeniser->context.current_tag_type =
  770. HUBBUB_TOKEN_START_TAG;
  771. tokeniser->context.pending += len;
  772. tokeniser->state = STATE_TAG_NAME;
  773. } else if (c == '>') {
  774. /** \todo parse error */
  775. tokeniser->context.pending += len;
  776. tokeniser->state = STATE_DATA;
  777. } else if (c == '?') {
  778. /** \todo parse error */
  779. /* Cursor still at "<", need to advance past it */
  780. parserutils_inputstream_advance(
  781. tokeniser->input, SLEN("<"));
  782. tokeniser->context.pending = 0;
  783. tokeniser->state = STATE_BOGUS_COMMENT;
  784. } else {
  785. /* Return to data state with '<' still in "chars" */
  786. tokeniser->state = STATE_DATA;
  787. }
  788. }
  789. return HUBBUB_OK;
  790. }
  791. /* this state expects tokeniser->context.chars to be "</" */
  792. /* this state never stays in this state for more than one character */
  793. hubbub_error hubbub_tokeniser_handle_close_tag_open(hubbub_tokeniser *tokeniser)
  794. {
  795. hubbub_tokeniser_context *ctx = &tokeniser->context;
  796. size_t len;
  797. const uint8_t *cptr;
  798. parserutils_error error;
  799. uint8_t c;
  800. assert(tokeniser->context.pending == 2);
  801. /* assert(tokeniser->context.chars.ptr[0] == '<'); */
  802. /* assert(tokeniser->context.chars.ptr[1] == '/'); */
  803. /**\todo fragment case */
  804. if (tokeniser->content_model == HUBBUB_CONTENT_MODEL_RCDATA ||
  805. tokeniser->content_model ==
  806. HUBBUB_CONTENT_MODEL_CDATA) {
  807. uint8_t *start_tag_name =
  808. tokeniser->context.last_start_tag_name;
  809. size_t start_tag_len =
  810. tokeniser->context.last_start_tag_len;
  811. while ((error = parserutils_inputstream_peek(tokeniser->input,
  812. ctx->pending +
  813. ctx->close_tag_match.count,
  814. &cptr,
  815. &len)) == PARSERUTILS_OK) {
  816. c = *cptr;
  817. if ((start_tag_name[ctx->close_tag_match.count] & ~0x20)
  818. != (c & ~0x20)) {
  819. break;
  820. }
  821. ctx->close_tag_match.count += len;
  822. if (ctx->close_tag_match.count == start_tag_len) {
  823. ctx->close_tag_match.match = true;
  824. break;
  825. }
  826. }
  827. if (error != PARSERUTILS_OK && error != PARSERUTILS_EOF) {
  828. return hubbub_error_from_parserutils_error(error);
  829. }
  830. if (ctx->close_tag_match.match == true) {
  831. error = parserutils_inputstream_peek(
  832. tokeniser->input,
  833. ctx->pending +
  834. ctx->close_tag_match.count,
  835. &cptr,
  836. &len);
  837. if (error != PARSERUTILS_OK &&
  838. error != PARSERUTILS_EOF) {
  839. return hubbub_error_from_parserutils_error(
  840. error);
  841. } else if (error != PARSERUTILS_EOF) {
  842. c = *cptr;
  843. if (c != '\t' && c != '\n' && c != '\f' &&
  844. c != ' ' && c != '>' &&
  845. c != '/') {
  846. ctx->close_tag_match.match = false;
  847. }
  848. }
  849. }
  850. }
  851. if (ctx->close_tag_match.match == false &&
  852. tokeniser->content_model !=
  853. HUBBUB_CONTENT_MODEL_PCDATA) {
  854. /* We should emit "</" here, but instead we leave it in the
  855. * buffer so the data state emits it with any characters
  856. * following it */
  857. tokeniser->state = STATE_DATA;
  858. } else {
  859. error = parserutils_inputstream_peek(tokeniser->input,
  860. tokeniser->context.pending, &cptr, &len);
  861. if (error == PARSERUTILS_EOF) {
  862. /** \todo parse error */
  863. /* Return to data state with "</" pending */
  864. tokeniser->state = STATE_DATA;
  865. return HUBBUB_OK;
  866. } else if (error != PARSERUTILS_OK) {
  867. return hubbub_error_from_parserutils_error(error);
  868. }
  869. c = *cptr;
  870. if ('A' <= c && c <= 'Z') {
  871. uint8_t lc = (c + 0x20);
  872. START_BUF(tokeniser->context.current_tag.name,
  873. &lc, len);
  874. tokeniser->context.current_tag.n_attributes = 0;
  875. tokeniser->context.current_tag_type =
  876. HUBBUB_TOKEN_END_TAG;
  877. tokeniser->context.pending += len;
  878. tokeniser->state = STATE_TAG_NAME;
  879. } else if ('a' <= c && c <= 'z') {
  880. START_BUF(tokeniser->context.current_tag.name,
  881. cptr, len);
  882. tokeniser->context.current_tag.n_attributes = 0;
  883. tokeniser->context.current_tag_type =
  884. HUBBUB_TOKEN_END_TAG;
  885. tokeniser->context.pending += len;
  886. tokeniser->state = STATE_TAG_NAME;
  887. } else if (c == '>') {
  888. /* Cursor still at "</", need to collect ">" */
  889. tokeniser->context.pending += len;
  890. /* Now need to advance past "</>" */
  891. parserutils_inputstream_advance(tokeniser->input,
  892. tokeniser->context.pending);
  893. tokeniser->context.pending = 0;
  894. /** \todo parse error */
  895. tokeniser->state = STATE_DATA;
  896. } else {
  897. /** \todo parse error */
  898. /* Cursor still at "</", need to advance past it */
  899. parserutils_inputstream_advance(tokeniser->input,
  900. tokeniser->context.pending);
  901. tokeniser->context.pending = 0;
  902. tokeniser->state = STATE_BOGUS_COMMENT;
  903. }
  904. }
  905. return HUBBUB_OK;
  906. }
  907. /* this state expects tokeniser->context.current_tag to already have its
  908. first character set */
  909. hubbub_error hubbub_tokeniser_handle_tag_name(hubbub_tokeniser *tokeniser)
  910. {
  911. hubbub_tag *ctag = &tokeniser->context.current_tag;
  912. size_t len;
  913. const uint8_t *cptr;
  914. parserutils_error error;
  915. uint8_t c;
  916. assert(tokeniser->context.pending > 0);
  917. /* assert(tokeniser->context.chars.ptr[0] == '<'); */
  918. assert(ctag->name.len > 0);
  919. /* assert(ctag->name.ptr); */
  920. error = parserutils_inputstream_peek(tokeniser->input,
  921. tokeniser->context.pending, &cptr, &len);
  922. if (error != PARSERUTILS_OK) {
  923. if (error == PARSERUTILS_EOF) {
  924. tokeniser->state = STATE_DATA;
  925. return emit_current_tag(tokeniser);
  926. } else {
  927. return hubbub_error_from_parserutils_error(error);
  928. }
  929. }
  930. c = *cptr;
  931. if (c == '\t' || c == '\n' || c == '\f' || c == ' ' || c == '\r') {
  932. tokeniser->context.pending += len;
  933. tokeniser->state = STATE_BEFORE_ATTRIBUTE_NAME;
  934. } else if (c == '>') {
  935. tokeniser->context.pending += len;
  936. tokeniser->state = STATE_DATA;
  937. return emit_current_tag(tokeniser);
  938. } else if (c == '\0') {
  939. COLLECT(ctag->name, u_fffd, sizeof(u_fffd));
  940. tokeniser->context.pending += len;
  941. } else if (c == '/') {
  942. tokeniser->context.pending += len;
  943. tokeniser->state = STATE_SELF_CLOSING_START_TAG;
  944. } else if ('A' <= c && c <= 'Z') {
  945. uint8_t lc = (c + 0x20);
  946. COLLECT(ctag->name, &lc, len);
  947. tokeniser->context.pending += len;
  948. } else {
  949. COLLECT(ctag->name, cptr, len);
  950. tokeniser->context.pending += len;
  951. }
  952. return HUBBUB_OK;
  953. }
  954. hubbub_error hubbub_tokeniser_handle_before_attribute_name(
  955. hubbub_tokeniser *tokeniser)
  956. {
  957. hubbub_tag *ctag = &tokeniser->context.current_tag;
  958. size_t len;
  959. const uint8_t *cptr;
  960. parserutils_error error;
  961. uint8_t c;
  962. error = parserutils_inputstream_peek(tokeniser->input,
  963. tokeniser->context.pending, &cptr, &len);
  964. if (error != PARSERUTILS_OK) {
  965. if (error == PARSERUTILS_EOF) {
  966. tokeniser->state = STATE_DATA;
  967. return emit_current_tag(tokeniser);
  968. } else {
  969. return hubbub_error_from_parserutils_error(error);
  970. }
  971. }
  972. c = *cptr;
  973. if (c == '\t' || c == '\n' || c == '\f' || c == ' ' || c == '\r') {
  974. /* pass over in silence */
  975. tokeniser->context.pending += len;
  976. } else if (c == '>') {
  977. tokeniser->context.pending += len;
  978. tokeniser->state = STATE_DATA;
  979. return emit_current_tag(tokeniser);
  980. } else if (c == '/') {
  981. tokeniser->context.pending += len;
  982. tokeniser->state = STATE_SELF_CLOSING_START_TAG;
  983. } else {
  984. hubbub_attribute *attr;
  985. if (c == '"' || c == '\'' || c == '=') {
  986. /** \todo parse error */
  987. }
  988. attr = tokeniser->alloc(ctag->attributes,
  989. (ctag->n_attributes + 1) *
  990. sizeof(hubbub_attribute),
  991. tokeniser->alloc_pw);
  992. if (attr == NULL)
  993. return HUBBUB_NOMEM;
  994. ctag->attributes = attr;
  995. if ('A' <= c && c <= 'Z') {
  996. uint8_t lc = (c + 0x20);
  997. START_BUF(attr[ctag->n_attributes].name, &lc, len);
  998. } else if (c == '\0') {
  999. START_BUF(attr[ctag->n_attributes].name,
  1000. u_fffd, sizeof(u_fffd));
  1001. } else {
  1002. START_BUF(attr[ctag->n_attributes].name, cptr, len);
  1003. }
  1004. attr[ctag->n_attributes].ns = HUBBUB_NS_NULL;
  1005. attr[ctag->n_attributes].value.ptr = NULL;
  1006. attr[ctag->n_attributes].value.len = 0;
  1007. ctag->n_attributes++;
  1008. tokeniser->context.pending += len;
  1009. tokeniser->state = STATE_ATTRIBUTE_NAME;
  1010. }
  1011. return HUBBUB_OK;
  1012. }
  1013. hubbub_error hubbub_tokeniser_handle_attribute_name(hubbub_tokeniser *tokeniser)
  1014. {
  1015. hubbub_tag *ctag = &tokeniser->context.current_tag;
  1016. size_t len;
  1017. const uint8_t *cptr;
  1018. parserutils_error error;
  1019. uint8_t c;
  1020. assert(ctag->attributes[ctag->n_attributes - 1].name.len > 0);
  1021. error = parserutils_inputstream_peek(tokeniser->input,
  1022. tokeniser->context.pending, &cptr, &len);
  1023. if (error != PARSERUTILS_OK) {
  1024. if (error == PARSERUTILS_EOF) {
  1025. tokeniser->state = STATE_DATA;
  1026. return emit_current_tag(tokeniser);
  1027. } else {
  1028. return hubbub_error_from_parserutils_error(error);
  1029. }
  1030. }
  1031. c = *cptr;
  1032. if (c == '\t' || c == '\n' || c == '\f' || c == ' ' || c == '\r') {
  1033. tokeniser->context.pending += len;
  1034. tokeniser->state = STATE_AFTER_ATTRIBUTE_NAME;
  1035. } else if (c == '=') {
  1036. tokeniser->context.pending += len;
  1037. tokeniser->state = STATE_BEFORE_ATTRIBUTE_VALUE;
  1038. } else if (c == '>') {
  1039. tokeniser->context.pending += len;
  1040. tokeniser->state = STATE_DATA;
  1041. return emit_current_tag(tokeniser);
  1042. } else if (c == '/') {
  1043. tokeniser->context.pending += len;
  1044. tokeniser->state = STATE_SELF_CLOSING_START_TAG;
  1045. } else if (c == '\0') {
  1046. COLLECT(ctag->attributes[ctag->n_attributes - 1].name,
  1047. u_fffd, sizeof(u_fffd));
  1048. tokeniser->context.pending += len;
  1049. } else if ('A' <= c && c <= 'Z') {
  1050. uint8_t lc = (c + 0x20);
  1051. COLLECT(ctag->attributes[ctag->n_attributes - 1].name,
  1052. &lc, len);
  1053. tokeniser->context.pending += len;
  1054. } else {
  1055. COLLECT(ctag->attributes[ctag->n_attributes - 1].name,
  1056. cptr, len);
  1057. tokeniser->context.pending += len;
  1058. }
  1059. return HUBBUB_OK;
  1060. }
  1061. hubbub_error hubbub_tokeniser_handle_after_attribute_name(
  1062. hubbub_tokeniser *tokeniser)
  1063. {
  1064. hubbub_tag *ctag = &tokeniser->context.current_tag;
  1065. size_t len;
  1066. const uint8_t *cptr;
  1067. parserutils_error error;
  1068. uint8_t c;
  1069. error = parserutils_inputstream_peek(tokeniser->input,
  1070. tokeniser->context.pending, &cptr, &len);
  1071. if (error != PARSERUTILS_OK) {
  1072. if (error == PARSERUTILS_EOF) {
  1073. tokeniser->state = STATE_DATA;
  1074. return emit_current_tag(tokeniser);
  1075. } else {
  1076. return hubbub_error_from_parserutils_error(error);
  1077. }
  1078. }
  1079. c = *cptr;
  1080. if (c == '\t' || c == '\n' || c == '\f' || c == ' ' || c == '\r') {
  1081. tokeniser->context.pending += len;
  1082. } else if (c == '=') {
  1083. tokeniser->context.pending += len;
  1084. tokeniser->state = STATE_BEFORE_ATTRIBUTE_VALUE;
  1085. } else if (c == '>') {
  1086. tokeniser->context.pending += len;
  1087. tokeniser->state = STATE_DATA;
  1088. return emit_current_tag(tokeniser);
  1089. } else if (c == '/') {
  1090. tokeniser->context.pending += len;
  1091. tokeniser->state = STATE_SELF_CLOSING_START_TAG;
  1092. } else {
  1093. hubbub_attribute *attr;
  1094. if (c == '"' || c == '\'') {
  1095. /** \todo parse error */
  1096. }
  1097. attr = tokeniser->alloc(ctag->attributes,
  1098. (ctag->n_attributes + 1) *
  1099. sizeof(hubbub_attribute),
  1100. tokeniser->alloc_pw);
  1101. if (attr == NULL)
  1102. return HUBBUB_NOMEM;
  1103. ctag->attributes = attr;
  1104. if ('A' <= c && c <= 'Z') {
  1105. uint8_t lc = (c + 0x20);
  1106. START_BUF(attr[ctag->n_attributes].name, &lc, len);
  1107. } else if (c == '\0') {
  1108. START_BUF(attr[ctag->n_attributes].name,
  1109. u_fffd, sizeof(u_fffd));
  1110. } else {
  1111. START_BUF(attr[ctag->n_attributes].name, cptr, len);
  1112. }
  1113. attr[ctag->n_attributes].ns = HUBBUB_NS_NULL;
  1114. attr[ctag->n_attributes].value.ptr = NULL;
  1115. attr[ctag->n_attributes].value.len = 0;
  1116. ctag->n_attributes++;
  1117. tokeniser->context.pending += len;
  1118. tokeniser->state = STATE_ATTRIBUTE_NAME;
  1119. }
  1120. return HUBBUB_OK;
  1121. }
  1122. /* this state is only ever triggered by an '=' */
  1123. hubbub_error hubbub_tokeniser_handle_before_attribute_value(
  1124. hubbub_tokeniser *tokeniser)
  1125. {
  1126. hubbub_tag *ctag = &tokeniser->context.current_tag;
  1127. size_t len;
  1128. const uint8_t *cptr;
  1129. parserutils_error error;
  1130. uint8_t c;
  1131. error = parserutils_inputstream_peek(tokeniser->input,
  1132. tokeniser->context.pending, &cptr, &len);
  1133. if (error != PARSERUTILS_OK) {
  1134. if (error == PARSERUTILS_EOF) {
  1135. /** \todo parse error */
  1136. tokeniser->state = STATE_DATA;
  1137. return emit_current_tag(tokeniser);
  1138. } else {
  1139. return hubbub_error_from_parserutils_error(error);
  1140. }
  1141. }
  1142. c = *cptr;
  1143. if (c == '\t' || c == '\n' || c == '\f' || c == ' ' || c == '\r') {
  1144. tokeniser->context.pending += len;
  1145. } else if (c == '"') {
  1146. tokeniser->context.pending += len;
  1147. tokeniser->state = STATE_ATTRIBUTE_VALUE_DQ;
  1148. } else if (c == '&') {
  1149. /* Don't consume the '&' -- reprocess in UQ state */
  1150. tokeniser->state = STATE_ATTRIBUTE_VALUE_UQ;
  1151. } else if (c == '\'') {
  1152. tokeniser->context.pending += len;
  1153. tokeniser->state = STATE_ATTRIBUTE_VALUE_SQ;
  1154. } else if (c == '>') {
  1155. /** \todo parse error */
  1156. tokeniser->context.pending += len;
  1157. tokeniser->state = STATE_DATA;
  1158. return emit_current_tag(tokeniser);
  1159. } else if (c == '\0') {
  1160. START_BUF(ctag->attributes[ctag->n_attributes - 1].value,
  1161. u_fffd, sizeof(u_fffd));
  1162. tokeniser->context.pending += len;
  1163. tokeniser->state = STATE_ATTRIBUTE_VALUE_UQ;
  1164. } else {
  1165. if (c == '=') {
  1166. /** \todo parse error */
  1167. }
  1168. START_BUF(ctag->attributes[ctag->n_attributes - 1].value,
  1169. cptr, len);
  1170. tokeniser->context.pending += len;
  1171. tokeniser->state = STATE_ATTRIBUTE_VALUE_UQ;
  1172. }
  1173. return HUBBUB_OK;
  1174. }
  1175. hubbub_error hubbub_tokeniser_handle_attribute_value_dq(
  1176. hubbub_tokeniser *tokeniser)
  1177. {
  1178. hubbub_tag *ctag = &tokeniser->context.current_tag;
  1179. size_t len;
  1180. const uint8_t *cptr;
  1181. parserutils_error error;
  1182. uint8_t c;
  1183. error = parserutils_inputstream_peek(tokeniser->input,
  1184. tokeniser->context.pending, &cptr, &len);
  1185. if (error != PARSERUTILS_OK) {
  1186. if (error == PARSERUTILS_EOF) {
  1187. tokeniser->state = STATE_DATA;
  1188. return emit_current_tag(tokeniser);
  1189. } else {
  1190. return hubbub_error_from_parserutils_error(error);
  1191. }
  1192. }
  1193. c = *cptr;
  1194. if (c == '"') {
  1195. tokeniser->context.pending += len;
  1196. tokeniser->state = STATE_AFTER_ATTRIBUTE_VALUE_Q;
  1197. } else if (c == '&') {
  1198. tokeniser->context.prev_state = tokeniser->state;
  1199. tokeniser->state = STATE_CHARACTER_REFERENCE_IN_ATTRIBUTE_VALUE;
  1200. tokeniser->context.allowed_char = '"';
  1201. /* Don't eat the '&'; it'll be handled by entity consumption */
  1202. } else if (c == '\0') {
  1203. COLLECT_MS(ctag->attributes[ctag->n_attributes - 1].value,
  1204. u_fffd, sizeof(u_fffd));
  1205. tokeniser->context.pending += len;
  1206. } else if (c == '\r') {
  1207. error = parserutils_inputstream_peek(
  1208. tokeniser->input,
  1209. tokeniser->context.pending + len,
  1210. &cptr,
  1211. &len);
  1212. if (error != PARSERUTILS_OK && error != PARSERUTILS_EOF) {
  1213. return hubbub_error_from_parserutils_error(error);
  1214. } else if (error == PARSERUTILS_EOF || *cptr != '\n') {
  1215. COLLECT_MS(ctag->attributes[
  1216. ctag->n_attributes - 1].value,
  1217. &lf, sizeof(lf));
  1218. }
  1219. /* Consume '\r' */
  1220. tokeniser->context.pending += 1;
  1221. } else {
  1222. COLLECT_MS(ctag->attributes[ctag->n_attributes - 1].value,
  1223. cptr, len);
  1224. tokeniser->context.pending += len;
  1225. }
  1226. return HUBBUB_OK;
  1227. }
  1228. hubbub_error hubbub_tokeniser_handle_attribute_value_sq(
  1229. hubbub_tokeniser *tokeniser)
  1230. {
  1231. hubbub_tag *ctag = &tokeniser->context.current_tag;
  1232. size_t len;
  1233. const uint8_t *cptr;
  1234. parserutils_error error;
  1235. uint8_t c;
  1236. error = parserutils_inputstream_peek(tokeniser->input,
  1237. tokeniser->context.pending, &cptr, &len);
  1238. if (error != PARSERUTILS_OK) {
  1239. if (error == PARSERUTILS_EOF) {
  1240. tokeniser->state = STATE_DATA;
  1241. return emit_current_tag(tokeniser);
  1242. } else {
  1243. return hubbub_error_from_parserutils_error(error);
  1244. }
  1245. }
  1246. c = *cptr;
  1247. if (c == '\'') {
  1248. tokeniser->context.pending += len;
  1249. tokeniser->state = STATE_AFTER_ATTRIBUTE_VALUE_Q;
  1250. } else if (c == '&') {
  1251. tokeniser->context.prev_state = tokeniser->state;
  1252. tokeniser->state = STATE_CHARACTER_REFERENCE_IN_ATTRIBUTE_VALUE;
  1253. tokeniser->context.allowed_char = '\'';
  1254. /* Don't eat the '&'; it'll be handled by entity consumption */
  1255. } else if (c == '\0') {
  1256. COLLECT_MS(ctag->attributes[ctag->n_attributes - 1].value,
  1257. u_fffd, sizeof(u_fffd));
  1258. tokeniser->context.pending += len;
  1259. } else if (c == '\r') {
  1260. error = parserutils_inputstream_peek(
  1261. tokeniser->input,
  1262. tokeniser->context.pending + len,
  1263. &cptr,
  1264. &len);
  1265. if (error != PARSERUTILS_OK && error != PARSERUTILS_EOF) {
  1266. return hubbub_error_from_parserutils_error(error);
  1267. } else if (error == PARSERUTILS_EOF || *cptr != '\n') {
  1268. COLLECT_MS(ctag->attributes[
  1269. ctag->n_attributes - 1].value,
  1270. &lf, sizeof(lf));
  1271. }
  1272. /* Consume \r */
  1273. tokeniser->context.pending += 1;
  1274. } else {
  1275. COLLECT_MS(ctag->attributes[ctag->n_attributes - 1].value,
  1276. cptr, len);
  1277. tokeniser->context.pending += len;
  1278. }
  1279. return HUBBUB_OK;
  1280. }
  1281. hubbub_error hubbub_tokeniser_handle_attribute_value_uq(
  1282. hubbub_tokeniser *tokeniser)
  1283. {
  1284. hubbub_tag *ctag = &tokeniser->context.current_tag;
  1285. uint8_t c;
  1286. size_t len;
  1287. const uint8_t *cptr;
  1288. parserutils_error error;
  1289. error = parserutils_inputstream_peek(tokeniser->input,
  1290. tokeniser->context.pending, &cptr, &len);
  1291. if (error != PARSERUTILS_OK) {
  1292. if (error == PARSERUTILS_EOF) {
  1293. tokeniser->state = STATE_DATA;
  1294. return emit_current_tag(tokeniser);
  1295. } else {
  1296. return hubbub_error_from_parserutils_error(error);
  1297. }
  1298. }
  1299. c = *cptr;
  1300. assert(c == '&' ||
  1301. ctag->attributes[ctag->n_attributes - 1].value.len >= 1);
  1302. if (c == '\t' || c == '\n' || c == '\f' || c == ' ' || c == '\r') {
  1303. tokeniser->context.pending += len;
  1304. tokeniser->state = STATE_BEFORE_ATTRIBUTE_NAME;
  1305. } else if (c == '&') {
  1306. tokeniser->context.prev_state = tokeniser->state;
  1307. tokeniser->state = STATE_CHARACTER_REFERENCE_IN_ATTRIBUTE_VALUE;
  1308. /* Don't eat the '&'; it'll be handled by entity consumption */
  1309. } else if (c == '>') {
  1310. tokeniser->context.pending += len;
  1311. tokeniser->state = STATE_DATA;
  1312. return emit_current_tag(tokeniser);
  1313. } else if (c == '\0') {
  1314. COLLECT(ctag->attributes[ctag->n_attributes - 1].value,
  1315. u_fffd, sizeof(u_fffd));
  1316. tokeniser->context.pending += len;
  1317. } else {
  1318. if (c == '"' || c == '\'' || c == '=') {
  1319. /** \todo parse error */
  1320. }
  1321. COLLECT(ctag->attributes[ctag->n_attributes - 1].value,
  1322. cptr, len);
  1323. tokeniser->context.pending += len;
  1324. }
  1325. return HUBBUB_OK;
  1326. }
  1327. hubbub_error hubbub_tokeniser_handle_character_reference_in_attribute_value(
  1328. hubbub_tokeniser *tokeniser)
  1329. {
  1330. if (tokeniser->context.match_entity.complete == false) {
  1331. return hubbub_tokeniser_consume_character_reference(tokeniser,
  1332. tokeniser->context.pending);
  1333. } else {
  1334. hubbub_tag *ctag = &tokeniser->context.current_tag;
  1335. hubbub_attribute *attr = &ctag->attributes[
  1336. ctag->n_attributes - 1];
  1337. uint8_t utf8[6];
  1338. uint8_t *utf8ptr = utf8;
  1339. size_t len = sizeof(utf8);
  1340. if (tokeniser->context.match_entity.codepoint) {
  1341. parserutils_charset_utf8_from_ucs4(
  1342. tokeniser->context.match_entity.codepoint,
  1343. &utf8ptr, &len);
  1344. COLLECT_MS(attr->value, utf8, sizeof(utf8) - len);
  1345. /* +1 for the ampersand */
  1346. tokeniser->context.pending +=
  1347. tokeniser->context.match_entity.length
  1348. + 1;
  1349. } else {
  1350. size_t len = 0;
  1351. const uint8_t *cptr = NULL;
  1352. parserutils_error error;
  1353. error = parserutils_inputstream_peek(
  1354. tokeniser->input,
  1355. tokeniser->context.pending,
  1356. &cptr,
  1357. &len);
  1358. assert(error == PARSERUTILS_OK);
  1359. /* Insert the ampersand */
  1360. COLLECT_MS(attr->value, cptr, len);
  1361. tokeniser->context.pending += len;
  1362. }
  1363. /* Reset for next time */
  1364. tokeniser->context.match_entity.complete = false;
  1365. /* And back to the previous state */
  1366. tokeniser->state = tokeniser->context.prev_state;
  1367. }
  1368. return HUBBUB_OK;
  1369. }
  1370. /* always switches state */
  1371. hubbub_error hubbub_tokeniser_handle_after_attribute_value_q(
  1372. hubbub_tokeniser *tokeniser)
  1373. {
  1374. size_t len;
  1375. const uint8_t *cptr;
  1376. parserutils_error error;
  1377. uint8_t c;
  1378. error = parserutils_inputstream_peek(tokeniser->input,
  1379. tokeniser->context.pending, &cptr, &len);
  1380. if (error != PARSERUTILS_OK) {
  1381. if (error == PARSERUTILS_EOF) {
  1382. tokeniser->state = STATE_DATA;
  1383. return emit_current_tag(tokeniser);
  1384. } else {
  1385. return hubbub_error_from_parserutils_error(error);
  1386. }
  1387. }
  1388. c = *cptr;
  1389. if (c == '\t' || c == '\n' || c == '\f' || c == ' ' || c == '\r') {
  1390. tokeniser->context.pending += len;
  1391. tokeniser->state = STATE_BEFORE_ATTRIBUTE_NAME;
  1392. } else if (c == '>') {
  1393. tokeniser->context.pending += len;
  1394. tokeniser->state = STATE_DATA;
  1395. return emit_current_tag(tokeniser);
  1396. } else if (c == '/') {
  1397. tokeniser->context.pending += len;
  1398. tokeniser->state = STATE_SELF_CLOSING_START_TAG;
  1399. } else {
  1400. /** \todo parse error */
  1401. /* Reprocess character in before attribute name state */
  1402. tokeniser->state = STATE_BEFORE_ATTRIBUTE_NAME;
  1403. }
  1404. return HUBBUB_OK;
  1405. }
  1406. hubbub_error hubbub_tokeniser_handle_self_closing_start_tag(
  1407. hubbub_tokeniser *tokeniser)
  1408. {
  1409. size_t len;
  1410. const uint8_t *cptr;
  1411. parserutils_error error;
  1412. uint8_t c;
  1413. error = parserutils_inputstream_peek(tokeniser->input,
  1414. tokeniser->context.pending, &cptr, &len);
  1415. if (error != PARSERUTILS_OK) {
  1416. if (error == PARSERUTILS_EOF) {
  1417. tokeniser->state = STATE_DATA;
  1418. return emit_current_tag(tokeniser);
  1419. } else {
  1420. return hubbub_error_from_parserutils_error(error);
  1421. }
  1422. }
  1423. c = *cptr;
  1424. if (c == '>') {
  1425. tokeniser->context.pending += len;
  1426. tokeniser->state = STATE_DATA;
  1427. tokeniser->context.current_tag.self_closing = true;
  1428. return emit_current_tag(tokeniser);
  1429. } else {
  1430. /* Reprocess character in before attribute name state */
  1431. tokeniser->state = STATE_BEFORE_ATTRIBUTE_NAME;
  1432. }
  1433. return HUBBUB_OK;
  1434. }
  1435. /* this state expects tokeniser->context.chars to be empty on first entry */
  1436. hubbub_error hubbub_tokeniser_handle_bogus_comment(hubbub_tokeniser *tokeniser)
  1437. {
  1438. size_t len;
  1439. const uint8_t *cptr;
  1440. parserutils_error error;
  1441. uint8_t c;
  1442. error = parserutils_inputstream_peek(tokeniser->input,
  1443. tokeniser->context.pending, &cptr, &len);
  1444. if (error != PARSERUTILS_OK) {
  1445. if (error == PARSERUTILS_EOF) {
  1446. tokeniser->state = STATE_DATA;
  1447. return emit_current_comment(tokeniser);
  1448. } else {
  1449. return hubbub_error_from_parserutils_error(error);
  1450. }
  1451. }
  1452. c = *cptr;
  1453. if (c == '>') {
  1454. tokeniser->context.pending += len;
  1455. tokeniser->state = STATE_DATA;
  1456. return emit_current_comment(tokeniser);
  1457. } else if (c == '\0') {
  1458. error = parserutils_buffer_append(tokeniser->buffer,
  1459. u_fffd, sizeof(u_fffd));
  1460. if (error != PARSERUTILS_OK)
  1461. return hubbub_error_from_parserutils_error(error);
  1462. tokeniser->context.pending += len;
  1463. } else if (c == '\r') {
  1464. error = parserutils_inputstream_peek(
  1465. tokeniser->input,
  1466. tokeniser->context.pending,
  1467. &cptr,
  1468. &len);
  1469. if (error != PARSERUTILS_OK && error != PARSERUTILS_EOF) {
  1470. return hubbub_error_from_parserutils_error(error);
  1471. } else if (error == PARSERUTILS_EOF || *cptr != '\n') {
  1472. error = parserutils_buffer_append(tokeniser->buffer,
  1473. &lf, sizeof(lf));
  1474. if (error != PARSERUTILS_OK) {
  1475. return hubbub_error_from_parserutils_error(
  1476. error);
  1477. }
  1478. }
  1479. tokeniser->context.pending += len;
  1480. } else {
  1481. error = parserutils_buffer_append(tokeniser->buffer,
  1482. (uint8_t *) cptr, len);
  1483. if (error != PARSERUTILS_OK)
  1484. return hubbub_error_from_parserutils_error(error);
  1485. tokeniser->context.pending += len;
  1486. }
  1487. return HUBBUB_OK;
  1488. }
  1489. /* this state always switches to another state straight away */
  1490. hubbub_error hubbub_tokeniser_handle_markup_declaration_open(
  1491. hubbub_tokeniser *tokeniser)
  1492. {
  1493. size_t len;
  1494. const uint8_t *cptr;
  1495. parserutils_error error;
  1496. uint8_t c;
  1497. assert(tokeniser->context.pending == 0);
  1498. error = parserutils_inputstream_peek(tokeniser->input, 0, &cptr, &len);
  1499. if (error != PARSERUTILS_OK) {
  1500. if (error == PARSERUTILS_EOF) {
  1501. tokeniser->state = STATE_BOGUS_COMMENT;
  1502. return HUBBUB_OK;
  1503. } else {
  1504. return hubbub_error_from_parserutils_error(error);
  1505. }
  1506. }
  1507. c = *cptr;
  1508. if (c == '-') {
  1509. tokeniser->context.pending = len;
  1510. tokeniser->state = STATE_MATCH_COMMENT;
  1511. } else if ((c & ~0x20) == 'D') {
  1512. tokeniser->context.pending = len;
  1513. tokeniser->context.match_doctype.count = len;
  1514. tokeniser->state = STATE_MATCH_DOCTYPE;
  1515. } else if (tokeniser->process_cdata_section == true && c == '[') {
  1516. tokeniser->context.pending = len;
  1517. tokeniser->context.match_cdata.count = len;
  1518. tokeniser->state = STATE_MATCH_CDATA;
  1519. } else {
  1520. tokeniser->state = STATE_BOGUS_COMMENT;
  1521. }
  1522. return HUBBUB_OK;
  1523. }
  1524. hubbub_error hubbub_tokeniser_handle_match_comment(hubbub_tokeniser *tokeniser)
  1525. {
  1526. size_t len;
  1527. const uint8_t *cptr;
  1528. parserutils_error error;
  1529. error = parserutils_inputstream_peek(tokeniser->input,
  1530. tokeniser->context.pending, &cptr, &len);
  1531. if (error != PARSERUTILS_OK) {
  1532. if (error == PARSERUTILS_EOF) {
  1533. tokeniser->context.pending =
  1534. tokeniser->context.current_comment.len = 0;
  1535. tokeniser->state = STATE_BOGUS_COMMENT;
  1536. return HUBBUB_OK;
  1537. } else {
  1538. return hubbub_error_from_parserutils_error(error);
  1539. }
  1540. }
  1541. tokeniser->context.pending = tokeniser->context.current_comment.len = 0;
  1542. if (*cptr == '-') {
  1543. parserutils_inputstream_advance(tokeniser->input, SLEN("--"));
  1544. tokeniser->state = STATE_COMMENT_START;
  1545. } else {
  1546. tokeniser->state = STATE_BOGUS_COMMENT;
  1547. }
  1548. return HUBBUB_OK;
  1549. }
  1550. hubbub_error hubbub_tokeniser_handle_comment(hubbub_tokeniser *tokeniser)
  1551. {
  1552. size_t len;
  1553. const uint8_t *cptr;
  1554. parserutils_error error;
  1555. uint8_t c;
  1556. error = parserutils_inputstream_peek(tokeniser->input,
  1557. tokeniser->context.pending, &cptr, &len);
  1558. if (error != PARSERUTILS_OK) {
  1559. if (error == PARSERUTILS_EOF) {
  1560. tokeniser->state = STATE_DATA;
  1561. return emit_current_comment(tokeniser);
  1562. } else {
  1563. return hubbub_error_from_parserutils_error(error);
  1564. }
  1565. }
  1566. c = *cptr;
  1567. if (c == '>' && (tokeniser->state == STATE_COMMENT_START_DASH ||
  1568. tokeniser->state == STATE_COMMENT_START ||
  1569. tokeniser->state == STATE_COMMENT_END)) {
  1570. tokeniser->context.pending += len;
  1571. /** \todo parse error if state != COMMENT_END */
  1572. tokeniser->state = STATE_DATA;
  1573. return emit_current_comment(tokeniser);
  1574. } else if (c == '-') {
  1575. if (tokeniser->state == STATE_COMMENT_START) {
  1576. tokeniser->state = STATE_COMMENT_START_DASH;
  1577. } else if (tokeniser->state == STATE_COMMENT_START_DASH) {
  1578. tokeniser->state = STATE_COMMENT_END;
  1579. } else if (tokeniser->state == STATE_COMMENT) {
  1580. tokeniser->state = STATE_COMMENT_END_DASH;
  1581. } else if (tokeniser->state == STATE_COMMENT_END_DASH) {
  1582. tokeniser->state = STATE_COMMENT_END;
  1583. } else if (tokeniser->state == STATE_COMMENT_END) {
  1584. error = parserutils_buffer_append(tokeniser->buffer,
  1585. (uint8_t *) "-", SLEN("-"));
  1586. if (error != PARSERUTILS_OK) {
  1587. return hubbub_error_from_parserutils_error(
  1588. error);
  1589. }
  1590. }
  1591. tokeniser->context.pending += len;
  1592. } else {
  1593. if (tokeniser->state == STATE_COMMENT_START_DASH ||
  1594. tokeniser->state == STATE_COMMENT_END_DASH) {
  1595. error = parserutils_buffer_append(tokeniser->buffer,
  1596. (uint8_t *) "-", SLEN("-"));
  1597. if (error != PARSERUTILS_OK) {
  1598. return hubbub_error_from_parserutils_error(
  1599. error);
  1600. }
  1601. } else if (tokeniser->state == STATE_COMMENT_END) {
  1602. error = parserutils_buffer_append(tokeniser->buffer,
  1603. (uint8_t *) "--", SLEN("--"));
  1604. if (error != PARSERUTILS_OK) {
  1605. return hubbub_error_from_parserutils_error(
  1606. error);
  1607. }
  1608. }
  1609. if (c == '\0') {
  1610. error = parserutils_buffer_append(tokeniser->buffer,
  1611. u_fffd, sizeof(u_fffd));
  1612. if (error != PARSERUTILS_OK) {
  1613. return hubbub_error_from_parserutils_error(
  1614. error);
  1615. }
  1616. } else if (c == '\r') {
  1617. size_t next_len;
  1618. error = parserutils_inputstream_peek(
  1619. tokeniser->input,
  1620. tokeniser->context.pending + len,
  1621. &cptr,
  1622. &next_len);
  1623. if (error != PARSERUTILS_OK &&
  1624. error != PARSERUTILS_EOF) {
  1625. return hubbub_error_from_parserutils_error(
  1626. error);
  1627. } else if (error != PARSERUTILS_EOF && *cptr != '\n') {
  1628. error = parserutils_buffer_append(
  1629. tokeniser->buffer,
  1630. &lf, sizeof(lf));
  1631. if (error != PARSERUTILS_OK) {
  1632. return hubbub_error_from_parserutils_error(
  1633. error);
  1634. }
  1635. }
  1636. } else {
  1637. error = parserutils_buffer_append(tokeniser->buffer,
  1638. cptr, len);
  1639. if (error != PARSERUTILS_OK) {
  1640. return hubbub_error_from_parserutils_error(
  1641. error);
  1642. }
  1643. }
  1644. tokeniser->context.pending += len;
  1645. tokeniser->state = STATE_COMMENT;
  1646. }
  1647. return HUBBUB_OK;
  1648. }
  1649. #define DOCTYPE "DOCTYPE"
  1650. #define DOCTYPE_LEN (SLEN(DOCTYPE) - 1)
  1651. hubbub_error hubbub_tokeniser_handle_match_doctype(hubbub_tokeniser *tokeniser)
  1652. {
  1653. size_t len;
  1654. const uint8_t *cptr;
  1655. parserutils_error error;
  1656. uint8_t c;
  1657. error = parserutils_inputstream_peek(tokeniser->input,
  1658. tokeniser->context.match_doctype.count, &cptr, &len);
  1659. if (error != PARSERUTILS_OK) {
  1660. if (error == PARSERUTILS_EOF) {
  1661. tokeniser->context.current_comment.len =
  1662. tokeniser->context.pending = 0;
  1663. tokeniser->state = STATE_BOGUS_COMMENT;
  1664. return HUBBUB_OK;
  1665. } else {
  1666. return hubbub_error_from_parserutils_error(error);
  1667. }
  1668. }
  1669. c = *cptr;
  1670. assert(tokeniser->context.match_doctype.count <= DOCTYPE_LEN);
  1671. if (DOCTYPE[tokeniser->context.match_doctype.count] != (c & ~0x20)) {
  1672. tokeniser->context.current_comment.len =
  1673. tokeniser->context.pending = 0;
  1674. tokeniser->state = STATE_BOGUS_COMMENT;
  1675. return HUBBUB_OK;
  1676. }
  1677. tokeniser->context.pending += len;
  1678. if (tokeniser->context.match_doctype.count == DOCTYPE_LEN) {
  1679. /* Skip over the DOCTYPE bit */
  1680. parserutils_inputstream_advance(tokeniser->input,
  1681. tokeniser->context.pending);
  1682. memset(&tokeniser->context.current_doctype, 0,
  1683. sizeof tokeniser->context.current_doctype);
  1684. tokeniser->context.current_doctype.public_missing = true;
  1685. tokeniser->context.current_doctype.system_missing = true;
  1686. tokeniser->context.pending = 0;
  1687. tokeniser->state = STATE_DOCTYPE;
  1688. }
  1689. tokeniser->context.match_doctype.count++;
  1690. return HUBBUB_OK;
  1691. }
  1692. #undef DOCTYPE
  1693. #undef DOCTYPE_LEN
  1694. hubbub_error hubbub_tokeniser_handle_doctype(hubbub_tokeniser *tokeniser)
  1695. {
  1696. size_t len;
  1697. const uint8_t *cptr;
  1698. parserutils_error error;
  1699. uint8_t c;
  1700. error = parserutils_inputstream_peek(tokeniser->input,
  1701. tokeniser->context.pending, &cptr, &len);
  1702. if (error != PARSERUTILS_OK) {
  1703. if (error == PARSERUTILS_EOF) {
  1704. tokeniser->state = STATE_BEFORE_DOCTYPE_NAME;
  1705. return HUBBUB_OK;
  1706. } else {
  1707. return hubbub_error_from_parserutils_error(error);
  1708. }
  1709. }
  1710. c = *cptr;
  1711. if (c == '\t' || c == '\n' || c == '\f' || c == ' ' || c == '\r') {
  1712. tokeniser->context.pending += len;
  1713. }
  1714. tokeniser->state = STATE_BEFORE_DOCTYPE_NAME;
  1715. return HUBBUB_OK;
  1716. }
  1717. hubbub_error hubbub_tokeniser_handle_before_doctype_name(
  1718. hubbub_tokeniser *tokeniser)
  1719. {
  1720. hubbub_doctype *cdoc = &tokeniser->context.current_doctype;
  1721. size_t len;
  1722. const uint8_t *cptr;
  1723. parserutils_error error;
  1724. uint8_t c;
  1725. error = parserutils_inputstream_peek(tokeniser->input,
  1726. tokeniser->context.pending, &cptr, &len);
  1727. if (error != PARSERUTILS_OK) {
  1728. if (error == PARSERUTILS_EOF) {
  1729. /** \todo parse error */
  1730. /* Emit current doctype, force-quirks on */
  1731. tokeniser->state = STATE_DATA;
  1732. return emit_current_doctype(tokeniser, true);
  1733. } else {
  1734. return hubbub_error_from_parserutils_error(error);
  1735. }
  1736. }
  1737. c = *cptr;
  1738. if (c == '\t' || c == '\n' || c == '\f' || c == ' ' || c == '\r') {
  1739. /* pass over in silence */
  1740. tokeniser->context.pending += len;
  1741. } else if (c == '>') {
  1742. /** \todo parse error */
  1743. tokeniser->context.pending += len;
  1744. tokeniser->state = STATE_DATA;
  1745. return emit_current_doctype(tokeniser, true);
  1746. } else {
  1747. if (c == '\0') {
  1748. START_BUF(cdoc->name, u_fffd, sizeof(u_fffd));
  1749. } else if ('A' <= c && c <= 'Z') {
  1750. uint8_t lc = c + 0x20;
  1751. START_BUF(cdoc->name, &lc, len);
  1752. } else {
  1753. START_BUF(cdoc->name, cptr, len);
  1754. }
  1755. tokeniser->context.pending += len;
  1756. tokeniser->state = STATE_DOCTYPE_NAME;
  1757. }
  1758. return HUBBUB_OK;
  1759. }
  1760. hubbub_error hubbub_tokeniser_handle_doctype_name(hubbub_tokeniser *tokeniser)
  1761. {
  1762. hubbub_doctype *cdoc = &tokeniser->context.current_doctype;
  1763. size_t len;
  1764. const uint8_t *cptr;
  1765. parserutils_error error;
  1766. uint8_t c;
  1767. error = parserutils_inputstream_peek(tokeniser->input,
  1768. tokeniser->context.pending, &cptr, &len);
  1769. if (error != PARSERUTILS_OK) {
  1770. if (error == PARSERUTILS_EOF) {
  1771. tokeniser->state = STATE_DATA;
  1772. return emit_current_doctype(tokeniser, true);
  1773. } else {
  1774. return hubbub_error_from_parserutils_error(error);
  1775. }
  1776. }
  1777. c = *cptr;
  1778. if (c == '\t' || c == '\n' || c == '\f' || c == ' ' || c == '\r') {
  1779. tokeniser->context.pending += len;
  1780. tokeniser->state = STATE_AFTER_DOCTYPE_NAME;
  1781. } else if (c == '>') {
  1782. tokeniser->context.pending += len;
  1783. tokeniser->state = STATE_DATA;
  1784. return emit_current_doctype(tokeniser, false);
  1785. } else if (c == '\0') {
  1786. COLLECT(cdoc->name, u_fffd, sizeof(u_fffd));
  1787. tokeniser->context.pending += len;
  1788. } else if ('A' <= c && c <= 'Z') {
  1789. uint8_t lc = c + 0x20;
  1790. COLLECT(cdoc->name, &lc, len);
  1791. tokeniser->context.pending += len;
  1792. } else {
  1793. COLLECT(cdoc->name, cptr, len);
  1794. tokeniser->context.pending += len;
  1795. }
  1796. return HUBBUB_OK;
  1797. }
  1798. hubbub_error hubbub_tokeniser_handle_after_doctype_name(
  1799. hubbub_tokeniser *tokeniser)
  1800. {
  1801. size_t len;
  1802. const uint8_t *cptr;
  1803. parserutils_error error;
  1804. uint8_t c;
  1805. error = parserutils_inputstream_peek(tokeniser->input,
  1806. tokeniser->context.pending, &cptr, &len);
  1807. if (error != PARSERUTILS_OK) {
  1808. if (error == PARSERUTILS_EOF) {
  1809. tokeniser->state = STATE_DATA;
  1810. return emit_current_doctype(tokeniser, true);
  1811. } else {
  1812. return hubbub_error_from_parserutils_error(error);
  1813. }
  1814. }
  1815. c = *cptr;
  1816. tokeniser->context.pending += len;
  1817. if (c == '\t' || c == '\n' || c == '\f' || c == ' ' || c == '\r') {
  1818. /* pass over in silence */
  1819. } else if (c == '>') {
  1820. tokeniser->state = STATE_DATA;
  1821. return emit_current_doctype(tokeniser, false);
  1822. } else if ((c & ~0x20) == 'P') {
  1823. tokeniser->context.match_doctype.count = 1;
  1824. tokeniser->state = STATE_MATCH_PUBLIC;
  1825. } else if ((c & ~0x20) == 'S') {
  1826. tokeniser->context.match_doctype.count = 1;
  1827. tokeniser->state = STATE_MATCH_SYSTEM;
  1828. } else {
  1829. tokeniser->state = STATE_BOGUS_DOCTYPE;
  1830. tokeniser->context.current_doctype.force_quirks = true;
  1831. }
  1832. return HUBBUB_OK;
  1833. }
  1834. #define PUBLIC "PUBLIC"
  1835. #define PUBLIC_LEN (SLEN(PUBLIC) - 1)
  1836. hubbub_error hubbub_tokeniser_handle_match_public(hubbub_tokeniser *tokeniser)
  1837. {
  1838. size_t len;
  1839. const uint8_t *cptr;
  1840. parserutils_error error;
  1841. uint8_t c;
  1842. error = parserutils_inputstream_peek(tokeniser->input,
  1843. tokeniser->context.pending, &cptr, &len);
  1844. if (error != PARSERUTILS_OK) {
  1845. if (error == PARSERUTILS_EOF) {
  1846. tokeniser->context.current_doctype.force_quirks = true;
  1847. tokeniser->state = STATE_BOGUS_DOCTYPE;
  1848. return HUBBUB_OK;
  1849. } else {
  1850. return hubbub_error_from_parserutils_error(error);
  1851. }
  1852. }
  1853. c = *cptr;
  1854. assert(tokeniser->context.match_doctype.count <= PUBLIC_LEN);
  1855. if (PUBLIC[tokeniser->context.match_doctype.count] != (c & ~0x20)) {
  1856. tokeniser->context.current_doctype.force_quirks = true;
  1857. tokeniser->state = STATE_BOGUS_DOCTYPE;
  1858. return HUBBUB_OK;
  1859. }
  1860. tokeniser->context.pending += len;
  1861. if (tokeniser->context.match_doctype.count == PUBLIC_LEN) {
  1862. tokeniser->state = STATE_BEFORE_DOCTYPE_PUBLIC;
  1863. }
  1864. tokeniser->context.match_doctype.count++;
  1865. return HUBBUB_OK;
  1866. }
  1867. #undef PUBLIC
  1868. #undef PUBLIC_LEN
  1869. hubbub_error hubbub_tokeniser_handle_before_doctype_public(
  1870. hubbub_tokeniser *tokeniser)
  1871. {
  1872. hubbub_doctype *cdoc = &tokeniser->context.current_doctype;
  1873. size_t len;
  1874. const uint8_t *cptr;
  1875. parserutils_error error;
  1876. uint8_t c;
  1877. error = parserutils_inputstream_peek(tokeniser->input,
  1878. tokeniser->context.pending, &cptr, &len);
  1879. if (error != PARSERUTILS_OK) {
  1880. if (error == PARSERUTILS_EOF) {
  1881. tokeniser->state = STATE_DATA;
  1882. return emit_current_doctype(tokeniser, true);
  1883. } else {
  1884. return hubbub_error_from_parserutils_error(error);
  1885. }
  1886. }
  1887. c = *cptr;
  1888. tokeniser->context.pending += len;
  1889. if (c == '\t' || c == '\n' || c == '\f' || c == ' ' || c == '\r') {
  1890. /* pass over in silence */
  1891. } else if (c == '"') {
  1892. cdoc->public_missing = false;
  1893. cdoc->public_id.len = 0;
  1894. tokeniser->state = STATE_DOCTYPE_PUBLIC_DQ;
  1895. } else if (c == '\'') {
  1896. cdoc->public_missing = false;
  1897. cdoc->public_id.len = 0;
  1898. tokeniser->state = STATE_DOCTYPE_PUBLIC_SQ;
  1899. } else if (c == '>') {
  1900. tokeniser->state = STATE_DATA;
  1901. return emit_current_doctype(tokeniser, true);
  1902. } else {
  1903. cdoc->force_quirks = true;
  1904. tokeniser->state = STATE_BOGUS_DOCTYPE;
  1905. }
  1906. return HUBBUB_OK;
  1907. }
  1908. hubbub_error hubbub_tokeniser_handle_doctype_public_dq(
  1909. hubbub_tokeniser *tokeniser)
  1910. {
  1911. hubbub_doctype *cdoc = &tokeniser->context.current_doctype;
  1912. size_t len;
  1913. const uint8_t *cptr;
  1914. parserutils_error error;
  1915. uint8_t c;
  1916. error = parserutils_inputstream_peek(tokeniser->input,
  1917. tokeniser->context.pending, &cptr, &len);
  1918. if (error != PARSERUTILS_OK) {
  1919. if (error == PARSERUTILS_EOF) {
  1920. tokeniser->state = STATE_DATA;
  1921. return emit_current_doctype(tokeniser, true);
  1922. } else {
  1923. return hubbub_error_from_parserutils_error(error);
  1924. }
  1925. }
  1926. c = *cptr;
  1927. if (c == '"') {
  1928. tokeniser->context.pending += len;
  1929. tokeniser->state = STATE_AFTER_DOCTYPE_PUBLIC;
  1930. } else if (c == '>') {
  1931. tokeniser->context.pending += len;
  1932. tokeniser->state = STATE_DATA;
  1933. return emit_current_doctype(tokeniser, true);
  1934. } else if (c == '\0') {
  1935. COLLECT_MS(cdoc->public_id, u_fffd, sizeof(u_fffd));
  1936. tokeniser->context.pending += len;
  1937. } else if (c == '\r') {
  1938. error = parserutils_inputstream_peek(
  1939. tokeniser->input,
  1940. tokeniser->context.pending,
  1941. &cptr,
  1942. &len);
  1943. if (error != PARSERUTILS_OK && error != PARSERUTILS_EOF) {
  1944. return hubbub_error_from_parserutils_error(error);
  1945. } else if (error == PARSERUTILS_EOF || *cptr != '\n') {
  1946. COLLECT_MS(cdoc->public_id, &lf, sizeof(lf));
  1947. }
  1948. /* Collect '\r' */
  1949. tokeniser->context.pending += 1;
  1950. } else {
  1951. COLLECT_MS(cdoc->public_id, cptr, len);
  1952. tokeniser->context.pending += len;
  1953. }
  1954. return HUBBUB_OK;
  1955. }
  1956. hubbub_error hubbub_tokeniser_handle_doctype_public_sq(
  1957. hubbub_tokeniser *tokeniser)
  1958. {
  1959. hubbub_doctype *cdoc = &tokeniser->context.current_doctype;
  1960. size_t len;
  1961. const uint8_t *cptr;
  1962. parserutils_error error;
  1963. uint8_t c;
  1964. error = parserutils_inputstream_peek(tokeniser->input,
  1965. tokeniser->context.pending, &cptr, &len);
  1966. if (error != PARSERUTILS_OK) {
  1967. if (error == PARSERUTILS_EOF) {
  1968. tokeniser->state = STATE_DATA;
  1969. return emit_current_doctype(tokeniser, true);
  1970. } else {
  1971. return hubbub_error_from_parserutils_error(error);
  1972. }
  1973. }
  1974. c = *cptr;
  1975. if (c == '\'') {
  1976. tokeniser->context.pending += len;
  1977. tokeniser->state = STATE_AFTER_DOCTYPE_PUBLIC;
  1978. } else if (c == '>') {
  1979. tokeniser->context.pending += len;
  1980. tokeniser->state = STATE_DATA;
  1981. return emit_current_doctype(tokeniser, true);
  1982. } else if (c == '\0') {
  1983. COLLECT_MS(cdoc->public_id, u_fffd, sizeof(u_fffd));
  1984. tokeniser->context.pending += len;
  1985. } else if (c == '\r') {
  1986. error = parserutils_inputstream_peek(
  1987. tokeniser->input,
  1988. tokeniser->context.pending,
  1989. &cptr,
  1990. &len);
  1991. if (error != PARSERUTILS_OK && error != PARSERUTILS_EOF) {
  1992. return hubbub_error_from_parserutils_error(error);
  1993. } else if (error == PARSERUTILS_EOF || *cptr != '\n') {
  1994. COLLECT_MS(cdoc->public_id, &lf, sizeof(lf));
  1995. }
  1996. /* Collect '\r' */
  1997. tokeniser->context.pending += 1;
  1998. } else {
  1999. COLLECT_MS(cdoc->public_id, cptr, len);
  2000. tokeniser->context.pending += len;
  2001. }
  2002. return HUBBUB_OK;
  2003. }
  2004. hubbub_error hubbub_tokeniser_handle_after_doctype_public(
  2005. hubbub_tokeniser *tokeniser)
  2006. {
  2007. hubbub_doctype *cdoc = &tokeniser->context.current_doctype;
  2008. size_t len;
  2009. const uint8_t *cptr;
  2010. parserutils_error error;
  2011. uint8_t c;
  2012. error = parserutils_inputstream_peek(tokeniser->input,
  2013. tokeniser->context.pending, &cptr, &len);
  2014. if (error != PARSERUTILS_OK) {
  2015. if (error == PARSERUTILS_EOF) {
  2016. tokeniser->state = STATE_DATA;
  2017. return emit_current_doctype(tokeniser, true);
  2018. } else {
  2019. return hubbub_error_from_parserutils_error(error);
  2020. }
  2021. }
  2022. c = *cptr;
  2023. tokeniser->context.pending += len;
  2024. if (c == '\t' || c == '\n' || c == '\f' || c == ' ' || c == '\r') {
  2025. /* pass over in silence */
  2026. } else if (c == '"') {
  2027. cdoc->system_missing = false;
  2028. cdoc->system_id.len = 0;
  2029. tokeniser->state = STATE_DOCTYPE_SYSTEM_DQ;
  2030. } else if (c == '\'') {
  2031. cdoc->system_missing = false;
  2032. cdoc->system_id.len = 0;
  2033. tokeniser->state = STATE_DOCTYPE_SYSTEM_SQ;
  2034. } else if (c == '>') {
  2035. tokeniser->state = STATE_DATA;
  2036. return emit_current_doctype(tokeniser, false);
  2037. } else {
  2038. cdoc->force_quirks = true;
  2039. tokeniser->state = STATE_BOGUS_DOCTYPE;
  2040. }
  2041. return HUBBUB_OK;
  2042. }
  2043. #define SYSTEM "SYSTEM"
  2044. #define SYSTEM_LEN (SLEN(SYSTEM) - 1)
  2045. hubbub_error hubbub_tokeniser_handle_match_system(hubbub_tokeniser *tokeniser)
  2046. {
  2047. size_t len;
  2048. const uint8_t *cptr;
  2049. parserutils_error error;
  2050. uint8_t c;
  2051. error = parserutils_inputstream_peek(tokeniser->input,
  2052. tokeniser->context.pending, &cptr, &len);
  2053. if (error != PARSERUTILS_OK){
  2054. if (error == PARSERUTILS_EOF) {
  2055. tokeniser->context.current_doctype.force_quirks = true;
  2056. tokeniser->state = STATE_BOGUS_DOCTYPE;
  2057. return HUBBUB_OK;
  2058. } else {
  2059. return hubbub_error_from_parserutils_error(error);
  2060. }
  2061. }
  2062. c = *cptr;
  2063. assert(tokeniser->context.match_doctype.count <= SYSTEM_LEN);
  2064. if (SYSTEM[tokeniser->context.match_doctype.count] != (c & ~0x20)) {
  2065. tokeniser->context.current_doctype.force_quirks = true;
  2066. tokeniser->state = STATE_BOGUS_DOCTYPE;
  2067. return HUBBUB_OK;
  2068. }
  2069. tokeniser->context.pending += len;
  2070. if (tokeniser->context.match_doctype.count == SYSTEM_LEN) {
  2071. tokeniser->state = STATE_BEFORE_DOCTYPE_SYSTEM;
  2072. }
  2073. tokeniser->context.match_doctype.count++;
  2074. return HUBBUB_OK;
  2075. }
  2076. #undef SYSTEM
  2077. #undef SYSTEM_LEN
  2078. hubbub_error hubbub_tokeniser_handle_before_doctype_system(
  2079. hubbub_tokeniser *tokeniser)
  2080. {
  2081. hubbub_doctype *cdoc = &tokeniser->context.current_doctype;
  2082. size_t len;
  2083. const uint8_t *cptr;
  2084. parserutils_error error;
  2085. uint8_t c;
  2086. error = parserutils_inputstream_peek(tokeniser->input,
  2087. tokeniser->context.pending, &cptr, &len);
  2088. if (error != PARSERUTILS_OK) {
  2089. if (error == PARSERUTILS_EOF) {
  2090. tokeniser->state = STATE_DATA;
  2091. return emit_current_doctype(tokeniser, true);
  2092. } else {
  2093. return hubbub_error_from_parserutils_error(error);
  2094. }
  2095. }
  2096. c = *cptr;
  2097. tokeniser->context.pending += len;
  2098. if (c == '\t' || c == '\n' || c == '\f' || c == ' ' || c == '\r') {
  2099. /* pass over */
  2100. } else if (c == '"') {
  2101. cdoc->system_missing = false;
  2102. cdoc->system_id.len = 0;
  2103. tokeniser->state = STATE_DOCTYPE_SYSTEM_DQ;
  2104. } else if (c == '\'') {
  2105. cdoc->system_missing = false;
  2106. cdoc->system_id.len = 0;
  2107. tokeniser->state = STATE_DOCTYPE_SYSTEM_SQ;
  2108. } else if (c == '>') {
  2109. tokeniser->state = STATE_DATA;
  2110. return emit_current_doctype(tokeniser, true);
  2111. } else {
  2112. cdoc->force_quirks = true;
  2113. tokeniser->state = STATE_BOGUS_DOCTYPE;
  2114. }
  2115. return HUBBUB_OK;
  2116. }
  2117. hubbub_error hubbub_tokeniser_handle_doctype_system_dq(
  2118. hubbub_tokeniser *tokeniser)
  2119. {
  2120. hubbub_doctype *cdoc = &tokeniser->context.current_doctype;
  2121. size_t len;
  2122. const uint8_t *cptr;
  2123. parserutils_error error;
  2124. uint8_t c;
  2125. error = parserutils_inputstream_peek(tokeniser->input,
  2126. tokeniser->context.pending, &cptr, &len);
  2127. if (error != PARSERUTILS_OK) {
  2128. if (error == PARSERUTILS_EOF) {
  2129. tokeniser->state = STATE_DATA;
  2130. return emit_current_doctype(tokeniser, true);
  2131. } else {
  2132. return hubbub_error_from_parserutils_error(error);
  2133. }
  2134. }
  2135. c = *cptr;
  2136. if (c == '"') {
  2137. tokeniser->context.pending += len;
  2138. tokeniser->state = STATE_AFTER_DOCTYPE_SYSTEM;
  2139. } else if (c == '>') {
  2140. tokeniser->context.pending += len;
  2141. tokeniser->state = STATE_DATA;
  2142. return emit_current_doctype(tokeniser, true);
  2143. } else if (c == '\0') {
  2144. COLLECT_MS(cdoc->system_id, u_fffd, sizeof(u_fffd));
  2145. tokeniser->context.pending += len;
  2146. } else if (c == '\r') {
  2147. error = parserutils_inputstream_peek(
  2148. tokeniser->input,
  2149. tokeniser->context.pending,
  2150. &cptr,
  2151. &len);
  2152. if (error != PARSERUTILS_OK && error != PARSERUTILS_EOF) {
  2153. return hubbub_error_from_parserutils_error(error);
  2154. } else if (error == PARSERUTILS_EOF || *cptr != '\n') {
  2155. COLLECT_MS(cdoc->system_id, &lf, sizeof(lf));
  2156. }
  2157. /* Collect '\r' */
  2158. tokeniser->context.pending += 1;
  2159. } else {
  2160. COLLECT_MS(cdoc->system_id, cptr, len);
  2161. tokeniser->context.pending += len;
  2162. }
  2163. return HUBBUB_OK;
  2164. }
  2165. hubbub_error hubbub_tokeniser_handle_doctype_system_sq(
  2166. hubbub_tokeniser *tokeniser)
  2167. {
  2168. hubbub_doctype *cdoc = &tokeniser->context.current_doctype;
  2169. size_t len;
  2170. const uint8_t *cptr;
  2171. parserutils_error error;
  2172. uint8_t c;
  2173. error = parserutils_inputstream_peek(tokeniser->input,
  2174. tokeniser->context.pending, &cptr, &len);
  2175. if (error != PARSERUTILS_OK) {
  2176. if (error == PARSERUTILS_EOF) {
  2177. tokeniser->state = STATE_DATA;
  2178. return emit_current_doctype(tokeniser, true);
  2179. } else {
  2180. return hubbub_error_from_parserutils_error(error);
  2181. }
  2182. }
  2183. c = *cptr;
  2184. if (c == '\'') {
  2185. tokeniser->context.pending += len;
  2186. tokeniser->state = STATE_AFTER_DOCTYPE_SYSTEM;
  2187. } else if (c == '>') {
  2188. tokeniser->context.pending += len;
  2189. tokeniser->state = STATE_DATA;
  2190. return emit_current_doctype(tokeniser, true);
  2191. } else if (c == '\0') {
  2192. COLLECT_MS(cdoc->system_id, u_fffd, sizeof(u_fffd));
  2193. tokeniser->context.pending += len;
  2194. } else if (c == '\r') {
  2195. error = parserutils_inputstream_peek(
  2196. tokeniser->input,
  2197. tokeniser->context.pending,
  2198. &cptr,
  2199. &len);
  2200. if (error != PARSERUTILS_OK && error != PARSERUTILS_EOF) {
  2201. return hubbub_error_from_parserutils_error(error);
  2202. } else if (error == PARSERUTILS_EOF || *cptr != '\n') {
  2203. COLLECT_MS(cdoc->system_id, &lf, sizeof(lf));
  2204. }
  2205. /* Collect '\r' */
  2206. tokeniser->context.pending += 1;
  2207. } else {
  2208. COLLECT_MS(cdoc->system_id, cptr, len);
  2209. tokeniser->context.pending += len;
  2210. }
  2211. return HUBBUB_OK;
  2212. }
  2213. hubbub_error hubbub_tokeniser_handle_after_doctype_system(
  2214. hubbub_tokeniser *tokeniser)
  2215. {
  2216. size_t len;
  2217. const uint8_t *cptr;
  2218. parserutils_error error;
  2219. uint8_t c;
  2220. error = parserutils_inputstream_peek(tokeniser->input,
  2221. tokeniser->context.pending, &cptr, &len);
  2222. if (error != PARSERUTILS_OK) {
  2223. if (error == PARSERUTILS_EOF) {
  2224. tokeniser->state = STATE_DATA;
  2225. return emit_current_doctype(tokeniser, true);
  2226. } else {
  2227. return hubbub_error_from_parserutils_error(error);
  2228. }
  2229. }
  2230. c = *cptr;
  2231. tokeniser->context.pending += len;
  2232. if (c == '\t' || c == '\n' || c == '\f' || c == ' ' || c == '\r') {
  2233. /* pass over in silence */
  2234. } else if (c == '>') {
  2235. tokeniser->state = STATE_DATA;
  2236. return emit_current_doctype(tokeniser, false);
  2237. } else {
  2238. tokeniser->state = STATE_BOGUS_DOCTYPE;
  2239. }
  2240. return HUBBUB_OK;
  2241. }
  2242. hubbub_error hubbub_tokeniser_handle_bogus_doctype(hubbub_tokeniser *tokeniser)
  2243. {
  2244. size_t len;
  2245. const uint8_t *cptr;
  2246. parserutils_error error;
  2247. uint8_t c;
  2248. error = parserutils_inputstream_peek(tokeniser->input,
  2249. tokeniser->context.pending, &cptr, &len);
  2250. if (error != PARSERUTILS_OK) {
  2251. if (error == PARSERUTILS_EOF) {
  2252. tokeniser->state = STATE_DATA;
  2253. return emit_current_doctype(tokeniser, false);
  2254. } else {
  2255. return hubbub_error_from_parserutils_error(error);
  2256. }
  2257. }
  2258. c = *cptr;
  2259. tokeniser->context.pending += len;
  2260. if (c == '>') {
  2261. tokeniser->state = STATE_DATA;
  2262. return emit_current_doctype(tokeniser, false);
  2263. }
  2264. return HUBBUB_OK;
  2265. }
  2266. #define CDATA "[CDATA["
  2267. #define CDATA_LEN (SLEN(CDATA) - 1)
  2268. hubbub_error hubbub_tokeniser_handle_match_cdata(hubbub_tokeniser *tokeniser)
  2269. {
  2270. size_t len;
  2271. const uint8_t *cptr;
  2272. parserutils_error error;
  2273. uint8_t c;
  2274. error = parserutils_inputstream_peek(tokeniser->input,
  2275. tokeniser->context.pending, &cptr, &len);
  2276. if (error != PARSERUTILS_OK) {
  2277. if (error == PARSERUTILS_EOF) {
  2278. tokeniser->context.current_comment.len =
  2279. tokeniser->context.pending = 0;
  2280. tokeniser->state = STATE_BOGUS_COMMENT;
  2281. return HUBBUB_OK;
  2282. } else {
  2283. return hubbub_error_from_parserutils_error(error);
  2284. }
  2285. }
  2286. c = *cptr;
  2287. assert(tokeniser->context.match_cdata.count <= CDATA_LEN);
  2288. if (CDATA[tokeniser->context.match_cdata.count] != (c & ~0x20)) {
  2289. tokeniser->context.current_comment.len =
  2290. tokeniser->context.pending =
  2291. 0;
  2292. tokeniser->state = STATE_BOGUS_COMMENT;
  2293. return HUBBUB_OK;
  2294. }
  2295. tokeniser->context.pending += len;
  2296. if (tokeniser->context.match_cdata.count == CDATA_LEN) {
  2297. parserutils_inputstream_advance(tokeniser->input,
  2298. tokeniser->context.match_cdata.count + len);
  2299. tokeniser->context.pending = 0;
  2300. tokeniser->context.match_cdata.end = 0;
  2301. tokeniser->state = STATE_CDATA_BLOCK;
  2302. }
  2303. tokeniser->context.match_cdata.count += len;
  2304. return HUBBUB_OK;
  2305. }
  2306. #undef CDATA
  2307. #undef CDATA_LEN
  2308. hubbub_error hubbub_tokeniser_handle_cdata_block(hubbub_tokeniser *tokeniser)
  2309. {
  2310. size_t len;
  2311. const uint8_t *cptr;
  2312. parserutils_error error;
  2313. uint8_t c;
  2314. error = parserutils_inputstream_peek(tokeniser->input,
  2315. tokeniser->context.pending, &cptr, &len);
  2316. if (error != PARSERUTILS_OK) {
  2317. if (error == PARSERUTILS_EOF) {
  2318. tokeniser->state = STATE_DATA;
  2319. return emit_current_chars(tokeniser);
  2320. } else {
  2321. return hubbub_error_from_parserutils_error(error);
  2322. }
  2323. }
  2324. c = *cptr;
  2325. if (c == ']' && (tokeniser->context.match_cdata.end == 0 ||
  2326. tokeniser->context.match_cdata.end == 1)) {
  2327. tokeniser->context.pending += len;
  2328. tokeniser->context.match_cdata.end += len;
  2329. } else if (c == '>' && tokeniser->context.match_cdata.end == 2) {
  2330. /* Remove the previous two "]]" */
  2331. tokeniser->context.pending -= 2;
  2332. /* Emit any pending characters */
  2333. emit_current_chars(tokeniser);
  2334. /* Now move past the "]]>" bit */
  2335. parserutils_inputstream_advance(tokeniser->input, SLEN("]]>"));
  2336. tokeniser->state = STATE_DATA;
  2337. } else if (c == '\0') {
  2338. if (tokeniser->context.pending > 0) {
  2339. /* Emit any pending characters */
  2340. emit_current_chars(tokeniser);
  2341. }
  2342. /* Perform NUL-byte replacement */
  2343. emit_character_token(tokeniser, &u_fffd_str);
  2344. parserutils_inputstream_advance(tokeniser->input, len);
  2345. tokeniser->context.match_cdata.end = 0;
  2346. } else if (c == '\r') {
  2347. error = parserutils_inputstream_peek(
  2348. tokeniser->input,
  2349. tokeniser->context.pending + len,
  2350. &cptr,
  2351. &len);
  2352. if (error != PARSERUTILS_OK && error != PARSERUTILS_EOF) {
  2353. return hubbub_error_from_parserutils_error(error);
  2354. }
  2355. if (tokeniser->context.pending > 0) {
  2356. /* Emit any pending characters */
  2357. emit_current_chars(tokeniser);
  2358. }
  2359. if (error == PARSERUTILS_EOF || *cptr != '\n') {
  2360. /* Emit newline */
  2361. emit_character_token(tokeniser, &lf_str);
  2362. }
  2363. /* Advance over \r */
  2364. parserutils_inputstream_advance(tokeniser->input, 1);
  2365. tokeniser->context.match_cdata.end = 0;
  2366. } else {
  2367. tokeniser->context.pending += len;
  2368. tokeniser->context.match_cdata.end = 0;
  2369. }
  2370. return HUBBUB_OK;
  2371. }
  2372. hubbub_error hubbub_tokeniser_consume_character_reference(
  2373. hubbub_tokeniser *tokeniser, size_t pos)
  2374. {
  2375. uint32_t allowed_char = tokeniser->context.allowed_char;
  2376. size_t len;
  2377. const uint8_t *cptr;
  2378. parserutils_error error;
  2379. uint8_t c;
  2380. size_t off;
  2381. error = parserutils_inputstream_peek(tokeniser->input, pos,
  2382. &cptr, &len);
  2383. /* We should always start on an ampersand */
  2384. assert(error == PARSERUTILS_OK);
  2385. assert(len == 1 && *cptr == '&');
  2386. off = pos + len;
  2387. /* Look at the character after the ampersand */
  2388. error = parserutils_inputstream_peek(tokeniser->input, off,
  2389. &cptr, &len);
  2390. if (error != PARSERUTILS_OK) {
  2391. if (error == PARSERUTILS_EOF) {
  2392. tokeniser->context.match_entity.complete = true;
  2393. tokeniser->context.match_entity.codepoint = 0;
  2394. return HUBBUB_OK;
  2395. } else {
  2396. return hubbub_error_from_parserutils_error(error);
  2397. }
  2398. }
  2399. c = *cptr;
  2400. /* Set things up */
  2401. tokeniser->context.match_entity.offset = off;
  2402. tokeniser->context.match_entity.poss_length = 0;
  2403. tokeniser->context.match_entity.length = 0;
  2404. tokeniser->context.match_entity.base = 0;
  2405. tokeniser->context.match_entity.codepoint = 0;
  2406. tokeniser->context.match_entity.had_data = false;
  2407. tokeniser->context.match_entity.return_state = tokeniser->state;
  2408. tokeniser->context.match_entity.complete = false;
  2409. tokeniser->context.match_entity.overflow = false;
  2410. tokeniser->context.match_entity.context = NULL;
  2411. tokeniser->context.match_entity.prev_len = len;
  2412. /* Reset allowed character for future calls */
  2413. tokeniser->context.allowed_char = '\0';
  2414. if (c == '\t' || c == '\n' || c == '\f' || c == ' ' ||
  2415. c == '<' || c == '&' ||
  2416. (allowed_char && c == allowed_char)) {
  2417. tokeniser->context.match_entity.complete = true;
  2418. tokeniser->context.match_entity.codepoint = 0;
  2419. } else if (c == '#') {
  2420. tokeniser->context.match_entity.length += len;
  2421. tokeniser->state = STATE_NUMBERED_ENTITY;
  2422. } else {
  2423. tokeniser->state = STATE_NAMED_ENTITY;
  2424. }
  2425. return HUBBUB_OK;
  2426. }
  2427. hubbub_error hubbub_tokeniser_handle_numbered_entity(
  2428. hubbub_tokeniser *tokeniser)
  2429. {
  2430. hubbub_tokeniser_context *ctx = &tokeniser->context;
  2431. size_t len;
  2432. const uint8_t *cptr;
  2433. parserutils_error error;
  2434. error = parserutils_inputstream_peek(tokeniser->input,
  2435. ctx->match_entity.offset + ctx->match_entity.length,
  2436. &cptr, &len);
  2437. if (error != PARSERUTILS_OK && error != PARSERUTILS_EOF) {
  2438. return hubbub_error_from_parserutils_error(error);
  2439. }
  2440. if (error != PARSERUTILS_EOF && ctx->match_entity.base == 0) {
  2441. uint8_t c = *cptr;
  2442. if ((c & ~0x20) == 'X') {
  2443. ctx->match_entity.base = 16;
  2444. ctx->match_entity.length += len;
  2445. } else {
  2446. ctx->match_entity.base = 10;
  2447. }
  2448. }
  2449. while ((error = parserutils_inputstream_peek(tokeniser->input,
  2450. ctx->match_entity.offset + ctx->match_entity.length,
  2451. &cptr, &len)) == PARSERUTILS_OK) {
  2452. uint8_t c = *cptr;
  2453. if (ctx->match_entity.base == 10 &&
  2454. ('0' <= c && c <= '9')) {
  2455. ctx->match_entity.had_data = true;
  2456. ctx->match_entity.codepoint =
  2457. ctx->match_entity.codepoint * 10 + (c - '0');
  2458. ctx->match_entity.length += len;
  2459. } else if (ctx->match_entity.base == 16 &&
  2460. (('0' <= c && c <= '9') ||
  2461. ('A' <= (c & ~0x20) &&
  2462. (c & ~0x20) <= 'F'))) {
  2463. ctx->match_entity.had_data = true;
  2464. ctx->match_entity.codepoint *= 16;
  2465. if ('0' <= c && c <= '9') {
  2466. ctx->match_entity.codepoint += (c - '0');
  2467. } else {
  2468. ctx->match_entity.codepoint +=
  2469. ((c & ~0x20) - 'A' + 10);
  2470. }
  2471. ctx->match_entity.length += len;
  2472. } else {
  2473. break;
  2474. }
  2475. if (ctx->match_entity.codepoint >= 0x10FFFF) {
  2476. ctx->match_entity.overflow = true;
  2477. }
  2478. }
  2479. if (error != PARSERUTILS_OK && error != PARSERUTILS_EOF) {
  2480. return hubbub_error_from_parserutils_error(error);
  2481. }
  2482. /* Eat trailing semicolon, if any */
  2483. if (error != PARSERUTILS_EOF && *cptr == ';') {
  2484. ctx->match_entity.length += len;
  2485. }
  2486. /* Had data, so calculate final codepoint */
  2487. if (ctx->match_entity.had_data) {
  2488. uint32_t cp = ctx->match_entity.codepoint;
  2489. if (0x80 <= cp && cp <= 0x9F) {
  2490. cp = cp1252Table[cp - 0x80];
  2491. } else if (cp == 0x0D) {
  2492. cp = 0x000A;
  2493. } else if (ctx->match_entity.overflow ||
  2494. cp <= 0x0008 || cp == 0x000B ||
  2495. (0x000E <= cp && cp <= 0x001F) ||
  2496. (0x007F <= cp && cp <= 0x009F) ||
  2497. (0xD800 <= cp && cp <= 0xDFFF) ||
  2498. (0xFDD0 <= cp && cp <= 0xFDEF) ||
  2499. (cp & 0xFFFE) == 0xFFFE) {
  2500. /* the check for cp > 0x10FFFF per spec is performed
  2501. * in the loop above to avoid overflow */
  2502. cp = 0xFFFD;
  2503. }
  2504. ctx->match_entity.codepoint = cp;
  2505. }
  2506. /* Flag completion */
  2507. ctx->match_entity.complete = true;
  2508. /* And back to the state we were entered in */
  2509. tokeniser->state = ctx->match_entity.return_state;
  2510. return HUBBUB_OK;
  2511. }
  2512. hubbub_error hubbub_tokeniser_handle_named_entity(hubbub_tokeniser *tokeniser)
  2513. {
  2514. hubbub_tokeniser_context *ctx = &tokeniser->context;
  2515. size_t len;
  2516. const uint8_t *cptr;
  2517. parserutils_error error;
  2518. while ((error = parserutils_inputstream_peek(tokeniser->input,
  2519. ctx->match_entity.offset +
  2520. ctx->match_entity.poss_length,
  2521. &cptr, &len)) == PARSERUTILS_OK) {
  2522. uint32_t cp;
  2523. uint8_t c = *cptr;
  2524. hubbub_error error;
  2525. if (c > 0x7F) {
  2526. /* Entity names are ASCII only */
  2527. break;
  2528. }
  2529. error = hubbub_entities_search_step(c, &cp,
  2530. &ctx->match_entity.context);
  2531. if (error == HUBBUB_OK) {
  2532. /* Had a match - store it for later */
  2533. ctx->match_entity.codepoint = cp;
  2534. ctx->match_entity.length =
  2535. ctx->match_entity.poss_length + len;
  2536. ctx->match_entity.poss_length =
  2537. ctx->match_entity.length;
  2538. } else if (error == HUBBUB_INVALID) {
  2539. /* No further matches - use last found */
  2540. break;
  2541. } else {
  2542. /* Need more data */
  2543. ctx->match_entity.poss_length += len;
  2544. }
  2545. }
  2546. if (error != PARSERUTILS_OK && error != PARSERUTILS_EOF) {
  2547. return hubbub_error_from_parserutils_error(error);
  2548. }
  2549. if (ctx->match_entity.length > 0) {
  2550. uint8_t c;
  2551. error = parserutils_inputstream_peek(tokeniser->input,
  2552. ctx->match_entity.offset +
  2553. ctx->match_entity.length - 1,
  2554. &cptr, &len);
  2555. /* We're re-reading a character we've already read after.
  2556. * Therefore, there's no way that an error may occur as
  2557. * a result. */
  2558. assert(error == PARSERUTILS_OK);
  2559. c = *cptr;
  2560. if ((tokeniser->context.match_entity.return_state ==
  2561. STATE_CHARACTER_REFERENCE_IN_ATTRIBUTE_VALUE) &&
  2562. c != ';') {
  2563. error = parserutils_inputstream_peek(tokeniser->input,
  2564. ctx->match_entity.offset +
  2565. ctx->match_entity.length,
  2566. &cptr, &len);
  2567. /* We must have attempted to read one more character
  2568. * than was present in the entity name, as that is the
  2569. * only way to break out of the loop above. If that
  2570. * failed, then any non-EOF case will have been handled
  2571. * by the if statement after the loop thus it cannot
  2572. * occur here. */
  2573. assert(error == PARSERUTILS_OK ||
  2574. error == PARSERUTILS_EOF);
  2575. if (error == PARSERUTILS_EOF) {
  2576. ctx->match_entity.codepoint = 0;
  2577. }
  2578. c = *cptr;
  2579. if ((0x0030 <= c && c <= 0x0039) ||
  2580. (0x0041 <= c && c <= 0x005A) ||
  2581. (0x0061 <= c && c <= 0x007A)) {
  2582. ctx->match_entity.codepoint = 0;
  2583. }
  2584. }
  2585. }
  2586. /* Flag completion */
  2587. ctx->match_entity.complete = true;
  2588. /* And back to the state from whence we came */
  2589. tokeniser->state = ctx->match_entity.return_state;
  2590. return HUBBUB_OK;
  2591. }
  2592. /*** Token emitting bits ***/
  2593. /**
  2594. * Emit a character token.
  2595. *
  2596. * \param tokeniser Tokeniser instance
  2597. * \param chars Pointer to hubbub_string to emit
  2598. * \return true
  2599. */
  2600. hubbub_error emit_character_token(hubbub_tokeniser *tokeniser,
  2601. const hubbub_string *chars)
  2602. {
  2603. hubbub_token token;
  2604. token.type = HUBBUB_TOKEN_CHARACTER;
  2605. token.data.character = *chars;
  2606. return hubbub_tokeniser_emit_token(tokeniser, &token);
  2607. }
  2608. /**
  2609. * Emit the current pending characters being stored in the tokeniser context.
  2610. *
  2611. * \param tokeniser Tokeniser instance
  2612. * \return true
  2613. */
  2614. hubbub_error emit_current_chars(hubbub_tokeniser *tokeniser)
  2615. {
  2616. hubbub_token token;
  2617. size_t len;
  2618. const uint8_t *cptr = NULL;
  2619. parserutils_error error;
  2620. /* Calling this with nothing to output is a probable bug */
  2621. assert(tokeniser->context.pending > 0);
  2622. error = parserutils_inputstream_peek(tokeniser->input, 0, &cptr, &len);
  2623. assert(error == PARSERUTILS_OK);
  2624. token.type = HUBBUB_TOKEN_CHARACTER;
  2625. token.data.character.ptr = cptr;
  2626. token.data.character.len = tokeniser->context.pending;
  2627. return hubbub_tokeniser_emit_token(tokeniser, &token);
  2628. }
  2629. /**
  2630. * Emit the current tag token being stored in the tokeniser context.
  2631. *
  2632. * \param tokeniser Tokeniser instance
  2633. * \return true
  2634. */
  2635. hubbub_error emit_current_tag(hubbub_tokeniser *tokeniser)
  2636. {
  2637. hubbub_error err;
  2638. hubbub_token token;
  2639. uint32_t n_attributes;
  2640. hubbub_attribute *attrs;
  2641. uint8_t *ptr;
  2642. uint32_t i, j;
  2643. /* Emit current tag */
  2644. token.type = tokeniser->context.current_tag_type;
  2645. token.data.tag = tokeniser->context.current_tag;
  2646. token.data.tag.ns = HUBBUB_NS_HTML;
  2647. n_attributes = token.data.tag.n_attributes;
  2648. attrs = token.data.tag.attributes;
  2649. /* Set pointers correctly... */
  2650. ptr = tokeniser->buffer->data;
  2651. token.data.tag.name.ptr = tokeniser->buffer->data;
  2652. ptr += token.data.tag.name.len;
  2653. for (i = 0; i < n_attributes; i++) {
  2654. attrs[i].name.ptr = ptr;
  2655. ptr += attrs[i].name.len;
  2656. attrs[i].value.ptr = ptr;
  2657. ptr += attrs[i].value.len;
  2658. }
  2659. /* Discard duplicate attributes */
  2660. for (i = 0; i < n_attributes; i++) {
  2661. for (j = 0; j < n_attributes; j++) {
  2662. uint32_t move;
  2663. if (j == i ||
  2664. attrs[i].name.len !=
  2665. attrs[j].name.len ||
  2666. strncmp((char *) attrs[i].name.ptr,
  2667. (char *) attrs[j].name.ptr,
  2668. attrs[i].name.len) != 0) {
  2669. /* Attributes don't match */
  2670. continue;
  2671. }
  2672. assert(i < j);
  2673. /* Calculate amount to move */
  2674. move = (n_attributes - 1 - j) *
  2675. sizeof(hubbub_attribute);
  2676. if (move > 0) {
  2677. memmove(&attrs[j],&attrs[j+1], move);
  2678. }
  2679. /* We've deleted an item, so we need to
  2680. * reprocess this index */
  2681. j--;
  2682. /* And reduce the number of attributes */
  2683. n_attributes--;
  2684. }
  2685. }
  2686. token.data.tag.n_attributes = n_attributes;
  2687. err = hubbub_tokeniser_emit_token(tokeniser, &token);
  2688. if (token.type == HUBBUB_TOKEN_START_TAG) {
  2689. /* Save start tag name for R?CDATA */
  2690. if (token.data.tag.name.len <
  2691. sizeof(tokeniser->context.last_start_tag_name)) {
  2692. strncpy((char *) tokeniser->context.last_start_tag_name,
  2693. (const char *) token.data.tag.name.ptr,
  2694. token.data.tag.name.len);
  2695. tokeniser->context.last_start_tag_len =
  2696. token.data.tag.name.len;
  2697. } else {
  2698. tokeniser->context.last_start_tag_name[0] = '\0';
  2699. tokeniser->context.last_start_tag_len = 0;
  2700. }
  2701. } else /* if (token->type == HUBBUB_TOKEN_END_TAG) */ {
  2702. /* Reset content model after R?CDATA elements */
  2703. tokeniser->content_model = HUBBUB_CONTENT_MODEL_PCDATA;
  2704. }
  2705. /* Reset the self-closing flag */
  2706. tokeniser->context.current_tag.self_closing = false;
  2707. return err;
  2708. }
  2709. /**
  2710. * Emit the current comment token being stored in the tokeniser context.
  2711. *
  2712. * \param tokeniser Tokeniser instance
  2713. * \return true
  2714. */
  2715. hubbub_error emit_current_comment(hubbub_tokeniser *tokeniser)
  2716. {
  2717. hubbub_token token;
  2718. token.type = HUBBUB_TOKEN_COMMENT;
  2719. token.data.comment.ptr = tokeniser->buffer->data;
  2720. token.data.comment.len = tokeniser->buffer->length;
  2721. return hubbub_tokeniser_emit_token(tokeniser, &token);
  2722. }
  2723. /**
  2724. * Emit the current doctype token being stored in the tokeniser context.
  2725. *
  2726. * \param tokeniser Tokeniser instance
  2727. * \param force_quirks Force quirks mode on this document
  2728. * \return true
  2729. */
  2730. hubbub_error emit_current_doctype(hubbub_tokeniser *tokeniser,
  2731. bool force_quirks)
  2732. {
  2733. hubbub_token token;
  2734. /* Emit doctype */
  2735. token.type = HUBBUB_TOKEN_DOCTYPE;
  2736. token.data.doctype = tokeniser->context.current_doctype;
  2737. if (force_quirks == true)
  2738. token.data.doctype.force_quirks = true;
  2739. /* Set pointers correctly */
  2740. token.data.doctype.name.ptr = tokeniser->buffer->data;
  2741. if (token.data.doctype.public_missing == false) {
  2742. token.data.doctype.public_id.ptr = tokeniser->buffer->data +
  2743. token.data.doctype.name.len;
  2744. }
  2745. if (token.data.doctype.system_missing == false) {
  2746. token.data.doctype.system_id.ptr = tokeniser->buffer->data +
  2747. token.data.doctype.name.len +
  2748. token.data.doctype.public_id.len;
  2749. }
  2750. return hubbub_tokeniser_emit_token(tokeniser, &token);
  2751. }
  2752. /**
  2753. * Emit a token, performing sanity checks if necessary
  2754. *
  2755. * \param tokeniser Tokeniser instance
  2756. * \param token Token to emit
  2757. */
  2758. hubbub_error hubbub_tokeniser_emit_token(hubbub_tokeniser *tokeniser,
  2759. hubbub_token *token)
  2760. {
  2761. hubbub_error err = HUBBUB_OK;
  2762. assert(tokeniser != NULL);
  2763. assert(token != NULL);
  2764. #ifndef NDEBUG
  2765. /* Sanity checks */
  2766. switch (token->type) {
  2767. case HUBBUB_TOKEN_DOCTYPE:
  2768. assert(memchr(token->data.doctype.name.ptr, 0xff,
  2769. token->data.doctype.name.len) == NULL);
  2770. if (token->data.doctype.public_missing == false)
  2771. assert(memchr(token->data.doctype.public_id.ptr, 0xff,
  2772. token->data.doctype.public_id.len) == NULL);
  2773. if (token->data.doctype.system_missing == false)
  2774. assert(memchr(token->data.doctype.system_id.ptr, 0xff,
  2775. token->data.doctype.system_id.len) == NULL);
  2776. break;
  2777. case HUBBUB_TOKEN_START_TAG:
  2778. case HUBBUB_TOKEN_END_TAG:
  2779. {
  2780. uint32_t i;
  2781. assert(memchr(token->data.tag.name.ptr, 0xff,
  2782. token->data.tag.name.len) == NULL);
  2783. for (i = 0; i < token->data.tag.n_attributes; i++) {
  2784. hubbub_attribute *attr = &token->data.tag.attributes[i];
  2785. assert(memchr(attr->name.ptr, 0xff, attr->name.len) ==
  2786. NULL);
  2787. assert(memchr(attr->value.ptr, 0xff, attr->value.len) ==
  2788. NULL);
  2789. }
  2790. }
  2791. break;
  2792. case HUBBUB_TOKEN_COMMENT:
  2793. assert(memchr(token->data.comment.ptr, 0xff,
  2794. token->data.comment.len) == NULL);
  2795. break;
  2796. case HUBBUB_TOKEN_CHARACTER:
  2797. assert(memchr(token->data.character.ptr, 0xff,
  2798. token->data.character.len) == NULL);
  2799. break;
  2800. case HUBBUB_TOKEN_EOF:
  2801. break;
  2802. }
  2803. #endif
  2804. /* Emit the token */
  2805. if (tokeniser->token_handler) {
  2806. err = tokeniser->token_handler(token, tokeniser->token_pw);
  2807. }
  2808. /* Discard current buffer */
  2809. if (tokeniser->buffer->length) {
  2810. parserutils_buffer_discard(tokeniser->buffer, 0,
  2811. tokeniser->buffer->length);
  2812. }
  2813. /* Advance the pointer */
  2814. if (tokeniser->context.pending) {
  2815. parserutils_inputstream_advance(tokeniser->input,
  2816. tokeniser->context.pending);
  2817. tokeniser->context.pending = 0;
  2818. }
  2819. return err;
  2820. }