PageRenderTime 30ms CodeModel.GetById 24ms RepoModel.GetById 0ms app.codeStats 1ms

/peek-build/src/netdepends/hubbub-0.0.2/src/tokeniser/tokeniser.c

https://bitbucket.org/C0deMaver1ck/peeklinux
C | 3370 lines | 2902 code | 319 blank | 149 comment | 615 complexity | 6530fbcb7409a5b2eafc2432fd3ae059 MD5 | raw file
Possible License(s): GPL-2.0, LGPL-2.1, LGPL-2.0

Large files files are truncated, but you can click here to view the full file

  1. /*
  2. * This file is part of Hubbub.
  3. * Licensed under the MIT License,
  4. * http://www.opensource.org/licenses/mit-license.php
  5. * Copyright 2007 John-Mark Bell <jmb@netsurf-browser.org>
  6. * Copyright 2008 Andrew Sidwell <takkaria@netsurf-browser.org>
  7. */
  8. #include <assert.h>
  9. #include <stdbool.h>
  10. #include <string.h>
  11. #include <stdio.h>
  12. #include <parserutils/charset/utf8.h>
  13. #include "utils/parserutilserror.h"
  14. #include "utils/utils.h"
  15. #include "tokeniser/entities.h"
  16. #include "tokeniser/tokeniser.h"
  17. /**
  18. * Table of mappings between Windows-1252 codepoints 128-159 and UCS4
  19. */
  20. static const uint32_t cp1252Table[32] = {
  21. 0x20AC, 0xFFFD, 0x201A, 0x0192, 0x201E, 0x2026, 0x2020, 0x2021,
  22. 0x02C6, 0x2030, 0x0160, 0x2039, 0x0152, 0xFFFD, 0x017D, 0xFFFD,
  23. 0xFFFD, 0x2018, 0x2019, 0x201C, 0x201D, 0x2022, 0x2013, 0x2014,
  24. 0x02DC, 0x2122, 0x0161, 0x203A, 0x0153, 0xFFFD, 0x017E, 0x0178
  25. };
  26. /**
  27. * UTF-8 encoding of U+FFFD REPLACEMENT CHARACTER
  28. */
  29. static const uint8_t u_fffd[3] = { '\xEF', '\xBF', '\xBD' };
  30. static const hubbub_string u_fffd_str = { u_fffd, sizeof(u_fffd) };
  31. /**
  32. * String for when we want to emit newlines
  33. */
  34. static const uint8_t lf = '\n';
  35. static const hubbub_string lf_str = { &lf, 1 };
  36. /**
  37. * Tokeniser states
  38. */
  39. typedef enum hubbub_tokeniser_state {
  40. STATE_DATA,
  41. STATE_CHARACTER_REFERENCE_DATA,
  42. STATE_TAG_OPEN,
  43. STATE_CLOSE_TAG_OPEN,
  44. STATE_TAG_NAME,
  45. STATE_BEFORE_ATTRIBUTE_NAME,
  46. STATE_ATTRIBUTE_NAME,
  47. STATE_AFTER_ATTRIBUTE_NAME,
  48. STATE_BEFORE_ATTRIBUTE_VALUE,
  49. STATE_ATTRIBUTE_VALUE_DQ,
  50. STATE_ATTRIBUTE_VALUE_SQ,
  51. STATE_ATTRIBUTE_VALUE_UQ,
  52. STATE_CHARACTER_REFERENCE_IN_ATTRIBUTE_VALUE,
  53. STATE_AFTER_ATTRIBUTE_VALUE_Q,
  54. STATE_SELF_CLOSING_START_TAG,
  55. STATE_BOGUS_COMMENT,
  56. STATE_MARKUP_DECLARATION_OPEN,
  57. STATE_MATCH_COMMENT,
  58. STATE_COMMENT_START,
  59. STATE_COMMENT_START_DASH,
  60. STATE_COMMENT,
  61. STATE_COMMENT_END_DASH,
  62. STATE_COMMENT_END,
  63. STATE_MATCH_DOCTYPE,
  64. STATE_DOCTYPE,
  65. STATE_BEFORE_DOCTYPE_NAME,
  66. STATE_DOCTYPE_NAME,
  67. STATE_AFTER_DOCTYPE_NAME,
  68. STATE_MATCH_PUBLIC,
  69. STATE_BEFORE_DOCTYPE_PUBLIC,
  70. STATE_DOCTYPE_PUBLIC_DQ,
  71. STATE_DOCTYPE_PUBLIC_SQ,
  72. STATE_AFTER_DOCTYPE_PUBLIC,
  73. STATE_MATCH_SYSTEM,
  74. STATE_BEFORE_DOCTYPE_SYSTEM,
  75. STATE_DOCTYPE_SYSTEM_DQ,
  76. STATE_DOCTYPE_SYSTEM_SQ,
  77. STATE_AFTER_DOCTYPE_SYSTEM,
  78. STATE_BOGUS_DOCTYPE,
  79. STATE_MATCH_CDATA,
  80. STATE_CDATA_BLOCK,
  81. STATE_NUMBERED_ENTITY,
  82. STATE_NAMED_ENTITY
  83. } hubbub_tokeniser_state;
  84. /**
  85. * Context for tokeniser
  86. */
  87. typedef struct hubbub_tokeniser_context {
  88. size_t pending; /**< Count of pending chars */
  89. hubbub_string current_comment; /**< Current comment text */
  90. hubbub_token_type current_tag_type; /**< Type of current_tag */
  91. hubbub_tag current_tag; /**< Current tag */
  92. hubbub_doctype current_doctype; /**< Current doctype */
  93. hubbub_tokeniser_state prev_state; /**< Previous state */
  94. uint8_t last_start_tag_name[10]; /**< Name of the last start tag
  95. * emitted */
  96. size_t last_start_tag_len; /**< Length of last start tag */
  97. struct {
  98. uint32_t count;
  99. bool match;
  100. } close_tag_match; /**< State for matching close
  101. * tags */
  102. struct {
  103. uint32_t count; /**< Index into "DOCTYPE" */
  104. } match_doctype; /**< State for matching doctype */
  105. struct {
  106. uint32_t count; /**< Index into "[CDATA[" */
  107. uint32_t end; /**< Index into "]]>" */
  108. } match_cdata; /**< State for matching cdata */
  109. struct {
  110. size_t offset; /**< Offset in buffer */
  111. uint32_t length; /**< Length of entity */
  112. uint32_t codepoint; /**< UCS4 codepoint */
  113. bool complete; /**< True if match complete */
  114. uint32_t poss_length; /**< Optimistic length
  115. * when matching named
  116. * character references */
  117. uint8_t base; /**< Base for numeric
  118. * entities */
  119. void *context; /**< Context for named
  120. * entity search */
  121. size_t prev_len; /**< Previous byte length
  122. * of str */
  123. bool had_data; /**< Whether we read
  124. * anything after &#(x)? */
  125. bool overflow; /**< Whether this entity has
  126. * has overflowed the maximum
  127. * numeric entity value */
  128. hubbub_tokeniser_state return_state; /**< State we were
  129. * called from */
  130. } match_entity; /**< Entity matching state */
  131. struct {
  132. uint32_t line; /**< Current line of input */
  133. uint32_t col; /**< Current character in
  134. * line */
  135. } position; /**< Position in source data */
  136. uint32_t allowed_char; /**< Used for quote matching */
  137. } hubbub_tokeniser_context;
  138. /**
  139. * Tokeniser data structure
  140. */
  141. struct hubbub_tokeniser {
  142. hubbub_tokeniser_state state; /**< Current tokeniser state */
  143. hubbub_content_model content_model; /**< Current content
  144. * model flag */
  145. bool escape_flag; /**< Escape flag **/
  146. bool process_cdata_section; /**< Whether to process CDATA sections*/
  147. parserutils_inputstream *input; /**< Input stream */
  148. parserutils_buffer *buffer; /**< Input buffer */
  149. hubbub_tokeniser_context context; /**< Tokeniser context */
  150. hubbub_token_handler token_handler; /**< Token handling callback */
  151. void *token_pw; /**< Token handler data */
  152. hubbub_error_handler error_handler; /**< Error handling callback */
  153. void *error_pw; /**< Error handler data */
  154. hubbub_allocator_fn alloc; /**< Memory (de)allocation function */
  155. void *alloc_pw; /**< Client private data */
  156. };
  157. static hubbub_error hubbub_tokeniser_handle_data(hubbub_tokeniser *tokeniser);
  158. static hubbub_error hubbub_tokeniser_handle_character_reference_data(
  159. hubbub_tokeniser *tokeniser);
  160. static hubbub_error hubbub_tokeniser_handle_tag_open(
  161. hubbub_tokeniser *tokeniser);
  162. static hubbub_error hubbub_tokeniser_handle_close_tag_open(
  163. hubbub_tokeniser *tokeniser);
  164. static hubbub_error hubbub_tokeniser_handle_tag_name(
  165. hubbub_tokeniser *tokeniser);
  166. static hubbub_error hubbub_tokeniser_handle_before_attribute_name(
  167. hubbub_tokeniser *tokeniser);
  168. static hubbub_error hubbub_tokeniser_handle_attribute_name(
  169. hubbub_tokeniser *tokeniser);
  170. static hubbub_error hubbub_tokeniser_handle_after_attribute_name(
  171. hubbub_tokeniser *tokeniser);
  172. static hubbub_error hubbub_tokeniser_handle_before_attribute_value(
  173. hubbub_tokeniser *tokeniser);
  174. static hubbub_error hubbub_tokeniser_handle_attribute_value_dq(
  175. hubbub_tokeniser *tokeniser);
  176. static hubbub_error hubbub_tokeniser_handle_attribute_value_sq(
  177. hubbub_tokeniser *tokeniser);
  178. static hubbub_error hubbub_tokeniser_handle_attribute_value_uq(
  179. hubbub_tokeniser *tokeniser);
  180. static hubbub_error hubbub_tokeniser_handle_character_reference_in_attribute_value(
  181. hubbub_tokeniser *tokeniser);
  182. static hubbub_error hubbub_tokeniser_handle_after_attribute_value_q(
  183. hubbub_tokeniser *tokeniser);
  184. static hubbub_error hubbub_tokeniser_handle_self_closing_start_tag(
  185. hubbub_tokeniser *tokeniser);
  186. static hubbub_error hubbub_tokeniser_handle_bogus_comment(
  187. hubbub_tokeniser *tokeniser);
  188. static hubbub_error hubbub_tokeniser_handle_markup_declaration_open(
  189. hubbub_tokeniser *tokeniser);
  190. static hubbub_error hubbub_tokeniser_handle_match_comment(
  191. hubbub_tokeniser *tokeniser);
  192. static hubbub_error hubbub_tokeniser_handle_comment(
  193. hubbub_tokeniser *tokeniser);
  194. static hubbub_error hubbub_tokeniser_handle_match_doctype(
  195. hubbub_tokeniser *tokeniser);
  196. static hubbub_error hubbub_tokeniser_handle_doctype(
  197. hubbub_tokeniser *tokeniser);
  198. static hubbub_error hubbub_tokeniser_handle_before_doctype_name(
  199. hubbub_tokeniser *tokeniser);
  200. static hubbub_error hubbub_tokeniser_handle_doctype_name(
  201. hubbub_tokeniser *tokeniser);
  202. static hubbub_error hubbub_tokeniser_handle_after_doctype_name(
  203. hubbub_tokeniser *tokeniser);
  204. static hubbub_error hubbub_tokeniser_handle_match_public(
  205. hubbub_tokeniser *tokeniser);
  206. static hubbub_error hubbub_tokeniser_handle_before_doctype_public(
  207. hubbub_tokeniser *tokeniser);
  208. static hubbub_error hubbub_tokeniser_handle_doctype_public_dq(
  209. hubbub_tokeniser *tokeniser);
  210. static hubbub_error hubbub_tokeniser_handle_doctype_public_sq(
  211. hubbub_tokeniser *tokeniser);
  212. static hubbub_error hubbub_tokeniser_handle_after_doctype_public(
  213. hubbub_tokeniser *tokeniser);
  214. static hubbub_error hubbub_tokeniser_handle_match_system(
  215. hubbub_tokeniser *tokeniser);
  216. static hubbub_error hubbub_tokeniser_handle_before_doctype_system(
  217. hubbub_tokeniser *tokeniser);
  218. static hubbub_error hubbub_tokeniser_handle_doctype_system_dq(
  219. hubbub_tokeniser *tokeniser);
  220. static hubbub_error hubbub_tokeniser_handle_doctype_system_sq(
  221. hubbub_tokeniser *tokeniser);
  222. static hubbub_error hubbub_tokeniser_handle_after_doctype_system(
  223. hubbub_tokeniser *tokeniser);
  224. static hubbub_error hubbub_tokeniser_handle_bogus_doctype(
  225. hubbub_tokeniser *tokeniser);
  226. static hubbub_error hubbub_tokeniser_handle_match_cdata(
  227. hubbub_tokeniser *tokeniser);
  228. static hubbub_error hubbub_tokeniser_handle_cdata_block(
  229. hubbub_tokeniser *tokeniser);
  230. static hubbub_error hubbub_tokeniser_consume_character_reference(
  231. hubbub_tokeniser *tokeniser, size_t off);
  232. static hubbub_error hubbub_tokeniser_handle_numbered_entity(
  233. hubbub_tokeniser *tokeniser);
  234. static hubbub_error hubbub_tokeniser_handle_named_entity(
  235. hubbub_tokeniser *tokeniser);
  236. static inline hubbub_error emit_character_token(hubbub_tokeniser *tokeniser,
  237. const hubbub_string *chars);
  238. static inline hubbub_error emit_current_chars(hubbub_tokeniser *tokeniser);
  239. static inline hubbub_error emit_current_tag(hubbub_tokeniser *tokeniser);
  240. static inline hubbub_error emit_current_comment(hubbub_tokeniser *tokeniser);
  241. static inline hubbub_error emit_current_doctype(hubbub_tokeniser *tokeniser,
  242. bool force_quirks);
  243. static hubbub_error hubbub_tokeniser_emit_token(hubbub_tokeniser *tokeniser,
  244. hubbub_token *token);
  245. /**
  246. * Create a hubbub tokeniser
  247. *
  248. * \param input Input stream instance
  249. * \param alloc Memory (de)allocation function
  250. * \param pw Pointer to client-specific private data (may be NULL)
  251. * \param tokeniser Pointer to location to receive tokeniser instance
  252. * \return HUBBUB_OK on success,
  253. * HUBBUB_BADPARM on bad parameters,
  254. * HUBBUB_NOMEM on memory exhaustion
  255. */
  256. hubbub_error hubbub_tokeniser_create(parserutils_inputstream *input,
  257. hubbub_allocator_fn alloc, void *pw,
  258. hubbub_tokeniser **tokeniser)
  259. {
  260. parserutils_error perror;
  261. hubbub_tokeniser *tok;
  262. if (input == NULL || alloc == NULL || tokeniser == NULL)
  263. return HUBBUB_BADPARM;
  264. tok = alloc(NULL, sizeof(hubbub_tokeniser), pw);
  265. if (tok == NULL)
  266. return HUBBUB_NOMEM;
  267. perror = parserutils_buffer_create(alloc, pw, &tok->buffer);
  268. if (perror != PARSERUTILS_OK) {
  269. alloc(tok, 0, pw);
  270. return hubbub_error_from_parserutils_error(perror);
  271. }
  272. tok->state = STATE_DATA;
  273. tok->content_model = HUBBUB_CONTENT_MODEL_PCDATA;
  274. tok->escape_flag = false;
  275. tok->process_cdata_section = false;
  276. tok->input = input;
  277. tok->token_handler = NULL;
  278. tok->token_pw = NULL;
  279. tok->error_handler = NULL;
  280. tok->error_pw = NULL;
  281. tok->alloc = alloc;
  282. tok->alloc_pw = pw;
  283. memset(&tok->context, 0, sizeof(hubbub_tokeniser_context));
  284. *tokeniser = tok;
  285. return HUBBUB_OK;
  286. }
  287. /**
  288. * Destroy a hubbub tokeniser
  289. *
  290. * \param tokeniser The tokeniser instance to destroy
  291. * \return HUBBUB_OK on success, appropriate error otherwise
  292. */
  293. hubbub_error hubbub_tokeniser_destroy(hubbub_tokeniser *tokeniser)
  294. {
  295. if (tokeniser == NULL)
  296. return HUBBUB_BADPARM;
  297. if (tokeniser->context.current_tag.attributes != NULL) {
  298. tokeniser->alloc(tokeniser->context.current_tag.attributes,
  299. 0, tokeniser->alloc_pw);
  300. }
  301. parserutils_buffer_destroy(tokeniser->buffer);
  302. tokeniser->alloc(tokeniser, 0, tokeniser->alloc_pw);
  303. return HUBBUB_OK;
  304. }
  305. /**
  306. * Configure a hubbub tokeniser
  307. *
  308. * \param tokeniser The tokeniser instance to configure
  309. * \param type The option type to set
  310. * \param params Option-specific parameters
  311. * \return HUBBUB_OK on success, appropriate error otherwise
  312. */
  313. hubbub_error hubbub_tokeniser_setopt(hubbub_tokeniser *tokeniser,
  314. hubbub_tokeniser_opttype type,
  315. hubbub_tokeniser_optparams *params)
  316. {
  317. if (tokeniser == NULL || params == NULL)
  318. return HUBBUB_BADPARM;
  319. switch (type) {
  320. case HUBBUB_TOKENISER_TOKEN_HANDLER:
  321. tokeniser->token_handler = params->token_handler.handler;
  322. tokeniser->token_pw = params->token_handler.pw;
  323. break;
  324. case HUBBUB_TOKENISER_ERROR_HANDLER:
  325. tokeniser->error_handler = params->error_handler.handler;
  326. tokeniser->error_pw = params->error_handler.pw;
  327. break;
  328. case HUBBUB_TOKENISER_CONTENT_MODEL:
  329. tokeniser->content_model = params->content_model.model;
  330. break;
  331. case HUBBUB_TOKENISER_PROCESS_CDATA:
  332. tokeniser->process_cdata_section = params->process_cdata;
  333. break;
  334. }
  335. return HUBBUB_OK;
  336. }
  337. /**
  338. * Process remaining data in the input stream
  339. *
  340. * \param tokeniser The tokeniser instance to invoke
  341. * \return HUBBUB_OK on success, appropriate error otherwise
  342. */
  343. hubbub_error hubbub_tokeniser_run(hubbub_tokeniser *tokeniser)
  344. {
  345. hubbub_error cont = HUBBUB_OK;
  346. if (tokeniser == NULL)
  347. return HUBBUB_BADPARM;
  348. #if 0
  349. #define state(x) \
  350. case x: \
  351. printf( #x "\n");
  352. #else
  353. #define state(x) \
  354. case x:
  355. #endif
  356. while (cont == HUBBUB_OK) {
  357. switch (tokeniser->state) {
  358. state(STATE_DATA)
  359. cont = hubbub_tokeniser_handle_data(tokeniser);
  360. break;
  361. state(STATE_CHARACTER_REFERENCE_DATA)
  362. cont = hubbub_tokeniser_handle_character_reference_data(
  363. tokeniser);
  364. break;
  365. state(STATE_TAG_OPEN)
  366. cont = hubbub_tokeniser_handle_tag_open(tokeniser);
  367. break;
  368. state(STATE_CLOSE_TAG_OPEN)
  369. cont = hubbub_tokeniser_handle_close_tag_open(
  370. tokeniser);
  371. break;
  372. state(STATE_TAG_NAME)
  373. cont = hubbub_tokeniser_handle_tag_name(tokeniser);
  374. break;
  375. state(STATE_BEFORE_ATTRIBUTE_NAME)
  376. cont = hubbub_tokeniser_handle_before_attribute_name(
  377. tokeniser);
  378. break;
  379. state(STATE_ATTRIBUTE_NAME)
  380. cont = hubbub_tokeniser_handle_attribute_name(
  381. tokeniser);
  382. break;
  383. state(STATE_AFTER_ATTRIBUTE_NAME)
  384. cont = hubbub_tokeniser_handle_after_attribute_name(
  385. tokeniser);
  386. break;
  387. state(STATE_BEFORE_ATTRIBUTE_VALUE)
  388. cont = hubbub_tokeniser_handle_before_attribute_value(
  389. tokeniser);
  390. break;
  391. state(STATE_ATTRIBUTE_VALUE_DQ)
  392. cont = hubbub_tokeniser_handle_attribute_value_dq(
  393. tokeniser);
  394. break;
  395. state(STATE_ATTRIBUTE_VALUE_SQ)
  396. cont = hubbub_tokeniser_handle_attribute_value_sq(
  397. tokeniser);
  398. break;
  399. state(STATE_ATTRIBUTE_VALUE_UQ)
  400. cont = hubbub_tokeniser_handle_attribute_value_uq(
  401. tokeniser);
  402. break;
  403. state(STATE_CHARACTER_REFERENCE_IN_ATTRIBUTE_VALUE)
  404. cont = hubbub_tokeniser_handle_character_reference_in_attribute_value(
  405. tokeniser);
  406. break;
  407. state(STATE_AFTER_ATTRIBUTE_VALUE_Q)
  408. cont = hubbub_tokeniser_handle_after_attribute_value_q(
  409. tokeniser);
  410. break;
  411. state(STATE_SELF_CLOSING_START_TAG)
  412. cont = hubbub_tokeniser_handle_self_closing_start_tag(
  413. tokeniser);
  414. break;
  415. state(STATE_BOGUS_COMMENT)
  416. cont = hubbub_tokeniser_handle_bogus_comment(
  417. tokeniser);
  418. break;
  419. state(STATE_MARKUP_DECLARATION_OPEN)
  420. cont = hubbub_tokeniser_handle_markup_declaration_open(
  421. tokeniser);
  422. break;
  423. state(STATE_MATCH_COMMENT)
  424. cont = hubbub_tokeniser_handle_match_comment(
  425. tokeniser);
  426. break;
  427. case STATE_COMMENT_START:
  428. case STATE_COMMENT_START_DASH:
  429. case STATE_COMMENT:
  430. case STATE_COMMENT_END_DASH:
  431. case STATE_COMMENT_END:
  432. cont = hubbub_tokeniser_handle_comment(tokeniser);
  433. break;
  434. state(STATE_MATCH_DOCTYPE)
  435. cont = hubbub_tokeniser_handle_match_doctype(
  436. tokeniser);
  437. break;
  438. state(STATE_DOCTYPE)
  439. cont = hubbub_tokeniser_handle_doctype(tokeniser);
  440. break;
  441. state(STATE_BEFORE_DOCTYPE_NAME)
  442. cont = hubbub_tokeniser_handle_before_doctype_name(
  443. tokeniser);
  444. break;
  445. state(STATE_DOCTYPE_NAME)
  446. cont = hubbub_tokeniser_handle_doctype_name(
  447. tokeniser);
  448. break;
  449. state(STATE_AFTER_DOCTYPE_NAME)
  450. cont = hubbub_tokeniser_handle_after_doctype_name(
  451. tokeniser);
  452. break;
  453. state(STATE_MATCH_PUBLIC)
  454. cont = hubbub_tokeniser_handle_match_public(
  455. tokeniser);
  456. break;
  457. state(STATE_BEFORE_DOCTYPE_PUBLIC)
  458. cont = hubbub_tokeniser_handle_before_doctype_public(
  459. tokeniser);
  460. break;
  461. state(STATE_DOCTYPE_PUBLIC_DQ)
  462. cont = hubbub_tokeniser_handle_doctype_public_dq(
  463. tokeniser);
  464. break;
  465. state(STATE_DOCTYPE_PUBLIC_SQ)
  466. cont = hubbub_tokeniser_handle_doctype_public_sq(
  467. tokeniser);
  468. break;
  469. state(STATE_AFTER_DOCTYPE_PUBLIC)
  470. cont = hubbub_tokeniser_handle_after_doctype_public(
  471. tokeniser);
  472. break;
  473. state(STATE_MATCH_SYSTEM)
  474. cont = hubbub_tokeniser_handle_match_system(
  475. tokeniser);
  476. break;
  477. state(STATE_BEFORE_DOCTYPE_SYSTEM)
  478. cont = hubbub_tokeniser_handle_before_doctype_system(
  479. tokeniser);
  480. break;
  481. state(STATE_DOCTYPE_SYSTEM_DQ)
  482. cont = hubbub_tokeniser_handle_doctype_system_dq(
  483. tokeniser);
  484. break;
  485. state(STATE_DOCTYPE_SYSTEM_SQ)
  486. cont = hubbub_tokeniser_handle_doctype_system_sq(
  487. tokeniser);
  488. break;
  489. state(STATE_AFTER_DOCTYPE_SYSTEM)
  490. cont = hubbub_tokeniser_handle_after_doctype_system(
  491. tokeniser);
  492. break;
  493. state(STATE_BOGUS_DOCTYPE)
  494. cont = hubbub_tokeniser_handle_bogus_doctype(
  495. tokeniser);
  496. break;
  497. state(STATE_MATCH_CDATA)
  498. cont = hubbub_tokeniser_handle_match_cdata(
  499. tokeniser);
  500. break;
  501. state(STATE_CDATA_BLOCK)
  502. cont = hubbub_tokeniser_handle_cdata_block(
  503. tokeniser);
  504. break;
  505. state(STATE_NUMBERED_ENTITY)
  506. cont = hubbub_tokeniser_handle_numbered_entity(
  507. tokeniser);
  508. break;
  509. state(STATE_NAMED_ENTITY)
  510. cont = hubbub_tokeniser_handle_named_entity(
  511. tokeniser);
  512. break;
  513. }
  514. }
  515. return (cont == HUBBUB_NEEDDATA) ? HUBBUB_OK : cont;
  516. }
  517. /**
  518. * Various macros for manipulating buffers.
  519. *
  520. * \todo make some of these inline functions (type-safety)
  521. * \todo document them properly here
  522. */
  523. #define START_BUF(str, cptr, length) \
  524. do { \
  525. parserutils_error perror; \
  526. perror = parserutils_buffer_append(tokeniser->buffer, \
  527. (uint8_t *) (cptr), (length)); \
  528. if (perror != PARSERUTILS_OK) \
  529. return hubbub_error_from_parserutils_error(perror); \
  530. (str).len = (length); \
  531. } while (0)
  532. #define COLLECT(str, cptr, length) \
  533. do { \
  534. parserutils_error perror; \
  535. assert(str.len != 0); \
  536. perror = parserutils_buffer_append(tokeniser->buffer, \
  537. (uint8_t *) (cptr), (length)); \
  538. if (perror != PARSERUTILS_OK) \
  539. return hubbub_error_from_parserutils_error(perror); \
  540. (str).len += (length); \
  541. } while (0)
  542. #define COLLECT_MS(str, cptr, length) \
  543. do { \
  544. parserutils_error perror; \
  545. perror = parserutils_buffer_append(tokeniser->buffer, \
  546. (uint8_t *) (cptr), (length)); \
  547. if (perror != PARSERUTILS_OK) \
  548. return hubbub_error_from_parserutils_error(perror); \
  549. (str).len += (length); \
  550. } while (0)
  551. /* this should always be called with an empty "chars" buffer */
  552. hubbub_error hubbub_tokeniser_handle_data(hubbub_tokeniser *tokeniser)
  553. {
  554. parserutils_error error;
  555. hubbub_token token;
  556. const uint8_t *cptr;
  557. size_t len;
  558. while ((error = parserutils_inputstream_peek(tokeniser->input,
  559. tokeniser->context.pending, &cptr, &len)) ==
  560. PARSERUTILS_OK) {
  561. const uint8_t c = *cptr;
  562. if (c == '&' &&
  563. (tokeniser->content_model == HUBBUB_CONTENT_MODEL_PCDATA ||
  564. tokeniser->content_model == HUBBUB_CONTENT_MODEL_RCDATA) &&
  565. tokeniser->escape_flag == false) {
  566. tokeniser->state =
  567. STATE_CHARACTER_REFERENCE_DATA;
  568. /* Don't eat the '&'; it'll be handled by entity
  569. * consumption */
  570. break;
  571. } else if (c == '-' &&
  572. tokeniser->escape_flag == false &&
  573. (tokeniser->content_model ==
  574. HUBBUB_CONTENT_MODEL_RCDATA ||
  575. tokeniser->content_model ==
  576. HUBBUB_CONTENT_MODEL_CDATA) &&
  577. tokeniser->context.pending >= 3) {
  578. size_t ignore;
  579. error = parserutils_inputstream_peek(
  580. tokeniser->input,
  581. tokeniser->context.pending - 3,
  582. &cptr,
  583. &ignore);
  584. assert(error == PARSERUTILS_OK);
  585. if (strncmp((char *)cptr,
  586. "<!--", SLEN("<!--")) == 0) {
  587. tokeniser->escape_flag = true;
  588. }
  589. tokeniser->context.pending += len;
  590. } else if (c == '<' && (tokeniser->content_model ==
  591. HUBBUB_CONTENT_MODEL_PCDATA ||
  592. ((tokeniser->content_model ==
  593. HUBBUB_CONTENT_MODEL_RCDATA ||
  594. tokeniser->content_model ==
  595. HUBBUB_CONTENT_MODEL_CDATA) &&
  596. tokeniser->escape_flag == false))) {
  597. if (tokeniser->context.pending > 0) {
  598. /* Emit any pending characters */
  599. emit_current_chars(tokeniser);
  600. }
  601. /* Buffer '<' */
  602. tokeniser->context.pending = len;
  603. tokeniser->state = STATE_TAG_OPEN;
  604. break;
  605. } else if (c == '>' && tokeniser->escape_flag == true &&
  606. (tokeniser->content_model ==
  607. HUBBUB_CONTENT_MODEL_RCDATA ||
  608. tokeniser->content_model ==
  609. HUBBUB_CONTENT_MODEL_CDATA)) {
  610. /* no need to check that there are enough characters,
  611. * since you can only run into this if the flag is
  612. * true in the first place, which requires four
  613. * characters. */
  614. error = parserutils_inputstream_peek(
  615. tokeniser->input,
  616. tokeniser->context.pending - 2,
  617. &cptr,
  618. &len);
  619. assert(error == PARSERUTILS_OK);
  620. if (strncmp((char *) cptr, "-->", SLEN("-->")) == 0) {
  621. tokeniser->escape_flag = false;
  622. }
  623. tokeniser->context.pending += len;
  624. } else if (c == '\0') {
  625. if (tokeniser->context.pending > 0) {
  626. /* Emit any pending characters */
  627. emit_current_chars(tokeniser);
  628. }
  629. /* Emit a replacement character */
  630. emit_character_token(tokeniser, &u_fffd_str);
  631. /* Advance past NUL */
  632. parserutils_inputstream_advance(tokeniser->input, 1);
  633. } else if (c == '\r') {
  634. error = parserutils_inputstream_peek(
  635. tokeniser->input,
  636. tokeniser->context.pending + len,
  637. &cptr,
  638. &len);
  639. if (error != PARSERUTILS_OK &&
  640. error != PARSERUTILS_EOF) {
  641. break;
  642. }
  643. if (tokeniser->context.pending > 0) {
  644. /* Emit any pending characters */
  645. emit_current_chars(tokeniser);
  646. }
  647. if (error == PARSERUTILS_EOF || *cptr != '\n') {
  648. /* Emit newline */
  649. emit_character_token(tokeniser, &lf_str);
  650. }
  651. /* Advance over */
  652. parserutils_inputstream_advance(tokeniser->input, 1);
  653. } else {
  654. /* Just collect into buffer */
  655. tokeniser->context.pending += len;
  656. }
  657. }
  658. if (tokeniser->state != STATE_TAG_OPEN &&
  659. (tokeniser->state != STATE_DATA || error == PARSERUTILS_EOF) &&
  660. tokeniser->context.pending > 0) {
  661. /* Emit any pending characters */
  662. emit_current_chars(tokeniser);
  663. }
  664. if (error == PARSERUTILS_EOF) {
  665. token.type = HUBBUB_TOKEN_EOF;
  666. hubbub_tokeniser_emit_token(tokeniser, &token);
  667. }
  668. if (error == PARSERUTILS_EOF) {
  669. return HUBBUB_NEEDDATA;
  670. } else {
  671. return hubbub_error_from_parserutils_error(error);
  672. }
  673. }
  674. /* emit any pending tokens before calling */
  675. hubbub_error hubbub_tokeniser_handle_character_reference_data(
  676. hubbub_tokeniser *tokeniser)
  677. {
  678. assert(tokeniser->context.pending == 0);
  679. if (tokeniser->context.match_entity.complete == false) {
  680. return hubbub_tokeniser_consume_character_reference(tokeniser,
  681. tokeniser->context.pending);
  682. } else {
  683. hubbub_token token;
  684. uint8_t utf8[6];
  685. uint8_t *utf8ptr = utf8;
  686. size_t len = sizeof(utf8);
  687. token.type = HUBBUB_TOKEN_CHARACTER;
  688. if (tokeniser->context.match_entity.codepoint) {
  689. parserutils_charset_utf8_from_ucs4(
  690. tokeniser->context.match_entity.codepoint,
  691. &utf8ptr, &len);
  692. token.data.character.ptr = utf8;
  693. token.data.character.len = sizeof(utf8) - len;
  694. hubbub_tokeniser_emit_token(tokeniser, &token);
  695. /* +1 for ampersand */
  696. parserutils_inputstream_advance(tokeniser->input,
  697. tokeniser->context.match_entity.length
  698. + 1);
  699. } else {
  700. parserutils_error error;
  701. const uint8_t *cptr = NULL;
  702. error = parserutils_inputstream_peek(
  703. tokeniser->input,
  704. tokeniser->context.pending,
  705. &cptr,
  706. &len);
  707. assert(error == PARSERUTILS_OK);
  708. token.data.character.ptr = cptr;
  709. token.data.character.len = len;
  710. hubbub_tokeniser_emit_token(tokeniser, &token);
  711. parserutils_inputstream_advance(tokeniser->input, len);
  712. }
  713. /* Reset for next time */
  714. tokeniser->context.match_entity.complete = false;
  715. tokeniser->state = STATE_DATA;
  716. }
  717. return HUBBUB_OK;
  718. }
  719. /* this state always switches to another state straight away */
  720. /* this state expects the current character to be '<' */
  721. hubbub_error hubbub_tokeniser_handle_tag_open(hubbub_tokeniser *tokeniser)
  722. {
  723. hubbub_tag *ctag = &tokeniser->context.current_tag;
  724. size_t len;
  725. const uint8_t *cptr;
  726. parserutils_error error;
  727. uint8_t c;
  728. assert(tokeniser->context.pending == 1);
  729. /* assert(tokeniser->context.chars.ptr[0] == '<'); */
  730. error = parserutils_inputstream_peek(tokeniser->input,
  731. tokeniser->context.pending, &cptr, &len);
  732. if (error != PARSERUTILS_OK) {
  733. if (error == PARSERUTILS_EOF) {
  734. /* Return to data state with '<' still in "chars" */
  735. tokeniser->state = STATE_DATA;
  736. return HUBBUB_OK;
  737. } else {
  738. return hubbub_error_from_parserutils_error(error);
  739. }
  740. }
  741. c = *cptr;
  742. if (c == '/') {
  743. tokeniser->context.pending += len;
  744. tokeniser->context.close_tag_match.match = false;
  745. tokeniser->context.close_tag_match.count = 0;
  746. tokeniser->state = STATE_CLOSE_TAG_OPEN;
  747. } else if (tokeniser->content_model == HUBBUB_CONTENT_MODEL_RCDATA ||
  748. tokeniser->content_model ==
  749. HUBBUB_CONTENT_MODEL_CDATA) {
  750. /* Return to data state with '<' still in "chars" */
  751. tokeniser->state = STATE_DATA;
  752. } else if (tokeniser->content_model == HUBBUB_CONTENT_MODEL_PCDATA) {
  753. if (c == '!') {
  754. parserutils_inputstream_advance(tokeniser->input,
  755. SLEN("<!"));
  756. tokeniser->context.pending = 0;
  757. tokeniser->state = STATE_MARKUP_DECLARATION_OPEN;
  758. } else if ('A' <= c && c <= 'Z') {
  759. uint8_t lc = (c + 0x20);
  760. START_BUF(ctag->name, &lc, len);
  761. ctag->n_attributes = 0;
  762. tokeniser->context.current_tag_type =
  763. HUBBUB_TOKEN_START_TAG;
  764. tokeniser->context.pending += len;
  765. tokeniser->state = STATE_TAG_NAME;
  766. } else if ('a' <= c && c <= 'z') {
  767. START_BUF(ctag->name, cptr, len);
  768. ctag->n_attributes = 0;
  769. tokeniser->context.current_tag_type =
  770. HUBBUB_TOKEN_START_TAG;
  771. tokeniser->context.pending += len;
  772. tokeniser->state = STATE_TAG_NAME;
  773. } else if (c == '>') {
  774. /** \todo parse error */
  775. tokeniser->context.pending += len;
  776. tokeniser->state = STATE_DATA;
  777. } else if (c == '?') {
  778. /** \todo parse error */
  779. /* Cursor still at "<", need to advance past it */
  780. parserutils_inputstream_advance(
  781. tokeniser->input, SLEN("<"));
  782. tokeniser->context.pending = 0;
  783. tokeniser->state = STATE_BOGUS_COMMENT;
  784. } else {
  785. /* Return to data state with '<' still in "chars" */
  786. tokeniser->state = STATE_DATA;
  787. }
  788. }
  789. return HUBBUB_OK;
  790. }
  791. /* this state expects tokeniser->context.chars to be "</" */
  792. /* this state never stays in this state for more than one character */
  793. hubbub_error hubbub_tokeniser_handle_close_tag_open(hubbub_tokeniser *tokeniser)
  794. {
  795. hubbub_tokeniser_context *ctx = &tokeniser->context;
  796. size_t len;
  797. const uint8_t *cptr;
  798. parserutils_error error;
  799. uint8_t c;
  800. assert(tokeniser->context.pending == 2);
  801. /* assert(tokeniser->context.chars.ptr[0] == '<'); */
  802. /* assert(tokeniser->context.chars.ptr[1] == '/'); */
  803. /**\todo fragment case */
  804. if (tokeniser->content_model == HUBBUB_CONTENT_MODEL_RCDATA ||
  805. tokeniser->content_model ==
  806. HUBBUB_CONTENT_MODEL_CDATA) {
  807. uint8_t *start_tag_name =
  808. tokeniser->context.last_start_tag_name;
  809. size_t start_tag_len =
  810. tokeniser->context.last_start_tag_len;
  811. while ((error = parserutils_inputstream_peek(tokeniser->input,
  812. ctx->pending +
  813. ctx->close_tag_match.count,
  814. &cptr,
  815. &len)) == PARSERUTILS_OK) {
  816. c = *cptr;
  817. if ((start_tag_name[ctx->close_tag_match.count] & ~0x20)
  818. != (c & ~0x20)) {
  819. break;
  820. }
  821. ctx->close_tag_match.count += len;
  822. if (ctx->close_tag_match.count == start_tag_len) {
  823. ctx->close_tag_match.match = true;
  824. break;
  825. }
  826. }
  827. if (error != PARSERUTILS_OK && error != PARSERUTILS_EOF) {
  828. return hubbub_error_from_parserutils_error(error);
  829. }
  830. if (ctx->close_tag_match.match == true) {
  831. error = parserutils_inputstream_peek(
  832. tokeniser->input,
  833. ctx->pending +
  834. ctx->close_tag_match.count,
  835. &cptr,
  836. &len);
  837. if (error != PARSERUTILS_OK &&
  838. error != PARSERUTILS_EOF) {
  839. return hubbub_error_from_parserutils_error(
  840. error);
  841. } else if (error != PARSERUTILS_EOF) {
  842. c = *cptr;
  843. if (c != '\t' && c != '\n' && c != '\f' &&
  844. c != ' ' && c != '>' &&
  845. c != '/') {
  846. ctx->close_tag_match.match = false;
  847. }
  848. }
  849. }
  850. }
  851. if (ctx->close_tag_match.match == false &&
  852. tokeniser->content_model !=
  853. HUBBUB_CONTENT_MODEL_PCDATA) {
  854. /* We should emit "</" here, but instead we leave it in the
  855. * buffer so the data state emits it with any characters
  856. * following it */
  857. tokeniser->state = STATE_DATA;
  858. } else {
  859. error = parserutils_inputstream_peek(tokeniser->input,
  860. tokeniser->context.pending, &cptr, &len);
  861. if (error == PARSERUTILS_EOF) {
  862. /** \todo parse error */
  863. /* Return to data state with "</" pending */
  864. tokeniser->state = STATE_DATA;
  865. return HUBBUB_OK;
  866. } else if (error != PARSERUTILS_OK) {
  867. return hubbub_error_from_parserutils_error(error);
  868. }
  869. c = *cptr;
  870. if ('A' <= c && c <= 'Z') {
  871. uint8_t lc = (c + 0x20);
  872. START_BUF(tokeniser->context.current_tag.name,
  873. &lc, len);
  874. tokeniser->context.current_tag.n_attributes = 0;
  875. tokeniser->context.current_tag_type =
  876. HUBBUB_TOKEN_END_TAG;
  877. tokeniser->context.pending += len;
  878. tokeniser->state = STATE_TAG_NAME;
  879. } else if ('a' <= c && c <= 'z') {
  880. START_BUF(tokeniser->context.current_tag.name,
  881. cptr, len);
  882. tokeniser->context.current_tag.n_attributes = 0;
  883. tokeniser->context.current_tag_type =
  884. HUBBUB_TOKEN_END_TAG;
  885. tokeniser->context.pending += len;
  886. tokeniser->state = STATE_TAG_NAME;
  887. } else if (c == '>') {
  888. /* Cursor still at "</", need to collect ">" */
  889. tokeniser->context.pending += len;
  890. /* Now need to advance past "</>" */
  891. parserutils_inputstream_advance(tokeniser->input,
  892. tokeniser->context.pending);
  893. tokeniser->context.pending = 0;
  894. /** \todo parse error */
  895. tokeniser->state = STATE_DATA;
  896. } else {
  897. /** \todo parse error */
  898. /* Cursor still at "</", need to advance past it */
  899. parserutils_inputstream_advance(tokeniser->input,
  900. tokeniser->context.pending);
  901. tokeniser->context.pending = 0;
  902. tokeniser->state = STATE_BOGUS_COMMENT;
  903. }
  904. }
  905. return HUBBUB_OK;
  906. }
  907. /* this state expects tokeniser->context.current_tag to already have its
  908. first character set */
  909. hubbub_error hubbub_tokeniser_handle_tag_name(hubbub_tokeniser *tokeniser)
  910. {
  911. hubbub_tag *ctag = &tokeniser->context.current_tag;
  912. size_t len;
  913. const uint8_t *cptr;
  914. parserutils_error error;
  915. uint8_t c;
  916. assert(tokeniser->context.pending > 0);
  917. /* assert(tokeniser->context.chars.ptr[0] == '<'); */
  918. assert(ctag->name.len > 0);
  919. /* assert(ctag->name.ptr); */
  920. error = parserutils_inputstream_peek(tokeniser->input,
  921. tokeniser->context.pending, &cptr, &len);
  922. if (error != PARSERUTILS_OK) {
  923. if (error == PARSERUTILS_EOF) {
  924. tokeniser->state = STATE_DATA;
  925. return emit_current_tag(tokeniser);
  926. } else {
  927. return hubbub_error_from_parserutils_error(error);
  928. }
  929. }
  930. c = *cptr;
  931. if (c == '\t' || c == '\n' || c == '\f' || c == ' ' || c == '\r') {
  932. tokeniser->context.pending += len;
  933. tokeniser->state = STATE_BEFORE_ATTRIBUTE_NAME;
  934. } else if (c == '>') {
  935. tokeniser->context.pending += len;
  936. tokeniser->state = STATE_DATA;
  937. return emit_current_tag(tokeniser);
  938. } else if (c == '\0') {
  939. COLLECT(ctag->name, u_fffd, sizeof(u_fffd));
  940. tokeniser->context.pending += len;
  941. } else if (c == '/') {
  942. tokeniser->context.pending += len;
  943. tokeniser->state = STATE_SELF_CLOSING_START_TAG;
  944. } else if ('A' <= c && c <= 'Z') {
  945. uint8_t lc = (c + 0x20);
  946. COLLECT(ctag->name, &lc, len);
  947. tokeniser->context.pending += len;
  948. } else {
  949. COLLECT(ctag->name, cptr, len);
  950. tokeniser->context.pending += len;
  951. }
  952. return HUBBUB_OK;
  953. }
  954. hubbub_error hubbub_tokeniser_handle_before_attribute_name(
  955. hubbub_tokeniser *tokeniser)
  956. {
  957. hubbub_tag *ctag = &tokeniser->context.current_tag;
  958. size_t len;
  959. const uint8_t *cptr;
  960. parserutils_error error;
  961. uint8_t c;
  962. error = parserutils_inputstream_peek(tokeniser->input,
  963. tokeniser->context.pending, &cptr, &len);
  964. if (error != PARSERUTILS_OK) {
  965. if (error == PARSERUTILS_EOF) {
  966. tokeniser->state = STATE_DATA;
  967. return emit_current_tag(tokeniser);
  968. } else {
  969. return hubbub_error_from_parserutils_error(error);
  970. }
  971. }
  972. c = *cptr;
  973. if (c == '\t' || c == '\n' || c == '\f' || c == ' ' || c == '\r') {
  974. /* pass over in silence */
  975. tokeniser->context.pending += len;
  976. } else if (c == '>') {
  977. tokeniser->context.pending += len;
  978. tokeniser->state = STATE_DATA;
  979. return emit_current_tag(tokeniser);
  980. } else if (c == '/') {
  981. tokeniser->context.pending += len;
  982. tokeniser->state = STATE_SELF_CLOSING_START_TAG;
  983. } else {
  984. hubbub_attribute *attr;
  985. if (c == '"' || c == '\'' || c == '=') {
  986. /** \todo parse error */
  987. }
  988. attr = tokeniser->alloc(ctag->attributes,
  989. (ctag->n_attributes + 1) *
  990. sizeof(hubbub_attribute),
  991. tokeniser->alloc_pw);
  992. if (attr == NULL)
  993. return HUBBUB_NOMEM;
  994. ctag->attributes = attr;
  995. if ('A' <= c && c <= 'Z') {
  996. uint8_t lc = (c + 0x20);
  997. START_BUF(attr[ctag->n_attributes].name, &lc, len);
  998. } else if (c == '\0') {
  999. START_BUF(attr[ctag->n_attributes].name,
  1000. u_fffd, sizeof(u_fffd));
  1001. } else {
  1002. START_BUF(attr[ctag->n_attributes].name, cptr, len);
  1003. }
  1004. attr[ctag->n_attributes].ns = HUBBUB_NS_NULL;
  1005. attr[ctag->n_attributes].value.ptr = NULL;
  1006. attr[ctag->n_attributes].value.len = 0;
  1007. ctag->n_attributes++;
  1008. tokeniser->context.pending += len;
  1009. tokeniser->state = STATE_ATTRIBUTE_NAME;
  1010. }
  1011. return HUBBUB_OK;
  1012. }
  1013. hubbub_error hubbub_tokeniser_handle_attribute_name(hubbub_tokeniser *tokeniser)
  1014. {
  1015. hubbub_tag *ctag = &tokeniser->context.current_tag;
  1016. size_t len;
  1017. const uint8_t *cptr;
  1018. parserutils_error error;
  1019. uint8_t c;
  1020. assert(ctag->attributes[ctag->n_attributes - 1].name.len > 0);
  1021. error = parserutils_inputstream_peek(tokeniser->input,
  1022. tokeniser->context.pending, &cptr, &len);
  1023. if (error != PARSERUTILS_OK) {
  1024. if (error == PARSERUTILS_EOF) {
  1025. tokeniser->state = STATE_DATA;
  1026. return emit_current_tag(tokeniser);
  1027. } else {
  1028. return hubbub_error_from_parserutils_error(error);
  1029. }
  1030. }
  1031. c = *cptr;
  1032. if (c == '\t' || c == '\n' || c == '\f' || c == ' ' || c == '\r') {
  1033. tokeniser->context.pending += len;
  1034. tokeniser->state = STATE_AFTER_ATTRIBUTE_NAME;
  1035. } else if (c == '=') {
  1036. tokeniser->context.pending += len;
  1037. tokeniser->state = STATE_BEFORE_ATTRIBUTE_VALUE;
  1038. } else if (c == '>') {
  1039. tokeniser->context.pending += len;
  1040. tokeniser->state = STATE_DATA;
  1041. return emit_current_tag(tokeniser);
  1042. } else if (c == '/') {
  1043. tokeniser->context.pending += len;
  1044. tokeniser->state = STATE_SELF_CLOSING_START_TAG;
  1045. } else if (c == '\0') {
  1046. COLLECT(ctag->attributes[ctag->n_attributes - 1].name,
  1047. u_fffd, sizeof(u_fffd));
  1048. tokeniser->context.pending += len;
  1049. } else if ('A' <= c && c <= 'Z') {
  1050. uint8_t lc = (c + 0x20);
  1051. COLLECT(ctag->attributes[ctag->n_attributes - 1].name,
  1052. &lc, len);
  1053. tokeniser->context.pending += len;
  1054. } else {
  1055. COLLECT(ctag->attributes[ctag->n_attributes - 1].name,
  1056. cptr, len);
  1057. tokeniser->context.pending += len;
  1058. }
  1059. return HUBBUB_OK;
  1060. }
  1061. hubbub_error hubbub_tokeniser_handle_after_attribute_name(
  1062. hubbub_tokeniser *tokeniser)
  1063. {
  1064. hubbub_tag *ctag = &tokeniser->context.current_tag;
  1065. size_t len;
  1066. const uint8_t *cptr;
  1067. parserutils_error error;
  1068. uint8_t c;
  1069. error = parserutils_inputstream_peek(tokeniser->input,
  1070. tokeniser->context.pending, &cptr, &len);
  1071. if (error != PARSERUTILS_OK) {
  1072. if (error == PARSERUTILS_EOF) {
  1073. tokeniser->state = STATE_DATA;
  1074. return emit_current_tag(tokeniser);
  1075. } else {
  1076. return hubbub_error_from_parserutils_error(error);
  1077. }
  1078. }
  1079. c = *cptr;
  1080. if (c == '\t' || c == '\n' || c == '\f' || c == ' ' || c == '\r') {
  1081. tokeniser->context.pending += len;
  1082. } else if (c == '=') {
  1083. tokeniser->context.pending += len;
  1084. tokeniser->state = STATE_BEFORE_ATTRIBUTE_VALUE;
  1085. } else if (c == '>') {
  1086. tokeniser->context.pending += len;
  1087. tokeniser->state = STATE_DATA;
  1088. return emit_current_tag(tokeniser);
  1089. } else if (c == '/') {
  1090. tokeniser->context.pending += len;
  1091. tokeniser->state = STATE_SELF_CLOSING_START_TAG;
  1092. } else {
  1093. hubbub_attribute *attr;
  1094. if (c == '"' || c == '\'') {
  1095. /** \todo parse error */
  1096. }
  1097. attr = tokeniser->alloc(ctag->attributes,
  1098. (ctag->n_attributes + 1) *
  1099. sizeof(hubbub_attribute),
  1100. tokeniser->alloc_pw);
  1101. if (attr == NULL)
  1102. return HUBBUB_NOMEM;
  1103. ctag->attributes = attr;
  1104. if ('A' <= c && c <= 'Z') {
  1105. uint8_t lc = (c + 0x20);
  1106. START_BUF(attr[ctag->n_attributes].name, &lc, len);
  1107. } else if (c == '\0') {
  1108. START_BUF(attr[ctag->n_attributes].name,
  1109. u_fffd, sizeof(u_fffd));
  1110. } else {
  1111. START_BUF(attr[ctag->n_attributes].name, cptr, len);
  1112. }
  1113. attr[ctag->n_attributes].ns = HUBBUB_NS_NULL;
  1114. attr[ctag->n_attributes].value.ptr = NULL;
  1115. attr[ctag->n_attributes].value.len = 0;
  1116. ctag->n_attributes++;
  1117. tokeniser->context.pending += len;
  1118. tokeniser->state = STATE_ATTRIBUTE_NAME;
  1119. }
  1120. return HUBBUB_OK;
  1121. }
  1122. /* this state is only ever triggered by an '=' */
  1123. hubbub_error hubbub_tokeniser_handle_before_attribute_value(
  1124. hubbub_tokeniser *tokeniser)
  1125. {
  1126. hubbub_tag *ctag = &tokeniser->context.current_tag;
  1127. size_t len;
  1128. const uint8_t *cptr;
  1129. parserutils_error error;
  1130. uint8_t c;
  1131. error = parserutils_inputstream_peek(tokeniser->input,
  1132. tokeniser->context.pending, &cptr, &len);
  1133. if (error != PARSERUTILS_OK) {
  1134. if (error == PARSERUTILS_EOF) {
  1135. /** \todo parse error */
  1136. tokeniser->state = STATE_DATA;
  1137. return emit_current_tag(tokeniser);
  1138. } else {
  1139. return hubbub_error_from_parserutils_error(error);
  1140. }
  1141. }
  1142. c = *cptr;
  1143. if (c == '\t' || c == '\n' || c == '\f' || c == ' ' || c == '\r') {
  1144. tokeniser->context.pending += len;
  1145. } else if (c == '"') {
  1146. tokeniser->context.pending += len;
  1147. tokeniser->state = STATE_ATTRIBUTE_VALUE_DQ;
  1148. } else if (c == '&') {
  1149. /* Don't consume the '&' -- reprocess in UQ state */
  1150. tokeniser->state = STATE_ATTRIBUTE_VALUE_UQ;
  1151. } else if (c == '\'') {
  1152. tokeniser->context.pending += len;
  1153. tokeniser->state = STATE_ATTRIBUTE_VALUE_SQ;
  1154. } else if (c == '>') {
  1155. /** \todo parse error */
  1156. tokeniser->context.pending += len;
  1157. tokeniser->state = STATE_DATA;
  1158. return emit_current_tag(tokeniser);
  1159. } else if (c == '\0') {
  1160. START_BUF(ctag->attributes[ctag->n_attributes - 1].value,
  1161. u_fffd, sizeof(u_fffd));
  1162. tokeniser->context.pending += len;
  1163. tokeniser->state = STATE_ATTRIBUTE_VALUE_UQ;
  1164. } else {
  1165. if (c == '=') {
  1166. /** \todo parse error */
  1167. }
  1168. START_BUF(ctag->attributes[ctag->n_attributes - 1].value,
  1169. cptr, len);
  1170. tokeniser->context.pending += len;
  1171. tokeniser->state = STATE_ATTRIBUTE_VALUE_UQ;
  1172. }
  1173. return HUBBUB_OK;
  1174. }
  1175. hubbub_error hubbub_tokeniser_handle_attribute_value_dq(
  1176. hubbub_tokeniser *tokeniser)
  1177. {
  1178. hubbub_tag *ctag = &tokeniser->context.current_tag;
  1179. size_t len;
  1180. const uint8_t *cptr;
  1181. parserutils_error error;
  1182. uint8_t c;
  1183. error = parserutils_inputstream_peek(tokeniser->input,
  1184. tokeniser->context.pending, &cptr, &len);
  1185. if (error != PARSERUTILS_OK) {
  1186. if (error == PARSERUTILS_EOF) {
  1187. tokeniser->state = STATE_DATA;
  1188. return emit_current_tag(tokeniser);
  1189. } else {
  1190. return hubbub_error_from_parserutils_error(error);
  1191. }
  1192. }
  1193. c = *cptr;
  1194. if (c == '"') {
  1195. tokeniser->context.pending += len;
  1196. tokeniser->state = STATE_AFTER_ATTRIBUTE_VALUE_Q;
  1197. } else if (c == '&') {
  1198. tokeniser->context.prev_state = tokeniser->state;
  1199. tokeniser->state = STATE_CHARACTER_REFERENCE_IN_ATTRIBUTE_VALUE;
  1200. tokeniser->context.allowed_char = '"';
  1201. /* Don't eat the '&'; it'll be handled by entity consumption */
  1202. } else if (c == '\0') {
  1203. COLLECT_MS(ctag->attributes[ctag->n_attributes - 1].value,
  1204. u_fffd, sizeof(u_fffd));
  1205. tokeniser->context.pending += len;
  1206. } else if (c == '\r') {
  1207. error = parserutils_inputstream_peek(
  1208. tokeniser->input,
  1209. tokeniser->context.pending + len,
  1210. &cptr,
  1211. &len);
  1212. if (error != PARSERUTILS_OK && error != PARSERUTILS_EOF) {
  1213. return hubbub_error_from_parserutils_error(error);
  1214. } else if (error == PARSERUTILS_EOF || *cptr != '\n') {
  1215. COLLECT_MS(ctag->attributes[
  1216. ctag->n_attributes - 1].value,
  1217. &lf, sizeof(lf));
  1218. }
  1219. /* Consume '\r' */
  1220. tokeniser->context.pending += 1;
  1221. } else {
  1222. COLLECT_MS(ctag->attributes[ctag->n_attributes - 1].value,
  1223. cptr, len);
  1224. tokeniser->context.pending += len;
  1225. }
  1226. return HUBBUB_OK;
  1227. }
  1228. hubbub_error hubbub_tokeniser_handle_attribute_value_sq(
  1229. hubbub_tokeniser *tokeniser)
  1230. {
  1231. hubbub_tag *ctag = &tokeniser->context.current_tag;
  1232. size_t len;
  1233. const uint8_t *cptr;
  1234. parserutils_error error;
  1235. uint8_t c;
  1236. error = parserutils_inputstream_peek(tokeniser->input,
  1237. tokeniser->context.pending, &cptr, &len);
  1238. if (error != PARSERUTILS_OK) {
  1239. if (error == PARSERUTILS_EOF) {
  1240. tokeniser->state = STATE_DATA;
  1241. return emit_current_tag(tokeniser);
  1242. } else {
  1243. return hubbub_error_from_parserutils_error(error);
  1244. }
  1245. }
  1246. c = *cptr;
  1247. if (c == '\'') {
  1248. tokeniser->context.pending += len;
  1249. tokeniser->state = STATE_AFTER_ATTRIBUTE_VALUE_Q;
  1250. } else if (c == '&') {
  1251. tokeniser->context.prev_state = tokeniser->state;
  1252. tokeniser->state = STATE_CHARACTER_REFERENCE_IN_ATTRIBUTE_VALUE;
  1253. tokeniser->context.allowed_char = '\'';
  1254. /* Don't eat the '&'; it'll be handled by entity consumption */
  1255. } else if (c == '\0') {
  1256. COLLECT_MS(ctag->attributes[ctag->n_attributes - 1].value,
  1257. u_fffd, sizeof(u_fffd));
  1258. tokeniser->context.pending += len;
  1259. } else if (c == '\r') {
  1260. error = parserutils_inputstream_peek(
  1261. tokeniser->input,
  1262. tokeniser->context.pending + len,
  1263. &cptr,
  1264. &len);
  1265. if (error != PARSERUTILS_OK && error != PARSERUTILS_EOF) {
  1266. return hubbub_error_from_parserutils_error(error);
  1267. } else if (error == PARSERUTILS_EOF || *cptr != '\n') {
  1268. COLLECT_MS(ctag->attributes[
  1269. ctag->n_attributes - 1].value,
  1270. &lf, sizeof(lf));
  1271. }
  1272. /* Consume \r */
  1273. tokeniser->context.pending += 1;
  1274. } else {
  1275. COLLECT_MS(ctag->attributes[ctag->n_attributes - 1].value,
  1276. cptr, len);
  1277. tokeniser->context.pending += len;
  1278. }
  1279. return HUBBUB_OK;
  1280. }
  1281. hubbub_error hubbub_tokeniser_handle_attribute_value_uq(
  1282. hubbub_tokeniser *tokeniser)
  1283. {
  1284. hubbub_tag *ctag = &tokeniser->context.current_tag;
  1285. uint8_t c;
  1286. size_t len;
  1287. const uint8_t *cptr;
  1288. parserutils_error error;
  1289. error = parserutils_inputstream_peek(tokeniser->input,
  1290. tokeniser->context.pending, &cptr, &len);
  1291. if (error != PARSERUTILS_OK) {
  1292. if (error == PARSERUTILS_EOF) {
  1293. tokeniser->state = STATE_DATA;
  1294. return emit_current_tag(tokeniser);
  1295. } else {
  1296. return hubbub_error_from_parserutils_error(error);
  1297. }
  1298. }
  1299. c = *cptr;
  1300. assert(c == '&' ||
  1301. ctag->attributes[ctag->n_attributes - 1].value.len >= 1);
  1302. if (c == '\t' || c == '\n' || c == '\f' || c == ' ' || c == '\r') {
  1303. tokeniser->context.pending += len;
  1304. tokeniser->state = STATE_BEFORE_ATTRIBUTE_NAME;
  1305. } else if (c == '&') {
  1306. tokeniser->context.prev_state = tokeniser->state;
  1307. tokeniser->state = STATE_CHARACTER_REFERENCE_IN_ATTRIBUTE_VALUE;
  1308. /* Don't eat the '&'; it'll be handled by entity consumption */
  1309. } else if (c == '>') {
  1310. tokeniser->context.pending += len;
  1311. tokeniser->state = STATE_DATA;
  1312. return emit_current_tag(tokeniser);
  1313. } else if (c == '\0') {
  1314. COLLECT(ctag->attributes[ctag->n_attributes - 1].value,
  1315. u_fffd, sizeof(u_fffd));
  1316. tokeniser->context.pending += len;
  1317. } else {
  1318. if (c == '"' || c == '\'' || c == '=') {
  1319. /** \todo parse error */
  1320. }
  1321. COLLECT(ctag->attributes[ctag->n_attributes - 1].value,
  1322. cptr, len);
  1323. tokeniser->context.pending += len;
  1324. }
  1325. return HUBBUB_OK;
  1326. }
  1327. hubbub_error hubbub_tokeniser_handle_character_reference_in_attribute_value(
  1328. hubbub_tokeniser *tokeniser)
  1329. {
  1330. if (tokeniser->context.match_entity.complete == false) {
  1331. return hubbub_tokeniser_consume_character_reference(tokeniser,
  1332. tokeniser->context.pending);
  1333. } else {
  1334. hubbub_tag *ctag = &tokeniser->context.current_tag;
  1335. hubbub_attribute *attr = &ctag->attributes[
  1336. ctag->n_attributes - 1];
  1337. uint8_t utf8[6];
  1338. uint8_t *utf8ptr = utf8;
  1339. size_t len = sizeof(utf8);
  1340. if (tokeniser->context.match_entity.codepoint) {
  1341. parserutils_charset_utf8_from_ucs4(
  1342. tokeniser->context.match_entity.codepoint,
  1343. &utf8ptr, &len);
  1344. COLLECT_MS(attr->value, utf8, sizeof(utf8) - len);
  1345. /* +1 for the ampersand */
  1346. tokeniser->context.pending +=
  1347. tokeniser->context.match_entity.length
  1348. + 1;
  1349. } else {
  1350. size_t len = 0;
  1351. const uint8_t *cptr = NULL;
  1352. parserutils_error error;
  1353. error = parserutils_inputstream_peek(
  1354. tokeniser->input,
  1355. tokeniser->context.pending,
  1356. &cptr,
  1357. &len);
  1358. assert(error == PARSERUTILS_OK);
  1359. /* Insert the ampersand */
  1360. COLLECT_MS(attr->value, cptr, len);
  1361. tokeniser->context.pending += len;
  1362. }
  1363. /* Reset for next time */
  1364. tokeniser->context.match_entity.complete = false;
  1365. /* And back to the previous state */
  1366. tokeniser->state = tokeniser->context.prev_state;
  1367. }
  1368. return HUBBUB_OK;
  1369. }
  1370. /* always switches state */
  1371. hubbub_error hubbub_tokeniser_handle_after_attribute_value_q(
  1372. hubbub_tokeniser *tokeniser)
  1373. {
  1374. size_t len;
  1375. const uint8_t *cptr;
  1376. parserutils_error error;
  1377. uint8_t c;
  1378. error = parserutils_inputstream_peek(tokeniser->input,
  1379. tokeniser->context.pending, &cptr, &len);
  1380. if (error != PARSERUTILS_OK) {
  1381. if (error == PARSERUTILS_EOF) {
  1382. tokeniser->state = STATE_DATA;
  1383. return emit_current_tag(tokeniser);
  1384. } else {
  1385. return hubbub_error_from_parserutils_error(error);
  1386. }
  1387. }
  1388. c = *cptr;
  1389. if (c == '\t' || c == '\n' || c == '\f' || c == ' ' || c == '\r') {
  1390. tokeniser->context.pending += len;
  1391. tokeniser->state = STATE_BEFORE_ATTRIBUTE_NAME;
  1392. } else if (c == '>') {
  1393. tokeniser->context.pending += len;
  1394. tokeniser->state = STATE_DATA;
  1395. return emit_current_tag(tokeniser);
  1396. } else if (c == '/') {
  1397. tokeniser->context.pending += len;
  1398. tokeniser->state = STATE_SELF_CLOSING_START_TAG;
  1399. } else {
  1400. /** \todo parse error */
  1401. /* Reprocess character in before attribute name state */
  1402. tokeniser->state = STATE_BEFORE_ATTRIBUTE_NAME;
  1403. }
  1404. return HUBBUB_OK;
  1405. }
  1406. hubbub_error hubbub_tokeniser_handle_self_closing_start_tag(
  1407. hubbub_tokeniser *tokeniser)
  1408. {
  1409. size_t len;
  1410. const uint8_t *cptr;
  1411. parserutils_error error;
  1412. uint8_t c;
  1413. error = parserutils_inputstream_peek(tokeniser->input,
  1414. tokeniser->context.pending, &cptr, &len);
  1415. if (error != PARSERUTILS_OK) {
  1416. if (error == PARSERUTILS_EOF) {
  1417. tokeniser->state = STATE_DATA;
  1418. return emit_current_tag(tokeniser);
  1419. } else {
  1420. return hubbub_error_from_parserutils_error(error);
  1421. }
  1422. }
  1423. c = *cptr;
  1424. if (c == '>') {
  1425. tokeniser->context.pending += len;
  1426. tokeniser->state = STATE_DATA;
  1427. tokeniser->context.current_tag.self_closing = true;
  1428. return emit_current_tag(tokeniser);
  1429. } else {
  1430. /* Reprocess character in before attribute name state */
  1431. tokeniser->state = STATE_BEFORE_ATTRIBUTE_NAME;
  1432. }
  1433. return HUBBUB_OK;
  1434. }
  1435. /* this state expects tokeniser->context.chars to be empty on first entry */
  1436. hubbub_error hubbub_tokeniser_handle_bogus_comment(hubbub_tokeniser *tokeniser)
  1437. {
  1438. size_t len;
  1439. const uint8_t *cptr;
  1440. parserutils_error error;
  1441. uint8_t c;
  1442. error = parserutils_inputstream_peek(tokeniser->input,
  1443. tokeniser->context.pending, &cptr, &len);
  1444. if (error != PARSERUTILS_OK) {
  1445. if (error == PARSERUTILS_EOF) {
  1446. tokeniser->state = STATE_DATA;
  1447. return emit_current_comment(tokeniser);
  1448. } else {
  1449. return hubbub_error_from_parserutils_error(error);
  1450. }
  1451. }
  1452. c = *cptr;
  1453. if (c == '>') {
  1454. tokeniser->context.pending += len;
  1455. tokeniser->state = STATE_DATA;
  1456. return emit_current_comment(tokeniser);
  1457. } else if (c == '\0') {
  1458. error = parserutils_buffer_append(tokeniser->buffer,
  1459. u_fffd, sizeof(u_fffd));
  1460. if (error != PARSERUTILS_OK)
  1461. return hubbub_error_from_parserutils_error(error);
  1462. tokeniser->context.pending += len;
  1463. } else if (c == '\r') {
  1464. error = parserutils_inputstream_peek(
  1465. tokeniser->input,
  1466. tokeniser->context.pending,
  1467. &cptr,
  1468. &len);
  1469. if (error != PARSERUTILS_OK && error != PARSERUTILS_EOF) {
  1470. return hubbub_error_from_parserutils_error(error);
  1471. } else if (error == PARSERUTILS_EOF || *cptr != '\n') {
  1472. error = parserutils_buffer_append(tokeniser->buffer,
  1473. &lf, sizeof(lf));
  1474. if (error != PARSERUTILS_OK) {
  1475. return hubbub_error_from_parserutils_error(
  1476. error);
  1477. }
  1478. }
  1479. tokeniser->context.pending += len;
  1480. } else {
  1481. error = parserutils_buffer_append(tokeniser->buffer,
  1482. (uint8_t *) cptr, len);
  1483. if (error != PARSERUTILS_OK)
  1484. return hubbub_error_from_parserutils_error(error);
  1485. tokeniser->context.pending += len;
  1486. }
  1487. return HUBBUB_OK;
  1488. }
  1489. /* this state always switches to another state straight away */
  1490. hubbub_error hubbub_tokeniser_handle_markup_declaration_open(
  1491. hubbub_tokeniser *tokeniser)
  1492. {
  1493. size_t len;
  1494. const uint8_t *cptr;
  1495. parserutils_error error;
  1496. uint8_t c;
  1497. assert(tokeniser->context.pending == 0);
  1498. error = parserutils_inputstream_peek(tokeniser->input, 0, &cptr, &len);
  1499. if (error != PARSERUTILS_OK) {
  1500. if (error == PARSERUTILS_EOF) {
  1501. tokeniser->state = STATE_BOGUS_COMMENT;
  1502. return HUBBUB_OK;
  1503. } else {
  1504. return hubbub_error_from_parserutils_error(error);
  1505. }
  1506. }
  1507. c = *cptr;
  1508. if (c == '-') {
  1509. tokeniser->context.pending = len;
  1510. tokeniser->state = STATE_MATCH_COMMENT;
  1511. } else if ((c & ~0x20) == 'D') {
  1512. tokeniser->context.pending = len;
  1513. tokeniser->context.match_doctype.count = len;
  1514. tokeniser->state = STATE_MATCH_DOCTYPE;
  1515. } else if (tokeniser->process_cdata_section == true && c == '[') {
  1516. tokeniser->context.pending = len;
  1517. tokeniser->context.match_cdata.count = len;
  1518. tokeniser->state = STATE_MATCH_CDATA;
  1519. } else {
  1520. tokeniser->state = STATE_BOGUS_COMMENT;
  1521. }
  1522. return HUBBUB_OK;
  1523. }
  1524. hubbub_error hubbub_tokeniser_handle_match_comment(hubbub_tokeniser *tokeniser)
  1525. {
  1526. size_t len;
  1527. const uint8_t *cptr;
  1528. parserutils_error error;
  1529. error = parserutils_inputstream_peek(tokeniser->input,
  1530. tokeniser->context.pending, &cptr, &len);
  1531. if (error != PARSERUTILS_OK) {
  1532. if (error == PARSERUTI

Large files files are truncated, but you can click here to view the full file