PageRenderTime 78ms CodeModel.GetById 21ms RepoModel.GetById 0ms app.codeStats 1ms

/parser/html/javasrc/Tokenizer.java

http://github.com/zpao/v8monkey
Java | 7027 lines | 3908 code | 381 blank | 2738 comment | 760 complexity | fb65ba7ba7f99f659d89e51498e7a0f4 MD5 | raw file
Possible License(s): MPL-2.0-no-copyleft-exception, LGPL-3.0, AGPL-1.0, LGPL-2.1, BSD-3-Clause, GPL-2.0, JSON, Apache-2.0, 0BSD
  1. /*
  2. * Copyright (c) 2005-2007 Henri Sivonen
  3. * Copyright (c) 2007-2010 Mozilla Foundation
  4. * Portions of comments Copyright 2004-2010 Apple Computer, Inc., Mozilla
  5. * Foundation, and Opera Software ASA.
  6. *
  7. * Permission is hereby granted, free of charge, to any person obtaining a
  8. * copy of this software and associated documentation files (the "Software"),
  9. * to deal in the Software without restriction, including without limitation
  10. * the rights to use, copy, modify, merge, publish, distribute, sublicense,
  11. * and/or sell copies of the Software, and to permit persons to whom the
  12. * Software is furnished to do so, subject to the following conditions:
  13. *
  14. * The above copyright notice and this permission notice shall be included in
  15. * all copies or substantial portions of the Software.
  16. *
  17. * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  18. * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  19. * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
  20. * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  21. * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  22. * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
  23. * DEALINGS IN THE SOFTWARE.
  24. */
  25. /*
  26. * The comments following this one that use the same comment syntax as this
  27. * comment are quotes from the WHATWG HTML 5 spec as of 2 June 2007
  28. * amended as of June 18 2008 and May 31 2010.
  29. * That document came with this statement:
  30. * "© Copyright 2004-2010 Apple Computer, Inc., Mozilla Foundation, and
  31. * Opera Software ASA. You are granted a license to use, reproduce and
  32. * create derivative works of this document."
  33. */
  34. package nu.validator.htmlparser.impl;
  35. import nu.validator.htmlparser.annotation.Auto;
  36. import nu.validator.htmlparser.annotation.CharacterName;
  37. import nu.validator.htmlparser.annotation.Const;
  38. import nu.validator.htmlparser.annotation.Inline;
  39. import nu.validator.htmlparser.annotation.Local;
  40. import nu.validator.htmlparser.annotation.NoLength;
  41. import nu.validator.htmlparser.common.EncodingDeclarationHandler;
  42. import nu.validator.htmlparser.common.Interner;
  43. import nu.validator.htmlparser.common.TokenHandler;
  44. import nu.validator.htmlparser.common.XmlViolationPolicy;
  45. import org.xml.sax.ErrorHandler;
  46. import org.xml.sax.Locator;
  47. import org.xml.sax.SAXException;
  48. import org.xml.sax.SAXParseException;
  49. /**
  50. * An implementation of
  51. * http://www.whatwg.org/specs/web-apps/current-work/multipage/tokenization.html
  52. *
  53. * This class implements the <code>Locator</code> interface. This is not an
  54. * incidental implementation detail: Users of this class are encouraged to make
  55. * use of the <code>Locator</code> nature.
  56. *
  57. * By default, the tokenizer may report data that XML 1.0 bans. The tokenizer
  58. * can be configured to treat these conditions as fatal or to coerce the infoset
  59. * to something that XML 1.0 allows.
  60. *
  61. * @version $Id$
  62. * @author hsivonen
  63. */
  64. public class Tokenizer implements Locator {
  65. private static final int DATA_AND_RCDATA_MASK = ~1;
  66. public static final int DATA = 0;
  67. public static final int RCDATA = 1;
  68. public static final int SCRIPT_DATA = 2;
  69. public static final int RAWTEXT = 3;
  70. public static final int SCRIPT_DATA_ESCAPED = 4;
  71. public static final int ATTRIBUTE_VALUE_DOUBLE_QUOTED = 5;
  72. public static final int ATTRIBUTE_VALUE_SINGLE_QUOTED = 6;
  73. public static final int ATTRIBUTE_VALUE_UNQUOTED = 7;
  74. public static final int PLAINTEXT = 8;
  75. public static final int TAG_OPEN = 9;
  76. public static final int CLOSE_TAG_OPEN = 10;
  77. public static final int TAG_NAME = 11;
  78. public static final int BEFORE_ATTRIBUTE_NAME = 12;
  79. public static final int ATTRIBUTE_NAME = 13;
  80. public static final int AFTER_ATTRIBUTE_NAME = 14;
  81. public static final int BEFORE_ATTRIBUTE_VALUE = 15;
  82. public static final int AFTER_ATTRIBUTE_VALUE_QUOTED = 16;
  83. public static final int BOGUS_COMMENT = 17;
  84. public static final int MARKUP_DECLARATION_OPEN = 18;
  85. public static final int DOCTYPE = 19;
  86. public static final int BEFORE_DOCTYPE_NAME = 20;
  87. public static final int DOCTYPE_NAME = 21;
  88. public static final int AFTER_DOCTYPE_NAME = 22;
  89. public static final int BEFORE_DOCTYPE_PUBLIC_IDENTIFIER = 23;
  90. public static final int DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED = 24;
  91. public static final int DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED = 25;
  92. public static final int AFTER_DOCTYPE_PUBLIC_IDENTIFIER = 26;
  93. public static final int BEFORE_DOCTYPE_SYSTEM_IDENTIFIER = 27;
  94. public static final int DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED = 28;
  95. public static final int DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED = 29;
  96. public static final int AFTER_DOCTYPE_SYSTEM_IDENTIFIER = 30;
  97. public static final int BOGUS_DOCTYPE = 31;
  98. public static final int COMMENT_START = 32;
  99. public static final int COMMENT_START_DASH = 33;
  100. public static final int COMMENT = 34;
  101. public static final int COMMENT_END_DASH = 35;
  102. public static final int COMMENT_END = 36;
  103. public static final int COMMENT_END_BANG = 37;
  104. public static final int NON_DATA_END_TAG_NAME = 38;
  105. public static final int MARKUP_DECLARATION_HYPHEN = 39;
  106. public static final int MARKUP_DECLARATION_OCTYPE = 40;
  107. public static final int DOCTYPE_UBLIC = 41;
  108. public static final int DOCTYPE_YSTEM = 42;
  109. public static final int AFTER_DOCTYPE_PUBLIC_KEYWORD = 43;
  110. public static final int BETWEEN_DOCTYPE_PUBLIC_AND_SYSTEM_IDENTIFIERS = 44;
  111. public static final int AFTER_DOCTYPE_SYSTEM_KEYWORD = 45;
  112. public static final int CONSUME_CHARACTER_REFERENCE = 46;
  113. public static final int CONSUME_NCR = 47;
  114. public static final int CHARACTER_REFERENCE_TAIL = 48;
  115. public static final int HEX_NCR_LOOP = 49;
  116. public static final int DECIMAL_NRC_LOOP = 50;
  117. public static final int HANDLE_NCR_VALUE = 51;
  118. public static final int HANDLE_NCR_VALUE_RECONSUME = 52;
  119. public static final int CHARACTER_REFERENCE_HILO_LOOKUP = 53;
  120. public static final int SELF_CLOSING_START_TAG = 54;
  121. public static final int CDATA_START = 55;
  122. public static final int CDATA_SECTION = 56;
  123. public static final int CDATA_RSQB = 57;
  124. public static final int CDATA_RSQB_RSQB = 58;
  125. public static final int SCRIPT_DATA_LESS_THAN_SIGN = 59;
  126. public static final int SCRIPT_DATA_ESCAPE_START = 60;
  127. public static final int SCRIPT_DATA_ESCAPE_START_DASH = 61;
  128. public static final int SCRIPT_DATA_ESCAPED_DASH = 62;
  129. public static final int SCRIPT_DATA_ESCAPED_DASH_DASH = 63;
  130. public static final int BOGUS_COMMENT_HYPHEN = 64;
  131. public static final int RAWTEXT_RCDATA_LESS_THAN_SIGN = 65;
  132. public static final int SCRIPT_DATA_ESCAPED_LESS_THAN_SIGN = 66;
  133. public static final int SCRIPT_DATA_DOUBLE_ESCAPE_START = 67;
  134. public static final int SCRIPT_DATA_DOUBLE_ESCAPED = 68;
  135. public static final int SCRIPT_DATA_DOUBLE_ESCAPED_LESS_THAN_SIGN = 69;
  136. public static final int SCRIPT_DATA_DOUBLE_ESCAPED_DASH = 70;
  137. public static final int SCRIPT_DATA_DOUBLE_ESCAPED_DASH_DASH = 71;
  138. public static final int SCRIPT_DATA_DOUBLE_ESCAPE_END = 72;
  139. public static final int PROCESSING_INSTRUCTION = 73;
  140. public static final int PROCESSING_INSTRUCTION_QUESTION_MARK = 74;
  141. /**
  142. * Magic value for UTF-16 operations.
  143. */
  144. private static final int LEAD_OFFSET = (0xD800 - (0x10000 >> 10));
  145. /**
  146. * UTF-16 code unit array containing less than and greater than for emitting
  147. * those characters on certain parse errors.
  148. */
  149. private static final @NoLength char[] LT_GT = { '<', '>' };
  150. /**
  151. * UTF-16 code unit array containing less than and solidus for emitting
  152. * those characters on certain parse errors.
  153. */
  154. private static final @NoLength char[] LT_SOLIDUS = { '<', '/' };
  155. /**
  156. * UTF-16 code unit array containing ]] for emitting those characters on
  157. * state transitions.
  158. */
  159. private static final @NoLength char[] RSQB_RSQB = { ']', ']' };
  160. /**
  161. * Array version of U+FFFD.
  162. */
  163. private static final @NoLength char[] REPLACEMENT_CHARACTER = { '\uFFFD' };
  164. // [NOCPP[
  165. /**
  166. * Array version of space.
  167. */
  168. private static final @NoLength char[] SPACE = { ' ' };
  169. // ]NOCPP]
  170. /**
  171. * Array version of line feed.
  172. */
  173. private static final @NoLength char[] LF = { '\n' };
  174. /**
  175. * Buffer growth parameter.
  176. */
  177. private static final int BUFFER_GROW_BY = 1024;
  178. /**
  179. * "CDATA[" as <code>char[]</code>
  180. */
  181. private static final @NoLength char[] CDATA_LSQB = "CDATA[".toCharArray();
  182. /**
  183. * "octype" as <code>char[]</code>
  184. */
  185. private static final @NoLength char[] OCTYPE = "octype".toCharArray();
  186. /**
  187. * "ublic" as <code>char[]</code>
  188. */
  189. private static final @NoLength char[] UBLIC = "ublic".toCharArray();
  190. /**
  191. * "ystem" as <code>char[]</code>
  192. */
  193. private static final @NoLength char[] YSTEM = "ystem".toCharArray();
  194. private static final char[] TITLE_ARR = { 't', 'i', 't', 'l', 'e' };
  195. private static final char[] SCRIPT_ARR = { 's', 'c', 'r', 'i', 'p', 't' };
  196. private static final char[] STYLE_ARR = { 's', 't', 'y', 'l', 'e' };
  197. private static final char[] PLAINTEXT_ARR = { 'p', 'l', 'a', 'i', 'n', 't',
  198. 'e', 'x', 't' };
  199. private static final char[] XMP_ARR = { 'x', 'm', 'p' };
  200. private static final char[] TEXTAREA_ARR = { 't', 'e', 'x', 't', 'a', 'r',
  201. 'e', 'a' };
  202. private static final char[] IFRAME_ARR = { 'i', 'f', 'r', 'a', 'm', 'e' };
  203. private static final char[] NOEMBED_ARR = { 'n', 'o', 'e', 'm', 'b', 'e',
  204. 'd' };
  205. private static final char[] NOSCRIPT_ARR = { 'n', 'o', 's', 'c', 'r', 'i',
  206. 'p', 't' };
  207. private static final char[] NOFRAMES_ARR = { 'n', 'o', 'f', 'r', 'a', 'm',
  208. 'e', 's' };
  209. /**
  210. * The token handler.
  211. */
  212. protected final TokenHandler tokenHandler;
  213. protected EncodingDeclarationHandler encodingDeclarationHandler;
  214. // [NOCPP[
  215. /**
  216. * The error handler.
  217. */
  218. protected ErrorHandler errorHandler;
  219. // ]NOCPP]
  220. /**
  221. * Whether the previous char read was CR.
  222. */
  223. protected boolean lastCR;
  224. protected int stateSave;
  225. private int returnStateSave;
  226. protected int index;
  227. private boolean forceQuirks;
  228. private char additional;
  229. private int entCol;
  230. private int firstCharKey;
  231. private int lo;
  232. private int hi;
  233. private int candidate;
  234. private int strBufMark;
  235. private int prevValue;
  236. protected int value;
  237. private boolean seenDigits;
  238. protected int cstart;
  239. /**
  240. * The SAX public id for the resource being tokenized. (Only passed to back
  241. * as part of locator data.)
  242. */
  243. private String publicId;
  244. /**
  245. * The SAX system id for the resource being tokenized. (Only passed to back
  246. * as part of locator data.)
  247. */
  248. private String systemId;
  249. /**
  250. * Buffer for short identifiers.
  251. */
  252. private @Auto char[] strBuf;
  253. /**
  254. * Number of significant <code>char</code>s in <code>strBuf</code>.
  255. */
  256. private int strBufLen;
  257. /**
  258. * <code>-1</code> to indicate that <code>strBuf</code> is used or otherwise
  259. * an offset to the main buffer.
  260. */
  261. // private int strBufOffset = -1;
  262. /**
  263. * Buffer for long strings.
  264. */
  265. private @Auto char[] longStrBuf;
  266. /**
  267. * Number of significant <code>char</code>s in <code>longStrBuf</code>.
  268. */
  269. private int longStrBufLen;
  270. /**
  271. * <code>-1</code> to indicate that <code>longStrBuf</code> is used or
  272. * otherwise an offset to the main buffer.
  273. */
  274. // private int longStrBufOffset = -1;
  275. /**
  276. * Buffer for expanding NCRs falling into the Basic Multilingual Plane.
  277. */
  278. private final @Auto char[] bmpChar;
  279. /**
  280. * Buffer for expanding astral NCRs.
  281. */
  282. private final @Auto char[] astralChar;
  283. /**
  284. * The element whose end tag closes the current CDATA or RCDATA element.
  285. */
  286. protected ElementName endTagExpectation = null;
  287. private char[] endTagExpectationAsArray; // not @Auto!
  288. /**
  289. * <code>true</code> if tokenizing an end tag
  290. */
  291. protected boolean endTag;
  292. /**
  293. * The current tag token name.
  294. */
  295. private ElementName tagName = null;
  296. /**
  297. * The current attribute name.
  298. */
  299. protected AttributeName attributeName = null;
  300. // [NOCPP[
  301. /**
  302. * Whether comment tokens are emitted.
  303. */
  304. private boolean wantsComments = false;
  305. /**
  306. * <code>true</code> when HTML4-specific additional errors are requested.
  307. */
  308. protected boolean html4;
  309. /**
  310. * Whether the stream is past the first 512 bytes.
  311. */
  312. private boolean metaBoundaryPassed;
  313. // ]NOCPP]
  314. /**
  315. * The name of the current doctype token.
  316. */
  317. private @Local String doctypeName;
  318. /**
  319. * The public id of the current doctype token.
  320. */
  321. private String publicIdentifier;
  322. /**
  323. * The system id of the current doctype token.
  324. */
  325. private String systemIdentifier;
  326. /**
  327. * The attribute holder.
  328. */
  329. private HtmlAttributes attributes;
  330. // [NOCPP[
  331. /**
  332. * The policy for vertical tab and form feed.
  333. */
  334. private XmlViolationPolicy contentSpacePolicy = XmlViolationPolicy.ALTER_INFOSET;
  335. /**
  336. * The policy for comments.
  337. */
  338. private XmlViolationPolicy commentPolicy = XmlViolationPolicy.ALTER_INFOSET;
  339. private XmlViolationPolicy xmlnsPolicy = XmlViolationPolicy.ALTER_INFOSET;
  340. private XmlViolationPolicy namePolicy = XmlViolationPolicy.ALTER_INFOSET;
  341. private boolean html4ModeCompatibleWithXhtml1Schemata;
  342. private final boolean newAttributesEachTime;
  343. // ]NOCPP]
  344. private int mappingLangToXmlLang;
  345. private boolean shouldSuspend;
  346. protected boolean confident;
  347. private int line;
  348. private Interner interner;
  349. // CPPONLY: private boolean viewingXmlSource;
  350. // [NOCPP[
  351. protected LocatorImpl ampersandLocation;
  352. public Tokenizer(TokenHandler tokenHandler, boolean newAttributesEachTime) {
  353. this.tokenHandler = tokenHandler;
  354. this.encodingDeclarationHandler = null;
  355. this.newAttributesEachTime = newAttributesEachTime;
  356. this.bmpChar = new char[1];
  357. this.astralChar = new char[2];
  358. this.tagName = null;
  359. this.attributeName = null;
  360. this.doctypeName = null;
  361. this.publicIdentifier = null;
  362. this.systemIdentifier = null;
  363. this.attributes = null;
  364. }
  365. // ]NOCPP]
  366. /**
  367. * The constructor.
  368. *
  369. * @param tokenHandler
  370. * the handler for receiving tokens
  371. */
  372. public Tokenizer(TokenHandler tokenHandler
  373. // CPPONLY: , boolean viewingXmlSource
  374. ) {
  375. this.tokenHandler = tokenHandler;
  376. this.encodingDeclarationHandler = null;
  377. // [NOCPP[
  378. this.newAttributesEachTime = false;
  379. // ]NOCPP]
  380. this.bmpChar = new char[1];
  381. this.astralChar = new char[2];
  382. this.tagName = null;
  383. this.attributeName = null;
  384. this.doctypeName = null;
  385. this.publicIdentifier = null;
  386. this.systemIdentifier = null;
  387. this.attributes = null;
  388. // CPPONLY: this.viewingXmlSource = viewingXmlSource;
  389. }
  390. public void setInterner(Interner interner) {
  391. this.interner = interner;
  392. }
  393. public void initLocation(String newPublicId, String newSystemId) {
  394. this.systemId = newSystemId;
  395. this.publicId = newPublicId;
  396. }
  397. // CPPONLY: boolean isViewingXmlSource() {
  398. // CPPONLY: return viewingXmlSource;
  399. // CPPONLY: }
  400. // [NOCPP[
  401. /**
  402. * Returns the mappingLangToXmlLang.
  403. *
  404. * @return the mappingLangToXmlLang
  405. */
  406. public boolean isMappingLangToXmlLang() {
  407. return mappingLangToXmlLang == AttributeName.HTML_LANG;
  408. }
  409. /**
  410. * Sets the mappingLangToXmlLang.
  411. *
  412. * @param mappingLangToXmlLang
  413. * the mappingLangToXmlLang to set
  414. */
  415. public void setMappingLangToXmlLang(boolean mappingLangToXmlLang) {
  416. this.mappingLangToXmlLang = mappingLangToXmlLang ? AttributeName.HTML_LANG
  417. : AttributeName.HTML;
  418. }
  419. /**
  420. * Sets the error handler.
  421. *
  422. * @see org.xml.sax.XMLReader#setErrorHandler(org.xml.sax.ErrorHandler)
  423. */
  424. public void setErrorHandler(ErrorHandler eh) {
  425. this.errorHandler = eh;
  426. }
  427. public ErrorHandler getErrorHandler() {
  428. return this.errorHandler;
  429. }
  430. /**
  431. * Sets the commentPolicy.
  432. *
  433. * @param commentPolicy
  434. * the commentPolicy to set
  435. */
  436. public void setCommentPolicy(XmlViolationPolicy commentPolicy) {
  437. this.commentPolicy = commentPolicy;
  438. }
  439. /**
  440. * Sets the contentNonXmlCharPolicy.
  441. *
  442. * @param contentNonXmlCharPolicy
  443. * the contentNonXmlCharPolicy to set
  444. */
  445. public void setContentNonXmlCharPolicy(
  446. XmlViolationPolicy contentNonXmlCharPolicy) {
  447. if (contentNonXmlCharPolicy != XmlViolationPolicy.ALLOW) {
  448. throw new IllegalArgumentException(
  449. "Must use ErrorReportingTokenizer to set contentNonXmlCharPolicy to non-ALLOW.");
  450. }
  451. }
  452. /**
  453. * Sets the contentSpacePolicy.
  454. *
  455. * @param contentSpacePolicy
  456. * the contentSpacePolicy to set
  457. */
  458. public void setContentSpacePolicy(XmlViolationPolicy contentSpacePolicy) {
  459. this.contentSpacePolicy = contentSpacePolicy;
  460. }
  461. /**
  462. * Sets the xmlnsPolicy.
  463. *
  464. * @param xmlnsPolicy
  465. * the xmlnsPolicy to set
  466. */
  467. public void setXmlnsPolicy(XmlViolationPolicy xmlnsPolicy) {
  468. if (xmlnsPolicy == XmlViolationPolicy.FATAL) {
  469. throw new IllegalArgumentException("Can't use FATAL here.");
  470. }
  471. this.xmlnsPolicy = xmlnsPolicy;
  472. }
  473. public void setNamePolicy(XmlViolationPolicy namePolicy) {
  474. this.namePolicy = namePolicy;
  475. }
  476. /**
  477. * Sets the html4ModeCompatibleWithXhtml1Schemata.
  478. *
  479. * @param html4ModeCompatibleWithXhtml1Schemata
  480. * the html4ModeCompatibleWithXhtml1Schemata to set
  481. */
  482. public void setHtml4ModeCompatibleWithXhtml1Schemata(
  483. boolean html4ModeCompatibleWithXhtml1Schemata) {
  484. this.html4ModeCompatibleWithXhtml1Schemata = html4ModeCompatibleWithXhtml1Schemata;
  485. }
  486. // ]NOCPP]
  487. // For the token handler to call
  488. /**
  489. * Sets the tokenizer state and the associated element name. This should
  490. * only ever used to put the tokenizer into one of the states that have
  491. * a special end tag expectation.
  492. *
  493. * @param specialTokenizerState
  494. * the tokenizer state to set
  495. * @param endTagExpectation
  496. * the expected end tag for transitioning back to normal
  497. */
  498. public void setStateAndEndTagExpectation(int specialTokenizerState,
  499. @Local String endTagExpectation) {
  500. this.stateSave = specialTokenizerState;
  501. if (specialTokenizerState == Tokenizer.DATA) {
  502. return;
  503. }
  504. @Auto char[] asArray = Portability.newCharArrayFromLocal(endTagExpectation);
  505. this.endTagExpectation = ElementName.elementNameByBuffer(asArray, 0,
  506. asArray.length, interner);
  507. endTagExpectationToArray();
  508. }
  509. /**
  510. * Sets the tokenizer state and the associated element name. This should
  511. * only ever used to put the tokenizer into one of the states that have
  512. * a special end tag expectation.
  513. *
  514. * @param specialTokenizerState
  515. * the tokenizer state to set
  516. * @param endTagExpectation
  517. * the expected end tag for transitioning back to normal
  518. */
  519. public void setStateAndEndTagExpectation(int specialTokenizerState,
  520. ElementName endTagExpectation) {
  521. this.stateSave = specialTokenizerState;
  522. this.endTagExpectation = endTagExpectation;
  523. endTagExpectationToArray();
  524. }
  525. private void endTagExpectationToArray() {
  526. switch (endTagExpectation.getGroup()) {
  527. case TreeBuilder.TITLE:
  528. endTagExpectationAsArray = TITLE_ARR;
  529. return;
  530. case TreeBuilder.SCRIPT:
  531. endTagExpectationAsArray = SCRIPT_ARR;
  532. return;
  533. case TreeBuilder.STYLE:
  534. endTagExpectationAsArray = STYLE_ARR;
  535. return;
  536. case TreeBuilder.PLAINTEXT:
  537. endTagExpectationAsArray = PLAINTEXT_ARR;
  538. return;
  539. case TreeBuilder.XMP:
  540. endTagExpectationAsArray = XMP_ARR;
  541. return;
  542. case TreeBuilder.TEXTAREA:
  543. endTagExpectationAsArray = TEXTAREA_ARR;
  544. return;
  545. case TreeBuilder.IFRAME:
  546. endTagExpectationAsArray = IFRAME_ARR;
  547. return;
  548. case TreeBuilder.NOEMBED:
  549. endTagExpectationAsArray = NOEMBED_ARR;
  550. return;
  551. case TreeBuilder.NOSCRIPT:
  552. endTagExpectationAsArray = NOSCRIPT_ARR;
  553. return;
  554. case TreeBuilder.NOFRAMES:
  555. endTagExpectationAsArray = NOFRAMES_ARR;
  556. return;
  557. default:
  558. assert false: "Bad end tag expectation.";
  559. return;
  560. }
  561. }
  562. /**
  563. * For C++ use only.
  564. */
  565. public void setLineNumber(int line) {
  566. this.line = line;
  567. }
  568. // start Locator impl
  569. /**
  570. * @see org.xml.sax.Locator#getLineNumber()
  571. */
  572. @Inline public int getLineNumber() {
  573. return line;
  574. }
  575. // [NOCPP[
  576. /**
  577. * @see org.xml.sax.Locator#getColumnNumber()
  578. */
  579. @Inline public int getColumnNumber() {
  580. return -1;
  581. }
  582. /**
  583. * @see org.xml.sax.Locator#getPublicId()
  584. */
  585. public String getPublicId() {
  586. return publicId;
  587. }
  588. /**
  589. * @see org.xml.sax.Locator#getSystemId()
  590. */
  591. public String getSystemId() {
  592. return systemId;
  593. }
  594. // end Locator impl
  595. // end public API
  596. public void notifyAboutMetaBoundary() {
  597. metaBoundaryPassed = true;
  598. }
  599. void turnOnAdditionalHtml4Errors() {
  600. html4 = true;
  601. }
  602. // ]NOCPP]
  603. HtmlAttributes emptyAttributes() {
  604. // [NOCPP[
  605. if (newAttributesEachTime) {
  606. return new HtmlAttributes(mappingLangToXmlLang);
  607. } else {
  608. // ]NOCPP]
  609. return HtmlAttributes.EMPTY_ATTRIBUTES;
  610. // [NOCPP[
  611. }
  612. // ]NOCPP]
  613. }
  614. @Inline private void clearStrBufAndAppend(char c) {
  615. strBuf[0] = c;
  616. strBufLen = 1;
  617. }
  618. @Inline private void clearStrBuf() {
  619. strBufLen = 0;
  620. }
  621. /**
  622. * Appends to the smaller buffer.
  623. *
  624. * @param c
  625. * the UTF-16 code unit to append
  626. */
  627. private void appendStrBuf(char c) {
  628. if (strBufLen == strBuf.length) {
  629. char[] newBuf = new char[strBuf.length + Tokenizer.BUFFER_GROW_BY];
  630. System.arraycopy(strBuf, 0, newBuf, 0, strBuf.length);
  631. strBuf = newBuf;
  632. }
  633. strBuf[strBufLen++] = c;
  634. }
  635. /**
  636. * The smaller buffer as a String. Currently only used for error reporting.
  637. *
  638. * <p>
  639. * C++ memory note: The return value must be released.
  640. *
  641. * @return the smaller buffer as a string
  642. */
  643. protected String strBufToString() {
  644. return Portability.newStringFromBuffer(strBuf, 0, strBufLen);
  645. }
  646. /**
  647. * Returns the short buffer as a local name. The return value is released in
  648. * emitDoctypeToken().
  649. *
  650. * @return the smaller buffer as local name
  651. */
  652. private void strBufToDoctypeName() {
  653. doctypeName = Portability.newLocalNameFromBuffer(strBuf, 0, strBufLen,
  654. interner);
  655. }
  656. /**
  657. * Emits the smaller buffer as character tokens.
  658. *
  659. * @throws SAXException
  660. * if the token handler threw
  661. */
  662. private void emitStrBuf() throws SAXException {
  663. if (strBufLen > 0) {
  664. tokenHandler.characters(strBuf, 0, strBufLen);
  665. }
  666. }
  667. @Inline private void clearLongStrBuf() {
  668. longStrBufLen = 0;
  669. }
  670. @Inline private void clearLongStrBufAndAppend(char c) {
  671. longStrBuf[0] = c;
  672. longStrBufLen = 1;
  673. }
  674. /**
  675. * Appends to the larger buffer.
  676. *
  677. * @param c
  678. * the UTF-16 code unit to append
  679. */
  680. private void appendLongStrBuf(char c) {
  681. if (longStrBufLen == longStrBuf.length) {
  682. char[] newBuf = new char[longStrBufLen + (longStrBufLen >> 1)];
  683. System.arraycopy(longStrBuf, 0, newBuf, 0, longStrBuf.length);
  684. longStrBuf = newBuf;
  685. }
  686. longStrBuf[longStrBufLen++] = c;
  687. }
  688. @Inline private void appendSecondHyphenToBogusComment() throws SAXException {
  689. // [NOCPP[
  690. switch (commentPolicy) {
  691. case ALTER_INFOSET:
  692. // detachLongStrBuf();
  693. appendLongStrBuf(' ');
  694. // FALLTHROUGH
  695. case ALLOW:
  696. warn("The document is not mappable to XML 1.0 due to two consecutive hyphens in a comment.");
  697. // ]NOCPP]
  698. appendLongStrBuf('-');
  699. // [NOCPP[
  700. break;
  701. case FATAL:
  702. fatal("The document is not mappable to XML 1.0 due to two consecutive hyphens in a comment.");
  703. break;
  704. }
  705. // ]NOCPP]
  706. }
  707. // [NOCPP[
  708. private void maybeAppendSpaceToBogusComment() throws SAXException {
  709. switch (commentPolicy) {
  710. case ALTER_INFOSET:
  711. // detachLongStrBuf();
  712. appendLongStrBuf(' ');
  713. // FALLTHROUGH
  714. case ALLOW:
  715. warn("The document is not mappable to XML 1.0 due to a trailing hyphen in a comment.");
  716. break;
  717. case FATAL:
  718. fatal("The document is not mappable to XML 1.0 due to a trailing hyphen in a comment.");
  719. break;
  720. }
  721. }
  722. // ]NOCPP]
  723. @Inline private void adjustDoubleHyphenAndAppendToLongStrBufAndErr(char c)
  724. throws SAXException {
  725. errConsecutiveHyphens();
  726. // [NOCPP[
  727. switch (commentPolicy) {
  728. case ALTER_INFOSET:
  729. // detachLongStrBuf();
  730. longStrBufLen--;
  731. appendLongStrBuf(' ');
  732. appendLongStrBuf('-');
  733. // FALLTHROUGH
  734. case ALLOW:
  735. warn("The document is not mappable to XML 1.0 due to two consecutive hyphens in a comment.");
  736. // ]NOCPP]
  737. appendLongStrBuf(c);
  738. // [NOCPP[
  739. break;
  740. case FATAL:
  741. fatal("The document is not mappable to XML 1.0 due to two consecutive hyphens in a comment.");
  742. break;
  743. }
  744. // ]NOCPP]
  745. }
  746. private void appendLongStrBuf(@NoLength char[] buffer, int offset, int length) {
  747. int reqLen = longStrBufLen + length;
  748. if (longStrBuf.length < reqLen) {
  749. char[] newBuf = new char[reqLen + (reqLen >> 1)];
  750. System.arraycopy(longStrBuf, 0, newBuf, 0, longStrBuf.length);
  751. longStrBuf = newBuf;
  752. }
  753. System.arraycopy(buffer, offset, longStrBuf, longStrBufLen, length);
  754. longStrBufLen = reqLen;
  755. }
  756. /**
  757. * Append the contents of the smaller buffer to the larger one.
  758. */
  759. @Inline private void appendStrBufToLongStrBuf() {
  760. appendLongStrBuf(strBuf, 0, strBufLen);
  761. }
  762. /**
  763. * The larger buffer as a string.
  764. *
  765. * <p>
  766. * C++ memory note: The return value must be released.
  767. *
  768. * @return the larger buffer as a string
  769. */
  770. private String longStrBufToString() {
  771. return Portability.newStringFromBuffer(longStrBuf, 0, longStrBufLen);
  772. }
  773. /**
  774. * Emits the current comment token.
  775. *
  776. * @param pos
  777. * TODO
  778. *
  779. * @throws SAXException
  780. */
  781. private void emitComment(int provisionalHyphens, int pos)
  782. throws SAXException {
  783. // [NOCPP[
  784. if (wantsComments) {
  785. // ]NOCPP]
  786. // if (longStrBufOffset != -1) {
  787. // tokenHandler.comment(buf, longStrBufOffset, longStrBufLen
  788. // - provisionalHyphens);
  789. // } else {
  790. tokenHandler.comment(longStrBuf, 0, longStrBufLen
  791. - provisionalHyphens);
  792. // }
  793. // [NOCPP[
  794. }
  795. // ]NOCPP]
  796. cstart = pos + 1;
  797. }
  798. /**
  799. * Flushes coalesced character tokens.
  800. *
  801. * @param buf
  802. * TODO
  803. * @param pos
  804. * TODO
  805. *
  806. * @throws SAXException
  807. */
  808. protected void flushChars(@NoLength char[] buf, int pos)
  809. throws SAXException {
  810. if (pos > cstart) {
  811. tokenHandler.characters(buf, cstart, pos - cstart);
  812. }
  813. cstart = Integer.MAX_VALUE;
  814. }
  815. /**
  816. * Reports an condition that would make the infoset incompatible with XML
  817. * 1.0 as fatal.
  818. *
  819. * @param message
  820. * the message
  821. * @throws SAXException
  822. * @throws SAXParseException
  823. */
  824. public void fatal(String message) throws SAXException {
  825. SAXParseException spe = new SAXParseException(message, this);
  826. if (errorHandler != null) {
  827. errorHandler.fatalError(spe);
  828. }
  829. throw spe;
  830. }
  831. /**
  832. * Reports a Parse Error.
  833. *
  834. * @param message
  835. * the message
  836. * @throws SAXException
  837. */
  838. public void err(String message) throws SAXException {
  839. if (errorHandler == null) {
  840. return;
  841. }
  842. SAXParseException spe = new SAXParseException(message, this);
  843. errorHandler.error(spe);
  844. }
  845. public void errTreeBuilder(String message) throws SAXException {
  846. ErrorHandler eh = null;
  847. if (tokenHandler instanceof TreeBuilder<?>) {
  848. TreeBuilder<?> treeBuilder = (TreeBuilder<?>) tokenHandler;
  849. eh = treeBuilder.getErrorHandler();
  850. }
  851. if (eh == null) {
  852. eh = errorHandler;
  853. }
  854. if (eh == null) {
  855. return;
  856. }
  857. SAXParseException spe = new SAXParseException(message, this);
  858. eh.error(spe);
  859. }
  860. /**
  861. * Reports a warning
  862. *
  863. * @param message
  864. * the message
  865. * @throws SAXException
  866. */
  867. public void warn(String message) throws SAXException {
  868. if (errorHandler == null) {
  869. return;
  870. }
  871. SAXParseException spe = new SAXParseException(message, this);
  872. errorHandler.warning(spe);
  873. }
  874. /**
  875. *
  876. */
  877. private void resetAttributes() {
  878. // [NOCPP[
  879. if (newAttributesEachTime) {
  880. // ]NOCPP]
  881. attributes = null;
  882. // [NOCPP[
  883. } else {
  884. attributes.clear(mappingLangToXmlLang);
  885. }
  886. // ]NOCPP]
  887. }
  888. private void strBufToElementNameString() {
  889. // if (strBufOffset != -1) {
  890. // return ElementName.elementNameByBuffer(buf, strBufOffset, strBufLen);
  891. // } else {
  892. tagName = ElementName.elementNameByBuffer(strBuf, 0, strBufLen,
  893. interner);
  894. // }
  895. }
  896. private int emitCurrentTagToken(boolean selfClosing, int pos)
  897. throws SAXException {
  898. cstart = pos + 1;
  899. maybeErrSlashInEndTag(selfClosing);
  900. stateSave = Tokenizer.DATA;
  901. HtmlAttributes attrs = (attributes == null ? HtmlAttributes.EMPTY_ATTRIBUTES
  902. : attributes);
  903. if (endTag) {
  904. /*
  905. * When an end tag token is emitted, the content model flag must be
  906. * switched to the PCDATA state.
  907. */
  908. maybeErrAttributesOnEndTag(attrs);
  909. // CPPONLY: if (!viewingXmlSource) {
  910. tokenHandler.endTag(tagName);
  911. // CPPONLY: }
  912. Portability.delete(attributes);
  913. } else {
  914. // CPPONLY: if (viewingXmlSource) {
  915. // CPPONLY: Portability.delete(attributes);
  916. // CPPONLY: } else {
  917. tokenHandler.startTag(tagName, attrs, selfClosing);
  918. // CPPONLY: }
  919. }
  920. tagName.release();
  921. tagName = null;
  922. resetAttributes();
  923. /*
  924. * The token handler may have called setStateAndEndTagExpectation
  925. * and changed stateSave since the start of this method.
  926. */
  927. return stateSave;
  928. }
  929. private void attributeNameComplete() throws SAXException {
  930. // if (strBufOffset != -1) {
  931. // attributeName = AttributeName.nameByBuffer(buf, strBufOffset,
  932. // strBufLen, namePolicy != XmlViolationPolicy.ALLOW);
  933. // } else {
  934. attributeName = AttributeName.nameByBuffer(strBuf, 0, strBufLen
  935. // [NOCPP[
  936. , namePolicy != XmlViolationPolicy.ALLOW
  937. // ]NOCPP]
  938. , interner);
  939. // }
  940. if (attributes == null) {
  941. attributes = new HtmlAttributes(mappingLangToXmlLang);
  942. }
  943. /*
  944. * When the user agent leaves the attribute name state (and before
  945. * emitting the tag token, if appropriate), the complete attribute's
  946. * name must be compared to the other attributes on the same token; if
  947. * there is already an attribute on the token with the exact same name,
  948. * then this is a parse error and the new attribute must be dropped,
  949. * along with the value that gets associated with it (if any).
  950. */
  951. if (attributes.contains(attributeName)) {
  952. errDuplicateAttribute();
  953. attributeName.release();
  954. attributeName = null;
  955. }
  956. }
  957. private void addAttributeWithoutValue() throws SAXException {
  958. noteAttributeWithoutValue();
  959. // [NOCPP[
  960. if (metaBoundaryPassed && AttributeName.CHARSET == attributeName
  961. && ElementName.META == tagName) {
  962. err("A \u201Ccharset\u201D attribute on a \u201Cmeta\u201D element found after the first 512 bytes.");
  963. }
  964. // ]NOCPP]
  965. if (attributeName != null) {
  966. // [NOCPP[
  967. if (html4) {
  968. if (attributeName.isBoolean()) {
  969. if (html4ModeCompatibleWithXhtml1Schemata) {
  970. attributes.addAttribute(attributeName,
  971. attributeName.getLocal(AttributeName.HTML),
  972. xmlnsPolicy);
  973. } else {
  974. attributes.addAttribute(attributeName, "", xmlnsPolicy);
  975. }
  976. } else {
  977. if (AttributeName.BORDER != attributeName) {
  978. err("Attribute value omitted for a non-boolean attribute. (HTML4-only error.)");
  979. attributes.addAttribute(attributeName, "", xmlnsPolicy);
  980. }
  981. }
  982. } else {
  983. if (AttributeName.SRC == attributeName
  984. || AttributeName.HREF == attributeName) {
  985. warn("Attribute \u201C"
  986. + attributeName.getLocal(AttributeName.HTML)
  987. + "\u201D without an explicit value seen. The attribute may be dropped by IE7.");
  988. }
  989. // ]NOCPP]
  990. attributes.addAttribute(attributeName,
  991. Portability.newEmptyString()
  992. // [NOCPP[
  993. , xmlnsPolicy
  994. // ]NOCPP]
  995. );
  996. // [NOCPP[
  997. }
  998. // ]NOCPP]
  999. attributeName = null; // attributeName has been adopted by the
  1000. // |attributes| object
  1001. }
  1002. }
  1003. private void addAttributeWithValue() throws SAXException {
  1004. // [NOCPP[
  1005. if (metaBoundaryPassed && ElementName.META == tagName
  1006. && AttributeName.CHARSET == attributeName) {
  1007. err("A \u201Ccharset\u201D attribute on a \u201Cmeta\u201D element found after the first 512 bytes.");
  1008. }
  1009. // ]NOCPP]
  1010. if (attributeName != null) {
  1011. String val = longStrBufToString(); // Ownership transferred to
  1012. // HtmlAttributes
  1013. // CPPONLY: if (mViewSource) {
  1014. // CPPONLY: mViewSource.MaybeLinkifyAttributeValue(attributeName, val);
  1015. // CPPONLY: }
  1016. // [NOCPP[
  1017. if (!endTag && html4 && html4ModeCompatibleWithXhtml1Schemata
  1018. && attributeName.isCaseFolded()) {
  1019. val = newAsciiLowerCaseStringFromString(val);
  1020. }
  1021. // ]NOCPP]
  1022. attributes.addAttribute(attributeName, val
  1023. // [NOCPP[
  1024. , xmlnsPolicy
  1025. // ]NOCPP]
  1026. );
  1027. attributeName = null; // attributeName has been adopted by the
  1028. // |attributes| object
  1029. }
  1030. }
  1031. // [NOCPP[
  1032. private static String newAsciiLowerCaseStringFromString(String str) {
  1033. if (str == null) {
  1034. return null;
  1035. }
  1036. char[] buf = new char[str.length()];
  1037. for (int i = 0; i < str.length(); i++) {
  1038. char c = str.charAt(i);
  1039. if (c >= 'A' && c <= 'Z') {
  1040. c += 0x20;
  1041. }
  1042. buf[i] = c;
  1043. }
  1044. return new String(buf);
  1045. }
  1046. protected void startErrorReporting() throws SAXException {
  1047. }
  1048. // ]NOCPP]
  1049. public void start() throws SAXException {
  1050. initializeWithoutStarting();
  1051. tokenHandler.startTokenization(this);
  1052. // [NOCPP[
  1053. startErrorReporting();
  1054. // ]NOCPP]
  1055. }
  1056. public boolean tokenizeBuffer(UTF16Buffer buffer) throws SAXException {
  1057. int state = stateSave;
  1058. int returnState = returnStateSave;
  1059. char c = '\u0000';
  1060. shouldSuspend = false;
  1061. lastCR = false;
  1062. int start = buffer.getStart();
  1063. /**
  1064. * The index of the last <code>char</code> read from <code>buf</code>.
  1065. */
  1066. int pos = start - 1;
  1067. /**
  1068. * The index of the first <code>char</code> in <code>buf</code> that is
  1069. * part of a coalesced run of character tokens or
  1070. * <code>Integer.MAX_VALUE</code> if there is not a current run being
  1071. * coalesced.
  1072. */
  1073. switch (state) {
  1074. case DATA:
  1075. case RCDATA:
  1076. case SCRIPT_DATA:
  1077. case PLAINTEXT:
  1078. case RAWTEXT:
  1079. case CDATA_SECTION:
  1080. case SCRIPT_DATA_ESCAPED:
  1081. case SCRIPT_DATA_ESCAPE_START:
  1082. case SCRIPT_DATA_ESCAPE_START_DASH:
  1083. case SCRIPT_DATA_ESCAPED_DASH:
  1084. case SCRIPT_DATA_ESCAPED_DASH_DASH:
  1085. case SCRIPT_DATA_DOUBLE_ESCAPE_START:
  1086. case SCRIPT_DATA_DOUBLE_ESCAPED:
  1087. case SCRIPT_DATA_DOUBLE_ESCAPED_LESS_THAN_SIGN:
  1088. case SCRIPT_DATA_DOUBLE_ESCAPED_DASH:
  1089. case SCRIPT_DATA_DOUBLE_ESCAPED_DASH_DASH:
  1090. case SCRIPT_DATA_DOUBLE_ESCAPE_END:
  1091. cstart = start;
  1092. break;
  1093. default:
  1094. cstart = Integer.MAX_VALUE;
  1095. break;
  1096. }
  1097. /**
  1098. * The number of <code>char</code>s in <code>buf</code> that have
  1099. * meaning. (The rest of the array is garbage and should not be
  1100. * examined.)
  1101. */
  1102. // CPPONLY: if (mViewSource) {
  1103. // CPPONLY: mViewSource.SetBuffer(buffer);
  1104. // CPPONLY: pos = stateLoop(state, c, pos, buffer.getBuffer(), false, returnState, buffer.getEnd());
  1105. // CPPONLY: mViewSource.DropBuffer((pos == buffer.getEnd()) ? pos : pos + 1);
  1106. // CPPONLY: } else {
  1107. // CPPONLY: pos = stateLoop(state, c, pos, buffer.getBuffer(), false, returnState, buffer.getEnd());
  1108. // CPPONLY: }
  1109. // [NOCPP[
  1110. pos = stateLoop(state, c, pos, buffer.getBuffer(), false, returnState,
  1111. buffer.getEnd());
  1112. // ]NOCPP]
  1113. if (pos == buffer.getEnd()) {
  1114. // exiting due to end of buffer
  1115. buffer.setStart(pos);
  1116. } else {
  1117. buffer.setStart(pos + 1);
  1118. }
  1119. return lastCR;
  1120. }
  1121. @SuppressWarnings("unused") private int stateLoop(int state, char c,
  1122. int pos, @NoLength char[] buf, boolean reconsume, int returnState,
  1123. int endPos) throws SAXException {
  1124. /*
  1125. * Idioms used in this code:
  1126. *
  1127. *
  1128. * Consuming the next input character
  1129. *
  1130. * To consume the next input character, the code does this: if (++pos ==
  1131. * endPos) { break stateloop; } c = checkChar(buf, pos);
  1132. *
  1133. *
  1134. * Staying in a state
  1135. *
  1136. * When there's a state that the tokenizer may stay in over multiple
  1137. * input characters, the state has a wrapper |for(;;)| loop and staying
  1138. * in the state continues the loop.
  1139. *
  1140. *
  1141. * Switching to another state
  1142. *
  1143. * To switch to another state, the code sets the state variable to the
  1144. * magic number of the new state. Then it either continues stateloop or
  1145. * breaks out of the state's own wrapper loop if the target state is
  1146. * right after the current state in source order. (This is a partial
  1147. * workaround for Java's lack of goto.)
  1148. *
  1149. *
  1150. * Reconsume support
  1151. *
  1152. * The spec sometimes says that an input character is reconsumed in
  1153. * another state. If a state can ever be entered so that an input
  1154. * character can be reconsumed in it, the state's code starts with an
  1155. * |if (reconsume)| that sets reconsume to false and skips over the
  1156. * normal code for consuming a new character.
  1157. *
  1158. * To reconsume the current character in another state, the code sets
  1159. * |reconsume| to true and then switches to the other state.
  1160. *
  1161. *
  1162. * Emitting character tokens
  1163. *
  1164. * This method emits character tokens lazily. Whenever a new range of
  1165. * character tokens starts, the field cstart must be set to the start
  1166. * index of the range. The flushChars() method must be called at the end
  1167. * of a range to flush it.
  1168. *
  1169. *
  1170. * U+0000 handling
  1171. *
  1172. * The various states have to handle the replacement of U+0000 with
  1173. * U+FFFD. However, if U+0000 would be reconsumed in another state, the
  1174. * replacement doesn't need to happen, because it's handled by the
  1175. * reconsuming state.
  1176. *
  1177. *
  1178. * LF handling
  1179. *
  1180. * Every state needs to increment the line number upon LF unless the LF
  1181. * gets reconsumed by another state which increments the line number.
  1182. *
  1183. *
  1184. * CR handling
  1185. *
  1186. * Every state needs to handle CR unless the CR gets reconsumed and is
  1187. * handled by the reconsuming state. The CR needs to be handled as if it
  1188. * were and LF, the lastCR field must be set to true and then this
  1189. * method must return. The IO driver will then swallow the next
  1190. * character if it is an LF to coalesce CRLF.
  1191. */
  1192. stateloop: for (;;) {
  1193. switch (state) {
  1194. case DATA:
  1195. dataloop: for (;;) {
  1196. if (reconsume) {
  1197. reconsume = false;
  1198. } else {
  1199. if (++pos == endPos) {
  1200. break stateloop;
  1201. }
  1202. c = checkChar(buf, pos);
  1203. }
  1204. switch (c) {
  1205. case '&':
  1206. /*
  1207. * U+0026 AMPERSAND (&) Switch to the character
  1208. * reference in data state.
  1209. */
  1210. flushChars(buf, pos);
  1211. clearStrBufAndAppend(c);
  1212. setAdditionalAndRememberAmpersandLocation('\u0000');
  1213. returnState = state;
  1214. state = transition(state, Tokenizer.CONSUME_CHARACTER_REFERENCE, reconsume, pos);
  1215. continue stateloop;
  1216. case '<':
  1217. /*
  1218. * U+003C LESS-THAN SIGN (<) Switch to the tag
  1219. * open state.
  1220. */
  1221. flushChars(buf, pos);
  1222. state = transition(state, Tokenizer.TAG_OPEN, reconsume, pos);
  1223. break dataloop; // FALL THROUGH continue
  1224. // stateloop;
  1225. case '\u0000':
  1226. emitReplacementCharacter(buf, pos);
  1227. continue;
  1228. case '\r':
  1229. emitCarriageReturn(buf, pos);
  1230. break stateloop;
  1231. case '\n':
  1232. silentLineFeed();
  1233. default:
  1234. /*
  1235. * Anything else Emit the input character as a
  1236. * character token.
  1237. *
  1238. * Stay in the data state.
  1239. */
  1240. continue;
  1241. }
  1242. }
  1243. // WARNING FALLTHRU CASE TRANSITION: DON'T REORDER
  1244. case TAG_OPEN:
  1245. tagopenloop: for (;;) {
  1246. /*
  1247. * The behavior of this state depends on the content
  1248. * model flag.
  1249. */
  1250. if (++pos == endPos) {
  1251. break stateloop;
  1252. }
  1253. c = checkChar(buf, pos);
  1254. /*
  1255. * If the content model flag is set to the PCDATA state
  1256. * Consume the next input character:
  1257. */
  1258. if (c >= 'A' && c <= 'Z') {
  1259. /*
  1260. * U+0041 LATIN CAPITAL LETTER A through to U+005A
  1261. * LATIN CAPITAL LETTER Z Create a new start tag
  1262. * token,
  1263. */
  1264. endTag = false;
  1265. /*
  1266. * set its tag name to the lowercase version of the
  1267. * input character (add 0x0020 to the character's
  1268. * code point),
  1269. */
  1270. clearStrBufAndAppend((char) (c + 0x20));
  1271. /* then switch to the tag name state. */
  1272. state = transition(state, Tokenizer.TAG_NAME, reconsume, pos);
  1273. /*
  1274. * (Don't emit the token yet; further details will
  1275. * be filled in before it is emitted.)
  1276. */
  1277. break tagopenloop;
  1278. // continue stateloop;
  1279. } else if (c >= 'a' && c <= 'z') {
  1280. /*
  1281. * U+0061 LATIN SMALL LETTER A through to U+007A
  1282. * LATIN SMALL LETTER Z Create a new start tag
  1283. * token,
  1284. */
  1285. endTag = false;
  1286. /*
  1287. * set its tag name to the input character,
  1288. */
  1289. clearStrBufAndAppend(c);
  1290. /* then switch to the tag name state. */
  1291. state = transition(state, Tokenizer.TAG_NAME, reconsume, pos);
  1292. /*
  1293. * (Don't emit the token yet; further details will
  1294. * be filled in before it is emitted.)
  1295. */
  1296. break tagopenloop;
  1297. // continue stateloop;
  1298. }
  1299. switch (c) {
  1300. case '!':
  1301. /*
  1302. * U+0021 EXCLAMATION MARK (!) Switch to the
  1303. * markup declaration open state.
  1304. */
  1305. state = transition(state, Tokenizer.MARKUP_DECLARATION_OPEN, reconsume, pos);
  1306. continue stateloop;
  1307. case '/':
  1308. /*
  1309. * U+002F SOLIDUS (/) Switch to the close tag
  1310. * open state.
  1311. */
  1312. state = transition(state, Tokenizer.CLOSE_TAG_OPEN, reconsume, pos);
  1313. continue stateloop;
  1314. case '?':
  1315. // CPPONLY: if (viewingXmlSource) {
  1316. // CPPONLY: state = transition(state,
  1317. // CPPONLY: Tokenizer.PROCESSING_INSTRUCTION,
  1318. // CPPONLY: reconsume,
  1319. // CPPONLY: pos);
  1320. // CPPONLY: continue stateloop;
  1321. // CPPONLY: }
  1322. /*
  1323. * U+003F QUESTION MARK (?) Parse error.
  1324. */
  1325. errProcessingInstruction();
  1326. /*
  1327. * Switch to the bogus comment state.
  1328. */
  1329. clearLongStrBufAndAppend(c);
  1330. state = transition(state, Tokenizer.BOGUS_COMMENT, reconsume, pos);
  1331. continue stateloop;
  1332. case '>':
  1333. /*
  1334. * U+003E GREATER-THAN SIGN (>) Parse error.
  1335. */
  1336. errLtGt();
  1337. /*
  1338. * Emit a U+003C LESS-THAN SIGN character token
  1339. * and a U+003E GREATER-THAN SIGN character
  1340. * token.
  1341. */
  1342. tokenHandler.characters(Tokenizer.LT_GT, 0, 2);
  1343. /* Switch to the data state. */
  1344. cstart = pos + 1;
  1345. state = transition(state, Tokenizer.DATA, reconsume, pos);
  1346. continue stateloop;
  1347. default:
  1348. /*
  1349. * Anything else Parse error.
  1350. */
  1351. errBadCharAfterLt(c);
  1352. /*
  1353. * Emit a U+003C LESS-THAN SIGN character token
  1354. */
  1355. tokenHandler.characters(Tokenizer.LT_GT, 0, 1);
  1356. /*
  1357. * and reconsume the current input character in
  1358. * the data state.
  1359. */
  1360. cstart = pos;
  1361. reconsume = true;
  1362. state = transition(state, Tokenizer.DATA, reconsume, pos);
  1363. continue stateloop;
  1364. }
  1365. }
  1366. // FALL THROUGH DON'T REORDER
  1367. case TAG_NAME:
  1368. tagnameloop: for (;;) {
  1369. if (++pos == endPos) {
  1370. break stateloop;
  1371. }
  1372. c = checkChar(buf, pos);
  1373. /*
  1374. * Consume the next input character:
  1375. */
  1376. switch (c) {
  1377. case '\r':
  1378. silentCarriageReturn();
  1379. strBufToElementNameString();
  1380. state = transition(state, Tokenizer.BEFORE_ATTRIBUTE_NAME, reconsume, pos);
  1381. break stateloop;
  1382. case '\n':
  1383. silentLineFeed();
  1384. case ' ':
  1385. case '\t':
  1386. case '\u000C':
  1387. /*
  1388. * U+0009 CHARACTER TABULATION U+000A LINE FEED
  1389. * (LF) U+000C FORM FEED (FF) U+0020 SPACE
  1390. * Switch to the before attribute name state.
  1391. */
  1392. strBufToElementNameString();
  1393. state = transition(state, Tokenizer.BEFORE_ATTRIBUTE_NAME, reconsume, pos);
  1394. break tagnameloop;
  1395. // continue stateloop;
  1396. case '/':
  1397. /*
  1398. * U+002F SOLIDUS (/) Switch to the self-closing
  1399. * start tag state.
  1400. */
  1401. strBufToElementNameString();
  1402. state = transition(state, Tokenizer.SELF_CLOSING_START_TAG, reconsume, pos);
  1403. continue stateloop;
  1404. case '>':
  1405. /*
  1406. * U+003E GREATER-THAN SIGN (>) Emit the current
  1407. * tag token.
  1408. */
  1409. strBufToElementNameString();
  1410. state = transition(state, emitCurrentTagToken(false, pos), reconsume, pos);
  1411. if (shouldSuspend) {
  1412. break stateloop;
  1413. }
  1414. /*
  1415. * Switch to the data state.
  1416. */
  1417. continue stateloop;
  1418. case '\u0000':
  1419. c = '\uFFFD';
  1420. // fall thru
  1421. default:
  1422. if (c >= 'A' && c <= 'Z') {
  1423. /*
  1424. * U+0041 LATIN CAPITAL LETTER A through to
  1425. * U+005A LATIN CAPITAL LETTER Z Append the
  1426. * lowercase version of the current input
  1427. * character (add 0x0020 to the character's
  1428. * code point) to the current tag token's
  1429. * tag name.
  1430. */
  1431. c += 0x20;
  1432. }
  1433. /*
  1434. * Anything else Append the current input
  1435. * character to the current tag token's tag
  1436. * name.
  1437. */
  1438. appendStrBuf(c);
  1439. /*
  1440. * Stay in the tag name state.
  1441. */
  1442. continue;
  1443. }
  1444. }
  1445. // FALLTHRU DON'T REORDER
  1446. case BEFORE_ATTRIBUTE_NAME:
  1447. beforeattributenameloop: for (;;) {
  1448. if (reconsume) {
  1449. reconsume = false;
  1450. } else {
  1451. if (++pos == endPos) {
  1452. break stateloop;
  1453. }
  1454. c = checkChar(buf, pos);
  1455. }
  1456. /*
  1457. * Consume the next input character:
  1458. */
  1459. switch (c) {
  1460. case '\r':
  1461. silentCarriageReturn();
  1462. break stateloop;
  1463. case '\n':
  1464. silentLineFeed();
  1465. // fall thru
  1466. case ' ':
  1467. case '\t':
  1468. case '\u000C':
  1469. /*
  1470. * U+0009 CHARACTER TABULATION U+000A LINE FEED
  1471. * (LF) U+000C FORM FEED (FF) U+0020 SPACE Stay
  1472. * in the before attribute name state.
  1473. */
  1474. continue;
  1475. case '/':
  1476. /*
  1477. * U+002F SOLIDUS (/) Switch to the self-closing
  1478. * start tag state.
  1479. */
  1480. state = transition(state, Tokenizer.SELF_CLOSING_START_TAG, reconsume, pos);
  1481. continue stateloop;
  1482. case '>':
  1483. /*
  1484. * U+003E GREATER-THAN SIGN (>) Emit the current
  1485. * tag token.
  1486. */
  1487. state = transition(state, emitCurrentTagToken(false, pos), reconsume, pos);
  1488. if (shouldSuspend) {
  1489. break stateloop;
  1490. }
  1491. /*
  1492. * Switch to the data state.
  1493. */
  1494. continue stateloop;
  1495. case '\u0000':
  1496. c = '\uFFFD';
  1497. // fall thru
  1498. case '\"':
  1499. case '\'':
  1500. case '<':
  1501. case '=':
  1502. /*
  1503. * U+0022 QUOTATION MARK (") U+0027 APOSTROPHE
  1504. * (') U+003C LESS-THAN SIGN (<) U+003D EQUALS
  1505. * SIGN (=) Parse error.
  1506. */
  1507. errBadCharBeforeAttributeNameOrNull(c);
  1508. /*
  1509. * Treat it as per the "anything else" entry
  1510. * below.
  1511. */
  1512. default:
  1513. /*
  1514. * Anything else Start a new attribute in the
  1515. * current tag token.
  1516. */
  1517. if (c >= 'A' && c <= 'Z') {
  1518. /*
  1519. * U+0041 LATIN CAPITAL LETTER A through to
  1520. * U+005A LATIN CAPITAL LETTER Z Set that
  1521. * attribute's name to the lowercase version
  1522. * of the current input character (add
  1523. * 0x0020 to the character's code point)
  1524. */
  1525. c += 0x20;
  1526. }
  1527. /*
  1528. * Set that attribute's name to the current
  1529. * input character,
  1530. */
  1531. clearStrBufAndAppend(c);
  1532. /*
  1533. * and its value to the empty string.
  1534. */
  1535. // Will do later.
  1536. /*
  1537. * Switch to the attribute name state.
  1538. */
  1539. state = transition(state, Tokenizer.ATTRIBUTE_NAME, reconsume, pos);
  1540. break beforeattributenameloop;
  1541. // continue stateloop;
  1542. }
  1543. }
  1544. // FALLTHRU DON'T REORDER
  1545. case ATTRIBUTE_NAME:
  1546. attributenameloop: for (;;) {
  1547. if (++pos == endPos) {
  1548. break stateloop;
  1549. }
  1550. c = checkChar(buf, pos);
  1551. /*
  1552. * Consume the next input character:
  1553. */
  1554. switch (c) {
  1555. case '\r':
  1556. silentCarriageReturn();
  1557. attributeNameComplete();
  1558. state = transition(state, Tokenizer.AFTER_ATTRIBUTE_NAME, reconsume, pos);
  1559. break stateloop;
  1560. case '\n':
  1561. silentLineFeed();
  1562. // fall thru
  1563. case ' ':
  1564. case '\t':
  1565. case '\u000C':
  1566. /*
  1567. * U+0009 CHARACTER TABULATION U+000A LINE FEED
  1568. * (LF) U+000C FORM FEED (FF) U+0020 SPACE
  1569. * Switch to the after attribute name state.
  1570. */
  1571. attributeNameComplete();
  1572. state = transition(state, Tokenizer.AFTER_ATTRIBUTE_NAME, reconsume, pos);
  1573. continue stateloop;
  1574. case '/':
  1575. /*
  1576. * U+002F SOLIDUS (/) Switch to the self-closing
  1577. * start tag state.
  1578. */
  1579. attributeNameComplete();
  1580. addAttributeWithoutValue();
  1581. state = transition(state, Tokenizer.SELF_CLOSING_START_TAG, reconsume, pos);
  1582. continue stateloop;
  1583. case '=':
  1584. /*
  1585. * U+003D EQUALS SIGN (=) Switch to the before
  1586. * attribute value state.
  1587. */
  1588. attributeNameComplete();
  1589. state = transition(state, Tokenizer.BEFORE_ATTRIBUTE_VALUE, reconsume, pos);
  1590. break attributenameloop;
  1591. // continue stateloop;
  1592. case '>':
  1593. /*
  1594. * U+003E GREATER-THAN SIGN (>) Emit the current
  1595. * tag token.
  1596. */
  1597. attributeNameComplete();
  1598. addAttributeWithoutValue();
  1599. state = transition(state, emitCurrentTagToken(false, pos), reconsume, pos);
  1600. if (shouldSuspend) {
  1601. break stateloop;
  1602. }
  1603. /*
  1604. * Switch to the data state.
  1605. */
  1606. continue stateloop;
  1607. case '\u0000':
  1608. c = '\uFFFD';
  1609. // fall thru
  1610. case '\"':
  1611. case '\'':
  1612. case '<':
  1613. /*
  1614. * U+0022 QUOTATION MARK (") U+0027 APOSTROPHE
  1615. * (') U+003C LESS-THAN SIGN (<) Parse error.
  1616. */
  1617. errQuoteOrLtInAttributeNameOrNull(c);
  1618. /*
  1619. * Treat it as per the "anything else" entry
  1620. * below.
  1621. */
  1622. default:
  1623. if (c >= 'A' && c <= 'Z') {
  1624. /*
  1625. * U+0041 LATIN CAPITAL LETTER A through to
  1626. * U+005A LATIN CAPITAL LETTER Z Append the
  1627. * lowercase version of the current input
  1628. * character (add 0x0020 to the character's
  1629. * code point) to the current attribute's
  1630. * name.
  1631. */
  1632. c += 0x20;
  1633. }
  1634. /*
  1635. * Anything else Append the current input
  1636. * character to the current attribute's name.
  1637. */
  1638. appendStrBuf(c);
  1639. /*
  1640. * Stay in the attribute name state.
  1641. */
  1642. continue;
  1643. }
  1644. }
  1645. // FALLTHRU DON'T REORDER
  1646. case BEFORE_ATTRIBUTE_VALUE:
  1647. beforeattributevalueloop: for (;;) {
  1648. if (++pos == endPos) {
  1649. break stateloop;
  1650. }
  1651. c = checkChar(buf, pos);
  1652. /*
  1653. * Consume the next input character:
  1654. */
  1655. switch (c) {
  1656. case '\r':
  1657. silentCarriageReturn();
  1658. break stateloop;
  1659. case '\n':
  1660. silentLineFeed();
  1661. // fall thru
  1662. case ' ':
  1663. case '\t':
  1664. case '\u000C':
  1665. /*
  1666. * U+0009 CHARACTER TABULATION U+000A LINE FEED
  1667. * (LF) U+000C FORM FEED (FF) U+0020 SPACE Stay
  1668. * in the before attribute value state.
  1669. */
  1670. continue;
  1671. case '"':
  1672. /*
  1673. * U+0022 QUOTATION MARK (") Switch to the
  1674. * attribute value (double-quoted) state.
  1675. */
  1676. clearLongStrBuf();
  1677. state = transition(state, Tokenizer.ATTRIBUTE_VALUE_DOUBLE_QUOTED, reconsume, pos);
  1678. break beforeattributevalueloop;
  1679. // continue stateloop;
  1680. case '&':
  1681. /*
  1682. * U+0026 AMPERSAND (&) Switch to the attribute
  1683. * value (unquoted) state and reconsume this
  1684. * input character.
  1685. */
  1686. clearLongStrBuf();
  1687. reconsume = true;
  1688. state = transition(state, Tokenizer.ATTRIBUTE_VALUE_UNQUOTED, reconsume, pos);
  1689. noteUnquotedAttributeValue();
  1690. continue stateloop;
  1691. case '\'':
  1692. /*
  1693. * U+0027 APOSTROPHE (') Switch to the attribute
  1694. * value (single-quoted) state.
  1695. */
  1696. clearLongStrBuf();
  1697. state = transition(state, Tokenizer.ATTRIBUTE_VALUE_SINGLE_QUOTED, reconsume, pos);
  1698. continue stateloop;
  1699. case '>':
  1700. /*
  1701. * U+003E GREATER-THAN SIGN (>) Parse error.
  1702. */
  1703. errAttributeValueMissing();
  1704. /*
  1705. * Emit the current tag token.
  1706. */
  1707. addAttributeWithoutValue();
  1708. state = transition(state, emitCurrentTagToken(false, pos), reconsume, pos);
  1709. if (shouldSuspend) {
  1710. break stateloop;
  1711. }
  1712. /*
  1713. * Switch to the data state.
  1714. */
  1715. continue stateloop;
  1716. case '\u0000':
  1717. c = '\uFFFD';
  1718. // fall thru
  1719. case '<':
  1720. case '=':
  1721. case '`':
  1722. /*
  1723. * U+003C LESS-THAN SIGN (<) U+003D EQUALS SIGN
  1724. * (=) U+0060 GRAVE ACCENT (`)
  1725. */
  1726. errLtOrEqualsOrGraveInUnquotedAttributeOrNull(c);
  1727. /*
  1728. * Treat it as per the "anything else" entry
  1729. * below.
  1730. */
  1731. default:
  1732. // [NOCPP[
  1733. errHtml4NonNameInUnquotedAttribute(c);
  1734. // ]NOCPP]
  1735. /*
  1736. * Anything else Append the current input
  1737. * character to the current attribute's value.
  1738. */
  1739. clearLongStrBufAndAppend(c);
  1740. /*
  1741. * Switch to the attribute value (unquoted)
  1742. * state.
  1743. */
  1744. state = transition(state, Tokenizer.ATTRIBUTE_VALUE_UNQUOTED, reconsume, pos);
  1745. noteUnquotedAttributeValue();
  1746. continue stateloop;
  1747. }
  1748. }
  1749. // FALLTHRU DON'T REORDER
  1750. case ATTRIBUTE_VALUE_DOUBLE_QUOTED:
  1751. attributevaluedoublequotedloop: for (;;) {
  1752. if (reconsume) {
  1753. reconsume = false;
  1754. } else {
  1755. if (++pos == endPos) {
  1756. break stateloop;
  1757. }
  1758. c = checkChar(buf, pos);
  1759. }
  1760. /*
  1761. * Consume the next input character:
  1762. */
  1763. switch (c) {
  1764. case '"':
  1765. /*
  1766. * U+0022 QUOTATION MARK (") Switch to the after
  1767. * attribute value (quoted) state.
  1768. */
  1769. addAttributeWithValue();
  1770. state = transition(state, Tokenizer.AFTER_ATTRIBUTE_VALUE_QUOTED, reconsume, pos);
  1771. break attributevaluedoublequotedloop;
  1772. // continue stateloop;
  1773. case '&':
  1774. /*
  1775. * U+0026 AMPERSAND (&) Switch to the character
  1776. * reference in attribute value state, with the
  1777. * additional allowed character being U+0022
  1778. * QUOTATION MARK (").
  1779. */
  1780. clearStrBufAndAppend(c);
  1781. setAdditionalAndRememberAmpersandLocation('\"');
  1782. returnState = state;
  1783. state = transition(state, Tokenizer.CONSUME_CHARACTER_REFERENCE, reconsume, pos);
  1784. continue stateloop;
  1785. case '\r':
  1786. appendLongStrBufCarriageReturn();
  1787. break stateloop;
  1788. case '\n':
  1789. appendLongStrBufLineFeed();
  1790. continue;
  1791. case '\u0000':
  1792. c = '\uFFFD';
  1793. // fall thru
  1794. default:
  1795. /*
  1796. * Anything else Append the current input
  1797. * character to the current attribute's value.
  1798. */
  1799. appendLongStrBuf(c);
  1800. /*
  1801. * Stay in the attribute value (double-quoted)
  1802. * state.
  1803. */
  1804. continue;
  1805. }
  1806. }
  1807. // FALLTHRU DON'T REORDER
  1808. case AFTER_ATTRIBUTE_VALUE_QUOTED:
  1809. afterattributevaluequotedloop: for (;;) {
  1810. if (++pos == endPos) {
  1811. break stateloop;
  1812. }
  1813. c = checkChar(buf, pos);
  1814. /*
  1815. * Consume the next input character:
  1816. */
  1817. switch (c) {
  1818. case '\r':
  1819. silentCarriageReturn();
  1820. state = transition(state, Tokenizer.BEFORE_ATTRIBUTE_NAME, reconsume, pos);
  1821. break stateloop;
  1822. case '\n':
  1823. silentLineFeed();
  1824. // fall thru
  1825. case ' ':
  1826. case '\t':
  1827. case '\u000C':
  1828. /*
  1829. * U+0009 CHARACTER TABULATION U+000A LINE FEED
  1830. * (LF) U+000C FORM FEED (FF) U+0020 SPACE
  1831. * Switch to the before attribute name state.
  1832. */
  1833. state = transition(state, Tokenizer.BEFORE_ATTRIBUTE_NAME, reconsume, pos);
  1834. continue stateloop;
  1835. case '/':
  1836. /*
  1837. * U+002F SOLIDUS (/) Switch to the self-closing
  1838. * start tag state.
  1839. */
  1840. state = transition(state, Tokenizer.SELF_CLOSING_START_TAG, reconsume, pos);
  1841. break afterattributevaluequotedloop;
  1842. // continue stateloop;
  1843. case '>':
  1844. /*
  1845. * U+003E GREATER-THAN SIGN (>) Emit the current
  1846. * tag token.
  1847. */
  1848. state = transition(state, emitCurrentTagToken(false, pos), reconsume, pos);
  1849. if (shouldSuspend) {
  1850. break stateloop;
  1851. }
  1852. /*
  1853. * Switch to the data state.
  1854. */
  1855. continue stateloop;
  1856. default:
  1857. /*
  1858. * Anything else Parse error.
  1859. */
  1860. errNoSpaceBetweenAttributes();
  1861. /*
  1862. * Reconsume the character in the before
  1863. * attribute name state.
  1864. */
  1865. reconsume = true;
  1866. state = transition(state, Tokenizer.BEFORE_ATTRIBUTE_NAME, reconsume, pos);
  1867. continue stateloop;
  1868. }
  1869. }
  1870. // FALLTHRU DON'T REORDER
  1871. case SELF_CLOSING_START_TAG:
  1872. if (++pos == endPos) {
  1873. break stateloop;
  1874. }
  1875. c = checkChar(buf, pos);
  1876. /*
  1877. * Consume the next input character:
  1878. */
  1879. switch (c) {
  1880. case '>':
  1881. /*
  1882. * U+003E GREATER-THAN SIGN (>) Set the self-closing
  1883. * flag of the current tag token. Emit the current
  1884. * tag token.
  1885. */
  1886. // [NOCPP[
  1887. errHtml4XmlVoidSyntax();
  1888. // ]NOCPP]
  1889. state = transition(state, emitCurrentTagToken(true, pos), reconsume, pos);
  1890. if (shouldSuspend) {
  1891. break stateloop;
  1892. }
  1893. /*
  1894. * Switch to the data state.
  1895. */
  1896. continue stateloop;
  1897. default:
  1898. /* Anything else Parse error. */
  1899. errSlashNotFollowedByGt();
  1900. /*
  1901. * Reconsume the character in the before attribute
  1902. * name state.
  1903. */
  1904. reconsume = true;
  1905. state = transition(state, Tokenizer.BEFORE_ATTRIBUTE_NAME, reconsume, pos);
  1906. continue stateloop;
  1907. }
  1908. // XXX reorder point
  1909. case ATTRIBUTE_VALUE_UNQUOTED:
  1910. for (;;) {
  1911. if (reconsume) {
  1912. reconsume = false;
  1913. } else {
  1914. if (++pos == endPos) {
  1915. break stateloop;
  1916. }
  1917. c = checkChar(buf, pos);
  1918. }
  1919. /*
  1920. * Consume the next input character:
  1921. */
  1922. switch (c) {
  1923. case '\r':
  1924. silentCarriageReturn();
  1925. addAttributeWithValue();
  1926. state = transition(state, Tokenizer.BEFORE_ATTRIBUTE_NAME, reconsume, pos);
  1927. break stateloop;
  1928. case '\n':
  1929. silentLineFeed();
  1930. // fall thru
  1931. case ' ':
  1932. case '\t':
  1933. case '\u000C':
  1934. /*
  1935. * U+0009 CHARACTER TABULATION U+000A LINE FEED
  1936. * (LF) U+000C FORM FEED (FF) U+0020 SPACE
  1937. * Switch to the before attribute name state.
  1938. */
  1939. addAttributeWithValue();
  1940. state = transition(state, Tokenizer.BEFORE_ATTRIBUTE_NAME, reconsume, pos);
  1941. continue stateloop;
  1942. case '&':
  1943. /*
  1944. * U+0026 AMPERSAND (&) Switch to the character
  1945. * reference in attribute value state, with the
  1946. * additional allowed character being U+003E
  1947. * GREATER-THAN SIGN (>)
  1948. */
  1949. clearStrBufAndAppend(c);
  1950. setAdditionalAndRememberAmpersandLocation('>');
  1951. returnState = state;
  1952. state = transition(state, Tokenizer.CONSUME_CHARACTER_REFERENCE, reconsume, pos);
  1953. continue stateloop;
  1954. case '>':
  1955. /*
  1956. * U+003E GREATER-THAN SIGN (>) Emit the current
  1957. * tag token.
  1958. */
  1959. addAttributeWithValue();
  1960. state = transition(state, emitCurrentTagToken(false, pos), reconsume, pos);
  1961. if (shouldSuspend) {
  1962. break stateloop;
  1963. }
  1964. /*
  1965. * Switch to the data state.
  1966. */
  1967. continue stateloop;
  1968. case '\u0000':
  1969. c = '\uFFFD';
  1970. // fall thru
  1971. case '<':
  1972. case '\"':
  1973. case '\'':
  1974. case '=':
  1975. case '`':
  1976. /*
  1977. * U+0022 QUOTATION MARK (") U+0027 APOSTROPHE
  1978. * (') U+003C LESS-THAN SIGN (<) U+003D EQUALS
  1979. * SIGN (=) U+0060 GRAVE ACCENT (`) Parse error.
  1980. */
  1981. errUnquotedAttributeValOrNull(c);
  1982. /*
  1983. * Treat it as per the "anything else" entry
  1984. * below.
  1985. */
  1986. // fall through
  1987. default:
  1988. // [NOCPP]
  1989. errHtml4NonNameInUnquotedAttribute(c);
  1990. // ]NOCPP]
  1991. /*
  1992. * Anything else Append the current input
  1993. * character to the current attribute's value.
  1994. */
  1995. appendLongStrBuf(c);
  1996. /*
  1997. * Stay in the attribute value (unquoted) state.
  1998. */
  1999. continue;
  2000. }
  2001. }
  2002. // XXX reorder point
  2003. case AFTER_ATTRIBUTE_NAME:
  2004. for (;;) {
  2005. if (++pos == endPos) {
  2006. break stateloop;
  2007. }
  2008. c = checkChar(buf, pos);
  2009. /*
  2010. * Consume the next input character:
  2011. */
  2012. switch (c) {
  2013. case '\r':
  2014. silentCarriageReturn();
  2015. break stateloop;
  2016. case '\n':
  2017. silentLineFeed();
  2018. // fall thru
  2019. case ' ':
  2020. case '\t':
  2021. case '\u000C':
  2022. /*
  2023. * U+0009 CHARACTER TABULATION U+000A LINE FEED
  2024. * (LF) U+000C FORM FEED (FF) U+0020 SPACE Stay
  2025. * in the after attribute name state.
  2026. */
  2027. continue;
  2028. case '/':
  2029. /*
  2030. * U+002F SOLIDUS (/) Switch to the self-closing
  2031. * start tag state.
  2032. */
  2033. addAttributeWithoutValue();
  2034. state = transition(state, Tokenizer.SELF_CLOSING_START_TAG, reconsume, pos);
  2035. continue stateloop;
  2036. case '=':
  2037. /*
  2038. * U+003D EQUALS SIGN (=) Switch to the before
  2039. * attribute value state.
  2040. */
  2041. state = transition(state, Tokenizer.BEFORE_ATTRIBUTE_VALUE, reconsume, pos);
  2042. continue stateloop;
  2043. case '>':
  2044. /*
  2045. * U+003E GREATER-THAN SIGN (>) Emit the current
  2046. * tag token.
  2047. */
  2048. addAttributeWithoutValue();
  2049. state = transition(state, emitCurrentTagToken(false, pos), reconsume, pos);
  2050. if (shouldSuspend) {
  2051. break stateloop;
  2052. }
  2053. /*
  2054. * Switch to the data state.
  2055. */
  2056. continue stateloop;
  2057. case '\u0000':
  2058. c = '\uFFFD';
  2059. // fall thru
  2060. case '\"':
  2061. case '\'':
  2062. case '<':
  2063. errQuoteOrLtInAttributeNameOrNull(c);
  2064. /*
  2065. * Treat it as per the "anything else" entry
  2066. * below.
  2067. */
  2068. default:
  2069. addAttributeWithoutValue();
  2070. /*
  2071. * Anything else Start a new attribute in the
  2072. * current tag token.
  2073. */
  2074. if (c >= 'A' && c <= 'Z') {
  2075. /*
  2076. * U+0041 LATIN CAPITAL LETTER A through to
  2077. * U+005A LATIN CAPITAL LETTER Z Set that
  2078. * attribute's name to the lowercase version
  2079. * of the current input character (add
  2080. * 0x0020 to the character's code point)
  2081. */
  2082. c += 0x20;
  2083. }
  2084. /*
  2085. * Set that attribute's name to the current
  2086. * input character,
  2087. */
  2088. clearStrBufAndAppend(c);
  2089. /*
  2090. * and its value to the empty string.
  2091. */
  2092. // Will do later.
  2093. /*
  2094. * Switch to the attribute name state.
  2095. */
  2096. state = transition(state, Tokenizer.ATTRIBUTE_NAME, reconsume, pos);
  2097. continue stateloop;
  2098. }
  2099. }
  2100. // XXX reorder point
  2101. case MARKUP_DECLARATION_OPEN:
  2102. markupdeclarationopenloop: for (;;) {
  2103. if (++pos == endPos) {
  2104. break stateloop;
  2105. }
  2106. c = checkChar(buf, pos);
  2107. /*
  2108. * If the next two characters are both U+002D
  2109. * HYPHEN-MINUS characters (-), consume those two
  2110. * characters, create a comment token whose data is the
  2111. * empty string, and switch to the comment start state.
  2112. *
  2113. * Otherwise, if the next seven characters are an ASCII
  2114. * case-insensitive match for the word "DOCTYPE", then
  2115. * consume those characters and switch to the DOCTYPE
  2116. * state.
  2117. *
  2118. * Otherwise, if the insertion mode is
  2119. * "in foreign content" and the current node is not an
  2120. * element in the HTML namespace and the next seven
  2121. * characters are an case-sensitive match for the string
  2122. * "[CDATA[" (the five uppercase letters "CDATA" with a
  2123. * U+005B LEFT SQUARE BRACKET character before and
  2124. * after), then consume those characters and switch to
  2125. * the CDATA section state.
  2126. *
  2127. * Otherwise, is is a parse error. Switch to the bogus
  2128. * comment state. The next character that is consumed,
  2129. * if any, is the first character that will be in the
  2130. * comment.
  2131. */
  2132. switch (c) {
  2133. case '-':
  2134. clearLongStrBufAndAppend(c);
  2135. state = transition(state, Tokenizer.MARKUP_DECLARATION_HYPHEN, reconsume, pos);
  2136. break markupdeclarationopenloop;
  2137. // continue stateloop;
  2138. case 'd':
  2139. case 'D':
  2140. clearLongStrBufAndAppend(c);
  2141. index = 0;
  2142. state = transition(state, Tokenizer.MARKUP_DECLARATION_OCTYPE, reconsume, pos);
  2143. continue stateloop;
  2144. case '[':
  2145. if (tokenHandler.cdataSectionAllowed()) {
  2146. clearLongStrBufAndAppend(c);
  2147. index = 0;
  2148. state = transition(state, Tokenizer.CDATA_START, reconsume, pos);
  2149. continue stateloop;
  2150. }
  2151. // else fall through
  2152. default:
  2153. errBogusComment();
  2154. clearLongStrBuf();
  2155. reconsume = true;
  2156. state = transition(state, Tokenizer.BOGUS_COMMENT, reconsume, pos);
  2157. continue stateloop;
  2158. }
  2159. }
  2160. // FALLTHRU DON'T REORDER
  2161. case MARKUP_DECLARATION_HYPHEN:
  2162. markupdeclarationhyphenloop: for (;;) {
  2163. if (++pos == endPos) {
  2164. break stateloop;
  2165. }
  2166. c = checkChar(buf, pos);
  2167. switch (c) {
  2168. case '\u0000':
  2169. break stateloop;
  2170. case '-':
  2171. clearLongStrBuf();
  2172. state = transition(state, Tokenizer.COMMENT_START, reconsume, pos);
  2173. break markupdeclarationhyphenloop;
  2174. // continue stateloop;
  2175. default:
  2176. errBogusComment();
  2177. reconsume = true;
  2178. state = transition(state, Tokenizer.BOGUS_COMMENT, reconsume, pos);
  2179. continue stateloop;
  2180. }
  2181. }
  2182. // FALLTHRU DON'T REORDER
  2183. case COMMENT_START:
  2184. commentstartloop: for (;;) {
  2185. if (++pos == endPos) {
  2186. break stateloop;
  2187. }
  2188. c = checkChar(buf, pos);
  2189. /*
  2190. * Comment start state
  2191. *
  2192. *
  2193. * Consume the next input character:
  2194. */
  2195. switch (c) {
  2196. case '-':
  2197. /*
  2198. * U+002D HYPHEN-MINUS (-) Switch to the comment
  2199. * start dash state.
  2200. */
  2201. appendLongStrBuf(c);
  2202. state = transition(state, Tokenizer.COMMENT_START_DASH, reconsume, pos);
  2203. continue stateloop;
  2204. case '>':
  2205. /*
  2206. * U+003E GREATER-THAN SIGN (>) Parse error.
  2207. */
  2208. errPrematureEndOfComment();
  2209. /* Emit the comment token. */
  2210. emitComment(0, pos);
  2211. /*
  2212. * Switch to the data state.
  2213. */
  2214. state = transition(state, Tokenizer.DATA, reconsume, pos);
  2215. continue stateloop;
  2216. case '\r':
  2217. appendLongStrBufCarriageReturn();
  2218. state = transition(state, Tokenizer.COMMENT, reconsume, pos);
  2219. break stateloop;
  2220. case '\n':
  2221. appendLongStrBufLineFeed();
  2222. state = transition(state, Tokenizer.COMMENT, reconsume, pos);
  2223. break commentstartloop;
  2224. case '\u0000':
  2225. c = '\uFFFD';
  2226. // fall thru
  2227. default:
  2228. /*
  2229. * Anything else Append the input character to
  2230. * the comment token's data.
  2231. */
  2232. appendLongStrBuf(c);
  2233. /*
  2234. * Switch to the comment state.
  2235. */
  2236. state = transition(state, Tokenizer.COMMENT, reconsume, pos);
  2237. break commentstartloop;
  2238. // continue stateloop;
  2239. }
  2240. }
  2241. // FALLTHRU DON'T REORDER
  2242. case COMMENT:
  2243. commentloop: for (;;) {
  2244. if (++pos == endPos) {
  2245. break stateloop;
  2246. }
  2247. c = checkChar(buf, pos);
  2248. /*
  2249. * Comment state Consume the next input character:
  2250. */
  2251. switch (c) {
  2252. case '-':
  2253. /*
  2254. * U+002D HYPHEN-MINUS (-) Switch to the comment
  2255. * end dash state
  2256. */
  2257. appendLongStrBuf(c);
  2258. state = transition(state, Tokenizer.COMMENT_END_DASH, reconsume, pos);
  2259. break commentloop;
  2260. // continue stateloop;
  2261. case '\r':
  2262. appendLongStrBufCarriageReturn();
  2263. break stateloop;
  2264. case '\n':
  2265. appendLongStrBufLineFeed();
  2266. continue;
  2267. case '\u0000':
  2268. c = '\uFFFD';
  2269. // fall thru
  2270. default:
  2271. /*
  2272. * Anything else Append the input character to
  2273. * the comment token's data.
  2274. */
  2275. appendLongStrBuf(c);
  2276. /*
  2277. * Stay in the comment state.
  2278. */
  2279. continue;
  2280. }
  2281. }
  2282. // FALLTHRU DON'T REORDER
  2283. case COMMENT_END_DASH:
  2284. commentenddashloop: for (;;) {
  2285. if (++pos == endPos) {
  2286. break stateloop;
  2287. }
  2288. c = checkChar(buf, pos);
  2289. /*
  2290. * Comment end dash state Consume the next input
  2291. * character:
  2292. */
  2293. switch (c) {
  2294. case '-':
  2295. /*
  2296. * U+002D HYPHEN-MINUS (-) Switch to the comment
  2297. * end state
  2298. */
  2299. appendLongStrBuf(c);
  2300. state = transition(state, Tokenizer.COMMENT_END, reconsume, pos);
  2301. break commentenddashloop;
  2302. // continue stateloop;
  2303. case '\r':
  2304. appendLongStrBufCarriageReturn();
  2305. state = transition(state, Tokenizer.COMMENT, reconsume, pos);
  2306. break stateloop;
  2307. case '\n':
  2308. appendLongStrBufLineFeed();
  2309. state = transition(state, Tokenizer.COMMENT, reconsume, pos);
  2310. continue stateloop;
  2311. case '\u0000':
  2312. c = '\uFFFD';
  2313. // fall thru
  2314. default:
  2315. /*
  2316. * Anything else Append a U+002D HYPHEN-MINUS
  2317. * (-) character and the input character to the
  2318. * comment token's data.
  2319. */
  2320. appendLongStrBuf(c);
  2321. /*
  2322. * Switch to the comment state.
  2323. */
  2324. state = transition(state, Tokenizer.COMMENT, reconsume, pos);
  2325. continue stateloop;
  2326. }
  2327. }
  2328. // FALLTHRU DON'T REORDER
  2329. case COMMENT_END:
  2330. commentendloop: for (;;) {
  2331. if (++pos == endPos) {
  2332. break stateloop;
  2333. }
  2334. c = checkChar(buf, pos);
  2335. /*
  2336. * Comment end dash state Consume the next input
  2337. * character:
  2338. */
  2339. switch (c) {
  2340. case '>':
  2341. /*
  2342. * U+003E GREATER-THAN SIGN (>) Emit the comment
  2343. * token.
  2344. */
  2345. emitComment(2, pos);
  2346. /*
  2347. * Switch to the data state.
  2348. */
  2349. state = transition(state, Tokenizer.DATA, reconsume, pos);
  2350. continue stateloop;
  2351. case '-':
  2352. /* U+002D HYPHEN-MINUS (-) Parse error. */
  2353. /*
  2354. * Append a U+002D HYPHEN-MINUS (-) character to
  2355. * the comment token's data.
  2356. */
  2357. adjustDoubleHyphenAndAppendToLongStrBufAndErr(c);
  2358. /*
  2359. * Stay in the comment end state.
  2360. */
  2361. continue;
  2362. case '\r':
  2363. adjustDoubleHyphenAndAppendToLongStrBufCarriageReturn();
  2364. state = transition(state, Tokenizer.COMMENT, reconsume, pos);
  2365. break stateloop;
  2366. case '\n':
  2367. adjustDoubleHyphenAndAppendToLongStrBufLineFeed();
  2368. state = transition(state, Tokenizer.COMMENT, reconsume, pos);
  2369. continue stateloop;
  2370. case '!':
  2371. errHyphenHyphenBang();
  2372. appendLongStrBuf(c);
  2373. state = transition(state, Tokenizer.COMMENT_END_BANG, reconsume, pos);
  2374. continue stateloop;
  2375. case '\u0000':
  2376. c = '\uFFFD';
  2377. // fall thru
  2378. default:
  2379. /*
  2380. * Append two U+002D HYPHEN-MINUS (-) characters
  2381. * and the input character to the comment
  2382. * token's data.
  2383. */
  2384. adjustDoubleHyphenAndAppendToLongStrBufAndErr(c);
  2385. /*
  2386. * Switch to the comment state.
  2387. */
  2388. state = transition(state, Tokenizer.COMMENT, reconsume, pos);
  2389. continue stateloop;
  2390. }
  2391. }
  2392. // XXX reorder point
  2393. case COMMENT_END_BANG:
  2394. for (;;) {
  2395. if (++pos == endPos) {
  2396. break stateloop;
  2397. }
  2398. c = checkChar(buf, pos);
  2399. /*
  2400. * Comment end bang state
  2401. *
  2402. * Consume the next input character:
  2403. */
  2404. switch (c) {
  2405. case '>':
  2406. /*
  2407. * U+003E GREATER-THAN SIGN (>) Emit the comment
  2408. * token.
  2409. */
  2410. emitComment(3, pos);
  2411. /*
  2412. * Switch to the data state.
  2413. */
  2414. state = transition(state, Tokenizer.DATA, reconsume, pos);
  2415. continue stateloop;
  2416. case '-':
  2417. /*
  2418. * Append two U+002D HYPHEN-MINUS (-) characters
  2419. * and a U+0021 EXCLAMATION MARK (!) character
  2420. * to the comment token's data.
  2421. */
  2422. appendLongStrBuf(c);
  2423. /*
  2424. * Switch to the comment end dash state.
  2425. */
  2426. state = transition(state, Tokenizer.COMMENT_END_DASH, reconsume, pos);
  2427. continue stateloop;
  2428. case '\r':
  2429. appendLongStrBufCarriageReturn();
  2430. break stateloop;
  2431. case '\n':
  2432. appendLongStrBufLineFeed();
  2433. continue;
  2434. case '\u0000':
  2435. c = '\uFFFD';
  2436. // fall thru
  2437. default:
  2438. /*
  2439. * Anything else Append two U+002D HYPHEN-MINUS
  2440. * (-) characters, a U+0021 EXCLAMATION MARK (!)
  2441. * character, and the input character to the
  2442. * comment token's data. Switch to the comment
  2443. * state.
  2444. */
  2445. appendLongStrBuf(c);
  2446. /*
  2447. * Switch to the comment state.
  2448. */
  2449. state = transition(state, Tokenizer.COMMENT, reconsume, pos);
  2450. continue stateloop;
  2451. }
  2452. }
  2453. // XXX reorder point
  2454. case COMMENT_START_DASH:
  2455. if (++pos == endPos) {
  2456. break stateloop;
  2457. }
  2458. c = checkChar(buf, pos);
  2459. /*
  2460. * Comment start dash state
  2461. *
  2462. * Consume the next input character:
  2463. */
  2464. switch (c) {
  2465. case '-':
  2466. /*
  2467. * U+002D HYPHEN-MINUS (-) Switch to the comment end
  2468. * state
  2469. */
  2470. appendLongStrBuf(c);
  2471. state = transition(state, Tokenizer.COMMENT_END, reconsume, pos);
  2472. continue stateloop;
  2473. case '>':
  2474. errPrematureEndOfComment();
  2475. /* Emit the comment token. */
  2476. emitComment(1, pos);
  2477. /*
  2478. * Switch to the data state.
  2479. */
  2480. state = transition(state, Tokenizer.DATA, reconsume, pos);
  2481. continue stateloop;
  2482. case '\r':
  2483. appendLongStrBufCarriageReturn();
  2484. state = transition(state, Tokenizer.COMMENT, reconsume, pos);
  2485. break stateloop;
  2486. case '\n':
  2487. appendLongStrBufLineFeed();
  2488. state = transition(state, Tokenizer.COMMENT, reconsume, pos);
  2489. continue stateloop;
  2490. case '\u0000':
  2491. c = '\uFFFD';
  2492. // fall thru
  2493. default:
  2494. /*
  2495. * Append a U+002D HYPHEN-MINUS character (-) and
  2496. * the current input character to the comment
  2497. * token's data.
  2498. */
  2499. appendLongStrBuf(c);
  2500. /*
  2501. * Switch to the comment state.
  2502. */
  2503. state = transition(state, Tokenizer.COMMENT, reconsume, pos);
  2504. continue stateloop;
  2505. }
  2506. // XXX reorder point
  2507. case CDATA_START:
  2508. for (;;) {
  2509. if (++pos == endPos) {
  2510. break stateloop;
  2511. }
  2512. c = checkChar(buf, pos);
  2513. if (index < 6) { // CDATA_LSQB.length
  2514. if (c == Tokenizer.CDATA_LSQB[index]) {
  2515. appendLongStrBuf(c);
  2516. } else {
  2517. errBogusComment();
  2518. reconsume = true;
  2519. state = transition(state, Tokenizer.BOGUS_COMMENT, reconsume, pos);
  2520. continue stateloop;
  2521. }
  2522. index++;
  2523. continue;
  2524. } else {
  2525. cstart = pos; // start coalescing
  2526. reconsume = true;
  2527. state = transition(state, Tokenizer.CDATA_SECTION, reconsume, pos);
  2528. break; // FALL THROUGH continue stateloop;
  2529. }
  2530. }
  2531. // WARNING FALLTHRU CASE TRANSITION: DON'T REORDER
  2532. case CDATA_SECTION:
  2533. cdatasectionloop: for (;;) {
  2534. if (reconsume) {
  2535. reconsume = false;
  2536. } else {
  2537. if (++pos == endPos) {
  2538. break stateloop;
  2539. }
  2540. c = checkChar(buf, pos);
  2541. }
  2542. switch (c) {
  2543. case ']':
  2544. flushChars(buf, pos);
  2545. state = transition(state, Tokenizer.CDATA_RSQB, reconsume, pos);
  2546. break cdatasectionloop; // FALL THROUGH
  2547. case '\u0000':
  2548. emitReplacementCharacter(buf, pos);
  2549. continue;
  2550. case '\r':
  2551. emitCarriageReturn(buf, pos);
  2552. break stateloop;
  2553. case '\n':
  2554. silentLineFeed();
  2555. // fall thru
  2556. default:
  2557. continue;
  2558. }
  2559. }
  2560. // WARNING FALLTHRU CASE TRANSITION: DON'T REORDER
  2561. case CDATA_RSQB:
  2562. cdatarsqb: for (;;) {
  2563. if (++pos == endPos) {
  2564. break stateloop;
  2565. }
  2566. c = checkChar(buf, pos);
  2567. switch (c) {
  2568. case ']':
  2569. state = transition(state, Tokenizer.CDATA_RSQB_RSQB, reconsume, pos);
  2570. break cdatarsqb;
  2571. default:
  2572. tokenHandler.characters(Tokenizer.RSQB_RSQB, 0,
  2573. 1);
  2574. cstart = pos;
  2575. reconsume = true;
  2576. state = transition(state, Tokenizer.CDATA_SECTION, reconsume, pos);
  2577. continue stateloop;
  2578. }
  2579. }
  2580. // WARNING FALLTHRU CASE TRANSITION: DON'T REORDER
  2581. case CDATA_RSQB_RSQB:
  2582. if (++pos == endPos) {
  2583. break stateloop;
  2584. }
  2585. c = checkChar(buf, pos);
  2586. switch (c) {
  2587. case '>':
  2588. cstart = pos + 1;
  2589. state = transition(state, Tokenizer.DATA, reconsume, pos);
  2590. continue stateloop;
  2591. default:
  2592. tokenHandler.characters(Tokenizer.RSQB_RSQB, 0, 2);
  2593. cstart = pos;
  2594. reconsume = true;
  2595. state = transition(state, Tokenizer.CDATA_SECTION, reconsume, pos);
  2596. continue stateloop;
  2597. }
  2598. // XXX reorder point
  2599. case ATTRIBUTE_VALUE_SINGLE_QUOTED:
  2600. attributevaluesinglequotedloop: for (;;) {
  2601. if (reconsume) {
  2602. reconsume = false;
  2603. } else {
  2604. if (++pos == endPos) {
  2605. break stateloop;
  2606. }
  2607. c = checkChar(buf, pos);
  2608. }
  2609. /*
  2610. * Consume the next input character:
  2611. */
  2612. switch (c) {
  2613. case '\'':
  2614. /*
  2615. * U+0027 APOSTROPHE (') Switch to the after
  2616. * attribute value (quoted) state.
  2617. */
  2618. addAttributeWithValue();
  2619. state = transition(state, Tokenizer.AFTER_ATTRIBUTE_VALUE_QUOTED, reconsume, pos);
  2620. continue stateloop;
  2621. case '&':
  2622. /*
  2623. * U+0026 AMPERSAND (&) Switch to the character
  2624. * reference in attribute value state, with the
  2625. * + additional allowed character being U+0027
  2626. * APOSTROPHE (').
  2627. */
  2628. clearStrBufAndAppend(c);
  2629. setAdditionalAndRememberAmpersandLocation('\'');
  2630. returnState = state;
  2631. state = transition(state, Tokenizer.CONSUME_CHARACTER_REFERENCE, reconsume, pos);
  2632. break attributevaluesinglequotedloop;
  2633. // continue stateloop;
  2634. case '\r':
  2635. appendLongStrBufCarriageReturn();
  2636. break stateloop;
  2637. case '\n':
  2638. appendLongStrBufLineFeed();
  2639. continue;
  2640. case '\u0000':
  2641. c = '\uFFFD';
  2642. // fall thru
  2643. default:
  2644. /*
  2645. * Anything else Append the current input
  2646. * character to the current attribute's value.
  2647. */
  2648. appendLongStrBuf(c);
  2649. /*
  2650. * Stay in the attribute value (double-quoted)
  2651. * state.
  2652. */
  2653. continue;
  2654. }
  2655. }
  2656. // FALLTHRU DON'T REORDER
  2657. case CONSUME_CHARACTER_REFERENCE:
  2658. if (++pos == endPos) {
  2659. break stateloop;
  2660. }
  2661. c = checkChar(buf, pos);
  2662. if (c == '\u0000') {
  2663. break stateloop;
  2664. }
  2665. /*
  2666. * Unlike the definition is the spec, this state does not
  2667. * return a value and never requires the caller to
  2668. * backtrack. This state takes care of emitting characters
  2669. * or appending to the current attribute value. It also
  2670. * takes care of that in the case when consuming the
  2671. * character reference fails.
  2672. */
  2673. /*
  2674. * This section defines how to consume a character
  2675. * reference. This definition is used when parsing character
  2676. * references in text and in attributes.
  2677. *
  2678. * The behavior depends on the identity of the next
  2679. * character (the one immediately after the U+0026 AMPERSAND
  2680. * character):
  2681. */
  2682. switch (c) {
  2683. case ' ':
  2684. case '\t':
  2685. case '\n':
  2686. case '\r': // we'll reconsume!
  2687. case '\u000C':
  2688. case '<':
  2689. case '&':
  2690. emitOrAppendStrBuf(returnState);
  2691. if ((returnState & DATA_AND_RCDATA_MASK) == 0) {
  2692. cstart = pos;
  2693. }
  2694. reconsume = true;
  2695. state = transition(state, returnState, reconsume, pos);
  2696. continue stateloop;
  2697. case '#':
  2698. /*
  2699. * U+0023 NUMBER SIGN (#) Consume the U+0023 NUMBER
  2700. * SIGN.
  2701. */
  2702. appendStrBuf('#');
  2703. state = transition(state, Tokenizer.CONSUME_NCR, reconsume, pos);
  2704. continue stateloop;
  2705. default:
  2706. if (c == additional) {
  2707. emitOrAppendStrBuf(returnState);
  2708. reconsume = true;
  2709. state = transition(state, returnState, reconsume, pos);
  2710. continue stateloop;
  2711. }
  2712. if (c >= 'a' && c <= 'z') {
  2713. firstCharKey = c - 'a' + 26;
  2714. } else if (c >= 'A' && c <= 'Z') {
  2715. firstCharKey = c - 'A';
  2716. } else {
  2717. // No match
  2718. /*
  2719. * If no match can be made, then this is a parse
  2720. * error.
  2721. */
  2722. errNoNamedCharacterMatch();
  2723. emitOrAppendStrBuf(returnState);
  2724. if ((returnState & DATA_AND_RCDATA_MASK) == 0) {
  2725. cstart = pos;
  2726. }
  2727. reconsume = true;
  2728. state = transition(state, returnState, reconsume, pos);
  2729. continue stateloop;
  2730. }
  2731. // Didn't fail yet
  2732. appendStrBuf(c);
  2733. state = transition(state, Tokenizer.CHARACTER_REFERENCE_HILO_LOOKUP, reconsume, pos);
  2734. // FALL THROUGH continue stateloop;
  2735. }
  2736. // WARNING FALLTHRU CASE TRANSITION: DON'T REORDER
  2737. case CHARACTER_REFERENCE_HILO_LOOKUP:
  2738. {
  2739. if (++pos == endPos) {
  2740. break stateloop;
  2741. }
  2742. c = checkChar(buf, pos);
  2743. if (c == '\u0000') {
  2744. break stateloop;
  2745. }
  2746. /*
  2747. * The data structure is as follows:
  2748. *
  2749. * HILO_ACCEL is a two-dimensional int array whose major
  2750. * index corresponds to the second character of the
  2751. * character reference (code point as index) and the
  2752. * minor index corresponds to the first character of the
  2753. * character reference (packed so that A-Z runs from 0
  2754. * to 25 and a-z runs from 26 to 51). This layout makes
  2755. * it easier to use the sparseness of the data structure
  2756. * to omit parts of it: The second dimension of the
  2757. * table is null when no character reference starts with
  2758. * the character corresponding to that row.
  2759. *
  2760. * The int value HILO_ACCEL (by these indeces) is zero
  2761. * if there exists no character reference starting with
  2762. * that two-letter prefix. Otherwise, the value is an
  2763. * int that packs two shorts so that the higher short is
  2764. * the index of the highest character reference name
  2765. * with that prefix in NAMES and the lower short
  2766. * corresponds to the index of the lowest character
  2767. * reference name with that prefix. (It happens that the
  2768. * first two character reference names share their
  2769. * prefix so the packed int cannot be 0 by packing the
  2770. * two shorts.)
  2771. *
  2772. * NAMES is an array of byte arrays where each byte
  2773. * array encodes the name of a character references as
  2774. * ASCII. The names omit the first two letters of the
  2775. * name. (Since storing the first two letters would be
  2776. * redundant with the data contained in HILO_ACCEL.) The
  2777. * entries are lexically sorted.
  2778. *
  2779. * For a given index in NAMES, the same index in VALUES
  2780. * contains the corresponding expansion as an array of
  2781. * two UTF-16 code units (either the character and
  2782. * U+0000 or a suggogate pair).
  2783. */
  2784. int hilo = 0;
  2785. if (c <= 'z') {
  2786. @Const @NoLength int[] row = NamedCharactersAccel.HILO_ACCEL[c];
  2787. if (row != null) {
  2788. hilo = row[firstCharKey];
  2789. }
  2790. }
  2791. if (hilo == 0) {
  2792. /*
  2793. * If no match can be made, then this is a parse
  2794. * error.
  2795. */
  2796. errNoNamedCharacterMatch();
  2797. emitOrAppendStrBuf(returnState);
  2798. if ((returnState & DATA_AND_RCDATA_MASK) == 0) {
  2799. cstart = pos;
  2800. }
  2801. reconsume = true;
  2802. state = transition(state, returnState, reconsume, pos);
  2803. continue stateloop;
  2804. }
  2805. // Didn't fail yet
  2806. appendStrBuf(c);
  2807. lo = hilo & 0xFFFF;
  2808. hi = hilo >> 16;
  2809. entCol = -1;
  2810. candidate = -1;
  2811. strBufMark = 0;
  2812. state = transition(state, Tokenizer.CHARACTER_REFERENCE_TAIL, reconsume, pos);
  2813. // FALL THROUGH continue stateloop;
  2814. }
  2815. case CHARACTER_REFERENCE_TAIL:
  2816. outer: for (;;) {
  2817. if (++pos == endPos) {
  2818. break stateloop;
  2819. }
  2820. c = checkChar(buf, pos);
  2821. if (c == '\u0000') {
  2822. break stateloop;
  2823. }
  2824. entCol++;
  2825. /*
  2826. * Consume the maximum number of characters possible,
  2827. * with the consumed characters matching one of the
  2828. * identifiers in the first column of the named
  2829. * character references table (in a case-sensitive
  2830. * manner).
  2831. */
  2832. loloop: for (;;) {
  2833. if (hi < lo) {
  2834. break outer;
  2835. }
  2836. if (entCol == NamedCharacters.NAMES[lo].length()) {
  2837. candidate = lo;
  2838. strBufMark = strBufLen;
  2839. lo++;
  2840. } else if (entCol > NamedCharacters.NAMES[lo].length()) {
  2841. break outer;
  2842. } else if (c > NamedCharacters.NAMES[lo].charAt(entCol)) {
  2843. lo++;
  2844. } else {
  2845. break loloop;
  2846. }
  2847. }
  2848. hiloop: for (;;) {
  2849. if (hi < lo) {
  2850. break outer;
  2851. }
  2852. if (entCol == NamedCharacters.NAMES[hi].length()) {
  2853. break hiloop;
  2854. }
  2855. if (entCol > NamedCharacters.NAMES[hi].length()) {
  2856. break outer;
  2857. } else if (c < NamedCharacters.NAMES[hi].charAt(entCol)) {
  2858. hi--;
  2859. } else {
  2860. break hiloop;
  2861. }
  2862. }
  2863. if (c == ';') {
  2864. // If we see a semicolon, there cannot be a
  2865. // longer match. Break the loop. However, before
  2866. // breaking, take the longest match so far as the
  2867. // candidate, if we are just about to complete a
  2868. // match.
  2869. if (entCol + 1 == NamedCharacters.NAMES[lo].length()) {
  2870. candidate = lo;
  2871. strBufMark = strBufLen;
  2872. }
  2873. break outer;
  2874. }
  2875. if (hi < lo) {
  2876. break outer;
  2877. }
  2878. appendStrBuf(c);
  2879. continue;
  2880. }
  2881. if (candidate == -1) {
  2882. // reconsume deals with CR, LF or nul
  2883. /*
  2884. * If no match can be made, then this is a parse error.
  2885. */
  2886. errNoNamedCharacterMatch();
  2887. emitOrAppendStrBuf(returnState);
  2888. if ((returnState & DATA_AND_RCDATA_MASK) == 0) {
  2889. cstart = pos;
  2890. }
  2891. reconsume = true;
  2892. state = transition(state, returnState, reconsume, pos);
  2893. continue stateloop;
  2894. } else {
  2895. // c can't be CR, LF or nul if we got here
  2896. @Const @CharacterName String candidateName = NamedCharacters.NAMES[candidate];
  2897. if (candidateName.length() == 0
  2898. || candidateName.charAt(candidateName.length() - 1) != ';') {
  2899. /*
  2900. * If the last character matched is not a U+003B
  2901. * SEMICOLON (;), there is a parse error.
  2902. */
  2903. if ((returnState & DATA_AND_RCDATA_MASK) != 0) {
  2904. /*
  2905. * If the entity is being consumed as part of an
  2906. * attribute, and the last character matched is
  2907. * not a U+003B SEMICOLON (;),
  2908. */
  2909. char ch;
  2910. if (strBufMark == strBufLen) {
  2911. ch = c;
  2912. } else {
  2913. // if (strBufOffset != -1) {
  2914. // ch = buf[strBufOffset + strBufMark];
  2915. // } else {
  2916. ch = strBuf[strBufMark];
  2917. // }
  2918. }
  2919. if (ch == '=' || (ch >= '0' && ch <= '9')
  2920. || (ch >= 'A' && ch <= 'Z')
  2921. || (ch >= 'a' && ch <= 'z')) {
  2922. /*
  2923. * and the next character is either a U+003D
  2924. * EQUALS SIGN character (=) or in the range
  2925. * U+0030 DIGIT ZERO to U+0039 DIGIT NINE,
  2926. * U+0041 LATIN CAPITAL LETTER A to U+005A
  2927. * LATIN CAPITAL LETTER Z, or U+0061 LATIN
  2928. * SMALL LETTER A to U+007A LATIN SMALL
  2929. * LETTER Z, then, for historical reasons,
  2930. * all the characters that were matched
  2931. * after the U+0026 AMPERSAND (&) must be
  2932. * unconsumed, and nothing is returned.
  2933. */
  2934. errNoNamedCharacterMatch();
  2935. appendStrBufToLongStrBuf();
  2936. reconsume = true;
  2937. state = transition(state, returnState, reconsume, pos);
  2938. continue stateloop;
  2939. }
  2940. }
  2941. if ((returnState & DATA_AND_RCDATA_MASK) != 0) {
  2942. errUnescapedAmpersandInterpretedAsCharacterReference();
  2943. } else {
  2944. errNotSemicolonTerminated();
  2945. }
  2946. }
  2947. /*
  2948. * Otherwise, return a character token for the character
  2949. * corresponding to the entity name (as given by the
  2950. * second column of the named character references
  2951. * table).
  2952. */
  2953. // CPPONLY: completedNamedCharacterReference();
  2954. @Const @NoLength char[] val = NamedCharacters.VALUES[candidate];
  2955. if (
  2956. // [NOCPP[
  2957. val.length == 1
  2958. // ]NOCPP]
  2959. // CPPONLY: val[1] == 0
  2960. ) {
  2961. emitOrAppendOne(val, returnState);
  2962. } else {
  2963. emitOrAppendTwo(val, returnState);
  2964. }
  2965. // this is so complicated!
  2966. if (strBufMark < strBufLen) {
  2967. if ((returnState & DATA_AND_RCDATA_MASK) != 0) {
  2968. for (int i = strBufMark; i < strBufLen; i++) {
  2969. appendLongStrBuf(strBuf[i]);
  2970. }
  2971. } else {
  2972. tokenHandler.characters(strBuf, strBufMark,
  2973. strBufLen - strBufMark);
  2974. }
  2975. }
  2976. // Check if we broke out early with c being the last
  2977. // character that matched as opposed to being the
  2978. // first one that didn't match. In the case of an
  2979. // early break, the next run on text should start
  2980. // *after* the current character and the current
  2981. // character shouldn't be reconsumed.
  2982. boolean earlyBreak = (c == ';' && strBufMark == strBufLen);
  2983. if ((returnState & DATA_AND_RCDATA_MASK) == 0) {
  2984. cstart = earlyBreak ? pos + 1 : pos;
  2985. }
  2986. reconsume = !earlyBreak;
  2987. state = transition(state, returnState, reconsume, pos);
  2988. continue stateloop;
  2989. /*
  2990. * If the markup contains I'm &notit; I tell you, the
  2991. * entity is parsed as "not", as in, I'm ¬it; I tell
  2992. * you. But if the markup was I'm &notin; I tell you,
  2993. * the entity would be parsed as "notin;", resulting in
  2994. * I'm ∉ I tell you.
  2995. */
  2996. }
  2997. // XXX reorder point
  2998. case CONSUME_NCR:
  2999. if (++pos == endPos) {
  3000. break stateloop;
  3001. }
  3002. c = checkChar(buf, pos);
  3003. prevValue = -1;
  3004. value = 0;
  3005. seenDigits = false;
  3006. /*
  3007. * The behavior further depends on the character after the
  3008. * U+0023 NUMBER SIGN:
  3009. */
  3010. switch (c) {
  3011. case 'x':
  3012. case 'X':
  3013. /*
  3014. * U+0078 LATIN SMALL LETTER X U+0058 LATIN CAPITAL
  3015. * LETTER X Consume the X.
  3016. *
  3017. * Follow the steps below, but using the range of
  3018. * characters U+0030 DIGIT ZERO through to U+0039
  3019. * DIGIT NINE, U+0061 LATIN SMALL LETTER A through
  3020. * to U+0066 LATIN SMALL LETTER F, and U+0041 LATIN
  3021. * CAPITAL LETTER A, through to U+0046 LATIN CAPITAL
  3022. * LETTER F (in other words, 0-9, A-F, a-f).
  3023. *
  3024. * When it comes to interpreting the number,
  3025. * interpret it as a hexadecimal number.
  3026. */
  3027. appendStrBuf(c);
  3028. state = transition(state, Tokenizer.HEX_NCR_LOOP, reconsume, pos);
  3029. continue stateloop;
  3030. default:
  3031. /*
  3032. * Anything else Follow the steps below, but using
  3033. * the range of characters U+0030 DIGIT ZERO through
  3034. * to U+0039 DIGIT NINE (i.e. just 0-9).
  3035. *
  3036. * When it comes to interpreting the number,
  3037. * interpret it as a decimal number.
  3038. */
  3039. reconsume = true;
  3040. state = transition(state, Tokenizer.DECIMAL_NRC_LOOP, reconsume, pos);
  3041. // FALL THROUGH continue stateloop;
  3042. }
  3043. // WARNING FALLTHRU CASE TRANSITION: DON'T REORDER
  3044. case DECIMAL_NRC_LOOP:
  3045. decimalloop: for (;;) {
  3046. if (reconsume) {
  3047. reconsume = false;
  3048. } else {
  3049. if (++pos == endPos) {
  3050. break stateloop;
  3051. }
  3052. c = checkChar(buf, pos);
  3053. }
  3054. // Deal with overflow gracefully
  3055. if (value < prevValue) {
  3056. value = 0x110000; // Value above Unicode range but
  3057. // within int
  3058. // range
  3059. }
  3060. prevValue = value;
  3061. /*
  3062. * Consume as many characters as match the range of
  3063. * characters given above.
  3064. */
  3065. if (c >= '0' && c <= '9') {
  3066. seenDigits = true;
  3067. value *= 10;
  3068. value += c - '0';
  3069. continue;
  3070. } else if (c == ';') {
  3071. if (seenDigits) {
  3072. if ((returnState & DATA_AND_RCDATA_MASK) == 0) {
  3073. cstart = pos + 1;
  3074. }
  3075. state = transition(state, Tokenizer.HANDLE_NCR_VALUE, reconsume, pos);
  3076. // FALL THROUGH continue stateloop;
  3077. break decimalloop;
  3078. } else {
  3079. errNoDigitsInNCR();
  3080. appendStrBuf(';');
  3081. emitOrAppendStrBuf(returnState);
  3082. if ((returnState & DATA_AND_RCDATA_MASK) == 0) {
  3083. cstart = pos + 1;
  3084. }
  3085. state = transition(state, returnState, reconsume, pos);
  3086. continue stateloop;
  3087. }
  3088. } else {
  3089. /*
  3090. * If no characters match the range, then don't
  3091. * consume any characters (and unconsume the U+0023
  3092. * NUMBER SIGN character and, if appropriate, the X
  3093. * character). This is a parse error; nothing is
  3094. * returned.
  3095. *
  3096. * Otherwise, if the next character is a U+003B
  3097. * SEMICOLON, consume that too. If it isn't, there
  3098. * is a parse error.
  3099. */
  3100. if (!seenDigits) {
  3101. errNoDigitsInNCR();
  3102. emitOrAppendStrBuf(returnState);
  3103. if ((returnState & DATA_AND_RCDATA_MASK) == 0) {
  3104. cstart = pos;
  3105. }
  3106. reconsume = true;
  3107. state = transition(state, returnState, reconsume, pos);
  3108. continue stateloop;
  3109. } else {
  3110. errCharRefLacksSemicolon();
  3111. if ((returnState & DATA_AND_RCDATA_MASK) == 0) {
  3112. cstart = pos;
  3113. }
  3114. reconsume = true;
  3115. state = transition(state, Tokenizer.HANDLE_NCR_VALUE, reconsume, pos);
  3116. // FALL THROUGH continue stateloop;
  3117. break decimalloop;
  3118. }
  3119. }
  3120. }
  3121. // WARNING FALLTHRU CASE TRANSITION: DON'T REORDER
  3122. case HANDLE_NCR_VALUE:
  3123. // WARNING previous state sets reconsume
  3124. // XXX inline this case if the method size can take it
  3125. handleNcrValue(returnState);
  3126. state = transition(state, returnState, reconsume, pos);
  3127. continue stateloop;
  3128. // XXX reorder point
  3129. case HEX_NCR_LOOP:
  3130. for (;;) {
  3131. if (++pos == endPos) {
  3132. break stateloop;
  3133. }
  3134. c = checkChar(buf, pos);
  3135. // Deal with overflow gracefully
  3136. if (value < prevValue) {
  3137. value = 0x110000; // Value above Unicode range but
  3138. // within int
  3139. // range
  3140. }
  3141. prevValue = value;
  3142. /*
  3143. * Consume as many characters as match the range of
  3144. * characters given above.
  3145. */
  3146. if (c >= '0' && c <= '9') {
  3147. seenDigits = true;
  3148. value *= 16;
  3149. value += c - '0';
  3150. continue;
  3151. } else if (c >= 'A' && c <= 'F') {
  3152. seenDigits = true;
  3153. value *= 16;
  3154. value += c - 'A' + 10;
  3155. continue;
  3156. } else if (c >= 'a' && c <= 'f') {
  3157. seenDigits = true;
  3158. value *= 16;
  3159. value += c - 'a' + 10;
  3160. continue;
  3161. } else if (c == ';') {
  3162. if (seenDigits) {
  3163. if ((returnState & DATA_AND_RCDATA_MASK) == 0) {
  3164. cstart = pos + 1;
  3165. }
  3166. state = transition(state, Tokenizer.HANDLE_NCR_VALUE, reconsume, pos);
  3167. continue stateloop;
  3168. } else {
  3169. errNoDigitsInNCR();
  3170. appendStrBuf(';');
  3171. emitOrAppendStrBuf(returnState);
  3172. if ((returnState & DATA_AND_RCDATA_MASK) == 0) {
  3173. cstart = pos + 1;
  3174. }
  3175. state = transition(state, returnState, reconsume, pos);
  3176. continue stateloop;
  3177. }
  3178. } else {
  3179. /*
  3180. * If no characters match the range, then don't
  3181. * consume any characters (and unconsume the U+0023
  3182. * NUMBER SIGN character and, if appropriate, the X
  3183. * character). This is a parse error; nothing is
  3184. * returned.
  3185. *
  3186. * Otherwise, if the next character is a U+003B
  3187. * SEMICOLON, consume that too. If it isn't, there
  3188. * is a parse error.
  3189. */
  3190. if (!seenDigits) {
  3191. errNoDigitsInNCR();
  3192. emitOrAppendStrBuf(returnState);
  3193. if ((returnState & DATA_AND_RCDATA_MASK) == 0) {
  3194. cstart = pos;
  3195. }
  3196. reconsume = true;
  3197. state = transition(state, returnState, reconsume, pos);
  3198. continue stateloop;
  3199. } else {
  3200. errCharRefLacksSemicolon();
  3201. if ((returnState & DATA_AND_RCDATA_MASK) == 0) {
  3202. cstart = pos;
  3203. }
  3204. reconsume = true;
  3205. state = transition(state, Tokenizer.HANDLE_NCR_VALUE, reconsume, pos);
  3206. continue stateloop;
  3207. }
  3208. }
  3209. }
  3210. // XXX reorder point
  3211. case PLAINTEXT:
  3212. plaintextloop: for (;;) {
  3213. if (reconsume) {
  3214. reconsume = false;
  3215. } else {
  3216. if (++pos == endPos) {
  3217. break stateloop;
  3218. }
  3219. c = checkChar(buf, pos);
  3220. }
  3221. switch (c) {
  3222. case '\u0000':
  3223. emitPlaintextReplacementCharacter(buf, pos);
  3224. continue;
  3225. case '\r':
  3226. emitCarriageReturn(buf, pos);
  3227. break stateloop;
  3228. case '\n':
  3229. silentLineFeed();
  3230. default:
  3231. /*
  3232. * Anything else Emit the current input
  3233. * character as a character token. Stay in the
  3234. * RAWTEXT state.
  3235. */
  3236. continue;
  3237. }
  3238. }
  3239. // XXX reorder point
  3240. case CLOSE_TAG_OPEN:
  3241. if (++pos == endPos) {
  3242. break stateloop;
  3243. }
  3244. c = checkChar(buf, pos);
  3245. /*
  3246. * Otherwise, if the content model flag is set to the PCDATA
  3247. * state, or if the next few characters do match that tag
  3248. * name, consume the next input character:
  3249. */
  3250. switch (c) {
  3251. case '>':
  3252. /* U+003E GREATER-THAN SIGN (>) Parse error. */
  3253. errLtSlashGt();
  3254. /*
  3255. * Switch to the data state.
  3256. */
  3257. cstart = pos + 1;
  3258. state = transition(state, Tokenizer.DATA, reconsume, pos);
  3259. continue stateloop;
  3260. case '\r':
  3261. silentCarriageReturn();
  3262. /* Anything else Parse error. */
  3263. errGarbageAfterLtSlash();
  3264. /*
  3265. * Switch to the bogus comment state.
  3266. */
  3267. clearLongStrBufAndAppend('\n');
  3268. state = transition(state, Tokenizer.BOGUS_COMMENT, reconsume, pos);
  3269. break stateloop;
  3270. case '\n':
  3271. silentLineFeed();
  3272. /* Anything else Parse error. */
  3273. errGarbageAfterLtSlash();
  3274. /*
  3275. * Switch to the bogus comment state.
  3276. */
  3277. clearLongStrBufAndAppend('\n');
  3278. state = transition(state, Tokenizer.BOGUS_COMMENT, reconsume, pos);
  3279. continue stateloop;
  3280. case '\u0000':
  3281. c = '\uFFFD';
  3282. // fall thru
  3283. default:
  3284. if (c >= 'A' && c <= 'Z') {
  3285. c += 0x20;
  3286. }
  3287. if (c >= 'a' && c <= 'z') {
  3288. /*
  3289. * U+0061 LATIN SMALL LETTER A through to U+007A
  3290. * LATIN SMALL LETTER Z Create a new end tag
  3291. * token,
  3292. */
  3293. endTag = true;
  3294. /*
  3295. * set its tag name to the input character,
  3296. */
  3297. clearStrBufAndAppend(c);
  3298. /*
  3299. * then switch to the tag name state. (Don't
  3300. * emit the token yet; further details will be
  3301. * filled in before it is emitted.)
  3302. */
  3303. state = transition(state, Tokenizer.TAG_NAME, reconsume, pos);
  3304. continue stateloop;
  3305. } else {
  3306. /* Anything else Parse error. */
  3307. errGarbageAfterLtSlash();
  3308. /*
  3309. * Switch to the bogus comment state.
  3310. */
  3311. clearLongStrBufAndAppend(c);
  3312. state = transition(state, Tokenizer.BOGUS_COMMENT, reconsume, pos);
  3313. continue stateloop;
  3314. }
  3315. }
  3316. // XXX reorder point
  3317. case RCDATA:
  3318. rcdataloop: for (;;) {
  3319. if (reconsume) {
  3320. reconsume = false;
  3321. } else {
  3322. if (++pos == endPos) {
  3323. break stateloop;
  3324. }
  3325. c = checkChar(buf, pos);
  3326. }
  3327. switch (c) {
  3328. case '&':
  3329. /*
  3330. * U+0026 AMPERSAND (&) Switch to the character
  3331. * reference in RCDATA state.
  3332. */
  3333. flushChars(buf, pos);
  3334. clearStrBufAndAppend(c);
  3335. additional = '\u0000';
  3336. returnState = state;
  3337. state = transition(state, Tokenizer.CONSUME_CHARACTER_REFERENCE, reconsume, pos);
  3338. continue stateloop;
  3339. case '<':
  3340. /*
  3341. * U+003C LESS-THAN SIGN (<) Switch to the
  3342. * RCDATA less-than sign state.
  3343. */
  3344. flushChars(buf, pos);
  3345. returnState = state;
  3346. state = transition(state, Tokenizer.RAWTEXT_RCDATA_LESS_THAN_SIGN, reconsume, pos);
  3347. continue stateloop;
  3348. case '\u0000':
  3349. emitReplacementCharacter(buf, pos);
  3350. continue;
  3351. case '\r':
  3352. emitCarriageReturn(buf, pos);
  3353. break stateloop;
  3354. case '\n':
  3355. silentLineFeed();
  3356. default:
  3357. /*
  3358. * Emit the current input character as a
  3359. * character token. Stay in the RCDATA state.
  3360. */
  3361. continue;
  3362. }
  3363. }
  3364. // XXX reorder point
  3365. case RAWTEXT:
  3366. rawtextloop: for (;;) {
  3367. if (reconsume) {
  3368. reconsume = false;
  3369. } else {
  3370. if (++pos == endPos) {
  3371. break stateloop;
  3372. }
  3373. c = checkChar(buf, pos);
  3374. }
  3375. switch (c) {
  3376. case '<':
  3377. /*
  3378. * U+003C LESS-THAN SIGN (<) Switch to the
  3379. * RAWTEXT less-than sign state.
  3380. */
  3381. flushChars(buf, pos);
  3382. returnState = state;
  3383. state = transition(state, Tokenizer.RAWTEXT_RCDATA_LESS_THAN_SIGN, reconsume, pos);
  3384. break rawtextloop;
  3385. // FALL THRU continue stateloop;
  3386. case '\u0000':
  3387. emitReplacementCharacter(buf, pos);
  3388. continue;
  3389. case '\r':
  3390. emitCarriageReturn(buf, pos);
  3391. break stateloop;
  3392. case '\n':
  3393. silentLineFeed();
  3394. default:
  3395. /*
  3396. * Emit the current input character as a
  3397. * character token. Stay in the RAWTEXT state.
  3398. */
  3399. continue;
  3400. }
  3401. }
  3402. // XXX fallthru don't reorder
  3403. case RAWTEXT_RCDATA_LESS_THAN_SIGN:
  3404. rawtextrcdatalessthansignloop: for (;;) {
  3405. if (++pos == endPos) {
  3406. break stateloop;
  3407. }
  3408. c = checkChar(buf, pos);
  3409. switch (c) {
  3410. case '/':
  3411. /*
  3412. * U+002F SOLIDUS (/) Set the temporary buffer
  3413. * to the empty string. Switch to the script
  3414. * data end tag open state.
  3415. */
  3416. index = 0;
  3417. clearStrBuf();
  3418. state = transition(state, Tokenizer.NON_DATA_END_TAG_NAME, reconsume, pos);
  3419. break rawtextrcdatalessthansignloop;
  3420. // FALL THRU continue stateloop;
  3421. default:
  3422. /*
  3423. * Otherwise, emit a U+003C LESS-THAN SIGN
  3424. * character token
  3425. */
  3426. tokenHandler.characters(Tokenizer.LT_GT, 0, 1);
  3427. /*
  3428. * and reconsume the current input character in
  3429. * the data state.
  3430. */
  3431. cstart = pos;
  3432. reconsume = true;
  3433. state = transition(state, returnState, reconsume, pos);
  3434. continue stateloop;
  3435. }
  3436. }
  3437. // XXX fall thru. don't reorder.
  3438. case NON_DATA_END_TAG_NAME:
  3439. for (;;) {
  3440. if (++pos == endPos) {
  3441. break stateloop;
  3442. }
  3443. c = checkChar(buf, pos);
  3444. /*
  3445. * ASSERT! when entering this state, set index to 0 and
  3446. * call clearStrBuf() assert (contentModelElement !=
  3447. * null); Let's implement the above without lookahead.
  3448. * strBuf is the 'temporary buffer'.
  3449. */
  3450. if (index < endTagExpectationAsArray.length) {
  3451. char e = endTagExpectationAsArray[index];
  3452. char folded = c;
  3453. if (c >= 'A' && c <= 'Z') {
  3454. folded += 0x20;
  3455. }
  3456. if (folded != e) {
  3457. // [NOCPP[
  3458. errHtml4LtSlashInRcdata(folded);
  3459. // ]NOCPP]
  3460. tokenHandler.characters(Tokenizer.LT_SOLIDUS,
  3461. 0, 2);
  3462. emitStrBuf();
  3463. cstart = pos;
  3464. reconsume = true;
  3465. state = transition(state, returnState, reconsume, pos);
  3466. continue stateloop;
  3467. }
  3468. appendStrBuf(c);
  3469. index++;
  3470. continue;
  3471. } else {
  3472. endTag = true;
  3473. // XXX replace contentModelElement with different
  3474. // type
  3475. tagName = endTagExpectation;
  3476. switch (c) {
  3477. case '\r':
  3478. silentCarriageReturn();
  3479. state = transition(state, Tokenizer.BEFORE_ATTRIBUTE_NAME, reconsume, pos);
  3480. break stateloop;
  3481. case '\n':
  3482. silentLineFeed();
  3483. // fall thru
  3484. case ' ':
  3485. case '\t':
  3486. case '\u000C':
  3487. /*
  3488. * U+0009 CHARACTER TABULATION U+000A LINE
  3489. * FEED (LF) U+000C FORM FEED (FF) U+0020
  3490. * SPACE If the current end tag token is an
  3491. * appropriate end tag token, then switch to
  3492. * the before attribute name state.
  3493. */
  3494. state = transition(state, Tokenizer.BEFORE_ATTRIBUTE_NAME, reconsume, pos);
  3495. continue stateloop;
  3496. case '/':
  3497. /*
  3498. * U+002F SOLIDUS (/) If the current end tag
  3499. * token is an appropriate end tag token,
  3500. * then switch to the self-closing start tag
  3501. * state.
  3502. */
  3503. state = transition(state, Tokenizer.SELF_CLOSING_START_TAG, reconsume, pos);
  3504. continue stateloop;
  3505. case '>':
  3506. /*
  3507. * U+003E GREATER-THAN SIGN (>) If the
  3508. * current end tag token is an appropriate
  3509. * end tag token, then emit the current tag
  3510. * token and switch to the data state.
  3511. */
  3512. state = transition(state, emitCurrentTagToken(false, pos), reconsume, pos);
  3513. if (shouldSuspend) {
  3514. break stateloop;
  3515. }
  3516. continue stateloop;
  3517. default:
  3518. /*
  3519. * Emit a U+003C LESS-THAN SIGN character
  3520. * token, a U+002F SOLIDUS character token,
  3521. * a character token for each of the
  3522. * characters in the temporary buffer (in
  3523. * the order they were added to the buffer),
  3524. * and reconsume the current input character
  3525. * in the RAWTEXT state.
  3526. */
  3527. // [NOCPP[
  3528. errWarnLtSlashInRcdata();
  3529. // ]NOCPP]
  3530. tokenHandler.characters(
  3531. Tokenizer.LT_SOLIDUS, 0, 2);
  3532. emitStrBuf();
  3533. if (c == '\u0000') {
  3534. emitReplacementCharacter(buf, pos);
  3535. } else {
  3536. cstart = pos; // don't drop the
  3537. // character
  3538. }
  3539. state = transition(state, returnState, reconsume, pos);
  3540. continue stateloop;
  3541. }
  3542. }
  3543. }
  3544. // XXX reorder point
  3545. // BEGIN HOTSPOT WORKAROUND
  3546. case BOGUS_COMMENT:
  3547. boguscommentloop: for (;;) {
  3548. if (reconsume) {
  3549. reconsume = false;
  3550. } else {
  3551. if (++pos == endPos) {
  3552. break stateloop;
  3553. }
  3554. c = checkChar(buf, pos);
  3555. }
  3556. /*
  3557. * Consume every character up to and including the first
  3558. * U+003E GREATER-THAN SIGN character (>) or the end of
  3559. * the file (EOF), whichever comes first. Emit a comment
  3560. * token whose data is the concatenation of all the
  3561. * characters starting from and including the character
  3562. * that caused the state machine to switch into the
  3563. * bogus comment state, up to and including the
  3564. * character immediately before the last consumed
  3565. * character (i.e. up to the character just before the
  3566. * U+003E or EOF character). (If the comment was started
  3567. * by the end of the file (EOF), the token is empty.)
  3568. *
  3569. * Switch to the data state.
  3570. *
  3571. * If the end of the file was reached, reconsume the EOF
  3572. * character.
  3573. */
  3574. switch (c) {
  3575. case '>':
  3576. emitComment(0, pos);
  3577. state = transition(state, Tokenizer.DATA, reconsume, pos);
  3578. continue stateloop;
  3579. case '-':
  3580. appendLongStrBuf(c);
  3581. state = transition(state, Tokenizer.BOGUS_COMMENT_HYPHEN, reconsume, pos);
  3582. break boguscommentloop;
  3583. case '\r':
  3584. appendLongStrBufCarriageReturn();
  3585. break stateloop;
  3586. case '\n':
  3587. appendLongStrBufLineFeed();
  3588. continue;
  3589. case '\u0000':
  3590. c = '\uFFFD';
  3591. // fall thru
  3592. default:
  3593. appendLongStrBuf(c);
  3594. continue;
  3595. }
  3596. }
  3597. // FALLTHRU DON'T REORDER
  3598. case BOGUS_COMMENT_HYPHEN:
  3599. boguscommenthyphenloop: for (;;) {
  3600. if (++pos == endPos) {
  3601. break stateloop;
  3602. }
  3603. c = checkChar(buf, pos);
  3604. switch (c) {
  3605. case '>':
  3606. // [NOCPP[
  3607. maybeAppendSpaceToBogusComment();
  3608. // ]NOCPP]
  3609. emitComment(0, pos);
  3610. state = transition(state, Tokenizer.DATA, reconsume, pos);
  3611. continue stateloop;
  3612. case '-':
  3613. appendSecondHyphenToBogusComment();
  3614. continue boguscommenthyphenloop;
  3615. case '\r':
  3616. appendLongStrBufCarriageReturn();
  3617. state = transition(state, Tokenizer.BOGUS_COMMENT, reconsume, pos);
  3618. break stateloop;
  3619. case '\n':
  3620. appendLongStrBufLineFeed();
  3621. state = transition(state, Tokenizer.BOGUS_COMMENT, reconsume, pos);
  3622. continue stateloop;
  3623. case '\u0000':
  3624. c = '\uFFFD';
  3625. // fall thru
  3626. default:
  3627. appendLongStrBuf(c);
  3628. state = transition(state, Tokenizer.BOGUS_COMMENT, reconsume, pos);
  3629. continue stateloop;
  3630. }
  3631. }
  3632. // XXX reorder point
  3633. case SCRIPT_DATA:
  3634. scriptdataloop: for (;;) {
  3635. if (reconsume) {
  3636. reconsume = false;
  3637. } else {
  3638. if (++pos == endPos) {
  3639. break stateloop;
  3640. }
  3641. c = checkChar(buf, pos);
  3642. }
  3643. switch (c) {
  3644. case '<':
  3645. /*
  3646. * U+003C LESS-THAN SIGN (<) Switch to the
  3647. * script data less-than sign state.
  3648. */
  3649. flushChars(buf, pos);
  3650. returnState = state;
  3651. state = transition(state, Tokenizer.SCRIPT_DATA_LESS_THAN_SIGN, reconsume, pos);
  3652. break scriptdataloop; // FALL THRU continue
  3653. // stateloop;
  3654. case '\u0000':
  3655. emitReplacementCharacter(buf, pos);
  3656. continue;
  3657. case '\r':
  3658. emitCarriageReturn(buf, pos);
  3659. break stateloop;
  3660. case '\n':
  3661. silentLineFeed();
  3662. default:
  3663. /*
  3664. * Anything else Emit the current input
  3665. * character as a character token. Stay in the
  3666. * script data state.
  3667. */
  3668. continue;
  3669. }
  3670. }
  3671. // WARNING FALLTHRU CASE TRANSITION: DON'T REORDER
  3672. case SCRIPT_DATA_LESS_THAN_SIGN:
  3673. scriptdatalessthansignloop: for (;;) {
  3674. if (++pos == endPos) {
  3675. break stateloop;
  3676. }
  3677. c = checkChar(buf, pos);
  3678. switch (c) {
  3679. case '/':
  3680. /*
  3681. * U+002F SOLIDUS (/) Set the temporary buffer
  3682. * to the empty string. Switch to the script
  3683. * data end tag open state.
  3684. */
  3685. index = 0;
  3686. clearStrBuf();
  3687. state = transition(state, Tokenizer.NON_DATA_END_TAG_NAME, reconsume, pos);
  3688. continue stateloop;
  3689. case '!':
  3690. tokenHandler.characters(Tokenizer.LT_GT, 0, 1);
  3691. cstart = pos;
  3692. state = transition(state, Tokenizer.SCRIPT_DATA_ESCAPE_START, reconsume, pos);
  3693. break scriptdatalessthansignloop; // FALL THRU
  3694. // continue
  3695. // stateloop;
  3696. default:
  3697. /*
  3698. * Otherwise, emit a U+003C LESS-THAN SIGN
  3699. * character token
  3700. */
  3701. tokenHandler.characters(Tokenizer.LT_GT, 0, 1);
  3702. /*
  3703. * and reconsume the current input character in
  3704. * the data state.
  3705. */
  3706. cstart = pos;
  3707. reconsume = true;
  3708. state = transition(state, Tokenizer.SCRIPT_DATA, reconsume, pos);
  3709. continue stateloop;
  3710. }
  3711. }
  3712. // WARNING FALLTHRU CASE TRANSITION: DON'T REORDER
  3713. case SCRIPT_DATA_ESCAPE_START:
  3714. scriptdataescapestartloop: for (;;) {
  3715. if (++pos == endPos) {
  3716. break stateloop;
  3717. }
  3718. c = checkChar(buf, pos);
  3719. /*
  3720. * Consume the next input character:
  3721. */
  3722. switch (c) {
  3723. case '-':
  3724. /*
  3725. * U+002D HYPHEN-MINUS (-) Emit a U+002D
  3726. * HYPHEN-MINUS character token. Switch to the
  3727. * script data escape start dash state.
  3728. */
  3729. state = transition(state, Tokenizer.SCRIPT_DATA_ESCAPE_START_DASH, reconsume, pos);
  3730. break scriptdataescapestartloop; // FALL THRU
  3731. // continue
  3732. // stateloop;
  3733. default:
  3734. /*
  3735. * Anything else Reconsume the current input
  3736. * character in the script data state.
  3737. */
  3738. reconsume = true;
  3739. state = transition(state, Tokenizer.SCRIPT_DATA, reconsume, pos);
  3740. continue stateloop;
  3741. }
  3742. }
  3743. // WARNING FALLTHRU CASE TRANSITION: DON'T REORDER
  3744. case SCRIPT_DATA_ESCAPE_START_DASH:
  3745. scriptdataescapestartdashloop: for (;;) {
  3746. if (++pos == endPos) {
  3747. break stateloop;
  3748. }
  3749. c = checkChar(buf, pos);
  3750. /*
  3751. * Consume the next input character:
  3752. */
  3753. switch (c) {
  3754. case '-':
  3755. /*
  3756. * U+002D HYPHEN-MINUS (-) Emit a U+002D
  3757. * HYPHEN-MINUS character token. Switch to the
  3758. * script data escaped dash dash state.
  3759. */
  3760. state = transition(state, Tokenizer.SCRIPT_DATA_ESCAPED_DASH_DASH, reconsume, pos);
  3761. break scriptdataescapestartdashloop;
  3762. // continue stateloop;
  3763. default:
  3764. /*
  3765. * Anything else Reconsume the current input
  3766. * character in the script data state.
  3767. */
  3768. reconsume = true;
  3769. state = transition(state, Tokenizer.SCRIPT_DATA, reconsume, pos);
  3770. continue stateloop;
  3771. }
  3772. }
  3773. // WARNING FALLTHRU CASE TRANSITION: DON'T REORDER
  3774. case SCRIPT_DATA_ESCAPED_DASH_DASH:
  3775. scriptdataescapeddashdashloop: for (;;) {
  3776. if (++pos == endPos) {
  3777. break stateloop;
  3778. }
  3779. c = checkChar(buf, pos);
  3780. /*
  3781. * Consume the next input character:
  3782. */
  3783. switch (c) {
  3784. case '-':
  3785. /*
  3786. * U+002D HYPHEN-MINUS (-) Emit a U+002D
  3787. * HYPHEN-MINUS character token. Stay in the
  3788. * script data escaped dash dash state.
  3789. */
  3790. continue;
  3791. case '<':
  3792. /*
  3793. * U+003C LESS-THAN SIGN (<) Switch to the
  3794. * script data escaped less-than sign state.
  3795. */
  3796. flushChars(buf, pos);
  3797. state = transition(state, Tokenizer.SCRIPT_DATA_ESCAPED_LESS_THAN_SIGN, reconsume, pos);
  3798. continue stateloop;
  3799. case '>':
  3800. /*
  3801. * U+003E GREATER-THAN SIGN (>) Emit a U+003E
  3802. * GREATER-THAN SIGN character token. Switch to
  3803. * the script data state.
  3804. */
  3805. state = transition(state, Tokenizer.SCRIPT_DATA, reconsume, pos);
  3806. continue stateloop;
  3807. case '\u0000':
  3808. emitReplacementCharacter(buf, pos);
  3809. state = transition(state, Tokenizer.SCRIPT_DATA_ESCAPED, reconsume, pos);
  3810. break scriptdataescapeddashdashloop;
  3811. case '\r':
  3812. emitCarriageReturn(buf, pos);
  3813. state = transition(state, Tokenizer.SCRIPT_DATA_ESCAPED, reconsume, pos);
  3814. break stateloop;
  3815. case '\n':
  3816. silentLineFeed();
  3817. default:
  3818. /*
  3819. * Anything else Emit the current input
  3820. * character as a character token. Switch to the
  3821. * script data escaped state.
  3822. */
  3823. state = transition(state, Tokenizer.SCRIPT_DATA_ESCAPED, reconsume, pos);
  3824. break scriptdataescapeddashdashloop;
  3825. // continue stateloop;
  3826. }
  3827. }
  3828. // WARNING FALLTHRU CASE TRANSITION: DON'T REORDER
  3829. case SCRIPT_DATA_ESCAPED:
  3830. scriptdataescapedloop: for (;;) {
  3831. if (reconsume) {
  3832. reconsume = false;
  3833. } else {
  3834. if (++pos == endPos) {
  3835. break stateloop;
  3836. }
  3837. c = checkChar(buf, pos);
  3838. }
  3839. /*
  3840. * Consume the next input character:
  3841. */
  3842. switch (c) {
  3843. case '-':
  3844. /*
  3845. * U+002D HYPHEN-MINUS (-) Emit a U+002D
  3846. * HYPHEN-MINUS character token. Switch to the
  3847. * script data escaped dash state.
  3848. */
  3849. state = transition(state, Tokenizer.SCRIPT_DATA_ESCAPED_DASH, reconsume, pos);
  3850. break scriptdataescapedloop; // FALL THRU
  3851. // continue
  3852. // stateloop;
  3853. case '<':
  3854. /*
  3855. * U+003C LESS-THAN SIGN (<) Switch to the
  3856. * script data escaped less-than sign state.
  3857. */
  3858. flushChars(buf, pos);
  3859. state = transition(state, Tokenizer.SCRIPT_DATA_ESCAPED_LESS_THAN_SIGN, reconsume, pos);
  3860. continue stateloop;
  3861. case '\u0000':
  3862. emitReplacementCharacter(buf, pos);
  3863. continue;
  3864. case '\r':
  3865. emitCarriageReturn(buf, pos);
  3866. break stateloop;
  3867. case '\n':
  3868. silentLineFeed();
  3869. default:
  3870. /*
  3871. * Anything else Emit the current input
  3872. * character as a character token. Stay in the
  3873. * script data escaped state.
  3874. */
  3875. continue;
  3876. }
  3877. }
  3878. // WARNING FALLTHRU CASE TRANSITION: DON'T REORDER
  3879. case SCRIPT_DATA_ESCAPED_DASH:
  3880. scriptdataescapeddashloop: for (;;) {
  3881. if (++pos == endPos) {
  3882. break stateloop;
  3883. }
  3884. c = checkChar(buf, pos);
  3885. /*
  3886. * Consume the next input character:
  3887. */
  3888. switch (c) {
  3889. case '-':
  3890. /*
  3891. * U+002D HYPHEN-MINUS (-) Emit a U+002D
  3892. * HYPHEN-MINUS character token. Switch to the
  3893. * script data escaped dash dash state.
  3894. */
  3895. state = transition(state, Tokenizer.SCRIPT_DATA_ESCAPED_DASH_DASH, reconsume, pos);
  3896. continue stateloop;
  3897. case '<':
  3898. /*
  3899. * U+003C LESS-THAN SIGN (<) Switch to the
  3900. * script data escaped less-than sign state.
  3901. */
  3902. flushChars(buf, pos);
  3903. state = transition(state, Tokenizer.SCRIPT_DATA_ESCAPED_LESS_THAN_SIGN, reconsume, pos);
  3904. break scriptdataescapeddashloop;
  3905. // continue stateloop;
  3906. case '\u0000':
  3907. emitReplacementCharacter(buf, pos);
  3908. state = transition(state, Tokenizer.SCRIPT_DATA_ESCAPED, reconsume, pos);
  3909. continue stateloop;
  3910. case '\r':
  3911. emitCarriageReturn(buf, pos);
  3912. state = transition(state, Tokenizer.SCRIPT_DATA_ESCAPED, reconsume, pos);
  3913. break stateloop;
  3914. case '\n':
  3915. silentLineFeed();
  3916. default:
  3917. /*
  3918. * Anything else Emit the current input
  3919. * character as a character token. Switch to the
  3920. * script data escaped state.
  3921. */
  3922. state = transition(state, Tokenizer.SCRIPT_DATA_ESCAPED, reconsume, pos);
  3923. continue stateloop;
  3924. }
  3925. }
  3926. // WARNING FALLTHRU CASE TRANSITION: DON'T REORDER
  3927. case SCRIPT_DATA_ESCAPED_LESS_THAN_SIGN:
  3928. scriptdataescapedlessthanloop: for (;;) {
  3929. if (++pos == endPos) {
  3930. break stateloop;
  3931. }
  3932. c = checkChar(buf, pos);
  3933. /*
  3934. * Consume the next input character:
  3935. */
  3936. switch (c) {
  3937. case '/':
  3938. /*
  3939. * U+002F SOLIDUS (/) Set the temporary buffer
  3940. * to the empty string. Switch to the script
  3941. * data escaped end tag open state.
  3942. */
  3943. index = 0;
  3944. clearStrBuf();
  3945. returnState = Tokenizer.SCRIPT_DATA_ESCAPED;
  3946. state = transition(state, Tokenizer.NON_DATA_END_TAG_NAME, reconsume, pos);
  3947. continue stateloop;
  3948. case 'S':
  3949. case 's':
  3950. /*
  3951. * U+0041 LATIN CAPITAL LETTER A through to
  3952. * U+005A LATIN CAPITAL LETTER Z Emit a U+003C
  3953. * LESS-THAN SIGN character token and the
  3954. * current input character as a character token.
  3955. */
  3956. tokenHandler.characters(Tokenizer.LT_GT, 0, 1);
  3957. cstart = pos;
  3958. index = 1;
  3959. /*
  3960. * Set the temporary buffer to the empty string.
  3961. * Append the lowercase version of the current
  3962. * input character (add 0x0020 to the
  3963. * character's code point) to the temporary
  3964. * buffer. Switch to the script data double
  3965. * escape start state.
  3966. */
  3967. state = transition(state, Tokenizer.SCRIPT_DATA_DOUBLE_ESCAPE_START, reconsume, pos);
  3968. break scriptdataescapedlessthanloop;
  3969. // continue stateloop;
  3970. default:
  3971. /*
  3972. * Anything else Emit a U+003C LESS-THAN SIGN
  3973. * character token and reconsume the current
  3974. * input character in the script data escaped
  3975. * state.
  3976. */
  3977. tokenHandler.characters(Tokenizer.LT_GT, 0, 1);
  3978. cstart = pos;
  3979. reconsume = true;
  3980. state = transition(state, Tokenizer.SCRIPT_DATA_ESCAPED, reconsume, pos);
  3981. continue stateloop;
  3982. }
  3983. }
  3984. // WARNING FALLTHRU CASE TRANSITION: DON'T REORDER
  3985. case SCRIPT_DATA_DOUBLE_ESCAPE_START:
  3986. scriptdatadoubleescapestartloop: for (;;) {
  3987. if (++pos == endPos) {
  3988. break stateloop;
  3989. }
  3990. c = checkChar(buf, pos);
  3991. assert (index > 0);
  3992. if (index < 6) { // SCRIPT_ARR.length
  3993. char folded = c;
  3994. if (c >= 'A' && c <= 'Z') {
  3995. folded += 0x20;
  3996. }
  3997. if (folded != Tokenizer.SCRIPT_ARR[index]) {
  3998. reconsume = true;
  3999. state = transition(state, Tokenizer.SCRIPT_DATA_ESCAPED, reconsume, pos);
  4000. continue stateloop;
  4001. }
  4002. index++;
  4003. continue;
  4004. }
  4005. switch (c) {
  4006. case '\r':
  4007. emitCarriageReturn(buf, pos);
  4008. state = transition(state, Tokenizer.SCRIPT_DATA_DOUBLE_ESCAPED, reconsume, pos);
  4009. break stateloop;
  4010. case '\n':
  4011. silentLineFeed();
  4012. case ' ':
  4013. case '\t':
  4014. case '\u000C':
  4015. case '/':
  4016. case '>':
  4017. /*
  4018. * U+0009 CHARACTER TABULATION U+000A LINE FEED
  4019. * (LF) U+000C FORM FEED (FF) U+0020 SPACE
  4020. * U+002F SOLIDUS (/) U+003E GREATER-THAN SIGN
  4021. * (>) Emit the current input character as a
  4022. * character token. If the temporary buffer is
  4023. * the string "script", then switch to the
  4024. * script data double escaped state.
  4025. */
  4026. state = transition(state, Tokenizer.SCRIPT_DATA_DOUBLE_ESCAPED, reconsume, pos);
  4027. break scriptdatadoubleescapestartloop;
  4028. // continue stateloop;
  4029. default:
  4030. /*
  4031. * Anything else Reconsume the current input
  4032. * character in the script data escaped state.
  4033. */
  4034. reconsume = true;
  4035. state = transition(state, Tokenizer.SCRIPT_DATA_ESCAPED, reconsume, pos);
  4036. continue stateloop;
  4037. }
  4038. }
  4039. // WARNING FALLTHRU CASE TRANSITION: DON'T REORDER
  4040. case SCRIPT_DATA_DOUBLE_ESCAPED:
  4041. scriptdatadoubleescapedloop: for (;;) {
  4042. if (reconsume) {
  4043. reconsume = false;
  4044. } else {
  4045. if (++pos == endPos) {
  4046. break stateloop;
  4047. }
  4048. c = checkChar(buf, pos);
  4049. }
  4050. /*
  4051. * Consume the next input character:
  4052. */
  4053. switch (c) {
  4054. case '-':
  4055. /*
  4056. * U+002D HYPHEN-MINUS (-) Emit a U+002D
  4057. * HYPHEN-MINUS character token. Switch to the
  4058. * script data double escaped dash state.
  4059. */
  4060. state = transition(state, Tokenizer.SCRIPT_DATA_DOUBLE_ESCAPED_DASH, reconsume, pos);
  4061. break scriptdatadoubleescapedloop; // FALL THRU
  4062. // continue
  4063. // stateloop;
  4064. case '<':
  4065. /*
  4066. * U+003C LESS-THAN SIGN (<) Emit a U+003C
  4067. * LESS-THAN SIGN character token. Switch to the
  4068. * script data double escaped less-than sign
  4069. * state.
  4070. */
  4071. state = transition(state, Tokenizer.SCRIPT_DATA_DOUBLE_ESCAPED_LESS_THAN_SIGN, reconsume, pos);
  4072. continue stateloop;
  4073. case '\u0000':
  4074. emitReplacementCharacter(buf, pos);
  4075. continue;
  4076. case '\r':
  4077. emitCarriageReturn(buf, pos);
  4078. break stateloop;
  4079. case '\n':
  4080. silentLineFeed();
  4081. default:
  4082. /*
  4083. * Anything else Emit the current input
  4084. * character as a character token. Stay in the
  4085. * script data double escaped state.
  4086. */
  4087. continue;
  4088. }
  4089. }
  4090. // WARNING FALLTHRU CASE TRANSITION: DON'T REORDER
  4091. case SCRIPT_DATA_DOUBLE_ESCAPED_DASH:
  4092. scriptdatadoubleescapeddashloop: for (;;) {
  4093. if (++pos == endPos) {
  4094. break stateloop;
  4095. }
  4096. c = checkChar(buf, pos);
  4097. /*
  4098. * Consume the next input character:
  4099. */
  4100. switch (c) {
  4101. case '-':
  4102. /*
  4103. * U+002D HYPHEN-MINUS (-) Emit a U+002D
  4104. * HYPHEN-MINUS character token. Switch to the
  4105. * script data double escaped dash dash state.
  4106. */
  4107. state = transition(state, Tokenizer.SCRIPT_DATA_DOUBLE_ESCAPED_DASH_DASH, reconsume, pos);
  4108. break scriptdatadoubleescapeddashloop;
  4109. // continue stateloop;
  4110. case '<':
  4111. /*
  4112. * U+003C LESS-THAN SIGN (<) Emit a U+003C
  4113. * LESS-THAN SIGN character token. Switch to the
  4114. * script data double escaped less-than sign
  4115. * state.
  4116. */
  4117. state = transition(state, Tokenizer.SCRIPT_DATA_DOUBLE_ESCAPED_LESS_THAN_SIGN, reconsume, pos);
  4118. continue stateloop;
  4119. case '\u0000':
  4120. emitReplacementCharacter(buf, pos);
  4121. state = transition(state, Tokenizer.SCRIPT_DATA_DOUBLE_ESCAPED, reconsume, pos);
  4122. continue stateloop;
  4123. case '\r':
  4124. emitCarriageReturn(buf, pos);
  4125. state = transition(state, Tokenizer.SCRIPT_DATA_DOUBLE_ESCAPED, reconsume, pos);
  4126. break stateloop;
  4127. case '\n':
  4128. silentLineFeed();
  4129. default:
  4130. /*
  4131. * Anything else Emit the current input
  4132. * character as a character token. Switch to the
  4133. * script data double escaped state.
  4134. */
  4135. state = transition(state, Tokenizer.SCRIPT_DATA_DOUBLE_ESCAPED, reconsume, pos);
  4136. continue stateloop;
  4137. }
  4138. }
  4139. // WARNING FALLTHRU CASE TRANSITION: DON'T REORDER
  4140. case SCRIPT_DATA_DOUBLE_ESCAPED_DASH_DASH:
  4141. scriptdatadoubleescapeddashdashloop: for (;;) {
  4142. if (++pos == endPos) {
  4143. break stateloop;
  4144. }
  4145. c = checkChar(buf, pos);
  4146. /*
  4147. * Consume the next input character:
  4148. */
  4149. switch (c) {
  4150. case '-':
  4151. /*
  4152. * U+002D HYPHEN-MINUS (-) Emit a U+002D
  4153. * HYPHEN-MINUS character token. Stay in the
  4154. * script data double escaped dash dash state.
  4155. */
  4156. continue;
  4157. case '<':
  4158. /*
  4159. * U+003C LESS-THAN SIGN (<) Emit a U+003C
  4160. * LESS-THAN SIGN character token. Switch to the
  4161. * script data double escaped less-than sign
  4162. * state.
  4163. */
  4164. state = transition(state, Tokenizer.SCRIPT_DATA_DOUBLE_ESCAPED_LESS_THAN_SIGN, reconsume, pos);
  4165. break scriptdatadoubleescapeddashdashloop;
  4166. case '>':
  4167. /*
  4168. * U+003E GREATER-THAN SIGN (>) Emit a U+003E
  4169. * GREATER-THAN SIGN character token. Switch to
  4170. * the script data state.
  4171. */
  4172. state = transition(state, Tokenizer.SCRIPT_DATA, reconsume, pos);
  4173. continue stateloop;
  4174. case '\u0000':
  4175. emitReplacementCharacter(buf, pos);
  4176. state = transition(state, Tokenizer.SCRIPT_DATA_DOUBLE_ESCAPED, reconsume, pos);
  4177. continue stateloop;
  4178. case '\r':
  4179. emitCarriageReturn(buf, pos);
  4180. state = transition(state, Tokenizer.SCRIPT_DATA_DOUBLE_ESCAPED, reconsume, pos);
  4181. break stateloop;
  4182. case '\n':
  4183. silentLineFeed();
  4184. default:
  4185. /*
  4186. * Anything else Emit the current input
  4187. * character as a character token. Switch to the
  4188. * script data double escaped state.
  4189. */
  4190. state = transition(state, Tokenizer.SCRIPT_DATA_DOUBLE_ESCAPED, reconsume, pos);
  4191. continue stateloop;
  4192. }
  4193. }
  4194. // WARNING FALLTHRU CASE TRANSITION: DON'T REORDER
  4195. case SCRIPT_DATA_DOUBLE_ESCAPED_LESS_THAN_SIGN:
  4196. scriptdatadoubleescapedlessthanloop: for (;;) {
  4197. if (++pos == endPos) {
  4198. break stateloop;
  4199. }
  4200. c = checkChar(buf, pos);
  4201. /*
  4202. * Consume the next input character:
  4203. */
  4204. switch (c) {
  4205. case '/':
  4206. /*
  4207. * U+002F SOLIDUS (/) Emit a U+002F SOLIDUS
  4208. * character token. Set the temporary buffer to
  4209. * the empty string. Switch to the script data
  4210. * double escape end state.
  4211. */
  4212. index = 0;
  4213. state = transition(state, Tokenizer.SCRIPT_DATA_DOUBLE_ESCAPE_END, reconsume, pos);
  4214. break scriptdatadoubleescapedlessthanloop;
  4215. default:
  4216. /*
  4217. * Anything else Reconsume the current input
  4218. * character in the script data double escaped
  4219. * state.
  4220. */
  4221. reconsume = true;
  4222. state = transition(state, Tokenizer.SCRIPT_DATA_DOUBLE_ESCAPED, reconsume, pos);
  4223. continue stateloop;
  4224. }
  4225. }
  4226. // WARNING FALLTHRU CASE TRANSITION: DON'T REORDER
  4227. case SCRIPT_DATA_DOUBLE_ESCAPE_END:
  4228. scriptdatadoubleescapeendloop: for (;;) {
  4229. if (++pos == endPos) {
  4230. break stateloop;
  4231. }
  4232. c = checkChar(buf, pos);
  4233. if (index < 6) { // SCRIPT_ARR.length
  4234. char folded = c;
  4235. if (c >= 'A' && c <= 'Z') {
  4236. folded += 0x20;
  4237. }
  4238. if (folded != Tokenizer.SCRIPT_ARR[index]) {
  4239. reconsume = true;
  4240. state = transition(state, Tokenizer.SCRIPT_DATA_DOUBLE_ESCAPED, reconsume, pos);
  4241. continue stateloop;
  4242. }
  4243. index++;
  4244. continue;
  4245. }
  4246. switch (c) {
  4247. case '\r':
  4248. emitCarriageReturn(buf, pos);
  4249. state = transition(state, Tokenizer.SCRIPT_DATA_ESCAPED, reconsume, pos);
  4250. break stateloop;
  4251. case '\n':
  4252. silentLineFeed();
  4253. case ' ':
  4254. case '\t':
  4255. case '\u000C':
  4256. case '/':
  4257. case '>':
  4258. /*
  4259. * U+0009 CHARACTER TABULATION U+000A LINE FEED
  4260. * (LF) U+000C FORM FEED (FF) U+0020 SPACE
  4261. * U+002F SOLIDUS (/) U+003E GREATER-THAN SIGN
  4262. * (>) Emit the current input character as a
  4263. * character token. If the temporary buffer is
  4264. * the string "script", then switch to the
  4265. * script data escaped state.
  4266. */
  4267. state = transition(state, Tokenizer.SCRIPT_DATA_ESCAPED, reconsume, pos);
  4268. continue stateloop;
  4269. default:
  4270. /*
  4271. * Reconsume the current input character in the
  4272. * script data double escaped state.
  4273. */
  4274. reconsume = true;
  4275. state = transition(state, Tokenizer.SCRIPT_DATA_DOUBLE_ESCAPED, reconsume, pos);
  4276. continue stateloop;
  4277. }
  4278. }
  4279. // XXX reorder point
  4280. case MARKUP_DECLARATION_OCTYPE:
  4281. markupdeclarationdoctypeloop: for (;;) {
  4282. if (++pos == endPos) {
  4283. break stateloop;
  4284. }
  4285. c = checkChar(buf, pos);
  4286. if (index < 6) { // OCTYPE.length
  4287. char folded = c;
  4288. if (c >= 'A' && c <= 'Z') {
  4289. folded += 0x20;
  4290. }
  4291. if (folded == Tokenizer.OCTYPE[index]) {
  4292. appendLongStrBuf(c);
  4293. } else {
  4294. errBogusComment();
  4295. reconsume = true;
  4296. state = transition(state, Tokenizer.BOGUS_COMMENT, reconsume, pos);
  4297. continue stateloop;
  4298. }
  4299. index++;
  4300. continue;
  4301. } else {
  4302. reconsume = true;
  4303. state = transition(state, Tokenizer.DOCTYPE, reconsume, pos);
  4304. break markupdeclarationdoctypeloop;
  4305. // continue stateloop;
  4306. }
  4307. }
  4308. // FALLTHRU DON'T REORDER
  4309. case DOCTYPE:
  4310. doctypeloop: for (;;) {
  4311. if (reconsume) {
  4312. reconsume = false;
  4313. } else {
  4314. if (++pos == endPos) {
  4315. break stateloop;
  4316. }
  4317. c = checkChar(buf, pos);
  4318. }
  4319. initDoctypeFields();
  4320. /*
  4321. * Consume the next input character:
  4322. */
  4323. switch (c) {
  4324. case '\r':
  4325. silentCarriageReturn();
  4326. state = transition(state, Tokenizer.BEFORE_DOCTYPE_NAME, reconsume, pos);
  4327. break stateloop;
  4328. case '\n':
  4329. silentLineFeed();
  4330. // fall thru
  4331. case ' ':
  4332. case '\t':
  4333. case '\u000C':
  4334. /*
  4335. * U+0009 CHARACTER TABULATION U+000A LINE FEED
  4336. * (LF) U+000C FORM FEED (FF) U+0020 SPACE
  4337. * Switch to the before DOCTYPE name state.
  4338. */
  4339. state = transition(state, Tokenizer.BEFORE_DOCTYPE_NAME, reconsume, pos);
  4340. break doctypeloop;
  4341. // continue stateloop;
  4342. default:
  4343. /*
  4344. * Anything else Parse error.
  4345. */
  4346. errMissingSpaceBeforeDoctypeName();
  4347. /*
  4348. * Reconsume the current character in the before
  4349. * DOCTYPE name state.
  4350. */
  4351. reconsume = true;
  4352. state = transition(state, Tokenizer.BEFORE_DOCTYPE_NAME, reconsume, pos);
  4353. break doctypeloop;
  4354. // continue stateloop;
  4355. }
  4356. }
  4357. // FALLTHRU DON'T REORDER
  4358. case BEFORE_DOCTYPE_NAME:
  4359. beforedoctypenameloop: for (;;) {
  4360. if (reconsume) {
  4361. reconsume = false;
  4362. } else {
  4363. if (++pos == endPos) {
  4364. break stateloop;
  4365. }
  4366. c = checkChar(buf, pos);
  4367. }
  4368. /*
  4369. * Consume the next input character:
  4370. */
  4371. switch (c) {
  4372. case '\r':
  4373. silentCarriageReturn();
  4374. break stateloop;
  4375. case '\n':
  4376. silentLineFeed();
  4377. // fall thru
  4378. case ' ':
  4379. case '\t':
  4380. case '\u000C':
  4381. /*
  4382. * U+0009 CHARACTER TABULATION U+000A LINE FEED
  4383. * (LF) U+000C FORM FEED (FF) U+0020 SPACE Stay
  4384. * in the before DOCTYPE name state.
  4385. */
  4386. continue;
  4387. case '>':
  4388. /*
  4389. * U+003E GREATER-THAN SIGN (>) Parse error.
  4390. */
  4391. errNamelessDoctype();
  4392. /*
  4393. * Create a new DOCTYPE token. Set its
  4394. * force-quirks flag to on.
  4395. */
  4396. forceQuirks = true;
  4397. /*
  4398. * Emit the token.
  4399. */
  4400. emitDoctypeToken(pos);
  4401. /*
  4402. * Switch to the data state.
  4403. */
  4404. state = transition(state, Tokenizer.DATA, reconsume, pos);
  4405. continue stateloop;
  4406. case '\u0000':
  4407. c = '\uFFFD';
  4408. // fall thru
  4409. default:
  4410. if (c >= 'A' && c <= 'Z') {
  4411. /*
  4412. * U+0041 LATIN CAPITAL LETTER A through to
  4413. * U+005A LATIN CAPITAL LETTER Z Create a
  4414. * new DOCTYPE token. Set the token's name
  4415. * to the lowercase version of the input
  4416. * character (add 0x0020 to the character's
  4417. * code point).
  4418. */
  4419. c += 0x20;
  4420. }
  4421. /* Anything else Create a new DOCTYPE token. */
  4422. /*
  4423. * Set the token's name name to the current
  4424. * input character.
  4425. */
  4426. clearStrBufAndAppend(c);
  4427. /*
  4428. * Switch to the DOCTYPE name state.
  4429. */
  4430. state = transition(state, Tokenizer.DOCTYPE_NAME, reconsume, pos);
  4431. break beforedoctypenameloop;
  4432. // continue stateloop;
  4433. }
  4434. }
  4435. // FALLTHRU DON'T REORDER
  4436. case DOCTYPE_NAME:
  4437. doctypenameloop: for (;;) {
  4438. if (++pos == endPos) {
  4439. break stateloop;
  4440. }
  4441. c = checkChar(buf, pos);
  4442. /*
  4443. * Consume the next input character:
  4444. */
  4445. switch (c) {
  4446. case '\r':
  4447. silentCarriageReturn();
  4448. strBufToDoctypeName();
  4449. state = transition(state, Tokenizer.AFTER_DOCTYPE_NAME, reconsume, pos);
  4450. break stateloop;
  4451. case '\n':
  4452. silentLineFeed();
  4453. // fall thru
  4454. case ' ':
  4455. case '\t':
  4456. case '\u000C':
  4457. /*
  4458. * U+0009 CHARACTER TABULATION U+000A LINE FEED
  4459. * (LF) U+000C FORM FEED (FF) U+0020 SPACE
  4460. * Switch to the after DOCTYPE name state.
  4461. */
  4462. strBufToDoctypeName();
  4463. state = transition(state, Tokenizer.AFTER_DOCTYPE_NAME, reconsume, pos);
  4464. break doctypenameloop;
  4465. // continue stateloop;
  4466. case '>':
  4467. /*
  4468. * U+003E GREATER-THAN SIGN (>) Emit the current
  4469. * DOCTYPE token.
  4470. */
  4471. strBufToDoctypeName();
  4472. emitDoctypeToken(pos);
  4473. /*
  4474. * Switch to the data state.
  4475. */
  4476. state = transition(state, Tokenizer.DATA, reconsume, pos);
  4477. continue stateloop;
  4478. case '\u0000':
  4479. c = '\uFFFD';
  4480. // fall thru
  4481. default:
  4482. /*
  4483. * U+0041 LATIN CAPITAL LETTER A through to
  4484. * U+005A LATIN CAPITAL LETTER Z Append the
  4485. * lowercase version of the input character (add
  4486. * 0x0020 to the character's code point) to the
  4487. * current DOCTYPE token's name.
  4488. */
  4489. if (c >= 'A' && c <= 'Z') {
  4490. c += 0x0020;
  4491. }
  4492. /*
  4493. * Anything else Append the current input
  4494. * character to the current DOCTYPE token's
  4495. * name.
  4496. */
  4497. appendStrBuf(c);
  4498. /*
  4499. * Stay in the DOCTYPE name state.
  4500. */
  4501. continue;
  4502. }
  4503. }
  4504. // FALLTHRU DON'T REORDER
  4505. case AFTER_DOCTYPE_NAME:
  4506. afterdoctypenameloop: for (;;) {
  4507. if (++pos == endPos) {
  4508. break stateloop;
  4509. }
  4510. c = checkChar(buf, pos);
  4511. /*
  4512. * Consume the next input character:
  4513. */
  4514. switch (c) {
  4515. case '\r':
  4516. silentCarriageReturn();
  4517. break stateloop;
  4518. case '\n':
  4519. silentLineFeed();
  4520. // fall thru
  4521. case ' ':
  4522. case '\t':
  4523. case '\u000C':
  4524. /*
  4525. * U+0009 CHARACTER TABULATION U+000A LINE FEED
  4526. * (LF) U+000C FORM FEED (FF) U+0020 SPACE Stay
  4527. * in the after DOCTYPE name state.
  4528. */
  4529. continue;
  4530. case '>':
  4531. /*
  4532. * U+003E GREATER-THAN SIGN (>) Emit the current
  4533. * DOCTYPE token.
  4534. */
  4535. emitDoctypeToken(pos);
  4536. /*
  4537. * Switch to the data state.
  4538. */
  4539. state = transition(state, Tokenizer.DATA, reconsume, pos);
  4540. continue stateloop;
  4541. case 'p':
  4542. case 'P':
  4543. index = 0;
  4544. state = transition(state, Tokenizer.DOCTYPE_UBLIC, reconsume, pos);
  4545. break afterdoctypenameloop;
  4546. // continue stateloop;
  4547. case 's':
  4548. case 'S':
  4549. index = 0;
  4550. state = transition(state, Tokenizer.DOCTYPE_YSTEM, reconsume, pos);
  4551. continue stateloop;
  4552. default:
  4553. /*
  4554. * Otherwise, this is the parse error.
  4555. */
  4556. bogusDoctype();
  4557. /*
  4558. * Set the DOCTYPE token's force-quirks flag to
  4559. * on.
  4560. */
  4561. // done by bogusDoctype();
  4562. /*
  4563. * Switch to the bogus DOCTYPE state.
  4564. */
  4565. state = transition(state, Tokenizer.BOGUS_DOCTYPE, reconsume, pos);
  4566. continue stateloop;
  4567. }
  4568. }
  4569. // FALLTHRU DON'T REORDER
  4570. case DOCTYPE_UBLIC:
  4571. doctypeublicloop: for (;;) {
  4572. if (++pos == endPos) {
  4573. break stateloop;
  4574. }
  4575. c = checkChar(buf, pos);
  4576. /*
  4577. * If the six characters starting from the current input
  4578. * character are an ASCII case-insensitive match for the
  4579. * word "PUBLIC", then consume those characters and
  4580. * switch to the before DOCTYPE public identifier state.
  4581. */
  4582. if (index < 5) { // UBLIC.length
  4583. char folded = c;
  4584. if (c >= 'A' && c <= 'Z') {
  4585. folded += 0x20;
  4586. }
  4587. if (folded != Tokenizer.UBLIC[index]) {
  4588. bogusDoctype();
  4589. // forceQuirks = true;
  4590. reconsume = true;
  4591. state = transition(state, Tokenizer.BOGUS_DOCTYPE, reconsume, pos);
  4592. continue stateloop;
  4593. }
  4594. index++;
  4595. continue;
  4596. } else {
  4597. reconsume = true;
  4598. state = transition(state, Tokenizer.AFTER_DOCTYPE_PUBLIC_KEYWORD, reconsume, pos);
  4599. break doctypeublicloop;
  4600. // continue stateloop;
  4601. }
  4602. }
  4603. // FALLTHRU DON'T REORDER
  4604. case AFTER_DOCTYPE_PUBLIC_KEYWORD:
  4605. afterdoctypepublickeywordloop: for (;;) {
  4606. if (reconsume) {
  4607. reconsume = false;
  4608. } else {
  4609. if (++pos == endPos) {
  4610. break stateloop;
  4611. }
  4612. c = checkChar(buf, pos);
  4613. }
  4614. /*
  4615. * Consume the next input character:
  4616. */
  4617. switch (c) {
  4618. case '\r':
  4619. silentCarriageReturn();
  4620. state = transition(state, Tokenizer.BEFORE_DOCTYPE_PUBLIC_IDENTIFIER, reconsume, pos);
  4621. break stateloop;
  4622. case '\n':
  4623. silentLineFeed();
  4624. // fall thru
  4625. case ' ':
  4626. case '\t':
  4627. case '\u000C':
  4628. /*
  4629. * U+0009 CHARACTER TABULATION U+000A LINE FEED
  4630. * (LF) U+000C FORM FEED (FF) U+0020 SPACE
  4631. * Switch to the before DOCTYPE public
  4632. * identifier state.
  4633. */
  4634. state = transition(state, Tokenizer.BEFORE_DOCTYPE_PUBLIC_IDENTIFIER, reconsume, pos);
  4635. break afterdoctypepublickeywordloop;
  4636. // FALL THROUGH continue stateloop
  4637. case '"':
  4638. /*
  4639. * U+0022 QUOTATION MARK (") Parse Error.
  4640. */
  4641. errNoSpaceBetweenDoctypePublicKeywordAndQuote();
  4642. /*
  4643. * Set the DOCTYPE token's public identifier to
  4644. * the empty string (not missing),
  4645. */
  4646. clearLongStrBuf();
  4647. /*
  4648. * then switch to the DOCTYPE public identifier
  4649. * (double-quoted) state.
  4650. */
  4651. state = transition(state, Tokenizer.DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED, reconsume, pos);
  4652. continue stateloop;
  4653. case '\'':
  4654. /*
  4655. * U+0027 APOSTROPHE (') Parse Error.
  4656. */
  4657. errNoSpaceBetweenDoctypePublicKeywordAndQuote();
  4658. /*
  4659. * Set the DOCTYPE token's public identifier to
  4660. * the empty string (not missing),
  4661. */
  4662. clearLongStrBuf();
  4663. /*
  4664. * then switch to the DOCTYPE public identifier
  4665. * (single-quoted) state.
  4666. */
  4667. state = transition(state, Tokenizer.DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED, reconsume, pos);
  4668. continue stateloop;
  4669. case '>':
  4670. /* U+003E GREATER-THAN SIGN (>) Parse error. */
  4671. errExpectedPublicId();
  4672. /*
  4673. * Set the DOCTYPE token's force-quirks flag to
  4674. * on.
  4675. */
  4676. forceQuirks = true;
  4677. /*
  4678. * Emit that DOCTYPE token.
  4679. */
  4680. emitDoctypeToken(pos);
  4681. /*
  4682. * Switch to the data state.
  4683. */
  4684. state = transition(state, Tokenizer.DATA, reconsume, pos);
  4685. continue stateloop;
  4686. default:
  4687. bogusDoctype();
  4688. /*
  4689. * Set the DOCTYPE token's force-quirks flag to
  4690. * on.
  4691. */
  4692. // done by bogusDoctype();
  4693. /*
  4694. * Switch to the bogus DOCTYPE state.
  4695. */
  4696. state = transition(state, Tokenizer.BOGUS_DOCTYPE, reconsume, pos);
  4697. continue stateloop;
  4698. }
  4699. }
  4700. // FALLTHRU DON'T REORDER
  4701. case BEFORE_DOCTYPE_PUBLIC_IDENTIFIER:
  4702. beforedoctypepublicidentifierloop: for (;;) {
  4703. if (++pos == endPos) {
  4704. break stateloop;
  4705. }
  4706. c = checkChar(buf, pos);
  4707. /*
  4708. * Consume the next input character:
  4709. */
  4710. switch (c) {
  4711. case '\r':
  4712. silentCarriageReturn();
  4713. break stateloop;
  4714. case '\n':
  4715. silentLineFeed();
  4716. // fall thru
  4717. case ' ':
  4718. case '\t':
  4719. case '\u000C':
  4720. /*
  4721. * U+0009 CHARACTER TABULATION U+000A LINE FEED
  4722. * (LF) U+000C FORM FEED (FF) U+0020 SPACE Stay
  4723. * in the before DOCTYPE public identifier
  4724. * state.
  4725. */
  4726. continue;
  4727. case '"':
  4728. /*
  4729. * U+0022 QUOTATION MARK (") Set the DOCTYPE
  4730. * token's public identifier to the empty string
  4731. * (not missing),
  4732. */
  4733. clearLongStrBuf();
  4734. /*
  4735. * then switch to the DOCTYPE public identifier
  4736. * (double-quoted) state.
  4737. */
  4738. state = transition(state, Tokenizer.DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED, reconsume, pos);
  4739. break beforedoctypepublicidentifierloop;
  4740. // continue stateloop;
  4741. case '\'':
  4742. /*
  4743. * U+0027 APOSTROPHE (') Set the DOCTYPE token's
  4744. * public identifier to the empty string (not
  4745. * missing),
  4746. */
  4747. clearLongStrBuf();
  4748. /*
  4749. * then switch to the DOCTYPE public identifier
  4750. * (single-quoted) state.
  4751. */
  4752. state = transition(state, Tokenizer.DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED, reconsume, pos);
  4753. continue stateloop;
  4754. case '>':
  4755. /* U+003E GREATER-THAN SIGN (>) Parse error. */
  4756. errExpectedPublicId();
  4757. /*
  4758. * Set the DOCTYPE token's force-quirks flag to
  4759. * on.
  4760. */
  4761. forceQuirks = true;
  4762. /*
  4763. * Emit that DOCTYPE token.
  4764. */
  4765. emitDoctypeToken(pos);
  4766. /*
  4767. * Switch to the data state.
  4768. */
  4769. state = transition(state, Tokenizer.DATA, reconsume, pos);
  4770. continue stateloop;
  4771. default:
  4772. bogusDoctype();
  4773. /*
  4774. * Set the DOCTYPE token's force-quirks flag to
  4775. * on.
  4776. */
  4777. // done by bogusDoctype();
  4778. /*
  4779. * Switch to the bogus DOCTYPE state.
  4780. */
  4781. state = transition(state, Tokenizer.BOGUS_DOCTYPE, reconsume, pos);
  4782. continue stateloop;
  4783. }
  4784. }
  4785. // FALLTHRU DON'T REORDER
  4786. case DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED:
  4787. doctypepublicidentifierdoublequotedloop: for (;;) {
  4788. if (++pos == endPos) {
  4789. break stateloop;
  4790. }
  4791. c = checkChar(buf, pos);
  4792. /*
  4793. * Consume the next input character:
  4794. */
  4795. switch (c) {
  4796. case '"':
  4797. /*
  4798. * U+0022 QUOTATION MARK (") Switch to the after
  4799. * DOCTYPE public identifier state.
  4800. */
  4801. publicIdentifier = longStrBufToString();
  4802. state = transition(state, Tokenizer.AFTER_DOCTYPE_PUBLIC_IDENTIFIER, reconsume, pos);
  4803. break doctypepublicidentifierdoublequotedloop;
  4804. // continue stateloop;
  4805. case '>':
  4806. /*
  4807. * U+003E GREATER-THAN SIGN (>) Parse error.
  4808. */
  4809. errGtInPublicId();
  4810. /*
  4811. * Set the DOCTYPE token's force-quirks flag to
  4812. * on.
  4813. */
  4814. forceQuirks = true;
  4815. /*
  4816. * Emit that DOCTYPE token.
  4817. */
  4818. publicIdentifier = longStrBufToString();
  4819. emitDoctypeToken(pos);
  4820. /*
  4821. * Switch to the data state.
  4822. */
  4823. state = transition(state, Tokenizer.DATA, reconsume, pos);
  4824. continue stateloop;
  4825. case '\r':
  4826. appendLongStrBufCarriageReturn();
  4827. break stateloop;
  4828. case '\n':
  4829. appendLongStrBufLineFeed();
  4830. continue;
  4831. case '\u0000':
  4832. c = '\uFFFD';
  4833. // fall thru
  4834. default:
  4835. /*
  4836. * Anything else Append the current input
  4837. * character to the current DOCTYPE token's
  4838. * public identifier.
  4839. */
  4840. appendLongStrBuf(c);
  4841. /*
  4842. * Stay in the DOCTYPE public identifier
  4843. * (double-quoted) state.
  4844. */
  4845. continue;
  4846. }
  4847. }
  4848. // FALLTHRU DON'T REORDER
  4849. case AFTER_DOCTYPE_PUBLIC_IDENTIFIER:
  4850. afterdoctypepublicidentifierloop: for (;;) {
  4851. if (++pos == endPos) {
  4852. break stateloop;
  4853. }
  4854. c = checkChar(buf, pos);
  4855. /*
  4856. * Consume the next input character:
  4857. */
  4858. switch (c) {
  4859. case '\r':
  4860. silentCarriageReturn();
  4861. state = transition(state, Tokenizer.BETWEEN_DOCTYPE_PUBLIC_AND_SYSTEM_IDENTIFIERS, reconsume, pos);
  4862. break stateloop;
  4863. case '\n':
  4864. silentLineFeed();
  4865. // fall thru
  4866. case ' ':
  4867. case '\t':
  4868. case '\u000C':
  4869. /*
  4870. * U+0009 CHARACTER TABULATION U+000A LINE FEED
  4871. * (LF) U+000C FORM FEED (FF) U+0020 SPACE
  4872. * Switch to the between DOCTYPE public and
  4873. * system identifiers state.
  4874. */
  4875. state = transition(state, Tokenizer.BETWEEN_DOCTYPE_PUBLIC_AND_SYSTEM_IDENTIFIERS, reconsume, pos);
  4876. break afterdoctypepublicidentifierloop;
  4877. // continue stateloop;
  4878. case '>':
  4879. /*
  4880. * U+003E GREATER-THAN SIGN (>) Emit the current
  4881. * DOCTYPE token.
  4882. */
  4883. emitDoctypeToken(pos);
  4884. /*
  4885. * Switch to the data state.
  4886. */
  4887. state = transition(state, Tokenizer.DATA, reconsume, pos);
  4888. continue stateloop;
  4889. case '"':
  4890. /*
  4891. * U+0022 QUOTATION MARK (") Parse error.
  4892. */
  4893. errNoSpaceBetweenPublicAndSystemIds();
  4894. /*
  4895. * Set the DOCTYPE token's system identifier to
  4896. * the empty string (not missing),
  4897. */
  4898. clearLongStrBuf();
  4899. /*
  4900. * then switch to the DOCTYPE system identifier
  4901. * (double-quoted) state.
  4902. */
  4903. state = transition(state, Tokenizer.DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED, reconsume, pos);
  4904. continue stateloop;
  4905. case '\'':
  4906. /*
  4907. * U+0027 APOSTROPHE (') Parse error.
  4908. */
  4909. errNoSpaceBetweenPublicAndSystemIds();
  4910. /*
  4911. * Set the DOCTYPE token's system identifier to
  4912. * the empty string (not missing),
  4913. */
  4914. clearLongStrBuf();
  4915. /*
  4916. * then switch to the DOCTYPE system identifier
  4917. * (single-quoted) state.
  4918. */
  4919. state = transition(state, Tokenizer.DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED, reconsume, pos);
  4920. continue stateloop;
  4921. default:
  4922. bogusDoctype();
  4923. /*
  4924. * Set the DOCTYPE token's force-quirks flag to
  4925. * on.
  4926. */
  4927. // done by bogusDoctype();
  4928. /*
  4929. * Switch to the bogus DOCTYPE state.
  4930. */
  4931. state = transition(state, Tokenizer.BOGUS_DOCTYPE, reconsume, pos);
  4932. continue stateloop;
  4933. }
  4934. }
  4935. // FALLTHRU DON'T REORDER
  4936. case BETWEEN_DOCTYPE_PUBLIC_AND_SYSTEM_IDENTIFIERS:
  4937. betweendoctypepublicandsystemidentifiersloop: for (;;) {
  4938. if (++pos == endPos) {
  4939. break stateloop;
  4940. }
  4941. c = checkChar(buf, pos);
  4942. /*
  4943. * Consume the next input character:
  4944. */
  4945. switch (c) {
  4946. case '\r':
  4947. silentCarriageReturn();
  4948. break stateloop;
  4949. case '\n':
  4950. silentLineFeed();
  4951. // fall thru
  4952. case ' ':
  4953. case '\t':
  4954. case '\u000C':
  4955. /*
  4956. * U+0009 CHARACTER TABULATION U+000A LINE FEED
  4957. * (LF) U+000C FORM FEED (FF) U+0020 SPACE Stay
  4958. * in the between DOCTYPE public and system
  4959. * identifiers state.
  4960. */
  4961. continue;
  4962. case '>':
  4963. /*
  4964. * U+003E GREATER-THAN SIGN (>) Emit the current
  4965. * DOCTYPE token.
  4966. */
  4967. emitDoctypeToken(pos);
  4968. /*
  4969. * Switch to the data state.
  4970. */
  4971. state = transition(state, Tokenizer.DATA, reconsume, pos);
  4972. continue stateloop;
  4973. case '"':
  4974. /*
  4975. * U+0022 QUOTATION MARK (") Set the DOCTYPE
  4976. * token's system identifier to the empty string
  4977. * (not missing),
  4978. */
  4979. clearLongStrBuf();
  4980. /*
  4981. * then switch to the DOCTYPE system identifier
  4982. * (double-quoted) state.
  4983. */
  4984. state = transition(state, Tokenizer.DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED, reconsume, pos);
  4985. break betweendoctypepublicandsystemidentifiersloop;
  4986. // continue stateloop;
  4987. case '\'':
  4988. /*
  4989. * U+0027 APOSTROPHE (') Set the DOCTYPE token's
  4990. * system identifier to the empty string (not
  4991. * missing),
  4992. */
  4993. clearLongStrBuf();
  4994. /*
  4995. * then switch to the DOCTYPE system identifier
  4996. * (single-quoted) state.
  4997. */
  4998. state = transition(state, Tokenizer.DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED, reconsume, pos);
  4999. continue stateloop;
  5000. default:
  5001. bogusDoctype();
  5002. /*
  5003. * Set the DOCTYPE token's force-quirks flag to
  5004. * on.
  5005. */
  5006. // done by bogusDoctype();
  5007. /*
  5008. * Switch to the bogus DOCTYPE state.
  5009. */
  5010. state = transition(state, Tokenizer.BOGUS_DOCTYPE, reconsume, pos);
  5011. continue stateloop;
  5012. }
  5013. }
  5014. // FALLTHRU DON'T REORDER
  5015. case DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED:
  5016. doctypesystemidentifierdoublequotedloop: for (;;) {
  5017. if (++pos == endPos) {
  5018. break stateloop;
  5019. }
  5020. c = checkChar(buf, pos);
  5021. /*
  5022. * Consume the next input character:
  5023. */
  5024. switch (c) {
  5025. case '"':
  5026. /*
  5027. * U+0022 QUOTATION MARK (") Switch to the after
  5028. * DOCTYPE system identifier state.
  5029. */
  5030. systemIdentifier = longStrBufToString();
  5031. state = transition(state, Tokenizer.AFTER_DOCTYPE_SYSTEM_IDENTIFIER, reconsume, pos);
  5032. continue stateloop;
  5033. case '>':
  5034. /*
  5035. * U+003E GREATER-THAN SIGN (>) Parse error.
  5036. */
  5037. errGtInSystemId();
  5038. /*
  5039. * Set the DOCTYPE token's force-quirks flag to
  5040. * on.
  5041. */
  5042. forceQuirks = true;
  5043. /*
  5044. * Emit that DOCTYPE token.
  5045. */
  5046. systemIdentifier = longStrBufToString();
  5047. emitDoctypeToken(pos);
  5048. /*
  5049. * Switch to the data state.
  5050. */
  5051. state = transition(state, Tokenizer.DATA, reconsume, pos);
  5052. continue stateloop;
  5053. case '\r':
  5054. appendLongStrBufCarriageReturn();
  5055. break stateloop;
  5056. case '\n':
  5057. appendLongStrBufLineFeed();
  5058. continue;
  5059. case '\u0000':
  5060. c = '\uFFFD';
  5061. // fall thru
  5062. default:
  5063. /*
  5064. * Anything else Append the current input
  5065. * character to the current DOCTYPE token's
  5066. * system identifier.
  5067. */
  5068. appendLongStrBuf(c);
  5069. /*
  5070. * Stay in the DOCTYPE system identifier
  5071. * (double-quoted) state.
  5072. */
  5073. continue;
  5074. }
  5075. }
  5076. // FALLTHRU DON'T REORDER
  5077. case AFTER_DOCTYPE_SYSTEM_IDENTIFIER:
  5078. afterdoctypesystemidentifierloop: for (;;) {
  5079. if (++pos == endPos) {
  5080. break stateloop;
  5081. }
  5082. c = checkChar(buf, pos);
  5083. /*
  5084. * Consume the next input character:
  5085. */
  5086. switch (c) {
  5087. case '\r':
  5088. silentCarriageReturn();
  5089. break stateloop;
  5090. case '\n':
  5091. silentLineFeed();
  5092. // fall thru
  5093. case ' ':
  5094. case '\t':
  5095. case '\u000C':
  5096. /*
  5097. * U+0009 CHARACTER TABULATION U+000A LINE FEED
  5098. * (LF) U+000C FORM FEED (FF) U+0020 SPACE Stay
  5099. * in the after DOCTYPE system identifier state.
  5100. */
  5101. continue;
  5102. case '>':
  5103. /*
  5104. * U+003E GREATER-THAN SIGN (>) Emit the current
  5105. * DOCTYPE token.
  5106. */
  5107. emitDoctypeToken(pos);
  5108. /*
  5109. * Switch to the data state.
  5110. */
  5111. state = transition(state, Tokenizer.DATA, reconsume, pos);
  5112. continue stateloop;
  5113. default:
  5114. /*
  5115. * Switch to the bogus DOCTYPE state. (This does
  5116. * not set the DOCTYPE token's force-quirks flag
  5117. * to on.)
  5118. */
  5119. bogusDoctypeWithoutQuirks();
  5120. state = transition(state, Tokenizer.BOGUS_DOCTYPE, reconsume, pos);
  5121. break afterdoctypesystemidentifierloop;
  5122. // continue stateloop;
  5123. }
  5124. }
  5125. // FALLTHRU DON'T REORDER
  5126. case BOGUS_DOCTYPE:
  5127. for (;;) {
  5128. if (reconsume) {
  5129. reconsume = false;
  5130. } else {
  5131. if (++pos == endPos) {
  5132. break stateloop;
  5133. }
  5134. c = checkChar(buf, pos);
  5135. }
  5136. /*
  5137. * Consume the next input character:
  5138. */
  5139. switch (c) {
  5140. case '>':
  5141. /*
  5142. * U+003E GREATER-THAN SIGN (>) Emit that
  5143. * DOCTYPE token.
  5144. */
  5145. emitDoctypeToken(pos);
  5146. /*
  5147. * Switch to the data state.
  5148. */
  5149. state = transition(state, Tokenizer.DATA, reconsume, pos);
  5150. continue stateloop;
  5151. case '\r':
  5152. silentCarriageReturn();
  5153. break stateloop;
  5154. case '\n':
  5155. silentLineFeed();
  5156. // fall thru
  5157. default:
  5158. /*
  5159. * Anything else Stay in the bogus DOCTYPE
  5160. * state.
  5161. */
  5162. continue;
  5163. }
  5164. }
  5165. // XXX reorder point
  5166. case DOCTYPE_YSTEM:
  5167. doctypeystemloop: for (;;) {
  5168. if (++pos == endPos) {
  5169. break stateloop;
  5170. }
  5171. c = checkChar(buf, pos);
  5172. /*
  5173. * Otherwise, if the six characters starting from the
  5174. * current input character are an ASCII case-insensitive
  5175. * match for the word "SYSTEM", then consume those
  5176. * characters and switch to the before DOCTYPE system
  5177. * identifier state.
  5178. */
  5179. if (index < 5) { // YSTEM.length
  5180. char folded = c;
  5181. if (c >= 'A' && c <= 'Z') {
  5182. folded += 0x20;
  5183. }
  5184. if (folded != Tokenizer.YSTEM[index]) {
  5185. bogusDoctype();
  5186. reconsume = true;
  5187. state = transition(state, Tokenizer.BOGUS_DOCTYPE, reconsume, pos);
  5188. continue stateloop;
  5189. }
  5190. index++;
  5191. continue stateloop;
  5192. } else {
  5193. reconsume = true;
  5194. state = transition(state, Tokenizer.AFTER_DOCTYPE_SYSTEM_KEYWORD, reconsume, pos);
  5195. break doctypeystemloop;
  5196. // continue stateloop;
  5197. }
  5198. }
  5199. // FALLTHRU DON'T REORDER
  5200. case AFTER_DOCTYPE_SYSTEM_KEYWORD:
  5201. afterdoctypesystemkeywordloop: for (;;) {
  5202. if (reconsume) {
  5203. reconsume = false;
  5204. } else {
  5205. if (++pos == endPos) {
  5206. break stateloop;
  5207. }
  5208. c = checkChar(buf, pos);
  5209. }
  5210. /*
  5211. * Consume the next input character:
  5212. */
  5213. switch (c) {
  5214. case '\r':
  5215. silentCarriageReturn();
  5216. state = transition(state, Tokenizer.BEFORE_DOCTYPE_SYSTEM_IDENTIFIER, reconsume, pos);
  5217. break stateloop;
  5218. case '\n':
  5219. silentLineFeed();
  5220. // fall thru
  5221. case ' ':
  5222. case '\t':
  5223. case '\u000C':
  5224. /*
  5225. * U+0009 CHARACTER TABULATION U+000A LINE FEED
  5226. * (LF) U+000C FORM FEED (FF) U+0020 SPACE
  5227. * Switch to the before DOCTYPE public
  5228. * identifier state.
  5229. */
  5230. state = transition(state, Tokenizer.BEFORE_DOCTYPE_SYSTEM_IDENTIFIER, reconsume, pos);
  5231. break afterdoctypesystemkeywordloop;
  5232. // FALL THROUGH continue stateloop
  5233. case '"':
  5234. /*
  5235. * U+0022 QUOTATION MARK (") Parse Error.
  5236. */
  5237. errNoSpaceBetweenDoctypeSystemKeywordAndQuote();
  5238. /*
  5239. * Set the DOCTYPE token's system identifier to
  5240. * the empty string (not missing),
  5241. */
  5242. clearLongStrBuf();
  5243. /*
  5244. * then switch to the DOCTYPE public identifier
  5245. * (double-quoted) state.
  5246. */
  5247. state = transition(state, Tokenizer.DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED, reconsume, pos);
  5248. continue stateloop;
  5249. case '\'':
  5250. /*
  5251. * U+0027 APOSTROPHE (') Parse Error.
  5252. */
  5253. errNoSpaceBetweenDoctypeSystemKeywordAndQuote();
  5254. /*
  5255. * Set the DOCTYPE token's public identifier to
  5256. * the empty string (not missing),
  5257. */
  5258. clearLongStrBuf();
  5259. /*
  5260. * then switch to the DOCTYPE public identifier
  5261. * (single-quoted) state.
  5262. */
  5263. state = transition(state, Tokenizer.DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED, reconsume, pos);
  5264. continue stateloop;
  5265. case '>':
  5266. /* U+003E GREATER-THAN SIGN (>) Parse error. */
  5267. errExpectedPublicId();
  5268. /*
  5269. * Set the DOCTYPE token's force-quirks flag to
  5270. * on.
  5271. */
  5272. forceQuirks = true;
  5273. /*
  5274. * Emit that DOCTYPE token.
  5275. */
  5276. emitDoctypeToken(pos);
  5277. /*
  5278. * Switch to the data state.
  5279. */
  5280. state = transition(state, Tokenizer.DATA, reconsume, pos);
  5281. continue stateloop;
  5282. default:
  5283. bogusDoctype();
  5284. /*
  5285. * Set the DOCTYPE token's force-quirks flag to
  5286. * on.
  5287. */
  5288. // done by bogusDoctype();
  5289. /*
  5290. * Switch to the bogus DOCTYPE state.
  5291. */
  5292. state = transition(state, Tokenizer.BOGUS_DOCTYPE, reconsume, pos);
  5293. continue stateloop;
  5294. }
  5295. }
  5296. // FALLTHRU DON'T REORDER
  5297. case BEFORE_DOCTYPE_SYSTEM_IDENTIFIER:
  5298. beforedoctypesystemidentifierloop: for (;;) {
  5299. if (++pos == endPos) {
  5300. break stateloop;
  5301. }
  5302. c = checkChar(buf, pos);
  5303. /*
  5304. * Consume the next input character:
  5305. */
  5306. switch (c) {
  5307. case '\r':
  5308. silentCarriageReturn();
  5309. break stateloop;
  5310. case '\n':
  5311. silentLineFeed();
  5312. // fall thru
  5313. case ' ':
  5314. case '\t':
  5315. case '\u000C':
  5316. /*
  5317. * U+0009 CHARACTER TABULATION U+000A LINE FEED
  5318. * (LF) U+000C FORM FEED (FF) U+0020 SPACE Stay
  5319. * in the before DOCTYPE system identifier
  5320. * state.
  5321. */
  5322. continue;
  5323. case '"':
  5324. /*
  5325. * U+0022 QUOTATION MARK (") Set the DOCTYPE
  5326. * token's system identifier to the empty string
  5327. * (not missing),
  5328. */
  5329. clearLongStrBuf();
  5330. /*
  5331. * then switch to the DOCTYPE system identifier
  5332. * (double-quoted) state.
  5333. */
  5334. state = transition(state, Tokenizer.DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED, reconsume, pos);
  5335. continue stateloop;
  5336. case '\'':
  5337. /*
  5338. * U+0027 APOSTROPHE (') Set the DOCTYPE token's
  5339. * system identifier to the empty string (not
  5340. * missing),
  5341. */
  5342. clearLongStrBuf();
  5343. /*
  5344. * then switch to the DOCTYPE system identifier
  5345. * (single-quoted) state.
  5346. */
  5347. state = transition(state, Tokenizer.DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED, reconsume, pos);
  5348. break beforedoctypesystemidentifierloop;
  5349. // continue stateloop;
  5350. case '>':
  5351. /* U+003E GREATER-THAN SIGN (>) Parse error. */
  5352. errExpectedSystemId();
  5353. /*
  5354. * Set the DOCTYPE token's force-quirks flag to
  5355. * on.
  5356. */
  5357. forceQuirks = true;
  5358. /*
  5359. * Emit that DOCTYPE token.
  5360. */
  5361. emitDoctypeToken(pos);
  5362. /*
  5363. * Switch to the data state.
  5364. */
  5365. state = transition(state, Tokenizer.DATA, reconsume, pos);
  5366. continue stateloop;
  5367. default:
  5368. bogusDoctype();
  5369. /*
  5370. * Set the DOCTYPE token's force-quirks flag to
  5371. * on.
  5372. */
  5373. // done by bogusDoctype();
  5374. /*
  5375. * Switch to the bogus DOCTYPE state.
  5376. */
  5377. state = transition(state, Tokenizer.BOGUS_DOCTYPE, reconsume, pos);
  5378. continue stateloop;
  5379. }
  5380. }
  5381. // FALLTHRU DON'T REORDER
  5382. case DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED:
  5383. for (;;) {
  5384. if (++pos == endPos) {
  5385. break stateloop;
  5386. }
  5387. c = checkChar(buf, pos);
  5388. /*
  5389. * Consume the next input character:
  5390. */
  5391. switch (c) {
  5392. case '\'':
  5393. /*
  5394. * U+0027 APOSTROPHE (') Switch to the after
  5395. * DOCTYPE system identifier state.
  5396. */
  5397. systemIdentifier = longStrBufToString();
  5398. state = transition(state, Tokenizer.AFTER_DOCTYPE_SYSTEM_IDENTIFIER, reconsume, pos);
  5399. continue stateloop;
  5400. case '>':
  5401. errGtInSystemId();
  5402. /*
  5403. * Set the DOCTYPE token's force-quirks flag to
  5404. * on.
  5405. */
  5406. forceQuirks = true;
  5407. /*
  5408. * Emit that DOCTYPE token.
  5409. */
  5410. systemIdentifier = longStrBufToString();
  5411. emitDoctypeToken(pos);
  5412. /*
  5413. * Switch to the data state.
  5414. */
  5415. state = transition(state, Tokenizer.DATA, reconsume, pos);
  5416. continue stateloop;
  5417. case '\r':
  5418. appendLongStrBufCarriageReturn();
  5419. break stateloop;
  5420. case '\n':
  5421. appendLongStrBufLineFeed();
  5422. continue;
  5423. case '\u0000':
  5424. c = '\uFFFD';
  5425. // fall thru
  5426. default:
  5427. /*
  5428. * Anything else Append the current input
  5429. * character to the current DOCTYPE token's
  5430. * system identifier.
  5431. */
  5432. appendLongStrBuf(c);
  5433. /*
  5434. * Stay in the DOCTYPE system identifier
  5435. * (double-quoted) state.
  5436. */
  5437. continue;
  5438. }
  5439. }
  5440. // XXX reorder point
  5441. case DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED:
  5442. for (;;) {
  5443. if (++pos == endPos) {
  5444. break stateloop;
  5445. }
  5446. c = checkChar(buf, pos);
  5447. /*
  5448. * Consume the next input character:
  5449. */
  5450. switch (c) {
  5451. case '\'':
  5452. /*
  5453. * U+0027 APOSTROPHE (') Switch to the after
  5454. * DOCTYPE public identifier state.
  5455. */
  5456. publicIdentifier = longStrBufToString();
  5457. state = transition(state, Tokenizer.AFTER_DOCTYPE_PUBLIC_IDENTIFIER, reconsume, pos);
  5458. continue stateloop;
  5459. case '>':
  5460. errGtInPublicId();
  5461. /*
  5462. * Set the DOCTYPE token's force-quirks flag to
  5463. * on.
  5464. */
  5465. forceQuirks = true;
  5466. /*
  5467. * Emit that DOCTYPE token.
  5468. */
  5469. publicIdentifier = longStrBufToString();
  5470. emitDoctypeToken(pos);
  5471. /*
  5472. * Switch to the data state.
  5473. */
  5474. state = transition(state, Tokenizer.DATA, reconsume, pos);
  5475. continue stateloop;
  5476. case '\r':
  5477. appendLongStrBufCarriageReturn();
  5478. break stateloop;
  5479. case '\n':
  5480. appendLongStrBufLineFeed();
  5481. continue;
  5482. case '\u0000':
  5483. c = '\uFFFD';
  5484. // fall thru
  5485. default:
  5486. /*
  5487. * Anything else Append the current input
  5488. * character to the current DOCTYPE token's
  5489. * public identifier.
  5490. */
  5491. appendLongStrBuf(c);
  5492. /*
  5493. * Stay in the DOCTYPE public identifier
  5494. * (single-quoted) state.
  5495. */
  5496. continue;
  5497. }
  5498. }
  5499. // XXX reorder point
  5500. case PROCESSING_INSTRUCTION:
  5501. processinginstructionloop: for (;;) {
  5502. if (++pos == endPos) {
  5503. break stateloop;
  5504. }
  5505. c = checkChar(buf, pos);
  5506. switch (c) {
  5507. case '?':
  5508. state = transition(
  5509. state,
  5510. Tokenizer.PROCESSING_INSTRUCTION_QUESTION_MARK,
  5511. reconsume, pos);
  5512. break processinginstructionloop;
  5513. // continue stateloop;
  5514. default:
  5515. continue;
  5516. }
  5517. }
  5518. case PROCESSING_INSTRUCTION_QUESTION_MARK:
  5519. if (++pos == endPos) {
  5520. break stateloop;
  5521. }
  5522. c = checkChar(buf, pos);
  5523. switch (c) {
  5524. case '>':
  5525. state = transition(state, Tokenizer.DATA,
  5526. reconsume, pos);
  5527. continue stateloop;
  5528. default:
  5529. state = transition(state,
  5530. Tokenizer.PROCESSING_INSTRUCTION,
  5531. reconsume, pos);
  5532. continue stateloop;
  5533. }
  5534. // END HOTSPOT WORKAROUND
  5535. }
  5536. }
  5537. flushChars(buf, pos);
  5538. /*
  5539. * if (prevCR && pos != endPos) { // why is this needed? pos--; col--; }
  5540. */
  5541. // Save locals
  5542. stateSave = state;
  5543. returnStateSave = returnState;
  5544. return pos;
  5545. }
  5546. // HOTSPOT WORKAROUND INSERTION POINT
  5547. // [NOCPP[
  5548. protected int transition(int from, int to, boolean reconsume, int pos) throws SAXException {
  5549. return to;
  5550. }
  5551. // ]NOCPP]
  5552. private void initDoctypeFields() {
  5553. doctypeName = "";
  5554. if (systemIdentifier != null) {
  5555. Portability.releaseString(systemIdentifier);
  5556. systemIdentifier = null;
  5557. }
  5558. if (publicIdentifier != null) {
  5559. Portability.releaseString(publicIdentifier);
  5560. publicIdentifier = null;
  5561. }
  5562. forceQuirks = false;
  5563. }
  5564. @Inline private void adjustDoubleHyphenAndAppendToLongStrBufCarriageReturn()
  5565. throws SAXException {
  5566. silentCarriageReturn();
  5567. adjustDoubleHyphenAndAppendToLongStrBufAndErr('\n');
  5568. }
  5569. @Inline private void adjustDoubleHyphenAndAppendToLongStrBufLineFeed()
  5570. throws SAXException {
  5571. silentLineFeed();
  5572. adjustDoubleHyphenAndAppendToLongStrBufAndErr('\n');
  5573. }
  5574. @Inline private void appendLongStrBufLineFeed() {
  5575. silentLineFeed();
  5576. appendLongStrBuf('\n');
  5577. }
  5578. @Inline private void appendLongStrBufCarriageReturn() {
  5579. silentCarriageReturn();
  5580. appendLongStrBuf('\n');
  5581. }
  5582. @Inline protected void silentCarriageReturn() {
  5583. ++line;
  5584. lastCR = true;
  5585. }
  5586. @Inline protected void silentLineFeed() {
  5587. ++line;
  5588. }
  5589. private void emitCarriageReturn(@NoLength char[] buf, int pos)
  5590. throws SAXException {
  5591. silentCarriageReturn();
  5592. flushChars(buf, pos);
  5593. tokenHandler.characters(Tokenizer.LF, 0, 1);
  5594. cstart = Integer.MAX_VALUE;
  5595. }
  5596. private void emitReplacementCharacter(@NoLength char[] buf, int pos)
  5597. throws SAXException {
  5598. flushChars(buf, pos);
  5599. tokenHandler.zeroOriginatingReplacementCharacter();
  5600. cstart = pos + 1;
  5601. }
  5602. private void emitPlaintextReplacementCharacter(@NoLength char[] buf, int pos)
  5603. throws SAXException {
  5604. flushChars(buf, pos);
  5605. tokenHandler.characters(REPLACEMENT_CHARACTER, 0, 1);
  5606. cstart = pos + 1;
  5607. }
  5608. private void setAdditionalAndRememberAmpersandLocation(char add) {
  5609. additional = add;
  5610. // [NOCPP[
  5611. ampersandLocation = new LocatorImpl(this);
  5612. // ]NOCPP]
  5613. }
  5614. private void bogusDoctype() throws SAXException {
  5615. errBogusDoctype();
  5616. forceQuirks = true;
  5617. }
  5618. private void bogusDoctypeWithoutQuirks() throws SAXException {
  5619. errBogusDoctype();
  5620. forceQuirks = false;
  5621. }
  5622. private void emitOrAppendStrBuf(int returnState) throws SAXException {
  5623. if ((returnState & DATA_AND_RCDATA_MASK) != 0) {
  5624. appendStrBufToLongStrBuf();
  5625. } else {
  5626. emitStrBuf();
  5627. }
  5628. }
  5629. private void handleNcrValue(int returnState) throws SAXException {
  5630. /*
  5631. * If one or more characters match the range, then take them all and
  5632. * interpret the string of characters as a number (either hexadecimal or
  5633. * decimal as appropriate).
  5634. */
  5635. if (value <= 0xFFFF) {
  5636. if (value >= 0x80 && value <= 0x9f) {
  5637. /*
  5638. * If that number is one of the numbers in the first column of
  5639. * the following table, then this is a parse error.
  5640. */
  5641. errNcrInC1Range();
  5642. /*
  5643. * Find the row with that number in the first column, and return
  5644. * a character token for the Unicode character given in the
  5645. * second column of that row.
  5646. */
  5647. @NoLength char[] val = NamedCharacters.WINDOWS_1252[value - 0x80];
  5648. emitOrAppendOne(val, returnState);
  5649. // [NOCPP[
  5650. } else if (value == 0xC
  5651. && contentSpacePolicy != XmlViolationPolicy.ALLOW) {
  5652. if (contentSpacePolicy == XmlViolationPolicy.ALTER_INFOSET) {
  5653. emitOrAppendOne(Tokenizer.SPACE, returnState);
  5654. } else if (contentSpacePolicy == XmlViolationPolicy.FATAL) {
  5655. fatal("A character reference expanded to a form feed which is not legal XML 1.0 white space.");
  5656. }
  5657. // ]NOCPP]
  5658. } else if (value == 0x0) {
  5659. errNcrZero();
  5660. emitOrAppendOne(Tokenizer.REPLACEMENT_CHARACTER, returnState);
  5661. } else if ((value & 0xF800) == 0xD800) {
  5662. errNcrSurrogate();
  5663. emitOrAppendOne(Tokenizer.REPLACEMENT_CHARACTER, returnState);
  5664. } else {
  5665. /*
  5666. * Otherwise, return a character token for the Unicode character
  5667. * whose code point is that number.
  5668. */
  5669. char ch = (char) value;
  5670. // [NOCPP[
  5671. if (value == 0x0D) {
  5672. errNcrCr();
  5673. } else if ((value <= 0x0008) || (value == 0x000B)
  5674. || (value >= 0x000E && value <= 0x001F)) {
  5675. ch = errNcrControlChar(ch);
  5676. } else if (value >= 0xFDD0 && value <= 0xFDEF) {
  5677. errNcrUnassigned();
  5678. } else if ((value & 0xFFFE) == 0xFFFE) {
  5679. ch = errNcrNonCharacter(ch);
  5680. } else if (value >= 0x007F && value <= 0x009F) {
  5681. errNcrControlChar();
  5682. } else {
  5683. maybeWarnPrivateUse(ch);
  5684. }
  5685. // ]NOCPP]
  5686. bmpChar[0] = ch;
  5687. emitOrAppendOne(bmpChar, returnState);
  5688. }
  5689. } else if (value <= 0x10FFFF) {
  5690. // [NOCPP[
  5691. maybeWarnPrivateUseAstral();
  5692. if ((value & 0xFFFE) == 0xFFFE) {
  5693. errAstralNonCharacter(value);
  5694. }
  5695. // ]NOCPP]
  5696. astralChar[0] = (char) (Tokenizer.LEAD_OFFSET + (value >> 10));
  5697. astralChar[1] = (char) (0xDC00 + (value & 0x3FF));
  5698. emitOrAppendTwo(astralChar, returnState);
  5699. } else {
  5700. errNcrOutOfRange();
  5701. emitOrAppendOne(Tokenizer.REPLACEMENT_CHARACTER, returnState);
  5702. }
  5703. }
  5704. public void eof() throws SAXException {
  5705. int state = stateSave;
  5706. int returnState = returnStateSave;
  5707. eofloop: for (;;) {
  5708. switch (state) {
  5709. case SCRIPT_DATA_LESS_THAN_SIGN:
  5710. case SCRIPT_DATA_ESCAPED_LESS_THAN_SIGN:
  5711. /*
  5712. * Otherwise, emit a U+003C LESS-THAN SIGN character token
  5713. */
  5714. tokenHandler.characters(Tokenizer.LT_GT, 0, 1);
  5715. /*
  5716. * and reconsume the current input character in the data
  5717. * state.
  5718. */
  5719. break eofloop;
  5720. case TAG_OPEN:
  5721. /*
  5722. * The behavior of this state depends on the content model
  5723. * flag.
  5724. */
  5725. /*
  5726. * Anything else Parse error.
  5727. */
  5728. errEofAfterLt();
  5729. /*
  5730. * Emit a U+003C LESS-THAN SIGN character token
  5731. */
  5732. tokenHandler.characters(Tokenizer.LT_GT, 0, 1);
  5733. /*
  5734. * and reconsume the current input character in the data
  5735. * state.
  5736. */
  5737. break eofloop;
  5738. case RAWTEXT_RCDATA_LESS_THAN_SIGN:
  5739. /*
  5740. * Emit a U+003C LESS-THAN SIGN character token
  5741. */
  5742. tokenHandler.characters(Tokenizer.LT_GT, 0, 1);
  5743. /*
  5744. * and reconsume the current input character in the RCDATA
  5745. * state.
  5746. */
  5747. break eofloop;
  5748. case NON_DATA_END_TAG_NAME:
  5749. /*
  5750. * Emit a U+003C LESS-THAN SIGN character token, a U+002F
  5751. * SOLIDUS character token,
  5752. */
  5753. tokenHandler.characters(Tokenizer.LT_SOLIDUS, 0, 2);
  5754. /*
  5755. * a character token for each of the characters in the
  5756. * temporary buffer (in the order they were added to the
  5757. * buffer),
  5758. */
  5759. emitStrBuf();
  5760. /*
  5761. * and reconsume the current input character in the RCDATA
  5762. * state.
  5763. */
  5764. break eofloop;
  5765. case CLOSE_TAG_OPEN:
  5766. /* EOF Parse error. */
  5767. errEofAfterLt();
  5768. /*
  5769. * Emit a U+003C LESS-THAN SIGN character token and a U+002F
  5770. * SOLIDUS character token.
  5771. */
  5772. tokenHandler.characters(Tokenizer.LT_SOLIDUS, 0, 2);
  5773. /*
  5774. * Reconsume the EOF character in the data state.
  5775. */
  5776. break eofloop;
  5777. case TAG_NAME:
  5778. /*
  5779. * EOF Parse error.
  5780. */
  5781. errEofInTagName();
  5782. /*
  5783. * Reconsume the EOF character in the data state.
  5784. */
  5785. break eofloop;
  5786. case BEFORE_ATTRIBUTE_NAME:
  5787. case AFTER_ATTRIBUTE_VALUE_QUOTED:
  5788. case SELF_CLOSING_START_TAG:
  5789. /* EOF Parse error. */
  5790. errEofWithoutGt();
  5791. /*
  5792. * Reconsume the EOF character in the data state.
  5793. */
  5794. break eofloop;
  5795. case ATTRIBUTE_NAME:
  5796. /*
  5797. * EOF Parse error.
  5798. */
  5799. errEofInAttributeName();
  5800. /*
  5801. * Reconsume the EOF character in the data state.
  5802. */
  5803. break eofloop;
  5804. case AFTER_ATTRIBUTE_NAME:
  5805. case BEFORE_ATTRIBUTE_VALUE:
  5806. /* EOF Parse error. */
  5807. errEofWithoutGt();
  5808. /*
  5809. * Reconsume the EOF character in the data state.
  5810. */
  5811. break eofloop;
  5812. case ATTRIBUTE_VALUE_DOUBLE_QUOTED:
  5813. case ATTRIBUTE_VALUE_SINGLE_QUOTED:
  5814. case ATTRIBUTE_VALUE_UNQUOTED:
  5815. /* EOF Parse error. */
  5816. errEofInAttributeValue();
  5817. /*
  5818. * Reconsume the EOF character in the data state.
  5819. */
  5820. break eofloop;
  5821. case BOGUS_COMMENT:
  5822. emitComment(0, 0);
  5823. break eofloop;
  5824. case BOGUS_COMMENT_HYPHEN:
  5825. // [NOCPP[
  5826. maybeAppendSpaceToBogusComment();
  5827. // ]NOCPP]
  5828. emitComment(0, 0);
  5829. break eofloop;
  5830. case MARKUP_DECLARATION_OPEN:
  5831. errBogusComment();
  5832. clearLongStrBuf();
  5833. emitComment(0, 0);
  5834. break eofloop;
  5835. case MARKUP_DECLARATION_HYPHEN:
  5836. errBogusComment();
  5837. emitComment(0, 0);
  5838. break eofloop;
  5839. case MARKUP_DECLARATION_OCTYPE:
  5840. if (index < 6) {
  5841. errBogusComment();
  5842. emitComment(0, 0);
  5843. } else {
  5844. /* EOF Parse error. */
  5845. errEofInDoctype();
  5846. /*
  5847. * Create a new DOCTYPE token. Set its force-quirks flag
  5848. * to on.
  5849. */
  5850. doctypeName = "";
  5851. if (systemIdentifier != null) {
  5852. Portability.releaseString(systemIdentifier);
  5853. systemIdentifier = null;
  5854. }
  5855. if (publicIdentifier != null) {
  5856. Portability.releaseString(publicIdentifier);
  5857. publicIdentifier = null;
  5858. }
  5859. forceQuirks = true;
  5860. /*
  5861. * Emit the token.
  5862. */
  5863. emitDoctypeToken(0);
  5864. /*
  5865. * Reconsume the EOF character in the data state.
  5866. */
  5867. break eofloop;
  5868. }
  5869. break eofloop;
  5870. case COMMENT_START:
  5871. case COMMENT:
  5872. /*
  5873. * EOF Parse error.
  5874. */
  5875. errEofInComment();
  5876. /* Emit the comment token. */
  5877. emitComment(0, 0);
  5878. /*
  5879. * Reconsume the EOF character in the data state.
  5880. */
  5881. break eofloop;
  5882. case COMMENT_END:
  5883. errEofInComment();
  5884. /* Emit the comment token. */
  5885. emitComment(2, 0);
  5886. /*
  5887. * Reconsume the EOF character in the data state.
  5888. */
  5889. break eofloop;
  5890. case COMMENT_END_DASH:
  5891. case COMMENT_START_DASH:
  5892. errEofInComment();
  5893. /* Emit the comment token. */
  5894. emitComment(1, 0);
  5895. /*
  5896. * Reconsume the EOF character in the data state.
  5897. */
  5898. break eofloop;
  5899. case COMMENT_END_BANG:
  5900. errEofInComment();
  5901. /* Emit the comment token. */
  5902. emitComment(3, 0);
  5903. /*
  5904. * Reconsume the EOF character in the data state.
  5905. */
  5906. break eofloop;
  5907. case DOCTYPE:
  5908. case BEFORE_DOCTYPE_NAME:
  5909. errEofInDoctype();
  5910. /*
  5911. * Create a new DOCTYPE token. Set its force-quirks flag to
  5912. * on.
  5913. */
  5914. forceQuirks = true;
  5915. /*
  5916. * Emit the token.
  5917. */
  5918. emitDoctypeToken(0);
  5919. /*
  5920. * Reconsume the EOF character in the data state.
  5921. */
  5922. break eofloop;
  5923. case DOCTYPE_NAME:
  5924. errEofInDoctype();
  5925. strBufToDoctypeName();
  5926. /*
  5927. * Set the DOCTYPE token's force-quirks flag to on.
  5928. */
  5929. forceQuirks = true;
  5930. /*
  5931. * Emit that DOCTYPE token.
  5932. */
  5933. emitDoctypeToken(0);
  5934. /*
  5935. * Reconsume the EOF character in the data state.
  5936. */
  5937. break eofloop;
  5938. case DOCTYPE_UBLIC:
  5939. case DOCTYPE_YSTEM:
  5940. case AFTER_DOCTYPE_NAME:
  5941. case AFTER_DOCTYPE_PUBLIC_KEYWORD:
  5942. case AFTER_DOCTYPE_SYSTEM_KEYWORD:
  5943. case BEFORE_DOCTYPE_PUBLIC_IDENTIFIER:
  5944. errEofInDoctype();
  5945. /*
  5946. * Set the DOCTYPE token's force-quirks flag to on.
  5947. */
  5948. forceQuirks = true;
  5949. /*
  5950. * Emit that DOCTYPE token.
  5951. */
  5952. emitDoctypeToken(0);
  5953. /*
  5954. * Reconsume the EOF character in the data state.
  5955. */
  5956. break eofloop;
  5957. case DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED:
  5958. case DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED:
  5959. /* EOF Parse error. */
  5960. errEofInPublicId();
  5961. /*
  5962. * Set the DOCTYPE token's force-quirks flag to on.
  5963. */
  5964. forceQuirks = true;
  5965. /*
  5966. * Emit that DOCTYPE token.
  5967. */
  5968. publicIdentifier = longStrBufToString();
  5969. emitDoctypeToken(0);
  5970. /*
  5971. * Reconsume the EOF character in the data state.
  5972. */
  5973. break eofloop;
  5974. case AFTER_DOCTYPE_PUBLIC_IDENTIFIER:
  5975. case BEFORE_DOCTYPE_SYSTEM_IDENTIFIER:
  5976. case BETWEEN_DOCTYPE_PUBLIC_AND_SYSTEM_IDENTIFIERS:
  5977. errEofInDoctype();
  5978. /*
  5979. * Set the DOCTYPE token's force-quirks flag to on.
  5980. */
  5981. forceQuirks = true;
  5982. /*
  5983. * Emit that DOCTYPE token.
  5984. */
  5985. emitDoctypeToken(0);
  5986. /*
  5987. * Reconsume the EOF character in the data state.
  5988. */
  5989. break eofloop;
  5990. case DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED:
  5991. case DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED:
  5992. /* EOF Parse error. */
  5993. errEofInSystemId();
  5994. /*
  5995. * Set the DOCTYPE token's force-quirks flag to on.
  5996. */
  5997. forceQuirks = true;
  5998. /*
  5999. * Emit that DOCTYPE token.
  6000. */
  6001. systemIdentifier = longStrBufToString();
  6002. emitDoctypeToken(0);
  6003. /*
  6004. * Reconsume the EOF character in the data state.
  6005. */
  6006. break eofloop;
  6007. case AFTER_DOCTYPE_SYSTEM_IDENTIFIER:
  6008. errEofInDoctype();
  6009. /*
  6010. * Set the DOCTYPE token's force-quirks flag to on.
  6011. */
  6012. forceQuirks = true;
  6013. /*
  6014. * Emit that DOCTYPE token.
  6015. */
  6016. emitDoctypeToken(0);
  6017. /*
  6018. * Reconsume the EOF character in the data state.
  6019. */
  6020. break eofloop;
  6021. case BOGUS_DOCTYPE:
  6022. /*
  6023. * Emit that DOCTYPE token.
  6024. */
  6025. emitDoctypeToken(0);
  6026. /*
  6027. * Reconsume the EOF character in the data state.
  6028. */
  6029. break eofloop;
  6030. case CONSUME_CHARACTER_REFERENCE:
  6031. /*
  6032. * Unlike the definition is the spec, this state does not
  6033. * return a value and never requires the caller to
  6034. * backtrack. This state takes care of emitting characters
  6035. * or appending to the current attribute value. It also
  6036. * takes care of that in the case when consuming the entity
  6037. * fails.
  6038. */
  6039. /*
  6040. * This section defines how to consume an entity. This
  6041. * definition is used when parsing entities in text and in
  6042. * attributes.
  6043. *
  6044. * The behavior depends on the identity of the next
  6045. * character (the one immediately after the U+0026 AMPERSAND
  6046. * character):
  6047. */
  6048. emitOrAppendStrBuf(returnState);
  6049. state = returnState;
  6050. continue;
  6051. case CHARACTER_REFERENCE_HILO_LOOKUP:
  6052. errNoNamedCharacterMatch();
  6053. emitOrAppendStrBuf(returnState);
  6054. state = returnState;
  6055. continue;
  6056. case CHARACTER_REFERENCE_TAIL:
  6057. outer: for (;;) {
  6058. char c = '\u0000';
  6059. entCol++;
  6060. /*
  6061. * Consume the maximum number of characters possible,
  6062. * with the consumed characters matching one of the
  6063. * identifiers in the first column of the named
  6064. * character references table (in a case-sensitive
  6065. * manner).
  6066. */
  6067. hiloop: for (;;) {
  6068. if (hi == -1) {
  6069. break hiloop;
  6070. }
  6071. if (entCol == NamedCharacters.NAMES[hi].length()) {
  6072. break hiloop;
  6073. }
  6074. if (entCol > NamedCharacters.NAMES[hi].length()) {
  6075. break outer;
  6076. } else if (c < NamedCharacters.NAMES[hi].charAt(entCol)) {
  6077. hi--;
  6078. } else {
  6079. break hiloop;
  6080. }
  6081. }
  6082. loloop: for (;;) {
  6083. if (hi < lo) {
  6084. break outer;
  6085. }
  6086. if (entCol == NamedCharacters.NAMES[lo].length()) {
  6087. candidate = lo;
  6088. strBufMark = strBufLen;
  6089. lo++;
  6090. } else if (entCol > NamedCharacters.NAMES[lo].length()) {
  6091. break outer;
  6092. } else if (c > NamedCharacters.NAMES[lo].charAt(entCol)) {
  6093. lo++;
  6094. } else {
  6095. break loloop;
  6096. }
  6097. }
  6098. if (hi < lo) {
  6099. break outer;
  6100. }
  6101. continue;
  6102. }
  6103. if (candidate == -1) {
  6104. /*
  6105. * If no match can be made, then this is a parse error.
  6106. */
  6107. errNoNamedCharacterMatch();
  6108. emitOrAppendStrBuf(returnState);
  6109. state = returnState;
  6110. continue eofloop;
  6111. } else {
  6112. @Const @CharacterName String candidateName = NamedCharacters.NAMES[candidate];
  6113. if (candidateName.length() == 0
  6114. || candidateName.charAt(candidateName.length() - 1) != ';') {
  6115. /*
  6116. * If the last character matched is not a U+003B
  6117. * SEMICOLON (;), there is a parse error.
  6118. */
  6119. if ((returnState & DATA_AND_RCDATA_MASK) != 0) {
  6120. /*
  6121. * If the entity is being consumed as part of an
  6122. * attribute, and the last character matched is
  6123. * not a U+003B SEMICOLON (;),
  6124. */
  6125. char ch;
  6126. if (strBufMark == strBufLen) {
  6127. ch = '\u0000';
  6128. } else {
  6129. ch = strBuf[strBufMark];
  6130. }
  6131. if ((ch >= '0' && ch <= '9')
  6132. || (ch >= 'A' && ch <= 'Z')
  6133. || (ch >= 'a' && ch <= 'z')) {
  6134. /*
  6135. * and the next character is in the range
  6136. * U+0030 DIGIT ZERO to U+0039 DIGIT NINE,
  6137. * U+0041 LATIN CAPITAL LETTER A to U+005A
  6138. * LATIN CAPITAL LETTER Z, or U+0061 LATIN
  6139. * SMALL LETTER A to U+007A LATIN SMALL
  6140. * LETTER Z, then, for historical reasons,
  6141. * all the characters that were matched
  6142. * after the U+0026 AMPERSAND (&) must be
  6143. * unconsumed, and nothing is returned.
  6144. */
  6145. errNoNamedCharacterMatch();
  6146. appendStrBufToLongStrBuf();
  6147. state = returnState;
  6148. continue eofloop;
  6149. }
  6150. }
  6151. if ((returnState & DATA_AND_RCDATA_MASK) != 0) {
  6152. errUnescapedAmpersandInterpretedAsCharacterReference();
  6153. } else {
  6154. errNotSemicolonTerminated();
  6155. }
  6156. }
  6157. /*
  6158. * Otherwise, return a character token for the character
  6159. * corresponding to the entity name (as given by the
  6160. * second column of the named character references
  6161. * table).
  6162. */
  6163. @Const @NoLength char[] val = NamedCharacters.VALUES[candidate];
  6164. if (
  6165. // [NOCPP[
  6166. val.length == 1
  6167. // ]NOCPP]
  6168. // CPPONLY: val[1] == 0
  6169. ) {
  6170. emitOrAppendOne(val, returnState);
  6171. } else {
  6172. emitOrAppendTwo(val, returnState);
  6173. }
  6174. // this is so complicated!
  6175. if (strBufMark < strBufLen) {
  6176. if ((returnState & DATA_AND_RCDATA_MASK) != 0) {
  6177. for (int i = strBufMark; i < strBufLen; i++) {
  6178. appendLongStrBuf(strBuf[i]);
  6179. }
  6180. } else {
  6181. tokenHandler.characters(strBuf, strBufMark,
  6182. strBufLen - strBufMark);
  6183. }
  6184. }
  6185. state = returnState;
  6186. continue eofloop;
  6187. /*
  6188. * If the markup contains I'm &notit; I tell you, the
  6189. * entity is parsed as "not", as in, I'm ¬it; I tell
  6190. * you. But if the markup was I'm &notin; I tell you,
  6191. * the entity would be parsed as "notin;", resulting in
  6192. * I'm ∉ I tell you.
  6193. */
  6194. }
  6195. case CONSUME_NCR:
  6196. case DECIMAL_NRC_LOOP:
  6197. case HEX_NCR_LOOP:
  6198. /*
  6199. * If no characters match the range, then don't consume any
  6200. * characters (and unconsume the U+0023 NUMBER SIGN
  6201. * character and, if appropriate, the X character). This is
  6202. * a parse error; nothing is returned.
  6203. *
  6204. * Otherwise, if the next character is a U+003B SEMICOLON,
  6205. * consume that too. If it isn't, there is a parse error.
  6206. */
  6207. if (!seenDigits) {
  6208. errNoDigitsInNCR();
  6209. emitOrAppendStrBuf(returnState);
  6210. state = returnState;
  6211. continue;
  6212. } else {
  6213. errCharRefLacksSemicolon();
  6214. }
  6215. // WARNING previous state sets reconsume
  6216. handleNcrValue(returnState);
  6217. state = returnState;
  6218. continue;
  6219. case CDATA_RSQB:
  6220. tokenHandler.characters(Tokenizer.RSQB_RSQB, 0, 1);
  6221. break eofloop;
  6222. case CDATA_RSQB_RSQB:
  6223. tokenHandler.characters(Tokenizer.RSQB_RSQB, 0, 2);
  6224. break eofloop;
  6225. case DATA:
  6226. default:
  6227. break eofloop;
  6228. }
  6229. }
  6230. // case DATA:
  6231. /*
  6232. * EOF Emit an end-of-file token.
  6233. */
  6234. tokenHandler.eof();
  6235. return;
  6236. }
  6237. private void emitDoctypeToken(int pos) throws SAXException {
  6238. cstart = pos + 1;
  6239. tokenHandler.doctype(doctypeName, publicIdentifier, systemIdentifier,
  6240. forceQuirks);
  6241. // It is OK and sufficient to release these here, since
  6242. // there's no way out of the doctype states than through paths
  6243. // that call this method.
  6244. doctypeName = null;
  6245. Portability.releaseString(publicIdentifier);
  6246. publicIdentifier = null;
  6247. Portability.releaseString(systemIdentifier);
  6248. systemIdentifier = null;
  6249. }
  6250. @Inline protected char checkChar(@NoLength char[] buf, int pos)
  6251. throws SAXException {
  6252. return buf[pos];
  6253. }
  6254. // [NOCPP[
  6255. /**
  6256. * Returns the alreadyComplainedAboutNonAscii.
  6257. *
  6258. * @return the alreadyComplainedAboutNonAscii
  6259. */
  6260. public boolean isAlreadyComplainedAboutNonAscii() {
  6261. return true;
  6262. }
  6263. // ]NOCPP]
  6264. public boolean internalEncodingDeclaration(String internalCharset)
  6265. throws SAXException {
  6266. if (encodingDeclarationHandler != null) {
  6267. return encodingDeclarationHandler.internalEncodingDeclaration(internalCharset);
  6268. }
  6269. return false;
  6270. }
  6271. /**
  6272. * @param val
  6273. * @throws SAXException
  6274. */
  6275. private void emitOrAppendTwo(@Const @NoLength char[] val, int returnState)
  6276. throws SAXException {
  6277. if ((returnState & DATA_AND_RCDATA_MASK) != 0) {
  6278. appendLongStrBuf(val[0]);
  6279. appendLongStrBuf(val[1]);
  6280. } else {
  6281. tokenHandler.characters(val, 0, 2);
  6282. }
  6283. }
  6284. private void emitOrAppendOne(@Const @NoLength char[] val, int returnState)
  6285. throws SAXException {
  6286. if ((returnState & DATA_AND_RCDATA_MASK) != 0) {
  6287. appendLongStrBuf(val[0]);
  6288. } else {
  6289. tokenHandler.characters(val, 0, 1);
  6290. }
  6291. }
  6292. public void end() throws SAXException {
  6293. strBuf = null;
  6294. longStrBuf = null;
  6295. doctypeName = null;
  6296. if (systemIdentifier != null) {
  6297. Portability.releaseString(systemIdentifier);
  6298. systemIdentifier = null;
  6299. }
  6300. if (publicIdentifier != null) {
  6301. Portability.releaseString(publicIdentifier);
  6302. publicIdentifier = null;
  6303. }
  6304. if (tagName != null) {
  6305. tagName.release();
  6306. tagName = null;
  6307. }
  6308. if (attributeName != null) {
  6309. attributeName.release();
  6310. attributeName = null;
  6311. }
  6312. tokenHandler.endTokenization();
  6313. if (attributes != null) {
  6314. attributes.clear(mappingLangToXmlLang);
  6315. Portability.delete(attributes);
  6316. attributes = null;
  6317. }
  6318. }
  6319. public void requestSuspension() {
  6320. shouldSuspend = true;
  6321. }
  6322. // [NOCPP[
  6323. public void becomeConfident() {
  6324. confident = true;
  6325. }
  6326. /**
  6327. * Returns the nextCharOnNewLine.
  6328. *
  6329. * @return the nextCharOnNewLine
  6330. */
  6331. public boolean isNextCharOnNewLine() {
  6332. return false;
  6333. }
  6334. public boolean isPrevCR() {
  6335. return lastCR;
  6336. }
  6337. /**
  6338. * Returns the line.
  6339. *
  6340. * @return the line
  6341. */
  6342. public int getLine() {
  6343. return -1;
  6344. }
  6345. /**
  6346. * Returns the col.
  6347. *
  6348. * @return the col
  6349. */
  6350. public int getCol() {
  6351. return -1;
  6352. }
  6353. // ]NOCPP]
  6354. public boolean isInDataState() {
  6355. return (stateSave == DATA);
  6356. }
  6357. public void resetToDataState() {
  6358. strBufLen = 0;
  6359. longStrBufLen = 0;
  6360. stateSave = Tokenizer.DATA;
  6361. // line = 1; XXX line numbers
  6362. lastCR = false;
  6363. index = 0;
  6364. forceQuirks = false;
  6365. additional = '\u0000';
  6366. entCol = -1;
  6367. firstCharKey = -1;
  6368. lo = 0;
  6369. hi = 0; // will always be overwritten before use anyway
  6370. candidate = -1;
  6371. strBufMark = 0;
  6372. prevValue = -1;
  6373. value = 0;
  6374. seenDigits = false;
  6375. endTag = false;
  6376. shouldSuspend = false;
  6377. initDoctypeFields();
  6378. if (tagName != null) {
  6379. tagName.release();
  6380. tagName = null;
  6381. }
  6382. if (attributeName != null) {
  6383. attributeName.release();
  6384. attributeName = null;
  6385. }
  6386. // [NOCPP[
  6387. if (newAttributesEachTime) {
  6388. // ]NOCPP]
  6389. if (attributes != null) {
  6390. Portability.delete(attributes);
  6391. attributes = null;
  6392. }
  6393. // [NOCPP[
  6394. }
  6395. // ]NOCPP]
  6396. }
  6397. public void loadState(Tokenizer other) throws SAXException {
  6398. strBufLen = other.strBufLen;
  6399. if (strBufLen > strBuf.length) {
  6400. strBuf = new char[strBufLen];
  6401. }
  6402. System.arraycopy(other.strBuf, 0, strBuf, 0, strBufLen);
  6403. longStrBufLen = other.longStrBufLen;
  6404. if (longStrBufLen > longStrBuf.length) {
  6405. longStrBuf = new char[longStrBufLen];
  6406. }
  6407. System.arraycopy(other.longStrBuf, 0, longStrBuf, 0, longStrBufLen);
  6408. stateSave = other.stateSave;
  6409. returnStateSave = other.returnStateSave;
  6410. endTagExpectation = other.endTagExpectation;
  6411. endTagExpectationAsArray = other.endTagExpectationAsArray;
  6412. // line = 1; XXX line numbers
  6413. lastCR = other.lastCR;
  6414. index = other.index;
  6415. forceQuirks = other.forceQuirks;
  6416. additional = other.additional;
  6417. entCol = other.entCol;
  6418. firstCharKey = other.firstCharKey;
  6419. lo = other.lo;
  6420. hi = other.hi;
  6421. candidate = other.candidate;
  6422. strBufMark = other.strBufMark;
  6423. prevValue = other.prevValue;
  6424. value = other.value;
  6425. seenDigits = other.seenDigits;
  6426. endTag = other.endTag;
  6427. shouldSuspend = false;
  6428. if (other.doctypeName == null) {
  6429. doctypeName = null;
  6430. } else {
  6431. doctypeName = Portability.newLocalFromLocal(other.doctypeName,
  6432. interner);
  6433. }
  6434. Portability.releaseString(systemIdentifier);
  6435. if (other.systemIdentifier == null) {
  6436. systemIdentifier = null;
  6437. } else {
  6438. systemIdentifier = Portability.newStringFromString(other.systemIdentifier);
  6439. }
  6440. Portability.releaseString(publicIdentifier);
  6441. if (other.publicIdentifier == null) {
  6442. publicIdentifier = null;
  6443. } else {
  6444. publicIdentifier = Portability.newStringFromString(other.publicIdentifier);
  6445. }
  6446. if (tagName != null) {
  6447. tagName.release();
  6448. }
  6449. if (other.tagName == null) {
  6450. tagName = null;
  6451. } else {
  6452. tagName = other.tagName.cloneElementName(interner);
  6453. }
  6454. if (attributeName != null) {
  6455. attributeName.release();
  6456. }
  6457. if (other.attributeName == null) {
  6458. attributeName = null;
  6459. } else {
  6460. attributeName = other.attributeName.cloneAttributeName(interner);
  6461. }
  6462. Portability.delete(attributes);
  6463. if (other.attributes == null) {
  6464. attributes = null;
  6465. } else {
  6466. attributes = other.attributes.cloneAttributes(interner);
  6467. }
  6468. }
  6469. public void initializeWithoutStarting() throws SAXException {
  6470. confident = false;
  6471. strBuf = new char[64];
  6472. longStrBuf = new char[1024];
  6473. line = 1;
  6474. // [NOCPP[
  6475. html4 = false;
  6476. metaBoundaryPassed = false;
  6477. wantsComments = tokenHandler.wantsComments();
  6478. if (!newAttributesEachTime) {
  6479. attributes = new HtmlAttributes(mappingLangToXmlLang);
  6480. }
  6481. // ]NOCPP]
  6482. resetToDataState();
  6483. }
  6484. protected void errGarbageAfterLtSlash() throws SAXException {
  6485. }
  6486. protected void errLtSlashGt() throws SAXException {
  6487. }
  6488. protected void errWarnLtSlashInRcdata() throws SAXException {
  6489. }
  6490. protected void errHtml4LtSlashInRcdata(char folded) throws SAXException {
  6491. }
  6492. protected void errCharRefLacksSemicolon() throws SAXException {
  6493. }
  6494. protected void errNoDigitsInNCR() throws SAXException {
  6495. }
  6496. protected void errGtInSystemId() throws SAXException {
  6497. }
  6498. protected void errGtInPublicId() throws SAXException {
  6499. }
  6500. protected void errNamelessDoctype() throws SAXException {
  6501. }
  6502. protected void errConsecutiveHyphens() throws SAXException {
  6503. }
  6504. protected void errPrematureEndOfComment() throws SAXException {
  6505. }
  6506. protected void errBogusComment() throws SAXException {
  6507. }
  6508. protected void errUnquotedAttributeValOrNull(char c) throws SAXException {
  6509. }
  6510. protected void errSlashNotFollowedByGt() throws SAXException {
  6511. }
  6512. protected void errHtml4XmlVoidSyntax() throws SAXException {
  6513. }
  6514. protected void errNoSpaceBetweenAttributes() throws SAXException {
  6515. }
  6516. protected void errHtml4NonNameInUnquotedAttribute(char c)
  6517. throws SAXException {
  6518. }
  6519. protected void errLtOrEqualsOrGraveInUnquotedAttributeOrNull(char c)
  6520. throws SAXException {
  6521. }
  6522. protected void errAttributeValueMissing() throws SAXException {
  6523. }
  6524. protected void errBadCharBeforeAttributeNameOrNull(char c)
  6525. throws SAXException {
  6526. }
  6527. protected void errEqualsSignBeforeAttributeName() throws SAXException {
  6528. }
  6529. protected void errBadCharAfterLt(char c) throws SAXException {
  6530. }
  6531. protected void errLtGt() throws SAXException {
  6532. }
  6533. protected void errProcessingInstruction() throws SAXException {
  6534. }
  6535. protected void errUnescapedAmpersandInterpretedAsCharacterReference()
  6536. throws SAXException {
  6537. }
  6538. protected void errNotSemicolonTerminated() throws SAXException {
  6539. }
  6540. protected void errNoNamedCharacterMatch() throws SAXException {
  6541. }
  6542. protected void errQuoteBeforeAttributeName(char c) throws SAXException {
  6543. }
  6544. protected void errQuoteOrLtInAttributeNameOrNull(char c)
  6545. throws SAXException {
  6546. }
  6547. protected void errExpectedPublicId() throws SAXException {
  6548. }
  6549. protected void errBogusDoctype() throws SAXException {
  6550. }
  6551. protected void maybeWarnPrivateUseAstral() throws SAXException {
  6552. }
  6553. protected void maybeWarnPrivateUse(char ch) throws SAXException {
  6554. }
  6555. protected void maybeErrAttributesOnEndTag(HtmlAttributes attrs)
  6556. throws SAXException {
  6557. }
  6558. protected void maybeErrSlashInEndTag(boolean selfClosing)
  6559. throws SAXException {
  6560. }
  6561. protected char errNcrNonCharacter(char ch) throws SAXException {
  6562. return ch;
  6563. }
  6564. protected void errAstralNonCharacter(int ch) throws SAXException {
  6565. }
  6566. protected void errNcrSurrogate() throws SAXException {
  6567. }
  6568. protected char errNcrControlChar(char ch) throws SAXException {
  6569. return ch;
  6570. }
  6571. protected void errNcrCr() throws SAXException {
  6572. }
  6573. protected void errNcrInC1Range() throws SAXException {
  6574. }
  6575. protected void errEofInPublicId() throws SAXException {
  6576. }
  6577. protected void errEofInComment() throws SAXException {
  6578. }
  6579. protected void errEofInDoctype() throws SAXException {
  6580. }
  6581. protected void errEofInAttributeValue() throws SAXException {
  6582. }
  6583. protected void errEofInAttributeName() throws SAXException {
  6584. }
  6585. protected void errEofWithoutGt() throws SAXException {
  6586. }
  6587. protected void errEofInTagName() throws SAXException {
  6588. }
  6589. protected void errEofInEndTag() throws SAXException {
  6590. }
  6591. protected void errEofAfterLt() throws SAXException {
  6592. }
  6593. protected void errNcrOutOfRange() throws SAXException {
  6594. }
  6595. protected void errNcrUnassigned() throws SAXException {
  6596. }
  6597. protected void errDuplicateAttribute() throws SAXException {
  6598. }
  6599. protected void errEofInSystemId() throws SAXException {
  6600. }
  6601. protected void errExpectedSystemId() throws SAXException {
  6602. }
  6603. protected void errMissingSpaceBeforeDoctypeName() throws SAXException {
  6604. }
  6605. protected void errHyphenHyphenBang() throws SAXException {
  6606. }
  6607. protected void errNcrControlChar() throws SAXException {
  6608. }
  6609. protected void errNcrZero() throws SAXException {
  6610. }
  6611. protected void errNoSpaceBetweenDoctypeSystemKeywordAndQuote()
  6612. throws SAXException {
  6613. }
  6614. protected void errNoSpaceBetweenPublicAndSystemIds() throws SAXException {
  6615. }
  6616. protected void errNoSpaceBetweenDoctypePublicKeywordAndQuote()
  6617. throws SAXException {
  6618. }
  6619. protected void noteAttributeWithoutValue() throws SAXException {
  6620. }
  6621. protected void noteUnquotedAttributeValue() throws SAXException {
  6622. }
  6623. /**
  6624. * Sets the encodingDeclarationHandler.
  6625. *
  6626. * @param encodingDeclarationHandler
  6627. * the encodingDeclarationHandler to set
  6628. */
  6629. public void setEncodingDeclarationHandler(
  6630. EncodingDeclarationHandler encodingDeclarationHandler) {
  6631. this.encodingDeclarationHandler = encodingDeclarationHandler;
  6632. }
  6633. void destructor() {
  6634. // The translator will write refcount tracing stuff here
  6635. }
  6636. // [NOCPP[
  6637. /**
  6638. * Sets an offset to be added to the position reported to
  6639. * <code>TransitionHandler</code>.
  6640. *
  6641. * @param offset the offset
  6642. */
  6643. public void setTransitionBaseOffset(int offset) {
  6644. }
  6645. // ]NOCPP]
  6646. }