/parser/html/javasrc/Tokenizer.java

http://github.com/zpao/v8monkey · Java · 7027 lines · 3908 code · 381 blank · 2738 comment · 760 complexity · fb65ba7ba7f99f659d89e51498e7a0f4 MD5 · raw file

Large files are truncated click here to view the full file

  1. /*
  2. * Copyright (c) 2005-2007 Henri Sivonen
  3. * Copyright (c) 2007-2010 Mozilla Foundation
  4. * Portions of comments Copyright 2004-2010 Apple Computer, Inc., Mozilla
  5. * Foundation, and Opera Software ASA.
  6. *
  7. * Permission is hereby granted, free of charge, to any person obtaining a
  8. * copy of this software and associated documentation files (the "Software"),
  9. * to deal in the Software without restriction, including without limitation
  10. * the rights to use, copy, modify, merge, publish, distribute, sublicense,
  11. * and/or sell copies of the Software, and to permit persons to whom the
  12. * Software is furnished to do so, subject to the following conditions:
  13. *
  14. * The above copyright notice and this permission notice shall be included in
  15. * all copies or substantial portions of the Software.
  16. *
  17. * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  18. * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  19. * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
  20. * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  21. * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  22. * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
  23. * DEALINGS IN THE SOFTWARE.
  24. */
  25. /*
  26. * The comments following this one that use the same comment syntax as this
  27. * comment are quotes from the WHATWG HTML 5 spec as of 2 June 2007
  28. * amended as of June 18 2008 and May 31 2010.
  29. * That document came with this statement:
  30. * "© Copyright 2004-2010 Apple Computer, Inc., Mozilla Foundation, and
  31. * Opera Software ASA. You are granted a license to use, reproduce and
  32. * create derivative works of this document."
  33. */
  34. package nu.validator.htmlparser.impl;
  35. import nu.validator.htmlparser.annotation.Auto;
  36. import nu.validator.htmlparser.annotation.CharacterName;
  37. import nu.validator.htmlparser.annotation.Const;
  38. import nu.validator.htmlparser.annotation.Inline;
  39. import nu.validator.htmlparser.annotation.Local;
  40. import nu.validator.htmlparser.annotation.NoLength;
  41. import nu.validator.htmlparser.common.EncodingDeclarationHandler;
  42. import nu.validator.htmlparser.common.Interner;
  43. import nu.validator.htmlparser.common.TokenHandler;
  44. import nu.validator.htmlparser.common.XmlViolationPolicy;
  45. import org.xml.sax.ErrorHandler;
  46. import org.xml.sax.Locator;
  47. import org.xml.sax.SAXException;
  48. import org.xml.sax.SAXParseException;
  49. /**
  50. * An implementation of
  51. * http://www.whatwg.org/specs/web-apps/current-work/multipage/tokenization.html
  52. *
  53. * This class implements the <code>Locator</code> interface. This is not an
  54. * incidental implementation detail: Users of this class are encouraged to make
  55. * use of the <code>Locator</code> nature.
  56. *
  57. * By default, the tokenizer may report data that XML 1.0 bans. The tokenizer
  58. * can be configured to treat these conditions as fatal or to coerce the infoset
  59. * to something that XML 1.0 allows.
  60. *
  61. * @version $Id$
  62. * @author hsivonen
  63. */
  64. public class Tokenizer implements Locator {
  65. private static final int DATA_AND_RCDATA_MASK = ~1;
  66. public static final int DATA = 0;
  67. public static final int RCDATA = 1;
  68. public static final int SCRIPT_DATA = 2;
  69. public static final int RAWTEXT = 3;
  70. public static final int SCRIPT_DATA_ESCAPED = 4;
  71. public static final int ATTRIBUTE_VALUE_DOUBLE_QUOTED = 5;
  72. public static final int ATTRIBUTE_VALUE_SINGLE_QUOTED = 6;
  73. public static final int ATTRIBUTE_VALUE_UNQUOTED = 7;
  74. public static final int PLAINTEXT = 8;
  75. public static final int TAG_OPEN = 9;
  76. public static final int CLOSE_TAG_OPEN = 10;
  77. public static final int TAG_NAME = 11;
  78. public static final int BEFORE_ATTRIBUTE_NAME = 12;
  79. public static final int ATTRIBUTE_NAME = 13;
  80. public static final int AFTER_ATTRIBUTE_NAME = 14;
  81. public static final int BEFORE_ATTRIBUTE_VALUE = 15;
  82. public static final int AFTER_ATTRIBUTE_VALUE_QUOTED = 16;
  83. public static final int BOGUS_COMMENT = 17;
  84. public static final int MARKUP_DECLARATION_OPEN = 18;
  85. public static final int DOCTYPE = 19;
  86. public static final int BEFORE_DOCTYPE_NAME = 20;
  87. public static final int DOCTYPE_NAME = 21;
  88. public static final int AFTER_DOCTYPE_NAME = 22;
  89. public static final int BEFORE_DOCTYPE_PUBLIC_IDENTIFIER = 23;
  90. public static final int DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED = 24;
  91. public static final int DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED = 25;
  92. public static final int AFTER_DOCTYPE_PUBLIC_IDENTIFIER = 26;
  93. public static final int BEFORE_DOCTYPE_SYSTEM_IDENTIFIER = 27;
  94. public static final int DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED = 28;
  95. public static final int DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED = 29;
  96. public static final int AFTER_DOCTYPE_SYSTEM_IDENTIFIER = 30;
  97. public static final int BOGUS_DOCTYPE = 31;
  98. public static final int COMMENT_START = 32;
  99. public static final int COMMENT_START_DASH = 33;
  100. public static final int COMMENT = 34;
  101. public static final int COMMENT_END_DASH = 35;
  102. public static final int COMMENT_END = 36;
  103. public static final int COMMENT_END_BANG = 37;
  104. public static final int NON_DATA_END_TAG_NAME = 38;
  105. public static final int MARKUP_DECLARATION_HYPHEN = 39;
  106. public static final int MARKUP_DECLARATION_OCTYPE = 40;
  107. public static final int DOCTYPE_UBLIC = 41;
  108. public static final int DOCTYPE_YSTEM = 42;
  109. public static final int AFTER_DOCTYPE_PUBLIC_KEYWORD = 43;
  110. public static final int BETWEEN_DOCTYPE_PUBLIC_AND_SYSTEM_IDENTIFIERS = 44;
  111. public static final int AFTER_DOCTYPE_SYSTEM_KEYWORD = 45;
  112. public static final int CONSUME_CHARACTER_REFERENCE = 46;
  113. public static final int CONSUME_NCR = 47;
  114. public static final int CHARACTER_REFERENCE_TAIL = 48;
  115. public static final int HEX_NCR_LOOP = 49;
  116. public static final int DECIMAL_NRC_LOOP = 50;
  117. public static final int HANDLE_NCR_VALUE = 51;
  118. public static final int HANDLE_NCR_VALUE_RECONSUME = 52;
  119. public static final int CHARACTER_REFERENCE_HILO_LOOKUP = 53;
  120. public static final int SELF_CLOSING_START_TAG = 54;
  121. public static final int CDATA_START = 55;
  122. public static final int CDATA_SECTION = 56;
  123. public static final int CDATA_RSQB = 57;
  124. public static final int CDATA_RSQB_RSQB = 58;
  125. public static final int SCRIPT_DATA_LESS_THAN_SIGN = 59;
  126. public static final int SCRIPT_DATA_ESCAPE_START = 60;
  127. public static final int SCRIPT_DATA_ESCAPE_START_DASH = 61;
  128. public static final int SCRIPT_DATA_ESCAPED_DASH = 62;
  129. public static final int SCRIPT_DATA_ESCAPED_DASH_DASH = 63;
  130. public static final int BOGUS_COMMENT_HYPHEN = 64;
  131. public static final int RAWTEXT_RCDATA_LESS_THAN_SIGN = 65;
  132. public static final int SCRIPT_DATA_ESCAPED_LESS_THAN_SIGN = 66;
  133. public static final int SCRIPT_DATA_DOUBLE_ESCAPE_START = 67;
  134. public static final int SCRIPT_DATA_DOUBLE_ESCAPED = 68;
  135. public static final int SCRIPT_DATA_DOUBLE_ESCAPED_LESS_THAN_SIGN = 69;
  136. public static final int SCRIPT_DATA_DOUBLE_ESCAPED_DASH = 70;
  137. public static final int SCRIPT_DATA_DOUBLE_ESCAPED_DASH_DASH = 71;
  138. public static final int SCRIPT_DATA_DOUBLE_ESCAPE_END = 72;
  139. public static final int PROCESSING_INSTRUCTION = 73;
  140. public static final int PROCESSING_INSTRUCTION_QUESTION_MARK = 74;
  141. /**
  142. * Magic value for UTF-16 operations.
  143. */
  144. private static final int LEAD_OFFSET = (0xD800 - (0x10000 >> 10));
  145. /**
  146. * UTF-16 code unit array containing less than and greater than for emitting
  147. * those characters on certain parse errors.
  148. */
  149. private static final @NoLength char[] LT_GT = { '<', '>' };
  150. /**
  151. * UTF-16 code unit array containing less than and solidus for emitting
  152. * those characters on certain parse errors.
  153. */
  154. private static final @NoLength char[] LT_SOLIDUS = { '<', '/' };
  155. /**
  156. * UTF-16 code unit array containing ]] for emitting those characters on
  157. * state transitions.
  158. */
  159. private static final @NoLength char[] RSQB_RSQB = { ']', ']' };
  160. /**
  161. * Array version of U+FFFD.
  162. */
  163. private static final @NoLength char[] REPLACEMENT_CHARACTER = { '\uFFFD' };
  164. // [NOCPP[
  165. /**
  166. * Array version of space.
  167. */
  168. private static final @NoLength char[] SPACE = { ' ' };
  169. // ]NOCPP]
  170. /**
  171. * Array version of line feed.
  172. */
  173. private static final @NoLength char[] LF = { '\n' };
  174. /**
  175. * Buffer growth parameter.
  176. */
  177. private static final int BUFFER_GROW_BY = 1024;
  178. /**
  179. * "CDATA[" as <code>char[]</code>
  180. */
  181. private static final @NoLength char[] CDATA_LSQB = "CDATA[".toCharArray();
  182. /**
  183. * "octype" as <code>char[]</code>
  184. */
  185. private static final @NoLength char[] OCTYPE = "octype".toCharArray();
  186. /**
  187. * "ublic" as <code>char[]</code>
  188. */
  189. private static final @NoLength char[] UBLIC = "ublic".toCharArray();
  190. /**
  191. * "ystem" as <code>char[]</code>
  192. */
  193. private static final @NoLength char[] YSTEM = "ystem".toCharArray();
  194. private static final char[] TITLE_ARR = { 't', 'i', 't', 'l', 'e' };
  195. private static final char[] SCRIPT_ARR = { 's', 'c', 'r', 'i', 'p', 't' };
  196. private static final char[] STYLE_ARR = { 's', 't', 'y', 'l', 'e' };
  197. private static final char[] PLAINTEXT_ARR = { 'p', 'l', 'a', 'i', 'n', 't',
  198. 'e', 'x', 't' };
  199. private static final char[] XMP_ARR = { 'x', 'm', 'p' };
  200. private static final char[] TEXTAREA_ARR = { 't', 'e', 'x', 't', 'a', 'r',
  201. 'e', 'a' };
  202. private static final char[] IFRAME_ARR = { 'i', 'f', 'r', 'a', 'm', 'e' };
  203. private static final char[] NOEMBED_ARR = { 'n', 'o', 'e', 'm', 'b', 'e',
  204. 'd' };
  205. private static final char[] NOSCRIPT_ARR = { 'n', 'o', 's', 'c', 'r', 'i',
  206. 'p', 't' };
  207. private static final char[] NOFRAMES_ARR = { 'n', 'o', 'f', 'r', 'a', 'm',
  208. 'e', 's' };
  209. /**
  210. * The token handler.
  211. */
  212. protected final TokenHandler tokenHandler;
  213. protected EncodingDeclarationHandler encodingDeclarationHandler;
  214. // [NOCPP[
  215. /**
  216. * The error handler.
  217. */
  218. protected ErrorHandler errorHandler;
  219. // ]NOCPP]
  220. /**
  221. * Whether the previous char read was CR.
  222. */
  223. protected boolean lastCR;
  224. protected int stateSave;
  225. private int returnStateSave;
  226. protected int index;
  227. private boolean forceQuirks;
  228. private char additional;
  229. private int entCol;
  230. private int firstCharKey;
  231. private int lo;
  232. private int hi;
  233. private int candidate;
  234. private int strBufMark;
  235. private int prevValue;
  236. protected int value;
  237. private boolean seenDigits;
  238. protected int cstart;
  239. /**
  240. * The SAX public id for the resource being tokenized. (Only passed to back
  241. * as part of locator data.)
  242. */
  243. private String publicId;
  244. /**
  245. * The SAX system id for the resource being tokenized. (Only passed to back
  246. * as part of locator data.)
  247. */
  248. private String systemId;
  249. /**
  250. * Buffer for short identifiers.
  251. */
  252. private @Auto char[] strBuf;
  253. /**
  254. * Number of significant <code>char</code>s in <code>strBuf</code>.
  255. */
  256. private int strBufLen;
  257. /**
  258. * <code>-1</code> to indicate that <code>strBuf</code> is used or otherwise
  259. * an offset to the main buffer.
  260. */
  261. // private int strBufOffset = -1;
  262. /**
  263. * Buffer for long strings.
  264. */
  265. private @Auto char[] longStrBuf;
  266. /**
  267. * Number of significant <code>char</code>s in <code>longStrBuf</code>.
  268. */
  269. private int longStrBufLen;
  270. /**
  271. * <code>-1</code> to indicate that <code>longStrBuf</code> is used or
  272. * otherwise an offset to the main buffer.
  273. */
  274. // private int longStrBufOffset = -1;
  275. /**
  276. * Buffer for expanding NCRs falling into the Basic Multilingual Plane.
  277. */
  278. private final @Auto char[] bmpChar;
  279. /**
  280. * Buffer for expanding astral NCRs.
  281. */
  282. private final @Auto char[] astralChar;
  283. /**
  284. * The element whose end tag closes the current CDATA or RCDATA element.
  285. */
  286. protected ElementName endTagExpectation = null;
  287. private char[] endTagExpectationAsArray; // not @Auto!
  288. /**
  289. * <code>true</code> if tokenizing an end tag
  290. */
  291. protected boolean endTag;
  292. /**
  293. * The current tag token name.
  294. */
  295. private ElementName tagName = null;
  296. /**
  297. * The current attribute name.
  298. */
  299. protected AttributeName attributeName = null;
  300. // [NOCPP[
  301. /**
  302. * Whether comment tokens are emitted.
  303. */
  304. private boolean wantsComments = false;
  305. /**
  306. * <code>true</code> when HTML4-specific additional errors are requested.
  307. */
  308. protected boolean html4;
  309. /**
  310. * Whether the stream is past the first 512 bytes.
  311. */
  312. private boolean metaBoundaryPassed;
  313. // ]NOCPP]
  314. /**
  315. * The name of the current doctype token.
  316. */
  317. private @Local String doctypeName;
  318. /**
  319. * The public id of the current doctype token.
  320. */
  321. private String publicIdentifier;
  322. /**
  323. * The system id of the current doctype token.
  324. */
  325. private String systemIdentifier;
  326. /**
  327. * The attribute holder.
  328. */
  329. private HtmlAttributes attributes;
  330. // [NOCPP[
  331. /**
  332. * The policy for vertical tab and form feed.
  333. */
  334. private XmlViolationPolicy contentSpacePolicy = XmlViolationPolicy.ALTER_INFOSET;
  335. /**
  336. * The policy for comments.
  337. */
  338. private XmlViolationPolicy commentPolicy = XmlViolationPolicy.ALTER_INFOSET;
  339. private XmlViolationPolicy xmlnsPolicy = XmlViolationPolicy.ALTER_INFOSET;
  340. private XmlViolationPolicy namePolicy = XmlViolationPolicy.ALTER_INFOSET;
  341. private boolean html4ModeCompatibleWithXhtml1Schemata;
  342. private final boolean newAttributesEachTime;
  343. // ]NOCPP]
  344. private int mappingLangToXmlLang;
  345. private boolean shouldSuspend;
  346. protected boolean confident;
  347. private int line;
  348. private Interner interner;
  349. // CPPONLY: private boolean viewingXmlSource;
  350. // [NOCPP[
  351. protected LocatorImpl ampersandLocation;
  352. public Tokenizer(TokenHandler tokenHandler, boolean newAttributesEachTime) {
  353. this.tokenHandler = tokenHandler;
  354. this.encodingDeclarationHandler = null;
  355. this.newAttributesEachTime = newAttributesEachTime;
  356. this.bmpChar = new char[1];
  357. this.astralChar = new char[2];
  358. this.tagName = null;
  359. this.attributeName = null;
  360. this.doctypeName = null;
  361. this.publicIdentifier = null;
  362. this.systemIdentifier = null;
  363. this.attributes = null;
  364. }
  365. // ]NOCPP]
  366. /**
  367. * The constructor.
  368. *
  369. * @param tokenHandler
  370. * the handler for receiving tokens
  371. */
  372. public Tokenizer(TokenHandler tokenHandler
  373. // CPPONLY: , boolean viewingXmlSource
  374. ) {
  375. this.tokenHandler = tokenHandler;
  376. this.encodingDeclarationHandler = null;
  377. // [NOCPP[
  378. this.newAttributesEachTime = false;
  379. // ]NOCPP]
  380. this.bmpChar = new char[1];
  381. this.astralChar = new char[2];
  382. this.tagName = null;
  383. this.attributeName = null;
  384. this.doctypeName = null;
  385. this.publicIdentifier = null;
  386. this.systemIdentifier = null;
  387. this.attributes = null;
  388. // CPPONLY: this.viewingXmlSource = viewingXmlSource;
  389. }
  390. public void setInterner(Interner interner) {
  391. this.interner = interner;
  392. }
  393. public void initLocation(String newPublicId, String newSystemId) {
  394. this.systemId = newSystemId;
  395. this.publicId = newPublicId;
  396. }
  397. // CPPONLY: boolean isViewingXmlSource() {
  398. // CPPONLY: return viewingXmlSource;
  399. // CPPONLY: }
  400. // [NOCPP[
  401. /**
  402. * Returns the mappingLangToXmlLang.
  403. *
  404. * @return the mappingLangToXmlLang
  405. */
  406. public boolean isMappingLangToXmlLang() {
  407. return mappingLangToXmlLang == AttributeName.HTML_LANG;
  408. }
  409. /**
  410. * Sets the mappingLangToXmlLang.
  411. *
  412. * @param mappingLangToXmlLang
  413. * the mappingLangToXmlLang to set
  414. */
  415. public void setMappingLangToXmlLang(boolean mappingLangToXmlLang) {
  416. this.mappingLangToXmlLang = mappingLangToXmlLang ? AttributeName.HTML_LANG
  417. : AttributeName.HTML;
  418. }
  419. /**
  420. * Sets the error handler.
  421. *
  422. * @see org.xml.sax.XMLReader#setErrorHandler(org.xml.sax.ErrorHandler)
  423. */
  424. public void setErrorHandler(ErrorHandler eh) {
  425. this.errorHandler = eh;
  426. }
  427. public ErrorHandler getErrorHandler() {
  428. return this.errorHandler;
  429. }
  430. /**
  431. * Sets the commentPolicy.
  432. *
  433. * @param commentPolicy
  434. * the commentPolicy to set
  435. */
  436. public void setCommentPolicy(XmlViolationPolicy commentPolicy) {
  437. this.commentPolicy = commentPolicy;
  438. }
  439. /**
  440. * Sets the contentNonXmlCharPolicy.
  441. *
  442. * @param contentNonXmlCharPolicy
  443. * the contentNonXmlCharPolicy to set
  444. */
  445. public void setContentNonXmlCharPolicy(
  446. XmlViolationPolicy contentNonXmlCharPolicy) {
  447. if (contentNonXmlCharPolicy != XmlViolationPolicy.ALLOW) {
  448. throw new IllegalArgumentException(
  449. "Must use ErrorReportingTokenizer to set contentNonXmlCharPolicy to non-ALLOW.");
  450. }
  451. }
  452. /**
  453. * Sets the contentSpacePolicy.
  454. *
  455. * @param contentSpacePolicy
  456. * the contentSpacePolicy to set
  457. */
  458. public void setContentSpacePolicy(XmlViolationPolicy contentSpacePolicy) {
  459. this.contentSpacePolicy = contentSpacePolicy;
  460. }
  461. /**
  462. * Sets the xmlnsPolicy.
  463. *
  464. * @param xmlnsPolicy
  465. * the xmlnsPolicy to set
  466. */
  467. public void setXmlnsPolicy(XmlViolationPolicy xmlnsPolicy) {
  468. if (xmlnsPolicy == XmlViolationPolicy.FATAL) {
  469. throw new IllegalArgumentException("Can't use FATAL here.");
  470. }
  471. this.xmlnsPolicy = xmlnsPolicy;
  472. }
  473. public void setNamePolicy(XmlViolationPolicy namePolicy) {
  474. this.namePolicy = namePolicy;
  475. }
  476. /**
  477. * Sets the html4ModeCompatibleWithXhtml1Schemata.
  478. *
  479. * @param html4ModeCompatibleWithXhtml1Schemata
  480. * the html4ModeCompatibleWithXhtml1Schemata to set
  481. */
  482. public void setHtml4ModeCompatibleWithXhtml1Schemata(
  483. boolean html4ModeCompatibleWithXhtml1Schemata) {
  484. this.html4ModeCompatibleWithXhtml1Schemata = html4ModeCompatibleWithXhtml1Schemata;
  485. }
  486. // ]NOCPP]
  487. // For the token handler to call
  488. /**
  489. * Sets the tokenizer state and the associated element name. This should
  490. * only ever used to put the tokenizer into one of the states that have
  491. * a special end tag expectation.
  492. *
  493. * @param specialTokenizerState
  494. * the tokenizer state to set
  495. * @param endTagExpectation
  496. * the expected end tag for transitioning back to normal
  497. */
  498. public void setStateAndEndTagExpectation(int specialTokenizerState,
  499. @Local String endTagExpectation) {
  500. this.stateSave = specialTokenizerState;
  501. if (specialTokenizerState == Tokenizer.DATA) {
  502. return;
  503. }
  504. @Auto char[] asArray = Portability.newCharArrayFromLocal(endTagExpectation);
  505. this.endTagExpectation = ElementName.elementNameByBuffer(asArray, 0,
  506. asArray.length, interner);
  507. endTagExpectationToArray();
  508. }
  509. /**
  510. * Sets the tokenizer state and the associated element name. This should
  511. * only ever used to put the tokenizer into one of the states that have
  512. * a special end tag expectation.
  513. *
  514. * @param specialTokenizerState
  515. * the tokenizer state to set
  516. * @param endTagExpectation
  517. * the expected end tag for transitioning back to normal
  518. */
  519. public void setStateAndEndTagExpectation(int specialTokenizerState,
  520. ElementName endTagExpectation) {
  521. this.stateSave = specialTokenizerState;
  522. this.endTagExpectation = endTagExpectation;
  523. endTagExpectationToArray();
  524. }
  525. private void endTagExpectationToArray() {
  526. switch (endTagExpectation.getGroup()) {
  527. case TreeBuilder.TITLE:
  528. endTagExpectationAsArray = TITLE_ARR;
  529. return;
  530. case TreeBuilder.SCRIPT:
  531. endTagExpectationAsArray = SCRIPT_ARR;
  532. return;
  533. case TreeBuilder.STYLE:
  534. endTagExpectationAsArray = STYLE_ARR;
  535. return;
  536. case TreeBuilder.PLAINTEXT:
  537. endTagExpectationAsArray = PLAINTEXT_ARR;
  538. return;
  539. case TreeBuilder.XMP:
  540. endTagExpectationAsArray = XMP_ARR;
  541. return;
  542. case TreeBuilder.TEXTAREA:
  543. endTagExpectationAsArray = TEXTAREA_ARR;
  544. return;
  545. case TreeBuilder.IFRAME:
  546. endTagExpectationAsArray = IFRAME_ARR;
  547. return;
  548. case TreeBuilder.NOEMBED:
  549. endTagExpectationAsArray = NOEMBED_ARR;
  550. return;
  551. case TreeBuilder.NOSCRIPT:
  552. endTagExpectationAsArray = NOSCRIPT_ARR;
  553. return;
  554. case TreeBuilder.NOFRAMES:
  555. endTagExpectationAsArray = NOFRAMES_ARR;
  556. return;
  557. default:
  558. assert false: "Bad end tag expectation.";
  559. return;
  560. }
  561. }
  562. /**
  563. * For C++ use only.
  564. */
  565. public void setLineNumber(int line) {
  566. this.line = line;
  567. }
  568. // start Locator impl
  569. /**
  570. * @see org.xml.sax.Locator#getLineNumber()
  571. */
  572. @Inline public int getLineNumber() {
  573. return line;
  574. }
  575. // [NOCPP[
  576. /**
  577. * @see org.xml.sax.Locator#getColumnNumber()
  578. */
  579. @Inline public int getColumnNumber() {
  580. return -1;
  581. }
  582. /**
  583. * @see org.xml.sax.Locator#getPublicId()
  584. */
  585. public String getPublicId() {
  586. return publicId;
  587. }
  588. /**
  589. * @see org.xml.sax.Locator#getSystemId()
  590. */
  591. public String getSystemId() {
  592. return systemId;
  593. }
  594. // end Locator impl
  595. // end public API
  596. public void notifyAboutMetaBoundary() {
  597. metaBoundaryPassed = true;
  598. }
  599. void turnOnAdditionalHtml4Errors() {
  600. html4 = true;
  601. }
  602. // ]NOCPP]
  603. HtmlAttributes emptyAttributes() {
  604. // [NOCPP[
  605. if (newAttributesEachTime) {
  606. return new HtmlAttributes(mappingLangToXmlLang);
  607. } else {
  608. // ]NOCPP]
  609. return HtmlAttributes.EMPTY_ATTRIBUTES;
  610. // [NOCPP[
  611. }
  612. // ]NOCPP]
  613. }
  614. @Inline private void clearStrBufAndAppend(char c) {
  615. strBuf[0] = c;
  616. strBufLen = 1;
  617. }
  618. @Inline private void clearStrBuf() {
  619. strBufLen = 0;
  620. }
  621. /**
  622. * Appends to the smaller buffer.
  623. *
  624. * @param c
  625. * the UTF-16 code unit to append
  626. */
  627. private void appendStrBuf(char c) {
  628. if (strBufLen == strBuf.length) {
  629. char[] newBuf = new char[strBuf.length + Tokenizer.BUFFER_GROW_BY];
  630. System.arraycopy(strBuf, 0, newBuf, 0, strBuf.length);
  631. strBuf = newBuf;
  632. }
  633. strBuf[strBufLen++] = c;
  634. }
  635. /**
  636. * The smaller buffer as a String. Currently only used for error reporting.
  637. *
  638. * <p>
  639. * C++ memory note: The return value must be released.
  640. *
  641. * @return the smaller buffer as a string
  642. */
  643. protected String strBufToString() {
  644. return Portability.newStringFromBuffer(strBuf, 0, strBufLen);
  645. }
  646. /**
  647. * Returns the short buffer as a local name. The return value is released in
  648. * emitDoctypeToken().
  649. *
  650. * @return the smaller buffer as local name
  651. */
  652. private void strBufToDoctypeName() {
  653. doctypeName = Portability.newLocalNameFromBuffer(strBuf, 0, strBufLen,
  654. interner);
  655. }
  656. /**
  657. * Emits the smaller buffer as character tokens.
  658. *
  659. * @throws SAXException
  660. * if the token handler threw
  661. */
  662. private void emitStrBuf() throws SAXException {
  663. if (strBufLen > 0) {
  664. tokenHandler.characters(strBuf, 0, strBufLen);
  665. }
  666. }
  667. @Inline private void clearLongStrBuf() {
  668. longStrBufLen = 0;
  669. }
  670. @Inline private void clearLongStrBufAndAppend(char c) {
  671. longStrBuf[0] = c;
  672. longStrBufLen = 1;
  673. }
  674. /**
  675. * Appends to the larger buffer.
  676. *
  677. * @param c
  678. * the UTF-16 code unit to append
  679. */
  680. private void appendLongStrBuf(char c) {
  681. if (longStrBufLen == longStrBuf.length) {
  682. char[] newBuf = new char[longStrBufLen + (longStrBufLen >> 1)];
  683. System.arraycopy(longStrBuf, 0, newBuf, 0, longStrBuf.length);
  684. longStrBuf = newBuf;
  685. }
  686. longStrBuf[longStrBufLen++] = c;
  687. }
  688. @Inline private void appendSecondHyphenToBogusComment() throws SAXException {
  689. // [NOCPP[
  690. switch (commentPolicy) {
  691. case ALTER_INFOSET:
  692. // detachLongStrBuf();
  693. appendLongStrBuf(' ');
  694. // FALLTHROUGH
  695. case ALLOW:
  696. warn("The document is not mappable to XML 1.0 due to two consecutive hyphens in a comment.");
  697. // ]NOCPP]
  698. appendLongStrBuf('-');
  699. // [NOCPP[
  700. break;
  701. case FATAL:
  702. fatal("The document is not mappable to XML 1.0 due to two consecutive hyphens in a comment.");
  703. break;
  704. }
  705. // ]NOCPP]
  706. }
  707. // [NOCPP[
  708. private void maybeAppendSpaceToBogusComment() throws SAXException {
  709. switch (commentPolicy) {
  710. case ALTER_INFOSET:
  711. // detachLongStrBuf();
  712. appendLongStrBuf(' ');
  713. // FALLTHROUGH
  714. case ALLOW:
  715. warn("The document is not mappable to XML 1.0 due to a trailing hyphen in a comment.");
  716. break;
  717. case FATAL:
  718. fatal("The document is not mappable to XML 1.0 due to a trailing hyphen in a comment.");
  719. break;
  720. }
  721. }
  722. // ]NOCPP]
  723. @Inline private void adjustDoubleHyphenAndAppendToLongStrBufAndErr(char c)
  724. throws SAXException {
  725. errConsecutiveHyphens();
  726. // [NOCPP[
  727. switch (commentPolicy) {
  728. case ALTER_INFOSET:
  729. // detachLongStrBuf();
  730. longStrBufLen--;
  731. appendLongStrBuf(' ');
  732. appendLongStrBuf('-');
  733. // FALLTHROUGH
  734. case ALLOW:
  735. warn("The document is not mappable to XML 1.0 due to two consecutive hyphens in a comment.");
  736. // ]NOCPP]
  737. appendLongStrBuf(c);
  738. // [NOCPP[
  739. break;
  740. case FATAL:
  741. fatal("The document is not mappable to XML 1.0 due to two consecutive hyphens in a comment.");
  742. break;
  743. }
  744. // ]NOCPP]
  745. }
  746. private void appendLongStrBuf(@NoLength char[] buffer, int offset, int length) {
  747. int reqLen = longStrBufLen + length;
  748. if (longStrBuf.length < reqLen) {
  749. char[] newBuf = new char[reqLen + (reqLen >> 1)];
  750. System.arraycopy(longStrBuf, 0, newBuf, 0, longStrBuf.length);
  751. longStrBuf = newBuf;
  752. }
  753. System.arraycopy(buffer, offset, longStrBuf, longStrBufLen, length);
  754. longStrBufLen = reqLen;
  755. }
  756. /**
  757. * Append the contents of the smaller buffer to the larger one.
  758. */
  759. @Inline private void appendStrBufToLongStrBuf() {
  760. appendLongStrBuf(strBuf, 0, strBufLen);
  761. }
  762. /**
  763. * The larger buffer as a string.
  764. *
  765. * <p>
  766. * C++ memory note: The return value must be released.
  767. *
  768. * @return the larger buffer as a string
  769. */
  770. private String longStrBufToString() {
  771. return Portability.newStringFromBuffer(longStrBuf, 0, longStrBufLen);
  772. }
  773. /**
  774. * Emits the current comment token.
  775. *
  776. * @param pos
  777. * TODO
  778. *
  779. * @throws SAXException
  780. */
  781. private void emitComment(int provisionalHyphens, int pos)
  782. throws SAXException {
  783. // [NOCPP[
  784. if (wantsComments) {
  785. // ]NOCPP]
  786. // if (longStrBufOffset != -1) {
  787. // tokenHandler.comment(buf, longStrBufOffset, longStrBufLen
  788. // - provisionalHyphens);
  789. // } else {
  790. tokenHandler.comment(longStrBuf, 0, longStrBufLen
  791. - provisionalHyphens);
  792. // }
  793. // [NOCPP[
  794. }
  795. // ]NOCPP]
  796. cstart = pos + 1;
  797. }
  798. /**
  799. * Flushes coalesced character tokens.
  800. *
  801. * @param buf
  802. * TODO
  803. * @param pos
  804. * TODO
  805. *
  806. * @throws SAXException
  807. */
  808. protected void flushChars(@NoLength char[] buf, int pos)
  809. throws SAXException {
  810. if (pos > cstart) {
  811. tokenHandler.characters(buf, cstart, pos - cstart);
  812. }
  813. cstart = Integer.MAX_VALUE;
  814. }
  815. /**
  816. * Reports an condition that would make the infoset incompatible with XML
  817. * 1.0 as fatal.
  818. *
  819. * @param message
  820. * the message
  821. * @throws SAXException
  822. * @throws SAXParseException
  823. */
  824. public void fatal(String message) throws SAXException {
  825. SAXParseException spe = new SAXParseException(message, this);
  826. if (errorHandler != null) {
  827. errorHandler.fatalError(spe);
  828. }
  829. throw spe;
  830. }
  831. /**
  832. * Reports a Parse Error.
  833. *
  834. * @param message
  835. * the message
  836. * @throws SAXException
  837. */
  838. public void err(String message) throws SAXException {
  839. if (errorHandler == null) {
  840. return;
  841. }
  842. SAXParseException spe = new SAXParseException(message, this);
  843. errorHandler.error(spe);
  844. }
  845. public void errTreeBuilder(String message) throws SAXException {
  846. ErrorHandler eh = null;
  847. if (tokenHandler instanceof TreeBuilder<?>) {
  848. TreeBuilder<?> treeBuilder = (TreeBuilder<?>) tokenHandler;
  849. eh = treeBuilder.getErrorHandler();
  850. }
  851. if (eh == null) {
  852. eh = errorHandler;
  853. }
  854. if (eh == null) {
  855. return;
  856. }
  857. SAXParseException spe = new SAXParseException(message, this);
  858. eh.error(spe);
  859. }
  860. /**
  861. * Reports a warning
  862. *
  863. * @param message
  864. * the message
  865. * @throws SAXException
  866. */
  867. public void warn(String message) throws SAXException {
  868. if (errorHandler == null) {
  869. return;
  870. }
  871. SAXParseException spe = new SAXParseException(message, this);
  872. errorHandler.warning(spe);
  873. }
  874. /**
  875. *
  876. */
  877. private void resetAttributes() {
  878. // [NOCPP[
  879. if (newAttributesEachTime) {
  880. // ]NOCPP]
  881. attributes = null;
  882. // [NOCPP[
  883. } else {
  884. attributes.clear(mappingLangToXmlLang);
  885. }
  886. // ]NOCPP]
  887. }
  888. private void strBufToElementNameString() {
  889. // if (strBufOffset != -1) {
  890. // return ElementName.elementNameByBuffer(buf, strBufOffset, strBufLen);
  891. // } else {
  892. tagName = ElementName.elementNameByBuffer(strBuf, 0, strBufLen,
  893. interner);
  894. // }
  895. }
  896. private int emitCurrentTagToken(boolean selfClosing, int pos)
  897. throws SAXException {
  898. cstart = pos + 1;
  899. maybeErrSlashInEndTag(selfClosing);
  900. stateSave = Tokenizer.DATA;
  901. HtmlAttributes attrs = (attributes == null ? HtmlAttributes.EMPTY_ATTRIBUTES
  902. : attributes);
  903. if (endTag) {
  904. /*
  905. * When an end tag token is emitted, the content model flag must be
  906. * switched to the PCDATA state.
  907. */
  908. maybeErrAttributesOnEndTag(attrs);
  909. // CPPONLY: if (!viewingXmlSource) {
  910. tokenHandler.endTag(tagName);
  911. // CPPONLY: }
  912. Portability.delete(attributes);
  913. } else {
  914. // CPPONLY: if (viewingXmlSource) {
  915. // CPPONLY: Portability.delete(attributes);
  916. // CPPONLY: } else {
  917. tokenHandler.startTag(tagName, attrs, selfClosing);
  918. // CPPONLY: }
  919. }
  920. tagName.release();
  921. tagName = null;
  922. resetAttributes();
  923. /*
  924. * The token handler may have called setStateAndEndTagExpectation
  925. * and changed stateSave since the start of this method.
  926. */
  927. return stateSave;
  928. }
  929. private void attributeNameComplete() throws SAXException {
  930. // if (strBufOffset != -1) {
  931. // attributeName = AttributeName.nameByBuffer(buf, strBufOffset,
  932. // strBufLen, namePolicy != XmlViolationPolicy.ALLOW);
  933. // } else {
  934. attributeName = AttributeName.nameByBuffer(strBuf, 0, strBufLen
  935. // [NOCPP[
  936. , namePolicy != XmlViolationPolicy.ALLOW
  937. // ]NOCPP]
  938. , interner);
  939. // }
  940. if (attributes == null) {
  941. attributes = new HtmlAttributes(mappingLangToXmlLang);
  942. }
  943. /*
  944. * When the user agent leaves the attribute name state (and before
  945. * emitting the tag token, if appropriate), the complete attribute's
  946. * name must be compared to the other attributes on the same token; if
  947. * there is already an attribute on the token with the exact same name,
  948. * then this is a parse error and the new attribute must be dropped,
  949. * along with the value that gets associated with it (if any).
  950. */
  951. if (attributes.contains(attributeName)) {
  952. errDuplicateAttribute();
  953. attributeName.release();
  954. attributeName = null;
  955. }
  956. }
  957. private void addAttributeWithoutValue() throws SAXException {
  958. noteAttributeWithoutValue();
  959. // [NOCPP[
  960. if (metaBoundaryPassed && AttributeName.CHARSET == attributeName
  961. && ElementName.META == tagName) {
  962. err("A \u201Ccharset\u201D attribute on a \u201Cmeta\u201D element found after the first 512 bytes.");
  963. }
  964. // ]NOCPP]
  965. if (attributeName != null) {
  966. // [NOCPP[
  967. if (html4) {
  968. if (attributeName.isBoolean()) {
  969. if (html4ModeCompatibleWithXhtml1Schemata) {
  970. attributes.addAttribute(attributeName,
  971. attributeName.getLocal(AttributeName.HTML),
  972. xmlnsPolicy);
  973. } else {
  974. attributes.addAttribute(attributeName, "", xmlnsPolicy);
  975. }
  976. } else {
  977. if (AttributeName.BORDER != attributeName) {
  978. err("Attribute value omitted for a non-boolean attribute. (HTML4-only error.)");
  979. attributes.addAttribute(attributeName, "", xmlnsPolicy);
  980. }
  981. }
  982. } else {
  983. if (AttributeName.SRC == attributeName
  984. || AttributeName.HREF == attributeName) {
  985. warn("Attribute \u201C"
  986. + attributeName.getLocal(AttributeName.HTML)
  987. + "\u201D without an explicit value seen. The attribute may be dropped by IE7.");
  988. }
  989. // ]NOCPP]
  990. attributes.addAttribute(attributeName,
  991. Portability.newEmptyString()
  992. // [NOCPP[
  993. , xmlnsPolicy
  994. // ]NOCPP]
  995. );
  996. // [NOCPP[
  997. }
  998. // ]NOCPP]
  999. attributeName = null; // attributeName has been adopted by the
  1000. // |attributes| object
  1001. }
  1002. }
  1003. private void addAttributeWithValue() throws SAXException {
  1004. // [NOCPP[
  1005. if (metaBoundaryPassed && ElementName.META == tagName
  1006. && AttributeName.CHARSET == attributeName) {
  1007. err("A \u201Ccharset\u201D attribute on a \u201Cmeta\u201D element found after the first 512 bytes.");
  1008. }
  1009. // ]NOCPP]
  1010. if (attributeName != null) {
  1011. String val = longStrBufToString(); // Ownership transferred to
  1012. // HtmlAttributes
  1013. // CPPONLY: if (mViewSource) {
  1014. // CPPONLY: mViewSource.MaybeLinkifyAttributeValue(attributeName, val);
  1015. // CPPONLY: }
  1016. // [NOCPP[
  1017. if (!endTag && html4 && html4ModeCompatibleWithXhtml1Schemata
  1018. && attributeName.isCaseFolded()) {
  1019. val = newAsciiLowerCaseStringFromString(val);
  1020. }
  1021. // ]NOCPP]
  1022. attributes.addAttribute(attributeName, val
  1023. // [NOCPP[
  1024. , xmlnsPolicy
  1025. // ]NOCPP]
  1026. );
  1027. attributeName = null; // attributeName has been adopted by the
  1028. // |attributes| object
  1029. }
  1030. }
  1031. // [NOCPP[
  1032. private static String newAsciiLowerCaseStringFromString(String str) {
  1033. if (str == null) {
  1034. return null;
  1035. }
  1036. char[] buf = new char[str.length()];
  1037. for (int i = 0; i < str.length(); i++) {
  1038. char c = str.charAt(i);
  1039. if (c >= 'A' && c <= 'Z') {
  1040. c += 0x20;
  1041. }
  1042. buf[i] = c;
  1043. }
  1044. return new String(buf);
  1045. }
  1046. protected void startErrorReporting() throws SAXException {
  1047. }
  1048. // ]NOCPP]
  1049. public void start() throws SAXException {
  1050. initializeWithoutStarting();
  1051. tokenHandler.startTokenization(this);
  1052. // [NOCPP[
  1053. startErrorReporting();
  1054. // ]NOCPP]
  1055. }
  1056. public boolean tokenizeBuffer(UTF16Buffer buffer) throws SAXException {
  1057. int state = stateSave;
  1058. int returnState = returnStateSave;
  1059. char c = '\u0000';
  1060. shouldSuspend = false;
  1061. lastCR = false;
  1062. int start = buffer.getStart();
  1063. /**
  1064. * The index of the last <code>char</code> read from <code>buf</code>.
  1065. */
  1066. int pos = start - 1;
  1067. /**
  1068. * The index of the first <code>char</code> in <code>buf</code> that is
  1069. * part of a coalesced run of character tokens or
  1070. * <code>Integer.MAX_VALUE</code> if there is not a current run being
  1071. * coalesced.
  1072. */
  1073. switch (state) {
  1074. case DATA:
  1075. case RCDATA:
  1076. case SCRIPT_DATA:
  1077. case PLAINTEXT:
  1078. case RAWTEXT:
  1079. case CDATA_SECTION:
  1080. case SCRIPT_DATA_ESCAPED:
  1081. case SCRIPT_DATA_ESCAPE_START:
  1082. case SCRIPT_DATA_ESCAPE_START_DASH:
  1083. case SCRIPT_DATA_ESCAPED_DASH:
  1084. case SCRIPT_DATA_ESCAPED_DASH_DASH:
  1085. case SCRIPT_DATA_DOUBLE_ESCAPE_START:
  1086. case SCRIPT_DATA_DOUBLE_ESCAPED:
  1087. case SCRIPT_DATA_DOUBLE_ESCAPED_LESS_THAN_SIGN:
  1088. case SCRIPT_DATA_DOUBLE_ESCAPED_DASH:
  1089. case SCRIPT_DATA_DOUBLE_ESCAPED_DASH_DASH:
  1090. case SCRIPT_DATA_DOUBLE_ESCAPE_END:
  1091. cstart = start;
  1092. break;
  1093. default:
  1094. cstart = Integer.MAX_VALUE;
  1095. break;
  1096. }
  1097. /**
  1098. * The number of <code>char</code>s in <code>buf</code> that have
  1099. * meaning. (The rest of the array is garbage and should not be
  1100. * examined.)
  1101. */
  1102. // CPPONLY: if (mViewSource) {
  1103. // CPPONLY: mViewSource.SetBuffer(buffer);
  1104. // CPPONLY: pos = stateLoop(state, c, pos, buffer.getBuffer(), false, returnState, buffer.getEnd());
  1105. // CPPONLY: mViewSource.DropBuffer((pos == buffer.getEnd()) ? pos : pos + 1);
  1106. // CPPONLY: } else {
  1107. // CPPONLY: pos = stateLoop(state, c, pos, buffer.getBuffer(), false, returnState, buffer.getEnd());
  1108. // CPPONLY: }
  1109. // [NOCPP[
  1110. pos = stateLoop(state, c, pos, buffer.getBuffer(), false, returnState,
  1111. buffer.getEnd());
  1112. // ]NOCPP]
  1113. if (pos == buffer.getEnd()) {
  1114. // exiting due to end of buffer
  1115. buffer.setStart(pos);
  1116. } else {
  1117. buffer.setStart(pos + 1);
  1118. }
  1119. return lastCR;
  1120. }
  1121. @SuppressWarnings("unused") private int stateLoop(int state, char c,
  1122. int pos, @NoLength char[] buf, boolean reconsume, int returnState,
  1123. int endPos) throws SAXException {
  1124. /*
  1125. * Idioms used in this code:
  1126. *
  1127. *
  1128. * Consuming the next input character
  1129. *
  1130. * To consume the next input character, the code does this: if (++pos ==
  1131. * endPos) { break stateloop; } c = checkChar(buf, pos);
  1132. *
  1133. *
  1134. * Staying in a state
  1135. *
  1136. * When there's a state that the tokenizer may stay in over multiple
  1137. * input characters, the state has a wrapper |for(;;)| loop and staying
  1138. * in the state continues the loop.
  1139. *
  1140. *
  1141. * Switching to another state
  1142. *
  1143. * To switch to another state, the code sets the state variable to the
  1144. * magic number of the new state. Then it either continues stateloop or
  1145. * breaks out of the state's own wrapper loop if the target state is
  1146. * right after the current state in source order. (This is a partial
  1147. * workaround for Java's lack of goto.)
  1148. *
  1149. *
  1150. * Reconsume support
  1151. *
  1152. * The spec sometimes says that an input character is reconsumed in
  1153. * another state. If a state can ever be entered so that an input
  1154. * character can be reconsumed in it, the state's code starts with an
  1155. * |if (reconsume)| that sets reconsume to false and skips over the
  1156. * normal code for consuming a new character.
  1157. *
  1158. * To reconsume the current character in another state, the code sets
  1159. * |reconsume| to true and then switches to the other state.
  1160. *
  1161. *
  1162. * Emitting character tokens
  1163. *
  1164. * This method emits character tokens lazily. Whenever a new range of
  1165. * character tokens starts, the field cstart must be set to the start
  1166. * index of the range. The flushChars() method must be called at the end
  1167. * of a range to flush it.
  1168. *
  1169. *
  1170. * U+0000 handling
  1171. *
  1172. * The various states have to handle the replacement of U+0000 with
  1173. * U+FFFD. However, if U+0000 would be reconsumed in another state, the
  1174. * replacement doesn't need to happen, because it's handled by the
  1175. * reconsuming state.
  1176. *
  1177. *
  1178. * LF handling
  1179. *
  1180. * Every state needs to increment the line number upon LF unless the LF
  1181. * gets reconsumed by another state which increments the line number.
  1182. *
  1183. *
  1184. * CR handling
  1185. *
  1186. * Every state needs to handle CR unless the CR gets reconsumed and is
  1187. * handled by the reconsuming state. The CR needs to be handled as if it
  1188. * were and LF, the lastCR field must be set to true and then this
  1189. * method must return. The IO driver will then swallow the next
  1190. * character if it is an LF to coalesce CRLF.
  1191. */
  1192. stateloop: for (;;) {
  1193. switch (state) {
  1194. case DATA:
  1195. dataloop: for (;;) {
  1196. if (reconsume) {
  1197. reconsume = false;
  1198. } else {
  1199. if (++pos == endPos) {
  1200. break stateloop;
  1201. }
  1202. c = checkChar(buf, pos);
  1203. }
  1204. switch (c) {
  1205. case '&':
  1206. /*
  1207. * U+0026 AMPERSAND (&) Switch to the character
  1208. * reference in data state.
  1209. */
  1210. flushChars(buf, pos);
  1211. clearStrBufAndAppend(c);
  1212. setAdditionalAndRememberAmpersandLocation('\u0000');
  1213. returnState = state;
  1214. state = transition(state, Tokenizer.CONSUME_CHARACTER_REFERENCE, reconsume, pos);
  1215. continue stateloop;
  1216. case '<':
  1217. /*
  1218. * U+003C LESS-THAN SIGN (<) Switch to the tag
  1219. * open state.
  1220. */
  1221. flushChars(buf, pos);
  1222. state = transition(state, Tokenizer.TAG_OPEN, reconsume, pos);
  1223. break dataloop; // FALL THROUGH continue
  1224. // stateloop;
  1225. case '\u0000':
  1226. emitReplacementCharacter(buf, pos);
  1227. continue;
  1228. case '\r':
  1229. emitCarriageReturn(buf, pos);
  1230. break stateloop;
  1231. case '\n':
  1232. silentLineFeed();
  1233. default:
  1234. /*
  1235. * Anything else Emit the input character as a
  1236. * character token.
  1237. *
  1238. * Stay in the data state.
  1239. */
  1240. continue;
  1241. }
  1242. }
  1243. // WARNING FALLTHRU CASE TRANSITION: DON'T REORDER
  1244. case TAG_OPEN:
  1245. tagopenloop: for (;;) {
  1246. /*
  1247. * The behavior of this state depends on the content
  1248. * model flag.
  1249. */
  1250. if (++pos == endPos) {
  1251. break stateloop;
  1252. }
  1253. c = checkChar(buf, pos);
  1254. /*
  1255. * If the content model flag is set to the PCDATA state
  1256. * Consume the next input character:
  1257. */
  1258. if (c >= 'A' && c <= 'Z') {
  1259. /*
  1260. * U+0041 LATIN CAPITAL LETTER A through to U+005A
  1261. * LATIN CAPITAL LETTER Z Create a new start tag
  1262. * token,
  1263. */
  1264. endTag = false;
  1265. /*
  1266. * set its tag name to the lowercase version of the
  1267. * input character (add 0x0020 to the character's
  1268. * code point),
  1269. */
  1270. clearStrBufAndAppend((char) (c + 0x20));
  1271. /* then switch to the tag name state. */
  1272. state = transition(state, Tokenizer.TAG_NAME, reconsume, pos);
  1273. /*
  1274. * (Don't emit the token yet; further details will
  1275. * be filled in before it is emitted.)
  1276. */
  1277. break tagopenloop;
  1278. // continue stateloop;
  1279. } else if (c >= 'a' && c <= 'z') {
  1280. /*
  1281. * U+0061 LATIN SMALL LETTER A through to U+007A
  1282. * LATIN SMALL LETTER Z Create a new start tag
  1283. * token,
  1284. */
  1285. endTag = false;
  1286. /*
  1287. * set its tag name to the input character,
  1288. */
  1289. clearStrBufAndAppend(c);
  1290. /* then switch to the tag name state. */
  1291. state = transition(state, Tokenizer.TAG_NAME, reconsume, pos);
  1292. /*
  1293. * (Don't emit the token yet; further details will
  1294. * be filled in before it is emitted.)
  1295. */
  1296. break tagopenloop;
  1297. // continue stateloop;
  1298. }
  1299. switch (c) {
  1300. case '!':
  1301. /*
  1302. * U+0021 EXCLAMATION MARK (!) Switch to the
  1303. * markup declaration open state.
  1304. */
  1305. state = transition(state, Tokenizer.MARKUP_DECLARATION_OPEN, reconsume, pos);
  1306. continue stateloop;
  1307. case '/':
  1308. /*
  1309. * U+002F SOLIDUS (/) Switch to the close tag
  1310. * open state.
  1311. */
  1312. state = transition(state, Tokenizer.CLOSE_TAG_OPEN, reconsume, pos);