PageRenderTime 262ms CodeModel.GetById 22ms app.highlight 204ms RepoModel.GetById 2ms app.codeStats 1ms

/parser/html/javasrc/Tokenizer.java

http://github.com/zpao/v8monkey
Java | 7027 lines | 3908 code | 381 blank | 2738 comment | 760 complexity | fb65ba7ba7f99f659d89e51498e7a0f4 MD5 | raw file
   1/*
   2 * Copyright (c) 2005-2007 Henri Sivonen
   3 * Copyright (c) 2007-2010 Mozilla Foundation
   4 * Portions of comments Copyright 2004-2010 Apple Computer, Inc., Mozilla 
   5 * Foundation, and Opera Software ASA.
   6 *
   7 * Permission is hereby granted, free of charge, to any person obtaining a 
   8 * copy of this software and associated documentation files (the "Software"), 
   9 * to deal in the Software without restriction, including without limitation 
  10 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 
  11 * and/or sell copies of the Software, and to permit persons to whom the 
  12 * Software is furnished to do so, subject to the following conditions:
  13 *
  14 * The above copyright notice and this permission notice shall be included in 
  15 * all copies or substantial portions of the Software.
  16 *
  17 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 
  18 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 
  19 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 
  20 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 
  21 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 
  22 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 
  23 * DEALINGS IN THE SOFTWARE.
  24 */
  25
  26/*
  27 * The comments following this one that use the same comment syntax as this 
  28 * comment are quotes from the WHATWG HTML 5 spec as of 2 June 2007 
  29 * amended as of June 18 2008 and May 31 2010.
  30 * That document came with this statement:
  31 * "© Copyright 2004-2010 Apple Computer, Inc., Mozilla Foundation, and 
  32 * Opera Software ASA. You are granted a license to use, reproduce and 
  33 * create derivative works of this document."
  34 */
  35
  36package nu.validator.htmlparser.impl;
  37
  38import nu.validator.htmlparser.annotation.Auto;
  39import nu.validator.htmlparser.annotation.CharacterName;
  40import nu.validator.htmlparser.annotation.Const;
  41import nu.validator.htmlparser.annotation.Inline;
  42import nu.validator.htmlparser.annotation.Local;
  43import nu.validator.htmlparser.annotation.NoLength;
  44import nu.validator.htmlparser.common.EncodingDeclarationHandler;
  45import nu.validator.htmlparser.common.Interner;
  46import nu.validator.htmlparser.common.TokenHandler;
  47import nu.validator.htmlparser.common.XmlViolationPolicy;
  48
  49import org.xml.sax.ErrorHandler;
  50import org.xml.sax.Locator;
  51import org.xml.sax.SAXException;
  52import org.xml.sax.SAXParseException;
  53
  54/**
  55 * An implementation of
  56 * http://www.whatwg.org/specs/web-apps/current-work/multipage/tokenization.html
  57 * 
  58 * This class implements the <code>Locator</code> interface. This is not an
  59 * incidental implementation detail: Users of this class are encouraged to make
  60 * use of the <code>Locator</code> nature.
  61 * 
  62 * By default, the tokenizer may report data that XML 1.0 bans. The tokenizer
  63 * can be configured to treat these conditions as fatal or to coerce the infoset
  64 * to something that XML 1.0 allows.
  65 * 
  66 * @version $Id$
  67 * @author hsivonen
  68 */
  69public class Tokenizer implements Locator {
  70
  71    private static final int DATA_AND_RCDATA_MASK = ~1;
  72
  73    public static final int DATA = 0;
  74
  75    public static final int RCDATA = 1;
  76
  77    public static final int SCRIPT_DATA = 2;
  78
  79    public static final int RAWTEXT = 3;
  80
  81    public static final int SCRIPT_DATA_ESCAPED = 4;
  82
  83    public static final int ATTRIBUTE_VALUE_DOUBLE_QUOTED = 5;
  84
  85    public static final int ATTRIBUTE_VALUE_SINGLE_QUOTED = 6;
  86
  87    public static final int ATTRIBUTE_VALUE_UNQUOTED = 7;
  88
  89    public static final int PLAINTEXT = 8;
  90
  91    public static final int TAG_OPEN = 9;
  92
  93    public static final int CLOSE_TAG_OPEN = 10;
  94
  95    public static final int TAG_NAME = 11;
  96
  97    public static final int BEFORE_ATTRIBUTE_NAME = 12;
  98
  99    public static final int ATTRIBUTE_NAME = 13;
 100
 101    public static final int AFTER_ATTRIBUTE_NAME = 14;
 102
 103    public static final int BEFORE_ATTRIBUTE_VALUE = 15;
 104
 105    public static final int AFTER_ATTRIBUTE_VALUE_QUOTED = 16;
 106
 107    public static final int BOGUS_COMMENT = 17;
 108
 109    public static final int MARKUP_DECLARATION_OPEN = 18;
 110
 111    public static final int DOCTYPE = 19;
 112
 113    public static final int BEFORE_DOCTYPE_NAME = 20;
 114
 115    public static final int DOCTYPE_NAME = 21;
 116
 117    public static final int AFTER_DOCTYPE_NAME = 22;
 118
 119    public static final int BEFORE_DOCTYPE_PUBLIC_IDENTIFIER = 23;
 120
 121    public static final int DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED = 24;
 122
 123    public static final int DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED = 25;
 124
 125    public static final int AFTER_DOCTYPE_PUBLIC_IDENTIFIER = 26;
 126
 127    public static final int BEFORE_DOCTYPE_SYSTEM_IDENTIFIER = 27;
 128
 129    public static final int DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED = 28;
 130
 131    public static final int DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED = 29;
 132
 133    public static final int AFTER_DOCTYPE_SYSTEM_IDENTIFIER = 30;
 134
 135    public static final int BOGUS_DOCTYPE = 31;
 136
 137    public static final int COMMENT_START = 32;
 138
 139    public static final int COMMENT_START_DASH = 33;
 140
 141    public static final int COMMENT = 34;
 142
 143    public static final int COMMENT_END_DASH = 35;
 144
 145    public static final int COMMENT_END = 36;
 146
 147    public static final int COMMENT_END_BANG = 37;
 148
 149    public static final int NON_DATA_END_TAG_NAME = 38;
 150
 151    public static final int MARKUP_DECLARATION_HYPHEN = 39;
 152
 153    public static final int MARKUP_DECLARATION_OCTYPE = 40;
 154
 155    public static final int DOCTYPE_UBLIC = 41;
 156
 157    public static final int DOCTYPE_YSTEM = 42;
 158
 159    public static final int AFTER_DOCTYPE_PUBLIC_KEYWORD = 43;
 160
 161    public static final int BETWEEN_DOCTYPE_PUBLIC_AND_SYSTEM_IDENTIFIERS = 44;
 162
 163    public static final int AFTER_DOCTYPE_SYSTEM_KEYWORD = 45;
 164
 165    public static final int CONSUME_CHARACTER_REFERENCE = 46;
 166
 167    public static final int CONSUME_NCR = 47;
 168
 169    public static final int CHARACTER_REFERENCE_TAIL = 48;
 170
 171    public static final int HEX_NCR_LOOP = 49;
 172
 173    public static final int DECIMAL_NRC_LOOP = 50;
 174
 175    public static final int HANDLE_NCR_VALUE = 51;
 176
 177    public static final int HANDLE_NCR_VALUE_RECONSUME = 52;
 178
 179    public static final int CHARACTER_REFERENCE_HILO_LOOKUP = 53;
 180
 181    public static final int SELF_CLOSING_START_TAG = 54;
 182
 183    public static final int CDATA_START = 55;
 184
 185    public static final int CDATA_SECTION = 56;
 186
 187    public static final int CDATA_RSQB = 57;
 188
 189    public static final int CDATA_RSQB_RSQB = 58;
 190
 191    public static final int SCRIPT_DATA_LESS_THAN_SIGN = 59;
 192
 193    public static final int SCRIPT_DATA_ESCAPE_START = 60;
 194
 195    public static final int SCRIPT_DATA_ESCAPE_START_DASH = 61;
 196
 197    public static final int SCRIPT_DATA_ESCAPED_DASH = 62;
 198
 199    public static final int SCRIPT_DATA_ESCAPED_DASH_DASH = 63;
 200
 201    public static final int BOGUS_COMMENT_HYPHEN = 64;
 202
 203    public static final int RAWTEXT_RCDATA_LESS_THAN_SIGN = 65;
 204
 205    public static final int SCRIPT_DATA_ESCAPED_LESS_THAN_SIGN = 66;
 206
 207    public static final int SCRIPT_DATA_DOUBLE_ESCAPE_START = 67;
 208
 209    public static final int SCRIPT_DATA_DOUBLE_ESCAPED = 68;
 210
 211    public static final int SCRIPT_DATA_DOUBLE_ESCAPED_LESS_THAN_SIGN = 69;
 212
 213    public static final int SCRIPT_DATA_DOUBLE_ESCAPED_DASH = 70;
 214
 215    public static final int SCRIPT_DATA_DOUBLE_ESCAPED_DASH_DASH = 71;
 216
 217    public static final int SCRIPT_DATA_DOUBLE_ESCAPE_END = 72;
 218
 219    public static final int PROCESSING_INSTRUCTION = 73;
 220
 221    public static final int PROCESSING_INSTRUCTION_QUESTION_MARK = 74;
 222
 223    /**
 224     * Magic value for UTF-16 operations.
 225     */
 226    private static final int LEAD_OFFSET = (0xD800 - (0x10000 >> 10));
 227
 228    /**
 229     * UTF-16 code unit array containing less than and greater than for emitting
 230     * those characters on certain parse errors.
 231     */
 232    private static final @NoLength char[] LT_GT = { '<', '>' };
 233
 234    /**
 235     * UTF-16 code unit array containing less than and solidus for emitting
 236     * those characters on certain parse errors.
 237     */
 238    private static final @NoLength char[] LT_SOLIDUS = { '<', '/' };
 239
 240    /**
 241     * UTF-16 code unit array containing ]] for emitting those characters on
 242     * state transitions.
 243     */
 244    private static final @NoLength char[] RSQB_RSQB = { ']', ']' };
 245
 246    /**
 247     * Array version of U+FFFD.
 248     */
 249    private static final @NoLength char[] REPLACEMENT_CHARACTER = { '\uFFFD' };
 250
 251    // [NOCPP[
 252
 253    /**
 254     * Array version of space.
 255     */
 256    private static final @NoLength char[] SPACE = { ' ' };
 257
 258    // ]NOCPP]
 259
 260    /**
 261     * Array version of line feed.
 262     */
 263    private static final @NoLength char[] LF = { '\n' };
 264
 265    /**
 266     * Buffer growth parameter.
 267     */
 268    private static final int BUFFER_GROW_BY = 1024;
 269
 270    /**
 271     * "CDATA[" as <code>char[]</code>
 272     */
 273    private static final @NoLength char[] CDATA_LSQB = "CDATA[".toCharArray();
 274
 275    /**
 276     * "octype" as <code>char[]</code>
 277     */
 278    private static final @NoLength char[] OCTYPE = "octype".toCharArray();
 279
 280    /**
 281     * "ublic" as <code>char[]</code>
 282     */
 283    private static final @NoLength char[] UBLIC = "ublic".toCharArray();
 284
 285    /**
 286     * "ystem" as <code>char[]</code>
 287     */
 288    private static final @NoLength char[] YSTEM = "ystem".toCharArray();
 289
 290    private static final char[] TITLE_ARR = { 't', 'i', 't', 'l', 'e' };
 291
 292    private static final char[] SCRIPT_ARR = { 's', 'c', 'r', 'i', 'p', 't' };
 293
 294    private static final char[] STYLE_ARR = { 's', 't', 'y', 'l', 'e' };
 295
 296    private static final char[] PLAINTEXT_ARR = { 'p', 'l', 'a', 'i', 'n', 't',
 297            'e', 'x', 't' };
 298
 299    private static final char[] XMP_ARR = { 'x', 'm', 'p' };
 300
 301    private static final char[] TEXTAREA_ARR = { 't', 'e', 'x', 't', 'a', 'r',
 302            'e', 'a' };
 303
 304    private static final char[] IFRAME_ARR = { 'i', 'f', 'r', 'a', 'm', 'e' };
 305
 306    private static final char[] NOEMBED_ARR = { 'n', 'o', 'e', 'm', 'b', 'e',
 307            'd' };
 308
 309    private static final char[] NOSCRIPT_ARR = { 'n', 'o', 's', 'c', 'r', 'i',
 310            'p', 't' };
 311
 312    private static final char[] NOFRAMES_ARR = { 'n', 'o', 'f', 'r', 'a', 'm',
 313            'e', 's' };
 314
 315    /**
 316     * The token handler.
 317     */
 318    protected final TokenHandler tokenHandler;
 319
 320    protected EncodingDeclarationHandler encodingDeclarationHandler;
 321
 322    // [NOCPP[
 323
 324    /**
 325     * The error handler.
 326     */
 327    protected ErrorHandler errorHandler;
 328
 329    // ]NOCPP]
 330
 331    /**
 332     * Whether the previous char read was CR.
 333     */
 334    protected boolean lastCR;
 335
 336    protected int stateSave;
 337
 338    private int returnStateSave;
 339
 340    protected int index;
 341
 342    private boolean forceQuirks;
 343
 344    private char additional;
 345
 346    private int entCol;
 347
 348    private int firstCharKey;
 349
 350    private int lo;
 351
 352    private int hi;
 353
 354    private int candidate;
 355
 356    private int strBufMark;
 357
 358    private int prevValue;
 359
 360    protected int value;
 361
 362    private boolean seenDigits;
 363
 364    protected int cstart;
 365
 366    /**
 367     * The SAX public id for the resource being tokenized. (Only passed to back
 368     * as part of locator data.)
 369     */
 370    private String publicId;
 371
 372    /**
 373     * The SAX system id for the resource being tokenized. (Only passed to back
 374     * as part of locator data.)
 375     */
 376    private String systemId;
 377
 378    /**
 379     * Buffer for short identifiers.
 380     */
 381    private @Auto char[] strBuf;
 382
 383    /**
 384     * Number of significant <code>char</code>s in <code>strBuf</code>.
 385     */
 386    private int strBufLen;
 387
 388    /**
 389     * <code>-1</code> to indicate that <code>strBuf</code> is used or otherwise
 390     * an offset to the main buffer.
 391     */
 392    // private int strBufOffset = -1;
 393    /**
 394     * Buffer for long strings.
 395     */
 396    private @Auto char[] longStrBuf;
 397
 398    /**
 399     * Number of significant <code>char</code>s in <code>longStrBuf</code>.
 400     */
 401    private int longStrBufLen;
 402
 403    /**
 404     * <code>-1</code> to indicate that <code>longStrBuf</code> is used or
 405     * otherwise an offset to the main buffer.
 406     */
 407    // private int longStrBufOffset = -1;
 408
 409    /**
 410     * Buffer for expanding NCRs falling into the Basic Multilingual Plane.
 411     */
 412    private final @Auto char[] bmpChar;
 413
 414    /**
 415     * Buffer for expanding astral NCRs.
 416     */
 417    private final @Auto char[] astralChar;
 418
 419    /**
 420     * The element whose end tag closes the current CDATA or RCDATA element.
 421     */
 422    protected ElementName endTagExpectation = null;
 423
 424    private char[] endTagExpectationAsArray; // not @Auto!
 425
 426    /**
 427     * <code>true</code> if tokenizing an end tag
 428     */
 429    protected boolean endTag;
 430
 431    /**
 432     * The current tag token name.
 433     */
 434    private ElementName tagName = null;
 435
 436    /**
 437     * The current attribute name.
 438     */
 439    protected AttributeName attributeName = null;
 440
 441    // [NOCPP[
 442
 443    /**
 444     * Whether comment tokens are emitted.
 445     */
 446    private boolean wantsComments = false;
 447
 448    /**
 449     * <code>true</code> when HTML4-specific additional errors are requested.
 450     */
 451    protected boolean html4;
 452
 453    /**
 454     * Whether the stream is past the first 512 bytes.
 455     */
 456    private boolean metaBoundaryPassed;
 457
 458    // ]NOCPP]
 459
 460    /**
 461     * The name of the current doctype token.
 462     */
 463    private @Local String doctypeName;
 464
 465    /**
 466     * The public id of the current doctype token.
 467     */
 468    private String publicIdentifier;
 469
 470    /**
 471     * The system id of the current doctype token.
 472     */
 473    private String systemIdentifier;
 474
 475    /**
 476     * The attribute holder.
 477     */
 478    private HtmlAttributes attributes;
 479
 480    // [NOCPP[
 481
 482    /**
 483     * The policy for vertical tab and form feed.
 484     */
 485    private XmlViolationPolicy contentSpacePolicy = XmlViolationPolicy.ALTER_INFOSET;
 486
 487    /**
 488     * The policy for comments.
 489     */
 490    private XmlViolationPolicy commentPolicy = XmlViolationPolicy.ALTER_INFOSET;
 491
 492    private XmlViolationPolicy xmlnsPolicy = XmlViolationPolicy.ALTER_INFOSET;
 493
 494    private XmlViolationPolicy namePolicy = XmlViolationPolicy.ALTER_INFOSET;
 495
 496    private boolean html4ModeCompatibleWithXhtml1Schemata;
 497
 498    private final boolean newAttributesEachTime;
 499
 500    // ]NOCPP]
 501
 502    private int mappingLangToXmlLang;
 503
 504    private boolean shouldSuspend;
 505
 506    protected boolean confident;
 507
 508    private int line;
 509
 510    private Interner interner;
 511
 512    // CPPONLY: private boolean viewingXmlSource;
 513
 514    // [NOCPP[
 515
 516    protected LocatorImpl ampersandLocation;
 517
 518    public Tokenizer(TokenHandler tokenHandler, boolean newAttributesEachTime) {
 519        this.tokenHandler = tokenHandler;
 520        this.encodingDeclarationHandler = null;
 521        this.newAttributesEachTime = newAttributesEachTime;
 522        this.bmpChar = new char[1];
 523        this.astralChar = new char[2];
 524        this.tagName = null;
 525        this.attributeName = null;
 526        this.doctypeName = null;
 527        this.publicIdentifier = null;
 528        this.systemIdentifier = null;
 529        this.attributes = null;
 530    }
 531
 532    // ]NOCPP]
 533
 534    /**
 535     * The constructor.
 536     * 
 537     * @param tokenHandler
 538     *            the handler for receiving tokens
 539     */
 540    public Tokenizer(TokenHandler tokenHandler
 541    // CPPONLY: , boolean viewingXmlSource        
 542    ) {
 543        this.tokenHandler = tokenHandler;
 544        this.encodingDeclarationHandler = null;
 545        // [NOCPP[
 546        this.newAttributesEachTime = false;
 547        // ]NOCPP]
 548        this.bmpChar = new char[1];
 549        this.astralChar = new char[2];
 550        this.tagName = null;
 551        this.attributeName = null;
 552        this.doctypeName = null;
 553        this.publicIdentifier = null;
 554        this.systemIdentifier = null;
 555        this.attributes = null;
 556    // CPPONLY: this.viewingXmlSource = viewingXmlSource;
 557    }
 558
 559    public void setInterner(Interner interner) {
 560        this.interner = interner;
 561    }
 562
 563    public void initLocation(String newPublicId, String newSystemId) {
 564        this.systemId = newSystemId;
 565        this.publicId = newPublicId;
 566
 567    }
 568
 569    // CPPONLY: boolean isViewingXmlSource() {
 570    // CPPONLY: return viewingXmlSource;
 571    // CPPONLY: }
 572
 573    // [NOCPP[
 574
 575    /**
 576     * Returns the mappingLangToXmlLang.
 577     * 
 578     * @return the mappingLangToXmlLang
 579     */
 580    public boolean isMappingLangToXmlLang() {
 581        return mappingLangToXmlLang == AttributeName.HTML_LANG;
 582    }
 583
 584    /**
 585     * Sets the mappingLangToXmlLang.
 586     * 
 587     * @param mappingLangToXmlLang
 588     *            the mappingLangToXmlLang to set
 589     */
 590    public void setMappingLangToXmlLang(boolean mappingLangToXmlLang) {
 591        this.mappingLangToXmlLang = mappingLangToXmlLang ? AttributeName.HTML_LANG
 592                : AttributeName.HTML;
 593    }
 594
 595    /**
 596     * Sets the error handler.
 597     * 
 598     * @see org.xml.sax.XMLReader#setErrorHandler(org.xml.sax.ErrorHandler)
 599     */
 600    public void setErrorHandler(ErrorHandler eh) {
 601        this.errorHandler = eh;
 602    }
 603
 604    public ErrorHandler getErrorHandler() {
 605        return this.errorHandler;
 606    }
 607
 608    /**
 609     * Sets the commentPolicy.
 610     * 
 611     * @param commentPolicy
 612     *            the commentPolicy to set
 613     */
 614    public void setCommentPolicy(XmlViolationPolicy commentPolicy) {
 615        this.commentPolicy = commentPolicy;
 616    }
 617
 618    /**
 619     * Sets the contentNonXmlCharPolicy.
 620     * 
 621     * @param contentNonXmlCharPolicy
 622     *            the contentNonXmlCharPolicy to set
 623     */
 624    public void setContentNonXmlCharPolicy(
 625            XmlViolationPolicy contentNonXmlCharPolicy) {
 626        if (contentNonXmlCharPolicy != XmlViolationPolicy.ALLOW) {
 627            throw new IllegalArgumentException(
 628                    "Must use ErrorReportingTokenizer to set contentNonXmlCharPolicy to non-ALLOW.");
 629        }
 630    }
 631
 632    /**
 633     * Sets the contentSpacePolicy.
 634     * 
 635     * @param contentSpacePolicy
 636     *            the contentSpacePolicy to set
 637     */
 638    public void setContentSpacePolicy(XmlViolationPolicy contentSpacePolicy) {
 639        this.contentSpacePolicy = contentSpacePolicy;
 640    }
 641
 642    /**
 643     * Sets the xmlnsPolicy.
 644     * 
 645     * @param xmlnsPolicy
 646     *            the xmlnsPolicy to set
 647     */
 648    public void setXmlnsPolicy(XmlViolationPolicy xmlnsPolicy) {
 649        if (xmlnsPolicy == XmlViolationPolicy.FATAL) {
 650            throw new IllegalArgumentException("Can't use FATAL here.");
 651        }
 652        this.xmlnsPolicy = xmlnsPolicy;
 653    }
 654
 655    public void setNamePolicy(XmlViolationPolicy namePolicy) {
 656        this.namePolicy = namePolicy;
 657    }
 658
 659    /**
 660     * Sets the html4ModeCompatibleWithXhtml1Schemata.
 661     * 
 662     * @param html4ModeCompatibleWithXhtml1Schemata
 663     *            the html4ModeCompatibleWithXhtml1Schemata to set
 664     */
 665    public void setHtml4ModeCompatibleWithXhtml1Schemata(
 666            boolean html4ModeCompatibleWithXhtml1Schemata) {
 667        this.html4ModeCompatibleWithXhtml1Schemata = html4ModeCompatibleWithXhtml1Schemata;
 668    }
 669
 670    // ]NOCPP]
 671
 672    // For the token handler to call
 673    /**
 674     * Sets the tokenizer state and the associated element name. This should 
 675     * only ever used to put the tokenizer into one of the states that have
 676     * a special end tag expectation.
 677     * 
 678     * @param specialTokenizerState
 679     *            the tokenizer state to set
 680     * @param endTagExpectation
 681     *            the expected end tag for transitioning back to normal
 682     */
 683    public void setStateAndEndTagExpectation(int specialTokenizerState,
 684            @Local String endTagExpectation) {
 685        this.stateSave = specialTokenizerState;
 686        if (specialTokenizerState == Tokenizer.DATA) {
 687            return;
 688        }
 689        @Auto char[] asArray = Portability.newCharArrayFromLocal(endTagExpectation);
 690        this.endTagExpectation = ElementName.elementNameByBuffer(asArray, 0,
 691                asArray.length, interner);
 692        endTagExpectationToArray();
 693    }
 694
 695    /**
 696     * Sets the tokenizer state and the associated element name. This should 
 697     * only ever used to put the tokenizer into one of the states that have
 698     * a special end tag expectation.
 699     * 
 700     * @param specialTokenizerState
 701     *            the tokenizer state to set
 702     * @param endTagExpectation
 703     *            the expected end tag for transitioning back to normal
 704     */
 705    public void setStateAndEndTagExpectation(int specialTokenizerState,
 706            ElementName endTagExpectation) {
 707        this.stateSave = specialTokenizerState;
 708        this.endTagExpectation = endTagExpectation;
 709        endTagExpectationToArray();
 710    }
 711
 712    private void endTagExpectationToArray() {
 713        switch (endTagExpectation.getGroup()) {
 714            case TreeBuilder.TITLE:
 715                endTagExpectationAsArray = TITLE_ARR;
 716                return;
 717            case TreeBuilder.SCRIPT:
 718                endTagExpectationAsArray = SCRIPT_ARR;
 719                return;
 720            case TreeBuilder.STYLE:
 721                endTagExpectationAsArray = STYLE_ARR;
 722                return;
 723            case TreeBuilder.PLAINTEXT:
 724                endTagExpectationAsArray = PLAINTEXT_ARR;
 725                return;
 726            case TreeBuilder.XMP:
 727                endTagExpectationAsArray = XMP_ARR;
 728                return;
 729            case TreeBuilder.TEXTAREA:
 730                endTagExpectationAsArray = TEXTAREA_ARR;
 731                return;
 732            case TreeBuilder.IFRAME:
 733                endTagExpectationAsArray = IFRAME_ARR;
 734                return;
 735            case TreeBuilder.NOEMBED:
 736                endTagExpectationAsArray = NOEMBED_ARR;
 737                return;
 738            case TreeBuilder.NOSCRIPT:
 739                endTagExpectationAsArray = NOSCRIPT_ARR;
 740                return;
 741            case TreeBuilder.NOFRAMES:
 742                endTagExpectationAsArray = NOFRAMES_ARR;
 743                return;
 744            default:
 745                assert false: "Bad end tag expectation.";
 746                return;
 747        }
 748    }
 749
 750    /**
 751     * For C++ use only.
 752     */
 753    public void setLineNumber(int line) {
 754        this.line = line;
 755    }
 756
 757    // start Locator impl
 758
 759    /**
 760     * @see org.xml.sax.Locator#getLineNumber()
 761     */
 762    @Inline public int getLineNumber() {
 763        return line;
 764    }
 765
 766    // [NOCPP[
 767
 768    /**
 769     * @see org.xml.sax.Locator#getColumnNumber()
 770     */
 771    @Inline public int getColumnNumber() {
 772        return -1;
 773    }
 774
 775    /**
 776     * @see org.xml.sax.Locator#getPublicId()
 777     */
 778    public String getPublicId() {
 779        return publicId;
 780    }
 781
 782    /**
 783     * @see org.xml.sax.Locator#getSystemId()
 784     */
 785    public String getSystemId() {
 786        return systemId;
 787    }
 788
 789    // end Locator impl
 790
 791    // end public API
 792
 793    public void notifyAboutMetaBoundary() {
 794        metaBoundaryPassed = true;
 795    }
 796
 797    void turnOnAdditionalHtml4Errors() {
 798        html4 = true;
 799    }
 800
 801    // ]NOCPP]
 802
 803    HtmlAttributes emptyAttributes() {
 804        // [NOCPP[
 805        if (newAttributesEachTime) {
 806            return new HtmlAttributes(mappingLangToXmlLang);
 807        } else {
 808            // ]NOCPP]
 809            return HtmlAttributes.EMPTY_ATTRIBUTES;
 810            // [NOCPP[
 811        }
 812        // ]NOCPP]
 813    }
 814
 815    @Inline private void clearStrBufAndAppend(char c) {
 816        strBuf[0] = c;
 817        strBufLen = 1;
 818    }
 819
 820    @Inline private void clearStrBuf() {
 821        strBufLen = 0;
 822    }
 823
 824    /**
 825     * Appends to the smaller buffer.
 826     * 
 827     * @param c
 828     *            the UTF-16 code unit to append
 829     */
 830    private void appendStrBuf(char c) {
 831        if (strBufLen == strBuf.length) {
 832            char[] newBuf = new char[strBuf.length + Tokenizer.BUFFER_GROW_BY];
 833            System.arraycopy(strBuf, 0, newBuf, 0, strBuf.length);
 834            strBuf = newBuf;
 835        }
 836        strBuf[strBufLen++] = c;
 837    }
 838
 839    /**
 840     * The smaller buffer as a String. Currently only used for error reporting.
 841     * 
 842     * <p>
 843     * C++ memory note: The return value must be released.
 844     * 
 845     * @return the smaller buffer as a string
 846     */
 847    protected String strBufToString() {
 848        return Portability.newStringFromBuffer(strBuf, 0, strBufLen);
 849    }
 850
 851    /**
 852     * Returns the short buffer as a local name. The return value is released in
 853     * emitDoctypeToken().
 854     * 
 855     * @return the smaller buffer as local name
 856     */
 857    private void strBufToDoctypeName() {
 858        doctypeName = Portability.newLocalNameFromBuffer(strBuf, 0, strBufLen,
 859                interner);
 860    }
 861
 862    /**
 863     * Emits the smaller buffer as character tokens.
 864     * 
 865     * @throws SAXException
 866     *             if the token handler threw
 867     */
 868    private void emitStrBuf() throws SAXException {
 869        if (strBufLen > 0) {
 870            tokenHandler.characters(strBuf, 0, strBufLen);
 871        }
 872    }
 873
 874    @Inline private void clearLongStrBuf() {
 875        longStrBufLen = 0;
 876    }
 877
 878    @Inline private void clearLongStrBufAndAppend(char c) {
 879        longStrBuf[0] = c;
 880        longStrBufLen = 1;
 881    }
 882
 883    /**
 884     * Appends to the larger buffer.
 885     * 
 886     * @param c
 887     *            the UTF-16 code unit to append
 888     */
 889    private void appendLongStrBuf(char c) {
 890        if (longStrBufLen == longStrBuf.length) {
 891            char[] newBuf = new char[longStrBufLen + (longStrBufLen >> 1)];
 892            System.arraycopy(longStrBuf, 0, newBuf, 0, longStrBuf.length);
 893            longStrBuf = newBuf;
 894        }
 895        longStrBuf[longStrBufLen++] = c;
 896    }
 897
 898    @Inline private void appendSecondHyphenToBogusComment() throws SAXException {
 899        // [NOCPP[
 900        switch (commentPolicy) {
 901            case ALTER_INFOSET:
 902                // detachLongStrBuf();
 903                appendLongStrBuf(' ');
 904                // FALLTHROUGH
 905            case ALLOW:
 906                warn("The document is not mappable to XML 1.0 due to two consecutive hyphens in a comment.");
 907                // ]NOCPP]
 908                appendLongStrBuf('-');
 909                // [NOCPP[
 910                break;
 911            case FATAL:
 912                fatal("The document is not mappable to XML 1.0 due to two consecutive hyphens in a comment.");
 913                break;
 914        }
 915        // ]NOCPP]
 916    }
 917
 918    // [NOCPP[
 919    private void maybeAppendSpaceToBogusComment() throws SAXException {
 920        switch (commentPolicy) {
 921            case ALTER_INFOSET:
 922                // detachLongStrBuf();
 923                appendLongStrBuf(' ');
 924                // FALLTHROUGH
 925            case ALLOW:
 926                warn("The document is not mappable to XML 1.0 due to a trailing hyphen in a comment.");
 927                break;
 928            case FATAL:
 929                fatal("The document is not mappable to XML 1.0 due to a trailing hyphen in a comment.");
 930                break;
 931        }
 932    }
 933
 934    // ]NOCPP]
 935
 936    @Inline private void adjustDoubleHyphenAndAppendToLongStrBufAndErr(char c)
 937            throws SAXException {
 938        errConsecutiveHyphens();
 939        // [NOCPP[
 940        switch (commentPolicy) {
 941            case ALTER_INFOSET:
 942                // detachLongStrBuf();
 943                longStrBufLen--;
 944                appendLongStrBuf(' ');
 945                appendLongStrBuf('-');
 946                // FALLTHROUGH
 947            case ALLOW:
 948                warn("The document is not mappable to XML 1.0 due to two consecutive hyphens in a comment.");
 949                // ]NOCPP]
 950                appendLongStrBuf(c);
 951                // [NOCPP[
 952                break;
 953            case FATAL:
 954                fatal("The document is not mappable to XML 1.0 due to two consecutive hyphens in a comment.");
 955                break;
 956        }
 957        // ]NOCPP]
 958    }
 959
 960    private void appendLongStrBuf(@NoLength char[] buffer, int offset, int length) {
 961        int reqLen = longStrBufLen + length;
 962        if (longStrBuf.length < reqLen) {
 963            char[] newBuf = new char[reqLen + (reqLen >> 1)];
 964            System.arraycopy(longStrBuf, 0, newBuf, 0, longStrBuf.length);
 965            longStrBuf = newBuf;
 966        }
 967        System.arraycopy(buffer, offset, longStrBuf, longStrBufLen, length);
 968        longStrBufLen = reqLen;
 969    }
 970
 971    /**
 972     * Append the contents of the smaller buffer to the larger one.
 973     */
 974    @Inline private void appendStrBufToLongStrBuf() {
 975        appendLongStrBuf(strBuf, 0, strBufLen);
 976    }
 977
 978    /**
 979     * The larger buffer as a string.
 980     * 
 981     * <p>
 982     * C++ memory note: The return value must be released.
 983     * 
 984     * @return the larger buffer as a string
 985     */
 986    private String longStrBufToString() {
 987        return Portability.newStringFromBuffer(longStrBuf, 0, longStrBufLen);
 988    }
 989
 990    /**
 991     * Emits the current comment token.
 992     * 
 993     * @param pos
 994     *            TODO
 995     * 
 996     * @throws SAXException
 997     */
 998    private void emitComment(int provisionalHyphens, int pos)
 999            throws SAXException {
1000        // [NOCPP[
1001        if (wantsComments) {
1002            // ]NOCPP]
1003            // if (longStrBufOffset != -1) {
1004            // tokenHandler.comment(buf, longStrBufOffset, longStrBufLen
1005            // - provisionalHyphens);
1006            // } else {
1007            tokenHandler.comment(longStrBuf, 0, longStrBufLen
1008                    - provisionalHyphens);
1009            // }
1010            // [NOCPP[
1011        }
1012        // ]NOCPP]
1013        cstart = pos + 1;
1014    }
1015
1016    /**
1017     * Flushes coalesced character tokens.
1018     * 
1019     * @param buf
1020     *            TODO
1021     * @param pos
1022     *            TODO
1023     * 
1024     * @throws SAXException
1025     */
1026    protected void flushChars(@NoLength char[] buf, int pos)
1027            throws SAXException {
1028        if (pos > cstart) {
1029            tokenHandler.characters(buf, cstart, pos - cstart);
1030        }
1031        cstart = Integer.MAX_VALUE;
1032    }
1033
1034    /**
1035     * Reports an condition that would make the infoset incompatible with XML
1036     * 1.0 as fatal.
1037     * 
1038     * @param message
1039     *            the message
1040     * @throws SAXException
1041     * @throws SAXParseException
1042     */
1043    public void fatal(String message) throws SAXException {
1044        SAXParseException spe = new SAXParseException(message, this);
1045        if (errorHandler != null) {
1046            errorHandler.fatalError(spe);
1047        }
1048        throw spe;
1049    }
1050
1051    /**
1052     * Reports a Parse Error.
1053     * 
1054     * @param message
1055     *            the message
1056     * @throws SAXException
1057     */
1058    public void err(String message) throws SAXException {
1059        if (errorHandler == null) {
1060            return;
1061        }
1062        SAXParseException spe = new SAXParseException(message, this);
1063        errorHandler.error(spe);
1064    }
1065
1066    public void errTreeBuilder(String message) throws SAXException {
1067        ErrorHandler eh = null;
1068        if (tokenHandler instanceof TreeBuilder<?>) {
1069            TreeBuilder<?> treeBuilder = (TreeBuilder<?>) tokenHandler;
1070            eh = treeBuilder.getErrorHandler();
1071        }
1072        if (eh == null) {
1073            eh = errorHandler;
1074        }
1075        if (eh == null) {
1076            return;
1077        }
1078        SAXParseException spe = new SAXParseException(message, this);
1079        eh.error(spe);
1080    }
1081
1082    /**
1083     * Reports a warning
1084     * 
1085     * @param message
1086     *            the message
1087     * @throws SAXException
1088     */
1089    public void warn(String message) throws SAXException {
1090        if (errorHandler == null) {
1091            return;
1092        }
1093        SAXParseException spe = new SAXParseException(message, this);
1094        errorHandler.warning(spe);
1095    }
1096
1097    /**
1098     * 
1099     */
1100    private void resetAttributes() {
1101        // [NOCPP[
1102        if (newAttributesEachTime) {
1103            // ]NOCPP]
1104            attributes = null;
1105            // [NOCPP[
1106        } else {
1107            attributes.clear(mappingLangToXmlLang);
1108        }
1109        // ]NOCPP]
1110    }
1111
1112    private void strBufToElementNameString() {
1113        // if (strBufOffset != -1) {
1114        // return ElementName.elementNameByBuffer(buf, strBufOffset, strBufLen);
1115        // } else {
1116        tagName = ElementName.elementNameByBuffer(strBuf, 0, strBufLen,
1117                interner);
1118        // }
1119    }
1120
1121    private int emitCurrentTagToken(boolean selfClosing, int pos)
1122            throws SAXException {
1123        cstart = pos + 1;
1124        maybeErrSlashInEndTag(selfClosing);
1125        stateSave = Tokenizer.DATA;
1126        HtmlAttributes attrs = (attributes == null ? HtmlAttributes.EMPTY_ATTRIBUTES
1127                : attributes);
1128        if (endTag) {
1129            /*
1130             * When an end tag token is emitted, the content model flag must be
1131             * switched to the PCDATA state.
1132             */
1133            maybeErrAttributesOnEndTag(attrs);
1134            // CPPONLY: if (!viewingXmlSource) {
1135            tokenHandler.endTag(tagName);
1136            // CPPONLY: }
1137            Portability.delete(attributes);
1138        } else {
1139            // CPPONLY: if (viewingXmlSource) {
1140            // CPPONLY: Portability.delete(attributes);
1141            // CPPONLY: } else {
1142            tokenHandler.startTag(tagName, attrs, selfClosing);
1143            // CPPONLY: }
1144        }
1145        tagName.release();
1146        tagName = null;
1147        resetAttributes();
1148        /*
1149         * The token handler may have called setStateAndEndTagExpectation
1150         * and changed stateSave since the start of this method.
1151         */
1152        return stateSave;
1153    }
1154
1155    private void attributeNameComplete() throws SAXException {
1156        // if (strBufOffset != -1) {
1157        // attributeName = AttributeName.nameByBuffer(buf, strBufOffset,
1158        // strBufLen, namePolicy != XmlViolationPolicy.ALLOW);
1159        // } else {
1160        attributeName = AttributeName.nameByBuffer(strBuf, 0, strBufLen
1161        // [NOCPP[
1162                , namePolicy != XmlViolationPolicy.ALLOW
1163                // ]NOCPP]
1164                , interner);
1165        // }
1166
1167        if (attributes == null) {
1168            attributes = new HtmlAttributes(mappingLangToXmlLang);
1169        }
1170
1171        /*
1172         * When the user agent leaves the attribute name state (and before
1173         * emitting the tag token, if appropriate), the complete attribute's
1174         * name must be compared to the other attributes on the same token; if
1175         * there is already an attribute on the token with the exact same name,
1176         * then this is a parse error and the new attribute must be dropped,
1177         * along with the value that gets associated with it (if any).
1178         */
1179        if (attributes.contains(attributeName)) {
1180            errDuplicateAttribute();
1181            attributeName.release();
1182            attributeName = null;
1183        }
1184    }
1185
1186    private void addAttributeWithoutValue() throws SAXException {
1187        noteAttributeWithoutValue();
1188
1189        // [NOCPP[
1190        if (metaBoundaryPassed && AttributeName.CHARSET == attributeName
1191                && ElementName.META == tagName) {
1192            err("A \u201Ccharset\u201D attribute on a \u201Cmeta\u201D element found after the first 512 bytes.");
1193        }
1194        // ]NOCPP]
1195        if (attributeName != null) {
1196            // [NOCPP[
1197            if (html4) {
1198                if (attributeName.isBoolean()) {
1199                    if (html4ModeCompatibleWithXhtml1Schemata) {
1200                        attributes.addAttribute(attributeName,
1201                                attributeName.getLocal(AttributeName.HTML),
1202                                xmlnsPolicy);
1203                    } else {
1204                        attributes.addAttribute(attributeName, "", xmlnsPolicy);
1205                    }
1206                } else {
1207                    if (AttributeName.BORDER != attributeName) {
1208                        err("Attribute value omitted for a non-boolean attribute. (HTML4-only error.)");
1209                        attributes.addAttribute(attributeName, "", xmlnsPolicy);
1210                    }
1211                }
1212            } else {
1213                if (AttributeName.SRC == attributeName
1214                        || AttributeName.HREF == attributeName) {
1215                    warn("Attribute \u201C"
1216                            + attributeName.getLocal(AttributeName.HTML)
1217                            + "\u201D without an explicit value seen. The attribute may be dropped by IE7.");
1218                }
1219                // ]NOCPP]
1220                attributes.addAttribute(attributeName,
1221                        Portability.newEmptyString()
1222                        // [NOCPP[
1223                        , xmlnsPolicy
1224                // ]NOCPP]
1225                );
1226                // [NOCPP[
1227            }
1228            // ]NOCPP]
1229            attributeName = null; // attributeName has been adopted by the
1230            // |attributes| object
1231        }
1232    }
1233
1234    private void addAttributeWithValue() throws SAXException {
1235        // [NOCPP[
1236        if (metaBoundaryPassed && ElementName.META == tagName
1237                && AttributeName.CHARSET == attributeName) {
1238            err("A \u201Ccharset\u201D attribute on a \u201Cmeta\u201D element found after the first 512 bytes.");
1239        }
1240        // ]NOCPP]
1241        if (attributeName != null) {
1242            String val = longStrBufToString(); // Ownership transferred to
1243            // HtmlAttributes
1244            // CPPONLY: if (mViewSource) {
1245            // CPPONLY:   mViewSource.MaybeLinkifyAttributeValue(attributeName, val);
1246            // CPPONLY: }
1247            // [NOCPP[
1248            if (!endTag && html4 && html4ModeCompatibleWithXhtml1Schemata
1249                    && attributeName.isCaseFolded()) {
1250                val = newAsciiLowerCaseStringFromString(val);
1251            }
1252            // ]NOCPP]
1253            attributes.addAttribute(attributeName, val
1254            // [NOCPP[
1255                    , xmlnsPolicy
1256            // ]NOCPP]
1257            );
1258            attributeName = null; // attributeName has been adopted by the
1259            // |attributes| object
1260        }
1261    }
1262
1263    // [NOCPP[
1264
1265    private static String newAsciiLowerCaseStringFromString(String str) {
1266        if (str == null) {
1267            return null;
1268        }
1269        char[] buf = new char[str.length()];
1270        for (int i = 0; i < str.length(); i++) {
1271            char c = str.charAt(i);
1272            if (c >= 'A' && c <= 'Z') {
1273                c += 0x20;
1274            }
1275            buf[i] = c;
1276        }
1277        return new String(buf);
1278    }
1279
1280    protected void startErrorReporting() throws SAXException {
1281
1282    }
1283
1284    // ]NOCPP]
1285    
1286    public void start() throws SAXException {
1287        initializeWithoutStarting();
1288        tokenHandler.startTokenization(this);
1289        // [NOCPP[
1290        startErrorReporting();
1291        // ]NOCPP]
1292    }
1293
1294    public boolean tokenizeBuffer(UTF16Buffer buffer) throws SAXException {
1295        int state = stateSave;
1296        int returnState = returnStateSave;
1297        char c = '\u0000';
1298        shouldSuspend = false;
1299        lastCR = false;
1300
1301        int start = buffer.getStart();
1302        /**
1303         * The index of the last <code>char</code> read from <code>buf</code>.
1304         */
1305        int pos = start - 1;
1306
1307        /**
1308         * The index of the first <code>char</code> in <code>buf</code> that is
1309         * part of a coalesced run of character tokens or
1310         * <code>Integer.MAX_VALUE</code> if there is not a current run being
1311         * coalesced.
1312         */
1313        switch (state) {
1314            case DATA:
1315            case RCDATA:
1316            case SCRIPT_DATA:
1317            case PLAINTEXT:
1318            case RAWTEXT:
1319            case CDATA_SECTION:
1320            case SCRIPT_DATA_ESCAPED:
1321            case SCRIPT_DATA_ESCAPE_START:
1322            case SCRIPT_DATA_ESCAPE_START_DASH:
1323            case SCRIPT_DATA_ESCAPED_DASH:
1324            case SCRIPT_DATA_ESCAPED_DASH_DASH:
1325            case SCRIPT_DATA_DOUBLE_ESCAPE_START:
1326            case SCRIPT_DATA_DOUBLE_ESCAPED:
1327            case SCRIPT_DATA_DOUBLE_ESCAPED_LESS_THAN_SIGN:
1328            case SCRIPT_DATA_DOUBLE_ESCAPED_DASH:
1329            case SCRIPT_DATA_DOUBLE_ESCAPED_DASH_DASH:
1330            case SCRIPT_DATA_DOUBLE_ESCAPE_END:
1331                cstart = start;
1332                break;
1333            default:
1334                cstart = Integer.MAX_VALUE;
1335                break;
1336        }
1337
1338        /**
1339         * The number of <code>char</code>s in <code>buf</code> that have
1340         * meaning. (The rest of the array is garbage and should not be
1341         * examined.)
1342         */
1343        // CPPONLY: if (mViewSource) {
1344        // CPPONLY:   mViewSource.SetBuffer(buffer);
1345        // CPPONLY:   pos = stateLoop(state, c, pos, buffer.getBuffer(), false, returnState, buffer.getEnd());
1346        // CPPONLY:   mViewSource.DropBuffer((pos == buffer.getEnd()) ? pos : pos + 1);
1347        // CPPONLY: } else {
1348        // CPPONLY:   pos = stateLoop(state, c, pos, buffer.getBuffer(), false, returnState, buffer.getEnd());
1349        // CPPONLY: }
1350        // [NOCPP[
1351        pos = stateLoop(state, c, pos, buffer.getBuffer(), false, returnState,
1352                buffer.getEnd());
1353        // ]NOCPP]
1354        if (pos == buffer.getEnd()) {
1355            // exiting due to end of buffer
1356            buffer.setStart(pos);
1357        } else {
1358            buffer.setStart(pos + 1);
1359        }
1360        return lastCR;
1361    }
1362
1363    @SuppressWarnings("unused") private int stateLoop(int state, char c,
1364            int pos, @NoLength char[] buf, boolean reconsume, int returnState,
1365            int endPos) throws SAXException {
1366        /*
1367         * Idioms used in this code:
1368         * 
1369         * 
1370         * Consuming the next input character
1371         * 
1372         * To consume the next input character, the code does this: if (++pos ==
1373         * endPos) { break stateloop; } c = checkChar(buf, pos);
1374         * 
1375         * 
1376         * Staying in a state
1377         * 
1378         * When there's a state that the tokenizer may stay in over multiple
1379         * input characters, the state has a wrapper |for(;;)| loop and staying
1380         * in the state continues the loop.
1381         * 
1382         * 
1383         * Switching to another state
1384         * 
1385         * To switch to another state, the code sets the state variable to the
1386         * magic number of the new state. Then it either continues stateloop or
1387         * breaks out of the state's own wrapper loop if the target state is
1388         * right after the current state in source order. (This is a partial
1389         * workaround for Java's lack of goto.)
1390         * 
1391         * 
1392         * Reconsume support
1393         * 
1394         * The spec sometimes says that an input character is reconsumed in
1395         * another state. If a state can ever be entered so that an input
1396         * character can be reconsumed in it, the state's code starts with an
1397         * |if (reconsume)| that sets reconsume to false and skips over the
1398         * normal code for consuming a new character.
1399         * 
1400         * To reconsume the current character in another state, the code sets
1401         * |reconsume| to true and then switches to the other state.
1402         * 
1403         * 
1404         * Emitting character tokens
1405         * 
1406         * This method emits character tokens lazily. Whenever a new range of
1407         * character tokens starts, the field cstart must be set to the start
1408         * index of the range. The flushChars() method must be called at the end
1409         * of a range to flush it.
1410         * 
1411         * 
1412         * U+0000 handling
1413         * 
1414         * The various states have to handle the replacement of U+0000 with
1415         * U+FFFD. However, if U+0000 would be reconsumed in another state, the
1416         * replacement doesn't need to happen, because it's handled by the
1417         * reconsuming state.
1418         * 
1419         * 
1420         * LF handling
1421         * 
1422         * Every state needs to increment the line number upon LF unless the LF
1423         * gets reconsumed by another state which increments the line number.
1424         * 
1425         * 
1426         * CR handling
1427         * 
1428         * Every state needs to handle CR unless the CR gets reconsumed and is
1429         * handled by the reconsuming state. The CR needs to be handled as if it
1430         * were and LF, the lastCR field must be set to true and then this
1431         * method must return. The IO driver will then swallow the next
1432         * character if it is an LF to coalesce CRLF.
1433         */
1434        stateloop: for (;;) {
1435            switch (state) {
1436                case DATA:
1437                    dataloop: for (;;) {
1438                        if (reconsume) {
1439                            reconsume = false;
1440                        } else {
1441                            if (++pos == endPos) {
1442                                break stateloop;
1443                            }
1444                            c = checkChar(buf, pos);
1445                        }
1446                        switch (c) {
1447                            case '&':
1448                                /*
1449                                 * U+0026 AMPERSAND (&) Switch to the character
1450                                 * reference in data state.
1451                                 */
1452                                flushChars(buf, pos);
1453                                clearStrBufAndAppend(c);
1454                                setAdditionalAndRememberAmpersandLocation('\u0000');
1455                                returnState = state;
1456                                state = transition(state, Tokenizer.CONSUME_CHARACTER_REFERENCE, reconsume, pos);
1457                                continue stateloop;
1458                            case '<':
1459                                /*
1460                                 * U+003C LESS-THAN SIGN (<) Switch to the tag
1461                                 * open state.
1462                                 */
1463                                flushChars(buf, pos);
1464
1465                                state = transition(state, Tokenizer.TAG_OPEN, reconsume, pos);
1466                                break dataloop; // FALL THROUGH continue
1467                            // stateloop;
1468                            case '\u0000':
1469                                emitReplacementCharacter(buf, pos);
1470                                continue;
1471                            case '\r':
1472                                emitCarriageReturn(buf, pos);
1473                                break stateloop;
1474                            case '\n':
1475                                silentLineFeed();
1476                            default:
1477                                /*
1478                                 * Anything else Emit the input character as a
1479                                 * character token.
1480                                 * 
1481                                 * Stay in the data state.
1482                                 */
1483                                continue;
1484                        }
1485                    }
1486                    // WARNING FALLTHRU CASE TRANSITION: DON'T REORDER
1487                case TAG_OPEN:
1488                    tagopenloop: for (;;) {
1489                        /*
1490                         * The behavior of this state depends on the content
1491                         * model flag.
1492                         */
1493                        if (++pos == endPos) {
1494                            break stateloop;
1495                        }
1496                        c = checkChar(buf, pos);
1497                        /*
1498                         * If the content model flag is set to the PCDATA state
1499                         * Consume the next input character:
1500                         */
1501                        if (c >= 'A' && c <= 'Z') {
1502                            /*
1503                             * U+0041 LATIN CAPITAL LETTER A through to U+005A
1504                             * LATIN CAPITAL LETTER Z Create a new start tag
1505                             * token,
1506                             */
1507                            endTag = false;
1508                            /*
1509                             * set its tag name to the lowercase version of the
1510                             * input character (add 0x0020 to the character's
1511                             * code point),
1512                             */
1513                            clearStrBufAndAppend((char) (c + 0x20));
1514                            /* then switch to the tag name state. */
1515                            state = transition(state, Tokenizer.TAG_NAME, reconsume, pos);
1516                            /*
1517                             * (Don't emit the token yet; further details will
1518                             * be filled in before it is emitted.)
1519                             */
1520                            break tagopenloop;
1521                            // continue stateloop;
1522                        } else if (c >= 'a' && c <= 'z') {
1523                            /*
1524                             * U+0061 LATIN SMALL LETTER A through to U+007A
1525                             * LATIN SMALL LETTER Z Create a new start tag
1526                             * token,
1527                             */
1528                            endTag = false;
1529                            /*
1530                             * set its tag name to the input character,
1531                             */
1532                            clearStrBufAndAppend(c);
1533                            /* then switch to the tag name state. */
1534                            state = transition(state, Tokenizer.TAG_NAME, reconsume, pos);
1535                            /*
1536                             * (Don't emit the token yet; further details will
1537                             * be filled in before it is emitted.)
1538                             */
1539                            break tagopenloop;
1540                            // continue stateloop;
1541                        }
1542                        switch (c) {
1543                            case '!':
1544                                /*
1545                                 * U+0021 EXCLAMATION MARK (!) Switch to the
1546                                 * markup declaration open state.
1547                                 */
1548                                state = transition(state, Tokenizer.MARKUP_DECLARATION_OPEN, reconsume, pos);
1549                                continue stateloop;
1550                            case '/':
1551                                /*
1552                                 * U+002F SOLIDUS (/) Switch to the close tag
1553                                 * open state.
1554                                 */
1555                                state = transition(state, Tokenizer.CLOSE_TAG_OPEN, reconsume, pos);
1556                                continue stateloop;
1557                            case '?':
1558                                // CPPONLY: if (viewingXmlSource) {
1559                                // CPPONLY: state = transition(state,
1560                                // CPPONLY: Tokenizer.PROCESSING_INSTRUCTION,
1561                                // CPPONLY: reconsume,
1562                                // CPPONLY: pos);
1563                                // CPPONLY: continue stateloop;
1564                                // CPPONLY: }
1565                                /*
1566                                 * U+003F QUESTION MARK (?) Parse error.
1567                                 */
1568                                errProcessingInstruction();
1569                                /*
1570                                 * Switch to the bogus comment state.
1571                                 */
1572                                clearLongStrBufAndAppend(c);
1573                                state = transition(state, Tokenizer.BOGUS_COMMENT, reconsume, pos);
1574                                continue stateloop;
1575                            case '>':
1576                                /*
1577                                 * U+003E GREATER-THAN SIGN (>) Parse error.
1578                                 */
1579                                errLtGt();
1580                                /*
1581                                 * Emit a U+003C LESS-THAN SIGN character token
1582                                 * and a U+003E GREATER-THAN SIGN character
1583                                 * token.
1584                                 */
1585                                tokenHandler.characters(Tokenizer.LT_GT, 0, 2);
1586                                /* Switch to the data state. */
1587                                cstart = pos + 1;
1588                                state = transition(state, Tokenizer.DATA, reconsume, pos);
1589                                continue stateloop;
1590                            default:
1591                                /*
1592                                 * Anything else Parse error.
1593                                 */
1594                                errBadCharAfterLt(c);
1595                                /*
1596                                 * Emit a U+003C LESS-THAN SIGN character token
1597                                 */
1598                                tokenHandler.characters(Tokenizer.LT_GT, 0, 1);
1599                                /*
1600                                 * and reconsume the current input character in
1601                                 * the data state.
1602                                 */
1603                                cstart = pos;
1604                                reconsume = true;
1605                                state = transition(state, Tokenizer.DATA, reconsume, pos);
1606                                continue stateloop;
1607                        }
1608                    }
1609                    // FALL THROUGH DON'T REORDER
1610                case TAG_NAME:
1611                    tagnameloop: for (;;) {
1612                        if (++pos == endPos) {
1613                            break stateloop;
1614                        }
1615                        c = checkChar(buf, pos);
1616                        /*
1617                         * Consume the next input character:
1618                         */
1619                        switch (c) {
1620                            case '\r':
1621                                silentCarriageReturn();
1622                                strBufToElementNameString();
1623                                state = transition(state, Tokenizer.BEFORE_ATTRIBUTE_NAME, reconsume, pos);
1624                                break stateloop;
1625                            case '\n':
1626                                silentLineFeed();
1627                            case ' ':
1628                            case '\t':
1629                            case '\u000C':
1630                                /*
1631                                 * U+0009 CHARACTER TABULATION U+000A LINE FEED
1632                                 * (LF) U+000C FORM FEED (FF) U+0020 SPACE
1633                                 * Switch to the before attribute name state.
1634                                 */
1635                                strBufToElementNameString();
1636                                state = transition(state, Tokenizer.BEFORE_ATTRIBUTE_NAME, reconsume, pos);
1637                                break tagnameloop;
1638                            // continue stateloop;
1639                            case '/':
1640                                /*
1641                                 * U+002F SOLIDUS (/) Switch to the self-closing
1642                                 * start tag state.
1643                                 */
1644                                strBufToElementNameString();
1645                                state = transition(state, Tokenizer.SELF_CLOSING_START_TAG, reconsume, pos);
1646                                continue stateloop;
1647                            case '>':
1648                                /*
1649                                 * U+003E GREATER-THAN SIGN (>) Emit the current
1650                                 * tag token.
1651                                 */
1652                                strBufToElementNameString();
1653                                state = transition(state, emitCurrentTagToken(false, pos), reconsume, pos);
1654                                if (shouldSuspend) {
1655                                    break stateloop;
1656                                }
1657                                /*
1658                                 * Switch to the data state.
1659                                 */
1660                                continue stateloop;
1661                            case '\u0000':
1662                                c = '\uFFFD';
1663                                // fall thru
1664                            default:
1665                                if (c >= 'A' && c <= 'Z') {
1666                                    /*
1667                                     * U+0041 LATIN CAPITAL LETTER A through to
1668                                     * U+005A LATIN CAPITAL LETTER Z Append the
1669                                     * lowercase version of the current input
1670                                     * character (add 0x0020 to the character's
1671                                     * code point) to the current tag token's
1672                                     * tag name.
1673                                     */
1674                                    c += 0x20;
1675                                }
1676                                /*
1677                                 * Anything else Append the current input
1678                                 * character to the current tag token's tag
1679                                 * name.
1680                                 */
1681                                appendStrBuf(c);
1682                                /*
1683                                 * Stay in the tag name state.
1684                                 */
1685                                continue;
1686                        }
1687                    }
1688                    // FALLTHRU DON'T REORDER
1689                case BEFORE_ATTRIBUTE_NAME:
1690                    beforeattributenameloop: for (;;) {
1691                        if (reconsume) {
1692                            reconsume = false;
1693                        } else {
1694                            if (++pos == endPos) {
1695                                break stateloop;
1696                            }
1697                            c = checkChar(buf, pos);
1698                        }
1699                        /*
1700                         * Consume the next input character:
1701                         */
1702                        switch (c) {
1703                            case '\r':
1704                                silentCarriageReturn();
1705                                break stateloop;
1706                            case '\n':
1707                                silentLineFeed();
1708                                // fall thru
1709                            case ' ':
1710                            case '\t':
1711                            case '\u000C':
1712                                /*
1713                                 * U+0009 CHARACTER TABULATION U+000A LINE FEED
1714                                 * (LF) U+000C FORM FEED (FF) U+0020 SPACE Stay
1715                                 * in the before attribute name state.
1716                                 */
1717                                continue;
1718                            case '/':
1719                                /*
1720                                 * U+002F SOLIDUS (/) Switch to the self-closing
1721                                 * start tag state.
1722                                 */
1723                                state = transition(state, Tokenizer.SELF_CLOSING_START_TAG, reconsume, pos);
1724                                continue stateloop;
1725                            case '>':
1726                                /*
1727                                 * U+003E GREATER-THAN SIGN (>) Emit the current
1728                                 * tag token.
1729                                 */
1730                                state = transition(state, emitCurrentTagToken(false, pos), reconsume, pos);
1731                                if (shouldSuspend) {
1732                                    break stateloop;
1733                                }
1734                                /*
1735                                 * Switch to the data state.
1736                                 */
1737                                continue stateloop;
1738                            case '\u0000':
1739                                c = '\uFFFD';
1740                                // fall thru
1741                            case '\"':
1742                            case '\'':
1743                            case '<':
1744                            case '=':
1745                                /*
1746                                 * U+0022 QUOTATION MARK (") U+0027 APOSTROPHE
1747                                 * (') U+003C LESS-THAN SIGN (<) U+003D EQUALS
1748                                 * SIGN (=) Parse error.
1749                                 */
1750                                errBadCharBeforeAttributeNameOrNull(c);
1751                                /*
1752                                 * Treat it as per the "anything else" entry
1753                                 * below.
1754                                 */
1755                            default:
1756                                /*
1757                                 * Anything else Start a new attribute in the
1758                                 * current tag token.
1759                                 */
1760                                if (c >= 'A' && c <= 'Z') {
1761                                    /*
1762                                     * U+0041 LATIN CAPITAL LETTER A through to
1763                                     * U+005A LATIN CAPITAL LETTER Z Set that
1764                                     * attribute's name to the lowercase version
1765                                     * of the current input character (add
1766                                     * 0x0020 to the character's code point)
1767                                     */
1768                                    c += 0x20;
1769                                }
1770                                /*
1771                                 * Set that attribute's name to the current
1772                                 * input character,
1773                                 */
1774                                clearStrBufAndAppend(c);
1775                                /*
1776                                 * and its value to the empty string.
1777                                 */
1778                                // Will do later.
1779                                /*
1780                                 * Switch to the attribute name state.
1781                                 */
1782                                state = transition(state, Tokenizer.ATTRIBUTE_NAME, reconsume, pos);
1783                                break beforeattributenameloop;
1784                            // continue stateloop;
1785                        }
1786                    }
1787                    // FALLTHRU DON'T REORDER
1788                case ATTRIBUTE_NAME:
1789                    attributenameloop: for (;;) {
1790                        if (++pos == endPos) {
1791                            break stateloop;
1792                        }
1793                        c = checkChar(buf, pos);
1794                        /*
1795                         * Consume the next input character:
1796                         */
1797                        switch (c) {
1798                            case '\r':
1799                                silentCarriageReturn();
1800                                attributeNameComplete();
1801                                state = transition(state, Tokenizer.AFTER_ATTRIBUTE_NAME, reconsume, pos);
1802                                break stateloop;
1803                            case '\n':
1804                                silentLineFeed();
1805                                // fall thru
1806                            case ' ':
1807                            case '\t':
1808                            case '\u000C':
1809                                /*
1810                                 * U+0009 CHARACTER TABULATION U+000A LINE FEED
1811                                 * (LF) U+000C FORM FEED (FF) U+0020 SPACE
1812                                 * Switch to the after attribute name state.
1813                                 */
1814                                attributeNameComplete();
1815                                state = transition(state, Tokenizer.AFTER_ATTRIBUTE_NAME, reconsume, pos);
1816                                continue stateloop;
1817                            case '/':
1818                                /*
1819                                 * U+002F SOLIDUS (/) Switch to the self-closing
1820                                 * start tag state.
1821                                 */
1822                                attributeNameComplete();
1823                                addAttributeWithoutValue();
1824                                state = transition(state, Tokenizer.SELF_CLOSING_START_TAG, reconsume, pos);
1825                                continue stateloop;
1826                            case '=':
1827                                /*
1828                                 * U+003D EQUALS SIGN (=) Switch to the before
1829                                 * attribute value state.
1830                                 */
1831                                attributeNameComplete();
1832                                state = transition(state, Tokenizer.BEFORE_ATTRIBUTE_VALUE, reconsume, pos);
1833                                break attributenameloop;
1834                            // continue stateloop;
1835                            case '>':
1836                                /*
1837                                 * U+003E GREATER-THAN SIGN (>) Emit the current
1838                                 * tag token.
1839                                 */
1840                                attributeNameComplete();
1841                                addAttributeWithoutValue();
1842                                state = transition(state, emitCurrentTagToken(false, pos), reconsume, pos);
1843                                if (shouldSuspend) {
1844                                    break stateloop;
1845                                }
1846                                /*
1847                                 * Switch to the data state.
1848                                 */
1849                                continue stateloop;
1850                            case '\u0000':
1851                                c = '\uFFFD';
1852                                // fall thru
1853                            case '\"':
1854                            case '\'':
1855                            case '<':
1856                                /*
1857                                 * U+0022 QUOTATION MARK (") U+0027 APOSTROPHE
1858                                 * (') U+003C LESS-THAN SIGN (<) Parse error.
1859                                 */
1860                                errQuoteOrLtInAttributeNameOrNull(c);
1861                                /*
1862                                 * Treat it as per the "anything else" entry
1863                                 * below.
1864                                 */
1865                            default:
1866                                if (c >= 'A' && c <= 'Z') {
1867                                    /*
1868                                     * U+0041 LATIN CAPITAL LETTER A through to
1869                                     * U+005A LATIN CAPITAL LETTER Z Append the
1870                                     * lowercase version of the current input
1871                                     * character (add 0x0020 to the character's
1872                                     * code point) to the current attribute's
1873                                     * name.
1874                                     */
1875                                    c += 0x20;
1876                                }
1877                                /*
1878                                 * Anything else Append the current input
1879                                 * character to the current attribute's name.
1880                                 */
1881                                appendStrBuf(c);
1882                                /*
1883                                 * Stay in the attribute name state.
1884                                 */
1885                                continue;
1886                        }
1887                    }
1888                    // FALLTHRU DON'T REORDER
1889                case BEFORE_ATTRIBUTE_VALUE:
1890                    beforeattributevalueloop: for (;;) {
1891                        if (++pos == endPos) {
1892                            break stateloop;
1893                        }
1894                        c = checkChar(buf, pos);
1895                        /*
1896                         * Consume the next input character:
1897                         */
1898                        switch (c) {
1899                            case '\r':
1900                                silentCarriageReturn();
1901                                break stateloop;
1902                            case '\n':
1903                                silentLineFeed();
1904                                // fall thru
1905                            case ' ':
1906                            case '\t':
1907                            case '\u000C':
1908                                /*
1909                                 * U+0009 CHARACTER TABULATION U+000A LINE FEED
1910                                 * (LF) U+000C FORM FEED (FF) U+0020 SPACE Stay
1911                                 * in the before attribute value state.
1912                                 */
1913                                continue;
1914                            case '"':
1915                                /*
1916                                 * U+0022 QUOTATION MARK (") Switch to the
1917                                 * attribute value (double-quoted) state.
1918                                 */
1919                                clearLongStrBuf();
1920                                state = transition(state, Tokenizer.ATTRIBUTE_VALUE_DOUBLE_QUOTED, reconsume, pos);
1921                                break beforeattributevalueloop;
1922                            // continue stateloop;
1923                            case '&':
1924                                /*
1925                                 * U+0026 AMPERSAND (&) Switch to the attribute
1926                                 * value (unquoted) state and reconsume this
1927                                 * input character.
1928                                 */
1929                                clearLongStrBuf();
1930                                reconsume = true;
1931                                state = transition(state, Tokenizer.ATTRIBUTE_VALUE_UNQUOTED, reconsume, pos);
1932                                noteUnquotedAttributeValue();
1933                                continue stateloop;
1934                            case '\'':
1935                                /*
1936                                 * U+0027 APOSTROPHE (') Switch to the attribute
1937                                 * value (single-quoted) state.
1938                                 */
1939                                clearLongStrBuf();
1940                                state = transition(state, Tokenizer.ATTRIBUTE_VALUE_SINGLE_QUOTED, reconsume, pos);
1941                                continue stateloop;
1942                            case '>':
1943                                /*
1944                                 * U+003E GREATER-THAN SIGN (>) Parse error.
1945                                 */
1946                                errAttributeValueMissing();
1947                                /*
1948                                 * Emit the current tag token.
1949                                 */
1950                                addAttributeWithoutValue();
1951                                state = transition(state, emitCurrentTagToken(false, pos), reconsume, pos);
1952                                if (shouldSuspend) {
1953                                    break stateloop;
1954                                }
1955                                /*
1956                                 * Switch to the data state.
1957                                 */
1958                                continue stateloop;
1959                            case '\u0000':
1960                                c = '\uFFFD';
1961                                // fall thru
1962                            case '<':
1963                            case '=':
1964                            case '`':
1965                                /*
1966                                 * U+003C LESS-THAN SIGN (<) U+003D EQUALS SIGN
1967                                 * (=) U+0060 GRAVE ACCENT (`)
1968                                 */
1969                                errLtOrEqualsOrGraveInUnquotedAttributeOrNull(c);
1970                                /*
1971                                 * Treat it as per the "anything else" entry
1972                                 * below.
1973                                 */
1974                            default:
1975                                // [NOCPP[
1976                                errHtml4NonNameInUnquotedAttribute(c);
1977                                // ]NOCPP]
1978                                /*
1979                                 * Anything else Append the current input
1980                                 * character to the current attribute's value.
1981                                 */
1982                                clearLongStrBufAndAppend(c);
1983                                /*
1984                                 * Switch to the attribute value (unquoted)
1985                                 * state.
1986                                 */
1987
1988                                state = transition(state, Tokenizer.ATTRIBUTE_VALUE_UNQUOTED, reconsume, pos);
1989                                noteUnquotedAttributeValue();
1990                                continue stateloop;
1991                        }
1992                    }
1993                    // FALLTHRU DON'T REORDER
1994                case ATTRIBUTE_VALUE_DOUBLE_QUOTED:
1995                    attributevaluedoublequotedloop: for (;;) {
1996                        if (reconsume) {
1997                            reconsume = false;
1998                        } else {
1999                            if (++pos == endPos) {
2000                                break stateloop;
2001                            }
2002                            c = checkChar(buf, pos);
2003                        }
2004                        /*
2005                         * Consume the next input character:
2006                         */
2007                        switch (c) {
2008                            case '"':
2009                                /*
2010                                 * U+0022 QUOTATION MARK (") Switch to the after
2011                                 * attribute value (quoted) state.
2012                                 */
2013                                addAttributeWithValue();
2014
2015                                state = transition(state, Tokenizer.AFTER_ATTRIBUTE_VALUE_QUOTED, reconsume, pos);
2016                                break attributevaluedoublequotedloop;
2017                            // continue stateloop;
2018                            case '&':
2019                                /*
2020                                 * U+0026 AMPERSAND (&) Switch to the character
2021                                 * reference in attribute value state, with the
2022                                 * additional allowed character being U+0022
2023                                 * QUOTATION MARK (").
2024                                 */
2025                                clearStrBufAndAppend(c);
2026                                setAdditionalAndRememberAmpersandLocation('\"');
2027                                returnState = state;
2028                                state = transition(state, Tokenizer.CONSUME_CHARACTER_REFERENCE, reconsume, pos);
2029                                continue stateloop;
2030                            case '\r':
2031                                appendLongStrBufCarriageReturn();
2032                                break stateloop;
2033                            case '\n':
2034                                appendLongStrBufLineFeed();
2035                                continue;
2036                            case '\u0000':
2037                                c = '\uFFFD';
2038                                // fall thru
2039                            default:
2040                                /*
2041                                 * Anything else Append the current input
2042                                 * character to the current attribute's value.
2043                                 */
2044                                appendLongStrBuf(c);
2045                                /*
2046                                 * Stay in the attribute value (double-quoted)
2047                                 * state.
2048                                 */
2049                                continue;
2050                        }
2051                    }
2052                    // FALLTHRU DON'T REORDER
2053                case AFTER_ATTRIBUTE_VALUE_QUOTED:
2054                    afterattributevaluequotedloop: for (;;) {
2055                        if (++pos == endPos) {
2056                            break stateloop;
2057                        }
2058                        c = checkChar(buf, pos);
2059                        /*
2060                         * Consume the next input character:
2061                         */
2062                        switch (c) {
2063                            case '\r':
2064                                silentCarriageReturn();
2065                                state = transition(state, Tokenizer.BEFORE_ATTRIBUTE_NAME, reconsume, pos);
2066                                break stateloop;
2067                            case '\n':
2068                                silentLineFeed();
2069                                // fall thru
2070                            case ' ':
2071                            case '\t':
2072                            case '\u000C':
2073                                /*
2074                                 * U+0009 CHARACTER TABULATION U+000A LINE FEED
2075                                 * (LF) U+000C FORM FEED (FF) U+0020 SPACE
2076                                 * Switch to the before attribute name state.
2077                                 */
2078                                state = transition(state, Tokenizer.BEFORE_ATTRIBUTE_NAME, reconsume, pos);
2079                                continue stateloop;
2080                            case '/':
2081                                /*
2082                                 * U+002F SOLIDUS (/) Switch to the self-closing
2083                                 * start tag state.
2084                                 */
2085                                state = transition(state, Tokenizer.SELF_CLOSING_START_TAG, reconsume, pos);
2086                                break afterattributevaluequotedloop;
2087                            // continue stateloop;
2088                            case '>':
2089                                /*
2090                                 * U+003E GREATER-THAN SIGN (>) Emit the current
2091                                 * tag token.
2092                                 */
2093                                state = transition(state, emitCurrentTagToken(false, pos), reconsume, pos);
2094                                if (shouldSuspend) {
2095                                    break stateloop;
2096                                }
2097                                /*
2098                                 * Switch to the data state.
2099                                 */
2100                                continue stateloop;
2101                            default:
2102                                /*
2103                                 * Anything else Parse error.
2104                                 */
2105                                errNoSpaceBetweenAttributes();
2106                                /*
2107                                 * Reconsume the character in the before
2108                                 * attribute name state.
2109                                 */
2110                                reconsume = true;
2111                                state = transition(state, Tokenizer.BEFORE_ATTRIBUTE_NAME, reconsume, pos);
2112                                continue stateloop;
2113                        }
2114                    }
2115                    // FALLTHRU DON'T REORDER
2116                case SELF_CLOSING_START_TAG:
2117                    if (++pos == endPos) {
2118                        break stateloop;
2119                    }
2120                    c = checkChar(buf, pos);
2121                    /*
2122                     * Consume the next input character:
2123                     */
2124                    switch (c) {
2125                        case '>':
2126                            /*
2127                             * U+003E GREATER-THAN SIGN (>) Set the self-closing
2128                             * flag of the current tag token. Emit the current
2129                             * tag token.
2130                             */
2131                            // [NOCPP[
2132                            errHtml4XmlVoidSyntax();
2133                            // ]NOCPP]
2134                            state = transition(state, emitCurrentTagToken(true, pos), reconsume, pos);
2135                            if (shouldSuspend) {
2136                                break stateloop;
2137                            }
2138                            /*
2139                             * Switch to the data state.
2140                             */
2141                            continue stateloop;
2142                        default:
2143                            /* Anything else Parse error. */
2144                            errSlashNotFollowedByGt();
2145                            /*
2146                             * Reconsume the character in the before attribute
2147                             * name state.
2148                             */
2149                            reconsume = true;
2150                            state = transition(state, Tokenizer.BEFORE_ATTRIBUTE_NAME, reconsume, pos);
2151                            continue stateloop;
2152                    }
2153                    // XXX reorder point
2154                case ATTRIBUTE_VALUE_UNQUOTED:
2155                    for (;;) {
2156                        if (reconsume) {
2157                            reconsume = false;
2158                        } else {
2159                            if (++pos == endPos) {
2160                                break stateloop;
2161                            }
2162                            c = checkChar(buf, pos);
2163                        }
2164                        /*
2165                         * Consume the next input character:
2166                         */
2167                        switch (c) {
2168                            case '\r':
2169                                silentCarriageReturn();
2170                                addAttributeWithValue();
2171                                state = transition(state, Tokenizer.BEFORE_ATTRIBUTE_NAME, reconsume, pos);
2172                                break stateloop;
2173                            case '\n':
2174                                silentLineFeed();
2175                                // fall thru
2176                            case ' ':
2177                            case '\t':
2178                            case '\u000C':
2179                                /*
2180                                 * U+0009 CHARACTER TABULATION U+000A LINE FEED
2181                                 * (LF) U+000C FORM FEED (FF) U+0020 SPACE
2182                                 * Switch to the before attribute name state.
2183                                 */
2184                                addAttributeWithValue();
2185                                state = transition(state, Tokenizer.BEFORE_ATTRIBUTE_NAME, reconsume, pos);
2186                                continue stateloop;
2187                            case '&':
2188                                /*
2189                                 * U+0026 AMPERSAND (&) Switch to the character
2190                                 * reference in attribute value state, with the
2191                                 * additional allowed character being U+003E
2192                                 * GREATER-THAN SIGN (>)
2193                                 */
2194                                clearStrBufAndAppend(c);
2195                                setAdditionalAndRememberAmpersandLocation('>');
2196                                returnState = state;
2197                                state = transition(state, Tokenizer.CONSUME_CHARACTER_REFERENCE, reconsume, pos);
2198                                continue stateloop;
2199                            case '>':
2200                                /*
2201                                 * U+003E GREATER-THAN SIGN (>) Emit the current
2202                                 * tag token.
2203                                 */
2204                                addAttributeWithValue();
2205                                state = transition(state, emitCurrentTagToken(false, pos), reconsume, pos);
2206                                if (shouldSuspend) {
2207                                    break stateloop;
2208                                }
2209                                /*
2210                                 * Switch to the data state.
2211                                 */
2212                                continue stateloop;
2213                            case '\u0000':
2214                                c = '\uFFFD';
2215                                // fall thru
2216                            case '<':
2217                            case '\"':
2218                            case '\'':
2219                            case '=':
2220                            case '`':
2221                                /*
2222                                 * U+0022 QUOTATION MARK (") U+0027 APOSTROPHE
2223                                 * (') U+003C LESS-THAN SIGN (<) U+003D EQUALS
2224                                 * SIGN (=) U+0060 GRAVE ACCENT (`) Parse error.
2225                                 */
2226                                errUnquotedAttributeValOrNull(c);
2227                                /*
2228                                 * Treat it as per the "anything else" entry
2229                                 * below.
2230                                 */
2231                                // fall through
2232                            default:
2233                                // [NOCPP]
2234                                errHtml4NonNameInUnquotedAttribute(c);
2235                                // ]NOCPP]
2236                                /*
2237                                 * Anything else Append the current input
2238                                 * character to the current attribute's value.
2239                                 */
2240                                appendLongStrBuf(c);
2241                                /*
2242                                 * Stay in the attribute value (unquoted) state.
2243                                 */
2244                                continue;
2245                        }
2246                    }
2247                    // XXX reorder point
2248                case AFTER_ATTRIBUTE_NAME:
2249                    for (;;) {
2250                        if (++pos == endPos) {
2251                            break stateloop;
2252                        }
2253                        c = checkChar(buf, pos);
2254                        /*
2255                         * Consume the next input character:
2256                         */
2257                        switch (c) {
2258                            case '\r':
2259                                silentCarriageReturn();
2260                                break stateloop;
2261                            case '\n':
2262                                silentLineFeed();
2263                                // fall thru
2264                            case ' ':
2265                            case '\t':
2266                            case '\u000C':
2267                                /*
2268                                 * U+0009 CHARACTER TABULATION U+000A LINE FEED
2269                                 * (LF) U+000C FORM FEED (FF) U+0020 SPACE Stay
2270                                 * in the after attribute name state.
2271                                 */
2272                                continue;
2273                            case '/':
2274                                /*
2275                                 * U+002F SOLIDUS (/) Switch to the self-closing
2276                                 * start tag state.
2277                                 */
2278                                addAttributeWithoutValue();
2279                                state = transition(state, Tokenizer.SELF_CLOSING_START_TAG, reconsume, pos);
2280                                continue stateloop;
2281                            case '=':
2282                                /*
2283                                 * U+003D EQUALS SIGN (=) Switch to the before
2284                                 * attribute value state.
2285                                 */
2286                                state = transition(state, Tokenizer.BEFORE_ATTRIBUTE_VALUE, reconsume, pos);
2287                                continue stateloop;
2288                            case '>':
2289                                /*
2290                                 * U+003E GREATER-THAN SIGN (>) Emit the current
2291                                 * tag token.
2292                                 */
2293                                addAttributeWithoutValue();
2294                                state = transition(state, emitCurrentTagToken(false, pos), reconsume, pos);
2295                                if (shouldSuspend) {
2296                                    break stateloop;
2297                                }
2298                                /*
2299                                 * Switch to the data state.
2300                                 */
2301                                continue stateloop;
2302                            case '\u0000':
2303                                c = '\uFFFD';
2304                                // fall thru
2305                            case '\"':
2306                            case '\'':
2307                            case '<':
2308                                errQuoteOrLtInAttributeNameOrNull(c);
2309                                /*
2310                                 * Treat it as per the "anything else" entry
2311                                 * below.
2312                                 */
2313                            default:
2314                                addAttributeWithoutValue();
2315                                /*
2316                                 * Anything else Start a new attribute in the
2317                                 * current tag token.
2318                                 */
2319                                if (c >= 'A' && c <= 'Z') {
2320                                    /*
2321                                     * U+0041 LATIN CAPITAL LETTER A through to
2322                                     * U+005A LATIN CAPITAL LETTER Z Set that
2323                                     * attribute's name to the lowercase version
2324                                     * of the current input character (add
2325                                     * 0x0020 to the character's code point)
2326                                     */
2327                                    c += 0x20;
2328                                }
2329                                /*
2330                                 * Set that attribute's name to the current
2331                                 * input character,
2332                                 */
2333                                clearStrBufAndAppend(c);
2334                                /*
2335                                 * and its value to the empty string.
2336                                 */
2337                                // Will do later.
2338                                /*
2339                                 * Switch to the attribute name state.
2340                                 */
2341                                state = transition(state, Tokenizer.ATTRIBUTE_NAME, reconsume, pos);
2342                                continue stateloop;
2343                        }
2344                    }
2345                    // XXX reorder point
2346                case MARKUP_DECLARATION_OPEN:
2347                    markupdeclarationopenloop: for (;;) {
2348                        if (++pos == endPos) {
2349                            break stateloop;
2350                        }
2351                        c = checkChar(buf, pos);
2352                        /*
2353                         * If the next two characters are both U+002D
2354                         * HYPHEN-MINUS characters (-), consume those two
2355                         * characters, create a comment token whose data is the
2356                         * empty string, and switch to the comment start state.
2357                         * 
2358                         * Otherwise, if the next seven characters are an ASCII
2359                         * case-insensitive match for the word "DOCTYPE", then
2360                         * consume those characters and switch to the DOCTYPE
2361                         * state.
2362                         * 
2363                         * Otherwise, if the insertion mode is
2364                         * "in foreign content" and the current node is not an
2365                         * element in the HTML namespace and the next seven
2366                         * characters are an case-sensitive match for the string
2367                         * "[CDATA[" (the five uppercase letters "CDATA" with a
2368                         * U+005B LEFT SQUARE BRACKET character before and
2369                         * after), then consume those characters and switch to
2370                         * the CDATA section state.
2371                         * 
2372                         * Otherwise, is is a parse error. Switch to the bogus
2373                         * comment state. The next character that is consumed,
2374                         * if any, is the first character that will be in the
2375                         * comment.
2376                         */
2377                        switch (c) {
2378                            case '-':
2379                                clearLongStrBufAndAppend(c);
2380                                state = transition(state, Tokenizer.MARKUP_DECLARATION_HYPHEN, reconsume, pos);
2381                                break markupdeclarationopenloop;
2382                            // continue stateloop;
2383                            case 'd':
2384                            case 'D':
2385                                clearLongStrBufAndAppend(c);
2386                                index = 0;
2387                                state = transition(state, Tokenizer.MARKUP_DECLARATION_OCTYPE, reconsume, pos);
2388                                continue stateloop;
2389                            case '[':
2390                                if (tokenHandler.cdataSectionAllowed()) {
2391                                    clearLongStrBufAndAppend(c);
2392                                    index = 0;
2393                                    state = transition(state, Tokenizer.CDATA_START, reconsume, pos);
2394                                    continue stateloop;
2395                                }
2396                                // else fall through
2397                            default:
2398                                errBogusComment();
2399                                clearLongStrBuf();
2400                                reconsume = true;
2401                                state = transition(state, Tokenizer.BOGUS_COMMENT, reconsume, pos);
2402                                continue stateloop;
2403                        }
2404                    }
2405                    // FALLTHRU DON'T REORDER
2406                case MARKUP_DECLARATION_HYPHEN:
2407                    markupdeclarationhyphenloop: for (;;) {
2408                        if (++pos == endPos) {
2409                            break stateloop;
2410                        }
2411                        c = checkChar(buf, pos);
2412                        switch (c) {
2413                            case '\u0000':
2414                                break stateloop;
2415                            case '-':
2416                                clearLongStrBuf();
2417                                state = transition(state, Tokenizer.COMMENT_START, reconsume, pos);
2418                                break markupdeclarationhyphenloop;
2419                            // continue stateloop;
2420                            default:
2421                                errBogusComment();
2422                                reconsume = true;
2423                                state = transition(state, Tokenizer.BOGUS_COMMENT, reconsume, pos);
2424                                continue stateloop;
2425                        }
2426                    }
2427                    // FALLTHRU DON'T REORDER
2428                case COMMENT_START:
2429                    commentstartloop: for (;;) {
2430                        if (++pos == endPos) {
2431                            break stateloop;
2432                        }
2433                        c = checkChar(buf, pos);
2434                        /*
2435                         * Comment start state
2436                         * 
2437                         * 
2438                         * Consume the next input character:
2439                         */
2440                        switch (c) {
2441                            case '-':
2442                                /*
2443                                 * U+002D HYPHEN-MINUS (-) Switch to the comment
2444                                 * start dash state.
2445                                 */
2446                                appendLongStrBuf(c);
2447                                state = transition(state, Tokenizer.COMMENT_START_DASH, reconsume, pos);
2448                                continue stateloop;
2449                            case '>':
2450                                /*
2451                                 * U+003E GREATER-THAN SIGN (>) Parse error.
2452                                 */
2453                                errPrematureEndOfComment();
2454                                /* Emit the comment token. */
2455                                emitComment(0, pos);
2456                                /*
2457                                 * Switch to the data state.
2458                                 */
2459                                state = transition(state, Tokenizer.DATA, reconsume, pos);
2460                                continue stateloop;
2461                            case '\r':
2462                                appendLongStrBufCarriageReturn();
2463                                state = transition(state, Tokenizer.COMMENT, reconsume, pos);
2464                                break stateloop;
2465                            case '\n':
2466                                appendLongStrBufLineFeed();
2467                                state = transition(state, Tokenizer.COMMENT, reconsume, pos);
2468                                break commentstartloop;
2469                            case '\u0000':
2470                                c = '\uFFFD';
2471                                // fall thru
2472                            default:
2473                                /*
2474                                 * Anything else Append the input character to
2475                                 * the comment token's data.
2476                                 */
2477                                appendLongStrBuf(c);
2478                                /*
2479                                 * Switch to the comment state.
2480                                 */
2481                                state = transition(state, Tokenizer.COMMENT, reconsume, pos);
2482                                break commentstartloop;
2483                            // continue stateloop;
2484                        }
2485                    }
2486                    // FALLTHRU DON'T REORDER
2487                case COMMENT:
2488                    commentloop: for (;;) {
2489                        if (++pos == endPos) {
2490                            break stateloop;
2491                        }
2492                        c = checkChar(buf, pos);
2493                        /*
2494                         * Comment state Consume the next input character:
2495                         */
2496                        switch (c) {
2497                            case '-':
2498                                /*
2499                                 * U+002D HYPHEN-MINUS (-) Switch to the comment
2500                                 * end dash state
2501                                 */
2502                                appendLongStrBuf(c);
2503                                state = transition(state, Tokenizer.COMMENT_END_DASH, reconsume, pos);
2504                                break commentloop;
2505                            // continue stateloop;
2506                            case '\r':
2507                                appendLongStrBufCarriageReturn();
2508                                break stateloop;
2509                            case '\n':
2510                                appendLongStrBufLineFeed();
2511                                continue;
2512                            case '\u0000':
2513                                c = '\uFFFD';
2514                                // fall thru
2515                            default:
2516                                /*
2517                                 * Anything else Append the input character to
2518                                 * the comment token's data.
2519                                 */
2520                                appendLongStrBuf(c);
2521                                /*
2522                                 * Stay in the comment state.
2523                                 */
2524                                continue;
2525                        }
2526                    }
2527                    // FALLTHRU DON'T REORDER
2528                case COMMENT_END_DASH:
2529                    commentenddashloop: for (;;) {
2530                        if (++pos == endPos) {
2531                            break stateloop;
2532                        }
2533                        c = checkChar(buf, pos);
2534                        /*
2535                         * Comment end dash state Consume the next input
2536                         * character:
2537                         */
2538                        switch (c) {
2539                            case '-':
2540                                /*
2541                                 * U+002D HYPHEN-MINUS (-) Switch to the comment
2542                                 * end state
2543                                 */
2544                                appendLongStrBuf(c);
2545                                state = transition(state, Tokenizer.COMMENT_END, reconsume, pos);
2546                                break commentenddashloop;
2547                            // continue stateloop;
2548                            case '\r':
2549                                appendLongStrBufCarriageReturn();
2550                                state = transition(state, Tokenizer.COMMENT, reconsume, pos);
2551                                break stateloop;
2552                            case '\n':
2553                                appendLongStrBufLineFeed();
2554                                state = transition(state, Tokenizer.COMMENT, reconsume, pos);
2555                                continue stateloop;
2556                            case '\u0000':
2557                                c = '\uFFFD';
2558                                // fall thru
2559                            default:
2560                                /*
2561                                 * Anything else Append a U+002D HYPHEN-MINUS
2562                                 * (-) character and the input character to the
2563                                 * comment token's data.
2564                                 */
2565                                appendLongStrBuf(c);
2566                                /*
2567                                 * Switch to the comment state.
2568                                 */
2569                                state = transition(state, Tokenizer.COMMENT, reconsume, pos);
2570                                continue stateloop;
2571                        }
2572                    }
2573                    // FALLTHRU DON'T REORDER
2574                case COMMENT_END:
2575                    commentendloop: for (;;) {
2576                        if (++pos == endPos) {
2577                            break stateloop;
2578                        }
2579                        c = checkChar(buf, pos);
2580                        /*
2581                         * Comment end dash state Consume the next input
2582                         * character:
2583                         */
2584                        switch (c) {
2585                            case '>':
2586                                /*
2587                                 * U+003E GREATER-THAN SIGN (>) Emit the comment
2588                                 * token.
2589                                 */
2590                                emitComment(2, pos);
2591                                /*
2592                                 * Switch to the data state.
2593                                 */
2594                                state = transition(state, Tokenizer.DATA, reconsume, pos);
2595                                continue stateloop;
2596                            case '-':
2597                                /* U+002D HYPHEN-MINUS (-) Parse error. */
2598                                /*
2599                                 * Append a U+002D HYPHEN-MINUS (-) character to
2600                                 * the comment token's data.
2601                                 */
2602                                adjustDoubleHyphenAndAppendToLongStrBufAndErr(c);
2603                                /*
2604                                 * Stay in the comment end state.
2605                                 */
2606                                continue;
2607                            case '\r':
2608                                adjustDoubleHyphenAndAppendToLongStrBufCarriageReturn();
2609                                state = transition(state, Tokenizer.COMMENT, reconsume, pos);
2610                                break stateloop;
2611                            case '\n':
2612                                adjustDoubleHyphenAndAppendToLongStrBufLineFeed();
2613                                state = transition(state, Tokenizer.COMMENT, reconsume, pos);
2614                                continue stateloop;
2615                            case '!':
2616                                errHyphenHyphenBang();
2617                                appendLongStrBuf(c);
2618                                state = transition(state, Tokenizer.COMMENT_END_BANG, reconsume, pos);
2619                                continue stateloop;
2620                            case '\u0000':
2621                                c = '\uFFFD';
2622                                // fall thru
2623                            default:
2624                                /*
2625                                 * Append two U+002D HYPHEN-MINUS (-) characters
2626                                 * and the input character to the comment
2627                                 * token's data.
2628                                 */
2629                                adjustDoubleHyphenAndAppendToLongStrBufAndErr(c);
2630                                /*
2631                                 * Switch to the comment state.
2632                                 */
2633                                state = transition(state, Tokenizer.COMMENT, reconsume, pos);
2634                                continue stateloop;
2635                        }
2636                    }
2637                    // XXX reorder point
2638                case COMMENT_END_BANG:
2639                    for (;;) {
2640                        if (++pos == endPos) {
2641                            break stateloop;
2642                        }
2643                        c = checkChar(buf, pos);
2644                        /*
2645                         * Comment end bang state
2646                         * 
2647                         * Consume the next input character:
2648                         */
2649                        switch (c) {
2650                            case '>':
2651                                /*
2652                                 * U+003E GREATER-THAN SIGN (>) Emit the comment
2653                                 * token.
2654                                 */
2655                                emitComment(3, pos);
2656                                /*
2657                                 * Switch to the data state.
2658                                 */
2659                                state = transition(state, Tokenizer.DATA, reconsume, pos);
2660                                continue stateloop;
2661                            case '-':
2662                                /*
2663                                 * Append two U+002D HYPHEN-MINUS (-) characters
2664                                 * and a U+0021 EXCLAMATION MARK (!) character
2665                                 * to the comment token's data.
2666                                 */
2667                                appendLongStrBuf(c);
2668                                /*
2669                                 * Switch to the comment end dash state.
2670                                 */
2671                                state = transition(state, Tokenizer.COMMENT_END_DASH, reconsume, pos);
2672                                continue stateloop;
2673                            case '\r':
2674                                appendLongStrBufCarriageReturn();
2675                                break stateloop;
2676                            case '\n':
2677                                appendLongStrBufLineFeed();
2678                                continue;
2679                            case '\u0000':
2680                                c = '\uFFFD';
2681                                // fall thru
2682                            default:
2683                                /*
2684                                 * Anything else Append two U+002D HYPHEN-MINUS
2685                                 * (-) characters, a U+0021 EXCLAMATION MARK (!)
2686                                 * character, and the input character to the
2687                                 * comment token's data. Switch to the comment
2688                                 * state.
2689                                 */
2690                                appendLongStrBuf(c);
2691                                /*
2692                                 * Switch to the comment state.
2693                                 */
2694                                state = transition(state, Tokenizer.COMMENT, reconsume, pos);
2695                                continue stateloop;
2696                        }
2697                    }
2698                    // XXX reorder point
2699                case COMMENT_START_DASH:
2700                    if (++pos == endPos) {
2701                        break stateloop;
2702                    }
2703                    c = checkChar(buf, pos);
2704                    /*
2705                     * Comment start dash state
2706                     * 
2707                     * Consume the next input character:
2708                     */
2709                    switch (c) {
2710                        case '-':
2711                            /*
2712                             * U+002D HYPHEN-MINUS (-) Switch to the comment end
2713                             * state
2714                             */
2715                            appendLongStrBuf(c);
2716                            state = transition(state, Tokenizer.COMMENT_END, reconsume, pos);
2717                            continue stateloop;
2718                        case '>':
2719                            errPrematureEndOfComment();
2720                            /* Emit the comment token. */
2721                            emitComment(1, pos);
2722                            /*
2723                             * Switch to the data state.
2724                             */
2725                            state = transition(state, Tokenizer.DATA, reconsume, pos);
2726                            continue stateloop;
2727                        case '\r':
2728                            appendLongStrBufCarriageReturn();
2729                            state = transition(state, Tokenizer.COMMENT, reconsume, pos);
2730                            break stateloop;
2731                        case '\n':
2732                            appendLongStrBufLineFeed();
2733                            state = transition(state, Tokenizer.COMMENT, reconsume, pos);
2734                            continue stateloop;
2735                        case '\u0000':
2736                            c = '\uFFFD';
2737                            // fall thru
2738                        default:
2739                            /*
2740                             * Append a U+002D HYPHEN-MINUS character (-) and
2741                             * the current input character to the comment
2742                             * token's data.
2743                             */
2744                            appendLongStrBuf(c);
2745                            /*
2746                             * Switch to the comment state.
2747                             */
2748                            state = transition(state, Tokenizer.COMMENT, reconsume, pos);
2749                            continue stateloop;
2750                    }
2751                    // XXX reorder point
2752                case CDATA_START:
2753                    for (;;) {
2754                        if (++pos == endPos) {
2755                            break stateloop;
2756                        }
2757                        c = checkChar(buf, pos);
2758                        if (index < 6) { // CDATA_LSQB.length
2759                            if (c == Tokenizer.CDATA_LSQB[index]) {
2760                                appendLongStrBuf(c);
2761                            } else {
2762                                errBogusComment();
2763                                reconsume = true;
2764                                state = transition(state, Tokenizer.BOGUS_COMMENT, reconsume, pos);
2765                                continue stateloop;
2766                            }
2767                            index++;
2768                            continue;
2769                        } else {
2770                            cstart = pos; // start coalescing
2771                            reconsume = true;
2772                            state = transition(state, Tokenizer.CDATA_SECTION, reconsume, pos);
2773                            break; // FALL THROUGH continue stateloop;
2774                        }
2775                    }
2776                    // WARNING FALLTHRU CASE TRANSITION: DON'T REORDER
2777                case CDATA_SECTION:
2778                    cdatasectionloop: for (;;) {
2779                        if (reconsume) {
2780                            reconsume = false;
2781                        } else {
2782                            if (++pos == endPos) {
2783                                break stateloop;
2784                            }
2785                            c = checkChar(buf, pos);
2786                        }
2787                        switch (c) {
2788                            case ']':
2789                                flushChars(buf, pos);
2790                                state = transition(state, Tokenizer.CDATA_RSQB, reconsume, pos);
2791                                break cdatasectionloop; // FALL THROUGH
2792                            case '\u0000':
2793                                emitReplacementCharacter(buf, pos);
2794                                continue;
2795                            case '\r':
2796                                emitCarriageReturn(buf, pos);
2797                                break stateloop;
2798                            case '\n':
2799                                silentLineFeed();
2800                                // fall thru
2801                            default:
2802                                continue;
2803                        }
2804                    }
2805                    // WARNING FALLTHRU CASE TRANSITION: DON'T REORDER
2806                case CDATA_RSQB:
2807                    cdatarsqb: for (;;) {
2808                        if (++pos == endPos) {
2809                            break stateloop;
2810                        }
2811                        c = checkChar(buf, pos);
2812                        switch (c) {
2813                            case ']':
2814                                state = transition(state, Tokenizer.CDATA_RSQB_RSQB, reconsume, pos);
2815                                break cdatarsqb;
2816                            default:
2817                                tokenHandler.characters(Tokenizer.RSQB_RSQB, 0,
2818                                        1);
2819                                cstart = pos;
2820                                reconsume = true;
2821                                state = transition(state, Tokenizer.CDATA_SECTION, reconsume, pos);
2822                                continue stateloop;
2823                        }
2824                    }
2825                    // WARNING FALLTHRU CASE TRANSITION: DON'T REORDER
2826                case CDATA_RSQB_RSQB:
2827                    if (++pos == endPos) {
2828                        break stateloop;
2829                    }
2830                    c = checkChar(buf, pos);
2831                    switch (c) {
2832                        case '>':
2833                            cstart = pos + 1;
2834                            state = transition(state, Tokenizer.DATA, reconsume, pos);
2835                            continue stateloop;
2836                        default:
2837                            tokenHandler.characters(Tokenizer.RSQB_RSQB, 0, 2);
2838                            cstart = pos;
2839                            reconsume = true;
2840                            state = transition(state, Tokenizer.CDATA_SECTION, reconsume, pos);
2841                            continue stateloop;
2842
2843                    }
2844                    // XXX reorder point
2845                case ATTRIBUTE_VALUE_SINGLE_QUOTED:
2846                    attributevaluesinglequotedloop: for (;;) {
2847                        if (reconsume) {
2848                            reconsume = false;
2849                        } else {
2850                            if (++pos == endPos) {
2851                                break stateloop;
2852                            }
2853                            c = checkChar(buf, pos);
2854                        }
2855                        /*
2856                         * Consume the next input character:
2857                         */
2858                        switch (c) {
2859                            case '\'':
2860                                /*
2861                                 * U+0027 APOSTROPHE (') Switch to the after
2862                                 * attribute value (quoted) state.
2863                                 */
2864                                addAttributeWithValue();
2865
2866                                state = transition(state, Tokenizer.AFTER_ATTRIBUTE_VALUE_QUOTED, reconsume, pos);
2867                                continue stateloop;
2868                            case '&':
2869                                /*
2870                                 * U+0026 AMPERSAND (&) Switch to the character
2871                                 * reference in attribute value state, with the
2872                                 * + additional allowed character being U+0027
2873                                 * APOSTROPHE (').
2874                                 */
2875                                clearStrBufAndAppend(c);
2876                                setAdditionalAndRememberAmpersandLocation('\'');
2877                                returnState = state;
2878                                state = transition(state, Tokenizer.CONSUME_CHARACTER_REFERENCE, reconsume, pos);
2879                                break attributevaluesinglequotedloop;
2880                            // continue stateloop;
2881                            case '\r':
2882                                appendLongStrBufCarriageReturn();
2883                                break stateloop;
2884                            case '\n':
2885                                appendLongStrBufLineFeed();
2886                                continue;
2887                            case '\u0000':
2888                                c = '\uFFFD';
2889                                // fall thru
2890                            default:
2891                                /*
2892                                 * Anything else Append the current input
2893                                 * character to the current attribute's value.
2894                                 */
2895                                appendLongStrBuf(c);
2896                                /*
2897                                 * Stay in the attribute value (double-quoted)
2898                                 * state.
2899                                 */
2900                                continue;
2901                        }
2902                    }
2903                    // FALLTHRU DON'T REORDER
2904                case CONSUME_CHARACTER_REFERENCE:
2905                    if (++pos == endPos) {
2906                        break stateloop;
2907                    }
2908                    c = checkChar(buf, pos);
2909                    if (c == '\u0000') {
2910                        break stateloop;
2911                    }
2912                    /*
2913                     * Unlike the definition is the spec, this state does not
2914                     * return a value and never requires the caller to
2915                     * backtrack. This state takes care of emitting characters
2916                     * or appending to the current attribute value. It also
2917                     * takes care of that in the case when consuming the
2918                     * character reference fails.
2919                     */
2920                    /*
2921                     * This section defines how to consume a character
2922                     * reference. This definition is used when parsing character
2923                     * references in text and in attributes.
2924                     * 
2925                     * The behavior depends on the identity of the next
2926                     * character (the one immediately after the U+0026 AMPERSAND
2927                     * character):
2928                     */
2929                    switch (c) {
2930                        case ' ':
2931                        case '\t':
2932                        case '\n':
2933                        case '\r': // we'll reconsume!
2934                        case '\u000C':
2935                        case '<':
2936                        case '&':
2937                            emitOrAppendStrBuf(returnState);
2938                            if ((returnState & DATA_AND_RCDATA_MASK) == 0) {
2939                                cstart = pos;
2940                            }
2941                            reconsume = true;
2942                            state = transition(state, returnState, reconsume, pos);
2943                            continue stateloop;
2944                        case '#':
2945                            /*
2946                             * U+0023 NUMBER SIGN (#) Consume the U+0023 NUMBER
2947                             * SIGN.
2948                             */
2949                            appendStrBuf('#');
2950                            state = transition(state, Tokenizer.CONSUME_NCR, reconsume, pos);
2951                            continue stateloop;
2952                        default:
2953                            if (c == additional) {
2954                                emitOrAppendStrBuf(returnState);
2955                                reconsume = true;
2956                                state = transition(state, returnState, reconsume, pos);
2957                                continue stateloop;
2958                            }
2959                            if (c >= 'a' && c <= 'z') {
2960                                firstCharKey = c - 'a' + 26;
2961                            } else if (c >= 'A' && c <= 'Z') {
2962                                firstCharKey = c - 'A';
2963                            } else {
2964                                // No match
2965                                /*
2966                                 * If no match can be made, then this is a parse
2967                                 * error.
2968                                 */
2969                                errNoNamedCharacterMatch();
2970                                emitOrAppendStrBuf(returnState);
2971                                if ((returnState & DATA_AND_RCDATA_MASK) == 0) {
2972                                    cstart = pos;
2973                                }
2974                                reconsume = true;
2975                                state = transition(state, returnState, reconsume, pos);
2976                                continue stateloop;
2977                            }
2978                            // Didn't fail yet
2979                            appendStrBuf(c);
2980                            state = transition(state, Tokenizer.CHARACTER_REFERENCE_HILO_LOOKUP, reconsume, pos);
2981                            // FALL THROUGH continue stateloop;
2982                    }
2983                    // WARNING FALLTHRU CASE TRANSITION: DON'T REORDER
2984                case CHARACTER_REFERENCE_HILO_LOOKUP:
2985                    {
2986                        if (++pos == endPos) {
2987                            break stateloop;
2988                        }
2989                        c = checkChar(buf, pos);
2990                        if (c == '\u0000') {
2991                            break stateloop;
2992                        }
2993                        /*
2994                         * The data structure is as follows:
2995                         * 
2996                         * HILO_ACCEL is a two-dimensional int array whose major
2997                         * index corresponds to the second character of the
2998                         * character reference (code point as index) and the
2999                         * minor index corresponds to the first character of the
3000                         * character reference (packed so that A-Z runs from 0
3001                         * to 25 and a-z runs from 26 to 51). This layout makes
3002                         * it easier to use the sparseness of the data structure
3003                         * to omit parts of it: The second dimension of the
3004                         * table is null when no character reference starts with
3005                         * the character corresponding to that row.
3006                         * 
3007                         * The int value HILO_ACCEL (by these indeces) is zero
3008                         * if there exists no character reference starting with
3009                         * that two-letter prefix. Otherwise, the value is an
3010                         * int that packs two shorts so that the higher short is
3011                         * the index of the highest character reference name
3012                         * with that prefix in NAMES and the lower short
3013                         * corresponds to the index of the lowest character
3014                         * reference name with that prefix. (It happens that the
3015                         * first two character reference names share their
3016                         * prefix so the packed int cannot be 0 by packing the
3017                         * two shorts.)
3018                         * 
3019                         * NAMES is an array of byte arrays where each byte
3020                         * array encodes the name of a character references as
3021                         * ASCII. The names omit the first two letters of the
3022                         * name. (Since storing the first two letters would be
3023                         * redundant with the data contained in HILO_ACCEL.) The
3024                         * entries are lexically sorted.
3025                         * 
3026                         * For a given index in NAMES, the same index in VALUES
3027                         * contains the corresponding expansion as an array of
3028                         * two UTF-16 code units (either the character and
3029                         * U+0000 or a suggogate pair).
3030                         */
3031                        int hilo = 0;
3032                        if (c <= 'z') {
3033                            @Const @NoLength int[] row = NamedCharactersAccel.HILO_ACCEL[c];
3034                            if (row != null) {
3035                                hilo = row[firstCharKey];
3036                            }
3037                        }
3038                        if (hilo == 0) {
3039                            /*
3040                             * If no match can be made, then this is a parse
3041                             * error.
3042                             */
3043                            errNoNamedCharacterMatch();
3044                            emitOrAppendStrBuf(returnState);
3045                            if ((returnState & DATA_AND_RCDATA_MASK) == 0) {
3046                                cstart = pos;
3047                            }
3048                            reconsume = true;
3049                            state = transition(state, returnState, reconsume, pos);
3050                            continue stateloop;
3051                        }
3052                        // Didn't fail yet
3053                        appendStrBuf(c);
3054                        lo = hilo & 0xFFFF;
3055                        hi = hilo >> 16;
3056                        entCol = -1;
3057                        candidate = -1;
3058                        strBufMark = 0;
3059                        state = transition(state, Tokenizer.CHARACTER_REFERENCE_TAIL, reconsume, pos);
3060                        // FALL THROUGH continue stateloop;
3061                    }
3062                case CHARACTER_REFERENCE_TAIL:
3063                    outer: for (;;) {
3064                        if (++pos == endPos) {
3065                            break stateloop;
3066                        }
3067                        c = checkChar(buf, pos);
3068                        if (c == '\u0000') {
3069                            break stateloop;
3070                        }
3071                        entCol++;
3072                        /*
3073                         * Consume the maximum number of characters possible,
3074                         * with the consumed characters matching one of the
3075                         * identifiers in the first column of the named
3076                         * character references table (in a case-sensitive
3077                         * manner).
3078                         */
3079                        loloop: for (;;) {
3080                            if (hi < lo) {
3081                                break outer;
3082                            }
3083                            if (entCol == NamedCharacters.NAMES[lo].length()) {
3084                                candidate = lo;
3085                                strBufMark = strBufLen;
3086                                lo++;
3087                            } else if (entCol > NamedCharacters.NAMES[lo].length()) {
3088                                break outer;
3089                            } else if (c > NamedCharacters.NAMES[lo].charAt(entCol)) {
3090                                lo++;
3091                            } else {
3092                                break loloop;
3093                            }
3094                        }
3095
3096                        hiloop: for (;;) {
3097                            if (hi < lo) {
3098                                break outer;
3099                            }
3100                            if (entCol == NamedCharacters.NAMES[hi].length()) {
3101                                break hiloop;
3102                            }
3103                            if (entCol > NamedCharacters.NAMES[hi].length()) {
3104                                break outer;
3105                            } else if (c < NamedCharacters.NAMES[hi].charAt(entCol)) {
3106                                hi--;
3107                            } else {
3108                                break hiloop;
3109                            }
3110                        }
3111
3112                        if (c == ';') {
3113                            // If we see a semicolon, there cannot be a 
3114                            // longer match. Break the loop. However, before
3115                            // breaking, take the longest match so far as the 
3116                            // candidate, if we are just about to complete a 
3117                            // match.
3118                            if (entCol + 1 == NamedCharacters.NAMES[lo].length()) {
3119                                candidate = lo;
3120                                strBufMark = strBufLen;
3121                            }                            
3122                            break outer;
3123                        }
3124                        
3125                        if (hi < lo) {
3126                            break outer;
3127                        }
3128                        appendStrBuf(c);
3129                        continue;
3130                    }
3131
3132                    if (candidate == -1) {
3133                        // reconsume deals with CR, LF or nul
3134                        /*
3135                         * If no match can be made, then this is a parse error.
3136                         */
3137                        errNoNamedCharacterMatch();
3138                        emitOrAppendStrBuf(returnState);
3139                        if ((returnState & DATA_AND_RCDATA_MASK) == 0) {
3140                            cstart = pos;
3141                        }
3142                        reconsume = true;
3143                        state = transition(state, returnState, reconsume, pos);
3144                        continue stateloop;
3145                    } else {
3146                        // c can't be CR, LF or nul if we got here
3147                        @Const @CharacterName String candidateName = NamedCharacters.NAMES[candidate];
3148                        if (candidateName.length() == 0
3149                                || candidateName.charAt(candidateName.length() - 1) != ';') {
3150                            /*
3151                             * If the last character matched is not a U+003B
3152                             * SEMICOLON (;), there is a parse error.
3153                             */
3154                            if ((returnState & DATA_AND_RCDATA_MASK) != 0) {
3155                                /*
3156                                 * If the entity is being consumed as part of an
3157                                 * attribute, and the last character matched is
3158                                 * not a U+003B SEMICOLON (;),
3159                                 */
3160                                char ch;
3161                                if (strBufMark == strBufLen) {
3162                                    ch = c;
3163                                } else {
3164                                    // if (strBufOffset != -1) {
3165                                    // ch = buf[strBufOffset + strBufMark];
3166                                    // } else {
3167                                    ch = strBuf[strBufMark];
3168                                    // }
3169                                }
3170                                if (ch == '=' || (ch >= '0' && ch <= '9')
3171                                        || (ch >= 'A' && ch <= 'Z')
3172                                        || (ch >= 'a' && ch <= 'z')) {
3173                                    /*
3174                                     * and the next character is either a U+003D
3175                                     * EQUALS SIGN character (=) or in the range
3176                                     * U+0030 DIGIT ZERO to U+0039 DIGIT NINE,
3177                                     * U+0041 LATIN CAPITAL LETTER A to U+005A
3178                                     * LATIN CAPITAL LETTER Z, or U+0061 LATIN
3179                                     * SMALL LETTER A to U+007A LATIN SMALL
3180                                     * LETTER Z, then, for historical reasons,
3181                                     * all the characters that were matched
3182                                     * after the U+0026 AMPERSAND (&) must be
3183                                     * unconsumed, and nothing is returned.
3184                                     */
3185                                    errNoNamedCharacterMatch();
3186                                    appendStrBufToLongStrBuf();
3187                                    reconsume = true;
3188                                    state = transition(state, returnState, reconsume, pos);
3189                                    continue stateloop;
3190                                }
3191                            }
3192                            if ((returnState & DATA_AND_RCDATA_MASK) != 0) {
3193                                errUnescapedAmpersandInterpretedAsCharacterReference();
3194                            } else {
3195                                errNotSemicolonTerminated();
3196                            }
3197                        }
3198
3199                        /*
3200                         * Otherwise, return a character token for the character
3201                         * corresponding to the entity name (as given by the
3202                         * second column of the named character references
3203                         * table).
3204                         */
3205                        // CPPONLY: completedNamedCharacterReference();
3206                        @Const @NoLength char[] val = NamedCharacters.VALUES[candidate];
3207                        if (
3208                        // [NOCPP[
3209                        val.length == 1
3210                        // ]NOCPP]
3211                        // CPPONLY: val[1] == 0
3212                        ) {
3213                            emitOrAppendOne(val, returnState);
3214                        } else {
3215                            emitOrAppendTwo(val, returnState);
3216                        }
3217                        // this is so complicated!
3218                        if (strBufMark < strBufLen) {
3219                            if ((returnState & DATA_AND_RCDATA_MASK) != 0) {
3220                                for (int i = strBufMark; i < strBufLen; i++) {
3221                                    appendLongStrBuf(strBuf[i]);
3222                                }
3223                            } else {
3224                                tokenHandler.characters(strBuf, strBufMark,
3225                                        strBufLen - strBufMark);
3226                            }
3227                        }
3228                        // Check if we broke out early with c being the last
3229                        // character that matched as opposed to being the
3230                        // first one that didn't match. In the case of an 
3231                        // early break, the next run on text should start
3232                        // *after* the current character and the current 
3233                        // character shouldn't be reconsumed.
3234                        boolean earlyBreak = (c == ';' && strBufMark == strBufLen);
3235                        if ((returnState & DATA_AND_RCDATA_MASK) == 0) {
3236                            cstart = earlyBreak ? pos + 1 : pos;
3237                        }
3238                        reconsume = !earlyBreak;
3239                        state = transition(state, returnState, reconsume, pos);
3240                        continue stateloop;
3241                        /*
3242                         * If the markup contains I'm &notit; I tell you, the
3243                         * entity is parsed as "not", as in, I'm ¬it; I tell
3244                         * you. But if the markup was I'm &notin; I tell you,
3245                         * the entity would be parsed as "notin;", resulting in
3246                         * I'm ∉ I tell you.
3247                         */
3248                    }
3249                    // XXX reorder point
3250                case CONSUME_NCR:
3251                    if (++pos == endPos) {
3252                        break stateloop;
3253                    }
3254                    c = checkChar(buf, pos);
3255                    prevValue = -1;
3256                    value = 0;
3257                    seenDigits = false;
3258                    /*
3259                     * The behavior further depends on the character after the
3260                     * U+0023 NUMBER SIGN:
3261                     */
3262                    switch (c) {
3263                        case 'x':
3264                        case 'X':
3265
3266                            /*
3267                             * U+0078 LATIN SMALL LETTER X U+0058 LATIN CAPITAL
3268                             * LETTER X Consume the X.
3269                             * 
3270                             * Follow the steps below, but using the range of
3271                             * characters U+0030 DIGIT ZERO through to U+0039
3272                             * DIGIT NINE, U+0061 LATIN SMALL LETTER A through
3273                             * to U+0066 LATIN SMALL LETTER F, and U+0041 LATIN
3274                             * CAPITAL LETTER A, through to U+0046 LATIN CAPITAL
3275                             * LETTER F (in other words, 0-9, A-F, a-f).
3276                             * 
3277                             * When it comes to interpreting the number,
3278                             * interpret it as a hexadecimal number.
3279                             */
3280                            appendStrBuf(c);
3281                            state = transition(state, Tokenizer.HEX_NCR_LOOP, reconsume, pos);
3282                            continue stateloop;
3283                        default:
3284                            /*
3285                             * Anything else Follow the steps below, but using
3286                             * the range of characters U+0030 DIGIT ZERO through
3287                             * to U+0039 DIGIT NINE (i.e. just 0-9).
3288                             * 
3289                             * When it comes to interpreting the number,
3290                             * interpret it as a decimal number.
3291                             */
3292                            reconsume = true;
3293                            state = transition(state, Tokenizer.DECIMAL_NRC_LOOP, reconsume, pos);
3294                            // FALL THROUGH continue stateloop;
3295                    }
3296                    // WARNING FALLTHRU CASE TRANSITION: DON'T REORDER
3297                case DECIMAL_NRC_LOOP:
3298                    decimalloop: for (;;) {
3299                        if (reconsume) {
3300                            reconsume = false;
3301                        } else {
3302                            if (++pos == endPos) {
3303                                break stateloop;
3304                            }
3305                            c = checkChar(buf, pos);
3306                        }
3307                        // Deal with overflow gracefully
3308                        if (value < prevValue) {
3309                            value = 0x110000; // Value above Unicode range but
3310                            // within int
3311                            // range
3312                        }
3313                        prevValue = value;
3314                        /*
3315                         * Consume as many characters as match the range of
3316                         * characters given above.
3317                         */
3318                        if (c >= '0' && c <= '9') {
3319                            seenDigits = true;
3320                            value *= 10;
3321                            value += c - '0';
3322                            continue;
3323                        } else if (c == ';') {
3324                            if (seenDigits) {
3325                                if ((returnState & DATA_AND_RCDATA_MASK) == 0) {
3326                                    cstart = pos + 1;
3327                                }
3328                                state = transition(state, Tokenizer.HANDLE_NCR_VALUE, reconsume, pos);
3329                                // FALL THROUGH continue stateloop;
3330                                break decimalloop;
3331                            } else {
3332                                errNoDigitsInNCR();
3333                                appendStrBuf(';');
3334                                emitOrAppendStrBuf(returnState);
3335                                if ((returnState & DATA_AND_RCDATA_MASK) == 0) {
3336                                    cstart = pos + 1;
3337                                }
3338                                state = transition(state, returnState, reconsume, pos);
3339                                continue stateloop;
3340                            }
3341                        } else {
3342                            /*
3343                             * If no characters match the range, then don't
3344                             * consume any characters (and unconsume the U+0023
3345                             * NUMBER SIGN character and, if appropriate, the X
3346                             * character). This is a parse error; nothing is
3347                             * returned.
3348                             * 
3349                             * Otherwise, if the next character is a U+003B
3350                             * SEMICOLON, consume that too. If it isn't, there
3351                             * is a parse error.
3352                             */
3353                            if (!seenDigits) {
3354                                errNoDigitsInNCR();
3355                                emitOrAppendStrBuf(returnState);
3356                                if ((returnState & DATA_AND_RCDATA_MASK) == 0) {
3357                                    cstart = pos;
3358                                }
3359                                reconsume = true;
3360                                state = transition(state, returnState, reconsume, pos);
3361                                continue stateloop;
3362                            } else {
3363                                errCharRefLacksSemicolon();
3364                                if ((returnState & DATA_AND_RCDATA_MASK) == 0) {
3365                                    cstart = pos;
3366                                }
3367                                reconsume = true;
3368                                state = transition(state, Tokenizer.HANDLE_NCR_VALUE, reconsume, pos);
3369                                // FALL THROUGH continue stateloop;
3370                                break decimalloop;
3371                            }
3372                        }
3373                    }
3374                    // WARNING FALLTHRU CASE TRANSITION: DON'T REORDER
3375                case HANDLE_NCR_VALUE:
3376                    // WARNING previous state sets reconsume
3377                    // XXX inline this case if the method size can take it
3378                    handleNcrValue(returnState);
3379                    state = transition(state, returnState, reconsume, pos);
3380                    continue stateloop;
3381                    // XXX reorder point
3382                case HEX_NCR_LOOP:
3383                    for (;;) {
3384                        if (++pos == endPos) {
3385                            break stateloop;
3386                        }
3387                        c = checkChar(buf, pos);
3388                        // Deal with overflow gracefully
3389                        if (value < prevValue) {
3390                            value = 0x110000; // Value above Unicode range but
3391                            // within int
3392                            // range
3393                        }
3394                        prevValue = value;
3395                        /*
3396                         * Consume as many characters as match the range of
3397                         * characters given above.
3398                         */
3399                        if (c >= '0' && c <= '9') {
3400                            seenDigits = true;
3401                            value *= 16;
3402                            value += c - '0';
3403                            continue;
3404                        } else if (c >= 'A' && c <= 'F') {
3405                            seenDigits = true;
3406                            value *= 16;
3407                            value += c - 'A' + 10;
3408                            continue;
3409                        } else if (c >= 'a' && c <= 'f') {
3410                            seenDigits = true;
3411                            value *= 16;
3412                            value += c - 'a' + 10;
3413                            continue;
3414                        } else if (c == ';') {
3415                            if (seenDigits) {
3416                                if ((returnState & DATA_AND_RCDATA_MASK) == 0) {
3417                                    cstart = pos + 1;
3418                                }
3419                                state = transition(state, Tokenizer.HANDLE_NCR_VALUE, reconsume, pos);
3420                                continue stateloop;
3421                            } else {
3422                                errNoDigitsInNCR();
3423                                appendStrBuf(';');
3424                                emitOrAppendStrBuf(returnState);
3425                                if ((returnState & DATA_AND_RCDATA_MASK) == 0) {
3426                                    cstart = pos + 1;
3427                                }
3428                                state = transition(state, returnState, reconsume, pos);
3429                                continue stateloop;
3430                            }
3431                        } else {
3432                            /*
3433                             * If no characters match the range, then don't
3434                             * consume any characters (and unconsume the U+0023
3435                             * NUMBER SIGN character and, if appropriate, the X
3436                             * character). This is a parse error; nothing is
3437                             * returned.
3438                             * 
3439                             * Otherwise, if the next character is a U+003B
3440                             * SEMICOLON, consume that too. If it isn't, there
3441                             * is a parse error.
3442                             */
3443                            if (!seenDigits) {
3444                                errNoDigitsInNCR();
3445                                emitOrAppendStrBuf(returnState);
3446                                if ((returnState & DATA_AND_RCDATA_MASK) == 0) {
3447                                    cstart = pos;
3448                                }
3449                                reconsume = true;
3450                                state = transition(state, returnState, reconsume, pos);
3451                                continue stateloop;
3452                            } else {
3453                                errCharRefLacksSemicolon();
3454                                if ((returnState & DATA_AND_RCDATA_MASK) == 0) {
3455                                    cstart = pos;
3456                                }
3457                                reconsume = true;
3458                                state = transition(state, Tokenizer.HANDLE_NCR_VALUE, reconsume, pos);
3459                                continue stateloop;
3460                            }
3461                        }
3462                    }
3463                    // XXX reorder point
3464                case PLAINTEXT:
3465                    plaintextloop: for (;;) {
3466                        if (reconsume) {
3467                            reconsume = false;
3468                        } else {
3469                            if (++pos == endPos) {
3470                                break stateloop;
3471                            }
3472                            c = checkChar(buf, pos);
3473                        }
3474                        switch (c) {
3475                            case '\u0000':
3476                                emitPlaintextReplacementCharacter(buf, pos);
3477                                continue;
3478                            case '\r':
3479                                emitCarriageReturn(buf, pos);
3480                                break stateloop;
3481                            case '\n':
3482                                silentLineFeed();
3483                            default:
3484                                /*
3485                                 * Anything else Emit the current input
3486                                 * character as a character token. Stay in the
3487                                 * RAWTEXT state.
3488                                 */
3489                                continue;
3490                        }
3491                    }
3492                    // XXX reorder point
3493                case CLOSE_TAG_OPEN:
3494                    if (++pos == endPos) {
3495                        break stateloop;
3496                    }
3497                    c = checkChar(buf, pos);
3498                    /*
3499                     * Otherwise, if the content model flag is set to the PCDATA
3500                     * state, or if the next few characters do match that tag
3501                     * name, consume the next input character:
3502                     */
3503                    switch (c) {
3504                        case '>':
3505                            /* U+003E GREATER-THAN SIGN (>) Parse error. */
3506                            errLtSlashGt();
3507                            /*
3508                             * Switch to the data state.
3509                             */
3510                            cstart = pos + 1;
3511                            state = transition(state, Tokenizer.DATA, reconsume, pos);
3512                            continue stateloop;
3513                        case '\r':
3514                            silentCarriageReturn();
3515                            /* Anything else Parse error. */
3516                            errGarbageAfterLtSlash();
3517                            /*
3518                             * Switch to the bogus comment state.
3519                             */
3520                            clearLongStrBufAndAppend('\n');
3521                            state = transition(state, Tokenizer.BOGUS_COMMENT, reconsume, pos);
3522                            break stateloop;
3523                        case '\n':
3524                            silentLineFeed();
3525                            /* Anything else Parse error. */
3526                            errGarbageAfterLtSlash();
3527                            /*
3528                             * Switch to the bogus comment state.
3529                             */
3530                            clearLongStrBufAndAppend('\n');
3531                            state = transition(state, Tokenizer.BOGUS_COMMENT, reconsume, pos);
3532                            continue stateloop;
3533                        case '\u0000':
3534                            c = '\uFFFD';
3535                            // fall thru
3536                        default:
3537                            if (c >= 'A' && c <= 'Z') {
3538                                c += 0x20;
3539                            }
3540                            if (c >= 'a' && c <= 'z') {
3541                                /*
3542                                 * U+0061 LATIN SMALL LETTER A through to U+007A
3543                                 * LATIN SMALL LETTER Z Create a new end tag
3544                                 * token,
3545                                 */
3546                                endTag = true;
3547                                /*
3548                                 * set its tag name to the input character,
3549                                 */
3550                                clearStrBufAndAppend(c);
3551                                /*
3552                                 * then switch to the tag name state. (Don't
3553                                 * emit the token yet; further details will be
3554                                 * filled in before it is emitted.)
3555                                 */
3556                                state = transition(state, Tokenizer.TAG_NAME, reconsume, pos);
3557                                continue stateloop;
3558                            } else {
3559                                /* Anything else Parse error. */
3560                                errGarbageAfterLtSlash();
3561                                /*
3562                                 * Switch to the bogus comment state.
3563                                 */
3564                                clearLongStrBufAndAppend(c);
3565                                state = transition(state, Tokenizer.BOGUS_COMMENT, reconsume, pos);
3566                                continue stateloop;
3567                            }
3568                    }
3569                    // XXX reorder point
3570                case RCDATA:
3571                    rcdataloop: for (;;) {
3572                        if (reconsume) {
3573                            reconsume = false;
3574                        } else {
3575                            if (++pos == endPos) {
3576                                break stateloop;
3577                            }
3578                            c = checkChar(buf, pos);
3579                        }
3580                        switch (c) {
3581                            case '&':
3582                                /*
3583                                 * U+0026 AMPERSAND (&) Switch to the character
3584                                 * reference in RCDATA state.
3585                                 */
3586                                flushChars(buf, pos);
3587                                clearStrBufAndAppend(c);
3588                                additional = '\u0000';
3589                                returnState = state;
3590                                state = transition(state, Tokenizer.CONSUME_CHARACTER_REFERENCE, reconsume, pos);
3591                                continue stateloop;
3592                            case '<':
3593                                /*
3594                                 * U+003C LESS-THAN SIGN (<) Switch to the
3595                                 * RCDATA less-than sign state.
3596                                 */
3597                                flushChars(buf, pos);
3598
3599                                returnState = state;
3600                                state = transition(state, Tokenizer.RAWTEXT_RCDATA_LESS_THAN_SIGN, reconsume, pos);
3601                                continue stateloop;
3602                            case '\u0000':
3603                                emitReplacementCharacter(buf, pos);
3604                                continue;
3605                            case '\r':
3606                                emitCarriageReturn(buf, pos);
3607                                break stateloop;
3608                            case '\n':
3609                                silentLineFeed();
3610                            default:
3611                                /*
3612                                 * Emit the current input character as a
3613                                 * character token. Stay in the RCDATA state.
3614                                 */
3615                                continue;
3616                        }
3617                    }
3618                    // XXX reorder point
3619                case RAWTEXT:
3620                    rawtextloop: for (;;) {
3621                        if (reconsume) {
3622                            reconsume = false;
3623                        } else {
3624                            if (++pos == endPos) {
3625                                break stateloop;
3626                            }
3627                            c = checkChar(buf, pos);
3628                        }
3629                        switch (c) {
3630                            case '<':
3631                                /*
3632                                 * U+003C LESS-THAN SIGN (<) Switch to the
3633                                 * RAWTEXT less-than sign state.
3634                                 */
3635                                flushChars(buf, pos);
3636
3637                                returnState = state;
3638                                state = transition(state, Tokenizer.RAWTEXT_RCDATA_LESS_THAN_SIGN, reconsume, pos);
3639                                break rawtextloop;
3640                            // FALL THRU continue stateloop;
3641                            case '\u0000':
3642                                emitReplacementCharacter(buf, pos);
3643                                continue;
3644                            case '\r':
3645                                emitCarriageReturn(buf, pos);
3646                                break stateloop;
3647                            case '\n':
3648                                silentLineFeed();
3649                            default:
3650                                /*
3651                                 * Emit the current input character as a
3652                                 * character token. Stay in the RAWTEXT state.
3653                                 */
3654                                continue;
3655                        }
3656                    }
3657                    // XXX fallthru don't reorder
3658                case RAWTEXT_RCDATA_LESS_THAN_SIGN:
3659                    rawtextrcdatalessthansignloop: for (;;) {
3660                        if (++pos == endPos) {
3661                            break stateloop;
3662                        }
3663                        c = checkChar(buf, pos);
3664                        switch (c) {
3665                            case '/':
3666                                /*
3667                                 * U+002F SOLIDUS (/) Set the temporary buffer
3668                                 * to the empty string. Switch to the script
3669                                 * data end tag open state.
3670                                 */
3671                                index = 0;
3672                                clearStrBuf();
3673                                state = transition(state, Tokenizer.NON_DATA_END_TAG_NAME, reconsume, pos);
3674                                break rawtextrcdatalessthansignloop;
3675                            // FALL THRU continue stateloop;
3676                            default:
3677                                /*
3678                                 * Otherwise, emit a U+003C LESS-THAN SIGN
3679                                 * character token
3680                                 */
3681                                tokenHandler.characters(Tokenizer.LT_GT, 0, 1);
3682                                /*
3683                                 * and reconsume the current input character in
3684                                 * the data state.
3685                                 */
3686                                cstart = pos;
3687                                reconsume = true;
3688                                state = transition(state, returnState, reconsume, pos);
3689                                continue stateloop;
3690                        }
3691                    }
3692                    // XXX fall thru. don't reorder.
3693                case NON_DATA_END_TAG_NAME:
3694                    for (;;) {
3695                        if (++pos == endPos) {
3696                            break stateloop;
3697                        }
3698                        c = checkChar(buf, pos);
3699                        /*
3700                         * ASSERT! when entering this state, set index to 0 and
3701                         * call clearStrBuf() assert (contentModelElement !=
3702                         * null); Let's implement the above without lookahead.
3703                         * strBuf is the 'temporary buffer'.
3704                         */
3705                        if (index < endTagExpectationAsArray.length) {
3706                            char e = endTagExpectationAsArray[index];
3707                            char folded = c;
3708                            if (c >= 'A' && c <= 'Z') {
3709                                folded += 0x20;
3710                            }
3711                            if (folded != e) {
3712                                // [NOCPP[
3713                                errHtml4LtSlashInRcdata(folded);
3714                                // ]NOCPP]
3715                                tokenHandler.characters(Tokenizer.LT_SOLIDUS,
3716                                        0, 2);
3717                                emitStrBuf();
3718                                cstart = pos;
3719                                reconsume = true;
3720                                state = transition(state, returnState, reconsume, pos);
3721                                continue stateloop;
3722                            }
3723                            appendStrBuf(c);
3724                            index++;
3725                            continue;
3726                        } else {
3727                            endTag = true;
3728                            // XXX replace contentModelElement with different
3729                            // type
3730                            tagName = endTagExpectation;
3731                            switch (c) {
3732                                case '\r':
3733                                    silentCarriageReturn();
3734                                    state = transition(state, Tokenizer.BEFORE_ATTRIBUTE_NAME, reconsume, pos);
3735                                    break stateloop;
3736                                case '\n':
3737                                    silentLineFeed();
3738                                    // fall thru
3739                                case ' ':
3740                                case '\t':
3741                                case '\u000C':
3742                                    /*
3743                                     * U+0009 CHARACTER TABULATION U+000A LINE
3744                                     * FEED (LF) U+000C FORM FEED (FF) U+0020
3745                                     * SPACE If the current end tag token is an
3746                                     * appropriate end tag token, then switch to
3747                                     * the before attribute name state.
3748                                     */
3749                                    state = transition(state, Tokenizer.BEFORE_ATTRIBUTE_NAME, reconsume, pos);
3750                                    continue stateloop;
3751                                case '/':
3752                                    /*
3753                                     * U+002F SOLIDUS (/) If the current end tag
3754                                     * token is an appropriate end tag token,
3755                                     * then switch to the self-closing start tag
3756                                     * state.
3757                                     */
3758                                    state = transition(state, Tokenizer.SELF_CLOSING_START_TAG, reconsume, pos);
3759                                    continue stateloop;
3760                                case '>':
3761                                    /*
3762                                     * U+003E GREATER-THAN SIGN (>) If the
3763                                     * current end tag token is an appropriate
3764                                     * end tag token, then emit the current tag
3765                                     * token and switch to the data state.
3766                                     */
3767                                    state = transition(state, emitCurrentTagToken(false, pos), reconsume, pos);
3768                                    if (shouldSuspend) {
3769                                        break stateloop;
3770                                    }
3771                                    continue stateloop;
3772                                default:
3773                                    /*
3774                                     * Emit a U+003C LESS-THAN SIGN character
3775                                     * token, a U+002F SOLIDUS character token,
3776                                     * a character token for each of the
3777                                     * characters in the temporary buffer (in
3778                                     * the order they were added to the buffer),
3779                                     * and reconsume the current input character
3780                                     * in the RAWTEXT state.
3781                                     */
3782                                    // [NOCPP[
3783                                    errWarnLtSlashInRcdata();
3784                                    // ]NOCPP]
3785                                    tokenHandler.characters(
3786                                            Tokenizer.LT_SOLIDUS, 0, 2);
3787                                    emitStrBuf();
3788                                    if (c == '\u0000') {
3789                                        emitReplacementCharacter(buf, pos);
3790                                    } else {
3791                                        cstart = pos; // don't drop the
3792                                        // character
3793                                    }
3794                                    state = transition(state, returnState, reconsume, pos);
3795                                    continue stateloop;
3796                            }
3797                        }
3798                    }
3799                    // XXX reorder point
3800                    // BEGIN HOTSPOT WORKAROUND
3801                case BOGUS_COMMENT:
3802                    boguscommentloop: for (;;) {
3803                        if (reconsume) {
3804                            reconsume = false;
3805                        } else {
3806                            if (++pos == endPos) {
3807                                break stateloop;
3808                            }
3809                            c = checkChar(buf, pos);
3810                        }
3811                        /*
3812                         * Consume every character up to and including the first
3813                         * U+003E GREATER-THAN SIGN character (>) or the end of
3814                         * the file (EOF), whichever comes first. Emit a comment
3815                         * token whose data is the concatenation of all the
3816                         * characters starting from and including the character
3817                         * that caused the state machine to switch into the
3818                         * bogus comment state, up to and including the
3819                         * character immediately before the last consumed
3820                         * character (i.e. up to the character just before the
3821                         * U+003E or EOF character). (If the comment was started
3822                         * by the end of the file (EOF), the token is empty.)
3823                         * 
3824                         * Switch to the data state.
3825                         * 
3826                         * If the end of the file was reached, reconsume the EOF
3827                         * character.
3828                         */
3829                        switch (c) {
3830                            case '>':
3831                                emitComment(0, pos);
3832                                state = transition(state, Tokenizer.DATA, reconsume, pos);
3833                                continue stateloop;
3834                            case '-':
3835                                appendLongStrBuf(c);
3836                                state = transition(state, Tokenizer.BOGUS_COMMENT_HYPHEN, reconsume, pos);
3837                                break boguscommentloop;
3838                            case '\r':
3839                                appendLongStrBufCarriageReturn();
3840                                break stateloop;
3841                            case '\n':
3842                                appendLongStrBufLineFeed();
3843                                continue;
3844                            case '\u0000':
3845                                c = '\uFFFD';
3846                                // fall thru
3847                            default:
3848                                appendLongStrBuf(c);
3849                                continue;
3850                        }
3851                    }
3852                    // FALLTHRU DON'T REORDER
3853                case BOGUS_COMMENT_HYPHEN:
3854                    boguscommenthyphenloop: for (;;) {
3855                        if (++pos == endPos) {
3856                            break stateloop;
3857                        }
3858                        c = checkChar(buf, pos);
3859                        switch (c) {
3860                            case '>':
3861                                // [NOCPP[
3862                                maybeAppendSpaceToBogusComment();
3863                                // ]NOCPP]
3864                                emitComment(0, pos);
3865                                state = transition(state, Tokenizer.DATA, reconsume, pos);
3866                                continue stateloop;
3867                            case '-':
3868                                appendSecondHyphenToBogusComment();
3869                                continue boguscommenthyphenloop;
3870                            case '\r':
3871                                appendLongStrBufCarriageReturn();
3872                                state = transition(state, Tokenizer.BOGUS_COMMENT, reconsume, pos);
3873                                break stateloop;
3874                            case '\n':
3875                                appendLongStrBufLineFeed();
3876                                state = transition(state, Tokenizer.BOGUS_COMMENT, reconsume, pos);
3877                                continue stateloop;
3878                            case '\u0000':
3879                                c = '\uFFFD';
3880                                // fall thru
3881                            default:
3882                                appendLongStrBuf(c);
3883                                state = transition(state, Tokenizer.BOGUS_COMMENT, reconsume, pos);
3884                                continue stateloop;
3885                        }
3886                    }
3887                    // XXX reorder point
3888                case SCRIPT_DATA:
3889                    scriptdataloop: for (;;) {
3890                        if (reconsume) {
3891                            reconsume = false;
3892                        } else {
3893                            if (++pos == endPos) {
3894                                break stateloop;
3895                            }
3896                            c = checkChar(buf, pos);
3897                        }
3898                        switch (c) {
3899                            case '<':
3900                                /*
3901                                 * U+003C LESS-THAN SIGN (<) Switch to the
3902                                 * script data less-than sign state.
3903                                 */
3904                                flushChars(buf, pos);
3905                                returnState = state;
3906                                state = transition(state, Tokenizer.SCRIPT_DATA_LESS_THAN_SIGN, reconsume, pos);
3907                                break scriptdataloop; // FALL THRU continue
3908                            // stateloop;
3909                            case '\u0000':
3910                                emitReplacementCharacter(buf, pos);
3911                                continue;
3912                            case '\r':
3913                                emitCarriageReturn(buf, pos);
3914                                break stateloop;
3915                            case '\n':
3916                                silentLineFeed();
3917                            default:
3918                                /*
3919                                 * Anything else Emit the current input
3920                                 * character as a character token. Stay in the
3921                                 * script data state.
3922                                 */
3923                                continue;
3924                        }
3925                    }
3926                    // WARNING FALLTHRU CASE TRANSITION: DON'T REORDER
3927                case SCRIPT_DATA_LESS_THAN_SIGN:
3928                    scriptdatalessthansignloop: for (;;) {
3929                        if (++pos == endPos) {
3930                            break stateloop;
3931                        }
3932                        c = checkChar(buf, pos);
3933                        switch (c) {
3934                            case '/':
3935                                /*
3936                                 * U+002F SOLIDUS (/) Set the temporary buffer
3937                                 * to the empty string. Switch to the script
3938                                 * data end tag open state.
3939                                 */
3940                                index = 0;
3941                                clearStrBuf();
3942                                state = transition(state, Tokenizer.NON_DATA_END_TAG_NAME, reconsume, pos);
3943                                continue stateloop;
3944                            case '!':
3945                                tokenHandler.characters(Tokenizer.LT_GT, 0, 1);
3946                                cstart = pos;
3947                                state = transition(state, Tokenizer.SCRIPT_DATA_ESCAPE_START, reconsume, pos);
3948                                break scriptdatalessthansignloop; // FALL THRU
3949                            // continue
3950                            // stateloop;
3951                            default:
3952                                /*
3953                                 * Otherwise, emit a U+003C LESS-THAN SIGN
3954                                 * character token
3955                                 */
3956                                tokenHandler.characters(Tokenizer.LT_GT, 0, 1);
3957                                /*
3958                                 * and reconsume the current input character in
3959                                 * the data state.
3960                                 */
3961                                cstart = pos;
3962                                reconsume = true;
3963                                state = transition(state, Tokenizer.SCRIPT_DATA, reconsume, pos);
3964                                continue stateloop;
3965                        }
3966                    }
3967                    // WARNING FALLTHRU CASE TRANSITION: DON'T REORDER
3968                case SCRIPT_DATA_ESCAPE_START:
3969                    scriptdataescapestartloop: for (;;) {
3970                        if (++pos == endPos) {
3971                            break stateloop;
3972                        }
3973                        c = checkChar(buf, pos);
3974                        /*
3975                         * Consume the next input character:
3976                         */
3977                        switch (c) {
3978                            case '-':
3979                                /*
3980                                 * U+002D HYPHEN-MINUS (-) Emit a U+002D
3981                                 * HYPHEN-MINUS character token. Switch to the
3982                                 * script data escape start dash state.
3983                                 */
3984                                state = transition(state, Tokenizer.SCRIPT_DATA_ESCAPE_START_DASH, reconsume, pos);
3985                                break scriptdataescapestartloop; // FALL THRU
3986                            // continue
3987                            // stateloop;
3988                            default:
3989                                /*
3990                                 * Anything else Reconsume the current input
3991                                 * character in the script data state.
3992                                 */
3993                                reconsume = true;
3994                                state = transition(state, Tokenizer.SCRIPT_DATA, reconsume, pos);
3995                                continue stateloop;
3996                        }
3997                    }
3998                    // WARNING FALLTHRU CASE TRANSITION: DON'T REORDER
3999                case SCRIPT_DATA_ESCAPE_START_DASH:
4000                    scriptdataescapestartdashloop: for (;;) {
4001                        if (++pos == endPos) {
4002                            break stateloop;
4003                        }
4004                        c = checkChar(buf, pos);
4005                        /*
4006                         * Consume the next input character:
4007                         */
4008                        switch (c) {
4009                            case '-':
4010                                /*
4011                                 * U+002D HYPHEN-MINUS (-) Emit a U+002D
4012                                 * HYPHEN-MINUS character token. Switch to the
4013                                 * script data escaped dash dash state.
4014                                 */
4015                                state = transition(state, Tokenizer.SCRIPT_DATA_ESCAPED_DASH_DASH, reconsume, pos);
4016                                break scriptdataescapestartdashloop;
4017                            // continue stateloop;
4018                            default:
4019                                /*
4020                                 * Anything else Reconsume the current input
4021                                 * character in the script data state.
4022                                 */
4023                                reconsume = true;
4024                                state = transition(state, Tokenizer.SCRIPT_DATA, reconsume, pos);
4025                                continue stateloop;
4026                        }
4027                    }
4028                    // WARNING FALLTHRU CASE TRANSITION: DON'T REORDER
4029                case SCRIPT_DATA_ESCAPED_DASH_DASH:
4030                    scriptdataescapeddashdashloop: for (;;) {
4031                        if (++pos == endPos) {
4032                            break stateloop;
4033                        }
4034                        c = checkChar(buf, pos);
4035                        /*
4036                         * Consume the next input character:
4037                         */
4038                        switch (c) {
4039                            case '-':
4040                                /*
4041                                 * U+002D HYPHEN-MINUS (-) Emit a U+002D
4042                                 * HYPHEN-MINUS character token. Stay in the
4043                                 * script data escaped dash dash state.
4044                                 */
4045                                continue;
4046                            case '<':
4047                                /*
4048                                 * U+003C LESS-THAN SIGN (<) Switch to the
4049                                 * script data escaped less-than sign state.
4050                                 */
4051                                flushChars(buf, pos);
4052                                state = transition(state, Tokenizer.SCRIPT_DATA_ESCAPED_LESS_THAN_SIGN, reconsume, pos);
4053                                continue stateloop;
4054                            case '>':
4055                                /*
4056                                 * U+003E GREATER-THAN SIGN (>) Emit a U+003E
4057                                 * GREATER-THAN SIGN character token. Switch to
4058                                 * the script data state.
4059                                 */
4060                                state = transition(state, Tokenizer.SCRIPT_DATA, reconsume, pos);
4061                                continue stateloop;
4062                            case '\u0000':
4063                                emitReplacementCharacter(buf, pos);
4064                                state = transition(state, Tokenizer.SCRIPT_DATA_ESCAPED, reconsume, pos);
4065                                break scriptdataescapeddashdashloop;
4066                            case '\r':
4067                                emitCarriageReturn(buf, pos);
4068                                state = transition(state, Tokenizer.SCRIPT_DATA_ESCAPED, reconsume, pos);
4069                                break stateloop;
4070                            case '\n':
4071                                silentLineFeed();
4072                            default:
4073                                /*
4074                                 * Anything else Emit the current input
4075                                 * character as a character token. Switch to the
4076                                 * script data escaped state.
4077                                 */
4078                                state = transition(state, Tokenizer.SCRIPT_DATA_ESCAPED, reconsume, pos);
4079                                break scriptdataescapeddashdashloop;
4080                            // continue stateloop;
4081                        }
4082                    }
4083                    // WARNING FALLTHRU CASE TRANSITION: DON'T REORDER
4084                case SCRIPT_DATA_ESCAPED:
4085                    scriptdataescapedloop: for (;;) {
4086                        if (reconsume) {
4087                            reconsume = false;
4088                        } else {
4089                            if (++pos == endPos) {
4090                                break stateloop;
4091                            }
4092                            c = checkChar(buf, pos);
4093                        }
4094                        /*
4095                         * Consume the next input character:
4096                         */
4097                        switch (c) {
4098                            case '-':
4099                                /*
4100                                 * U+002D HYPHEN-MINUS (-) Emit a U+002D
4101                                 * HYPHEN-MINUS character token. Switch to the
4102                                 * script data escaped dash state.
4103                                 */
4104                                state = transition(state, Tokenizer.SCRIPT_DATA_ESCAPED_DASH, reconsume, pos);
4105                                break scriptdataescapedloop; // FALL THRU
4106                            // continue
4107                            // stateloop;
4108                            case '<':
4109                                /*
4110                                 * U+003C LESS-THAN SIGN (<) Switch to the
4111                                 * script data escaped less-than sign state.
4112                                 */
4113                                flushChars(buf, pos);
4114                                state = transition(state, Tokenizer.SCRIPT_DATA_ESCAPED_LESS_THAN_SIGN, reconsume, pos);
4115                                continue stateloop;
4116                            case '\u0000':
4117                                emitReplacementCharacter(buf, pos);
4118                                continue;
4119                            case '\r':
4120                                emitCarriageReturn(buf, pos);
4121                                break stateloop;
4122                            case '\n':
4123                                silentLineFeed();
4124                            default:
4125                                /*
4126                                 * Anything else Emit the current input
4127                                 * character as a character token. Stay in the
4128                                 * script data escaped state.
4129                                 */
4130                                continue;
4131                        }
4132                    }
4133                    // WARNING FALLTHRU CASE TRANSITION: DON'T REORDER
4134                case SCRIPT_DATA_ESCAPED_DASH:
4135                    scriptdataescapeddashloop: for (;;) {
4136                        if (++pos == endPos) {
4137                            break stateloop;
4138                        }
4139                        c = checkChar(buf, pos);
4140                        /*
4141                         * Consume the next input character:
4142                         */
4143                        switch (c) {
4144                            case '-':
4145                                /*
4146                                 * U+002D HYPHEN-MINUS (-) Emit a U+002D
4147                                 * HYPHEN-MINUS character token. Switch to the
4148                                 * script data escaped dash dash state.
4149                                 */
4150                                state = transition(state, Tokenizer.SCRIPT_DATA_ESCAPED_DASH_DASH, reconsume, pos);
4151                                continue stateloop;
4152                            case '<':
4153                                /*
4154                                 * U+003C LESS-THAN SIGN (<) Switch to the
4155                                 * script data escaped less-than sign state.
4156                                 */
4157                                flushChars(buf, pos);
4158                                state = transition(state, Tokenizer.SCRIPT_DATA_ESCAPED_LESS_THAN_SIGN, reconsume, pos);
4159                                break scriptdataescapeddashloop;
4160                            // continue stateloop;
4161                            case '\u0000':
4162                                emitReplacementCharacter(buf, pos);
4163                                state = transition(state, Tokenizer.SCRIPT_DATA_ESCAPED, reconsume, pos);
4164                                continue stateloop;
4165                            case '\r':
4166                                emitCarriageReturn(buf, pos);
4167                                state = transition(state, Tokenizer.SCRIPT_DATA_ESCAPED, reconsume, pos);
4168                                break stateloop;
4169                            case '\n':
4170                                silentLineFeed();
4171                            default:
4172                                /*
4173                                 * Anything else Emit the current input
4174                                 * character as a character token. Switch to the
4175                                 * script data escaped state.
4176                                 */
4177                                state = transition(state, Tokenizer.SCRIPT_DATA_ESCAPED, reconsume, pos);
4178                                continue stateloop;
4179                        }
4180                    }
4181                    // WARNING FALLTHRU CASE TRANSITION: DON'T REORDER
4182                case SCRIPT_DATA_ESCAPED_LESS_THAN_SIGN:
4183                    scriptdataescapedlessthanloop: for (;;) {
4184                        if (++pos == endPos) {
4185                            break stateloop;
4186                        }
4187                        c = checkChar(buf, pos);
4188                        /*
4189                         * Consume the next input character:
4190                         */
4191                        switch (c) {
4192                            case '/':
4193                                /*
4194                                 * U+002F SOLIDUS (/) Set the temporary buffer
4195                                 * to the empty string. Switch to the script
4196                                 * data escaped end tag open state.
4197                                 */
4198                                index = 0;
4199                                clearStrBuf();
4200                                returnState = Tokenizer.SCRIPT_DATA_ESCAPED;
4201                                state = transition(state, Tokenizer.NON_DATA_END_TAG_NAME, reconsume, pos);
4202                                continue stateloop;
4203                            case 'S':
4204                            case 's':
4205                                /*
4206                                 * U+0041 LATIN CAPITAL LETTER A through to
4207                                 * U+005A LATIN CAPITAL LETTER Z Emit a U+003C
4208                                 * LESS-THAN SIGN character token and the
4209                                 * current input character as a character token.
4210                                 */
4211                                tokenHandler.characters(Tokenizer.LT_GT, 0, 1);
4212                                cstart = pos;
4213                                index = 1;
4214                                /*
4215                                 * Set the temporary buffer to the empty string.
4216                                 * Append the lowercase version of the current
4217                                 * input character (add 0x0020 to the
4218                                 * character's code point) to the temporary
4219                                 * buffer. Switch to the script data double
4220                                 * escape start state.
4221                                 */
4222                                state = transition(state, Tokenizer.SCRIPT_DATA_DOUBLE_ESCAPE_START, reconsume, pos);
4223                                break scriptdataescapedlessthanloop;
4224                            // continue stateloop;
4225                            default:
4226                                /*
4227                                 * Anything else Emit a U+003C LESS-THAN SIGN
4228                                 * character token and reconsume the current
4229                                 * input character in the script data escaped
4230                                 * state.
4231                                 */
4232                                tokenHandler.characters(Tokenizer.LT_GT, 0, 1);
4233                                cstart = pos;
4234                                reconsume = true;
4235                                state = transition(state, Tokenizer.SCRIPT_DATA_ESCAPED, reconsume, pos);
4236                                continue stateloop;
4237                        }
4238                    }
4239                    // WARNING FALLTHRU CASE TRANSITION: DON'T REORDER
4240                case SCRIPT_DATA_DOUBLE_ESCAPE_START:
4241                    scriptdatadoubleescapestartloop: for (;;) {
4242                        if (++pos == endPos) {
4243                            break stateloop;
4244                        }
4245                        c = checkChar(buf, pos);
4246                        assert (index > 0);
4247                        if (index < 6) { // SCRIPT_ARR.length
4248                            char folded = c;
4249                            if (c >= 'A' && c <= 'Z') {
4250                                folded += 0x20;
4251                            }
4252                            if (folded != Tokenizer.SCRIPT_ARR[index]) {
4253                                reconsume = true;
4254                                state = transition(state, Tokenizer.SCRIPT_DATA_ESCAPED, reconsume, pos);
4255                                continue stateloop;
4256                            }
4257                            index++;
4258                            continue;
4259                        }
4260                        switch (c) {
4261                            case '\r':
4262                                emitCarriageReturn(buf, pos);
4263                                state = transition(state, Tokenizer.SCRIPT_DATA_DOUBLE_ESCAPED, reconsume, pos);
4264                                break stateloop;
4265                            case '\n':
4266                                silentLineFeed();
4267                            case ' ':
4268                            case '\t':
4269                            case '\u000C':
4270                            case '/':
4271                            case '>':
4272                                /*
4273                                 * U+0009 CHARACTER TABULATION U+000A LINE FEED
4274                                 * (LF) U+000C FORM FEED (FF) U+0020 SPACE
4275                                 * U+002F SOLIDUS (/) U+003E GREATER-THAN SIGN
4276                                 * (>) Emit the current input character as a
4277                                 * character token. If the temporary buffer is
4278                                 * the string "script", then switch to the
4279                                 * script data double escaped state.
4280                                 */
4281                                state = transition(state, Tokenizer.SCRIPT_DATA_DOUBLE_ESCAPED, reconsume, pos);
4282                                break scriptdatadoubleescapestartloop;
4283                            // continue stateloop;
4284                            default:
4285                                /*
4286                                 * Anything else Reconsume the current input
4287                                 * character in the script data escaped state.
4288                                 */
4289                                reconsume = true;
4290                                state = transition(state, Tokenizer.SCRIPT_DATA_ESCAPED, reconsume, pos);
4291                                continue stateloop;
4292                        }
4293                    }
4294                    // WARNING FALLTHRU CASE TRANSITION: DON'T REORDER
4295                case SCRIPT_DATA_DOUBLE_ESCAPED:
4296                    scriptdatadoubleescapedloop: for (;;) {
4297                        if (reconsume) {
4298                            reconsume = false;
4299                        } else {
4300                            if (++pos == endPos) {
4301                                break stateloop;
4302                            }
4303                            c = checkChar(buf, pos);
4304                        }
4305                        /*
4306                         * Consume the next input character:
4307                         */
4308                        switch (c) {
4309                            case '-':
4310                                /*
4311                                 * U+002D HYPHEN-MINUS (-) Emit a U+002D
4312                                 * HYPHEN-MINUS character token. Switch to the
4313                                 * script data double escaped dash state.
4314                                 */
4315                                state = transition(state, Tokenizer.SCRIPT_DATA_DOUBLE_ESCAPED_DASH, reconsume, pos);
4316                                break scriptdatadoubleescapedloop; // FALL THRU
4317                            // continue
4318                            // stateloop;
4319                            case '<':
4320                                /*
4321                                 * U+003C LESS-THAN SIGN (<) Emit a U+003C
4322                                 * LESS-THAN SIGN character token. Switch to the
4323                                 * script data double escaped less-than sign
4324                                 * state.
4325                                 */
4326                                state = transition(state, Tokenizer.SCRIPT_DATA_DOUBLE_ESCAPED_LESS_THAN_SIGN, reconsume, pos);
4327                                continue stateloop;
4328                            case '\u0000':
4329                                emitReplacementCharacter(buf, pos);
4330                                continue;
4331                            case '\r':
4332                                emitCarriageReturn(buf, pos);
4333                                break stateloop;
4334                            case '\n':
4335                                silentLineFeed();
4336                            default:
4337                                /*
4338                                 * Anything else Emit the current input
4339                                 * character as a character token. Stay in the
4340                                 * script data double escaped state.
4341                                 */
4342                                continue;
4343                        }
4344                    }
4345                    // WARNING FALLTHRU CASE TRANSITION: DON'T REORDER
4346                case SCRIPT_DATA_DOUBLE_ESCAPED_DASH:
4347                    scriptdatadoubleescapeddashloop: for (;;) {
4348                        if (++pos == endPos) {
4349                            break stateloop;
4350                        }
4351                        c = checkChar(buf, pos);
4352                        /*
4353                         * Consume the next input character:
4354                         */
4355                        switch (c) {
4356                            case '-':
4357                                /*
4358                                 * U+002D HYPHEN-MINUS (-) Emit a U+002D
4359                                 * HYPHEN-MINUS character token. Switch to the
4360                                 * script data double escaped dash dash state.
4361                                 */
4362                                state = transition(state, Tokenizer.SCRIPT_DATA_DOUBLE_ESCAPED_DASH_DASH, reconsume, pos);
4363                                break scriptdatadoubleescapeddashloop;
4364                            // continue stateloop;
4365                            case '<':
4366                                /*
4367                                 * U+003C LESS-THAN SIGN (<) Emit a U+003C
4368                                 * LESS-THAN SIGN character token. Switch to the
4369                                 * script data double escaped less-than sign
4370                                 * state.
4371                                 */
4372                                state = transition(state, Tokenizer.SCRIPT_DATA_DOUBLE_ESCAPED_LESS_THAN_SIGN, reconsume, pos);
4373                                continue stateloop;
4374                            case '\u0000':
4375                                emitReplacementCharacter(buf, pos);
4376                                state = transition(state, Tokenizer.SCRIPT_DATA_DOUBLE_ESCAPED, reconsume, pos);
4377                                continue stateloop;
4378                            case '\r':
4379                                emitCarriageReturn(buf, pos);
4380                                state = transition(state, Tokenizer.SCRIPT_DATA_DOUBLE_ESCAPED, reconsume, pos);
4381                                break stateloop;
4382                            case '\n':
4383                                silentLineFeed();
4384                            default:
4385                                /*
4386                                 * Anything else Emit the current input
4387                                 * character as a character token. Switch to the
4388                                 * script data double escaped state.
4389                                 */
4390                                state = transition(state, Tokenizer.SCRIPT_DATA_DOUBLE_ESCAPED, reconsume, pos);
4391                                continue stateloop;
4392                        }
4393                    }
4394                    // WARNING FALLTHRU CASE TRANSITION: DON'T REORDER
4395                case SCRIPT_DATA_DOUBLE_ESCAPED_DASH_DASH:
4396                    scriptdatadoubleescapeddashdashloop: for (;;) {
4397                        if (++pos == endPos) {
4398                            break stateloop;
4399                        }
4400                        c = checkChar(buf, pos);
4401                        /*
4402                         * Consume the next input character:
4403                         */
4404                        switch (c) {
4405                            case '-':
4406                                /*
4407                                 * U+002D HYPHEN-MINUS (-) Emit a U+002D
4408                                 * HYPHEN-MINUS character token. Stay in the
4409                                 * script data double escaped dash dash state.
4410                                 */
4411                                continue;
4412                            case '<':
4413                                /*
4414                                 * U+003C LESS-THAN SIGN (<) Emit a U+003C
4415                                 * LESS-THAN SIGN character token. Switch to the
4416                                 * script data double escaped less-than sign
4417                                 * state.
4418                                 */
4419                                state = transition(state, Tokenizer.SCRIPT_DATA_DOUBLE_ESCAPED_LESS_THAN_SIGN, reconsume, pos);
4420                                break scriptdatadoubleescapeddashdashloop;
4421                            case '>':
4422                                /*
4423                                 * U+003E GREATER-THAN SIGN (>) Emit a U+003E
4424                                 * GREATER-THAN SIGN character token. Switch to
4425                                 * the script data state.
4426                                 */
4427                                state = transition(state, Tokenizer.SCRIPT_DATA, reconsume, pos);
4428                                continue stateloop;
4429                            case '\u0000':
4430                                emitReplacementCharacter(buf, pos);
4431                                state = transition(state, Tokenizer.SCRIPT_DATA_DOUBLE_ESCAPED, reconsume, pos);
4432                                continue stateloop;
4433                            case '\r':
4434                                emitCarriageReturn(buf, pos);
4435                                state = transition(state, Tokenizer.SCRIPT_DATA_DOUBLE_ESCAPED, reconsume, pos);
4436                                break stateloop;
4437                            case '\n':
4438                                silentLineFeed();
4439                            default:
4440                                /*
4441                                 * Anything else Emit the current input
4442                                 * character as a character token. Switch to the
4443                                 * script data double escaped state.
4444                                 */
4445                                state = transition(state, Tokenizer.SCRIPT_DATA_DOUBLE_ESCAPED, reconsume, pos);
4446                                continue stateloop;
4447                        }
4448                    }
4449                    // WARNING FALLTHRU CASE TRANSITION: DON'T REORDER
4450                case SCRIPT_DATA_DOUBLE_ESCAPED_LESS_THAN_SIGN:
4451                    scriptdatadoubleescapedlessthanloop: for (;;) {
4452                        if (++pos == endPos) {
4453                            break stateloop;
4454                        }
4455                        c = checkChar(buf, pos);
4456                        /*
4457                         * Consume the next input character:
4458                         */
4459                        switch (c) {
4460                            case '/':
4461                                /*
4462                                 * U+002F SOLIDUS (/) Emit a U+002F SOLIDUS
4463                                 * character token. Set the temporary buffer to
4464                                 * the empty string. Switch to the script data
4465                                 * double escape end state.
4466                                 */
4467                                index = 0;
4468                                state = transition(state, Tokenizer.SCRIPT_DATA_DOUBLE_ESCAPE_END, reconsume, pos);
4469                                break scriptdatadoubleescapedlessthanloop;
4470                            default:
4471                                /*
4472                                 * Anything else Reconsume the current input
4473                                 * character in the script data double escaped
4474                                 * state.
4475                                 */
4476                                reconsume = true;
4477                                state = transition(state, Tokenizer.SCRIPT_DATA_DOUBLE_ESCAPED, reconsume, pos);
4478                                continue stateloop;
4479                        }
4480                    }
4481                    // WARNING FALLTHRU CASE TRANSITION: DON'T REORDER
4482                case SCRIPT_DATA_DOUBLE_ESCAPE_END:
4483                    scriptdatadoubleescapeendloop: for (;;) {
4484                        if (++pos == endPos) {
4485                            break stateloop;
4486                        }
4487                        c = checkChar(buf, pos);
4488                        if (index < 6) { // SCRIPT_ARR.length
4489                            char folded = c;
4490                            if (c >= 'A' && c <= 'Z') {
4491                                folded += 0x20;
4492                            }
4493                            if (folded != Tokenizer.SCRIPT_ARR[index]) {
4494                                reconsume = true;
4495                                state = transition(state, Tokenizer.SCRIPT_DATA_DOUBLE_ESCAPED, reconsume, pos);
4496                                continue stateloop;
4497                            }
4498                            index++;
4499                            continue;
4500                        }
4501                        switch (c) {
4502                            case '\r':
4503                                emitCarriageReturn(buf, pos);
4504                                state = transition(state, Tokenizer.SCRIPT_DATA_ESCAPED, reconsume, pos);
4505                                break stateloop;
4506                            case '\n':
4507                                silentLineFeed();
4508                            case ' ':
4509                            case '\t':
4510                            case '\u000C':
4511                            case '/':
4512                            case '>':
4513                                /*
4514                                 * U+0009 CHARACTER TABULATION U+000A LINE FEED
4515                                 * (LF) U+000C FORM FEED (FF) U+0020 SPACE
4516                                 * U+002F SOLIDUS (/) U+003E GREATER-THAN SIGN
4517                                 * (>) Emit the current input character as a
4518                                 * character token. If the temporary buffer is
4519                                 * the string "script", then switch to the
4520                                 * script data escaped state.
4521                                 */
4522                                state = transition(state, Tokenizer.SCRIPT_DATA_ESCAPED, reconsume, pos);
4523                                continue stateloop;
4524                            default:
4525                                /*
4526                                 * Reconsume the current input character in the
4527                                 * script data double escaped state.
4528                                 */
4529                                reconsume = true;
4530                                state = transition(state, Tokenizer.SCRIPT_DATA_DOUBLE_ESCAPED, reconsume, pos);
4531                                continue stateloop;
4532                        }
4533                    }
4534                    // XXX reorder point
4535                case MARKUP_DECLARATION_OCTYPE:
4536                    markupdeclarationdoctypeloop: for (;;) {
4537                        if (++pos == endPos) {
4538                            break stateloop;
4539                        }
4540                        c = checkChar(buf, pos);
4541                        if (index < 6) { // OCTYPE.length
4542                            char folded = c;
4543                            if (c >= 'A' && c <= 'Z') {
4544                                folded += 0x20;
4545                            }
4546                            if (folded == Tokenizer.OCTYPE[index]) {
4547                                appendLongStrBuf(c);
4548                            } else {
4549                                errBogusComment();
4550                                reconsume = true;
4551                                state = transition(state, Tokenizer.BOGUS_COMMENT, reconsume, pos);
4552                                continue stateloop;
4553                            }
4554                            index++;
4555                            continue;
4556                        } else {
4557                            reconsume = true;
4558                            state = transition(state, Tokenizer.DOCTYPE, reconsume, pos);
4559                            break markupdeclarationdoctypeloop;
4560                            // continue stateloop;
4561                        }
4562                    }
4563                    // FALLTHRU DON'T REORDER
4564                case DOCTYPE:
4565                    doctypeloop: for (;;) {
4566                        if (reconsume) {
4567                            reconsume = false;
4568                        } else {
4569                            if (++pos == endPos) {
4570                                break stateloop;
4571                            }
4572                            c = checkChar(buf, pos);
4573                        }
4574                        initDoctypeFields();
4575                        /*
4576                         * Consume the next input character:
4577                         */
4578                        switch (c) {
4579                            case '\r':
4580                                silentCarriageReturn();
4581                                state = transition(state, Tokenizer.BEFORE_DOCTYPE_NAME, reconsume, pos);
4582                                break stateloop;
4583                            case '\n':
4584                                silentLineFeed();
4585                                // fall thru
4586                            case ' ':
4587                            case '\t':
4588                            case '\u000C':
4589                                /*
4590                                 * U+0009 CHARACTER TABULATION U+000A LINE FEED
4591                                 * (LF) U+000C FORM FEED (FF) U+0020 SPACE
4592                                 * Switch to the before DOCTYPE name state.
4593                                 */
4594                                state = transition(state, Tokenizer.BEFORE_DOCTYPE_NAME, reconsume, pos);
4595                                break doctypeloop;
4596                            // continue stateloop;
4597                            default:
4598                                /*
4599                                 * Anything else Parse error.
4600                                 */
4601                                errMissingSpaceBeforeDoctypeName();
4602                                /*
4603                                 * Reconsume the current character in the before
4604                                 * DOCTYPE name state.
4605                                 */
4606                                reconsume = true;
4607                                state = transition(state, Tokenizer.BEFORE_DOCTYPE_NAME, reconsume, pos);
4608                                break doctypeloop;
4609                            // continue stateloop;
4610                        }
4611                    }
4612                    // FALLTHRU DON'T REORDER
4613                case BEFORE_DOCTYPE_NAME:
4614                    beforedoctypenameloop: for (;;) {
4615                        if (reconsume) {
4616                            reconsume = false;
4617                        } else {
4618                            if (++pos == endPos) {
4619                                break stateloop;
4620                            }
4621                            c = checkChar(buf, pos);
4622                        }
4623                        /*
4624                         * Consume the next input character:
4625                         */
4626                        switch (c) {
4627                            case '\r':
4628                                silentCarriageReturn();
4629                                break stateloop;
4630                            case '\n':
4631                                silentLineFeed();
4632                                // fall thru
4633                            case ' ':
4634                            case '\t':
4635                            case '\u000C':
4636                                /*
4637                                 * U+0009 CHARACTER TABULATION U+000A LINE FEED
4638                                 * (LF) U+000C FORM FEED (FF) U+0020 SPACE Stay
4639                                 * in the before DOCTYPE name state.
4640                                 */
4641                                continue;
4642                            case '>':
4643                                /*
4644                                 * U+003E GREATER-THAN SIGN (>) Parse error.
4645                                 */
4646                                errNamelessDoctype();
4647                                /*
4648                                 * Create a new DOCTYPE token. Set its
4649                                 * force-quirks flag to on.
4650                                 */
4651                                forceQuirks = true;
4652                                /*
4653                                 * Emit the token.
4654                                 */
4655                                emitDoctypeToken(pos);
4656                                /*
4657                                 * Switch to the data state.
4658                                 */
4659                                state = transition(state, Tokenizer.DATA, reconsume, pos);
4660                                continue stateloop;
4661                            case '\u0000':
4662                                c = '\uFFFD';
4663                                // fall thru
4664                            default:
4665                                if (c >= 'A' && c <= 'Z') {
4666                                    /*
4667                                     * U+0041 LATIN CAPITAL LETTER A through to
4668                                     * U+005A LATIN CAPITAL LETTER Z Create a
4669                                     * new DOCTYPE token. Set the token's name
4670                                     * to the lowercase version of the input
4671                                     * character (add 0x0020 to the character's
4672                                     * code point).
4673                                     */
4674                                    c += 0x20;
4675                                }
4676                                /* Anything else Create a new DOCTYPE token. */
4677                                /*
4678                                 * Set the token's name name to the current
4679                                 * input character.
4680                                 */
4681                                clearStrBufAndAppend(c);
4682                                /*
4683                                 * Switch to the DOCTYPE name state.
4684                                 */
4685                                state = transition(state, Tokenizer.DOCTYPE_NAME, reconsume, pos);
4686                                break beforedoctypenameloop;
4687                            // continue stateloop;
4688                        }
4689                    }
4690                    // FALLTHRU DON'T REORDER
4691                case DOCTYPE_NAME:
4692                    doctypenameloop: for (;;) {
4693                        if (++pos == endPos) {
4694                            break stateloop;
4695                        }
4696                        c = checkChar(buf, pos);
4697                        /*
4698                         * Consume the next input character:
4699                         */
4700                        switch (c) {
4701                            case '\r':
4702                                silentCarriageReturn();
4703                                strBufToDoctypeName();
4704                                state = transition(state, Tokenizer.AFTER_DOCTYPE_NAME, reconsume, pos);
4705                                break stateloop;
4706                            case '\n':
4707                                silentLineFeed();
4708                                // fall thru
4709                            case ' ':
4710                            case '\t':
4711                            case '\u000C':
4712                                /*
4713                                 * U+0009 CHARACTER TABULATION U+000A LINE FEED
4714                                 * (LF) U+000C FORM FEED (FF) U+0020 SPACE
4715                                 * Switch to the after DOCTYPE name state.
4716                                 */
4717                                strBufToDoctypeName();
4718                                state = transition(state, Tokenizer.AFTER_DOCTYPE_NAME, reconsume, pos);
4719                                break doctypenameloop;
4720                            // continue stateloop;
4721                            case '>':
4722                                /*
4723                                 * U+003E GREATER-THAN SIGN (>) Emit the current
4724                                 * DOCTYPE token.
4725                                 */
4726                                strBufToDoctypeName();
4727                                emitDoctypeToken(pos);
4728                                /*
4729                                 * Switch to the data state.
4730                                 */
4731                                state = transition(state, Tokenizer.DATA, reconsume, pos);
4732                                continue stateloop;
4733                            case '\u0000':
4734                                c = '\uFFFD';
4735                                // fall thru
4736                            default:
4737                                /*
4738                                 * U+0041 LATIN CAPITAL LETTER A through to
4739                                 * U+005A LATIN CAPITAL LETTER Z Append the
4740                                 * lowercase version of the input character (add
4741                                 * 0x0020 to the character's code point) to the
4742                                 * current DOCTYPE token's name.
4743                                 */
4744                                if (c >= 'A' && c <= 'Z') {
4745                                    c += 0x0020;
4746                                }
4747                                /*
4748                                 * Anything else Append the current input
4749                                 * character to the current DOCTYPE token's
4750                                 * name.
4751                                 */
4752                                appendStrBuf(c);
4753                                /*
4754                                 * Stay in the DOCTYPE name state.
4755                                 */
4756                                continue;
4757                        }
4758                    }
4759                    // FALLTHRU DON'T REORDER
4760                case AFTER_DOCTYPE_NAME:
4761                    afterdoctypenameloop: for (;;) {
4762                        if (++pos == endPos) {
4763                            break stateloop;
4764                        }
4765                        c = checkChar(buf, pos);
4766                        /*
4767                         * Consume the next input character:
4768                         */
4769                        switch (c) {
4770                            case '\r':
4771                                silentCarriageReturn();
4772                                break stateloop;
4773                            case '\n':
4774                                silentLineFeed();
4775                                // fall thru
4776                            case ' ':
4777                            case '\t':
4778                            case '\u000C':
4779                                /*
4780                                 * U+0009 CHARACTER TABULATION U+000A LINE FEED
4781                                 * (LF) U+000C FORM FEED (FF) U+0020 SPACE Stay
4782                                 * in the after DOCTYPE name state.
4783                                 */
4784                                continue;
4785                            case '>':
4786                                /*
4787                                 * U+003E GREATER-THAN SIGN (>) Emit the current
4788                                 * DOCTYPE token.
4789                                 */
4790                                emitDoctypeToken(pos);
4791                                /*
4792                                 * Switch to the data state.
4793                                 */
4794                                state = transition(state, Tokenizer.DATA, reconsume, pos);
4795                                continue stateloop;
4796                            case 'p':
4797                            case 'P':
4798                                index = 0;
4799                                state = transition(state, Tokenizer.DOCTYPE_UBLIC, reconsume, pos);
4800                                break afterdoctypenameloop;
4801                            // continue stateloop;
4802                            case 's':
4803                            case 'S':
4804                                index = 0;
4805                                state = transition(state, Tokenizer.DOCTYPE_YSTEM, reconsume, pos);
4806                                continue stateloop;
4807                            default:
4808                                /*
4809                                 * Otherwise, this is the parse error.
4810                                 */
4811                                bogusDoctype();
4812
4813                                /*
4814                                 * Set the DOCTYPE token's force-quirks flag to
4815                                 * on.
4816                                 */
4817                                // done by bogusDoctype();
4818                                /*
4819                                 * Switch to the bogus DOCTYPE state.
4820                                 */
4821                                state = transition(state, Tokenizer.BOGUS_DOCTYPE, reconsume, pos);
4822                                continue stateloop;
4823                        }
4824                    }
4825                    // FALLTHRU DON'T REORDER
4826                case DOCTYPE_UBLIC:
4827                    doctypeublicloop: for (;;) {
4828                        if (++pos == endPos) {
4829                            break stateloop;
4830                        }
4831                        c = checkChar(buf, pos);
4832                        /*
4833                         * If the six characters starting from the current input
4834                         * character are an ASCII case-insensitive match for the
4835                         * word "PUBLIC", then consume those characters and
4836                         * switch to the before DOCTYPE public identifier state.
4837                         */
4838                        if (index < 5) { // UBLIC.length
4839                            char folded = c;
4840                            if (c >= 'A' && c <= 'Z') {
4841                                folded += 0x20;
4842                            }
4843                            if (folded != Tokenizer.UBLIC[index]) {
4844                                bogusDoctype();
4845                                // forceQuirks = true;
4846                                reconsume = true;
4847                                state = transition(state, Tokenizer.BOGUS_DOCTYPE, reconsume, pos);
4848                                continue stateloop;
4849                            }
4850                            index++;
4851                            continue;
4852                        } else {
4853                            reconsume = true;
4854                            state = transition(state, Tokenizer.AFTER_DOCTYPE_PUBLIC_KEYWORD, reconsume, pos);
4855                            break doctypeublicloop;
4856                            // continue stateloop;
4857                        }
4858                    }
4859                    // FALLTHRU DON'T REORDER
4860                case AFTER_DOCTYPE_PUBLIC_KEYWORD:
4861                    afterdoctypepublickeywordloop: for (;;) {
4862                        if (reconsume) {
4863                            reconsume = false;
4864                        } else {
4865                            if (++pos == endPos) {
4866                                break stateloop;
4867                            }
4868                            c = checkChar(buf, pos);
4869                        }
4870                        /*
4871                         * Consume the next input character:
4872                         */
4873                        switch (c) {
4874                            case '\r':
4875                                silentCarriageReturn();
4876                                state = transition(state, Tokenizer.BEFORE_DOCTYPE_PUBLIC_IDENTIFIER, reconsume, pos);
4877                                break stateloop;
4878                            case '\n':
4879                                silentLineFeed();
4880                                // fall thru
4881                            case ' ':
4882                            case '\t':
4883                            case '\u000C':
4884                                /*
4885                                 * U+0009 CHARACTER TABULATION U+000A LINE FEED
4886                                 * (LF) U+000C FORM FEED (FF) U+0020 SPACE
4887                                 * Switch to the before DOCTYPE public
4888                                 * identifier state.
4889                                 */
4890                                state = transition(state, Tokenizer.BEFORE_DOCTYPE_PUBLIC_IDENTIFIER, reconsume, pos);
4891                                break afterdoctypepublickeywordloop;
4892                            // FALL THROUGH continue stateloop
4893                            case '"':
4894                                /*
4895                                 * U+0022 QUOTATION MARK (") Parse Error.
4896                                 */
4897                                errNoSpaceBetweenDoctypePublicKeywordAndQuote();
4898                                /*
4899                                 * Set the DOCTYPE token's public identifier to
4900                                 * the empty string (not missing),
4901                                 */
4902                                clearLongStrBuf();
4903                                /*
4904                                 * then switch to the DOCTYPE public identifier
4905                                 * (double-quoted) state.
4906                                 */
4907                                state = transition(state, Tokenizer.DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED, reconsume, pos);
4908                                continue stateloop;
4909                            case '\'':
4910                                /*
4911                                 * U+0027 APOSTROPHE (') Parse Error.
4912                                 */
4913                                errNoSpaceBetweenDoctypePublicKeywordAndQuote();
4914                                /*
4915                                 * Set the DOCTYPE token's public identifier to
4916                                 * the empty string (not missing),
4917                                 */
4918                                clearLongStrBuf();
4919                                /*
4920                                 * then switch to the DOCTYPE public identifier
4921                                 * (single-quoted) state.
4922                                 */
4923                                state = transition(state, Tokenizer.DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED, reconsume, pos);
4924                                continue stateloop;
4925                            case '>':
4926                                /* U+003E GREATER-THAN SIGN (>) Parse error. */
4927                                errExpectedPublicId();
4928                                /*
4929                                 * Set the DOCTYPE token's force-quirks flag to
4930                                 * on.
4931                                 */
4932                                forceQuirks = true;
4933                                /*
4934                                 * Emit that DOCTYPE token.
4935                                 */
4936                                emitDoctypeToken(pos);
4937                                /*
4938                                 * Switch to the data state.
4939                                 */
4940                                state = transition(state, Tokenizer.DATA, reconsume, pos);
4941                                continue stateloop;
4942                            default:
4943                                bogusDoctype();
4944                                /*
4945                                 * Set the DOCTYPE token's force-quirks flag to
4946                                 * on.
4947                                 */
4948                                // done by bogusDoctype();
4949                                /*
4950                                 * Switch to the bogus DOCTYPE state.
4951                                 */
4952                                state = transition(state, Tokenizer.BOGUS_DOCTYPE, reconsume, pos);
4953                                continue stateloop;
4954                        }
4955                    }
4956                    // FALLTHRU DON'T REORDER
4957                case BEFORE_DOCTYPE_PUBLIC_IDENTIFIER:
4958                    beforedoctypepublicidentifierloop: for (;;) {
4959                        if (++pos == endPos) {
4960                            break stateloop;
4961                        }
4962                        c = checkChar(buf, pos);
4963                        /*
4964                         * Consume the next input character:
4965                         */
4966                        switch (c) {
4967                            case '\r':
4968                                silentCarriageReturn();
4969                                break stateloop;
4970                            case '\n':
4971                                silentLineFeed();
4972                                // fall thru
4973                            case ' ':
4974                            case '\t':
4975                            case '\u000C':
4976                                /*
4977                                 * U+0009 CHARACTER TABULATION U+000A LINE FEED
4978                                 * (LF) U+000C FORM FEED (FF) U+0020 SPACE Stay
4979                                 * in the before DOCTYPE public identifier
4980                                 * state.
4981                                 */
4982                                continue;
4983                            case '"':
4984                                /*
4985                                 * U+0022 QUOTATION MARK (") Set the DOCTYPE
4986                                 * token's public identifier to the empty string
4987                                 * (not missing),
4988                                 */
4989                                clearLongStrBuf();
4990                                /*
4991                                 * then switch to the DOCTYPE public identifier
4992                                 * (double-quoted) state.
4993                                 */
4994                                state = transition(state, Tokenizer.DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED, reconsume, pos);
4995                                break beforedoctypepublicidentifierloop;
4996                            // continue stateloop;
4997                            case '\'':
4998                                /*
4999                                 * U+0027 APOSTROPHE (') Set the DOCTYPE token's
5000                                 * public identifier to the empty string (not
5001                                 * missing),
5002                                 */
5003                                clearLongStrBuf();
5004                                /*
5005                                 * then switch to the DOCTYPE public identifier
5006                                 * (single-quoted) state.
5007                                 */
5008                                state = transition(state, Tokenizer.DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED, reconsume, pos);
5009                                continue stateloop;
5010                            case '>':
5011                                /* U+003E GREATER-THAN SIGN (>) Parse error. */
5012                                errExpectedPublicId();
5013                                /*
5014                                 * Set the DOCTYPE token's force-quirks flag to
5015                                 * on.
5016                                 */
5017                                forceQuirks = true;
5018                                /*
5019                                 * Emit that DOCTYPE token.
5020                                 */
5021                                emitDoctypeToken(pos);
5022                                /*
5023                                 * Switch to the data state.
5024                                 */
5025                                state = transition(state, Tokenizer.DATA, reconsume, pos);
5026                                continue stateloop;
5027                            default:
5028                                bogusDoctype();
5029                                /*
5030                                 * Set the DOCTYPE token's force-quirks flag to
5031                                 * on.
5032                                 */
5033                                // done by bogusDoctype();
5034                                /*
5035                                 * Switch to the bogus DOCTYPE state.
5036                                 */
5037                                state = transition(state, Tokenizer.BOGUS_DOCTYPE, reconsume, pos);
5038                                continue stateloop;
5039                        }
5040                    }
5041                    // FALLTHRU DON'T REORDER
5042                case DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED:
5043                    doctypepublicidentifierdoublequotedloop: for (;;) {
5044                        if (++pos == endPos) {
5045                            break stateloop;
5046                        }
5047                        c = checkChar(buf, pos);
5048                        /*
5049                         * Consume the next input character:
5050                         */
5051                        switch (c) {
5052                            case '"':
5053                                /*
5054                                 * U+0022 QUOTATION MARK (") Switch to the after
5055                                 * DOCTYPE public identifier state.
5056                                 */
5057                                publicIdentifier = longStrBufToString();
5058                                state = transition(state, Tokenizer.AFTER_DOCTYPE_PUBLIC_IDENTIFIER, reconsume, pos);
5059                                break doctypepublicidentifierdoublequotedloop;
5060                            // continue stateloop;
5061                            case '>':
5062                                /*
5063                                 * U+003E GREATER-THAN SIGN (>) Parse error.
5064                                 */
5065                                errGtInPublicId();
5066                                /*
5067                                 * Set the DOCTYPE token's force-quirks flag to
5068                                 * on.
5069                                 */
5070                                forceQuirks = true;
5071                                /*
5072                                 * Emit that DOCTYPE token.
5073                                 */
5074                                publicIdentifier = longStrBufToString();
5075                                emitDoctypeToken(pos);
5076                                /*
5077                                 * Switch to the data state.
5078                                 */
5079                                state = transition(state, Tokenizer.DATA, reconsume, pos);
5080                                continue stateloop;
5081                            case '\r':
5082                                appendLongStrBufCarriageReturn();
5083                                break stateloop;
5084                            case '\n':
5085                                appendLongStrBufLineFeed();
5086                                continue;
5087                            case '\u0000':
5088                                c = '\uFFFD';
5089                                // fall thru
5090                            default:
5091                                /*
5092                                 * Anything else Append the current input
5093                                 * character to the current DOCTYPE token's
5094                                 * public identifier.
5095                                 */
5096                                appendLongStrBuf(c);
5097                                /*
5098                                 * Stay in the DOCTYPE public identifier
5099                                 * (double-quoted) state.
5100                                 */
5101                                continue;
5102                        }
5103                    }
5104                    // FALLTHRU DON'T REORDER
5105                case AFTER_DOCTYPE_PUBLIC_IDENTIFIER:
5106                    afterdoctypepublicidentifierloop: for (;;) {
5107                        if (++pos == endPos) {
5108                            break stateloop;
5109                        }
5110                        c = checkChar(buf, pos);
5111                        /*
5112                         * Consume the next input character:
5113                         */
5114                        switch (c) {
5115                            case '\r':
5116                                silentCarriageReturn();
5117                                state = transition(state, Tokenizer.BETWEEN_DOCTYPE_PUBLIC_AND_SYSTEM_IDENTIFIERS, reconsume, pos);
5118                                break stateloop;
5119                            case '\n':
5120                                silentLineFeed();
5121                                // fall thru
5122                            case ' ':
5123                            case '\t':
5124                            case '\u000C':
5125                                /*
5126                                 * U+0009 CHARACTER TABULATION U+000A LINE FEED
5127                                 * (LF) U+000C FORM FEED (FF) U+0020 SPACE
5128                                 * Switch to the between DOCTYPE public and
5129                                 * system identifiers state.
5130                                 */
5131                                state = transition(state, Tokenizer.BETWEEN_DOCTYPE_PUBLIC_AND_SYSTEM_IDENTIFIERS, reconsume, pos);
5132                                break afterdoctypepublicidentifierloop;
5133                            // continue stateloop;
5134                            case '>':
5135                                /*
5136                                 * U+003E GREATER-THAN SIGN (>) Emit the current
5137                                 * DOCTYPE token.
5138                                 */
5139                                emitDoctypeToken(pos);
5140                                /*
5141                                 * Switch to the data state.
5142                                 */
5143                                state = transition(state, Tokenizer.DATA, reconsume, pos);
5144                                continue stateloop;
5145                            case '"':
5146                                /*
5147                                 * U+0022 QUOTATION MARK (") Parse error.
5148                                 */
5149                                errNoSpaceBetweenPublicAndSystemIds();
5150                                /*
5151                                 * Set the DOCTYPE token's system identifier to
5152                                 * the empty string (not missing),
5153                                 */
5154                                clearLongStrBuf();
5155                                /*
5156                                 * then switch to the DOCTYPE system identifier
5157                                 * (double-quoted) state.
5158                                 */
5159                                state = transition(state, Tokenizer.DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED, reconsume, pos);
5160                                continue stateloop;
5161                            case '\'':
5162                                /*
5163                                 * U+0027 APOSTROPHE (') Parse error.
5164                                 */
5165                                errNoSpaceBetweenPublicAndSystemIds();
5166                                /*
5167                                 * Set the DOCTYPE token's system identifier to
5168                                 * the empty string (not missing),
5169                                 */
5170                                clearLongStrBuf();
5171                                /*
5172                                 * then switch to the DOCTYPE system identifier
5173                                 * (single-quoted) state.
5174                                 */
5175                                state = transition(state, Tokenizer.DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED, reconsume, pos);
5176                                continue stateloop;
5177                            default:
5178                                bogusDoctype();
5179                                /*
5180                                 * Set the DOCTYPE token's force-quirks flag to
5181                                 * on.
5182                                 */
5183                                // done by bogusDoctype();
5184                                /*
5185                                 * Switch to the bogus DOCTYPE state.
5186                                 */
5187                                state = transition(state, Tokenizer.BOGUS_DOCTYPE, reconsume, pos);
5188                                continue stateloop;
5189                        }
5190                    }
5191                    // FALLTHRU DON'T REORDER
5192                case BETWEEN_DOCTYPE_PUBLIC_AND_SYSTEM_IDENTIFIERS:
5193                    betweendoctypepublicandsystemidentifiersloop: for (;;) {
5194                        if (++pos == endPos) {
5195                            break stateloop;
5196                        }
5197                        c = checkChar(buf, pos);
5198                        /*
5199                         * Consume the next input character:
5200                         */
5201                        switch (c) {
5202                            case '\r':
5203                                silentCarriageReturn();
5204                                break stateloop;
5205                            case '\n':
5206                                silentLineFeed();
5207                                // fall thru
5208                            case ' ':
5209                            case '\t':
5210                            case '\u000C':
5211                                /*
5212                                 * U+0009 CHARACTER TABULATION U+000A LINE FEED
5213                                 * (LF) U+000C FORM FEED (FF) U+0020 SPACE Stay
5214                                 * in the between DOCTYPE public and system
5215                                 * identifiers state.
5216                                 */
5217                                continue;
5218                            case '>':
5219                                /*
5220                                 * U+003E GREATER-THAN SIGN (>) Emit the current
5221                                 * DOCTYPE token.
5222                                 */
5223                                emitDoctypeToken(pos);
5224                                /*
5225                                 * Switch to the data state.
5226                                 */
5227                                state = transition(state, Tokenizer.DATA, reconsume, pos);
5228                                continue stateloop;
5229                            case '"':
5230                                /*
5231                                 * U+0022 QUOTATION MARK (") Set the DOCTYPE
5232                                 * token's system identifier to the empty string
5233                                 * (not missing),
5234                                 */
5235                                clearLongStrBuf();
5236                                /*
5237                                 * then switch to the DOCTYPE system identifier
5238                                 * (double-quoted) state.
5239                                 */
5240                                state = transition(state, Tokenizer.DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED, reconsume, pos);
5241                                break betweendoctypepublicandsystemidentifiersloop;
5242                            // continue stateloop;
5243                            case '\'':
5244                                /*
5245                                 * U+0027 APOSTROPHE (') Set the DOCTYPE token's
5246                                 * system identifier to the empty string (not
5247                                 * missing),
5248                                 */
5249                                clearLongStrBuf();
5250                                /*
5251                                 * then switch to the DOCTYPE system identifier
5252                                 * (single-quoted) state.
5253                                 */
5254                                state = transition(state, Tokenizer.DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED, reconsume, pos);
5255                                continue stateloop;
5256                            default:
5257                                bogusDoctype();
5258                                /*
5259                                 * Set the DOCTYPE token's force-quirks flag to
5260                                 * on.
5261                                 */
5262                                // done by bogusDoctype();
5263                                /*
5264                                 * Switch to the bogus DOCTYPE state.
5265                                 */
5266                                state = transition(state, Tokenizer.BOGUS_DOCTYPE, reconsume, pos);
5267                                continue stateloop;
5268                        }
5269                    }
5270                    // FALLTHRU DON'T REORDER
5271                case DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED:
5272                    doctypesystemidentifierdoublequotedloop: for (;;) {
5273                        if (++pos == endPos) {
5274                            break stateloop;
5275                        }
5276                        c = checkChar(buf, pos);
5277                        /*
5278                         * Consume the next input character:
5279                         */
5280                        switch (c) {
5281                            case '"':
5282                                /*
5283                                 * U+0022 QUOTATION MARK (") Switch to the after
5284                                 * DOCTYPE system identifier state.
5285                                 */
5286                                systemIdentifier = longStrBufToString();
5287                                state = transition(state, Tokenizer.AFTER_DOCTYPE_SYSTEM_IDENTIFIER, reconsume, pos);
5288                                continue stateloop;
5289                            case '>':
5290                                /*
5291                                 * U+003E GREATER-THAN SIGN (>) Parse error.
5292                                 */
5293                                errGtInSystemId();
5294                                /*
5295                                 * Set the DOCTYPE token's force-quirks flag to
5296                                 * on.
5297                                 */
5298                                forceQuirks = true;
5299                                /*
5300                                 * Emit that DOCTYPE token.
5301                                 */
5302                                systemIdentifier = longStrBufToString();
5303                                emitDoctypeToken(pos);
5304                                /*
5305                                 * Switch to the data state.
5306                                 */
5307                                state = transition(state, Tokenizer.DATA, reconsume, pos);
5308                                continue stateloop;
5309                            case '\r':
5310                                appendLongStrBufCarriageReturn();
5311                                break stateloop;
5312                            case '\n':
5313                                appendLongStrBufLineFeed();
5314                                continue;
5315                            case '\u0000':
5316                                c = '\uFFFD';
5317                                // fall thru
5318                            default:
5319                                /*
5320                                 * Anything else Append the current input
5321                                 * character to the current DOCTYPE token's
5322                                 * system identifier.
5323                                 */
5324                                appendLongStrBuf(c);
5325                                /*
5326                                 * Stay in the DOCTYPE system identifier
5327                                 * (double-quoted) state.
5328                                 */
5329                                continue;
5330                        }
5331                    }
5332                    // FALLTHRU DON'T REORDER
5333                case AFTER_DOCTYPE_SYSTEM_IDENTIFIER:
5334                    afterdoctypesystemidentifierloop: for (;;) {
5335                        if (++pos == endPos) {
5336                            break stateloop;
5337                        }
5338                        c = checkChar(buf, pos);
5339                        /*
5340                         * Consume the next input character:
5341                         */
5342                        switch (c) {
5343                            case '\r':
5344                                silentCarriageReturn();
5345                                break stateloop;
5346                            case '\n':
5347                                silentLineFeed();
5348                                // fall thru
5349                            case ' ':
5350                            case '\t':
5351                            case '\u000C':
5352                                /*
5353                                 * U+0009 CHARACTER TABULATION U+000A LINE FEED
5354                                 * (LF) U+000C FORM FEED (FF) U+0020 SPACE Stay
5355                                 * in the after DOCTYPE system identifier state.
5356                                 */
5357                                continue;
5358                            case '>':
5359                                /*
5360                                 * U+003E GREATER-THAN SIGN (>) Emit the current
5361                                 * DOCTYPE token.
5362                                 */
5363                                emitDoctypeToken(pos);
5364                                /*
5365                                 * Switch to the data state.
5366                                 */
5367                                state = transition(state, Tokenizer.DATA, reconsume, pos);
5368                                continue stateloop;
5369                            default:
5370                                /*
5371                                 * Switch to the bogus DOCTYPE state. (This does
5372                                 * not set the DOCTYPE token's force-quirks flag
5373                                 * to on.)
5374                                 */
5375                                bogusDoctypeWithoutQuirks();
5376                                state = transition(state, Tokenizer.BOGUS_DOCTYPE, reconsume, pos);
5377                                break afterdoctypesystemidentifierloop;
5378                            // continue stateloop;
5379                        }
5380                    }
5381                    // FALLTHRU DON'T REORDER
5382                case BOGUS_DOCTYPE:
5383                    for (;;) {
5384                        if (reconsume) {
5385                            reconsume = false;
5386                        } else {
5387                            if (++pos == endPos) {
5388                                break stateloop;
5389                            }
5390                            c = checkChar(buf, pos);
5391                        }
5392                        /*
5393                         * Consume the next input character:
5394                         */
5395                        switch (c) {
5396                            case '>':
5397                                /*
5398                                 * U+003E GREATER-THAN SIGN (>) Emit that
5399                                 * DOCTYPE token.
5400                                 */
5401                                emitDoctypeToken(pos);
5402                                /*
5403                                 * Switch to the data state.
5404                                 */
5405                                state = transition(state, Tokenizer.DATA, reconsume, pos);
5406                                continue stateloop;
5407                            case '\r':
5408                                silentCarriageReturn();
5409                                break stateloop;
5410                            case '\n':
5411                                silentLineFeed();
5412                                // fall thru
5413                            default:
5414                                /*
5415                                 * Anything else Stay in the bogus DOCTYPE
5416                                 * state.
5417                                 */
5418                                continue;
5419                        }
5420                    }
5421                    // XXX reorder point
5422                case DOCTYPE_YSTEM:
5423                    doctypeystemloop: for (;;) {
5424                        if (++pos == endPos) {
5425                            break stateloop;
5426                        }
5427                        c = checkChar(buf, pos);
5428                        /*
5429                         * Otherwise, if the six characters starting from the
5430                         * current input character are an ASCII case-insensitive
5431                         * match for the word "SYSTEM", then consume those
5432                         * characters and switch to the before DOCTYPE system
5433                         * identifier state.
5434                         */
5435                        if (index < 5) { // YSTEM.length
5436                            char folded = c;
5437                            if (c >= 'A' && c <= 'Z') {
5438                                folded += 0x20;
5439                            }
5440                            if (folded != Tokenizer.YSTEM[index]) {
5441                                bogusDoctype();
5442                                reconsume = true;
5443                                state = transition(state, Tokenizer.BOGUS_DOCTYPE, reconsume, pos);
5444                                continue stateloop;
5445                            }
5446                            index++;
5447                            continue stateloop;
5448                        } else {
5449                            reconsume = true;
5450                            state = transition(state, Tokenizer.AFTER_DOCTYPE_SYSTEM_KEYWORD, reconsume, pos);
5451                            break doctypeystemloop;
5452                            // continue stateloop;
5453                        }
5454                    }
5455                    // FALLTHRU DON'T REORDER
5456                case AFTER_DOCTYPE_SYSTEM_KEYWORD:
5457                    afterdoctypesystemkeywordloop: for (;;) {
5458                        if (reconsume) {
5459                            reconsume = false;
5460                        } else {
5461                            if (++pos == endPos) {
5462                                break stateloop;
5463                            }
5464                            c = checkChar(buf, pos);
5465                        }
5466                        /*
5467                         * Consume the next input character:
5468                         */
5469                        switch (c) {
5470                            case '\r':
5471                                silentCarriageReturn();
5472                                state = transition(state, Tokenizer.BEFORE_DOCTYPE_SYSTEM_IDENTIFIER, reconsume, pos);
5473                                break stateloop;
5474                            case '\n':
5475                                silentLineFeed();
5476                                // fall thru
5477                            case ' ':
5478                            case '\t':
5479                            case '\u000C':
5480                                /*
5481                                 * U+0009 CHARACTER TABULATION U+000A LINE FEED
5482                                 * (LF) U+000C FORM FEED (FF) U+0020 SPACE
5483                                 * Switch to the before DOCTYPE public
5484                                 * identifier state.
5485                                 */
5486                                state = transition(state, Tokenizer.BEFORE_DOCTYPE_SYSTEM_IDENTIFIER, reconsume, pos);
5487                                break afterdoctypesystemkeywordloop;
5488                            // FALL THROUGH continue stateloop
5489                            case '"':
5490                                /*
5491                                 * U+0022 QUOTATION MARK (") Parse Error.
5492                                 */
5493                                errNoSpaceBetweenDoctypeSystemKeywordAndQuote();
5494                                /*
5495                                 * Set the DOCTYPE token's system identifier to
5496                                 * the empty string (not missing),
5497                                 */
5498                                clearLongStrBuf();
5499                                /*
5500                                 * then switch to the DOCTYPE public identifier
5501                                 * (double-quoted) state.
5502                                 */
5503                                state = transition(state, Tokenizer.DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED, reconsume, pos);
5504                                continue stateloop;
5505                            case '\'':
5506                                /*
5507                                 * U+0027 APOSTROPHE (') Parse Error.
5508                                 */
5509                                errNoSpaceBetweenDoctypeSystemKeywordAndQuote();
5510                                /*
5511                                 * Set the DOCTYPE token's public identifier to
5512                                 * the empty string (not missing),
5513                                 */
5514                                clearLongStrBuf();
5515                                /*
5516                                 * then switch to the DOCTYPE public identifier
5517                                 * (single-quoted) state.
5518                                 */
5519                                state = transition(state, Tokenizer.DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED, reconsume, pos);
5520                                continue stateloop;
5521                            case '>':
5522                                /* U+003E GREATER-THAN SIGN (>) Parse error. */
5523                                errExpectedPublicId();
5524                                /*
5525                                 * Set the DOCTYPE token's force-quirks flag to
5526                                 * on.
5527                                 */
5528                                forceQuirks = true;
5529                                /*
5530                                 * Emit that DOCTYPE token.
5531                                 */
5532                                emitDoctypeToken(pos);
5533                                /*
5534                                 * Switch to the data state.
5535                                 */
5536                                state = transition(state, Tokenizer.DATA, reconsume, pos);
5537                                continue stateloop;
5538                            default:
5539                                bogusDoctype();
5540                                /*
5541                                 * Set the DOCTYPE token's force-quirks flag to
5542                                 * on.
5543                                 */
5544                                // done by bogusDoctype();
5545                                /*
5546                                 * Switch to the bogus DOCTYPE state.
5547                                 */
5548                                state = transition(state, Tokenizer.BOGUS_DOCTYPE, reconsume, pos);
5549                                continue stateloop;
5550                        }
5551                    }
5552                    // FALLTHRU DON'T REORDER
5553                case BEFORE_DOCTYPE_SYSTEM_IDENTIFIER:
5554                    beforedoctypesystemidentifierloop: for (;;) {
5555                        if (++pos == endPos) {
5556                            break stateloop;
5557                        }
5558                        c = checkChar(buf, pos);
5559                        /*
5560                         * Consume the next input character:
5561                         */
5562                        switch (c) {
5563                            case '\r':
5564                                silentCarriageReturn();
5565                                break stateloop;
5566                            case '\n':
5567                                silentLineFeed();
5568                                // fall thru
5569                            case ' ':
5570                            case '\t':
5571                            case '\u000C':
5572                                /*
5573                                 * U+0009 CHARACTER TABULATION U+000A LINE FEED
5574                                 * (LF) U+000C FORM FEED (FF) U+0020 SPACE Stay
5575                                 * in the before DOCTYPE system identifier
5576                                 * state.
5577                                 */
5578                                continue;
5579                            case '"':
5580                                /*
5581                                 * U+0022 QUOTATION MARK (") Set the DOCTYPE
5582                                 * token's system identifier to the empty string
5583                                 * (not missing),
5584                                 */
5585                                clearLongStrBuf();
5586                                /*
5587                                 * then switch to the DOCTYPE system identifier
5588                                 * (double-quoted) state.
5589                                 */
5590                                state = transition(state, Tokenizer.DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED, reconsume, pos);
5591                                continue stateloop;
5592                            case '\'':
5593                                /*
5594                                 * U+0027 APOSTROPHE (') Set the DOCTYPE token's
5595                                 * system identifier to the empty string (not
5596                                 * missing),
5597                                 */
5598                                clearLongStrBuf();
5599                                /*
5600                                 * then switch to the DOCTYPE system identifier
5601                                 * (single-quoted) state.
5602                                 */
5603                                state = transition(state, Tokenizer.DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED, reconsume, pos);
5604                                break beforedoctypesystemidentifierloop;
5605                            // continue stateloop;
5606                            case '>':
5607                                /* U+003E GREATER-THAN SIGN (>) Parse error. */
5608                                errExpectedSystemId();
5609                                /*
5610                                 * Set the DOCTYPE token's force-quirks flag to
5611                                 * on.
5612                                 */
5613                                forceQuirks = true;
5614                                /*
5615                                 * Emit that DOCTYPE token.
5616                                 */
5617                                emitDoctypeToken(pos);
5618                                /*
5619                                 * Switch to the data state.
5620                                 */
5621                                state = transition(state, Tokenizer.DATA, reconsume, pos);
5622                                continue stateloop;
5623                            default:
5624                                bogusDoctype();
5625                                /*
5626                                 * Set the DOCTYPE token's force-quirks flag to
5627                                 * on.
5628                                 */
5629                                // done by bogusDoctype();
5630                                /*
5631                                 * Switch to the bogus DOCTYPE state.
5632                                 */
5633                                state = transition(state, Tokenizer.BOGUS_DOCTYPE, reconsume, pos);
5634                                continue stateloop;
5635                        }
5636                    }
5637                    // FALLTHRU DON'T REORDER
5638                case DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED:
5639                    for (;;) {
5640                        if (++pos == endPos) {
5641                            break stateloop;
5642                        }
5643                        c = checkChar(buf, pos);
5644                        /*
5645                         * Consume the next input character:
5646                         */
5647                        switch (c) {
5648                            case '\'':
5649                                /*
5650                                 * U+0027 APOSTROPHE (') Switch to the after
5651                                 * DOCTYPE system identifier state.
5652                                 */
5653                                systemIdentifier = longStrBufToString();
5654                                state = transition(state, Tokenizer.AFTER_DOCTYPE_SYSTEM_IDENTIFIER, reconsume, pos);
5655                                continue stateloop;
5656                            case '>':
5657                                errGtInSystemId();
5658                                /*
5659                                 * Set the DOCTYPE token's force-quirks flag to
5660                                 * on.
5661                                 */
5662                                forceQuirks = true;
5663                                /*
5664                                 * Emit that DOCTYPE token.
5665                                 */
5666                                systemIdentifier = longStrBufToString();
5667                                emitDoctypeToken(pos);
5668                                /*
5669                                 * Switch to the data state.
5670                                 */
5671                                state = transition(state, Tokenizer.DATA, reconsume, pos);
5672                                continue stateloop;
5673                            case '\r':
5674                                appendLongStrBufCarriageReturn();
5675                                break stateloop;
5676                            case '\n':
5677                                appendLongStrBufLineFeed();
5678                                continue;
5679                            case '\u0000':
5680                                c = '\uFFFD';
5681                                // fall thru
5682                            default:
5683                                /*
5684                                 * Anything else Append the current input
5685                                 * character to the current DOCTYPE token's
5686                                 * system identifier.
5687                                 */
5688                                appendLongStrBuf(c);
5689                                /*
5690                                 * Stay in the DOCTYPE system identifier
5691                                 * (double-quoted) state.
5692                                 */
5693                                continue;
5694                        }
5695                    }
5696                    // XXX reorder point
5697                case DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED:
5698                    for (;;) {
5699                        if (++pos == endPos) {
5700                            break stateloop;
5701                        }
5702                        c = checkChar(buf, pos);
5703                        /*
5704                         * Consume the next input character:
5705                         */
5706                        switch (c) {
5707                            case '\'':
5708                                /*
5709                                 * U+0027 APOSTROPHE (') Switch to the after
5710                                 * DOCTYPE public identifier state.
5711                                 */
5712                                publicIdentifier = longStrBufToString();
5713                                state = transition(state, Tokenizer.AFTER_DOCTYPE_PUBLIC_IDENTIFIER, reconsume, pos);
5714                                continue stateloop;
5715                            case '>':
5716                                errGtInPublicId();
5717                                /*
5718                                 * Set the DOCTYPE token's force-quirks flag to
5719                                 * on.
5720                                 */
5721                                forceQuirks = true;
5722                                /*
5723                                 * Emit that DOCTYPE token.
5724                                 */
5725                                publicIdentifier = longStrBufToString();
5726                                emitDoctypeToken(pos);
5727                                /*
5728                                 * Switch to the data state.
5729                                 */
5730                                state = transition(state, Tokenizer.DATA, reconsume, pos);
5731                                continue stateloop;
5732                            case '\r':
5733                                appendLongStrBufCarriageReturn();
5734                                break stateloop;
5735                            case '\n':
5736                                appendLongStrBufLineFeed();
5737                                continue;
5738                            case '\u0000':
5739                                c = '\uFFFD';
5740                                // fall thru
5741                            default:
5742                                /*
5743                                 * Anything else Append the current input
5744                                 * character to the current DOCTYPE token's
5745                                 * public identifier.
5746                                 */
5747                                appendLongStrBuf(c);
5748                                /*
5749                                 * Stay in the DOCTYPE public identifier
5750                                 * (single-quoted) state.
5751                                 */
5752                                continue;
5753                        }
5754                    }
5755                    // XXX reorder point
5756                case PROCESSING_INSTRUCTION:
5757                    processinginstructionloop: for (;;) {
5758                        if (++pos == endPos) {
5759                            break stateloop;
5760                        }
5761                        c = checkChar(buf, pos);
5762                        switch (c) {
5763                            case '?':
5764                                state = transition(
5765                                        state,
5766                                        Tokenizer.PROCESSING_INSTRUCTION_QUESTION_MARK,
5767                                        reconsume, pos);
5768                                break processinginstructionloop;
5769                            // continue stateloop;
5770                            default:
5771                                continue;
5772                        }
5773                    }
5774                case PROCESSING_INSTRUCTION_QUESTION_MARK:
5775                    if (++pos == endPos) {
5776                        break stateloop;
5777                    }
5778                    c = checkChar(buf, pos);
5779                    switch (c) {
5780                        case '>':
5781                            state = transition(state, Tokenizer.DATA,
5782                                    reconsume, pos);
5783                            continue stateloop;
5784                        default:
5785                            state = transition(state,
5786                                    Tokenizer.PROCESSING_INSTRUCTION,
5787                                    reconsume, pos);
5788                            continue stateloop;
5789                    }
5790                    // END HOTSPOT WORKAROUND
5791            }
5792        }
5793        flushChars(buf, pos);
5794        /*
5795         * if (prevCR && pos != endPos) { // why is this needed? pos--; col--; }
5796         */
5797        // Save locals
5798        stateSave = state;
5799        returnStateSave = returnState;
5800        return pos;
5801    }
5802    
5803    // HOTSPOT WORKAROUND INSERTION POINT
5804    
5805    // [NOCPP[
5806    
5807    protected int transition(int from, int to, boolean reconsume, int pos) throws SAXException {
5808        return to;
5809    }
5810
5811    // ]NOCPP]
5812    
5813    private void initDoctypeFields() {
5814        doctypeName = "";
5815        if (systemIdentifier != null) {
5816            Portability.releaseString(systemIdentifier);
5817            systemIdentifier = null;
5818        }
5819        if (publicIdentifier != null) {
5820            Portability.releaseString(publicIdentifier);
5821            publicIdentifier = null;
5822        }
5823        forceQuirks = false;
5824    }
5825
5826    @Inline private void adjustDoubleHyphenAndAppendToLongStrBufCarriageReturn()
5827            throws SAXException {
5828        silentCarriageReturn();
5829        adjustDoubleHyphenAndAppendToLongStrBufAndErr('\n');
5830    }
5831
5832    @Inline private void adjustDoubleHyphenAndAppendToLongStrBufLineFeed()
5833            throws SAXException {
5834        silentLineFeed();
5835        adjustDoubleHyphenAndAppendToLongStrBufAndErr('\n');
5836    }
5837
5838    @Inline private void appendLongStrBufLineFeed() {
5839        silentLineFeed();
5840        appendLongStrBuf('\n');
5841    }
5842
5843    @Inline private void appendLongStrBufCarriageReturn() {
5844        silentCarriageReturn();
5845        appendLongStrBuf('\n');
5846    }
5847
5848    @Inline protected void silentCarriageReturn() {
5849        ++line;
5850        lastCR = true;
5851    }
5852
5853    @Inline protected void silentLineFeed() {
5854        ++line;
5855    }
5856
5857    private void emitCarriageReturn(@NoLength char[] buf, int pos)
5858            throws SAXException {
5859        silentCarriageReturn();
5860        flushChars(buf, pos);
5861        tokenHandler.characters(Tokenizer.LF, 0, 1);
5862        cstart = Integer.MAX_VALUE;
5863    }
5864
5865    private void emitReplacementCharacter(@NoLength char[] buf, int pos)
5866            throws SAXException {
5867        flushChars(buf, pos);
5868        tokenHandler.zeroOriginatingReplacementCharacter();
5869        cstart = pos + 1;
5870    }
5871
5872    private void emitPlaintextReplacementCharacter(@NoLength char[] buf, int pos)
5873            throws SAXException {
5874        flushChars(buf, pos);
5875        tokenHandler.characters(REPLACEMENT_CHARACTER, 0, 1);
5876        cstart = pos + 1;
5877    }
5878
5879    private void setAdditionalAndRememberAmpersandLocation(char add) {
5880        additional = add;
5881        // [NOCPP[
5882        ampersandLocation = new LocatorImpl(this);
5883        // ]NOCPP]
5884    }
5885
5886    private void bogusDoctype() throws SAXException {
5887        errBogusDoctype();
5888        forceQuirks = true;
5889    }
5890
5891    private void bogusDoctypeWithoutQuirks() throws SAXException {
5892        errBogusDoctype();
5893        forceQuirks = false;
5894    }
5895
5896    private void emitOrAppendStrBuf(int returnState) throws SAXException {
5897        if ((returnState & DATA_AND_RCDATA_MASK) != 0) {
5898            appendStrBufToLongStrBuf();
5899        } else {
5900            emitStrBuf();
5901        }
5902    }
5903
5904    private void handleNcrValue(int returnState) throws SAXException {
5905        /*
5906         * If one or more characters match the range, then take them all and
5907         * interpret the string of characters as a number (either hexadecimal or
5908         * decimal as appropriate).
5909         */
5910        if (value <= 0xFFFF) {
5911            if (value >= 0x80 && value <= 0x9f) {
5912                /*
5913                 * If that number is one of the numbers in the first column of
5914                 * the following table, then this is a parse error.
5915                 */
5916                errNcrInC1Range();
5917                /*
5918                 * Find the row with that number in the first column, and return
5919                 * a character token for the Unicode character given in the
5920                 * second column of that row.
5921                 */
5922                @NoLength char[] val = NamedCharacters.WINDOWS_1252[value - 0x80];
5923                emitOrAppendOne(val, returnState);
5924                // [NOCPP[
5925            } else if (value == 0xC
5926                    && contentSpacePolicy != XmlViolationPolicy.ALLOW) {
5927                if (contentSpacePolicy == XmlViolationPolicy.ALTER_INFOSET) {
5928                    emitOrAppendOne(Tokenizer.SPACE, returnState);
5929                } else if (contentSpacePolicy == XmlViolationPolicy.FATAL) {
5930                    fatal("A character reference expanded to a form feed which is not legal XML 1.0 white space.");
5931                }
5932                // ]NOCPP]
5933            } else if (value == 0x0) {
5934                errNcrZero();
5935                emitOrAppendOne(Tokenizer.REPLACEMENT_CHARACTER, returnState);
5936            } else if ((value & 0xF800) == 0xD800) {
5937                errNcrSurrogate();
5938                emitOrAppendOne(Tokenizer.REPLACEMENT_CHARACTER, returnState);
5939            } else {
5940                /*
5941                 * Otherwise, return a character token for the Unicode character
5942                 * whose code point is that number.
5943                 */
5944                char ch = (char) value;
5945                // [NOCPP[
5946                if (value == 0x0D) {
5947                    errNcrCr();
5948                } else if ((value <= 0x0008) || (value == 0x000B)
5949                        || (value >= 0x000E && value <= 0x001F)) {
5950                    ch = errNcrControlChar(ch);
5951                } else if (value >= 0xFDD0 && value <= 0xFDEF) {
5952                    errNcrUnassigned();
5953                } else if ((value & 0xFFFE) == 0xFFFE) {
5954                    ch = errNcrNonCharacter(ch);
5955                } else if (value >= 0x007F && value <= 0x009F) {
5956                    errNcrControlChar();
5957                } else {
5958                    maybeWarnPrivateUse(ch);
5959                }
5960                // ]NOCPP]
5961                bmpChar[0] = ch;
5962                emitOrAppendOne(bmpChar, returnState);
5963            }
5964        } else if (value <= 0x10FFFF) {
5965            // [NOCPP[
5966            maybeWarnPrivateUseAstral();
5967            if ((value & 0xFFFE) == 0xFFFE) {
5968                errAstralNonCharacter(value);
5969            }
5970            // ]NOCPP]
5971            astralChar[0] = (char) (Tokenizer.LEAD_OFFSET + (value >> 10));
5972            astralChar[1] = (char) (0xDC00 + (value & 0x3FF));
5973            emitOrAppendTwo(astralChar, returnState);
5974        } else {
5975            errNcrOutOfRange();
5976            emitOrAppendOne(Tokenizer.REPLACEMENT_CHARACTER, returnState);
5977        }
5978    }
5979
5980    public void eof() throws SAXException {
5981        int state = stateSave;
5982        int returnState = returnStateSave;
5983
5984        eofloop: for (;;) {
5985            switch (state) {
5986                case SCRIPT_DATA_LESS_THAN_SIGN:
5987                case SCRIPT_DATA_ESCAPED_LESS_THAN_SIGN:
5988                    /*
5989                     * Otherwise, emit a U+003C LESS-THAN SIGN character token
5990                     */
5991                    tokenHandler.characters(Tokenizer.LT_GT, 0, 1);
5992                    /*
5993                     * and reconsume the current input character in the data
5994                     * state.
5995                     */
5996                    break eofloop;
5997                case TAG_OPEN:
5998                    /*
5999                     * The behavior of this state depends on the content model
6000                     * flag.
6001                     */
6002                    /*
6003                     * Anything else Parse error.
6004                     */
6005                    errEofAfterLt();
6006                    /*
6007                     * Emit a U+003C LESS-THAN SIGN character token
6008                     */
6009                    tokenHandler.characters(Tokenizer.LT_GT, 0, 1);
6010                    /*
6011                     * and reconsume the current input character in the data
6012                     * state.
6013                     */
6014                    break eofloop;
6015                case RAWTEXT_RCDATA_LESS_THAN_SIGN:
6016                    /*
6017                     * Emit a U+003C LESS-THAN SIGN character token
6018                     */
6019                    tokenHandler.characters(Tokenizer.LT_GT, 0, 1);
6020                    /*
6021                     * and reconsume the current input character in the RCDATA
6022                     * state.
6023                     */
6024                    break eofloop;
6025                case NON_DATA_END_TAG_NAME:
6026                    /*
6027                     * Emit a U+003C LESS-THAN SIGN character token, a U+002F
6028                     * SOLIDUS character token,
6029                     */
6030                    tokenHandler.characters(Tokenizer.LT_SOLIDUS, 0, 2);
6031                    /*
6032                     * a character token for each of the characters in the
6033                     * temporary buffer (in the order they were added to the
6034                     * buffer),
6035                     */
6036                    emitStrBuf();
6037                    /*
6038                     * and reconsume the current input character in the RCDATA
6039                     * state.
6040                     */
6041                    break eofloop;
6042                case CLOSE_TAG_OPEN:
6043                    /* EOF Parse error. */
6044                    errEofAfterLt();
6045                    /*
6046                     * Emit a U+003C LESS-THAN SIGN character token and a U+002F
6047                     * SOLIDUS character token.
6048                     */
6049                    tokenHandler.characters(Tokenizer.LT_SOLIDUS, 0, 2);
6050                    /*
6051                     * Reconsume the EOF character in the data state.
6052                     */
6053                    break eofloop;
6054                case TAG_NAME:
6055                    /*
6056                     * EOF Parse error.
6057                     */
6058                    errEofInTagName();
6059                    /*
6060                     * Reconsume the EOF character in the data state.
6061                     */
6062                    break eofloop;
6063                case BEFORE_ATTRIBUTE_NAME:
6064                case AFTER_ATTRIBUTE_VALUE_QUOTED:
6065                case SELF_CLOSING_START_TAG:
6066                    /* EOF Parse error. */
6067                    errEofWithoutGt();
6068                    /*
6069                     * Reconsume the EOF character in the data state.
6070                     */
6071                    break eofloop;
6072                case ATTRIBUTE_NAME:
6073                    /*
6074                     * EOF Parse error.
6075                     */
6076                    errEofInAttributeName();
6077                    /*
6078                     * Reconsume the EOF character in the data state.
6079                     */
6080                    break eofloop;
6081                case AFTER_ATTRIBUTE_NAME:
6082                case BEFORE_ATTRIBUTE_VALUE:
6083                    /* EOF Parse error. */
6084                    errEofWithoutGt();
6085                    /*
6086                     * Reconsume the EOF character in the data state.
6087                     */
6088                    break eofloop;
6089                case ATTRIBUTE_VALUE_DOUBLE_QUOTED:
6090                case ATTRIBUTE_VALUE_SINGLE_QUOTED:
6091                case ATTRIBUTE_VALUE_UNQUOTED:
6092                    /* EOF Parse error. */
6093                    errEofInAttributeValue();
6094                    /*
6095                     * Reconsume the EOF character in the data state.
6096                     */
6097                    break eofloop;
6098                case BOGUS_COMMENT:
6099                    emitComment(0, 0);
6100                    break eofloop;
6101                case BOGUS_COMMENT_HYPHEN:
6102                    // [NOCPP[
6103                    maybeAppendSpaceToBogusComment();
6104                    // ]NOCPP]
6105                    emitComment(0, 0);
6106                    break eofloop;
6107                case MARKUP_DECLARATION_OPEN:
6108                    errBogusComment();
6109                    clearLongStrBuf();
6110                    emitComment(0, 0);
6111                    break eofloop;
6112                case MARKUP_DECLARATION_HYPHEN:
6113                    errBogusComment();
6114                    emitComment(0, 0);
6115                    break eofloop;
6116                case MARKUP_DECLARATION_OCTYPE:
6117                    if (index < 6) {
6118                        errBogusComment();
6119                        emitComment(0, 0);
6120                    } else {
6121                        /* EOF Parse error. */
6122                        errEofInDoctype();
6123                        /*
6124                         * Create a new DOCTYPE token. Set its force-quirks flag
6125                         * to on.
6126                         */
6127                        doctypeName = "";
6128                        if (systemIdentifier != null) {
6129                            Portability.releaseString(systemIdentifier);
6130                            systemIdentifier = null;
6131                        }
6132                        if (publicIdentifier != null) {
6133                            Portability.releaseString(publicIdentifier);
6134                            publicIdentifier = null;
6135                        }
6136                        forceQuirks = true;
6137                        /*
6138                         * Emit the token.
6139                         */
6140                        emitDoctypeToken(0);
6141                        /*
6142                         * Reconsume the EOF character in the data state.
6143                         */
6144                        break eofloop;
6145                    }
6146                    break eofloop;
6147                case COMMENT_START:
6148                case COMMENT:
6149                    /*
6150                     * EOF Parse error.
6151                     */
6152                    errEofInComment();
6153                    /* Emit the comment token. */
6154                    emitComment(0, 0);
6155                    /*
6156                     * Reconsume the EOF character in the data state.
6157                     */
6158                    break eofloop;
6159                case COMMENT_END:
6160                    errEofInComment();
6161                    /* Emit the comment token. */
6162                    emitComment(2, 0);
6163                    /*
6164                     * Reconsume the EOF character in the data state.
6165                     */
6166                    break eofloop;
6167                case COMMENT_END_DASH:
6168                case COMMENT_START_DASH:
6169                    errEofInComment();
6170                    /* Emit the comment token. */
6171                    emitComment(1, 0);
6172                    /*
6173                     * Reconsume the EOF character in the data state.
6174                     */
6175                    break eofloop;
6176                case COMMENT_END_BANG:
6177                    errEofInComment();
6178                    /* Emit the comment token. */
6179                    emitComment(3, 0);
6180                    /*
6181                     * Reconsume the EOF character in the data state.
6182                     */
6183                    break eofloop;
6184                case DOCTYPE:
6185                case BEFORE_DOCTYPE_NAME:
6186                    errEofInDoctype();
6187                    /*
6188                     * Create a new DOCTYPE token. Set its force-quirks flag to
6189                     * on.
6190                     */
6191                    forceQuirks = true;
6192                    /*
6193                     * Emit the token.
6194                     */
6195                    emitDoctypeToken(0);
6196                    /*
6197                     * Reconsume the EOF character in the data state.
6198                     */
6199                    break eofloop;
6200                case DOCTYPE_NAME:
6201                    errEofInDoctype();
6202                    strBufToDoctypeName();
6203                    /*
6204                     * Set the DOCTYPE token's force-quirks flag to on.
6205                     */
6206                    forceQuirks = true;
6207                    /*
6208                     * Emit that DOCTYPE token.
6209                     */
6210                    emitDoctypeToken(0);
6211                    /*
6212                     * Reconsume the EOF character in the data state.
6213                     */
6214                    break eofloop;
6215                case DOCTYPE_UBLIC:
6216                case DOCTYPE_YSTEM:
6217                case AFTER_DOCTYPE_NAME:
6218                case AFTER_DOCTYPE_PUBLIC_KEYWORD:
6219                case AFTER_DOCTYPE_SYSTEM_KEYWORD:
6220                case BEFORE_DOCTYPE_PUBLIC_IDENTIFIER:
6221                    errEofInDoctype();
6222                    /*
6223                     * Set the DOCTYPE token's force-quirks flag to on.
6224                     */
6225                    forceQuirks = true;
6226                    /*
6227                     * Emit that DOCTYPE token.
6228                     */
6229                    emitDoctypeToken(0);
6230                    /*
6231                     * Reconsume the EOF character in the data state.
6232                     */
6233                    break eofloop;
6234                case DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED:
6235                case DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED:
6236                    /* EOF Parse error. */
6237                    errEofInPublicId();
6238                    /*
6239                     * Set the DOCTYPE token's force-quirks flag to on.
6240                     */
6241                    forceQuirks = true;
6242                    /*
6243                     * Emit that DOCTYPE token.
6244                     */
6245                    publicIdentifier = longStrBufToString();
6246                    emitDoctypeToken(0);
6247                    /*
6248                     * Reconsume the EOF character in the data state.
6249                     */
6250                    break eofloop;
6251                case AFTER_DOCTYPE_PUBLIC_IDENTIFIER:
6252                case BEFORE_DOCTYPE_SYSTEM_IDENTIFIER:
6253                case BETWEEN_DOCTYPE_PUBLIC_AND_SYSTEM_IDENTIFIERS:
6254                    errEofInDoctype();
6255                    /*
6256                     * Set the DOCTYPE token's force-quirks flag to on.
6257                     */
6258                    forceQuirks = true;
6259                    /*
6260                     * Emit that DOCTYPE token.
6261                     */
6262                    emitDoctypeToken(0);
6263                    /*
6264                     * Reconsume the EOF character in the data state.
6265                     */
6266                    break eofloop;
6267                case DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED:
6268                case DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED:
6269                    /* EOF Parse error. */
6270                    errEofInSystemId();
6271                    /*
6272                     * Set the DOCTYPE token's force-quirks flag to on.
6273                     */
6274                    forceQuirks = true;
6275                    /*
6276                     * Emit that DOCTYPE token.
6277                     */
6278                    systemIdentifier = longStrBufToString();
6279                    emitDoctypeToken(0);
6280                    /*
6281                     * Reconsume the EOF character in the data state.
6282                     */
6283                    break eofloop;
6284                case AFTER_DOCTYPE_SYSTEM_IDENTIFIER:
6285                    errEofInDoctype();
6286                    /*
6287                     * Set the DOCTYPE token's force-quirks flag to on.
6288                     */
6289                    forceQuirks = true;
6290                    /*
6291                     * Emit that DOCTYPE token.
6292                     */
6293                    emitDoctypeToken(0);
6294                    /*
6295                     * Reconsume the EOF character in the data state.
6296                     */
6297                    break eofloop;
6298                case BOGUS_DOCTYPE:
6299                    /*
6300                     * Emit that DOCTYPE token.
6301                     */
6302                    emitDoctypeToken(0);
6303                    /*
6304                     * Reconsume the EOF character in the data state.
6305                     */
6306                    break eofloop;
6307                case CONSUME_CHARACTER_REFERENCE:
6308                    /*
6309                     * Unlike the definition is the spec, this state does not
6310                     * return a value and never requires the caller to
6311                     * backtrack. This state takes care of emitting characters
6312                     * or appending to the current attribute value. It also
6313                     * takes care of that in the case when consuming the entity
6314                     * fails.
6315                     */
6316                    /*
6317                     * This section defines how to consume an entity. This
6318                     * definition is used when parsing entities in text and in
6319                     * attributes.
6320                     * 
6321                     * The behavior depends on the identity of the next
6322                     * character (the one immediately after the U+0026 AMPERSAND
6323                     * character):
6324                     */
6325
6326                    emitOrAppendStrBuf(returnState);
6327                    state = returnState;
6328                    continue;
6329                case CHARACTER_REFERENCE_HILO_LOOKUP:
6330                    errNoNamedCharacterMatch();
6331                    emitOrAppendStrBuf(returnState);
6332                    state = returnState;
6333                    continue;
6334                case CHARACTER_REFERENCE_TAIL:
6335                    outer: for (;;) {
6336                        char c = '\u0000';
6337                        entCol++;
6338                        /*
6339                         * Consume the maximum number of characters possible,
6340                         * with the consumed characters matching one of the
6341                         * identifiers in the first column of the named
6342                         * character references table (in a case-sensitive
6343                         * manner).
6344                         */
6345                        hiloop: for (;;) {
6346                            if (hi == -1) {
6347                                break hiloop;
6348                            }
6349                            if (entCol == NamedCharacters.NAMES[hi].length()) {
6350                                break hiloop;
6351                            }
6352                            if (entCol > NamedCharacters.NAMES[hi].length()) {
6353                                break outer;
6354                            } else if (c < NamedCharacters.NAMES[hi].charAt(entCol)) {
6355                                hi--;
6356                            } else {
6357                                break hiloop;
6358                            }
6359                        }
6360
6361                        loloop: for (;;) {
6362                            if (hi < lo) {
6363                                break outer;
6364                            }
6365                            if (entCol == NamedCharacters.NAMES[lo].length()) {
6366                                candidate = lo;
6367                                strBufMark = strBufLen;
6368                                lo++;
6369                            } else if (entCol > NamedCharacters.NAMES[lo].length()) {
6370                                break outer;
6371                            } else if (c > NamedCharacters.NAMES[lo].charAt(entCol)) {
6372                                lo++;
6373                            } else {
6374                                break loloop;
6375                            }
6376                        }
6377                        if (hi < lo) {
6378                            break outer;
6379                        }
6380                        continue;
6381                    }
6382
6383                    if (candidate == -1) {
6384                        /*
6385                         * If no match can be made, then this is a parse error.
6386                         */
6387                        errNoNamedCharacterMatch();
6388                        emitOrAppendStrBuf(returnState);
6389                        state = returnState;
6390                        continue eofloop;
6391                    } else {
6392                        @Const @CharacterName String candidateName = NamedCharacters.NAMES[candidate];
6393                        if (candidateName.length() == 0
6394                                || candidateName.charAt(candidateName.length() - 1) != ';') {
6395                            /*
6396                             * If the last character matched is not a U+003B
6397                             * SEMICOLON (;), there is a parse error.
6398                             */
6399                            if ((returnState & DATA_AND_RCDATA_MASK) != 0) {
6400                                /*
6401                                 * If the entity is being consumed as part of an
6402                                 * attribute, and the last character matched is
6403                                 * not a U+003B SEMICOLON (;),
6404                                 */
6405                                char ch;
6406                                if (strBufMark == strBufLen) {
6407                                    ch = '\u0000';
6408                                } else {
6409                                    ch = strBuf[strBufMark];
6410                                }
6411                                if ((ch >= '0' && ch <= '9')
6412                                        || (ch >= 'A' && ch <= 'Z')
6413                                        || (ch >= 'a' && ch <= 'z')) {
6414                                    /*
6415                                     * and the next character is in the range
6416                                     * U+0030 DIGIT ZERO to U+0039 DIGIT NINE,
6417                                     * U+0041 LATIN CAPITAL LETTER A to U+005A
6418                                     * LATIN CAPITAL LETTER Z, or U+0061 LATIN
6419                                     * SMALL LETTER A to U+007A LATIN SMALL
6420                                     * LETTER Z, then, for historical reasons,
6421                                     * all the characters that were matched
6422                                     * after the U+0026 AMPERSAND (&) must be
6423                                     * unconsumed, and nothing is returned.
6424                                     */
6425                                    errNoNamedCharacterMatch();
6426                                    appendStrBufToLongStrBuf();
6427                                    state = returnState;
6428                                    continue eofloop;
6429                                }
6430                            }
6431                            if ((returnState & DATA_AND_RCDATA_MASK) != 0) {
6432                                errUnescapedAmpersandInterpretedAsCharacterReference();
6433                            } else {
6434                                errNotSemicolonTerminated();
6435                            }
6436                        }
6437
6438                        /*
6439                         * Otherwise, return a character token for the character
6440                         * corresponding to the entity name (as given by the
6441                         * second column of the named character references
6442                         * table).
6443                         */
6444                        @Const @NoLength char[] val = NamedCharacters.VALUES[candidate];
6445                        if (
6446                        // [NOCPP[
6447                        val.length == 1
6448                        // ]NOCPP]
6449                        // CPPONLY: val[1] == 0
6450                        ) {
6451                            emitOrAppendOne(val, returnState);
6452                        } else {
6453                            emitOrAppendTwo(val, returnState);
6454                        }
6455                        // this is so complicated!
6456                        if (strBufMark < strBufLen) {
6457                            if ((returnState & DATA_AND_RCDATA_MASK) != 0) {
6458                                for (int i = strBufMark; i < strBufLen; i++) {
6459                                    appendLongStrBuf(strBuf[i]);
6460                                }
6461                            } else {
6462                                tokenHandler.characters(strBuf, strBufMark,
6463                                        strBufLen - strBufMark);
6464                            }
6465                        }
6466                        state = returnState;
6467                        continue eofloop;
6468                        /*
6469                         * If the markup contains I'm &notit; I tell you, the
6470                         * entity is parsed as "not", as in, I'm ¬it; I tell
6471                         * you. But if the markup was I'm &notin; I tell you,
6472                         * the entity would be parsed as "notin;", resulting in
6473                         * I'm ∉ I tell you.
6474                         */
6475                    }
6476                case CONSUME_NCR:
6477                case DECIMAL_NRC_LOOP:
6478                case HEX_NCR_LOOP:
6479                    /*
6480                     * If no characters match the range, then don't consume any
6481                     * characters (and unconsume the U+0023 NUMBER SIGN
6482                     * character and, if appropriate, the X character). This is
6483                     * a parse error; nothing is returned.
6484                     * 
6485                     * Otherwise, if the next character is a U+003B SEMICOLON,
6486                     * consume that too. If it isn't, there is a parse error.
6487                     */
6488                    if (!seenDigits) {
6489                        errNoDigitsInNCR();
6490                        emitOrAppendStrBuf(returnState);
6491                        state = returnState;
6492                        continue;
6493                    } else {
6494                        errCharRefLacksSemicolon();
6495                    }
6496                    // WARNING previous state sets reconsume
6497                    handleNcrValue(returnState);
6498                    state = returnState;
6499                    continue;
6500                case CDATA_RSQB:
6501                    tokenHandler.characters(Tokenizer.RSQB_RSQB, 0, 1);
6502                    break eofloop;
6503                case CDATA_RSQB_RSQB:
6504                    tokenHandler.characters(Tokenizer.RSQB_RSQB, 0, 2);
6505                    break eofloop;
6506                case DATA:
6507                default:
6508                    break eofloop;
6509            }
6510        }
6511        // case DATA:
6512        /*
6513         * EOF Emit an end-of-file token.
6514         */
6515        tokenHandler.eof();
6516        return;
6517    }
6518
6519    private void emitDoctypeToken(int pos) throws SAXException {
6520        cstart = pos + 1;
6521        tokenHandler.doctype(doctypeName, publicIdentifier, systemIdentifier,
6522                forceQuirks);
6523        // It is OK and sufficient to release these here, since
6524        // there's no way out of the doctype states than through paths
6525        // that call this method.
6526        doctypeName = null;
6527        Portability.releaseString(publicIdentifier);
6528        publicIdentifier = null;
6529        Portability.releaseString(systemIdentifier);
6530        systemIdentifier = null;
6531    }
6532
6533    @Inline protected char checkChar(@NoLength char[] buf, int pos)
6534            throws SAXException {
6535        return buf[pos];
6536    }
6537
6538    // [NOCPP[
6539
6540    /**
6541     * Returns the alreadyComplainedAboutNonAscii.
6542     * 
6543     * @return the alreadyComplainedAboutNonAscii
6544     */
6545    public boolean isAlreadyComplainedAboutNonAscii() {
6546        return true;
6547    }
6548
6549    // ]NOCPP]
6550
6551    public boolean internalEncodingDeclaration(String internalCharset)
6552            throws SAXException {
6553        if (encodingDeclarationHandler != null) {
6554            return encodingDeclarationHandler.internalEncodingDeclaration(internalCharset);
6555        }
6556        return false;
6557    }
6558
6559    /**
6560     * @param val
6561     * @throws SAXException
6562     */
6563    private void emitOrAppendTwo(@Const @NoLength char[] val, int returnState)
6564            throws SAXException {
6565        if ((returnState & DATA_AND_RCDATA_MASK) != 0) {
6566            appendLongStrBuf(val[0]);
6567            appendLongStrBuf(val[1]);
6568        } else {
6569            tokenHandler.characters(val, 0, 2);
6570        }
6571    }
6572
6573    private void emitOrAppendOne(@Const @NoLength char[] val, int returnState)
6574            throws SAXException {
6575        if ((returnState & DATA_AND_RCDATA_MASK) != 0) {
6576            appendLongStrBuf(val[0]);
6577        } else {
6578            tokenHandler.characters(val, 0, 1);
6579        }
6580    }
6581
6582    public void end() throws SAXException {
6583        strBuf = null;
6584        longStrBuf = null;
6585        doctypeName = null;
6586        if (systemIdentifier != null) {
6587            Portability.releaseString(systemIdentifier);
6588            systemIdentifier = null;
6589        }
6590        if (publicIdentifier != null) {
6591            Portability.releaseString(publicIdentifier);
6592            publicIdentifier = null;
6593        }
6594        if (tagName != null) {
6595            tagName.release();
6596            tagName = null;
6597        }
6598        if (attributeName != null) {
6599            attributeName.release();
6600            attributeName = null;
6601        }
6602        tokenHandler.endTokenization();
6603        if (attributes != null) {
6604            attributes.clear(mappingLangToXmlLang);
6605            Portability.delete(attributes);
6606            attributes = null;
6607        }
6608    }
6609
6610    public void requestSuspension() {
6611        shouldSuspend = true;
6612    }
6613
6614    // [NOCPP[
6615    
6616    public void becomeConfident() {
6617        confident = true;
6618    }
6619
6620    /**
6621     * Returns the nextCharOnNewLine.
6622     * 
6623     * @return the nextCharOnNewLine
6624     */
6625    public boolean isNextCharOnNewLine() {
6626        return false;
6627    }
6628
6629    public boolean isPrevCR() {
6630        return lastCR;
6631    }
6632
6633    /**
6634     * Returns the line.
6635     * 
6636     * @return the line
6637     */
6638    public int getLine() {
6639        return -1;
6640    }
6641
6642    /**
6643     * Returns the col.
6644     * 
6645     * @return the col
6646     */
6647    public int getCol() {
6648        return -1;
6649    }
6650
6651    // ]NOCPP]
6652    
6653    public boolean isInDataState() {
6654        return (stateSave == DATA);
6655    }
6656
6657    public void resetToDataState() {
6658        strBufLen = 0;
6659        longStrBufLen = 0;
6660        stateSave = Tokenizer.DATA;
6661        // line = 1; XXX line numbers
6662        lastCR = false;
6663        index = 0;
6664        forceQuirks = false;
6665        additional = '\u0000';
6666        entCol = -1;
6667        firstCharKey = -1;
6668        lo = 0;
6669        hi = 0; // will always be overwritten before use anyway
6670        candidate = -1;
6671        strBufMark = 0;
6672        prevValue = -1;
6673        value = 0;
6674        seenDigits = false;
6675        endTag = false;
6676        shouldSuspend = false;
6677        initDoctypeFields();
6678        if (tagName != null) {
6679            tagName.release();
6680            tagName = null;
6681        }
6682        if (attributeName != null) {
6683            attributeName.release();
6684            attributeName = null;
6685        }
6686        // [NOCPP[
6687        if (newAttributesEachTime) {
6688            // ]NOCPP]
6689            if (attributes != null) {
6690                Portability.delete(attributes);
6691                attributes = null;
6692            }
6693            // [NOCPP[
6694        }
6695        // ]NOCPP]
6696    }
6697
6698    public void loadState(Tokenizer other) throws SAXException {
6699        strBufLen = other.strBufLen;
6700        if (strBufLen > strBuf.length) {
6701            strBuf = new char[strBufLen];
6702        }
6703        System.arraycopy(other.strBuf, 0, strBuf, 0, strBufLen);
6704
6705        longStrBufLen = other.longStrBufLen;
6706        if (longStrBufLen > longStrBuf.length) {
6707            longStrBuf = new char[longStrBufLen];
6708        }
6709        System.arraycopy(other.longStrBuf, 0, longStrBuf, 0, longStrBufLen);
6710
6711        stateSave = other.stateSave;
6712        returnStateSave = other.returnStateSave;
6713        endTagExpectation = other.endTagExpectation;
6714        endTagExpectationAsArray = other.endTagExpectationAsArray;
6715        // line = 1; XXX line numbers
6716        lastCR = other.lastCR;
6717        index = other.index;
6718        forceQuirks = other.forceQuirks;
6719        additional = other.additional;
6720        entCol = other.entCol;
6721        firstCharKey = other.firstCharKey;
6722        lo = other.lo;
6723        hi = other.hi;
6724        candidate = other.candidate;
6725        strBufMark = other.strBufMark;
6726        prevValue = other.prevValue;
6727        value = other.value;
6728        seenDigits = other.seenDigits;
6729        endTag = other.endTag;
6730        shouldSuspend = false;
6731
6732        if (other.doctypeName == null) {
6733            doctypeName = null;
6734        } else {
6735            doctypeName = Portability.newLocalFromLocal(other.doctypeName,
6736                    interner);
6737        }
6738
6739        Portability.releaseString(systemIdentifier);
6740        if (other.systemIdentifier == null) {
6741            systemIdentifier = null;
6742        } else {
6743            systemIdentifier = Portability.newStringFromString(other.systemIdentifier);
6744        }
6745
6746        Portability.releaseString(publicIdentifier);
6747        if (other.publicIdentifier == null) {
6748            publicIdentifier = null;
6749        } else {
6750            publicIdentifier = Portability.newStringFromString(other.publicIdentifier);
6751        }
6752
6753        if (tagName != null) {
6754            tagName.release();
6755        }
6756        if (other.tagName == null) {
6757            tagName = null;
6758        } else {
6759            tagName = other.tagName.cloneElementName(interner);
6760        }
6761
6762        if (attributeName != null) {
6763            attributeName.release();
6764        }
6765        if (other.attributeName == null) {
6766            attributeName = null;
6767        } else {
6768            attributeName = other.attributeName.cloneAttributeName(interner);
6769        }
6770
6771        Portability.delete(attributes);
6772        if (other.attributes == null) {
6773            attributes = null;
6774        } else {
6775            attributes = other.attributes.cloneAttributes(interner);
6776        }
6777    }
6778
6779    public void initializeWithoutStarting() throws SAXException {
6780        confident = false;
6781        strBuf = new char[64];
6782        longStrBuf = new char[1024];
6783        line = 1;
6784        // [NOCPP[
6785        html4 = false;
6786        metaBoundaryPassed = false;
6787        wantsComments = tokenHandler.wantsComments();
6788        if (!newAttributesEachTime) {
6789            attributes = new HtmlAttributes(mappingLangToXmlLang);
6790        }
6791        // ]NOCPP]
6792        resetToDataState();
6793    }
6794
6795    protected void errGarbageAfterLtSlash() throws SAXException {
6796    }
6797
6798    protected void errLtSlashGt() throws SAXException {
6799    }
6800
6801    protected void errWarnLtSlashInRcdata() throws SAXException {
6802    }
6803
6804    protected void errHtml4LtSlashInRcdata(char folded) throws SAXException {
6805    }
6806
6807    protected void errCharRefLacksSemicolon() throws SAXException {
6808    }
6809
6810    protected void errNoDigitsInNCR() throws SAXException {
6811    }
6812
6813    protected void errGtInSystemId() throws SAXException {
6814    }
6815
6816    protected void errGtInPublicId() throws SAXException {
6817    }
6818
6819    protected void errNamelessDoctype() throws SAXException {
6820    }
6821
6822    protected void errConsecutiveHyphens() throws SAXException {
6823    }
6824
6825    protected void errPrematureEndOfComment() throws SAXException {
6826    }
6827
6828    protected void errBogusComment() throws SAXException {
6829    }
6830
6831    protected void errUnquotedAttributeValOrNull(char c) throws SAXException {
6832    }
6833
6834    protected void errSlashNotFollowedByGt() throws SAXException {
6835    }
6836
6837    protected void errHtml4XmlVoidSyntax() throws SAXException {
6838    }
6839
6840    protected void errNoSpaceBetweenAttributes() throws SAXException {
6841    }
6842
6843    protected void errHtml4NonNameInUnquotedAttribute(char c)
6844            throws SAXException {
6845    }
6846
6847    protected void errLtOrEqualsOrGraveInUnquotedAttributeOrNull(char c)
6848            throws SAXException {
6849    }
6850
6851    protected void errAttributeValueMissing() throws SAXException {
6852    }
6853
6854    protected void errBadCharBeforeAttributeNameOrNull(char c)
6855            throws SAXException {
6856    }
6857
6858    protected void errEqualsSignBeforeAttributeName() throws SAXException {
6859    }
6860
6861    protected void errBadCharAfterLt(char c) throws SAXException {
6862    }
6863
6864    protected void errLtGt() throws SAXException {
6865    }
6866
6867    protected void errProcessingInstruction() throws SAXException {
6868    }
6869
6870    protected void errUnescapedAmpersandInterpretedAsCharacterReference()
6871            throws SAXException {
6872    }
6873
6874    protected void errNotSemicolonTerminated() throws SAXException {
6875    }
6876
6877    protected void errNoNamedCharacterMatch() throws SAXException {
6878    }
6879
6880    protected void errQuoteBeforeAttributeName(char c) throws SAXException {
6881    }
6882
6883    protected void errQuoteOrLtInAttributeNameOrNull(char c)
6884            throws SAXException {
6885    }
6886
6887    protected void errExpectedPublicId() throws SAXException {
6888    }
6889
6890    protected void errBogusDoctype() throws SAXException {
6891    }
6892
6893