PageRenderTime 149ms CodeModel.GetById 59ms app.highlight 71ms RepoModel.GetById 1ms app.codeStats 2ms

/parser/html/javasrc/Tokenizer.java

http://github.com/zpao/v8monkey
Java | 7027 lines | 3908 code | 381 blank | 2738 comment | 760 complexity | fb65ba7ba7f99f659d89e51498e7a0f4 MD5 | raw file

Large files files are truncated, but you can click here to view the full file

   1/*
   2 * Copyright (c) 2005-2007 Henri Sivonen
   3 * Copyright (c) 2007-2010 Mozilla Foundation
   4 * Portions of comments Copyright 2004-2010 Apple Computer, Inc., Mozilla 
   5 * Foundation, and Opera Software ASA.
   6 *
   7 * Permission is hereby granted, free of charge, to any person obtaining a 
   8 * copy of this software and associated documentation files (the "Software"), 
   9 * to deal in the Software without restriction, including without limitation 
  10 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 
  11 * and/or sell copies of the Software, and to permit persons to whom the 
  12 * Software is furnished to do so, subject to the following conditions:
  13 *
  14 * The above copyright notice and this permission notice shall be included in 
  15 * all copies or substantial portions of the Software.
  16 *
  17 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 
  18 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 
  19 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 
  20 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 
  21 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 
  22 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 
  23 * DEALINGS IN THE SOFTWARE.
  24 */
  25
  26/*
  27 * The comments following this one that use the same comment syntax as this 
  28 * comment are quotes from the WHATWG HTML 5 spec as of 2 June 2007 
  29 * amended as of June 18 2008 and May 31 2010.
  30 * That document came with this statement:
  31 * "© Copyright 2004-2010 Apple Computer, Inc., Mozilla Foundation, and 
  32 * Opera Software ASA. You are granted a license to use, reproduce and 
  33 * create derivative works of this document."
  34 */
  35
  36package nu.validator.htmlparser.impl;
  37
  38import nu.validator.htmlparser.annotation.Auto;
  39import nu.validator.htmlparser.annotation.CharacterName;
  40import nu.validator.htmlparser.annotation.Const;
  41import nu.validator.htmlparser.annotation.Inline;
  42import nu.validator.htmlparser.annotation.Local;
  43import nu.validator.htmlparser.annotation.NoLength;
  44import nu.validator.htmlparser.common.EncodingDeclarationHandler;
  45import nu.validator.htmlparser.common.Interner;
  46import nu.validator.htmlparser.common.TokenHandler;
  47import nu.validator.htmlparser.common.XmlViolationPolicy;
  48
  49import org.xml.sax.ErrorHandler;
  50import org.xml.sax.Locator;
  51import org.xml.sax.SAXException;
  52import org.xml.sax.SAXParseException;
  53
  54/**
  55 * An implementation of
  56 * http://www.whatwg.org/specs/web-apps/current-work/multipage/tokenization.html
  57 * 
  58 * This class implements the <code>Locator</code> interface. This is not an
  59 * incidental implementation detail: Users of this class are encouraged to make
  60 * use of the <code>Locator</code> nature.
  61 * 
  62 * By default, the tokenizer may report data that XML 1.0 bans. The tokenizer
  63 * can be configured to treat these conditions as fatal or to coerce the infoset
  64 * to something that XML 1.0 allows.
  65 * 
  66 * @version $Id$
  67 * @author hsivonen
  68 */
  69public class Tokenizer implements Locator {
  70
  71    private static final int DATA_AND_RCDATA_MASK = ~1;
  72
  73    public static final int DATA = 0;
  74
  75    public static final int RCDATA = 1;
  76
  77    public static final int SCRIPT_DATA = 2;
  78
  79    public static final int RAWTEXT = 3;
  80
  81    public static final int SCRIPT_DATA_ESCAPED = 4;
  82
  83    public static final int ATTRIBUTE_VALUE_DOUBLE_QUOTED = 5;
  84
  85    public static final int ATTRIBUTE_VALUE_SINGLE_QUOTED = 6;
  86
  87    public static final int ATTRIBUTE_VALUE_UNQUOTED = 7;
  88
  89    public static final int PLAINTEXT = 8;
  90
  91    public static final int TAG_OPEN = 9;
  92
  93    public static final int CLOSE_TAG_OPEN = 10;
  94
  95    public static final int TAG_NAME = 11;
  96
  97    public static final int BEFORE_ATTRIBUTE_NAME = 12;
  98
  99    public static final int ATTRIBUTE_NAME = 13;
 100
 101    public static final int AFTER_ATTRIBUTE_NAME = 14;
 102
 103    public static final int BEFORE_ATTRIBUTE_VALUE = 15;
 104
 105    public static final int AFTER_ATTRIBUTE_VALUE_QUOTED = 16;
 106
 107    public static final int BOGUS_COMMENT = 17;
 108
 109    public static final int MARKUP_DECLARATION_OPEN = 18;
 110
 111    public static final int DOCTYPE = 19;
 112
 113    public static final int BEFORE_DOCTYPE_NAME = 20;
 114
 115    public static final int DOCTYPE_NAME = 21;
 116
 117    public static final int AFTER_DOCTYPE_NAME = 22;
 118
 119    public static final int BEFORE_DOCTYPE_PUBLIC_IDENTIFIER = 23;
 120
 121    public static final int DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED = 24;
 122
 123    public static final int DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED = 25;
 124
 125    public static final int AFTER_DOCTYPE_PUBLIC_IDENTIFIER = 26;
 126
 127    public static final int BEFORE_DOCTYPE_SYSTEM_IDENTIFIER = 27;
 128
 129    public static final int DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED = 28;
 130
 131    public static final int DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED = 29;
 132
 133    public static final int AFTER_DOCTYPE_SYSTEM_IDENTIFIER = 30;
 134
 135    public static final int BOGUS_DOCTYPE = 31;
 136
 137    public static final int COMMENT_START = 32;
 138
 139    public static final int COMMENT_START_DASH = 33;
 140
 141    public static final int COMMENT = 34;
 142
 143    public static final int COMMENT_END_DASH = 35;
 144
 145    public static final int COMMENT_END = 36;
 146
 147    public static final int COMMENT_END_BANG = 37;
 148
 149    public static final int NON_DATA_END_TAG_NAME = 38;
 150
 151    public static final int MARKUP_DECLARATION_HYPHEN = 39;
 152
 153    public static final int MARKUP_DECLARATION_OCTYPE = 40;
 154
 155    public static final int DOCTYPE_UBLIC = 41;
 156
 157    public static final int DOCTYPE_YSTEM = 42;
 158
 159    public static final int AFTER_DOCTYPE_PUBLIC_KEYWORD = 43;
 160
 161    public static final int BETWEEN_DOCTYPE_PUBLIC_AND_SYSTEM_IDENTIFIERS = 44;
 162
 163    public static final int AFTER_DOCTYPE_SYSTEM_KEYWORD = 45;
 164
 165    public static final int CONSUME_CHARACTER_REFERENCE = 46;
 166
 167    public static final int CONSUME_NCR = 47;
 168
 169    public static final int CHARACTER_REFERENCE_TAIL = 48;
 170
 171    public static final int HEX_NCR_LOOP = 49;
 172
 173    public static final int DECIMAL_NRC_LOOP = 50;
 174
 175    public static final int HANDLE_NCR_VALUE = 51;
 176
 177    public static final int HANDLE_NCR_VALUE_RECONSUME = 52;
 178
 179    public static final int CHARACTER_REFERENCE_HILO_LOOKUP = 53;
 180
 181    public static final int SELF_CLOSING_START_TAG = 54;
 182
 183    public static final int CDATA_START = 55;
 184
 185    public static final int CDATA_SECTION = 56;
 186
 187    public static final int CDATA_RSQB = 57;
 188
 189    public static final int CDATA_RSQB_RSQB = 58;
 190
 191    public static final int SCRIPT_DATA_LESS_THAN_SIGN = 59;
 192
 193    public static final int SCRIPT_DATA_ESCAPE_START = 60;
 194
 195    public static final int SCRIPT_DATA_ESCAPE_START_DASH = 61;
 196
 197    public static final int SCRIPT_DATA_ESCAPED_DASH = 62;
 198
 199    public static final int SCRIPT_DATA_ESCAPED_DASH_DASH = 63;
 200
 201    public static final int BOGUS_COMMENT_HYPHEN = 64;
 202
 203    public static final int RAWTEXT_RCDATA_LESS_THAN_SIGN = 65;
 204
 205    public static final int SCRIPT_DATA_ESCAPED_LESS_THAN_SIGN = 66;
 206
 207    public static final int SCRIPT_DATA_DOUBLE_ESCAPE_START = 67;
 208
 209    public static final int SCRIPT_DATA_DOUBLE_ESCAPED = 68;
 210
 211    public static final int SCRIPT_DATA_DOUBLE_ESCAPED_LESS_THAN_SIGN = 69;
 212
 213    public static final int SCRIPT_DATA_DOUBLE_ESCAPED_DASH = 70;
 214
 215    public static final int SCRIPT_DATA_DOUBLE_ESCAPED_DASH_DASH = 71;
 216
 217    public static final int SCRIPT_DATA_DOUBLE_ESCAPE_END = 72;
 218
 219    public static final int PROCESSING_INSTRUCTION = 73;
 220
 221    public static final int PROCESSING_INSTRUCTION_QUESTION_MARK = 74;
 222
 223    /**
 224     * Magic value for UTF-16 operations.
 225     */
 226    private static final int LEAD_OFFSET = (0xD800 - (0x10000 >> 10));
 227
 228    /**
 229     * UTF-16 code unit array containing less than and greater than for emitting
 230     * those characters on certain parse errors.
 231     */
 232    private static final @NoLength char[] LT_GT = { '<', '>' };
 233
 234    /**
 235     * UTF-16 code unit array containing less than and solidus for emitting
 236     * those characters on certain parse errors.
 237     */
 238    private static final @NoLength char[] LT_SOLIDUS = { '<', '/' };
 239
 240    /**
 241     * UTF-16 code unit array containing ]] for emitting those characters on
 242     * state transitions.
 243     */
 244    private static final @NoLength char[] RSQB_RSQB = { ']', ']' };
 245
 246    /**
 247     * Array version of U+FFFD.
 248     */
 249    private static final @NoLength char[] REPLACEMENT_CHARACTER = { '\uFFFD' };
 250
 251    // [NOCPP[
 252
 253    /**
 254     * Array version of space.
 255     */
 256    private static final @NoLength char[] SPACE = { ' ' };
 257
 258    // ]NOCPP]
 259
 260    /**
 261     * Array version of line feed.
 262     */
 263    private static final @NoLength char[] LF = { '\n' };
 264
 265    /**
 266     * Buffer growth parameter.
 267     */
 268    private static final int BUFFER_GROW_BY = 1024;
 269
 270    /**
 271     * "CDATA[" as <code>char[]</code>
 272     */
 273    private static final @NoLength char[] CDATA_LSQB = "CDATA[".toCharArray();
 274
 275    /**
 276     * "octype" as <code>char[]</code>
 277     */
 278    private static final @NoLength char[] OCTYPE = "octype".toCharArray();
 279
 280    /**
 281     * "ublic" as <code>char[]</code>
 282     */
 283    private static final @NoLength char[] UBLIC = "ublic".toCharArray();
 284
 285    /**
 286     * "ystem" as <code>char[]</code>
 287     */
 288    private static final @NoLength char[] YSTEM = "ystem".toCharArray();
 289
 290    private static final char[] TITLE_ARR = { 't', 'i', 't', 'l', 'e' };
 291
 292    private static final char[] SCRIPT_ARR = { 's', 'c', 'r', 'i', 'p', 't' };
 293
 294    private static final char[] STYLE_ARR = { 's', 't', 'y', 'l', 'e' };
 295
 296    private static final char[] PLAINTEXT_ARR = { 'p', 'l', 'a', 'i', 'n', 't',
 297            'e', 'x', 't' };
 298
 299    private static final char[] XMP_ARR = { 'x', 'm', 'p' };
 300
 301    private static final char[] TEXTAREA_ARR = { 't', 'e', 'x', 't', 'a', 'r',
 302            'e', 'a' };
 303
 304    private static final char[] IFRAME_ARR = { 'i', 'f', 'r', 'a', 'm', 'e' };
 305
 306    private static final char[] NOEMBED_ARR = { 'n', 'o', 'e', 'm', 'b', 'e',
 307            'd' };
 308
 309    private static final char[] NOSCRIPT_ARR = { 'n', 'o', 's', 'c', 'r', 'i',
 310            'p', 't' };
 311
 312    private static final char[] NOFRAMES_ARR = { 'n', 'o', 'f', 'r', 'a', 'm',
 313            'e', 's' };
 314
 315    /**
 316     * The token handler.
 317     */
 318    protected final TokenHandler tokenHandler;
 319
 320    protected EncodingDeclarationHandler encodingDeclarationHandler;
 321
 322    // [NOCPP[
 323
 324    /**
 325     * The error handler.
 326     */
 327    protected ErrorHandler errorHandler;
 328
 329    // ]NOCPP]
 330
 331    /**
 332     * Whether the previous char read was CR.
 333     */
 334    protected boolean lastCR;
 335
 336    protected int stateSave;
 337
 338    private int returnStateSave;
 339
 340    protected int index;
 341
 342    private boolean forceQuirks;
 343
 344    private char additional;
 345
 346    private int entCol;
 347
 348    private int firstCharKey;
 349
 350    private int lo;
 351
 352    private int hi;
 353
 354    private int candidate;
 355
 356    private int strBufMark;
 357
 358    private int prevValue;
 359
 360    protected int value;
 361
 362    private boolean seenDigits;
 363
 364    protected int cstart;
 365
 366    /**
 367     * The SAX public id for the resource being tokenized. (Only passed to back
 368     * as part of locator data.)
 369     */
 370    private String publicId;
 371
 372    /**
 373     * The SAX system id for the resource being tokenized. (Only passed to back
 374     * as part of locator data.)
 375     */
 376    private String systemId;
 377
 378    /**
 379     * Buffer for short identifiers.
 380     */
 381    private @Auto char[] strBuf;
 382
 383    /**
 384     * Number of significant <code>char</code>s in <code>strBuf</code>.
 385     */
 386    private int strBufLen;
 387
 388    /**
 389     * <code>-1</code> to indicate that <code>strBuf</code> is used or otherwise
 390     * an offset to the main buffer.
 391     */
 392    // private int strBufOffset = -1;
 393    /**
 394     * Buffer for long strings.
 395     */
 396    private @Auto char[] longStrBuf;
 397
 398    /**
 399     * Number of significant <code>char</code>s in <code>longStrBuf</code>.
 400     */
 401    private int longStrBufLen;
 402
 403    /**
 404     * <code>-1</code> to indicate that <code>longStrBuf</code> is used or
 405     * otherwise an offset to the main buffer.
 406     */
 407    // private int longStrBufOffset = -1;
 408
 409    /**
 410     * Buffer for expanding NCRs falling into the Basic Multilingual Plane.
 411     */
 412    private final @Auto char[] bmpChar;
 413
 414    /**
 415     * Buffer for expanding astral NCRs.
 416     */
 417    private final @Auto char[] astralChar;
 418
 419    /**
 420     * The element whose end tag closes the current CDATA or RCDATA element.
 421     */
 422    protected ElementName endTagExpectation = null;
 423
 424    private char[] endTagExpectationAsArray; // not @Auto!
 425
 426    /**
 427     * <code>true</code> if tokenizing an end tag
 428     */
 429    protected boolean endTag;
 430
 431    /**
 432     * The current tag token name.
 433     */
 434    private ElementName tagName = null;
 435
 436    /**
 437     * The current attribute name.
 438     */
 439    protected AttributeName attributeName = null;
 440
 441    // [NOCPP[
 442
 443    /**
 444     * Whether comment tokens are emitted.
 445     */
 446    private boolean wantsComments = false;
 447
 448    /**
 449     * <code>true</code> when HTML4-specific additional errors are requested.
 450     */
 451    protected boolean html4;
 452
 453    /**
 454     * Whether the stream is past the first 512 bytes.
 455     */
 456    private boolean metaBoundaryPassed;
 457
 458    // ]NOCPP]
 459
 460    /**
 461     * The name of the current doctype token.
 462     */
 463    private @Local String doctypeName;
 464
 465    /**
 466     * The public id of the current doctype token.
 467     */
 468    private String publicIdentifier;
 469
 470    /**
 471     * The system id of the current doctype token.
 472     */
 473    private String systemIdentifier;
 474
 475    /**
 476     * The attribute holder.
 477     */
 478    private HtmlAttributes attributes;
 479
 480    // [NOCPP[
 481
 482    /**
 483     * The policy for vertical tab and form feed.
 484     */
 485    private XmlViolationPolicy contentSpacePolicy = XmlViolationPolicy.ALTER_INFOSET;
 486
 487    /**
 488     * The policy for comments.
 489     */
 490    private XmlViolationPolicy commentPolicy = XmlViolationPolicy.ALTER_INFOSET;
 491
 492    private XmlViolationPolicy xmlnsPolicy = XmlViolationPolicy.ALTER_INFOSET;
 493
 494    private XmlViolationPolicy namePolicy = XmlViolationPolicy.ALTER_INFOSET;
 495
 496    private boolean html4ModeCompatibleWithXhtml1Schemata;
 497
 498    private final boolean newAttributesEachTime;
 499
 500    // ]NOCPP]
 501
 502    private int mappingLangToXmlLang;
 503
 504    private boolean shouldSuspend;
 505
 506    protected boolean confident;
 507
 508    private int line;
 509
 510    private Interner interner;
 511
 512    // CPPONLY: private boolean viewingXmlSource;
 513
 514    // [NOCPP[
 515
 516    protected LocatorImpl ampersandLocation;
 517
 518    public Tokenizer(TokenHandler tokenHandler, boolean newAttributesEachTime) {
 519        this.tokenHandler = tokenHandler;
 520        this.encodingDeclarationHandler = null;
 521        this.newAttributesEachTime = newAttributesEachTime;
 522        this.bmpChar = new char[1];
 523        this.astralChar = new char[2];
 524        this.tagName = null;
 525        this.attributeName = null;
 526        this.doctypeName = null;
 527        this.publicIdentifier = null;
 528        this.systemIdentifier = null;
 529        this.attributes = null;
 530    }
 531
 532    // ]NOCPP]
 533
 534    /**
 535     * The constructor.
 536     * 
 537     * @param tokenHandler
 538     *            the handler for receiving tokens
 539     */
 540    public Tokenizer(TokenHandler tokenHandler
 541    // CPPONLY: , boolean viewingXmlSource        
 542    ) {
 543        this.tokenHandler = tokenHandler;
 544        this.encodingDeclarationHandler = null;
 545        // [NOCPP[
 546        this.newAttributesEachTime = false;
 547        // ]NOCPP]
 548        this.bmpChar = new char[1];
 549        this.astralChar = new char[2];
 550        this.tagName = null;
 551        this.attributeName = null;
 552        this.doctypeName = null;
 553        this.publicIdentifier = null;
 554        this.systemIdentifier = null;
 555        this.attributes = null;
 556    // CPPONLY: this.viewingXmlSource = viewingXmlSource;
 557    }
 558
 559    public void setInterner(Interner interner) {
 560        this.interner = interner;
 561    }
 562
 563    public void initLocation(String newPublicId, String newSystemId) {
 564        this.systemId = newSystemId;
 565        this.publicId = newPublicId;
 566
 567    }
 568
 569    // CPPONLY: boolean isViewingXmlSource() {
 570    // CPPONLY: return viewingXmlSource;
 571    // CPPONLY: }
 572
 573    // [NOCPP[
 574
 575    /**
 576     * Returns the mappingLangToXmlLang.
 577     * 
 578     * @return the mappingLangToXmlLang
 579     */
 580    public boolean isMappingLangToXmlLang() {
 581        return mappingLangToXmlLang == AttributeName.HTML_LANG;
 582    }
 583
 584    /**
 585     * Sets the mappingLangToXmlLang.
 586     * 
 587     * @param mappingLangToXmlLang
 588     *            the mappingLangToXmlLang to set
 589     */
 590    public void setMappingLangToXmlLang(boolean mappingLangToXmlLang) {
 591        this.mappingLangToXmlLang = mappingLangToXmlLang ? AttributeName.HTML_LANG
 592                : AttributeName.HTML;
 593    }
 594
 595    /**
 596     * Sets the error handler.
 597     * 
 598     * @see org.xml.sax.XMLReader#setErrorHandler(org.xml.sax.ErrorHandler)
 599     */
 600    public void setErrorHandler(ErrorHandler eh) {
 601        this.errorHandler = eh;
 602    }
 603
 604    public ErrorHandler getErrorHandler() {
 605        return this.errorHandler;
 606    }
 607
 608    /**
 609     * Sets the commentPolicy.
 610     * 
 611     * @param commentPolicy
 612     *            the commentPolicy to set
 613     */
 614    public void setCommentPolicy(XmlViolationPolicy commentPolicy) {
 615        this.commentPolicy = commentPolicy;
 616    }
 617
 618    /**
 619     * Sets the contentNonXmlCharPolicy.
 620     * 
 621     * @param contentNonXmlCharPolicy
 622     *            the contentNonXmlCharPolicy to set
 623     */
 624    public void setContentNonXmlCharPolicy(
 625            XmlViolationPolicy contentNonXmlCharPolicy) {
 626        if (contentNonXmlCharPolicy != XmlViolationPolicy.ALLOW) {
 627            throw new IllegalArgumentException(
 628                    "Must use ErrorReportingTokenizer to set contentNonXmlCharPolicy to non-ALLOW.");
 629        }
 630    }
 631
 632    /**
 633     * Sets the contentSpacePolicy.
 634     * 
 635     * @param contentSpacePolicy
 636     *            the contentSpacePolicy to set
 637     */
 638    public void setContentSpacePolicy(XmlViolationPolicy contentSpacePolicy) {
 639        this.contentSpacePolicy = contentSpacePolicy;
 640    }
 641
 642    /**
 643     * Sets the xmlnsPolicy.
 644     * 
 645     * @param xmlnsPolicy
 646     *            the xmlnsPolicy to set
 647     */
 648    public void setXmlnsPolicy(XmlViolationPolicy xmlnsPolicy) {
 649        if (xmlnsPolicy == XmlViolationPolicy.FATAL) {
 650            throw new IllegalArgumentException("Can't use FATAL here.");
 651        }
 652        this.xmlnsPolicy = xmlnsPolicy;
 653    }
 654
 655    public void setNamePolicy(XmlViolationPolicy namePolicy) {
 656        this.namePolicy = namePolicy;
 657    }
 658
 659    /**
 660     * Sets the html4ModeCompatibleWithXhtml1Schemata.
 661     * 
 662     * @param html4ModeCompatibleWithXhtml1Schemata
 663     *            the html4ModeCompatibleWithXhtml1Schemata to set
 664     */
 665    public void setHtml4ModeCompatibleWithXhtml1Schemata(
 666            boolean html4ModeCompatibleWithXhtml1Schemata) {
 667        this.html4ModeCompatibleWithXhtml1Schemata = html4ModeCompatibleWithXhtml1Schemata;
 668    }
 669
 670    // ]NOCPP]
 671
 672    // For the token handler to call
 673    /**
 674     * Sets the tokenizer state and the associated element name. This should 
 675     * only ever used to put the tokenizer into one of the states that have
 676     * a special end tag expectation.
 677     * 
 678     * @param specialTokenizerState
 679     *            the tokenizer state to set
 680     * @param endTagExpectation
 681     *            the expected end tag for transitioning back to normal
 682     */
 683    public void setStateAndEndTagExpectation(int specialTokenizerState,
 684            @Local String endTagExpectation) {
 685        this.stateSave = specialTokenizerState;
 686        if (specialTokenizerState == Tokenizer.DATA) {
 687            return;
 688        }
 689        @Auto char[] asArray = Portability.newCharArrayFromLocal(endTagExpectation);
 690        this.endTagExpectation = ElementName.elementNameByBuffer(asArray, 0,
 691                asArray.length, interner);
 692        endTagExpectationToArray();
 693    }
 694
 695    /**
 696     * Sets the tokenizer state and the associated element name. This should 
 697     * only ever used to put the tokenizer into one of the states that have
 698     * a special end tag expectation.
 699     * 
 700     * @param specialTokenizerState
 701     *            the tokenizer state to set
 702     * @param endTagExpectation
 703     *            the expected end tag for transitioning back to normal
 704     */
 705    public void setStateAndEndTagExpectation(int specialTokenizerState,
 706            ElementName endTagExpectation) {
 707        this.stateSave = specialTokenizerState;
 708        this.endTagExpectation = endTagExpectation;
 709        endTagExpectationToArray();
 710    }
 711
 712    private void endTagExpectationToArray() {
 713        switch (endTagExpectation.getGroup()) {
 714            case TreeBuilder.TITLE:
 715                endTagExpectationAsArray = TITLE_ARR;
 716                return;
 717            case TreeBuilder.SCRIPT:
 718                endTagExpectationAsArray = SCRIPT_ARR;
 719                return;
 720            case TreeBuilder.STYLE:
 721                endTagExpectationAsArray = STYLE_ARR;
 722                return;
 723            case TreeBuilder.PLAINTEXT:
 724                endTagExpectationAsArray = PLAINTEXT_ARR;
 725                return;
 726            case TreeBuilder.XMP:
 727                endTagExpectationAsArray = XMP_ARR;
 728                return;
 729            case TreeBuilder.TEXTAREA:
 730                endTagExpectationAsArray = TEXTAREA_ARR;
 731                return;
 732            case TreeBuilder.IFRAME:
 733                endTagExpectationAsArray = IFRAME_ARR;
 734                return;
 735            case TreeBuilder.NOEMBED:
 736                endTagExpectationAsArray = NOEMBED_ARR;
 737                return;
 738            case TreeBuilder.NOSCRIPT:
 739                endTagExpectationAsArray = NOSCRIPT_ARR;
 740                return;
 741            case TreeBuilder.NOFRAMES:
 742                endTagExpectationAsArray = NOFRAMES_ARR;
 743                return;
 744            default:
 745                assert false: "Bad end tag expectation.";
 746                return;
 747        }
 748    }
 749
 750    /**
 751     * For C++ use only.
 752     */
 753    public void setLineNumber(int line) {
 754        this.line = line;
 755    }
 756
 757    // start Locator impl
 758
 759    /**
 760     * @see org.xml.sax.Locator#getLineNumber()
 761     */
 762    @Inline public int getLineNumber() {
 763        return line;
 764    }
 765
 766    // [NOCPP[
 767
 768    /**
 769     * @see org.xml.sax.Locator#getColumnNumber()
 770     */
 771    @Inline public int getColumnNumber() {
 772        return -1;
 773    }
 774
 775    /**
 776     * @see org.xml.sax.Locator#getPublicId()
 777     */
 778    public String getPublicId() {
 779        return publicId;
 780    }
 781
 782    /**
 783     * @see org.xml.sax.Locator#getSystemId()
 784     */
 785    public String getSystemId() {
 786        return systemId;
 787    }
 788
 789    // end Locator impl
 790
 791    // end public API
 792
 793    public void notifyAboutMetaBoundary() {
 794        metaBoundaryPassed = true;
 795    }
 796
 797    void turnOnAdditionalHtml4Errors() {
 798        html4 = true;
 799    }
 800
 801    // ]NOCPP]
 802
 803    HtmlAttributes emptyAttributes() {
 804        // [NOCPP[
 805        if (newAttributesEachTime) {
 806            return new HtmlAttributes(mappingLangToXmlLang);
 807        } else {
 808            // ]NOCPP]
 809            return HtmlAttributes.EMPTY_ATTRIBUTES;
 810            // [NOCPP[
 811        }
 812        // ]NOCPP]
 813    }
 814
 815    @Inline private void clearStrBufAndAppend(char c) {
 816        strBuf[0] = c;
 817        strBufLen = 1;
 818    }
 819
 820    @Inline private void clearStrBuf() {
 821        strBufLen = 0;
 822    }
 823
 824    /**
 825     * Appends to the smaller buffer.
 826     * 
 827     * @param c
 828     *            the UTF-16 code unit to append
 829     */
 830    private void appendStrBuf(char c) {
 831        if (strBufLen == strBuf.length) {
 832            char[] newBuf = new char[strBuf.length + Tokenizer.BUFFER_GROW_BY];
 833            System.arraycopy(strBuf, 0, newBuf, 0, strBuf.length);
 834            strBuf = newBuf;
 835        }
 836        strBuf[strBufLen++] = c;
 837    }
 838
 839    /**
 840     * The smaller buffer as a String. Currently only used for error reporting.
 841     * 
 842     * <p>
 843     * C++ memory note: The return value must be released.
 844     * 
 845     * @return the smaller buffer as a string
 846     */
 847    protected String strBufToString() {
 848        return Portability.newStringFromBuffer(strBuf, 0, strBufLen);
 849    }
 850
 851    /**
 852     * Returns the short buffer as a local name. The return value is released in
 853     * emitDoctypeToken().
 854     * 
 855     * @return the smaller buffer as local name
 856     */
 857    private void strBufToDoctypeName() {
 858        doctypeName = Portability.newLocalNameFromBuffer(strBuf, 0, strBufLen,
 859                interner);
 860    }
 861
 862    /**
 863     * Emits the smaller buffer as character tokens.
 864     * 
 865     * @throws SAXException
 866     *             if the token handler threw
 867     */
 868    private void emitStrBuf() throws SAXException {
 869        if (strBufLen > 0) {
 870            tokenHandler.characters(strBuf, 0, strBufLen);
 871        }
 872    }
 873
 874    @Inline private void clearLongStrBuf() {
 875        longStrBufLen = 0;
 876    }
 877
 878    @Inline private void clearLongStrBufAndAppend(char c) {
 879        longStrBuf[0] = c;
 880        longStrBufLen = 1;
 881    }
 882
 883    /**
 884     * Appends to the larger buffer.
 885     * 
 886     * @param c
 887     *            the UTF-16 code unit to append
 888     */
 889    private void appendLongStrBuf(char c) {
 890        if (longStrBufLen == longStrBuf.length) {
 891            char[] newBuf = new char[longStrBufLen + (longStrBufLen >> 1)];
 892            System.arraycopy(longStrBuf, 0, newBuf, 0, longStrBuf.length);
 893            longStrBuf = newBuf;
 894        }
 895        longStrBuf[longStrBufLen++] = c;
 896    }
 897
 898    @Inline private void appendSecondHyphenToBogusComment() throws SAXException {
 899        // [NOCPP[
 900        switch (commentPolicy) {
 901            case ALTER_INFOSET:
 902                // detachLongStrBuf();
 903                appendLongStrBuf(' ');
 904                // FALLTHROUGH
 905            case ALLOW:
 906                warn("The document is not mappable to XML 1.0 due to two consecutive hyphens in a comment.");
 907                // ]NOCPP]
 908                appendLongStrBuf('-');
 909                // [NOCPP[
 910                break;
 911            case FATAL:
 912                fatal("The document is not mappable to XML 1.0 due to two consecutive hyphens in a comment.");
 913                break;
 914        }
 915        // ]NOCPP]
 916    }
 917
 918    // [NOCPP[
 919    private void maybeAppendSpaceToBogusComment() throws SAXException {
 920        switch (commentPolicy) {
 921            case ALTER_INFOSET:
 922                // detachLongStrBuf();
 923                appendLongStrBuf(' ');
 924                // FALLTHROUGH
 925            case ALLOW:
 926                warn("The document is not mappable to XML 1.0 due to a trailing hyphen in a comment.");
 927                break;
 928            case FATAL:
 929                fatal("The document is not mappable to XML 1.0 due to a trailing hyphen in a comment.");
 930                break;
 931        }
 932    }
 933
 934    // ]NOCPP]
 935
 936    @Inline private void adjustDoubleHyphenAndAppendToLongStrBufAndErr(char c)
 937            throws SAXException {
 938        errConsecutiveHyphens();
 939        // [NOCPP[
 940        switch (commentPolicy) {
 941            case ALTER_INFOSET:
 942                // detachLongStrBuf();
 943                longStrBufLen--;
 944                appendLongStrBuf(' ');
 945                appendLongStrBuf('-');
 946                // FALLTHROUGH
 947            case ALLOW:
 948                warn("The document is not mappable to XML 1.0 due to two consecutive hyphens in a comment.");
 949                // ]NOCPP]
 950                appendLongStrBuf(c);
 951                // [NOCPP[
 952                break;
 953            case FATAL:
 954                fatal("The document is not mappable to XML 1.0 due to two consecutive hyphens in a comment.");
 955                break;
 956        }
 957        // ]NOCPP]
 958    }
 959
 960    private void appendLongStrBuf(@NoLength char[] buffer, int offset, int length) {
 961        int reqLen = longStrBufLen + length;
 962        if (longStrBuf.length < reqLen) {
 963            char[] newBuf = new char[reqLen + (reqLen >> 1)];
 964            System.arraycopy(longStrBuf, 0, newBuf, 0, longStrBuf.length);
 965            longStrBuf = newBuf;
 966        }
 967        System.arraycopy(buffer, offset, longStrBuf, longStrBufLen, length);
 968        longStrBufLen = reqLen;
 969    }
 970
 971    /**
 972     * Append the contents of the smaller buffer to the larger one.
 973     */
 974    @Inline private void appendStrBufToLongStrBuf() {
 975        appendLongStrBuf(strBuf, 0, strBufLen);
 976    }
 977
 978    /**
 979     * The larger buffer as a string.
 980     * 
 981     * <p>
 982     * C++ memory note: The return value must be released.
 983     * 
 984     * @return the larger buffer as a string
 985     */
 986    private String longStrBufToString() {
 987        return Portability.newStringFromBuffer(longStrBuf, 0, longStrBufLen);
 988    }
 989
 990    /**
 991     * Emits the current comment token.
 992     * 
 993     * @param pos
 994     *            TODO
 995     * 
 996     * @throws SAXException
 997     */
 998    private void emitComment(int provisionalHyphens, int pos)
 999            throws SAXException {
1000        // [NOCPP[
1001        if (wantsComments) {
1002            // ]NOCPP]
1003            // if (longStrBufOffset != -1) {
1004            // tokenHandler.comment(buf, longStrBufOffset, longStrBufLen
1005            // - provisionalHyphens);
1006            // } else {
1007            tokenHandler.comment(longStrBuf, 0, longStrBufLen
1008                    - provisionalHyphens);
1009            // }
1010            // [NOCPP[
1011        }
1012        // ]NOCPP]
1013        cstart = pos + 1;
1014    }
1015
1016    /**
1017     * Flushes coalesced character tokens.
1018     * 
1019     * @param buf
1020     *            TODO
1021     * @param pos
1022     *            TODO
1023     * 
1024     * @throws SAXException
1025     */
1026    protected void flushChars(@NoLength char[] buf, int pos)
1027            throws SAXException {
1028        if (pos > cstart) {
1029            tokenHandler.characters(buf, cstart, pos - cstart);
1030        }
1031        cstart = Integer.MAX_VALUE;
1032    }
1033
1034    /**
1035     * Reports an condition that would make the infoset incompatible with XML
1036     * 1.0 as fatal.
1037     * 
1038     * @param message
1039     *            the message
1040     * @throws SAXException
1041     * @throws SAXParseException
1042     */
1043    public void fatal(String message) throws SAXException {
1044        SAXParseException spe = new SAXParseException(message, this);
1045        if (errorHandler != null) {
1046            errorHandler.fatalError(spe);
1047        }
1048        throw spe;
1049    }
1050
1051    /**
1052     * Reports a Parse Error.
1053     * 
1054     * @param message
1055     *            the message
1056     * @throws SAXException
1057     */
1058    public void err(String message) throws SAXException {
1059        if (errorHandler == null) {
1060            return;
1061        }
1062        SAXParseException spe = new SAXParseException(message, this);
1063        errorHandler.error(spe);
1064    }
1065
1066    public void errTreeBuilder(String message) throws SAXException {
1067        ErrorHandler eh = null;
1068        if (tokenHandler instanceof TreeBuilder<?>) {
1069            TreeBuilder<?> treeBuilder = (TreeBuilder<?>) tokenHandler;
1070            eh = treeBuilder.getErrorHandler();
1071        }
1072        if (eh == null) {
1073            eh = errorHandler;
1074        }
1075        if (eh == null) {
1076            return;
1077        }
1078        SAXParseException spe = new SAXParseException(message, this);
1079        eh.error(spe);
1080    }
1081
1082    /**
1083     * Reports a warning
1084     * 
1085     * @param message
1086     *            the message
1087     * @throws SAXException
1088     */
1089    public void warn(String message) throws SAXException {
1090        if (errorHandler == null) {
1091            return;
1092        }
1093        SAXParseException spe = new SAXParseException(message, this);
1094        errorHandler.warning(spe);
1095    }
1096
1097    /**
1098     * 
1099     */
1100    private void resetAttributes() {
1101        // [NOCPP[
1102        if (newAttributesEachTime) {
1103            // ]NOCPP]
1104            attributes = null;
1105            // [NOCPP[
1106        } else {
1107            attributes.clear(mappingLangToXmlLang);
1108        }
1109        // ]NOCPP]
1110    }
1111
1112    private void strBufToElementNameString() {
1113        // if (strBufOffset != -1) {
1114        // return ElementName.elementNameByBuffer(buf, strBufOffset, strBufLen);
1115        // } else {
1116        tagName = ElementName.elementNameByBuffer(strBuf, 0, strBufLen,
1117                interner);
1118        // }
1119    }
1120
1121    private int emitCurrentTagToken(boolean selfClosing, int pos)
1122            throws SAXException {
1123        cstart = pos + 1;
1124        maybeErrSlashInEndTag(selfClosing);
1125        stateSave = Tokenizer.DATA;
1126        HtmlAttributes attrs = (attributes == null ? HtmlAttributes.EMPTY_ATTRIBUTES
1127                : attributes);
1128        if (endTag) {
1129            /*
1130             * When an end tag token is emitted, the content model flag must be
1131             * switched to the PCDATA state.
1132             */
1133            maybeErrAttributesOnEndTag(attrs);
1134            // CPPONLY: if (!viewingXmlSource) {
1135            tokenHandler.endTag(tagName);
1136            // CPPONLY: }
1137            Portability.delete(attributes);
1138        } else {
1139            // CPPONLY: if (viewingXmlSource) {
1140            // CPPONLY: Portability.delete(attributes);
1141            // CPPONLY: } else {
1142            tokenHandler.startTag(tagName, attrs, selfClosing);
1143            // CPPONLY: }
1144        }
1145        tagName.release();
1146        tagName = null;
1147        resetAttributes();
1148        /*
1149         * The token handler may have called setStateAndEndTagExpectation
1150         * and changed stateSave since the start of this method.
1151         */
1152        return stateSave;
1153    }
1154
1155    private void attributeNameComplete() throws SAXException {
1156        // if (strBufOffset != -1) {
1157        // attributeName = AttributeName.nameByBuffer(buf, strBufOffset,
1158        // strBufLen, namePolicy != XmlViolationPolicy.ALLOW);
1159        // } else {
1160        attributeName = AttributeName.nameByBuffer(strBuf, 0, strBufLen
1161        // [NOCPP[
1162                , namePolicy != XmlViolationPolicy.ALLOW
1163                // ]NOCPP]
1164                , interner);
1165        // }
1166
1167        if (attributes == null) {
1168            attributes = new HtmlAttributes(mappingLangToXmlLang);
1169        }
1170
1171        /*
1172         * When the user agent leaves the attribute name state (and before
1173         * emitting the tag token, if appropriate), the complete attribute's
1174         * name must be compared to the other attributes on the same token; if
1175         * there is already an attribute on the token with the exact same name,
1176         * then this is a parse error and the new attribute must be dropped,
1177         * along with the value that gets associated with it (if any).
1178         */
1179        if (attributes.contains(attributeName)) {
1180            errDuplicateAttribute();
1181            attributeName.release();
1182            attributeName = null;
1183        }
1184    }
1185
1186    private void addAttributeWithoutValue() throws SAXException {
1187        noteAttributeWithoutValue();
1188
1189        // [NOCPP[
1190        if (metaBoundaryPassed && AttributeName.CHARSET == attributeName
1191                && ElementName.META == tagName) {
1192            err("A \u201Ccharset\u201D attribute on a \u201Cmeta\u201D element found after the first 512 bytes.");
1193        }
1194        // ]NOCPP]
1195        if (attributeName != null) {
1196            // [NOCPP[
1197            if (html4) {
1198                if (attributeName.isBoolean()) {
1199                    if (html4ModeCompatibleWithXhtml1Schemata) {
1200                        attributes.addAttribute(attributeName,
1201                                attributeName.getLocal(AttributeName.HTML),
1202                                xmlnsPolicy);
1203                    } else {
1204                        attributes.addAttribute(attributeName, "", xmlnsPolicy);
1205                    }
1206                } else {
1207                    if (AttributeName.BORDER != attributeName) {
1208                        err("Attribute value omitted for a non-boolean attribute. (HTML4-only error.)");
1209                        attributes.addAttribute(attributeName, "", xmlnsPolicy);
1210                    }
1211                }
1212            } else {
1213                if (AttributeName.SRC == attributeName
1214                        || AttributeName.HREF == attributeName) {
1215                    warn("Attribute \u201C"
1216                            + attributeName.getLocal(AttributeName.HTML)
1217                            + "\u201D without an explicit value seen. The attribute may be dropped by IE7.");
1218                }
1219                // ]NOCPP]
1220                attributes.addAttribute(attributeName,
1221                        Portability.newEmptyString()
1222                        // [NOCPP[
1223                        , xmlnsPolicy
1224                // ]NOCPP]
1225                );
1226                // [NOCPP[
1227            }
1228            // ]NOCPP]
1229            attributeName = null; // attributeName has been adopted by the
1230            // |attributes| object
1231        }
1232    }
1233
1234    private void addAttributeWithValue() throws SAXException {
1235        // [NOCPP[
1236        if (metaBoundaryPassed && ElementName.META == tagName
1237                && AttributeName.CHARSET == attributeName) {
1238            err("A \u201Ccharset\u201D attribute on a \u201Cmeta\u201D element found after the first 512 bytes.");
1239        }
1240        // ]NOCPP]
1241        if (attributeName != null) {
1242            String val = longStrBufToString(); // Ownership transferred to
1243            // HtmlAttributes
1244            // CPPONLY: if (mViewSource) {
1245            // CPPONLY:   mViewSource.MaybeLinkifyAttributeValue(attributeName, val);
1246            // CPPONLY: }
1247            // [NOCPP[
1248            if (!endTag && html4 && html4ModeCompatibleWithXhtml1Schemata
1249                    && attributeName.isCaseFolded()) {
1250                val = newAsciiLowerCaseStringFromString(val);
1251            }
1252            // ]NOCPP]
1253            attributes.addAttribute(attributeName, val
1254            // [NOCPP[
1255                    , xmlnsPolicy
1256            // ]NOCPP]
1257            );
1258            attributeName = null; // attributeName has been adopted by the
1259            // |attributes| object
1260        }
1261    }
1262
1263    // [NOCPP[
1264
1265    private static String newAsciiLowerCaseStringFromString(String str) {
1266        if (str == null) {
1267            return null;
1268        }
1269        char[] buf = new char[str.length()];
1270        for (int i = 0; i < str.length(); i++) {
1271            char c = str.charAt(i);
1272            if (c >= 'A' && c <= 'Z') {
1273                c += 0x20;
1274            }
1275            buf[i] = c;
1276        }
1277        return new String(buf);
1278    }
1279
1280    protected void startErrorReporting() throws SAXException {
1281
1282    }
1283
1284    // ]NOCPP]
1285    
1286    public void start() throws SAXException {
1287        initializeWithoutStarting();
1288        tokenHandler.startTokenization(this);
1289        // [NOCPP[
1290        startErrorReporting();
1291        // ]NOCPP]
1292    }
1293
1294    public boolean tokenizeBuffer(UTF16Buffer buffer) throws SAXException {
1295        int state = stateSave;
1296        int returnState = returnStateSave;
1297        char c = '\u0000';
1298        shouldSuspend = false;
1299        lastCR = false;
1300
1301        int start = buffer.getStart();
1302        /**
1303         * The index of the last <code>char</code> read from <code>buf</code>.
1304         */
1305        int pos = start - 1;
1306
1307        /**
1308         * The index of the first <code>char</code> in <code>buf</code> that is
1309         * part of a coalesced run of character tokens or
1310         * <code>Integer.MAX_VALUE</code> if there is not a current run being
1311         * coalesced.
1312         */
1313        switch (state) {
1314            case DATA:
1315            case RCDATA:
1316            case SCRIPT_DATA:
1317            case PLAINTEXT:
1318            case RAWTEXT:
1319            case CDATA_SECTION:
1320            case SCRIPT_DATA_ESCAPED:
1321            case SCRIPT_DATA_ESCAPE_START:
1322            case SCRIPT_DATA_ESCAPE_START_DASH:
1323            case SCRIPT_DATA_ESCAPED_DASH:
1324            case SCRIPT_DATA_ESCAPED_DASH_DASH:
1325            case SCRIPT_DATA_DOUBLE_ESCAPE_START:
1326            case SCRIPT_DATA_DOUBLE_ESCAPED:
1327            case SCRIPT_DATA_DOUBLE_ESCAPED_LESS_THAN_SIGN:
1328            case SCRIPT_DATA_DOUBLE_ESCAPED_DASH:
1329            case SCRIPT_DATA_DOUBLE_ESCAPED_DASH_DASH:
1330            case SCRIPT_DATA_DOUBLE_ESCAPE_END:
1331                cstart = start;
1332                break;
1333            default:
1334                cstart = Integer.MAX_VALUE;
1335                break;
1336        }
1337
1338        /**
1339         * The number of <code>char</code>s in <code>buf</code> that have
1340         * meaning. (The rest of the array is garbage and should not be
1341         * examined.)
1342         */
1343        // CPPONLY: if (mViewSource) {
1344        // CPPONLY:   mViewSource.SetBuffer(buffer);
1345        // CPPONLY:   pos = stateLoop(state, c, pos, buffer.getBuffer(), false, returnState, buffer.getEnd());
1346        // CPPONLY:   mViewSource.DropBuffer((pos == buffer.getEnd()) ? pos : pos + 1);
1347        // CPPONLY: } else {
1348        // CPPONLY:   pos = stateLoop(state, c, pos, buffer.getBuffer(), false, returnState, buffer.getEnd());
1349        // CPPONLY: }
1350        // [NOCPP[
1351        pos = stateLoop(state, c, pos, buffer.getBuffer(), false, returnState,
1352                buffer.getEnd());
1353        // ]NOCPP]
1354        if (pos == buffer.getEnd()) {
1355            // exiting due to end of buffer
1356            buffer.setStart(pos);
1357        } else {
1358            buffer.setStart(pos + 1);
1359        }
1360        return lastCR;
1361    }
1362
1363    @SuppressWarnings("unused") private int stateLoop(int state, char c,
1364            int pos, @NoLength char[] buf, boolean reconsume, int returnState,
1365            int endPos) throws SAXException {
1366        /*
1367         * Idioms used in this code:
1368         * 
1369         * 
1370         * Consuming the next input character
1371         * 
1372         * To consume the next input character, the code does this: if (++pos ==
1373         * endPos) { break stateloop; } c = checkChar(buf, pos);
1374         * 
1375         * 
1376         * Staying in a state
1377         * 
1378         * When there's a state that the tokenizer may stay in over multiple
1379         * input characters, the state has a wrapper |for(;;)| loop and staying
1380         * in the state continues the loop.
1381         * 
1382         * 
1383         * Switching to another state
1384         * 
1385         * To switch to another state, the code sets the state variable to the
1386         * magic number of the new state. Then it either continues stateloop or
1387         * breaks out of the state's own wrapper loop if the target state is
1388         * right after the current state in source order. (This is a partial
1389         * workaround for Java's lack of goto.)
1390         * 
1391         * 
1392         * Reconsume support
1393         * 
1394         * The spec sometimes says that an input character is reconsumed in
1395         * another state. If a state can ever be entered so that an input
1396         * character can be reconsumed in it, the state's code starts with an
1397         * |if (reconsume)| that sets reconsume to false and skips over the
1398         * normal code for consuming a new character.
1399         * 
1400         * To reconsume the current character in another state, the code sets
1401         * |reconsume| to true and then switches to the other state.
1402         * 
1403         * 
1404         * Emitting character tokens
1405         * 
1406         * This method emits character tokens lazily. Whenever a new range of
1407         * character tokens starts, the field cstart must be set to the start
1408         * index of the range. The flushChars() method must be called at the end
1409         * of a range to flush it.
1410         * 
1411         * 
1412         * U+0000 handling
1413         * 
1414         * The various states have to handle the replacement of U+0000 with
1415         * U+FFFD. However, if U+0000 would be reconsumed in another state, the
1416         * replacement doesn't need to happen, because it's handled by the
1417         * reconsuming state.
1418         * 
1419         * 
1420         * LF handling
1421         * 
1422         * Every state needs to increment the line number upon LF unless the LF
1423         * gets reconsumed by another state which increments the line number.
1424         * 
1425         * 
1426         * CR handling
1427         * 
1428         * Every state needs to handle CR unless the CR gets reconsumed and is
1429         * handled by the reconsuming state. The CR needs to be handled as if it
1430         * were and LF, the lastCR field must be set to true and then this
1431         * method must return. The IO driver will then swallow the next
1432         * character if it is an LF to coalesce CRLF.
1433         */
1434        stateloop: for (;;) {
1435            switch (state) {
1436                case DATA:
1437                    dataloop: for (;;) {
1438                        if (reconsume) {
1439                            reconsume = false;
1440                        } else {
1441                            if (++pos == endPos) {
1442                                break stateloop;
1443                            }
1444                            c = checkChar(buf, pos);
1445                        }
1446                        switch (c) {
1447                            case '&':
1448                                /*
1449                                 * U+0026 AMPERSAND (&) Switch to the character
1450                                 * reference in data state.
1451                                 */
1452                                flushChars(buf, pos);
1453                                clearStrBufAndAppend(c);
1454                                setAdditionalAndRememberAmpersandLocation('\u0000');
1455                                returnState = state;
1456                                state = transition(state, Tokenizer.CONSUME_CHARACTER_REFERENCE, reconsume, pos);
1457                                continue stateloop;
1458                            case '<':
1459                                /*
1460                                 * U+003C LESS-THAN SIGN (<) Switch to the tag
1461                                 * open state.
1462                                 */
1463                                flushChars(buf, pos);
1464
1465                                state = transition(state, Tokenizer.TAG_OPEN, reconsume, pos);
1466                                break dataloop; // FALL THROUGH continue
1467                            // stateloop;
1468                            case '\u0000':
1469                                emitReplacementCharacter(buf, pos);
1470                                continue;
1471                            case '\r':
1472                                emitCarriageReturn(buf, pos);
1473                                break stateloop;
1474                            case '\n':
1475                                silentLineFeed();
1476                            default:
1477                                /*
1478                                 * Anything else Emit the input character as a
1479                                 * character token.
1480                                 * 
1481                                 * Stay in the data state.
1482                                 */
1483                                continue;
1484                        }
1485                    }
1486                    // WARNING FALLTHRU CASE TRANSITION: DON'T REORDER
1487                case TAG_OPEN:
1488                    tagopenloop: for (;;) {
1489                        /*
1490                         * The behavior of this state depends on the content
1491                         * model flag.
1492                         */
1493                        if (++pos == endPos) {
1494                            break stateloop;
1495                        }
1496                        c = checkChar(buf, pos);
1497                        /*
1498                         * If the content model flag is set to the PCDATA state
1499                         * Consume the next input character:
1500                         */
1501                        if (c >= 'A' && c <= 'Z') {
1502                            /*
1503                             * U+0041 LATIN CAPITAL LETTER A through to U+005A
1504                             * LATIN CAPITAL LETTER Z Create a new start tag
1505                             * token,
1506                             */
1507                            endTag = false;
1508                            /*
1509                             * set its tag name to the lowercase version of the
1510                             * input character (add 0x0020 to the character's
1511                             * code point),
1512                             */
1513                            clearStrBufAndAppend((char) (c + 0x20));
1514                            /* then switch to the tag name state. */
1515                            state = transition(state, Tokenizer.TAG_NAME, reconsume, pos);
1516                            /*
1517                             * (Don't emit the token yet; further details will
1518                             * be filled in before it is emitted.)
1519                             */
1520                            break tagopenloop;
1521                            // continue stateloop;
1522                        } else if (c >= 'a' && c <= 'z') {
1523                            /*
1524                             * U+0061 LATIN SMALL LETTER A through to U+007A
1525                             * LATIN SMALL LETTER Z Create a new start tag
1526                             * token,
1527                             */
1528                            endTag = false;
1529                            /*
1530                             * set its tag name to the input character,
1531                             */
1532                            clearStrBufAndAppend(c);
1533                            /* then switch to the tag name state. */
1534                            state = transition(state, Tokenizer.TAG_NAME, reconsume, pos);
1535                            /*
1536                             * (Don't emit the token yet; further details will
1537                             * be filled in before it is emitted.)
1538                             */
1539                            break tagopenloop;
1540                            // continue stateloop;
1541                        }
1542                        switch (c) {
1543                            case '!':
1544                                /*
1545                                 * U+0021 EXCLAMATION MARK (!) Switch to the
1546                                 * markup declaration open state.
1547                                 */
1548                                state = transition(state, Tokenizer.MARKUP_DECLARATION_OPEN, reconsume, pos);
1549                                continue stateloop;
1550                            case '/':
1551                                /*
1552                                 * U+002F SOLIDUS (/) Switch to the close tag
1553                                 * open state.
1554                                 */
1555                                state = transition(state, Tokenizer.CLOSE_TAG_OPEN, reconsume, pos);
1556                        

Large files files are truncated, but you can click here to view the full file