PageRenderTime 175ms CodeModel.GetById 106ms app.highlight 58ms RepoModel.GetById 1ms app.codeStats 0ms

/jEdit/tags/jedit-4-1-pre5/com/microstar/xml/XmlParser.java

#
Java | 2673 lines | 1863 code | 223 blank | 587 comment | 194 complexity | 7f055fa7eeb0030f70c8ed18ce0e0d2a MD5 | raw file

Large files files are truncated, but you can click here to view the full file

   1// XmlParser.java: the main parser class.
   2// NO WARRANTY! See README, and copyright below.
   3// $Id: XmlParser.java 3792 2001-09-02 05:37:43Z spestov $
   4
   5package com.microstar.xml;
   6
   7import java.io.BufferedInputStream;
   8import java.io.EOFException;
   9import java.io.InputStream;
  10import java.io.Reader;
  11import java.net.URL;
  12import java.net.URLConnection;
  13import java.util.Enumeration;
  14import java.util.Hashtable;
  15import java.util.Stack;
  16
  17
  18/**
  19  * Parse XML documents and return parse events through call-backs.
  20  * <p>You need to define a class implementing the <code>XmlHandler</code>
  21  * interface: an object belonging to this class will receive the
  22  * callbacks for the events.  (As an alternative to implementing
  23  * the full XmlHandler interface, you can simply extend the 
  24  * <code>HandlerBase</code> convenience class.)
  25  * <p>Usage (assuming that <code>MyHandler</code> is your implementation
  26  * of the <code>XmlHandler</code> interface):
  27  * <pre>
  28  * XmlHandler handler = new MyHandler();
  29  * XmlParser parser = new XmlParser();
  30  * parser.setHandler(handler);
  31  * try {
  32  *   parser.parse("http://www.host.com/doc.xml", null);
  33  * } catch (Exception e) {
  34  *   [do something interesting]
  35  * }
  36  * </pre>
  37  * <p>Alternatively, you can use the standard SAX interfaces
  38  * with the <code>SAXDriver</code> class as your entry point.
  39  * @author Copyright (c) 1997, 1998 by Microstar Software Ltd.
  40  * @author Written by David Megginson &lt;dmeggins@microstar.com&gt;
  41  * @version 1.1
  42  * @see XmlHandler
  43  * @see HandlerBase
  44  * @see SAXDriver
  45  */
  46public class XmlParser {
  47
  48  //
  49  // Use special cheats that speed up the code (currently about 50%),
  50  // but may cause problems with future maintenance and add to the
  51  // class file size (about 500 bytes).
  52  //
  53  private final static boolean USE_CHEATS = true;
  54
  55
  56
  57  //////////////////////////////////////////////////////////////////////
  58  // Constructors.
  59  ////////////////////////////////////////////////////////////////////////
  60
  61
  62  /**
  63    * Construct a new parser with no associated handler.
  64    * @see #setHandler
  65    * @see #parse
  66    */
  67  public XmlParser ()
  68  {
  69  }
  70
  71
  72  /**
  73    * Set the handler that will receive parsing events.
  74    * @param handler The handler to receive callback events.
  75    * @see #parse
  76    * @see XmlHandler
  77    */
  78  public void setHandler (XmlHandler handler)
  79  {
  80    this.handler = handler;
  81  }
  82
  83
  84  /**
  85    * Parse an XML document from a URI.
  86    * <p>You may parse a document more than once, but only one thread
  87    * may call this method for an object at one time.
  88    * @param systemId The URI of the document.
  89    * @param publicId The public identifier of the document, or null.
  90    * @param encoding The suggested encoding, or null if unknown.
  91    * @exception java.lang.Exception Any exception thrown by your
  92    *            own handlers, or any derivation of java.io.IOException
  93    *            thrown by the parser itself.
  94    */
  95  public void parse (String systemId, String publicId, String encoding)
  96    throws java.lang.Exception
  97  {
  98    doParse(systemId, publicId, null, null, encoding);
  99  }
 100
 101
 102  /**
 103    * Parse an XML document from a byte stream.
 104    * <p>The URI that you supply will become the base URI for
 105    * resolving relative links, but &AElig;lfred will actually read
 106    * the document from the supplied input stream.
 107    * <p>You may parse a document more than once, but only one thread
 108    * may call this method for an object at one time.
 109    * @param systemId The base URI of the document, or null if not
 110    *                 known.
 111    * @param publicId The public identifier of the document, or null
 112    *                 if not known.
 113    * @param stream A byte input stream.
 114    * @param encoding The suggested encoding, or null if unknown.
 115    * @exception java.lang.Exception Any exception thrown by your
 116    *            own handlers, or any derivation of java.io.IOException
 117    *            thrown by the parser itself.
 118    */
 119  public void parse (String systemId, String publicId,
 120		     InputStream stream, String encoding)
 121    throws java.lang.Exception
 122  {
 123    doParse(systemId, publicId, null, stream, encoding);
 124  }
 125
 126
 127  /**
 128    * Parse an XML document from a character stream.
 129    * <p>The URI that you supply will become the base URI for
 130    * resolving relative links, but &AElig;lfred will actually read
 131    * the document from the supplied input stream.
 132    * <p>You may parse a document more than once, but only one thread
 133    * may call this method for an object at one time.
 134    * @param systemId The base URI of the document, or null if not
 135    *                 known.
 136    * @param publicId The public identifier of the document, or null
 137    *                 if not known.
 138    * @param reader A character stream.
 139    * @exception java.lang.Exception Any exception thrown by your
 140    *            own handlers, or any derivation of java.io.IOException
 141    *            thrown by the parser itself.
 142    */
 143  public void parse (String systemId, String publicId, Reader reader)
 144    throws java.lang.Exception
 145  {
 146    doParse(systemId, publicId, reader, null, null);
 147  }
 148
 149
 150  private synchronized void doParse (String systemId, String publicId,
 151				     Reader reader, InputStream stream,
 152				     String encoding)
 153    throws java.lang.Exception
 154  {
 155    basePublicId = publicId;
 156    baseURI = systemId;
 157    baseReader = reader;
 158    baseInputStream = stream;
 159
 160    initializeVariables();
 161
 162				// Set the default entities here.
 163    setInternalEntity(intern("amp"), "&#38;");
 164    setInternalEntity(intern("lt"), "&#60;");
 165    setInternalEntity(intern("gt"), "&#62;");
 166    setInternalEntity(intern("apos"), "&#39;");
 167    setInternalEntity(intern("quot"), "&#34;");
 168
 169    if (handler != null) {
 170      handler.startDocument();
 171    }
 172
 173    pushURL("[document]", basePublicId, baseURI, baseReader, baseInputStream,
 174	    encoding);
 175
 176    parseDocument();
 177
 178    if (handler != null) {
 179      handler.endDocument();
 180    }
 181    cleanupVariables();
 182  }
 183
 184
 185
 186  ////////////////////////////////////////////////////////////////////////
 187  // Constants.
 188  ////////////////////////////////////////////////////////////////////////
 189
 190  //
 191  // Constants for element content type.
 192  //
 193
 194  /**
 195    * Constant: an element has not been declared.
 196    * @see #getElementContentType
 197    */
 198  public final static int CONTENT_UNDECLARED = 0;
 199
 200  /**
 201    * Constant: the element has a content model of ANY.
 202    * @see #getElementContentType
 203    */
 204  public final static int CONTENT_ANY = 1;
 205
 206  /**
 207    * Constant: the element has declared content of EMPTY.
 208    * @see #getElementContentType
 209    */
 210  public final static int CONTENT_EMPTY = 2;
 211
 212  /**
 213    * Constant: the element has mixed content.
 214    * @see #getElementContentType
 215    */
 216  public final static int CONTENT_MIXED = 3;
 217
 218  /**
 219    * Constant: the element has element content.
 220    * @see #getElementContentType
 221    */
 222  public final static int CONTENT_ELEMENTS = 4;
 223
 224
 225  //
 226  // Constants for the entity type.
 227  //
 228
 229  /**
 230    * Constant: the entity has not been declared.
 231    * @see #getEntityType
 232    */
 233  public final static int ENTITY_UNDECLARED = 0;
 234
 235  /**
 236    * Constant: the entity is internal.
 237    * @see #getEntityType
 238    */
 239  public final static int ENTITY_INTERNAL = 1;
 240
 241  /**
 242    * Constant: the entity is external, non-XML data.
 243    * @see #getEntityType
 244    */
 245  public final static int ENTITY_NDATA = 2;
 246
 247  /**
 248    * Constant: the entity is external XML data.
 249    * @see #getEntityType
 250    */
 251  public final static int ENTITY_TEXT = 3;
 252
 253
 254  //
 255  // Constants for attribute type.
 256  //
 257
 258  /**
 259    * Constant: the attribute has not been declared for this element type.
 260    * @see #getAttributeType
 261    */
 262  public final static int ATTRIBUTE_UNDECLARED = 0;
 263
 264  /**
 265    * Constant: the attribute value is a string value.
 266    * @see #getAttributeType
 267    */
 268  public final static int ATTRIBUTE_CDATA = 1;
 269
 270  /**
 271    * Constant: the attribute value is a unique identifier.
 272    * @see #getAttributeType
 273    */
 274  public final static int ATTRIBUTE_ID = 2;
 275
 276  /**
 277    * Constant: the attribute value is a reference to a unique identifier.
 278    * @see #getAttributeType
 279    */
 280  public final static int ATTRIBUTE_IDREF = 3;
 281
 282  /**
 283    * Constant: the attribute value is a list of ID references.
 284    * @see #getAttributeType
 285    */
 286  public final static int ATTRIBUTE_IDREFS = 4;
 287
 288  /**
 289    * Constant: the attribute value is the name of an entity.
 290    * @see #getAttributeType
 291    */
 292  public final static int ATTRIBUTE_ENTITY = 5;
 293
 294  /**
 295    * Constant: the attribute value is a list of entity names.
 296    * @see #getAttributeType
 297    */
 298  public final static int ATTRIBUTE_ENTITIES = 6;
 299
 300  /**
 301    * Constant: the attribute value is a name token.
 302    * @see #getAttributeType
 303    */
 304  public final static int ATTRIBUTE_NMTOKEN = 7;
 305
 306  /**
 307    * Constant: the attribute value is a list of name tokens.
 308    * @see #getAttributeType
 309    */
 310  public final static int ATTRIBUTE_NMTOKENS = 8;
 311
 312  /**
 313    * Constant: the attribute value is a token from an enumeration.
 314    * @see #getAttributeType
 315    */
 316  public final static int ATTRIBUTE_ENUMERATED = 9;
 317
 318  /**
 319    * Constant: the attribute is the name of a notation.
 320    * @see #getAttributeType
 321    */
 322  public final static int ATTRIBUTE_NOTATION = 10;
 323
 324
 325  //
 326  // When the class is loaded, populate the hash table of
 327  // attribute types.
 328  //
 329
 330  /**
 331    * Hash table of attribute types.
 332    */
 333  private static Hashtable attributeTypeHash;
 334  static {
 335    attributeTypeHash = new Hashtable();
 336    attributeTypeHash.put("CDATA", new Integer(ATTRIBUTE_CDATA));
 337    attributeTypeHash.put("ID", new Integer(ATTRIBUTE_ID));
 338    attributeTypeHash.put("IDREF", new Integer(ATTRIBUTE_IDREF));
 339    attributeTypeHash.put("IDREFS", new Integer(ATTRIBUTE_IDREFS));
 340    attributeTypeHash.put("ENTITY", new Integer(ATTRIBUTE_ENTITY));
 341    attributeTypeHash.put("ENTITIES", new Integer(ATTRIBUTE_ENTITIES));
 342    attributeTypeHash.put("NMTOKEN", new Integer(ATTRIBUTE_NMTOKEN));
 343    attributeTypeHash.put("NMTOKENS", new Integer(ATTRIBUTE_NMTOKENS));
 344    attributeTypeHash.put("NOTATION", new Integer(ATTRIBUTE_NOTATION));
 345  }
 346
 347
 348  //
 349  // Constants for supported encodings.
 350  //
 351  private final static int ENCODING_UTF_8 = 1;
 352  private final static int ENCODING_ISO_8859_1 = 2;
 353  private final static int ENCODING_UCS_2_12 = 3;
 354  private final static int ENCODING_UCS_2_21 = 4;
 355  private final static int ENCODING_UCS_4_1234 = 5;
 356  private final static int ENCODING_UCS_4_4321 = 6;
 357  private final static int ENCODING_UCS_4_2143 = 7;
 358  private final static int ENCODING_UCS_4_3412 = 8;
 359
 360
 361  //
 362  // Constants for attribute default value.
 363  //
 364
 365  /**
 366    * Constant: the attribute is not declared.
 367    * @see #getAttributeDefaultValueType
 368    */
 369  public final static int ATTRIBUTE_DEFAULT_UNDECLARED = 0;
 370
 371  /**
 372    * Constant: the attribute has a literal default value specified.
 373    * @see #getAttributeDefaultValueType
 374    * @see #getAttributeDefaultValue
 375    */
 376  public final static int ATTRIBUTE_DEFAULT_SPECIFIED = 1;
 377
 378  /**
 379    * Constant: the attribute was declared #IMPLIED.
 380    * @see #getAttributeDefaultValueType
 381    */
 382  public final static int ATTRIBUTE_DEFAULT_IMPLIED = 2;
 383
 384  /**
 385    * Constant: the attribute was declared #REQUIRED.
 386    * @see #getAttributeDefaultValueType
 387    */
 388  public final static int ATTRIBUTE_DEFAULT_REQUIRED = 3;
 389
 390  /**
 391    * Constant: the attribute was declared #FIXED.
 392    * @see #getAttributeDefaultValueType
 393    * @see #getAttributeDefaultValue
 394    */
 395  public final static int ATTRIBUTE_DEFAULT_FIXED = 4;
 396
 397
 398  //
 399  // Constants for input.
 400  //
 401  private final static int INPUT_NONE = 0;
 402  private final static int INPUT_INTERNAL = 1;
 403  private final static int INPUT_EXTERNAL = 2;
 404  private final static int INPUT_STREAM = 3;
 405  private final static int INPUT_BUFFER = 4;
 406  private final static int INPUT_READER = 5;
 407
 408
 409  //
 410  // Flags for reading literals.
 411  //
 412  private final static int LIT_CHAR_REF = 1;
 413  private final static int LIT_ENTITY_REF = 2;
 414  private final static int LIT_PE_REF = 4;
 415  private final static int LIT_NORMALIZE = 8;
 416
 417
 418  //
 419  // Flags for parsing context.
 420  //
 421  private final static int CONTEXT_NONE = 0;
 422  private final static int CONTEXT_DTD = 1;
 423  private final static int CONTEXT_ENTITYVALUE = 2;
 424  private final static int CONTEXT_ATTRIBUTEVALUE = 3;
 425
 426
 427
 428  //////////////////////////////////////////////////////////////////////
 429  // Error reporting.
 430  //////////////////////////////////////////////////////////////////////
 431
 432
 433  /**
 434    * Report an error.
 435    * @param message The error message.
 436    * @param textFound The text that caused the error (or null).
 437    * @see XmlHandler#error
 438    * @see #line
 439    */
 440  void error (String message, String textFound, String textExpected)
 441    throws java.lang.Exception
 442  {
 443    errorCount++;
 444    if (textFound != null) {
 445      message = message + " (found \"" + textFound + "\")";
 446    }
 447    if (textExpected != null) {
 448      message = message + " (expected \"" + textExpected + "\")";
 449    }
 450    if (handler != null) {
 451      String uri = null;
 452
 453      if (externalEntity != null) {
 454	uri = externalEntity.getURL().toString();
 455      }
 456      handler.error(message, uri, line, column);
 457    }
 458  }
 459
 460
 461  /**
 462    * Report a serious error.
 463    * @param message The error message.
 464    * @param textFound The text that caused the error (or null).
 465    */
 466  void error (String message, char textFound, String textExpected)
 467    throws java.lang.Exception
 468  {
 469    error(message, new Character(textFound).toString(), textExpected);
 470  }
 471
 472
 473
 474  //////////////////////////////////////////////////////////////////////
 475  // Major syntactic productions.
 476  //////////////////////////////////////////////////////////////////////
 477
 478
 479  /**
 480    * Parse an XML document.
 481    * <pre>
 482    * [1] document ::= prolog element Misc*
 483    * </pre>
 484    * <p>This is the top-level parsing function for a single XML
 485    * document.  As a minimum, a well-formed document must have
 486    * a document element, and a valid document must have a prolog
 487    * as well.
 488    */
 489  void parseDocument ()
 490    throws java.lang.Exception
 491    {
 492    char c;
 493
 494    parseProlog();
 495    require('<');
 496    parseElement();
 497    try
 498      {
 499      parseMisc();  //skip all white, PIs, and comments
 500      c=readCh();   //if this doesn't throw an exception...
 501      error("unexpected characters after document end",c,null);
 502      }
 503    catch (EOFException e)
 504      {return;}
 505    }
 506
 507
 508  /**
 509    * Skip a comment.
 510    * <pre>
 511    * [18] Comment ::= '&lt;!--' ((Char - '-') | ('-' (Char - '-')))* "-->"
 512    * </pre>
 513    * <p>(The <code>&lt;!--</code> has already been read.)
 514    */
 515  void parseComment ()
 516    throws java.lang.Exception
 517  {
 518    skipUntil("-->");
 519  }
 520
 521
 522  /**
 523    * Parse a processing instruction and do a call-back.
 524    * <pre>
 525    * [19] PI ::= '&lt;?' Name (S (Char* - (Char* '?&gt;' Char*)))? '?&gt;'
 526    * </pre>
 527    * <p>(The <code>&lt;?</code> has already been read.)
 528    * <p>An XML processing instruction <em>must</em> begin with
 529    * a Name, which is the instruction's target.
 530    */
 531  void parsePI ()
 532    throws java.lang.Exception
 533  {
 534    String name;
 535
 536    name = readNmtoken(true);
 537    if (!tryRead("?>")) {
 538      requireWhitespace();
 539      parseUntil("?>");
 540    }
 541    if (handler != null) {
 542      handler.processingInstruction(name, dataBufferToString());
 543    }
 544  }
 545
 546
 547  /**
 548    * Parse a CDATA marked section.
 549    * <pre>
 550    * [20] CDSect ::= CDStart CData CDEnd
 551    * [21] CDStart ::= '&lt;![CDATA['
 552    * [22] CData ::= (Char* - (Char* ']]&gt;' Char*))
 553    * [23] CDEnd ::= ']]&gt;'
 554    * </pre>
 555    * <p>(The '&lt;![CDATA[' has already been read.)
 556    * <p>Note that this just appends characters to the dataBuffer,
 557    * without actually generating an event.
 558    */
 559  void parseCDSect ()
 560    throws java.lang.Exception
 561  {
 562    parseUntil("]]>");
 563  }
 564
 565
 566  /**
 567    * Parse the prolog of an XML document.
 568    * <pre>
 569    * [24] prolog ::= XMLDecl? Misc* (Doctypedecl Misc*)?
 570    * </pre>
 571    * <p>There are a couple of tricks here.  First, it is necessary to
 572    * declare the XML default attributes after the DTD (if present)
 573    * has been read.  Second, it is not possible to expand general
 574    * references in attribute value literals until after the entire
 575    * DTD (if present) has been parsed.
 576    * <p>We do not look for the XML declaration here, because it is
 577    * handled by pushURL().
 578    * @see pushURL
 579    */
 580  void parseProlog ()
 581    throws java.lang.Exception
 582  {
 583    parseMisc();
 584
 585    if (tryRead("<!DOCTYPE")) {
 586      parseDoctypedecl();
 587      parseMisc();
 588    }
 589  }
 590
 591
 592  /**
 593    * Parse the XML declaration.
 594    * <pre>
 595    * [25] XMLDecl ::= '&lt;?xml' VersionInfo EncodingDecl? SDDecl? S? '?&gt;'
 596    * [26] VersionInfo ::= S 'version' Eq ('"1.0"' | "'1.0'")
 597    * [33] SDDecl ::= S 'standalone' Eq "'" ('yes' | 'no') "'"
 598    *               | S 'standalone' Eq '"' ("yes" | "no") '"'
 599    * [78] EncodingDecl ::= S 'encoding' Eq QEncoding
 600    * </pre>
 601    * <p>([80] to [82] are also significant.)
 602    * <p>(The <code>&lt;?xml</code> and whitespace have already been read.)
 603    * <p>TODO: validate value of standalone.
 604    * @see #parseTextDecl
 605    * @see #checkEncoding
 606    */
 607  void parseXMLDecl (boolean ignoreEncoding)
 608    throws java.lang.Exception
 609  {
 610    String version;
 611    String encodingName = null;
 612    String standalone = null;
 613
 614				// Read the version.
 615    require("version");
 616    parseEq();
 617    version = readLiteral(0);
 618    if (!version.equals("1.0")) {
 619      error("unsupported XML version", version, "1.0");
 620    }
 621
 622				// Try reading an encoding declaration.
 623    skipWhitespace();
 624    if (tryRead("encoding")) {
 625      parseEq();
 626      encodingName = readLiteral(0);
 627      checkEncoding(encodingName, ignoreEncoding);
 628    }
 629
 630				// Try reading a standalone declaration
 631    skipWhitespace();
 632    if (tryRead("standalone")) {
 633      parseEq();
 634      standalone = readLiteral(0);
 635    }
 636
 637    skipWhitespace();
 638    require("?>");
 639  }
 640
 641
 642  /**
 643    * Parse the Encoding PI.
 644    * <pre>
 645    * [78] EncodingDecl ::= S 'encoding' Eq QEncoding
 646    * [79] EncodingPI ::= '&lt;?xml' S 'encoding' Eq QEncoding S? '?&gt;'
 647    * [80] QEncoding ::= '"' Encoding '"' | "'" Encoding "'"
 648    * [81] Encoding ::= LatinName
 649    * [82] LatinName ::= [A-Za-z] ([A-Za-z0-9._] | '-')*
 650    * </pre>
 651    * <p>(The <code>&lt;?xml</code>' and whitespace have already been read.)
 652    * @see #parseXMLDecl
 653    * @see #checkEncoding
 654    */
 655  void parseTextDecl (boolean ignoreEncoding)
 656    throws java.lang.Exception
 657  {
 658    String encodingName = null;
 659    
 660				// Read an optional version.
 661    if (tryRead("version")) {
 662      String version;
 663      parseEq();
 664      version = readLiteral(0);
 665      if (!version.equals("1.0")) {
 666	error("unsupported XML version", version, "1.0");
 667      }
 668      requireWhitespace();
 669    }
 670      
 671
 672				// Read the encoding.
 673    require("encoding");
 674    parseEq();
 675    encodingName = readLiteral(0);
 676    checkEncoding(encodingName, ignoreEncoding);
 677
 678    skipWhitespace();
 679    require("?>");
 680  }
 681
 682
 683  /**
 684    * Check that the encoding specified makes sense.
 685    * <p>Compare what the author has specified in the XML declaration
 686    * or encoding PI with what we have detected.
 687    * <p>This is also important for distinguishing among the various
 688    * 7- and 8-bit encodings, such as ISO-LATIN-1 (I cannot autodetect
 689    * those).
 690    * @param encodingName The name of the encoding specified by the user.
 691    * @see #parseXMLDecl
 692    * @see #parseTextDecl
 693    */
 694  void checkEncoding (String encodingName, boolean ignoreEncoding)
 695    throws java.lang.Exception
 696  {
 697    encodingName = encodingName.toUpperCase();
 698
 699    if (ignoreEncoding) {
 700      return;
 701    }
 702
 703    switch (encoding) {
 704				// 8-bit encodings
 705    case ENCODING_UTF_8:
 706      if (encodingName.equals("ISO-8859-1")) {
 707	encoding = ENCODING_ISO_8859_1;
 708      } else if (!encodingName.equals("UTF-8")) {
 709	error("unsupported 8-bit encoding",
 710	      encodingName,
 711	      "UTF-8 or ISO-8859-1");
 712      }
 713      break;
 714				// 16-bit encodings
 715    case ENCODING_UCS_2_12:
 716    case ENCODING_UCS_2_21:
 717      if (!encodingName.equals("ISO-10646-UCS-2") &&
 718	  !encodingName.equals("UTF-16")) {
 719	error("unsupported 16-bit encoding",
 720	      encodingName,
 721	      "ISO-10646-UCS-2");
 722      }
 723      break;
 724				// 32-bit encodings
 725    case ENCODING_UCS_4_1234:
 726    case ENCODING_UCS_4_4321:
 727    case ENCODING_UCS_4_2143:
 728    case ENCODING_UCS_4_3412:
 729      if (!encodingName.equals("ISO-10646-UCS-4")) {
 730	error("unsupported 32-bit encoding",
 731	      encodingName,
 732	      "ISO-10646-UCS-4");
 733      }
 734    }
 735  }
 736
 737
 738  /**
 739    * Parse miscellaneous markup outside the document element and DOCTYPE
 740    * declaration.
 741    * <pre>
 742    * [27] Misc ::= Comment | PI | S
 743    * </pre>
 744    */
 745  void parseMisc ()
 746    throws java.lang.Exception
 747    {
 748    while (true)
 749      {
 750      skipWhitespace();
 751      if (tryRead("<?"))
 752        {parsePI();}
 753      else if (tryRead("<!--"))
 754        {parseComment();}
 755      else
 756        {return;}
 757      }
 758    }
 759
 760
 761  /**
 762    * Parse a document type declaration.
 763    * <pre>
 764    * [28] doctypedecl ::= '&lt;!DOCTYPE' S Name (S ExternalID)? S?
 765    *                      ('[' %markupdecl* ']' S?)? '&gt;'
 766    * </pre>
 767    * <p>(The <code>&lt;!DOCTYPE</code> has already been read.)
 768    */
 769  void parseDoctypedecl ()
 770    throws java.lang.Exception
 771  {
 772    char c;
 773    String doctypeName, ids[];
 774
 775				// Read the document type name.
 776    requireWhitespace();
 777    doctypeName = readNmtoken(true);
 778
 779				// Read the ExternalIDs.
 780    skipWhitespace();
 781    ids = readExternalIds(false);
 782
 783				// Look for a declaration subset.
 784    skipWhitespace();
 785    if (tryRead('[')) {
 786
 787				// loop until the subset ends
 788      while (true) {
 789	context = CONTEXT_DTD;
 790	skipWhitespace();
 791	context = CONTEXT_NONE;
 792	if (tryRead(']')) {
 793	  break;		// end of subset
 794	} else {
 795	  context = CONTEXT_DTD;
 796	  parseMarkupdecl();
 797	  context = CONTEXT_NONE;
 798	}
 799      }
 800    }
 801
 802				// Read the external subset, if any
 803    if (ids[1] != null) {
 804      pushURL("[external subset]", ids[0], ids[1], null, null, null);
 805
 806				// Loop until we end up back at '>'
 807      while (true) {
 808	context = CONTEXT_DTD;
 809	skipWhitespace();
 810	context = CONTEXT_NONE;
 811	if (tryRead('>')) {
 812	  break;
 813	} else {
 814	  context = CONTEXT_DTD;
 815	  parseMarkupdecl();
 816	  context = CONTEXT_NONE;
 817	}
 818      }
 819    } else {
 820				// No external subset.
 821      skipWhitespace();
 822      require('>');
 823    }
 824
 825    if (handler != null) {
 826      handler.doctypeDecl(doctypeName, ids[0], ids[1]);
 827    }
 828
 829				// Expand general entities in
 830				// default values of attributes.
 831				// (Do this after the doctypeDecl
 832				// event!).
 833    // expandAttributeDefaultValues();
 834  }
 835
 836
 837  /**
 838    * Parse a markup declaration in the internal or external DTD subset.
 839    * <pre>
 840    * [29] markupdecl ::= ( %elementdecl | %AttlistDecl | %EntityDecl |
 841    *                       %NotationDecl | %PI | %S | %Comment |
 842    *                       InternalPERef )
 843    * [30] InternalPERef ::= PEReference
 844    * [31] extSubset ::= (%markupdecl | %conditionalSect)*
 845    * </pre>
 846    */
 847  void parseMarkupdecl ()
 848    throws java.lang.Exception
 849  {
 850    if (tryRead("<!ELEMENT")) {
 851      parseElementdecl();
 852    } else if (tryRead("<!ATTLIST")) {
 853      parseAttlistDecl();
 854    } else if (tryRead("<!ENTITY")) {
 855      parseEntityDecl();
 856    } else if (tryRead("<!NOTATION")) {
 857      parseNotationDecl();
 858    } else if (tryRead("<?")) {
 859      parsePI();
 860    } else if (tryRead("<!--")) {
 861      parseComment();
 862    } else if (tryRead("<![")) {
 863      parseConditionalSect();
 864    } else {
 865      error("expected markup declaration", null, null);
 866    }
 867  }
 868
 869
 870  /**
 871    * Parse an element, with its tags.
 872    * <pre>
 873    * [33] STag ::= '&lt;' Name (S Attribute)* S? '&gt;' [WFC: unique Att spec]
 874    * [38] element ::= EmptyElement | STag content ETag
 875    * [39] EmptyElement ::= '&lt;' Name (S Attribute)* S? '/&gt;'
 876    *                       [WFC: unique Att spec]
 877    * </pre>
 878    * <p>(The '&lt;' has already been read.)
 879    * <p>NOTE: this method actually chains onto parseContent(), if necessary,
 880    * and parseContent() will take care of calling parseETag().
 881    */
 882  void parseElement ()
 883    throws java.lang.Exception
 884  {
 885    String gi;
 886    char c;
 887    int oldElementContent = currentElementContent;
 888    String oldElement = currentElement;
 889
 890				// This is the (global) counter for the
 891				// array of specified attributes.
 892    tagAttributePos = 0;
 893
 894				// Read the element type name.
 895    gi = readNmtoken(true);
 896
 897				// Determine the current content type.
 898    currentElement = gi;
 899    currentElementContent = getElementContentType(gi);
 900    if (currentElementContent == CONTENT_UNDECLARED) {
 901      currentElementContent = CONTENT_ANY;
 902    }
 903
 904				// Read the attributes, if any.
 905				// After this loop, we should be just
 906				// in front of the closing delimiter.
 907    skipWhitespace();
 908    c = readCh();
 909    while (c != '/' && c != '>') {
 910      unread(c);
 911      parseAttribute(gi);
 912      skipWhitespace();
 913      c = readCh();
 914    }
 915    unread(c);
 916
 917				// Supply any defaulted attributes.
 918    Enumeration atts = declaredAttributes(gi);
 919    if (atts != null) {
 920      String aname;
 921    loop: while (atts.hasMoreElements()) {
 922      aname = (String)atts.nextElement();
 923				// See if it was specified.
 924      for (int i = 0; i < tagAttributePos; i++) {
 925	if (tagAttributes[i] == aname) {
 926	  continue loop;
 927	}
 928      }
 929				// I guess not...
 930      if (handler != null) {
 931	handler.attribute(aname,
 932			  getAttributeExpandedValue(gi, aname),
 933			  false);
 934      }
 935    }
 936    }
 937
 938				// Figure out if this is a start tag
 939				// or an empty element, and dispatch an
 940				// event accordingly.
 941    c = readCh();
 942    switch (c) {
 943    case '>':
 944      if (handler != null) {
 945	handler.startElement(gi);
 946      }
 947      parseContent();
 948      break;
 949    case '/':
 950      require('>');
 951      if (handler != null) {
 952	handler.startElement(gi);
 953	handler.endElement(gi);
 954      }
 955      break;
 956    }
 957
 958				// Restore the previous state.
 959    currentElement = oldElement;
 960    currentElementContent = oldElementContent;
 961  }
 962
 963
 964  /**
 965    * Parse an attribute assignment.
 966    * <pre>
 967    * [34] Attribute ::= Name Eq AttValue
 968    * </pre>
 969    * @param name The name of the attribute's element.
 970    * @see XmlHandler#attribute
 971    */
 972  void parseAttribute (String name)
 973    throws java.lang.Exception
 974  {
 975    String aname;
 976    int type;
 977    String value;
 978
 979				// Read the attribute name.
 980    aname = readNmtoken(true).intern();
 981    type = getAttributeDefaultValueType(name, aname);
 982
 983				// Parse '='
 984    parseEq();
 985
 986				// Read the value, normalizing whitespace
 987				// if it is not CDATA.
 988    if (type == ATTRIBUTE_CDATA || type == ATTRIBUTE_UNDECLARED) {
 989      value = readLiteral(LIT_CHAR_REF | LIT_ENTITY_REF);
 990    } else {
 991      value = readLiteral(LIT_CHAR_REF | LIT_ENTITY_REF | LIT_NORMALIZE);
 992    }
 993
 994				// Inform the handler about the
 995				// attribute.
 996    if (handler != null) {
 997      handler.attribute(aname, value, true);
 998    }
 999    dataBufferPos = 0;
1000
1001				// Note that the attribute has been
1002				// specified.
1003    if (tagAttributePos == tagAttributes.length) {
1004      String newAttrib[] = new String[tagAttributes.length * 2];
1005      System.arraycopy(tagAttributes, 0, newAttrib, 0, tagAttributePos);
1006      tagAttributes = newAttrib;
1007    }
1008    tagAttributes[tagAttributePos++] = aname;
1009  }
1010
1011
1012  /**
1013    * Parse an equals sign surrounded by optional whitespace.
1014    * [35] Eq ::= S? '=' S?
1015    */
1016  void parseEq ()
1017    throws java.lang.Exception
1018  {
1019    skipWhitespace();
1020    require('=');
1021    skipWhitespace();
1022  }
1023
1024
1025  /**
1026    * Parse an end tag.
1027    * [36] ETag ::= '</' Name S? '>'
1028    * *NOTE: parseContent() chains to here.
1029    */
1030  void parseETag ()
1031    throws java.lang.Exception
1032  {
1033    String name;
1034    name = readNmtoken(true);
1035    if (name != currentElement) {
1036      error("mismatched end tag", name, currentElement);
1037    }
1038    skipWhitespace();
1039    require('>');
1040    if (handler != null) {
1041      handler.endElement(name);
1042    }
1043  }
1044
1045
1046  /**
1047    * Parse the content of an element.
1048    * [37] content ::= (element | PCData | Reference | CDSect | PI | Comment)*
1049    * [68] Reference ::= EntityRef | CharRef
1050    */
1051  void parseContent ()
1052    throws java.lang.Exception
1053  {
1054    String data;
1055    char c;
1056
1057    while (true) {
1058
1059      switch (currentElementContent) {
1060      case CONTENT_ANY:
1061      case CONTENT_MIXED:
1062	parsePCData();
1063	break;
1064      case CONTENT_ELEMENTS:
1065	parseWhitespace();
1066	break;
1067      }
1068
1069				// Handle delimiters
1070      c = readCh();
1071      switch (c) {
1072
1073      case '&':			// Found "&"
1074	c = readCh();
1075	if (c == '#') {
1076	  parseCharRef();
1077	} else {
1078	  unread(c);
1079	  parseEntityRef(true);
1080	}
1081	break;
1082
1083      case '<':			// Found "<"
1084
1085	c = readCh();
1086	switch (c) {
1087
1088	case '!':		// Found "<!"
1089	  c = readCh();
1090	  switch (c) {
1091	  case '-':		// Found "<!-"
1092	    require('-');
1093	    parseComment();
1094	    break;
1095	  case '[':		// Found "<!["
1096	    require("CDATA[");
1097	    parseCDSect();
1098	    break;
1099	  default:
1100	    error("expected comment or CDATA section", c, null);
1101	    break;
1102	  }
1103	  break;
1104
1105	case '?':		// Found "<?"
1106	  dataBufferFlush();
1107	  parsePI();
1108	  break;
1109
1110	case '/':		// Found "</"
1111	  dataBufferFlush();
1112	  parseETag();
1113	  return;
1114
1115	default:		// Found "<" followed by something else
1116	  dataBufferFlush();
1117	  unread(c);
1118	  parseElement();
1119	  break;
1120	}
1121      }
1122    }
1123  }
1124
1125
1126  /**
1127    * Parse an element type declaration.
1128    * [40] elementdecl ::= '<!ELEMENT' S %Name S (%S S)? %contentspec S? '>'
1129    *                      [VC: Unique Element Declaration]
1130    * *NOTE: the '<!ELEMENT' has already been read.
1131    */
1132  void parseElementdecl ()
1133    throws java.lang.Exception
1134  {
1135    String name;
1136
1137    requireWhitespace();
1138				// Read the element type name.
1139    name = readNmtoken(true);
1140
1141    requireWhitespace();
1142				// Read the content model.
1143    parseContentspec(name);
1144
1145    skipWhitespace();
1146    require('>');
1147  }
1148
1149
1150  /**
1151    * Content specification.
1152    * [41] contentspec ::= 'EMPTY' | 'ANY' | Mixed | elements
1153    */
1154  void parseContentspec (String name)
1155    throws java.lang.Exception
1156  {
1157    if (tryRead("EMPTY")) {
1158      setElement(name, CONTENT_EMPTY, null, null);
1159      return;
1160    } else if (tryRead("ANY")) {
1161      setElement(name, CONTENT_ANY, null, null);
1162      return;
1163    } else {
1164      require('(');
1165      dataBufferAppend('(');
1166      skipWhitespace();
1167      if (tryRead("#PCDATA")) {
1168	dataBufferAppend("#PCDATA");
1169	parseMixed();
1170	setElement(name, CONTENT_MIXED, dataBufferToString(), null);
1171      } else {
1172	parseElements();
1173	setElement(name, CONTENT_ELEMENTS, dataBufferToString(), null);
1174      }
1175    }
1176  }
1177
1178
1179  /**
1180    * Parse an element-content model.
1181    * [42] elements ::= (choice | seq) ('?' | '*' | '+')?
1182    * [44] cps ::= S? %cp S?
1183    * [45] choice ::= '(' S? %ctokplus (S? '|' S? %ctoks)* S? ')'
1184    * [46] ctokplus ::= cps ('|' cps)+
1185    * [47] ctoks ::= cps ('|' cps)*
1186    * [48] seq ::= '(' S? %stoks (S? ',' S? %stoks)* S? ')'
1187    * [49] stoks ::= cps (',' cps)*
1188    * *NOTE: the opening '(' and S have already been read.
1189    * *TODO: go over parameter entity boundaries more carefully.
1190    */
1191  void parseElements ()
1192    throws java.lang.Exception
1193  {
1194    char c;
1195    char sep;
1196
1197				// Parse the first content particle
1198    skipWhitespace();
1199    parseCp();
1200
1201				// Check for end or for a separator.
1202    skipWhitespace();
1203    c = readCh();
1204    switch (c) {
1205    case ')':
1206      dataBufferAppend(')');
1207      c = readCh();
1208      switch (c) {
1209      case '*':
1210      case '+':
1211      case '?':
1212	dataBufferAppend(c);
1213	break;
1214      default:
1215	unread(c);
1216      }
1217      return;
1218    case ',':			// Register the separator.
1219    case '|':
1220      sep = c;
1221      dataBufferAppend(c);
1222      break;
1223    default:
1224      error("bad separator in content model", c, null);
1225      return;
1226    }
1227
1228				// Parse the rest of the content model.
1229    while (true) {
1230      skipWhitespace();
1231      parseCp();
1232      skipWhitespace();
1233      c = readCh();
1234      if (c == ')') {
1235	dataBufferAppend(')');
1236	break;
1237      } else if (c != sep) {
1238	error("bad separator in content model", c, null);
1239	return;
1240      } else {
1241	dataBufferAppend(c);
1242      }
1243    }
1244
1245				// Check for the occurrence indicator.
1246    c = readCh();
1247    switch (c) {
1248    case '?':
1249    case '*':
1250    case '+':
1251      dataBufferAppend(c);
1252      return;
1253    default:
1254      unread(c);
1255      return;
1256    }
1257  }
1258
1259
1260  /**
1261    * Parse a content particle.
1262    * [43] cp ::= (Name | choice | seq) ('?' | '*' | '+')
1263    * *NOTE: I actually use a slightly different production here:
1264    *        cp ::= (elements | (Name ('?' | '*' | '+')?))
1265    */
1266  void parseCp ()
1267    throws java.lang.Exception
1268  {
1269    char c;
1270
1271    if (tryRead('(')) {
1272      dataBufferAppend('(');
1273      parseElements();
1274    } else {
1275      dataBufferAppend(readNmtoken(true));
1276      c = readCh();
1277      switch (c) {
1278      case '?':
1279      case '*':
1280      case '+':
1281	dataBufferAppend(c);
1282	break;
1283      default:
1284	unread(c);
1285	break;
1286      }
1287    }
1288  }
1289
1290
1291  /**
1292    * Parse mixed content.
1293    * [50] Mixed ::= '(' S? %( %'#PCDATA' (S? '|' S? %Mtoks)* ) S? ')*'
1294    *              | '(' S? %('#PCDATA') S? ')'
1295    * [51] Mtoks ::= %Name (S? '|' S? %Name)*
1296    * *NOTE: the S and '#PCDATA' have already been read.
1297    */
1298  void parseMixed ()
1299    throws java.lang.Exception
1300  {
1301    char c;
1302
1303				// Check for PCDATA alone.
1304    skipWhitespace();
1305    if (tryRead(')')) {
1306      dataBufferAppend(")*");
1307      tryRead('*');
1308      return;
1309    }
1310
1311				// Parse mixed content.
1312    skipWhitespace();
1313    while (!tryRead(")*")) {
1314      require('|');
1315      dataBufferAppend('|');
1316      skipWhitespace();
1317      dataBufferAppend(readNmtoken(true));
1318      skipWhitespace();
1319    }
1320    dataBufferAppend(")*");
1321  }
1322
1323
1324  /**
1325    * Parse an attribute list declaration.
1326    * [52] AttlistDecl ::= '<!ATTLIST' S %Name S? %AttDef+ S? '>'
1327    * *NOTE: the '<!ATTLIST' has already been read.
1328    */
1329  void parseAttlistDecl ()
1330    throws java.lang.Exception
1331  {
1332    String elementName;
1333
1334    requireWhitespace();
1335    elementName = readNmtoken(true);
1336    requireWhitespace();
1337    while (!tryRead('>')) {
1338      parseAttDef(elementName);
1339      skipWhitespace();
1340    }
1341  }
1342
1343
1344  /**
1345    * Parse a single attribute definition.
1346    * [53] AttDef ::= S %Name S %AttType S %Default
1347    */
1348  void parseAttDef (String elementName)
1349    throws java.lang.Exception
1350  {
1351    String name;
1352    int type;
1353    String enum = null;
1354
1355				// Read the attribute name.
1356    name = readNmtoken(true);
1357
1358				// Read the attribute type.
1359    requireWhitespace();
1360    type = readAttType();
1361
1362				// Get the string of enumerated values
1363				// if necessary.
1364    if (type == ATTRIBUTE_ENUMERATED || type == ATTRIBUTE_NOTATION) {
1365      enum = dataBufferToString();
1366    }
1367
1368				// Read the default value.
1369    requireWhitespace();
1370    parseDefault(elementName, name, type, enum);
1371  }
1372
1373
1374  /**
1375    * Parse the attribute type.
1376    * [54] AttType ::= StringType | TokenizedType | EnumeratedType
1377    * [55] StringType ::= 'CDATA'
1378    * [56] TokenizedType ::= 'ID' | 'IDREF' | 'IDREFS' | 'ENTITY' | 'ENTITIES' |
1379    *                        'NMTOKEN' | 'NMTOKENS'
1380    * [57] EnumeratedType ::= NotationType | Enumeration
1381    * *TODO: validate the type!!
1382    */
1383  int readAttType ()
1384    throws java.lang.Exception
1385  {
1386    String typeString;
1387    Integer type;
1388
1389    if (tryRead('(')) {
1390      parseEnumeration();
1391      return ATTRIBUTE_ENUMERATED;
1392    } else {
1393      typeString = readNmtoken(true);
1394      if (typeString.equals("NOTATION")) {
1395	parseNotationType();
1396      }
1397      type = (Integer)attributeTypeHash.get(typeString);
1398      if (type == null) {
1399	error("illegal attribute type", typeString, null);
1400	return ATTRIBUTE_UNDECLARED;
1401      } else {
1402	return type.intValue();
1403      }
1404    }
1405  }
1406
1407
1408  /**
1409    * Parse an enumeration.
1410    * [60] Enumeration ::= '(' S? %Etoks (S? '|' S? %Etoks)* S? ')'
1411    * [61] Etoks ::= %Nmtoken (S? '|' S? %Nmtoken)*
1412    * *NOTE: the '(' has already been read.
1413    */
1414  void parseEnumeration ()
1415    throws java.lang.Exception
1416  {
1417    char c;
1418
1419    dataBufferAppend('(');
1420
1421				// Read the first token.
1422    skipWhitespace();
1423    dataBufferAppend(readNmtoken(true));
1424				// Read the remaining tokens.
1425    skipWhitespace();
1426    while (!tryRead(')')) {
1427      require('|');
1428      dataBufferAppend('|');
1429      skipWhitespace();
1430      dataBufferAppend(readNmtoken(true));
1431      skipWhitespace();
1432    }
1433    dataBufferAppend(')');
1434  }
1435
1436
1437  /**
1438    * Parse a notation type for an attribute.
1439    * [58] NotationType ::= %'NOTATION' S '(' S? %Ntoks (S? '|' S? %Ntoks)*
1440    *                       S? ')'
1441    * [59] Ntoks ::= %Name (S? '|' S? %Name)
1442    * *NOTE: the 'NOTATION' has already been read
1443    */
1444  void parseNotationType ()
1445    throws java.lang.Exception
1446  {
1447    requireWhitespace();
1448    require('(');
1449
1450    parseEnumeration();
1451  }
1452
1453
1454  /**
1455    * Parse the default value for an attribute.
1456    * [62] Default ::= '#REQUIRED' | '#IMPLIED' | ((%'#FIXED' S)? %AttValue
1457    */
1458  void parseDefault (String elementName, String name, int type, String enum)
1459    throws java.lang.Exception
1460  {
1461    int valueType = ATTRIBUTE_DEFAULT_SPECIFIED;
1462    String value = null;
1463    boolean normalizeWSFlag;
1464
1465    if (tryRead('#')) {
1466      if (tryRead("FIXED")) {
1467	valueType = ATTRIBUTE_DEFAULT_FIXED;
1468	requireWhitespace();
1469	context = CONTEXT_ATTRIBUTEVALUE;
1470	value = readLiteral(LIT_CHAR_REF);
1471	context = CONTEXT_DTD;
1472      } else if (tryRead("REQUIRED")) {
1473	valueType = ATTRIBUTE_DEFAULT_REQUIRED;
1474      } else if (tryRead("IMPLIED")) {
1475	valueType = ATTRIBUTE_DEFAULT_IMPLIED;
1476      } else {
1477	error("illegal keyword for attribute default value", null, null);
1478      }
1479    } else {
1480      context = CONTEXT_ATTRIBUTEVALUE;
1481      value = readLiteral(LIT_CHAR_REF);
1482      context = CONTEXT_DTD;
1483    }
1484    setAttribute(elementName, name, type, enum, value, valueType);
1485  }
1486
1487
1488  /**
1489    * Parse a conditional section.
1490    * [63] conditionalSect ::= includeSect || ignoreSect
1491    * [64] includeSect ::= '<![' %'INCLUDE' '[' (%markupdecl*)* ']]>'
1492    * [65] ignoreSect ::= '<![' %'IGNORE' '[' ignoreSectContents* ']]>'
1493    * [66] ignoreSectContents ::= ((SkipLit | Comment | PI) -(Char* ']]>'))
1494    *                           | ('<![' ignoreSectContents* ']]>')
1495    *                           | (Char - (']' | [<'"]))
1496    *                           | ('<!' (Char - ('-' | '[')))
1497    * *NOTE: the '<![' has already been read.
1498    * *TODO: verify that I am handling ignoreSectContents right.
1499    */
1500  void parseConditionalSect ()
1501    throws java.lang.Exception
1502  {
1503    skipWhitespace();
1504    if (tryRead("INCLUDE")) {
1505      skipWhitespace();
1506      require('[');
1507      skipWhitespace();
1508      while (!tryRead("]]>")) {
1509	parseMarkupdecl();
1510	skipWhitespace();
1511      }
1512    } else if (tryRead("IGNORE")) {
1513      skipWhitespace();
1514      require('[');
1515      int nesting = 1;
1516      char c;
1517      for (int nest = 1; nest > 0; ) {
1518	c = readCh();
1519	switch (c) {
1520	case '<':
1521	  if (tryRead("![")) {
1522	    nest++;
1523	  }
1524	case ']':
1525	  if (tryRead("]>")) {
1526	    nest--;
1527	  }
1528	}
1529      }
1530    } else {
1531      error("conditional section must begin with INCLUDE or IGNORE",
1532	    null, null);
1533    }
1534  }
1535
1536
1537  /**
1538    * Read a character reference.
1539    * [67] CharRef ::= '&#' [0-9]+ ';' | '&#x' [0-9a-fA-F]+ ';'
1540    * *NOTE: the '&#' has already been read.
1541    */
1542  void parseCharRef ()
1543    throws java.lang.Exception
1544  {
1545    int value = 0;
1546    char c;
1547
1548    if (tryRead('x')) {
1549      loop1: while (true) {
1550	c = readCh();
1551	switch (c) {
1552	case '0':
1553	case '1':
1554	case '2':
1555	case '3':
1556	case '4':
1557	case '5':
1558	case '6':
1559	case '7':
1560	case '8':
1561	case '9':
1562	case 'a':
1563	case 'A':
1564	case 'b':
1565	case 'B':
1566	case 'c':
1567	case 'C':
1568	case 'd':
1569	case 'D':
1570	case 'e':
1571	case 'E':
1572	case 'f':
1573	case 'F':
1574	  value *= 16;
1575	  value += Integer.parseInt(new Character(c).toString(), 16);
1576	  break;
1577	case ';':
1578	  break loop1;
1579	default:
1580	  error("illegal character in character reference", c, null);
1581	  break loop1;
1582	}
1583      }
1584    } else {
1585      loop2: while (true) {
1586	c = readCh();
1587	switch (c) {
1588	case '0':
1589	case '1':
1590	case '2':
1591	case '3':
1592	case '4':
1593	case '5':
1594	case '6':
1595	case '7':
1596	case '8':
1597	case '9':
1598	  value *= 10;
1599	  value += Integer.parseInt(new Character(c).toString(), 10);
1600	  break;
1601	case ';':
1602	  break loop2;
1603	default:
1604	  error("illegal character in character reference", c, null);
1605	  break loop2;
1606	}
1607      }
1608    }
1609
1610    // Check for surrogates: 00000000 0000xxxx yyyyyyyy zzzzzzzz
1611    //  (1101|10xx|xxyy|yyyy + 1101|11yy|zzzz|zzzz: 
1612    if (value <= 0x0000ffff) {
1613				// no surrogates needed
1614      dataBufferAppend((char)value);
1615    } else if (value <= 0x000fffff) {
1616				// > 16 bits, surrogate needed
1617      dataBufferAppend((char)(0xd8 | ((value & 0x000ffc00) >> 10)));
1618      dataBufferAppend((char)(0xdc | (value & 0x0003ff)));
1619    } else {
1620				// too big for surrogate
1621      error("character reference " + value + " is too large for UTF-16",
1622	    new Integer(value).toString(), null);
1623    }
1624  }
1625
1626
1627  /**
1628    * Parse a reference.
1629    * [69] EntityRef ::= '&' Name ';'
1630    * *NOTE: the '&' has already been read.
1631    * @param externalAllowed External entities are allowed here.
1632    */
1633  void parseEntityRef (boolean externalAllowed)
1634    throws java.lang.Exception
1635  {
1636    String name;
1637
1638    name = readNmtoken(true);
1639    require(';');
1640    switch (getEntityType(name)) {
1641    case ENTITY_UNDECLARED:
1642      error("reference to undeclared entity", name, null);
1643      break;
1644    case ENTITY_INTERNAL:
1645      pushString(name, getEntityValue(name));
1646      break;
1647    case ENTITY_TEXT:
1648      if (externalAllowed) {
1649	pushURL(name, getEntityPublicId(name),
1650		getEntitySystemId(name),
1651		null, null, null);
1652      } else {
1653	error("reference to external entity in attribute value.", name, null);
1654      }
1655      break;
1656    case ENTITY_NDATA:
1657      if (externalAllowed) {
1658	error("data entity reference in content", name, null);
1659      } else {
1660	error("reference to external entity in attribute value.", name, null);
1661      }
1662      break;
1663    }
1664  }
1665
1666
1667  /**
1668    * Parse a parameter entity reference.
1669    * [70] PEReference ::= '%' Name ';'
1670    * *NOTE: the '%' has already been read.
1671    */
1672  void parsePEReference (boolean isEntityValue)
1673    throws java.lang.Exception
1674  {
1675    String name;
1676
1677    name = "%" + readNmtoken(true);
1678    require(';');
1679    switch (getEntityType(name)) {
1680    case ENTITY_UNDECLARED:
1681      error("reference to undeclared parameter entity", name, null);
1682      break;
1683    case ENTITY_INTERNAL:
1684      if (isEntityValue) {
1685	pushString(name, getEntityValue(name));
1686      } else {
1687	pushString(name, " " + getEntityValue(name) + ' ');
1688      }
1689      break;
1690    case ENTITY_TEXT:
1691      if (isEntityValue) {
1692	pushString(null, " ");
1693      }
1694      pushURL(name, getEntityPublicId(name),
1695	      getEntitySystemId(name),
1696	      null, null, null);
1697      if (isEntityValue) {
1698	pushString(null, " ");
1699      }
1700      break;
1701    }
1702  }
1703
1704
1705  /**
1706    * Parse an entity declaration.
1707    * [71] EntityDecl ::= '<!ENTITY' S %Name S %EntityDef S? '>'
1708    *                   | '<!ENTITY' S '%' S %Name S %EntityDef S? '>'
1709    * [72] EntityDef ::= EntityValue | ExternalDef
1710    * [73] ExternalDef ::= ExternalID %NDataDecl?
1711    * [74] ExternalID ::= 'SYSTEM' S SystemLiteral
1712    *                   | 'PUBLIC' S PubidLiteral S SystemLiteral
1713    * [75] NDataDecl ::= S %'NDATA' S %Name
1714    * *NOTE: the '<!ENTITY' has already been read.
1715    */
1716  void parseEntityDecl ()
1717    throws java.lang.Exception
1718  {
1719    char c;
1720    boolean peFlag = false;
1721    String name, value, notationName, ids[];
1722
1723				// Check for a parameter entity.
1724    requireWhitespace();
1725    if (tryRead('%')) {
1726      peFlag = true;
1727      requireWhitespace();
1728    }
1729
1730				// Read the entity name, and prepend
1731				// '%' if necessary.
1732    name = readNmtoken(true);
1733    if (peFlag) {
1734      name = "%" + name;
1735    }
1736
1737				// Read the entity value.
1738    requireWhitespace();
1739    c = readCh();
1740    unread(c);
1741    if (c == '"' || c == '\'') {
1742				// Internal entity.
1743      context = CONTEXT_ENTITYVALUE;
1744      value = readLiteral(LIT_CHAR_REF|LIT_PE_REF);
1745      context = CONTEXT_DTD;
1746      setInternalEntity(name,value);
1747    } else {
1748				// Read the external IDs
1749      ids = readExternalIds(false);
1750      if (ids[1] == null) {
1751	error("system identifer missing", name, null);
1752      }
1753
1754				// Check for NDATA declaration.
1755      skipWhitespace();
1756      if (tryRead("NDATA")) {
1757	requireWhitespace();
1758	notationName = readNmtoken(true);
1759	setExternalDataEntity(name, ids[0], ids[1], notationName);
1760      } else {
1761	setExternalTextEntity(name, ids[0], ids[1]);
1762      }
1763    }
1764
1765				// Finish the declaration.
1766    skipWhitespace();
1767    require('>');
1768  }
1769
1770
1771  /**
1772    * Parse a notation declaration.
1773    * [81] NotationDecl ::= '<!NOTATION' S %Name S %ExternalID S? '>'
1774    * *NOTE: the '<!NOTATION' has already been read.
1775    */
1776  void parseNotationDecl ()
1777    throws java.lang.Exception
1778  {
1779    String nname, ids[];
1780    
1781
1782    requireWhitespace();
1783    nname = readNmtoken(true);
1784
1785    requireWhitespace();
1786
1787				// Read the external identifiers.
1788    ids = readExternalIds(true);
1789    if (ids[0] == null && ids[1] == null) {
1790      error("external identifer missing", nname, null);
1791    }
1792
1793				// Register the notation.
1794    setNotation(nname, ids[0], ids[1]);
1795
1796    skipWhitespace();
1797    require('>');
1798  }
1799
1800
1801  /**
1802    * Parse PCDATA.
1803    * <pre>
1804    * [16] PCData ::= [^&lt;&amp;]*
1805    * </pre>
1806    * <p>The trick here is that the data stays in the dataBuffer without
1807    * necessarily being converted to a string right away.
1808    */
1809  void parsePCData ()
1810    throws java.lang.Exception
1811  {
1812    char c;
1813
1814				// Start with a little cheat -- in most
1815				// cases, the entire sequence of
1816				// character data will already be in
1817				// the readBuffer; if not, fall through to
1818				// the normal approach.
1819    if (USE_CHEATS) {
1820      int lineAugment = 0;
1821      int columnAugment = 0;
1822
1823      loop: for (int i = readBufferPos; i < readBufferLength; i++) {
1824	switch (readBuffer[i]) {
1825	case '\n':
1826	  lineAugment++;
1827	  columnAugment = 0;
1828	  break;
1829	case '&':
1830	case '<':
1831	  int start = readBufferPos;
1832	  columnAugment++;
1833	  readBufferPos = i;
1834	  if (lineAugment > 0) {
1835	    line += lineAugment;
1836	    column = columnAugment;
1837	  } else {
1838	    column += columnAugment;
1839	  }
1840	  dataBufferAppend(readBuffer, start, i-start);
1841	  return;
1842	default:
1843	  columnAugment++;
1844	}
1845      }
1846    }
1847
1848				// OK, the cheat didn't work; start over
1849				// and do it by the book.
1850    while (true) {
1851      c = readCh();
1852      switch (c) {
1853      case '<':
1854      case '&':
1855	unread(c);
1856	return;
1857      default:
1858	dataBufferAppend(c);
1859	break;
1860      }
1861    }
1862  }
1863
1864
1865
1866  //////////////////////////////////////////////////////////////////////
1867  // High-level reading and scanning methods.
1868  //////////////////////////////////////////////////////////////////////
1869
1870  /**
1871    * Require whitespace characters.
1872    * [1] S ::= (#x20 | #x9 | #xd | #xa)+
1873    */
1874  void requireWhitespace ()
1875    throws java.lang.Exception
1876  {
1877    char c = readCh();
1878    if (isWhitespace(c)) {
1879      skipWhitespace();
1880    } else {
1881      error("whitespace expected", c, null);
1882    }
1883  }
1884
1885
1886  /**
1887    * Parse whitespace characters, and leave them in the data buffer.
1888    */
1889  void parseWhitespace ()
1890    throws java.lang.Exception
1891  {
1892    char c = readCh();
1893    while (isWhitespace(c)) {
1894      dataBufferAppend(c);
1895      c = readCh();
1896    }
1897    unread(c);
1898  }
1899
1900
1901  /**
1902    * Skip whitespace characters.
1903    * [1] S ::= (#x20 | #x9 | #xd | #xa)+
1904    */
1905  void skipWhitespace ()
1906    throws java.lang.Exception
1907  {
1908				// Start with a little cheat.  Most of
1909				// the time, the white space will fall
1910				// within the current read buffer; if
1911				// not, then fall through.
1912    if (USE_CHEATS) {
1913      int lineAugment = 0;
1914      int columnAugment = 0;
1915
1916      loop: for (int i = readBufferPos; i < readBufferLength; i++) {
1917	switch (readBuffer[i]) {
1918	case ' ':
1919	case '\t':
1920	case '\r':
1921	  columnAugment++;
1922	  break;
1923	case '\n':
1924	  lineAugment++;
1925	  columnAugment = 0;
1926	  break;
1927	case '%':
1928	  if (context == CONTEXT_DTD || context == CONTEXT_ENTITYVALUE) {
1929	    break loop;
1930	  } // else fall through...
1931	default:
1932	  readBufferPos = i;
1933	  if (lineAugment > 0) {
1934	    line += lineAugment;
1935	    column = columnAugment;
1936	  } else {
1937	    column += columnAugment;
1938	  }
1939	  return;
1940	}
1941      }
1942    }
1943
1944				// OK, do it by the book.
1945    char c = readCh();
1946    while (isWhitespace(c)) {
1947      c = readCh();
1948    }
1949    unread(c);
1950  }
1951
1952
1953  /**
1954    * Read a name or name token.
1955    * [5] Name ::= (Letter | '_' | ':') (NameChar)*
1956    * [7] Nmtoken ::= (NameChar)+
1957    * *NOTE: [6] is implemented implicitly where required.
1958    */
1959  String readNmtoken (boolean isName)
1960    throws java.lang.Exception
1961  {
1962    char c;
1963
1964    if (USE_CHEATS) {
1965      loop: for (int i = readBufferPos; i < readBufferLength; i++) {
1966	switch (readBuffer[i]) {
1967	case '%':
1968	  if (context == CONTEXT_DTD || context == CONTEXT_ENTITYVALUE) {
1969	    break loop;
1970	  } // else fall through...
1971	case '<':
1972	case '>':
1973	case '&':
1974	case ',':
1975	case '|':
1976	case '*':
1977	case '+':
1978	case '?':
1979	case ')':
1980	case '=':
1981	case '\'':
1982	case '"':
1983	case '[':
1984	case ' ':
1985	case '\t':
1986	case '\r':
1987	case '\n':
1988	case ';':
1989	case '/':
1990	case '#':
1991	  int start = readBufferPos;
1992	  if (i == start) {
1993	    error("name expected", readBuffer[i], null);
1994	  }
1995	  readBufferPos = i;
1996	  return intern(readBuffer, start, i - start);
1997	}
1998      }
1999    }
2000
2001    nameBufferPos = 0;
2002
2003				// Read the first character.
2004    loop: while (true) {
2005      c = readCh();
2006      switch (c) {
2007      case '%':
2008      case '<':
2009      case '>':
2010      case '&':
2011      case ',':
2012      case '|':
2013      case '*':
2014      case '+':
2015      case '?':
2016      case ')':
2017      case '=':
2018      case '\'':
2019      case '"':
2020      case '[':
2021      case ' ':
2022      case '\t':
2023      case '\n':
2024      case '\r':
2025      case ';':
2026      case '/':
2027	unread(c);
2028	if (nameBufferPos == 0) {
2029	  error("name expected", null, null);
2030	}
2031	String s = intern(nameBuffer,0,nameBufferPos);
2032	nameBufferPos = 0;
2033	return s;
2034      default:
2035	nameBuffer =
2036	  (char[])extendArray(nameBuffer, nameBuffer.length, nameBufferPos);
2037	nameBuffer[nameBufferPos++] = c;
2038      }
2039    }
2040  }
2041
2042
2043  /**
2044    * Read a literal.
2045    * [10] AttValue ::= '"' ([^<&"] | Referenc

Large files files are truncated, but you can click here to view the full file