PageRenderTime 5435ms CodeModel.GetById 157ms RepoModel.GetById 1ms app.codeStats 1ms

/projects/jedit-4.3.2/jEdit/com/microstar/xml/XmlParser.java

https://gitlab.com/essere.lab.public/qualitas.class-corpus
Java | 1942 lines | 1158 code | 213 blank | 571 comment | 172 complexity | 92c614be86777980dd15a16fec1c6c38 MD5 | raw file
  1. // XmlParser.java: the main parser class.
  2. // NO WARRANTY! See README, and copyright below.
  3. // $Id: XmlParser.java 14583 2009-02-06 05:38:53Z ezust $
  4. package com.microstar.xml;
  5. import java.io.BufferedInputStream;
  6. import java.io.EOFException;
  7. import java.io.InputStream;
  8. import java.io.Reader;
  9. import java.net.URL;
  10. import java.net.URLConnection;
  11. import java.util.Enumeration;
  12. import java.util.Hashtable;
  13. import java.util.Stack;
  14. /**
  15. * Parse XML documents and return parse events through call-backs.
  16. * <p>You need to define a class implementing the <code>XmlHandler</code>
  17. * interface: an object belonging to this class will receive the
  18. * callbacks for the events. (As an alternative to implementing
  19. * the full XmlHandler interface, you can simply extend the
  20. * <code>HandlerBase</code> convenience class.)
  21. * <p>Usage (assuming that <code>MyHandler</code> is your implementation
  22. * of the <code>XmlHandler</code> interface):
  23. * <pre>
  24. * XmlHandler handler = new MyHandler();
  25. * XmlParser parser = new XmlParser();
  26. * parser.setHandler(handler);
  27. * try {
  28. * parser.parse("http://www.host.com/doc.xml", null);
  29. * } catch (Exception e) {
  30. * [do something interesting]
  31. * }
  32. * </pre>
  33. * @author Copyright (c) 1997, 1998 by Microstar Software Ltd.
  34. * @author Written by David Megginson &lt;dmeggins@microstar.com&gt;
  35. * @version 1.1
  36. * @deprecated use org.xml.sax.XMLReader
  37. */
  38. public class XmlParser {
  39. //
  40. // Use special cheats that speed up the code (currently about 50%),
  41. // but may cause problems with future maintenance and add to the
  42. // class file size (about 500 bytes).
  43. //
  44. private final static boolean USE_CHEATS = true;
  45. //////////////////////////////////////////////////////////////////////
  46. // Constructors.
  47. ////////////////////////////////////////////////////////////////////////
  48. /**
  49. * Construct a new parser with no associated handler.
  50. * @see #setHandler
  51. * @see #parse
  52. */
  53. public XmlParser ()
  54. {
  55. }
  56. /**
  57. * Set the handler that will receive parsing events.
  58. * @param handler The handler to receive callback events.
  59. * @see #parse
  60. * @see XmlHandler
  61. */
  62. public void setHandler (XmlHandler handler)
  63. {
  64. this.handler = handler;
  65. }
  66. /**
  67. * Parse an XML document from a URI.
  68. * <p>You may parse a document more than once, but only one thread
  69. * may call this method for an object at one time.
  70. * @param systemId The URI of the document.
  71. * @param publicId The public identifier of the document, or null.
  72. * @param encoding The suggested encoding, or null if unknown.
  73. * @exception java.lang.Exception Any exception thrown by your
  74. * own handlers, or any derivation of java.io.IOException
  75. * thrown by the parser itself.
  76. */
  77. public void parse (String systemId, String publicId, String encoding)
  78. throws java.lang.Exception
  79. {
  80. doParse(systemId, publicId, null, null, encoding);
  81. }
  82. /**
  83. * Parse an XML document from a byte stream.
  84. * <p>The URI that you supply will become the base URI for
  85. * resolving relative links, but &AElig;lfred will actually read
  86. * the document from the supplied input stream.
  87. * <p>You may parse a document more than once, but only one thread
  88. * may call this method for an object at one time.
  89. * @param systemId The base URI of the document, or null if not
  90. * known.
  91. * @param publicId The public identifier of the document, or null
  92. * if not known.
  93. * @param stream A byte input stream.
  94. * @param encoding The suggested encoding, or null if unknown.
  95. * @exception java.lang.Exception Any exception thrown by your
  96. * own handlers, or any derivation of java.io.IOException
  97. * thrown by the parser itself.
  98. */
  99. public void parse (String systemId, String publicId,
  100. InputStream stream, String encoding)
  101. throws java.lang.Exception
  102. {
  103. doParse(systemId, publicId, null, stream, encoding);
  104. }
  105. /**
  106. * Parse an XML document from a character stream.
  107. * <p>The URI that you supply will become the base URI for
  108. * resolving relative links, but &AElig;lfred will actually read
  109. * the document from the supplied input stream.
  110. * <p>You may parse a document more than once, but only one thread
  111. * may call this method for an object at one time.
  112. * @param systemId The base URI of the document, or null if not
  113. * known.
  114. * @param publicId The public identifier of the document, or null
  115. * if not known.
  116. * @param reader A character stream.
  117. * @exception java.lang.Exception Any exception thrown by your
  118. * own handlers, or any derivation of java.io.IOException
  119. * thrown by the parser itself.
  120. */
  121. public void parse (String systemId, String publicId, Reader reader)
  122. throws java.lang.Exception
  123. {
  124. doParse(systemId, publicId, reader, null, null);
  125. }
  126. private synchronized void doParse (String systemId, String publicId,
  127. Reader reader, InputStream stream,
  128. String encoding)
  129. throws java.lang.Exception
  130. {
  131. basePublicId = publicId;
  132. baseURI = systemId;
  133. baseReader = reader;
  134. baseInputStream = stream;
  135. initializeVariables();
  136. // Set the default entities here.
  137. setInternalEntity(intern("amp"), "&#38;");
  138. setInternalEntity(intern("lt"), "&#60;");
  139. setInternalEntity(intern("gt"), "&#62;");
  140. setInternalEntity(intern("apos"), "&#39;");
  141. setInternalEntity(intern("quot"), "&#34;");
  142. if (handler != null) {
  143. handler.startDocument();
  144. }
  145. pushURL("[document]", basePublicId, baseURI, baseReader, baseInputStream,
  146. encoding);
  147. parseDocument();
  148. if (handler != null) {
  149. handler.endDocument();
  150. }
  151. cleanupVariables();
  152. }
  153. ////////////////////////////////////////////////////////////////////////
  154. // Constants.
  155. ////////////////////////////////////////////////////////////////////////
  156. //
  157. // Constants for element content type.
  158. //
  159. /**
  160. * Constant: an element has not been declared.
  161. * @see #getElementContentType
  162. */
  163. public final static int CONTENT_UNDECLARED = 0;
  164. /**
  165. * Constant: the element has a content model of ANY.
  166. * @see #getElementContentType
  167. */
  168. public final static int CONTENT_ANY = 1;
  169. /**
  170. * Constant: the element has declared content of EMPTY.
  171. * @see #getElementContentType
  172. */
  173. public final static int CONTENT_EMPTY = 2;
  174. /**
  175. * Constant: the element has mixed content.
  176. * @see #getElementContentType
  177. */
  178. public final static int CONTENT_MIXED = 3;
  179. /**
  180. * Constant: the element has element content.
  181. * @see #getElementContentType
  182. */
  183. public final static int CONTENT_ELEMENTS = 4;
  184. //
  185. // Constants for the entity type.
  186. //
  187. /**
  188. * Constant: the entity has not been declared.
  189. * @see #getEntityType
  190. */
  191. public final static int ENTITY_UNDECLARED = 0;
  192. /**
  193. * Constant: the entity is internal.
  194. * @see #getEntityType
  195. */
  196. public final static int ENTITY_INTERNAL = 1;
  197. /**
  198. * Constant: the entity is external, non-XML data.
  199. * @see #getEntityType
  200. */
  201. public final static int ENTITY_NDATA = 2;
  202. /**
  203. * Constant: the entity is external XML data.
  204. * @see #getEntityType
  205. */
  206. public final static int ENTITY_TEXT = 3;
  207. //
  208. // Constants for attribute type.
  209. //
  210. /**
  211. * Constant: the attribute has not been declared for this element type.
  212. * @see #getAttributeType
  213. */
  214. public final static int ATTRIBUTE_UNDECLARED = 0;
  215. /**
  216. * Constant: the attribute value is a string value.
  217. * @see #getAttributeType
  218. */
  219. public final static int ATTRIBUTE_CDATA = 1;
  220. /**
  221. * Constant: the attribute value is a unique identifier.
  222. * @see #getAttributeType
  223. */
  224. public final static int ATTRIBUTE_ID = 2;
  225. /**
  226. * Constant: the attribute value is a reference to a unique identifier.
  227. * @see #getAttributeType
  228. */
  229. public final static int ATTRIBUTE_IDREF = 3;
  230. /**
  231. * Constant: the attribute value is a list of ID references.
  232. * @see #getAttributeType
  233. */
  234. public final static int ATTRIBUTE_IDREFS = 4;
  235. /**
  236. * Constant: the attribute value is the name of an entity.
  237. * @see #getAttributeType
  238. */
  239. public final static int ATTRIBUTE_ENTITY = 5;
  240. /**
  241. * Constant: the attribute value is a list of entity names.
  242. * @see #getAttributeType
  243. */
  244. public final static int ATTRIBUTE_ENTITIES = 6;
  245. /**
  246. * Constant: the attribute value is a name token.
  247. * @see #getAttributeType
  248. */
  249. public final static int ATTRIBUTE_NMTOKEN = 7;
  250. /**
  251. * Constant: the attribute value is a list of name tokens.
  252. * @see #getAttributeType
  253. */
  254. public final static int ATTRIBUTE_NMTOKENS = 8;
  255. /**
  256. * Constant: the attribute value is a token from an enumeration.
  257. * @see #getAttributeType
  258. */
  259. public final static int ATTRIBUTE_ENUMERATED = 9;
  260. /**
  261. * Constant: the attribute is the name of a notation.
  262. * @see #getAttributeType
  263. */
  264. public final static int ATTRIBUTE_NOTATION = 10;
  265. //
  266. // When the class is loaded, populate the hash table of
  267. // attribute types.
  268. //
  269. /**
  270. * Hash table of attribute types.
  271. */
  272. private static Hashtable attributeTypeHash;
  273. static {
  274. attributeTypeHash = new Hashtable();
  275. attributeTypeHash.put("CDATA", new Integer(ATTRIBUTE_CDATA));
  276. attributeTypeHash.put("ID", new Integer(ATTRIBUTE_ID));
  277. attributeTypeHash.put("IDREF", new Integer(ATTRIBUTE_IDREF));
  278. attributeTypeHash.put("IDREFS", new Integer(ATTRIBUTE_IDREFS));
  279. attributeTypeHash.put("ENTITY", new Integer(ATTRIBUTE_ENTITY));
  280. attributeTypeHash.put("ENTITIES", new Integer(ATTRIBUTE_ENTITIES));
  281. attributeTypeHash.put("NMTOKEN", new Integer(ATTRIBUTE_NMTOKEN));
  282. attributeTypeHash.put("NMTOKENS", new Integer(ATTRIBUTE_NMTOKENS));
  283. attributeTypeHash.put("NOTATION", new Integer(ATTRIBUTE_NOTATION));
  284. }
  285. //
  286. // Constants for supported encodings.
  287. //
  288. private final static int ENCODING_UTF_8 = 1;
  289. private final static int ENCODING_ISO_8859_1 = 2;
  290. private final static int ENCODING_UCS_2_12 = 3;
  291. private final static int ENCODING_UCS_2_21 = 4;
  292. private final static int ENCODING_UCS_4_1234 = 5;
  293. private final static int ENCODING_UCS_4_4321 = 6;
  294. private final static int ENCODING_UCS_4_2143 = 7;
  295. private final static int ENCODING_UCS_4_3412 = 8;
  296. //
  297. // Constants for attribute default value.
  298. //
  299. /**
  300. * Constant: the attribute is not declared.
  301. * @see #getAttributeDefaultValueType
  302. */
  303. public final static int ATTRIBUTE_DEFAULT_UNDECLARED = 0;
  304. /**
  305. * Constant: the attribute has a literal default value specified.
  306. * @see #getAttributeDefaultValueType
  307. * @see #getAttributeDefaultValue
  308. */
  309. public final static int ATTRIBUTE_DEFAULT_SPECIFIED = 1;
  310. /**
  311. * Constant: the attribute was declared #IMPLIED.
  312. * @see #getAttributeDefaultValueType
  313. */
  314. public final static int ATTRIBUTE_DEFAULT_IMPLIED = 2;
  315. /**
  316. * Constant: the attribute was declared #REQUIRED.
  317. * @see #getAttributeDefaultValueType
  318. */
  319. public final static int ATTRIBUTE_DEFAULT_REQUIRED = 3;
  320. /**
  321. * Constant: the attribute was declared #FIXED.
  322. * @see #getAttributeDefaultValueType
  323. * @see #getAttributeDefaultValue
  324. */
  325. public final static int ATTRIBUTE_DEFAULT_FIXED = 4;
  326. //
  327. // Constants for input.
  328. //
  329. private final static int INPUT_NONE = 0;
  330. private final static int INPUT_INTERNAL = 1;
  331. private final static int INPUT_EXTERNAL = 2;
  332. private final static int INPUT_STREAM = 3;
  333. private final static int INPUT_BUFFER = 4;
  334. private final static int INPUT_READER = 5;
  335. //
  336. // Flags for reading literals.
  337. //
  338. private final static int LIT_CHAR_REF = 1;
  339. private final static int LIT_ENTITY_REF = 2;
  340. private final static int LIT_PE_REF = 4;
  341. private final static int LIT_NORMALIZE = 8;
  342. //
  343. // Flags for parsing context.
  344. //
  345. private final static int CONTEXT_NONE = 0;
  346. private final static int CONTEXT_DTD = 1;
  347. private final static int CONTEXT_ENTITYVALUE = 2;
  348. private final static int CONTEXT_ATTRIBUTEVALUE = 3;
  349. //////////////////////////////////////////////////////////////////////
  350. // Error reporting.
  351. //////////////////////////////////////////////////////////////////////
  352. /**
  353. * Report an error.
  354. * @param message The error message.
  355. * @param textFound The text that caused the error (or null).
  356. * @see XmlHandler#error
  357. * @see #line
  358. */
  359. void error (String message, String textFound, String textExpected)
  360. throws java.lang.Exception
  361. {
  362. errorCount++;
  363. if (textFound != null) {
  364. message = message + " (found \"" + textFound + "\")";
  365. }
  366. if (textExpected != null) {
  367. message = message + " (expected \"" + textExpected + "\")";
  368. }
  369. if (handler != null) {
  370. String uri = null;
  371. if (externalEntity != null) {
  372. uri = externalEntity.getURL().toString();
  373. }
  374. handler.error(message, uri, line, column);
  375. }
  376. }
  377. /**
  378. * Report a serious error.
  379. * @param message The error message.
  380. * @param textFound The text that caused the error (or null).
  381. */
  382. void error (String message, char textFound, String textExpected)
  383. throws java.lang.Exception
  384. {
  385. error(message, new Character(textFound).toString(), textExpected);
  386. }
  387. //////////////////////////////////////////////////////////////////////
  388. // Major syntactic productions.
  389. //////////////////////////////////////////////////////////////////////
  390. /**
  391. * Parse an XML document.
  392. * <pre>
  393. * [1] document ::= prolog element Misc*
  394. * </pre>
  395. * <p>This is the top-level parsing function for a single XML
  396. * document. As a minimum, a well-formed document must have
  397. * a document element, and a valid document must have a prolog
  398. * as well.
  399. */
  400. void parseDocument ()
  401. throws java.lang.Exception
  402. {
  403. char c;
  404. parseProlog();
  405. require('<');
  406. parseElement();
  407. try
  408. {
  409. parseMisc(); //skip all white, PIs, and comments
  410. c=readCh(); //if this doesn't throw an exception...
  411. error("unexpected characters after document end",c,null);
  412. }
  413. catch (EOFException e)
  414. {return;}
  415. }
  416. /**
  417. * Skip a comment.
  418. * <pre>
  419. * [18] Comment ::= '&lt;!--' ((Char - '-') | ('-' (Char - '-')))* "-->"
  420. * </pre>
  421. * <p>(The <code>&lt;!--</code> has already been read.)
  422. */
  423. void parseComment ()
  424. throws java.lang.Exception
  425. {
  426. skipUntil("-->");
  427. }
  428. /**
  429. * Parse a processing instruction and do a call-back.
  430. * <pre>
  431. * [19] PI ::= '&lt;?' Name (S (Char* - (Char* '?&gt;' Char*)))? '?&gt;'
  432. * </pre>
  433. * <p>(The <code>&lt;?</code> has already been read.)
  434. * <p>An XML processing instruction <em>must</em> begin with
  435. * a Name, which is the instruction's target.
  436. */
  437. void parsePI ()
  438. throws java.lang.Exception
  439. {
  440. String name;
  441. name = readNmtoken(true);
  442. if (!tryRead("?>")) {
  443. requireWhitespace();
  444. parseUntil("?>");
  445. }
  446. if (handler != null) {
  447. handler.processingInstruction(name, dataBufferToString());
  448. }
  449. }
  450. /**
  451. * Parse a CDATA marked section.
  452. * <pre>
  453. * [20] CDSect ::= CDStart CData CDEnd
  454. * [21] CDStart ::= '&lt;![CDATA['
  455. * [22] CData ::= (Char* - (Char* ']]&gt;' Char*))
  456. * [23] CDEnd ::= ']]&gt;'
  457. * </pre>
  458. * <p>(The '&lt;![CDATA[' has already been read.)
  459. * <p>Note that this just appends characters to the dataBuffer,
  460. * without actually generating an event.
  461. */
  462. void parseCDSect ()
  463. throws java.lang.Exception
  464. {
  465. parseUntil("]]>");
  466. }
  467. /**
  468. * Parse the prolog of an XML document.
  469. * <pre>
  470. * [24] prolog ::= XMLDecl? Misc* (Doctypedecl Misc*)?
  471. * </pre>
  472. * <p>There are a couple of tricks here. First, it is necessary to
  473. * declare the XML default attributes after the DTD (if present)
  474. * has been read. Second, it is not possible to expand general
  475. * references in attribute value literals until after the entire
  476. * DTD (if present) has been parsed.
  477. * <p>We do not look for the XML declaration here, because it is
  478. * handled by pushURL().
  479. * @see pushURL
  480. */
  481. void parseProlog ()
  482. throws java.lang.Exception
  483. {
  484. parseMisc();
  485. if (tryRead("<!DOCTYPE")) {
  486. parseDoctypedecl();
  487. parseMisc();
  488. }
  489. }
  490. /**
  491. * Parse the XML declaration.
  492. * <pre>
  493. * [25] XMLDecl ::= '&lt;?xml' VersionInfo EncodingDecl? SDDecl? S? '?&gt;'
  494. * [26] VersionInfo ::= S 'version' Eq ('"1.0"' | "'1.0'")
  495. * [33] SDDecl ::= S 'standalone' Eq "'" ('yes' | 'no') "'"
  496. * | S 'standalone' Eq '"' ("yes" | "no") '"'
  497. * [78] EncodingDecl ::= S 'encoding' Eq QEncoding
  498. * </pre>
  499. * <p>([80] to [82] are also significant.)
  500. * <p>(The <code>&lt;?xml</code> and whitespace have already been read.)
  501. * <p>TODO: validate value of standalone.
  502. * @see #parseTextDecl
  503. * @see #checkEncoding
  504. */
  505. void parseXMLDecl (boolean ignoreEncoding)
  506. throws java.lang.Exception
  507. {
  508. String version;
  509. String encodingName = null;
  510. String standalone = null;
  511. // Read the version.
  512. require("version");
  513. parseEq();
  514. version = readLiteral(0);
  515. if (!version.equals("1.0")) {
  516. error("unsupported XML version", version, "1.0");
  517. }
  518. // Try reading an encoding declaration.
  519. skipWhitespace();
  520. if (tryRead("encoding")) {
  521. parseEq();
  522. encodingName = readLiteral(0);
  523. checkEncoding(encodingName, ignoreEncoding);
  524. }
  525. // Try reading a standalone declaration
  526. skipWhitespace();
  527. if (tryRead("standalone")) {
  528. parseEq();
  529. standalone = readLiteral(0);
  530. }
  531. skipWhitespace();
  532. require("?>");
  533. }
  534. /**
  535. * Parse the Encoding PI.
  536. * <pre>
  537. * [78] EncodingDecl ::= S 'encoding' Eq QEncoding
  538. * [79] EncodingPI ::= '&lt;?xml' S 'encoding' Eq QEncoding S? '?&gt;'
  539. * [80] QEncoding ::= '"' Encoding '"' | "'" Encoding "'"
  540. * [81] Encoding ::= LatinName
  541. * [82] LatinName ::= [A-Za-z] ([A-Za-z0-9._] | '-')*
  542. * </pre>
  543. * <p>(The <code>&lt;?xml</code>' and whitespace have already been read.)
  544. * @see #parseXMLDecl
  545. * @see #checkEncoding
  546. */
  547. void parseTextDecl (boolean ignoreEncoding)
  548. throws java.lang.Exception
  549. {
  550. String encodingName = null;
  551. // Read an optional version.
  552. if (tryRead("version")) {
  553. String version;
  554. parseEq();
  555. version = readLiteral(0);
  556. if (!version.equals("1.0")) {
  557. error("unsupported XML version", version, "1.0");
  558. }
  559. requireWhitespace();
  560. }
  561. // Read the encoding.
  562. require("encoding");
  563. parseEq();
  564. encodingName = readLiteral(0);
  565. checkEncoding(encodingName, ignoreEncoding);
  566. skipWhitespace();
  567. require("?>");
  568. }
  569. /**
  570. * Check that the encoding specified makes sense.
  571. * <p>Compare what the author has specified in the XML declaration
  572. * or encoding PI with what we have detected.
  573. * <p>This is also important for distinguishing among the various
  574. * 7- and 8-bit encodings, such as ISO-LATIN-1 (I cannot autodetect
  575. * those).
  576. * @param encodingName The name of the encoding specified by the user.
  577. * @see #parseXMLDecl
  578. * @see #parseTextDecl
  579. */
  580. void checkEncoding (String encodingName, boolean ignoreEncoding)
  581. throws java.lang.Exception
  582. {
  583. encodingName = encodingName.toUpperCase();
  584. if (ignoreEncoding) {
  585. return;
  586. }
  587. switch (encoding) {
  588. // 8-bit encodings
  589. case ENCODING_UTF_8:
  590. if (encodingName.equals("ISO-8859-1")) {
  591. encoding = ENCODING_ISO_8859_1;
  592. } else if (!encodingName.equals("UTF-8")) {
  593. error("unsupported 8-bit encoding",
  594. encodingName,
  595. "UTF-8 or ISO-8859-1");
  596. }
  597. break;
  598. // 16-bit encodings
  599. case ENCODING_UCS_2_12:
  600. case ENCODING_UCS_2_21:
  601. if (!encodingName.equals("ISO-10646-UCS-2") &&
  602. !encodingName.equals("UTF-16")) {
  603. error("unsupported 16-bit encoding",
  604. encodingName,
  605. "ISO-10646-UCS-2");
  606. }
  607. break;
  608. // 32-bit encodings
  609. case ENCODING_UCS_4_1234:
  610. case ENCODING_UCS_4_4321:
  611. case ENCODING_UCS_4_2143:
  612. case ENCODING_UCS_4_3412:
  613. if (!encodingName.equals("ISO-10646-UCS-4")) {
  614. error("unsupported 32-bit encoding",
  615. encodingName,
  616. "ISO-10646-UCS-4");
  617. }
  618. }
  619. }
  620. /**
  621. * Parse miscellaneous markup outside the document element and DOCTYPE
  622. * declaration.
  623. * <pre>
  624. * [27] Misc ::= Comment | PI | S
  625. * </pre>
  626. */
  627. void parseMisc ()
  628. throws java.lang.Exception
  629. {
  630. while (true)
  631. {
  632. skipWhitespace();
  633. if (tryRead("<?"))
  634. {parsePI();}
  635. else if (tryRead("<!--"))
  636. {parseComment();}
  637. else
  638. {return;}
  639. }
  640. }
  641. /**
  642. * Parse a document type declaration.
  643. * <pre>
  644. * [28] doctypedecl ::= '&lt;!DOCTYPE' S Name (S ExternalID)? S?
  645. * ('[' %markupdecl* ']' S?)? '&gt;'
  646. * </pre>
  647. * <p>(The <code>&lt;!DOCTYPE</code> has already been read.)
  648. */
  649. void parseDoctypedecl ()
  650. throws java.lang.Exception
  651. {
  652. char c;
  653. String doctypeName, ids[];
  654. // Read the document type name.
  655. requireWhitespace();
  656. doctypeName = readNmtoken(true);
  657. // Read the ExternalIDs.
  658. skipWhitespace();
  659. ids = readExternalIds(false);
  660. // Look for a declaration subset.
  661. skipWhitespace();
  662. if (tryRead('[')) {
  663. // loop until the subset ends
  664. while (true) {
  665. context = CONTEXT_DTD;
  666. skipWhitespace();
  667. context = CONTEXT_NONE;
  668. if (tryRead(']')) {
  669. break; // end of subset
  670. } else {
  671. context = CONTEXT_DTD;
  672. parseMarkupdecl();
  673. context = CONTEXT_NONE;
  674. }
  675. }
  676. }
  677. // Read the external subset, if any
  678. if (ids[1] != null) {
  679. pushURL("[external subset]", ids[0], ids[1], null, null, null);
  680. // Loop until we end up back at '>'
  681. while (true) {
  682. context = CONTEXT_DTD;
  683. skipWhitespace();
  684. context = CONTEXT_NONE;
  685. if (tryRead('>')) {
  686. break;
  687. } else {
  688. context = CONTEXT_DTD;
  689. parseMarkupdecl();
  690. context = CONTEXT_NONE;
  691. }
  692. }
  693. } else {
  694. // No external subset.
  695. skipWhitespace();
  696. require('>');
  697. }
  698. if (handler != null) {
  699. handler.doctypeDecl(doctypeName, ids[0], ids[1]);
  700. }
  701. // Expand general entities in
  702. // default values of attributes.
  703. // (Do this after the doctypeDecl
  704. // event!).
  705. // expandAttributeDefaultValues();
  706. }
  707. /**
  708. * Parse a markup declaration in the internal or external DTD subset.
  709. * <pre>
  710. * [29] markupdecl ::= ( %elementdecl | %AttlistDecl | %EntityDecl |
  711. * %NotationDecl | %PI | %S | %Comment |
  712. * InternalPERef )
  713. * [30] InternalPERef ::= PEReference
  714. * [31] extSubset ::= (%markupdecl | %conditionalSect)*
  715. * </pre>
  716. */
  717. void parseMarkupdecl ()
  718. throws java.lang.Exception
  719. {
  720. if (tryRead("<!ELEMENT")) {
  721. parseElementdecl();
  722. } else if (tryRead("<!ATTLIST")) {
  723. parseAttlistDecl();
  724. } else if (tryRead("<!ENTITY")) {
  725. parseEntityDecl();
  726. } else if (tryRead("<!NOTATION")) {
  727. parseNotationDecl();
  728. } else if (tryRead("<?")) {
  729. parsePI();
  730. } else if (tryRead("<!--")) {
  731. parseComment();
  732. } else if (tryRead("<![")) {
  733. parseConditionalSect();
  734. } else {
  735. error("expected markup declaration", null, null);
  736. }
  737. }
  738. /**
  739. * Parse an element, with its tags.
  740. * <pre>
  741. * [33] STag ::= '&lt;' Name (S Attribute)* S? '&gt;' [WFC: unique Att spec]
  742. * [38] element ::= EmptyElement | STag content ETag
  743. * [39] EmptyElement ::= '&lt;' Name (S Attribute)* S? '/&gt;'
  744. * [WFC: unique Att spec]
  745. * </pre>
  746. * <p>(The '&lt;' has already been read.)
  747. * <p>NOTE: this method actually chains onto parseContent(), if necessary,
  748. * and parseContent() will take care of calling parseETag().
  749. */
  750. void parseElement ()
  751. throws java.lang.Exception
  752. {
  753. String gi;
  754. char c;
  755. int oldElementContent = currentElementContent;
  756. String oldElement = currentElement;
  757. // This is the (global) counter for the
  758. // array of specified attributes.
  759. tagAttributePos = 0;
  760. // Read the element type name.
  761. gi = readNmtoken(true);
  762. // Determine the current content type.
  763. currentElement = gi;
  764. currentElementContent = getElementContentType(gi);
  765. if (currentElementContent == CONTENT_UNDECLARED) {
  766. currentElementContent = CONTENT_ANY;
  767. }
  768. // Read the attributes, if any.
  769. // After this loop, we should be just
  770. // in front of the closing delimiter.
  771. skipWhitespace();
  772. c = readCh();
  773. while (c != '/' && c != '>') {
  774. unread(c);
  775. parseAttribute(gi);
  776. skipWhitespace();
  777. c = readCh();
  778. }
  779. unread(c);
  780. // Supply any defaulted attributes.
  781. Enumeration atts = declaredAttributes(gi);
  782. if (atts != null) {
  783. String aname;
  784. loop: while (atts.hasMoreElements()) {
  785. aname = (String)atts.nextElement();
  786. // See if it was specified.
  787. for (int i = 0; i < tagAttributePos; i++) {
  788. if (tagAttributes[i] == aname) {
  789. continue loop;
  790. }
  791. }
  792. // I guess not...
  793. if (handler != null) {
  794. handler.attribute(aname,
  795. getAttributeExpandedValue(gi, aname),
  796. false);
  797. }
  798. }
  799. }
  800. // Figure out if this is a start tag
  801. // or an empty element, and dispatch an
  802. // event accordingly.
  803. c = readCh();
  804. switch (c) {
  805. case '>':
  806. if (handler != null) {
  807. handler.startElement(gi);
  808. }
  809. parseContent();
  810. break;
  811. case '/':
  812. require('>');
  813. if (handler != null) {
  814. handler.startElement(gi);
  815. handler.endElement(gi);
  816. }
  817. break;
  818. }
  819. // Restore the previous state.
  820. currentElement = oldElement;
  821. currentElementContent = oldElementContent;
  822. }
  823. /**
  824. * Parse an attribute assignment.
  825. * <pre>
  826. * [34] Attribute ::= Name Eq AttValue
  827. * </pre>
  828. * @param name The name of the attribute's element.
  829. * @see XmlHandler#attribute
  830. */
  831. void parseAttribute (String name)
  832. throws java.lang.Exception
  833. {
  834. String aname;
  835. int type;
  836. String value;
  837. // Read the attribute name.
  838. aname = readNmtoken(true).intern();
  839. type = getAttributeDefaultValueType(name, aname);
  840. // Parse '='
  841. parseEq();
  842. // Read the value, normalizing whitespace
  843. // if it is not CDATA.
  844. if (type == ATTRIBUTE_CDATA || type == ATTRIBUTE_UNDECLARED) {
  845. value = readLiteral(LIT_CHAR_REF | LIT_ENTITY_REF);
  846. } else {
  847. value = readLiteral(LIT_CHAR_REF | LIT_ENTITY_REF | LIT_NORMALIZE);
  848. }
  849. // Inform the handler about the
  850. // attribute.
  851. if (handler != null) {
  852. handler.attribute(aname, value, true);
  853. }
  854. dataBufferPos = 0;
  855. // Note that the attribute has been
  856. // specified.
  857. if (tagAttributePos == tagAttributes.length) {
  858. String newAttrib[] = new String[tagAttributes.length * 2];
  859. System.arraycopy(tagAttributes, 0, newAttrib, 0, tagAttributePos);
  860. tagAttributes = newAttrib;
  861. }
  862. tagAttributes[tagAttributePos++] = aname;
  863. }
  864. /**
  865. * Parse an equals sign surrounded by optional whitespace.
  866. * [35] Eq ::= S? '=' S?
  867. */
  868. void parseEq ()
  869. throws java.lang.Exception
  870. {
  871. skipWhitespace();
  872. require('=');
  873. skipWhitespace();
  874. }
  875. /**
  876. * Parse an end tag.
  877. * [36] ETag ::= '</' Name S? '>'
  878. * *NOTE: parseContent() chains to here.
  879. */
  880. void parseETag ()
  881. throws java.lang.Exception
  882. {
  883. String name;
  884. name = readNmtoken(true);
  885. if (name != currentElement) {
  886. error("mismatched end tag", name, currentElement);
  887. }
  888. skipWhitespace();
  889. require('>');
  890. if (handler != null) {
  891. handler.endElement(name);
  892. }
  893. }
  894. /**
  895. * Parse the content of an element.
  896. * [37] content ::= (element | PCData | Reference | CDSect | PI | Comment)*
  897. * [68] Reference ::= EntityRef | CharRef
  898. */
  899. void parseContent ()
  900. throws java.lang.Exception
  901. {
  902. String data;
  903. char c;
  904. while (true) {
  905. switch (currentElementContent) {
  906. case CONTENT_ANY:
  907. case CONTENT_MIXED:
  908. parsePCData();
  909. break;
  910. case CONTENT_ELEMENTS:
  911. parseWhitespace();
  912. break;
  913. }
  914. // Handle delimiters
  915. c = readCh();
  916. switch (c) {
  917. case '&': // Found "&"
  918. c = readCh();
  919. if (c == '#') {
  920. parseCharRef();
  921. } else {
  922. unread(c);
  923. parseEntityRef(true);
  924. }
  925. break;
  926. case '<': // Found "<"
  927. c = readCh();
  928. switch (c) {
  929. case '!': // Found "<!"
  930. c = readCh();
  931. switch (c) {
  932. case '-': // Found "<!-"
  933. require('-');
  934. parseComment();
  935. break;
  936. case '[': // Found "<!["
  937. require("CDATA[");
  938. parseCDSect();
  939. break;
  940. default:
  941. error("expected comment or CDATA section", c, null);
  942. break;
  943. }
  944. break;
  945. case '?': // Found "<?"
  946. dataBufferFlush();
  947. parsePI();
  948. break;
  949. case '/': // Found "</"
  950. dataBufferFlush();
  951. parseETag();
  952. return;
  953. default: // Found "<" followed by something else
  954. dataBufferFlush();
  955. unread(c);
  956. parseElement();
  957. break;
  958. }
  959. }
  960. }
  961. }
  962. /**
  963. * Parse an element type declaration.
  964. * [40] elementdecl ::= '<!ELEMENT' S %Name S (%S S)? %contentspec S? '>'
  965. * [VC: Unique Element Declaration]
  966. * *NOTE: the '<!ELEMENT' has already been read.
  967. */
  968. void parseElementdecl ()
  969. throws java.lang.Exception
  970. {
  971. String name;
  972. requireWhitespace();
  973. // Read the element type name.
  974. name = readNmtoken(true);
  975. requireWhitespace();
  976. // Read the content model.
  977. parseContentspec(name);
  978. skipWhitespace();
  979. require('>');
  980. }
  981. /**
  982. * Content specification.
  983. * [41] contentspec ::= 'EMPTY' | 'ANY' | Mixed | elements
  984. */
  985. void parseContentspec (String name)
  986. throws java.lang.Exception
  987. {
  988. if (tryRead("EMPTY")) {
  989. setElement(name, CONTENT_EMPTY, null, null);
  990. return;
  991. } else if (tryRead("ANY")) {
  992. setElement(name, CONTENT_ANY, null, null);
  993. return;
  994. } else {
  995. require('(');
  996. dataBufferAppend('(');
  997. skipWhitespace();
  998. if (tryRead("#PCDATA")) {
  999. dataBufferAppend("#PCDATA");
  1000. parseMixed();
  1001. setElement(name, CONTENT_MIXED, dataBufferToString(), null);
  1002. } else {
  1003. parseElements();
  1004. setElement(name, CONTENT_ELEMENTS, dataBufferToString(), null);
  1005. }
  1006. }
  1007. }
  1008. /**
  1009. * Parse an element-content model.
  1010. * [42] elements ::= (choice | seq) ('?' | '*' | '+')?
  1011. * [44] cps ::= S? %cp S?
  1012. * [45] choice ::= '(' S? %ctokplus (S? '|' S? %ctoks)* S? ')'
  1013. * [46] ctokplus ::= cps ('|' cps)+
  1014. * [47] ctoks ::= cps ('|' cps)*
  1015. * [48] seq ::= '(' S? %stoks (S? ',' S? %stoks)* S? ')'
  1016. * [49] stoks ::= cps (',' cps)*
  1017. * *NOTE: the opening '(' and S have already been read.
  1018. * *TODO: go over parameter entity boundaries more carefully.
  1019. */
  1020. void parseElements ()
  1021. throws java.lang.Exception
  1022. {
  1023. char c;
  1024. char sep;
  1025. // Parse the first content particle
  1026. skipWhitespace();
  1027. parseCp();
  1028. // Check for end or for a separator.
  1029. skipWhitespace();
  1030. c = readCh();
  1031. switch (c) {
  1032. case ')':
  1033. dataBufferAppend(')');
  1034. c = readCh();
  1035. switch (c) {
  1036. case '*':
  1037. case '+':
  1038. case '?':
  1039. dataBufferAppend(c);
  1040. break;
  1041. default:
  1042. unread(c);
  1043. }
  1044. return;
  1045. case ',': // Register the separator.
  1046. case '|':
  1047. sep = c;
  1048. dataBufferAppend(c);
  1049. break;
  1050. default:
  1051. error("bad separator in content model", c, null);
  1052. return;
  1053. }
  1054. // Parse the rest of the content model.
  1055. while (true) {
  1056. skipWhitespace();
  1057. parseCp();
  1058. skipWhitespace();
  1059. c = readCh();
  1060. if (c == ')') {
  1061. dataBufferAppend(')');
  1062. break;
  1063. } else if (c != sep) {
  1064. error("bad separator in content model", c, null);
  1065. return;
  1066. } else {
  1067. dataBufferAppend(c);
  1068. }
  1069. }
  1070. // Check for the occurrence indicator.
  1071. c = readCh();
  1072. switch (c) {
  1073. case '?':
  1074. case '*':
  1075. case '+':
  1076. dataBufferAppend(c);
  1077. return;
  1078. default:
  1079. unread(c);
  1080. return;
  1081. }
  1082. }
  1083. /**
  1084. * Parse a content particle.
  1085. * [43] cp ::= (Name | choice | seq) ('?' | '*' | '+')
  1086. * *NOTE: I actually use a slightly different production here:
  1087. * cp ::= (elements | (Name ('?' | '*' | '+')?))
  1088. */
  1089. void parseCp ()
  1090. throws java.lang.Exception
  1091. {
  1092. char c;
  1093. if (tryRead('(')) {
  1094. dataBufferAppend('(');
  1095. parseElements();
  1096. } else {
  1097. dataBufferAppend(readNmtoken(true));
  1098. c = readCh();
  1099. switch (c) {
  1100. case '?':
  1101. case '*':
  1102. case '+':
  1103. dataBufferAppend(c);
  1104. break;
  1105. default:
  1106. unread(c);
  1107. break;
  1108. }
  1109. }
  1110. }
  1111. /**
  1112. * Parse mixed content.
  1113. * [50] Mixed ::= '(' S? %( %'#PCDATA' (S? '|' S? %Mtoks)* ) S? ')*'
  1114. * | '(' S? %('#PCDATA') S? ')'
  1115. * [51] Mtoks ::= %Name (S? '|' S? %Name)*
  1116. * *NOTE: the S and '#PCDATA' have already been read.
  1117. */
  1118. void parseMixed ()
  1119. throws java.lang.Exception
  1120. {
  1121. char c;
  1122. // Check for PCDATA alone.
  1123. skipWhitespace();
  1124. if (tryRead(')')) {
  1125. dataBufferAppend(")*");
  1126. tryRead('*');
  1127. return;
  1128. }
  1129. // Parse mixed content.
  1130. skipWhitespace();
  1131. while (!tryRead(")*")) {
  1132. require('|');
  1133. dataBufferAppend('|');
  1134. skipWhitespace();
  1135. dataBufferAppend(readNmtoken(true));
  1136. skipWhitespace();
  1137. }
  1138. dataBufferAppend(")*");
  1139. }
  1140. /**
  1141. * Parse an attribute list declaration.
  1142. * [52] AttlistDecl ::= '<!ATTLIST' S %Name S? %AttDef+ S? '>'
  1143. * *NOTE: the '<!ATTLIST' has already been read.
  1144. */
  1145. void parseAttlistDecl ()
  1146. throws java.lang.Exception
  1147. {
  1148. String elementName;
  1149. requireWhitespace();
  1150. elementName = readNmtoken(true);
  1151. requireWhitespace();
  1152. while (!tryRead('>')) {
  1153. parseAttDef(elementName);
  1154. skipWhitespace();
  1155. }
  1156. }
  1157. /**
  1158. * Parse a single attribute definition.
  1159. * [53] AttDef ::= S %Name S %AttType S %Default
  1160. */
  1161. void parseAttDef (String elementName)
  1162. throws java.lang.Exception
  1163. {
  1164. String name;
  1165. int type;
  1166. String enumeration = null;
  1167. // Read the attribute name.
  1168. name = readNmtoken(true);
  1169. // Read the attribute type.
  1170. requireWhitespace();
  1171. type = readAttType();
  1172. // Get the string of enumerated values
  1173. // if necessary.
  1174. if (type == ATTRIBUTE_ENUMERATED || type == ATTRIBUTE_NOTATION) {
  1175. enumeration = dataBufferToString();
  1176. }
  1177. // Read the default value.
  1178. requireWhitespace();
  1179. parseDefault(elementName, name, type, enumeration);
  1180. }
  1181. /**
  1182. * Parse the attribute type.
  1183. * [54] AttType ::= StringType | TokenizedType | EnumeratedType
  1184. * [55] StringType ::= 'CDATA'
  1185. * [56] TokenizedType ::= 'ID' | 'IDREF' | 'IDREFS' | 'ENTITY' | 'ENTITIES' |
  1186. * 'NMTOKEN' | 'NMTOKENS'
  1187. * [57] EnumeratedType ::= NotationType | Enumeration
  1188. * *TODO: validate the type!!
  1189. */
  1190. int readAttType ()
  1191. throws java.lang.Exception
  1192. {
  1193. String typeString;
  1194. Integer type;
  1195. if (tryRead('(')) {
  1196. parseEnumeration();
  1197. return ATTRIBUTE_ENUMERATED;
  1198. } else {
  1199. typeString = readNmtoken(true);
  1200. if (typeString.equals("NOTATION")) {
  1201. parseNotationType();
  1202. }
  1203. type = (Integer)attributeTypeHash.get(typeString);
  1204. if (type == null) {
  1205. error("illegal attribute type", typeString, null);
  1206. return ATTRIBUTE_UNDECLARED;
  1207. } else {
  1208. return type.intValue();
  1209. }
  1210. }
  1211. }
  1212. /**
  1213. * Parse an enumeration.
  1214. * [60] Enumeration ::= '(' S? %Etoks (S? '|' S? %Etoks)* S? ')'
  1215. * [61] Etoks ::= %Nmtoken (S? '|' S? %Nmtoken)*
  1216. * *NOTE: the '(' has already been read.
  1217. */
  1218. void parseEnumeration ()
  1219. throws java.lang.Exception
  1220. {
  1221. char c;
  1222. dataBufferAppend('(');
  1223. // Read the first token.
  1224. skipWhitespace();
  1225. dataBufferAppend(readNmtoken(true));
  1226. // Read the remaining tokens.
  1227. skipWhitespace();
  1228. while (!tryRead(')')) {
  1229. require('|');
  1230. dataBufferAppend('|');
  1231. skipWhitespace();
  1232. dataBufferAppend(readNmtoken(true));
  1233. skipWhitespace();
  1234. }
  1235. dataBufferAppend(')');
  1236. }
  1237. /**
  1238. * Parse a notation type for an attribute.
  1239. * [58] NotationType ::= %'NOTATION' S '(' S? %Ntoks (S? '|' S? %Ntoks)*
  1240. * S? ')'
  1241. * [59] Ntoks ::= %Name (S? '|' S? %Name)
  1242. * *NOTE: the 'NOTATION' has already been read
  1243. */
  1244. void parseNotationType ()
  1245. throws java.lang.Exception
  1246. {
  1247. requireWhitespace();
  1248. require('(');
  1249. parseEnumeration();
  1250. }
  1251. /**
  1252. * Parse the default value for an attribute.
  1253. * [62] Default ::= '#REQUIRED' | '#IMPLIED' | ((%'#FIXED' S)? %AttValue
  1254. */
  1255. void parseDefault (String elementName, String name, int type, String enumeration)
  1256. throws java.lang.Exception
  1257. {
  1258. int valueType = ATTRIBUTE_DEFAULT_SPECIFIED;
  1259. String value = null;
  1260. boolean normalizeWSFlag;
  1261. if (tryRead('#')) {
  1262. if (tryRead("FIXED")) {
  1263. valueType = ATTRIBUTE_DEFAULT_FIXED;
  1264. requireWhitespace();
  1265. context = CONTEXT_ATTRIBUTEVALUE;
  1266. value = readLiteral(LIT_CHAR_REF);
  1267. context = CONTEXT_DTD;
  1268. } else if (tryRead("REQUIRED")) {
  1269. valueType = ATTRIBUTE_DEFAULT_REQUIRED;
  1270. } else if (tryRead("IMPLIED")) {
  1271. valueType = ATTRIBUTE_DEFAULT_IMPLIED;
  1272. } else {
  1273. error("illegal keyword for attribute default value", null, null);
  1274. }
  1275. } else {
  1276. context = CONTEXT_ATTRIBUTEVALUE;
  1277. value = readLiteral(LIT_CHAR_REF);
  1278. context = CONTEXT_DTD;
  1279. }
  1280. setAttribute(elementName, name, type, enumeration, value, valueType);
  1281. }
  1282. /**
  1283. * Parse a conditional section.
  1284. * [63] conditionalSect ::= includeSect || ignoreSect
  1285. * [64] includeSect ::= '<![' %'INCLUDE' '[' (%markupdecl*)* ']]>'
  1286. * [65] ignoreSect ::= '<![' %'IGNORE' '[' ignoreSectContents* ']]>'
  1287. * [66] ignoreSectContents ::= ((SkipLit | Comment | PI) -(Char* ']]>'))
  1288. * | ('<![' ignoreSectContents* ']]>')
  1289. * | (Char - (']' | [<'"]))
  1290. * | ('<!' (Char - ('-' | '[')))
  1291. * *NOTE: the '<![' has already been read.
  1292. * *TODO: verify that I am handling ignoreSectContents right.
  1293. */
  1294. void parseConditionalSect ()
  1295. throws java.lang.Exception
  1296. {
  1297. skipWhitespace();
  1298. if (tryRead("INCLUDE")) {
  1299. skipWhitespace();
  1300. require('[');
  1301. skipWhitespace();
  1302. while (!tryRead("]]>")) {
  1303. parseMarkupdecl();
  1304. skipWhitespace();
  1305. }
  1306. } else if (tryRead("IGNORE")) {
  1307. skipWhitespace();
  1308. require('[');
  1309. int nesting = 1;
  1310. char c;
  1311. for (int nest = 1; nest > 0; ) {
  1312. c = readCh();
  1313. switch (c) {
  1314. case '<':
  1315. if (tryRead("![")) {
  1316. nest++;
  1317. }
  1318. case ']':
  1319. if (tryRead("]>")) {
  1320. nest--;
  1321. }
  1322. }
  1323. }
  1324. } else {
  1325. error("conditional section must begin with INCLUDE or IGNORE",
  1326. null, null);
  1327. }
  1328. }
  1329. /**
  1330. * Read a character reference.
  1331. * [67] CharRef ::= '&#' [0-9]+ ';' | '&#x' [0-9a-fA-F]+ ';'
  1332. * *NOTE: the '&#' has already been read.
  1333. */
  1334. void parseCharRef ()
  1335. throws java.lang.Exception
  1336. {
  1337. int value = 0;
  1338. char c;
  1339. if (tryRead('x')) {
  1340. loop1: while (true) {
  1341. c = readCh();
  1342. switch (c) {
  1343. case '0':
  1344. case '1':
  1345. case '2':
  1346. case '3':
  1347. case '4':
  1348. case '5':
  1349. case '6':
  1350. case '7':
  1351. case '8':
  1352. case '9':
  1353. case 'a':
  1354. case 'A':
  1355. case 'b':
  1356. case 'B':
  1357. case 'c':
  1358. case 'C':
  1359. case 'd':
  1360. case 'D':
  1361. case 'e':
  1362. case 'E':
  1363. case 'f':
  1364. case 'F':
  1365. value *= 16;
  1366. value += Integer.parseInt(new Character(c).toString(), 16);
  1367. break;
  1368. case ';':
  1369. break loop1;
  1370. default:
  1371. error("illegal character in character reference", c, null);
  1372. break loop1;
  1373. }
  1374. }
  1375. } else {
  1376. loop2: while (true) {
  1377. c = readCh();
  1378. switch (c) {
  1379. case '0':
  1380. case '1':
  1381. case '2':
  1382. case '3':
  1383. case '4':
  1384. case '5':
  1385. case '6':
  1386. case '7':
  1387. case '8':
  1388. case '9':
  1389. value *= 10;
  1390. value += Integer.parseInt(new Character(c).toString(), 10);
  1391. break;
  1392. case ';':
  1393. break loop2;
  1394. default:
  1395. error("illegal character in character reference", c, null);
  1396. break loop2;
  1397. }
  1398. }
  1399. }
  1400. // Check for surrogates: 00000000 0000xxxx yyyyyyyy zzzzzzzz
  1401. // (1101|10xx|xxyy|yyyy + 1101|11yy|zzzz|zzzz:
  1402. if (value <= 0x0000ffff) {
  1403. // no surrogates needed
  1404. dataBufferAppend((char)value);
  1405. } else if (value <= 0x000fffff) {
  1406. // > 16 bits, surrogate needed
  1407. dataBufferAppend((char)(0xd8 | ((value & 0x000ffc00) >> 10)));
  1408. dataBufferAppend((char)(0xdc | (value & 0x0003ff)));
  1409. } else {
  1410. // too big for surrogate
  1411. error("character reference " + value + " is too large for UTF-16",
  1412. new Integer(value).toString(), null);
  1413. }
  1414. }
  1415. /**
  1416. * Parse a reference.
  1417. * [69] EntityRef ::= '&' Name ';'
  1418. * *NOTE: the '&' has already been read.
  1419. * @param externalAllowed External entities are allowed here.
  1420. */
  1421. void parseEntityRef (boolean externalAllowed)
  1422. throws java.lang.Exception
  1423. {
  1424. String name;
  1425. name = readNmtoken(true);
  1426. require(';');
  1427. switch (getEntityType(name)) {
  1428. case ENTITY_UNDECLARED:
  1429. error("reference to undeclared entity", name, null);
  1430. break;
  1431. case ENTITY_INTERNAL:
  1432. pushString(name, getEntityValue(name));
  1433. break;
  1434. case ENTITY_TEXT:
  1435. if (externalAllowed) {
  1436. pushURL(name, getEntityPublicId(name),
  1437. getEntitySystemId(name),
  1438. null, null, null);
  1439. } else {
  1440. error("reference to external entity in attribute value.", name, null);
  1441. }
  1442. break;
  1443. case ENTITY_NDATA:
  1444. if (externalAllowed) {
  1445. error("data entity reference in content", name, null);
  1446. } else {
  1447. error("reference to external entity in attribute value.", name, null);
  1448. }
  1449. break;
  1450. }
  1451. }
  1452. /**
  1453. * Parse a parameter entity reference.
  1454. * [70] PEReference ::= '%' Name ';'
  1455. * *NOTE: the '%' has already been read.
  1456. */
  1457. void parsePEReference (boolean isEntityValue)
  1458. throws java.lang.Exception
  1459. {
  1460. String name;
  1461. name = "%" + readNmtoken(true);
  1462. require(';');
  1463. switch (getEntityType(name)) {
  1464. case ENTITY_UNDECLARED:
  1465. error("reference to undeclared parameter entity", name, null);
  1466. break;
  1467. case ENTITY_INTERNAL:
  1468. if (isEntityValue) {
  1469. pushString(name, getEntityValue(name));
  1470. } else {
  1471. pushString(name, " " + getEntityValue(name) + ' ');
  1472. }
  1473. break;
  1474. case ENTITY_TEXT:
  1475. if (isEntityValue) {
  1476. pushString(null, " ");
  1477. }
  1478. pushURL(name, getEntityPublicId(name),
  1479. getEntitySystemId(name),
  1480. null, null, null);
  1481. if (isEntityValue) {
  1482. pushString(null, " ");
  1483. }
  1484. break;
  1485. }
  1486. }
  1487. /**
  1488. * Parse an entity declaration.
  1489. * [71] EntityDecl ::= '<!ENTITY' S %Name S %EntityDef S? '>'
  1490. * | '<!ENTITY' S '%' S %Name S %EntityDef S? '>'
  1491. * [72] EntityDef ::= EntityValue | ExternalDef
  1492. * [73] ExternalDef ::= ExternalID %NDataDecl?
  1493. * [74] ExternalID ::= 'SYSTEM' S SystemLiteral
  1494. * | 'PUBLIC' S PubidLiteral S SystemLiteral
  1495. * [75] NDataDecl ::= S %'NDATA' S %Name
  1496. * *NOTE: the '<!ENTITY' has already been read.
  1497. */
  1498. void parseEntityDecl ()
  1499. throws java.lang.Exception
  1500. {
  1501. char c;
  1502. boolean peFlag = false;
  1503. String name, value, notationName, ids[];
  1504. // Check for a parameter entity.
  1505. requireWhitespace();
  1506. if (tryRead('%')) {
  1507. peFlag = true;
  1508. requireWhitespace();
  1509. }
  1510. // Read the entity name, and prepend
  1511. // '%' if necessary.
  1512. name = readNmtoken(true);
  1513. if (peFlag) {
  1514. name = "%" + name;
  1515. }
  1516. // Read the entity value.
  1517. requireWhitespace();
  1518. c = readCh();
  1519. unread(c);
  1520. if (c == '"' || c == '\'') {
  1521. // Internal entity.
  1522. context = CONTEXT_ENTITYVALUE;
  1523. value = readLiteral(LIT_CHAR_REF|LIT_PE_REF);
  1524. context = CONTEXT_DTD;
  1525. setInternalEntity(name,value);
  1526. } else {
  1527. // Read the external IDs
  1528. ids = readExternalIds(false);
  1529. if (ids[1] == null) {
  1530. error("system identifer missing", name, null);
  1531. }
  1532. // Check for NDATA declaration.
  1533. skipWhitespace();
  1534. if (tryRead("NDATA")) {
  1535. requireWhitespace();
  1536. notationName = readNmtoken(true);
  1537. setExternalDataEntity(name, ids[0], ids[1], notationName);
  1538. } else {
  1539. setExternalTextEntity(name, ids[0], ids[1]);
  1540. }
  1541. }
  1542. // Finish the declaration.
  1543. skipWhitespace();
  1544. require('>');
  1545. }
  1546. /**
  1547. * Parse a notation declaration.
  1548. * [81] NotationDecl ::= '<!NOTATION' S %Name S %ExternalID S? '>'
  1549. * *NOTE: the '<!NOTATION' has already been read.
  1550. */
  1551. void parseNotationDecl ()
  1552. throws java.lang.Exception
  1553. {
  1554. String nname, ids[];
  1555. requireWhitespace();
  1556. nname = readNmtoken(true);
  1557. requireWhitespace();
  1558. // Read the external identifiers.
  1559. ids = readExternalIds(true);
  1560. if (ids[0] == null && ids[1] == null) {
  1561. error("external identifer missing", nname, null);
  1562. }
  1563. // Register the notation.
  1564. setNotation(nname, ids[0], ids[1]);
  1565. skipWhitespace();
  1566. require('>');
  1567. }
  1568. /**
  1569. * Parse PCDATA.
  1570. * <pre>
  1571. * [16] PCData ::= [^&lt;&amp;]*
  1572. * </pre>
  1573. * <p>The trick here is that the data stays in the dataBuffer without
  1574. * necessarily being converted to a string right away.
  1575. */
  1576. void parsePCData ()
  1577. throws java.lang.Exception
  1578. {
  1579. char c;
  1580. // Start with a little cheat -- in most
  1581. // cases, the entire sequence of
  1582. // character data will already be in
  1583. // the readBuffer; if not, fall through to
  1584. // the normal approach.
  1585. if (USE_CHEATS) {
  1586. int lineAugment = 0;
  1587. int columnAugment = 0;
  1588. loop: for (int i = readBufferPos; i < readBufferLength; i++) {
  1589. switch (readBuffer[i]) {
  1590. case '\n':
  1591. lineAugment++;
  1592. columnAugment = 0;
  1593. break;
  1594. case '&':
  1595. case '<':
  1596. int start = readBufferPos;
  1597. columnAugment++;
  1598. readBufferPos = i;
  1599. if (lineAugment > 0) {
  1600. line += lineAugment;
  1601. column = columnAugment;
  1602. } else {
  1603. column += columnAugment;
  1604. }
  1605. dataBufferAppend(readBuffer, start, i-start);
  1606. return;
  1607. default:
  1608. columnAugment++;
  1609. }
  1610. }
  1611. }
  1612. // OK, the cheat didn't work; start over
  1613. // and do it by the book.
  1614. while (true) {
  1615. c = readCh();
  1616. switch (c) {
  1617. case '<':
  1618. case '&':
  1619. unread(c);
  1620. return;
  1621. default:
  1622. dataBufferAppend(c);
  1623. break;
  1624. }
  1625. }
  1626. }
  1627. //////////////////////////////////////////////////////////////////////
  1628. // High-level reading and scanning methods.
  1629. //////////////////////////////////////////////////////////////////////
  1630. /**
  1631. * Require whitespace characters.
  1632. * [1] S ::= (#x20 | #x9 | #xd | #xa)+
  1633. */
  1634. void requireWhitespace ()
  1635. throws java.lang.Exception
  1636. {
  1637. char c = readCh();
  1638. if (isWhitespace(c)) {
  1639. skipWhitespace();
  1640. } else {
  1641. error("whitespace expected", c, null);
  1642. }
  1643. }
  1644. /**
  1645. * Parse whitespace characters, and leave them in the data buffer.
  1646. */
  1647. void parseWhitespace ()
  1648. throws java.lang.Exception
  1649. {
  1650. char c = readCh();
  1651. while (isWhitespace(c)) {
  1652. dataBufferAppend(c);
  1653. c = readCh();
  1654. }
  1655. unread(c);
  1656. }
  1657. /**
  1658. * Skip whitespace characters.
  1659. * [1] S ::= (#x20 | #x9 | #xd | #xa)+
  1660. */
  1661. void skipWhitespace ()
  1662. throws java.lang.Exception
  1663. {
  1664. // Start with a little cheat. Most of
  1665. // the time, the white space will fall
  1666. // within the current read buffer; if
  1667. // not, then fall through.
  1668. if (USE_CHEATS) {
  1669. int lineAugment = 0;
  1670. int columnAugment = 0;
  1671. loop: for (int i = readBufferPos; i < readBufferLength; i++) {
  1672. switch (readBuffer[i]) {
  1673. case ' ':
  1674. case '\t':
  1675. case '\r':
  1676. columnAugment++;
  1677. break;
  1678. case '\n':
  1679. lineAugment++;
  1680. columnAugment = 0;
  1681. break;
  1682. case '%':
  1683. if (context == CONTEXT_DTD || context == CONTEXT_ENTITYVALUE) {
  1684. break loop;
  1685. } // else fall through...
  1686. default:
  1687. readBufferPos = i;
  1688. if (lineAugment > 0) {
  1689. line += lineAugment;
  1690. column = columnAugment;
  1691. } else {
  1692. column += columnAugment;
  1693. }
  1694. return;
  1695. }
  1696. }
  1697. }
  1698. // OK, do it by the book.
  1699. char c = readCh();
  1700. while (is