PageRenderTime 53ms CodeModel.GetById 14ms RepoModel.GetById 0ms app.codeStats 0ms

/jEdit/tags/jedit-4-0-pre5/com/microstar/xml/XmlParser.java

#
Java | 2673 lines | 1863 code | 223 blank | 587 comment | 194 complexity | 7f055fa7eeb0030f70c8ed18ce0e0d2a MD5 | raw file
Possible License(s): BSD-3-Clause, AGPL-1.0, Apache-2.0, LGPL-2.0, LGPL-3.0, GPL-2.0, CC-BY-SA-3.0, LGPL-2.1, GPL-3.0, MPL-2.0-no-copyleft-exception, IPL-1.0

Large files files are truncated, but you can click here to view the full file

  1. // XmlParser.java: the main parser class.
  2. // NO WARRANTY! See README, and copyright below.
  3. // $Id: XmlParser.java 3792 2001-09-02 05:37:43Z spestov $
  4. package com.microstar.xml;
  5. import java.io.BufferedInputStream;
  6. import java.io.EOFException;
  7. import java.io.InputStream;
  8. import java.io.Reader;
  9. import java.net.URL;
  10. import java.net.URLConnection;
  11. import java.util.Enumeration;
  12. import java.util.Hashtable;
  13. import java.util.Stack;
  14. /**
  15. * Parse XML documents and return parse events through call-backs.
  16. * <p>You need to define a class implementing the <code>XmlHandler</code>
  17. * interface: an object belonging to this class will receive the
  18. * callbacks for the events. (As an alternative to implementing
  19. * the full XmlHandler interface, you can simply extend the
  20. * <code>HandlerBase</code> convenience class.)
  21. * <p>Usage (assuming that <code>MyHandler</code> is your implementation
  22. * of the <code>XmlHandler</code> interface):
  23. * <pre>
  24. * XmlHandler handler = new MyHandler();
  25. * XmlParser parser = new XmlParser();
  26. * parser.setHandler(handler);
  27. * try {
  28. * parser.parse("http://www.host.com/doc.xml", null);
  29. * } catch (Exception e) {
  30. * [do something interesting]
  31. * }
  32. * </pre>
  33. * <p>Alternatively, you can use the standard SAX interfaces
  34. * with the <code>SAXDriver</code> class as your entry point.
  35. * @author Copyright (c) 1997, 1998 by Microstar Software Ltd.
  36. * @author Written by David Megginson &lt;dmeggins@microstar.com&gt;
  37. * @version 1.1
  38. * @see XmlHandler
  39. * @see HandlerBase
  40. * @see SAXDriver
  41. */
  42. public class XmlParser {
  43. //
  44. // Use special cheats that speed up the code (currently about 50%),
  45. // but may cause problems with future maintenance and add to the
  46. // class file size (about 500 bytes).
  47. //
  48. private final static boolean USE_CHEATS = true;
  49. //////////////////////////////////////////////////////////////////////
  50. // Constructors.
  51. ////////////////////////////////////////////////////////////////////////
  52. /**
  53. * Construct a new parser with no associated handler.
  54. * @see #setHandler
  55. * @see #parse
  56. */
  57. public XmlParser ()
  58. {
  59. }
  60. /**
  61. * Set the handler that will receive parsing events.
  62. * @param handler The handler to receive callback events.
  63. * @see #parse
  64. * @see XmlHandler
  65. */
  66. public void setHandler (XmlHandler handler)
  67. {
  68. this.handler = handler;
  69. }
  70. /**
  71. * Parse an XML document from a URI.
  72. * <p>You may parse a document more than once, but only one thread
  73. * may call this method for an object at one time.
  74. * @param systemId The URI of the document.
  75. * @param publicId The public identifier of the document, or null.
  76. * @param encoding The suggested encoding, or null if unknown.
  77. * @exception java.lang.Exception Any exception thrown by your
  78. * own handlers, or any derivation of java.io.IOException
  79. * thrown by the parser itself.
  80. */
  81. public void parse (String systemId, String publicId, String encoding)
  82. throws java.lang.Exception
  83. {
  84. doParse(systemId, publicId, null, null, encoding);
  85. }
  86. /**
  87. * Parse an XML document from a byte stream.
  88. * <p>The URI that you supply will become the base URI for
  89. * resolving relative links, but &AElig;lfred will actually read
  90. * the document from the supplied input stream.
  91. * <p>You may parse a document more than once, but only one thread
  92. * may call this method for an object at one time.
  93. * @param systemId The base URI of the document, or null if not
  94. * known.
  95. * @param publicId The public identifier of the document, or null
  96. * if not known.
  97. * @param stream A byte input stream.
  98. * @param encoding The suggested encoding, or null if unknown.
  99. * @exception java.lang.Exception Any exception thrown by your
  100. * own handlers, or any derivation of java.io.IOException
  101. * thrown by the parser itself.
  102. */
  103. public void parse (String systemId, String publicId,
  104. InputStream stream, String encoding)
  105. throws java.lang.Exception
  106. {
  107. doParse(systemId, publicId, null, stream, encoding);
  108. }
  109. /**
  110. * Parse an XML document from a character stream.
  111. * <p>The URI that you supply will become the base URI for
  112. * resolving relative links, but &AElig;lfred will actually read
  113. * the document from the supplied input stream.
  114. * <p>You may parse a document more than once, but only one thread
  115. * may call this method for an object at one time.
  116. * @param systemId The base URI of the document, or null if not
  117. * known.
  118. * @param publicId The public identifier of the document, or null
  119. * if not known.
  120. * @param reader A character stream.
  121. * @exception java.lang.Exception Any exception thrown by your
  122. * own handlers, or any derivation of java.io.IOException
  123. * thrown by the parser itself.
  124. */
  125. public void parse (String systemId, String publicId, Reader reader)
  126. throws java.lang.Exception
  127. {
  128. doParse(systemId, publicId, reader, null, null);
  129. }
  130. private synchronized void doParse (String systemId, String publicId,
  131. Reader reader, InputStream stream,
  132. String encoding)
  133. throws java.lang.Exception
  134. {
  135. basePublicId = publicId;
  136. baseURI = systemId;
  137. baseReader = reader;
  138. baseInputStream = stream;
  139. initializeVariables();
  140. // Set the default entities here.
  141. setInternalEntity(intern("amp"), "&#38;");
  142. setInternalEntity(intern("lt"), "&#60;");
  143. setInternalEntity(intern("gt"), "&#62;");
  144. setInternalEntity(intern("apos"), "&#39;");
  145. setInternalEntity(intern("quot"), "&#34;");
  146. if (handler != null) {
  147. handler.startDocument();
  148. }
  149. pushURL("[document]", basePublicId, baseURI, baseReader, baseInputStream,
  150. encoding);
  151. parseDocument();
  152. if (handler != null) {
  153. handler.endDocument();
  154. }
  155. cleanupVariables();
  156. }
  157. ////////////////////////////////////////////////////////////////////////
  158. // Constants.
  159. ////////////////////////////////////////////////////////////////////////
  160. //
  161. // Constants for element content type.
  162. //
  163. /**
  164. * Constant: an element has not been declared.
  165. * @see #getElementContentType
  166. */
  167. public final static int CONTENT_UNDECLARED = 0;
  168. /**
  169. * Constant: the element has a content model of ANY.
  170. * @see #getElementContentType
  171. */
  172. public final static int CONTENT_ANY = 1;
  173. /**
  174. * Constant: the element has declared content of EMPTY.
  175. * @see #getElementContentType
  176. */
  177. public final static int CONTENT_EMPTY = 2;
  178. /**
  179. * Constant: the element has mixed content.
  180. * @see #getElementContentType
  181. */
  182. public final static int CONTENT_MIXED = 3;
  183. /**
  184. * Constant: the element has element content.
  185. * @see #getElementContentType
  186. */
  187. public final static int CONTENT_ELEMENTS = 4;
  188. //
  189. // Constants for the entity type.
  190. //
  191. /**
  192. * Constant: the entity has not been declared.
  193. * @see #getEntityType
  194. */
  195. public final static int ENTITY_UNDECLARED = 0;
  196. /**
  197. * Constant: the entity is internal.
  198. * @see #getEntityType
  199. */
  200. public final static int ENTITY_INTERNAL = 1;
  201. /**
  202. * Constant: the entity is external, non-XML data.
  203. * @see #getEntityType
  204. */
  205. public final static int ENTITY_NDATA = 2;
  206. /**
  207. * Constant: the entity is external XML data.
  208. * @see #getEntityType
  209. */
  210. public final static int ENTITY_TEXT = 3;
  211. //
  212. // Constants for attribute type.
  213. //
  214. /**
  215. * Constant: the attribute has not been declared for this element type.
  216. * @see #getAttributeType
  217. */
  218. public final static int ATTRIBUTE_UNDECLARED = 0;
  219. /**
  220. * Constant: the attribute value is a string value.
  221. * @see #getAttributeType
  222. */
  223. public final static int ATTRIBUTE_CDATA = 1;
  224. /**
  225. * Constant: the attribute value is a unique identifier.
  226. * @see #getAttributeType
  227. */
  228. public final static int ATTRIBUTE_ID = 2;
  229. /**
  230. * Constant: the attribute value is a reference to a unique identifier.
  231. * @see #getAttributeType
  232. */
  233. public final static int ATTRIBUTE_IDREF = 3;
  234. /**
  235. * Constant: the attribute value is a list of ID references.
  236. * @see #getAttributeType
  237. */
  238. public final static int ATTRIBUTE_IDREFS = 4;
  239. /**
  240. * Constant: the attribute value is the name of an entity.
  241. * @see #getAttributeType
  242. */
  243. public final static int ATTRIBUTE_ENTITY = 5;
  244. /**
  245. * Constant: the attribute value is a list of entity names.
  246. * @see #getAttributeType
  247. */
  248. public final static int ATTRIBUTE_ENTITIES = 6;
  249. /**
  250. * Constant: the attribute value is a name token.
  251. * @see #getAttributeType
  252. */
  253. public final static int ATTRIBUTE_NMTOKEN = 7;
  254. /**
  255. * Constant: the attribute value is a list of name tokens.
  256. * @see #getAttributeType
  257. */
  258. public final static int ATTRIBUTE_NMTOKENS = 8;
  259. /**
  260. * Constant: the attribute value is a token from an enumeration.
  261. * @see #getAttributeType
  262. */
  263. public final static int ATTRIBUTE_ENUMERATED = 9;
  264. /**
  265. * Constant: the attribute is the name of a notation.
  266. * @see #getAttributeType
  267. */
  268. public final static int ATTRIBUTE_NOTATION = 10;
  269. //
  270. // When the class is loaded, populate the hash table of
  271. // attribute types.
  272. //
  273. /**
  274. * Hash table of attribute types.
  275. */
  276. private static Hashtable attributeTypeHash;
  277. static {
  278. attributeTypeHash = new Hashtable();
  279. attributeTypeHash.put("CDATA", new Integer(ATTRIBUTE_CDATA));
  280. attributeTypeHash.put("ID", new Integer(ATTRIBUTE_ID));
  281. attributeTypeHash.put("IDREF", new Integer(ATTRIBUTE_IDREF));
  282. attributeTypeHash.put("IDREFS", new Integer(ATTRIBUTE_IDREFS));
  283. attributeTypeHash.put("ENTITY", new Integer(ATTRIBUTE_ENTITY));
  284. attributeTypeHash.put("ENTITIES", new Integer(ATTRIBUTE_ENTITIES));
  285. attributeTypeHash.put("NMTOKEN", new Integer(ATTRIBUTE_NMTOKEN));
  286. attributeTypeHash.put("NMTOKENS", new Integer(ATTRIBUTE_NMTOKENS));
  287. attributeTypeHash.put("NOTATION", new Integer(ATTRIBUTE_NOTATION));
  288. }
  289. //
  290. // Constants for supported encodings.
  291. //
  292. private final static int ENCODING_UTF_8 = 1;
  293. private final static int ENCODING_ISO_8859_1 = 2;
  294. private final static int ENCODING_UCS_2_12 = 3;
  295. private final static int ENCODING_UCS_2_21 = 4;
  296. private final static int ENCODING_UCS_4_1234 = 5;
  297. private final static int ENCODING_UCS_4_4321 = 6;
  298. private final static int ENCODING_UCS_4_2143 = 7;
  299. private final static int ENCODING_UCS_4_3412 = 8;
  300. //
  301. // Constants for attribute default value.
  302. //
  303. /**
  304. * Constant: the attribute is not declared.
  305. * @see #getAttributeDefaultValueType
  306. */
  307. public final static int ATTRIBUTE_DEFAULT_UNDECLARED = 0;
  308. /**
  309. * Constant: the attribute has a literal default value specified.
  310. * @see #getAttributeDefaultValueType
  311. * @see #getAttributeDefaultValue
  312. */
  313. public final static int ATTRIBUTE_DEFAULT_SPECIFIED = 1;
  314. /**
  315. * Constant: the attribute was declared #IMPLIED.
  316. * @see #getAttributeDefaultValueType
  317. */
  318. public final static int ATTRIBUTE_DEFAULT_IMPLIED = 2;
  319. /**
  320. * Constant: the attribute was declared #REQUIRED.
  321. * @see #getAttributeDefaultValueType
  322. */
  323. public final static int ATTRIBUTE_DEFAULT_REQUIRED = 3;
  324. /**
  325. * Constant: the attribute was declared #FIXED.
  326. * @see #getAttributeDefaultValueType
  327. * @see #getAttributeDefaultValue
  328. */
  329. public final static int ATTRIBUTE_DEFAULT_FIXED = 4;
  330. //
  331. // Constants for input.
  332. //
  333. private final static int INPUT_NONE = 0;
  334. private final static int INPUT_INTERNAL = 1;
  335. private final static int INPUT_EXTERNAL = 2;
  336. private final static int INPUT_STREAM = 3;
  337. private final static int INPUT_BUFFER = 4;
  338. private final static int INPUT_READER = 5;
  339. //
  340. // Flags for reading literals.
  341. //
  342. private final static int LIT_CHAR_REF = 1;
  343. private final static int LIT_ENTITY_REF = 2;
  344. private final static int LIT_PE_REF = 4;
  345. private final static int LIT_NORMALIZE = 8;
  346. //
  347. // Flags for parsing context.
  348. //
  349. private final static int CONTEXT_NONE = 0;
  350. private final static int CONTEXT_DTD = 1;
  351. private final static int CONTEXT_ENTITYVALUE = 2;
  352. private final static int CONTEXT_ATTRIBUTEVALUE = 3;
  353. //////////////////////////////////////////////////////////////////////
  354. // Error reporting.
  355. //////////////////////////////////////////////////////////////////////
  356. /**
  357. * Report an error.
  358. * @param message The error message.
  359. * @param textFound The text that caused the error (or null).
  360. * @see XmlHandler#error
  361. * @see #line
  362. */
  363. void error (String message, String textFound, String textExpected)
  364. throws java.lang.Exception
  365. {
  366. errorCount++;
  367. if (textFound != null) {
  368. message = message + " (found \"" + textFound + "\")";
  369. }
  370. if (textExpected != null) {
  371. message = message + " (expected \"" + textExpected + "\")";
  372. }
  373. if (handler != null) {
  374. String uri = null;
  375. if (externalEntity != null) {
  376. uri = externalEntity.getURL().toString();
  377. }
  378. handler.error(message, uri, line, column);
  379. }
  380. }
  381. /**
  382. * Report a serious error.
  383. * @param message The error message.
  384. * @param textFound The text that caused the error (or null).
  385. */
  386. void error (String message, char textFound, String textExpected)
  387. throws java.lang.Exception
  388. {
  389. error(message, new Character(textFound).toString(), textExpected);
  390. }
  391. //////////////////////////////////////////////////////////////////////
  392. // Major syntactic productions.
  393. //////////////////////////////////////////////////////////////////////
  394. /**
  395. * Parse an XML document.
  396. * <pre>
  397. * [1] document ::= prolog element Misc*
  398. * </pre>
  399. * <p>This is the top-level parsing function for a single XML
  400. * document. As a minimum, a well-formed document must have
  401. * a document element, and a valid document must have a prolog
  402. * as well.
  403. */
  404. void parseDocument ()
  405. throws java.lang.Exception
  406. {
  407. char c;
  408. parseProlog();
  409. require('<');
  410. parseElement();
  411. try
  412. {
  413. parseMisc(); //skip all white, PIs, and comments
  414. c=readCh(); //if this doesn't throw an exception...
  415. error("unexpected characters after document end",c,null);
  416. }
  417. catch (EOFException e)
  418. {return;}
  419. }
  420. /**
  421. * Skip a comment.
  422. * <pre>
  423. * [18] Comment ::= '&lt;!--' ((Char - '-') | ('-' (Char - '-')))* "-->"
  424. * </pre>
  425. * <p>(The <code>&lt;!--</code> has already been read.)
  426. */
  427. void parseComment ()
  428. throws java.lang.Exception
  429. {
  430. skipUntil("-->");
  431. }
  432. /**
  433. * Parse a processing instruction and do a call-back.
  434. * <pre>
  435. * [19] PI ::= '&lt;?' Name (S (Char* - (Char* '?&gt;' Char*)))? '?&gt;'
  436. * </pre>
  437. * <p>(The <code>&lt;?</code> has already been read.)
  438. * <p>An XML processing instruction <em>must</em> begin with
  439. * a Name, which is the instruction's target.
  440. */
  441. void parsePI ()
  442. throws java.lang.Exception
  443. {
  444. String name;
  445. name = readNmtoken(true);
  446. if (!tryRead("?>")) {
  447. requireWhitespace();
  448. parseUntil("?>");
  449. }
  450. if (handler != null) {
  451. handler.processingInstruction(name, dataBufferToString());
  452. }
  453. }
  454. /**
  455. * Parse a CDATA marked section.
  456. * <pre>
  457. * [20] CDSect ::= CDStart CData CDEnd
  458. * [21] CDStart ::= '&lt;![CDATA['
  459. * [22] CData ::= (Char* - (Char* ']]&gt;' Char*))
  460. * [23] CDEnd ::= ']]&gt;'
  461. * </pre>
  462. * <p>(The '&lt;![CDATA[' has already been read.)
  463. * <p>Note that this just appends characters to the dataBuffer,
  464. * without actually generating an event.
  465. */
  466. void parseCDSect ()
  467. throws java.lang.Exception
  468. {
  469. parseUntil("]]>");
  470. }
  471. /**
  472. * Parse the prolog of an XML document.
  473. * <pre>
  474. * [24] prolog ::= XMLDecl? Misc* (Doctypedecl Misc*)?
  475. * </pre>
  476. * <p>There are a couple of tricks here. First, it is necessary to
  477. * declare the XML default attributes after the DTD (if present)
  478. * has been read. Second, it is not possible to expand general
  479. * references in attribute value literals until after the entire
  480. * DTD (if present) has been parsed.
  481. * <p>We do not look for the XML declaration here, because it is
  482. * handled by pushURL().
  483. * @see pushURL
  484. */
  485. void parseProlog ()
  486. throws java.lang.Exception
  487. {
  488. parseMisc();
  489. if (tryRead("<!DOCTYPE")) {
  490. parseDoctypedecl();
  491. parseMisc();
  492. }
  493. }
  494. /**
  495. * Parse the XML declaration.
  496. * <pre>
  497. * [25] XMLDecl ::= '&lt;?xml' VersionInfo EncodingDecl? SDDecl? S? '?&gt;'
  498. * [26] VersionInfo ::= S 'version' Eq ('"1.0"' | "'1.0'")
  499. * [33] SDDecl ::= S 'standalone' Eq "'" ('yes' | 'no') "'"
  500. * | S 'standalone' Eq '"' ("yes" | "no") '"'
  501. * [78] EncodingDecl ::= S 'encoding' Eq QEncoding
  502. * </pre>
  503. * <p>([80] to [82] are also significant.)
  504. * <p>(The <code>&lt;?xml</code> and whitespace have already been read.)
  505. * <p>TODO: validate value of standalone.
  506. * @see #parseTextDecl
  507. * @see #checkEncoding
  508. */
  509. void parseXMLDecl (boolean ignoreEncoding)
  510. throws java.lang.Exception
  511. {
  512. String version;
  513. String encodingName = null;
  514. String standalone = null;
  515. // Read the version.
  516. require("version");
  517. parseEq();
  518. version = readLiteral(0);
  519. if (!version.equals("1.0")) {
  520. error("unsupported XML version", version, "1.0");
  521. }
  522. // Try reading an encoding declaration.
  523. skipWhitespace();
  524. if (tryRead("encoding")) {
  525. parseEq();
  526. encodingName = readLiteral(0);
  527. checkEncoding(encodingName, ignoreEncoding);
  528. }
  529. // Try reading a standalone declaration
  530. skipWhitespace();
  531. if (tryRead("standalone")) {
  532. parseEq();
  533. standalone = readLiteral(0);
  534. }
  535. skipWhitespace();
  536. require("?>");
  537. }
  538. /**
  539. * Parse the Encoding PI.
  540. * <pre>
  541. * [78] EncodingDecl ::= S 'encoding' Eq QEncoding
  542. * [79] EncodingPI ::= '&lt;?xml' S 'encoding' Eq QEncoding S? '?&gt;'
  543. * [80] QEncoding ::= '"' Encoding '"' | "'" Encoding "'"
  544. * [81] Encoding ::= LatinName
  545. * [82] LatinName ::= [A-Za-z] ([A-Za-z0-9._] | '-')*
  546. * </pre>
  547. * <p>(The <code>&lt;?xml</code>' and whitespace have already been read.)
  548. * @see #parseXMLDecl
  549. * @see #checkEncoding
  550. */
  551. void parseTextDecl (boolean ignoreEncoding)
  552. throws java.lang.Exception
  553. {
  554. String encodingName = null;
  555. // Read an optional version.
  556. if (tryRead("version")) {
  557. String version;
  558. parseEq();
  559. version = readLiteral(0);
  560. if (!version.equals("1.0")) {
  561. error("unsupported XML version", version, "1.0");
  562. }
  563. requireWhitespace();
  564. }
  565. // Read the encoding.
  566. require("encoding");
  567. parseEq();
  568. encodingName = readLiteral(0);
  569. checkEncoding(encodingName, ignoreEncoding);
  570. skipWhitespace();
  571. require("?>");
  572. }
  573. /**
  574. * Check that the encoding specified makes sense.
  575. * <p>Compare what the author has specified in the XML declaration
  576. * or encoding PI with what we have detected.
  577. * <p>This is also important for distinguishing among the various
  578. * 7- and 8-bit encodings, such as ISO-LATIN-1 (I cannot autodetect
  579. * those).
  580. * @param encodingName The name of the encoding specified by the user.
  581. * @see #parseXMLDecl
  582. * @see #parseTextDecl
  583. */
  584. void checkEncoding (String encodingName, boolean ignoreEncoding)
  585. throws java.lang.Exception
  586. {
  587. encodingName = encodingName.toUpperCase();
  588. if (ignoreEncoding) {
  589. return;
  590. }
  591. switch (encoding) {
  592. // 8-bit encodings
  593. case ENCODING_UTF_8:
  594. if (encodingName.equals("ISO-8859-1")) {
  595. encoding = ENCODING_ISO_8859_1;
  596. } else if (!encodingName.equals("UTF-8")) {
  597. error("unsupported 8-bit encoding",
  598. encodingName,
  599. "UTF-8 or ISO-8859-1");
  600. }
  601. break;
  602. // 16-bit encodings
  603. case ENCODING_UCS_2_12:
  604. case ENCODING_UCS_2_21:
  605. if (!encodingName.equals("ISO-10646-UCS-2") &&
  606. !encodingName.equals("UTF-16")) {
  607. error("unsupported 16-bit encoding",
  608. encodingName,
  609. "ISO-10646-UCS-2");
  610. }
  611. break;
  612. // 32-bit encodings
  613. case ENCODING_UCS_4_1234:
  614. case ENCODING_UCS_4_4321:
  615. case ENCODING_UCS_4_2143:
  616. case ENCODING_UCS_4_3412:
  617. if (!encodingName.equals("ISO-10646-UCS-4")) {
  618. error("unsupported 32-bit encoding",
  619. encodingName,
  620. "ISO-10646-UCS-4");
  621. }
  622. }
  623. }
  624. /**
  625. * Parse miscellaneous markup outside the document element and DOCTYPE
  626. * declaration.
  627. * <pre>
  628. * [27] Misc ::= Comment | PI | S
  629. * </pre>
  630. */
  631. void parseMisc ()
  632. throws java.lang.Exception
  633. {
  634. while (true)
  635. {
  636. skipWhitespace();
  637. if (tryRead("<?"))
  638. {parsePI();}
  639. else if (tryRead("<!--"))
  640. {parseComment();}
  641. else
  642. {return;}
  643. }
  644. }
  645. /**
  646. * Parse a document type declaration.
  647. * <pre>
  648. * [28] doctypedecl ::= '&lt;!DOCTYPE' S Name (S ExternalID)? S?
  649. * ('[' %markupdecl* ']' S?)? '&gt;'
  650. * </pre>
  651. * <p>(The <code>&lt;!DOCTYPE</code> has already been read.)
  652. */
  653. void parseDoctypedecl ()
  654. throws java.lang.Exception
  655. {
  656. char c;
  657. String doctypeName, ids[];
  658. // Read the document type name.
  659. requireWhitespace();
  660. doctypeName = readNmtoken(true);
  661. // Read the ExternalIDs.
  662. skipWhitespace();
  663. ids = readExternalIds(false);
  664. // Look for a declaration subset.
  665. skipWhitespace();
  666. if (tryRead('[')) {
  667. // loop until the subset ends
  668. while (true) {
  669. context = CONTEXT_DTD;
  670. skipWhitespace();
  671. context = CONTEXT_NONE;
  672. if (tryRead(']')) {
  673. break; // end of subset
  674. } else {
  675. context = CONTEXT_DTD;
  676. parseMarkupdecl();
  677. context = CONTEXT_NONE;
  678. }
  679. }
  680. }
  681. // Read the external subset, if any
  682. if (ids[1] != null) {
  683. pushURL("[external subset]", ids[0], ids[1], null, null, null);
  684. // Loop until we end up back at '>'
  685. while (true) {
  686. context = CONTEXT_DTD;
  687. skipWhitespace();
  688. context = CONTEXT_NONE;
  689. if (tryRead('>')) {
  690. break;
  691. } else {
  692. context = CONTEXT_DTD;
  693. parseMarkupdecl();
  694. context = CONTEXT_NONE;
  695. }
  696. }
  697. } else {
  698. // No external subset.
  699. skipWhitespace();
  700. require('>');
  701. }
  702. if (handler != null) {
  703. handler.doctypeDecl(doctypeName, ids[0], ids[1]);
  704. }
  705. // Expand general entities in
  706. // default values of attributes.
  707. // (Do this after the doctypeDecl
  708. // event!).
  709. // expandAttributeDefaultValues();
  710. }
  711. /**
  712. * Parse a markup declaration in the internal or external DTD subset.
  713. * <pre>
  714. * [29] markupdecl ::= ( %elementdecl | %AttlistDecl | %EntityDecl |
  715. * %NotationDecl | %PI | %S | %Comment |
  716. * InternalPERef )
  717. * [30] InternalPERef ::= PEReference
  718. * [31] extSubset ::= (%markupdecl | %conditionalSect)*
  719. * </pre>
  720. */
  721. void parseMarkupdecl ()
  722. throws java.lang.Exception
  723. {
  724. if (tryRead("<!ELEMENT")) {
  725. parseElementdecl();
  726. } else if (tryRead("<!ATTLIST")) {
  727. parseAttlistDecl();
  728. } else if (tryRead("<!ENTITY")) {
  729. parseEntityDecl();
  730. } else if (tryRead("<!NOTATION")) {
  731. parseNotationDecl();
  732. } else if (tryRead("<?")) {
  733. parsePI();
  734. } else if (tryRead("<!--")) {
  735. parseComment();
  736. } else if (tryRead("<![")) {
  737. parseConditionalSect();
  738. } else {
  739. error("expected markup declaration", null, null);
  740. }
  741. }
  742. /**
  743. * Parse an element, with its tags.
  744. * <pre>
  745. * [33] STag ::= '&lt;' Name (S Attribute)* S? '&gt;' [WFC: unique Att spec]
  746. * [38] element ::= EmptyElement | STag content ETag
  747. * [39] EmptyElement ::= '&lt;' Name (S Attribute)* S? '/&gt;'
  748. * [WFC: unique Att spec]
  749. * </pre>
  750. * <p>(The '&lt;' has already been read.)
  751. * <p>NOTE: this method actually chains onto parseContent(), if necessary,
  752. * and parseContent() will take care of calling parseETag().
  753. */
  754. void parseElement ()
  755. throws java.lang.Exception
  756. {
  757. String gi;
  758. char c;
  759. int oldElementContent = currentElementContent;
  760. String oldElement = currentElement;
  761. // This is the (global) counter for the
  762. // array of specified attributes.
  763. tagAttributePos = 0;
  764. // Read the element type name.
  765. gi = readNmtoken(true);
  766. // Determine the current content type.
  767. currentElement = gi;
  768. currentElementContent = getElementContentType(gi);
  769. if (currentElementContent == CONTENT_UNDECLARED) {
  770. currentElementContent = CONTENT_ANY;
  771. }
  772. // Read the attributes, if any.
  773. // After this loop, we should be just
  774. // in front of the closing delimiter.
  775. skipWhitespace();
  776. c = readCh();
  777. while (c != '/' && c != '>') {
  778. unread(c);
  779. parseAttribute(gi);
  780. skipWhitespace();
  781. c = readCh();
  782. }
  783. unread(c);
  784. // Supply any defaulted attributes.
  785. Enumeration atts = declaredAttributes(gi);
  786. if (atts != null) {
  787. String aname;
  788. loop: while (atts.hasMoreElements()) {
  789. aname = (String)atts.nextElement();
  790. // See if it was specified.
  791. for (int i = 0; i < tagAttributePos; i++) {
  792. if (tagAttributes[i] == aname) {
  793. continue loop;
  794. }
  795. }
  796. // I guess not...
  797. if (handler != null) {
  798. handler.attribute(aname,
  799. getAttributeExpandedValue(gi, aname),
  800. false);
  801. }
  802. }
  803. }
  804. // Figure out if this is a start tag
  805. // or an empty element, and dispatch an
  806. // event accordingly.
  807. c = readCh();
  808. switch (c) {
  809. case '>':
  810. if (handler != null) {
  811. handler.startElement(gi);
  812. }
  813. parseContent();
  814. break;
  815. case '/':
  816. require('>');
  817. if (handler != null) {
  818. handler.startElement(gi);
  819. handler.endElement(gi);
  820. }
  821. break;
  822. }
  823. // Restore the previous state.
  824. currentElement = oldElement;
  825. currentElementContent = oldElementContent;
  826. }
  827. /**
  828. * Parse an attribute assignment.
  829. * <pre>
  830. * [34] Attribute ::= Name Eq AttValue
  831. * </pre>
  832. * @param name The name of the attribute's element.
  833. * @see XmlHandler#attribute
  834. */
  835. void parseAttribute (String name)
  836. throws java.lang.Exception
  837. {
  838. String aname;
  839. int type;
  840. String value;
  841. // Read the attribute name.
  842. aname = readNmtoken(true).intern();
  843. type = getAttributeDefaultValueType(name, aname);
  844. // Parse '='
  845. parseEq();
  846. // Read the value, normalizing whitespace
  847. // if it is not CDATA.
  848. if (type == ATTRIBUTE_CDATA || type == ATTRIBUTE_UNDECLARED) {
  849. value = readLiteral(LIT_CHAR_REF | LIT_ENTITY_REF);
  850. } else {
  851. value = readLiteral(LIT_CHAR_REF | LIT_ENTITY_REF | LIT_NORMALIZE);
  852. }
  853. // Inform the handler about the
  854. // attribute.
  855. if (handler != null) {
  856. handler.attribute(aname, value, true);
  857. }
  858. dataBufferPos = 0;
  859. // Note that the attribute has been
  860. // specified.
  861. if (tagAttributePos == tagAttributes.length) {
  862. String newAttrib[] = new String[tagAttributes.length * 2];
  863. System.arraycopy(tagAttributes, 0, newAttrib, 0, tagAttributePos);
  864. tagAttributes = newAttrib;
  865. }
  866. tagAttributes[tagAttributePos++] = aname;
  867. }
  868. /**
  869. * Parse an equals sign surrounded by optional whitespace.
  870. * [35] Eq ::= S? '=' S?
  871. */
  872. void parseEq ()
  873. throws java.lang.Exception
  874. {
  875. skipWhitespace();
  876. require('=');
  877. skipWhitespace();
  878. }
  879. /**
  880. * Parse an end tag.
  881. * [36] ETag ::= '</' Name S? '>'
  882. * *NOTE: parseContent() chains to here.
  883. */
  884. void parseETag ()
  885. throws java.lang.Exception
  886. {
  887. String name;
  888. name = readNmtoken(true);
  889. if (name != currentElement) {
  890. error("mismatched end tag", name, currentElement);
  891. }
  892. skipWhitespace();
  893. require('>');
  894. if (handler != null) {
  895. handler.endElement(name);
  896. }
  897. }
  898. /**
  899. * Parse the content of an element.
  900. * [37] content ::= (element | PCData | Reference | CDSect | PI | Comment)*
  901. * [68] Reference ::= EntityRef | CharRef
  902. */
  903. void parseContent ()
  904. throws java.lang.Exception
  905. {
  906. String data;
  907. char c;
  908. while (true) {
  909. switch (currentElementContent) {
  910. case CONTENT_ANY:
  911. case CONTENT_MIXED:
  912. parsePCData();
  913. break;
  914. case CONTENT_ELEMENTS:
  915. parseWhitespace();
  916. break;
  917. }
  918. // Handle delimiters
  919. c = readCh();
  920. switch (c) {
  921. case '&': // Found "&"
  922. c = readCh();
  923. if (c == '#') {
  924. parseCharRef();
  925. } else {
  926. unread(c);
  927. parseEntityRef(true);
  928. }
  929. break;
  930. case '<': // Found "<"
  931. c = readCh();
  932. switch (c) {
  933. case '!': // Found "<!"
  934. c = readCh();
  935. switch (c) {
  936. case '-': // Found "<!-"
  937. require('-');
  938. parseComment();
  939. break;
  940. case '[': // Found "<!["
  941. require("CDATA[");
  942. parseCDSect();
  943. break;
  944. default:
  945. error("expected comment or CDATA section", c, null);
  946. break;
  947. }
  948. break;
  949. case '?': // Found "<?"
  950. dataBufferFlush();
  951. parsePI();
  952. break;
  953. case '/': // Found "</"
  954. dataBufferFlush();
  955. parseETag();
  956. return;
  957. default: // Found "<" followed by something else
  958. dataBufferFlush();
  959. unread(c);
  960. parseElement();
  961. break;
  962. }
  963. }
  964. }
  965. }
  966. /**
  967. * Parse an element type declaration.
  968. * [40] elementdecl ::= '<!ELEMENT' S %Name S (%S S)? %contentspec S? '>'
  969. * [VC: Unique Element Declaration]
  970. * *NOTE: the '<!ELEMENT' has already been read.
  971. */
  972. void parseElementdecl ()
  973. throws java.lang.Exception
  974. {
  975. String name;
  976. requireWhitespace();
  977. // Read the element type name.
  978. name = readNmtoken(true);
  979. requireWhitespace();
  980. // Read the content model.
  981. parseContentspec(name);
  982. skipWhitespace();
  983. require('>');
  984. }
  985. /**
  986. * Content specification.
  987. * [41] contentspec ::= 'EMPTY' | 'ANY' | Mixed | elements
  988. */
  989. void parseContentspec (String name)
  990. throws java.lang.Exception
  991. {
  992. if (tryRead("EMPTY")) {
  993. setElement(name, CONTENT_EMPTY, null, null);
  994. return;
  995. } else if (tryRead("ANY")) {
  996. setElement(name, CONTENT_ANY, null, null);
  997. return;
  998. } else {
  999. require('(');
  1000. dataBufferAppend('(');
  1001. skipWhitespace();
  1002. if (tryRead("#PCDATA")) {
  1003. dataBufferAppend("#PCDATA");
  1004. parseMixed();
  1005. setElement(name, CONTENT_MIXED, dataBufferToString(), null);
  1006. } else {
  1007. parseElements();
  1008. setElement(name, CONTENT_ELEMENTS, dataBufferToString(), null);
  1009. }
  1010. }
  1011. }
  1012. /**
  1013. * Parse an element-content model.
  1014. * [42] elements ::= (choice | seq) ('?' | '*' | '+')?
  1015. * [44] cps ::= S? %cp S?
  1016. * [45] choice ::= '(' S? %ctokplus (S? '|' S? %ctoks)* S? ')'
  1017. * [46] ctokplus ::= cps ('|' cps)+
  1018. * [47] ctoks ::= cps ('|' cps)*
  1019. * [48] seq ::= '(' S? %stoks (S? ',' S? %stoks)* S? ')'
  1020. * [49] stoks ::= cps (',' cps)*
  1021. * *NOTE: the opening '(' and S have already been read.
  1022. * *TODO: go over parameter entity boundaries more carefully.
  1023. */
  1024. void parseElements ()
  1025. throws java.lang.Exception
  1026. {
  1027. char c;
  1028. char sep;
  1029. // Parse the first content particle
  1030. skipWhitespace();
  1031. parseCp();
  1032. // Check for end or for a separator.
  1033. skipWhitespace();
  1034. c = readCh();
  1035. switch (c) {
  1036. case ')':
  1037. dataBufferAppend(')');
  1038. c = readCh();
  1039. switch (c) {
  1040. case '*':
  1041. case '+':
  1042. case '?':
  1043. dataBufferAppend(c);
  1044. break;
  1045. default:
  1046. unread(c);
  1047. }
  1048. return;
  1049. case ',': // Register the separator.
  1050. case '|':
  1051. sep = c;
  1052. dataBufferAppend(c);
  1053. break;
  1054. default:
  1055. error("bad separator in content model", c, null);
  1056. return;
  1057. }
  1058. // Parse the rest of the content model.
  1059. while (true) {
  1060. skipWhitespace();
  1061. parseCp();
  1062. skipWhitespace();
  1063. c = readCh();
  1064. if (c == ')') {
  1065. dataBufferAppend(')');
  1066. break;
  1067. } else if (c != sep) {
  1068. error("bad separator in content model", c, null);
  1069. return;
  1070. } else {
  1071. dataBufferAppend(c);
  1072. }
  1073. }
  1074. // Check for the occurrence indicator.
  1075. c = readCh();
  1076. switch (c) {
  1077. case '?':
  1078. case '*':
  1079. case '+':
  1080. dataBufferAppend(c);
  1081. return;
  1082. default:
  1083. unread(c);
  1084. return;
  1085. }
  1086. }
  1087. /**
  1088. * Parse a content particle.
  1089. * [43] cp ::= (Name | choice | seq) ('?' | '*' | '+')
  1090. * *NOTE: I actually use a slightly different production here:
  1091. * cp ::= (elements | (Name ('?' | '*' | '+')?))
  1092. */
  1093. void parseCp ()
  1094. throws java.lang.Exception
  1095. {
  1096. char c;
  1097. if (tryRead('(')) {
  1098. dataBufferAppend('(');
  1099. parseElements();
  1100. } else {
  1101. dataBufferAppend(readNmtoken(true));
  1102. c = readCh();
  1103. switch (c) {
  1104. case '?':
  1105. case '*':
  1106. case '+':
  1107. dataBufferAppend(c);
  1108. break;
  1109. default:
  1110. unread(c);
  1111. break;
  1112. }
  1113. }
  1114. }
  1115. /**
  1116. * Parse mixed content.
  1117. * [50] Mixed ::= '(' S? %( %'#PCDATA' (S? '|' S? %Mtoks)* ) S? ')*'
  1118. * | '(' S? %('#PCDATA') S? ')'
  1119. * [51] Mtoks ::= %Name (S? '|' S? %Name)*
  1120. * *NOTE: the S and '#PCDATA' have already been read.
  1121. */
  1122. void parseMixed ()
  1123. throws java.lang.Exception
  1124. {
  1125. char c;
  1126. // Check for PCDATA alone.
  1127. skipWhitespace();
  1128. if (tryRead(')')) {
  1129. dataBufferAppend(")*");
  1130. tryRead('*');
  1131. return;
  1132. }
  1133. // Parse mixed content.
  1134. skipWhitespace();
  1135. while (!tryRead(")*")) {
  1136. require('|');
  1137. dataBufferAppend('|');
  1138. skipWhitespace();
  1139. dataBufferAppend(readNmtoken(true));
  1140. skipWhitespace();
  1141. }
  1142. dataBufferAppend(")*");
  1143. }
  1144. /**
  1145. * Parse an attribute list declaration.
  1146. * [52] AttlistDecl ::= '<!ATTLIST' S %Name S? %AttDef+ S? '>'
  1147. * *NOTE: the '<!ATTLIST' has already been read.
  1148. */
  1149. void parseAttlistDecl ()
  1150. throws java.lang.Exception
  1151. {
  1152. String elementName;
  1153. requireWhitespace();
  1154. elementName = readNmtoken(true);
  1155. requireWhitespace();
  1156. while (!tryRead('>')) {
  1157. parseAttDef(elementName);
  1158. skipWhitespace();
  1159. }
  1160. }
  1161. /**
  1162. * Parse a single attribute definition.
  1163. * [53] AttDef ::= S %Name S %AttType S %Default
  1164. */
  1165. void parseAttDef (String elementName)
  1166. throws java.lang.Exception
  1167. {
  1168. String name;
  1169. int type;
  1170. String enum = null;
  1171. // Read the attribute name.
  1172. name = readNmtoken(true);
  1173. // Read the attribute type.
  1174. requireWhitespace();
  1175. type = readAttType();
  1176. // Get the string of enumerated values
  1177. // if necessary.
  1178. if (type == ATTRIBUTE_ENUMERATED || type == ATTRIBUTE_NOTATION) {
  1179. enum = dataBufferToString();
  1180. }
  1181. // Read the default value.
  1182. requireWhitespace();
  1183. parseDefault(elementName, name, type, enum);
  1184. }
  1185. /**
  1186. * Parse the attribute type.
  1187. * [54] AttType ::= StringType | TokenizedType | EnumeratedType
  1188. * [55] StringType ::= 'CDATA'
  1189. * [56] TokenizedType ::= 'ID' | 'IDREF' | 'IDREFS' | 'ENTITY' | 'ENTITIES' |
  1190. * 'NMTOKEN' | 'NMTOKENS'
  1191. * [57] EnumeratedType ::= NotationType | Enumeration
  1192. * *TODO: validate the type!!
  1193. */
  1194. int readAttType ()
  1195. throws java.lang.Exception
  1196. {
  1197. String typeString;
  1198. Integer type;
  1199. if (tryRead('(')) {
  1200. parseEnumeration();
  1201. return ATTRIBUTE_ENUMERATED;
  1202. } else {
  1203. typeString = readNmtoken(true);
  1204. if (typeString.equals("NOTATION")) {
  1205. parseNotationType();
  1206. }
  1207. type = (Integer)attributeTypeHash.get(typeString);
  1208. if (type == null) {
  1209. error("illegal attribute type", typeString, null);
  1210. return ATTRIBUTE_UNDECLARED;
  1211. } else {
  1212. return type.intValue();
  1213. }
  1214. }
  1215. }
  1216. /**
  1217. * Parse an enumeration.
  1218. * [60] Enumeration ::= '(' S? %Etoks (S? '|' S? %Etoks)* S? ')'
  1219. * [61] Etoks ::= %Nmtoken (S? '|' S? %Nmtoken)*
  1220. * *NOTE: the '(' has already been read.
  1221. */
  1222. void parseEnumeration ()
  1223. throws java.lang.Exception
  1224. {
  1225. char c;
  1226. dataBufferAppend('(');
  1227. // Read the first token.
  1228. skipWhitespace();
  1229. dataBufferAppend(readNmtoken(true));
  1230. // Read the remaining tokens.
  1231. skipWhitespace();
  1232. while (!tryRead(')')) {
  1233. require('|');
  1234. dataBufferAppend('|');
  1235. skipWhitespace();
  1236. dataBufferAppend(readNmtoken(true));
  1237. skipWhitespace();
  1238. }
  1239. dataBufferAppend(')');
  1240. }
  1241. /**
  1242. * Parse a notation type for an attribute.
  1243. * [58] NotationType ::= %'NOTATION' S '(' S? %Ntoks (S? '|' S? %Ntoks)*
  1244. * S? ')'
  1245. * [59] Ntoks ::= %Name (S? '|' S? %Name)
  1246. * *NOTE: the 'NOTATION' has already been read
  1247. */
  1248. void parseNotationType ()
  1249. throws java.lang.Exception
  1250. {
  1251. requireWhitespace();
  1252. require('(');
  1253. parseEnumeration();
  1254. }
  1255. /**
  1256. * Parse the default value for an attribute.
  1257. * [62] Default ::= '#REQUIRED' | '#IMPLIED' | ((%'#FIXED' S)? %AttValue
  1258. */
  1259. void parseDefault (String elementName, String name, int type, String enum)
  1260. throws java.lang.Exception
  1261. {
  1262. int valueType = ATTRIBUTE_DEFAULT_SPECIFIED;
  1263. String value = null;
  1264. boolean normalizeWSFlag;
  1265. if (tryRead('#')) {
  1266. if (tryRead("FIXED")) {
  1267. valueType = ATTRIBUTE_DEFAULT_FIXED;
  1268. requireWhitespace();
  1269. context = CONTEXT_ATTRIBUTEVALUE;
  1270. value = readLiteral(LIT_CHAR_REF);
  1271. context = CONTEXT_DTD;
  1272. } else if (tryRead("REQUIRED")) {
  1273. valueType = ATTRIBUTE_DEFAULT_REQUIRED;
  1274. } else if (tryRead("IMPLIED")) {
  1275. valueType = ATTRIBUTE_DEFAULT_IMPLIED;
  1276. } else {
  1277. error("illegal keyword for attribute default value", null, null);
  1278. }
  1279. } else {
  1280. context = CONTEXT_ATTRIBUTEVALUE;
  1281. value = readLiteral(LIT_CHAR_REF);
  1282. context = CONTEXT_DTD;
  1283. }
  1284. setAttribute(elementName, name, type, enum, value, valueType);
  1285. }
  1286. /**
  1287. * Parse a conditional section.
  1288. * [63] conditionalSect ::= includeSect || ignoreSect
  1289. * [64] includeSect ::= '<![' %'INCLUDE' '[' (%markupdecl*)* ']]>'
  1290. * [65] ignoreSect ::= '<![' %'IGNORE' '[' ignoreSectContents* ']]>'
  1291. * [66] ignoreSectContents ::= ((SkipLit | Comment | PI) -(Char* ']]>'))
  1292. * | ('<![' ignoreSectContents* ']]>')
  1293. * | (Char - (']' | [<'"]))
  1294. * | ('<!' (Char - ('-' | '[')))
  1295. * *NOTE: the '<![' has already been read.
  1296. * *TODO: verify that I am handling ignoreSectContents right.
  1297. */
  1298. void parseConditionalSect ()
  1299. throws java.lang.Exception
  1300. {
  1301. skipWhitespace();
  1302. if (tryRead("INCLUDE")) {
  1303. skipWhitespace();
  1304. require('[');
  1305. skipWhitespace();
  1306. while (!tryRead("]]>")) {
  1307. parseMarkupdecl();
  1308. skipWhitespace();
  1309. }
  1310. } else if (tryRead("IGNORE")) {
  1311. skipWhitespace();
  1312. require('[');
  1313. int nesting = 1;
  1314. char c;
  1315. for (int nest = 1; nest > 0; ) {
  1316. c = readCh();
  1317. switch (c) {
  1318. case '<':
  1319. if (tryRead("![")) {
  1320. nest++;
  1321. }
  1322. case ']':
  1323. if (tryRead("]>")) {
  1324. nest--;
  1325. }
  1326. }
  1327. }
  1328. } else {
  1329. error("conditional section must begin with INCLUDE or IGNORE",
  1330. null, null);
  1331. }
  1332. }
  1333. /**
  1334. * Read a character reference.
  1335. * [67] CharRef ::= '&#' [0-9]+ ';' | '&#x' [0-9a-fA-F]+ ';'
  1336. * *NOTE: the '&#' has already been read.
  1337. */
  1338. void parseCharRef ()
  1339. throws java.lang.Exception
  1340. {
  1341. int value = 0;
  1342. char c;
  1343. if (tryRead('x')) {
  1344. loop1: while (true) {
  1345. c = readCh();
  1346. switch (c) {
  1347. case '0':
  1348. case '1':
  1349. case '2':
  1350. case '3':
  1351. case '4':
  1352. case '5':
  1353. case '6':
  1354. case '7':
  1355. case '8':
  1356. case '9':
  1357. case 'a':
  1358. case 'A':
  1359. case 'b':
  1360. case 'B':
  1361. case 'c':
  1362. case 'C':
  1363. case 'd':
  1364. case 'D':
  1365. case 'e':
  1366. case 'E':
  1367. case 'f':
  1368. case 'F':
  1369. value *= 16;
  1370. value += Integer.parseInt(new Character(c).toString(), 16);
  1371. break;
  1372. case ';':
  1373. break loop1;
  1374. default:
  1375. error("illegal character in character reference", c, null);
  1376. break loop1;
  1377. }
  1378. }
  1379. } else {
  1380. loop2: while (true) {
  1381. c = readCh();
  1382. switch (c) {
  1383. case '0':
  1384. case '1':
  1385. case '2':
  1386. case '3':
  1387. case '4':
  1388. case '5':
  1389. case '6':
  1390. case '7':
  1391. case '8':
  1392. case '9':
  1393. value *= 10;
  1394. value += Integer.parseInt(new Character(c).toString(), 10);
  1395. break;
  1396. case ';':
  1397. break loop2;
  1398. default:
  1399. error("illegal character in character reference", c, null);
  1400. break loop2;
  1401. }
  1402. }
  1403. }
  1404. // Check for surrogates: 00000000 0000xxxx yyyyyyyy zzzzzzzz
  1405. // (1101|10xx|xxyy|yyyy + 1101|11yy|zzzz|zzzz:
  1406. if (value <= 0x0000ffff) {
  1407. // no surrogates needed
  1408. dataBufferAppend((char)value);
  1409. } else if (value <= 0x000fffff) {
  1410. // > 16 bits, surrogate needed
  1411. dataBufferAppend((char)(0xd8 | ((value & 0x000ffc00) >> 10)));
  1412. dataBufferAppend((char)(0xdc | (value & 0x0003ff)));
  1413. } else {
  1414. // too big for surrogate
  1415. error("character reference " + value + " is too large for UTF-16",
  1416. new Integer(value).toString(), null);
  1417. }
  1418. }
  1419. /**
  1420. * Parse a reference.
  1421. * [69] EntityRef ::= '&' Name ';'
  1422. * *NOTE: the '&' has already been read.
  1423. * @param externalAllowed External entities are allowed here.
  1424. */
  1425. void parseEntityRef (boolean externalAllowed)
  1426. throws java.lang.Exception
  1427. {
  1428. String name;
  1429. name = readNmtoken(true);
  1430. require(';');
  1431. switch (getEntityType(name)) {
  1432. case ENTITY_UNDECLARED:
  1433. error("reference to undeclared entity", name, null);
  1434. break;
  1435. case ENTITY_INTERNAL:
  1436. pushString(name, getEntityValue(name));
  1437. break;
  1438. case ENTITY_TEXT:
  1439. if (externalAllowed) {
  1440. pushURL(name, getEntityPublicId(name),
  1441. getEntitySystemId(name),
  1442. null, null, null);
  1443. } else {
  1444. error("reference to external entity in attribute value.", name, null);
  1445. }
  1446. break;
  1447. case ENTITY_NDATA:
  1448. if (externalAllowed) {
  1449. error("data entity reference in content", name, null);
  1450. } else {
  1451. error("reference to external entity in attribute value.", name, null);
  1452. }
  1453. break;
  1454. }
  1455. }
  1456. /**
  1457. * Parse a parameter entity reference.
  1458. * [70] PEReference ::= '%' Name ';'
  1459. * *NOTE: the '%' has already been read.
  1460. */
  1461. void parsePEReference (boolean isEntityValue)
  1462. throws java.lang.Exception
  1463. {
  1464. String name;
  1465. name = "%" + readNmtoken(true);
  1466. require(';');
  1467. switch (getEntityType(name)) {
  1468. case ENTITY_UNDECLARED:
  1469. error("reference to undeclared parameter entity", name, null);
  1470. break;
  1471. case ENTITY_INTERNAL:
  1472. if (isEntityValue) {
  1473. pushString(name, getEntityValue(name));
  1474. } else {
  1475. pushString(name, " " + getEntityValue(name) + ' ');
  1476. }
  1477. break;
  1478. case ENTITY_TEXT:
  1479. if (isEntityValue) {
  1480. pushString(null, " ");
  1481. }
  1482. pushURL(name, getEntityPublicId(name),
  1483. getEntitySystemId(name),
  1484. null, null, null);
  1485. if (isEntityValue) {
  1486. pushString(null, " ");
  1487. }
  1488. break;
  1489. }
  1490. }
  1491. /**
  1492. * Parse an entity declaration.
  1493. * [71] EntityDecl ::= '<!ENTITY' S %Name S %EntityDef S? '>'
  1494. * | '<!ENTITY' S '%' S %Name S %EntityDef S? '>'
  1495. * [72] EntityDef ::= EntityValue | ExternalDef
  1496. * [73] ExternalDef ::= ExternalID %NDataDecl?
  1497. * [74] ExternalID ::= 'SYSTEM' S SystemLiteral
  1498. * | 'PUBLIC' S PubidLiteral S SystemLiteral
  1499. * [75] NDataDecl ::= S %'NDATA' S %Name
  1500. * *NOTE: the '<!ENTITY' has already been read.
  1501. */
  1502. void parseEntityDecl ()
  1503. throws java.lang.Exception
  1504. {
  1505. char c;
  1506. boolean peFlag = false;
  1507. String name, value, notationName, ids[];
  1508. // Check for a parameter entity.
  1509. requireWhitespace();
  1510. if (tryRead('%')) {
  1511. peFlag = true;
  1512. requireWhitespace();
  1513. }
  1514. // Read the entity name, and prepend
  1515. // '%' if necessary.
  1516. name = readNmtoken(true);
  1517. if (peFlag) {
  1518. name = "%" + name;
  1519. }
  1520. // Read the entity value.
  1521. requireWhitespace();
  1522. c = readCh();
  1523. unread(c);
  1524. if (c == '"' || c == '\'') {
  1525. // Internal entity.
  1526. context = CONTEXT_ENTITYVALUE;
  1527. value = readLiteral(LIT_CHAR_REF|LIT_PE_REF);
  1528. context = CONTEXT_DTD;
  1529. setInternalEntity(name,value);
  1530. } else {
  1531. // Read the external IDs
  1532. ids = readExternalIds(false);
  1533. if (ids[1] == null) {
  1534. error("system identifer missing", name, null);
  1535. }
  1536. // Check for NDATA declaration.
  1537. skipWhitespace();
  1538. if (tryRead("NDATA")) {
  1539. requireWhitespace();
  1540. notationName = readNmtoken(true);
  1541. setExternalDataEntity(name, ids[0], ids[1], notationName);
  1542. } else {
  1543. setExternalTextEntity(name, ids[0], ids[1]);
  1544. }
  1545. }
  1546. // Finish the declaration.
  1547. skipWhitespace();
  1548. require('>');
  1549. }
  1550. /**
  1551. * Parse a notation declaration.
  1552. * [81] NotationDecl ::= '<!NOTATION' S %Name S %ExternalID S? '>'
  1553. * *NOTE: the '<!NOTATION' has already been read.
  1554. */
  1555. void parseNotationDecl ()
  1556. throws java.lang.Exception
  1557. {
  1558. String nname, ids[];
  1559. requireWhitespace();
  1560. nname = readNmtoken(true);
  1561. requireWhitespace();
  1562. // Read the external identifiers.
  1563. ids = readExternalIds(true);
  1564. if (ids[0] == null && ids[1] == null) {
  1565. error("external identifer missing", nname, null);
  1566. }
  1567. // Register the notation.
  1568. setNotation(nname, ids[0], ids[1]);
  1569. skipWhitespace();
  1570. require('>');
  1571. }
  1572. /**
  1573. * Parse PCDATA.
  1574. * <pre>
  1575. * [16] PCData ::= [^&lt;&amp;]*
  1576. * </pre>
  1577. * <p>The trick here is that the data stays in the dataBuffer without
  1578. * necessarily being converted to a string right away.
  1579. */
  1580. void parsePCData ()
  1581. throws java.lang.Exception
  1582. {
  1583. char c;
  1584. // Start with a little cheat -- in most
  1585. // cases, the entire sequence of
  1586. // character data will already be in
  1587. // the readBuffer; if not, fall through to
  1588. // the normal approach.
  1589. if (USE_CHEATS) {
  1590. int lineAugment = 0;
  1591. int columnAugment = 0;
  1592. loop: for (int i = readBufferPos; i < readBufferLength; i++) {
  1593. switch (readBuffer[i]) {
  1594. case '\n':
  1595. lineAugment++;
  1596. columnAugment = 0;
  1597. break;
  1598. case '&':
  1599. case '<':
  1600. int start = readBufferPos;
  1601. columnAugment++;
  1602. readBufferPos = i;
  1603. if (lineAugment > 0) {
  1604. line += lineAugment;
  1605. column = columnAugment;
  1606. } else {
  1607. column += columnAugment;
  1608. }
  1609. dataBufferAppend(readBuffer, start, i-start);
  1610. return;
  1611. default:
  1612. columnAugment++;
  1613. }
  1614. }
  1615. }
  1616. // OK, the cheat didn't work; start over
  1617. // and do it by the book.
  1618. while (true) {
  1619. c = readCh();
  1620. switch (c) {
  1621. case '<':
  1622. case '&':
  1623. unread(c);
  1624. return;
  1625. default:
  1626. dataBufferAppend(c);
  1627. break;
  1628. }
  1629. }
  1630. }
  1631. //////////////////////////////////////////////////////////////////////
  1632. // High-level reading and scanning methods.
  1633. //////////////////////////////////////////////////////////////////////
  1634. /**
  1635. * Require whitespace characters.
  1636. * [1] S ::= (#x20 | #x9 | #xd | #xa)+
  1637. */
  1638. void requireWhitespace ()
  1639. throws java.lang.Exception
  1640. {
  1641. char c = readCh();
  1642. if (isWhitespace(c)) {
  1643. skipWhitespace();
  1644. } else {
  1645. error("whitespace expected", c, null);
  1646. }
  1647. }
  1648. /**
  1649. * Parse whitespace characters, and leave them in the data buffer.
  1650. */
  1651. void parseWhitespace ()
  1652. throws java.lang.Exception
  1653. {
  1654. char c = readCh();
  1655. while (isWhitespace(c)) {
  1656. dataBufferAppend(c);
  1657. c = readCh();
  1658. }
  1659. unread(c);
  1660. }
  1661. /**
  1662. * Skip whitespace characters.
  1663. * [1] S ::= (#x20 | #x9 | #xd | #xa)+
  1664. */
  1665. void skipWhitespace ()
  1666. throws java.lang.Exception
  1667. {
  1668. // Start with a little cheat. Most of
  1669. // the time, the white space will fall
  1670. // within the current read buffer; if
  1671. // not, then fall through.
  1672. if (USE_CHEATS) {
  1673. int lineAugment = 0;
  1674. int columnAugment = 0;
  1675. loop: for (int i = readBufferPos; i < readBufferLength; i++) {
  1676. switch (readBuffer[i]) {
  1677. case ' ':
  1678. case '\t':
  1679. case '\r':
  1680. columnAugment++;
  1681. break;
  1682. case '\n':
  1683. lineAugment++;
  1684. columnAugment = 0;
  1685. break;
  1686. case '%':
  1687. if (context == CONTEXT_DTD || context == CONTEXT_ENTITYVALUE) {
  1688. break loop;
  1689. } // else fall through...
  1690. default:
  1691. readBufferPos = i;
  1692. if (lineAugment > 0) {
  1693. line += lineAugment;
  1694. column = columnAugment;
  1695. } else {
  1696. column += columnAugment;
  1697. }
  1698. return;
  1699. }
  1700. }
  1701. }
  1702. // OK, do it by the book.
  1703. char c = readCh();
  1704. while (isWhitespace(c)) {
  1705. c = readCh();
  1706. }
  1707. unread(c);
  1708. }
  1709. /**
  1710. * Read a name or name token.
  1711. * [5] Name ::= (Letter | '_' | ':') (NameChar)*
  1712. * [7] Nmtoken ::= (NameChar)+
  1713. * *NOTE: [6] is implemented implicitly where required.
  1714. */
  1715. String readNmtoken (boolean isName)
  1716. throws java.lang.Exception
  1717. {
  1718. char c;
  1719. if (USE_CHEATS) {
  1720. loop: for (int i = readBufferPos; i < readBufferLength; i++) {
  1721. switch (readBuffer[i]) {
  1722. case '%':
  1723. if (context == CONTEXT_DTD || context == CONTEXT_ENTITYVALUE) {
  1724. break loop;
  1725. } // else fall through...
  1726. case '<':
  1727. case '>':
  1728. case '&':
  1729. case ',':
  1730. case '|':
  1731. case '*':
  1732. case '+':
  1733. case '?':
  1734. case ')':
  1735. case '=':
  1736. case '\'':
  1737. case '"':
  1738. case '[':
  1739. case ' ':
  1740. case '\t':
  1741. case '\r':
  1742. case '\n':
  1743. case ';':
  1744. case '/':
  1745. case '#':
  1746. int start = readBufferPos;
  1747. if (i == start) {
  1748. error("name expected", readBuffer[i], null);
  1749. }
  1750. readBufferPos = i;
  1751. return intern(readBuffer, start, i - start);
  1752. }
  1753. }
  1754. }
  1755. nameBufferPos = 0;
  1756. // Read the first character.
  1757. loop: while (true) {
  1758. c = readCh();
  1759. switch (c) {
  1760. case '%':
  1761. case '<':
  1762. case '>':
  1763. case '&':
  1764. case ',':
  1765. case '|':
  1766. case '*':
  1767. case '+':
  1768. case '?':
  1769. case ')':
  1770. case '=':
  1771. case '\'':
  1772. case '"':
  1773. case '[':
  1774. case ' ':
  1775. case '\t':
  1776. case '\n':
  1777. case '\r':
  1778. case ';':
  1779. case '/':
  1780. unread(c);
  1781. if (nameBufferPos == 0) {
  1782. error("name expected", null, null);
  1783. }
  1784. String s = intern(nameBuffer,0,nameBufferPos);
  1785. nameBufferPos = 0;
  1786. return s;
  1787. default:
  1788. nameBuffer =
  1789. (char[])extendArray(nameBuffer, nameBuffer.length, nameBufferPos);
  1790. nameBuffer[nameBufferPos++] = c;
  1791. }
  1792. }
  1793. }
  1794. /**
  1795. * Read a literal.
  1796. * [10] AttValue ::= '"' ([^<&"] | Referenc…

Large files files are truncated, but you can click here to view the full file