PageRenderTime 48ms CodeModel.GetById 20ms RepoModel.GetById 1ms app.codeStats 0ms

/bundles/plugins-trunk/XML/xml/DTDGenerator.java

#
Java | 638 lines | 351 code | 99 blank | 188 comment | 88 complexity | 0b462c9eb4d6eb83063139c9dac7eb4a MD5 | raw file
Possible License(s): BSD-3-Clause, AGPL-1.0, Apache-2.0, LGPL-2.0, LGPL-3.0, GPL-2.0, CC-BY-SA-3.0, LGPL-2.1, GPL-3.0, MPL-2.0-no-copyleft-exception, IPL-1.0
  1. /*
  2. * DTDGenerator.java
  3. *
  4. * Copyright (C) 2001 Michael H. Kay
  5. * Portions copyright (C) 2007 Martin Raspe
  6. *
  7. * The XML plugin is licensed under the GNU General Public License, with
  8. * the following exception:
  9. *
  10. * "Permission is granted to link this code with software released under
  11. * the Apache license version 1.1, for example used by the Xerces XML
  12. * parser package."
  13. *
  14. * The code of this module is licensed under the terms of the
  15. * Mozilla Public License Version 1.0. (http://www.mozilla.org/MPL/)
  16. */
  17. package xml;
  18. import java.util.*;
  19. import java.io.StringReader;
  20. import java.lang.reflect.Array;
  21. import javax.xml.parsers.SAXParserFactory;
  22. import org.xml.sax.*;
  23. import org.xml.sax.helpers.DefaultHandler;
  24. import org.gjt.sp.jedit.*;
  25. /**
  26. * DTDGenerator<BR>
  27. * Generates a possible DTD from an XML document instance.
  28. * Pure SAX version of the Saxon DTDGenerator
  29. * The program has no remaining dependencies on Saxon; all it needs is:
  30. * JAXP 1.1
  31. * SAX2
  32. * A JAXP 1.1 conformant XML parser
  33. * Java 1.2
  34. * @author M.H.Kay
  35. * @version 7.0: separated from Saxon source, now works with any JAXP 1.1 XML parser
  36. *
  37. * Adapted for the jEdit XML Plugin by Martin Raspe (hertzhaft@biblhertz.it)
  38. */
  39. public class DTDGenerator extends DefaultHandler {
  40. // DTDGenerator is a ContentHandler, created for convenience
  41. // by extending the default handler that comes with SAX2
  42. protected static int MIN_ENUMERATION_INSTANCES = 10;
  43. // minimum number of appearances of an attribute for
  44. // it to be considered a candidate for an enumeration type
  45. protected static int MAX_ENUMERATION_VALUES = 20;
  46. // maximum number of distinct attribute values to be
  47. // included in an enumeration
  48. protected static int MIN_ENUMERATION_RATIO = 3;
  49. // an attribute will be regarded as an enumeration attribute
  50. // only if the number of instances divided by the number of
  51. // distinct values is >= this ratio
  52. protected static int MIN_FIXED = 5;
  53. // minimum number of attributes that must appear, with
  54. // the same value each time, for the value to be regarded
  55. // as FIXED
  56. protected static int MIN_ID_VALUES = 10;
  57. // minumum number of attribute values that must appear
  58. // for the attribute to be regarded as an ID value
  59. protected static int MAX_ID_VALUES = 100000;
  60. // maximum number of attribute values to be saved
  61. // while checking for uniqueness
  62. TreeMap<String,ElementDetails> elementList; // alphabetical list of element types appearing in the document;
  63. // each has the element name as a key and an ElementDetails object
  64. // as the value
  65. Stack<StackEntry> elementStack; // stack of elements currently open; each entry is a StackEntry
  66. // object
  67. public DTDGenerator ()
  68. {
  69. elementList = new TreeMap<String,ElementDetails>();
  70. elementStack = new Stack<StackEntry>();
  71. }
  72. /**
  73. * Write a DTD for the given XML document
  74. */
  75. public static String write (View view, String xml)
  76. {
  77. DTDGenerator generator = new DTDGenerator();
  78. generator.parse(view, xml);
  79. return generator.printDTD();
  80. }
  81. /**
  82. * (planned) Write an XML Schema for the given XML document
  83. */
  84. public static String writeXSD (View view, String xml)
  85. {
  86. DTDGenerator generator = new DTDGenerator();
  87. generator.parse(view, xml);
  88. // return generator.printXSD();
  89. return generator.printDTD();
  90. }
  91. /**
  92. * (planned) Write a Relax NG Schema for the given XML document
  93. */
  94. public static String writeRNG (View view, String xml)
  95. {
  96. DTDGenerator generator = new DTDGenerator();
  97. generator.parse(view, xml);
  98. // return generator.printRNG();
  99. return generator.printDTD();
  100. }
  101. private void parse(View view, String text) {
  102. try {
  103. InputSource is = new InputSource(new StringReader(text));
  104. XMLReader parser = SAXParserFactory.newInstance().newSAXParser().getXMLReader();
  105. parser.setContentHandler(this);
  106. parser.setEntityResolver(this);
  107. parser.parse(is);
  108. } catch (Exception err) {
  109. StringBuffer s = new StringBuffer("Failed while parsing text:\n");
  110. s.append(err.getMessage() + "\n");
  111. // StackTraceElement[] st = err.getStackTrace();
  112. // for (int i = 0; i < Array.getLength(st); i++) {
  113. // s.append(st[i].toString() + "\n");
  114. // }
  115. Macros.error(view, s.toString());
  116. }
  117. }
  118. /**
  119. * Test whether a string is an XML name.
  120. * TODO: This is currently an incomplete test, it treats all non-ASCII characters
  121. * as being valid in names.
  122. */
  123. private boolean isValidName(String s) {
  124. if (!isValidNMTOKEN(s)) return false;
  125. int c = s.charAt(0);
  126. return ! ((c>=0x30 && c<=0x39) || c=='.' || c=='-' );
  127. }
  128. /**
  129. * Test whether a string is an XML NMTOKEN.
  130. * TODO: This is currently an incomplete test, it treats all non-ASCII characters
  131. * as being valid in NMTOKENs.
  132. */
  133. private boolean isValidNMTOKEN(String s) {
  134. if (s.length()==0) return false;
  135. for (int i=0; i<s.length(); i++) {
  136. int c = s.charAt(i);
  137. if (!( (c>=0x41 && c<=0x5a) ||
  138. (c>=0x61 && c<=0x7a) ||
  139. (c>=0x30 && c<=0x39) ||
  140. c=='.' ||
  141. c=='_' ||
  142. c=='-' ||
  143. c==':' ||
  144. c>128 ))
  145. return false;
  146. }
  147. return true;
  148. }
  149. /**
  150. * When the whole document has been analysed, construct the DTD
  151. */
  152. private String printDTD ()
  153. {
  154. // process the element types encountered, in turn
  155. StringBuffer s = new StringBuffer();
  156. Iterator<String> e=elementList.keySet().iterator();
  157. while ( e.hasNext() )
  158. {
  159. String elementname = e.next();
  160. ElementDetails ed = elementList.get(elementname);
  161. TreeMap children = ed.children;
  162. Set childKeys = children.keySet();
  163. //EMPTY content
  164. if (childKeys.size()==0 && !ed.hasCharacterContent)
  165. s.append("<!ELEMENT " + elementname + " EMPTY >\n");
  166. //CHARACTER content
  167. if (childKeys.size()==0 && ed.hasCharacterContent)
  168. s.append("<!ELEMENT " + elementname + " ( #PCDATA ) >\n");
  169. //ELEMENT content
  170. if (childKeys.size()>0 && !ed.hasCharacterContent) {
  171. s.append("<!ELEMENT " + elementname + " ( ");
  172. if (ed.sequenced) {
  173. // all elements of this type have the same child elements
  174. // in the same sequence, retained in the childseq vector
  175. Enumeration c = ed.childseq.elements();
  176. while (true) {
  177. ChildDetails ch = (ChildDetails)c.nextElement();
  178. s.append(ch.name);
  179. if (ch.repeatable && !ch.optional)
  180. s.append("+");
  181. if (ch.repeatable && ch.optional)
  182. s.append("*");
  183. if (ch.optional && !ch.repeatable)
  184. s.append("?");
  185. if (c.hasMoreElements())
  186. s.append(", ");
  187. else
  188. break;
  189. }
  190. s.append(" ) >\n");
  191. }
  192. else {
  193. // the children don't always appear in the same sequence; so
  194. // list them alphabetically and allow them to be in any order
  195. Iterator c1 = childKeys.iterator();
  196. while (c1.hasNext()) {
  197. s.append((String)c1.next());
  198. if (c1.hasNext()) s.append(" | ");
  199. }
  200. s.append(" )* >\n");
  201. }
  202. };
  203. //MIXED content
  204. if (childKeys.size()>0 && ed.hasCharacterContent) {
  205. s.append("<!ELEMENT " + elementname + " ( #PCDATA");
  206. Iterator c2 = childKeys.iterator();
  207. while (c2.hasNext()) {
  208. s.append(" | " + (String)c2.next());
  209. }
  210. s.append(" )* >\n");
  211. };
  212. // Now examine the attributes encountered for this element type
  213. TreeMap attlist = ed.attributes;
  214. boolean doneID = false; // to ensure we have at most one ID attribute per element
  215. Iterator a=attlist.keySet().iterator();
  216. while ( a.hasNext() )
  217. {
  218. String attname = (String) a.next();
  219. AttributeDetails ad = (AttributeDetails) attlist.get(attname);
  220. // If the attribute is present on every instance of the element, treat it as required
  221. boolean required = (ad.occurrences==ed.occurrences);
  222. // If every value of the attribute is distinct,
  223. // and there are at least MIN_ID_VALUES, treat it as an ID
  224. // TODO: this may give the wrong answer, we should check whether the value sets of two
  225. // candidate-ID attributes overlap, in which case they can't both be IDs !!)
  226. boolean isid = ad.allNames && // ID values must be Names
  227. (!doneID) && // Only allowed one ID attribute per element type
  228. (ad.unique) &&
  229. (ad.occurrences>=MIN_ID_VALUES);
  230. // if there is only one attribute value, and at least MIN_FIXED occurrences of it,
  231. // treat it as FIXED
  232. boolean isfixed = required && ad.values.size()==1 && ad.occurrences >= MIN_FIXED;
  233. // if the number of distinct values is small compared with the number of occurrences,
  234. // treat it as an enumeration
  235. boolean isenum = ad.allNMTOKENs && // Enumeration values must be NMTOKENs
  236. (ad.occurrences>=MIN_ENUMERATION_INSTANCES) &&
  237. (ad.values.size()<=ad.occurrences/MIN_ENUMERATION_RATIO) &&
  238. (ad.values.size()<=MAX_ENUMERATION_VALUES);
  239. s.append("<!ATTLIST " + elementname + " " + attname + " ");
  240. String tokentype = (ad.allNMTOKENs ? "NMTOKEN" : "CDATA");
  241. if (isid) {
  242. s.append("ID");
  243. doneID = true;
  244. }
  245. else if (isfixed) {
  246. String val = (String) ad.values.first();
  247. s.append(tokentype + " #FIXED \"" + escape(val) + "\" >\n");
  248. }
  249. else if (isenum) {
  250. s.append("( ");
  251. Iterator v = ad.values.iterator();
  252. while (v.hasNext()) {
  253. s.append((String) v.next());
  254. if (!v.hasNext()) break;
  255. s.append(" | ");
  256. };
  257. s.append(" )");
  258. }
  259. else
  260. s.append(tokentype);
  261. if (!isfixed) {
  262. if (required)
  263. s.append(" #REQUIRED >\n");
  264. else
  265. s.append(" #IMPLIED >\n");
  266. }
  267. };
  268. s.append("\n");
  269. };
  270. return s.toString();
  271. }
  272. /**
  273. * Escape special characters for display.
  274. * @param ch The character array containing the string
  275. * @param start The start position of the input string within the character array
  276. * @param length The length of the input string within the character array
  277. * @return The XML/HTML representation of the string<br>
  278. * This static method converts a Unicode string to a string containing
  279. * only ASCII characters, in which non-ASCII characters are represented
  280. * by the usual XML/HTML escape conventions (for example, "&lt;" becomes "&amp;lt;").
  281. * Note: if the input consists solely of ASCII or Latin-1 characters,
  282. * the output will be equally valid in XML and HTML. Otherwise it will be valid
  283. * only in XML.
  284. * The escaped characters are written to the dest array starting at position 0; the
  285. * number of positions used is returned as the result
  286. * For jEdit we leave all special characters alone and escape just the five classics.
  287. * The corresponding method generateDTD() in XMLActions.java copies
  288. * the encoding of the current buffer over to the DTD
  289. */
  290. private static int escape(char ch[], int start, int length, char[] out)
  291. {
  292. int o = 0;
  293. for (int i = start; i < start+length; i++) {
  294. if (ch[i]=='<') {("&lt;").getChars(0,4,out,o); o+=4;}
  295. else if (ch[i]=='>') {("&gt;").getChars(0,4,out,o); o+=4;}
  296. else if (ch[i]=='&') {("&amp;").getChars(0,5,out,o); o+=5;}
  297. else if (ch[i]=='\"') {("&#34;").getChars(0,5,out,o); o+=5;}
  298. else if (ch[i]=='\'') {("&#39;").getChars(0,5,out,o); o+=5;}
  299. else {out[o++]=ch[i]; }
  300. // else if (ch[i]<=0x7f) {out[o++]=ch[i];}
  301. // else {
  302. // String dec = "&#" + Integer.toString((int)ch[i]) + ';';
  303. // dec.getChars(0, dec.length(), out, o);
  304. // o+=dec.length();
  305. // }
  306. }
  307. return o;
  308. }
  309. /**
  310. * Escape special characters in a String value.
  311. * @param in The input string
  312. * @return The XML representation of the string<br>
  313. * This static method converts a Unicode string to a string containing
  314. * only ASCII characters, in which non-ASCII characters are represented
  315. * by the usual XML/HTML escape conventions (for example, "&lt;" becomes
  316. * "&amp;lt;").<br>
  317. * Note: if the input consists solely of ASCII or Latin-1 characters,
  318. * the output will be equally valid in XML and HTML. Otherwise it will be valid
  319. * only in XML.
  320. */
  321. private static String escape(String in)
  322. {
  323. char[] dest = new char[in.length()*8];
  324. int newlen = escape( in.toCharArray(), 0, in.length(), dest);
  325. return new String(dest, 0, newlen);
  326. }
  327. /**
  328. * We ignore all references to external entities such as DOCTYPE declarations.
  329. * Method borrowed from xml.parser.SAXParserImpl
  330. */
  331. public InputSource resolveEntity(String publicId, String systemId)
  332. throws SAXException
  333. {
  334. InputSource dummy = new InputSource(new StringReader("<!-- -->"));
  335. dummy.setSystemId(systemId);
  336. dummy.setPublicId(publicId);
  337. return dummy;
  338. }
  339. /**
  340. * Handle the start of an element. Record information about the position of this
  341. * element relative to its parent, and about the attributes of the element.
  342. */
  343. public void startElement (String uri, String localName, String name, Attributes attributes)
  344. throws SAXException
  345. {
  346. StackEntry se = new StackEntry();
  347. // create an entry in the Element List, or locate the existing entry
  348. ElementDetails ed = (ElementDetails) elementList.get(name);
  349. if (ed==null) {
  350. ed = new ElementDetails(name);
  351. elementList.put(name,ed);
  352. };
  353. // retain the associated element details object
  354. se.elementDetails = ed;
  355. // initialise sequence numbering of child element types
  356. se.sequenceNumber = -1;
  357. // count occurrences of this element type
  358. ed.occurrences++;
  359. // Handle the attributes accumulated for this element.
  360. // Merge the new attribute list into the existing list for the element
  361. for (int a=0; a<attributes.getLength(); a++) {
  362. String attName = attributes.getQName(a);
  363. String val = attributes.getValue(a);
  364. AttributeDetails ad = (AttributeDetails) ed.attributes.get(attName);
  365. if (ad==null) {
  366. ad=new AttributeDetails(attName);
  367. ed.attributes.put(attName, ad);
  368. };
  369. if (!ad.values.contains(val)) {
  370. // We haven't seen this attribute value before
  371. ad.values.add(val);
  372. // Check if attribute value is a valid name
  373. if (ad.allNames && !isValidName(val)) {
  374. ad.allNames = false;
  375. }
  376. // Check if attribute value is a valid NMTOKEN
  377. if (ad.allNMTOKENs && !isValidNMTOKEN(val)) {
  378. ad.allNMTOKENs = false;
  379. }
  380. // For economy, don't save the new value unless it's needed;
  381. // it's needed only if we're looking for ID values or enumerated values
  382. if (ad.unique && ad.allNames && ad.occurrences <= MAX_ID_VALUES) {
  383. ad.values.add(val);
  384. } else if (ad.values.size() <= MAX_ENUMERATION_VALUES) {
  385. ad.values.add(val);
  386. }
  387. } else {
  388. // We've seen this attribute value before
  389. ad.unique = false;
  390. }
  391. ad.occurrences++;
  392. };
  393. // now keep track of the nesting and sequencing of child elements
  394. if (!elementStack.isEmpty()) {
  395. StackEntry parent = (StackEntry)elementStack.peek();
  396. ElementDetails parentDetails = parent.elementDetails;
  397. int seq = parent.sequenceNumber;
  398. // for sequencing, we're interested in consecutive groups of the same child element type
  399. boolean isFirstInGroup = (parent.latestChild==null || (!parent.latestChild.equals(name)));
  400. if (isFirstInGroup) {
  401. seq++;
  402. parent.sequenceNumber++;
  403. }
  404. parent.latestChild = name;
  405. // if we've seen this child of this parent before, get the details
  406. TreeMap<String,ChildDetails> children = parentDetails.children;
  407. ChildDetails c = children.get(name);
  408. if (c==null) {
  409. // this is the first time we've seen this child belonging to this parent
  410. c = new ChildDetails();
  411. c.name = name;
  412. c.position = seq;
  413. c.repeatable = false;
  414. c.optional = false;
  415. children.put(name, c);
  416. parentDetails.childseq.addElement(c);
  417. // if the first time we see this child is not on the first instance of the parent,
  418. // then we allow it as an optional element
  419. if (parentDetails.occurrences!=1) {
  420. c.optional = true;
  421. }
  422. } else {
  423. // if it's the first occurrence of the parent element, and we've seen this
  424. // child before, and it's the first of a new group, then the child occurrences are
  425. // not consecutive
  426. if (parentDetails.occurrences==1 && isFirstInGroup) {
  427. parentDetails.sequenced = false;
  428. }
  429. // check whether the position of this group of children in this parent element is
  430. // the same as its position in previous instances of the parent.
  431. if (parentDetails.childseq.size()<=seq ||
  432. !((ChildDetails)parentDetails.childseq.elementAt(seq)).name.equals(name))
  433. {
  434. parentDetails.sequenced = false;
  435. }
  436. }
  437. // if there's more than one child element, mark it as repeatable
  438. if (!isFirstInGroup) {
  439. c.repeatable = true;
  440. }
  441. }
  442. elementStack.push(se);
  443. }
  444. /**
  445. * End of element. If sequenced, check that all expected children are accounted for.
  446. */
  447. public void endElement (String uri, String localName, String name)
  448. throws SAXException
  449. {
  450. // If the number of child element groups in this parent element is less than the
  451. // number in previous elements, then the absent children are marked as optional
  452. ElementDetails ed = (ElementDetails) elementList.get(name);
  453. if (ed.sequenced) {
  454. StackEntry se = (StackEntry)elementStack.peek();
  455. int seq = se.sequenceNumber;
  456. for (int i=seq+1; i<ed.childseq.size(); i++) {
  457. ((ChildDetails)ed.childseq.elementAt(i)).optional = true;
  458. }
  459. }
  460. elementStack.pop();
  461. }
  462. /**
  463. * Handle character data.
  464. * Make a note whether significant character data is found in the element
  465. */
  466. public void characters (char ch[], int start, int length)
  467. throws SAXException
  468. {
  469. ElementDetails ed = ((StackEntry)elementStack.peek()).elementDetails;
  470. if (!ed.hasCharacterContent) {
  471. for (int i=start; i<start+length; i++) {
  472. if ((int)ch[i] > 0x20) {
  473. ed.hasCharacterContent = true;
  474. break;
  475. }
  476. }
  477. }
  478. }
  479. /**
  480. * ElementDetails is a data structure to keep information about element types
  481. */
  482. private class ElementDetails {
  483. String name;
  484. int occurrences;
  485. boolean hasCharacterContent;
  486. boolean sequenced;
  487. TreeMap<String,ChildDetails> children;
  488. Vector<ChildDetails> childseq;
  489. TreeMap<String,AttributeDetails> attributes;
  490. public ElementDetails ( String name ) {
  491. this.name = name;
  492. this.occurrences = 0;
  493. this.hasCharacterContent = false;
  494. this.sequenced = true;
  495. this.children = new TreeMap<String,ChildDetails>();
  496. this.childseq = new Vector<ChildDetails>();
  497. this.attributes = new TreeMap<String,AttributeDetails>();
  498. }
  499. }
  500. /**
  501. * ChildDetails records information about the presence of a child element within its
  502. * parent element. If the parent element is sequenced, then the child elements always
  503. * occur in sequence with the given frequency.
  504. */
  505. private class ChildDetails {
  506. String name;
  507. int position;
  508. boolean repeatable;
  509. boolean optional;
  510. }
  511. /**
  512. * AttributeDetails is a data structure to keep information about attribute types
  513. */
  514. private class AttributeDetails {
  515. String name; // name of the attribute
  516. int occurrences; // number of occurrences of the attribute
  517. boolean unique; // true if no duplicate values encountered
  518. TreeSet<String> values; // set of all distinct values encountered for this attribute
  519. boolean allNames; // true if all the attribute values are valid names
  520. boolean allNMTOKENs; // true if all the attribute values are valid NMTOKENs
  521. public AttributeDetails ( String name ) {
  522. this.name = name;
  523. this.occurrences = 0;
  524. this.unique = true;
  525. this.values = new TreeSet<String>();
  526. this.allNames = true;
  527. this.allNMTOKENs = true;
  528. }
  529. }
  530. /**
  531. * StackEntry is a data structure we put on the stack for each nested element
  532. */
  533. private class StackEntry {
  534. ElementDetails elementDetails;
  535. int sequenceNumber;
  536. String latestChild;
  537. }
  538. } // end of outer class DTDSAXGen