/bundles/plugins-trunk/XML/xml/DTDGenerator.java
Java | 638 lines | 351 code | 99 blank | 188 comment | 88 complexity | 0b462c9eb4d6eb83063139c9dac7eb4a MD5 | raw file
Possible License(s): BSD-3-Clause, AGPL-1.0, Apache-2.0, LGPL-2.0, LGPL-3.0, GPL-2.0, CC-BY-SA-3.0, LGPL-2.1, GPL-3.0, MPL-2.0-no-copyleft-exception, IPL-1.0
- /*
- * DTDGenerator.java
- *
- * Copyright (C) 2001 Michael H. Kay
- * Portions copyright (C) 2007 Martin Raspe
- *
- * The XML plugin is licensed under the GNU General Public License, with
- * the following exception:
- *
- * "Permission is granted to link this code with software released under
- * the Apache license version 1.1, for example used by the Xerces XML
- * parser package."
- *
- * The code of this module is licensed under the terms of the
- * Mozilla Public License Version 1.0. (http://www.mozilla.org/MPL/)
- */
-
- package xml;
-
- import java.util.*;
- import java.io.StringReader;
- import java.lang.reflect.Array;
- import javax.xml.parsers.SAXParserFactory;
-
- import org.xml.sax.*;
- import org.xml.sax.helpers.DefaultHandler;
-
- import org.gjt.sp.jedit.*;
-
- /**
- * DTDGenerator<BR>
- * Generates a possible DTD from an XML document instance.
- * Pure SAX version of the Saxon DTDGenerator
- * The program has no remaining dependencies on Saxon; all it needs is:
- * JAXP 1.1
- * SAX2
- * A JAXP 1.1 conformant XML parser
- * Java 1.2
- * @author M.H.Kay
- * @version 7.0: separated from Saxon source, now works with any JAXP 1.1 XML parser
- *
- * Adapted for the jEdit XML Plugin by Martin Raspe (hertzhaft@biblhertz.it)
- */
-
- public class DTDGenerator extends DefaultHandler {
- // DTDGenerator is a ContentHandler, created for convenience
- // by extending the default handler that comes with SAX2
-
- protected static int MIN_ENUMERATION_INSTANCES = 10;
- // minimum number of appearances of an attribute for
- // it to be considered a candidate for an enumeration type
-
- protected static int MAX_ENUMERATION_VALUES = 20;
- // maximum number of distinct attribute values to be
- // included in an enumeration
-
- protected static int MIN_ENUMERATION_RATIO = 3;
- // an attribute will be regarded as an enumeration attribute
- // only if the number of instances divided by the number of
- // distinct values is >= this ratio
-
- protected static int MIN_FIXED = 5;
- // minimum number of attributes that must appear, with
- // the same value each time, for the value to be regarded
- // as FIXED
-
- protected static int MIN_ID_VALUES = 10;
- // minumum number of attribute values that must appear
- // for the attribute to be regarded as an ID value
-
- protected static int MAX_ID_VALUES = 100000;
- // maximum number of attribute values to be saved
- // while checking for uniqueness
-
- TreeMap<String,ElementDetails> elementList; // alphabetical list of element types appearing in the document;
- // each has the element name as a key and an ElementDetails object
- // as the value
-
- Stack<StackEntry> elementStack; // stack of elements currently open; each entry is a StackEntry
- // object
-
- public DTDGenerator ()
- {
- elementList = new TreeMap<String,ElementDetails>();
- elementStack = new Stack<StackEntry>();
- }
-
- /**
- * Write a DTD for the given XML document
- */
-
- public static String write (View view, String xml)
- {
- DTDGenerator generator = new DTDGenerator();
- generator.parse(view, xml);
- return generator.printDTD();
- }
-
- /**
- * (planned) Write an XML Schema for the given XML document
- */
-
- public static String writeXSD (View view, String xml)
- {
- DTDGenerator generator = new DTDGenerator();
- generator.parse(view, xml);
- // return generator.printXSD();
- return generator.printDTD();
- }
-
- /**
- * (planned) Write a Relax NG Schema for the given XML document
- */
-
- public static String writeRNG (View view, String xml)
- {
- DTDGenerator generator = new DTDGenerator();
- generator.parse(view, xml);
- // return generator.printRNG();
- return generator.printDTD();
- }
-
- private void parse(View view, String text) {
- try {
- InputSource is = new InputSource(new StringReader(text));
- XMLReader parser = SAXParserFactory.newInstance().newSAXParser().getXMLReader();
- parser.setContentHandler(this);
- parser.setEntityResolver(this);
- parser.parse(is);
- } catch (Exception err) {
- StringBuffer s = new StringBuffer("Failed while parsing text:\n");
- s.append(err.getMessage() + "\n");
- // StackTraceElement[] st = err.getStackTrace();
- // for (int i = 0; i < Array.getLength(st); i++) {
- // s.append(st[i].toString() + "\n");
- // }
- Macros.error(view, s.toString());
- }
- }
-
- /**
- * Test whether a string is an XML name.
- * TODO: This is currently an incomplete test, it treats all non-ASCII characters
- * as being valid in names.
- */
-
- private boolean isValidName(String s) {
- if (!isValidNMTOKEN(s)) return false;
- int c = s.charAt(0);
- return ! ((c>=0x30 && c<=0x39) || c=='.' || c=='-' );
- }
-
- /**
- * Test whether a string is an XML NMTOKEN.
- * TODO: This is currently an incomplete test, it treats all non-ASCII characters
- * as being valid in NMTOKENs.
- */
-
- private boolean isValidNMTOKEN(String s) {
- if (s.length()==0) return false;
- for (int i=0; i<s.length(); i++) {
- int c = s.charAt(i);
- if (!( (c>=0x41 && c<=0x5a) ||
- (c>=0x61 && c<=0x7a) ||
- (c>=0x30 && c<=0x39) ||
- c=='.' ||
- c=='_' ||
- c=='-' ||
- c==':' ||
- c>128 ))
- return false;
- }
- return true;
- }
-
- /**
- * When the whole document has been analysed, construct the DTD
- */
-
- private String printDTD ()
- {
- // process the element types encountered, in turn
-
- StringBuffer s = new StringBuffer();
- Iterator<String> e=elementList.keySet().iterator();
- while ( e.hasNext() )
- {
- String elementname = e.next();
- ElementDetails ed = elementList.get(elementname);
- TreeMap children = ed.children;
- Set childKeys = children.keySet();
-
- //EMPTY content
- if (childKeys.size()==0 && !ed.hasCharacterContent)
- s.append("<!ELEMENT " + elementname + " EMPTY >\n");
-
- //CHARACTER content
- if (childKeys.size()==0 && ed.hasCharacterContent)
- s.append("<!ELEMENT " + elementname + " ( #PCDATA ) >\n");
-
- //ELEMENT content
- if (childKeys.size()>0 && !ed.hasCharacterContent) {
- s.append("<!ELEMENT " + elementname + " ( ");
-
- if (ed.sequenced) {
-
- // all elements of this type have the same child elements
- // in the same sequence, retained in the childseq vector
-
- Enumeration c = ed.childseq.elements();
- while (true) {
- ChildDetails ch = (ChildDetails)c.nextElement();
- s.append(ch.name);
- if (ch.repeatable && !ch.optional)
- s.append("+");
- if (ch.repeatable && ch.optional)
- s.append("*");
- if (ch.optional && !ch.repeatable)
- s.append("?");
- if (c.hasMoreElements())
- s.append(", ");
- else
- break;
- }
- s.append(" ) >\n");
- }
- else {
-
- // the children don't always appear in the same sequence; so
- // list them alphabetically and allow them to be in any order
-
- Iterator c1 = childKeys.iterator();
- while (c1.hasNext()) {
- s.append((String)c1.next());
- if (c1.hasNext()) s.append(" | ");
- }
- s.append(" )* >\n");
- }
- };
-
- //MIXED content
- if (childKeys.size()>0 && ed.hasCharacterContent) {
- s.append("<!ELEMENT " + elementname + " ( #PCDATA");
- Iterator c2 = childKeys.iterator();
- while (c2.hasNext()) {
- s.append(" | " + (String)c2.next());
- }
- s.append(" )* >\n");
- };
-
- // Now examine the attributes encountered for this element type
-
- TreeMap attlist = ed.attributes;
- boolean doneID = false; // to ensure we have at most one ID attribute per element
- Iterator a=attlist.keySet().iterator();
- while ( a.hasNext() )
- {
- String attname = (String) a.next();
- AttributeDetails ad = (AttributeDetails) attlist.get(attname);
-
- // If the attribute is present on every instance of the element, treat it as required
- boolean required = (ad.occurrences==ed.occurrences);
-
- // If every value of the attribute is distinct,
- // and there are at least MIN_ID_VALUES, treat it as an ID
- // TODO: this may give the wrong answer, we should check whether the value sets of two
- // candidate-ID attributes overlap, in which case they can't both be IDs !!)
- boolean isid = ad.allNames && // ID values must be Names
- (!doneID) && // Only allowed one ID attribute per element type
- (ad.unique) &&
- (ad.occurrences>=MIN_ID_VALUES);
-
- // if there is only one attribute value, and at least MIN_FIXED occurrences of it,
- // treat it as FIXED
- boolean isfixed = required && ad.values.size()==1 && ad.occurrences >= MIN_FIXED;
-
- // if the number of distinct values is small compared with the number of occurrences,
- // treat it as an enumeration
- boolean isenum = ad.allNMTOKENs && // Enumeration values must be NMTOKENs
- (ad.occurrences>=MIN_ENUMERATION_INSTANCES) &&
- (ad.values.size()<=ad.occurrences/MIN_ENUMERATION_RATIO) &&
- (ad.values.size()<=MAX_ENUMERATION_VALUES);
-
- s.append("<!ATTLIST " + elementname + " " + attname + " ");
- String tokentype = (ad.allNMTOKENs ? "NMTOKEN" : "CDATA");
-
- if (isid) {
- s.append("ID");
- doneID = true;
- }
- else if (isfixed) {
- String val = (String) ad.values.first();
- s.append(tokentype + " #FIXED \"" + escape(val) + "\" >\n");
- }
- else if (isenum) {
- s.append("( ");
- Iterator v = ad.values.iterator();
- while (v.hasNext()) {
- s.append((String) v.next());
- if (!v.hasNext()) break;
- s.append(" | ");
- };
- s.append(" )");
- }
- else
- s.append(tokentype);
-
- if (!isfixed) {
- if (required)
- s.append(" #REQUIRED >\n");
- else
- s.append(" #IMPLIED >\n");
- }
- };
- s.append("\n");
- };
- return s.toString();
-
- }
-
-
- /**
- * Escape special characters for display.
- * @param ch The character array containing the string
- * @param start The start position of the input string within the character array
- * @param length The length of the input string within the character array
- * @return The XML/HTML representation of the string<br>
- * This static method converts a Unicode string to a string containing
- * only ASCII characters, in which non-ASCII characters are represented
- * by the usual XML/HTML escape conventions (for example, "<" becomes "&lt;").
- * Note: if the input consists solely of ASCII or Latin-1 characters,
- * the output will be equally valid in XML and HTML. Otherwise it will be valid
- * only in XML.
- * The escaped characters are written to the dest array starting at position 0; the
- * number of positions used is returned as the result
-
- * For jEdit we leave all special characters alone and escape just the five classics.
- * The corresponding method generateDTD() in XMLActions.java copies
- * the encoding of the current buffer over to the DTD
- */
-
- private static int escape(char ch[], int start, int length, char[] out)
- {
- int o = 0;
- for (int i = start; i < start+length; i++) {
- if (ch[i]=='<') {("<").getChars(0,4,out,o); o+=4;}
- else if (ch[i]=='>') {(">").getChars(0,4,out,o); o+=4;}
- else if (ch[i]=='&') {("&").getChars(0,5,out,o); o+=5;}
- else if (ch[i]=='\"') {(""").getChars(0,5,out,o); o+=5;}
- else if (ch[i]=='\'') {("'").getChars(0,5,out,o); o+=5;}
- else {out[o++]=ch[i]; }
- // else if (ch[i]<=0x7f) {out[o++]=ch[i];}
- // else {
- // String dec = "&#" + Integer.toString((int)ch[i]) + ';';
- // dec.getChars(0, dec.length(), out, o);
- // o+=dec.length();
- // }
- }
- return o;
- }
-
- /**
- * Escape special characters in a String value.
- * @param in The input string
- * @return The XML representation of the string<br>
- * This static method converts a Unicode string to a string containing
- * only ASCII characters, in which non-ASCII characters are represented
- * by the usual XML/HTML escape conventions (for example, "<" becomes
- * "&lt;").<br>
- * Note: if the input consists solely of ASCII or Latin-1 characters,
- * the output will be equally valid in XML and HTML. Otherwise it will be valid
- * only in XML.
- */
-
- private static String escape(String in)
- {
- char[] dest = new char[in.length()*8];
- int newlen = escape( in.toCharArray(), 0, in.length(), dest);
- return new String(dest, 0, newlen);
- }
-
- /**
- * We ignore all references to external entities such as DOCTYPE declarations.
- * Method borrowed from xml.parser.SAXParserImpl
- */
-
- public InputSource resolveEntity(String publicId, String systemId)
- throws SAXException
- {
- InputSource dummy = new InputSource(new StringReader("<!-- -->"));
- dummy.setSystemId(systemId);
- dummy.setPublicId(publicId);
- return dummy;
- }
-
- /**
- * Handle the start of an element. Record information about the position of this
- * element relative to its parent, and about the attributes of the element.
- */
-
- public void startElement (String uri, String localName, String name, Attributes attributes)
- throws SAXException
- {
- StackEntry se = new StackEntry();
-
- // create an entry in the Element List, or locate the existing entry
- ElementDetails ed = (ElementDetails) elementList.get(name);
- if (ed==null) {
- ed = new ElementDetails(name);
- elementList.put(name,ed);
- };
-
- // retain the associated element details object
- se.elementDetails = ed;
-
- // initialise sequence numbering of child element types
- se.sequenceNumber = -1;
-
- // count occurrences of this element type
- ed.occurrences++;
-
- // Handle the attributes accumulated for this element.
- // Merge the new attribute list into the existing list for the element
-
- for (int a=0; a<attributes.getLength(); a++) {
- String attName = attributes.getQName(a);
- String val = attributes.getValue(a);
-
- AttributeDetails ad = (AttributeDetails) ed.attributes.get(attName);
- if (ad==null) {
- ad=new AttributeDetails(attName);
- ed.attributes.put(attName, ad);
- };
-
- if (!ad.values.contains(val)) {
-
- // We haven't seen this attribute value before
-
- ad.values.add(val);
-
- // Check if attribute value is a valid name
- if (ad.allNames && !isValidName(val)) {
- ad.allNames = false;
- }
-
- // Check if attribute value is a valid NMTOKEN
- if (ad.allNMTOKENs && !isValidNMTOKEN(val)) {
- ad.allNMTOKENs = false;
- }
-
- // For economy, don't save the new value unless it's needed;
- // it's needed only if we're looking for ID values or enumerated values
-
- if (ad.unique && ad.allNames && ad.occurrences <= MAX_ID_VALUES) {
- ad.values.add(val);
- } else if (ad.values.size() <= MAX_ENUMERATION_VALUES) {
- ad.values.add(val);
- }
-
- } else {
- // We've seen this attribute value before
- ad.unique = false;
- }
- ad.occurrences++;
- };
-
- // now keep track of the nesting and sequencing of child elements
- if (!elementStack.isEmpty()) {
- StackEntry parent = (StackEntry)elementStack.peek();
- ElementDetails parentDetails = parent.elementDetails;
- int seq = parent.sequenceNumber;
-
- // for sequencing, we're interested in consecutive groups of the same child element type
- boolean isFirstInGroup = (parent.latestChild==null || (!parent.latestChild.equals(name)));
- if (isFirstInGroup) {
- seq++;
- parent.sequenceNumber++;
- }
- parent.latestChild = name;
-
- // if we've seen this child of this parent before, get the details
- TreeMap<String,ChildDetails> children = parentDetails.children;
- ChildDetails c = children.get(name);
- if (c==null) {
- // this is the first time we've seen this child belonging to this parent
- c = new ChildDetails();
- c.name = name;
- c.position = seq;
- c.repeatable = false;
- c.optional = false;
- children.put(name, c);
- parentDetails.childseq.addElement(c);
-
- // if the first time we see this child is not on the first instance of the parent,
- // then we allow it as an optional element
- if (parentDetails.occurrences!=1) {
- c.optional = true;
- }
-
- } else {
-
- // if it's the first occurrence of the parent element, and we've seen this
- // child before, and it's the first of a new group, then the child occurrences are
- // not consecutive
- if (parentDetails.occurrences==1 && isFirstInGroup) {
- parentDetails.sequenced = false;
- }
-
- // check whether the position of this group of children in this parent element is
- // the same as its position in previous instances of the parent.
- if (parentDetails.childseq.size()<=seq ||
- !((ChildDetails)parentDetails.childseq.elementAt(seq)).name.equals(name))
- {
- parentDetails.sequenced = false;
- }
- }
-
- // if there's more than one child element, mark it as repeatable
- if (!isFirstInGroup) {
- c.repeatable = true;
- }
- }
- elementStack.push(se);
- }
-
- /**
- * End of element. If sequenced, check that all expected children are accounted for.
- */
-
- public void endElement (String uri, String localName, String name)
- throws SAXException
- {
-
- // If the number of child element groups in this parent element is less than the
- // number in previous elements, then the absent children are marked as optional
- ElementDetails ed = (ElementDetails) elementList.get(name);
- if (ed.sequenced) {
- StackEntry se = (StackEntry)elementStack.peek();
- int seq = se.sequenceNumber;
- for (int i=seq+1; i<ed.childseq.size(); i++) {
- ((ChildDetails)ed.childseq.elementAt(i)).optional = true;
- }
- }
- elementStack.pop();
- }
-
- /**
- * Handle character data.
- * Make a note whether significant character data is found in the element
- */
-
- public void characters (char ch[], int start, int length)
- throws SAXException
- {
- ElementDetails ed = ((StackEntry)elementStack.peek()).elementDetails;
- if (!ed.hasCharacterContent) {
- for (int i=start; i<start+length; i++) {
- if ((int)ch[i] > 0x20) {
- ed.hasCharacterContent = true;
- break;
- }
- }
- }
- }
-
- /**
- * ElementDetails is a data structure to keep information about element types
- */
-
- private class ElementDetails {
- String name;
- int occurrences;
- boolean hasCharacterContent;
- boolean sequenced;
- TreeMap<String,ChildDetails> children;
- Vector<ChildDetails> childseq;
- TreeMap<String,AttributeDetails> attributes;
-
- public ElementDetails ( String name ) {
- this.name = name;
- this.occurrences = 0;
- this.hasCharacterContent = false;
- this.sequenced = true;
- this.children = new TreeMap<String,ChildDetails>();
- this.childseq = new Vector<ChildDetails>();
- this.attributes = new TreeMap<String,AttributeDetails>();
- }
- }
-
- /**
- * ChildDetails records information about the presence of a child element within its
- * parent element. If the parent element is sequenced, then the child elements always
- * occur in sequence with the given frequency.
- */
-
- private class ChildDetails {
- String name;
- int position;
- boolean repeatable;
- boolean optional;
- }
-
-
- /**
- * AttributeDetails is a data structure to keep information about attribute types
- */
-
- private class AttributeDetails {
- String name; // name of the attribute
- int occurrences; // number of occurrences of the attribute
- boolean unique; // true if no duplicate values encountered
- TreeSet<String> values; // set of all distinct values encountered for this attribute
- boolean allNames; // true if all the attribute values are valid names
- boolean allNMTOKENs; // true if all the attribute values are valid NMTOKENs
-
- public AttributeDetails ( String name ) {
- this.name = name;
- this.occurrences = 0;
- this.unique = true;
- this.values = new TreeSet<String>();
- this.allNames = true;
- this.allNMTOKENs = true;
- }
- }
-
- /**
- * StackEntry is a data structure we put on the stack for each nested element
- */
-
- private class StackEntry {
- ElementDetails elementDetails;
- int sequenceNumber;
- String latestChild;
- }
-
-
- } // end of outer class DTDSAXGen
-