PageRenderTime 183ms CodeModel.GetById 149ms app.highlight 29ms RepoModel.GetById 1ms app.codeStats 0ms

/bundles/plugins-trunk/XML/xml/DTDGenerator.java

#
Java | 638 lines | 351 code | 99 blank | 188 comment | 88 complexity | 0b462c9eb4d6eb83063139c9dac7eb4a MD5 | raw file
  1/*
  2 * DTDGenerator.java
  3 *
  4 * Copyright (C) 2001 Michael H. Kay
  5 * Portions copyright (C) 2007 Martin Raspe
  6 *
  7 * The XML plugin is licensed under the GNU General Public License, with
  8 * the following exception:
  9 *
 10 * "Permission is granted to link this code with software released under
 11 * the Apache license version 1.1, for example used by the Xerces XML
 12 * parser package."
 13 *
 14 * The code of this module is licensed under the terms of the 
 15 * Mozilla Public License Version 1.0. (http://www.mozilla.org/MPL/) 
 16 */
 17
 18package xml;
 19
 20import java.util.*;
 21import java.io.StringReader;
 22import java.lang.reflect.Array;
 23import javax.xml.parsers.SAXParserFactory;
 24
 25import org.xml.sax.*;
 26import org.xml.sax.helpers.DefaultHandler;
 27
 28import org.gjt.sp.jedit.*;
 29
 30/**
 31* DTDGenerator<BR>
 32* Generates a possible DTD from an XML document instance.
 33* Pure SAX version of the Saxon DTDGenerator
 34* The program has no remaining dependencies on Saxon; all it needs is:
 35*    JAXP 1.1
 36*    SAX2
 37*    A JAXP 1.1 conformant XML parser
 38*    Java 1.2
 39* @author M.H.Kay
 40* @version 7.0: separated from Saxon source, now works with any JAXP 1.1 XML parser
 41*
 42* Adapted for the jEdit XML Plugin by Martin Raspe (hertzhaft@biblhertz.it) 
 43*/
 44
 45public class DTDGenerator extends DefaultHandler {
 46                                // DTDGenerator is a ContentHandler, created for convenience
 47                                // by extending the default handler that comes with SAX2
 48
 49    protected static int MIN_ENUMERATION_INSTANCES = 10;   
 50                                // minimum number of appearances of an attribute for
 51                                // it to be considered a candidate for an enumeration type
 52    
 53    protected static int MAX_ENUMERATION_VALUES = 20;   
 54                                // maximum number of distinct attribute values to be 
 55                                // included in an enumeration
 56
 57    protected static int MIN_ENUMERATION_RATIO = 3;   
 58                                // an attribute will be regarded as an enumeration attribute
 59                                // only if the number of instances divided by the number of
 60                                // distinct values is >= this ratio
 61                                
 62    protected static int MIN_FIXED = 5;
 63                                // minimum number of attributes that must appear, with
 64                                // the same value each time, for the value to be regarded
 65                                // as FIXED                                
 66                                
 67    protected static int MIN_ID_VALUES = 10;     
 68                                // minumum number of attribute values that must appear
 69                                // for the attribute to be regarded as an ID value
 70
 71    protected static int MAX_ID_VALUES = 100000; 
 72                                // maximum number of attribute values to be saved
 73                                // while checking for uniqueness
 74
 75    TreeMap<String,ElementDetails> elementList;   // alphabetical list of element types appearing in the document;
 76                           // each has the element name as a key and an ElementDetails object
 77                           // as the value
 78
 79    Stack<StackEntry> elementStack;    // stack of elements currently open; each entry is a StackEntry
 80                           // object
 81
 82    public DTDGenerator () 
 83    {
 84        elementList = new TreeMap<String,ElementDetails>();
 85        elementStack = new Stack<StackEntry>();
 86    }
 87
 88    /**
 89    * Write a DTD for the given XML document
 90    */
 91
 92    public static String write (View view, String xml)
 93    {
 94        DTDGenerator generator = new DTDGenerator();
 95        generator.parse(view, xml);
 96        return generator.printDTD();
 97    }
 98
 99    /**
100    * (planned) Write an XML Schema for the given XML document
101    */
102
103    public static String writeXSD (View view, String xml)
104    {
105        DTDGenerator generator = new DTDGenerator();
106        generator.parse(view, xml);
107        // return generator.printXSD();
108        return generator.printDTD();
109    }
110
111    /**
112    * (planned) Write a Relax NG Schema for the given XML document
113    */
114
115    public static String writeRNG (View view, String xml)
116    {
117        DTDGenerator generator = new DTDGenerator();
118        generator.parse(view, xml);
119        // return generator.printRNG();
120        return generator.printDTD();
121    }
122
123    private void parse(View view, String text)  {
124        try {
125            InputSource is = new InputSource(new StringReader(text));
126            XMLReader parser = SAXParserFactory.newInstance().newSAXParser().getXMLReader();
127            parser.setContentHandler(this);
128            parser.setEntityResolver(this);
129            parser.parse(is);
130        } catch (Exception err) {
131        StringBuffer s = new StringBuffer("Failed while parsing text:\n");
132            s.append(err.getMessage() + "\n");
133            // StackTraceElement[] st = err.getStackTrace();
134            // for (int i = 0; i < Array.getLength(st); i++) {
135            //     s.append(st[i].toString() + "\n");
136            //     }
137            Macros.error(view, s.toString());
138        }
139    }
140
141    /**
142    * Test whether a string is an XML name.
143    * TODO: This is currently an incomplete test, it treats all non-ASCII characters
144    * as being valid in names.
145    */
146
147    private boolean isValidName(String s) {
148        if (!isValidNMTOKEN(s)) return false;
149        int c = s.charAt(0);
150        return ! ((c>=0x30 && c<=0x39) || c=='.' || c=='-' );
151    }
152
153    /**
154    * Test whether a string is an XML NMTOKEN.
155    * TODO: This is currently an incomplete test, it treats all non-ASCII characters
156    * as being valid in NMTOKENs.
157    */
158
159    private boolean isValidNMTOKEN(String s) {
160        if (s.length()==0) return false;
161        for (int i=0; i<s.length(); i++) {
162            int c = s.charAt(i);
163            if (!( (c>=0x41 && c<=0x5a) ||
164                   (c>=0x61 && c<=0x7a) ||
165                   (c>=0x30 && c<=0x39) ||
166                    c=='.' ||
167                    c=='_' ||
168                    c=='-' ||
169                    c==':' ||
170                    c>128 ))
171                return false;
172        }
173        return true;
174    }
175  
176    /**
177    * When the whole document has been analysed, construct the DTD
178    */
179    
180    private String printDTD ()
181    {
182        // process the element types encountered, in turn
183
184        StringBuffer s = new StringBuffer();
185        Iterator<String> e=elementList.keySet().iterator();
186        while ( e.hasNext() )
187        {
188            String elementname = e.next();
189            ElementDetails ed = elementList.get(elementname); 
190            TreeMap children = ed.children;
191            Set childKeys = children.keySet();
192
193            //EMPTY content
194            if (childKeys.size()==0 && !ed.hasCharacterContent) 
195                s.append("<!ELEMENT " + elementname + " EMPTY >\n");
196
197            //CHARACTER content
198            if (childKeys.size()==0 && ed.hasCharacterContent)
199                s.append("<!ELEMENT " + elementname + " ( #PCDATA ) >\n");
200
201            //ELEMENT content
202            if (childKeys.size()>0 && !ed.hasCharacterContent) {
203                s.append("<!ELEMENT " + elementname + " ( ");
204
205                if (ed.sequenced) {
206                    
207                    // all elements of this type have the same child elements
208                    // in the same sequence, retained in the childseq vector
209                    
210                    Enumeration c = ed.childseq.elements();
211                    while (true) {
212                        ChildDetails ch = (ChildDetails)c.nextElement();
213                        s.append(ch.name);
214                        if (ch.repeatable && !ch.optional) 
215                            s.append("+");
216                        if (ch.repeatable && ch.optional) 
217                            s.append("*");
218                        if (ch.optional && !ch.repeatable) 
219                            s.append("?");
220                        if (c.hasMoreElements())
221                            s.append(", ");
222                        else
223                            break;
224                    }
225                    s.append(" ) >\n");
226                }
227                else {
228                    
229                    // the children don't always appear in the same sequence; so
230                    // list them alphabetically and allow them to be in any order
231                    
232                    Iterator c1 = childKeys.iterator();
233                    while (c1.hasNext()) {
234                        s.append((String)c1.next());
235                        if (c1.hasNext()) s.append(" | ");
236                    }
237                    s.append(" )* >\n");
238                }
239            };
240
241            //MIXED content
242            if (childKeys.size()>0 && ed.hasCharacterContent) {
243                s.append("<!ELEMENT " + elementname + " ( #PCDATA");
244                Iterator c2 = childKeys.iterator();
245                while (c2.hasNext()) {
246                    s.append(" | " + (String)c2.next());
247                }
248                s.append(" )* >\n");
249            };
250
251            // Now examine the attributes encountered for this element type
252
253            TreeMap attlist = ed.attributes;
254            boolean doneID = false;       // to ensure we have at most one ID attribute per element
255            Iterator a=attlist.keySet().iterator();
256            while ( a.hasNext() )
257            {
258                String attname = (String) a.next();
259                AttributeDetails ad = (AttributeDetails) attlist.get(attname);
260
261                // If the attribute is present on every instance of the element, treat it as required
262                boolean required = (ad.occurrences==ed.occurrences);
263
264                // If every value of the attribute is distinct, 
265                // and there are at least MIN_ID_VALUES, treat it as an ID
266                // TODO: this may give the wrong answer, we should check whether the value sets of two
267                // candidate-ID attributes overlap, in which case they can't both be IDs !!)
268                boolean isid = ad.allNames &&           // ID values must be Names
269                                (!doneID) &&            // Only allowed one ID attribute per element type
270                                (ad.unique) &&
271                                (ad.occurrences>=MIN_ID_VALUES);
272
273                // if there is only one attribute value, and at least MIN_FIXED occurrences of it,
274                // treat it as FIXED 
275                boolean isfixed = required && ad.values.size()==1 && ad.occurrences >= MIN_FIXED;
276
277                // if the number of distinct values is small compared with the number of occurrences,
278                // treat it as an enumeration
279                boolean isenum = ad.allNMTOKENs &&      // Enumeration values must be NMTOKENs
280                                (ad.occurrences>=MIN_ENUMERATION_INSTANCES) && 
281                                (ad.values.size()<=ad.occurrences/MIN_ENUMERATION_RATIO) &&
282                                (ad.values.size()<=MAX_ENUMERATION_VALUES);
283
284                s.append("<!ATTLIST " + elementname + " " + attname + " ");
285                String tokentype = (ad.allNMTOKENs ? "NMTOKEN" : "CDATA");
286                
287                if (isid) { 
288                    s.append("ID");
289                    doneID = true;
290                }
291                else if (isfixed) {
292                    String val = (String) ad.values.first();
293                    s.append(tokentype + " #FIXED \"" + escape(val) + "\" >\n");
294                }
295                else if (isenum) {
296                    s.append("( ");
297                    Iterator v = ad.values.iterator();
298                    while (v.hasNext()) {
299                        s.append((String) v.next());
300                        if (!v.hasNext()) break;
301                        s.append(" | ");
302                    };
303                    s.append(" )");
304                }
305                else
306                    s.append(tokentype);
307
308                if (!isfixed) {
309                    if (required)
310                        s.append(" #REQUIRED >\n");
311                    else
312                        s.append(" #IMPLIED >\n");
313                }
314            };
315            s.append("\n");
316        };
317    return s.toString();
318   
319    }
320    
321
322    /**
323    * Escape special characters for display.
324    * @param ch The character array containing the string
325    * @param start The start position of the input string within the character array
326    * @param length The length of the input string within the character array
327    * @return The XML/HTML representation of the string<br>
328    * This static method converts a Unicode string to a string containing
329    * only ASCII characters, in which non-ASCII characters are represented
330    * by the usual XML/HTML escape conventions (for example, "&lt;" becomes "&amp;lt;").
331    * Note: if the input consists solely of ASCII or Latin-1 characters,
332    * the output will be equally valid in XML and HTML. Otherwise it will be valid
333    * only in XML.
334    * The escaped characters are written to the dest array starting at position 0; the
335    * number of positions used is returned as the result
336    
337    * For jEdit we leave all special characters alone and escape just the five classics.
338    * The corresponding method generateDTD() in XMLActions.java copies
339    * the encoding of the current buffer over to the DTD
340    */
341    
342    private static int escape(char ch[], int start, int length, char[] out)
343    {
344        int o = 0;
345        for (int i = start; i < start+length; i++) {
346            if (ch[i]=='<') {("&lt;").getChars(0,4,out,o); o+=4;}
347            else if (ch[i]=='>') {("&gt;").getChars(0,4,out,o); o+=4;}
348            else if (ch[i]=='&') {("&amp;").getChars(0,5,out,o); o+=5;}
349            else if (ch[i]=='\"') {("&#34;").getChars(0,5,out,o); o+=5;}
350            else if (ch[i]=='\'') {("&#39;").getChars(0,5,out,o); o+=5;}
351            else {out[o++]=ch[i]; }
352            // else if (ch[i]<=0x7f) {out[o++]=ch[i];}
353            // else {
354            //    String dec = "&#" + Integer.toString((int)ch[i]) + ';';
355            //    dec.getChars(0, dec.length(), out, o);
356            //    o+=dec.length();
357            // }
358        }
359        return o;
360    }
361
362    /**
363    * Escape special characters in a String value.
364    * @param in The input string
365    * @return The XML representation of the string<br>
366    * This static method converts a Unicode string to a string containing
367    * only ASCII characters, in which non-ASCII characters are represented
368    * by the usual XML/HTML escape conventions (for example, "&lt;" becomes
369    * "&amp;lt;").<br>
370    * Note: if the input consists solely of ASCII or Latin-1 characters,
371    * the output will be equally valid in XML and HTML. Otherwise it will be valid
372    * only in XML.
373    */
374    
375    private static String escape(String in)
376    {
377        char[] dest = new char[in.length()*8];
378        int newlen = escape( in.toCharArray(), 0, in.length(), dest);
379        return new String(dest, 0, newlen);
380    }
381       
382    /**
383    * We ignore all references to external entities such as DOCTYPE declarations.
384    * Method borrowed from xml.parser.SAXParserImpl
385    */
386    
387    public InputSource resolveEntity(String publicId, String systemId)
388        throws SAXException
389    {
390        InputSource dummy = new InputSource(new StringReader("<!-- -->"));
391        dummy.setSystemId(systemId);
392        dummy.setPublicId(publicId);
393        return dummy;
394    }
395
396    /**
397    * Handle the start of an element. Record information about the position of this
398    * element relative to its parent, and about the attributes of the element. 
399    */
400    
401    public void startElement (String uri, String localName, String name, Attributes attributes)
402    throws SAXException
403    {
404        StackEntry se = new StackEntry();
405
406        // create an entry in the Element List, or locate the existing entry        
407        ElementDetails ed = (ElementDetails) elementList.get(name);
408        if (ed==null)  { 
409            ed = new ElementDetails(name);
410            elementList.put(name,ed);
411        };
412
413        // retain the associated element details object
414        se.elementDetails = ed;
415
416        // initialise sequence numbering of child element types
417        se.sequenceNumber = -1;
418        
419        // count occurrences of this element type
420        ed.occurrences++;
421
422        // Handle the attributes accumulated for this element.
423        // Merge the new attribute list into the existing list for the element
424
425        for (int a=0; a<attributes.getLength(); a++) {
426            String attName = attributes.getQName(a);
427            String val = attributes.getValue(a);
428 
429            AttributeDetails ad = (AttributeDetails) ed.attributes.get(attName);
430            if (ad==null) {
431               ad=new AttributeDetails(attName);
432               ed.attributes.put(attName, ad);
433            };
434            
435            if (!ad.values.contains(val)) {
436                
437                // We haven't seen this attribute value before
438                                  
439                ad.values.add(val);
440                
441                // Check if attribute value is a valid name
442                if (ad.allNames && !isValidName(val)) {
443                    ad.allNames = false;     
444                }
445                
446                // Check if attribute value is a valid NMTOKEN
447                if (ad.allNMTOKENs && !isValidNMTOKEN(val)) {
448                    ad.allNMTOKENs = false;
449                }
450
451                // For economy, don't save the new value unless it's needed;
452                // it's needed only if we're looking for ID values or enumerated values
453
454                if (ad.unique && ad.allNames && ad.occurrences <= MAX_ID_VALUES) {
455                    ad.values.add(val);
456                } else if (ad.values.size() <= MAX_ENUMERATION_VALUES) {
457                    ad.values.add(val);
458                }
459                
460            } else {
461                // We've seen this attribute value before
462                ad.unique = false;
463            }
464            ad.occurrences++;
465        };
466
467        // now keep track of the nesting and sequencing of child elements
468        if (!elementStack.isEmpty()) {
469            StackEntry parent = (StackEntry)elementStack.peek();
470            ElementDetails parentDetails = parent.elementDetails;
471            int seq = parent.sequenceNumber;
472
473            // for sequencing, we're interested in consecutive groups of the same child element type
474            boolean isFirstInGroup = (parent.latestChild==null || (!parent.latestChild.equals(name)));
475            if (isFirstInGroup) {
476                seq++;
477                parent.sequenceNumber++;
478            }
479            parent.latestChild = name;
480
481            // if we've seen this child of this parent before, get the details
482            TreeMap<String,ChildDetails> children = parentDetails.children;
483            ChildDetails c = children.get(name);
484            if (c==null) {
485                // this is the first time we've seen this child belonging to this parent
486                c = new ChildDetails();
487                c.name = name;
488                c.position = seq;
489                c.repeatable = false;
490                c.optional = false;
491                children.put(name, c);
492                parentDetails.childseq.addElement(c);
493
494                // if the first time we see this child is not on the first instance of the parent,
495                // then we allow it as an optional element
496                if (parentDetails.occurrences!=1) {
497                    c.optional = true;
498                }
499
500            } else {
501
502                // if it's the first occurrence of the parent element, and we've seen this
503                // child before, and it's the first of a new group, then the child occurrences are
504                // not consecutive
505                if (parentDetails.occurrences==1 && isFirstInGroup) {
506                    parentDetails.sequenced = false;
507                }
508                
509                // check whether the position of this group of children in this parent element is
510                // the same as its position in previous instances of the parent.
511                if (parentDetails.childseq.size()<=seq ||
512                        !((ChildDetails)parentDetails.childseq.elementAt(seq)).name.equals(name))
513                {
514                    parentDetails.sequenced = false;
515                }
516            }
517
518            // if there's more than one child element, mark it as repeatable
519            if (!isFirstInGroup) {
520                c.repeatable = true;
521            }
522        }
523        elementStack.push(se);
524    }
525
526    /**
527    * End of element. If sequenced, check that all expected children are accounted for.
528    */
529
530    public void endElement (String uri, String localName, String name)
531    throws SAXException
532    {
533
534        // If the number of child element groups in this parent element is less than the
535        // number in previous elements, then the absent children are marked as optional
536        ElementDetails ed = (ElementDetails) elementList.get(name);
537        if (ed.sequenced) {
538            StackEntry se = (StackEntry)elementStack.peek();
539            int seq = se.sequenceNumber;
540            for (int i=seq+1; i<ed.childseq.size(); i++) {
541                ((ChildDetails)ed.childseq.elementAt(i)).optional = true;
542            }
543        }
544        elementStack.pop();
545    }
546    
547    /**
548    * Handle character data.
549    * Make a note whether significant character data is found in the element
550    */
551
552    public void characters (char ch[], int start, int length)
553    throws SAXException
554    {
555        ElementDetails ed = ((StackEntry)elementStack.peek()).elementDetails;
556        if (!ed.hasCharacterContent) {
557            for (int i=start; i<start+length; i++) {
558                if ((int)ch[i] > 0x20) {
559                    ed.hasCharacterContent = true;
560                    break;
561                }
562            }
563        }
564    }
565
566    /**
567    * ElementDetails is a data structure to keep information about element types
568    */
569
570    private class ElementDetails {
571        String name;
572        int occurrences;
573        boolean hasCharacterContent;
574        boolean sequenced;
575        TreeMap<String,ChildDetails> children;
576        Vector<ChildDetails> childseq;
577        TreeMap<String,AttributeDetails> attributes;
578
579        public ElementDetails ( String name ) {
580            this.name = name;
581            this.occurrences = 0;
582            this.hasCharacterContent = false;
583            this.sequenced = true;
584            this.children = new TreeMap<String,ChildDetails>();
585            this.childseq = new Vector<ChildDetails>();
586            this.attributes = new TreeMap<String,AttributeDetails>();
587        }
588    }
589
590    /**
591    * ChildDetails records information about the presence of a child element within its
592    * parent element. If the parent element is sequenced, then the child elements always
593    * occur in sequence with the given frequency.
594    */
595
596    private class ChildDetails {
597        String name;
598        int position;
599        boolean repeatable;
600        boolean optional;
601    }
602    
603
604    /**
605    * AttributeDetails is a data structure to keep information about attribute types
606    */
607
608    private class AttributeDetails {
609        String name;            // name of the attribute
610        int occurrences;        // number of occurrences of the attribute
611        boolean unique;         // true if no duplicate values encountered
612        TreeSet<String> values;         // set of all distinct values encountered for this attribute 
613        boolean allNames;       // true if all the attribute values are valid names
614        boolean allNMTOKENs;    // true if all the attribute values are valid NMTOKENs
615
616        public AttributeDetails ( String name ) {
617            this.name = name;
618            this.occurrences = 0;
619            this.unique = true;
620            this.values = new TreeSet<String>();
621            this.allNames = true;
622            this.allNMTOKENs = true;
623        }
624    }
625    
626    /**
627    * StackEntry is a data structure we put on the stack for each nested element
628    */
629    
630    private class StackEntry {
631        ElementDetails elementDetails;
632        int sequenceNumber;
633        String latestChild;
634    }
635
636
637} // end of outer class DTDSAXGen
638