PageRenderTime 81ms CodeModel.GetById 65ms app.highlight 13ms RepoModel.GetById 1ms app.codeStats 0ms

/bundles/plugins-trunk/XML/sidekick/html/parser/html/HtmlCollector.java

#
Java | 172 lines | 117 code | 26 blank | 29 comment | 20 complexity | 068db0a79a3f40c151d3f55973ef8d35 MD5 | raw file
  1/*
  2* HtmlCollector.java -- structures an HTML document tree.  
  3* Copyright (C) 1999 Quiotix Corporation.  
  4*
  5* This program is free software; you can redistribute it and/or modify
  6* it under the terms of the GNU General Public License, version 2, as 
  7* published by the Free Software Foundation.  
  8*
  9* This program is distributed in the hope that it will be useful,
 10* but WITHOUT ANY WARRANTY; without even the implied warranty of
 11* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 12* GNU General Public License (http://www.gnu.org/copyleft/gpl.txt)
 13* for more details.
 14*/
 15
 16package sidekick.html.parser.html;
 17
 18import java.io.FileInputStream;
 19import java.io.InputStream;
 20import java.util.*;
 21
 22/**
 23 * An HtmlVisitor which modifies the structure of the document so that
 24 * begin tags are matched properly with end tags and placed in TagBlock
 25 * elements.  Typically, an HtmlDocument is created by the parser, which
 26 * simply returns a flat list of elements.  The HtmlCollector takes this
 27 * flat list and gives it the structure that is implied by the HTML content.
 28 *
 29 * @author Brian Goetz, Quiotix
 30 */
 31
 32public class HtmlCollector extends HtmlVisitor {
 33
 34    protected ElementStack tagStack = new ElementStack();
 35    protected ElementStack elements;
 36    protected boolean collected;
 37    protected static Set dontMatch = new HashSet(
 38    	Arrays.asList("AREA", "BASE", "BASEFONT", "BR", "COL", "HR", "IMG",
 39    				  "INPUT", "ISINDEX", "LINK", "META", "PARAM"));
 40
 41    private static class TagStackEntry {
 42        String tagName;
 43        int index;
 44    }
 45
 46    private static class ElementStack extends Vector {
 47        ElementStack() {
 48            super();
 49        }
 50
 51        ElementStack( int n ) {
 52            super( n );
 53        }
 54
 55        public void popN( int n ) {
 56            elementCount -= n;
 57        }
 58    }
 59
 60    protected int pushNode( HtmlDocument.HtmlElement e ) {
 61        if (e != null) {
 62            elements.addElement( e );
 63        }
 64        return elements.size() - 1;
 65    }
 66
 67    public void visit( HtmlDocument.Comment c ) {
 68            pushNode( c );
 69    }
 70
 71    public void visit( HtmlDocument.Text t ) {
 72        pushNode( t );
 73    }
 74
 75    public void visit( HtmlDocument.Newline n ) {
 76        pushNode( n );
 77    }
 78
 79    public void visit( HtmlDocument.Tag t ) {
 80        if (t == null) {
 81            return;   
 82        }
 83        TagStackEntry ts = new TagStackEntry();
 84        int index;
 85
 86        /* Push the tag onto the element stack, and push an entry on the tag
 87        stack if it's a tag we care about matching */
 88        index = pushNode( t );
 89        if ( !t.emptyTag && !dontMatch.contains( t.tagName.toUpperCase() ) ) {
 90            ts.tagName = t.tagName;
 91            ts.index = index;
 92            tagStack.addElement( ts );
 93        }
 94    }
 95
 96    public void visit( HtmlDocument.EndTag t ) {
 97        if (t == null)
 98            return;
 99        int i;
100        for ( i = tagStack.size() - 1; i >= 0; i-- ) {
101            TagStackEntry ts = ( TagStackEntry ) tagStack.elementAt( i );
102            if ( t.tagName.equalsIgnoreCase( ts.tagName ) ) {
103                HtmlDocument.TagBlock block;
104                HtmlDocument.ElementSequence blockElements;
105                HtmlDocument.Tag tag;
106
107                // Create a new ElementSequence and copy the elements to it
108                blockElements = new HtmlDocument.ElementSequence( elements.size() - ts.index - 1 );
109                for ( int j = ts.index + 1; j < elements.size(); j++ ) {
110                    blockElements.addElement( ( HtmlDocument.HtmlElement ) elements.elementAt( j ) );
111                }
112                tag = ( HtmlDocument.Tag ) elements.elementAt( ts.index );
113                block = new HtmlDocument.TagBlock( tag, blockElements, t );
114                block.setStartLocation( tag.getStartLocation() );
115                block.setEndLocation( t.getEndLocation() );
116
117                // Pop the elements off the stack, push the new block
118                elements.popN( elements.size() - ts.index );
119                elements.addElement( block );
120
121                // Pop the matched tag and intervening unmatched tags
122                tagStack.popN( tagStack.size() - i );
123
124                collected = true;
125                break;
126            }
127        }
128
129        // If we didn't find a match, just push the end tag
130        if ( i < 0 )
131            pushNode( t );
132    }
133
134    public void visit( HtmlDocument.TagBlock bl ) {
135        if (bl == null)
136            return;
137        HtmlCollector c = new HtmlCollector();
138        c.start();
139        c.visit( bl.body );
140        c.finish();
141        pushNode( bl );
142    }
143
144    public void visit( HtmlDocument.ElementSequence s ) {
145        if ( s == null )
146            return ;
147        elements = new ElementStack( s.size() );
148        collected = false;
149
150        for ( Iterator iterator = s.iterator(); iterator.hasNext(); ) {
151            HtmlDocument.HtmlElement htmlElement = ( HtmlDocument.HtmlElement ) iterator.next();
152            if ( htmlElement != null )
153                htmlElement.accept( this );
154        }
155        if ( collected )
156            s.setElements( elements );
157    }
158
159    public static void main( String[] args ) throws Exception {
160        InputStream r = new FileInputStream( args[ 0 ] );
161
162        try {
163            HtmlDocument document = new HtmlParser( r ).HtmlDocument();
164            document.accept( new HtmlScrubber() );
165            document.accept( new HtmlCollector() );
166            document.accept( new HtmlDumper( System.out ) );
167        }
168        finally {
169            r.close();
170        }
171    }
172}