/bundles/plugins-trunk/XML/sidekick/html/parser/html/HtmlCollector.java
Java | 172 lines | 117 code | 26 blank | 29 comment | 20 complexity | 068db0a79a3f40c151d3f55973ef8d35 MD5 | raw file
Possible License(s): BSD-3-Clause, AGPL-1.0, Apache-2.0, LGPL-2.0, LGPL-3.0, GPL-2.0, CC-BY-SA-3.0, LGPL-2.1, GPL-3.0, MPL-2.0-no-copyleft-exception, IPL-1.0
- /*
- * HtmlCollector.java -- structures an HTML document tree.
- * Copyright (C) 1999 Quiotix Corporation.
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License, version 2, as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License (http://www.gnu.org/copyleft/gpl.txt)
- * for more details.
- */
- package sidekick.html.parser.html;
- import java.io.FileInputStream;
- import java.io.InputStream;
- import java.util.*;
- /**
- * An HtmlVisitor which modifies the structure of the document so that
- * begin tags are matched properly with end tags and placed in TagBlock
- * elements. Typically, an HtmlDocument is created by the parser, which
- * simply returns a flat list of elements. The HtmlCollector takes this
- * flat list and gives it the structure that is implied by the HTML content.
- *
- * @author Brian Goetz, Quiotix
- */
- public class HtmlCollector extends HtmlVisitor {
- protected ElementStack tagStack = new ElementStack();
- protected ElementStack elements;
- protected boolean collected;
- protected static Set dontMatch = new HashSet(
- Arrays.asList("AREA", "BASE", "BASEFONT", "BR", "COL", "HR", "IMG",
- "INPUT", "ISINDEX", "LINK", "META", "PARAM"));
- private static class TagStackEntry {
- String tagName;
- int index;
- }
- private static class ElementStack extends Vector {
- ElementStack() {
- super();
- }
- ElementStack( int n ) {
- super( n );
- }
- public void popN( int n ) {
- elementCount -= n;
- }
- }
- protected int pushNode( HtmlDocument.HtmlElement e ) {
- if (e != null) {
- elements.addElement( e );
- }
- return elements.size() - 1;
- }
- public void visit( HtmlDocument.Comment c ) {
- pushNode( c );
- }
- public void visit( HtmlDocument.Text t ) {
- pushNode( t );
- }
- public void visit( HtmlDocument.Newline n ) {
- pushNode( n );
- }
- public void visit( HtmlDocument.Tag t ) {
- if (t == null) {
- return;
- }
- TagStackEntry ts = new TagStackEntry();
- int index;
- /* Push the tag onto the element stack, and push an entry on the tag
- stack if it's a tag we care about matching */
- index = pushNode( t );
- if ( !t.emptyTag && !dontMatch.contains( t.tagName.toUpperCase() ) ) {
- ts.tagName = t.tagName;
- ts.index = index;
- tagStack.addElement( ts );
- }
- }
- public void visit( HtmlDocument.EndTag t ) {
- if (t == null)
- return;
- int i;
- for ( i = tagStack.size() - 1; i >= 0; i-- ) {
- TagStackEntry ts = ( TagStackEntry ) tagStack.elementAt( i );
- if ( t.tagName.equalsIgnoreCase( ts.tagName ) ) {
- HtmlDocument.TagBlock block;
- HtmlDocument.ElementSequence blockElements;
- HtmlDocument.Tag tag;
- // Create a new ElementSequence and copy the elements to it
- blockElements = new HtmlDocument.ElementSequence( elements.size() - ts.index - 1 );
- for ( int j = ts.index + 1; j < elements.size(); j++ ) {
- blockElements.addElement( ( HtmlDocument.HtmlElement ) elements.elementAt( j ) );
- }
- tag = ( HtmlDocument.Tag ) elements.elementAt( ts.index );
- block = new HtmlDocument.TagBlock( tag, blockElements, t );
- block.setStartLocation( tag.getStartLocation() );
- block.setEndLocation( t.getEndLocation() );
- // Pop the elements off the stack, push the new block
- elements.popN( elements.size() - ts.index );
- elements.addElement( block );
- // Pop the matched tag and intervening unmatched tags
- tagStack.popN( tagStack.size() - i );
- collected = true;
- break;
- }
- }
- // If we didn't find a match, just push the end tag
- if ( i < 0 )
- pushNode( t );
- }
- public void visit( HtmlDocument.TagBlock bl ) {
- if (bl == null)
- return;
- HtmlCollector c = new HtmlCollector();
- c.start();
- c.visit( bl.body );
- c.finish();
- pushNode( bl );
- }
- public void visit( HtmlDocument.ElementSequence s ) {
- if ( s == null )
- return ;
- elements = new ElementStack( s.size() );
- collected = false;
- for ( Iterator iterator = s.iterator(); iterator.hasNext(); ) {
- HtmlDocument.HtmlElement htmlElement = ( HtmlDocument.HtmlElement ) iterator.next();
- if ( htmlElement != null )
- htmlElement.accept( this );
- }
- if ( collected )
- s.setElements( elements );
- }
- public static void main( String[] args ) throws Exception {
- InputStream r = new FileInputStream( args[ 0 ] );
- try {
- HtmlDocument document = new HtmlParser( r ).HtmlDocument();
- document.accept( new HtmlScrubber() );
- document.accept( new HtmlCollector() );
- document.accept( new HtmlDumper( System.out ) );
- }
- finally {
- r.close();
- }
- }
- }