PageRenderTime 54ms CodeModel.GetById 27ms RepoModel.GetById 1ms app.codeStats 0ms

/bundles/plugins-trunk/XML/sidekick/html/parser/html/HtmlCollector.java

#
Java | 172 lines | 117 code | 26 blank | 29 comment | 20 complexity | 068db0a79a3f40c151d3f55973ef8d35 MD5 | raw file
Possible License(s): BSD-3-Clause, AGPL-1.0, Apache-2.0, LGPL-2.0, LGPL-3.0, GPL-2.0, CC-BY-SA-3.0, LGPL-2.1, GPL-3.0, MPL-2.0-no-copyleft-exception, IPL-1.0
  1. /*
  2. * HtmlCollector.java -- structures an HTML document tree.
  3. * Copyright (C) 1999 Quiotix Corporation.
  4. *
  5. * This program is free software; you can redistribute it and/or modify
  6. * it under the terms of the GNU General Public License, version 2, as
  7. * published by the Free Software Foundation.
  8. *
  9. * This program is distributed in the hope that it will be useful,
  10. * but WITHOUT ANY WARRANTY; without even the implied warranty of
  11. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  12. * GNU General Public License (http://www.gnu.org/copyleft/gpl.txt)
  13. * for more details.
  14. */
  15. package sidekick.html.parser.html;
  16. import java.io.FileInputStream;
  17. import java.io.InputStream;
  18. import java.util.*;
  19. /**
  20. * An HtmlVisitor which modifies the structure of the document so that
  21. * begin tags are matched properly with end tags and placed in TagBlock
  22. * elements. Typically, an HtmlDocument is created by the parser, which
  23. * simply returns a flat list of elements. The HtmlCollector takes this
  24. * flat list and gives it the structure that is implied by the HTML content.
  25. *
  26. * @author Brian Goetz, Quiotix
  27. */
  28. public class HtmlCollector extends HtmlVisitor {
  29. protected ElementStack tagStack = new ElementStack();
  30. protected ElementStack elements;
  31. protected boolean collected;
  32. protected static Set dontMatch = new HashSet(
  33. Arrays.asList("AREA", "BASE", "BASEFONT", "BR", "COL", "HR", "IMG",
  34. "INPUT", "ISINDEX", "LINK", "META", "PARAM"));
  35. private static class TagStackEntry {
  36. String tagName;
  37. int index;
  38. }
  39. private static class ElementStack extends Vector {
  40. ElementStack() {
  41. super();
  42. }
  43. ElementStack( int n ) {
  44. super( n );
  45. }
  46. public void popN( int n ) {
  47. elementCount -= n;
  48. }
  49. }
  50. protected int pushNode( HtmlDocument.HtmlElement e ) {
  51. if (e != null) {
  52. elements.addElement( e );
  53. }
  54. return elements.size() - 1;
  55. }
  56. public void visit( HtmlDocument.Comment c ) {
  57. pushNode( c );
  58. }
  59. public void visit( HtmlDocument.Text t ) {
  60. pushNode( t );
  61. }
  62. public void visit( HtmlDocument.Newline n ) {
  63. pushNode( n );
  64. }
  65. public void visit( HtmlDocument.Tag t ) {
  66. if (t == null) {
  67. return;
  68. }
  69. TagStackEntry ts = new TagStackEntry();
  70. int index;
  71. /* Push the tag onto the element stack, and push an entry on the tag
  72. stack if it's a tag we care about matching */
  73. index = pushNode( t );
  74. if ( !t.emptyTag && !dontMatch.contains( t.tagName.toUpperCase() ) ) {
  75. ts.tagName = t.tagName;
  76. ts.index = index;
  77. tagStack.addElement( ts );
  78. }
  79. }
  80. public void visit( HtmlDocument.EndTag t ) {
  81. if (t == null)
  82. return;
  83. int i;
  84. for ( i = tagStack.size() - 1; i >= 0; i-- ) {
  85. TagStackEntry ts = ( TagStackEntry ) tagStack.elementAt( i );
  86. if ( t.tagName.equalsIgnoreCase( ts.tagName ) ) {
  87. HtmlDocument.TagBlock block;
  88. HtmlDocument.ElementSequence blockElements;
  89. HtmlDocument.Tag tag;
  90. // Create a new ElementSequence and copy the elements to it
  91. blockElements = new HtmlDocument.ElementSequence( elements.size() - ts.index - 1 );
  92. for ( int j = ts.index + 1; j < elements.size(); j++ ) {
  93. blockElements.addElement( ( HtmlDocument.HtmlElement ) elements.elementAt( j ) );
  94. }
  95. tag = ( HtmlDocument.Tag ) elements.elementAt( ts.index );
  96. block = new HtmlDocument.TagBlock( tag, blockElements, t );
  97. block.setStartLocation( tag.getStartLocation() );
  98. block.setEndLocation( t.getEndLocation() );
  99. // Pop the elements off the stack, push the new block
  100. elements.popN( elements.size() - ts.index );
  101. elements.addElement( block );
  102. // Pop the matched tag and intervening unmatched tags
  103. tagStack.popN( tagStack.size() - i );
  104. collected = true;
  105. break;
  106. }
  107. }
  108. // If we didn't find a match, just push the end tag
  109. if ( i < 0 )
  110. pushNode( t );
  111. }
  112. public void visit( HtmlDocument.TagBlock bl ) {
  113. if (bl == null)
  114. return;
  115. HtmlCollector c = new HtmlCollector();
  116. c.start();
  117. c.visit( bl.body );
  118. c.finish();
  119. pushNode( bl );
  120. }
  121. public void visit( HtmlDocument.ElementSequence s ) {
  122. if ( s == null )
  123. return ;
  124. elements = new ElementStack( s.size() );
  125. collected = false;
  126. for ( Iterator iterator = s.iterator(); iterator.hasNext(); ) {
  127. HtmlDocument.HtmlElement htmlElement = ( HtmlDocument.HtmlElement ) iterator.next();
  128. if ( htmlElement != null )
  129. htmlElement.accept( this );
  130. }
  131. if ( collected )
  132. s.setElements( elements );
  133. }
  134. public static void main( String[] args ) throws Exception {
  135. InputStream r = new FileInputStream( args[ 0 ] );
  136. try {
  137. HtmlDocument document = new HtmlParser( r ).HtmlDocument();
  138. document.accept( new HtmlScrubber() );
  139. document.accept( new HtmlCollector() );
  140. document.accept( new HtmlDumper( System.out ) );
  141. }
  142. finally {
  143. r.close();
  144. }
  145. }
  146. }