PageRenderTime 35ms CodeModel.GetById 21ms app.highlight 11ms RepoModel.GetById 1ms app.codeStats 0ms

/bundles/plugins-trunk/XML/sidekick/html/parser/html/HtmlScrubber.java

#
Java | 147 lines | 99 code | 19 blank | 29 comment | 55 complexity | 36dcbc54bcdbef34a1e7c2ad052319cc MD5 | raw file
  1/*
  2 * HtmlScrubber.java -- cleans up HTML document tree.  
  3 * Copyright (C) 1999 Quiotix Corporation.  
  4 *
  5 * This program is free software; you can redistribute it and/or modify
  6 * it under the terms of the GNU General Public License, version 2, as 
  7 * published by the Free Software Foundation.  
  8 *
  9 * This program is distributed in the hope that it will be useful,
 10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 12 * GNU General Public License (http://www.gnu.org/copyleft/gpl.txt)
 13 * for more details.
 14 */
 15
 16package sidekick.html.parser.html;
 17import java.util.Iterator;
 18
 19/**
 20 * HtmlScrubber is a Visitor which walks an HtmlDocument and cleans it up.
 21 * It can change tags and tag attributes to uppercase or lowercase, strip
 22 * out unnecessary quotes from attribute values, and strip trailing spaces
 23 * before a newline.
 24 *
 25 * @author Brian Goetz, Quiotix
 26 * Additional contributions by: Thorsten Weber
 27 */
 28
 29public class HtmlScrubber extends HtmlVisitor {
 30
 31    public static final int TAGS_UPCASE     = 1;
 32    public static final int TAGS_DOWNCASE   = 2;
 33    public static final int ATTR_UPCASE     = 4;
 34    public static final int ATTR_DOWNCASE   = 8;
 35    public static final int STRIP_QUOTES    = 16;
 36    public static final int TRIM_SPACES     = 32;
 37    public static final int DEFAULT_OPTIONS =
 38            TAGS_DOWNCASE | ATTR_DOWNCASE | STRIP_QUOTES;
 39
 40    protected int flags;
 41    protected HtmlDocument.HtmlElement previousElement;
 42    protected boolean inPreBlock;
 43
 44    /** Create an HtmlScrubber with the default options (downcase tags and
 45     * tag attributes, strip out unnecessary quotes.)
 46     */
 47    public HtmlScrubber() {
 48        this(DEFAULT_OPTIONS);
 49    };
 50
 51    /** Create an HtmlScrubber with the desired set of options.
 52     * @param flags A bitmask representing the desired scrubbing options
 53     */
 54
 55    public HtmlScrubber(int flags) {
 56        this.flags = flags;
 57    };
 58
 59    private static boolean safeToUnquote(String qs) {
 60        int upperCount=0, lowerCount=0, idCount=0;
 61
 62        for (int i=1; i < qs.length()-1; i++) {
 63            char c = qs.charAt(i);
 64            if (Character.isUnicodeIdentifierPart(c))
 65                ++idCount;
 66            if (Character.isUpperCase(c))
 67                ++upperCount;
 68            else if (Character.isLowerCase(c))
 69                ++lowerCount;
 70        };
 71        return (qs.length()-2 > 0
 72                && (qs.length()-2 == idCount
 73                && (upperCount == 0 || lowerCount == 0)));
 74    };
 75
 76    public void start() {
 77        previousElement = null;
 78        inPreBlock = false;
 79    };
 80
 81    public void visit(HtmlDocument.Tag t) {
 82        if ((flags & TAGS_UPCASE) != 0)
 83            t.tagName = t.tagName.toUpperCase();
 84        else if ((flags & TAGS_DOWNCASE) != 0)
 85            t.tagName = t.tagName.toLowerCase();
 86        for (Iterator it=t.attributeList.attributes.iterator(); it.hasNext(); ) {
 87            HtmlDocument.Attribute a = (HtmlDocument.Attribute) it.next();
 88            if ((flags & ATTR_UPCASE) != 0)
 89                a.name = a.name.toUpperCase();
 90            else if ((flags & ATTR_DOWNCASE) != 0)
 91                a.name = a.name.toLowerCase();
 92            if (((flags & STRIP_QUOTES) != 0)
 93                    && a.hasValue
 94                    && ((a.value.charAt(0) == '\'' && a.value.charAt(a.value.length()-1) == '\'')
 95                        || (a.value.charAt(0) == '\"' && a.value.charAt(a.value.length()-1) == '\"'))
 96                    && safeToUnquote(a.value)) {
 97                a.value = a.value.substring(1, a.value.length()-1);
 98            };
 99        };
100
101        previousElement = t;
102    }
103
104    public void visit(HtmlDocument.EndTag t) {
105        if ((flags & TAGS_UPCASE) != 0)
106            t.tagName = t.tagName.toUpperCase();
107        else if ((flags & TAGS_DOWNCASE) != 0)
108            t.tagName = t.tagName.toLowerCase();
109
110        previousElement = t;
111    }
112
113    public void visit(HtmlDocument.Text t)        {
114        if (((flags & TRIM_SPACES) != 0)
115                && !inPreBlock
116                && (previousElement instanceof HtmlDocument.Newline
117                || previousElement instanceof HtmlDocument.Tag
118                || previousElement instanceof HtmlDocument.EndTag
119                || previousElement instanceof HtmlDocument.Comment)) {
120            int i;
121            for (i=0; i<t.text.length(); i++)
122                if (t.text.charAt(i) != ' '
123                        && t.text.charAt(i) != '\t')
124                    break;
125            if (i > 0)
126                t.text = t.text.substring(i);
127        };
128        previousElement = t;
129    }
130
131    public void visit(HtmlDocument.Comment c)     { previousElement = c; }
132    public void visit(HtmlDocument.Newline n)     { previousElement = n; }
133    public void visit(HtmlDocument.Annotation a)  { previousElement = a; }
134    public void visit(HtmlDocument.TagBlock bl) {
135        if (bl.startTag.tagName.equalsIgnoreCase("PRE")
136                || bl.startTag.tagName.equalsIgnoreCase("SCRIPT")
137                || bl.startTag.tagName.equalsIgnoreCase("STYLE")) {
138            inPreBlock = true;
139            super.visit(bl);
140            inPreBlock = false;
141        }
142        else
143            super.visit(bl);
144    }
145}
146
147