HtmlScrubber.java - This Java class, `HtmlScrubber`, is a V…

/bundles/plugins-trunk/XML/sidekick/html/parser/html/HtmlScrubber.java

# · Java · 147 lines · 99 code · 19 blank · 29 comment · 55 complexity · 36dcbc54bcdbef34a1e7c2ad052319cc MD5 · raw file


/*
 * HtmlScrubber.java -- cleans up HTML document tree.  
 * Copyright (C) 1999 Quiotix Corporation.  
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License, version 2, as 
 * published by the Free Software Foundation.  
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License (http://www.gnu.org/copyleft/gpl.txt)
 * for more details.
 */

package sidekick.html.parser.html;
import java.util.Iterator;

/**
 * HtmlScrubber is a Visitor which walks an HtmlDocument and cleans it up.
 * It can change tags and tag attributes to uppercase or lowercase, strip
 * out unnecessary quotes from attribute values, and strip trailing spaces
 * before a newline.
 *
 * @author Brian Goetz, Quiotix
 * Additional contributions by: Thorsten Weber
 */

public class HtmlScrubber extends HtmlVisitor {

    public static final int TAGS_UPCASE     = 1;
    public static final int TAGS_DOWNCASE   = 2;
    public static final int ATTR_UPCASE     = 4;
    public static final int ATTR_DOWNCASE   = 8;
    public static final int STRIP_QUOTES    = 16;
    public static final int TRIM_SPACES     = 32;
    public static final int DEFAULT_OPTIONS =
            TAGS_DOWNCASE | ATTR_DOWNCASE | STRIP_QUOTES;

    protected int flags;
    protected HtmlDocument.HtmlElement previousElement;
    protected boolean inPreBlock;

    /** Create an HtmlScrubber with the default options (downcase tags and
     * tag attributes, strip out unnecessary quotes.)
     */
    public HtmlScrubber() {
        this(DEFAULT_OPTIONS);
    };

    /** Create an HtmlScrubber with the desired set of options.
     * @param flags A bitmask representing the desired scrubbing options
     */

    public HtmlScrubber(int flags) {
        this.flags = flags;
    };

    private static boolean safeToUnquote(String qs) {
        int upperCount=0, lowerCount=0, idCount=0;

        for (int i=1; i < qs.length()-1; i++) {
            char c = qs.charAt(i);
            if (Character.isUnicodeIdentifierPart(c))
                ++idCount;
            if (Character.isUpperCase(c))
                ++upperCount;
            else if (Character.isLowerCase(c))
                ++lowerCount;
        };
        return (qs.length()-2 > 0
                && (qs.length()-2 == idCount
                && (upperCount == 0 || lowerCount == 0)));
    };

    public void start() {
        previousElement = null;
        inPreBlock = false;
    };

    public void visit(HtmlDocument.Tag t) {
        if ((flags & TAGS_UPCASE) != 0)
            t.tagName = t.tagName.toUpperCase();
        else if ((flags & TAGS_DOWNCASE) != 0)
            t.tagName = t.tagName.toLowerCase();
        for (Iterator it=t.attributeList.attributes.iterator(); it.hasNext(); ) {
            HtmlDocument.Attribute a = (HtmlDocument.Attribute) it.next();
            if ((flags & ATTR_UPCASE) != 0)
                a.name = a.name.toUpperCase();
            else if ((flags & ATTR_DOWNCASE) != 0)
                a.name = a.name.toLowerCase();
            if (((flags & STRIP_QUOTES) != 0)
                    && a.hasValue
                    && ((a.value.charAt(0) == '\'' && a.value.charAt(a.value.length()-1) == '\'')
                        || (a.value.charAt(0) == '\"' && a.value.charAt(a.value.length()-1) == '\"'))
                    && safeToUnquote(a.value)) {
                a.value = a.value.substring(1, a.value.length()-1);
            };
        };

        previousElement = t;
    }

    public void visit(HtmlDocument.EndTag t) {
        if ((flags & TAGS_UPCASE) != 0)
            t.tagName = t.tagName.toUpperCase();
        else if ((flags & TAGS_DOWNCASE) != 0)
            t.tagName = t.tagName.toLowerCase();

        previousElement = t;
    }

    public void visit(HtmlDocument.Text t)        {
        if (((flags & TRIM_SPACES) != 0)
                && !inPreBlock
                && (previousElement instanceof HtmlDocument.Newline
                || previousElement instanceof HtmlDocument.Tag
                || previousElement instanceof HtmlDocument.EndTag
                || previousElement instanceof HtmlDocument.Comment)) {
            int i;
            for (i=0; i<t.text.length(); i++)
                if (t.text.charAt(i) != ' '
                        && t.text.charAt(i) != '\t')
                    break;
            if (i > 0)
                t.text = t.text.substring(i);
        };
        previousElement = t;
    }

    public void visit(HtmlDocument.Comment c)     { previousElement = c; }
    public void visit(HtmlDocument.Newline n)     { previousElement = n; }
    public void visit(HtmlDocument.Annotation a)  { previousElement = a; }
    public void visit(HtmlDocument.TagBlock bl) {
        if (bl.startTag.tagName.equalsIgnoreCase("PRE")
                || bl.startTag.tagName.equalsIgnoreCase("SCRIPT")
                || bl.startTag.tagName.equalsIgnoreCase("STYLE")) {
            inPreBlock = true;
            super.visit(bl);
            inPreBlock = false;
        }
        else
            super.visit(bl);
    }
}

Summary ✨

This Java class, HtmlScrubber, is a Visitor that cleans up an HTML document tree by applying various transformations to its elements and attributes. It can convert tag names and attribute values to uppercase or lowercase, strip unnecessary quotes, and trim spaces from text nodes. The scrubbing options are customizable through a bitmask.

Tech Fingerprint

Standard Collections

Alerts (10)

'=' Maintainability Info: Avoid using unnamed 'magic' numbers directly in comparisons or assignments. Use named constants (static final variables) instead to improve readability and maintainability.
32 33 34 35 36
Complexity hotspot; lines 92 to 96 (total complexity: 12)
92 93 94 95 96