PageRenderTime 43ms CodeModel.GetById 17ms RepoModel.GetById 0ms app.codeStats 0ms

/bundles/plugins-trunk/XML/sidekick/html/parser/html/HtmlScrubber.java

#
Java | 147 lines | 99 code | 19 blank | 29 comment | 55 complexity | 36dcbc54bcdbef34a1e7c2ad052319cc MD5 | raw file
Possible License(s): BSD-3-Clause, AGPL-1.0, Apache-2.0, LGPL-2.0, LGPL-3.0, GPL-2.0, CC-BY-SA-3.0, LGPL-2.1, GPL-3.0, MPL-2.0-no-copyleft-exception, IPL-1.0
  1. /*
  2. * HtmlScrubber.java -- cleans up HTML document tree.
  3. * Copyright (C) 1999 Quiotix Corporation.
  4. *
  5. * This program is free software; you can redistribute it and/or modify
  6. * it under the terms of the GNU General Public License, version 2, as
  7. * published by the Free Software Foundation.
  8. *
  9. * This program is distributed in the hope that it will be useful,
  10. * but WITHOUT ANY WARRANTY; without even the implied warranty of
  11. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  12. * GNU General Public License (http://www.gnu.org/copyleft/gpl.txt)
  13. * for more details.
  14. */
  15. package sidekick.html.parser.html;
  16. import java.util.Iterator;
  17. /**
  18. * HtmlScrubber is a Visitor which walks an HtmlDocument and cleans it up.
  19. * It can change tags and tag attributes to uppercase or lowercase, strip
  20. * out unnecessary quotes from attribute values, and strip trailing spaces
  21. * before a newline.
  22. *
  23. * @author Brian Goetz, Quiotix
  24. * Additional contributions by: Thorsten Weber
  25. */
  26. public class HtmlScrubber extends HtmlVisitor {
  27. public static final int TAGS_UPCASE = 1;
  28. public static final int TAGS_DOWNCASE = 2;
  29. public static final int ATTR_UPCASE = 4;
  30. public static final int ATTR_DOWNCASE = 8;
  31. public static final int STRIP_QUOTES = 16;
  32. public static final int TRIM_SPACES = 32;
  33. public static final int DEFAULT_OPTIONS =
  34. TAGS_DOWNCASE | ATTR_DOWNCASE | STRIP_QUOTES;
  35. protected int flags;
  36. protected HtmlDocument.HtmlElement previousElement;
  37. protected boolean inPreBlock;
  38. /** Create an HtmlScrubber with the default options (downcase tags and
  39. * tag attributes, strip out unnecessary quotes.)
  40. */
  41. public HtmlScrubber() {
  42. this(DEFAULT_OPTIONS);
  43. };
  44. /** Create an HtmlScrubber with the desired set of options.
  45. * @param flags A bitmask representing the desired scrubbing options
  46. */
  47. public HtmlScrubber(int flags) {
  48. this.flags = flags;
  49. };
  50. private static boolean safeToUnquote(String qs) {
  51. int upperCount=0, lowerCount=0, idCount=0;
  52. for (int i=1; i < qs.length()-1; i++) {
  53. char c = qs.charAt(i);
  54. if (Character.isUnicodeIdentifierPart(c))
  55. ++idCount;
  56. if (Character.isUpperCase(c))
  57. ++upperCount;
  58. else if (Character.isLowerCase(c))
  59. ++lowerCount;
  60. };
  61. return (qs.length()-2 > 0
  62. && (qs.length()-2 == idCount
  63. && (upperCount == 0 || lowerCount == 0)));
  64. };
  65. public void start() {
  66. previousElement = null;
  67. inPreBlock = false;
  68. };
  69. public void visit(HtmlDocument.Tag t) {
  70. if ((flags & TAGS_UPCASE) != 0)
  71. t.tagName = t.tagName.toUpperCase();
  72. else if ((flags & TAGS_DOWNCASE) != 0)
  73. t.tagName = t.tagName.toLowerCase();
  74. for (Iterator it=t.attributeList.attributes.iterator(); it.hasNext(); ) {
  75. HtmlDocument.Attribute a = (HtmlDocument.Attribute) it.next();
  76. if ((flags & ATTR_UPCASE) != 0)
  77. a.name = a.name.toUpperCase();
  78. else if ((flags & ATTR_DOWNCASE) != 0)
  79. a.name = a.name.toLowerCase();
  80. if (((flags & STRIP_QUOTES) != 0)
  81. && a.hasValue
  82. && ((a.value.charAt(0) == '\'' && a.value.charAt(a.value.length()-1) == '\'')
  83. || (a.value.charAt(0) == '\"' && a.value.charAt(a.value.length()-1) == '\"'))
  84. && safeToUnquote(a.value)) {
  85. a.value = a.value.substring(1, a.value.length()-1);
  86. };
  87. };
  88. previousElement = t;
  89. }
  90. public void visit(HtmlDocument.EndTag t) {
  91. if ((flags & TAGS_UPCASE) != 0)
  92. t.tagName = t.tagName.toUpperCase();
  93. else if ((flags & TAGS_DOWNCASE) != 0)
  94. t.tagName = t.tagName.toLowerCase();
  95. previousElement = t;
  96. }
  97. public void visit(HtmlDocument.Text t) {
  98. if (((flags & TRIM_SPACES) != 0)
  99. && !inPreBlock
  100. && (previousElement instanceof HtmlDocument.Newline
  101. || previousElement instanceof HtmlDocument.Tag
  102. || previousElement instanceof HtmlDocument.EndTag
  103. || previousElement instanceof HtmlDocument.Comment)) {
  104. int i;
  105. for (i=0; i<t.text.length(); i++)
  106. if (t.text.charAt(i) != ' '
  107. && t.text.charAt(i) != '\t')
  108. break;
  109. if (i > 0)
  110. t.text = t.text.substring(i);
  111. };
  112. previousElement = t;
  113. }
  114. public void visit(HtmlDocument.Comment c) { previousElement = c; }
  115. public void visit(HtmlDocument.Newline n) { previousElement = n; }
  116. public void visit(HtmlDocument.Annotation a) { previousElement = a; }
  117. public void visit(HtmlDocument.TagBlock bl) {
  118. if (bl.startTag.tagName.equalsIgnoreCase("PRE")
  119. || bl.startTag.tagName.equalsIgnoreCase("SCRIPT")
  120. || bl.startTag.tagName.equalsIgnoreCase("STYLE")) {
  121. inPreBlock = true;
  122. super.visit(bl);
  123. inPreBlock = false;
  124. }
  125. else
  126. super.visit(bl);
  127. }
  128. }