/lib/saxonB/net/sf/saxon/event/HTMLEmitter.java
Java | 691 lines | 430 code | 87 blank | 174 comment | 92 complexity | 744c9d48f998c0dcd1c1713f73f308ba MD5 | raw file
- package net.sf.saxon.event;
- import net.sf.saxon.tinytree.CompressedWhitespace;
- import net.sf.saxon.trans.XPathException;
- import net.sf.saxon.value.Whitespace;
- import javax.xml.transform.OutputKeys;
- /**
- * This class generates HTML output
- * @author Michael H. Kay
- */
- public class HTMLEmitter extends XMLEmitter {
- /**
- * Preferred character representations
- */
- private static final int REP_NATIVE = 0;
- private static final int REP_ENTITY = 1;
- private static final int REP_DECIMAL = 2;
- private static final int REP_HEX = 3;
- private int nonASCIIRepresentation = REP_NATIVE;
- private int excludedRepresentation = REP_ENTITY;
- private int inScript;
- private boolean started = false;
- private String elementName;
- private short uriCode;
- /**
- * Decode preferred representation
- * @param rep string containing preferred representation (native, entity, decimal, or hex)
- * @return integer code for the preferred representation
- */
- private static int representationCode(String rep) {
- if (rep.equalsIgnoreCase("native")) return REP_NATIVE;
- if (rep.equalsIgnoreCase("entity")) return REP_ENTITY;
- if (rep.equalsIgnoreCase("decimal")) return REP_DECIMAL;
- if (rep.equalsIgnoreCase("hex")) return REP_HEX;
- return REP_ENTITY;
- }
- /**
- * Table of HTML tags that have no closing tag
- */
- static HTMLTagHashSet emptyTags = new HTMLTagHashSet(31);
- static {
- setEmptyTag("area");
- setEmptyTag("base");
- setEmptyTag("basefont");
- setEmptyTag("br");
- setEmptyTag("col");
- setEmptyTag("frame");
- setEmptyTag("hr");
- setEmptyTag("img");
- setEmptyTag("input");
- setEmptyTag("isindex");
- setEmptyTag("link");
- setEmptyTag("meta");
- setEmptyTag("param");
- }
- private static void setEmptyTag(String tag) {
- emptyTags.add(tag);
- }
- protected static boolean isEmptyTag(String tag) {
- return emptyTags.contains(tag);
- }
- /**
- * Table of boolean attributes
- */
- // we use two HashMaps to avoid unnecessary string concatenations
- private static HTMLTagHashSet booleanAttributes = new HTMLTagHashSet(31);
- private static HTMLTagHashSet booleanCombinations = new HTMLTagHashSet(53);
- static {
- setBooleanAttribute("area", "nohref");
- setBooleanAttribute("button", "disabled");
- setBooleanAttribute("dir", "compact");
- setBooleanAttribute("dl", "compact");
- setBooleanAttribute("frame", "noresize");
- setBooleanAttribute("hr", "noshade");
- setBooleanAttribute("img", "ismap");
- setBooleanAttribute("input", "checked");
- setBooleanAttribute("input", "disabled");
- setBooleanAttribute("input", "readonly");
- setBooleanAttribute("menu", "compact");
- setBooleanAttribute("object", "declare");
- setBooleanAttribute("ol", "compact");
- setBooleanAttribute("optgroup", "disabled");
- setBooleanAttribute("option", "selected");
- setBooleanAttribute("option", "disabled");
- setBooleanAttribute("script", "defer");
- setBooleanAttribute("select", "multiple");
- setBooleanAttribute("select", "disabled");
- setBooleanAttribute("td", "nowrap");
- setBooleanAttribute("textarea", "disabled");
- setBooleanAttribute("textarea", "readonly");
- setBooleanAttribute("th", "nowrap");
- setBooleanAttribute("ul", "compact");
- }
- private static void setBooleanAttribute(String element, String attribute) {
- booleanAttributes.add(attribute);
- booleanCombinations.add(element + '+' + attribute);
- }
- private static boolean isBooleanAttribute(String element, String attribute, String value) {
- return attribute.equalsIgnoreCase(value) &&
- booleanAttributes.contains(attribute) &&
- booleanCombinations.contains(element + '+' + attribute);
- }
- /**
- * Constructor
- */
- public HTMLEmitter() {
- }
- /**
- * Output start of document
- */
- public void open() throws XPathException {}
- protected void openDocument() throws XPathException {
- if (writer==null) {
- makeWriter();
- }
- if (started) return;
- started = true;
- // This method is sometimes called twice, especially during an identity transform
- // This check stops two DOCTYPE declarations being output.
- String version = outputProperties.getProperty(OutputKeys.VERSION);
- if (version != null && !(version.equals("4.0") || version.equals("4.01"))) {
- XPathException err = new XPathException("Unsupported HTML version: " + version);
- err.setErrorCode("SESU0013");
- throw err;
- }
- String byteOrderMark = outputProperties.getProperty(SaxonOutputKeys.BYTE_ORDER_MARK);
- if ("yes".equals(byteOrderMark) &&
- "UTF-8".equalsIgnoreCase(outputProperties.getProperty(OutputKeys.ENCODING))) {
- try {
- writer.write('\uFEFF');
- } catch (java.io.IOException err) {
- // Might be an encoding exception; just ignore it
- }
- }
- String systemId = outputProperties.getProperty(OutputKeys.DOCTYPE_SYSTEM);
- String publicId = outputProperties.getProperty(OutputKeys.DOCTYPE_PUBLIC);
- if (systemId!=null || publicId!=null) {
- writeDocType("html", systemId, publicId);
- }
- empty = false;
- inScript = -1000000;
- // Handle saxon:character-representation
- String representation = outputProperties.getProperty(
- SaxonOutputKeys.CHARACTER_REPRESENTATION);
- if (representation != null) {
- String nonASCIIrep;
- String excludedRep;
- int semi = representation.indexOf(';');
- if (semi < 0) {
- nonASCIIrep = Whitespace.trim(representation);
- excludedRep = nonASCIIrep;
- } else {
- nonASCIIrep = Whitespace.trim(representation.substring(0, semi));
- excludedRep = Whitespace.trim(representation.substring(semi+1));
- }
- nonASCIIRepresentation = representationCode(nonASCIIrep);
- excludedRepresentation = representationCode(excludedRep);
- if (excludedRepresentation == REP_NATIVE) {
- excludedRepresentation = REP_ENTITY;
- }
- }
- }
- /**
- * Output element start tag
- */
- public void startElement(int nameCode, int typeCode, int locationId, int properties) throws XPathException {
- super.startElement(nameCode, typeCode, locationId, properties);
- uriCode = namePool.getURICode(nameCode);
- elementName = (String)elementStack.peek();
- if (uriCode==0 &&
- ( elementName.equalsIgnoreCase("script") ||
- elementName.equalsIgnoreCase("style"))) {
- inScript = 0;
- }
- inScript++;
- }
- public void startContent() throws XPathException {
- closeStartTag(); // prevent <xxx/> syntax
- }
- /**
- * Write attribute name=value pair. Overrides the XML behaviour if the name and value
- * are the same (we assume this is a boolean attribute to be minimised), or if the value is
- * a URL.
- */
- protected void writeAttribute(int elCode, String attname, CharSequence value, int properties) throws XPathException {
- try {
- if (uriCode==0) {
- if (isBooleanAttribute(elementName, attname, value.toString())) {
- writer.write(attname);
- return;
- }
- }
- super.writeAttribute(elCode, attname, value, properties);
- } catch (java.io.IOException err) {
- throw new XPathException(err);
- }
- }
- /**
- * Escape characters. Overrides the XML behaviour
- */
- protected void writeEscape(final CharSequence chars, final boolean inAttribute)
- throws java.io.IOException, XPathException {
- int segstart = 0;
- final boolean[] specialChars = (inAttribute ? specialInAtt : specialInText);
- if (chars instanceof CompressedWhitespace) {
- ((CompressedWhitespace)chars).writeEscape(specialChars, writer);
- return;
- }
- boolean disabled = false;
- while (segstart < chars.length()) {
- int i = segstart;
- // find a maximal sequence of "ordinary" characters
- if (nonASCIIRepresentation == REP_NATIVE) {
- char c;
- while (i < chars.length() &&
- ((c = chars.charAt(i)) < 127 ? !specialChars[c] : (characterSet.inCharset(c) && c > 160)
- )
- ) {
- i++;
- }
- } else {
- char c;
- while (i < chars.length() && (c = chars.charAt(i)) < 127 && !specialChars[c]) {
- i++;
- }
- }
- // if this was the whole string, output the string and quit
- if (i == chars.length()) {
- if (segstart == 0) {
- writeCharSequence(chars);
- } else {
- writeCharSequence(chars.subSequence(segstart, i));
- }
- return;
- }
- // otherwise, output this sequence and continue
- if (i > segstart) {
- writeCharSequence(chars.subSequence(segstart, i));
- }
- final char c = chars.charAt(i);
- if (c==0) {
- // used to switch escaping on and off
- disabled = !disabled;
- } else if (disabled) {
- writer.write(c);
- } else if (c<=127) {
- // handle a special ASCII character
- if (inAttribute) {
- if (c=='<') {
- writer.write('<'); // not escaped
- } else if (c=='>') {
- writer.write(">"); // recommended for older browsers
- } else if (c=='&') {
- if (i+1<chars.length() && chars.charAt(i+1)=='{') {
- writer.write('&'); // not escaped if followed by '{'
- } else {
- writer.write("&");
- }
- } else if (c=='\"') {
- writer.write(""");
- } else if (c=='\n') {
- writer.write("
");
- } else if (c=='\t') {
- writer.write("	");
- } else if (c=='\r') {
- writer.write("
");
- }
- } else {
- if (c=='<') {
- writer.write("<");
- } else if (c=='>') {
- writer.write(">"); // changed to allow for "]]>"
- } else if (c=='&') {
- writer.write("&");
- } else if (c=='\r') {
- writer.write("
");
- }
- }
- } else if (c==160) {
- // always output NBSP as an entity reference
- writer.write(" ");
- } else if (c>=127 && c<160) {
- // these control characters are illegal in HTML
- XPathException err = new XPathException("Illegal HTML character: decimal " + (int)c);
- err.setErrorCode("SERE0014");
- throw err;
- } else if (c>=55296 && c<=56319) { //handle surrogate pair
- //A surrogate pair is two consecutive Unicode characters. The first
- //is in the range D800 to DBFF, the second is in the range DC00 to DFFF.
- //To compute the numeric value of the character corresponding to a surrogate
- //pair, use this formula (all numbers are hex):
- //(FirstChar - D800) * 400 + (SecondChar - DC00) + 10000
- // we'll trust the data to be sound
- int charval = (((int)c - 55296) * 1024) + ((int)chars.charAt(i+1) - 56320) + 65536;
- outputCharacterReference(charval);
- i++;
- } else if (characterSet.inCharset(c)) {
- switch(nonASCIIRepresentation) {
- case REP_NATIVE:
- writer.write(c);
- break;
- case REP_ENTITY:
- if (c>160 && c<=255) {
- // if chararacter in iso-8859-1, use an entity reference
- writer.write('&');
- writer.write(latin1Entities[(int)c-160]);
- writer.write(';');
- break;
- }
- // else fall through
- case REP_DECIMAL:
- preferHex = false;
- outputCharacterReference(c);
- break;
- case REP_HEX:
- preferHex = true;
- // fall through
- default:
- outputCharacterReference(c);
- break;
- }
- } else {
- // Character not present in encoding
- switch(excludedRepresentation) {
- case REP_ENTITY:
- if (c>160 && c<=255) {
- // if chararacter in iso-8859-1, use an entity reference
- writer.write('&');
- writer.write(latin1Entities[(int)c-160]);
- writer.write(';');
- break;
- }
- // else fall through
- case REP_NATIVE:
- case REP_DECIMAL:
- preferHex = false;
- outputCharacterReference(c);
- break;
- case REP_HEX:
- preferHex = true;
- // fall through
- default:
- outputCharacterReference(c);
- break;
- }
- }
- segstart = ++i;
- }
- }
- /**
- * Output an element end tag.
- */
- public void endElement() throws XPathException {
- String name = (String)elementStack.peek();
- inScript--;
- if (inScript==0) {
- inScript = -1000000;
- }
- if (isEmptyTag(name) && uriCode==0) {
- // no end tag required
- elementStack.pop();
- } else {
- super.endElement();
- }
- }
- /**
- * Character data.
- */
- public void characters (CharSequence chars, int locationId, int properties)
- throws XPathException {
- int options = properties;
- if (inScript>0) {
- options |= ReceiverOptions.DISABLE_ESCAPING;
- }
- super.characters(chars, locationId, options);
- }
- /**
- * Handle a processing instruction.
- */
- public void processingInstruction (String target, CharSequence data, int locationId, int properties)
- throws XPathException
- {
- if (empty) {
- openDocument();
- }
- for (int i=0; i<data.length(); i++) {
- if (data.charAt(i) == '>') {
- XPathException err = new XPathException("A processing instruction in HTML must not contain a > character");
- err.setErrorCode("SERE0015");
- throw err;
- }
- }
- try {
- writer.write("<?");
- writer.write(target);
- writer.write(' ');
- writeCharSequence(data);
- writer.write('>');
- } catch (java.io.IOException err) {
- throw new XPathException(err);
- }
- }
- private static final String[] latin1Entities = {
- "nbsp", // " " -- no-break space = non-breaking space,
- // U+00A0 ISOnum -->
- "iexcl", // "¡" -- inverted exclamation mark, U+00A1 ISOnum -->
- "cent", // "¢" -- cent sign, U+00A2 ISOnum -->
- "pound", // "£" -- pound sign, U+00A3 ISOnum -->
- "curren", // "¤" -- currency sign, U+00A4 ISOnum -->
- "yen", // "¥" -- yen sign = yuan sign, U+00A5 ISOnum -->
- "brvbar", // "¦" -- broken bar = broken vertical bar,
- // U+00A6 ISOnum -->
- "sect", // "§" -- section sign, U+00A7 ISOnum -->
- "uml", // "¨" -- diaeresis = spacing diaeresis,
- // U+00A8 ISOdia -->
- "copy", // "©" -- copyright sign, U+00A9 ISOnum -->
- "ordf", // "ª" -- feminine ordinal indicator, U+00AA ISOnum -->
- "laquo", // "«" -- left-pointing double angle quotation mark
- // = left pointing guillemet, U+00AB ISOnum -->
- "not", // "¬" -- not sign, U+00AC ISOnum -->
- "shy", // "­" -- soft hyphen = discretionary hyphen,
- // U+00AD ISOnum -->
- "reg", // "®" -- registered sign = registered trade mark sign,
- // U+00AE ISOnum -->
- "macr", // "¯" -- macron = spacing macron = overline
- // = APL overbar, U+00AF ISOdia -->
- "deg", // "°" -- degree sign, U+00B0 ISOnum -->
- "plusmn", // "±" -- plus-minus sign = plus-or-minus sign,
- // U+00B1 ISOnum -->
- "sup2", // "²" -- superscript two = superscript digit two
- // = squared, U+00B2 ISOnum -->
- "sup3", // "³" -- superscript three = superscript digit three
- // = cubed, U+00B3 ISOnum -->
- "acute", // "´" -- acute accent = spacing acute,
- // U+00B4 ISOdia -->
- "micro", // "µ" -- micro sign, U+00B5 ISOnum -->
- "para", // "¶" -- pilcrow sign = paragraph sign,
- // U+00B6 ISOnum -->
- "middot", // "·" -- middle dot = Georgian comma
- // = Greek middle dot, U+00B7 ISOnum -->
- "cedil", // "¸" -- cedilla = spacing cedilla, U+00B8 ISOdia -->
- "sup1", // "¹" -- superscript one = superscript digit one,
- // U+00B9 ISOnum -->
- "ordm", // "º" -- masculine ordinal indicator,
- // U+00BA ISOnum -->
- "raquo", // "»" -- right-pointing double angle quotation mark
- // = right pointing guillemet, U+00BB ISOnum -->
- "frac14", // "¼" -- vulgar fraction one quarter
- // = fraction one quarter, U+00BC ISOnum -->
- "frac12", // "½" -- vulgar fraction one half
- // = fraction one half, U+00BD ISOnum -->
- "frac34", // "¾" -- vulgar fraction three quarters
- // = fraction three quarters, U+00BE ISOnum -->
- "iquest", // "¿" -- inverted question mark
- // = turned question mark, U+00BF ISOnum -->
- "Agrave", // "À" -- latin capital letter A with grave
- // = latin capital letter A grave,
- // U+00C0 ISOlat1 -->
- "Aacute", // "Á" -- latin capital letter A with acute,
- // U+00C1 ISOlat1 -->
- "Acirc", // "Â" -- latin capital letter A with circumflex,
- // U+00C2 ISOlat1 -->
- "Atilde", // "Ã" -- latin capital letter A with tilde,
- // U+00C3 ISOlat1 -->
- "Auml", // "Ä" -- latin capital letter A with diaeresis,
- // U+00C4 ISOlat1 -->
- "Aring", // "Å" -- latin capital letter A with ring above
- // = latin capital letter A ring,
- // U+00C5 ISOlat1 -->
- "AElig", // "Æ" -- latin capital letter AE
- // = latin capital ligature AE,
- // U+00C6 ISOlat1 -->
- "Ccedil", // "Ç" -- latin capital letter C with cedilla,
- // U+00C7 ISOlat1 -->
- "Egrave", // "È" -- latin capital letter E with grave,
- // U+00C8 ISOlat1 -->
- "Eacute", // "É" -- latin capital letter E with acute,
- // U+00C9 ISOlat1 -->
- "Ecirc", // "Ê" -- latin capital letter E with circumflex,
- // U+00CA ISOlat1 -->
- "Euml", // "Ë" -- latin capital letter E with diaeresis,
- // U+00CB ISOlat1 -->
- "Igrave", // "Ì" -- latin capital letter I with grave,
- // U+00CC ISOlat1 -->
- "Iacute", // "Í" -- latin capital letter I with acute,
- // U+00CD ISOlat1 -->
- "Icirc", // "Î" -- latin capital letter I with circumflex,
- // U+00CE ISOlat1 -->
- "Iuml", // "Ï" -- latin capital letter I with diaeresis,
- // U+00CF ISOlat1 -->
- "ETH", // "Ð" -- latin capital letter ETH, U+00D0 ISOlat1 -->
- "Ntilde", // "Ñ" -- latin capital letter N with tilde,
- // U+00D1 ISOlat1 -->
- "Ograve", // "Ò" -- latin capital letter O with grave,
- // U+00D2 ISOlat1 -->
- "Oacute", // "Ó" -- latin capital letter O with acute,
- // U+00D3 ISOlat1 -->
- "Ocirc", // "Ô" -- latin capital letter O with circumflex,
- // U+00D4 ISOlat1 -->
- "Otilde", // "Õ" -- latin capital letter O with tilde,
- // U+00D5 ISOlat1 -->
- "Ouml", // "Ö" -- latin capital letter O with diaeresis,
- // U+00D6 ISOlat1 -->
- "times", // "×" -- multiplication sign, U+00D7 ISOnum -->
- "Oslash", // "Ø" -- latin capital letter O with stroke
- // = latin capital letter O slash,
- // U+00D8 ISOlat1 -->
- "Ugrave", // "Ù" -- latin capital letter U with grave,
- // U+00D9 ISOlat1 -->
- "Uacute", // "Ú" -- latin capital letter U with acute,
- // U+00DA ISOlat1 -->
- "Ucirc", // "Û" -- latin capital letter U with circumflex,
- // U+00DB ISOlat1 -->
- "Uuml", // "Ü" -- latin capital letter U with diaeresis,
- // U+00DC ISOlat1 -->
- "Yacute", // "Ý" -- latin capital letter Y with acute,
- // U+00DD ISOlat1 -->
- "THORN", // "Þ" -- latin capital letter THORN,
- // U+00DE ISOlat1 -->
- "szlig", // "ß" -- latin small letter sharp s = ess-zed,
- // U+00DF ISOlat1 -->
- "agrave", // "à" -- latin small letter a with grave
- // = latin small letter a grave,
- // U+00E0 ISOlat1 -->
- "aacute", // "á" -- latin small letter a with acute,
- // U+00E1 ISOlat1 -->
- "acirc", // "â" -- latin small letter a with circumflex,
- // U+00E2 ISOlat1 -->
- "atilde", // "ã" -- latin small letter a with tilde,
- // U+00E3 ISOlat1 -->
- "auml", // "ä" -- latin small letter a with diaeresis,
- // U+00E4 ISOlat1 -->
- "aring", // "å" -- latin small letter a with ring above
- // = latin small letter a ring,
- // U+00E5 ISOlat1 -->
- "aelig", // "æ" -- latin small letter ae
- // = latin small ligature ae, U+00E6 ISOlat1 -->
- "ccedil", // "ç" -- latin small letter c with cedilla,
- // U+00E7 ISOlat1 -->
- "egrave", // "è" -- latin small letter e with grave,
- // U+00E8 ISOlat1 -->
- "eacute", // "é" -- latin small letter e with acute,
- // U+00E9 ISOlat1 -->
- "ecirc", // "ê" -- latin small letter e with circumflex,
- // U+00EA ISOlat1 -->
- "euml", // "ë" -- latin small letter e with diaeresis,
- // U+00EB ISOlat1 -->
- "igrave", // "ì" -- latin small letter i with grave,
- // U+00EC ISOlat1 -->
- "iacute", // "í" -- latin small letter i with acute,
- // U+00ED ISOlat1 -->
- "icirc", // "î" -- latin small letter i with circumflex,
- // U+00EE ISOlat1 -->
- "iuml", // "ï" -- latin small letter i with diaeresis,
- // U+00EF ISOlat1 -->
- "eth", // "ð" -- latin small letter eth, U+00F0 ISOlat1 -->
- "ntilde", // "ñ" -- latin small letter n with tilde,
- // U+00F1 ISOlat1 -->
- "ograve", // "ò" -- latin small letter o with grave,
- // U+00F2 ISOlat1 -->
- "oacute", // "ó" -- latin small letter o with acute,
- // U+00F3 ISOlat1 -->
- "ocirc", // "ô" -- latin small letter o with circumflex,
- // U+00F4 ISOlat1 -->
- "otilde", // "õ" -- latin small letter o with tilde,
- // U+00F5 ISOlat1 -->
- "ouml", // "ö" -- latin small letter o with diaeresis,
- // U+00F6 ISOlat1 -->
- "divide", // "÷" -- division sign, U+00F7 ISOnum -->
- "oslash", // "ø" -- latin small letter o with stroke,
- // = latin small letter o slash,
- // U+00F8 ISOlat1 -->
- "ugrave", // "ù" -- latin small letter u with grave,
- // U+00F9 ISOlat1 -->
- "uacute", // "ú" -- latin small letter u with acute,
- // U+00FA ISOlat1 -->
- "ucirc", // "û" -- latin small letter u with circumflex,
- // U+00FB ISOlat1 -->
- "uuml", // "ü" -- latin small letter u with diaeresis,
- // U+00FC ISOlat1 -->
- "yacute", // "ý" -- latin small letter y with acute,
- // U+00FD ISOlat1 -->
- "thorn", // "þ" -- latin small letter thorn,
- // U+00FE ISOlat1 -->
- "yuml" // "ÿ" -- latin small letter y with diaeresis,
- // U+00FF ISOlat1 -->
- };
- }
- //
- // The contents of this file are subject to the Mozilla Public License Version 1.0 (the "License");
- // you may not use this file except in compliance with the License. You may obtain a copy of the
- // License at http://www.mozilla.org/MPL/
- //
- // Software distributed under the License is distributed on an "AS IS" basis,
- // WITHOUT WARRANTY OF ANY KIND, either express or implied.
- // See the License for the specific language governing rights and limitations under the License.
- //
- // The Original Code is: all this file.
- //
- // The Initial Developer of the Original Code is Michael H. Kay.
- //
- // Portions created by (your name) are Copyright (C) (your legal entity). All Rights Reserved.
- //
- // Contributor(s): none.
- //