HtmlStreamTokenizer.java

/eclipse-mylyn-R_3_8_0-fetched-src/org.eclipse.mylyn.commons/org.eclipse.mylyn.commons.core/src/org/eclipse/mylyn/commons/core/HtmlStreamTokenizer.java

#
Java | 1141 lines | 1084 code | 17 blank | 40 comment | 50 complexity | 36609adde6ece6c6f1b280a257b12612 MD5 | raw file
Possible License(s): Apache-2.0

/*******************************************************************************
 * Copyright (c) 2004, 2008 Tasktop Technologies and others.
 * All rights reserved. This program and the accompanying materials
 * are made available under the terms of the Eclipse Public License v1.0
 * which accompanies this distribution, and is available at
 * http://www.eclipse.org/legal/epl-v10.html
 *
 * Contributors:
 *     Tasktop Technologies - initial API and implementation
 *******************************************************************************/

package org.eclipse.mylyn.commons.core;

import java.io.BufferedReader;
import java.io.IOException;
import java.io.Reader;
import java.net.URL;
import java.text.ParseException;
import java.util.HashMap;
import java.util.Locale;

/**
 * Parses HTML into tokens.
 * 
 * @author Shawn Minto
 * @since 3.7
 */
public class HtmlStreamTokenizer {

	/** parser state */
	private State state;

	/** reader from which to parse the text */
	private final BufferedReader in;

	/** base URL for resolving relative URLs */
	private final URL base;

	/** buffer holding the text of the current token */
	private final StringBuffer textBuffer;

	/** buffer holding whitespace preceding the current token */
	private final StringBuffer whitespaceBuffer;

	/**
	 * holds a token that was read and then put back in the queue to be returned again on <code>nextToken</code> call
	 */
	private Token pushbackToken;

	/**
	 * holds a character that was read and then determined not to be part of the current token
	 */
	private int pushbackChar;

	/** current quote delimiter (single or double) */
	private int quoteChar;

	/** Allow class client to choose if tag attributes are escaped or not */
	private boolean escapeTagValues;

	/**
	 * Constructor.
	 * 
	 * @param in
	 *            reader for the HTML document to tokenize
	 * @param base
	 *            URL for resolving relative URLs
	 */
	public HtmlStreamTokenizer(Reader in, URL base) {
		textBuffer = new StringBuffer();
		whitespaceBuffer = new StringBuffer();
		pushbackChar = 0;
		state = State.TEXT;
		this.in = new BufferedReader(in);
		this.base = base;
		escapeTagValues = true;
	}

	public void escapeTagAttributes(boolean value) {
		escapeTagValues = value;
	}

	/**
	 * Returns the next token from the stream.
	 */
	public Token nextToken() throws IOException, ParseException {
		if (pushbackToken != null) {
			Token token = pushbackToken;
			pushbackToken = null;
			return token;
		}

		int closingComment = 0;

		textBuffer.setLength(0);
		whitespaceBuffer.setLength(0);
		do {
			int ch;
			if (pushbackChar != 0) {
				ch = pushbackChar;
				pushbackChar = 0;
			} else {
				ch = in.read();
			}
			if (ch < 0) {
				State oldState = state;
				state = State.EOF;
				if (textBuffer.length() > 0 && oldState == State.TEXT) {
					return new Token(textBuffer, whitespaceBuffer, false);
				} else {
					return new Token();
				}
			}
			if (state == State.TEXT) {
				if (ch == '<') {
					state = State.TAG;
					if (textBuffer.length() > 0) {
						return new Token(textBuffer, whitespaceBuffer, false);
					}
				} else if (Character.isWhitespace((char) ch)) {
					pushbackChar = ch;
					state = State.WS;
					if (textBuffer.length() > 0) {
						return new Token(textBuffer, whitespaceBuffer, false);
					}
				} else {
					textBuffer.append((char) ch);
				}
			} else if (state == State.WS) {
				if (!Character.isWhitespace((char) ch)) {
					pushbackChar = ch;
					state = State.TEXT;
				} else {
					whitespaceBuffer.append((char) ch);
				}
			} else if (state == State.TAG) {
				if (ch == '>') {
					state = State.TEXT;
					HtmlTag tag = new HtmlTag(base);
					parseTag(textBuffer.toString(), tag, escapeTagValues);
					return new Token(tag, whitespaceBuffer);
				}
				if (ch == '<' && textBuffer.length() == 0) {
					textBuffer.append("<<"); //$NON-NLS-1$
					state = State.TEXT;
				} else if (ch == '-' && textBuffer.length() == 2 && textBuffer.charAt(1) == '-'
						&& textBuffer.charAt(0) == '!') {
					textBuffer.setLength(0);
					state = State.COMMENT;
				} else if (ch == '\'' || ch == '"') {
					quoteChar = ch;
					textBuffer.append((char) ch);
					state = State.TAG_QUOTE;
				} else {
					textBuffer.append((char) ch);
				}
			} else if (state == State.TAG_QUOTE) {
				if (ch == '>') {
					pushbackChar = ch;
					state = State.TAG;
				} else {
					textBuffer.append((char) ch);
					if (ch == quoteChar) {
						state = State.TAG;
					}
				}
			} else if (state == State.COMMENT) {
				if (ch == '>' && closingComment >= 2) {
					textBuffer.setLength(textBuffer.length() - 2);
					closingComment = 0;
					state = State.TEXT;
					return new Token(textBuffer, whitespaceBuffer, true);
				}
				if (ch == '-') {
					closingComment++;
				} else {
					closingComment = 0;
				}
				textBuffer.append((char) ch);
			}
		} while (true);
	}

	/**
	 * Pushes the token back into the queue, to be returned by the subsequent call to <code>nextToken</code>
	 */
	public void pushback(Token token) {
		pushbackToken = token;
	}

	/**
	 * Parses an HTML tag out of a string of characters.
	 */
	private static void parseTag(String s, HtmlTag tag, boolean escapeValues) throws ParseException {

		int i = 0;
		for (; i < s.length() && Character.isWhitespace(s.charAt(i)); i++) {
			// just move forward
		}
		if (i == s.length()) {
			throw new ParseException("parse empty tag", 0); //$NON-NLS-1$
		}

		int start = i;
		for (; i < s.length() && !Character.isWhitespace(s.charAt(i)); i++) {
			// just move forward
		}
		tag.setTagName(s.substring(start, i));

		for (; i < s.length() && Character.isWhitespace(s.charAt(i)); i++) {
			// just move forward
		}
		if (i == s.length()) {
			return;
		} else {
			parseAttributes(tag, s, i, escapeValues);
			return;
		}
	}

	/**
	 * parses HTML tag attributes from a buffer and sets them in an HtmlTag
	 */
	private static void parseAttributes(HtmlTag tag, String s, int i, boolean escapeValues) throws ParseException {
		while (i < s.length()) {
			// skip whitespace
			while (i < s.length() && Character.isWhitespace(s.charAt(i))) {
				i++;
			}

			if (i == s.length()) {
				return;
			}

			// read the attribute name -- the rule might be looser than the RFC
			// specifies:
			// everything up to a space or an equal sign is included
			int start = i;
			for (; i < s.length() && !Character.isWhitespace(s.charAt(i)) && s.charAt(i) != '='; i++) {
				// just move forward
			}
			String attributeName = s.substring(start, i).toLowerCase(Locale.ENGLISH);

			if (attributeName.equals("/")) { //$NON-NLS-1$
				tag.setSelfTerminating(true);
				continue;
			}

			for (; i < s.length() && Character.isWhitespace(s.charAt(i)); i++) {
				// just move forward
			}
			if (i == s.length() || s.charAt(i) != '=') {
				// no attribute value
				tag.setAttribute(attributeName, ""); //$NON-NLS-1$
				continue;
			}

			// skip whitespace to the start of attribute value
			for (i = i + 1; i < s.length() && Character.isWhitespace(s.charAt(i)); i++) {
				// just move forward
			}
			if (i == s.length()) {
				return;
			}

			// read the attribute value -- the rule for unquoted attribute value
			// is
			// looser than the one in Conolly's W3C 1996 lexical analyzer draft:
			// everything
			// is included up to the next space
			String attributeValue;
			if (s.charAt(i) == '"') {
				start = ++i;
				for (; i < s.length() && s.charAt(i) != '"'; i++) {
					// just move forward
				}
				if (i == s.length()) {
					return; // shouldn't happen if input returned by nextToken
				}
				if (escapeValues) {
					attributeValue = unescape(s.substring(start, i));
				} else {
					attributeValue = s.substring(start, i);
				}
				i++;
			} else if (s.charAt(i) == '\'') {
				start = ++i;
				for (; i < s.length() && s.charAt(i) != '\''; i++) {
					// just move forward
				}
				if (i == s.length()) {
					return; // shouldn't happen if input returned by nextToken
				}
				attributeValue = unescape(s.substring(start, i));
				i++;
			} else {
				start = i;
				for (; i < s.length() && !Character.isWhitespace(s.charAt(i)); i++) {
					// just move forward
				}
				attributeValue = s.substring(start, i);
			}
			tag.setAttribute(attributeName, attributeValue);
		}
	}

	/**
	 * Returns a string with HTML escapes changed into their corresponding characters.
	 * 
	 * @deprecated use {@link StringEscapeUtils#unescapeHtml(String)} instead
	 */
	@Deprecated
	public static String unescape(String s) {
		if (s.indexOf('&') == -1) {
			return s;
		} else {
			StringBuffer sb = new StringBuffer(s);
			unescape(sb);
			return sb.toString();
		}
	}

	/**
	 * Replaces (in-place) HTML escapes in a StringBuffer with their corresponding characters.
	 * 
	 * @deprecated use {@link StringEscapeUtils#unescapeHtml(String)} instead
	 */
	@Deprecated
	public static StringBuffer unescape(StringBuffer sb) {
		int i = 0; // index into the unprocessed section of the buffer
		int j = 0; // index into the processed section of the buffer

		while (i < sb.length()) {
			char ch = sb.charAt(i);
			if (ch == '&') {
				int start = i;
				String escape = null;
				for (i = i + 1; i < sb.length(); i++) {
					ch = sb.charAt(i);
					if (!Character.isLetterOrDigit(ch) && !(ch == '#' && i == (start + 1))) {
						escape = sb.substring(start + 1, i);
						break;
					}
				}
				if (i == sb.length() && i != (start + 1)) {
					escape = sb.substring(start + 1);
				}
				if (escape != null) {
					Character character = parseReference(escape);
					if (character != null
							&& !((0x0A == character || 0x0D == character || 0x09 == ch)
									|| (character >= 0x20 && character <= 0xD7FF)
									|| (character >= 0xE000 && character <= 0xFFFD) || (character >= 0x10000 && character <= 0x10FFFF))) {
						// Character is an invalid xml character
						// http://www.w3.org/TR/REC-xml/#charsets
						character = null;
					}
					if (character != null) {
						ch = character.charValue();
					} else {
						// not an HTML escape; rewind
						i = start;
						ch = '&';
					}
				}
			}
			sb.setCharAt(j, ch);
			i++;
			j++;
		}

		sb.setLength(j);
		return sb;
	}

	/**
	 * Parses HTML character and entity references and returns the corresponding character.
	 */
	private static Character parseReference(String s) {
		if (s.length() == 0) {
			return null;
		}

		if (s.charAt(0) == '#') {
			// character reference
			if (s.length() == 1) {
				return null;
			}

			try {
				int value;
				if (s.charAt(1) == 'x') {
					// Hex reference
					value = Integer.parseInt(s.substring(2), 16);
				} else {
					// Decimal reference
					value = Integer.parseInt(s.substring(1));
				}
				return new Character((char) value);
			} catch (NumberFormatException e) {
				return null;
			}
		} else {
			return entities.get(s);
		}
	}

	/**
	 * Class for current token.
	 */
	public static class Token {
		public static final Type EOF = new Type();

		public static final Type TEXT = new Type();

		public static final Type TAG = new Type();

		public static final Type COMMENT = new Type();

		/** token's type */
		private Type type;

		/** token's value */
		private final Object value;

		/** whitespace preceding the token */
		private final StringBuffer whitespace;

		/**
		 * Constructor for the EOF token.
		 */
		protected Token() {
			type = EOF;
			value = null;
			whitespace = null;
		}

		/**
		 * Constructor for the HTML tag tokens.
		 */
		protected Token(HtmlTag tag, StringBuffer whitespace) {
			type = TAG;
			value = tag;
			this.whitespace = whitespace;
		}

		/**
		 * Constructor for regular text and comments.
		 */
		protected Token(StringBuffer text, StringBuffer whitespace, boolean comment) {
			if (comment) {
				type = COMMENT;
			} else {
				type = TEXT;
			}
			this.value = text;
			this.whitespace = whitespace;
		}

		/**
		 * Returns the token's type.
		 */
		public Type getType() {
			return type;
		}

		/**
		 * Returns the whitespace preceding the token.
		 */
		public StringBuffer getWhitespace() {
			return whitespace;
		}

		/**
		 * Returns the token's value. This is an HtmlTag for tokens of type <code>TAG</code> and a StringBuffer for
		 * tokens of type <code>TEXT</code> and <code>COMMENT</code>. For tokens of type <code>EOF</code>, the value is
		 * <code>null</code>.
		 */
		public Object getValue() {
			return value;
		}

		/**
		 * Returns the string representation of the token, including the preceding whitespace.
		 */
		@Override
		public String toString() {
			StringBuffer sb = new StringBuffer();
			if (whitespace != null) {
				sb.append(whitespace);
			}
			if (value != null) {
				if (type == TAG) {
					// sb.append('<');
				} else if (type == COMMENT) {
					sb.append("<!--"); //$NON-NLS-1$
				}
				sb.append(value);
				if (type == TAG) {
					// if(value instanceof HtmlTag) {
					// HtmlTag htmlTag = (HtmlTag)value;
					// if(htmlTag.getTagName().startsWith("?xml")) {
					// sb.append("?>");
					// }
					// } else {
					// sb.append('>');

				} else if (type == COMMENT) {
					sb.append("-->"); //$NON-NLS-1$
				}

			}
			return sb.toString();
		}

		/**
		 * Private enum class for token type.
		 */
		private static class Type {
			private Type() {
				// don't need to do anything
			}
		}
	}

	/**
	 * Enum class for parser state.
	 */
	private static class State {
		static final State EOF = new State();

		static final State COMMENT = new State();

		static final State TEXT = new State();

		static final State TAG = new State();

		static final State WS = new State();

		static final State TAG_QUOTE = new State();

		private State() {
			// don't need to do anything
		}
	}

	/** names and values of HTML entity references */
	private static HashMap<String, Character> entities;

	/*
	 * Based on ISO 8879.
	 * 
	 * Portions (c) International Organization for Standardization 1986
	 * Permission to copy in any form is granted for use with conforming SGML
	 * systems and applications as defined in ISO 8879, provided this notice is
	 * included in all copies.
	 * 
	 */
	static {
		entities = new HashMap<String, Character>();
		entities.put("nbsp", Character.valueOf('\240')); // no-break //$NON-NLS-1$
		// space =
		// non-breaking
		// space
		entities.put("iexcl", Character.valueOf('\241')); // inverted //$NON-NLS-1$
		// exclamation
		// mark
		entities.put("cent", Character.valueOf('\242')); // cent sign //$NON-NLS-1$
		entities.put("pound", Character.valueOf('\243')); // pound //$NON-NLS-1$
		// sign
		entities.put("curren", Character.valueOf('\244')); // currency //$NON-NLS-1$
		// sign
		entities.put("yen", Character.valueOf('\245')); // yen sign = //$NON-NLS-1$
		// yuan sign
		entities.put("brvbar", Character.valueOf('\246')); // broken //$NON-NLS-1$
		// bar =
		// broken
		// vertical
		// bar
		entities.put("sect", Character.valueOf('\247')); // section //$NON-NLS-1$
		// sign
		entities.put("uml", Character.valueOf('\250')); // diaeresis = //$NON-NLS-1$
		// spacing
		// diaeresis
		entities.put("copy", Character.valueOf('\251')); // copyright //$NON-NLS-1$
		// sign
		entities.put("ordf", Character.valueOf('\252')); // feminine //$NON-NLS-1$
		// ordinal
		// indicator
		entities.put("laquo", Character.valueOf('\253')); // left-pointing //$NON-NLS-1$
		// double
		// angle
		// quotation
		// mark =
		// left
		// pointing
		// guillemet
		entities.put("not", Character.valueOf('\254')); // not sign //$NON-NLS-1$
		entities.put("shy", Character.valueOf('\255')); // soft hyphen = //$NON-NLS-1$
		// discretionary
		// hyphen
		entities.put("reg", Character.valueOf('\256')); // registered //$NON-NLS-1$
		// sign =
		// registered
		// trade mark
		// sign
		entities.put("macr", Character.valueOf('\257')); // macron = //$NON-NLS-1$
		// spacing
		// macron =
		// overline
		// = APL
		// overbar
		entities.put("deg", Character.valueOf('\260')); // degree sign //$NON-NLS-1$
		entities.put("plusmn", Character.valueOf('\261')); // plus-minus //$NON-NLS-1$
		// sign =
		// plus-or-minus
		// sign
		entities.put("sup2", Character.valueOf('\262')); // superscript //$NON-NLS-1$
		// two =
		// superscript
		// digit two
		// = squared
		entities.put("sup3", Character.valueOf('\263')); // superscript //$NON-NLS-1$
		// three =
		// superscript
		// digit
		// three =
		// cubed
		entities.put("acute", Character.valueOf('\264')); // acute //$NON-NLS-1$
		// accent =
		// spacing
		// acute
		entities.put("micro", Character.valueOf('\265')); // micro //$NON-NLS-1$
		// sign
		entities.put("para", Character.valueOf('\266')); // pilcrow //$NON-NLS-1$
		// sign =
		// paragraph
		// sign
		entities.put("middot", Character.valueOf('\267')); // middle //$NON-NLS-1$
		// dot =
		// Georgian
		// comma =
		// Greek
		// middle
		// dot
		entities.put("cedil", Character.valueOf('\270')); // cedilla = //$NON-NLS-1$
		// spacing
		// cedilla
		entities.put("sup1", Character.valueOf('\271')); // superscript //$NON-NLS-1$
		// one =
		// superscript
		// digit one
		entities.put("ordm", Character.valueOf('\272')); // masculine //$NON-NLS-1$
		// ordinal
		// indicator
		entities.put("raquo", Character.valueOf('\273')); // right-pointing //$NON-NLS-1$
		// double
		// angle
		// quotation
		// mark =
		// right
		// pointing
		// guillemet
		entities.put("frac14", Character.valueOf('\274')); // vulgar //$NON-NLS-1$
		// fraction
		// one
		// quarter =
		// fraction
		// one
		// quarter
		entities.put("frac12", Character.valueOf('\275')); // vulgar //$NON-NLS-1$
		// fraction
		// one half
		// =
		// fraction
		// one half
		entities.put("frac34", Character.valueOf('\276')); // vulgar //$NON-NLS-1$
		// fraction
		// three
		// quarters
		// =
		// fraction
		// three
		// quarters
		entities.put("iquest", Character.valueOf('\277')); // inverted //$NON-NLS-1$
		// question
		// mark =
		// turned
		// question
		// mark
		entities.put("Agrave", Character.valueOf('\300')); // latin //$NON-NLS-1$
		// capital
		// letter A
		// with
		// grave =
		// latin
		// capital
		// letter A
		// grave
		entities.put("Aacute", Character.valueOf('\301')); // latin //$NON-NLS-1$
		// capital
		// letter A
		// with
		// acute
		entities.put("Acirc", Character.valueOf('\302')); // latin //$NON-NLS-1$
		// capital
		// letter A
		// with
		// circumflex
		entities.put("Atilde", Character.valueOf('\303')); // latin //$NON-NLS-1$
		// capital
		// letter A
		// with
		// tilde
		entities.put("Auml", Character.valueOf('\304')); // latin //$NON-NLS-1$
		// capital
		// letter A
		// with
		// diaeresis
		entities.put("Aring", Character.valueOf('\305')); // latin //$NON-NLS-1$
		// capital
		// letter A
		// with ring
		// above =
		// latin
		// capital
		// letter A
		// ring
		entities.put("AElig", Character.valueOf('\306')); // latin //$NON-NLS-1$
		// capital
		// letter AE
		// = latin
		// capital
		// ligature
		// AE
		entities.put("Ccedil", Character.valueOf('\307')); // latin //$NON-NLS-1$
		// capital
		// letter C
		// with
		// cedilla
		entities.put("Egrave", Character.valueOf('\310')); // latin //$NON-NLS-1$
		// capital
		// letter E
		// with
		// grave
		entities.put("Eacute", Character.valueOf('\311')); // latin //$NON-NLS-1$
		// capital
		// letter E
		// with
		// acute
		entities.put("Ecirc", Character.valueOf('\312')); // latin //$NON-NLS-1$
		// capital
		// letter E
		// with
		// circumflex
		entities.put("Euml", Character.valueOf('\313')); // latin //$NON-NLS-1$
		// capital
		// letter E
		// with
		// diaeresis
		entities.put("Igrave", Character.valueOf('\314')); // latin //$NON-NLS-1$
		// capital
		// letter I
		// with
		// grave
		entities.put("Iacute", Character.valueOf('\315')); // latin //$NON-NLS-1$
		// capital
		// letter I
		// with
		// acute
		entities.put("Icirc", Character.valueOf('\316')); // latin //$NON-NLS-1$
		// capital
		// letter I
		// with
		// circumflex
		entities.put("Iuml", Character.valueOf('\317')); // latin //$NON-NLS-1$
		// capital
		// letter I
		// with
		// diaeresis
		entities.put("ETH", Character.valueOf('\320')); // latin capital //$NON-NLS-1$
		// letter ETH
		entities.put("Ntilde", Character.valueOf('\321')); // latin //$NON-NLS-1$
		// capital
		// letter N
		// with
		// tilde
		entities.put("Ograve", Character.valueOf('\322')); // latin //$NON-NLS-1$
		// capital
		// letter O
		// with
		// grave
		entities.put("Oacute", Character.valueOf('\323')); // latin //$NON-NLS-1$
		// capital
		// letter O
		// with
		// acute
		entities.put("Ocirc", Character.valueOf('\324')); // latin //$NON-NLS-1$
		// capital
		// letter O
		// with
		// circumflex
		entities.put("Otilde", Character.valueOf('\325')); // latin //$NON-NLS-1$
		// capital
		// letter O
		// with
		// tilde
		entities.put("Ouml", Character.valueOf('\326')); // latin //$NON-NLS-1$
		// capital
		// letter O
		// with
		// diaeresis
		entities.put("times", Character.valueOf('\327')); // multiplication //$NON-NLS-1$
		// sign
		entities.put("Oslash", Character.valueOf('\330')); // latin //$NON-NLS-1$
		// capital
		// letter O
		// with
		// stroke =
		// latin
		// capital
		// letter O
		// slash
		entities.put("Ugrave", Character.valueOf('\331')); // latin //$NON-NLS-1$
		// capital
		// letter U
		// with
		// grave
		entities.put("Uacute", Character.valueOf('\332')); // latin //$NON-NLS-1$
		// capital
		// letter U
		// with
		// acute
		entities.put("Ucirc", Character.valueOf('\333')); // latin //$NON-NLS-1$
		// capital
		// letter U
		// with
		// circumflex
		entities.put("Uuml", Character.valueOf('\334')); // latin //$NON-NLS-1$
		// capital
		// letter U
		// with
		// diaeresis
		entities.put("Yacute", Character.valueOf('\335')); // latin //$NON-NLS-1$
		// capital
		// letter Y
		// with
		// acute
		entities.put("THORN", Character.valueOf('\336')); // latin //$NON-NLS-1$
		// capital
		// letter
		// THORN
		entities.put("szlig", Character.valueOf('\337')); // latin //$NON-NLS-1$
		// small
		// letter
		// sharp s =
		// ess-zed
		entities.put("agrave", Character.valueOf('\340')); // latin //$NON-NLS-1$
		// small
		// letter a
		// with
		// grave =
		// latin
		// small
		// letter a
		// grave
		entities.put("aacute", Character.valueOf('\341')); // latin //$NON-NLS-1$
		// small
		// letter a
		// with
		// acute
		entities.put("acirc", Character.valueOf('\342')); // latin //$NON-NLS-1$
		// small
		// letter a
		// with
		// circumflex
		entities.put("atilde", Character.valueOf('\343')); // latin //$NON-NLS-1$
		// small
		// letter a
		// with
		// tilde
		entities.put("auml", Character.valueOf('\344')); // latin //$NON-NLS-1$
		// small
		// letter a
		// with
		// diaeresis
		entities.put("aring", Character.valueOf('\345')); // latin //$NON-NLS-1$
		// small
		// letter a
		// with ring
		// above =
		// latin
		// small
		// letter a
		// ring
		entities.put("aelig", Character.valueOf('\346')); // latin //$NON-NLS-1$
		// small
		// letter ae
		// = latin
		// small
		// ligature
		// ae
		entities.put("ccedil", Character.valueOf('\347')); // latin //$NON-NLS-1$
		// small
		// letter c
		// with
		// cedilla
		entities.put("egrave", Character.valueOf('\350')); // latin //$NON-NLS-1$
		// small
		// letter e
		// with
		// grave
		entities.put("eacute", Character.valueOf('\351')); // latin //$NON-NLS-1$
		// small
		// letter e
		// with
		// acute
		entities.put("ecirc", Character.valueOf('\352')); // latin //$NON-NLS-1$
		// small
		// letter e
		// with
		// circumflex
		entities.put("euml", Character.valueOf('\353')); // latin //$NON-NLS-1$
		// small
		// letter e
		// with
		// diaeresis
		entities.put("igrave", Character.valueOf('\354')); // latin //$NON-NLS-1$
		// small
		// letter i
		// with
		// grave
		entities.put("iacute", Character.valueOf('\355')); // latin //$NON-NLS-1$
		// small
		// letter i
		// with
		// acute
		entities.put("icirc", Character.valueOf('\356')); // latin //$NON-NLS-1$
		// small
		// letter i
		// with
		// circumflex
		entities.put("iuml", Character.valueOf('\357')); // latin //$NON-NLS-1$
		// small
		// letter i
		// with
		// diaeresis
		entities.put("eth", Character.valueOf('\360')); // latin small //$NON-NLS-1$
		// letter eth
		entities.put("ntilde", Character.valueOf('\361')); // latin //$NON-NLS-1$
		// small
		// letter n
		// with
		// tilde
		entities.put("ograve", Character.valueOf('\362')); // latin //$NON-NLS-1$
		// small
		// letter o
		// with
		// grave
		entities.put("oacute", Character.valueOf('\363')); // latin //$NON-NLS-1$
		// small
		// letter o
		// with
		// acute
		entities.put("ocirc", Character.valueOf('\364')); // latin //$NON-NLS-1$
		// small
		// letter o
		// with
		// circumflex
		entities.put("otilde", Character.valueOf('\365')); // latin //$NON-NLS-1$
		// small
		// letter o
		// with
		// tilde
		entities.put("ouml", Character.valueOf('\366')); // latin //$NON-NLS-1$
		// small
		// letter o
		// with
		// diaeresis
		entities.put("divide", Character.valueOf('\367')); // division //$NON-NLS-1$
		// sign
		entities.put("oslash", Character.valueOf('\370')); // latin //$NON-NLS-1$
		// small
		// letter o
		// with
		// stroke =
		// latin
		// small
		// letter o
		// slash
		entities.put("ugrave", Character.valueOf('\371')); // latin //$NON-NLS-1$
		// small
		// letter u
		// with
		// grave
		entities.put("uacute", Character.valueOf('\372')); // latin //$NON-NLS-1$
		// small
		// letter u
		// with
		// acute
		entities.put("ucirc", Character.valueOf('\373')); // latin //$NON-NLS-1$
		// small
		// letter u
		// with
		// circumflex
		entities.put("uuml", Character.valueOf('\374')); // latin //$NON-NLS-1$
		// small
		// letter u
		// with
		// diaeresis
		entities.put("yacute", Character.valueOf('\375')); // latin //$NON-NLS-1$
		// small
		// letter y
		// with
		// acute
		entities.put("thorn", Character.valueOf('\376')); // latin //$NON-NLS-1$
		// small
		// letter
		// thorn
		entities.put("yuml", Character.valueOf('\377')); // latin //$NON-NLS-1$
		// small
		// letter y
		// with
		// diaeresis

		// Special characters
		entities.put("quot", Character.valueOf('\42')); // quotation //$NON-NLS-1$
		// mark = APL
		// quote
		entities.put("amp", Character.valueOf('\46')); // ampersand //$NON-NLS-1$
		entities.put("lt", Character.valueOf('\74')); // less-than //$NON-NLS-1$
		// sign
		entities.put("gt", Character.valueOf('\76')); // greater-than //$NON-NLS-1$
		// sign
		// Latin Extended-A
		entities.put("OElig", Character.valueOf('\u0152')); // latin //$NON-NLS-1$
		// capital
		// ligature
		// OE
		entities.put("oelig", Character.valueOf('\u0153')); // latin //$NON-NLS-1$
		// small
		// ligature
		// oe,
		// ligature
		// is a
		// misnomer,
		// this is a
		// separate
		// character
		// in some
		// languages
		entities.put("Scaron", Character.valueOf('\u0160')); // latin //$NON-NLS-1$
		// capital
		// letter
		// S
		// with
		// caron
		entities.put("scaron", Character.valueOf('\u0161')); // latin //$NON-NLS-1$
		// small
		// letter
		// s
		// with
		// caron
		entities.put("Yuml", Character.valueOf('\u0178')); // latin //$NON-NLS-1$
		// capital
		// letter Y
		// with
		// diaeresis
		// Spacing Modifier Letters
		entities.put("circ", Character.valueOf('\u02c6')); // modifier //$NON-NLS-1$
		// letter
		// circumflex
		// accent
		entities.put("tilde", Character.valueOf('\u02dc')); // small //$NON-NLS-1$
		// tilde
		// General punctuation
		entities.put("ensp", Character.valueOf('\u2002')); // en space //$NON-NLS-1$
		entities.put("emsp", Character.valueOf('\u2003')); // em space //$NON-NLS-1$
		entities.put("thinsp", Character.valueOf('\u2009')); // thin //$NON-NLS-1$
		// space
		entities.put("zwnj", Character.valueOf('\u200c')); // zero //$NON-NLS-1$
		// width
		// non-joiner
		entities.put("zwj", Character.valueOf('\u200d')); // zero //$NON-NLS-1$
		// width
		// joiner
		entities.put("lrm", Character.valueOf('\u200e')); // left-to-right //$NON-NLS-1$
		// mark
		entities.put("rlm", Character.valueOf('\u200f')); // right-to-left //$NON-NLS-1$
		// mark
		entities.put("ndash", Character.valueOf('\u2013')); // en dash //$NON-NLS-1$
		entities.put("mdash", Character.valueOf('\u2014')); // em dash //$NON-NLS-1$
		entities.put("lsquo", Character.valueOf('\u2018')); // left //$NON-NLS-1$
		// single
		// quotation
		// mark
		entities.put("rsquo", Character.valueOf('\u2019')); // right //$NON-NLS-1$
		// single
		// quotation
		// mark
		entities.put("sbquo", Character.valueOf('\u201a')); // single //$NON-NLS-1$
		// low-9
		// quotation
		// mark
		entities.put("ldquo", Character.valueOf('\u201c')); // left //$NON-NLS-1$
		// double
		// quotation
		// mark
		entities.put("rdquo", Character.valueOf('\u201d')); // right //$NON-NLS-1$
		// double
		// quotation
		// mark
		entities.put("bdquo", Character.valueOf('\u201e')); // double //$NON-NLS-1$
		// low-9
		// quotation
		// mark
		entities.put("dagger", Character.valueOf('\u2020')); // dagger //$NON-NLS-1$
		entities.put("Dagger", Character.valueOf('\u2021')); // double //$NON-NLS-1$
		// dagger
		entities.put("permil", Character.valueOf('\u2030')); // per //$NON-NLS-1$
		// mille
		// sign
		entities.put("lsaquo", Character.valueOf('\u2039')); // single //$NON-NLS-1$
		// left-pointing
		// angle
		// quotation
		// mark,
		// not
		// yet
		// standardized
		entities.put("rsaquo", Character.valueOf('\u203a')); // single //$NON-NLS-1$
		// right-pointing
		// angle
		// quotation
		// mark,
		// not
		// yet
		// standardized
		entities.put("euro", Character.valueOf('\u20ac')); // euro sign //$NON-NLS-1$
	}
}