/eclipse-mylyn-R_3_8_0-fetched-src/org.eclipse.mylyn.commons/org.eclipse.mylyn.commons.net/src/org/eclipse/mylyn/commons/net/HtmlStreamTokenizer.java
Java | 1145 lines | 1086 code | 18 blank | 41 comment | 50 complexity | 7b3d225cf2991b3f0afaf38b1120ee3c MD5 | raw file
Possible License(s): Apache-2.0
- /*******************************************************************************
- * Copyright (c) 2004, 2008 Tasktop Technologies and others.
- * All rights reserved. This program and the accompanying materials
- * are made available under the terms of the Eclipse Public License v1.0
- * which accompanies this distribution, and is available at
- * http://www.eclipse.org/legal/epl-v10.html
- *
- * Contributors:
- * Tasktop Technologies - initial API and implementation
- *******************************************************************************/
- package org.eclipse.mylyn.commons.net;
- import java.io.BufferedReader;
- import java.io.IOException;
- import java.io.Reader;
- import java.net.URL;
- import java.text.ParseException;
- import java.util.HashMap;
- import java.util.Locale;
- import org.apache.commons.lang.StringEscapeUtils;
- /**
- * Parses HTML into tokens.
- *
- * @author Shawn Minto
- * @since 2.0
- * @deprecated use org.eclipse.mylyn.commons.core.HtmlStreamTokenizer instead.
- */
- @Deprecated
- public class HtmlStreamTokenizer {
- /** parser state */
- private State state;
- /** reader from which to parse the text */
- private final BufferedReader in;
- /** base URL for resolving relative URLs */
- private final URL base;
- /** buffer holding the text of the current token */
- private final StringBuffer textBuffer;
- /** buffer holding whitespace preceding the current token */
- private final StringBuffer whitespaceBuffer;
- /**
- * holds a token that was read and then put back in the queue to be returned again on <code>nextToken</code> call
- */
- private Token pushbackToken;
- /**
- * holds a character that was read and then determined not to be part of the current token
- */
- private int pushbackChar;
- /** current quote delimiter (single or double) */
- private int quoteChar;
- /** Allow class client to choose if tag attributes are escaped or not */
- private boolean escapeTagValues;
- /**
- * Constructor.
- *
- * @param in
- * reader for the HTML document to tokenize
- * @param base
- * URL for resolving relative URLs
- */
- public HtmlStreamTokenizer(Reader in, URL base) {
- textBuffer = new StringBuffer();
- whitespaceBuffer = new StringBuffer();
- pushbackChar = 0;
- state = State.TEXT;
- this.in = new BufferedReader(in);
- this.base = base;
- escapeTagValues = true;
- }
- public void escapeTagAttributes(boolean value) {
- escapeTagValues = value;
- }
- /**
- * Returns the next token from the stream.
- */
- public Token nextToken() throws IOException, ParseException {
- if (pushbackToken != null) {
- Token token = pushbackToken;
- pushbackToken = null;
- return token;
- }
- int closingComment = 0;
- textBuffer.setLength(0);
- whitespaceBuffer.setLength(0);
- do {
- int ch;
- if (pushbackChar != 0) {
- ch = pushbackChar;
- pushbackChar = 0;
- } else {
- ch = in.read();
- }
- if (ch < 0) {
- State oldState = state;
- state = State.EOF;
- if (textBuffer.length() > 0 && oldState == State.TEXT) {
- return new Token(textBuffer, whitespaceBuffer, false);
- } else {
- return new Token();
- }
- }
- if (state == State.TEXT) {
- if (ch == '<') {
- state = State.TAG;
- if (textBuffer.length() > 0) {
- return new Token(textBuffer, whitespaceBuffer, false);
- }
- } else if (Character.isWhitespace((char) ch)) {
- pushbackChar = ch;
- state = State.WS;
- if (textBuffer.length() > 0) {
- return new Token(textBuffer, whitespaceBuffer, false);
- }
- } else {
- textBuffer.append((char) ch);
- }
- } else if (state == State.WS) {
- if (!Character.isWhitespace((char) ch)) {
- pushbackChar = ch;
- state = State.TEXT;
- } else {
- whitespaceBuffer.append((char) ch);
- }
- } else if (state == State.TAG) {
- if (ch == '>') {
- state = State.TEXT;
- HtmlTag tag = new HtmlTag(base);
- parseTag(textBuffer.toString(), tag, escapeTagValues);
- return new Token(tag, whitespaceBuffer);
- }
- if (ch == '<' && textBuffer.length() == 0) {
- textBuffer.append("<<"); //$NON-NLS-1$
- state = State.TEXT;
- } else if (ch == '-' && textBuffer.length() == 2 && textBuffer.charAt(1) == '-'
- && textBuffer.charAt(0) == '!') {
- textBuffer.setLength(0);
- state = State.COMMENT;
- } else if (ch == '\'' || ch == '"') {
- quoteChar = ch;
- textBuffer.append((char) ch);
- state = State.TAG_QUOTE;
- } else {
- textBuffer.append((char) ch);
- }
- } else if (state == State.TAG_QUOTE) {
- if (ch == '>') {
- pushbackChar = ch;
- state = State.TAG;
- } else {
- textBuffer.append((char) ch);
- if (ch == quoteChar) {
- state = State.TAG;
- }
- }
- } else if (state == State.COMMENT) {
- if (ch == '>' && closingComment >= 2) {
- textBuffer.setLength(textBuffer.length() - 2);
- closingComment = 0;
- state = State.TEXT;
- return new Token(textBuffer, whitespaceBuffer, true);
- }
- if (ch == '-') {
- closingComment++;
- } else {
- closingComment = 0;
- }
- textBuffer.append((char) ch);
- }
- } while (true);
- }
- /**
- * Pushes the token back into the queue, to be returned by the subsequent call to <code>nextToken</code>
- */
- public void pushback(Token token) {
- pushbackToken = token;
- }
- /**
- * Parses an HTML tag out of a string of characters.
- */
- private static void parseTag(String s, HtmlTag tag, boolean escapeValues) throws ParseException {
- int i = 0;
- for (; i < s.length() && Character.isWhitespace(s.charAt(i)); i++) {
- // just move forward
- }
- if (i == s.length()) {
- throw new ParseException("parse empty tag", 0); //$NON-NLS-1$
- }
- int start = i;
- for (; i < s.length() && !Character.isWhitespace(s.charAt(i)); i++) {
- // just move forward
- }
- tag.setTagName(s.substring(start, i));
- for (; i < s.length() && Character.isWhitespace(s.charAt(i)); i++) {
- // just move forward
- }
- if (i == s.length()) {
- return;
- } else {
- parseAttributes(tag, s, i, escapeValues);
- return;
- }
- }
- /**
- * parses HTML tag attributes from a buffer and sets them in an HtmlTag
- */
- private static void parseAttributes(HtmlTag tag, String s, int i, boolean escapeValues) throws ParseException {
- while (i < s.length()) {
- // skip whitespace
- while (i < s.length() && Character.isWhitespace(s.charAt(i))) {
- i++;
- }
- if (i == s.length()) {
- return;
- }
- // read the attribute name -- the rule might be looser than the RFC
- // specifies:
- // everything up to a space or an equal sign is included
- int start = i;
- for (; i < s.length() && !Character.isWhitespace(s.charAt(i)) && s.charAt(i) != '='; i++) {
- // just move forward
- }
- String attributeName = s.substring(start, i).toLowerCase(Locale.ENGLISH);
- if (attributeName.equals("/")) { //$NON-NLS-1$
- tag.setSelfTerminating(true);
- continue;
- }
- for (; i < s.length() && Character.isWhitespace(s.charAt(i)); i++) {
- // just move forward
- }
- if (i == s.length() || s.charAt(i) != '=') {
- // no attribute value
- tag.setAttribute(attributeName, ""); //$NON-NLS-1$
- continue;
- }
- // skip whitespace to the start of attribute value
- for (i = i + 1; i < s.length() && Character.isWhitespace(s.charAt(i)); i++) {
- // just move forward
- }
- if (i == s.length()) {
- return;
- }
- // read the attribute value -- the rule for unquoted attribute value
- // is
- // looser than the one in Conolly's W3C 1996 lexical analyzer draft:
- // everything
- // is included up to the next space
- String attributeValue;
- if (s.charAt(i) == '"') {
- start = ++i;
- for (; i < s.length() && s.charAt(i) != '"'; i++) {
- // just move forward
- }
- if (i == s.length()) {
- return; // shouldn't happen if input returned by nextToken
- }
- if (escapeValues) {
- attributeValue = unescape(s.substring(start, i));
- } else {
- attributeValue = s.substring(start, i);
- }
- i++;
- } else if (s.charAt(i) == '\'') {
- start = ++i;
- for (; i < s.length() && s.charAt(i) != '\''; i++) {
- // just move forward
- }
- if (i == s.length()) {
- return; // shouldn't happen if input returned by nextToken
- }
- attributeValue = unescape(s.substring(start, i));
- i++;
- } else {
- start = i;
- for (; i < s.length() && !Character.isWhitespace(s.charAt(i)); i++) {
- // just move forward
- }
- attributeValue = s.substring(start, i);
- }
- tag.setAttribute(attributeName, attributeValue);
- }
- }
- /**
- * Returns a string with HTML escapes changed into their corresponding characters.
- *
- * @deprecated use {@link StringEscapeUtils#unescapeHtml(String)} instead
- */
- @Deprecated
- public static String unescape(String s) {
- if (s.indexOf('&') == -1) {
- return s;
- } else {
- StringBuffer sb = new StringBuffer(s);
- unescape(sb);
- return sb.toString();
- }
- }
- /**
- * Replaces (in-place) HTML escapes in a StringBuffer with their corresponding characters.
- *
- * @deprecated use {@link StringEscapeUtils#unescapeHtml(String)} instead
- */
- @Deprecated
- public static StringBuffer unescape(StringBuffer sb) {
- int i = 0; // index into the unprocessed section of the buffer
- int j = 0; // index into the processed section of the buffer
- while (i < sb.length()) {
- char ch = sb.charAt(i);
- if (ch == '&') {
- int start = i;
- String escape = null;
- for (i = i + 1; i < sb.length(); i++) {
- ch = sb.charAt(i);
- if (!Character.isLetterOrDigit(ch) && !(ch == '#' && i == (start + 1))) {
- escape = sb.substring(start + 1, i);
- break;
- }
- }
- if (i == sb.length() && i != (start + 1)) {
- escape = sb.substring(start + 1);
- }
- if (escape != null) {
- Character character = parseReference(escape);
- if (character != null
- && !((0x0A == character || 0x0D == character || 0x09 == ch)
- || (character >= 0x20 && character <= 0xD7FF)
- || (character >= 0xE000 && character <= 0xFFFD) || (character >= 0x10000 && character <= 0x10FFFF))) {
- // Character is an invalid xml character
- // http://www.w3.org/TR/REC-xml/#charsets
- character = null;
- }
- if (character != null) {
- ch = character.charValue();
- } else {
- // not an HTML escape; rewind
- i = start;
- ch = '&';
- }
- }
- }
- sb.setCharAt(j, ch);
- i++;
- j++;
- }
- sb.setLength(j);
- return sb;
- }
- /**
- * Parses HTML character and entity references and returns the corresponding character.
- */
- private static Character parseReference(String s) {
- if (s.length() == 0) {
- return null;
- }
- if (s.charAt(0) == '#') {
- // character reference
- if (s.length() == 1) {
- return null;
- }
- try {
- int value;
- if (s.charAt(1) == 'x') {
- // Hex reference
- value = Integer.parseInt(s.substring(2), 16);
- } else {
- // Decimal reference
- value = Integer.parseInt(s.substring(1));
- }
- return new Character((char) value);
- } catch (NumberFormatException e) {
- return null;
- }
- } else {
- return entities.get(s);
- }
- }
- /**
- * Class for current token.
- */
- public static class Token {
- public static final Type EOF = new Type();
- public static final Type TEXT = new Type();
- public static final Type TAG = new Type();
- public static final Type COMMENT = new Type();
- /** token's type */
- private Type type;
- /** token's value */
- private final Object value;
- /** whitespace preceding the token */
- private final StringBuffer whitespace;
- /**
- * Constructor for the EOF token.
- */
- protected Token() {
- type = EOF;
- value = null;
- whitespace = null;
- }
- /**
- * Constructor for the HTML tag tokens.
- */
- protected Token(HtmlTag tag, StringBuffer whitespace) {
- type = TAG;
- value = tag;
- this.whitespace = whitespace;
- }
- /**
- * Constructor for regular text and comments.
- */
- protected Token(StringBuffer text, StringBuffer whitespace, boolean comment) {
- if (comment) {
- type = COMMENT;
- } else {
- type = TEXT;
- }
- this.value = text;
- this.whitespace = whitespace;
- }
- /**
- * Returns the token's type.
- */
- public Type getType() {
- return type;
- }
- /**
- * Returns the whitespace preceding the token.
- */
- public StringBuffer getWhitespace() {
- return whitespace;
- }
- /**
- * Returns the token's value. This is an HtmlTag for tokens of type <code>TAG</code> and a StringBuffer for
- * tokens of type <code>TEXT</code> and <code>COMMENT</code>. For tokens of type <code>EOF</code>, the value is
- * <code>null</code>.
- */
- public Object getValue() {
- return value;
- }
- /**
- * Returns the string representation of the token, including the preceding whitespace.
- */
- @Override
- public String toString() {
- StringBuffer sb = new StringBuffer();
- if (whitespace != null) {
- sb.append(whitespace);
- }
- if (value != null) {
- if (type == TAG) {
- // sb.append('<');
- } else if (type == COMMENT) {
- sb.append("<!--"); //$NON-NLS-1$
- }
- sb.append(value);
- if (type == TAG) {
- // if(value instanceof HtmlTag) {
- // HtmlTag htmlTag = (HtmlTag)value;
- // if(htmlTag.getTagName().startsWith("?xml")) {
- // sb.append("?>");
- // }
- // } else {
- // sb.append('>');
- } else if (type == COMMENT) {
- sb.append("-->"); //$NON-NLS-1$
- }
- }
- return sb.toString();
- }
- /**
- * Private enum class for token type.
- */
- private static class Type {
- private Type() {
- // don't need to do anything
- }
- }
- }
- /**
- * Enum class for parser state.
- */
- private static class State {
- static final State EOF = new State();
- static final State COMMENT = new State();
- static final State TEXT = new State();
- static final State TAG = new State();
- static final State WS = new State();
- static final State TAG_QUOTE = new State();
- private State() {
- // don't need to do anything
- }
- }
- /** names and values of HTML entity references */
- private static HashMap<String, Character> entities;
- /*
- * Based on ISO 8879.
- *
- * Portions (c) International Organization for Standardization 1986
- * Permission to copy in any form is granted for use with conforming SGML
- * systems and applications as defined in ISO 8879, provided this notice is
- * included in all copies.
- *
- */
- static {
- entities = new HashMap<String, Character>();
- entities.put("nbsp", Character.valueOf('\240')); // no-break //$NON-NLS-1$
- // space =
- // non-breaking
- // space
- entities.put("iexcl", Character.valueOf('\241')); // inverted //$NON-NLS-1$
- // exclamation
- // mark
- entities.put("cent", Character.valueOf('\242')); // cent sign //$NON-NLS-1$
- entities.put("pound", Character.valueOf('\243')); // pound //$NON-NLS-1$
- // sign
- entities.put("curren", Character.valueOf('\244')); // currency //$NON-NLS-1$
- // sign
- entities.put("yen", Character.valueOf('\245')); // yen sign = //$NON-NLS-1$
- // yuan sign
- entities.put("brvbar", Character.valueOf('\246')); // broken //$NON-NLS-1$
- // bar =
- // broken
- // vertical
- // bar
- entities.put("sect", Character.valueOf('\247')); // section //$NON-NLS-1$
- // sign
- entities.put("uml", Character.valueOf('\250')); // diaeresis = //$NON-NLS-1$
- // spacing
- // diaeresis
- entities.put("copy", Character.valueOf('\251')); // copyright //$NON-NLS-1$
- // sign
- entities.put("ordf", Character.valueOf('\252')); // feminine //$NON-NLS-1$
- // ordinal
- // indicator
- entities.put("laquo", Character.valueOf('\253')); // left-pointing //$NON-NLS-1$
- // double
- // angle
- // quotation
- // mark =
- // left
- // pointing
- // guillemet
- entities.put("not", Character.valueOf('\254')); // not sign //$NON-NLS-1$
- entities.put("shy", Character.valueOf('\255')); // soft hyphen = //$NON-NLS-1$
- // discretionary
- // hyphen
- entities.put("reg", Character.valueOf('\256')); // registered //$NON-NLS-1$
- // sign =
- // registered
- // trade mark
- // sign
- entities.put("macr", Character.valueOf('\257')); // macron = //$NON-NLS-1$
- // spacing
- // macron =
- // overline
- // = APL
- // overbar
- entities.put("deg", Character.valueOf('\260')); // degree sign //$NON-NLS-1$
- entities.put("plusmn", Character.valueOf('\261')); // plus-minus //$NON-NLS-1$
- // sign =
- // plus-or-minus
- // sign
- entities.put("sup2", Character.valueOf('\262')); // superscript //$NON-NLS-1$
- // two =
- // superscript
- // digit two
- // = squared
- entities.put("sup3", Character.valueOf('\263')); // superscript //$NON-NLS-1$
- // three =
- // superscript
- // digit
- // three =
- // cubed
- entities.put("acute", Character.valueOf('\264')); // acute //$NON-NLS-1$
- // accent =
- // spacing
- // acute
- entities.put("micro", Character.valueOf('\265')); // micro //$NON-NLS-1$
- // sign
- entities.put("para", Character.valueOf('\266')); // pilcrow //$NON-NLS-1$
- // sign =
- // paragraph
- // sign
- entities.put("middot", Character.valueOf('\267')); // middle //$NON-NLS-1$
- // dot =
- // Georgian
- // comma =
- // Greek
- // middle
- // dot
- entities.put("cedil", Character.valueOf('\270')); // cedilla = //$NON-NLS-1$
- // spacing
- // cedilla
- entities.put("sup1", Character.valueOf('\271')); // superscript //$NON-NLS-1$
- // one =
- // superscript
- // digit one
- entities.put("ordm", Character.valueOf('\272')); // masculine //$NON-NLS-1$
- // ordinal
- // indicator
- entities.put("raquo", Character.valueOf('\273')); // right-pointing //$NON-NLS-1$
- // double
- // angle
- // quotation
- // mark =
- // right
- // pointing
- // guillemet
- entities.put("frac14", Character.valueOf('\274')); // vulgar //$NON-NLS-1$
- // fraction
- // one
- // quarter =
- // fraction
- // one
- // quarter
- entities.put("frac12", Character.valueOf('\275')); // vulgar //$NON-NLS-1$
- // fraction
- // one half
- // =
- // fraction
- // one half
- entities.put("frac34", Character.valueOf('\276')); // vulgar //$NON-NLS-1$
- // fraction
- // three
- // quarters
- // =
- // fraction
- // three
- // quarters
- entities.put("iquest", Character.valueOf('\277')); // inverted //$NON-NLS-1$
- // question
- // mark =
- // turned
- // question
- // mark
- entities.put("Agrave", Character.valueOf('\300')); // latin //$NON-NLS-1$
- // capital
- // letter A
- // with
- // grave =
- // latin
- // capital
- // letter A
- // grave
- entities.put("Aacute", Character.valueOf('\301')); // latin //$NON-NLS-1$
- // capital
- // letter A
- // with
- // acute
- entities.put("Acirc", Character.valueOf('\302')); // latin //$NON-NLS-1$
- // capital
- // letter A
- // with
- // circumflex
- entities.put("Atilde", Character.valueOf('\303')); // latin //$NON-NLS-1$
- // capital
- // letter A
- // with
- // tilde
- entities.put("Auml", Character.valueOf('\304')); // latin //$NON-NLS-1$
- // capital
- // letter A
- // with
- // diaeresis
- entities.put("Aring", Character.valueOf('\305')); // latin //$NON-NLS-1$
- // capital
- // letter A
- // with ring
- // above =
- // latin
- // capital
- // letter A
- // ring
- entities.put("AElig", Character.valueOf('\306')); // latin //$NON-NLS-1$
- // capital
- // letter AE
- // = latin
- // capital
- // ligature
- // AE
- entities.put("Ccedil", Character.valueOf('\307')); // latin //$NON-NLS-1$
- // capital
- // letter C
- // with
- // cedilla
- entities.put("Egrave", Character.valueOf('\310')); // latin //$NON-NLS-1$
- // capital
- // letter E
- // with
- // grave
- entities.put("Eacute", Character.valueOf('\311')); // latin //$NON-NLS-1$
- // capital
- // letter E
- // with
- // acute
- entities.put("Ecirc", Character.valueOf('\312')); // latin //$NON-NLS-1$
- // capital
- // letter E
- // with
- // circumflex
- entities.put("Euml", Character.valueOf('\313')); // latin //$NON-NLS-1$
- // capital
- // letter E
- // with
- // diaeresis
- entities.put("Igrave", Character.valueOf('\314')); // latin //$NON-NLS-1$
- // capital
- // letter I
- // with
- // grave
- entities.put("Iacute", Character.valueOf('\315')); // latin //$NON-NLS-1$
- // capital
- // letter I
- // with
- // acute
- entities.put("Icirc", Character.valueOf('\316')); // latin //$NON-NLS-1$
- // capital
- // letter I
- // with
- // circumflex
- entities.put("Iuml", Character.valueOf('\317')); // latin //$NON-NLS-1$
- // capital
- // letter I
- // with
- // diaeresis
- entities.put("ETH", Character.valueOf('\320')); // latin capital //$NON-NLS-1$
- // letter ETH
- entities.put("Ntilde", Character.valueOf('\321')); // latin //$NON-NLS-1$
- // capital
- // letter N
- // with
- // tilde
- entities.put("Ograve", Character.valueOf('\322')); // latin //$NON-NLS-1$
- // capital
- // letter O
- // with
- // grave
- entities.put("Oacute", Character.valueOf('\323')); // latin //$NON-NLS-1$
- // capital
- // letter O
- // with
- // acute
- entities.put("Ocirc", Character.valueOf('\324')); // latin //$NON-NLS-1$
- // capital
- // letter O
- // with
- // circumflex
- entities.put("Otilde", Character.valueOf('\325')); // latin //$NON-NLS-1$
- // capital
- // letter O
- // with
- // tilde
- entities.put("Ouml", Character.valueOf('\326')); // latin //$NON-NLS-1$
- // capital
- // letter O
- // with
- // diaeresis
- entities.put("times", Character.valueOf('\327')); // multiplication //$NON-NLS-1$
- // sign
- entities.put("Oslash", Character.valueOf('\330')); // latin //$NON-NLS-1$
- // capital
- // letter O
- // with
- // stroke =
- // latin
- // capital
- // letter O
- // slash
- entities.put("Ugrave", Character.valueOf('\331')); // latin //$NON-NLS-1$
- // capital
- // letter U
- // with
- // grave
- entities.put("Uacute", Character.valueOf('\332')); // latin //$NON-NLS-1$
- // capital
- // letter U
- // with
- // acute
- entities.put("Ucirc", Character.valueOf('\333')); // latin //$NON-NLS-1$
- // capital
- // letter U
- // with
- // circumflex
- entities.put("Uuml", Character.valueOf('\334')); // latin //$NON-NLS-1$
- // capital
- // letter U
- // with
- // diaeresis
- entities.put("Yacute", Character.valueOf('\335')); // latin //$NON-NLS-1$
- // capital
- // letter Y
- // with
- // acute
- entities.put("THORN", Character.valueOf('\336')); // latin //$NON-NLS-1$
- // capital
- // letter
- // THORN
- entities.put("szlig", Character.valueOf('\337')); // latin //$NON-NLS-1$
- // small
- // letter
- // sharp s =
- // ess-zed
- entities.put("agrave", Character.valueOf('\340')); // latin //$NON-NLS-1$
- // small
- // letter a
- // with
- // grave =
- // latin
- // small
- // letter a
- // grave
- entities.put("aacute", Character.valueOf('\341')); // latin //$NON-NLS-1$
- // small
- // letter a
- // with
- // acute
- entities.put("acirc", Character.valueOf('\342')); // latin //$NON-NLS-1$
- // small
- // letter a
- // with
- // circumflex
- entities.put("atilde", Character.valueOf('\343')); // latin //$NON-NLS-1$
- // small
- // letter a
- // with
- // tilde
- entities.put("auml", Character.valueOf('\344')); // latin //$NON-NLS-1$
- // small
- // letter a
- // with
- // diaeresis
- entities.put("aring", Character.valueOf('\345')); // latin //$NON-NLS-1$
- // small
- // letter a
- // with ring
- // above =
- // latin
- // small
- // letter a
- // ring
- entities.put("aelig", Character.valueOf('\346')); // latin //$NON-NLS-1$
- // small
- // letter ae
- // = latin
- // small
- // ligature
- // ae
- entities.put("ccedil", Character.valueOf('\347')); // latin //$NON-NLS-1$
- // small
- // letter c
- // with
- // cedilla
- entities.put("egrave", Character.valueOf('\350')); // latin //$NON-NLS-1$
- // small
- // letter e
- // with
- // grave
- entities.put("eacute", Character.valueOf('\351')); // latin //$NON-NLS-1$
- // small
- // letter e
- // with
- // acute
- entities.put("ecirc", Character.valueOf('\352')); // latin //$NON-NLS-1$
- // small
- // letter e
- // with
- // circumflex
- entities.put("euml", Character.valueOf('\353')); // latin //$NON-NLS-1$
- // small
- // letter e
- // with
- // diaeresis
- entities.put("igrave", Character.valueOf('\354')); // latin //$NON-NLS-1$
- // small
- // letter i
- // with
- // grave
- entities.put("iacute", Character.valueOf('\355')); // latin //$NON-NLS-1$
- // small
- // letter i
- // with
- // acute
- entities.put("icirc", Character.valueOf('\356')); // latin //$NON-NLS-1$
- // small
- // letter i
- // with
- // circumflex
- entities.put("iuml", Character.valueOf('\357')); // latin //$NON-NLS-1$
- // small
- // letter i
- // with
- // diaeresis
- entities.put("eth", Character.valueOf('\360')); // latin small //$NON-NLS-1$
- // letter eth
- entities.put("ntilde", Character.valueOf('\361')); // latin //$NON-NLS-1$
- // small
- // letter n
- // with
- // tilde
- entities.put("ograve", Character.valueOf('\362')); // latin //$NON-NLS-1$
- // small
- // letter o
- // with
- // grave
- entities.put("oacute", Character.valueOf('\363')); // latin //$NON-NLS-1$
- // small
- // letter o
- // with
- // acute
- entities.put("ocirc", Character.valueOf('\364')); // latin //$NON-NLS-1$
- // small
- // letter o
- // with
- // circumflex
- entities.put("otilde", Character.valueOf('\365')); // latin //$NON-NLS-1$
- // small
- // letter o
- // with
- // tilde
- entities.put("ouml", Character.valueOf('\366')); // latin //$NON-NLS-1$
- // small
- // letter o
- // with
- // diaeresis
- entities.put("divide", Character.valueOf('\367')); // division //$NON-NLS-1$
- // sign
- entities.put("oslash", Character.valueOf('\370')); // latin //$NON-NLS-1$
- // small
- // letter o
- // with
- // stroke =
- // latin
- // small
- // letter o
- // slash
- entities.put("ugrave", Character.valueOf('\371')); // latin //$NON-NLS-1$
- // small
- // letter u
- // with
- // grave
- entities.put("uacute", Character.valueOf('\372')); // latin //$NON-NLS-1$
- // small
- // letter u
- // with
- // acute
- entities.put("ucirc", Character.valueOf('\373')); // latin //$NON-NLS-1$
- // small
- // letter u
- // with
- // circumflex
- entities.put("uuml", Character.valueOf('\374')); // latin //$NON-NLS-1$
- // small
- // letter u
- // with
- // diaeresis
- entities.put("yacute", Character.valueOf('\375')); // latin //$NON-NLS-1$
- // small
- // letter y
- // with
- // acute
- entities.put("thorn", Character.valueOf('\376')); // latin //$NON-NLS-1$
- // small
- // letter
- // thorn
- entities.put("yuml", Character.valueOf('\377')); // latin //$NON-NLS-1$
- // small
- // letter y
- // with
- // diaeresis
- // Special characters
- entities.put("quot", Character.valueOf('\42')); // quotation //$NON-NLS-1$
- // mark = APL
- // quote
- entities.put("amp", Character.valueOf('\46')); // ampersand //$NON-NLS-1$
- entities.put("lt", Character.valueOf('\74')); // less-than //$NON-NLS-1$
- // sign
- entities.put("gt", Character.valueOf('\76')); // greater-than //$NON-NLS-1$
- // sign
- // Latin Extended-A
- entities.put("OElig", Character.valueOf('\u0152')); // latin //$NON-NLS-1$
- // capital
- // ligature
- // OE
- entities.put("oelig", Character.valueOf('\u0153')); // latin //$NON-NLS-1$
- // small
- // ligature
- // oe,
- // ligature
- // is a
- // misnomer,
- // this is a
- // separate
- // character
- // in some
- // languages
- entities.put("Scaron", Character.valueOf('\u0160')); // latin //$NON-NLS-1$
- // capital
- // letter
- // S
- // with
- // caron
- entities.put("scaron", Character.valueOf('\u0161')); // latin //$NON-NLS-1$
- // small
- // letter
- // s
- // with
- // caron
- entities.put("Yuml", Character.valueOf('\u0178')); // latin //$NON-NLS-1$
- // capital
- // letter Y
- // with
- // diaeresis
- // Spacing Modifier Letters
- entities.put("circ", Character.valueOf('\u02c6')); // modifier //$NON-NLS-1$
- // letter
- // circumflex
- // accent
- entities.put("tilde", Character.valueOf('\u02dc')); // small //$NON-NLS-1$
- // tilde
- // General punctuation
- entities.put("ensp", Character.valueOf('\u2002')); // en space //$NON-NLS-1$
- entities.put("emsp", Character.valueOf('\u2003')); // em space //$NON-NLS-1$
- entities.put("thinsp", Character.valueOf('\u2009')); // thin //$NON-NLS-1$
- // space
- entities.put("zwnj", Character.valueOf('\u200c')); // zero //$NON-NLS-1$
- // width
- // non-joiner
- entities.put("zwj", Character.valueOf('\u200d')); // zero //$NON-NLS-1$
- // width
- // joiner
- entities.put("lrm", Character.valueOf('\u200e')); // left-to-right //$NON-NLS-1$
- // mark
- entities.put("rlm", Character.valueOf('\u200f')); // right-to-left //$NON-NLS-1$
- // mark
- entities.put("ndash", Character.valueOf('\u2013')); // en dash //$NON-NLS-1$
- entities.put("mdash", Character.valueOf('\u2014')); // em dash //$NON-NLS-1$
- entities.put("lsquo", Character.valueOf('\u2018')); // left //$NON-NLS-1$
- // single
- // quotation
- // mark
- entities.put("rsquo", Character.valueOf('\u2019')); // right //$NON-NLS-1$
- // single
- // quotation
- // mark
- entities.put("sbquo", Character.valueOf('\u201a')); // single //$NON-NLS-1$
- // low-9
- // quotation
- // mark
- entities.put("ldquo", Character.valueOf('\u201c')); // left //$NON-NLS-1$
- // double
- // quotation
- // mark
- entities.put("rdquo", Character.valueOf('\u201d')); // right //$NON-NLS-1$
- // double
- // quotation
- // mark
- entities.put("bdquo", Character.valueOf('\u201e')); // double //$NON-NLS-1$
- // low-9
- // quotation
- // mark
- entities.put("dagger", Character.valueOf('\u2020')); // dagger //$NON-NLS-1$
- entities.put("Dagger", Character.valueOf('\u2021')); // double //$NON-NLS-1$
- // dagger
- entities.put("permil", Character.valueOf('\u2030')); // per //$NON-NLS-1$
- // mille
- // sign
- entities.put("lsaquo", Character.valueOf('\u2039')); // single //$NON-NLS-1$
- // left-pointing
- // angle
- // quotation
- // mark,
- // not
- // yet
- // standardized
- entities.put("rsaquo", Character.valueOf('\u203a')); // single //$NON-NLS-1$
- // right-pointing
- // angle
- // quotation
- // mark,
- // not
- // yet
- // standardized
- entities.put("euro", Character.valueOf('\u20ac')); // euro sign //$NON-NLS-1$
- }
- }