PageRenderTime 59ms CodeModel.GetById 30ms RepoModel.GetById 0ms app.codeStats 0ms

/eclipse-mylyn-R_3_8_0-fetched-src/org.eclipse.mylyn.commons/org.eclipse.mylyn.commons.net/src/org/eclipse/mylyn/commons/net/HtmlStreamTokenizer.java

#
Java | 1145 lines | 1086 code | 18 blank | 41 comment | 50 complexity | 7b3d225cf2991b3f0afaf38b1120ee3c MD5 | raw file
Possible License(s): Apache-2.0
  1. /*******************************************************************************
  2. * Copyright (c) 2004, 2008 Tasktop Technologies and others.
  3. * All rights reserved. This program and the accompanying materials
  4. * are made available under the terms of the Eclipse Public License v1.0
  5. * which accompanies this distribution, and is available at
  6. * http://www.eclipse.org/legal/epl-v10.html
  7. *
  8. * Contributors:
  9. * Tasktop Technologies - initial API and implementation
  10. *******************************************************************************/
  11. package org.eclipse.mylyn.commons.net;
  12. import java.io.BufferedReader;
  13. import java.io.IOException;
  14. import java.io.Reader;
  15. import java.net.URL;
  16. import java.text.ParseException;
  17. import java.util.HashMap;
  18. import java.util.Locale;
  19. import org.apache.commons.lang.StringEscapeUtils;
  20. /**
  21. * Parses HTML into tokens.
  22. *
  23. * @author Shawn Minto
  24. * @since 2.0
  25. * @deprecated use org.eclipse.mylyn.commons.core.HtmlStreamTokenizer instead.
  26. */
  27. @Deprecated
  28. public class HtmlStreamTokenizer {
  29. /** parser state */
  30. private State state;
  31. /** reader from which to parse the text */
  32. private final BufferedReader in;
  33. /** base URL for resolving relative URLs */
  34. private final URL base;
  35. /** buffer holding the text of the current token */
  36. private final StringBuffer textBuffer;
  37. /** buffer holding whitespace preceding the current token */
  38. private final StringBuffer whitespaceBuffer;
  39. /**
  40. * holds a token that was read and then put back in the queue to be returned again on <code>nextToken</code> call
  41. */
  42. private Token pushbackToken;
  43. /**
  44. * holds a character that was read and then determined not to be part of the current token
  45. */
  46. private int pushbackChar;
  47. /** current quote delimiter (single or double) */
  48. private int quoteChar;
  49. /** Allow class client to choose if tag attributes are escaped or not */
  50. private boolean escapeTagValues;
  51. /**
  52. * Constructor.
  53. *
  54. * @param in
  55. * reader for the HTML document to tokenize
  56. * @param base
  57. * URL for resolving relative URLs
  58. */
  59. public HtmlStreamTokenizer(Reader in, URL base) {
  60. textBuffer = new StringBuffer();
  61. whitespaceBuffer = new StringBuffer();
  62. pushbackChar = 0;
  63. state = State.TEXT;
  64. this.in = new BufferedReader(in);
  65. this.base = base;
  66. escapeTagValues = true;
  67. }
  68. public void escapeTagAttributes(boolean value) {
  69. escapeTagValues = value;
  70. }
  71. /**
  72. * Returns the next token from the stream.
  73. */
  74. public Token nextToken() throws IOException, ParseException {
  75. if (pushbackToken != null) {
  76. Token token = pushbackToken;
  77. pushbackToken = null;
  78. return token;
  79. }
  80. int closingComment = 0;
  81. textBuffer.setLength(0);
  82. whitespaceBuffer.setLength(0);
  83. do {
  84. int ch;
  85. if (pushbackChar != 0) {
  86. ch = pushbackChar;
  87. pushbackChar = 0;
  88. } else {
  89. ch = in.read();
  90. }
  91. if (ch < 0) {
  92. State oldState = state;
  93. state = State.EOF;
  94. if (textBuffer.length() > 0 && oldState == State.TEXT) {
  95. return new Token(textBuffer, whitespaceBuffer, false);
  96. } else {
  97. return new Token();
  98. }
  99. }
  100. if (state == State.TEXT) {
  101. if (ch == '<') {
  102. state = State.TAG;
  103. if (textBuffer.length() > 0) {
  104. return new Token(textBuffer, whitespaceBuffer, false);
  105. }
  106. } else if (Character.isWhitespace((char) ch)) {
  107. pushbackChar = ch;
  108. state = State.WS;
  109. if (textBuffer.length() > 0) {
  110. return new Token(textBuffer, whitespaceBuffer, false);
  111. }
  112. } else {
  113. textBuffer.append((char) ch);
  114. }
  115. } else if (state == State.WS) {
  116. if (!Character.isWhitespace((char) ch)) {
  117. pushbackChar = ch;
  118. state = State.TEXT;
  119. } else {
  120. whitespaceBuffer.append((char) ch);
  121. }
  122. } else if (state == State.TAG) {
  123. if (ch == '>') {
  124. state = State.TEXT;
  125. HtmlTag tag = new HtmlTag(base);
  126. parseTag(textBuffer.toString(), tag, escapeTagValues);
  127. return new Token(tag, whitespaceBuffer);
  128. }
  129. if (ch == '<' && textBuffer.length() == 0) {
  130. textBuffer.append("<<"); //$NON-NLS-1$
  131. state = State.TEXT;
  132. } else if (ch == '-' && textBuffer.length() == 2 && textBuffer.charAt(1) == '-'
  133. && textBuffer.charAt(0) == '!') {
  134. textBuffer.setLength(0);
  135. state = State.COMMENT;
  136. } else if (ch == '\'' || ch == '"') {
  137. quoteChar = ch;
  138. textBuffer.append((char) ch);
  139. state = State.TAG_QUOTE;
  140. } else {
  141. textBuffer.append((char) ch);
  142. }
  143. } else if (state == State.TAG_QUOTE) {
  144. if (ch == '>') {
  145. pushbackChar = ch;
  146. state = State.TAG;
  147. } else {
  148. textBuffer.append((char) ch);
  149. if (ch == quoteChar) {
  150. state = State.TAG;
  151. }
  152. }
  153. } else if (state == State.COMMENT) {
  154. if (ch == '>' && closingComment >= 2) {
  155. textBuffer.setLength(textBuffer.length() - 2);
  156. closingComment = 0;
  157. state = State.TEXT;
  158. return new Token(textBuffer, whitespaceBuffer, true);
  159. }
  160. if (ch == '-') {
  161. closingComment++;
  162. } else {
  163. closingComment = 0;
  164. }
  165. textBuffer.append((char) ch);
  166. }
  167. } while (true);
  168. }
  169. /**
  170. * Pushes the token back into the queue, to be returned by the subsequent call to <code>nextToken</code>
  171. */
  172. public void pushback(Token token) {
  173. pushbackToken = token;
  174. }
  175. /**
  176. * Parses an HTML tag out of a string of characters.
  177. */
  178. private static void parseTag(String s, HtmlTag tag, boolean escapeValues) throws ParseException {
  179. int i = 0;
  180. for (; i < s.length() && Character.isWhitespace(s.charAt(i)); i++) {
  181. // just move forward
  182. }
  183. if (i == s.length()) {
  184. throw new ParseException("parse empty tag", 0); //$NON-NLS-1$
  185. }
  186. int start = i;
  187. for (; i < s.length() && !Character.isWhitespace(s.charAt(i)); i++) {
  188. // just move forward
  189. }
  190. tag.setTagName(s.substring(start, i));
  191. for (; i < s.length() && Character.isWhitespace(s.charAt(i)); i++) {
  192. // just move forward
  193. }
  194. if (i == s.length()) {
  195. return;
  196. } else {
  197. parseAttributes(tag, s, i, escapeValues);
  198. return;
  199. }
  200. }
  201. /**
  202. * parses HTML tag attributes from a buffer and sets them in an HtmlTag
  203. */
  204. private static void parseAttributes(HtmlTag tag, String s, int i, boolean escapeValues) throws ParseException {
  205. while (i < s.length()) {
  206. // skip whitespace
  207. while (i < s.length() && Character.isWhitespace(s.charAt(i))) {
  208. i++;
  209. }
  210. if (i == s.length()) {
  211. return;
  212. }
  213. // read the attribute name -- the rule might be looser than the RFC
  214. // specifies:
  215. // everything up to a space or an equal sign is included
  216. int start = i;
  217. for (; i < s.length() && !Character.isWhitespace(s.charAt(i)) && s.charAt(i) != '='; i++) {
  218. // just move forward
  219. }
  220. String attributeName = s.substring(start, i).toLowerCase(Locale.ENGLISH);
  221. if (attributeName.equals("/")) { //$NON-NLS-1$
  222. tag.setSelfTerminating(true);
  223. continue;
  224. }
  225. for (; i < s.length() && Character.isWhitespace(s.charAt(i)); i++) {
  226. // just move forward
  227. }
  228. if (i == s.length() || s.charAt(i) != '=') {
  229. // no attribute value
  230. tag.setAttribute(attributeName, ""); //$NON-NLS-1$
  231. continue;
  232. }
  233. // skip whitespace to the start of attribute value
  234. for (i = i + 1; i < s.length() && Character.isWhitespace(s.charAt(i)); i++) {
  235. // just move forward
  236. }
  237. if (i == s.length()) {
  238. return;
  239. }
  240. // read the attribute value -- the rule for unquoted attribute value
  241. // is
  242. // looser than the one in Conolly's W3C 1996 lexical analyzer draft:
  243. // everything
  244. // is included up to the next space
  245. String attributeValue;
  246. if (s.charAt(i) == '"') {
  247. start = ++i;
  248. for (; i < s.length() && s.charAt(i) != '"'; i++) {
  249. // just move forward
  250. }
  251. if (i == s.length()) {
  252. return; // shouldn't happen if input returned by nextToken
  253. }
  254. if (escapeValues) {
  255. attributeValue = unescape(s.substring(start, i));
  256. } else {
  257. attributeValue = s.substring(start, i);
  258. }
  259. i++;
  260. } else if (s.charAt(i) == '\'') {
  261. start = ++i;
  262. for (; i < s.length() && s.charAt(i) != '\''; i++) {
  263. // just move forward
  264. }
  265. if (i == s.length()) {
  266. return; // shouldn't happen if input returned by nextToken
  267. }
  268. attributeValue = unescape(s.substring(start, i));
  269. i++;
  270. } else {
  271. start = i;
  272. for (; i < s.length() && !Character.isWhitespace(s.charAt(i)); i++) {
  273. // just move forward
  274. }
  275. attributeValue = s.substring(start, i);
  276. }
  277. tag.setAttribute(attributeName, attributeValue);
  278. }
  279. }
  280. /**
  281. * Returns a string with HTML escapes changed into their corresponding characters.
  282. *
  283. * @deprecated use {@link StringEscapeUtils#unescapeHtml(String)} instead
  284. */
  285. @Deprecated
  286. public static String unescape(String s) {
  287. if (s.indexOf('&') == -1) {
  288. return s;
  289. } else {
  290. StringBuffer sb = new StringBuffer(s);
  291. unescape(sb);
  292. return sb.toString();
  293. }
  294. }
  295. /**
  296. * Replaces (in-place) HTML escapes in a StringBuffer with their corresponding characters.
  297. *
  298. * @deprecated use {@link StringEscapeUtils#unescapeHtml(String)} instead
  299. */
  300. @Deprecated
  301. public static StringBuffer unescape(StringBuffer sb) {
  302. int i = 0; // index into the unprocessed section of the buffer
  303. int j = 0; // index into the processed section of the buffer
  304. while (i < sb.length()) {
  305. char ch = sb.charAt(i);
  306. if (ch == '&') {
  307. int start = i;
  308. String escape = null;
  309. for (i = i + 1; i < sb.length(); i++) {
  310. ch = sb.charAt(i);
  311. if (!Character.isLetterOrDigit(ch) && !(ch == '#' && i == (start + 1))) {
  312. escape = sb.substring(start + 1, i);
  313. break;
  314. }
  315. }
  316. if (i == sb.length() && i != (start + 1)) {
  317. escape = sb.substring(start + 1);
  318. }
  319. if (escape != null) {
  320. Character character = parseReference(escape);
  321. if (character != null
  322. && !((0x0A == character || 0x0D == character || 0x09 == ch)
  323. || (character >= 0x20 && character <= 0xD7FF)
  324. || (character >= 0xE000 && character <= 0xFFFD) || (character >= 0x10000 && character <= 0x10FFFF))) {
  325. // Character is an invalid xml character
  326. // http://www.w3.org/TR/REC-xml/#charsets
  327. character = null;
  328. }
  329. if (character != null) {
  330. ch = character.charValue();
  331. } else {
  332. // not an HTML escape; rewind
  333. i = start;
  334. ch = '&';
  335. }
  336. }
  337. }
  338. sb.setCharAt(j, ch);
  339. i++;
  340. j++;
  341. }
  342. sb.setLength(j);
  343. return sb;
  344. }
  345. /**
  346. * Parses HTML character and entity references and returns the corresponding character.
  347. */
  348. private static Character parseReference(String s) {
  349. if (s.length() == 0) {
  350. return null;
  351. }
  352. if (s.charAt(0) == '#') {
  353. // character reference
  354. if (s.length() == 1) {
  355. return null;
  356. }
  357. try {
  358. int value;
  359. if (s.charAt(1) == 'x') {
  360. // Hex reference
  361. value = Integer.parseInt(s.substring(2), 16);
  362. } else {
  363. // Decimal reference
  364. value = Integer.parseInt(s.substring(1));
  365. }
  366. return new Character((char) value);
  367. } catch (NumberFormatException e) {
  368. return null;
  369. }
  370. } else {
  371. return entities.get(s);
  372. }
  373. }
  374. /**
  375. * Class for current token.
  376. */
  377. public static class Token {
  378. public static final Type EOF = new Type();
  379. public static final Type TEXT = new Type();
  380. public static final Type TAG = new Type();
  381. public static final Type COMMENT = new Type();
  382. /** token's type */
  383. private Type type;
  384. /** token's value */
  385. private final Object value;
  386. /** whitespace preceding the token */
  387. private final StringBuffer whitespace;
  388. /**
  389. * Constructor for the EOF token.
  390. */
  391. protected Token() {
  392. type = EOF;
  393. value = null;
  394. whitespace = null;
  395. }
  396. /**
  397. * Constructor for the HTML tag tokens.
  398. */
  399. protected Token(HtmlTag tag, StringBuffer whitespace) {
  400. type = TAG;
  401. value = tag;
  402. this.whitespace = whitespace;
  403. }
  404. /**
  405. * Constructor for regular text and comments.
  406. */
  407. protected Token(StringBuffer text, StringBuffer whitespace, boolean comment) {
  408. if (comment) {
  409. type = COMMENT;
  410. } else {
  411. type = TEXT;
  412. }
  413. this.value = text;
  414. this.whitespace = whitespace;
  415. }
  416. /**
  417. * Returns the token's type.
  418. */
  419. public Type getType() {
  420. return type;
  421. }
  422. /**
  423. * Returns the whitespace preceding the token.
  424. */
  425. public StringBuffer getWhitespace() {
  426. return whitespace;
  427. }
  428. /**
  429. * Returns the token's value. This is an HtmlTag for tokens of type <code>TAG</code> and a StringBuffer for
  430. * tokens of type <code>TEXT</code> and <code>COMMENT</code>. For tokens of type <code>EOF</code>, the value is
  431. * <code>null</code>.
  432. */
  433. public Object getValue() {
  434. return value;
  435. }
  436. /**
  437. * Returns the string representation of the token, including the preceding whitespace.
  438. */
  439. @Override
  440. public String toString() {
  441. StringBuffer sb = new StringBuffer();
  442. if (whitespace != null) {
  443. sb.append(whitespace);
  444. }
  445. if (value != null) {
  446. if (type == TAG) {
  447. // sb.append('<');
  448. } else if (type == COMMENT) {
  449. sb.append("<!--"); //$NON-NLS-1$
  450. }
  451. sb.append(value);
  452. if (type == TAG) {
  453. // if(value instanceof HtmlTag) {
  454. // HtmlTag htmlTag = (HtmlTag)value;
  455. // if(htmlTag.getTagName().startsWith("?xml")) {
  456. // sb.append("?>");
  457. // }
  458. // } else {
  459. // sb.append('>');
  460. } else if (type == COMMENT) {
  461. sb.append("-->"); //$NON-NLS-1$
  462. }
  463. }
  464. return sb.toString();
  465. }
  466. /**
  467. * Private enum class for token type.
  468. */
  469. private static class Type {
  470. private Type() {
  471. // don't need to do anything
  472. }
  473. }
  474. }
  475. /**
  476. * Enum class for parser state.
  477. */
  478. private static class State {
  479. static final State EOF = new State();
  480. static final State COMMENT = new State();
  481. static final State TEXT = new State();
  482. static final State TAG = new State();
  483. static final State WS = new State();
  484. static final State TAG_QUOTE = new State();
  485. private State() {
  486. // don't need to do anything
  487. }
  488. }
  489. /** names and values of HTML entity references */
  490. private static HashMap<String, Character> entities;
  491. /*
  492. * Based on ISO 8879.
  493. *
  494. * Portions (c) International Organization for Standardization 1986
  495. * Permission to copy in any form is granted for use with conforming SGML
  496. * systems and applications as defined in ISO 8879, provided this notice is
  497. * included in all copies.
  498. *
  499. */
  500. static {
  501. entities = new HashMap<String, Character>();
  502. entities.put("nbsp", Character.valueOf('\240')); // no-break //$NON-NLS-1$
  503. // space =
  504. // non-breaking
  505. // space
  506. entities.put("iexcl", Character.valueOf('\241')); // inverted //$NON-NLS-1$
  507. // exclamation
  508. // mark
  509. entities.put("cent", Character.valueOf('\242')); // cent sign //$NON-NLS-1$
  510. entities.put("pound", Character.valueOf('\243')); // pound //$NON-NLS-1$
  511. // sign
  512. entities.put("curren", Character.valueOf('\244')); // currency //$NON-NLS-1$
  513. // sign
  514. entities.put("yen", Character.valueOf('\245')); // yen sign = //$NON-NLS-1$
  515. // yuan sign
  516. entities.put("brvbar", Character.valueOf('\246')); // broken //$NON-NLS-1$
  517. // bar =
  518. // broken
  519. // vertical
  520. // bar
  521. entities.put("sect", Character.valueOf('\247')); // section //$NON-NLS-1$
  522. // sign
  523. entities.put("uml", Character.valueOf('\250')); // diaeresis = //$NON-NLS-1$
  524. // spacing
  525. // diaeresis
  526. entities.put("copy", Character.valueOf('\251')); // copyright //$NON-NLS-1$
  527. // sign
  528. entities.put("ordf", Character.valueOf('\252')); // feminine //$NON-NLS-1$
  529. // ordinal
  530. // indicator
  531. entities.put("laquo", Character.valueOf('\253')); // left-pointing //$NON-NLS-1$
  532. // double
  533. // angle
  534. // quotation
  535. // mark =
  536. // left
  537. // pointing
  538. // guillemet
  539. entities.put("not", Character.valueOf('\254')); // not sign //$NON-NLS-1$
  540. entities.put("shy", Character.valueOf('\255')); // soft hyphen = //$NON-NLS-1$
  541. // discretionary
  542. // hyphen
  543. entities.put("reg", Character.valueOf('\256')); // registered //$NON-NLS-1$
  544. // sign =
  545. // registered
  546. // trade mark
  547. // sign
  548. entities.put("macr", Character.valueOf('\257')); // macron = //$NON-NLS-1$
  549. // spacing
  550. // macron =
  551. // overline
  552. // = APL
  553. // overbar
  554. entities.put("deg", Character.valueOf('\260')); // degree sign //$NON-NLS-1$
  555. entities.put("plusmn", Character.valueOf('\261')); // plus-minus //$NON-NLS-1$
  556. // sign =
  557. // plus-or-minus
  558. // sign
  559. entities.put("sup2", Character.valueOf('\262')); // superscript //$NON-NLS-1$
  560. // two =
  561. // superscript
  562. // digit two
  563. // = squared
  564. entities.put("sup3", Character.valueOf('\263')); // superscript //$NON-NLS-1$
  565. // three =
  566. // superscript
  567. // digit
  568. // three =
  569. // cubed
  570. entities.put("acute", Character.valueOf('\264')); // acute //$NON-NLS-1$
  571. // accent =
  572. // spacing
  573. // acute
  574. entities.put("micro", Character.valueOf('\265')); // micro //$NON-NLS-1$
  575. // sign
  576. entities.put("para", Character.valueOf('\266')); // pilcrow //$NON-NLS-1$
  577. // sign =
  578. // paragraph
  579. // sign
  580. entities.put("middot", Character.valueOf('\267')); // middle //$NON-NLS-1$
  581. // dot =
  582. // Georgian
  583. // comma =
  584. // Greek
  585. // middle
  586. // dot
  587. entities.put("cedil", Character.valueOf('\270')); // cedilla = //$NON-NLS-1$
  588. // spacing
  589. // cedilla
  590. entities.put("sup1", Character.valueOf('\271')); // superscript //$NON-NLS-1$
  591. // one =
  592. // superscript
  593. // digit one
  594. entities.put("ordm", Character.valueOf('\272')); // masculine //$NON-NLS-1$
  595. // ordinal
  596. // indicator
  597. entities.put("raquo", Character.valueOf('\273')); // right-pointing //$NON-NLS-1$
  598. // double
  599. // angle
  600. // quotation
  601. // mark =
  602. // right
  603. // pointing
  604. // guillemet
  605. entities.put("frac14", Character.valueOf('\274')); // vulgar //$NON-NLS-1$
  606. // fraction
  607. // one
  608. // quarter =
  609. // fraction
  610. // one
  611. // quarter
  612. entities.put("frac12", Character.valueOf('\275')); // vulgar //$NON-NLS-1$
  613. // fraction
  614. // one half
  615. // =
  616. // fraction
  617. // one half
  618. entities.put("frac34", Character.valueOf('\276')); // vulgar //$NON-NLS-1$
  619. // fraction
  620. // three
  621. // quarters
  622. // =
  623. // fraction
  624. // three
  625. // quarters
  626. entities.put("iquest", Character.valueOf('\277')); // inverted //$NON-NLS-1$
  627. // question
  628. // mark =
  629. // turned
  630. // question
  631. // mark
  632. entities.put("Agrave", Character.valueOf('\300')); // latin //$NON-NLS-1$
  633. // capital
  634. // letter A
  635. // with
  636. // grave =
  637. // latin
  638. // capital
  639. // letter A
  640. // grave
  641. entities.put("Aacute", Character.valueOf('\301')); // latin //$NON-NLS-1$
  642. // capital
  643. // letter A
  644. // with
  645. // acute
  646. entities.put("Acirc", Character.valueOf('\302')); // latin //$NON-NLS-1$
  647. // capital
  648. // letter A
  649. // with
  650. // circumflex
  651. entities.put("Atilde", Character.valueOf('\303')); // latin //$NON-NLS-1$
  652. // capital
  653. // letter A
  654. // with
  655. // tilde
  656. entities.put("Auml", Character.valueOf('\304')); // latin //$NON-NLS-1$
  657. // capital
  658. // letter A
  659. // with
  660. // diaeresis
  661. entities.put("Aring", Character.valueOf('\305')); // latin //$NON-NLS-1$
  662. // capital
  663. // letter A
  664. // with ring
  665. // above =
  666. // latin
  667. // capital
  668. // letter A
  669. // ring
  670. entities.put("AElig", Character.valueOf('\306')); // latin //$NON-NLS-1$
  671. // capital
  672. // letter AE
  673. // = latin
  674. // capital
  675. // ligature
  676. // AE
  677. entities.put("Ccedil", Character.valueOf('\307')); // latin //$NON-NLS-1$
  678. // capital
  679. // letter C
  680. // with
  681. // cedilla
  682. entities.put("Egrave", Character.valueOf('\310')); // latin //$NON-NLS-1$
  683. // capital
  684. // letter E
  685. // with
  686. // grave
  687. entities.put("Eacute", Character.valueOf('\311')); // latin //$NON-NLS-1$
  688. // capital
  689. // letter E
  690. // with
  691. // acute
  692. entities.put("Ecirc", Character.valueOf('\312')); // latin //$NON-NLS-1$
  693. // capital
  694. // letter E
  695. // with
  696. // circumflex
  697. entities.put("Euml", Character.valueOf('\313')); // latin //$NON-NLS-1$
  698. // capital
  699. // letter E
  700. // with
  701. // diaeresis
  702. entities.put("Igrave", Character.valueOf('\314')); // latin //$NON-NLS-1$
  703. // capital
  704. // letter I
  705. // with
  706. // grave
  707. entities.put("Iacute", Character.valueOf('\315')); // latin //$NON-NLS-1$
  708. // capital
  709. // letter I
  710. // with
  711. // acute
  712. entities.put("Icirc", Character.valueOf('\316')); // latin //$NON-NLS-1$
  713. // capital
  714. // letter I
  715. // with
  716. // circumflex
  717. entities.put("Iuml", Character.valueOf('\317')); // latin //$NON-NLS-1$
  718. // capital
  719. // letter I
  720. // with
  721. // diaeresis
  722. entities.put("ETH", Character.valueOf('\320')); // latin capital //$NON-NLS-1$
  723. // letter ETH
  724. entities.put("Ntilde", Character.valueOf('\321')); // latin //$NON-NLS-1$
  725. // capital
  726. // letter N
  727. // with
  728. // tilde
  729. entities.put("Ograve", Character.valueOf('\322')); // latin //$NON-NLS-1$
  730. // capital
  731. // letter O
  732. // with
  733. // grave
  734. entities.put("Oacute", Character.valueOf('\323')); // latin //$NON-NLS-1$
  735. // capital
  736. // letter O
  737. // with
  738. // acute
  739. entities.put("Ocirc", Character.valueOf('\324')); // latin //$NON-NLS-1$
  740. // capital
  741. // letter O
  742. // with
  743. // circumflex
  744. entities.put("Otilde", Character.valueOf('\325')); // latin //$NON-NLS-1$
  745. // capital
  746. // letter O
  747. // with
  748. // tilde
  749. entities.put("Ouml", Character.valueOf('\326')); // latin //$NON-NLS-1$
  750. // capital
  751. // letter O
  752. // with
  753. // diaeresis
  754. entities.put("times", Character.valueOf('\327')); // multiplication //$NON-NLS-1$
  755. // sign
  756. entities.put("Oslash", Character.valueOf('\330')); // latin //$NON-NLS-1$
  757. // capital
  758. // letter O
  759. // with
  760. // stroke =
  761. // latin
  762. // capital
  763. // letter O
  764. // slash
  765. entities.put("Ugrave", Character.valueOf('\331')); // latin //$NON-NLS-1$
  766. // capital
  767. // letter U
  768. // with
  769. // grave
  770. entities.put("Uacute", Character.valueOf('\332')); // latin //$NON-NLS-1$
  771. // capital
  772. // letter U
  773. // with
  774. // acute
  775. entities.put("Ucirc", Character.valueOf('\333')); // latin //$NON-NLS-1$
  776. // capital
  777. // letter U
  778. // with
  779. // circumflex
  780. entities.put("Uuml", Character.valueOf('\334')); // latin //$NON-NLS-1$
  781. // capital
  782. // letter U
  783. // with
  784. // diaeresis
  785. entities.put("Yacute", Character.valueOf('\335')); // latin //$NON-NLS-1$
  786. // capital
  787. // letter Y
  788. // with
  789. // acute
  790. entities.put("THORN", Character.valueOf('\336')); // latin //$NON-NLS-1$
  791. // capital
  792. // letter
  793. // THORN
  794. entities.put("szlig", Character.valueOf('\337')); // latin //$NON-NLS-1$
  795. // small
  796. // letter
  797. // sharp s =
  798. // ess-zed
  799. entities.put("agrave", Character.valueOf('\340')); // latin //$NON-NLS-1$
  800. // small
  801. // letter a
  802. // with
  803. // grave =
  804. // latin
  805. // small
  806. // letter a
  807. // grave
  808. entities.put("aacute", Character.valueOf('\341')); // latin //$NON-NLS-1$
  809. // small
  810. // letter a
  811. // with
  812. // acute
  813. entities.put("acirc", Character.valueOf('\342')); // latin //$NON-NLS-1$
  814. // small
  815. // letter a
  816. // with
  817. // circumflex
  818. entities.put("atilde", Character.valueOf('\343')); // latin //$NON-NLS-1$
  819. // small
  820. // letter a
  821. // with
  822. // tilde
  823. entities.put("auml", Character.valueOf('\344')); // latin //$NON-NLS-1$
  824. // small
  825. // letter a
  826. // with
  827. // diaeresis
  828. entities.put("aring", Character.valueOf('\345')); // latin //$NON-NLS-1$
  829. // small
  830. // letter a
  831. // with ring
  832. // above =
  833. // latin
  834. // small
  835. // letter a
  836. // ring
  837. entities.put("aelig", Character.valueOf('\346')); // latin //$NON-NLS-1$
  838. // small
  839. // letter ae
  840. // = latin
  841. // small
  842. // ligature
  843. // ae
  844. entities.put("ccedil", Character.valueOf('\347')); // latin //$NON-NLS-1$
  845. // small
  846. // letter c
  847. // with
  848. // cedilla
  849. entities.put("egrave", Character.valueOf('\350')); // latin //$NON-NLS-1$
  850. // small
  851. // letter e
  852. // with
  853. // grave
  854. entities.put("eacute", Character.valueOf('\351')); // latin //$NON-NLS-1$
  855. // small
  856. // letter e
  857. // with
  858. // acute
  859. entities.put("ecirc", Character.valueOf('\352')); // latin //$NON-NLS-1$
  860. // small
  861. // letter e
  862. // with
  863. // circumflex
  864. entities.put("euml", Character.valueOf('\353')); // latin //$NON-NLS-1$
  865. // small
  866. // letter e
  867. // with
  868. // diaeresis
  869. entities.put("igrave", Character.valueOf('\354')); // latin //$NON-NLS-1$
  870. // small
  871. // letter i
  872. // with
  873. // grave
  874. entities.put("iacute", Character.valueOf('\355')); // latin //$NON-NLS-1$
  875. // small
  876. // letter i
  877. // with
  878. // acute
  879. entities.put("icirc", Character.valueOf('\356')); // latin //$NON-NLS-1$
  880. // small
  881. // letter i
  882. // with
  883. // circumflex
  884. entities.put("iuml", Character.valueOf('\357')); // latin //$NON-NLS-1$
  885. // small
  886. // letter i
  887. // with
  888. // diaeresis
  889. entities.put("eth", Character.valueOf('\360')); // latin small //$NON-NLS-1$
  890. // letter eth
  891. entities.put("ntilde", Character.valueOf('\361')); // latin //$NON-NLS-1$
  892. // small
  893. // letter n
  894. // with
  895. // tilde
  896. entities.put("ograve", Character.valueOf('\362')); // latin //$NON-NLS-1$
  897. // small
  898. // letter o
  899. // with
  900. // grave
  901. entities.put("oacute", Character.valueOf('\363')); // latin //$NON-NLS-1$
  902. // small
  903. // letter o
  904. // with
  905. // acute
  906. entities.put("ocirc", Character.valueOf('\364')); // latin //$NON-NLS-1$
  907. // small
  908. // letter o
  909. // with
  910. // circumflex
  911. entities.put("otilde", Character.valueOf('\365')); // latin //$NON-NLS-1$
  912. // small
  913. // letter o
  914. // with
  915. // tilde
  916. entities.put("ouml", Character.valueOf('\366')); // latin //$NON-NLS-1$
  917. // small
  918. // letter o
  919. // with
  920. // diaeresis
  921. entities.put("divide", Character.valueOf('\367')); // division //$NON-NLS-1$
  922. // sign
  923. entities.put("oslash", Character.valueOf('\370')); // latin //$NON-NLS-1$
  924. // small
  925. // letter o
  926. // with
  927. // stroke =
  928. // latin
  929. // small
  930. // letter o
  931. // slash
  932. entities.put("ugrave", Character.valueOf('\371')); // latin //$NON-NLS-1$
  933. // small
  934. // letter u
  935. // with
  936. // grave
  937. entities.put("uacute", Character.valueOf('\372')); // latin //$NON-NLS-1$
  938. // small
  939. // letter u
  940. // with
  941. // acute
  942. entities.put("ucirc", Character.valueOf('\373')); // latin //$NON-NLS-1$
  943. // small
  944. // letter u
  945. // with
  946. // circumflex
  947. entities.put("uuml", Character.valueOf('\374')); // latin //$NON-NLS-1$
  948. // small
  949. // letter u
  950. // with
  951. // diaeresis
  952. entities.put("yacute", Character.valueOf('\375')); // latin //$NON-NLS-1$
  953. // small
  954. // letter y
  955. // with
  956. // acute
  957. entities.put("thorn", Character.valueOf('\376')); // latin //$NON-NLS-1$
  958. // small
  959. // letter
  960. // thorn
  961. entities.put("yuml", Character.valueOf('\377')); // latin //$NON-NLS-1$
  962. // small
  963. // letter y
  964. // with
  965. // diaeresis
  966. // Special characters
  967. entities.put("quot", Character.valueOf('\42')); // quotation //$NON-NLS-1$
  968. // mark = APL
  969. // quote
  970. entities.put("amp", Character.valueOf('\46')); // ampersand //$NON-NLS-1$
  971. entities.put("lt", Character.valueOf('\74')); // less-than //$NON-NLS-1$
  972. // sign
  973. entities.put("gt", Character.valueOf('\76')); // greater-than //$NON-NLS-1$
  974. // sign
  975. // Latin Extended-A
  976. entities.put("OElig", Character.valueOf('\u0152')); // latin //$NON-NLS-1$
  977. // capital
  978. // ligature
  979. // OE
  980. entities.put("oelig", Character.valueOf('\u0153')); // latin //$NON-NLS-1$
  981. // small
  982. // ligature
  983. // oe,
  984. // ligature
  985. // is a
  986. // misnomer,
  987. // this is a
  988. // separate
  989. // character
  990. // in some
  991. // languages
  992. entities.put("Scaron", Character.valueOf('\u0160')); // latin //$NON-NLS-1$
  993. // capital
  994. // letter
  995. // S
  996. // with
  997. // caron
  998. entities.put("scaron", Character.valueOf('\u0161')); // latin //$NON-NLS-1$
  999. // small
  1000. // letter
  1001. // s
  1002. // with
  1003. // caron
  1004. entities.put("Yuml", Character.valueOf('\u0178')); // latin //$NON-NLS-1$
  1005. // capital
  1006. // letter Y
  1007. // with
  1008. // diaeresis
  1009. // Spacing Modifier Letters
  1010. entities.put("circ", Character.valueOf('\u02c6')); // modifier //$NON-NLS-1$
  1011. // letter
  1012. // circumflex
  1013. // accent
  1014. entities.put("tilde", Character.valueOf('\u02dc')); // small //$NON-NLS-1$
  1015. // tilde
  1016. // General punctuation
  1017. entities.put("ensp", Character.valueOf('\u2002')); // en space //$NON-NLS-1$
  1018. entities.put("emsp", Character.valueOf('\u2003')); // em space //$NON-NLS-1$
  1019. entities.put("thinsp", Character.valueOf('\u2009')); // thin //$NON-NLS-1$
  1020. // space
  1021. entities.put("zwnj", Character.valueOf('\u200c')); // zero //$NON-NLS-1$
  1022. // width
  1023. // non-joiner
  1024. entities.put("zwj", Character.valueOf('\u200d')); // zero //$NON-NLS-1$
  1025. // width
  1026. // joiner
  1027. entities.put("lrm", Character.valueOf('\u200e')); // left-to-right //$NON-NLS-1$
  1028. // mark
  1029. entities.put("rlm", Character.valueOf('\u200f')); // right-to-left //$NON-NLS-1$
  1030. // mark
  1031. entities.put("ndash", Character.valueOf('\u2013')); // en dash //$NON-NLS-1$
  1032. entities.put("mdash", Character.valueOf('\u2014')); // em dash //$NON-NLS-1$
  1033. entities.put("lsquo", Character.valueOf('\u2018')); // left //$NON-NLS-1$
  1034. // single
  1035. // quotation
  1036. // mark
  1037. entities.put("rsquo", Character.valueOf('\u2019')); // right //$NON-NLS-1$
  1038. // single
  1039. // quotation
  1040. // mark
  1041. entities.put("sbquo", Character.valueOf('\u201a')); // single //$NON-NLS-1$
  1042. // low-9
  1043. // quotation
  1044. // mark
  1045. entities.put("ldquo", Character.valueOf('\u201c')); // left //$NON-NLS-1$
  1046. // double
  1047. // quotation
  1048. // mark
  1049. entities.put("rdquo", Character.valueOf('\u201d')); // right //$NON-NLS-1$
  1050. // double
  1051. // quotation
  1052. // mark
  1053. entities.put("bdquo", Character.valueOf('\u201e')); // double //$NON-NLS-1$
  1054. // low-9
  1055. // quotation
  1056. // mark
  1057. entities.put("dagger", Character.valueOf('\u2020')); // dagger //$NON-NLS-1$
  1058. entities.put("Dagger", Character.valueOf('\u2021')); // double //$NON-NLS-1$
  1059. // dagger
  1060. entities.put("permil", Character.valueOf('\u2030')); // per //$NON-NLS-1$
  1061. // mille
  1062. // sign
  1063. entities.put("lsaquo", Character.valueOf('\u2039')); // single //$NON-NLS-1$
  1064. // left-pointing
  1065. // angle
  1066. // quotation
  1067. // mark,
  1068. // not
  1069. // yet
  1070. // standardized
  1071. entities.put("rsaquo", Character.valueOf('\u203a')); // single //$NON-NLS-1$
  1072. // right-pointing
  1073. // angle
  1074. // quotation
  1075. // mark,
  1076. // not
  1077. // yet
  1078. // standardized
  1079. entities.put("euro", Character.valueOf('\u20ac')); // euro sign //$NON-NLS-1$
  1080. }
  1081. }