PageRenderTime 60ms CodeModel.GetById 30ms RepoModel.GetById 0ms app.codeStats 0ms

/eclipse-mylyn-R_3_8_0-fetched-src/org.eclipse.mylyn.commons/org.eclipse.mylyn.commons.core/src/org/eclipse/mylyn/commons/core/HtmlStreamTokenizer.java

#
Java | 1141 lines | 1084 code | 17 blank | 40 comment | 50 complexity | 36609adde6ece6c6f1b280a257b12612 MD5 | raw file
Possible License(s): Apache-2.0
  1. /*******************************************************************************
  2. * Copyright (c) 2004, 2008 Tasktop Technologies and others.
  3. * All rights reserved. This program and the accompanying materials
  4. * are made available under the terms of the Eclipse Public License v1.0
  5. * which accompanies this distribution, and is available at
  6. * http://www.eclipse.org/legal/epl-v10.html
  7. *
  8. * Contributors:
  9. * Tasktop Technologies - initial API and implementation
  10. *******************************************************************************/
  11. package org.eclipse.mylyn.commons.core;
  12. import java.io.BufferedReader;
  13. import java.io.IOException;
  14. import java.io.Reader;
  15. import java.net.URL;
  16. import java.text.ParseException;
  17. import java.util.HashMap;
  18. import java.util.Locale;
  19. /**
  20. * Parses HTML into tokens.
  21. *
  22. * @author Shawn Minto
  23. * @since 3.7
  24. */
  25. public class HtmlStreamTokenizer {
  26. /** parser state */
  27. private State state;
  28. /** reader from which to parse the text */
  29. private final BufferedReader in;
  30. /** base URL for resolving relative URLs */
  31. private final URL base;
  32. /** buffer holding the text of the current token */
  33. private final StringBuffer textBuffer;
  34. /** buffer holding whitespace preceding the current token */
  35. private final StringBuffer whitespaceBuffer;
  36. /**
  37. * holds a token that was read and then put back in the queue to be returned again on <code>nextToken</code> call
  38. */
  39. private Token pushbackToken;
  40. /**
  41. * holds a character that was read and then determined not to be part of the current token
  42. */
  43. private int pushbackChar;
  44. /** current quote delimiter (single or double) */
  45. private int quoteChar;
  46. /** Allow class client to choose if tag attributes are escaped or not */
  47. private boolean escapeTagValues;
  48. /**
  49. * Constructor.
  50. *
  51. * @param in
  52. * reader for the HTML document to tokenize
  53. * @param base
  54. * URL for resolving relative URLs
  55. */
  56. public HtmlStreamTokenizer(Reader in, URL base) {
  57. textBuffer = new StringBuffer();
  58. whitespaceBuffer = new StringBuffer();
  59. pushbackChar = 0;
  60. state = State.TEXT;
  61. this.in = new BufferedReader(in);
  62. this.base = base;
  63. escapeTagValues = true;
  64. }
  65. public void escapeTagAttributes(boolean value) {
  66. escapeTagValues = value;
  67. }
  68. /**
  69. * Returns the next token from the stream.
  70. */
  71. public Token nextToken() throws IOException, ParseException {
  72. if (pushbackToken != null) {
  73. Token token = pushbackToken;
  74. pushbackToken = null;
  75. return token;
  76. }
  77. int closingComment = 0;
  78. textBuffer.setLength(0);
  79. whitespaceBuffer.setLength(0);
  80. do {
  81. int ch;
  82. if (pushbackChar != 0) {
  83. ch = pushbackChar;
  84. pushbackChar = 0;
  85. } else {
  86. ch = in.read();
  87. }
  88. if (ch < 0) {
  89. State oldState = state;
  90. state = State.EOF;
  91. if (textBuffer.length() > 0 && oldState == State.TEXT) {
  92. return new Token(textBuffer, whitespaceBuffer, false);
  93. } else {
  94. return new Token();
  95. }
  96. }
  97. if (state == State.TEXT) {
  98. if (ch == '<') {
  99. state = State.TAG;
  100. if (textBuffer.length() > 0) {
  101. return new Token(textBuffer, whitespaceBuffer, false);
  102. }
  103. } else if (Character.isWhitespace((char) ch)) {
  104. pushbackChar = ch;
  105. state = State.WS;
  106. if (textBuffer.length() > 0) {
  107. return new Token(textBuffer, whitespaceBuffer, false);
  108. }
  109. } else {
  110. textBuffer.append((char) ch);
  111. }
  112. } else if (state == State.WS) {
  113. if (!Character.isWhitespace((char) ch)) {
  114. pushbackChar = ch;
  115. state = State.TEXT;
  116. } else {
  117. whitespaceBuffer.append((char) ch);
  118. }
  119. } else if (state == State.TAG) {
  120. if (ch == '>') {
  121. state = State.TEXT;
  122. HtmlTag tag = new HtmlTag(base);
  123. parseTag(textBuffer.toString(), tag, escapeTagValues);
  124. return new Token(tag, whitespaceBuffer);
  125. }
  126. if (ch == '<' && textBuffer.length() == 0) {
  127. textBuffer.append("<<"); //$NON-NLS-1$
  128. state = State.TEXT;
  129. } else if (ch == '-' && textBuffer.length() == 2 && textBuffer.charAt(1) == '-'
  130. && textBuffer.charAt(0) == '!') {
  131. textBuffer.setLength(0);
  132. state = State.COMMENT;
  133. } else if (ch == '\'' || ch == '"') {
  134. quoteChar = ch;
  135. textBuffer.append((char) ch);
  136. state = State.TAG_QUOTE;
  137. } else {
  138. textBuffer.append((char) ch);
  139. }
  140. } else if (state == State.TAG_QUOTE) {
  141. if (ch == '>') {
  142. pushbackChar = ch;
  143. state = State.TAG;
  144. } else {
  145. textBuffer.append((char) ch);
  146. if (ch == quoteChar) {
  147. state = State.TAG;
  148. }
  149. }
  150. } else if (state == State.COMMENT) {
  151. if (ch == '>' && closingComment >= 2) {
  152. textBuffer.setLength(textBuffer.length() - 2);
  153. closingComment = 0;
  154. state = State.TEXT;
  155. return new Token(textBuffer, whitespaceBuffer, true);
  156. }
  157. if (ch == '-') {
  158. closingComment++;
  159. } else {
  160. closingComment = 0;
  161. }
  162. textBuffer.append((char) ch);
  163. }
  164. } while (true);
  165. }
  166. /**
  167. * Pushes the token back into the queue, to be returned by the subsequent call to <code>nextToken</code>
  168. */
  169. public void pushback(Token token) {
  170. pushbackToken = token;
  171. }
  172. /**
  173. * Parses an HTML tag out of a string of characters.
  174. */
  175. private static void parseTag(String s, HtmlTag tag, boolean escapeValues) throws ParseException {
  176. int i = 0;
  177. for (; i < s.length() && Character.isWhitespace(s.charAt(i)); i++) {
  178. // just move forward
  179. }
  180. if (i == s.length()) {
  181. throw new ParseException("parse empty tag", 0); //$NON-NLS-1$
  182. }
  183. int start = i;
  184. for (; i < s.length() && !Character.isWhitespace(s.charAt(i)); i++) {
  185. // just move forward
  186. }
  187. tag.setTagName(s.substring(start, i));
  188. for (; i < s.length() && Character.isWhitespace(s.charAt(i)); i++) {
  189. // just move forward
  190. }
  191. if (i == s.length()) {
  192. return;
  193. } else {
  194. parseAttributes(tag, s, i, escapeValues);
  195. return;
  196. }
  197. }
  198. /**
  199. * parses HTML tag attributes from a buffer and sets them in an HtmlTag
  200. */
  201. private static void parseAttributes(HtmlTag tag, String s, int i, boolean escapeValues) throws ParseException {
  202. while (i < s.length()) {
  203. // skip whitespace
  204. while (i < s.length() && Character.isWhitespace(s.charAt(i))) {
  205. i++;
  206. }
  207. if (i == s.length()) {
  208. return;
  209. }
  210. // read the attribute name -- the rule might be looser than the RFC
  211. // specifies:
  212. // everything up to a space or an equal sign is included
  213. int start = i;
  214. for (; i < s.length() && !Character.isWhitespace(s.charAt(i)) && s.charAt(i) != '='; i++) {
  215. // just move forward
  216. }
  217. String attributeName = s.substring(start, i).toLowerCase(Locale.ENGLISH);
  218. if (attributeName.equals("/")) { //$NON-NLS-1$
  219. tag.setSelfTerminating(true);
  220. continue;
  221. }
  222. for (; i < s.length() && Character.isWhitespace(s.charAt(i)); i++) {
  223. // just move forward
  224. }
  225. if (i == s.length() || s.charAt(i) != '=') {
  226. // no attribute value
  227. tag.setAttribute(attributeName, ""); //$NON-NLS-1$
  228. continue;
  229. }
  230. // skip whitespace to the start of attribute value
  231. for (i = i + 1; i < s.length() && Character.isWhitespace(s.charAt(i)); i++) {
  232. // just move forward
  233. }
  234. if (i == s.length()) {
  235. return;
  236. }
  237. // read the attribute value -- the rule for unquoted attribute value
  238. // is
  239. // looser than the one in Conolly's W3C 1996 lexical analyzer draft:
  240. // everything
  241. // is included up to the next space
  242. String attributeValue;
  243. if (s.charAt(i) == '"') {
  244. start = ++i;
  245. for (; i < s.length() && s.charAt(i) != '"'; i++) {
  246. // just move forward
  247. }
  248. if (i == s.length()) {
  249. return; // shouldn't happen if input returned by nextToken
  250. }
  251. if (escapeValues) {
  252. attributeValue = unescape(s.substring(start, i));
  253. } else {
  254. attributeValue = s.substring(start, i);
  255. }
  256. i++;
  257. } else if (s.charAt(i) == '\'') {
  258. start = ++i;
  259. for (; i < s.length() && s.charAt(i) != '\''; i++) {
  260. // just move forward
  261. }
  262. if (i == s.length()) {
  263. return; // shouldn't happen if input returned by nextToken
  264. }
  265. attributeValue = unescape(s.substring(start, i));
  266. i++;
  267. } else {
  268. start = i;
  269. for (; i < s.length() && !Character.isWhitespace(s.charAt(i)); i++) {
  270. // just move forward
  271. }
  272. attributeValue = s.substring(start, i);
  273. }
  274. tag.setAttribute(attributeName, attributeValue);
  275. }
  276. }
  277. /**
  278. * Returns a string with HTML escapes changed into their corresponding characters.
  279. *
  280. * @deprecated use {@link StringEscapeUtils#unescapeHtml(String)} instead
  281. */
  282. @Deprecated
  283. public static String unescape(String s) {
  284. if (s.indexOf('&') == -1) {
  285. return s;
  286. } else {
  287. StringBuffer sb = new StringBuffer(s);
  288. unescape(sb);
  289. return sb.toString();
  290. }
  291. }
  292. /**
  293. * Replaces (in-place) HTML escapes in a StringBuffer with their corresponding characters.
  294. *
  295. * @deprecated use {@link StringEscapeUtils#unescapeHtml(String)} instead
  296. */
  297. @Deprecated
  298. public static StringBuffer unescape(StringBuffer sb) {
  299. int i = 0; // index into the unprocessed section of the buffer
  300. int j = 0; // index into the processed section of the buffer
  301. while (i < sb.length()) {
  302. char ch = sb.charAt(i);
  303. if (ch == '&') {
  304. int start = i;
  305. String escape = null;
  306. for (i = i + 1; i < sb.length(); i++) {
  307. ch = sb.charAt(i);
  308. if (!Character.isLetterOrDigit(ch) && !(ch == '#' && i == (start + 1))) {
  309. escape = sb.substring(start + 1, i);
  310. break;
  311. }
  312. }
  313. if (i == sb.length() && i != (start + 1)) {
  314. escape = sb.substring(start + 1);
  315. }
  316. if (escape != null) {
  317. Character character = parseReference(escape);
  318. if (character != null
  319. && !((0x0A == character || 0x0D == character || 0x09 == ch)
  320. || (character >= 0x20 && character <= 0xD7FF)
  321. || (character >= 0xE000 && character <= 0xFFFD) || (character >= 0x10000 && character <= 0x10FFFF))) {
  322. // Character is an invalid xml character
  323. // http://www.w3.org/TR/REC-xml/#charsets
  324. character = null;
  325. }
  326. if (character != null) {
  327. ch = character.charValue();
  328. } else {
  329. // not an HTML escape; rewind
  330. i = start;
  331. ch = '&';
  332. }
  333. }
  334. }
  335. sb.setCharAt(j, ch);
  336. i++;
  337. j++;
  338. }
  339. sb.setLength(j);
  340. return sb;
  341. }
  342. /**
  343. * Parses HTML character and entity references and returns the corresponding character.
  344. */
  345. private static Character parseReference(String s) {
  346. if (s.length() == 0) {
  347. return null;
  348. }
  349. if (s.charAt(0) == '#') {
  350. // character reference
  351. if (s.length() == 1) {
  352. return null;
  353. }
  354. try {
  355. int value;
  356. if (s.charAt(1) == 'x') {
  357. // Hex reference
  358. value = Integer.parseInt(s.substring(2), 16);
  359. } else {
  360. // Decimal reference
  361. value = Integer.parseInt(s.substring(1));
  362. }
  363. return new Character((char) value);
  364. } catch (NumberFormatException e) {
  365. return null;
  366. }
  367. } else {
  368. return entities.get(s);
  369. }
  370. }
  371. /**
  372. * Class for current token.
  373. */
  374. public static class Token {
  375. public static final Type EOF = new Type();
  376. public static final Type TEXT = new Type();
  377. public static final Type TAG = new Type();
  378. public static final Type COMMENT = new Type();
  379. /** token's type */
  380. private Type type;
  381. /** token's value */
  382. private final Object value;
  383. /** whitespace preceding the token */
  384. private final StringBuffer whitespace;
  385. /**
  386. * Constructor for the EOF token.
  387. */
  388. protected Token() {
  389. type = EOF;
  390. value = null;
  391. whitespace = null;
  392. }
  393. /**
  394. * Constructor for the HTML tag tokens.
  395. */
  396. protected Token(HtmlTag tag, StringBuffer whitespace) {
  397. type = TAG;
  398. value = tag;
  399. this.whitespace = whitespace;
  400. }
  401. /**
  402. * Constructor for regular text and comments.
  403. */
  404. protected Token(StringBuffer text, StringBuffer whitespace, boolean comment) {
  405. if (comment) {
  406. type = COMMENT;
  407. } else {
  408. type = TEXT;
  409. }
  410. this.value = text;
  411. this.whitespace = whitespace;
  412. }
  413. /**
  414. * Returns the token's type.
  415. */
  416. public Type getType() {
  417. return type;
  418. }
  419. /**
  420. * Returns the whitespace preceding the token.
  421. */
  422. public StringBuffer getWhitespace() {
  423. return whitespace;
  424. }
  425. /**
  426. * Returns the token's value. This is an HtmlTag for tokens of type <code>TAG</code> and a StringBuffer for
  427. * tokens of type <code>TEXT</code> and <code>COMMENT</code>. For tokens of type <code>EOF</code>, the value is
  428. * <code>null</code>.
  429. */
  430. public Object getValue() {
  431. return value;
  432. }
  433. /**
  434. * Returns the string representation of the token, including the preceding whitespace.
  435. */
  436. @Override
  437. public String toString() {
  438. StringBuffer sb = new StringBuffer();
  439. if (whitespace != null) {
  440. sb.append(whitespace);
  441. }
  442. if (value != null) {
  443. if (type == TAG) {
  444. // sb.append('<');
  445. } else if (type == COMMENT) {
  446. sb.append("<!--"); //$NON-NLS-1$
  447. }
  448. sb.append(value);
  449. if (type == TAG) {
  450. // if(value instanceof HtmlTag) {
  451. // HtmlTag htmlTag = (HtmlTag)value;
  452. // if(htmlTag.getTagName().startsWith("?xml")) {
  453. // sb.append("?>");
  454. // }
  455. // } else {
  456. // sb.append('>');
  457. } else if (type == COMMENT) {
  458. sb.append("-->"); //$NON-NLS-1$
  459. }
  460. }
  461. return sb.toString();
  462. }
  463. /**
  464. * Private enum class for token type.
  465. */
  466. private static class Type {
  467. private Type() {
  468. // don't need to do anything
  469. }
  470. }
  471. }
  472. /**
  473. * Enum class for parser state.
  474. */
  475. private static class State {
  476. static final State EOF = new State();
  477. static final State COMMENT = new State();
  478. static final State TEXT = new State();
  479. static final State TAG = new State();
  480. static final State WS = new State();
  481. static final State TAG_QUOTE = new State();
  482. private State() {
  483. // don't need to do anything
  484. }
  485. }
  486. /** names and values of HTML entity references */
  487. private static HashMap<String, Character> entities;
  488. /*
  489. * Based on ISO 8879.
  490. *
  491. * Portions (c) International Organization for Standardization 1986
  492. * Permission to copy in any form is granted for use with conforming SGML
  493. * systems and applications as defined in ISO 8879, provided this notice is
  494. * included in all copies.
  495. *
  496. */
  497. static {
  498. entities = new HashMap<String, Character>();
  499. entities.put("nbsp", Character.valueOf('\240')); // no-break //$NON-NLS-1$
  500. // space =
  501. // non-breaking
  502. // space
  503. entities.put("iexcl", Character.valueOf('\241')); // inverted //$NON-NLS-1$
  504. // exclamation
  505. // mark
  506. entities.put("cent", Character.valueOf('\242')); // cent sign //$NON-NLS-1$
  507. entities.put("pound", Character.valueOf('\243')); // pound //$NON-NLS-1$
  508. // sign
  509. entities.put("curren", Character.valueOf('\244')); // currency //$NON-NLS-1$
  510. // sign
  511. entities.put("yen", Character.valueOf('\245')); // yen sign = //$NON-NLS-1$
  512. // yuan sign
  513. entities.put("brvbar", Character.valueOf('\246')); // broken //$NON-NLS-1$
  514. // bar =
  515. // broken
  516. // vertical
  517. // bar
  518. entities.put("sect", Character.valueOf('\247')); // section //$NON-NLS-1$
  519. // sign
  520. entities.put("uml", Character.valueOf('\250')); // diaeresis = //$NON-NLS-1$
  521. // spacing
  522. // diaeresis
  523. entities.put("copy", Character.valueOf('\251')); // copyright //$NON-NLS-1$
  524. // sign
  525. entities.put("ordf", Character.valueOf('\252')); // feminine //$NON-NLS-1$
  526. // ordinal
  527. // indicator
  528. entities.put("laquo", Character.valueOf('\253')); // left-pointing //$NON-NLS-1$
  529. // double
  530. // angle
  531. // quotation
  532. // mark =
  533. // left
  534. // pointing
  535. // guillemet
  536. entities.put("not", Character.valueOf('\254')); // not sign //$NON-NLS-1$
  537. entities.put("shy", Character.valueOf('\255')); // soft hyphen = //$NON-NLS-1$
  538. // discretionary
  539. // hyphen
  540. entities.put("reg", Character.valueOf('\256')); // registered //$NON-NLS-1$
  541. // sign =
  542. // registered
  543. // trade mark
  544. // sign
  545. entities.put("macr", Character.valueOf('\257')); // macron = //$NON-NLS-1$
  546. // spacing
  547. // macron =
  548. // overline
  549. // = APL
  550. // overbar
  551. entities.put("deg", Character.valueOf('\260')); // degree sign //$NON-NLS-1$
  552. entities.put("plusmn", Character.valueOf('\261')); // plus-minus //$NON-NLS-1$
  553. // sign =
  554. // plus-or-minus
  555. // sign
  556. entities.put("sup2", Character.valueOf('\262')); // superscript //$NON-NLS-1$
  557. // two =
  558. // superscript
  559. // digit two
  560. // = squared
  561. entities.put("sup3", Character.valueOf('\263')); // superscript //$NON-NLS-1$
  562. // three =
  563. // superscript
  564. // digit
  565. // three =
  566. // cubed
  567. entities.put("acute", Character.valueOf('\264')); // acute //$NON-NLS-1$
  568. // accent =
  569. // spacing
  570. // acute
  571. entities.put("micro", Character.valueOf('\265')); // micro //$NON-NLS-1$
  572. // sign
  573. entities.put("para", Character.valueOf('\266')); // pilcrow //$NON-NLS-1$
  574. // sign =
  575. // paragraph
  576. // sign
  577. entities.put("middot", Character.valueOf('\267')); // middle //$NON-NLS-1$
  578. // dot =
  579. // Georgian
  580. // comma =
  581. // Greek
  582. // middle
  583. // dot
  584. entities.put("cedil", Character.valueOf('\270')); // cedilla = //$NON-NLS-1$
  585. // spacing
  586. // cedilla
  587. entities.put("sup1", Character.valueOf('\271')); // superscript //$NON-NLS-1$
  588. // one =
  589. // superscript
  590. // digit one
  591. entities.put("ordm", Character.valueOf('\272')); // masculine //$NON-NLS-1$
  592. // ordinal
  593. // indicator
  594. entities.put("raquo", Character.valueOf('\273')); // right-pointing //$NON-NLS-1$
  595. // double
  596. // angle
  597. // quotation
  598. // mark =
  599. // right
  600. // pointing
  601. // guillemet
  602. entities.put("frac14", Character.valueOf('\274')); // vulgar //$NON-NLS-1$
  603. // fraction
  604. // one
  605. // quarter =
  606. // fraction
  607. // one
  608. // quarter
  609. entities.put("frac12", Character.valueOf('\275')); // vulgar //$NON-NLS-1$
  610. // fraction
  611. // one half
  612. // =
  613. // fraction
  614. // one half
  615. entities.put("frac34", Character.valueOf('\276')); // vulgar //$NON-NLS-1$
  616. // fraction
  617. // three
  618. // quarters
  619. // =
  620. // fraction
  621. // three
  622. // quarters
  623. entities.put("iquest", Character.valueOf('\277')); // inverted //$NON-NLS-1$
  624. // question
  625. // mark =
  626. // turned
  627. // question
  628. // mark
  629. entities.put("Agrave", Character.valueOf('\300')); // latin //$NON-NLS-1$
  630. // capital
  631. // letter A
  632. // with
  633. // grave =
  634. // latin
  635. // capital
  636. // letter A
  637. // grave
  638. entities.put("Aacute", Character.valueOf('\301')); // latin //$NON-NLS-1$
  639. // capital
  640. // letter A
  641. // with
  642. // acute
  643. entities.put("Acirc", Character.valueOf('\302')); // latin //$NON-NLS-1$
  644. // capital
  645. // letter A
  646. // with
  647. // circumflex
  648. entities.put("Atilde", Character.valueOf('\303')); // latin //$NON-NLS-1$
  649. // capital
  650. // letter A
  651. // with
  652. // tilde
  653. entities.put("Auml", Character.valueOf('\304')); // latin //$NON-NLS-1$
  654. // capital
  655. // letter A
  656. // with
  657. // diaeresis
  658. entities.put("Aring", Character.valueOf('\305')); // latin //$NON-NLS-1$
  659. // capital
  660. // letter A
  661. // with ring
  662. // above =
  663. // latin
  664. // capital
  665. // letter A
  666. // ring
  667. entities.put("AElig", Character.valueOf('\306')); // latin //$NON-NLS-1$
  668. // capital
  669. // letter AE
  670. // = latin
  671. // capital
  672. // ligature
  673. // AE
  674. entities.put("Ccedil", Character.valueOf('\307')); // latin //$NON-NLS-1$
  675. // capital
  676. // letter C
  677. // with
  678. // cedilla
  679. entities.put("Egrave", Character.valueOf('\310')); // latin //$NON-NLS-1$
  680. // capital
  681. // letter E
  682. // with
  683. // grave
  684. entities.put("Eacute", Character.valueOf('\311')); // latin //$NON-NLS-1$
  685. // capital
  686. // letter E
  687. // with
  688. // acute
  689. entities.put("Ecirc", Character.valueOf('\312')); // latin //$NON-NLS-1$
  690. // capital
  691. // letter E
  692. // with
  693. // circumflex
  694. entities.put("Euml", Character.valueOf('\313')); // latin //$NON-NLS-1$
  695. // capital
  696. // letter E
  697. // with
  698. // diaeresis
  699. entities.put("Igrave", Character.valueOf('\314')); // latin //$NON-NLS-1$
  700. // capital
  701. // letter I
  702. // with
  703. // grave
  704. entities.put("Iacute", Character.valueOf('\315')); // latin //$NON-NLS-1$
  705. // capital
  706. // letter I
  707. // with
  708. // acute
  709. entities.put("Icirc", Character.valueOf('\316')); // latin //$NON-NLS-1$
  710. // capital
  711. // letter I
  712. // with
  713. // circumflex
  714. entities.put("Iuml", Character.valueOf('\317')); // latin //$NON-NLS-1$
  715. // capital
  716. // letter I
  717. // with
  718. // diaeresis
  719. entities.put("ETH", Character.valueOf('\320')); // latin capital //$NON-NLS-1$
  720. // letter ETH
  721. entities.put("Ntilde", Character.valueOf('\321')); // latin //$NON-NLS-1$
  722. // capital
  723. // letter N
  724. // with
  725. // tilde
  726. entities.put("Ograve", Character.valueOf('\322')); // latin //$NON-NLS-1$
  727. // capital
  728. // letter O
  729. // with
  730. // grave
  731. entities.put("Oacute", Character.valueOf('\323')); // latin //$NON-NLS-1$
  732. // capital
  733. // letter O
  734. // with
  735. // acute
  736. entities.put("Ocirc", Character.valueOf('\324')); // latin //$NON-NLS-1$
  737. // capital
  738. // letter O
  739. // with
  740. // circumflex
  741. entities.put("Otilde", Character.valueOf('\325')); // latin //$NON-NLS-1$
  742. // capital
  743. // letter O
  744. // with
  745. // tilde
  746. entities.put("Ouml", Character.valueOf('\326')); // latin //$NON-NLS-1$
  747. // capital
  748. // letter O
  749. // with
  750. // diaeresis
  751. entities.put("times", Character.valueOf('\327')); // multiplication //$NON-NLS-1$
  752. // sign
  753. entities.put("Oslash", Character.valueOf('\330')); // latin //$NON-NLS-1$
  754. // capital
  755. // letter O
  756. // with
  757. // stroke =
  758. // latin
  759. // capital
  760. // letter O
  761. // slash
  762. entities.put("Ugrave", Character.valueOf('\331')); // latin //$NON-NLS-1$
  763. // capital
  764. // letter U
  765. // with
  766. // grave
  767. entities.put("Uacute", Character.valueOf('\332')); // latin //$NON-NLS-1$
  768. // capital
  769. // letter U
  770. // with
  771. // acute
  772. entities.put("Ucirc", Character.valueOf('\333')); // latin //$NON-NLS-1$
  773. // capital
  774. // letter U
  775. // with
  776. // circumflex
  777. entities.put("Uuml", Character.valueOf('\334')); // latin //$NON-NLS-1$
  778. // capital
  779. // letter U
  780. // with
  781. // diaeresis
  782. entities.put("Yacute", Character.valueOf('\335')); // latin //$NON-NLS-1$
  783. // capital
  784. // letter Y
  785. // with
  786. // acute
  787. entities.put("THORN", Character.valueOf('\336')); // latin //$NON-NLS-1$
  788. // capital
  789. // letter
  790. // THORN
  791. entities.put("szlig", Character.valueOf('\337')); // latin //$NON-NLS-1$
  792. // small
  793. // letter
  794. // sharp s =
  795. // ess-zed
  796. entities.put("agrave", Character.valueOf('\340')); // latin //$NON-NLS-1$
  797. // small
  798. // letter a
  799. // with
  800. // grave =
  801. // latin
  802. // small
  803. // letter a
  804. // grave
  805. entities.put("aacute", Character.valueOf('\341')); // latin //$NON-NLS-1$
  806. // small
  807. // letter a
  808. // with
  809. // acute
  810. entities.put("acirc", Character.valueOf('\342')); // latin //$NON-NLS-1$
  811. // small
  812. // letter a
  813. // with
  814. // circumflex
  815. entities.put("atilde", Character.valueOf('\343')); // latin //$NON-NLS-1$
  816. // small
  817. // letter a
  818. // with
  819. // tilde
  820. entities.put("auml", Character.valueOf('\344')); // latin //$NON-NLS-1$
  821. // small
  822. // letter a
  823. // with
  824. // diaeresis
  825. entities.put("aring", Character.valueOf('\345')); // latin //$NON-NLS-1$
  826. // small
  827. // letter a
  828. // with ring
  829. // above =
  830. // latin
  831. // small
  832. // letter a
  833. // ring
  834. entities.put("aelig", Character.valueOf('\346')); // latin //$NON-NLS-1$
  835. // small
  836. // letter ae
  837. // = latin
  838. // small
  839. // ligature
  840. // ae
  841. entities.put("ccedil", Character.valueOf('\347')); // latin //$NON-NLS-1$
  842. // small
  843. // letter c
  844. // with
  845. // cedilla
  846. entities.put("egrave", Character.valueOf('\350')); // latin //$NON-NLS-1$
  847. // small
  848. // letter e
  849. // with
  850. // grave
  851. entities.put("eacute", Character.valueOf('\351')); // latin //$NON-NLS-1$
  852. // small
  853. // letter e
  854. // with
  855. // acute
  856. entities.put("ecirc", Character.valueOf('\352')); // latin //$NON-NLS-1$
  857. // small
  858. // letter e
  859. // with
  860. // circumflex
  861. entities.put("euml", Character.valueOf('\353')); // latin //$NON-NLS-1$
  862. // small
  863. // letter e
  864. // with
  865. // diaeresis
  866. entities.put("igrave", Character.valueOf('\354')); // latin //$NON-NLS-1$
  867. // small
  868. // letter i
  869. // with
  870. // grave
  871. entities.put("iacute", Character.valueOf('\355')); // latin //$NON-NLS-1$
  872. // small
  873. // letter i
  874. // with
  875. // acute
  876. entities.put("icirc", Character.valueOf('\356')); // latin //$NON-NLS-1$
  877. // small
  878. // letter i
  879. // with
  880. // circumflex
  881. entities.put("iuml", Character.valueOf('\357')); // latin //$NON-NLS-1$
  882. // small
  883. // letter i
  884. // with
  885. // diaeresis
  886. entities.put("eth", Character.valueOf('\360')); // latin small //$NON-NLS-1$
  887. // letter eth
  888. entities.put("ntilde", Character.valueOf('\361')); // latin //$NON-NLS-1$
  889. // small
  890. // letter n
  891. // with
  892. // tilde
  893. entities.put("ograve", Character.valueOf('\362')); // latin //$NON-NLS-1$
  894. // small
  895. // letter o
  896. // with
  897. // grave
  898. entities.put("oacute", Character.valueOf('\363')); // latin //$NON-NLS-1$
  899. // small
  900. // letter o
  901. // with
  902. // acute
  903. entities.put("ocirc", Character.valueOf('\364')); // latin //$NON-NLS-1$
  904. // small
  905. // letter o
  906. // with
  907. // circumflex
  908. entities.put("otilde", Character.valueOf('\365')); // latin //$NON-NLS-1$
  909. // small
  910. // letter o
  911. // with
  912. // tilde
  913. entities.put("ouml", Character.valueOf('\366')); // latin //$NON-NLS-1$
  914. // small
  915. // letter o
  916. // with
  917. // diaeresis
  918. entities.put("divide", Character.valueOf('\367')); // division //$NON-NLS-1$
  919. // sign
  920. entities.put("oslash", Character.valueOf('\370')); // latin //$NON-NLS-1$
  921. // small
  922. // letter o
  923. // with
  924. // stroke =
  925. // latin
  926. // small
  927. // letter o
  928. // slash
  929. entities.put("ugrave", Character.valueOf('\371')); // latin //$NON-NLS-1$
  930. // small
  931. // letter u
  932. // with
  933. // grave
  934. entities.put("uacute", Character.valueOf('\372')); // latin //$NON-NLS-1$
  935. // small
  936. // letter u
  937. // with
  938. // acute
  939. entities.put("ucirc", Character.valueOf('\373')); // latin //$NON-NLS-1$
  940. // small
  941. // letter u
  942. // with
  943. // circumflex
  944. entities.put("uuml", Character.valueOf('\374')); // latin //$NON-NLS-1$
  945. // small
  946. // letter u
  947. // with
  948. // diaeresis
  949. entities.put("yacute", Character.valueOf('\375')); // latin //$NON-NLS-1$
  950. // small
  951. // letter y
  952. // with
  953. // acute
  954. entities.put("thorn", Character.valueOf('\376')); // latin //$NON-NLS-1$
  955. // small
  956. // letter
  957. // thorn
  958. entities.put("yuml", Character.valueOf('\377')); // latin //$NON-NLS-1$
  959. // small
  960. // letter y
  961. // with
  962. // diaeresis
  963. // Special characters
  964. entities.put("quot", Character.valueOf('\42')); // quotation //$NON-NLS-1$
  965. // mark = APL
  966. // quote
  967. entities.put("amp", Character.valueOf('\46')); // ampersand //$NON-NLS-1$
  968. entities.put("lt", Character.valueOf('\74')); // less-than //$NON-NLS-1$
  969. // sign
  970. entities.put("gt", Character.valueOf('\76')); // greater-than //$NON-NLS-1$
  971. // sign
  972. // Latin Extended-A
  973. entities.put("OElig", Character.valueOf('\u0152')); // latin //$NON-NLS-1$
  974. // capital
  975. // ligature
  976. // OE
  977. entities.put("oelig", Character.valueOf('\u0153')); // latin //$NON-NLS-1$
  978. // small
  979. // ligature
  980. // oe,
  981. // ligature
  982. // is a
  983. // misnomer,
  984. // this is a
  985. // separate
  986. // character
  987. // in some
  988. // languages
  989. entities.put("Scaron", Character.valueOf('\u0160')); // latin //$NON-NLS-1$
  990. // capital
  991. // letter
  992. // S
  993. // with
  994. // caron
  995. entities.put("scaron", Character.valueOf('\u0161')); // latin //$NON-NLS-1$
  996. // small
  997. // letter
  998. // s
  999. // with
  1000. // caron
  1001. entities.put("Yuml", Character.valueOf('\u0178')); // latin //$NON-NLS-1$
  1002. // capital
  1003. // letter Y
  1004. // with
  1005. // diaeresis
  1006. // Spacing Modifier Letters
  1007. entities.put("circ", Character.valueOf('\u02c6')); // modifier //$NON-NLS-1$
  1008. // letter
  1009. // circumflex
  1010. // accent
  1011. entities.put("tilde", Character.valueOf('\u02dc')); // small //$NON-NLS-1$
  1012. // tilde
  1013. // General punctuation
  1014. entities.put("ensp", Character.valueOf('\u2002')); // en space //$NON-NLS-1$
  1015. entities.put("emsp", Character.valueOf('\u2003')); // em space //$NON-NLS-1$
  1016. entities.put("thinsp", Character.valueOf('\u2009')); // thin //$NON-NLS-1$
  1017. // space
  1018. entities.put("zwnj", Character.valueOf('\u200c')); // zero //$NON-NLS-1$
  1019. // width
  1020. // non-joiner
  1021. entities.put("zwj", Character.valueOf('\u200d')); // zero //$NON-NLS-1$
  1022. // width
  1023. // joiner
  1024. entities.put("lrm", Character.valueOf('\u200e')); // left-to-right //$NON-NLS-1$
  1025. // mark
  1026. entities.put("rlm", Character.valueOf('\u200f')); // right-to-left //$NON-NLS-1$
  1027. // mark
  1028. entities.put("ndash", Character.valueOf('\u2013')); // en dash //$NON-NLS-1$
  1029. entities.put("mdash", Character.valueOf('\u2014')); // em dash //$NON-NLS-1$
  1030. entities.put("lsquo", Character.valueOf('\u2018')); // left //$NON-NLS-1$
  1031. // single
  1032. // quotation
  1033. // mark
  1034. entities.put("rsquo", Character.valueOf('\u2019')); // right //$NON-NLS-1$
  1035. // single
  1036. // quotation
  1037. // mark
  1038. entities.put("sbquo", Character.valueOf('\u201a')); // single //$NON-NLS-1$
  1039. // low-9
  1040. // quotation
  1041. // mark
  1042. entities.put("ldquo", Character.valueOf('\u201c')); // left //$NON-NLS-1$
  1043. // double
  1044. // quotation
  1045. // mark
  1046. entities.put("rdquo", Character.valueOf('\u201d')); // right //$NON-NLS-1$
  1047. // double
  1048. // quotation
  1049. // mark
  1050. entities.put("bdquo", Character.valueOf('\u201e')); // double //$NON-NLS-1$
  1051. // low-9
  1052. // quotation
  1053. // mark
  1054. entities.put("dagger", Character.valueOf('\u2020')); // dagger //$NON-NLS-1$
  1055. entities.put("Dagger", Character.valueOf('\u2021')); // double //$NON-NLS-1$
  1056. // dagger
  1057. entities.put("permil", Character.valueOf('\u2030')); // per //$NON-NLS-1$
  1058. // mille
  1059. // sign
  1060. entities.put("lsaquo", Character.valueOf('\u2039')); // single //$NON-NLS-1$
  1061. // left-pointing
  1062. // angle
  1063. // quotation
  1064. // mark,
  1065. // not
  1066. // yet
  1067. // standardized
  1068. entities.put("rsaquo", Character.valueOf('\u203a')); // single //$NON-NLS-1$
  1069. // right-pointing
  1070. // angle
  1071. // quotation
  1072. // mark,
  1073. // not
  1074. // yet
  1075. // standardized
  1076. entities.put("euro", Character.valueOf('\u20ac')); // euro sign //$NON-NLS-1$
  1077. }
  1078. }