/tcsproxy/java_stub/HTMLTokenizer.java

https://github.com/armandofox/glomop · Java · 471 lines · 407 code · 25 blank · 39 comment · 214 complexity · ad2b4590ac6edd89585520df2ecde2ad MD5 · raw file

  1. import java.io.*;
  2. import java.util.*;
  3. class HTMLTag
  4. {
  5. public boolean start;
  6. public StringBuffer name;
  7. public Vector attributes;
  8. public Vector values;
  9. public HTMLTag()
  10. {
  11. name = new StringBuffer();
  12. attributes = new Vector();
  13. values = new Vector();
  14. }
  15. }
  16. class HTMLTokenizer
  17. {
  18. protected PushbackInputStream in;
  19. public HTMLTag tag;
  20. public StringBuffer text;
  21. public final static int TT_EOF = -1;
  22. public final static int TT_START_TAG = -2;
  23. public final static int TT_END_TAG = -3;
  24. public final static int TT_TEXT = -4;
  25. public HTMLTokenizer(PushbackInputStream input)
  26. {
  27. in = input;
  28. }
  29. public int nextToken() throws IOException
  30. {
  31. tag = new HTMLTag();
  32. text = new StringBuffer();
  33. int c = in.read();
  34. if (c == -1) {
  35. return TT_EOF;
  36. }
  37. else if (c == '&') {
  38. c = in.read();
  39. if (c == -1) {
  40. text.append('&');
  41. return TT_TEXT;
  42. }
  43. else if (c == '#') {
  44. c = in.read();
  45. if (c == -1) {
  46. text.append("&#");
  47. return TT_TEXT;
  48. }
  49. else if (Character.isDigit((char)c)) {
  50. text.append(readNCR(c));
  51. readText();
  52. return TT_TEXT;
  53. }
  54. else {
  55. text.append("&#");
  56. text.append((char)c);
  57. readText();
  58. return TT_TEXT;
  59. }
  60. }
  61. else if (Character.isLetter((char)c)) {
  62. text.append(readGER(c));
  63. readText();
  64. return TT_TEXT;
  65. }
  66. else {
  67. text.append('&');
  68. text.append((char)c);
  69. readText();
  70. return TT_TEXT;
  71. }
  72. }
  73. else if (c == '<') {
  74. c = in.read();
  75. if (c == -1) {
  76. text.append('<');
  77. return TT_TEXT;
  78. }
  79. else if (c == '!') {
  80. c = in.read();
  81. if (c == -1) {
  82. text.append("<!");
  83. return TT_TEXT;
  84. }
  85. else if (c == '>') {
  86. // Empty comment <!> okay
  87. return nextToken();
  88. }
  89. else if (c == '-') {
  90. c = in.read();
  91. if (c == -1) {
  92. text.append("<!-");
  93. return TT_TEXT;
  94. }
  95. else if (c == '-') {
  96. readCommentDecl();
  97. return nextToken();
  98. }
  99. else {
  100. text.append("<!-");
  101. text.append((char)c);
  102. readText();
  103. return TT_TEXT;
  104. }
  105. }
  106. else {
  107. text.append("<!");
  108. text.append((char)c);
  109. readText();
  110. return TT_TEXT;
  111. }
  112. }
  113. else if (c == '/') {
  114. c = in.read();
  115. if (c == -1) {
  116. text.append("</");
  117. return TT_TEXT;
  118. }
  119. else if (Character.isLetter((char)c)) {
  120. if (readEndTag(c) == 0) {
  121. return TT_END_TAG;
  122. }
  123. else {
  124. return TT_EOF;
  125. }
  126. }
  127. else {
  128. text.append("</");
  129. text.append((char)c);
  130. readText();
  131. return TT_TEXT;
  132. }
  133. }
  134. else if (Character.isLetter((char)c)) {
  135. if (readStartTag(c) == 0) {
  136. return TT_START_TAG;
  137. }
  138. else {
  139. return TT_EOF;
  140. }
  141. }
  142. else {
  143. text.append('<');
  144. text.append((char)c);
  145. readText();
  146. return TT_TEXT;
  147. }
  148. }
  149. else {
  150. text.append((char)c);
  151. readText();
  152. return TT_TEXT;
  153. }
  154. }
  155. private void readText() throws IOException
  156. {
  157. int c = in.read();
  158. while (c != -1 && c != '&' && c != '<') {
  159. text.append((char)c);
  160. c = in.read();
  161. }
  162. if (c != -1) {
  163. in.unread(c);
  164. }
  165. }
  166. private char readNCR(int c) throws IOException
  167. {
  168. StringBuffer ncr = new StringBuffer();
  169. ncr.append((char)c);
  170. c = in.read();
  171. while (c != -1 && Character.isDigit((char)c)) {
  172. ncr.append((char)c);
  173. c = in.read();
  174. }
  175. if (c != -1 && c != ';' && !Character.isSpace((char)c)) {
  176. in.unread(c);
  177. }
  178. int value = Integer.parseInt(ncr.toString());
  179. if (value > 255) {
  180. return '?';
  181. }
  182. return (char)value;
  183. }
  184. private char readGER(int c) throws IOException
  185. {
  186. StringBuffer ger = new StringBuffer();
  187. readName(c, ger);
  188. c = in.read();
  189. if (c != -1 && c != ';' && !Character.isSpace((char)c)) {
  190. in.unread(c);
  191. }
  192. if (ger.toString().equals("copy")) {
  193. return (char)169;
  194. }
  195. else if (ger.toString().equals("reg")) {
  196. return (char)174;
  197. }
  198. else if (ger.toString().equals("amp")) {
  199. return (char)38;
  200. }
  201. else if (ger.toString().equals("gt")) {
  202. return (char)62;
  203. }
  204. else if (ger.toString().equals("lt")) {
  205. return (char)60;
  206. }
  207. else if (ger.toString().equals("quot")) {
  208. return (char)34;
  209. }
  210. else if (ger.toString().equals("nbsp")) {
  211. return (char)160;
  212. }
  213. else {
  214. return '?';
  215. }
  216. }
  217. private void readCommentDecl() throws IOException
  218. {
  219. readComment();
  220. int c;
  221. while ((c = in.read()) != -1) {
  222. if (c == '>') {
  223. return;
  224. }
  225. else if (c == '-') {
  226. c = in.read();
  227. if (c == -1) {
  228. // System.out.println("Unexpected end of file in comment declaration");
  229. return;
  230. }
  231. else if (c == '-') {
  232. readComment();
  233. }
  234. else {
  235. /*
  236. System.out.println("Illegal character(s) in comment declaration: "
  237. + '-' + (char)c);
  238. */
  239. }
  240. }
  241. else if (!Character.isSpace((char)c)) {
  242. /*
  243. System.out.println("Illegal character(s) in comment declaration: "
  244. + (char)c);
  245. */
  246. }
  247. }
  248. // System.out.println("Unexpected end of file in comment declaration");
  249. }
  250. private void readComment() throws IOException
  251. {
  252. int c;
  253. while ((c = in.read()) != -1) {
  254. if (c == '-') {
  255. c = in.read();
  256. if (c == -1) {
  257. // System.out.println("Unexpected end of file in comment");
  258. return;
  259. }
  260. else if (c == '-') {
  261. return;
  262. }
  263. }
  264. }
  265. // System.out.println("Unexpected end of file in comment");
  266. }
  267. private int readEndTag(int c) throws IOException
  268. {
  269. tag.start = false;
  270. if (readName(c, tag.name) != 0) {
  271. return -1;
  272. }
  273. while ((c = in.read()) != -1) {
  274. if (c == '>') {
  275. return 0;
  276. }
  277. else if (!Character.isSpace((char)c)) {
  278. /*
  279. System.out.println("Illegal character(s) in </" + tag.name + ">: " +
  280. (char)c);
  281. */
  282. }
  283. }
  284. // System.out.println("Unexpected end of file in </" + tag.name + ">");
  285. return -1;
  286. }
  287. private int readStartTag(int c) throws IOException
  288. {
  289. tag.start = true;
  290. if (readName(c, tag.name) != 0) {
  291. return -1;
  292. }
  293. readSpace();
  294. if ((c = in.read()) == -1) {
  295. // System.out.println("Unexpected end of file in <" + tag.name + ">");
  296. return -1;
  297. }
  298. return readAttributesValues(c);
  299. }
  300. private int readAttributesValues(int c) throws IOException
  301. {
  302. while (c != -1) {
  303. if (c == '>') { // Close start tag
  304. return 0;
  305. }
  306. else if (!Character.isLetter((char)c)) { // Not an attribute name
  307. /*
  308. System.out.println("Illegal character(s) in <" + tag.name + ">: " +
  309. (char)c);
  310. */
  311. c = in.read();
  312. while (c != -1 && !Character.isLetter((char)c) && c != '>') {
  313. c = in.read();
  314. }
  315. if (c == -1) {
  316. // System.out.println("Unexpected end of file in <" + tag.name + ">");
  317. return -1;
  318. }
  319. else if (c == '>') { // Not parsed as an attribute
  320. return 0;
  321. }
  322. }
  323. // Now c is the first letter of the attribute
  324. tag.attributes.addElement(new StringBuffer());
  325. StringBuffer attribute = (StringBuffer)(tag.attributes.lastElement());
  326. if (readName(c, attribute) != 0) {
  327. return -1;
  328. }
  329. readSpace();
  330. c = in.read();
  331. if (c == -1) {
  332. // System.out.println("Unexpected end of file in <" + tag.name + ">");
  333. return -1;
  334. }
  335. else if (c == '>') { // Add an empty value
  336. tag.values.addElement(new StringBuffer());
  337. return 0;
  338. }
  339. else if (c == '=') { // Now look for literal or name taken
  340. if (readValue() != 0) {
  341. return -1;
  342. }
  343. readSpace();
  344. c = in.read();
  345. }
  346. else { // Add an empty value
  347. tag.values.addElement(new StringBuffer());
  348. }
  349. }
  350. // System.out.println("Unexpected end of file in <" + tag.name + ">");
  351. return -1;
  352. }
  353. private int readName(int c, StringBuffer name) throws IOException
  354. {
  355. name.append((char)c);
  356. c = in.read();
  357. while (c != -1 &&
  358. (Character.isLetterOrDigit((char)c) || c == '-' || c == '.')) {
  359. name.append((char)c);
  360. c = in.read();
  361. }
  362. if (c == -1) {
  363. // System.out.println("Unexpected end of file in: " + name);
  364. return -1;
  365. }
  366. else {
  367. in.unread(c);
  368. return 0;
  369. }
  370. }
  371. private int readValue() throws IOException
  372. {
  373. readSpace();
  374. int c = in.read();
  375. if (c == -1) {
  376. /*
  377. System.out.println("Unexpected end of file in value: " +
  378. (StringBuffer)tag.attributes.lastElement());
  379. */
  380. return -1;
  381. }
  382. else if (c == '\"' || c == '\'') { // A literal
  383. tag.values.addElement(new StringBuffer());
  384. if (readLiteral(c) != 0) {
  385. return -1;
  386. }
  387. else {
  388. return 0;
  389. }
  390. }
  391. else if (Character.isLetterOrDigit((char)c) || c == '-' || c == '.') {
  392. tag.values.addElement(new StringBuffer());
  393. if ((readName(c, (StringBuffer)(tag.values.lastElement()))) != 0) {
  394. return -1;
  395. }
  396. else {
  397. return 0;
  398. }
  399. }
  400. else {
  401. c = in.read();
  402. while (c != -1 && c != '\"' && c != '\'' &&
  403. !Character.isLetterOrDigit((char)c) && c != '-' && c != '.') {
  404. c = in.read();
  405. }
  406. if (c == -1) {
  407. /*
  408. System.out.println("Unexpected end of file in value: " +
  409. (StringBuffer)tag.attributes.lastElement());
  410. */
  411. return -1;
  412. }
  413. else {
  414. in.unread(c);
  415. return readValue();
  416. }
  417. }
  418. }
  419. // Shoule be able to handle GER and NCR too!
  420. // What about illegal literal without quote?
  421. private int readLiteral(int c) throws IOException
  422. {
  423. char delimiter = (char)c;
  424. StringBuffer literal = (StringBuffer)(tag.values.lastElement());
  425. literal.append(delimiter);
  426. while ((c = in.read()) != -1 && c != delimiter) {
  427. literal.append((char)c);
  428. }
  429. if (c == -1) {
  430. // System.out.println("Unexpected end of file in literal: " + literal);
  431. return -1;
  432. }
  433. literal.append(delimiter);
  434. return 0;
  435. }
  436. private void readSpace() throws IOException
  437. {
  438. int c = in.read();
  439. while (c != -1 && Character.isSpace((char)c)) {
  440. c = in.read();
  441. }
  442. if (c != -1) {
  443. in.unread(c);
  444. }
  445. }
  446. }