/node_modules/jsdom/node_modules/htmlparser2/lib/Tokenizer.js

https://bitbucket.org/itsTamas/crawler · JavaScript · 546 lines · 471 code · 37 blank · 38 comment · 285 complexity · 394b91c1983ba8d36c3dd852235bcd9a MD5 · raw file

  1. module.exports = Tokenizer;
  2. var i = 0,
  3. TEXT = i++,
  4. BEFORE_TAG_NAME = i++, //after <
  5. IN_TAG_NAME = i++,
  6. BEFORE_CLOSING_TAG_NAME = i++,
  7. IN_CLOSING_TAG_NAME = i++,
  8. AFTER_CLOSING_TAG_NAME = i++,
  9. //attributes
  10. BEFORE_ATTRIBUTE_NAME = i++,
  11. IN_ATTRIBUTE_NAME = i++,
  12. AFTER_ATTRIBUTE_NAME = i++,
  13. BEFORE_ATTRIBUTE_VALUE = i++,
  14. IN_ATTRIBUTE_VALUE_DOUBLE_QUOTES = i++, // "
  15. IN_ATTRIBUTE_VALUE_SINGLE_QUOTES = i++, // '
  16. IN_ATTRIBUTE_VALUE_NO_QUOTES = i++,
  17. //declarations
  18. BEFORE_DECLARATION = i++, // !
  19. IN_DECLARATION = i++,
  20. //processing instructions
  21. IN_PROCESSING_INSTRUCTION = i++, // ?
  22. //comments
  23. BEFORE_COMMENT = i++,
  24. IN_COMMENT = i++,
  25. AFTER_COMMENT_1 = i++,
  26. AFTER_COMMENT_2 = i++,
  27. //cdata
  28. BEFORE_CDATA_1 = i++, // [
  29. BEFORE_CDATA_2 = i++, // C
  30. BEFORE_CDATA_3 = i++, // D
  31. BEFORE_CDATA_4 = i++, // A
  32. BEFORE_CDATA_5 = i++, // T
  33. BEFORE_CDATA_6 = i++, // A
  34. IN_CDATA = i++,// [
  35. AFTER_CDATA_1 = i++, // ]
  36. AFTER_CDATA_2 = i++, // ]
  37. //special tags
  38. BEFORE_SPECIAL = i++, //S
  39. BEFORE_SPECIAL_END = i++, //S
  40. BEFORE_SCRIPT_1 = i++, //C
  41. BEFORE_SCRIPT_2 = i++, //R
  42. BEFORE_SCRIPT_3 = i++, //I
  43. BEFORE_SCRIPT_4 = i++, //P
  44. BEFORE_SCRIPT_5 = i++, //T
  45. AFTER_SCRIPT_1 = i++, //C
  46. AFTER_SCRIPT_2 = i++, //R
  47. AFTER_SCRIPT_3 = i++, //I
  48. AFTER_SCRIPT_4 = i++, //P
  49. AFTER_SCRIPT_5 = i++, //T
  50. BEFORE_STYLE_1 = i++, //T
  51. BEFORE_STYLE_2 = i++, //Y
  52. BEFORE_STYLE_3 = i++, //L
  53. BEFORE_STYLE_4 = i++, //E
  54. AFTER_STYLE_1 = i++, //T
  55. AFTER_STYLE_2 = i++, //Y
  56. AFTER_STYLE_3 = i++, //L
  57. AFTER_STYLE_4 = i++; //E
  58. function whitespace(c){
  59. return c === " " || c === "\t" || c === "\r" || c === "\n";
  60. }
  61. function Tokenizer(options, cbs){
  62. this._state = TEXT;
  63. this._buffer = "";
  64. this._sectionStart = 0;
  65. this._index = 0;
  66. this._options = options;
  67. this._special = 0; // 1 for script, 2 for style
  68. this._cbs = cbs;
  69. this._running = true;
  70. }
  71. //TODO make events conditional
  72. Tokenizer.prototype.write = function(chunk){
  73. this._buffer += chunk;
  74. while(this._index < this._buffer.length && this._running){
  75. var c = this._buffer.charAt(this._index);
  76. if(this._state === TEXT){
  77. if(c === "<"){
  78. this._emitIfToken("text");
  79. this._state = BEFORE_TAG_NAME;
  80. this._sectionStart = this._index;
  81. }
  82. } else if(this._state === BEFORE_TAG_NAME){
  83. if(c === "/"){
  84. this._state = BEFORE_CLOSING_TAG_NAME;
  85. } else if(c === ">" || this._special > 0) {
  86. this._state = TEXT;
  87. } else {
  88. if(whitespace(c));
  89. else if(c === "!"){
  90. this._state = BEFORE_DECLARATION;
  91. this._sectionStart = this._index + 1;
  92. } else if(c === "?"){
  93. this._state = IN_PROCESSING_INSTRUCTION;
  94. this._sectionStart = this._index + 1;
  95. } else if(
  96. (!this._options || !this._options.xmlMode) &&
  97. (c === "s" || c === "S")
  98. ){
  99. this._state = BEFORE_SPECIAL;
  100. this._sectionStart = this._index;
  101. } else {
  102. this._state = IN_TAG_NAME;
  103. this._sectionStart = this._index;
  104. }
  105. }
  106. } else if(this._state === IN_TAG_NAME){
  107. if(c === "/"){
  108. this._emitToken("opentagname");
  109. this._cbs.onselfclosingtag();
  110. this._state = AFTER_CLOSING_TAG_NAME;
  111. } else if(c === ">"){
  112. this._emitToken("opentagname");
  113. this._cbs.onopentagend();
  114. this._state = TEXT;
  115. this._sectionStart = this._index + 1;
  116. } else if(whitespace(c)){
  117. this._emitToken("opentagname");
  118. this._state = BEFORE_ATTRIBUTE_NAME;
  119. }
  120. } else if(this._state === BEFORE_CLOSING_TAG_NAME){
  121. if(whitespace(c));
  122. else if(c === ">"){
  123. this._state = TEXT;
  124. } else if(this._special > 0){
  125. if(c === "s" || c === "S"){
  126. this._state = BEFORE_SPECIAL_END;
  127. }
  128. } else {
  129. this._state = IN_CLOSING_TAG_NAME;
  130. this._sectionStart = this._index;
  131. }
  132. } else if(this._state === IN_CLOSING_TAG_NAME){
  133. if(c === ">"){
  134. this._emitToken("closetag");
  135. this._state = TEXT;
  136. this._sectionStart = this._index + 1;
  137. this._special = 0;
  138. } else if(whitespace(c)){
  139. this._emitToken("closetag");
  140. this._state = AFTER_CLOSING_TAG_NAME;
  141. this._special = 0;
  142. }
  143. } else if(this._state === AFTER_CLOSING_TAG_NAME){
  144. //skip everything until ">"
  145. if(c === ">"){
  146. this._state = TEXT;
  147. this._sectionStart = this._index + 1;
  148. }
  149. }
  150. /*
  151. * attributes
  152. */
  153. else if(this._state === BEFORE_ATTRIBUTE_NAME){
  154. if(c === ">"){
  155. this._state = TEXT;
  156. this._cbs.onopentagend();
  157. this._sectionStart = this._index + 1;
  158. } else if(c === "/"){
  159. this._cbs.onselfclosingtag();
  160. this._state = AFTER_CLOSING_TAG_NAME;
  161. } else if(!whitespace(c)){
  162. this._state = IN_ATTRIBUTE_NAME;
  163. this._sectionStart = this._index;
  164. }
  165. } else if(this._state === IN_ATTRIBUTE_NAME){
  166. if(c === "="){
  167. this._emitIfToken("attribname");
  168. this._state = BEFORE_ATTRIBUTE_VALUE;
  169. } else if(whitespace(c)){
  170. this._emitIfToken("attribname");
  171. this._state = AFTER_ATTRIBUTE_NAME;
  172. } else if(c === "/" || c === ">"){
  173. this._emitIfToken("attribname");
  174. this._state = BEFORE_ATTRIBUTE_NAME;
  175. this._index--;
  176. }
  177. } else if(this._state === AFTER_ATTRIBUTE_NAME){
  178. if(c === "="){
  179. this._state = BEFORE_ATTRIBUTE_VALUE;
  180. } else if(c === "/" || c === ">"){
  181. this._state = BEFORE_ATTRIBUTE_NAME;
  182. this._index--;
  183. } else if(!whitespace(c)){
  184. this._state = IN_ATTRIBUTE_NAME;
  185. this._sectionStart = this._index;
  186. }
  187. } else if(this._state === BEFORE_ATTRIBUTE_VALUE){
  188. if(c === "\""){
  189. this._state = IN_ATTRIBUTE_VALUE_DOUBLE_QUOTES;
  190. this._sectionStart = this._index + 1;
  191. } else if(c === "'"){
  192. this._state = IN_ATTRIBUTE_VALUE_SINGLE_QUOTES;
  193. this._sectionStart = this._index + 1;
  194. } else if(!whitespace(c)){
  195. this._state = IN_ATTRIBUTE_VALUE_NO_QUOTES;
  196. this._sectionStart = this._index;
  197. }
  198. } else if(this._state === IN_ATTRIBUTE_VALUE_DOUBLE_QUOTES){
  199. if(c === "\""){
  200. this._emitToken("attribvalue");
  201. this._state = BEFORE_ATTRIBUTE_NAME;
  202. }
  203. } else if(this._state === IN_ATTRIBUTE_VALUE_SINGLE_QUOTES){
  204. if(c === "'"){
  205. this._emitToken("attribvalue");
  206. this._state = BEFORE_ATTRIBUTE_NAME;
  207. }
  208. } else if(this._state === IN_ATTRIBUTE_VALUE_NO_QUOTES){
  209. if(c === ">"){
  210. this._emitToken("attribvalue");
  211. this._state = TEXT;
  212. this._cbs.onopentagend();
  213. this._sectionStart = this._index + 1;
  214. } else if(whitespace(c)){
  215. this._emitToken("attribvalue");
  216. this._state = BEFORE_ATTRIBUTE_NAME;
  217. }
  218. }
  219. /*
  220. * declarations
  221. */
  222. else if(this._state === BEFORE_DECLARATION){
  223. if(c === "[") this._state = BEFORE_CDATA_1;
  224. else if(c === "-") this._state = BEFORE_COMMENT;
  225. else this._state = IN_DECLARATION;
  226. } else if(this._state === IN_DECLARATION){
  227. if(c === ">"){
  228. this._emitToken("declaration");
  229. this._state = TEXT;
  230. this._sectionStart = this._index + 1;
  231. }
  232. }
  233. /*
  234. * processing instructions
  235. */
  236. else if(this._state === IN_PROCESSING_INSTRUCTION){
  237. if(c === ">"){
  238. this._emitToken("processinginstruction");
  239. this._state = TEXT;
  240. this._sectionStart = this._index + 1;
  241. }
  242. }
  243. /*
  244. * comments
  245. */
  246. else if(this._state === BEFORE_COMMENT){
  247. if(c === "-"){
  248. this._state = IN_COMMENT;
  249. this._sectionStart = this._index + 1;
  250. } else {
  251. this._state = IN_DECLARATION;
  252. }
  253. } else if(this._state === IN_COMMENT){
  254. if(c === "-") this._state = AFTER_COMMENT_1;
  255. } else if(this._state === AFTER_COMMENT_1){
  256. if(c === "-") this._state = AFTER_COMMENT_2;
  257. else this._state = IN_COMMENT;
  258. } else if(this._state === AFTER_COMMENT_2){
  259. if(c === ">"){
  260. //remove 2 trailing chars
  261. this._cbs.oncomment(this._buffer.substring(this._sectionStart, this._index - 2));
  262. this._state = TEXT;
  263. this._sectionStart = this._index + 1;
  264. } else {
  265. this._state = IN_COMMENT;
  266. }
  267. }
  268. /*
  269. * cdata
  270. */
  271. else if(this._state === BEFORE_CDATA_1){
  272. if(c === "C") this._state = BEFORE_CDATA_2;
  273. else this._state = IN_DECLARATION;
  274. } else if(this._state === BEFORE_CDATA_2){
  275. if(c === "D") this._state = BEFORE_CDATA_3;
  276. else this._state = IN_DECLARATION;
  277. } else if(this._state === BEFORE_CDATA_3){
  278. if(c === "A") this._state = BEFORE_CDATA_4;
  279. else this._state = IN_DECLARATION;
  280. } else if(this._state === BEFORE_CDATA_4){
  281. if(c === "T") this._state = BEFORE_CDATA_5;
  282. else this._state = IN_DECLARATION;
  283. } else if(this._state === BEFORE_CDATA_5){
  284. if(c === "A") this._state = BEFORE_CDATA_6;
  285. else this._state = IN_DECLARATION;
  286. } else if(this._state === BEFORE_CDATA_6){
  287. if(c === "["){
  288. this._state = IN_CDATA;
  289. this._sectionStart = this._index + 1;
  290. } else {
  291. this._state = IN_DECLARATION;
  292. }
  293. } else if(this._state === IN_CDATA){
  294. if(c === "]") this._state = AFTER_CDATA_1;
  295. } else if(this._state === AFTER_CDATA_1){
  296. if(c === "]") this._state = AFTER_CDATA_2;
  297. else this._state = IN_CDATA;
  298. } else if(this._state === AFTER_CDATA_2){
  299. if(c === ">"){
  300. //remove 2 trailing chars
  301. this._cbs.oncdata(this._buffer.substring(this._sectionStart, this._index - 2));
  302. this._state = TEXT;
  303. this._sectionStart = this._index + 1;
  304. } else {
  305. this._state = IN_CDATA;
  306. }
  307. }
  308. /*
  309. * special tags
  310. */
  311. else if(this._state === BEFORE_SPECIAL){
  312. if(c === "c" || c === "C"){
  313. this._state = BEFORE_SCRIPT_1;
  314. } else if(c === "t" || c === "T"){
  315. this._state = BEFORE_STYLE_1;
  316. } else {
  317. this._state = IN_TAG_NAME;
  318. this._index--; //consume the token again
  319. }
  320. } else if(this._state === BEFORE_SPECIAL_END){
  321. if(this._special === 1 && (c === "c" || c === "C")){
  322. this._state = AFTER_SCRIPT_1;
  323. } else if(this._special === 2 && (c === "t" || c === "T")){
  324. this._state = AFTER_STYLE_1;
  325. }
  326. else this._state = TEXT;
  327. }
  328. /*
  329. * script
  330. */
  331. else if(this._state === BEFORE_SCRIPT_1){
  332. if(c === "r" || c === "R"){
  333. this._state = BEFORE_SCRIPT_2;
  334. } else {
  335. this._state = IN_TAG_NAME;
  336. this._index--; //consume the token again
  337. }
  338. } else if(this._state === BEFORE_SCRIPT_2){
  339. if(c === "i" || c === "I"){
  340. this._state = BEFORE_SCRIPT_3;
  341. } else {
  342. this._state = IN_TAG_NAME;
  343. this._index--; //consume the token again
  344. }
  345. } else if(this._state === BEFORE_SCRIPT_3){
  346. if(c === "p" || c === "P"){
  347. this._state = BEFORE_SCRIPT_4;
  348. } else {
  349. this._state = IN_TAG_NAME;
  350. this._index--; //consume the token again
  351. }
  352. } else if(this._state === BEFORE_SCRIPT_4){
  353. if(c === "t" || c === "T"){
  354. this._state = BEFORE_SCRIPT_5;
  355. } else {
  356. this._state = IN_TAG_NAME;
  357. this._index--; //consume the token again
  358. }
  359. } else if(this._state === BEFORE_SCRIPT_5){
  360. if(c === "/" || c === ">" || whitespace(c)){
  361. this._special = 1;
  362. }
  363. this._state = IN_TAG_NAME;
  364. this._index--; //consume the token again
  365. }
  366. else if(this._state === AFTER_SCRIPT_1){
  367. if(c === "r" || c === "R"){
  368. this._state = AFTER_SCRIPT_2;
  369. }
  370. else this._state = TEXT;
  371. } else if(this._state === AFTER_SCRIPT_2){
  372. if(c === "i" || c === "I"){
  373. this._state = AFTER_SCRIPT_3;
  374. }
  375. else this._state = TEXT;
  376. } else if(this._state === AFTER_SCRIPT_3){
  377. if(c === "p" || c === "P"){
  378. this._state = AFTER_SCRIPT_4;
  379. }
  380. else this._state = TEXT;
  381. } else if(this._state === AFTER_SCRIPT_4){
  382. if(c === "t" || c === "T"){
  383. this._state = AFTER_SCRIPT_5;
  384. }
  385. else this._state = TEXT;
  386. } else if(this._state === AFTER_SCRIPT_5){
  387. if(c === ">" || whitespace(c)){
  388. this._state = IN_CLOSING_TAG_NAME;
  389. this._sectionStart = this._index - 6;
  390. this._index--; //reconsume the token
  391. }
  392. else this._state = TEXT;
  393. }
  394. /*
  395. * style
  396. */
  397. else if(this._state === BEFORE_STYLE_1){
  398. if(c === "y" || c === "Y"){
  399. this._state = BEFORE_STYLE_2;
  400. } else {
  401. this._state = IN_TAG_NAME;
  402. this._index--; //consume the token again
  403. }
  404. } else if(this._state === BEFORE_STYLE_2){
  405. if(c === "l" || c === "L"){
  406. this._state = BEFORE_STYLE_3;
  407. } else {
  408. this._state = IN_TAG_NAME;
  409. this._index--; //consume the token again
  410. }
  411. } else if(this._state === BEFORE_STYLE_3){
  412. if(c === "e" || c === "E"){
  413. this._state = BEFORE_STYLE_4;
  414. } else {
  415. this._state = IN_TAG_NAME;
  416. this._index--; //consume the token again
  417. }
  418. } else if(this._state === BEFORE_STYLE_4){
  419. if(c === "/" || c === ">" || whitespace(c)){
  420. this._special = 2;
  421. }
  422. this._state = IN_TAG_NAME;
  423. this._index--; //consume the token again
  424. }
  425. else if(this._state === AFTER_STYLE_1){
  426. if(c === "y" || c === "Y"){
  427. this._state = AFTER_STYLE_2;
  428. }
  429. else this._state = TEXT;
  430. } else if(this._state === AFTER_STYLE_2){
  431. if(c === "l" || c === "L"){
  432. this._state = AFTER_STYLE_3;
  433. }
  434. else this._state = TEXT;
  435. } else if(this._state === AFTER_STYLE_3){
  436. if(c === "e" || c === "E"){
  437. this._state = AFTER_STYLE_4;
  438. }
  439. else this._state = TEXT;
  440. } else if(this._state === AFTER_STYLE_4){
  441. if(c === ">" || whitespace(c)){
  442. this._state = IN_CLOSING_TAG_NAME;
  443. this._sectionStart = this._index - 5;
  444. this._index--; //reconsume the token
  445. }
  446. else this._state = TEXT;
  447. }
  448. else {
  449. this._cbs.onerror(Error("unknown state"), this._state);
  450. }
  451. this._index++;
  452. }
  453. //cleanup
  454. if(this._sectionStart === -1){
  455. this._buffer = "";
  456. this._index = 0;
  457. } else {
  458. if(this._state === TEXT){
  459. this._emitIfToken("text");
  460. this._buffer = "";
  461. this._index = 0;
  462. } else if(this._sectionStart === this._index){
  463. //the section just started
  464. this._buffer = "";
  465. this._index = 0;
  466. } else if(this._sectionStart > 0){
  467. //remove everything unnecessary
  468. this._buffer = this._buffer.substr(this._sectionStart);
  469. this._index -= this._sectionStart;
  470. }
  471. this._sectionStart = 0;
  472. }
  473. };
  474. Tokenizer.prototype.pause = function(){
  475. this._running = false;
  476. };
  477. Tokenizer.prototype.resume = function(){
  478. this._running = true;
  479. };
  480. Tokenizer.prototype.end = function(chunk){
  481. if(chunk) this.write(chunk);
  482. //if there is remaining data, emit it in a reasonable way
  483. if(this._buffer === "" || this._sectionStart === -1 || this._sectionStart === this._index);
  484. else if(this._state === IN_CDATA || this._state === AFTER_CDATA_1 || this._state === AFTER_CDATA_2){
  485. this._emitIfToken("cdata");
  486. } else if(this._state === IN_COMMENT || this._state === AFTER_COMMENT_1 || this._state === AFTER_COMMENT_2){
  487. this._emitIfToken("comment");
  488. } else if(this._state === IN_TAG_NAME){
  489. this._emitIfToken("opentagname");
  490. } else if(this._state === IN_CLOSING_TAG_NAME){
  491. this._emitIfToken("closetag");
  492. } else {
  493. this._emitIfToken("text");
  494. }
  495. this._cbs.onend();
  496. };
  497. Tokenizer.prototype.reset = function(){
  498. Tokenizer.call(this, this._options, this._cbs);
  499. };
  500. Tokenizer.prototype._emitToken = function(name){
  501. this._cbs["on" + name](this._buffer.substring(this._sectionStart, this._index));
  502. this._sectionStart = -1;
  503. };
  504. Tokenizer.prototype._emitIfToken = function(name){
  505. if(this._index > this._sectionStart){
  506. this._cbs["on" + name](this._buffer.substring(this._sectionStart, this._index));
  507. }
  508. this._sectionStart = -1;
  509. };