/TextProcessor/HTMLWikiProcessor.cs

http://wikiprepsharp.codeplex.com · C# · 673 lines · 563 code · 18 blank · 92 comment · 174 complexity · c4efc7b4eea4598a30f8a8128b88575e MD5 · raw file

  1. //Copyright (c) Microsoft Corporation
  2. //
  3. //All rights reserved.
  4. //
  5. //Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0
  6. //
  7. //THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, MERCHANTABLITY OR NON-INFRINGEMENT.
  8. //
  9. //See the Apache Version 2.0 License for specific language governing permissions and limitations under the License.
  10. using System;
  11. using System.Collections.Generic;
  12. using System.Linq;
  13. using System.Text;
  14. namespace TextProcessor
  15. {
  16. public class HTMLWikiProcessor
  17. {
  18. TagProcessor tp;
  19. BufferProcessor bp;
  20. WordProcessor wp;
  21. WikiProcessor wikip;
  22. HashSet<int> exclusions;
  23. char[] extrawords;
  24. int extraword_counter;
  25. bool lastwordstopstatus;
  26. bool ignoreintegers;
  27. DecodedTextClass content;
  28. int currentdivision;
  29. public HTMLWikiProcessor(HashSet<int> exclusions, bool ignoreintegers)
  30. {
  31. this.exclusions = exclusions;
  32. this.ignoreintegers = ignoreintegers;
  33. //initialize Tag processor and buffer
  34. tp = new TagProcessor();
  35. bp = new BufferProcessor(20);
  36. wp = new WordProcessor(150);
  37. wikip = new WikiProcessor();
  38. extrawords = new char[500];
  39. extraword_counter = 0;
  40. }
  41. public void LoadDecodedTextClass(ref DecodedTextClass content)
  42. {
  43. this.content = content;
  44. }
  45. public void ProcessHTML(string html)
  46. {
  47. currentdivision = 0;
  48. bp.initialize();
  49. extraword_counter = 0;
  50. lastwordstopstatus = false;
  51. //now cycle through the html document
  52. //mode 0 is outside tag, mode 1 is inside tag
  53. bool tagmode = false;
  54. bool wikimode = false;
  55. bool wikimode_curly = false;
  56. wp.initialize();
  57. bool openingtag = false;
  58. bool calltagprocessor = false;
  59. int currenthiddentag = -1;
  60. int currentstickytag = -1;
  61. bool issticky, isdivision, ishidden;
  62. int currenttag = -1;
  63. int poscounter = 0;
  64. int extraword_i = 0;
  65. while (extraword_counter > 0 || poscounter <= html.Length)
  66. {
  67. //Console.Write(html[poscounter]);
  68. char letter;
  69. if (extraword_counter > 0)
  70. {
  71. letter = extrawords[extraword_i];
  72. extraword_i++;
  73. if (extraword_i == extraword_counter)
  74. {
  75. extraword_counter = 0;
  76. extraword_i = 0;
  77. }
  78. }
  79. else
  80. {
  81. if (poscounter < html.Length)
  82. {
  83. letter = html[poscounter];
  84. }
  85. else
  86. {
  87. letter = ' ';
  88. }
  89. poscounter++;
  90. }
  91. //check for the wikimedia special cases first
  92. if (content.wikimedia)
  93. {
  94. if (letter == '{' && bp.GetPastLetter(-1) == '{' && !wikimode)
  95. {
  96. wikimode = true;
  97. wikimode_curly = true;
  98. wikip.reset(wikimode_curly);
  99. bp.AddLetter(letter);
  100. continue;
  101. }
  102. if (letter == '[' && bp.GetPastLetter(-1) == '[' && !wikimode)
  103. {
  104. wikimode = true;
  105. wikimode_curly = false;
  106. wikip.reset(wikimode_curly);
  107. bp.AddLetter(letter);
  108. continue;
  109. }
  110. if (wikimode && wikimode_curly && letter == '}' && bp.GetPastLetter(-1) == '}')
  111. {
  112. //found a curly segment
  113. wikip.Close();
  114. AddWikiSegments();
  115. bp.AddLetter(letter);
  116. wikimode = false;
  117. continue;
  118. }
  119. if (wikimode && !wikimode_curly && letter == ']' && bp.GetPastLetter(-1) == ']')
  120. {
  121. //found an edgy segment
  122. wikip.Close();
  123. AddWikiSegments();
  124. bp.AddLetter(letter);
  125. wikimode = false;
  126. continue;
  127. }
  128. if (wikimode)
  129. {
  130. //add letter to buffer
  131. wikip.AddLetter(letter); //wikiprocessor
  132. bp.AddLetter(letter); //bufferprocessor
  133. continue;
  134. }
  135. }
  136. //now deal with other html code
  137. switch (letter)
  138. {
  139. case '<':
  140. //tag was opened
  141. //check if there is a word to be written
  142. wp.AddLetter(' ');
  143. if (wp.ValidAsciiOrInteger())
  144. {
  145. AddWords(false, currentdivision, currentstickytag, currenthiddentag);
  146. }
  147. else
  148. {
  149. AddWords(true, currentdivision, currentstickytag, currenthiddentag);
  150. }
  151. tagmode = true;
  152. openingtag = true;
  153. calltagprocessor = true;
  154. tp.initialize();
  155. break;
  156. case '/':
  157. if (tagmode)
  158. {
  159. if (bp.GetPastLetter(-1) == '<')
  160. {
  161. openingtag = false;
  162. }
  163. break;
  164. }
  165. goto default;
  166. case '>':
  167. if (calltagprocessor)
  168. {
  169. if (!tp.AddLetter(' '))
  170. {
  171. //tag ended
  172. tp.GetTag(out currenttag, out ishidden, out issticky, out isdivision);
  173. //hidden tag
  174. if ((ishidden) && (currenthiddentag == -1) && openingtag)
  175. {
  176. currenthiddentag = currenttag;
  177. }
  178. //sticky tag
  179. if ((issticky) && (currentstickytag == -1) && openingtag)
  180. {
  181. currentstickytag = currenttag;
  182. currentdivision++;
  183. }
  184. //division tag
  185. if (isdivision)
  186. {
  187. currentdivision++;
  188. }
  189. calltagprocessor = false;
  190. }
  191. }
  192. //tag was closed
  193. if (bp.GetPastLetter(-1) == '/')
  194. {
  195. openingtag = false;
  196. }
  197. //did we just close a hidden tag?
  198. if ((currenthiddentag == currenttag) && (!openingtag) && tagmode)
  199. {
  200. currenthiddentag = -1;
  201. }
  202. //did we just close a sticky tag?
  203. if ((currentstickytag == currenttag) && (!openingtag) && tagmode)
  204. {
  205. currentstickytag = -1;
  206. }
  207. tagmode = false;
  208. openingtag = false;
  209. calltagprocessor = false;
  210. wp.initialize();
  211. break;
  212. default:
  213. if (tagmode)
  214. {
  215. //we are inside a tag
  216. if (calltagprocessor)
  217. {
  218. if (!tp.AddLetter(letter))
  219. {
  220. //tag ended
  221. tp.GetTag(out currenttag, out ishidden, out issticky, out isdivision);
  222. //hidden tag
  223. if ((ishidden) && (currenthiddentag == -1) && openingtag)
  224. {
  225. currenthiddentag = currenttag;
  226. }
  227. //sticky tag
  228. if ((issticky) && (currentstickytag == -1) && openingtag)
  229. {
  230. currentstickytag = currenttag;
  231. currentdivision++;
  232. }
  233. //division tag
  234. if (isdivision)
  235. {
  236. currentdivision++;
  237. }
  238. calltagprocessor = false;
  239. }
  240. }
  241. }
  242. else
  243. {
  244. //let's try to add the letter to the current word
  245. if (currenthiddentag == -1)
  246. {
  247. //if (letter == ' ')
  248. //{
  249. // Console.Write("hello");
  250. //}
  251. //AddedLetter is false if letter is space, \n, \r, \t
  252. bool AddedLetter = wp.AddLetter(letter);
  253. //Console.WriteLine(AddedLetter);
  254. if (!AddedLetter)
  255. {
  256. //word has finished
  257. if (wp.ValidAsciiOrInteger())
  258. {
  259. AddWords(false, currentdivision, currentstickytag, currenthiddentag);
  260. }
  261. else
  262. {
  263. AddWords(true, currentdivision, currentstickytag, currenthiddentag);
  264. }
  265. //reinitialize word processor
  266. wp.initialize();
  267. }
  268. }
  269. }
  270. break;
  271. }
  272. //add letter to buffer
  273. bp.AddLetter(letter);
  274. }
  275. }
  276. int[] tag_hierarchy = new int[100];
  277. public void ProcessDivHTML(string html, HashSet<string> divfilters)
  278. {
  279. currentdivision = 0;
  280. int depth = 0;
  281. bp.initialize();
  282. extraword_counter = 0;
  283. lastwordstopstatus = false;
  284. //extra flags
  285. bool div_matchflag = false;
  286. bool div_matchflag_deep = false;
  287. int div_reading = 0;
  288. int div_depth = 0;
  289. string div_tagname = "";
  290. int div_tagid = tp.tagidlist["div"];
  291. //now cycle through the html document
  292. //mode 0 is outside tag, mode 1 is inside tag
  293. bool tagmode = false;
  294. wp.initialize();
  295. bool openingtag = false;
  296. bool calltagprocessor = false;
  297. int currenthiddentag = -1;
  298. int currentstickytag = -1;
  299. bool issticky, isdivision, ishidden;
  300. int currenttag = -1;
  301. int poscounter = 0;
  302. int extraword_i = 0;
  303. while (extraword_counter > 0 || poscounter <= html.Length)
  304. {
  305. //Console.Write(html[poscounter]);
  306. char letter;
  307. if (extraword_counter > 0)
  308. {
  309. letter = extrawords[extraword_i];
  310. extraword_i++;
  311. if (extraword_i == extraword_counter)
  312. {
  313. extraword_counter = 0;
  314. extraword_i = 0;
  315. }
  316. }
  317. else
  318. {
  319. if (poscounter < html.Length)
  320. {
  321. letter = html[poscounter];
  322. }
  323. else
  324. {
  325. letter = ' ';
  326. }
  327. poscounter++;
  328. }
  329. //now deal with other html code
  330. switch (letter)
  331. {
  332. case '<':
  333. //tag was opened
  334. //check if there is a word to be written
  335. wp.AddLetter(' ');
  336. if (wp.ValidAsciiOrInteger())
  337. {
  338. if ((currentstickytag != -1) || div_matchflag_deep)
  339. {
  340. AddWords(false, currentdivision, currentstickytag, currenthiddentag);
  341. }
  342. }
  343. else
  344. {
  345. if ((currentstickytag != -1) || div_matchflag_deep)
  346. {
  347. AddWords(true, currentdivision, currentstickytag, currenthiddentag);
  348. }
  349. }
  350. tagmode = true;
  351. openingtag = true;
  352. calltagprocessor = true;
  353. tp.initialize();
  354. break;
  355. case '/':
  356. if (tagmode)
  357. {
  358. if (bp.GetPastLetter(-1) == '<')
  359. {
  360. openingtag = false;
  361. }
  362. break;
  363. }
  364. goto default;
  365. case '>':
  366. if (calltagprocessor)
  367. {
  368. if (!tp.AddLetter(' '))
  369. {
  370. //tag ended
  371. tp.GetTag(out currenttag, out ishidden, out issticky, out isdivision);
  372. //hidden tag
  373. if ((ishidden) && (currenthiddentag == -1) && openingtag)
  374. {
  375. currenthiddentag = currenttag;
  376. }
  377. //sticky tag
  378. if ((issticky) && (currentstickytag == -1) && openingtag)
  379. {
  380. currentstickytag = currenttag;
  381. currentdivision++;
  382. }
  383. //division tag
  384. if (isdivision)
  385. {
  386. currentdivision++;
  387. }
  388. calltagprocessor = false;
  389. }
  390. }
  391. //tag was closed
  392. if (bp.GetPastLetter(-1) == '/')
  393. {
  394. openingtag = false;
  395. }
  396. else
  397. {
  398. if (openingtag)
  399. {
  400. //check if we have a div match
  401. switch (div_reading)
  402. {
  403. case 1:
  404. //Console.WriteLine(div_tagname);
  405. if (divfilters.Contains(div_tagname) && (depth < tag_hierarchy.Length) && (!div_matchflag))
  406. {
  407. //Console.WriteLine("match! "+depth);
  408. div_matchflag = true;
  409. div_matchflag_deep = true;
  410. div_depth = depth + 1;
  411. }
  412. div_reading = 2;
  413. break;
  414. default:
  415. break;
  416. }
  417. if (currenttag == div_tagid)
  418. {
  419. if (depth < tag_hierarchy.Length)
  420. {
  421. tag_hierarchy[depth] = currenttag;
  422. }
  423. //if (depth < 10)
  424. //{
  425. //Console.WriteLine(depth + " " + tp.tagidtrans[currenttag]+" open");
  426. //}
  427. depth++;
  428. if (div_matchflag)
  429. {
  430. div_matchflag_deep = true;
  431. if (depth > div_depth)
  432. {
  433. div_matchflag_deep = false;
  434. }
  435. }
  436. }
  437. }
  438. else
  439. {
  440. if (currenttag == div_tagid)
  441. {
  442. depth--;
  443. //if (depth < 10)
  444. //{
  445. //Console.WriteLine(depth + " " + tp.tagidtrans[currenttag] + " closed");
  446. //}
  447. if ((depth < div_depth) && div_matchflag)
  448. {
  449. div_matchflag = false;
  450. div_matchflag_deep = false;
  451. //Console.WriteLine("match over");
  452. }
  453. if (div_matchflag)
  454. {
  455. div_matchflag_deep = true;
  456. if (depth > div_depth)
  457. {
  458. div_matchflag_deep = false;
  459. }
  460. }
  461. }
  462. }
  463. }
  464. //did we just close a hidden tag?
  465. if ((currenthiddentag == currenttag) && (!openingtag) && tagmode)
  466. {
  467. currenthiddentag = -1;
  468. }
  469. //did we just close a sticky tag?
  470. if ((currentstickytag == currenttag) && (!openingtag) && tagmode)
  471. {
  472. currentstickytag = -1;
  473. }
  474. tagmode = false;
  475. openingtag = false;
  476. calltagprocessor = false;
  477. wp.initialize();
  478. break;
  479. default:
  480. if (tagmode)
  481. {
  482. //we are inside a tag
  483. if (calltagprocessor)
  484. {
  485. if (!tp.AddLetter(letter))
  486. {
  487. //tag ended
  488. tp.GetTag(out currenttag, out ishidden, out issticky, out isdivision);
  489. div_tagname = "";
  490. div_reading = 0;
  491. //hidden tag
  492. if ((ishidden) && (currenthiddentag == -1) && openingtag)
  493. {
  494. currenthiddentag = currenttag;
  495. }
  496. //sticky tag
  497. if ((issticky) && (currentstickytag == -1) && openingtag)
  498. {
  499. currentstickytag = currenttag;
  500. currentdivision++;
  501. }
  502. //division tag
  503. if (isdivision)
  504. {
  505. currentdivision++;
  506. }
  507. calltagprocessor = false;
  508. }
  509. }
  510. else
  511. {
  512. //do we have a divtag?
  513. if ((currenttag == div_tagid) && (!div_matchflag))
  514. {
  515. //first space starts accumulate mode
  516. switch (letter)
  517. {
  518. case ' ':
  519. switch (div_reading)
  520. {
  521. case 1:
  522. //Console.WriteLine(div_tagname);
  523. if (divfilters.Contains(div_tagname) && (depth < tag_hierarchy.Length))
  524. {
  525. div_matchflag = true;
  526. div_matchflag_deep = true;
  527. div_depth = depth;
  528. }
  529. div_reading = 2;
  530. break;
  531. default:
  532. break;
  533. }
  534. break;
  535. default:
  536. if (div_reading == 0)
  537. {
  538. div_reading = 1;
  539. }
  540. if (div_reading == 1)
  541. {
  542. div_tagname += letter;
  543. }
  544. break;
  545. }
  546. }
  547. }
  548. }
  549. else
  550. {
  551. //let's try to add the letter to the current word
  552. if (currenthiddentag == -1)
  553. {
  554. //if (letter == ' ')
  555. //{
  556. // Console.Write("hello");
  557. //}
  558. bool AddedLetter = wp.AddLetter(letter);
  559. //Console.WriteLine(AddedLetter);
  560. if (!AddedLetter)
  561. {
  562. //word has finished
  563. if (wp.ValidAsciiOrInteger())
  564. {
  565. //Console.Write("X"+depth);
  566. if ((currentstickytag != -1) || div_matchflag_deep)
  567. {
  568. //Console.Write("word added");
  569. AddWords(false, currentdivision, currentstickytag, currenthiddentag);
  570. }
  571. }
  572. else
  573. {
  574. if ((currentstickytag != -1) || div_matchflag_deep)
  575. {
  576. AddWords(true, currentdivision, currentstickytag, currenthiddentag);
  577. }
  578. }
  579. //reinitialize word processor
  580. wp.initialize();
  581. }
  582. }
  583. }
  584. break;
  585. }
  586. //add letter to buffer
  587. bp.AddLetter(letter);
  588. }
  589. }
  590. public void AddWikiSegments()
  591. {
  592. char[] chararray = new char[0];
  593. int offset = 0;
  594. int length = 0;
  595. int type = 0;
  596. int counter = 0;
  597. bool setextraword = false;
  598. extraword_counter = 0;
  599. int extracounter = 0;
  600. while (wikip.GetSegment(counter, ref chararray, ref offset, ref length, ref type, ref extrawords, ref extracounter, ref setextraword))
  601. {
  602. if (length > 0)
  603. {
  604. content.AddWikiConstruct(chararray, offset, length, type);
  605. }
  606. if (setextraword)
  607. {
  608. extraword_counter = extracounter;
  609. //Console.WriteLine(extraword_counter);
  610. //Console.WriteLine(new string(extrawords,0,extraword_counter));
  611. }
  612. counter++;
  613. }
  614. }
  615. public void AddWords(bool stopword, int currentdivision, int currentstickytag, int currenthiddentag)
  616. {
  617. if (currenthiddentag != -1) { return; }
  618. char[] chararray = new char[0];
  619. int offset = 0;
  620. int length = 0;
  621. int hcounter = 0;
  622. bool isInt = false;
  623. int decodedInt = -1;
  624. while (wp.GetStemmedCharArrays(ignoreintegers, ref hcounter, ref chararray, ref offset, ref length, ref isInt, ref decodedInt))
  625. {
  626. //create word hash
  627. int hash = 7;
  628. for (int k = offset; k < offset + length; k++)
  629. {
  630. hash = (hash * 17) + (chararray[k] | (chararray[k] << 0x10));
  631. }
  632. bool stickiness = true;
  633. if (currentstickytag == -1)
  634. {
  635. stickiness = false;
  636. }
  637. if (stopword || exclusions.Contains(hash) || (isInt && decodedInt == -1))
  638. {
  639. //we have a stopword - only add it if the previous word was not a stopword
  640. if (!lastwordstopstatus)
  641. {
  642. content.AddWord(chararray, 0, stickiness, currentdivision, true, isInt, decodedInt);
  643. lastwordstopstatus = true;
  644. }
  645. }
  646. else
  647. {
  648. content.AddWord(chararray, length, stickiness, currentdivision, false, isInt, decodedInt);
  649. lastwordstopstatus = false;
  650. //Console.WriteLine(new string(chararray,0,length));
  651. }
  652. }
  653. }
  654. }
  655. }