PageRenderTime 58ms CodeModel.GetById 27ms RepoModel.GetById 1ms app.codeStats 0ms

/java/main/org/hedera/io/input/WikiRevisionDiffInputFormat.java

https://github.com/giangbinhtran/Hedera
Java | 513 lines | 392 code | 48 blank | 73 comment | 250 complexity | ea8a8861ba24a3da30f63afa5dc10251 MD5 | raw file
  1. package org.hedera.io.input;
  2. import java.io.IOException;
  3. import java.io.UnsupportedEncodingException;
  4. import java.util.LinkedList;
  5. import java.util.List;
  6. import org.apache.hadoop.fs.FSDataInputStream;
  7. import org.apache.hadoop.io.DataOutputBuffer;
  8. import org.apache.hadoop.io.LongWritable;
  9. import org.apache.hadoop.io.compress.CompressionInputStream;
  10. import org.apache.hadoop.mapreduce.InputSplit;
  11. import org.apache.hadoop.mapreduce.RecordReader;
  12. import org.apache.hadoop.mapreduce.TaskAttemptContext;
  13. import org.hedera.io.RevisionDiffOld;
  14. import difflib.Delta;
  15. import difflib.DiffUtils;
  16. import difflib.Patch;
  17. public class WikiRevisionDiffInputFormat
  18. extends WikiRevisionInputFormat<LongWritable, RevisionDiffOld> {
  19. @Override
  20. public RecordReader<LongWritable, RevisionDiffOld> createRecordReader(
  21. InputSplit input, TaskAttemptContext context) throws IOException,
  22. InterruptedException {
  23. return new DiffReader();
  24. }
  25. /**
  26. * Read every pairs of consecutive revisions and calculate their diffs
  27. * using Meyer's alogirthm. Return WikipediaRevisionDiff which, among other fields,
  28. * emits the list of diff between the two texts
  29. *
  30. * @author tuan
  31. */
  32. // States of the flag:
  33. //
  34. // -1: EOF
  35. // 1 - outside the <page> tag
  36. // 2 - just passed the <page> tag but outside the <title>
  37. // 3 - just passed the <title> tag
  38. // 4 - just passed the </title> tag but outside the <namespace>
  39. // 5 - just passed the <namespace>
  40. // 6 - just passed the </namespace> but outside the <id>
  41. // 7 - just passed the (page's) <id>
  42. // 8 - just passed the </id> tag but outside the <revision>
  43. // 9 - just passed the (next) <revision>
  44. // 10 - just passed the inner <id> tag inside <revision>
  45. // 11 - just passed the inner </id> tag inside <revision>
  46. // 12 - just passed the <timestamp>
  47. // 13 - just passed the </timestamp> tag
  48. // 14 - just passed the <parentId>
  49. // 15 - just passed the </parentId> tag
  50. // 16 - just passed the <text> tag
  51. // 17 - just passed the </text> tag
  52. // 18 - just passed the </revision>
  53. // 19 - just passed the </page>
  54. public static class DiffReader extends WikiRevisionReader<RevisionDiffOld> {
  55. // Extra flags:
  56. //
  57. // indicating the flow condition within [flag = 16]
  58. // -1 - Unmatched
  59. // 1 - Matched <revision> tag partially
  60. // 2 - Matched </page> tag partially
  61. // 3 - Matched both <revision> and </page> partially
  62. private int revOrPage = -1;
  63. // indicating the flow condition within [flag = 9]
  64. // -1 - Unmatched
  65. // 1 - Matched <parentId> tag partially
  66. // 2 - Matched <timestamp> tag partially
  67. // 3 - Matched both <parentId> and <timestamp> partially
  68. private int parOrTs = -1;
  69. // We now convert and cache everything from pageHeader to the followin global variables
  70. // NOTE: they all need to be synchronized with pageHeader !!
  71. // private DataOutputBuffer pageHeader = new DataOutputBuffer();
  72. private DataOutputBuffer pageTitle = new DataOutputBuffer();
  73. private DataOutputBuffer nsBuf = new DataOutputBuffer();
  74. //////////////////////////////////////////////////////////////
  75. // END PageHeader variables
  76. //////////////////////////////////////////////////////////////
  77. // buffer for handling consecutive revisions
  78. private DataOutputBuffer timestampBuf = new DataOutputBuffer();
  79. private DataOutputBuffer revIdBuf = new DataOutputBuffer();
  80. private DataOutputBuffer parBuf = new DataOutputBuffer();
  81. private List<String> lastRevText = new LinkedList<>();
  82. private DataOutputBuffer contentBuf = new DataOutputBuffer();
  83. //////////////////////////////////////////////////////////////
  84. // END revision buffer variables
  85. //////////////////////////////////////////////////////////////
  86. @Override
  87. public void initialize(InputSplit input, TaskAttemptContext tac)
  88. throws IOException, InterruptedException {
  89. super.initialize(input, tac);
  90. value = new RevisionDiffOld();
  91. }
  92. private void resetEverything() {
  93. revOrPage = -1;
  94. parOrTs = -1;
  95. nsBuf.reset();
  96. timestampBuf.reset();
  97. revIdBuf.reset();
  98. parBuf.reset();
  99. contentBuf.reset();
  100. keyBuf.reset();
  101. pageTitle.reset();
  102. value.clear();
  103. lastRevText.clear();
  104. skipped = false;
  105. }
  106. @Override
  107. protected STATE doWhenMatch() throws IOException, InterruptedException {
  108. if (flag == 19) {
  109. resetEverything();
  110. }
  111. // emit the object when reaching </revision>
  112. else if (flag == 18) {
  113. if (!skipped)
  114. return STATE.STOP_TRUE;
  115. }
  116. // calculating the diff and shift the revision text when seeing </text>
  117. // inside the <revision> block
  118. else if (flag == 17) {
  119. if (!skipped) {
  120. // create a mass number of strings
  121. List<String> content = extractParagraph(contentBuf.getData(), 0,
  122. contentBuf.getLength() - END_TEXT.length);
  123. Patch patch = DiffUtils.diff(lastRevText, content);
  124. for (Delta d : patch.getDeltas()) {
  125. value.add(d);
  126. }
  127. lastRevText = content;
  128. }
  129. // release big chunk of bytes here
  130. contentBuf.reset();
  131. }
  132. else if (flag == 15) {
  133. if (!skipped) {
  134. String parIdStr = new String(parBuf.getData(), 0, parBuf.getLength()
  135. - END_PARENT_ID.length);
  136. long parId = Long.parseLong(parIdStr);
  137. value.setParentId(parId);
  138. }
  139. parBuf.reset();
  140. }
  141. else if (flag == 13) {
  142. if (!skipped) {
  143. String ts = new String(timestampBuf.getData(), 0, timestampBuf.getLength()
  144. - END_TIMESTAMP.length);
  145. long timestamp = TIME_FORMAT.parseMillis(ts);
  146. value.setTimestamp(timestamp);
  147. }
  148. timestampBuf.reset();
  149. }
  150. else if (flag == 11) {
  151. if (!skipped) {
  152. String idStr = new String(revIdBuf.getData(), 0, revIdBuf.getLength()
  153. - END_ID.length);
  154. long revId = Long.parseLong(idStr);
  155. value.setRevisionId(revId);
  156. }
  157. revIdBuf.reset();
  158. }
  159. else if (flag == 8) {
  160. if (!skipped) {
  161. String idStr = new String(keyBuf.getData(), 0, keyBuf.getLength()
  162. - END_ID.length);
  163. long pageId = Long.parseLong(idStr);
  164. key.set(pageId);
  165. value.setPageId(pageId);
  166. }
  167. keyBuf.reset();
  168. }
  169. else if (flag == 6) {
  170. String nsStr = new String(nsBuf.getData(), 0, nsBuf.getLength()
  171. - END_NAMESPACE.length);
  172. int ns = Integer.parseInt(nsStr);
  173. if (ns == 0) {
  174. skipped = skipNonArticles;
  175. }
  176. value.setNamespace(ns);
  177. nsBuf.reset();
  178. }
  179. else if (flag == 4) {
  180. String title = new String(pageTitle.getData(), 0, pageTitle.getLength()
  181. - END_TITLE.length);
  182. value.setPageTitle(title);
  183. pageTitle.reset();
  184. }
  185. else if (flag == -1) {
  186. return STATE.STOP_FALSE;
  187. }
  188. return STATE.CONTINUE;
  189. }
  190. public static List<String> extractParagraph(byte[] b, int offset, int len)
  191. throws UnsupportedEncodingException {
  192. List<String> res = new LinkedList<>();
  193. if (b != null && b.length > 0) {
  194. int start = offset;
  195. int i = offset;
  196. while (i < len) {
  197. char c = (char) (((b[i] & 0xFF) << 8) + (b[i+1] & 0xFF));
  198. if (c == '\n') {
  199. String s = new String(b,start,i,"UTF-8");
  200. res.add(s);
  201. while (Character.isWhitespace(c)) {
  202. i += 2;
  203. c = (char) (((b[i] & 0xFF) << 8) + (b[i+1] & 0xFF));
  204. }
  205. start = i;
  206. }
  207. else {
  208. i += 2;
  209. }
  210. }
  211. if (start < i) {
  212. String s = new String(b,start,i,"UTF-8");
  213. res.add(s);
  214. }
  215. }
  216. return res;
  217. }
  218. @Override
  219. protected boolean readUntilMatch() throws IOException {
  220. if (buf == null && pos.length != 2)
  221. throw new IOException("Internal buffer corrupted.");
  222. int i = 0;
  223. while (true) {
  224. if (pos[0] == pos[1]) {
  225. pos[1] = (compressed) ? ((CompressionInputStream)fsin).read(buf) :
  226. ((FSDataInputStream)fsin).read(buf);
  227. pos[0] = 0;
  228. LOG.info(pos[1] + " bytes read from the stream...");
  229. if (pos[1] == -1) {
  230. return false;
  231. }
  232. }
  233. while (pos[0] < pos[1]) {
  234. byte b = buf[pos[0]];
  235. pos[0]++;
  236. // ignore every character until reaching a new page
  237. if (flag == 1 || flag == 19) {
  238. if (b == START_PAGE[i]) {
  239. i++;
  240. if (i >= START_PAGE.length) {
  241. flag = 2;
  242. return true;
  243. }
  244. } else i = 0;
  245. }
  246. else if (flag == 2) {
  247. if (b == START_TITLE[i]) {
  248. i++;
  249. } else i = 0;
  250. if (i >= START_TITLE.length) {
  251. flag = 3;
  252. return true;
  253. }
  254. }
  255. // put everything between <title></title> block into title
  256. else if (flag == 3) {
  257. if (b == END_TITLE[i]) {
  258. i++;
  259. } else i = 0;
  260. pageTitle.write(b);
  261. if (i >= END_TITLE.length) {
  262. flag = 4;
  263. return true;
  264. }
  265. }
  266. else if (flag == 4) {
  267. if (b == START_NAMESPACE[i]) {
  268. i++;
  269. } else i = 0;
  270. if (i >= START_NAMESPACE.length) {
  271. flag = 5;
  272. return true;
  273. }
  274. }
  275. else if (flag == 5) {
  276. if (b == END_NAMESPACE[i]) {
  277. i++;
  278. } else i = 0;
  279. nsBuf.write(b);
  280. if (i >= END_NAMESPACE.length) {
  281. flag = 6;
  282. return true;
  283. }
  284. }
  285. // when passing the namespace and we realize that
  286. // this is not an article, and that the option of skipping
  287. // non-article pages is on, we simply skip everything until
  288. // the closing </page>
  289. else if (skipped && flag >= 6 && flag < 19) {
  290. if (b == END_PAGE[i]) {
  291. i++;
  292. } else i = 0;
  293. if (i >= END_PAGE.length) {
  294. flag = 19;
  295. return true;
  296. }
  297. }
  298. else if (flag == 6) {
  299. if (b == START_ID[i]) {
  300. i++;
  301. } else i = 0;
  302. if (i >= START_ID.length) {
  303. flag = 7;
  304. return true;
  305. }
  306. }
  307. // put everything in outer <id></id> block into keyBuf
  308. else if (flag == 7) {
  309. if (b == END_ID[i]) {
  310. i++;
  311. } else i = 0;
  312. keyBuf.write(b);
  313. if (i >= END_ID.length) {
  314. flag = 8;
  315. return true;
  316. }
  317. }
  318. else if (flag == 8) {
  319. if (b == START_REVISION[i]) {
  320. i++;
  321. } else i = 0;
  322. if (i >= START_REVISION.length) {
  323. flag = 9;
  324. return true;
  325. }
  326. }
  327. // inside <revision></revision> block, first check for id
  328. else if (flag == 9) {
  329. if (b == START_ID[i]) {
  330. i++;
  331. } else i = 0;
  332. if (i >= START_ID.length) {
  333. flag = 10;
  334. return true;
  335. }
  336. }
  337. // everything inside the inner <id></id> block goes to revision buffer
  338. else if (flag == 10) {
  339. if (b == END_ID[i]) {
  340. i++;
  341. } else i = 0;
  342. revIdBuf.write(b);
  343. if (i >= END_ID.length) {
  344. flag = 11;
  345. return true;
  346. }
  347. }
  348. // after the inner <id>, check for either <timestamp> or <parentId>
  349. else if (flag == 11) {
  350. int curMatch = 0;
  351. if ((i < START_PARENT_ID.length && b == START_PARENT_ID[i])
  352. && (i < START_TIMESTAMP.length && b == START_TIMESTAMP[i])) {
  353. curMatch = 3;
  354. } else if (i < START_PARENT_ID.length && b == START_PARENT_ID[i]) {
  355. curMatch = 1;
  356. } else if (i < START_TIMESTAMP.length && b == START_TIMESTAMP[i]) {
  357. curMatch = 2;
  358. }
  359. if (curMatch > 0 && (i == 0 || parOrTs == 3 || curMatch == parOrTs)) {
  360. i++;
  361. parOrTs = curMatch;
  362. } else i = 0;
  363. if ((parOrTs == 2 || parOrTs == 3) && i >= START_TIMESTAMP.length) {
  364. flag = 12;
  365. parOrTs = -1;
  366. return true;
  367. } else if ((parOrTs == 1 || parOrTs == 3) && i >= START_PARENT_ID.length) {
  368. flag = 14;
  369. parOrTs = -1;
  370. return true;
  371. }
  372. }
  373. // inside <timestamp></timestamp> block everything goes to timestamp buffer
  374. else if (flag == 12) {
  375. if (b == END_TIMESTAMP[i]) {
  376. i++;
  377. } else i = 0;
  378. timestampBuf.write(b);
  379. if (i >= END_TIMESTAMP.length) {
  380. flag = 13;
  381. return true;
  382. }
  383. }
  384. // inside <parentId></parentId> block everything goes to parentId buffer
  385. else if (flag == 14) {
  386. if (b == END_PARENT_ID[i]) {
  387. i++;
  388. } else i = 0;
  389. parBuf.write(b);
  390. if (i >= END_PARENT_ID.length) {
  391. flag = 15;
  392. return true;
  393. }
  394. }
  395. // after the </parentId>, search for <timestamp>
  396. else if (flag == 15) {
  397. if (b == START_TIMESTAMP[i]) {
  398. i++;
  399. } else i = 0;
  400. if (i >= START_TIMESTAMP.length) {
  401. flag = 12;
  402. return true;
  403. }
  404. }
  405. // after the </timestamp>, check for <text>
  406. else if (flag == 13) {
  407. if (b == START_TEXT[i]) {
  408. i++;
  409. } else i = 0;
  410. if (i >= START_TEXT.length) {
  411. flag = 16;
  412. return true;
  413. }
  414. }
  415. // inside <text></text> block everything goes to content buffer
  416. else if (flag == 16) {
  417. if (b == END_TEXT[i]) {
  418. i++;
  419. } else i = 0;
  420. contentBuf.write(b);
  421. if (i >= END_TEXT.length) {
  422. flag = 17;
  423. return true;
  424. }
  425. }
  426. // look for the closing </revision>
  427. else if (flag == 17) {
  428. if (b == END_REVISION[i]) {
  429. i++;
  430. } else i = 0;
  431. if (i >= END_REVISION.length) {
  432. flag = 18;
  433. return true;
  434. }
  435. }
  436. // Flag 16 can be the signal of a new record inside one old page
  437. else if (flag == 18) {
  438. int curMatch = 0;
  439. if ((i < END_PAGE.length && b == END_PAGE[i])
  440. && (i < START_REVISION.length && b == START_REVISION[i])) {
  441. curMatch = 3;
  442. } else if (i < END_PAGE.length && b == END_PAGE[i]) {
  443. curMatch = 2;
  444. } else if (i < START_REVISION.length && b == START_REVISION[i]) {
  445. curMatch = 1;
  446. }
  447. if (curMatch > 0 && (i == 0 || revOrPage == 3 || curMatch == revOrPage)) {
  448. i++;
  449. revOrPage = curMatch;
  450. } else i = 0;
  451. if ((revOrPage == 2 || revOrPage == 3) && i >= END_PAGE.length) {
  452. flag = 19;
  453. revOrPage = -1;
  454. return true;
  455. } else if ((revOrPage == 1 || revOrPage == 3) && i >= START_REVISION.length) {
  456. flag = 9;
  457. revOrPage = -1;
  458. return true;
  459. }
  460. }
  461. }
  462. }
  463. }
  464. }
  465. }