PageRenderTime 44ms CodeModel.GetById 20ms RepoModel.GetById 0ms app.codeStats 0ms

/java/main/org/hedera/io/input/WikiRevisionPageInputFormat.java

https://github.com/giangbinhtran/Hedera
Java | 430 lines | 349 code | 37 blank | 44 comment | 240 complexity | 09d77394279cd9398c0d04a84bb0c907 MD5 | raw file
  1. package org.hedera.io.input;
  2. import java.io.IOException;
  3. import org.apache.hadoop.fs.FSDataInputStream;
  4. import org.apache.hadoop.io.DataOutputBuffer;
  5. import org.apache.hadoop.io.LongWritable;
  6. import org.apache.hadoop.io.compress.CompressionInputStream;
  7. import org.apache.hadoop.mapreduce.InputSplit;
  8. import org.apache.hadoop.mapreduce.RecordReader;
  9. import org.apache.hadoop.mapreduce.TaskAttemptContext;
  10. import org.hedera.io.Revision;
  11. public class WikiRevisionPageInputFormat extends
  12. WikiRevisionInputFormat<LongWritable, Revision> {
  13. @Override
  14. public RecordReader<LongWritable, Revision> createRecordReader(InputSplit split,
  15. TaskAttemptContext context) {
  16. return new RevisionReader();
  17. }
  18. /**
  19. * Read each revision of Wikipedia page and transform into a WikipediaRevision object
  20. * @author tuan
  21. *
  22. */
  23. public static class RevisionReader extends WikiRevisionReader<Revision> {
  24. // Extra flags:
  25. //
  26. // indicating the flow condition within [flag = 18]
  27. // -1 - Unmatched
  28. // 1 - Matched <revision> tag partially
  29. // 2 - Matched </page> tag partially
  30. // 3 - Matched both <revision> and </page> partially
  31. private int revOrPage = -1;
  32. // indicating the flow condition within [flag = 11]
  33. // -1 - Unmatched
  34. // 1 - Matched <parentId> tag partially
  35. // 2 - Matched <timestamp> tag partially
  36. // 3 - Matched both <parentId> and <timestamp> partially
  37. private int parOrTs = -1;
  38. // We now convert and cache everything from pageHeader to the followin global variables
  39. // NOTE: they all need to be synchronized with pageHeader !!
  40. // private DataOutputBuffer pageHeader = new DataOutputBuffer();
  41. private DataOutputBuffer pageTitle = new DataOutputBuffer();
  42. private DataOutputBuffer nsBuf = new DataOutputBuffer();
  43. //////////////////////////////////////////////////////////////
  44. // END PageHeader variables
  45. //////////////////////////////////////////////////////////////
  46. private DataOutputBuffer revBuf = new DataOutputBuffer();
  47. private DataOutputBuffer timestampBuf = new DataOutputBuffer();
  48. private DataOutputBuffer parBuf = new DataOutputBuffer();
  49. private DataOutputBuffer contentBuf = new DataOutputBuffer();
  50. @Override
  51. public void initialize(InputSplit input, TaskAttemptContext tac)
  52. throws IOException, InterruptedException {
  53. super.initialize(input, tac);
  54. value = new Revision();
  55. }
  56. private void resetEverything() {
  57. keyBuf.reset();
  58. pageTitle.reset();
  59. value.clear();
  60. contentBuf.reset();
  61. parBuf.reset();
  62. timestampBuf.reset();
  63. revBuf.reset();
  64. nsBuf.reset();
  65. pageTitle.reset();
  66. skipped = false;
  67. revOrPage = -1;
  68. parOrTs = -1;
  69. }
  70. @Override
  71. public STATE doWhenMatch() throws IOException, InterruptedException {
  72. if (flag == 19) {
  73. resetEverything();
  74. }
  75. else if (flag == 18) {
  76. if (!skipped)
  77. return STATE.STOP_TRUE;
  78. }
  79. else if (flag == 17) {
  80. if (!skipped) {
  81. value.loadText(contentBuf.getData(), 0, contentBuf.getLength()
  82. - END_TEXT.length);
  83. }
  84. // reset big chunk of data right away to save memory
  85. contentBuf.reset();
  86. }
  87. else if (flag == 15) {
  88. if (!skipped) {
  89. String parIdStr = new String(parBuf.getData(), 0, parBuf.getLength()
  90. - END_PARENT_ID.length);
  91. long parId = Long.parseLong(parIdStr);
  92. value.setParentId(parId);
  93. }
  94. parBuf.reset();
  95. }
  96. else if (flag == 13) {
  97. if (!skipped) {
  98. String ts = new String(timestampBuf.getData(), 0, timestampBuf.getLength()
  99. - END_TIMESTAMP.length);
  100. long timestamp = TIME_FORMAT.parseMillis(ts);
  101. value.setTimestamp(timestamp);
  102. }
  103. timestampBuf.reset();
  104. }
  105. else if (flag == 11) {
  106. if (!skipped) {
  107. String idStr = new String(revBuf.getData(), 0, revBuf.getLength()
  108. - END_ID.length);
  109. long revId = Long.parseLong(idStr);
  110. value.setRevisionId(revId);
  111. }
  112. revBuf.reset();
  113. }
  114. else if (flag == 8) {
  115. if (!skipped) {
  116. String idStr = new String(keyBuf.getData(), 0, keyBuf.getLength()
  117. - END_ID.length);
  118. long pageId = Long.parseLong(idStr);
  119. key.set(pageId);
  120. value.setPageId(pageId);
  121. }
  122. keyBuf.reset();
  123. }
  124. else if (flag == 6) {
  125. String nsStr = new String(nsBuf.getData(), 0, nsBuf.getLength()
  126. - END_NAMESPACE.length);
  127. int namespace = Integer.parseInt(nsStr);
  128. if (namespace == 0) {
  129. skipped = skipNonArticles;
  130. }
  131. value.setNamespace(namespace);
  132. }
  133. else if (flag == 4) {
  134. String title = new String(pageTitle.getData(), 0, pageTitle.getLength()
  135. - END_TITLE.length);
  136. value.setPageTitle(title);
  137. pageTitle.reset();
  138. }
  139. else if (flag == -1) {
  140. return STATE.STOP_FALSE;
  141. }
  142. return STATE.CONTINUE;
  143. }
  144. // Scan the tags in SAX manner. Return at every legit tag and inform the program via
  145. // the global flag. Flush into the caches if necessary
  146. @Override
  147. protected boolean readUntilMatch() throws IOException {
  148. if (buf == null && pos.length != 2)
  149. throw new IOException("Internal buffer corrupted.");
  150. int i = 0;
  151. while (true) {
  152. if (pos[0] == pos[1]) {
  153. pos[1] = (compressed) ? ((CompressionInputStream)fsin).read(buf) :
  154. ((FSDataInputStream)fsin).read(buf);
  155. pos[0] = 0;
  156. if (pos[1] == -1) {
  157. return false;
  158. }
  159. }
  160. while (pos[0] < pos[1]) {
  161. byte b = buf[pos[0]];
  162. pos[0]++;
  163. // ignore every character until reaching a new page
  164. if (flag == 1 || flag == 19) {
  165. if (b == START_PAGE[i]) {
  166. i++;
  167. if (i >= START_PAGE.length) {
  168. flag = 2;
  169. return true;
  170. }
  171. } else i = 0;
  172. }
  173. else if (flag == 2) {
  174. if (b == START_TITLE[i]) {
  175. i++;
  176. } else i = 0;
  177. if (i >= START_TITLE.length) {
  178. flag = 3;
  179. return true;
  180. }
  181. }
  182. // put everything between <title></title> block into title
  183. else if (flag == 3) {
  184. if (b == END_TITLE[i]) {
  185. i++;
  186. } else i = 0;
  187. pageTitle.write(b);
  188. if (i >= END_TITLE.length) {
  189. flag = 4;
  190. return true;
  191. }
  192. }
  193. else if (flag == 4) {
  194. if (b == START_NAMESPACE[i]) {
  195. i++;
  196. } else i = 0;
  197. if (i >= START_NAMESPACE.length) {
  198. flag = 5;
  199. return true;
  200. }
  201. }
  202. // everything within <ns></ns> block goes into nsBuf
  203. else if (flag == 5) {
  204. if (b == END_NAMESPACE[i]) {
  205. i++;
  206. } else i = 0;
  207. nsBuf.write(b);
  208. if (i >= END_NAMESPACE.length) {
  209. flag = 6;
  210. return true;
  211. }
  212. }
  213. // when passing the namespace and we realize that
  214. // this is not an article, and that the option of skipping
  215. // non-article pages is on, we simply skip everything till
  216. // the closing </page>
  217. else if (skipped && flag >= 6 && flag < 19) {
  218. if (b == END_PAGE[i]) {
  219. i++;
  220. } else i = 0;
  221. if (i >= END_PAGE.length) {
  222. flag = 19;
  223. return true;
  224. }
  225. }
  226. else if (flag == 6) {
  227. if (b == START_ID[i]) {
  228. i++;
  229. } else i = 0;
  230. if (i >= START_ID.length) {
  231. flag = 7;
  232. return true;
  233. }
  234. }
  235. // put everything in outer <id></id> block into keyBuf
  236. else if (flag == 7) {
  237. if (b == END_ID[i]) {
  238. i++;
  239. } else i = 0;
  240. keyBuf.write(b);
  241. if (i >= END_ID.length) {
  242. flag = 8;
  243. return true;
  244. }
  245. }
  246. else if (flag == 8) {
  247. if (b == START_REVISION[i]) {
  248. i++;
  249. } else i = 0;
  250. if (i >= START_REVISION.length) {
  251. flag = 9;
  252. return true;
  253. }
  254. }
  255. // inside <revision></revision> block, first check for id
  256. else if (flag == 9) {
  257. if (b == START_ID[i]) {
  258. i++;
  259. } else i = 0;
  260. if (i >= START_ID.length) {
  261. flag = 10;
  262. return true;
  263. }
  264. }
  265. // everything inside the inner <id></id> block goes to revision buffer
  266. else if (flag == 10) {
  267. if (b == END_ID[i]) {
  268. i++;
  269. } else i = 0;
  270. revBuf.write(b);
  271. if (i >= END_ID.length) {
  272. flag = 11;
  273. return true;
  274. }
  275. }
  276. // after the inner <id>, check for either <timestamp> or <parentId>
  277. else if (flag == 11) {
  278. int curMatch = 0;
  279. if ((i < START_PARENT_ID.length && b == START_PARENT_ID[i])
  280. && (i < START_TIMESTAMP.length && b == START_TIMESTAMP[i])) {
  281. curMatch = 3;
  282. } else if (i < START_PARENT_ID.length && b == START_PARENT_ID[i]) {
  283. curMatch = 1;
  284. } else if (i < START_TIMESTAMP.length && b == START_TIMESTAMP[i]) {
  285. curMatch = 2;
  286. }
  287. if (curMatch > 0 && (i == 0 || parOrTs == 3 || curMatch == parOrTs)) {
  288. i++;
  289. parOrTs = curMatch;
  290. } else i = 0;
  291. if ((parOrTs == 2 || parOrTs == 3) && i >= START_TIMESTAMP.length) {
  292. flag = 12;
  293. parOrTs = -1;
  294. return true;
  295. } else if ((parOrTs == 1 || parOrTs == 3) && i >= START_PARENT_ID.length) {
  296. flag = 14;
  297. parOrTs = -1;
  298. return true;
  299. }
  300. }
  301. // inside <timestamp></timestamp> block everything goes to timestamp buffer
  302. else if (flag == 12) {
  303. if (b == END_TIMESTAMP[i]) {
  304. i++;
  305. } else i = 0;
  306. timestampBuf.write(b);
  307. if (i >= END_TIMESTAMP.length) {
  308. flag = 13;
  309. return true;
  310. }
  311. }
  312. // inside <parentId></parentId> block everything goes to parentId buffer
  313. else if (flag == 14) {
  314. if (b == END_PARENT_ID[i]) {
  315. i++;
  316. } else i = 0;
  317. parBuf.write(b);
  318. if (i >= END_PARENT_ID.length) {
  319. flag = 15;
  320. return true;
  321. }
  322. }
  323. // after the </parentId>, search for <timestamp>
  324. else if (flag == 15) {
  325. if (b == START_TIMESTAMP[i]) {
  326. i++;
  327. } else i = 0;
  328. if (i >= START_TIMESTAMP.length) {
  329. flag = 12;
  330. return true;
  331. }
  332. }
  333. // after the </timestamp>, check for <text>
  334. else if (flag == 13) {
  335. if (b == START_TEXT[i]) {
  336. i++;
  337. } else i = 0;
  338. if (i >= START_TEXT.length) {
  339. flag = 16;
  340. return true;
  341. }
  342. }
  343. // inside <text></text> block everything goes to content buffer
  344. else if (flag == 16) {
  345. if (b == END_TEXT[i]) {
  346. i++;
  347. } else i = 0;
  348. contentBuf.write(b);
  349. if (i >= END_TEXT.length) {
  350. flag = 17;
  351. return true;
  352. }
  353. }
  354. // look for the closing </revision>
  355. else if (flag == 17) {
  356. if (b == END_REVISION[i]) {
  357. i++;
  358. } else i = 0;
  359. if (i >= END_REVISION.length) {
  360. flag = 18;
  361. return true;
  362. }
  363. }
  364. // Flag 16 can be the signal of a new record inside one old page
  365. else if (flag == 18) {
  366. int curMatch = 0;
  367. if ((i < END_PAGE.length && b == END_PAGE[i])
  368. && (i < START_REVISION.length && b == START_REVISION[i])) {
  369. curMatch = 3;
  370. } else if (i < END_PAGE.length && b == END_PAGE[i]) {
  371. curMatch = 2;
  372. } else if (i < START_REVISION.length && b == START_REVISION[i]) {
  373. curMatch = 1;
  374. }
  375. if (curMatch > 0 && (i == 0 || revOrPage == 3 || curMatch == revOrPage)) {
  376. i++;
  377. revOrPage = curMatch;
  378. } else i = 0;
  379. if ((revOrPage == 2 || revOrPage == 3) && i >= END_PAGE.length) {
  380. flag = 19;
  381. revOrPage = -1;
  382. return true;
  383. } else if ((revOrPage == 1 || revOrPage == 3) && i >= START_REVISION.length) {
  384. flag = 9;
  385. revOrPage = -1;
  386. return true;
  387. }
  388. }
  389. }
  390. }
  391. }
  392. }
  393. }