PageRenderTime 27ms CodeModel.GetById 0ms RepoModel.GetById 0ms app.codeStats 0ms

/java/main/org/hedera/io/input/WikiRevisionPairInputFormat.java

https://github.com/giangbinhtran/Hedera
Java | 293 lines | 175 code | 18 blank | 100 comment | 102 complexity | b9445ac7f8bdf7113d22e8abec9cc7ee MD5 | raw file
  1. package org.hedera.io.input;
  2. import java.io.IOException;
  3. import java.nio.charset.StandardCharsets;
  4. import org.apache.hadoop.fs.FSDataInputStream;
  5. import org.apache.hadoop.io.DataOutputBuffer;
  6. import org.apache.hadoop.io.LongWritable;
  7. import org.apache.hadoop.io.Text;
  8. import org.apache.hadoop.io.compress.CompressionInputStream;
  9. import org.apache.hadoop.mapreduce.InputSplit;
  10. import org.apache.hadoop.mapreduce.RecordReader;
  11. import org.apache.hadoop.mapreduce.TaskAttemptContext;
  12. public class WikiRevisionPairInputFormat
  13. extends WikiRevisionInputFormat<LongWritable, Text> {
  14. @Override
  15. public RecordReader<LongWritable, Text> createRecordReader(InputSplit split,
  16. TaskAttemptContext context) {
  17. return new RevisionReader();
  18. }
  19. /** read a meta-history xml file and output as a record every pair of consecutive revisions.
  20. * For example, Given the following input containing two pages and four revisions,
  21. * <pre><code>
  22. * &lt;page&gt;
  23. * &lt;title&gt;ABC&lt;/title&gt;
  24. * &lt;id&gt;123&lt;/id&gt;
  25. * &lt;revision&gt;
  26. * &lt;id&gt;100&lt;/id&gt;
  27. * ....
  28. * &lt;/revision&gt;
  29. * &lt;revision&gt;
  30. * &lt;id&gt;200&lt;/id&gt;
  31. * ....
  32. * &lt;/revision&gt;
  33. * &lt;revision&gt;
  34. * &lt;id&gt;300&lt;/id&gt;
  35. * ....
  36. * &lt;/revision&gt;
  37. * &lt;/page&gt;
  38. * &lt;page&gt;
  39. * &lt;title&gt;DEF&lt;/title&gt;
  40. * &lt;id&gt;456&lt;/id&gt;
  41. * &lt;revision&gt;
  42. * &lt;id&gt;400&lt;/id&gt;
  43. * ....
  44. * &lt;/revision&gt;
  45. * &lt;/page&gt;
  46. * </code></pre>
  47. * it will produce four keys like this:
  48. * <pre><code>
  49. * &lt;page&gt;
  50. * &lt;title&gt;ABC&lt;/title&gt;
  51. * &lt;id&gt;123&lt;/id&gt;
  52. * &lt;revision beginningofpage="true"&gt;&lt;text xml:space="preserve"&gt;
  53. * &lt;/text&gt;&lt;/revision&gt;&lt;revision&gt;
  54. * &lt;id&gt;100&lt;/id&gt;
  55. * ....
  56. * &lt;/revision&gt;
  57. * &lt;/page&gt;
  58. * </code></pre>
  59. * <pre><code>
  60. * &lt;page&gt;
  61. * &lt;title&gt;ABC&lt;/title&gt;
  62. * &lt;id&gt;123&lt;/id&gt;
  63. * &lt;revision&gt;
  64. * &lt;id&gt;100&lt;/id&gt;
  65. * ....
  66. * &lt;/revision&gt;
  67. * &lt;revision&gt;
  68. * &lt;id&gt;200&lt;/id&gt;
  69. * ....
  70. * &lt;/revision&gt;
  71. * &lt;/page&gt;
  72. * </code></pre>
  73. * <pre><code>
  74. * &lt;page&gt;
  75. * &lt;title&gt;ABC&lt;/title&gt;
  76. * &lt;id&gt;123&lt;/id&gt;
  77. * &lt;revision&gt;
  78. * &lt;id&gt;200&lt;/id&gt;
  79. * ....
  80. * &lt;/revision&gt;
  81. * &lt;revision&gt;
  82. * &lt;id&gt;300&lt;/id&gt;
  83. * ....
  84. * &lt;/revision&gt;
  85. * &lt;/page&gt;
  86. * </code></pre>
  87. * <pre><code>
  88. * &lt;page&gt;
  89. * &lt;title&gt;DEF&lt;/title&gt;
  90. * &lt;id&gt;456&lt;/id&gt;
  91. * &lt;revision&gt;&lt;revision beginningofpage="true"&gt;&lt;text xml:space="preserve"&gt;
  92. * &lt;/text&gt;&lt;/revision&gt;&lt;revision&gt;
  93. * &lt;id&gt;400&lt;/id&gt;
  94. * ....
  95. * &lt;/revision&gt;
  96. * &lt;/page&gt;
  97. * </code></pre>
  98. */
  99. // State of the flag:
  100. // 1 - beginning of the first <page> tag
  101. // 2 - just passed the <page> tag but outside the <id> tag
  102. // 3 - just passed the <id>
  103. // 4 - just passed the </id> but outside <revision>
  104. // 5 - just passed the <revision>
  105. // 6 - just passed the </revision>
  106. // 7 - just passed the </page>
  107. public static class RevisionReader extends WikiRevisionReader<Text> {
  108. private static final byte[] DUMMY_REV = ("<revision beginningofpage=\"true\">"
  109. + "<timestamp>1970-01-01T00:00:00Z</timestamp><text xml:space=\"preserve\">"
  110. + "</text></revision>\n")
  111. .getBytes(StandardCharsets.UTF_8);
  112. // indicating how many <revision> tags have been met, reset after every page end
  113. private int revisionVisited;
  114. // indicating the flow conditifion within [flag = 6]
  115. // -1 - Unmatched
  116. // 1 - Matched <revision> tag partially
  117. // 2 - Matched </page> tag partially
  118. // 3 - Matched both <revision> and </page> partially
  119. private int lastMatchTag = -1;
  120. private DataOutputBuffer pageHeader = new DataOutputBuffer();
  121. private DataOutputBuffer keyBuf = new DataOutputBuffer();
  122. private DataOutputBuffer rev1Buf = new DataOutputBuffer();
  123. private DataOutputBuffer rev2Buf = new DataOutputBuffer();
  124. @Override
  125. public void initialize(InputSplit input, TaskAttemptContext tac)
  126. throws IOException, InterruptedException {
  127. super.initialize(input, tac);
  128. revisionVisited = 0;
  129. value = new Text();
  130. }
  131. @Override
  132. public STATE doWhenMatch() throws IOException, InterruptedException {
  133. if (flag == 7) {
  134. pageHeader.reset();
  135. rev1Buf.reset();
  136. rev2Buf.reset();
  137. value.clear();
  138. revisionVisited = 0;
  139. }
  140. else if (flag == 6) {
  141. value.set(pageHeader.getData(), 0, pageHeader.getLength()
  142. - START_REVISION.length);
  143. value.append(rev1Buf.getData(), 0, rev1Buf.getLength());
  144. value.append(rev2Buf.getData(), 0, rev2Buf.getLength());
  145. value.append(END_PAGE, 0, END_PAGE.length);
  146. return STATE.STOP_TRUE;
  147. }
  148. else if (flag == 4) {
  149. String pageId = new String(keyBuf.getData(), 0, keyBuf.getLength()
  150. - END_ID.length);
  151. key.set(Long.parseLong(pageId));
  152. keyBuf.reset();
  153. }
  154. else if (flag == 2) {
  155. pageHeader.write(START_PAGE);
  156. }
  157. else if (flag == 5) {
  158. rev1Buf.reset();
  159. if (revisionVisited == 0) {
  160. rev1Buf.write(DUMMY_REV);
  161. } else {
  162. rev1Buf.write(rev2Buf.getData());
  163. }
  164. rev2Buf.reset();
  165. rev2Buf.write(START_REVISION);
  166. }
  167. else if (flag == -1) {
  168. pageHeader.reset();
  169. return STATE.STOP_FALSE;
  170. }
  171. return STATE.CONTINUE;
  172. }
  173. @Override
  174. protected boolean readUntilMatch() throws IOException {
  175. if (buf == null && pos.length != 2)
  176. throw new IOException("Internal buffer corrupted.");
  177. int i = 0;
  178. while (true) {
  179. if (pos[0] == pos[1]) {
  180. pos[1] = (compressed) ? ((CompressionInputStream)fsin).read(buf) :
  181. ((FSDataInputStream)fsin).read(buf);
  182. pos[0] = 0;
  183. if (pos[1] == -1) {
  184. return false;
  185. }
  186. }
  187. while (pos[0] < pos[1]) {
  188. byte b = buf[pos[0]];
  189. pos[0]++;
  190. // ignore every character until reaching a new page
  191. if (flag == 1 || flag == 7) {
  192. if (b == START_PAGE[i]) {
  193. i++;
  194. if (i >= START_PAGE.length) {
  195. flag = 2;
  196. return true;
  197. }
  198. } else i = 0;
  199. }
  200. // put everything between <page> tag and the first <id> tag into pageHeader
  201. else if (flag == 2) {
  202. if (b == START_ID[i]) {
  203. i++;
  204. } else i = 0;
  205. pageHeader.write(b);
  206. if (i >= START_ID.length) {
  207. flag = 3;
  208. return true;
  209. }
  210. }
  211. // put everything in <id></id> block into pageHeader and keyBuf
  212. else if (flag == 3) {
  213. if (b == END_ID[i]) {
  214. i++;
  215. } else i = 0;
  216. pageHeader.write(b);
  217. keyBuf.write(b);
  218. if (i >= END_ID.length) {
  219. flag = 4;
  220. return true;
  221. }
  222. }
  223. // put everything between </id> tag and the first <revision> tag into pageHeader
  224. else if (flag == 4) {
  225. if (b == START_REVISION[i]) {
  226. i++;
  227. } else i = 0;
  228. pageHeader.write(b);
  229. if (i >= START_REVISION.length) {
  230. flag = 5;
  231. return true;
  232. }
  233. }
  234. // inside <revision></revision> block
  235. else if (flag == 5) {
  236. if (b == END_REVISION[i]) {
  237. i++;
  238. } else i = 0;
  239. rev2Buf.write(b);
  240. if (i >= END_REVISION.length) {
  241. flag = 6;
  242. revisionVisited++;
  243. return true;
  244. }
  245. }
  246. // Note that flag 4 can be the signal of a new record inside one old page
  247. else if (flag == 6) {
  248. int curMatch = 0;
  249. if ((i < END_PAGE.length && b == END_PAGE[i])
  250. && (i < START_REVISION.length && b == START_REVISION[i])) {
  251. curMatch = 3;
  252. } else if (i < END_PAGE.length && b == END_PAGE[i]) {
  253. curMatch = 2;
  254. } else if (i < START_REVISION.length && b == START_REVISION[i]) {
  255. curMatch = 1;
  256. }
  257. if (curMatch > 0 && (i == 0 || lastMatchTag == 3 || curMatch == lastMatchTag)) {
  258. i++;
  259. lastMatchTag = curMatch;
  260. } else i = 0;
  261. if ((lastMatchTag == 2 || lastMatchTag == 3) && i >= END_PAGE.length) {
  262. flag = 7;
  263. lastMatchTag = -1;
  264. return true;
  265. } else if ((lastMatchTag == 1 || lastMatchTag == 3) && i >= START_REVISION.length) {
  266. flag = 5;
  267. lastMatchTag = -1;
  268. return true;
  269. }
  270. }
  271. }
  272. }
  273. }
  274. }
  275. }