PageRenderTime 46ms CodeModel.GetById 22ms RepoModel.GetById 0ms app.codeStats 0ms

/java/main/org/hedera/io/input/WikiRevisionTextInputFormat.java

https://github.com/giangbinhtran/Hedera
Java | 274 lines | 160 code | 14 blank | 100 comment | 99 complexity | dec435243b2d56960f36e21a086f35d4 MD5 | raw file
  1. package org.hedera.io.input;
  2. import java.io.IOException;
  3. import org.apache.hadoop.fs.FSDataInputStream;
  4. import org.apache.hadoop.io.DataOutputBuffer;
  5. import org.apache.hadoop.io.LongWritable;
  6. import org.apache.hadoop.io.Text;
  7. import org.apache.hadoop.io.compress.CompressionInputStream;
  8. import org.apache.hadoop.mapreduce.InputSplit;
  9. import org.apache.hadoop.mapreduce.RecordReader;
  10. import org.apache.hadoop.mapreduce.TaskAttemptContext;
  11. import org.apache.log4j.Logger;
  12. public class WikiRevisionTextInputFormat extends
  13. WikiRevisionInputFormat<LongWritable, Text> {
  14. @Override
  15. public RecordReader<LongWritable, Text> createRecordReader(InputSplit split,
  16. TaskAttemptContext context) {
  17. return new RevisionReader();
  18. }
  19. /** read a meta-history xml file and output as a record every pair of consecutive revisions.
  20. * For example, Given the following input containing two pages and four revisions,
  21. * <pre><code>
  22. * &lt;page&gt;
  23. * &lt;title&gt;ABC&lt;/title&gt;
  24. * &lt;id&gt;123&lt;/id&gt;
  25. * &lt;revision&gt;
  26. * &lt;id&gt;100&lt;/id&gt;
  27. * ....
  28. * &lt;/revision&gt;
  29. * &lt;revision&gt;
  30. * &lt;id&gt;200&lt;/id&gt;
  31. * ....
  32. * &lt;/revision&gt;
  33. * &lt;revision&gt;
  34. * &lt;id&gt;300&lt;/id&gt;
  35. * ....
  36. * &lt;/revision&gt;
  37. * &lt;/page&gt;
  38. * &lt;page&gt;
  39. * &lt;title&gt;DEF&lt;/title&gt;
  40. * &lt;id&gt;456&lt;/id&gt;
  41. * &lt;revision&gt;
  42. * &lt;id&gt;400&lt;/id&gt;
  43. * ....
  44. * &lt;/revision&gt;
  45. * &lt;/page&gt;
  46. * </code></pre>
  47. * it will produce four keys like this:
  48. * <pre><code>
  49. * &lt;page&gt;
  50. * &lt;title&gt;ABC&lt;/title&gt;
  51. * &lt;id&gt;123&lt;/id&gt;
  52. * &lt;revision beginningofpage="true"&gt;&lt;text xml:space="preserve"&gt;
  53. * &lt;/text&gt;&lt;/revision&gt;&lt;revision&gt;
  54. * &lt;id&gt;100&lt;/id&gt;
  55. * ....
  56. * &lt;/revision&gt;
  57. * &lt;/page&gt;
  58. * </code></pre>
  59. * <pre><code>
  60. * &lt;page&gt;
  61. * &lt;title&gt;ABC&lt;/title&gt;
  62. * &lt;id&gt;123&lt;/id&gt;
  63. * &lt;revision&gt;
  64. * &lt;id&gt;100&lt;/id&gt;
  65. * ....
  66. * &lt;/revision&gt;
  67. * &lt;revision&gt;
  68. * &lt;id&gt;200&lt;/id&gt;
  69. * ....
  70. * &lt;/revision&gt;
  71. * &lt;/page&gt;
  72. * </code></pre>
  73. * <pre><code>
  74. * &lt;page&gt;
  75. * &lt;title&gt;ABC&lt;/title&gt;
  76. * &lt;id&gt;123&lt;/id&gt;
  77. * &lt;revision&gt;
  78. * &lt;id&gt;200&lt;/id&gt;
  79. * ....
  80. * &lt;/revision&gt;
  81. * &lt;revision&gt;
  82. * &lt;id&gt;300&lt;/id&gt;
  83. * ....
  84. * &lt;/revision&gt;
  85. * &lt;/page&gt;
  86. * </code></pre>
  87. * <pre><code>
  88. * &lt;page&gt;
  89. * &lt;title&gt;DEF&lt;/title&gt;
  90. * &lt;id&gt;456&lt;/id&gt;
  91. * &lt;revision&gt;&lt;revision beginningofpage="true"&gt;&lt;text xml:space="preserve"&gt;
  92. * &lt;/text&gt;&lt;/revision&gt;&lt;revision&gt;
  93. * &lt;id&gt;400&lt;/id&gt;
  94. * ....
  95. * &lt;/revision&gt;
  96. * &lt;/page&gt;
  97. * </code></pre>
  98. */
  99. // State of the flag:
  100. // -1: EOF
  101. // 1 - beginning of the first <page> tag
  102. // 2 - just passed the <page> tag but outside the <id> tag
  103. // 3 - just passed the <id>
  104. // 4 - just passed the </id> but outside <revision>
  105. // 5 - just passed the <revision>
  106. // 6 - just passed the </revision>
  107. // 7 - just passed the </page>
  108. public static class RevisionReader extends WikiRevisionReader<Text> {
  109. private static final Logger LOG = Logger.getLogger(RevisionReader.class);
  110. // indicating the flow condition within [flag = 6]
  111. // -1 - Unmatched
  112. // 1 - Matched <revision> tag partially
  113. // 2 - Matched </page> tag partially
  114. // 3 - Matched both <revision> and </page> partially
  115. private int lastMatchTag = -1;
  116. private DataOutputBuffer pageHeader = new DataOutputBuffer();
  117. private DataOutputBuffer keyBuf = new DataOutputBuffer();
  118. private DataOutputBuffer revBuf = new DataOutputBuffer();
  119. @Override
  120. public void initialize(InputSplit input, TaskAttemptContext tac)
  121. throws IOException, InterruptedException {
  122. super.initialize(input, tac);
  123. value = new Text();
  124. }
  125. @Override
  126. public STATE doWhenMatch() throws IOException, InterruptedException {
  127. if (flag == 7) {
  128. pageHeader.reset();
  129. revBuf.reset();
  130. value.clear();
  131. }
  132. else if (flag == 6) {
  133. value.set(pageHeader.getData(), 0, pageHeader.getLength()
  134. - START_REVISION.length);
  135. value.append(revBuf.getData(), 0, revBuf.getLength());
  136. value.append(END_PAGE, 0, END_PAGE.length);
  137. return STATE.STOP_TRUE;
  138. }
  139. else if (flag == 4) {
  140. String pageId = new String(keyBuf.getData(), 0, keyBuf.getLength()
  141. - END_ID.length);
  142. key.set(Long.parseLong(pageId));
  143. keyBuf.reset();
  144. }
  145. else if (flag == 2) {
  146. pageHeader.write(START_PAGE);
  147. }
  148. else if (flag == 5) {
  149. revBuf.reset();
  150. revBuf.write(START_REVISION);
  151. }
  152. else if (flag == -1) {
  153. pageHeader.reset();
  154. return STATE.STOP_FALSE;
  155. }
  156. return STATE.CONTINUE;
  157. }
  158. @Override
  159. protected boolean readUntilMatch() throws IOException {
  160. if (buf == null && pos.length != 2)
  161. throw new IOException("Internal buffer corrupted.");
  162. int i = 0;
  163. while (true) {
  164. if (pos[0] == pos[1]) {
  165. pos[1] = (compressed) ? ((CompressionInputStream)fsin).read(buf) :
  166. ((FSDataInputStream)fsin).read(buf);
  167. LOG.info(pos[1] + " bytes read from the stream...");
  168. pos[0] = 0;
  169. if (pos[1] == -1) {
  170. return false;
  171. }
  172. }
  173. while (pos[0] < pos[1]) {
  174. byte b = buf[pos[0]];
  175. pos[0]++;
  176. // ignore every character until reaching a new page
  177. if (flag == 1 || flag == 7) {
  178. if (b == START_PAGE[i]) {
  179. i++;
  180. if (i >= START_PAGE.length) {
  181. flag = 2;
  182. return true;
  183. }
  184. } else i = 0;
  185. }
  186. // put everything between <page> tag and the first <id> tag into pageHeader
  187. else if (flag == 2) {
  188. if (b == START_ID[i]) {
  189. i++;
  190. } else i = 0;
  191. pageHeader.write(b);
  192. if (i >= START_ID.length) {
  193. flag = 3;
  194. return true;
  195. }
  196. }
  197. // put everything in <id></id> block into pageHeader and keyBuf
  198. else if (flag == 3) {
  199. if (b == END_ID[i]) {
  200. i++;
  201. } else i = 0;
  202. pageHeader.write(b);
  203. keyBuf.write(b);
  204. if (i >= END_ID.length) {
  205. flag = 4;
  206. return true;
  207. }
  208. }
  209. // put everything between </id> tag and the first <revision> tag into pageHeader
  210. else if (flag == 4) {
  211. if (b == START_REVISION[i]) {
  212. i++;
  213. } else i = 0;
  214. pageHeader.write(b);
  215. if (i >= START_REVISION.length) {
  216. flag = 5;
  217. return true;
  218. }
  219. }
  220. // inside <revision></revision> block
  221. else if (flag == 5) {
  222. if (b == END_REVISION[i]) {
  223. i++;
  224. } else i = 0;
  225. revBuf.write(b);
  226. if (i >= END_REVISION.length) {
  227. flag = 6;
  228. return true;
  229. }
  230. }
  231. // Note that flag 4 can be the signal of a new record inside one old page
  232. else if (flag == 6) {
  233. int curMatch = 0;
  234. if ((i < END_PAGE.length && b == END_PAGE[i])
  235. && (i < START_REVISION.length && b == START_REVISION[i])) {
  236. curMatch = 3;
  237. } else if (i < END_PAGE.length && b == END_PAGE[i]) {
  238. curMatch = 2;
  239. } else if (i < START_REVISION.length && b == START_REVISION[i]) {
  240. curMatch = 1;
  241. }
  242. if (curMatch > 0 && (i == 0 || lastMatchTag == 3 || curMatch == lastMatchTag)) {
  243. i++;
  244. lastMatchTag = curMatch;
  245. } else i = 0;
  246. if ((lastMatchTag == 2 || lastMatchTag == 3) && i >= END_PAGE.length) {
  247. flag = 7;
  248. lastMatchTag = -1;
  249. return true;
  250. } else if ((lastMatchTag == 1 || lastMatchTag == 3) && i >= START_REVISION.length) {
  251. flag = 5;
  252. lastMatchTag = -1;
  253. return true;
  254. }
  255. }
  256. }
  257. }
  258. }
  259. }
  260. }