PageRenderTime 48ms CodeModel.GetById 21ms RepoModel.GetById 0ms app.codeStats 0ms

/java/main/org/hedera/io/etl/RevisionIdsFormat.java

https://github.com/giangbinhtran/Hedera
Java | 190 lines | 145 code | 23 blank | 22 comment | 43 complexity | 4ec7668bc48d13be6daea6bdb3cfa9a3 MD5 | raw file
  1. package org.hedera.io.etl;
  2. import java.io.IOException;
  3. import org.apache.hadoop.fs.Path;
  4. import org.apache.hadoop.io.DataOutputBuffer;
  5. import org.apache.hadoop.io.LongWritable;
  6. import org.apache.hadoop.mapreduce.InputSplit;
  7. import org.apache.hadoop.mapreduce.JobContext;
  8. import org.apache.hadoop.mapreduce.RecordReader;
  9. import org.apache.hadoop.mapreduce.TaskAttemptContext;
  10. import org.hedera.io.RevisionHeader;
  11. import org.hedera.io.input.WikiRevisionInputFormat;
  12. import edu.umd.cloud9.io.pair.PairOfLongs;
  13. public class RevisionIdsFormat extends
  14. WikiRevisionInputFormat<LongWritable, PairOfLongs> {
  15. // This job is not expensive, so don't bother set high parallel degree
  16. @Override
  17. public boolean isSplitable(JobContext context, Path file) {
  18. return false;
  19. }
  20. @Override
  21. public RecordReader<LongWritable, PairOfLongs> createRecordReader(
  22. InputSplit input, TaskAttemptContext context) throws IOException,
  23. InterruptedException {
  24. return new RevisionIdsReader();
  25. }
  26. /**
  27. * A lightweight ETL Reader that reads through Wikipedia Revision and extracts
  28. * revision ids, time stamp and page id / title. only those!!
  29. * @author tuan
  30. *
  31. */
  32. public class RevisionIdsReader extends
  33. DefaultRevisionETLReader<LongWritable, PairOfLongs> {
  34. @Override
  35. protected ETLExtractor<LongWritable, PairOfLongs,
  36. RevisionHeader> initializeExtractor() {
  37. return new IdExtractor();
  38. }
  39. @Override
  40. protected LongWritable initializeKey() {
  41. return new LongWritable();
  42. }
  43. @Override
  44. protected void freeKey(LongWritable key) {
  45. key.set(0);
  46. }
  47. @Override
  48. protected PairOfLongs initializeValue() {
  49. return new PairOfLongs();
  50. }
  51. @Override
  52. protected void freeValue(PairOfLongs value) {
  53. value.set(0, 0);
  54. }
  55. @Override
  56. // -1: EOF
  57. // 9 - default
  58. // 10 - just passed the inner <id> tag inside <revision>
  59. // 11 - just passed the inner </id> tag inside <revision>
  60. // 12 - just passed the <timestamp>
  61. // 13 - just passed the </timestamp> tag, skip to the </revision>
  62. // 14 - just passed the </revision>
  63. protected Ack readToNextRevision(DataOutputBuffer buffer,
  64. RevisionHeader meta) throws IOException {
  65. int i = 0;
  66. int flag = 9;
  67. try (DataOutputBuffer revIdBuf = new DataOutputBuffer();
  68. DataOutputBuffer timestampBuf = new DataOutputBuffer();
  69. DataOutputBuffer parBuf = new DataOutputBuffer()) {
  70. while (true) {
  71. if (!fetchMore()) return Ack.EOF;
  72. while (hasData()) {
  73. byte b = nextByte();
  74. if (flag == 9) {
  75. if (b == START_ID[i]) {
  76. i++;
  77. } else i = 0;
  78. if (i >= START_ID.length) {
  79. flag = 10;
  80. i = 0;
  81. }
  82. }
  83. // everything inside the inner <id></id>
  84. // block goes to revision buffer
  85. else if (flag == 10) {
  86. if (b == END_ID[i]) {
  87. i++;
  88. } else i = 0;
  89. revIdBuf.write(b);
  90. if (i >= END_ID.length) {
  91. flag = 11;
  92. String idStr = new String(revIdBuf.getData(), 0,
  93. revIdBuf.getLength() - END_ID.length);
  94. long revId = Long.parseLong(idStr);
  95. meta.setRevisionId(revId);
  96. revIdBuf.reset();
  97. i = 0;
  98. }
  99. }
  100. // after the inner <id>, check for <timestamp>
  101. else if (flag == 11) {
  102. if (b == START_TIMESTAMP[i]) {
  103. i++;
  104. } else i = 0;
  105. if (i >= START_TIMESTAMP.length) {
  106. flag = 12;
  107. i = 0;
  108. }
  109. }
  110. // inside <timestamp></timestamp> block everything goes to
  111. // timestamp buffer
  112. else if (flag == 12) {
  113. if (b == END_TIMESTAMP[i]) {
  114. i++;
  115. } else i = 0;
  116. timestampBuf.write(b);
  117. if (i >= END_TIMESTAMP.length) {
  118. flag = 13;
  119. String ts = new String(timestampBuf.getData(), 0,
  120. timestampBuf.getLength()
  121. - END_TIMESTAMP.length);
  122. long timestamp = TIME_FORMAT.parseMillis(ts);
  123. meta.setTimestamp(timestamp);
  124. timestampBuf.reset();
  125. i = 0;
  126. }
  127. }
  128. // after the </timestamp>, check for <text>
  129. else if (flag == 13) {
  130. if (b == END_REVISION[i]) {
  131. i++;
  132. } else i = 0;
  133. if (i >= END_REVISION.length) {
  134. // the flag is not anymore useful
  135. flag = 14;
  136. return Ack.PASSED_TO_NEXT_TAG;
  137. }
  138. }
  139. }
  140. }
  141. }
  142. }
  143. }
  144. public static class IdExtractor implements
  145. ETLExtractor<LongWritable, PairOfLongs,
  146. RevisionHeader> {
  147. @Override
  148. // We keep everything for articles
  149. public float check(RevisionHeader meta1,
  150. RevisionHeader meta2) {
  151. return 1f;
  152. }
  153. @Override
  154. public boolean extract(DataOutputBuffer content, RevisionHeader meta,
  155. LongWritable key, PairOfLongs value) {
  156. if (meta == null || (meta.getRevisionId() == 0
  157. && meta.getTimestamp() == 0)) {
  158. return false;
  159. }
  160. long revId = meta.getRevisionId();
  161. long ts = meta.getTimestamp();
  162. key.set(meta.getPageId());
  163. value.set(revId, ts);
  164. return true;
  165. }
  166. }
  167. }