PageRenderTime 53ms CodeModel.GetById 27ms RepoModel.GetById 0ms app.codeStats 0ms

/java/main/org/hedera/io/etl/IntervalRevisionETLReader.java

https://github.com/giangbinhtran/Hedera
Java | 272 lines | 210 code | 21 blank | 41 comment | 136 complexity | 234b6936b247d8d4cbadc49f6d70e677 MD5 | raw file
  1. /**
  2. *
  3. */
  4. package org.hedera.io.etl;
  5. import static org.hedera.io.input.WikiRevisionInputFormat.END_ID;
  6. import static org.hedera.io.input.WikiRevisionInputFormat.END_PARENT_ID;
  7. import static org.hedera.io.input.WikiRevisionInputFormat.END_REVISION;
  8. import static org.hedera.io.input.WikiRevisionInputFormat.END_TEXT;
  9. import static org.hedera.io.input.WikiRevisionInputFormat.END_TIMESTAMP;
  10. import static org.hedera.io.input.WikiRevisionInputFormat.MINOR_TAG;
  11. import static org.hedera.io.input.WikiRevisionInputFormat.START_ID;
  12. import static org.hedera.io.input.WikiRevisionInputFormat.START_PARENT_ID;
  13. import static org.hedera.io.input.WikiRevisionInputFormat.START_TEXT;
  14. import static org.hedera.io.input.WikiRevisionInputFormat.START_TIMESTAMP;
  15. import static org.hedera.io.input.WikiRevisionInputFormat.TIME_FORMAT;
  16. import java.io.IOException;
  17. import org.apache.hadoop.conf.Configuration;
  18. import org.apache.hadoop.io.DataOutputBuffer;
  19. import org.apache.hadoop.mapreduce.InputSplit;
  20. import org.apache.hadoop.mapreduce.TaskAttemptContext;
  21. import org.hedera.io.RevisionHeader;
  22. /**
  23. * A WikiRevsionETLReader that skips all revisions out of a specific range
  24. * @author tuan
  25. *
  26. */
  27. public abstract class IntervalRevisionETLReader<KEYIN, VALUEIN> extends
  28. DefaultRevisionETLReader<KEYIN, VALUEIN> {
  29. public static final String START_TIME_OPT = "org.hedera.io.etl.starttime";
  30. public static final String END_TIME_OPT = "org.hedera.io.etl.endtime";
  31. public static final String SCALE_OPT = "org.hedera.io.etl.bow.scale";
  32. public static final String HOUR_SCALE_OPT = "hour";
  33. public static final String DAY_SCALE_OPT = "day";
  34. public static final String WEEK_SCALE_OPT = "week";
  35. public static final String MONTH_SCALE_OPT = "month";
  36. private long startTs = Long.MIN_VALUE;
  37. private long endTs = Long.MAX_VALUE;
  38. @Override
  39. public void initialize(InputSplit input, TaskAttemptContext tac)
  40. throws IOException, InterruptedException {
  41. super.initialize(input, tac);
  42. Configuration conf = tac.getConfiguration();
  43. String startTime = conf.get(START_TIME_OPT);
  44. if (startTime != null) {
  45. startTs = TIME_FORMAT.parseMillis(startTime);
  46. }
  47. String endTime = conf.get(END_TIME_OPT);
  48. if (endTime != null) {
  49. endTs = TIME_FORMAT.parseMillis(endTime);
  50. }
  51. }
  52. @Override
  53. // -1: EOF
  54. // 9 - default
  55. // 10 - just passed the inner <id> tag inside <revision>
  56. // 11 - just passed the inner </id> tag inside <revision>
  57. // 12 - just passed the <timestamp>
  58. // 13 - just passed the </timestamp> tag
  59. // 14 - just passed the <parentId>
  60. // 15 - just passed the </parentId> tag
  61. // 16 - just passed the <minor/> (or not)
  62. // 17 - just passed the <text> tag
  63. // 18 - just passed the </text> tag
  64. // 19 - just passed the </revision>
  65. protected Ack readToNextRevision(DataOutputBuffer buffer,
  66. RevisionHeader meta) throws IOException {
  67. int i = 0;
  68. int flag = 9;
  69. int parOrTs = -1;
  70. int minorOrText = -1;
  71. try (DataOutputBuffer revIdBuf = new DataOutputBuffer();
  72. DataOutputBuffer timestampBuf = new DataOutputBuffer();
  73. DataOutputBuffer parBuf = new DataOutputBuffer()) {
  74. while (true) {
  75. if (!fetchMore()) return Ack.EOF;
  76. while (hasData()) {
  77. byte b = nextByte();
  78. if (flag == 9) {
  79. if (b == START_ID[i]) {
  80. i++;
  81. } else i = 0;
  82. if (i >= START_ID.length) {
  83. flag = 10;
  84. i = 0;
  85. }
  86. }
  87. // everything inside the inner <id></id>
  88. // block goes to revision buffer
  89. else if (flag == 10) {
  90. if (b == END_ID[i]) {
  91. i++;
  92. } else i = 0;
  93. revIdBuf.write(b);
  94. if (i >= END_ID.length) {
  95. flag = 11;
  96. String idStr = new String(revIdBuf.getData(), 0,
  97. revIdBuf.getLength() - END_ID.length);
  98. long revId = Long.parseLong(idStr);
  99. meta.setRevisionId(revId);
  100. revIdBuf.reset();
  101. i = 0;
  102. }
  103. }
  104. // after the inner <id>, check for either <timestamp> or <parentId>
  105. else if (flag == 11) {
  106. int curMatch = 0;
  107. if ((i < START_PARENT_ID.length && b == START_PARENT_ID[i])
  108. && (i < START_TIMESTAMP.length && b == START_TIMESTAMP[i])) {
  109. curMatch = 3;
  110. } else if (i < START_PARENT_ID.length && b == START_PARENT_ID[i]) {
  111. curMatch = 1;
  112. } else if (i < START_TIMESTAMP.length && b == START_TIMESTAMP[i]) {
  113. curMatch = 2;
  114. }
  115. if (curMatch > 0 && (i == 0 || parOrTs == 3 || curMatch == parOrTs)) {
  116. i++;
  117. parOrTs = curMatch;
  118. } else i = 0;
  119. if ((parOrTs == 2 || parOrTs == 3) && i >= START_TIMESTAMP.length) {
  120. flag = 12;
  121. parOrTs = -1;
  122. i = 0;
  123. } else if ((parOrTs == 1 || parOrTs == 3) && i >= START_PARENT_ID.length) {
  124. flag = 14;
  125. parOrTs = -1;
  126. i = 0;
  127. }
  128. }
  129. // inside <timestamp></timestamp> block everything goes to timestamp buffer
  130. else if (flag == 12) {
  131. if (b == END_TIMESTAMP[i]) {
  132. i++;
  133. } else i = 0;
  134. timestampBuf.write(b);
  135. if (i >= END_TIMESTAMP.length) {
  136. flag = 13;
  137. String ts = new String(timestampBuf.getData(), 0,
  138. timestampBuf.getLength()
  139. - END_TIMESTAMP.length);
  140. long timestamp = TIME_FORMAT.parseMillis(ts);
  141. if (timestamp < startTs || timestamp >= endTs) {
  142. meta.clear();
  143. return Ack.SKIPPED;
  144. }
  145. meta.setTimestamp(timestamp);
  146. timestampBuf.reset();
  147. i = 0;
  148. }
  149. }
  150. // inside <parentId></parentId> block everything goes to parentId buffer
  151. else if (flag == 14) {
  152. if (b == END_PARENT_ID[i]) {
  153. i++;
  154. } else i = 0;
  155. parBuf.write(b);
  156. if (i >= END_PARENT_ID.length) {
  157. flag = 15;
  158. String parIdStr = new String(parBuf.getData(), 0, parBuf.getLength()
  159. - END_PARENT_ID.length);
  160. long parId = Long.parseLong(parIdStr);
  161. meta.setParentId(parId);
  162. parBuf.reset();
  163. i = 0;
  164. }
  165. }
  166. // after the </parentId>, search for <timestamp>
  167. else if (flag == 15) {
  168. if (b == START_TIMESTAMP[i]) {
  169. i++;
  170. } else i = 0;
  171. if (i >= START_TIMESTAMP.length) {
  172. flag = 12;
  173. i = 0;
  174. }
  175. }
  176. // After the timestamp, sometimes we can make a quick check to see
  177. // whether we should skip this revision
  178. // after the </timestamp>, check for <minor/>, if they exist
  179. else if (flag == 13) {
  180. int curMatch = 0;
  181. if ((i < START_TEXT.length && b == START_TEXT[i])
  182. && (i < MINOR_TAG.length && b == MINOR_TAG[i])) {
  183. curMatch = 3;
  184. } else if (i < START_TEXT.length && b == START_TEXT[i]) {
  185. curMatch = 1;
  186. } else if (i < MINOR_TAG.length && b == MINOR_TAG[i]) {
  187. curMatch = 2;
  188. }
  189. if (curMatch > 0 && (i == 0 || minorOrText == 3 || curMatch == minorOrText)) {
  190. i++;
  191. minorOrText = curMatch;
  192. } else i = 0;
  193. if ((minorOrText == 2 || minorOrText == 3) && i >= MINOR_TAG.length) {
  194. // update the meta
  195. meta.setMinor(true);
  196. flag = 16;
  197. minorOrText = -1;
  198. i = 0;
  199. } else if ((minorOrText == 1 || minorOrText == 3) && i >= START_TEXT.length) {
  200. flag = 17;
  201. minorOrText = -1;
  202. i = 0;
  203. }
  204. }
  205. // after the <minor/>, and search for <text>
  206. else if (flag == 16) {
  207. if (b == START_TEXT[i]) {
  208. i++;
  209. } else i = 0;
  210. if (i >= START_TEXT.length) {
  211. flag = 17;
  212. i = 0;
  213. }
  214. }
  215. // inside <text></text> block everything goes to content buffer
  216. else if (flag == 17) {
  217. if (b == END_TEXT[i]) {
  218. i++;
  219. } else i = 0;
  220. buffer.write(b);
  221. if (i >= END_TEXT.length) {
  222. flag = 18;
  223. meta.setLength(buffer.getLength());
  224. processMetaData(buffer, meta);
  225. i = 0;
  226. }
  227. }
  228. // look for the closing </revision>
  229. else if (flag == 18) {
  230. if (b == END_REVISION[i]) {
  231. i++;
  232. } else i = 0;
  233. if (i >= END_REVISION.length) {
  234. // the flag is not anymore useful
  235. flag = 19;
  236. return Ack.PASSED_TO_NEXT_TAG;
  237. }
  238. }
  239. }
  240. }
  241. }
  242. }
  243. /**
  244. * This method processes after caching the currently visited revision.
  245. * It performs the meta-data quick updates before the actual extraction
  246. * (in WikiRevisionETLReader's code)
  247. * @param buffer
  248. * @param meta
  249. */
  250. protected void processMetaData(DataOutputBuffer buffer, RevisionHeader meta) {}
  251. }