PageRenderTime 38ms CodeModel.GetById 13ms RepoModel.GetById 1ms app.codeStats 0ms

/java/test/org/hedera/LocalIntervalWikiRevisionETLReader.java

https://github.com/giangbinhtran/Hedera
Java | 251 lines | 192 code | 21 blank | 38 comment | 132 complexity | d25c0836c24e82727cc1a85c76449cff MD5 | raw file
  1. package org.hedera;
  2. import static org.hedera.io.input.WikiRevisionInputFormat.END_PARENT_ID;
  3. import static org.hedera.io.input.WikiRevisionInputFormat.MINOR_TAG;
  4. import static org.hedera.io.input.WikiRevisionInputFormat.START_PARENT_ID;
  5. import static org.hedera.io.input.WikiRevisionInputFormat.TIME_FORMAT;
  6. import java.io.IOException;
  7. import org.apache.hadoop.io.DataOutputBuffer;
  8. import org.hedera.io.RevisionHeader;
  9. /**
  10. * A WikiRevsionETLReader that skips all revisions out of a specific range
  11. * @author tuan
  12. *
  13. */
  14. public abstract class LocalIntervalWikiRevisionETLReader<KEYIN, VALUEIN> extends
  15. LocalDefaultWikiRevisionETLReader<KEYIN, VALUEIN> {
  16. public static final String START_TIME_OPT = "org.hedera.io.etl.starttime";
  17. public static final String END_TIME_OPT = "org.hedera.io.etl.endtime";
  18. public static final String SCALE_OPT = "org.hedera.io.etl.bow.scale";
  19. public static final String HOUR_SCALE_OPT = "hour";
  20. public static final String DAY_SCALE_OPT = "day";
  21. public static final String WEEK_SCALE_OPT = "week";
  22. public static final String MONTH_SCALE_OPT = "month";
  23. private long startTs = Long.MIN_VALUE;
  24. private long endTs = Long.MAX_VALUE;
  25. @Override
  26. public void initialize() throws IOException {
  27. super.initialize();
  28. endTs = TIME_FORMAT.parseMillis("2005-03-28T07:41:42Z");
  29. startTs = TIME_FORMAT.parseMillis("2003-05-30T12:57:20Z");
  30. }
  31. @Override
  32. // -1: EOF
  33. // 9 - default
  34. // 10 - just passed the inner <id> tag inside <revision>
  35. // 11 - just passed the inner </id> tag inside <revision>
  36. // 12 - just passed the <timestamp>
  37. // 13 - just passed the </timestamp> tag
  38. // 14 - just passed the <parentId>
  39. // 15 - just passed the </parentId> tag
  40. // 16 - just passed the <minor/> (or not)
  41. // 17 - just passed the <text> tag
  42. // 18 - just passed the </text> tag
  43. // 19 - just passed the </revision>
  44. protected Ack readToNextRevision(DataOutputBuffer buffer,
  45. RevisionHeader meta) throws IOException {
  46. int i = 0;
  47. int flag = 9;
  48. int parOrTs = -1;
  49. int minorOrText = -1;
  50. try (DataOutputBuffer revIdBuf = new DataOutputBuffer();
  51. DataOutputBuffer timestampBuf = new DataOutputBuffer();
  52. DataOutputBuffer parBuf = new DataOutputBuffer()) {
  53. while (true) {
  54. if (!fetchMore()) return Ack.EOF;
  55. while (hasData()) {
  56. byte b = nextByte();
  57. if (flag == 9) {
  58. if (b == START_ID[i]) {
  59. i++;
  60. } else i = 0;
  61. if (i >= START_ID.length) {
  62. flag = 10;
  63. i = 0;
  64. }
  65. }
  66. // everything inside the inner <id></id>
  67. // block goes to revision buffer
  68. else if (flag == 10) {
  69. if (b == END_ID[i]) {
  70. i++;
  71. } else i = 0;
  72. revIdBuf.write(b);
  73. if (i >= END_ID.length) {
  74. flag = 11;
  75. String idStr = new String(revIdBuf.getData(), 0,
  76. revIdBuf.getLength() - END_ID.length);
  77. long revId = Long.parseLong(idStr);
  78. meta.setRevisionId(revId);
  79. revIdBuf.reset();
  80. i = 0;
  81. }
  82. }
  83. // after the inner <id>, check for either <timestamp> or <parentId>
  84. else if (flag == 11) {
  85. int curMatch = 0;
  86. if ((i < START_PARENT_ID.length && b == START_PARENT_ID[i])
  87. && (i < START_TIMESTAMP.length && b == START_TIMESTAMP[i])) {
  88. curMatch = 3;
  89. } else if (i < START_PARENT_ID.length && b == START_PARENT_ID[i]) {
  90. curMatch = 1;
  91. } else if (i < START_TIMESTAMP.length && b == START_TIMESTAMP[i]) {
  92. curMatch = 2;
  93. }
  94. if (curMatch > 0 && (i == 0 || parOrTs == 3 || curMatch == parOrTs)) {
  95. i++;
  96. parOrTs = curMatch;
  97. } else i = 0;
  98. if ((parOrTs == 2 || parOrTs == 3) && i >= START_TIMESTAMP.length) {
  99. flag = 12;
  100. parOrTs = -1;
  101. i = 0;
  102. } else if ((parOrTs == 1 || parOrTs == 3) && i >= START_PARENT_ID.length) {
  103. flag = 14;
  104. parOrTs = -1;
  105. i = 0;
  106. }
  107. }
  108. // inside <timestamp></timestamp> block everything goes to timestamp buffer
  109. else if (flag == 12) {
  110. if (b == END_TIMESTAMP[i]) {
  111. i++;
  112. } else i = 0;
  113. timestampBuf.write(b);
  114. if (i >= END_TIMESTAMP.length) {
  115. flag = 13;
  116. String ts = new String(timestampBuf.getData(), 0,
  117. timestampBuf.getLength()
  118. - END_TIMESTAMP.length);
  119. long timestamp = TIME_FORMAT.parseMillis(ts);
  120. if (timestamp < startTs || timestamp >= endTs) {
  121. meta.clear();
  122. return Ack.SKIPPED;
  123. }
  124. meta.setTimestamp(timestamp);
  125. timestampBuf.reset();
  126. i = 0;
  127. }
  128. }
  129. // inside <parentId></parentId> block everything goes to parentId buffer
  130. else if (flag == 14) {
  131. if (b == END_PARENT_ID[i]) {
  132. i++;
  133. } else i = 0;
  134. parBuf.write(b);
  135. if (i >= END_PARENT_ID.length) {
  136. flag = 15;
  137. String parIdStr = new String(parBuf.getData(), 0, parBuf.getLength()
  138. - END_PARENT_ID.length);
  139. long parId = Long.parseLong(parIdStr);
  140. meta.setParentId(parId);
  141. parBuf.reset();
  142. i = 0;
  143. }
  144. }
  145. // after the </parentId>, search for <timestamp>
  146. else if (flag == 15) {
  147. if (b == START_TIMESTAMP[i]) {
  148. i++;
  149. } else i = 0;
  150. if (i >= START_TIMESTAMP.length) {
  151. flag = 12;
  152. i = 0;
  153. }
  154. }
  155. // After the timestamp, sometimes we can make a quick check to see
  156. // whether we should skip this revision
  157. // after the </timestamp>, check for <minor/>, if they exist
  158. else if (flag == 13) {
  159. int curMatch = 0;
  160. if ((i < START_TEXT.length && b == START_TEXT[i])
  161. && (i < MINOR_TAG.length && b == MINOR_TAG[i])) {
  162. curMatch = 3;
  163. } else if (i < START_TEXT.length && b == START_TEXT[i]) {
  164. curMatch = 1;
  165. } else if (i < MINOR_TAG.length && b == MINOR_TAG[i]) {
  166. curMatch = 2;
  167. }
  168. if (curMatch > 0 && (i == 0 || minorOrText == 3 || curMatch == minorOrText)) {
  169. i++;
  170. minorOrText = curMatch;
  171. } else i = 0;
  172. if ((minorOrText == 2 || minorOrText == 3) && i >= MINOR_TAG.length) {
  173. // update the meta
  174. meta.setMinor(true);
  175. flag = 16;
  176. minorOrText = -1;
  177. i = 0;
  178. } else if ((minorOrText == 1 || minorOrText == 3) && i >= START_TEXT.length) {
  179. flag = 17;
  180. minorOrText = -1;
  181. i = 0;
  182. }
  183. }
  184. // after the <minor/>, and search for <text>
  185. else if (flag == 16) {
  186. if (b == START_TEXT[i]) {
  187. i++;
  188. } else i = 0;
  189. if (i >= START_TEXT.length) {
  190. flag = 17;
  191. i = 0;
  192. }
  193. }
  194. // inside <text></text> block everything goes to content buffer
  195. else if (flag == 17) {
  196. if (b == END_TEXT[i]) {
  197. i++;
  198. } else i = 0;
  199. buffer.write(b);
  200. if (i >= END_TEXT.length) {
  201. flag = 18;
  202. meta.setLength(buffer.getLength());
  203. processMetaData(buffer, meta);
  204. i = 0;
  205. }
  206. }
  207. // look for the closing </revision>
  208. else if (flag == 18) {
  209. if (b == END_REVISION[i]) {
  210. i++;
  211. } else i = 0;
  212. if (i >= END_REVISION.length) {
  213. // the flag is not anymore useful
  214. flag = 19;
  215. return Ack.PASSED_TO_NEXT_TAG;
  216. }
  217. }
  218. }
  219. }
  220. }
  221. }
  222. /**
  223. * This method processes after caching the currently visited revision.
  224. * It performs the meta-data quick updates before the actual extraction
  225. * (in WikiRevisionETLReader's code)
  226. * @param buffer
  227. * @param meta
  228. */
  229. protected void processMetaData(DataOutputBuffer buffer, RevisionHeader meta) {}
  230. }