PageRenderTime 54ms CodeModel.GetById 29ms RepoModel.GetById 0ms app.codeStats 0ms

/java/test/org/hedera/LocalWikiRevisionLinkReader.java

https://github.com/giangbinhtran/Hedera
Java | 200 lines | 168 code | 21 blank | 11 comment | 94 complexity | 7a1462023f4416ade5156078081b11f8 MD5 | raw file
  1. package org.hedera;
  2. import static org.hedera.io.input.WikiRevisionInputFormat.END_PARENT_ID;
  3. import static org.hedera.io.input.WikiRevisionInputFormat.START_PARENT_ID;
  4. import static org.hedera.io.input.WikiRevisionInputFormat.TIME_FORMAT;
  5. import java.io.IOException;
  6. import org.apache.hadoop.io.DataOutputBuffer;
  7. import org.apache.hadoop.io.LongWritable;
  8. import org.hedera.io.LinkProfile;
  9. import org.hedera.io.RevisionHeader;
  10. import org.hedera.io.etl.ETLExtractor;
  11. import org.hedera.io.etl.RevisionLinkInputFormat;
  12. public class LocalWikiRevisionLinkReader extends
  13. LocalDefaultWikiRevisionETLReader<LongWritable, LinkProfile> {
  14. @Override
  15. protected LongWritable initializeKey() {
  16. return new LongWritable();
  17. }
  18. @Override
  19. protected LinkProfile initializeValue() {
  20. return new LinkProfile();
  21. }
  22. @Override
  23. protected void freeKey(LongWritable key) {
  24. }
  25. @Override
  26. protected void freeValue(LinkProfile value) {
  27. value.clear();
  28. }
  29. @Override
  30. protected ETLExtractor<LongWritable, LinkProfile,
  31. RevisionHeader> initializeExtractor() {
  32. return new RevisionLinkInputFormat.LinkExtractor();
  33. }
  34. @Override
  35. protected Ack readToNextRevision(DataOutputBuffer buffer,
  36. RevisionHeader meta) throws IOException {
  37. int i = 0;
  38. int flag = 9;
  39. int parOrTs = -1;
  40. try (DataOutputBuffer revIdBuf = new DataOutputBuffer();
  41. DataOutputBuffer timestampBuf = new DataOutputBuffer();
  42. DataOutputBuffer parBuf = new DataOutputBuffer()) {
  43. while (true) {
  44. if (!fetchMore()) return Ack.EOF;
  45. while (hasData()) {
  46. byte b = nextByte();
  47. if (flag == 9) {
  48. if (b == START_ID[i]) {
  49. i++;
  50. } else i = 0;
  51. if (i >= START_ID.length) {
  52. flag = 10;
  53. i = 0;
  54. }
  55. }
  56. // everything inside the inner <id></id>
  57. // block goes to revision buffer
  58. else if (flag == 10) {
  59. if (b == END_ID[i]) {
  60. i++;
  61. } else i = 0;
  62. revIdBuf.write(b);
  63. if (i >= END_ID.length) {
  64. flag = 11;
  65. String idStr = new String(revIdBuf.getData(), 0,
  66. revIdBuf.getLength() - END_ID.length);
  67. long revId = Long.parseLong(idStr);
  68. meta.setRevisionId(revId);
  69. revIdBuf.reset();
  70. i = 0;
  71. }
  72. }
  73. // after the inner <id>, check for either <timestamp> or <parentId>
  74. else if (flag == 11) {
  75. int curMatch = 0;
  76. if ((i < START_PARENT_ID.length && b == START_PARENT_ID[i])
  77. && (i < START_TIMESTAMP.length && b == START_TIMESTAMP[i])) {
  78. curMatch = 3;
  79. } else if (i < START_PARENT_ID.length && b == START_PARENT_ID[i]) {
  80. curMatch = 1;
  81. } else if (i < START_TIMESTAMP.length && b == START_TIMESTAMP[i]) {
  82. curMatch = 2;
  83. }
  84. if (curMatch > 0 && (i == 0 || parOrTs == 3 || curMatch == parOrTs)) {
  85. i++;
  86. parOrTs = curMatch;
  87. } else i = 0;
  88. if ((parOrTs == 2 || parOrTs == 3) && i >= START_TIMESTAMP.length) {
  89. flag = 12;
  90. parOrTs = -1;
  91. i = 0;
  92. } else if ((parOrTs == 1 || parOrTs == 3) && i >= START_PARENT_ID.length) {
  93. flag = 14;
  94. parOrTs = -1;
  95. i = 0;
  96. }
  97. }
  98. // inside <timestamp></timestamp> block everything goes to timestamp buffer
  99. else if (flag == 12) {
  100. if (b == END_TIMESTAMP[i]) {
  101. i++;
  102. } else i = 0;
  103. timestampBuf.write(b);
  104. if (i >= END_TIMESTAMP.length) {
  105. flag = 13;
  106. String ts = new String(timestampBuf.getData(), 0,
  107. timestampBuf.getLength()
  108. - END_TIMESTAMP.length);
  109. long timestamp = TIME_FORMAT.parseMillis(ts);
  110. meta.setTimestamp(timestamp);
  111. timestampBuf.reset();
  112. i = 0;
  113. }
  114. }
  115. // inside <parentId></parentId> block everything goes to parentId buffer
  116. else if (flag == 14) {
  117. if (b == END_PARENT_ID[i]) {
  118. i++;
  119. } else i = 0;
  120. parBuf.write(b);
  121. if (i >= END_PARENT_ID.length) {
  122. flag = 15;
  123. String parIdStr = new String(parBuf.getData(), 0, parBuf.getLength()
  124. - END_PARENT_ID.length);
  125. long parId = Long.parseLong(parIdStr);
  126. meta.setParentId(parId);
  127. parBuf.reset();
  128. i = 0;
  129. }
  130. }
  131. // after the </parentId>, search for <timestamp>
  132. else if (flag == 15) {
  133. if (b == START_TIMESTAMP[i]) {
  134. i++;
  135. } else i = 0;
  136. if (i >= START_TIMESTAMP.length) {
  137. flag = 12;
  138. i = 0;
  139. }
  140. }
  141. // After the timestamp, sometimes we can make a quick check to see
  142. // whether we should skip this revision
  143. // after the </timestamp>, check for <text>
  144. else if (flag == 13) {
  145. if (b == START_TEXT[i]) {
  146. i++;
  147. } else i = 0;
  148. if (i >= START_TEXT.length) {
  149. flag = 16;
  150. i = 0;
  151. }
  152. }
  153. // inside <text></text> block everything goes to content buffer
  154. else if (flag == 16) {
  155. if (b == END_TEXT[i]) {
  156. i++;
  157. } else i = 0;
  158. buffer.write(b);
  159. if (i >= END_TEXT.length) {
  160. flag = 17;
  161. meta.setLength(buffer.getLength());
  162. i = 0;
  163. }
  164. }
  165. // look for the closing </revision>
  166. else if (flag == 17) {
  167. if (b == END_REVISION[i]) {
  168. i++;
  169. } else i = 0;
  170. if (i >= END_REVISION.length) {
  171. flag = 18;
  172. return Ack.PASSED_TO_NEXT_TAG;
  173. }
  174. }
  175. }
  176. }
  177. }
  178. }
  179. }