PageRenderTime 53ms CodeModel.GetById 28ms RepoModel.GetById 0ms app.codeStats 0ms

/java/test/org/hedera/LocalDefaultWikiRevisionETLReader.java

https://github.com/giangbinhtran/Hedera
Java | 200 lines | 158 code | 17 blank | 25 comment | 104 complexity | 9849c67c1e13b6ef5ea2083b4421be47 MD5 | raw file
  1. package org.hedera;
  2. import static org.hedera.io.input.WikiRevisionInputFormat.START_REDIRECT;
  3. import java.io.IOException;
  4. import org.apache.hadoop.io.DataOutputBuffer;
  5. import org.hedera.io.RevisionHeader;
  6. import org.mortbay.log.Log;
  7. public abstract class LocalDefaultWikiRevisionETLReader<KEYIN, VALUEIN> extends
  8. LocalWikiRevisionETLReader<RevisionHeader, KEYIN, VALUEIN> {
  9. // option to whether skip non-article pages
  10. protected boolean skipNonArticles = false;
  11. protected boolean skipRedirect = false;
  12. @Override
  13. protected RevisionHeader initializeMeta() {
  14. return new RevisionHeader();
  15. }
  16. @Override
  17. // Read the page header
  18. // -1: EOF
  19. // 1 - outside the <page> tag
  20. // 2 - just passed the <page> tag but outside the <title>
  21. // 3 - just passed the <title> tag
  22. // 4 - just passed the </title> tag but outside the <namespace>
  23. // 5 - just passed the <namespace>
  24. // 6 - just passed the </namespace> but outside the <id>
  25. // 7 - just passed the (page's) <id>
  26. // 8 - just passed the </id> tag but outside the <revision>
  27. // 9 - just passed the (next) <revision>
  28. protected Ack readToPageHeader(RevisionHeader meta)
  29. throws IOException {
  30. int i = 0;
  31. int flag = 2;
  32. boolean skipped = false;
  33. int revOrRedirect = -1;
  34. try (DataOutputBuffer pageTitle = new DataOutputBuffer();
  35. DataOutputBuffer nsBuf = new DataOutputBuffer();
  36. DataOutputBuffer keyBuf = new DataOutputBuffer()) {
  37. while (true) {
  38. if (!fetchMore()) return Ack.EOF;
  39. while (hasData()) {
  40. byte b = nextByte();
  41. if (flag == 2) {
  42. if (b == START_TITLE[i]) {
  43. i++;
  44. } else i = 0;
  45. if (i >= START_TITLE.length) {
  46. flag = 3;
  47. i = 0;
  48. }
  49. }
  50. // put everything between <title></title> block into title
  51. else if (flag == 3) {
  52. if (b == END_TITLE[i]) {
  53. i++;
  54. } else i = 0;
  55. pageTitle.write(b);
  56. if (i >= END_TITLE.length) {
  57. flag = 4;
  58. String title = new String(pageTitle.getData(), 0,
  59. pageTitle.getLength() - END_TITLE.length);
  60. meta.setPageTitle(title);
  61. pageTitle.reset();
  62. i = 0;
  63. }
  64. }
  65. else if (flag == 4) {
  66. if (b == START_NAMESPACE[i]) {
  67. i++;
  68. } else i = 0;
  69. if (i >= START_NAMESPACE.length) {
  70. flag = 5;
  71. i = 0;
  72. }
  73. }
  74. else if (flag == 5) {
  75. if (b == END_NAMESPACE[i]) {
  76. i++;
  77. } else i = 0;
  78. nsBuf.write(b);
  79. if (i >= END_NAMESPACE.length) {
  80. flag = 6;
  81. String nsStr = new String(nsBuf.getData(), 0,
  82. nsBuf.getLength() - END_NAMESPACE.length);
  83. int ns = Integer.parseInt(nsStr);
  84. nsBuf.reset();
  85. if (ns != 0) {
  86. if (skipNonArticles) {
  87. skipped = true;
  88. meta.clear();
  89. return Ack.SKIPPED;
  90. }
  91. }
  92. meta.setNamespace(ns);
  93. i = 0;
  94. }
  95. }
  96. // when passing the namespace and we realize that
  97. // this is not an article, and that the option of skipping
  98. // non-article pages is on, we simply skip everything until
  99. // the closing </page>
  100. else if (skipped && flag >= 6) {
  101. Log.warn("Peculiar read after skipping namespace");
  102. /* if (b == END_PAGE[i]) {
  103. i++;
  104. } else i = 0;
  105. if (i >= END_PAGE.length) {
  106. return Ack.SKIPPED;
  107. } */
  108. return Ack.FAILED;
  109. }
  110. else if (flag == 6) {
  111. if (b == START_ID[i]) {
  112. i++;
  113. } else i = 0;
  114. if (i >= START_ID.length) {
  115. flag = 7;
  116. i = 0;
  117. }
  118. }
  119. // put everything in outer <id></id> block into keyBuf
  120. else if (flag == 7) {
  121. if (b == END_ID[i]) {
  122. i++;
  123. } else i = 0;
  124. keyBuf.write(b);
  125. if (i >= END_ID.length) {
  126. flag = 8;
  127. String idStr = new String(keyBuf.getData(), 0,
  128. keyBuf.getLength() - END_ID.length);
  129. long pageId = Long.parseLong(idStr);
  130. meta.setPageId(pageId);
  131. i = 0;
  132. }
  133. }
  134. else if (flag == 8) {
  135. int curMatch = 0;
  136. if ((i < START_REVISION.length && b == START_REVISION[i])
  137. && (i < START_REDIRECT.length && b == START_REDIRECT[i])
  138. // subtle bug here: some tag names can overlap multiple times
  139. && (revOrRedirect == 3 || revOrRedirect == -1)) {
  140. curMatch = 3;
  141. } else if (i < START_REVISION.length && b == START_REVISION[i]
  142. && revOrRedirect != 2) {
  143. curMatch = 1;
  144. } else if (i < START_REDIRECT.length && b == START_REDIRECT[i]
  145. && revOrRedirect != 1) {
  146. curMatch = 2;
  147. } else curMatch = 0;
  148. if (curMatch > 0 && (i == 0 || revOrRedirect == 3
  149. || curMatch == revOrRedirect)) {
  150. i++;
  151. revOrRedirect = curMatch;
  152. } else i = 0;
  153. if ((revOrRedirect == 2 || revOrRedirect == 3)
  154. && i >= START_REDIRECT.length) {
  155. if (skipRedirect) {
  156. skipped = true;
  157. meta.clear();
  158. return Ack.SKIPPED;
  159. }
  160. revOrRedirect = -1;
  161. flag = 9;
  162. i = 0;
  163. } else if ((revOrRedirect == 1 || revOrRedirect == 3)
  164. && i >= START_REVISION.length) {
  165. flag = 10;
  166. revOrRedirect = -1;
  167. return Ack.PASSED_TO_NEXT_TAG;
  168. }
  169. }
  170. else if (flag == 9 && !skipRedirect) {
  171. if (b == START_REVISION[i]) {
  172. i++;
  173. } else i = 0;
  174. if (i >= START_REVISION.length) {
  175. flag = 10;
  176. return Ack.PASSED_TO_NEXT_TAG;
  177. }
  178. }
  179. }
  180. }
  181. }
  182. }
  183. }