PageRenderTime 45ms CodeModel.GetById 23ms RepoModel.GetById 0ms app.codeStats 0ms

/java/main/org/hedera/io/etl/DefaultRevisionETLReader.java

https://github.com/giangbinhtran/Hedera
Java | 247 lines | 193 code | 22 blank | 32 comment | 104 complexity | 5196245da1dd6071821f474cdaae823a MD5 | raw file
  1. package org.hedera.io.etl;
  2. import static org.hedera.io.input.WikiRevisionInputFormat.SKIP_NON_ARTICLES;
  3. import static org.hedera.io.input.WikiRevisionInputFormat.SKIP_REDIRECT;
  4. import static org.hedera.io.input.WikiRevisionInputFormat.START_TITLE;
  5. import static org.hedera.io.input.WikiRevisionInputFormat.END_TITLE;
  6. import static org.hedera.io.input.WikiRevisionInputFormat.START_NAMESPACE;
  7. import static org.hedera.io.input.WikiRevisionInputFormat.END_NAMESPACE;
  8. import static org.hedera.io.input.WikiRevisionInputFormat.START_ID;
  9. import static org.hedera.io.input.WikiRevisionInputFormat.END_ID;
  10. import static org.hedera.io.input.WikiRevisionInputFormat.START_REVISION;
  11. import static org.hedera.io.input.WikiRevisionInputFormat.START_REDIRECT;
  12. import java.io.IOException;
  13. import org.apache.hadoop.io.DataOutputBuffer;
  14. import org.apache.hadoop.mapreduce.InputSplit;
  15. import org.apache.hadoop.mapreduce.TaskAttemptContext;
  16. import org.apache.log4j.Logger;
  17. import org.hedera.io.RevisionHeader;
  18. import org.mortbay.log.Log;
  19. /**
  20. * A default WikiRevisionETLReader that extracts title, page id, namespace
  21. * from the page header
  22. */
  23. public abstract class DefaultRevisionETLReader<KEYIN, VALUEIN> extends
  24. RevisionETLReader<KEYIN, VALUEIN, RevisionHeader> {
  25. private static final Logger LOG =
  26. Logger.getLogger(DefaultRevisionETLReader.class);
  27. // option to whether skip non-article or redirect pages
  28. protected boolean skipNonArticles = false;
  29. protected boolean skipRedirect = false;
  30. @Override
  31. public void initialize(InputSplit input, TaskAttemptContext tac)
  32. throws IOException, InterruptedException {
  33. super.initialize(input, tac);
  34. skipNonArticles = tac.getConfiguration()
  35. .getBoolean(SKIP_NON_ARTICLES, false);
  36. skipRedirect = tac.getConfiguration()
  37. .getBoolean(SKIP_REDIRECT, false);
  38. LOG.info("Splitting option: [skip non-article: "
  39. + skipNonArticles + ", skip redirect: "
  40. + SKIP_REDIRECT + "]");
  41. }
  42. @Override
  43. protected RevisionHeader initializeMeta() {
  44. return new RevisionHeader();
  45. }
  46. @Override
  47. // Read the page header
  48. // -1: EOF
  49. // 1 - outside the <page> tag
  50. // 2 - just passed the <page> tag but outside the <title>
  51. // 3 - just passed the <title> tag
  52. // 4 - just passed the </title> tag but outside the <namespace>
  53. // 5 - just passed the <namespace>
  54. // 6 - just passed the </namespace> but outside the <id>
  55. // 7 - just passed the (page's) <id>
  56. // 8 - just passed the </id> tag but outside the <revision>
  57. // 9 - (optionally) just passed the <redirect>
  58. // 10 - just passed the (next) <revision>
  59. protected Ack readToPageHeader(RevisionHeader meta)
  60. throws IOException {
  61. int i = 0;
  62. int flag = 2;
  63. boolean skipped = false;
  64. int revOrRedirect = -1;
  65. try (DataOutputBuffer pageTitle = new DataOutputBuffer();
  66. DataOutputBuffer nsBuf = new DataOutputBuffer();
  67. DataOutputBuffer keyBuf = new DataOutputBuffer()) {
  68. while (true) {
  69. if (!fetchMore()) return Ack.EOF;
  70. while (hasData()) {
  71. byte b = nextByte();
  72. // when passing the namespace and we realize that
  73. // this is not an article, and that the option of skipping
  74. // non-article pages is on, we simply skip everything until
  75. // the closing </page>
  76. if (skipped) {
  77. if (flag >= 6) {
  78. Log.warn("Peculiar read after skipping namespace");
  79. /*
  80. if (b == END_PAGE[i]) {
  81. i++;
  82. } else i = 0;
  83. if (i >= END_PAGE.length) {
  84. return Ack.SKIPPED;
  85. } */
  86. return Ack.FAILED;
  87. } else return Ack.SKIPPED;
  88. }
  89. if (flag == 2) {
  90. if (b == START_TITLE[i]) {
  91. i++;
  92. } else i = 0;
  93. if (i >= START_TITLE.length) {
  94. flag = 3;
  95. i = 0;
  96. }
  97. }
  98. // put everything between <title></title> block into title
  99. else if (flag == 3) {
  100. if (b == END_TITLE[i]) {
  101. i++;
  102. } else i = 0;
  103. pageTitle.write(b);
  104. if (i >= END_TITLE.length) {
  105. flag = 4;
  106. String title = new String(pageTitle.getData(), 0,
  107. pageTitle.getLength() - END_TITLE.length);
  108. meta.setPageTitle(title);
  109. pageTitle.reset();
  110. i = 0;
  111. }
  112. }
  113. else if (flag == 4) {
  114. if (b == START_NAMESPACE[i]) {
  115. i++;
  116. } else i = 0;
  117. if (i >= START_NAMESPACE.length) {
  118. flag = 5;
  119. i = 0;
  120. }
  121. }
  122. else if (flag == 5) {
  123. if (b == END_NAMESPACE[i]) {
  124. i++;
  125. } else i = 0;
  126. nsBuf.write(b);
  127. if (i >= END_NAMESPACE.length) {
  128. flag = 6;
  129. String nsStr = new String(nsBuf.getData(), 0,
  130. nsBuf.getLength() - END_NAMESPACE.length);
  131. int ns = Integer.parseInt(nsStr);
  132. nsBuf.reset();
  133. if (ns != 0) {
  134. if (skipNonArticles) {
  135. skipped = true;
  136. meta.clear();
  137. return Ack.SKIPPED;
  138. }
  139. }
  140. meta.setNamespace(ns);
  141. i = 0;
  142. }
  143. }
  144. else if (flag == 6) {
  145. if (b == START_ID[i]) {
  146. i++;
  147. } else i = 0;
  148. if (i >= START_ID.length) {
  149. flag = 7;
  150. i = 0;
  151. }
  152. }
  153. // put everything in outer <id></id> block into keyBuf
  154. else if (flag == 7) {
  155. if (b == END_ID[i]) {
  156. i++;
  157. } else i = 0;
  158. keyBuf.write(b);
  159. if (i >= END_ID.length) {
  160. flag = 8;
  161. String idStr = new String(keyBuf.getData(), 0,
  162. keyBuf.getLength() - END_ID.length);
  163. long pageId = Long.parseLong(idStr);
  164. meta.setPageId(pageId);
  165. i = 0;
  166. }
  167. }
  168. else if (flag == 8) {
  169. int curMatch = 0;
  170. if ((i < START_REVISION.length
  171. && b == START_REVISION[i])
  172. && (i < START_REDIRECT.length
  173. && b == START_REDIRECT[i])
  174. // subtle bug here: some tag names can overlap
  175. // multiple times
  176. && (revOrRedirect == 3
  177. || revOrRedirect == -1)){
  178. curMatch = 3;
  179. } else if (i < START_REVISION.length
  180. && b == START_REVISION[i]
  181. && revOrRedirect != 2) {
  182. curMatch = 1;
  183. } else if (i < START_REDIRECT.length
  184. && b == START_REDIRECT[i]
  185. && revOrRedirect != 1) {
  186. curMatch = 2;
  187. } else {
  188. curMatch = 0;
  189. }
  190. if (curMatch > 0 && (i == 0 || revOrRedirect == 3
  191. || curMatch == revOrRedirect)) {
  192. i++;
  193. revOrRedirect = curMatch;
  194. } else i = 0;
  195. if ((revOrRedirect == 2 || revOrRedirect == 3)
  196. && i >= START_REDIRECT.length) {
  197. if (skipRedirect) {
  198. skipped = true;
  199. meta.clear();
  200. return Ack.SKIPPED;
  201. }
  202. revOrRedirect = -1;
  203. flag = 9;
  204. i = 0;
  205. } else if ((revOrRedirect == 1 || revOrRedirect == 3)
  206. && i >= START_REVISION.length) {
  207. flag = 10;
  208. revOrRedirect = -1;
  209. return Ack.PASSED_TO_NEXT_TAG;
  210. }
  211. }
  212. else if (flag == 9 && !skipRedirect) {
  213. if (b == START_REVISION[i]) {
  214. i++;
  215. } else i = 0;
  216. if (i >= START_REVISION.length) {
  217. flag = 10;
  218. return Ack.PASSED_TO_NEXT_TAG;
  219. }
  220. }
  221. }
  222. }
  223. }
  224. }
  225. }