PageRenderTime 47ms CodeModel.GetById 21ms RepoModel.GetById 0ms app.codeStats 0ms

/java/test/org/wikimedia/pig/load/TestWikipediaRevisionPairLoader.java

https://github.com/giangbinhtran/Hedera
Java | 174 lines | 138 code | 18 blank | 18 comment | 79 complexity | 6db1a8686c9b61f1739b6cbe236e5f90 MD5 | raw file
  1. package org.wikimedia.pig.load;
  2. import static org.junit.Assert.*;
  3. import java.io.ByteArrayOutputStream;
  4. import java.io.FileInputStream;
  5. import java.io.IOException;
  6. import java.nio.charset.StandardCharsets;
  7. import org.junit.After;
  8. import org.junit.Before;
  9. import org.junit.Test;
  10. public class TestWikipediaRevisionPairLoader {
  11. private static final String START_PAGE_TAG = "<page>";
  12. private static final String END_PAGE_TAG = "</page>";
  13. private static final byte[] START_PAGE = START_PAGE_TAG.getBytes(StandardCharsets.UTF_8);
  14. private static final byte[] END_PAGE = END_PAGE_TAG.getBytes(StandardCharsets.UTF_8);
  15. private static final byte[] START_REVISION = "<revision>".getBytes(StandardCharsets.UTF_8);
  16. private static final byte[] END_REVISION = "</revision>".getBytes(StandardCharsets.UTF_8);
  17. private static final byte[] DUMMY_REV = ("<revision beginningofpage=\"true\">"
  18. + "<text xml:space=\"preserve\"></text></revision>\n")
  19. .getBytes(StandardCharsets.UTF_8);
  20. @Before
  21. public void setUp() throws Exception {
  22. }
  23. @After
  24. public void tearDown() throws Exception {
  25. }
  26. private final StringBuffer value = new StringBuffer();
  27. // @Test
  28. public void testParser() {
  29. ByteArrayOutputStream value = new ByteArrayOutputStream();
  30. try (FileInputStream fis = new FileInputStream("files/testwiki.txt")) {
  31. flag = 1;
  32. revisionVisited = 0;
  33. while (readUntilMatch(fis)) {
  34. if (flag == 5) {
  35. System.out.println(value);
  36. value.reset();
  37. pageHeader.reset();
  38. rev1Buf.reset();
  39. rev2Buf.reset();
  40. revisionVisited = 0;
  41. }
  42. else if (flag == 4) {
  43. value.write(pageHeader.toByteArray(), 0, pageHeader.size() - 10);
  44. value.write(rev1Buf.toByteArray());
  45. value.write(rev2Buf.toByteArray());
  46. value.write(END_PAGE, 0, END_PAGE.length);
  47. }
  48. else if (flag == 2) {
  49. pageHeader.write(START_PAGE);
  50. }
  51. else if (flag == 3) {
  52. rev1Buf.reset();
  53. if (revisionVisited == 0) {
  54. rev1Buf.write(DUMMY_REV);
  55. } else {
  56. rev1Buf.write(rev2Buf.toByteArray());
  57. }
  58. rev2Buf.reset();
  59. rev2Buf.write(START_REVISION);
  60. }
  61. else if (flag == -1) {
  62. pageHeader.reset();
  63. }
  64. }
  65. } catch (IOException e) {
  66. e.printStackTrace();
  67. }
  68. }
  69. // A flag that tells in which block the cursor is:
  70. // -1: EOF
  71. // 1 - outside the <page> tag
  72. // 2 - just passed the <page> tag but outside the <revision>
  73. // 3 - just passed the (next) <revision>
  74. // 4 - just passed the </revision>
  75. // 5 - just passed the </page>
  76. private byte flag;
  77. // indicating how many <revision> tags have been met, reset after every record
  78. private int revisionVisited;
  79. // indicating the flow conditifion within [flag = 4]
  80. // -1 - Unmatched
  81. // 1 - Matched <revision> tag partially
  82. // 2 - Matched </page> tag partially
  83. // 3 - Matched both <revision> and </page> partially
  84. private int lastMatchTag = -1;
  85. private ByteArrayOutputStream pageHeader = new ByteArrayOutputStream();
  86. private ByteArrayOutputStream rev1Buf = new ByteArrayOutputStream();
  87. private ByteArrayOutputStream rev2Buf = new ByteArrayOutputStream();
  88. private boolean readUntilMatch(FileInputStream fsin) throws IOException {
  89. int i = 0;
  90. while (true) {
  91. int b = fsin.read();
  92. if (b == -1) {
  93. flag = -1;
  94. return false;
  95. }
  96. // ignore every character until reaching a new page
  97. if (flag == 1 || flag == 5) {
  98. if (b == START_PAGE[i]) {
  99. i++;
  100. if (i >= START_PAGE.length) {
  101. flag = 2;
  102. return true;
  103. }
  104. } else i = 0;
  105. }
  106. // put everything between <page> tag and the first <revision> tag into pageHeader
  107. else if (flag == 2) {
  108. if (b == START_REVISION[i]) {
  109. i++;
  110. } else i = 0;
  111. pageHeader.write(b);
  112. if (i >= START_REVISION.length) {
  113. flag = 3;
  114. return true;
  115. }
  116. }
  117. // inside <revision></revision> block
  118. else if (flag == 3) {
  119. if (b == END_REVISION[i]) {
  120. i++;
  121. } else i = 0;
  122. rev2Buf.write(b);
  123. if (i >= END_REVISION.length) {
  124. flag = 4;
  125. revisionVisited++;
  126. return true;
  127. }
  128. }
  129. // Note that flag 4 can be the signal of a new record inside one old page
  130. else if (flag == 4) {
  131. int curMatch = 0;
  132. if ((i < END_PAGE.length && b == END_PAGE[i])
  133. && (i < START_REVISION.length && b == START_REVISION[i])) {
  134. curMatch = 3;
  135. } else if (i < END_PAGE.length && b == END_PAGE[i]) {
  136. curMatch = 2;
  137. } else if (i < START_REVISION.length && b == START_REVISION[i]) {
  138. curMatch = 1;
  139. }
  140. if (curMatch > 0 && (i == 0 || lastMatchTag == 3 || curMatch == lastMatchTag)) {
  141. i++;
  142. lastMatchTag = curMatch;
  143. } else i = 0;
  144. if ((lastMatchTag == 2 || lastMatchTag == 3) && i >= END_PAGE.length) {
  145. flag = 5;
  146. lastMatchTag = -1;
  147. return true;
  148. } else if ((lastMatchTag == 1 || lastMatchTag == 3) && i >= START_REVISION.length) {
  149. flag = 3;
  150. lastMatchTag = -1;
  151. return true;
  152. }
  153. }
  154. }
  155. }
  156. }