PageRenderTime 50ms CodeModel.GetById 24ms RepoModel.GetById 0ms app.codeStats 0ms

/java/test/org/hedera/TestLocalWikiRevisionTextInputFormat.java

https://github.com/giangbinhtran/Hedera
Java | 236 lines | 188 code | 35 blank | 13 comment | 101 complexity | c88520ae7b1952ab8238e0805c23c7bd MD5 | raw file
  1. package org.hedera;
  2. import java.io.FileInputStream;
  3. import java.io.FileNotFoundException;
  4. import java.io.IOException;
  5. import java.nio.charset.StandardCharsets;
  6. import org.apache.hadoop.io.DataOutputBuffer;
  7. import org.apache.hadoop.io.LongWritable;
  8. import org.apache.hadoop.io.Text;
  9. import org.junit.After;
  10. import org.junit.Before;
  11. import org.junit.Test;
  12. /**
  13. * Simulate and test WikiRevisionTextInputFormat in local setting
  14. */
  15. public class TestLocalWikiRevisionTextInputFormat {
  16. public static final String START_PAGE_TAG = "<page>";
  17. public static final String END_PAGE_TAG = "</page>";
  18. public static final byte[] START_PAGE = START_PAGE_TAG.getBytes(StandardCharsets.UTF_8);
  19. public static final byte[] END_PAGE = END_PAGE_TAG.getBytes(StandardCharsets.UTF_8);
  20. public static final byte[] START_REVISION = "<revision>".getBytes(StandardCharsets.UTF_8);
  21. public static final byte[] END_REVISION = "</revision>".getBytes(StandardCharsets.UTF_8);
  22. public static final byte[] START_ID = "<id>".getBytes(StandardCharsets.UTF_8);
  23. public static final byte[] END_ID = "</id>".getBytes(StandardCharsets.UTF_8);
  24. public static final byte[] START_TITLE = "<title>".getBytes(StandardCharsets.UTF_8);
  25. public static final byte[] END_TITLE = "</title>".getBytes(StandardCharsets.UTF_8);
  26. public static final String START_TIMESTAMP_TAG = "<timestamp>";
  27. public static final String END_TIMESTAMP_TAG = "</timestamp>";
  28. public static final byte[] START_TIMESTAMP = START_TIMESTAMP_TAG.getBytes(StandardCharsets.UTF_8);
  29. public static final byte[] END_TIMESTAMP = END_TIMESTAMP_TAG.getBytes(StandardCharsets.UTF_8);
  30. protected static long DEFAULT_MAX_BLOCK_SIZE = 134217728l;
  31. protected static long THRESHOLD = 137438953472l;
  32. private static final String INPUT = "/home/tuan/Downloads/enwiki-latest-pages-meta-history6.xml-p000236948p000252388";
  33. private DataOutputBuffer pageHeader = new DataOutputBuffer();
  34. // We now convert and cache everything from pageHeader to the followin global variables
  35. // NOTE: they all need to be synchronized with pageHeader !!
  36. private DataOutputBuffer revBuf = new DataOutputBuffer();
  37. private DataOutputBuffer keyBuf = new DataOutputBuffer();
  38. private final LongWritable key = new LongWritable();
  39. private final Text value = new Text();
  40. private byte[] buf = new byte[134217728];
  41. private int[] pos = new int[2];
  42. FileInputStream fis;
  43. private int flag;
  44. private int lastMatchTag = -1;
  45. @Before
  46. public void initialize() throws FileNotFoundException {
  47. fis = new FileInputStream(INPUT);
  48. flag = 1;
  49. }
  50. public boolean nextKeyValue() throws NumberFormatException, IOException {
  51. while (readUntilMatch()) {
  52. if (flag == 7) {
  53. pageHeader.reset();
  54. value.clear();
  55. }
  56. else if (flag == 6) {
  57. value.set(pageHeader.getData(), 0, pageHeader.getLength()
  58. - START_REVISION.length);
  59. value.append(revBuf.getData(), 0, revBuf.getLength());
  60. value.append(END_PAGE, 0, END_PAGE.length);
  61. revBuf.reset();
  62. return true;
  63. }
  64. else if (flag == 4) {
  65. String pageId = new String(keyBuf.getData(), 0, keyBuf.getLength()
  66. - END_ID.length);
  67. key.set(Long.parseLong(pageId));
  68. keyBuf.reset();
  69. }
  70. else if (flag == 2) {
  71. pageHeader.write(START_PAGE);
  72. }
  73. else if (flag == 5) {
  74. revBuf.write(START_REVISION);
  75. }
  76. else if (flag == -1) {
  77. pageHeader.reset();
  78. }
  79. }
  80. return false;
  81. }
  82. // Scan the tags in SAX manner. Return at every legit tag and inform the program via the global flag
  83. // Flush into the caches if necessary
  84. private boolean readUntilMatch() throws IOException {
  85. if (buf == null && pos.length != 2)
  86. throw new IOException("Internal buffer corrupted.");
  87. int i = 0;
  88. while (true) {
  89. if (pos[0] == pos[1]) {
  90. pos[1] = fis.read(buf);
  91. pos[0] = 0;
  92. if (pos[1] == -1) {
  93. return false;
  94. }
  95. }
  96. while (pos[0] < pos[1]) {
  97. byte b = buf[pos[0]];
  98. pos[0]++;
  99. // ignore every character until reaching a new page
  100. if (flag == 1 || flag == 7) {
  101. if (b == START_PAGE[i]) {
  102. i++;
  103. if (i >= START_PAGE.length) {
  104. flag = 2;
  105. return true;
  106. }
  107. } else i = 0;
  108. }
  109. // put everything between <page> tag and the first <id> tag into pageHeader
  110. else if (flag == 2) {
  111. if (b == START_ID[i]) {
  112. i++;
  113. } else i = 0;
  114. pageHeader.write(b);
  115. if (i >= START_ID.length) {
  116. flag = 3;
  117. return true;
  118. }
  119. }
  120. // put everything in <id></id> block into pageHeader and keyBuf
  121. else if (flag == 3) {
  122. if (b == END_ID[i]) {
  123. i++;
  124. } else i = 0;
  125. pageHeader.write(b);
  126. keyBuf.write(b);
  127. if (i >= END_ID.length) {
  128. flag = 4;
  129. return true;
  130. }
  131. }
  132. // put everything between </id> tag and the first <revision> tag into pageHeader
  133. else if (flag == 4) {
  134. if (b == START_REVISION[i]) {
  135. i++;
  136. } else i = 0;
  137. pageHeader.write(b);
  138. if (i >= START_REVISION.length) {
  139. flag = 5;
  140. return true;
  141. }
  142. }
  143. // inside <revision></revision> block
  144. else if (flag == 5) {
  145. if (b == END_REVISION[i]) {
  146. i++;
  147. } else i = 0;
  148. revBuf.write(b);
  149. if (i >= END_REVISION.length) {
  150. flag = 6;
  151. return true;
  152. }
  153. }
  154. // Note that flag 4 can be the signal of a new record inside one old page
  155. else if (flag == 6) {
  156. int curMatch = 0;
  157. if ((i < END_PAGE.length && b == END_PAGE[i])
  158. && (i < START_REVISION.length && b == START_REVISION[i])) {
  159. curMatch = 3;
  160. } else if (i < END_PAGE.length && b == END_PAGE[i]) {
  161. curMatch = 2;
  162. } else if (i < START_REVISION.length && b == START_REVISION[i]) {
  163. curMatch = 1;
  164. }
  165. if (curMatch > 0 && (i == 0 || lastMatchTag == 3 || curMatch == lastMatchTag)) {
  166. i++;
  167. lastMatchTag = curMatch;
  168. } else i = 0;
  169. if ((lastMatchTag == 2 || lastMatchTag == 3) && i >= END_PAGE.length) {
  170. flag = 7;
  171. lastMatchTag = -1;
  172. return true;
  173. } else if ((lastMatchTag == 1 || lastMatchTag == 3) && i >= START_REVISION.length) {
  174. flag = 5;
  175. lastMatchTag = -1;
  176. return true;
  177. }
  178. }
  179. }
  180. }
  181. }
  182. public LongWritable currentKey() {
  183. return key;
  184. }
  185. public Text currentValue() {
  186. return value;
  187. }
  188. @After
  189. public void finish() throws IOException {
  190. fis.close();
  191. }
  192. @Test
  193. public void main() throws IOException {
  194. while (nextKeyValue()) {
  195. LongWritable k = currentKey();
  196. Text content = currentValue();
  197. System.out.println(k.get());
  198. System.out.println(content.toString());
  199. }
  200. }
  201. }