PageRenderTime 46ms CodeModel.GetById 16ms RepoModel.GetById 0ms app.codeStats 0ms

/java/test/org/hedera/LocalWikiRevisionETLReader.java

https://github.com/giangbinhtran/Hedera
Java | 395 lines | 263 code | 61 blank | 71 comment | 107 complexity | a6903bf69c786b2b6afd05e9c1744604 MD5 | raw file
  1. package org.hedera;
  2. import java.io.FileInputStream;
  3. import java.io.IOException;
  4. import java.nio.charset.StandardCharsets;
  5. import org.apache.hadoop.io.DataOutputBuffer;
  6. import org.hedera.io.CloneableObject;
  7. import org.hedera.io.etl.ETLExtractor;
  8. /** The local variant of WikiRevisionETLReader for testing purposes */
  9. public abstract class LocalWikiRevisionETLReader<
  10. META extends CloneableObject<META>,KEYIN,VALUEIN> {
  11. public static final String START_PAGE_TAG = "<page>";
  12. public static final String END_PAGE_TAG = "</page>";
  13. public static final byte[] START_PAGE = START_PAGE_TAG.getBytes(StandardCharsets.UTF_8);
  14. public static final byte[] END_PAGE = END_PAGE_TAG.getBytes(StandardCharsets.UTF_8);
  15. public static final byte[] START_REVISION = "<revision>".getBytes(StandardCharsets.UTF_8);
  16. public static final byte[] END_REVISION = "</revision>".getBytes(StandardCharsets.UTF_8);
  17. public static final byte[] START_ID = "<id>".getBytes(StandardCharsets.UTF_8);
  18. public static final byte[] END_ID = "</id>".getBytes(StandardCharsets.UTF_8);
  19. public static final byte[] START_TITLE = "<title>".getBytes(StandardCharsets.UTF_8);
  20. public static final byte[] END_TITLE = "</title>".getBytes(StandardCharsets.UTF_8);
  21. public static final byte[] START_NAMESPACE = "<ns>".getBytes(StandardCharsets.UTF_8);
  22. public static final byte[] END_NAMESPACE = "</ns>".getBytes(StandardCharsets.UTF_8);
  23. public static final String START_TIMESTAMP_TAG = "<timestamp>";
  24. public static final String END_TIMESTAMP_TAG = "</timestamp>";
  25. public static final byte[] START_TIMESTAMP = START_TIMESTAMP_TAG.getBytes(StandardCharsets.UTF_8);
  26. public static final byte[] END_TIMESTAMP = END_TIMESTAMP_TAG.getBytes(StandardCharsets.UTF_8);
  27. public static final byte[] START_TEXT = "<text xml:space=\"preserve\">"
  28. .getBytes(StandardCharsets.UTF_8);
  29. public static final byte[] END_TEXT = "</text>".getBytes(StandardCharsets.UTF_8);
  30. private static final String INPUT = "files/testwiki.txt";
  31. private static final float DEFAULT_LOWER_THRESHOLD = 0.01f;
  32. private static final float DEFAULT_UPPER_THRESHOLD = 0.1f;
  33. private FileInputStream fis;
  34. public static enum Ack {
  35. PASSED_TO_NEXT_TAG,
  36. EOF,
  37. SKIPPED,
  38. FAILED
  39. }
  40. private KEYIN key;
  41. private VALUEIN value;
  42. // caches for the last established revision
  43. private META meta;
  44. private DataOutputBuffer prevBuf = new DataOutputBuffer();
  45. // cache for the currently visited revision
  46. private DataOutputBuffer curBuf = new DataOutputBuffer();
  47. private META curMeta;
  48. protected ETLExtractor<KEYIN, VALUEIN, META> extractor;
  49. // A flag that tells in which block the cursor is.
  50. // Generic setting:
  51. // -1: EOF
  52. // 1: Before the first page
  53. // 2: Inside the page, does not reach the end revision yet
  54. // 3: outside the page block
  55. // 4: The boundary case - The last and second last revisions are
  56. // both worth extracting for information
  57. private byte flag;
  58. // a direct buffer to improve the local IO performance
  59. private byte[] buf = new byte[134217728];
  60. private int[] pos = new int[2];
  61. protected abstract META initializeMeta();
  62. protected abstract ETLExtractor<KEYIN, VALUEIN, META> initializeExtractor();
  63. public KEYIN getCurrentKey() throws IOException, InterruptedException {
  64. return key;
  65. }
  66. protected abstract KEYIN initializeKey();
  67. protected abstract void freeKey(KEYIN key);
  68. public VALUEIN getCurrentValue() throws IOException, InterruptedException {
  69. return value;
  70. }
  71. protected abstract VALUEIN initializeValue();
  72. protected abstract void freeValue(VALUEIN value);
  73. /**
  74. * Each ETLReader must set the key, value Mapper input as well as specify
  75. * the extractor, and instantiate the meta object (curMeta)
  76. */
  77. public void initialize() throws IOException {
  78. fis = new FileInputStream(INPUT);
  79. flag = 1;
  80. pos[0] = pos[1] = 0;
  81. meta = null;
  82. initializeOutput();
  83. }
  84. private void initializeOutput() {
  85. key = initializeKey();
  86. value = initializeValue();
  87. curMeta = initializeMeta();
  88. extractor = initializeExtractor();
  89. }
  90. protected void updateRevision() throws IOException {
  91. if (meta == null) {
  92. meta = initializeMeta();
  93. }
  94. meta.clone(curMeta);
  95. prevBuf.reset();
  96. prevBuf.write(curBuf.getData(), 0, curBuf.getLength()
  97. - END_TEXT.length);
  98. curBuf.reset();
  99. }
  100. protected void clearRevisions() {
  101. meta = null;
  102. prevBuf.reset();
  103. curBuf.reset();
  104. freeKey(key);
  105. freeValue(value);
  106. }
  107. //
  108. // Tuan: This is one of the most error-prone, tedious code I've ever written :(
  109. //
  110. public boolean nextKeyValue() throws IOException, InterruptedException {
  111. while (flag != -1) {
  112. // the rare case: One last revision from last page still needs
  113. // to be processed
  114. if (flag == 4) {
  115. extractor.extract(prevBuf, meta, key, value);
  116. flag = 3;
  117. return true;
  118. }
  119. else if (flag == 1 || flag == 3) {
  120. while (hasNextPage()) {
  121. // before we start, let's clean all buffers
  122. clearRevisions();
  123. Ack r = readToPageHeader(curMeta);
  124. // debug hook
  125. System.out.println("Header: " + curMeta);
  126. if (r == Ack.EOF)
  127. return false;
  128. else if (r == Ack.FAILED)
  129. throw new IOException("error when reading the next "
  130. + "<revision>");
  131. // Next_Tag = Revision in this case
  132. else if (r == Ack.PASSED_TO_NEXT_TAG) {
  133. flag = 2;
  134. break;
  135. }
  136. else continue;
  137. }
  138. }
  139. if (flag == 2) {
  140. Ack r = readToNextRevision(curBuf, curMeta);
  141. if (r == Ack.EOF)
  142. return false;
  143. else if (r == Ack.FAILED)
  144. throw new IOException("error when reading the next "
  145. + "</revision");
  146. // We never have skipped inside the revision block
  147. else if (r == Ack.PASSED_TO_NEXT_TAG) {
  148. // The first revision always replace the previous (empty) one
  149. if (meta == null) {
  150. updateRevision();
  151. if (hasNextRevision()) {
  152. continue;
  153. }
  154. // the last revision, extract and stop
  155. else {
  156. flag = 3;
  157. freeKey(key);
  158. freeValue(value);
  159. extractor.extract(prevBuf,meta,key,value);
  160. return true;
  161. }
  162. }
  163. // heuristics:
  164. // - If the two revisions are too similar (< 0.01), throw away
  165. // the previous revision and get the new one and continue.
  166. // - If the two revisions are different enough (> 0.1), perform
  167. // the extraction on the previous revision, then throw it away
  168. // and get the new one and stop.
  169. else {
  170. float score = extractor.check(curMeta, meta);
  171. if (score < DEFAULT_LOWER_THRESHOLD) {
  172. updateRevision();
  173. if (hasNextRevision()) {
  174. continue;
  175. }
  176. // the last revision, extract and stop
  177. else {
  178. flag = 3;
  179. extractor.extract(prevBuf,meta,key,value);
  180. return true;
  181. }
  182. }
  183. else if (score > DEFAULT_UPPER_THRESHOLD) {
  184. if (meta != null) {
  185. freeKey(key);
  186. freeValue(value);
  187. extractor.extract(prevBuf,meta,key,value);
  188. // Tricky scenario: The very last revision just has
  189. // a big change.
  190. if (!hasNextRevision()) {
  191. // By turning a special flag value, we hope it will not
  192. // be forgotten the next read
  193. flag = 4;
  194. }
  195. updateRevision();
  196. return true;
  197. }
  198. // Boundary case: We have only one revision. Emit it right away and stop
  199. else if (!hasNextRevision()) {
  200. updateRevision();
  201. if (meta != null) {
  202. flag = 3;
  203. freeKey(key);
  204. freeValue(value);
  205. extractor.extract(prevBuf,meta,key,value);
  206. return true;
  207. }
  208. }
  209. // there are still more revisions to check, just shift the revision one
  210. // step ahead and continue
  211. else {
  212. updateRevision();
  213. }
  214. }
  215. }
  216. }
  217. else if (r == Ack.SKIPPED) {
  218. if (hasNextRevision()) {
  219. continue;
  220. }
  221. // the last revision, extract and stop
  222. else {
  223. flag = 3;
  224. // it might happen that you skipped all the revisions and so,
  225. // just move on when meta is null
  226. if (meta != null) {
  227. freeKey(key);
  228. freeValue(value);
  229. extractor.extract(prevBuf,meta,key,value);
  230. return true;
  231. }
  232. }
  233. }
  234. }
  235. }
  236. return false;
  237. }
  238. /**
  239. * Consume all the tags from page tag till the first revision tag. Cache
  240. * the values to meta data if needed
  241. * @return true when reaching revision, false when EOF
  242. */
  243. protected abstract Ack readToPageHeader(META meta) throws IOException;
  244. /**
  245. * This method reads bytes inside the input stream into the buffer
  246. * until reaching EOF or the revision close tag. In case of success,
  247. * it extracts the meta-data into the meta form
  248. */
  249. protected abstract Ack readToNextRevision(DataOutputBuffer buffer, META meta)
  250. throws IOException;
  251. /**
  252. * Outside the <page> block, check if next <page> tag comes
  253. * @return true if next page has been found,
  254. * false if the EOF has been found
  255. * @throws IOException
  256. */
  257. private boolean hasNextPage() throws IOException {
  258. int i = 0;
  259. while (true) {
  260. if (!fetchMore()) return false;
  261. while (hasData()) {
  262. byte b = nextByte();
  263. if (b == START_PAGE[i]) {
  264. i++;
  265. if (i >= START_PAGE.length) {
  266. return true;
  267. }
  268. } else i = 0;
  269. }
  270. }
  271. }
  272. /**
  273. * Outside the revision block, check for next revision tag of the
  274. * page. Return true if next revision found, false if EOF or closing of page
  275. * found
  276. * @throws IOException
  277. */
  278. private boolean hasNextRevision() throws IOException {
  279. int i = 0;
  280. int revOrPage = -1;
  281. while (true) {
  282. if (!fetchMore()) return false;
  283. while (hasData()) {
  284. byte b = nextByte();
  285. int curMatch = 0;
  286. if ((i < END_PAGE.length && b == END_PAGE[i])
  287. && (i < START_REVISION.length && b == START_REVISION[i])) {
  288. curMatch = 3;
  289. } else if (i < END_PAGE.length && b == END_PAGE[i]) {
  290. curMatch = 2;
  291. } else if (i < START_REVISION.length && b == START_REVISION[i]) {
  292. curMatch = 1;
  293. }
  294. if (curMatch > 0 && (i == 0 || revOrPage == 3 || curMatch == revOrPage)) {
  295. i++;
  296. revOrPage = curMatch;
  297. } else i = 0;
  298. if ((revOrPage == 2 || revOrPage == 3) && i >= END_PAGE.length) {
  299. return false;
  300. } else if ((revOrPage == 1 || revOrPage == 3) && i >= START_REVISION.length) {
  301. return true;
  302. }
  303. }
  304. }
  305. }
  306. /** Read the stream and update the internal buffer if necessary. Always return
  307. * true except when reaching EOF
  308. * @throws IOException */
  309. protected final boolean fetchMore() throws IOException {
  310. if (buf == null && pos.length != 2)
  311. throw new IOException("Internal buffer corrupted.");
  312. if (pos[0] == pos[1]) {
  313. pos[1] = fis.read(buf);
  314. pos[0] = 0;
  315. if (pos[1] == -1) {
  316. flag = -1;
  317. return false;
  318. }
  319. } return true;
  320. }
  321. /** Check whether there are still data to read */
  322. protected boolean hasData() {
  323. return (pos[0] < pos[1]);
  324. }
  325. /** Get the next byte in the stream and move the cursor forward */
  326. protected byte nextByte() {
  327. byte b = buf[pos[0]];
  328. pos[0]++;
  329. return b;
  330. }
  331. public void close() throws IOException {
  332. fis.close();
  333. }
  334. }