PageRenderTime 50ms CodeModel.GetById 18ms RepoModel.GetById 0ms app.codeStats 1ms

/java/main/org/hedera/io/etl/RevisionETLReader.java

https://github.com/giangbinhtran/Hedera
Java | 553 lines | 374 code | 81 blank | 98 comment | 141 complexity | b6a123a8d5cafe124b0769bcebc811a4 MD5 | raw file
  1. package org.hedera.io.etl;
  2. import java.io.IOException;
  3. import java.io.InputStream;
  4. import org.apache.hadoop.conf.Configuration;
  5. import org.apache.hadoop.fs.FSDataInputStream;
  6. import org.apache.hadoop.fs.FileSystem;
  7. import org.apache.hadoop.fs.Path;
  8. import org.apache.hadoop.fs.Seekable;
  9. import org.apache.hadoop.io.DataOutputBuffer;
  10. import org.apache.hadoop.io.compress.CompressionCodec;
  11. import org.apache.hadoop.io.compress.CompressionCodecFactory;
  12. import org.apache.hadoop.io.compress.CompressionInputStream;
  13. import org.apache.hadoop.mapreduce.InputSplit;
  14. import org.apache.hadoop.mapreduce.RecordReader;
  15. import org.apache.hadoop.mapreduce.TaskAttemptContext;
  16. import org.apache.hadoop.mapreduce.lib.input.FileSplit;
  17. import org.apache.log4j.Logger;
  18. import org.hedera.io.CloneableObject;
  19. import com.twitter.elephantbird.util.TaskHeartbeatThread;
  20. import static org.hedera.io.input.WikiRevisionInputFormat.START_PAGE;
  21. import static org.hedera.io.input.WikiRevisionInputFormat.END_PAGE;
  22. import static org.hedera.io.input.WikiRevisionInputFormat.START_REVISION;
  23. import static org.hedera.io.input.WikiRevisionInputFormat.END_TEXT;
  24. public abstract class RevisionETLReader<KEYIN, VALUEIN,
  25. META extends CloneableObject<META>> extends RecordReader<KEYIN, VALUEIN> {
  26. private static final Logger LOG = Logger.getLogger(RevisionETLReader.class);
  27. protected static long DEFAULT_MAX_BLOCK_SIZE = 134217728l;
  28. private static final float DEFAULT_LOWER_THRESHOLD = 0.01f;
  29. private static final float DEFAULT_UPPER_THRESHOLD = 0.1f;
  30. // add a few break after five iterations to give other jobs in the cluster chances
  31. // to get executed
  32. private int threadCnt;
  33. // threshold for checking the revision seriously
  34. private static final long GOOD_ENOUGH_REVISION = 10;
  35. /** The acknowledgement signal when invoking one internal consuming method.
  36. * There are three states can return:
  37. * - PASSED_TO_NEXT_TAG: the consumer succeeds and now passed the next tag
  38. * - EOF: the consumer doesn't encounter the desired tag and it reaches
  39. * the file EOF byte
  40. * - SKIPPED: The consumter doesn't reach the desired tag yet, but
  41. * it will skip to the end of the page
  42. * - FAILED: The consumer fails due to internal errors
  43. */
  44. public static enum Ack {
  45. PASSED_TO_NEXT_TAG,
  46. EOF,
  47. SKIPPED,
  48. FAILED
  49. }
  50. // mark the cursor of the input stream
  51. private long start;
  52. private long end;
  53. // A flag that tells in which block the cursor is.
  54. // Generic setting:
  55. // -1: EOF
  56. // 1: Before the first page
  57. // 2: Inside the page, does not reach the end revision yet
  58. // 3: outside the page block
  59. // 4: The boundary case - The last and second last revisions are
  60. // both worth extracting for information
  61. private byte flag;
  62. // compression mode checking
  63. private boolean compressed = false;
  64. // a direct buffer to improve the local IO performance
  65. private byte[] buf = new byte[134217728];
  66. private int[] pos = new int[2];
  67. private Seekable fsin;
  68. private KEYIN key;
  69. private VALUEIN value;
  70. // caches for the last established revision
  71. private META meta;
  72. private DataOutputBuffer prevBuf = new DataOutputBuffer();
  73. // cache for the currently visited revision
  74. private DataOutputBuffer curBuf = new DataOutputBuffer();
  75. private META curMeta;
  76. protected ETLExtractor<KEYIN, VALUEIN, META> extractor;
  77. protected abstract META initializeMeta();
  78. protected abstract ETLExtractor<KEYIN, VALUEIN, META> initializeExtractor();
  79. private TaskAttemptContext context;
  80. @Override
  81. public KEYIN getCurrentKey() throws IOException, InterruptedException {
  82. return key;
  83. }
  84. protected abstract KEYIN initializeKey();
  85. protected abstract void freeKey(KEYIN key);
  86. @Override
  87. public VALUEIN getCurrentValue() throws IOException, InterruptedException {
  88. return value;
  89. }
  90. protected abstract VALUEIN initializeValue();
  91. protected abstract void freeValue(VALUEIN value);
  92. @Override
  93. public float getProgress() throws IOException, InterruptedException {
  94. return (fsin.getPos() - start) / (float) (end - start);
  95. }
  96. protected TaskAttemptContext getTaskAttemptContext() {
  97. return context;
  98. }
  99. @Override
  100. /**
  101. * Each ETLReader must set the key, value Mapper input as well as specify
  102. * the extractor, and instantiate the meta object (curMeta)
  103. */
  104. public void initialize(InputSplit input, TaskAttemptContext tac)
  105. throws IOException, InterruptedException {
  106. Configuration conf = tac.getConfiguration();
  107. setBlockSize(conf);
  108. FileSplit split = (FileSplit) input;
  109. start = split.getStart();
  110. end = start + split.getLength();
  111. Path file = split.getPath();
  112. CompressionCodecFactory compressionCodecs = new CompressionCodecFactory(conf);
  113. CompressionCodec codec = compressionCodecs.getCodec(file);
  114. FileSystem fs = file.getFileSystem(conf);
  115. if (codec != null) { // file is compressed
  116. compressed = true;
  117. // fsin = new FSDataInputStream(codec.createInputStream(fs.open(file)));
  118. CompressionInputStream cis = codec.createInputStream(fs.open(file));
  119. cis.skip(start - 1);
  120. fsin = cis;
  121. } else { // file is uncompressed
  122. compressed = false;
  123. fsin = fs.open(file);
  124. fsin.seek(start);
  125. }
  126. flag = 1;
  127. threadCnt = 0;
  128. pos[0] = pos[1] = 0;
  129. meta = null;
  130. this.context = tac;
  131. initializeObjects();
  132. }
  133. private void initializeObjects() {
  134. key = initializeKey();
  135. value = initializeValue();
  136. curMeta = initializeMeta();
  137. extractor = initializeExtractor();
  138. }
  139. protected static void setBlockSize(Configuration conf) {
  140. conf.setLong("mapreduce.input.fileinputformat.split.maxsize",
  141. DEFAULT_MAX_BLOCK_SIZE);
  142. }
  143. protected void updateRevision() throws IOException {
  144. if (meta == null) {
  145. meta = initializeMeta();
  146. }
  147. meta.clone(curMeta);
  148. prevBuf.reset();
  149. // some ETL Reader dont read the content at all !!
  150. if (curBuf.getLength() > 0) {
  151. prevBuf.write(curBuf.getData(), 0, curBuf.getLength());
  152. curBuf.reset();
  153. }
  154. }
  155. protected void clearRevisions() {
  156. meta = null;
  157. prevBuf.reset();
  158. curBuf.reset();
  159. freeKey(key);
  160. freeValue(value);
  161. }
  162. @Override
  163. //
  164. // Tuan: This is one of the most error-prone, tedious code I've ever written :(
  165. // I feel like I have to write the documentation for this method somewhere. Otherwise
  166. // I will lose understanding it next few months
  167. //
  168. public boolean nextKeyValue() throws IOException, InterruptedException {
  169. while (fsin.getPos() < end) {
  170. // take a break for other jobs running in the cluster
  171. threadCnt++;
  172. if (threadCnt % 10 == 0) {
  173. Thread.sleep(500);
  174. }
  175. if (flag == -1) {
  176. return false;
  177. }
  178. // the rare case: One last revision from last page still needs
  179. // to be processed
  180. if (flag == 4) {
  181. if (meta != null) {
  182. freeKey(key);
  183. freeValue(value);
  184. boolean res = extractor.extract(prevBuf, meta, key, value);
  185. flag = 3;
  186. if (!res) {
  187. throw new RuntimeException("This should not happen: "
  188. + " error in offset " + fsin.getPos());
  189. }
  190. return true;
  191. }
  192. // this should never happen !!
  193. else throw new RuntimeException("This should not happen: "
  194. + " error in offset " + fsin.getPos());
  195. }
  196. else if (flag == 1 || flag == 3) {
  197. while (hasNextPage()) {
  198. // before we start, let's clean all buffers
  199. clearRevisions();
  200. Ack r = readToPageHeader(curMeta);
  201. if (r == Ack.EOF)
  202. return false;
  203. else if (r == Ack.FAILED)
  204. throw new IOException("error when reading the next "
  205. + "<revision>");
  206. // Next_Tag = Revision in this case
  207. else if (r == Ack.PASSED_TO_NEXT_TAG) {
  208. flag = 2;
  209. break;
  210. }
  211. else continue;
  212. }
  213. }
  214. if (flag == 2) {
  215. Ack r = readToNextRevision(curBuf, curMeta);
  216. if (r == Ack.EOF)
  217. return false;
  218. else if (r == Ack.FAILED)
  219. throw new IOException("error when reading the next "
  220. + "</revision");
  221. else if (r == Ack.PASSED_TO_NEXT_TAG) {
  222. // if the current revision is too small, just skip it
  223. if (curBuf.getLength() < GOOD_ENOUGH_REVISION) {
  224. if (hasNextRevision()) {
  225. continue;
  226. }
  227. // the last revision, extract and stop
  228. else {
  229. flag = 3;
  230. if (meta != null) {
  231. freeKey(key);
  232. freeValue(value);
  233. boolean res = extractor.extract(prevBuf,meta,key,value);
  234. // every revsion that is checked is empty
  235. if (!res) {
  236. continue;
  237. }
  238. else return true;
  239. }
  240. }
  241. }
  242. // The first revision always replace the previous (empty) one
  243. if (meta == null) {
  244. updateRevision();
  245. if (hasNextRevision()) {
  246. continue;
  247. }
  248. // the last revision, extract and stop
  249. else {
  250. flag = 3;
  251. if (meta != null) {
  252. freeKey(key);
  253. freeValue(value);
  254. boolean res = extractor.extract(prevBuf,meta,key,value);
  255. // every revsion that is checked is empty
  256. if (!res) {
  257. continue;
  258. }
  259. else return true;
  260. }
  261. }
  262. }
  263. // heuristics:
  264. // - If the two revisions are too similar (< 0.01), throw
  265. // away the previous revision and get the new one and
  266. // continue.
  267. // - If the two revisions are different enough (> 0.1),
  268. // perform the extraction on the previous revision, making
  269. // sure the value is clean when seeing the revisions
  270. // independently. Then throw it away, get the new rev and
  271. // stop.
  272. else {
  273. float score = extractor.check(curMeta, meta);
  274. if (score < DEFAULT_LOWER_THRESHOLD) {
  275. updateRevision();
  276. if (hasNextRevision()) {
  277. continue;
  278. }
  279. // the last revision, extract and stop
  280. else {
  281. flag = 3;
  282. if (meta != null) {
  283. freeKey(key);
  284. freeValue(value);
  285. boolean res = extractor.extract(prevBuf,meta,key,value);
  286. // every revision that is checked is empty --> skip this page
  287. if (!res) {
  288. continue;
  289. }
  290. else return true;
  291. }
  292. }
  293. }
  294. else if (score > DEFAULT_UPPER_THRESHOLD) {
  295. if (meta != null) {
  296. freeKey(key);
  297. freeValue(value);
  298. boolean res = extractor.extract(prevBuf,meta,key,value);
  299. // Tricky scenario: The very last revision just has
  300. // a big change.
  301. if (!hasNextRevision()) {
  302. // By turning a special flag value, we hope it will not
  303. // be forgotten the next read
  304. flag = 4;
  305. }
  306. updateRevision();
  307. if (res)
  308. return true;
  309. else continue;
  310. }
  311. // Boundary case: We have only one revision. Emit it right away and stop
  312. else if (!hasNextRevision()) {
  313. updateRevision();
  314. if (meta != null) {
  315. flag = 3;
  316. freeKey(key);
  317. freeValue(value);
  318. boolean res = extractor.extract(prevBuf,meta,key,value);
  319. if (res)
  320. return true;
  321. else throw new RuntimeException("No way! String to " +
  322. "inspect: " + new String(prevBuf.getData(), "UTF-8"));
  323. }
  324. }
  325. // there are still more revisions to check, just shift the revision one
  326. // step ahead and continue
  327. else {
  328. updateRevision();
  329. }
  330. }
  331. }
  332. }
  333. else if (r == Ack.SKIPPED) {
  334. if (hasNextRevision()) {
  335. continue;
  336. }
  337. // the last revision, extract and stop
  338. else {
  339. flag = 3;
  340. // it might happen that you skipped all the revisions,
  341. // so just move on when meta is null
  342. if (meta != null) {
  343. freeKey(key);
  344. freeValue(value);
  345. boolean res = extractor.extract(prevBuf,meta,key,value);
  346. if (res)
  347. return true;
  348. else continue;
  349. }
  350. }
  351. }
  352. }
  353. }
  354. return false;
  355. }
  356. /**
  357. * Consume all the tags from page tag till the first revision tag. Cache
  358. * the values to meta data if needed
  359. * @return true when reaching revision, false when EOF
  360. */
  361. protected abstract Ack readToPageHeader(META meta) throws IOException;
  362. /**
  363. * This method reads bytes inside the input stream into the buffer
  364. * until reaching EOF or the revision close tag. In case of success,
  365. * it extracts the meta-data into the meta form
  366. */
  367. protected abstract Ack readToNextRevision(DataOutputBuffer buffer, META meta)
  368. throws IOException;
  369. /**
  370. * Outside the <page> block, check if next <page> tag comes
  371. * @return true if next page has been found,
  372. * false if the EOF has been found
  373. * @throws IOException
  374. */
  375. private boolean hasNextPage() throws IOException {
  376. int i = 0;
  377. while (true) {
  378. if (!fetchMore()) return false;
  379. while (hasData()) {
  380. byte b = nextByte();
  381. if (b == START_PAGE[i]) {
  382. i++;
  383. if (i >= START_PAGE.length) {
  384. return true;
  385. }
  386. } else i = 0;
  387. }
  388. }
  389. }
  390. /**
  391. * Outside the revision block, check for next revision tag of the
  392. * page. Return true if next revision found, false if EOF or closing of page
  393. * found
  394. * @throws IOException
  395. */
  396. private boolean hasNextRevision() throws IOException {
  397. int i = 0;
  398. int revOrPage = -1;
  399. while (true) {
  400. if (!fetchMore()) return false;
  401. while (hasData()) {
  402. byte b = nextByte();
  403. int curMatch = 0;
  404. if ((i < END_PAGE.length && b == END_PAGE[i])
  405. && (i < START_REVISION.length && b == START_REVISION[i])) {
  406. curMatch = 3;
  407. } else if (i < END_PAGE.length && b == END_PAGE[i]) {
  408. curMatch = 2;
  409. } else if (i < START_REVISION.length && b == START_REVISION[i]) {
  410. curMatch = 1;
  411. }
  412. if (curMatch > 0 && (i == 0 || revOrPage == 3 || curMatch == revOrPage)) {
  413. i++;
  414. revOrPage = curMatch;
  415. } else i = 0;
  416. if ((revOrPage == 2 || revOrPage == 3) && i >= END_PAGE.length) {
  417. return false;
  418. } else if ((revOrPage == 1 || revOrPage == 3) && i >= START_REVISION.length) {
  419. return true;
  420. }
  421. }
  422. }
  423. }
  424. /** Read the stream and update the internal buffer if necessary. Always return
  425. * true except when reaching EOF
  426. * @throws IOException */
  427. protected final boolean fetchMore() throws IOException {
  428. if (buf == null && pos.length != 2)
  429. throw new IOException("Internal buffer corrupted.");
  430. if (pos[0] == pos[1]) {
  431. // We use a thread that pings back to the cluster every 5 minutes
  432. // to avoid getting killed for slow read
  433. TaskHeartbeatThread heartbeat = new TaskHeartbeatThread(context, 60 * 5000) {
  434. @Override
  435. protected void progress() {
  436. LOG.info("Task " + context.getTaskAttemptID()
  437. + " pings back...");
  438. }
  439. };
  440. try {
  441. heartbeat.start();
  442. pos[1] = (compressed) ? ((InputStream)fsin).read(buf) :
  443. ((FSDataInputStream)fsin).read(buf);
  444. pos[0] = 0;
  445. } finally {
  446. heartbeat.stop();
  447. }
  448. if (pos[1] == -1) {
  449. flag = -1;
  450. return false;
  451. }
  452. } return true;
  453. }
  454. /** Check whether there are still data to read */
  455. protected boolean hasData() {
  456. return (pos[0] < pos[1]);
  457. }
  458. /** Get the next byte in the stream and move the cursor forward */
  459. protected byte nextByte() {
  460. byte b = buf[pos[0]];
  461. pos[0]++;
  462. return b;
  463. }
  464. @Override
  465. public void close() throws IOException {
  466. if (compressed) {
  467. ((CompressionInputStream)fsin).close();
  468. } else {
  469. ((FSDataInputStream)fsin).close();
  470. }
  471. }
  472. }