PageRenderTime 41ms CodeModel.GetById 13ms RepoModel.GetById 1ms app.codeStats 0ms

/java/main/org/hedera/io/etl/RevisionLinkInputFormat.java

https://github.com/giangbinhtran/Hedera
Java | 350 lines | 285 code | 31 blank | 34 comment | 155 complexity | f7d43665bbbb70d98587bc0c5160a84f MD5 | raw file
  1. package org.hedera.io.etl;
  2. import java.io.IOException;
  3. import java.nio.charset.StandardCharsets;
  4. import org.apache.hadoop.conf.Configuration;
  5. import org.apache.hadoop.fs.Path;
  6. import org.apache.hadoop.io.DataOutputBuffer;
  7. import org.apache.hadoop.io.LongWritable;
  8. import org.apache.hadoop.io.compress.CompressionCodec;
  9. import org.apache.hadoop.io.compress.SplittableCompressionCodec;
  10. import org.apache.hadoop.mapreduce.InputSplit;
  11. import org.apache.hadoop.mapreduce.JobContext;
  12. import org.apache.hadoop.mapreduce.RecordReader;
  13. import org.apache.hadoop.mapreduce.TaskAttemptContext;
  14. import org.apache.log4j.Logger;
  15. import org.hedera.io.LinkProfile;
  16. import org.hedera.io.RevisionHeader;
  17. import org.hedera.io.LinkProfile.Link;
  18. import org.hedera.io.input.WikiRevisionInputFormat;
  19. /**
  20. * The input format that supports ETL reading and extract link structures from
  21. * each revision on the go
  22. */
  23. public class RevisionLinkInputFormat extends
  24. WikiRevisionInputFormat<LongWritable, LinkProfile> {
  25. @Override
  26. public RecordReader<LongWritable, LinkProfile> createRecordReader(
  27. InputSplit input, TaskAttemptContext context)
  28. throws IOException, InterruptedException {
  29. return new RevisionLinkReader();
  30. }
  31. public static class RevisionLinkReader
  32. extends DefaultRevisionETLReader<LongWritable,
  33. LinkProfile> {
  34. @Override
  35. protected LongWritable initializeKey() {
  36. return new LongWritable();
  37. }
  38. @Override
  39. protected void freeKey(LongWritable key) {
  40. }
  41. @Override
  42. protected void freeValue(LinkProfile value) {
  43. value.clear();
  44. }
  45. @Override
  46. protected LinkProfile initializeValue() {
  47. return new LinkProfile();
  48. }
  49. @Override
  50. protected ETLExtractor<LongWritable, LinkProfile,
  51. RevisionHeader> initializeExtractor() {
  52. return new LinkExtractor();
  53. }
  54. @Override
  55. // -1: EOF
  56. // 9 - default
  57. // 10 - just passed the inner <id> tag inside <revision>
  58. // 11 - just passed the inner </id> tag inside <revision>
  59. // 12 - just passed the <timestamp>
  60. // 13 - just passed the </timestamp> tag
  61. // 14 - just passed the <parentId>
  62. // 15 - just passed the </parentId> tag
  63. // 16 - just passed the <minor/> (or not)
  64. // 17 - just passed the <text> tag
  65. // 18 - just passed the </text> tag
  66. // 19 - just passed the </revision>
  67. protected Ack readToNextRevision(DataOutputBuffer buffer,
  68. RevisionHeader meta) throws IOException {
  69. int i = 0;
  70. int flag = 9;
  71. int parOrTs = -1;
  72. int minorOrText = -1;
  73. try (DataOutputBuffer revIdBuf = new DataOutputBuffer();
  74. DataOutputBuffer timestampBuf = new DataOutputBuffer();
  75. DataOutputBuffer parBuf = new DataOutputBuffer()) {
  76. while (true) {
  77. if (!fetchMore()) return Ack.EOF;
  78. while (hasData()) {
  79. byte b = nextByte();
  80. if (flag == 9) {
  81. if (b == START_ID[i]) {
  82. i++;
  83. } else i = 0;
  84. if (i >= START_ID.length) {
  85. flag = 10;
  86. i = 0;
  87. }
  88. }
  89. // everything inside the inner <id></id>
  90. // block goes to revision buffer
  91. else if (flag == 10) {
  92. if (b == END_ID[i]) {
  93. i++;
  94. } else i = 0;
  95. revIdBuf.write(b);
  96. if (i >= END_ID.length) {
  97. flag = 11;
  98. String idStr = new String(revIdBuf.getData(), 0,
  99. revIdBuf.getLength() - END_ID.length);
  100. long revId = Long.parseLong(idStr);
  101. meta.setRevisionId(revId);
  102. revIdBuf.reset();
  103. i = 0;
  104. }
  105. }
  106. // after the inner <id>, check for either <timestamp> or <parentId>
  107. else if (flag == 11) {
  108. int curMatch = 0;
  109. if ((i < START_PARENT_ID.length && b == START_PARENT_ID[i])
  110. && (i < START_TIMESTAMP.length && b == START_TIMESTAMP[i])) {
  111. curMatch = 3;
  112. } else if (i < START_PARENT_ID.length && b == START_PARENT_ID[i]) {
  113. curMatch = 1;
  114. } else if (i < START_TIMESTAMP.length && b == START_TIMESTAMP[i]) {
  115. curMatch = 2;
  116. }
  117. if (curMatch > 0 && (i == 0 || parOrTs == 3 || curMatch == parOrTs)) {
  118. i++;
  119. parOrTs = curMatch;
  120. } else i = 0;
  121. if ((parOrTs == 2 || parOrTs == 3) && i >= START_TIMESTAMP.length) {
  122. flag = 12;
  123. parOrTs = -1;
  124. i = 0;
  125. } else if ((parOrTs == 1 || parOrTs == 3) && i >= START_PARENT_ID.length) {
  126. flag = 14;
  127. parOrTs = -1;
  128. i = 0;
  129. }
  130. }
  131. // inside <timestamp></timestamp> block everything goes to timestamp buffer
  132. else if (flag == 12) {
  133. if (b == END_TIMESTAMP[i]) {
  134. i++;
  135. } else i = 0;
  136. timestampBuf.write(b);
  137. if (i >= END_TIMESTAMP.length) {
  138. flag = 13;
  139. String ts = new String(timestampBuf.getData(), 0,
  140. timestampBuf.getLength()
  141. - END_TIMESTAMP.length);
  142. long timestamp = TIME_FORMAT.parseMillis(ts);
  143. meta.setTimestamp(timestamp);
  144. timestampBuf.reset();
  145. i = 0;
  146. }
  147. }
  148. // inside <parentId></parentId> block everything goes to parentId buffer
  149. else if (flag == 14) {
  150. if (b == END_PARENT_ID[i]) {
  151. i++;
  152. } else i = 0;
  153. parBuf.write(b);
  154. if (i >= END_PARENT_ID.length) {
  155. flag = 15;
  156. String parIdStr = new String(parBuf.getData(), 0, parBuf.getLength()
  157. - END_PARENT_ID.length);
  158. long parId = Long.parseLong(parIdStr);
  159. meta.setParentId(parId);
  160. parBuf.reset();
  161. i = 0;
  162. }
  163. }
  164. // after the </parentId>, search for <timestamp>
  165. else if (flag == 15) {
  166. if (b == START_TIMESTAMP[i]) {
  167. i++;
  168. } else i = 0;
  169. if (i >= START_TIMESTAMP.length) {
  170. flag = 12;
  171. i = 0;
  172. }
  173. }
  174. // After the timestamp, sometimes we can make a quick check to see
  175. // whether we should skip this revision
  176. // after the </timestamp>, check for <minor/>, if they exist
  177. else if (flag == 13) {
  178. int curMatch = 0;
  179. if ((i < START_TEXT.length && b == START_TEXT[i])
  180. && (i < MINOR_TAG.length && b == MINOR_TAG[i])) {
  181. curMatch = 3;
  182. } else if (i < START_TEXT.length && b == START_TEXT[i]) {
  183. curMatch = 1;
  184. } else if (i < MINOR_TAG.length && b == MINOR_TAG[i]) {
  185. curMatch = 2;
  186. }
  187. if (curMatch > 0 && (i == 0 || minorOrText == 3 || curMatch == minorOrText)) {
  188. i++;
  189. minorOrText = curMatch;
  190. } else i = 0;
  191. if ((minorOrText == 2 || minorOrText == 3) && i >= MINOR_TAG.length) {
  192. // update the meta
  193. meta.setMinor(true);
  194. flag = 16;
  195. minorOrText = -1;
  196. i = 0;
  197. } else if ((minorOrText == 1 || minorOrText == 3) && i >= START_TEXT.length) {
  198. flag = 17;
  199. minorOrText = -1;
  200. i = 0;
  201. }
  202. }
  203. // after the <minor/>, and search for <text>
  204. else if (flag == 16) {
  205. if (b == START_TEXT[i]) {
  206. i++;
  207. } else i = 0;
  208. if (i >= START_TEXT.length) {
  209. flag = 17;
  210. i = 0;
  211. }
  212. }
  213. // inside <text></text> block everything goes to content buffer
  214. else if (flag == 17) {
  215. if (b == END_TEXT[i]) {
  216. i++;
  217. } else i = 0;
  218. buffer.write(b);
  219. if (i >= END_TEXT.length) {
  220. flag = 18;
  221. meta.setLength(buffer.getLength());
  222. i = 0;
  223. }
  224. }
  225. // look for the closing </revision>
  226. else if (flag == 18) {
  227. if (b == END_REVISION[i]) {
  228. i++;
  229. } else i = 0;
  230. if (i >= END_REVISION.length) {
  231. // the flag is not anymore useful
  232. flag = 19;
  233. return Ack.PASSED_TO_NEXT_TAG;
  234. }
  235. }
  236. }
  237. }
  238. }
  239. }
  240. }
  241. public static class LinkExtractor implements
  242. ETLExtractor<LongWritable, LinkProfile, RevisionHeader> {
  243. private static final Logger LOG = Logger.getLogger(LinkExtractor.class);
  244. private static final byte[] OPEN_BRACKET = "[[".getBytes(StandardCharsets.UTF_8);
  245. private static final byte[] CLOSE_BRACKET = "]]".getBytes(StandardCharsets.UTF_8);
  246. @Override
  247. public float check(RevisionHeader curMeta, RevisionHeader prevMeta) {
  248. if (prevMeta == null || prevMeta.getLength() == 0) return 1f;
  249. if (curMeta.isMinor()) return 0.0005f;
  250. return Math.abs(curMeta.getLength() - prevMeta.getLength()) / (float)prevMeta.getLength();
  251. }
  252. @Override
  253. public boolean extract(DataOutputBuffer content, RevisionHeader meta,
  254. LongWritable key, LinkProfile value) {
  255. if (meta == null || meta.getLength() == 0) {
  256. return false;
  257. }
  258. // add meta-data
  259. key.set(meta.getPageId());
  260. value.clear();
  261. value.setNamespace(meta.getNamespace());
  262. value.setPageId(meta.getPageId());
  263. value.setPageTitle(meta.getPageTitle());
  264. value.setParentId(meta.getParentId());
  265. value.setRevisionId(meta.getRevisionId());
  266. value.setTimestamp(meta.getTimestamp());
  267. // add content (here the list of links)
  268. DataOutputBuffer linkBuffer = new DataOutputBuffer();
  269. byte[] bytes = content.getData();
  270. int len = content.getLength();
  271. int i = 0;
  272. // flag = 1: not see [[ or has passed ]] token
  273. // flag = 2: seen [[ but not ]] yet
  274. int flag = 1;
  275. try {
  276. for (int cursor = 0; cursor < len; cursor++) {
  277. byte b = bytes[cursor];
  278. if (flag == 1) {
  279. if (b == OPEN_BRACKET[i]) {
  280. i++;
  281. } else i = 0;
  282. if (i >= OPEN_BRACKET.length) {
  283. flag = 2;
  284. i = 0;
  285. }
  286. }
  287. else if (flag == 2) {
  288. if (b == CLOSE_BRACKET[i]) {
  289. i++;
  290. } else i = 0;
  291. linkBuffer.write(b);
  292. if (i >= CLOSE_BRACKET.length) {
  293. String linkText = new String(linkBuffer.getData(), 0,
  294. linkBuffer.getLength() - CLOSE_BRACKET.length,
  295. StandardCharsets.UTF_8);
  296. Link l = Link.convert(linkText, false);
  297. if (l != null) {
  298. value.addLink(l);
  299. }
  300. linkBuffer.reset();
  301. flag = 1;
  302. i = 0;
  303. }
  304. }
  305. }
  306. } catch (IOException e) {
  307. LOG.error("Error extracting link from revision: ["
  308. + value.getPageId() + ", rev: " + value.getRevisionId() + "]");
  309. } finally {
  310. try {
  311. linkBuffer.close();
  312. } catch (IOException e) {
  313. LOG.warn("Cannot close link buffer afterwards.");
  314. }
  315. }
  316. return true;
  317. }
  318. }
  319. }