PageRenderTime 56ms CodeModel.GetById 24ms RepoModel.GetById 0ms app.codeStats 0ms

/java/main/org/hedera/io/input/WikiRevisionTimeInputFormat.java

https://github.com/giangbinhtran/Hedera
Java | 399 lines | 330 code | 41 blank | 28 comment | 160 complexity | c6d8bd5749fc1369aab1a79ce55c0d8b MD5 | raw file
  1. package org.hedera.io.input;
  2. import java.io.IOException;
  3. import java.nio.charset.StandardCharsets;
  4. import org.apache.commons.cli.CommandLine;
  5. import org.apache.commons.cli.GnuParser;
  6. import org.apache.commons.cli.HelpFormatter;
  7. import org.apache.commons.cli.Options;
  8. import org.apache.hadoop.fs.FSDataInputStream;
  9. import org.apache.hadoop.io.DataOutputBuffer;
  10. import org.apache.hadoop.io.LongWritable;
  11. import org.apache.hadoop.io.Text;
  12. import org.apache.hadoop.io.compress.CompressionInputStream;
  13. import org.apache.hadoop.mapreduce.InputSplit;
  14. import org.apache.hadoop.mapreduce.RecordReader;
  15. import org.apache.hadoop.mapreduce.TaskAttemptContext;
  16. import org.apache.log4j.Logger;
  17. import org.joda.time.DateTime;
  18. import org.joda.time.DateTimeConstants;
  19. import org.joda.time.MutableDateTime;
  20. public class WikiRevisionTimeInputFormat extends
  21. WikiRevisionInputFormat<LongWritable, Text> {
  22. public static final String TIME_SCALE_OPT = "timescale";
  23. public static enum TimeScale {
  24. HOUR("hour"),
  25. DAY("day"),
  26. WEEK("week"),
  27. MONTH("month");
  28. private final String val;
  29. private TimeScale(String v) {val = v;}
  30. @Override
  31. public String toString() {
  32. return val;
  33. }
  34. public boolean equalsName(String name) {
  35. return (val.equals(name));
  36. }
  37. }
  38. private static Options opts = new Options();
  39. private static final GnuParser parser = new GnuParser();
  40. private CommandLine options;
  41. private static void initOptions() {
  42. opts.addOption(TIME_SCALE_OPT, true, "The time scale used to coalesce the timeline");
  43. }
  44. public WikiRevisionTimeInputFormat() {
  45. super();
  46. }
  47. public WikiRevisionTimeInputFormat(String optString) {
  48. super();
  49. initOptions();
  50. if (optString != null && !optString.isEmpty()) {
  51. try {
  52. options = parser.parse(opts, optString.split(" "));
  53. } catch (org.apache.commons.cli.ParseException e) {
  54. HelpFormatter formatter = new HelpFormatter();
  55. formatter.printHelp("[-" + TIME_SCALE_OPT + "]", opts);
  56. throw new RuntimeException(e);
  57. }
  58. }
  59. }
  60. @Override
  61. public RecordReader<LongWritable, Text> createRecordReader(InputSplit split,
  62. TaskAttemptContext context) {
  63. if (options != null) {
  64. if (!options.hasOption(TIME_SCALE_OPT)) {
  65. throw new RuntimeException("Must specify the time scale for RevisionDistant");
  66. } else {
  67. String scale = options.getOptionValue(TIME_SCALE_OPT);
  68. TimeScale ts = null;
  69. for (TimeScale t : TimeScale.values()) {
  70. if (t.equalsName(scale)) {
  71. ts = t;
  72. }
  73. }
  74. return new RevisionReader(ts);
  75. }
  76. } else throw new RuntimeException("Must specify the time scale for RevisionDistant");
  77. }
  78. // A flag that tells in which block the cursor is:
  79. // -1: EOF
  80. // 1 - outside the <page> tag
  81. // 2 - just passed the <page> tag but outside the <id> tag
  82. // 3 - just passed the <id> tag
  83. // 4 - just passed the </id> tag but outside the <revision> tag
  84. // 5 - just passed the (next) <revision>
  85. // 6 - just passed the <timestamp> inside the <revision>
  86. // 7 - just passed the </timestamp> but still inside the <revision></revision> block
  87. // 8 - just passed the </revision>
  88. // 9 - just passed the </page>
  89. public static class RevisionReader extends WikiRevisionReader<Text> {
  90. private static final Logger LOG = Logger.getLogger(RevisionReader.class);
  91. private static final byte[] DUMMY_REV = ("<revision beginningofpage=\"true\">"
  92. + "<timestamp>1970-01-01T00:00:00Z</timestamp><text xml:space=\"preserve\">"
  93. + "</text></revision>\n")
  94. .getBytes(StandardCharsets.UTF_8);
  95. // indicating the flow condition within [flag = 8]
  96. // -1 - Unmatched
  97. // 1 - Matched <revision> tag partially
  98. // 2 - Matched </page> tag partially
  99. // 3 - Matched both <revision> and </page> partially
  100. private int lastMatchTag = -1;
  101. private DataOutputBuffer pageHeader = new DataOutputBuffer();
  102. private DataOutputBuffer rev1Buf = new DataOutputBuffer();
  103. private DataOutputBuffer rev2Buf = new DataOutputBuffer();
  104. private DataOutputBuffer tmpBuf = new DataOutputBuffer();
  105. private DataOutputBuffer tsBuf = new DataOutputBuffer();
  106. private DataOutputBuffer keyBuf = new DataOutputBuffer();
  107. // remember the last time point
  108. private DateTime curTs;
  109. // remember the time scale constant
  110. private TimeScale timeScale;
  111. public RevisionReader() {
  112. super();
  113. }
  114. public RevisionReader(TimeScale ts) {
  115. super();
  116. this.timeScale = ts;
  117. }
  118. @Override
  119. public void initialize(InputSplit input, TaskAttemptContext tac)
  120. throws IOException, InterruptedException {
  121. super.initialize(input, tac);
  122. value = new Text();
  123. }
  124. @Override
  125. public STATE doWhenMatch() throws IOException, InterruptedException {
  126. if (flag == 9) {
  127. key.set(fsin.getPos() - rev2Buf.getLength() - END_PAGE.length);
  128. value.set(pageHeader.getData(), 0, pageHeader.getLength() - START_REVISION.length);
  129. value.append(rev1Buf.getData(), 0, rev1Buf.getLength());
  130. value.append(rev2Buf.getData(), 0, rev1Buf.getLength());
  131. value.append(END_PAGE, 0, END_PAGE.length);
  132. // flush the last pair
  133. pageHeader.reset();
  134. rev1Buf.reset();
  135. rev2Buf.reset();
  136. tmpBuf.reset();
  137. curTs = null;
  138. return STATE.STOP_TRUE;
  139. }
  140. else if (flag == 7) {
  141. String ts = new String(tsBuf.getData(),0, tsBuf.getLength() - END_TIMESTAMP.length);
  142. tsBuf.reset();
  143. DateTime dt = roundup(ts);
  144. if (curTs != null && dt.isAfter(curTs)) {
  145. key.set(fsin.getPos() - tmpBuf.getLength() - rev2Buf.getLength());
  146. value.set(pageHeader.getData(), 0, pageHeader.getLength() - START_REVISION.length);
  147. value.append(rev1Buf.getData(), 0, rev1Buf.getLength());
  148. value.append(rev2Buf.getData(), 0, rev1Buf.getLength());
  149. value.append(END_PAGE, 0, END_PAGE.length);
  150. rev1Buf.reset();
  151. rev1Buf.write(rev2Buf.getData());
  152. rev2Buf.reset();
  153. rev2Buf.write(tmpBuf.getData());
  154. tmpBuf.reset();
  155. curTs = dt;
  156. return STATE.STOP_TRUE;
  157. } else {
  158. rev2Buf.reset();
  159. rev2Buf.write(tmpBuf.getData());
  160. tmpBuf.reset();
  161. curTs = dt;
  162. }
  163. }
  164. else if (flag == 4) {
  165. String pageId = new String(keyBuf.getData(), 0, keyBuf.getLength() - END_ID.length);
  166. key.set(Long.parseLong(pageId));
  167. keyBuf.reset();
  168. }
  169. else if (flag == 2) {
  170. pageHeader.write(START_PAGE);
  171. }
  172. else if (flag == 5) {
  173. if (curTs == null) {
  174. rev1Buf.write(DUMMY_REV);
  175. }
  176. tmpBuf.write(START_REVISION);
  177. }
  178. else if (flag == 6) {
  179. tsBuf.reset();
  180. }
  181. else if (flag == -1) {
  182. pageHeader.reset();
  183. rev1Buf.reset();
  184. rev2Buf.reset();
  185. tmpBuf.reset();
  186. value.clear();
  187. return STATE.STOP_FALSE;
  188. }
  189. return STATE.CONTINUE;
  190. }
  191. private DateTime roundup(String timestamp) {
  192. MutableDateTime mdt = TIME_FORMAT.parseMutableDateTime(timestamp);
  193. if (timeScale == TimeScale.HOUR) {
  194. if (mdt.getMinuteOfHour() > 0 || mdt.getSecondOfMinute() > 0 || mdt.getMillisOfSecond() > 0) {
  195. mdt.addHours(1);
  196. }
  197. mdt.setMinuteOfHour(0);
  198. mdt.setSecondOfMinute(0);
  199. mdt.setMillisOfSecond(0);
  200. } else if (timeScale == TimeScale.DAY) {
  201. if (mdt.getHourOfDay() > 1 || mdt.getMinuteOfHour() > 0 || mdt.getSecondOfMinute() > 0
  202. || mdt.getMillisOfSecond() > 0) {
  203. mdt.addDays(1);
  204. }
  205. mdt.setHourOfDay(1);
  206. mdt.setMinuteOfHour(0);
  207. mdt.setSecondOfMinute(0);
  208. mdt.setMillisOfSecond(0);
  209. } else if (timeScale == TimeScale.WEEK) {
  210. if (mdt.getDayOfWeek() > 1 || mdt.getHourOfDay() > 1 || mdt.getMinuteOfHour() > 0
  211. || mdt.getSecondOfMinute() > 0 || mdt.getMillisOfSecond() > 0) {
  212. mdt.addWeeks(1);
  213. }
  214. mdt.setDayOfWeek(DateTimeConstants.MONDAY);
  215. mdt.setHourOfDay(1);
  216. mdt.setMinuteOfHour(0);
  217. mdt.setSecondOfMinute(0);
  218. mdt.setMillisOfSecond(0);
  219. } else if (timeScale == TimeScale.MONTH) {
  220. if (mdt.getDayOfMonth() > 1 || mdt.getHourOfDay() > 1 || mdt.getMinuteOfHour() > 0
  221. || mdt.getSecondOfMinute() > 0 || mdt.getMillisOfSecond() > 0) {
  222. mdt.addWeeks(1);
  223. }
  224. mdt.setDayOfMonth(1);
  225. mdt.setHourOfDay(1);
  226. mdt.setMinuteOfHour(0);
  227. mdt.setSecondOfMinute(0);
  228. mdt.setMillisOfSecond(0);
  229. }
  230. return mdt.toDateTimeISO();
  231. }
  232. @Override
  233. protected boolean readUntilMatch() throws IOException {
  234. if (buf == null && pos.length != 2)
  235. throw new IOException("Internal buffer corrupted.");
  236. int i = 0;
  237. while (true) {
  238. if (pos[0] == pos[1]) {
  239. pos[1] = (compressed) ? ((CompressionInputStream)fsin).read(buf) :
  240. ((FSDataInputStream)fsin).read(buf);
  241. LOG.info(pos[1] + " bytes read from the stream...");
  242. pos[0] = 0;
  243. if (pos[1] == -1) {
  244. return false;
  245. }
  246. }
  247. while (pos[0] < pos[1]) {
  248. byte b = buf[pos[0]];
  249. pos[0]++;
  250. // ignore every character until reaching a new page
  251. if (flag == 1 || flag == 9) {
  252. if (b == START_PAGE[i]) {
  253. i++;
  254. if (i >= START_PAGE.length) {
  255. flag = 2;
  256. return true;
  257. }
  258. } else i = 0;
  259. }
  260. // put everything between <page> tag and the first <id> tag into pageHeader
  261. else if (flag == 2) {
  262. if (b == START_ID[i]) {
  263. i++;
  264. } else i = 0;
  265. pageHeader.write(b);
  266. if (i >= START_ID.length) {
  267. flag = 3;
  268. return true;
  269. }
  270. }
  271. // put everything in <id></id> block into pageHeader and keyBuf
  272. else if (flag == 3) {
  273. if (b == END_ID[i]) {
  274. i++;
  275. } else i = 0;
  276. pageHeader.write(b);
  277. keyBuf.write(b);
  278. if (i >= END_ID.length) {
  279. flag = 4;
  280. return true;
  281. }
  282. }
  283. // put everything between </id> tag and the first <revision> tag into pageHeader
  284. else if (flag == 4) {
  285. if (b == START_REVISION[i]) {
  286. i++;
  287. } else i = 0;
  288. pageHeader.write(b);
  289. if (i >= START_REVISION.length) {
  290. flag = 5;
  291. return true;
  292. }
  293. }
  294. // everything between <revision> and <timestamp> goes into tmpBuf buffer
  295. else if (flag == 5) {
  296. if (b == START_TIMESTAMP[i]) {
  297. i++;
  298. } else i = 0;
  299. tmpBuf.write(b);
  300. if (i >= START_TIMESTAMP.length) {
  301. flag = 6;
  302. // tsBuf.reset();
  303. return true;
  304. }
  305. }
  306. // everything between <timestamp> </timestamp> block goes into tmpBuf and tsBuf buffers
  307. else if (flag == 6) {
  308. if (b == END_TIMESTAMP[i]) {
  309. i++;
  310. } else i = 0;
  311. tsBuf.write(b);
  312. tmpBuf.write(b);
  313. if (i >= END_TIMESTAMP.length) {
  314. flag = 7;
  315. return true;
  316. }
  317. }
  318. // everything up to </revision> goes into rev2Buf
  319. else if (flag == 7) {
  320. if (b == END_REVISION[i]) {
  321. i++;
  322. } else i = 0;
  323. rev2Buf.write(b);
  324. if (i >= END_REVISION.length) {
  325. flag = 8;
  326. return true;
  327. }
  328. }
  329. // Note that flag 6 can be the signal of a new revision inside one old page
  330. else if (flag == 8) {
  331. int curMatch = 0;
  332. if ((i < END_PAGE.length && b == END_PAGE[i])
  333. && (i < START_REVISION.length && b == START_REVISION[i])) {
  334. curMatch = 3;
  335. } else if (i < END_PAGE.length && b == END_PAGE[i]) {
  336. curMatch = 2;
  337. } else if (i < START_REVISION.length && b == START_REVISION[i]) {
  338. curMatch = 1;
  339. }
  340. if (curMatch > 0 && (i == 0 || lastMatchTag == 3 || curMatch == lastMatchTag)) {
  341. i++;
  342. lastMatchTag = curMatch;
  343. } else i = 0;
  344. if ((lastMatchTag == 2 || lastMatchTag == 3) && i >= END_PAGE.length) {
  345. flag = 9;
  346. lastMatchTag = -1;
  347. return true;
  348. } else if ((lastMatchTag == 1 || lastMatchTag == 3) && i >= START_REVISION.length) {
  349. flag = 5;
  350. lastMatchTag = -1;
  351. return true;
  352. }
  353. }
  354. }
  355. }
  356. }
  357. }
  358. }