/java/main/org/hedera/io/input/WikiRevisionDiffInputFormat.java
Java | 513 lines | 392 code | 48 blank | 73 comment | 250 complexity | ea8a8861ba24a3da30f63afa5dc10251 MD5 | raw file
- package org.hedera.io.input;
- import java.io.IOException;
- import java.io.UnsupportedEncodingException;
- import java.util.LinkedList;
- import java.util.List;
- import org.apache.hadoop.fs.FSDataInputStream;
- import org.apache.hadoop.io.DataOutputBuffer;
- import org.apache.hadoop.io.LongWritable;
- import org.apache.hadoop.io.compress.CompressionInputStream;
- import org.apache.hadoop.mapreduce.InputSplit;
- import org.apache.hadoop.mapreduce.RecordReader;
- import org.apache.hadoop.mapreduce.TaskAttemptContext;
- import org.hedera.io.RevisionDiffOld;
- import difflib.Delta;
- import difflib.DiffUtils;
- import difflib.Patch;
- public class WikiRevisionDiffInputFormat
- extends WikiRevisionInputFormat<LongWritable, RevisionDiffOld> {
-
- @Override
- public RecordReader<LongWritable, RevisionDiffOld> createRecordReader(
- InputSplit input, TaskAttemptContext context) throws IOException,
- InterruptedException {
- return new DiffReader();
- }
- /**
- * Read every pairs of consecutive revisions and calculate their diffs
- * using Meyer's alogirthm. Return WikipediaRevisionDiff which, among other fields,
- * emits the list of diff between the two texts
- *
- * @author tuan
- */
- // States of the flag:
- //
- // -1: EOF
- // 1 - outside the <page> tag
- // 2 - just passed the <page> tag but outside the <title>
- // 3 - just passed the <title> tag
- // 4 - just passed the </title> tag but outside the <namespace>
- // 5 - just passed the <namespace>
- // 6 - just passed the </namespace> but outside the <id>
- // 7 - just passed the (page's) <id>
- // 8 - just passed the </id> tag but outside the <revision>
- // 9 - just passed the (next) <revision>
- // 10 - just passed the inner <id> tag inside <revision>
- // 11 - just passed the inner </id> tag inside <revision>
- // 12 - just passed the <timestamp>
- // 13 - just passed the </timestamp> tag
- // 14 - just passed the <parentId>
- // 15 - just passed the </parentId> tag
- // 16 - just passed the <text> tag
- // 17 - just passed the </text> tag
- // 18 - just passed the </revision>
- // 19 - just passed the </page>
- public static class DiffReader extends WikiRevisionReader<RevisionDiffOld> {
- // Extra flags:
- //
- // indicating the flow condition within [flag = 16]
- // -1 - Unmatched
- // 1 - Matched <revision> tag partially
- // 2 - Matched </page> tag partially
- // 3 - Matched both <revision> and </page> partially
- private int revOrPage = -1;
- // indicating the flow condition within [flag = 9]
- // -1 - Unmatched
- // 1 - Matched <parentId> tag partially
- // 2 - Matched <timestamp> tag partially
- // 3 - Matched both <parentId> and <timestamp> partially
- private int parOrTs = -1;
- // We now convert and cache everything from pageHeader to the followin global variables
- // NOTE: they all need to be synchronized with pageHeader !!
- // private DataOutputBuffer pageHeader = new DataOutputBuffer();
- private DataOutputBuffer pageTitle = new DataOutputBuffer();
- private DataOutputBuffer nsBuf = new DataOutputBuffer();
- //////////////////////////////////////////////////////////////
- // END PageHeader variables
- //////////////////////////////////////////////////////////////
- // buffer for handling consecutive revisions
- private DataOutputBuffer timestampBuf = new DataOutputBuffer();
- private DataOutputBuffer revIdBuf = new DataOutputBuffer();
- private DataOutputBuffer parBuf = new DataOutputBuffer();
- private List<String> lastRevText = new LinkedList<>();
- private DataOutputBuffer contentBuf = new DataOutputBuffer();
- //////////////////////////////////////////////////////////////
- // END revision buffer variables
- //////////////////////////////////////////////////////////////
- @Override
- public void initialize(InputSplit input, TaskAttemptContext tac)
- throws IOException, InterruptedException {
- super.initialize(input, tac);
- value = new RevisionDiffOld();
- }
- private void resetEverything() {
- revOrPage = -1;
- parOrTs = -1;
- nsBuf.reset();
- timestampBuf.reset();
- revIdBuf.reset();
- parBuf.reset();
- contentBuf.reset();
- keyBuf.reset();
- pageTitle.reset();
- value.clear();
- lastRevText.clear();
- skipped = false;
- }
- @Override
- protected STATE doWhenMatch() throws IOException, InterruptedException {
- if (flag == 19) {
- resetEverything();
- }
- // emit the object when reaching </revision>
- else if (flag == 18) {
- if (!skipped)
- return STATE.STOP_TRUE;
- }
- // calculating the diff and shift the revision text when seeing </text>
- // inside the <revision> block
- else if (flag == 17) {
- if (!skipped) {
- // create a mass number of strings
- List<String> content = extractParagraph(contentBuf.getData(), 0,
- contentBuf.getLength() - END_TEXT.length);
- Patch patch = DiffUtils.diff(lastRevText, content);
- for (Delta d : patch.getDeltas()) {
- value.add(d);
- }
- lastRevText = content;
- }
- // release big chunk of bytes here
- contentBuf.reset();
- }
- else if (flag == 15) {
- if (!skipped) {
- String parIdStr = new String(parBuf.getData(), 0, parBuf.getLength()
- - END_PARENT_ID.length);
- long parId = Long.parseLong(parIdStr);
- value.setParentId(parId);
- }
- parBuf.reset();
- }
- else if (flag == 13) {
- if (!skipped) {
- String ts = new String(timestampBuf.getData(), 0, timestampBuf.getLength()
- - END_TIMESTAMP.length);
- long timestamp = TIME_FORMAT.parseMillis(ts);
- value.setTimestamp(timestamp);
- }
- timestampBuf.reset();
- }
- else if (flag == 11) {
- if (!skipped) {
- String idStr = new String(revIdBuf.getData(), 0, revIdBuf.getLength()
- - END_ID.length);
- long revId = Long.parseLong(idStr);
- value.setRevisionId(revId);
- }
- revIdBuf.reset();
- }
- else if (flag == 8) {
- if (!skipped) {
- String idStr = new String(keyBuf.getData(), 0, keyBuf.getLength()
- - END_ID.length);
- long pageId = Long.parseLong(idStr);
- key.set(pageId);
- value.setPageId(pageId);
- }
- keyBuf.reset();
- }
- else if (flag == 6) {
- String nsStr = new String(nsBuf.getData(), 0, nsBuf.getLength()
- - END_NAMESPACE.length);
- int ns = Integer.parseInt(nsStr);
- if (ns == 0) {
- skipped = skipNonArticles;
- }
- value.setNamespace(ns);
- nsBuf.reset();
- }
- else if (flag == 4) {
- String title = new String(pageTitle.getData(), 0, pageTitle.getLength()
- - END_TITLE.length);
- value.setPageTitle(title);
- pageTitle.reset();
- }
- else if (flag == -1) {
- return STATE.STOP_FALSE;
- }
- return STATE.CONTINUE;
- }
- public static List<String> extractParagraph(byte[] b, int offset, int len)
- throws UnsupportedEncodingException {
- List<String> res = new LinkedList<>();
- if (b != null && b.length > 0) {
- int start = offset;
- int i = offset;
- while (i < len) {
- char c = (char) (((b[i] & 0xFF) << 8) + (b[i+1] & 0xFF));
- if (c == '\n') {
- String s = new String(b,start,i,"UTF-8");
- res.add(s);
- while (Character.isWhitespace(c)) {
- i += 2;
- c = (char) (((b[i] & 0xFF) << 8) + (b[i+1] & 0xFF));
- }
- start = i;
- }
- else {
- i += 2;
- }
- }
- if (start < i) {
- String s = new String(b,start,i,"UTF-8");
- res.add(s);
- }
- }
- return res;
- }
- @Override
- protected boolean readUntilMatch() throws IOException {
- if (buf == null && pos.length != 2)
- throw new IOException("Internal buffer corrupted.");
- int i = 0;
- while (true) {
- if (pos[0] == pos[1]) {
- pos[1] = (compressed) ? ((CompressionInputStream)fsin).read(buf) :
- ((FSDataInputStream)fsin).read(buf);
- pos[0] = 0;
- LOG.info(pos[1] + " bytes read from the stream...");
- if (pos[1] == -1) {
- return false;
- }
- }
- while (pos[0] < pos[1]) {
- byte b = buf[pos[0]];
- pos[0]++;
- // ignore every character until reaching a new page
- if (flag == 1 || flag == 19) {
- if (b == START_PAGE[i]) {
- i++;
- if (i >= START_PAGE.length) {
- flag = 2;
- return true;
- }
- } else i = 0;
- }
- else if (flag == 2) {
- if (b == START_TITLE[i]) {
- i++;
- } else i = 0;
- if (i >= START_TITLE.length) {
- flag = 3;
- return true;
- }
- }
- // put everything between <title></title> block into title
- else if (flag == 3) {
- if (b == END_TITLE[i]) {
- i++;
- } else i = 0;
- pageTitle.write(b);
- if (i >= END_TITLE.length) {
- flag = 4;
- return true;
- }
- }
- else if (flag == 4) {
- if (b == START_NAMESPACE[i]) {
- i++;
- } else i = 0;
- if (i >= START_NAMESPACE.length) {
- flag = 5;
- return true;
- }
- }
- else if (flag == 5) {
- if (b == END_NAMESPACE[i]) {
- i++;
- } else i = 0;
- nsBuf.write(b);
- if (i >= END_NAMESPACE.length) {
- flag = 6;
- return true;
- }
- }
-
- // when passing the namespace and we realize that
- // this is not an article, and that the option of skipping
- // non-article pages is on, we simply skip everything until
- // the closing </page>
- else if (skipped && flag >= 6 && flag < 19) {
- if (b == END_PAGE[i]) {
- i++;
- } else i = 0;
- if (i >= END_PAGE.length) {
- flag = 19;
- return true;
- }
- }
- else if (flag == 6) {
- if (b == START_ID[i]) {
- i++;
- } else i = 0;
- if (i >= START_ID.length) {
- flag = 7;
- return true;
- }
- }
- // put everything in outer <id></id> block into keyBuf
- else if (flag == 7) {
- if (b == END_ID[i]) {
- i++;
- } else i = 0;
- keyBuf.write(b);
- if (i >= END_ID.length) {
- flag = 8;
- return true;
- }
- }
- else if (flag == 8) {
- if (b == START_REVISION[i]) {
- i++;
- } else i = 0;
- if (i >= START_REVISION.length) {
- flag = 9;
- return true;
- }
- }
- // inside <revision></revision> block, first check for id
- else if (flag == 9) {
- if (b == START_ID[i]) {
- i++;
- } else i = 0;
- if (i >= START_ID.length) {
- flag = 10;
- return true;
- }
- }
- // everything inside the inner <id></id> block goes to revision buffer
- else if (flag == 10) {
- if (b == END_ID[i]) {
- i++;
- } else i = 0;
- revIdBuf.write(b);
- if (i >= END_ID.length) {
- flag = 11;
- return true;
- }
- }
- // after the inner <id>, check for either <timestamp> or <parentId>
- else if (flag == 11) {
- int curMatch = 0;
- if ((i < START_PARENT_ID.length && b == START_PARENT_ID[i])
- && (i < START_TIMESTAMP.length && b == START_TIMESTAMP[i])) {
- curMatch = 3;
- } else if (i < START_PARENT_ID.length && b == START_PARENT_ID[i]) {
- curMatch = 1;
- } else if (i < START_TIMESTAMP.length && b == START_TIMESTAMP[i]) {
- curMatch = 2;
- }
- if (curMatch > 0 && (i == 0 || parOrTs == 3 || curMatch == parOrTs)) {
- i++;
- parOrTs = curMatch;
- } else i = 0;
- if ((parOrTs == 2 || parOrTs == 3) && i >= START_TIMESTAMP.length) {
- flag = 12;
- parOrTs = -1;
- return true;
- } else if ((parOrTs == 1 || parOrTs == 3) && i >= START_PARENT_ID.length) {
- flag = 14;
- parOrTs = -1;
- return true;
- }
- }
- // inside <timestamp></timestamp> block everything goes to timestamp buffer
- else if (flag == 12) {
- if (b == END_TIMESTAMP[i]) {
- i++;
- } else i = 0;
- timestampBuf.write(b);
- if (i >= END_TIMESTAMP.length) {
- flag = 13;
- return true;
- }
- }
- // inside <parentId></parentId> block everything goes to parentId buffer
- else if (flag == 14) {
- if (b == END_PARENT_ID[i]) {
- i++;
- } else i = 0;
- parBuf.write(b);
- if (i >= END_PARENT_ID.length) {
- flag = 15;
- return true;
- }
- }
- // after the </parentId>, search for <timestamp>
- else if (flag == 15) {
- if (b == START_TIMESTAMP[i]) {
- i++;
- } else i = 0;
- if (i >= START_TIMESTAMP.length) {
- flag = 12;
- return true;
- }
- }
- // after the </timestamp>, check for <text>
- else if (flag == 13) {
- if (b == START_TEXT[i]) {
- i++;
- } else i = 0;
- if (i >= START_TEXT.length) {
- flag = 16;
- return true;
- }
- }
- // inside <text></text> block everything goes to content buffer
- else if (flag == 16) {
- if (b == END_TEXT[i]) {
- i++;
- } else i = 0;
- contentBuf.write(b);
- if (i >= END_TEXT.length) {
- flag = 17;
- return true;
- }
- }
- // look for the closing </revision>
- else if (flag == 17) {
- if (b == END_REVISION[i]) {
- i++;
- } else i = 0;
- if (i >= END_REVISION.length) {
- flag = 18;
- return true;
- }
- }
- // Flag 16 can be the signal of a new record inside one old page
- else if (flag == 18) {
- int curMatch = 0;
- if ((i < END_PAGE.length && b == END_PAGE[i])
- && (i < START_REVISION.length && b == START_REVISION[i])) {
- curMatch = 3;
- } else if (i < END_PAGE.length && b == END_PAGE[i]) {
- curMatch = 2;
- } else if (i < START_REVISION.length && b == START_REVISION[i]) {
- curMatch = 1;
- }
- if (curMatch > 0 && (i == 0 || revOrPage == 3 || curMatch == revOrPage)) {
- i++;
- revOrPage = curMatch;
- } else i = 0;
- if ((revOrPage == 2 || revOrPage == 3) && i >= END_PAGE.length) {
- flag = 19;
- revOrPage = -1;
- return true;
- } else if ((revOrPage == 1 || revOrPage == 3) && i >= START_REVISION.length) {
- flag = 9;
- revOrPage = -1;
- return true;
- }
- }
- }
- }
- }
- }
- }