WikiRevisionDiffInputFormat.java - States of the flag:

/java/main/org/hedera/io/input/WikiRevisionDiffInputFormat.java

https://github.com/giangbinhtran/Hedera · Java · 513 lines · 392 code · 48 blank · 73 comment · 250 complexity · ea8a8861ba24a3da30f63afa5dc10251 MD5 · raw file

package org.hedera.io.input;

import java.io.IOException;
import java.io.UnsupportedEncodingException;
import java.util.LinkedList;
import java.util.List;

import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.io.DataOutputBuffer;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.compress.CompressionInputStream;
import org.apache.hadoop.mapreduce.InputSplit;
import org.apache.hadoop.mapreduce.RecordReader;
import org.apache.hadoop.mapreduce.TaskAttemptContext;
import org.hedera.io.RevisionDiffOld;

import difflib.Delta;
import difflib.DiffUtils;
import difflib.Patch;

public class WikiRevisionDiffInputFormat 
		extends WikiRevisionInputFormat<LongWritable, RevisionDiffOld> {
	
	@Override
	public RecordReader<LongWritable, RevisionDiffOld> createRecordReader(
			InputSplit input, TaskAttemptContext context) throws IOException,
			InterruptedException {
		return new DiffReader(); 
	}

	/**
	 * Read every pairs of consecutive revisions and calculate their diffs
	 * using Meyer's alogirthm. Return WikipediaRevisionDiff which, among other fields,
	 * emits the list of diff between the two texts
	 *
	 * @author tuan
	 */
	// States of the flag:
	// 				
	// -1: EOF
	// 1 - outside the <page> tag
	// 2 - just passed the <page> tag but outside the <title>
	// 3 - just passed the <title> tag		
	// 4 - just passed the </title> tag but outside the <namespace>
	// 5 - just passed the <namespace>
	// 6 - just passed the </namespace> but outside the <id>
	// 7 - just passed the (page's) <id>
	// 8 - just passed the </id> tag but outside the <revision>	
	// 9 - just passed the (next) <revision>
	// 10 - just passed the inner <id> tag inside <revision>
	// 11 - just passed the inner </id> tag inside <revision>
	// 12 - just passed the <timestamp>
	// 13 - just passed the </timestamp> tag
	// 14 - just passed the <parentId>
	// 15 - just passed the </parentId> tag
	// 16 - just passed the <text> tag
	// 17 - just passed the </text> tag
	// 18 - just passed the </revision>
	// 19 - just passed the </page>
	public static class DiffReader extends WikiRevisionReader<RevisionDiffOld> {

		// Extra flags:
		//
		// indicating the flow condition within [flag = 16]
		// -1 - Unmatched
		//  1 - Matched <revision> tag partially
		//  2 - Matched </page> tag partially
		//  3 - Matched both <revision> and </page> partially
		private int revOrPage = -1;

		// indicating the flow condition within [flag = 9]
		// -1 - Unmatched
		//  1 - Matched <parentId> tag partially
		//  2 - Matched <timestamp> tag partially
		//  3 - Matched both <parentId> and <timestamp> partially
		private int parOrTs = -1;

		// We now convert and cache everything from pageHeader to the followin global variables
		// NOTE: they all need to be synchronized with pageHeader !!
		// private DataOutputBuffer pageHeader = new DataOutputBuffer();
		private DataOutputBuffer pageTitle = new DataOutputBuffer();
		private DataOutputBuffer nsBuf = new DataOutputBuffer();
		//////////////////////////////////////////////////////////////
		// END PageHeader variables
		//////////////////////////////////////////////////////////////

		// buffer for handling consecutive revisions
		private DataOutputBuffer timestampBuf = new DataOutputBuffer();		
		private DataOutputBuffer revIdBuf = new DataOutputBuffer();		
		private DataOutputBuffer parBuf = new DataOutputBuffer();

		private List<String> lastRevText = new LinkedList<>();
		private DataOutputBuffer contentBuf = new DataOutputBuffer();
		//////////////////////////////////////////////////////////////
		// END revision buffer variables
		//////////////////////////////////////////////////////////////

		@Override
		public void initialize(InputSplit input, TaskAttemptContext tac)
				throws IOException, InterruptedException {
			super.initialize(input, tac);
			value = new RevisionDiffOld(); 
		}

		private void resetEverything() {
			revOrPage = -1;
			parOrTs = -1;
			nsBuf.reset();
			timestampBuf.reset();
			revIdBuf.reset();
			parBuf.reset();
			contentBuf.reset();
			keyBuf.reset();
			pageTitle.reset();
			value.clear();	
			lastRevText.clear();
			skipped = false;
		}

		@Override
		protected STATE doWhenMatch() throws IOException, InterruptedException {
			if (flag == 19) {
				resetEverything();
			}

			// emit the object when reaching </revision>
			else if (flag == 18) {
				if (!skipped)
					return STATE.STOP_TRUE;
			}

			// calculating the diff and shift the revision text when seeing </text>
			// inside the <revision> block
			else if (flag == 17) {
				if (!skipped) {
					// create a mass number of strings
					List<String> content = extractParagraph(contentBuf.getData(), 0,
							contentBuf.getLength() - END_TEXT.length);

					Patch patch = DiffUtils.diff(lastRevText, content);						
					for (Delta d : patch.getDeltas()) {
						value.add(d);
					}						

					lastRevText = content;						
				}
				// release big chunk of bytes here
				contentBuf.reset();
			}

			else if (flag == 15) {
				if (!skipped) {
					String parIdStr = new String(parBuf.getData(), 0, parBuf.getLength() 
							- END_PARENT_ID.length);
					long parId = Long.parseLong(parIdStr);
					value.setParentId(parId);
				}
				parBuf.reset();
			}

			else if (flag == 13) {
				if (!skipped) {
					String ts = new String(timestampBuf.getData(), 0, timestampBuf.getLength() 
							- END_TIMESTAMP.length);
					long timestamp = TIME_FORMAT.parseMillis(ts);
					value.setTimestamp(timestamp);
				}
				timestampBuf.reset();
			}

			else if (flag == 11) {
				if (!skipped) {
					String idStr = new String(revIdBuf.getData(), 0, revIdBuf.getLength()
							- END_ID.length);
					long revId = Long.parseLong(idStr);
					value.setRevisionId(revId);
				}
				revIdBuf.reset();
			}

			else if (flag == 8) {
				if (!skipped) {
					String idStr = new String(keyBuf.getData(), 0, keyBuf.getLength()
							- END_ID.length);
					long pageId = Long.parseLong(idStr);
					key.set(pageId);
					value.setPageId(pageId);
				}
				keyBuf.reset();
			}

			else if (flag == 6) {
				String nsStr = new String(nsBuf.getData(), 0, nsBuf.getLength()
						- END_NAMESPACE.length);
				int ns = Integer.parseInt(nsStr);
				if (ns == 0) {
					skipped = skipNonArticles;					
				}
				value.setNamespace(ns);
				nsBuf.reset();
			}

			else if (flag == 4) {
				String title = new String(pageTitle.getData(), 0, pageTitle.getLength()
						- END_TITLE.length);
				value.setPageTitle(title);
				pageTitle.reset();
			}

			else if (flag == -1) {
				return STATE.STOP_FALSE;
			}
			return STATE.CONTINUE;

		}

		public static List<String> extractParagraph(byte[] b, int offset, int len) 
				throws UnsupportedEncodingException {
			List<String> res = new LinkedList<>();		
			if (b != null && b.length > 0) {
				int start = offset;
				int i = offset;
				while (i < len) {
					char c = (char) (((b[i] & 0xFF) << 8) + (b[i+1] & 0xFF));
					if (c == '\n') {
						String s = new String(b,start,i,"UTF-8");
						res.add(s);

						while (Character.isWhitespace(c)) {
							i += 2;
							c = (char) (((b[i] & 0xFF) << 8) + (b[i+1] & 0xFF));							
						}	
						start = i;
					}
					else {
						i += 2;
					}
				}
				if (start < i) {
					String s = new String(b,start,i,"UTF-8");
					res.add(s);
				}
			}
			return res;
		}

		@Override
		protected boolean readUntilMatch() throws IOException {
			if (buf == null && pos.length != 2)
				throw new IOException("Internal buffer corrupted.");
			int i = 0;
			while (true) {
				if (pos[0] == pos[1]) {				
					pos[1] = (compressed) ? ((CompressionInputStream)fsin).read(buf) :
						((FSDataInputStream)fsin).read(buf);
					pos[0] = 0;
					LOG.info(pos[1] + " bytes read from the stream...");
					if (pos[1] == -1) {
						return false;
					}
				} 
				while (pos[0] < pos[1]) {
					byte b = buf[pos[0]];
					pos[0]++;

					// ignore every character until reaching a new page
					if (flag == 1 || flag == 19) {
						if (b == START_PAGE[i]) {
							i++;
							if (i >= START_PAGE.length) {
								flag = 2;
								return true;
							}
						} else i = 0;
					}

					else if (flag == 2) {
						if (b == START_TITLE[i]) {
							i++;
						} else i = 0;
						if (i >= START_TITLE.length) {
							flag = 3;
							return true;
						}
					}

					// put everything between <title></title> block into title
					else if (flag == 3) {
						if (b == END_TITLE[i]) {
							i++;
						} else i = 0;
						pageTitle.write(b);
						if (i >= END_TITLE.length) {
							flag = 4;
							return true;
						}
					}

					else if (flag == 4) {
						if (b == START_NAMESPACE[i]) {
							i++;
						} else i = 0;
						if (i >= START_NAMESPACE.length) {
							flag = 5;
							return true;
						}
					}

					else if (flag == 5) {
						if (b == END_NAMESPACE[i]) {
							i++;
						} else i = 0;
						nsBuf.write(b);
						if (i >= END_NAMESPACE.length) {
							flag = 6;
							return true;
						}
					}
					
					// when passing the namespace and we realize that 
					// this is not an article, and that the option of skipping
					// non-article pages is on, we simply skip everything until
					// the closing </page>
					else if (skipped && flag >= 6 && flag < 19) {
						if (b == END_PAGE[i]) {
							i++;
						} else i = 0;
						if (i >= END_PAGE.length) {
							flag = 19;
							return true;
						}
					}

					else if (flag == 6) {
						if (b == START_ID[i]) {
							i++;
						} else i = 0;
						if (i >= START_ID.length) {
							flag = 7;
							return true;
						}
					}

					// put everything in outer <id></id> block into keyBuf
					else if (flag == 7) {
						if (b == END_ID[i]) {
							i++;
						} else i = 0;
						keyBuf.write(b);
						if (i >= END_ID.length) {
							flag = 8;
							return true;
						}
					}

					else if (flag == 8) {
						if (b == START_REVISION[i]) {
							i++;
						} else i = 0;
						if (i >= START_REVISION.length) {
							flag = 9;
							return true;
						}
					}

					// inside <revision></revision> block, first check for id
					else if (flag == 9) {
						if (b == START_ID[i]) {
							i++;
						} else i = 0;
						if (i >= START_ID.length) {
							flag = 10;
							return true;
						}
					}

					// everything inside the inner <id></id> block goes to revision buffer
					else if (flag == 10) {
						if (b == END_ID[i]) {
							i++;
						} else i = 0;
						revIdBuf.write(b);
						if (i >= END_ID.length) {
							flag = 11;
							return true;
						}
					}

					// after the inner <id>, check for either <timestamp> or <parentId>
					else if (flag == 11) {
						int curMatch = 0;				
						if ((i < START_PARENT_ID.length && b == START_PARENT_ID[i]) 
								&& (i < START_TIMESTAMP.length && b == START_TIMESTAMP[i])) {
							curMatch = 3;
						} else if (i < START_PARENT_ID.length && b == START_PARENT_ID[i]) {
							curMatch = 1;
						} else if (i < START_TIMESTAMP.length && b == START_TIMESTAMP[i]) {
							curMatch = 2;
						}				
						if (curMatch > 0 && (i == 0 || parOrTs == 3 || curMatch == parOrTs)) {					
							i++;			
							parOrTs = curMatch;
						} else i = 0;
						if ((parOrTs == 2 || parOrTs == 3) && i >= START_TIMESTAMP.length) {
							flag = 12;
							parOrTs = -1;
							return true;							
						} else if ((parOrTs == 1 || parOrTs == 3) && i >= START_PARENT_ID.length) {
							flag = 14;
							parOrTs = -1;
							return true;
						}		
					}

					// inside <timestamp></timestamp> block everything goes to timestamp buffer
					else if (flag == 12) {
						if (b == END_TIMESTAMP[i]) {
							i++;
						} else i = 0;
						timestampBuf.write(b);
						if (i >= END_TIMESTAMP.length) {
							flag = 13;
							return true;
						}
					}

					// inside <parentId></parentId> block everything goes to parentId buffer
					else if (flag == 14) {
						if (b == END_PARENT_ID[i]) {
							i++;
						} else i = 0;
						parBuf.write(b);
						if (i >= END_PARENT_ID.length) {
							flag = 15;
							return true;
						}
					}

					// after the </parentId>, search for <timestamp>
					else if (flag == 15) {
						if (b == START_TIMESTAMP[i]) {
							i++;
						} else i = 0;
						if (i >= START_TIMESTAMP.length) {
							flag = 12;
							return true;
						}
					}

					// after the </timestamp>, check for <text>
					else if (flag == 13) {
						if (b == START_TEXT[i]) {
							i++;
						} else i = 0;
						if (i >= START_TEXT.length) {
							flag = 16;
							return true;
						}
					}

					// inside <text></text> block everything goes to content buffer
					else if (flag == 16) {
						if (b == END_TEXT[i]) {
							i++;
						} else i = 0;
						contentBuf.write(b);
						if (i >= END_TEXT.length) {
							flag = 17;
							return true;
						}
					}

					// look for the closing </revision>
					else if (flag == 17) {
						if (b == END_REVISION[i]) {
							i++;
						} else i = 0;
						if (i >= END_REVISION.length) {
							flag = 18;
							return true;
						}
					}

					// Flag 16 can be the signal of a new record inside one old page
					else if (flag == 18) {
						int curMatch = 0;				
						if ((i < END_PAGE.length && b == END_PAGE[i]) 
								&& (i < START_REVISION.length && b == START_REVISION[i])) {
							curMatch = 3;
						} else if (i < END_PAGE.length && b == END_PAGE[i]) {
							curMatch = 2;
						} else if (i < START_REVISION.length && b == START_REVISION[i]) {
							curMatch = 1;
						}				
						if (curMatch > 0 && (i == 0 || revOrPage == 3 || curMatch == revOrPage)) {					
							i++;			
							revOrPage = curMatch;
						} else i = 0;
						if ((revOrPage == 2 || revOrPage == 3) && i >= END_PAGE.length) {
							flag = 19;
							revOrPage = -1;
							return true;							
						} else if ((revOrPage == 1 || revOrPage == 3) && i >= START_REVISION.length) {
							flag = 9;
							revOrPage = -1;
							return true;
						}				
					} 
				}		
			}
		}
	}
}
Tech Fingerprint

Alerts (63)

'==' Maintainability Info: Avoid using unnamed 'magic' numbers directly in comparisons or assignments. Use named constants (static final variables) instead to improve readability and maintainability.
122 127 134 151 161 171 181 192 203 249 267 277 288 299 309 334 345 356 367 378 390 400 404 408 416 428 440 451 462 474 485 495 499 503
'+' Performance Info: Using string concatenation ('+' or '+=') inside loops can be inefficient due to repeated String object creation. Use StringBuilder (or StringBuffer for thread-safety) instead.
257
'=' Maintainability Info: Avoid using unnamed 'magic' numbers directly in comparisons or assignments. Use named constants (static final variables) instead to improve readability and maintainability.
271 282 294 304 315 329 339 351 361 372 384 394 398 405 409 422 434 445 456 468 479 489 491 500 504
'<' Maintainability Info: Avoid using unnamed 'magic' numbers directly in comparisons or assignments. Use named constants (static final variables) instead to improve readability and maintainability.
324
Complexity hotspot; line 400 (total complexity: 7)
400
Complexity hotspot; line 495 (total complexity: 7)
495