RevisionLinkInputFormat.java

/java/main/org/hedera/io/etl/RevisionLinkInputFormat.java

https://github.com/giangbinhtran/Hedera · Java · 350 lines · 285 code · 31 blank · 34 comment · 155 complexity · f7d43665bbbb70d98587bc0c5160a84f MD5 · raw file

package org.hedera.io.etl;

import java.io.IOException;
import java.nio.charset.StandardCharsets;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.DataOutputBuffer;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.compress.CompressionCodec;
import org.apache.hadoop.io.compress.SplittableCompressionCodec;
import org.apache.hadoop.mapreduce.InputSplit;
import org.apache.hadoop.mapreduce.JobContext;
import org.apache.hadoop.mapreduce.RecordReader;
import org.apache.hadoop.mapreduce.TaskAttemptContext;
import org.apache.log4j.Logger;
import org.hedera.io.LinkProfile;
import org.hedera.io.RevisionHeader;
import org.hedera.io.LinkProfile.Link;
import org.hedera.io.input.WikiRevisionInputFormat;

/**
 * The input format that supports ETL reading and extract link structures from
 * each revision on the go 
 */
public class RevisionLinkInputFormat extends 
		WikiRevisionInputFormat<LongWritable, LinkProfile> {

	@Override
	public RecordReader<LongWritable, LinkProfile> createRecordReader(
			InputSplit input, TaskAttemptContext context) 
					throws IOException, InterruptedException {
		return new RevisionLinkReader();
	}

	public static class RevisionLinkReader 
	extends DefaultRevisionETLReader<LongWritable, 
	LinkProfile> {

		@Override
		protected LongWritable initializeKey() {		
			return new LongWritable();		
		}

		@Override
		protected void freeKey(LongWritable key) {		
		}

		@Override
		protected void freeValue(LinkProfile value) {
			value.clear();
		}

		@Override
		protected LinkProfile initializeValue() {		
			return new LinkProfile();		
		}

		@Override
		protected ETLExtractor<LongWritable, LinkProfile,
		RevisionHeader> initializeExtractor() {		
			return new LinkExtractor();		
		}

		@Override
		// -1: EOF
		// 9 - default
		// 10 - just passed the inner <id> tag inside <revision>
		// 11 - just passed the inner </id> tag inside <revision>
		// 12 - just passed the <timestamp>
		// 13 - just passed the </timestamp> tag
		// 14 - just passed the <parentId>
		// 15 - just passed the </parentId> tag
		// 16 - just passed the <minor/> (or not)
		// 17 - just passed the <text> tag
		// 18 - just passed the </text> tag
		// 19 - just passed the </revision>
		protected Ack readToNextRevision(DataOutputBuffer buffer, 
				RevisionHeader meta) throws IOException {
			int i = 0;
			int flag = 9;	
			int parOrTs = -1;
			int minorOrText = -1;
			try (DataOutputBuffer revIdBuf = new DataOutputBuffer(); 
					DataOutputBuffer timestampBuf = new DataOutputBuffer(); 
					DataOutputBuffer parBuf = new DataOutputBuffer()) {

				while (true) {
					if (!fetchMore()) return Ack.EOF;
					while (hasData()) {
						byte b = nextByte();
						if (flag == 9) {
							if (b == START_ID[i]) {
								i++;
							} else i = 0;
							if (i >= START_ID.length) {
								flag = 10;
								i = 0;
							}
						}

						// everything inside the inner <id></id> 
						// block goes to revision buffer
						else if (flag == 10) {
							if (b == END_ID[i]) {
								i++;
							} else i = 0;
							revIdBuf.write(b);
							if (i >= END_ID.length) {
								flag = 11;
								String idStr = new String(revIdBuf.getData(), 0, 
										revIdBuf.getLength() - END_ID.length);
								long revId = Long.parseLong(idStr);
								meta.setRevisionId(revId);
								revIdBuf.reset();
								i = 0;
							}
						}

						// after the inner <id>, check for either <timestamp> or <parentId>
						else if (flag == 11) {
							int curMatch = 0;				
							if ((i < START_PARENT_ID.length && b == START_PARENT_ID[i]) 
									&& (i < START_TIMESTAMP.length && b == START_TIMESTAMP[i])) {
								curMatch = 3;
							} else if (i < START_PARENT_ID.length && b == START_PARENT_ID[i]) {
								curMatch = 1;
							} else if (i < START_TIMESTAMP.length && b == START_TIMESTAMP[i]) {
								curMatch = 2;
							}				
							if (curMatch > 0 && (i == 0 || parOrTs == 3 || curMatch == parOrTs)) {					
								i++;			
								parOrTs = curMatch;
							} else i = 0;
							if ((parOrTs == 2 || parOrTs == 3) && i >= START_TIMESTAMP.length) {
								flag = 12;
								parOrTs = -1;		
								i = 0;
							} else if ((parOrTs == 1 || parOrTs == 3) && i >= START_PARENT_ID.length) {
								flag = 14;
								parOrTs = -1;
								i = 0;
							}		
						}

						// inside <timestamp></timestamp> block everything goes to timestamp buffer
						else if (flag == 12) {
							if (b == END_TIMESTAMP[i]) {
								i++;
							} else i = 0;
							timestampBuf.write(b);
							if (i >= END_TIMESTAMP.length) {
								flag = 13;
								String ts = new String(timestampBuf.getData(), 0, 
										timestampBuf.getLength() 
										- END_TIMESTAMP.length);
								long timestamp = TIME_FORMAT.parseMillis(ts);
								meta.setTimestamp(timestamp);
								timestampBuf.reset();
								i = 0;
							}
						}

						// inside <parentId></parentId> block everything goes to parentId buffer
						else if (flag == 14) {
							if (b == END_PARENT_ID[i]) {
								i++;
							} else i = 0;
							parBuf.write(b);
							if (i >= END_PARENT_ID.length) {
								flag = 15;
								String parIdStr = new String(parBuf.getData(), 0, parBuf.getLength() 
										- END_PARENT_ID.length);
								long parId = Long.parseLong(parIdStr);
								meta.setParentId(parId);
								parBuf.reset();
								i = 0;
							}
						}

						// after the </parentId>, search for <timestamp>
						else if (flag == 15) {
							if (b == START_TIMESTAMP[i]) {
								i++;
							} else i = 0;
							if (i >= START_TIMESTAMP.length) {
								flag = 12;
								i = 0;
							}
						}

						// After the timestamp, sometimes we can make a quick check to see
						// whether we should  skip this revision

						// after the </timestamp>, check for <minor/>, if they exist
						else if (flag == 13) {
							int curMatch = 0;				
							if ((i < START_TEXT.length && b == START_TEXT[i]) 
									&& (i < MINOR_TAG.length && b == MINOR_TAG[i])) {
								curMatch = 3;
							} else if (i < START_TEXT.length && b == START_TEXT[i]) {
								curMatch = 1;
							} else if (i < MINOR_TAG.length && b == MINOR_TAG[i]) {
								curMatch = 2;
							}				
							if (curMatch > 0 && (i == 0 || minorOrText == 3 || curMatch == minorOrText)) {					
								i++;			
								minorOrText = curMatch;
							} else i = 0;
							if ((minorOrText == 2 || minorOrText == 3) && i >= MINOR_TAG.length) {
								// update the meta
								meta.setMinor(true);
								flag = 16;
								minorOrText = -1;		
								i = 0;
							} else if ((minorOrText == 1 || minorOrText == 3) && i >= START_TEXT.length) {
								flag = 17;
								minorOrText = -1;
								i = 0;
							}	
						}

						// after the <minor/>, and search for <text>
						else if (flag == 16) {
							if (b == START_TEXT[i]) {
								i++;
							} else i = 0;
							if (i >= START_TEXT.length) {
								flag = 17;
								i = 0;
							}
						}

						// inside <text></text> block everything goes to content buffer
						else if (flag == 17) {
							if (b == END_TEXT[i]) {
								i++;
							} else i = 0;
							buffer.write(b);
							if (i >= END_TEXT.length) {
								flag = 18;
								meta.setLength(buffer.getLength());
								i = 0;
							}
						}

						// look for the closing </revision>
						else if (flag == 18) {
							if (b == END_REVISION[i]) {
								i++;
							} else i = 0;
							if (i >= END_REVISION.length) {
								// the flag is not anymore useful
								flag = 19;
								return Ack.PASSED_TO_NEXT_TAG;
							}
						}
					}
				}
			}
		}
	}

	public static class LinkExtractor implements
	ETLExtractor<LongWritable, LinkProfile, RevisionHeader> {

		private static final Logger LOG = Logger.getLogger(LinkExtractor.class);
		private static final byte[] OPEN_BRACKET = "[[".getBytes(StandardCharsets.UTF_8);
		private static final byte[] CLOSE_BRACKET = "]]".getBytes(StandardCharsets.UTF_8);

		@Override
		public float check(RevisionHeader curMeta, RevisionHeader prevMeta) {		
			if (prevMeta == null || prevMeta.getLength() == 0) return 1f;
			if (curMeta.isMinor()) return 0.0005f;
			return Math.abs(curMeta.getLength() - prevMeta.getLength()) / (float)prevMeta.getLength();
		}

		@Override
		public boolean extract(DataOutputBuffer content, RevisionHeader meta,
				LongWritable key, LinkProfile value) {
			if (meta == null || meta.getLength() == 0) {
				return false;
			}
			// add meta-data		
			key.set(meta.getPageId());

			value.clear();

			value.setNamespace(meta.getNamespace());
			value.setPageId(meta.getPageId());
			value.setPageTitle(meta.getPageTitle());
			value.setParentId(meta.getParentId());
			value.setRevisionId(meta.getRevisionId());
			value.setTimestamp(meta.getTimestamp());

			// add content (here the list of links)	
			DataOutputBuffer linkBuffer = new DataOutputBuffer();
			byte[] bytes = content.getData();
			int len = content.getLength();
			int i = 0;

			// flag = 1: not see [[ or has passed ]] token
			// flag = 2: seen [[ but not ]] yet
			int flag = 1;
			try {
				for (int cursor = 0; cursor < len; cursor++) {
					byte b = bytes[cursor];
					if (flag == 1) {				
						if (b == OPEN_BRACKET[i]) {
							i++;					
						} else i = 0;
						if (i >= OPEN_BRACKET.length) {
							flag = 2;
							i = 0;
						}
					}
					else if (flag == 2) {
						if (b == CLOSE_BRACKET[i]) {
							i++;					
						} else i = 0;
						linkBuffer.write(b);
						if (i >= CLOSE_BRACKET.length) {						
							String linkText = new String(linkBuffer.getData(), 0,
									linkBuffer.getLength() - CLOSE_BRACKET.length,
									StandardCharsets.UTF_8);
							Link l = Link.convert(linkText, false);
							if (l != null) {
								value.addLink(l);
							}
							linkBuffer.reset();
							flag = 1;
							i = 0;					
						}
					}		
				}
			} catch (IOException e) {
				LOG.error("Error extracting link from revision: [" 
						+ value.getPageId() + ", rev: " + value.getRevisionId() + "]");
			} finally {
				try {
					linkBuffer.close();
				} catch (IOException e) {
					LOG.warn("Cannot close link buffer afterwards.");
				}
			}
			return true;
		}
	}
	
}
Tech Fingerprint

Alerts (38)

'=' Maintainability Info: Avoid using unnamed 'magic' numbers directly in comparisons or assignments. Use named constants (static final variables) instead to improve readability and maintainability.
81 97 110 125 129 136 140 153 171 187 200 204 213 217 229 241 254 313
'==' Maintainability Info: Avoid using unnamed 'magic' numbers directly in comparisons or assignments. Use named constants (static final variables) instead to improve readability and maintainability.
92 104 121 131 135 139 147 165 182 196 206 210 216 224 235 248 317
Complexity hotspot; line 131 (total complexity: 7)
131
Complexity hotspot; line 206 (total complexity: 7)
206
'.close()' Manual .close() call detected. Prefer using try-with-resources (Java 7+) for automatic and safer resource management, especially handling exceptions during close.
341