PageRenderTime 36ms CodeModel.GetById 1ms app.highlight 30ms RepoModel.GetById 1ms app.codeStats 1ms

/java/main/org/hedera/io/etl/RevisionLinkInputFormat.java

https://github.com/giangbinhtran/Hedera
Java | 350 lines | 285 code | 31 blank | 34 comment | 155 complexity | f7d43665bbbb70d98587bc0c5160a84f MD5 | raw file
  1package org.hedera.io.etl;
  2
  3import java.io.IOException;
  4import java.nio.charset.StandardCharsets;
  5
  6import org.apache.hadoop.conf.Configuration;
  7import org.apache.hadoop.fs.Path;
  8import org.apache.hadoop.io.DataOutputBuffer;
  9import org.apache.hadoop.io.LongWritable;
 10import org.apache.hadoop.io.compress.CompressionCodec;
 11import org.apache.hadoop.io.compress.SplittableCompressionCodec;
 12import org.apache.hadoop.mapreduce.InputSplit;
 13import org.apache.hadoop.mapreduce.JobContext;
 14import org.apache.hadoop.mapreduce.RecordReader;
 15import org.apache.hadoop.mapreduce.TaskAttemptContext;
 16import org.apache.log4j.Logger;
 17import org.hedera.io.LinkProfile;
 18import org.hedera.io.RevisionHeader;
 19import org.hedera.io.LinkProfile.Link;
 20import org.hedera.io.input.WikiRevisionInputFormat;
 21
 22/**
 23 * The input format that supports ETL reading and extract link structures from
 24 * each revision on the go 
 25 */
 26public class RevisionLinkInputFormat extends 
 27		WikiRevisionInputFormat<LongWritable, LinkProfile> {
 28
 29	@Override
 30	public RecordReader<LongWritable, LinkProfile> createRecordReader(
 31			InputSplit input, TaskAttemptContext context) 
 32					throws IOException, InterruptedException {
 33		return new RevisionLinkReader();
 34	}
 35
 36	public static class RevisionLinkReader 
 37	extends DefaultRevisionETLReader<LongWritable, 
 38	LinkProfile> {
 39
 40		@Override
 41		protected LongWritable initializeKey() {		
 42			return new LongWritable();		
 43		}
 44
 45		@Override
 46		protected void freeKey(LongWritable key) {		
 47		}
 48
 49		@Override
 50		protected void freeValue(LinkProfile value) {
 51			value.clear();
 52		}
 53
 54		@Override
 55		protected LinkProfile initializeValue() {		
 56			return new LinkProfile();		
 57		}
 58
 59		@Override
 60		protected ETLExtractor<LongWritable, LinkProfile,
 61		RevisionHeader> initializeExtractor() {		
 62			return new LinkExtractor();		
 63		}
 64
 65		@Override
 66		// -1: EOF
 67		// 9 - default
 68		// 10 - just passed the inner <id> tag inside <revision>
 69		// 11 - just passed the inner </id> tag inside <revision>
 70		// 12 - just passed the <timestamp>
 71		// 13 - just passed the </timestamp> tag
 72		// 14 - just passed the <parentId>
 73		// 15 - just passed the </parentId> tag
 74		// 16 - just passed the <minor/> (or not)
 75		// 17 - just passed the <text> tag
 76		// 18 - just passed the </text> tag
 77		// 19 - just passed the </revision>
 78		protected Ack readToNextRevision(DataOutputBuffer buffer, 
 79				RevisionHeader meta) throws IOException {
 80			int i = 0;
 81			int flag = 9;	
 82			int parOrTs = -1;
 83			int minorOrText = -1;
 84			try (DataOutputBuffer revIdBuf = new DataOutputBuffer(); 
 85					DataOutputBuffer timestampBuf = new DataOutputBuffer(); 
 86					DataOutputBuffer parBuf = new DataOutputBuffer()) {
 87
 88				while (true) {
 89					if (!fetchMore()) return Ack.EOF;
 90					while (hasData()) {
 91						byte b = nextByte();
 92						if (flag == 9) {
 93							if (b == START_ID[i]) {
 94								i++;
 95							} else i = 0;
 96							if (i >= START_ID.length) {
 97								flag = 10;
 98								i = 0;
 99							}
100						}
101
102						// everything inside the inner <id></id> 
103						// block goes to revision buffer
104						else if (flag == 10) {
105							if (b == END_ID[i]) {
106								i++;
107							} else i = 0;
108							revIdBuf.write(b);
109							if (i >= END_ID.length) {
110								flag = 11;
111								String idStr = new String(revIdBuf.getData(), 0, 
112										revIdBuf.getLength() - END_ID.length);
113								long revId = Long.parseLong(idStr);
114								meta.setRevisionId(revId);
115								revIdBuf.reset();
116								i = 0;
117							}
118						}
119
120						// after the inner <id>, check for either <timestamp> or <parentId>
121						else if (flag == 11) {
122							int curMatch = 0;				
123							if ((i < START_PARENT_ID.length && b == START_PARENT_ID[i]) 
124									&& (i < START_TIMESTAMP.length && b == START_TIMESTAMP[i])) {
125								curMatch = 3;
126							} else if (i < START_PARENT_ID.length && b == START_PARENT_ID[i]) {
127								curMatch = 1;
128							} else if (i < START_TIMESTAMP.length && b == START_TIMESTAMP[i]) {
129								curMatch = 2;
130							}				
131							if (curMatch > 0 && (i == 0 || parOrTs == 3 || curMatch == parOrTs)) {					
132								i++;			
133								parOrTs = curMatch;
134							} else i = 0;
135							if ((parOrTs == 2 || parOrTs == 3) && i >= START_TIMESTAMP.length) {
136								flag = 12;
137								parOrTs = -1;		
138								i = 0;
139							} else if ((parOrTs == 1 || parOrTs == 3) && i >= START_PARENT_ID.length) {
140								flag = 14;
141								parOrTs = -1;
142								i = 0;
143							}		
144						}
145
146						// inside <timestamp></timestamp> block everything goes to timestamp buffer
147						else if (flag == 12) {
148							if (b == END_TIMESTAMP[i]) {
149								i++;
150							} else i = 0;
151							timestampBuf.write(b);
152							if (i >= END_TIMESTAMP.length) {
153								flag = 13;
154								String ts = new String(timestampBuf.getData(), 0, 
155										timestampBuf.getLength() 
156										- END_TIMESTAMP.length);
157								long timestamp = TIME_FORMAT.parseMillis(ts);
158								meta.setTimestamp(timestamp);
159								timestampBuf.reset();
160								i = 0;
161							}
162						}
163
164						// inside <parentId></parentId> block everything goes to parentId buffer
165						else if (flag == 14) {
166							if (b == END_PARENT_ID[i]) {
167								i++;
168							} else i = 0;
169							parBuf.write(b);
170							if (i >= END_PARENT_ID.length) {
171								flag = 15;
172								String parIdStr = new String(parBuf.getData(), 0, parBuf.getLength() 
173										- END_PARENT_ID.length);
174								long parId = Long.parseLong(parIdStr);
175								meta.setParentId(parId);
176								parBuf.reset();
177								i = 0;
178							}
179						}
180
181						// after the </parentId>, search for <timestamp>
182						else if (flag == 15) {
183							if (b == START_TIMESTAMP[i]) {
184								i++;
185							} else i = 0;
186							if (i >= START_TIMESTAMP.length) {
187								flag = 12;
188								i = 0;
189							}
190						}
191
192						// After the timestamp, sometimes we can make a quick check to see
193						// whether we should  skip this revision
194
195						// after the </timestamp>, check for <minor/>, if they exist
196						else if (flag == 13) {
197							int curMatch = 0;				
198							if ((i < START_TEXT.length && b == START_TEXT[i]) 
199									&& (i < MINOR_TAG.length && b == MINOR_TAG[i])) {
200								curMatch = 3;
201							} else if (i < START_TEXT.length && b == START_TEXT[i]) {
202								curMatch = 1;
203							} else if (i < MINOR_TAG.length && b == MINOR_TAG[i]) {
204								curMatch = 2;
205							}				
206							if (curMatch > 0 && (i == 0 || minorOrText == 3 || curMatch == minorOrText)) {					
207								i++;			
208								minorOrText = curMatch;
209							} else i = 0;
210							if ((minorOrText == 2 || minorOrText == 3) && i >= MINOR_TAG.length) {
211								// update the meta
212								meta.setMinor(true);
213								flag = 16;
214								minorOrText = -1;		
215								i = 0;
216							} else if ((minorOrText == 1 || minorOrText == 3) && i >= START_TEXT.length) {
217								flag = 17;
218								minorOrText = -1;
219								i = 0;
220							}	
221						}
222
223						// after the <minor/>, and search for <text>
224						else if (flag == 16) {
225							if (b == START_TEXT[i]) {
226								i++;
227							} else i = 0;
228							if (i >= START_TEXT.length) {
229								flag = 17;
230								i = 0;
231							}
232						}
233
234						// inside <text></text> block everything goes to content buffer
235						else if (flag == 17) {
236							if (b == END_TEXT[i]) {
237								i++;
238							} else i = 0;
239							buffer.write(b);
240							if (i >= END_TEXT.length) {
241								flag = 18;
242								meta.setLength(buffer.getLength());
243								i = 0;
244							}
245						}
246
247						// look for the closing </revision>
248						else if (flag == 18) {
249							if (b == END_REVISION[i]) {
250								i++;
251							} else i = 0;
252							if (i >= END_REVISION.length) {
253								// the flag is not anymore useful
254								flag = 19;
255								return Ack.PASSED_TO_NEXT_TAG;
256							}
257						}
258					}
259				}
260			}
261		}
262	}
263
264	public static class LinkExtractor implements
265	ETLExtractor<LongWritable, LinkProfile, RevisionHeader> {
266
267		private static final Logger LOG = Logger.getLogger(LinkExtractor.class);
268		private static final byte[] OPEN_BRACKET = "[[".getBytes(StandardCharsets.UTF_8);
269		private static final byte[] CLOSE_BRACKET = "]]".getBytes(StandardCharsets.UTF_8);
270
271		@Override
272		public float check(RevisionHeader curMeta, RevisionHeader prevMeta) {		
273			if (prevMeta == null || prevMeta.getLength() == 0) return 1f;
274			if (curMeta.isMinor()) return 0.0005f;
275			return Math.abs(curMeta.getLength() - prevMeta.getLength()) / (float)prevMeta.getLength();
276		}
277
278		@Override
279		public boolean extract(DataOutputBuffer content, RevisionHeader meta,
280				LongWritable key, LinkProfile value) {
281			if (meta == null || meta.getLength() == 0) {
282				return false;
283			}
284			// add meta-data		
285			key.set(meta.getPageId());
286
287			value.clear();
288
289			value.setNamespace(meta.getNamespace());
290			value.setPageId(meta.getPageId());
291			value.setPageTitle(meta.getPageTitle());
292			value.setParentId(meta.getParentId());
293			value.setRevisionId(meta.getRevisionId());
294			value.setTimestamp(meta.getTimestamp());
295
296			// add content (here the list of links)	
297			DataOutputBuffer linkBuffer = new DataOutputBuffer();
298			byte[] bytes = content.getData();
299			int len = content.getLength();
300			int i = 0;
301
302			// flag = 1: not see [[ or has passed ]] token
303			// flag = 2: seen [[ but not ]] yet
304			int flag = 1;
305			try {
306				for (int cursor = 0; cursor < len; cursor++) {
307					byte b = bytes[cursor];
308					if (flag == 1) {				
309						if (b == OPEN_BRACKET[i]) {
310							i++;					
311						} else i = 0;
312						if (i >= OPEN_BRACKET.length) {
313							flag = 2;
314							i = 0;
315						}
316					}
317					else if (flag == 2) {
318						if (b == CLOSE_BRACKET[i]) {
319							i++;					
320						} else i = 0;
321						linkBuffer.write(b);
322						if (i >= CLOSE_BRACKET.length) {						
323							String linkText = new String(linkBuffer.getData(), 0,
324									linkBuffer.getLength() - CLOSE_BRACKET.length,
325									StandardCharsets.UTF_8);
326							Link l = Link.convert(linkText, false);
327							if (l != null) {
328								value.addLink(l);
329							}
330							linkBuffer.reset();
331							flag = 1;
332							i = 0;					
333						}
334					}		
335				}
336			} catch (IOException e) {
337				LOG.error("Error extracting link from revision: [" 
338						+ value.getPageId() + ", rev: " + value.getRevisionId() + "]");
339			} finally {
340				try {
341					linkBuffer.close();
342				} catch (IOException e) {
343					LOG.warn("Cannot close link buffer afterwards.");
344				}
345			}
346			return true;
347		}
348	}
349	
350}