PageRenderTime 34ms CodeModel.GetById 2ms app.highlight 27ms RepoModel.GetById 1ms app.codeStats 0ms

/java/main/org/hedera/io/input/WikiRevisionDiffInputFormat.java

https://github.com/giangbinhtran/Hedera
Java | 513 lines | 392 code | 48 blank | 73 comment | 250 complexity | ea8a8861ba24a3da30f63afa5dc10251 MD5 | raw file
  1package org.hedera.io.input;
  2
  3import java.io.IOException;
  4import java.io.UnsupportedEncodingException;
  5import java.util.LinkedList;
  6import java.util.List;
  7
  8import org.apache.hadoop.fs.FSDataInputStream;
  9import org.apache.hadoop.io.DataOutputBuffer;
 10import org.apache.hadoop.io.LongWritable;
 11import org.apache.hadoop.io.compress.CompressionInputStream;
 12import org.apache.hadoop.mapreduce.InputSplit;
 13import org.apache.hadoop.mapreduce.RecordReader;
 14import org.apache.hadoop.mapreduce.TaskAttemptContext;
 15import org.hedera.io.RevisionDiffOld;
 16
 17import difflib.Delta;
 18import difflib.DiffUtils;
 19import difflib.Patch;
 20
 21public class WikiRevisionDiffInputFormat 
 22		extends WikiRevisionInputFormat<LongWritable, RevisionDiffOld> {
 23	
 24	@Override
 25	public RecordReader<LongWritable, RevisionDiffOld> createRecordReader(
 26			InputSplit input, TaskAttemptContext context) throws IOException,
 27			InterruptedException {
 28		return new DiffReader(); 
 29	}
 30
 31	/**
 32	 * Read every pairs of consecutive revisions and calculate their diffs
 33	 * using Meyer's alogirthm. Return WikipediaRevisionDiff which, among other fields,
 34	 * emits the list of diff between the two texts
 35	 *
 36	 * @author tuan
 37	 */
 38	// States of the flag:
 39	// 				
 40	// -1: EOF
 41	// 1 - outside the <page> tag
 42	// 2 - just passed the <page> tag but outside the <title>
 43	// 3 - just passed the <title> tag		
 44	// 4 - just passed the </title> tag but outside the <namespace>
 45	// 5 - just passed the <namespace>
 46	// 6 - just passed the </namespace> but outside the <id>
 47	// 7 - just passed the (page's) <id>
 48	// 8 - just passed the </id> tag but outside the <revision>	
 49	// 9 - just passed the (next) <revision>
 50	// 10 - just passed the inner <id> tag inside <revision>
 51	// 11 - just passed the inner </id> tag inside <revision>
 52	// 12 - just passed the <timestamp>
 53	// 13 - just passed the </timestamp> tag
 54	// 14 - just passed the <parentId>
 55	// 15 - just passed the </parentId> tag
 56	// 16 - just passed the <text> tag
 57	// 17 - just passed the </text> tag
 58	// 18 - just passed the </revision>
 59	// 19 - just passed the </page>
 60	public static class DiffReader extends WikiRevisionReader<RevisionDiffOld> {
 61
 62		// Extra flags:
 63		//
 64		// indicating the flow condition within [flag = 16]
 65		// -1 - Unmatched
 66		//  1 - Matched <revision> tag partially
 67		//  2 - Matched </page> tag partially
 68		//  3 - Matched both <revision> and </page> partially
 69		private int revOrPage = -1;
 70
 71		// indicating the flow condition within [flag = 9]
 72		// -1 - Unmatched
 73		//  1 - Matched <parentId> tag partially
 74		//  2 - Matched <timestamp> tag partially
 75		//  3 - Matched both <parentId> and <timestamp> partially
 76		private int parOrTs = -1;
 77
 78		// We now convert and cache everything from pageHeader to the followin global variables
 79		// NOTE: they all need to be synchronized with pageHeader !!
 80		// private DataOutputBuffer pageHeader = new DataOutputBuffer();
 81		private DataOutputBuffer pageTitle = new DataOutputBuffer();
 82		private DataOutputBuffer nsBuf = new DataOutputBuffer();
 83		//////////////////////////////////////////////////////////////
 84		// END PageHeader variables
 85		//////////////////////////////////////////////////////////////
 86
 87		// buffer for handling consecutive revisions
 88		private DataOutputBuffer timestampBuf = new DataOutputBuffer();		
 89		private DataOutputBuffer revIdBuf = new DataOutputBuffer();		
 90		private DataOutputBuffer parBuf = new DataOutputBuffer();
 91
 92		private List<String> lastRevText = new LinkedList<>();
 93		private DataOutputBuffer contentBuf = new DataOutputBuffer();
 94		//////////////////////////////////////////////////////////////
 95		// END revision buffer variables
 96		//////////////////////////////////////////////////////////////
 97
 98		@Override
 99		public void initialize(InputSplit input, TaskAttemptContext tac)
100				throws IOException, InterruptedException {
101			super.initialize(input, tac);
102			value = new RevisionDiffOld(); 
103		}
104
105		private void resetEverything() {
106			revOrPage = -1;
107			parOrTs = -1;
108			nsBuf.reset();
109			timestampBuf.reset();
110			revIdBuf.reset();
111			parBuf.reset();
112			contentBuf.reset();
113			keyBuf.reset();
114			pageTitle.reset();
115			value.clear();	
116			lastRevText.clear();
117			skipped = false;
118		}
119
120		@Override
121		protected STATE doWhenMatch() throws IOException, InterruptedException {
122			if (flag == 19) {
123				resetEverything();
124			}
125
126			// emit the object when reaching </revision>
127			else if (flag == 18) {
128				if (!skipped)
129					return STATE.STOP_TRUE;
130			}
131
132			// calculating the diff and shift the revision text when seeing </text>
133			// inside the <revision> block
134			else if (flag == 17) {
135				if (!skipped) {
136					// create a mass number of strings
137					List<String> content = extractParagraph(contentBuf.getData(), 0,
138							contentBuf.getLength() - END_TEXT.length);
139
140					Patch patch = DiffUtils.diff(lastRevText, content);						
141					for (Delta d : patch.getDeltas()) {
142						value.add(d);
143					}						
144
145					lastRevText = content;						
146				}
147				// release big chunk of bytes here
148				contentBuf.reset();
149			}
150
151			else if (flag == 15) {
152				if (!skipped) {
153					String parIdStr = new String(parBuf.getData(), 0, parBuf.getLength() 
154							- END_PARENT_ID.length);
155					long parId = Long.parseLong(parIdStr);
156					value.setParentId(parId);
157				}
158				parBuf.reset();
159			}
160
161			else if (flag == 13) {
162				if (!skipped) {
163					String ts = new String(timestampBuf.getData(), 0, timestampBuf.getLength() 
164							- END_TIMESTAMP.length);
165					long timestamp = TIME_FORMAT.parseMillis(ts);
166					value.setTimestamp(timestamp);
167				}
168				timestampBuf.reset();
169			}
170
171			else if (flag == 11) {
172				if (!skipped) {
173					String idStr = new String(revIdBuf.getData(), 0, revIdBuf.getLength()
174							- END_ID.length);
175					long revId = Long.parseLong(idStr);
176					value.setRevisionId(revId);
177				}
178				revIdBuf.reset();
179			}
180
181			else if (flag == 8) {
182				if (!skipped) {
183					String idStr = new String(keyBuf.getData(), 0, keyBuf.getLength()
184							- END_ID.length);
185					long pageId = Long.parseLong(idStr);
186					key.set(pageId);
187					value.setPageId(pageId);
188				}
189				keyBuf.reset();
190			}
191
192			else if (flag == 6) {
193				String nsStr = new String(nsBuf.getData(), 0, nsBuf.getLength()
194						- END_NAMESPACE.length);
195				int ns = Integer.parseInt(nsStr);
196				if (ns == 0) {
197					skipped = skipNonArticles;					
198				}
199				value.setNamespace(ns);
200				nsBuf.reset();
201			}
202
203			else if (flag == 4) {
204				String title = new String(pageTitle.getData(), 0, pageTitle.getLength()
205						- END_TITLE.length);
206				value.setPageTitle(title);
207				pageTitle.reset();
208			}
209
210			else if (flag == -1) {
211				return STATE.STOP_FALSE;
212			}
213			return STATE.CONTINUE;
214
215		}
216
217		public static List<String> extractParagraph(byte[] b, int offset, int len) 
218				throws UnsupportedEncodingException {
219			List<String> res = new LinkedList<>();		
220			if (b != null && b.length > 0) {
221				int start = offset;
222				int i = offset;
223				while (i < len) {
224					char c = (char) (((b[i] & 0xFF) << 8) + (b[i+1] & 0xFF));
225					if (c == '\n') {
226						String s = new String(b,start,i,"UTF-8");
227						res.add(s);
228
229						while (Character.isWhitespace(c)) {
230							i += 2;
231							c = (char) (((b[i] & 0xFF) << 8) + (b[i+1] & 0xFF));							
232						}	
233						start = i;
234					}
235					else {
236						i += 2;
237					}
238				}
239				if (start < i) {
240					String s = new String(b,start,i,"UTF-8");
241					res.add(s);
242				}
243			}
244			return res;
245		}
246
247		@Override
248		protected boolean readUntilMatch() throws IOException {
249			if (buf == null && pos.length != 2)
250				throw new IOException("Internal buffer corrupted.");
251			int i = 0;
252			while (true) {
253				if (pos[0] == pos[1]) {				
254					pos[1] = (compressed) ? ((CompressionInputStream)fsin).read(buf) :
255						((FSDataInputStream)fsin).read(buf);
256					pos[0] = 0;
257					LOG.info(pos[1] + " bytes read from the stream...");
258					if (pos[1] == -1) {
259						return false;
260					}
261				} 
262				while (pos[0] < pos[1]) {
263					byte b = buf[pos[0]];
264					pos[0]++;
265
266					// ignore every character until reaching a new page
267					if (flag == 1 || flag == 19) {
268						if (b == START_PAGE[i]) {
269							i++;
270							if (i >= START_PAGE.length) {
271								flag = 2;
272								return true;
273							}
274						} else i = 0;
275					}
276
277					else if (flag == 2) {
278						if (b == START_TITLE[i]) {
279							i++;
280						} else i = 0;
281						if (i >= START_TITLE.length) {
282							flag = 3;
283							return true;
284						}
285					}
286
287					// put everything between <title></title> block into title
288					else if (flag == 3) {
289						if (b == END_TITLE[i]) {
290							i++;
291						} else i = 0;
292						pageTitle.write(b);
293						if (i >= END_TITLE.length) {
294							flag = 4;
295							return true;
296						}
297					}
298
299					else if (flag == 4) {
300						if (b == START_NAMESPACE[i]) {
301							i++;
302						} else i = 0;
303						if (i >= START_NAMESPACE.length) {
304							flag = 5;
305							return true;
306						}
307					}
308
309					else if (flag == 5) {
310						if (b == END_NAMESPACE[i]) {
311							i++;
312						} else i = 0;
313						nsBuf.write(b);
314						if (i >= END_NAMESPACE.length) {
315							flag = 6;
316							return true;
317						}
318					}
319					
320					// when passing the namespace and we realize that 
321					// this is not an article, and that the option of skipping
322					// non-article pages is on, we simply skip everything until
323					// the closing </page>
324					else if (skipped && flag >= 6 && flag < 19) {
325						if (b == END_PAGE[i]) {
326							i++;
327						} else i = 0;
328						if (i >= END_PAGE.length) {
329							flag = 19;
330							return true;
331						}
332					}
333
334					else if (flag == 6) {
335						if (b == START_ID[i]) {
336							i++;
337						} else i = 0;
338						if (i >= START_ID.length) {
339							flag = 7;
340							return true;
341						}
342					}
343
344					// put everything in outer <id></id> block into keyBuf
345					else if (flag == 7) {
346						if (b == END_ID[i]) {
347							i++;
348						} else i = 0;
349						keyBuf.write(b);
350						if (i >= END_ID.length) {
351							flag = 8;
352							return true;
353						}
354					}
355
356					else if (flag == 8) {
357						if (b == START_REVISION[i]) {
358							i++;
359						} else i = 0;
360						if (i >= START_REVISION.length) {
361							flag = 9;
362							return true;
363						}
364					}
365
366					// inside <revision></revision> block, first check for id
367					else if (flag == 9) {
368						if (b == START_ID[i]) {
369							i++;
370						} else i = 0;
371						if (i >= START_ID.length) {
372							flag = 10;
373							return true;
374						}
375					}
376
377					// everything inside the inner <id></id> block goes to revision buffer
378					else if (flag == 10) {
379						if (b == END_ID[i]) {
380							i++;
381						} else i = 0;
382						revIdBuf.write(b);
383						if (i >= END_ID.length) {
384							flag = 11;
385							return true;
386						}
387					}
388
389					// after the inner <id>, check for either <timestamp> or <parentId>
390					else if (flag == 11) {
391						int curMatch = 0;				
392						if ((i < START_PARENT_ID.length && b == START_PARENT_ID[i]) 
393								&& (i < START_TIMESTAMP.length && b == START_TIMESTAMP[i])) {
394							curMatch = 3;
395						} else if (i < START_PARENT_ID.length && b == START_PARENT_ID[i]) {
396							curMatch = 1;
397						} else if (i < START_TIMESTAMP.length && b == START_TIMESTAMP[i]) {
398							curMatch = 2;
399						}				
400						if (curMatch > 0 && (i == 0 || parOrTs == 3 || curMatch == parOrTs)) {					
401							i++;			
402							parOrTs = curMatch;
403						} else i = 0;
404						if ((parOrTs == 2 || parOrTs == 3) && i >= START_TIMESTAMP.length) {
405							flag = 12;
406							parOrTs = -1;
407							return true;							
408						} else if ((parOrTs == 1 || parOrTs == 3) && i >= START_PARENT_ID.length) {
409							flag = 14;
410							parOrTs = -1;
411							return true;
412						}		
413					}
414
415					// inside <timestamp></timestamp> block everything goes to timestamp buffer
416					else if (flag == 12) {
417						if (b == END_TIMESTAMP[i]) {
418							i++;
419						} else i = 0;
420						timestampBuf.write(b);
421						if (i >= END_TIMESTAMP.length) {
422							flag = 13;
423							return true;
424						}
425					}
426
427					// inside <parentId></parentId> block everything goes to parentId buffer
428					else if (flag == 14) {
429						if (b == END_PARENT_ID[i]) {
430							i++;
431						} else i = 0;
432						parBuf.write(b);
433						if (i >= END_PARENT_ID.length) {
434							flag = 15;
435							return true;
436						}
437					}
438
439					// after the </parentId>, search for <timestamp>
440					else if (flag == 15) {
441						if (b == START_TIMESTAMP[i]) {
442							i++;
443						} else i = 0;
444						if (i >= START_TIMESTAMP.length) {
445							flag = 12;
446							return true;
447						}
448					}
449
450					// after the </timestamp>, check for <text>
451					else if (flag == 13) {
452						if (b == START_TEXT[i]) {
453							i++;
454						} else i = 0;
455						if (i >= START_TEXT.length) {
456							flag = 16;
457							return true;
458						}
459					}
460
461					// inside <text></text> block everything goes to content buffer
462					else if (flag == 16) {
463						if (b == END_TEXT[i]) {
464							i++;
465						} else i = 0;
466						contentBuf.write(b);
467						if (i >= END_TEXT.length) {
468							flag = 17;
469							return true;
470						}
471					}
472
473					// look for the closing </revision>
474					else if (flag == 17) {
475						if (b == END_REVISION[i]) {
476							i++;
477						} else i = 0;
478						if (i >= END_REVISION.length) {
479							flag = 18;
480							return true;
481						}
482					}
483
484					// Flag 16 can be the signal of a new record inside one old page
485					else if (flag == 18) {
486						int curMatch = 0;				
487						if ((i < END_PAGE.length && b == END_PAGE[i]) 
488								&& (i < START_REVISION.length && b == START_REVISION[i])) {
489							curMatch = 3;
490						} else if (i < END_PAGE.length && b == END_PAGE[i]) {
491							curMatch = 2;
492						} else if (i < START_REVISION.length && b == START_REVISION[i]) {
493							curMatch = 1;
494						}				
495						if (curMatch > 0 && (i == 0 || revOrPage == 3 || curMatch == revOrPage)) {					
496							i++;			
497							revOrPage = curMatch;
498						} else i = 0;
499						if ((revOrPage == 2 || revOrPage == 3) && i >= END_PAGE.length) {
500							flag = 19;
501							revOrPage = -1;
502							return true;							
503						} else if ((revOrPage == 1 || revOrPage == 3) && i >= START_REVISION.length) {
504							flag = 9;
505							revOrPage = -1;
506							return true;
507						}				
508					} 
509				}		
510			}
511		}
512	}
513}