PageRenderTime 35ms CodeModel.GetById 1ms app.highlight 30ms RepoModel.GetById 1ms app.codeStats 0ms

/java/main/org/hedera/io/input/WikiRevisionPageInputFormat.java

https://github.com/giangbinhtran/Hedera
Java | 430 lines | 349 code | 37 blank | 44 comment | 240 complexity | 09d77394279cd9398c0d04a84bb0c907 MD5 | raw file
  1package org.hedera.io.input;
  2
  3
  4import java.io.IOException;
  5
  6import org.apache.hadoop.fs.FSDataInputStream;
  7import org.apache.hadoop.io.DataOutputBuffer;
  8import org.apache.hadoop.io.LongWritable;
  9import org.apache.hadoop.io.compress.CompressionInputStream;
 10import org.apache.hadoop.mapreduce.InputSplit;
 11import org.apache.hadoop.mapreduce.RecordReader;
 12import org.apache.hadoop.mapreduce.TaskAttemptContext;
 13import org.hedera.io.Revision;
 14
 15public class WikiRevisionPageInputFormat extends 
 16		WikiRevisionInputFormat<LongWritable, Revision> {
 17
 18	@Override
 19	public RecordReader<LongWritable, Revision> createRecordReader(InputSplit split, 
 20			TaskAttemptContext context) {
 21		return new RevisionReader();
 22	}
 23
 24	/**
 25	 * Read each revision of Wikipedia page and transform into a WikipediaRevision object
 26	 * @author tuan
 27	 * 
 28	 */ 
 29	public static class RevisionReader extends WikiRevisionReader<Revision> {
 30
 31		// Extra flags: 
 32		// 
 33		// indicating the flow condition within [flag = 18]
 34		// -1 - Unmatched
 35		//  1 - Matched <revision> tag partially
 36		//  2 - Matched </page> tag partially
 37		//  3 - Matched both <revision> and </page> partially
 38		private int revOrPage = -1;
 39
 40		// indicating the flow condition within [flag = 11]
 41		// -1 - Unmatched
 42		//  1 - Matched <parentId> tag partially
 43		//  2 - Matched <timestamp> tag partially
 44		//  3 - Matched both <parentId> and <timestamp> partially
 45		private int parOrTs = -1;
 46
 47		// We now convert and cache everything from pageHeader to the followin global variables
 48		// NOTE: they all need to be synchronized with pageHeader !!
 49		// private DataOutputBuffer pageHeader = new DataOutputBuffer();
 50		private DataOutputBuffer pageTitle = new DataOutputBuffer();		
 51		private DataOutputBuffer nsBuf = new DataOutputBuffer();
 52
 53		//////////////////////////////////////////////////////////////
 54		// END PageHeader variables
 55		//////////////////////////////////////////////////////////////
 56
 57		private DataOutputBuffer revBuf = new DataOutputBuffer();			
 58		private DataOutputBuffer timestampBuf = new DataOutputBuffer();		
 59		private DataOutputBuffer parBuf = new DataOutputBuffer();		
 60		private DataOutputBuffer contentBuf = new DataOutputBuffer();
 61
 62		@Override
 63		public void initialize(InputSplit input, TaskAttemptContext tac)
 64				throws IOException, InterruptedException {
 65			super.initialize(input, tac);
 66			value = new Revision(); 
 67		}
 68
 69		private void resetEverything() {			
 70			keyBuf.reset();
 71			pageTitle.reset();
 72			value.clear();
 73			contentBuf.reset();
 74			parBuf.reset();
 75			timestampBuf.reset();
 76			revBuf.reset();
 77			nsBuf.reset();
 78			pageTitle.reset();
 79			skipped = false;
 80			revOrPage = -1;
 81			parOrTs = -1;
 82		}
 83
 84		@Override
 85		public STATE doWhenMatch() throws IOException, InterruptedException {
 86			if (flag == 19) {
 87				resetEverything();
 88			} 
 89			else if (flag == 18) {
 90				if (!skipped)
 91					return STATE.STOP_TRUE;
 92			}
 93			else if (flag == 17) {
 94				if (!skipped) {
 95					value.loadText(contentBuf.getData(), 0, contentBuf.getLength() 
 96							- END_TEXT.length);
 97				}
 98
 99				// reset big chunk of data right away to save memory
100				contentBuf.reset();
101
102			}
103			else if (flag == 15) {
104				if (!skipped) {
105					String parIdStr = new String(parBuf.getData(), 0, parBuf.getLength() 
106							- END_PARENT_ID.length);
107					long parId = Long.parseLong(parIdStr);
108					value.setParentId(parId);
109				}
110				parBuf.reset();
111			}
112			else if (flag == 13) {
113				if (!skipped) {
114					String ts = new String(timestampBuf.getData(), 0, timestampBuf.getLength() 
115							- END_TIMESTAMP.length);
116					long timestamp = TIME_FORMAT.parseMillis(ts);
117					value.setTimestamp(timestamp);
118				}
119				timestampBuf.reset();
120			}
121			else if (flag == 11) {
122				if (!skipped) {
123					String idStr = new String(revBuf.getData(), 0, revBuf.getLength()
124							- END_ID.length);
125					long revId = Long.parseLong(idStr);
126					value.setRevisionId(revId);
127				}
128				revBuf.reset();
129			}
130			else if (flag == 8) {
131				if (!skipped) {
132					String idStr = new String(keyBuf.getData(), 0, keyBuf.getLength()
133							- END_ID.length);
134					long pageId = Long.parseLong(idStr);
135					key.set(pageId);
136					value.setPageId(pageId);
137				}
138				keyBuf.reset();
139			}
140			else if (flag == 6) {
141				String nsStr = new String(nsBuf.getData(), 0, nsBuf.getLength()
142						- END_NAMESPACE.length);
143				int namespace = Integer.parseInt(nsStr);
144				if (namespace == 0) {
145					skipped = skipNonArticles;					
146				}
147				value.setNamespace(namespace);
148			}
149			else if (flag == 4) {
150				String title = new String(pageTitle.getData(), 0, pageTitle.getLength()
151						- END_TITLE.length);
152				value.setPageTitle(title);
153
154				pageTitle.reset();
155			}
156			else if (flag == -1) {
157				return STATE.STOP_FALSE;
158			} 
159			return STATE.CONTINUE;
160		}
161
162		// Scan the tags in SAX manner. Return at every legit tag and inform the program via 
163		// the global flag. Flush into the caches if necessary
164		@Override
165		protected boolean readUntilMatch() throws IOException {
166			if (buf == null && pos.length != 2)
167				throw new IOException("Internal buffer corrupted.");
168			int i = 0;
169			while (true) {
170				if (pos[0] == pos[1]) {				
171					pos[1] = (compressed) ? ((CompressionInputStream)fsin).read(buf) :
172						((FSDataInputStream)fsin).read(buf);
173					pos[0] = 0;
174					if (pos[1] == -1) {
175						return false;
176					}
177				} 
178				while (pos[0] < pos[1]) {
179					byte b = buf[pos[0]];
180					pos[0]++;
181
182					// ignore every character until reaching a new page
183					if (flag == 1 || flag == 19) {
184						if (b == START_PAGE[i]) {
185							i++;
186							if (i >= START_PAGE.length) {
187								flag = 2;
188								return true;
189							}
190						} else i = 0;
191					}
192
193					else if (flag == 2) {
194						if (b == START_TITLE[i]) {
195							i++;
196						} else i = 0;
197						if (i >= START_TITLE.length) {
198							flag = 3;
199							return true;
200						}
201					}
202
203					// put everything between <title></title> block into title
204					else if (flag == 3) {
205						if (b == END_TITLE[i]) {
206							i++;
207						} else i = 0;
208						pageTitle.write(b);
209						if (i >= END_TITLE.length) {
210							flag = 4;
211							return true;
212						}
213					}
214
215					else if (flag == 4) {
216						if (b == START_NAMESPACE[i]) {
217							i++;
218						} else i = 0;
219						if (i >= START_NAMESPACE.length) {
220							flag = 5;
221							return true;
222						}
223					}
224
225					// everything within <ns></ns> block goes into nsBuf
226					else if (flag == 5) {
227						if (b == END_NAMESPACE[i]) {
228							i++;
229						} else i = 0;
230						nsBuf.write(b);
231						if (i >= END_NAMESPACE.length) {
232							flag = 6;
233							return true;
234						}
235					}
236					
237					// when passing the namespace and we realize that 
238					// this is not an article, and that the option of skipping
239					// non-article pages is on, we simply skip everything till
240					// the closing </page>
241					else if (skipped && flag >= 6 && flag < 19) {
242						if (b == END_PAGE[i]) {
243							i++;
244						} else i = 0;
245						if (i >= END_PAGE.length) {
246							flag = 19;
247							return true;
248						}
249					}
250
251					else if (flag == 6) {
252						if (b == START_ID[i]) {
253							i++;
254						} else i = 0;
255						if (i >= START_ID.length) {
256							flag = 7;
257							return true;
258						}
259					}
260
261					// put everything in outer <id></id> block into keyBuf
262					else if (flag == 7) {
263						if (b == END_ID[i]) {
264							i++;
265						} else i = 0;
266						keyBuf.write(b);
267						if (i >= END_ID.length) {
268							flag = 8;
269							return true;
270						}
271					}
272
273					else if (flag == 8) {
274						if (b == START_REVISION[i]) {
275							i++;
276						} else i = 0;
277						if (i >= START_REVISION.length) {
278							flag = 9;
279							return true;
280						}
281					}
282
283					// inside <revision></revision> block, first check for id
284					else if (flag == 9) {
285						if (b == START_ID[i]) {
286							i++;
287						} else i = 0;
288						if (i >= START_ID.length) {
289							flag = 10;
290							return true;
291						}
292					}
293
294					// everything inside the inner <id></id> block goes to revision buffer
295					else if (flag == 10) {
296						if (b == END_ID[i]) {
297							i++;
298						} else i = 0;
299						revBuf.write(b);
300						if (i >= END_ID.length) {
301							flag = 11;
302							return true;
303						}
304					}
305
306					// after the inner <id>, check for either <timestamp> or <parentId>
307					else if (flag == 11) {
308						int curMatch = 0;				
309						if ((i < START_PARENT_ID.length && b == START_PARENT_ID[i]) 
310								&& (i < START_TIMESTAMP.length && b == START_TIMESTAMP[i])) {
311							curMatch = 3;
312						} else if (i < START_PARENT_ID.length && b == START_PARENT_ID[i]) {
313							curMatch = 1;
314						} else if (i < START_TIMESTAMP.length && b == START_TIMESTAMP[i]) {
315							curMatch = 2;
316						}				
317						if (curMatch > 0 && (i == 0 || parOrTs == 3 || curMatch == parOrTs)) {					
318							i++;			
319							parOrTs = curMatch;
320						} else i = 0;
321						if ((parOrTs == 2 || parOrTs == 3) && i >= START_TIMESTAMP.length) {
322							flag = 12;
323							parOrTs = -1;
324							return true;							
325						} else if ((parOrTs == 1 || parOrTs == 3) && i >= START_PARENT_ID.length) {
326							flag = 14;
327							parOrTs = -1;
328							return true;
329						}		
330					}
331
332					// inside <timestamp></timestamp> block everything goes to timestamp buffer
333					else if (flag == 12) {
334						if (b == END_TIMESTAMP[i]) {
335							i++;
336						} else i = 0;
337						timestampBuf.write(b);
338						if (i >= END_TIMESTAMP.length) {
339							flag = 13;
340							return true;
341						}
342					}
343
344					// inside <parentId></parentId> block everything goes to parentId buffer
345					else if (flag == 14) {
346						if (b == END_PARENT_ID[i]) {
347							i++;
348						} else i = 0;
349						parBuf.write(b);
350						if (i >= END_PARENT_ID.length) {
351							flag = 15;
352							return true;
353						}
354					}
355
356					// after the </parentId>, search for <timestamp>
357					else if (flag == 15) {
358						if (b == START_TIMESTAMP[i]) {
359							i++;
360						} else i = 0;
361						if (i >= START_TIMESTAMP.length) {
362							flag = 12;
363							return true;
364						}
365					}
366
367					// after the </timestamp>, check for <text>
368					else if (flag == 13) {
369						if (b == START_TEXT[i]) {
370							i++;
371						} else i = 0;
372						if (i >= START_TEXT.length) {
373							flag = 16;
374							return true;
375						}
376					}
377
378					// inside <text></text> block everything goes to content buffer
379					else if (flag == 16) {
380						if (b == END_TEXT[i]) {
381							i++;
382						} else i = 0;
383						contentBuf.write(b);
384						if (i >= END_TEXT.length) {
385							flag = 17;
386							return true;
387						}
388					}
389
390					// look for the closing </revision>
391					else if (flag == 17) {
392						if (b == END_REVISION[i]) {
393							i++;
394						} else i = 0;
395						if (i >= END_REVISION.length) {
396							flag = 18;
397							return true;
398						}
399					}
400
401					// Flag 16 can be the signal of a new record inside one old page
402					else if (flag == 18) {
403						int curMatch = 0;				
404						if ((i < END_PAGE.length && b == END_PAGE[i]) 
405								&& (i < START_REVISION.length && b == START_REVISION[i])) {
406							curMatch = 3;
407						} else if (i < END_PAGE.length && b == END_PAGE[i]) {
408							curMatch = 2;
409						} else if (i < START_REVISION.length && b == START_REVISION[i]) {
410							curMatch = 1;
411						}				
412						if (curMatch > 0 && (i == 0 || revOrPage == 3 || curMatch == revOrPage)) {					
413							i++;			
414							revOrPage = curMatch;
415						} else i = 0;
416						if ((revOrPage == 2 || revOrPage == 3) && i >= END_PAGE.length) {
417							flag = 19;
418							revOrPage = -1;
419							return true;							
420						} else if ((revOrPage == 1 || revOrPage == 3) && i >= START_REVISION.length) {
421							flag = 9;
422							revOrPage = -1;
423							return true;
424						}				
425					} 
426				}		
427			}
428		}
429	}
430}