PageRenderTime 74ms CodeModel.GetById 20ms app.highlight 33ms RepoModel.GetById 16ms app.codeStats 0ms

/java/test/org/hedera/LocalIntervalWikiRevisionETLReader.java

https://github.com/giangbinhtran/Hedera
Java | 251 lines | 192 code | 21 blank | 38 comment | 132 complexity | d25c0836c24e82727cc1a85c76449cff MD5 | raw file
  1package org.hedera;
  2
  3import static org.hedera.io.input.WikiRevisionInputFormat.END_PARENT_ID;
  4import static org.hedera.io.input.WikiRevisionInputFormat.MINOR_TAG;
  5import static org.hedera.io.input.WikiRevisionInputFormat.START_PARENT_ID;
  6import static org.hedera.io.input.WikiRevisionInputFormat.TIME_FORMAT;
  7
  8import java.io.IOException;
  9
 10import org.apache.hadoop.io.DataOutputBuffer;
 11import org.hedera.io.RevisionHeader;
 12
 13/**
 14 * A WikiRevsionETLReader that skips all revisions out of a specific range
 15 * @author tuan
 16 *
 17 */
 18public abstract class LocalIntervalWikiRevisionETLReader<KEYIN, VALUEIN> extends
 19		LocalDefaultWikiRevisionETLReader<KEYIN, VALUEIN> {
 20
 21	public static final String START_TIME_OPT = "org.hedera.io.etl.starttime";
 22	public static final String END_TIME_OPT = "org.hedera.io.etl.endtime";
 23
 24	public static final String SCALE_OPT = "org.hedera.io.etl.bow.scale";
 25	public static final String HOUR_SCALE_OPT = "hour";
 26	public static final String DAY_SCALE_OPT = "day";
 27	public static final String WEEK_SCALE_OPT = "week";
 28	public static final String MONTH_SCALE_OPT = "month";
 29
 30	private long startTs = Long.MIN_VALUE;
 31	private long endTs = Long.MAX_VALUE;
 32
 33	@Override
 34	public void initialize() throws IOException {
 35		super.initialize();
 36		endTs = TIME_FORMAT.parseMillis("2005-03-28T07:41:42Z");
 37		startTs = TIME_FORMAT.parseMillis("2003-05-30T12:57:20Z");		
 38	}
 39
 40	@Override
 41	// -1: EOF
 42	// 9 - default
 43	// 10 - just passed the inner <id> tag inside <revision>
 44	// 11 - just passed the inner </id> tag inside <revision>
 45	// 12 - just passed the <timestamp>
 46	// 13 - just passed the </timestamp> tag
 47	// 14 - just passed the <parentId>
 48	// 15 - just passed the </parentId> tag
 49	// 16 - just passed the <minor/> (or not)
 50	// 17 - just passed the <text> tag
 51	// 18 - just passed the </text> tag
 52	// 19 - just passed the </revision>
 53	protected Ack readToNextRevision(DataOutputBuffer buffer, 
 54			RevisionHeader meta) throws IOException {
 55		int i = 0;
 56		int flag = 9;	
 57		int parOrTs = -1;
 58		int minorOrText = -1;
 59		try (DataOutputBuffer revIdBuf = new DataOutputBuffer(); 
 60				DataOutputBuffer timestampBuf = new DataOutputBuffer(); 
 61				DataOutputBuffer parBuf = new DataOutputBuffer()) {
 62
 63			while (true) {
 64				if (!fetchMore()) return Ack.EOF;
 65				while (hasData()) {
 66					byte b = nextByte();
 67					if (flag == 9) {
 68						if (b == START_ID[i]) {
 69							i++;
 70						} else i = 0;
 71						if (i >= START_ID.length) {
 72							flag = 10;
 73							i = 0;
 74						}
 75					}
 76
 77					// everything inside the inner <id></id> 
 78					// block goes to revision buffer
 79					else if (flag == 10) {
 80						if (b == END_ID[i]) {
 81							i++;
 82						} else i = 0;
 83						revIdBuf.write(b);
 84						if (i >= END_ID.length) {
 85							flag = 11;
 86							String idStr = new String(revIdBuf.getData(), 0, 
 87									revIdBuf.getLength() - END_ID.length);
 88							long revId = Long.parseLong(idStr);
 89							meta.setRevisionId(revId);
 90							revIdBuf.reset();
 91							i = 0;
 92						}
 93					}
 94
 95					// after the inner <id>, check for either <timestamp> or <parentId>
 96					else if (flag == 11) {
 97						int curMatch = 0;				
 98						if ((i < START_PARENT_ID.length && b == START_PARENT_ID[i]) 
 99								&& (i < START_TIMESTAMP.length && b == START_TIMESTAMP[i])) {
100							curMatch = 3;
101						} else if (i < START_PARENT_ID.length && b == START_PARENT_ID[i]) {
102							curMatch = 1;
103						} else if (i < START_TIMESTAMP.length && b == START_TIMESTAMP[i]) {
104							curMatch = 2;
105						}				
106						if (curMatch > 0 && (i == 0 || parOrTs == 3 || curMatch == parOrTs)) {					
107							i++;			
108							parOrTs = curMatch;
109						} else i = 0;
110						if ((parOrTs == 2 || parOrTs == 3) && i >= START_TIMESTAMP.length) {
111							flag = 12;
112							parOrTs = -1;		
113							i = 0;
114						} else if ((parOrTs == 1 || parOrTs == 3) && i >= START_PARENT_ID.length) {
115							flag = 14;
116							parOrTs = -1;
117							i = 0;
118						}		
119					}
120
121					// inside <timestamp></timestamp> block everything goes to timestamp buffer
122					else if (flag == 12) {
123						if (b == END_TIMESTAMP[i]) {
124							i++;
125						} else i = 0;
126						timestampBuf.write(b);
127						if (i >= END_TIMESTAMP.length) {
128							flag = 13;
129							String ts = new String(timestampBuf.getData(), 0, 
130									timestampBuf.getLength() 
131									- END_TIMESTAMP.length);
132							long timestamp = TIME_FORMAT.parseMillis(ts);
133							if (timestamp < startTs || timestamp >= endTs) {
134								meta.clear();
135								return Ack.SKIPPED;
136							}							
137							meta.setTimestamp(timestamp);
138							timestampBuf.reset();
139							i = 0;
140						}
141					}
142
143					// inside <parentId></parentId> block everything goes to parentId buffer
144					else if (flag == 14) {
145						if (b == END_PARENT_ID[i]) {
146							i++;
147						} else i = 0;
148						parBuf.write(b);
149						if (i >= END_PARENT_ID.length) {
150							flag = 15;
151							String parIdStr = new String(parBuf.getData(), 0, parBuf.getLength() 
152									- END_PARENT_ID.length);
153							long parId = Long.parseLong(parIdStr);
154							meta.setParentId(parId);
155							parBuf.reset();
156							i = 0;
157						}
158					}
159
160					// after the </parentId>, search for <timestamp>
161					else if (flag == 15) {
162						if (b == START_TIMESTAMP[i]) {
163							i++;
164						} else i = 0;
165						if (i >= START_TIMESTAMP.length) {
166							flag = 12;
167							i = 0;
168						}
169					}
170
171					// After the timestamp, sometimes we can make a quick check to see
172					// whether we should  skip this revision
173
174					// after the </timestamp>, check for <minor/>, if they exist
175					else if (flag == 13) {
176						int curMatch = 0;				
177						if ((i < START_TEXT.length && b == START_TEXT[i]) 
178								&& (i < MINOR_TAG.length && b == MINOR_TAG[i])) {
179							curMatch = 3;
180						} else if (i < START_TEXT.length && b == START_TEXT[i]) {
181							curMatch = 1;
182						} else if (i < MINOR_TAG.length && b == MINOR_TAG[i]) {
183							curMatch = 2;
184						}				
185						if (curMatch > 0 && (i == 0 || minorOrText == 3 || curMatch == minorOrText)) {					
186							i++;			
187							minorOrText = curMatch;
188						} else i = 0;
189						if ((minorOrText == 2 || minorOrText == 3) && i >= MINOR_TAG.length) {
190							// update the meta
191							meta.setMinor(true);
192							flag = 16;
193							minorOrText = -1;		
194							i = 0;
195						} else if ((minorOrText == 1 || minorOrText == 3) && i >= START_TEXT.length) {
196							flag = 17;
197							minorOrText = -1;
198							i = 0;
199						}	
200					}
201
202					// after the <minor/>, and search for <text>
203					else if (flag == 16) {
204						if (b == START_TEXT[i]) {
205							i++;
206						} else i = 0;
207						if (i >= START_TEXT.length) {
208							flag = 17;
209							i = 0;
210						}
211					}
212
213					// inside <text></text> block everything goes to content buffer
214					else if (flag == 17) {
215						if (b == END_TEXT[i]) {
216							i++;
217						} else i = 0;
218						buffer.write(b);
219						if (i >= END_TEXT.length) {
220							flag = 18;							
221							meta.setLength(buffer.getLength());
222							processMetaData(buffer, meta);
223							i = 0;
224						}
225					}
226
227					// look for the closing </revision>
228					else if (flag == 18) {
229						if (b == END_REVISION[i]) {
230							i++;
231						} else i = 0;
232						if (i >= END_REVISION.length) {
233							// the flag is not anymore useful
234							flag = 19;
235							return Ack.PASSED_TO_NEXT_TAG;
236						}
237					}
238				}
239			}
240		}
241	}
242
243	/**
244	 * This method processes after caching the currently visited revision.
245	 * It performs the meta-data quick updates before the actual extraction 
246	 * (in WikiRevisionETLReader's code)
247	 * @param buffer
248	 * @param meta
249	 */
250	protected void processMetaData(DataOutputBuffer buffer, RevisionHeader meta) {}
251}