PageRenderTime 79ms CodeModel.GetById 48ms app.highlight 26ms RepoModel.GetById 1ms app.codeStats 0ms

/tags/release-0.0.0-rc0/src/test/org/apache/hcatalog/pig/TestHCatLoader.java

#
Java | 381 lines | 304 code | 52 blank | 25 comment | 27 complexity | 2ee95e5b9d06641f1863080ca26c97fa MD5 | raw file
  1/**
  2 * Licensed to the Apache Software Foundation (ASF) under one
  3 * or more contributor license agreements.  See the NOTICE file
  4 * distributed with this work for additional information
  5 * regarding copyright ownership.  The ASF licenses this file
  6 * to you under the Apache License, Version 2.0 (the
  7 * "License"); you may not use this file except in compliance
  8 * with the License.  You may obtain a copy of the License at
  9 *
 10 *     http://www.apache.org/licenses/LICENSE-2.0
 11 *
 12 * Unless required by applicable law or agreed to in writing, software
 13 * distributed under the License is distributed on an "AS IS" BASIS,
 14 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 15 * See the License for the specific language governing permissions and
 16 * limitations under the License.
 17 */
 18package org.apache.hcatalog.pig;
 19
 20import java.io.IOException;
 21import java.util.ArrayList;
 22import java.util.Collection;
 23import java.util.HashMap;
 24import java.util.Iterator;
 25import java.util.List;
 26import java.util.Map;
 27import java.util.Properties;
 28
 29import junit.framework.TestCase;
 30
 31import org.apache.hadoop.hive.cli.CliSessionState;
 32import org.apache.hadoop.hive.conf.HiveConf;
 33import org.apache.hadoop.hive.ql.Driver;
 34import org.apache.hadoop.hive.ql.session.SessionState;
 35import org.apache.hcatalog.MiniCluster;
 36import org.apache.hcatalog.data.Pair;
 37import org.apache.pig.ExecType;
 38import org.apache.pig.PigServer;
 39import org.apache.pig.data.DataType;
 40import org.apache.pig.data.Tuple;
 41import org.apache.pig.impl.logicalLayer.schema.Schema;
 42import org.apache.pig.impl.logicalLayer.schema.Schema.FieldSchema;
 43import org.apache.pig.impl.util.UDFContext;
 44
 45public class TestHCatLoader extends TestCase {
 46
 47  private static final String BASIC_TABLE = "junit_unparted_basic";
 48  private static final String COMPLEX_TABLE = "junit_unparted_complex";
 49  private static final String PARTITIONED_TABLE = "junit_parted_basic";
 50  private static MiniCluster cluster = MiniCluster.buildCluster();
 51  private static Driver driver;
 52  private static Properties props;
 53
 54  private static final String basicFile = "/tmp/basic.input.data";
 55  private static final String complexFile = "/tmp/complex.input.data";
 56  private static String fullFileNameBasic;
 57  private static String fullFileNameComplex;
 58
 59  private static int guardTestCount = 5; // ugh, instantiate using introspection in guardedSetupBeforeClass
 60  private static boolean setupHasRun = false;
 61
 62  private static Map<Integer,Pair<Integer,String>> basicInputData;
 63
 64  private void dropTable(String tablename) throws IOException{
 65    driver.run("drop table "+tablename);
 66  }
 67  private void createTable(String tablename, String schema, String partitionedBy) throws IOException{
 68    String createTable;
 69    createTable = "create table "+tablename+"("+schema+") ";
 70    if ((partitionedBy != null)&&(!partitionedBy.trim().isEmpty())){
 71      createTable = createTable + "partitioned by ("+partitionedBy+") ";
 72    }
 73    createTable = createTable + "stored as RCFILE tblproperties('hcat.isd'='org.apache.hcatalog.rcfile.RCFileInputDriver'," +
 74        "'hcat.osd'='org.apache.hcatalog.rcfile.RCFileOutputDriver') ";
 75    int retCode = driver.run(createTable).getResponseCode();
 76    if(retCode != 0) {
 77      throw new IOException("Failed to create table. ["+createTable+"], return code from hive driver : ["+retCode+"]");
 78    }
 79  }
 80
 81  private void createTable(String tablename, String schema) throws IOException{
 82    createTable(tablename,schema,null);
 83  }
 84
 85  protected void guardedSetUpBeforeClass() throws Exception {
 86    if (!setupHasRun){
 87      setupHasRun = true;
 88    }else{
 89      return;
 90    }
 91
 92    HiveConf hiveConf = new HiveConf(this.getClass());
 93    hiveConf.set(HiveConf.ConfVars.PREEXECHOOKS.varname, "");
 94    hiveConf.set(HiveConf.ConfVars.POSTEXECHOOKS.varname, "");
 95    hiveConf.set(HiveConf.ConfVars.HIVE_SUPPORT_CONCURRENCY.varname, "false");
 96    driver = new Driver(hiveConf);
 97    SessionState.start(new CliSessionState(hiveConf));
 98    props = new Properties();
 99    props.setProperty("fs.default.name", cluster.getProperties().getProperty("fs.default.name"));
100    fullFileNameBasic = cluster.getProperties().getProperty("fs.default.name") + basicFile;
101    fullFileNameComplex = cluster.getProperties().getProperty("fs.default.name") + complexFile;
102
103    cleanup();
104
105    createTable(BASIC_TABLE,"a int, b string");
106    createTable(COMPLEX_TABLE,
107        "name string, studentid int, "
108        + "contact struct<phno:string,email:string>, "
109        + "currently_registered_courses array<string>, "
110        + "current_grades map<string,string>, "
111        + "phnos array<struct<phno:string,type:string>>");
112
113    createTable(PARTITIONED_TABLE,"a int, b string","bkt string");
114
115
116    int LOOP_SIZE = 3;
117    String[] input = new String[LOOP_SIZE*LOOP_SIZE];
118    basicInputData = new HashMap<Integer,Pair<Integer,String>>();
119    int k = 0;
120    for(int i = 1; i <= LOOP_SIZE; i++) {
121      String si = i + "";
122      for(int j=1;j<=LOOP_SIZE;j++) {
123        String sj = "S"+j+"S";
124        input[k] = si + "\t" + sj;
125        basicInputData.put(k, new Pair<Integer,String>(i,sj));
126        k++;
127      }
128    }
129    MiniCluster.createInputFile(cluster, basicFile, input);
130
131    MiniCluster.createInputFile(cluster, complexFile,
132        new String[]{
133        //"Henry Jekyll\t42\t(415-253-6367,hjekyll@contemporary.edu.uk)\t{(PHARMACOLOGY),(PSYCHIATRY)},[PHARMACOLOGY#A-,PSYCHIATRY#B+],{(415-253-6367,cell),(408-253-6367,landline)}",
134        //"Edward Hyde\t1337\t(415-253-6367,anonymous@b44chan.org)\t{(CREATIVE_WRITING),(COPYRIGHT_LAW)},[CREATIVE_WRITING#A+,COPYRIGHT_LAW#D],{(415-253-6367,cell),(408-253-6367,landline)}",
135        }
136    );
137
138    PigServer server = new PigServer(ExecType.LOCAL, props);
139    UDFContext.getUDFContext().setClientSystemProps();
140    server.setBatchOn();
141    server.registerQuery("A = load '"+fullFileNameBasic+"' as (a:int, b:chararray);");
142
143    server.registerQuery("store A into '"+BASIC_TABLE+"' using org.apache.hcatalog.pig.HCatStorer();");
144    server.registerQuery("B = foreach A generate a,b;");
145    server.registerQuery("B2 = filter B by a < 2;");
146    server.registerQuery("store B2 into '"+PARTITIONED_TABLE+"' using org.apache.hcatalog.pig.HCatStorer('bkt=0');");
147
148    server.registerQuery("C = foreach A generate a,b;");
149    server.registerQuery("C2 = filter C by a >= 2;");
150    server.registerQuery("store C2 into '"+PARTITIONED_TABLE+"' using org.apache.hcatalog.pig.HCatStorer('bkt=1');");
151
152    server.registerQuery("D = load '"+fullFileNameComplex+"' as (name:chararray, studentid:int, contact:tuple(phno:chararray,email:chararray), currently_registered_courses:bag{innertup:tuple(course:chararray)}, current_grades:map[ ] , phnos :bag{innertup:tuple(phno:chararray,type:chararray)});");
153    server.registerQuery("store D into '"+COMPLEX_TABLE+"' using org.apache.hcatalog.pig.HCatStorer();");
154    server.executeBatch();
155
156  }
157  private void cleanup() throws IOException {
158    MiniCluster.deleteFile(cluster, basicFile);
159    MiniCluster.deleteFile(cluster, complexFile);
160    dropTable(BASIC_TABLE);
161    dropTable(COMPLEX_TABLE);
162    dropTable(PARTITIONED_TABLE);
163  }
164
165  protected void guardedTearDownAfterClass() throws Exception {
166    guardTestCount--;
167    if (guardTestCount > 0){
168      return;
169    }
170    cleanup();
171  }
172
173  @Override
174  protected void setUp() throws Exception {
175    guardedSetUpBeforeClass();
176  }
177
178  @Override
179  protected void tearDown() throws Exception {
180    guardedTearDownAfterClass();
181  }
182
183  public void testSchemaLoadBasic() throws IOException{
184
185    PigServer server = new PigServer(ExecType.LOCAL, props);
186
187    // test that schema was loaded correctly
188    server.registerQuery("X = load '"+BASIC_TABLE+"' using org.apache.hcatalog.pig.HCatLoader();");
189    Schema dumpedXSchema = server.dumpSchema("X");
190    List<FieldSchema> Xfields = dumpedXSchema.getFields();
191    assertEquals(2,Xfields.size());
192    assertTrue(Xfields.get(0).alias.equalsIgnoreCase("a"));
193    assertTrue(Xfields.get(0).type == DataType.INTEGER);
194    assertTrue(Xfields.get(1).alias.equalsIgnoreCase("b"));
195    assertTrue(Xfields.get(1).type == DataType.CHARARRAY);
196
197  }
198
199  public void testReadDataBasic() throws IOException {
200    PigServer server = new PigServer(ExecType.LOCAL, props);
201
202    server.registerQuery("X = load '"+BASIC_TABLE+"' using org.apache.hcatalog.pig.HCatLoader();");
203    Iterator<Tuple> XIter = server.openIterator("X");
204    int numTuplesRead = 0;
205    while( XIter.hasNext() ){
206      Tuple t = XIter.next();
207      assertEquals(2,t.size());
208      assertTrue(t.get(0).getClass() == Integer.class);
209      assertTrue(t.get(1).getClass() == String.class);
210      assertEquals(t.get(0),basicInputData.get(numTuplesRead).first);
211      assertEquals(t.get(1),basicInputData.get(numTuplesRead).second);
212      numTuplesRead++;
213    }
214    assertEquals(basicInputData.size(),numTuplesRead);
215  }
216
217  public void testSchemaLoadComplex() throws IOException{
218
219    PigServer server = new PigServer(ExecType.LOCAL, props);
220
221    // test that schema was loaded correctly
222    server.registerQuery("K = load '"+COMPLEX_TABLE+"' using org.apache.hcatalog.pig.HCatLoader();");
223    Schema dumpedKSchema = server.dumpSchema("K");
224    List<FieldSchema> Kfields = dumpedKSchema.getFields();
225    assertEquals(6,Kfields.size());
226
227    assertEquals(DataType.CHARARRAY,Kfields.get(0).type);
228    assertEquals("name",Kfields.get(0).alias.toLowerCase());
229
230    assertEquals( DataType.INTEGER,Kfields.get(1).type);
231    assertEquals("studentid",Kfields.get(1).alias.toLowerCase());
232
233    assertEquals(DataType.TUPLE,Kfields.get(2).type);
234    assertEquals("contact",Kfields.get(2).alias.toLowerCase());
235    {
236      assertNotNull(Kfields.get(2).schema);
237      assertTrue(Kfields.get(2).schema.getFields().size() == 2);
238      assertTrue(Kfields.get(2).schema.getFields().get(0).type == DataType.CHARARRAY);
239      assertTrue(Kfields.get(2).schema.getFields().get(0).alias.equalsIgnoreCase("phno"));
240      assertTrue(Kfields.get(2).schema.getFields().get(1).type == DataType.CHARARRAY);
241      assertTrue(Kfields.get(2).schema.getFields().get(1).alias.equalsIgnoreCase("email"));
242    }
243    assertEquals(DataType.BAG,Kfields.get(3).type);
244    assertEquals("currently_registered_courses",Kfields.get(3).alias.toLowerCase());
245    {
246      assertNotNull(Kfields.get(3).schema);
247      assertEquals(1,Kfields.get(3).schema.getFields().size());
248      assertEquals(DataType.TUPLE,Kfields.get(3).schema.getFields().get(0).type);
249      assertNotNull(Kfields.get(3).schema.getFields().get(0).schema);
250      assertEquals(1,Kfields.get(3).schema.getFields().get(0).schema.getFields().size());
251      assertEquals(DataType.CHARARRAY,Kfields.get(3).schema.getFields().get(0).schema.getFields().get(0).type);
252      // assertEquals("course",Kfields.get(3).schema.getFields().get(0).schema.getFields().get(0).alias.toLowerCase());
253      // commented out, because the name becomes "innerfield" by default - we call it "course" in pig,
254      // but in the metadata, it'd be anonymous, so this would be autogenerated, which is fine
255    }
256    assertEquals(DataType.MAP,Kfields.get(4).type);
257    assertEquals("current_grades",Kfields.get(4).alias.toLowerCase());
258    assertEquals(DataType.BAG,Kfields.get(5).type);
259    assertEquals("phnos",Kfields.get(5).alias.toLowerCase());
260    {
261      assertNotNull(Kfields.get(5).schema);
262      assertEquals(1,Kfields.get(5).schema.getFields().size());
263      assertEquals(DataType.TUPLE,Kfields.get(5).schema.getFields().get(0).type);
264      assertNotNull(Kfields.get(5).schema.getFields().get(0).schema);
265      assertTrue(Kfields.get(5).schema.getFields().get(0).schema.getFields().size() == 2);
266      assertEquals(DataType.CHARARRAY,Kfields.get(5).schema.getFields().get(0).schema.getFields().get(0).type);
267      assertEquals("phno",Kfields.get(5).schema.getFields().get(0).schema.getFields().get(0).alias.toLowerCase());
268      assertEquals(DataType.CHARARRAY,Kfields.get(5).schema.getFields().get(0).schema.getFields().get(1).type);
269      assertEquals("type",Kfields.get(5).schema.getFields().get(0).schema.getFields().get(1).alias.toLowerCase());
270    }
271
272  }
273
274  public void testReadPartitionedBasic() throws IOException {
275    PigServer server = new PigServer(ExecType.LOCAL, props);
276
277    driver.run("select * from "+PARTITIONED_TABLE);
278    ArrayList<String> valuesReadFromHiveDriver = new ArrayList<String>();
279    driver.getResults(valuesReadFromHiveDriver);
280    assertEquals(basicInputData.size(),valuesReadFromHiveDriver.size());
281
282    server.registerQuery("W = load '"+PARTITIONED_TABLE+"' using org.apache.hcatalog.pig.HCatLoader();");
283    Schema dumpedWSchema = server.dumpSchema("W");
284    List<FieldSchema> Wfields = dumpedWSchema.getFields();
285    assertEquals(3,Wfields.size());
286    assertTrue(Wfields.get(0).alias.equalsIgnoreCase("a"));
287    assertTrue(Wfields.get(0).type == DataType.INTEGER);
288    assertTrue(Wfields.get(1).alias.equalsIgnoreCase("b"));
289    assertTrue(Wfields.get(1).type == DataType.CHARARRAY);
290    assertTrue(Wfields.get(2).alias.equalsIgnoreCase("bkt"));
291    assertTrue(Wfields.get(2).type == DataType.CHARARRAY);
292
293    Iterator<Tuple> WIter = server.openIterator("W");
294    Collection<Pair<Integer,String>> valuesRead = new ArrayList<Pair<Integer,String>>();
295    while( WIter.hasNext() ){
296      Tuple t = WIter.next();
297      assertTrue(t.size() == 3);
298      assertTrue(t.get(0).getClass() == Integer.class);
299      assertTrue(t.get(1).getClass() == String.class);
300      assertTrue(t.get(2).getClass() == String.class);
301      valuesRead.add(new Pair<Integer,String>((Integer)t.get(0),(String)t.get(1)));
302      if ((Integer)t.get(0) < 2){
303        assertEquals("0",t.get(2));
304      }else{
305        assertEquals("1",t.get(2));
306      }
307    }
308    assertEquals(valuesReadFromHiveDriver.size(),valuesRead.size());
309
310    server.registerQuery("P1 = load '"+PARTITIONED_TABLE+"' using org.apache.hcatalog.pig.HCatLoader();");
311    server.registerQuery("P1filter = filter P1 by bkt == '0';");
312    Iterator<Tuple> P1Iter = server.openIterator("P1filter");
313    int count1 = 0;
314    while( P1Iter.hasNext() ) {
315      Tuple t = P1Iter.next();
316
317      assertEquals("0", t.get(2));
318      assertEquals(1, t.get(0));
319      count1++;
320    }
321    assertEquals(3, count1);
322
323    server.registerQuery("P2 = load '"+PARTITIONED_TABLE+"' using org.apache.hcatalog.pig.HCatLoader();");
324    server.registerQuery("P2filter = filter P2 by bkt == '1';");
325    Iterator<Tuple> P2Iter = server.openIterator("P2filter");
326    int count2 = 0;
327    while( P2Iter.hasNext() ) {
328      Tuple t = P2Iter.next();
329
330      assertEquals("1", t.get(2));
331      assertTrue(((Integer) t.get(0)) > 1);
332      count2++;
333    }
334    assertEquals(6, count2);
335  }
336
337  public void testProjectionsBasic() throws IOException {
338
339    PigServer server = new PigServer(ExecType.LOCAL, props);
340
341    // projections are handled by using generate, not "as" on the Load
342
343    server.registerQuery("Y1 = load '"+BASIC_TABLE+"' using org.apache.hcatalog.pig.HCatLoader();");
344    server.registerQuery("Y2 = foreach Y1 generate a;");
345    server.registerQuery("Y3 = foreach Y1 generate b,a;");
346    Schema dumpedY2Schema = server.dumpSchema("Y2");
347    Schema dumpedY3Schema = server.dumpSchema("Y3");
348    List<FieldSchema> Y2fields = dumpedY2Schema.getFields();
349    List<FieldSchema> Y3fields = dumpedY3Schema.getFields();
350    assertEquals(1,Y2fields.size());
351    assertEquals("a",Y2fields.get(0).alias.toLowerCase());
352    assertEquals(DataType.INTEGER,Y2fields.get(0).type);
353    assertEquals(2,Y3fields.size());
354    assertEquals("b",Y3fields.get(0).alias.toLowerCase());
355    assertEquals(DataType.CHARARRAY,Y3fields.get(0).type);
356    assertEquals("a",Y3fields.get(1).alias.toLowerCase());
357    assertEquals(DataType.INTEGER,Y3fields.get(1).type);
358
359    int numTuplesRead = 0;
360    Iterator<Tuple> Y2Iter = server.openIterator("Y2");
361    while( Y2Iter.hasNext() ){
362      Tuple t = Y2Iter.next();
363      assertEquals(t.size(),1);
364      assertTrue(t.get(0).getClass() == Integer.class);
365      assertEquals(t.get(0),basicInputData.get(numTuplesRead).first);
366      numTuplesRead++;
367    }
368    numTuplesRead = 0;
369    Iterator<Tuple> Y3Iter = server.openIterator("Y3");
370    while( Y3Iter.hasNext() ){
371      Tuple t = Y3Iter.next();
372      assertEquals(t.size(),2);
373      assertTrue(t.get(0).getClass() == String.class);
374      assertEquals(t.get(0),basicInputData.get(numTuplesRead).second);
375      assertTrue(t.get(1).getClass() == Integer.class);
376      assertEquals(t.get(1),basicInputData.get(numTuplesRead).first);
377      numTuplesRead++;
378    }
379    assertEquals(basicInputData.size(),numTuplesRead);
380  }
381}