PageRenderTime 42ms CodeModel.GetById 16ms RepoModel.GetById 0ms app.codeStats 0ms

/tags/release-0.0.0-rc0/src/test/org/apache/hcatalog/pig/TestHCatLoader.java

#
Java | 381 lines | 304 code | 52 blank | 25 comment | 27 complexity | 2ee95e5b9d06641f1863080ca26c97fa MD5 | raw file
Possible License(s): Apache-2.0, BSD-3-Clause, JSON, CPL-1.0
  1. /**
  2. * Licensed to the Apache Software Foundation (ASF) under one
  3. * or more contributor license agreements. See the NOTICE file
  4. * distributed with this work for additional information
  5. * regarding copyright ownership. The ASF licenses this file
  6. * to you under the Apache License, Version 2.0 (the
  7. * "License"); you may not use this file except in compliance
  8. * with the License. You may obtain a copy of the License at
  9. *
  10. * http://www.apache.org/licenses/LICENSE-2.0
  11. *
  12. * Unless required by applicable law or agreed to in writing, software
  13. * distributed under the License is distributed on an "AS IS" BASIS,
  14. * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  15. * See the License for the specific language governing permissions and
  16. * limitations under the License.
  17. */
  18. package org.apache.hcatalog.pig;
  19. import java.io.IOException;
  20. import java.util.ArrayList;
  21. import java.util.Collection;
  22. import java.util.HashMap;
  23. import java.util.Iterator;
  24. import java.util.List;
  25. import java.util.Map;
  26. import java.util.Properties;
  27. import junit.framework.TestCase;
  28. import org.apache.hadoop.hive.cli.CliSessionState;
  29. import org.apache.hadoop.hive.conf.HiveConf;
  30. import org.apache.hadoop.hive.ql.Driver;
  31. import org.apache.hadoop.hive.ql.session.SessionState;
  32. import org.apache.hcatalog.MiniCluster;
  33. import org.apache.hcatalog.data.Pair;
  34. import org.apache.pig.ExecType;
  35. import org.apache.pig.PigServer;
  36. import org.apache.pig.data.DataType;
  37. import org.apache.pig.data.Tuple;
  38. import org.apache.pig.impl.logicalLayer.schema.Schema;
  39. import org.apache.pig.impl.logicalLayer.schema.Schema.FieldSchema;
  40. import org.apache.pig.impl.util.UDFContext;
  41. public class TestHCatLoader extends TestCase {
  42. private static final String BASIC_TABLE = "junit_unparted_basic";
  43. private static final String COMPLEX_TABLE = "junit_unparted_complex";
  44. private static final String PARTITIONED_TABLE = "junit_parted_basic";
  45. private static MiniCluster cluster = MiniCluster.buildCluster();
  46. private static Driver driver;
  47. private static Properties props;
  48. private static final String basicFile = "/tmp/basic.input.data";
  49. private static final String complexFile = "/tmp/complex.input.data";
  50. private static String fullFileNameBasic;
  51. private static String fullFileNameComplex;
  52. private static int guardTestCount = 5; // ugh, instantiate using introspection in guardedSetupBeforeClass
  53. private static boolean setupHasRun = false;
  54. private static Map<Integer,Pair<Integer,String>> basicInputData;
  55. private void dropTable(String tablename) throws IOException{
  56. driver.run("drop table "+tablename);
  57. }
  58. private void createTable(String tablename, String schema, String partitionedBy) throws IOException{
  59. String createTable;
  60. createTable = "create table "+tablename+"("+schema+") ";
  61. if ((partitionedBy != null)&&(!partitionedBy.trim().isEmpty())){
  62. createTable = createTable + "partitioned by ("+partitionedBy+") ";
  63. }
  64. createTable = createTable + "stored as RCFILE tblproperties('hcat.isd'='org.apache.hcatalog.rcfile.RCFileInputDriver'," +
  65. "'hcat.osd'='org.apache.hcatalog.rcfile.RCFileOutputDriver') ";
  66. int retCode = driver.run(createTable).getResponseCode();
  67. if(retCode != 0) {
  68. throw new IOException("Failed to create table. ["+createTable+"], return code from hive driver : ["+retCode+"]");
  69. }
  70. }
  71. private void createTable(String tablename, String schema) throws IOException{
  72. createTable(tablename,schema,null);
  73. }
  74. protected void guardedSetUpBeforeClass() throws Exception {
  75. if (!setupHasRun){
  76. setupHasRun = true;
  77. }else{
  78. return;
  79. }
  80. HiveConf hiveConf = new HiveConf(this.getClass());
  81. hiveConf.set(HiveConf.ConfVars.PREEXECHOOKS.varname, "");
  82. hiveConf.set(HiveConf.ConfVars.POSTEXECHOOKS.varname, "");
  83. hiveConf.set(HiveConf.ConfVars.HIVE_SUPPORT_CONCURRENCY.varname, "false");
  84. driver = new Driver(hiveConf);
  85. SessionState.start(new CliSessionState(hiveConf));
  86. props = new Properties();
  87. props.setProperty("fs.default.name", cluster.getProperties().getProperty("fs.default.name"));
  88. fullFileNameBasic = cluster.getProperties().getProperty("fs.default.name") + basicFile;
  89. fullFileNameComplex = cluster.getProperties().getProperty("fs.default.name") + complexFile;
  90. cleanup();
  91. createTable(BASIC_TABLE,"a int, b string");
  92. createTable(COMPLEX_TABLE,
  93. "name string, studentid int, "
  94. + "contact struct<phno:string,email:string>, "
  95. + "currently_registered_courses array<string>, "
  96. + "current_grades map<string,string>, "
  97. + "phnos array<struct<phno:string,type:string>>");
  98. createTable(PARTITIONED_TABLE,"a int, b string","bkt string");
  99. int LOOP_SIZE = 3;
  100. String[] input = new String[LOOP_SIZE*LOOP_SIZE];
  101. basicInputData = new HashMap<Integer,Pair<Integer,String>>();
  102. int k = 0;
  103. for(int i = 1; i <= LOOP_SIZE; i++) {
  104. String si = i + "";
  105. for(int j=1;j<=LOOP_SIZE;j++) {
  106. String sj = "S"+j+"S";
  107. input[k] = si + "\t" + sj;
  108. basicInputData.put(k, new Pair<Integer,String>(i,sj));
  109. k++;
  110. }
  111. }
  112. MiniCluster.createInputFile(cluster, basicFile, input);
  113. MiniCluster.createInputFile(cluster, complexFile,
  114. new String[]{
  115. //"Henry Jekyll\t42\t(415-253-6367,hjekyll@contemporary.edu.uk)\t{(PHARMACOLOGY),(PSYCHIATRY)},[PHARMACOLOGY#A-,PSYCHIATRY#B+],{(415-253-6367,cell),(408-253-6367,landline)}",
  116. //"Edward Hyde\t1337\t(415-253-6367,anonymous@b44chan.org)\t{(CREATIVE_WRITING),(COPYRIGHT_LAW)},[CREATIVE_WRITING#A+,COPYRIGHT_LAW#D],{(415-253-6367,cell),(408-253-6367,landline)}",
  117. }
  118. );
  119. PigServer server = new PigServer(ExecType.LOCAL, props);
  120. UDFContext.getUDFContext().setClientSystemProps();
  121. server.setBatchOn();
  122. server.registerQuery("A = load '"+fullFileNameBasic+"' as (a:int, b:chararray);");
  123. server.registerQuery("store A into '"+BASIC_TABLE+"' using org.apache.hcatalog.pig.HCatStorer();");
  124. server.registerQuery("B = foreach A generate a,b;");
  125. server.registerQuery("B2 = filter B by a < 2;");
  126. server.registerQuery("store B2 into '"+PARTITIONED_TABLE+"' using org.apache.hcatalog.pig.HCatStorer('bkt=0');");
  127. server.registerQuery("C = foreach A generate a,b;");
  128. server.registerQuery("C2 = filter C by a >= 2;");
  129. server.registerQuery("store C2 into '"+PARTITIONED_TABLE+"' using org.apache.hcatalog.pig.HCatStorer('bkt=1');");
  130. server.registerQuery("D = load '"+fullFileNameComplex+"' as (name:chararray, studentid:int, contact:tuple(phno:chararray,email:chararray), currently_registered_courses:bag{innertup:tuple(course:chararray)}, current_grades:map[ ] , phnos :bag{innertup:tuple(phno:chararray,type:chararray)});");
  131. server.registerQuery("store D into '"+COMPLEX_TABLE+"' using org.apache.hcatalog.pig.HCatStorer();");
  132. server.executeBatch();
  133. }
  134. private void cleanup() throws IOException {
  135. MiniCluster.deleteFile(cluster, basicFile);
  136. MiniCluster.deleteFile(cluster, complexFile);
  137. dropTable(BASIC_TABLE);
  138. dropTable(COMPLEX_TABLE);
  139. dropTable(PARTITIONED_TABLE);
  140. }
  141. protected void guardedTearDownAfterClass() throws Exception {
  142. guardTestCount--;
  143. if (guardTestCount > 0){
  144. return;
  145. }
  146. cleanup();
  147. }
  148. @Override
  149. protected void setUp() throws Exception {
  150. guardedSetUpBeforeClass();
  151. }
  152. @Override
  153. protected void tearDown() throws Exception {
  154. guardedTearDownAfterClass();
  155. }
  156. public void testSchemaLoadBasic() throws IOException{
  157. PigServer server = new PigServer(ExecType.LOCAL, props);
  158. // test that schema was loaded correctly
  159. server.registerQuery("X = load '"+BASIC_TABLE+"' using org.apache.hcatalog.pig.HCatLoader();");
  160. Schema dumpedXSchema = server.dumpSchema("X");
  161. List<FieldSchema> Xfields = dumpedXSchema.getFields();
  162. assertEquals(2,Xfields.size());
  163. assertTrue(Xfields.get(0).alias.equalsIgnoreCase("a"));
  164. assertTrue(Xfields.get(0).type == DataType.INTEGER);
  165. assertTrue(Xfields.get(1).alias.equalsIgnoreCase("b"));
  166. assertTrue(Xfields.get(1).type == DataType.CHARARRAY);
  167. }
  168. public void testReadDataBasic() throws IOException {
  169. PigServer server = new PigServer(ExecType.LOCAL, props);
  170. server.registerQuery("X = load '"+BASIC_TABLE+"' using org.apache.hcatalog.pig.HCatLoader();");
  171. Iterator<Tuple> XIter = server.openIterator("X");
  172. int numTuplesRead = 0;
  173. while( XIter.hasNext() ){
  174. Tuple t = XIter.next();
  175. assertEquals(2,t.size());
  176. assertTrue(t.get(0).getClass() == Integer.class);
  177. assertTrue(t.get(1).getClass() == String.class);
  178. assertEquals(t.get(0),basicInputData.get(numTuplesRead).first);
  179. assertEquals(t.get(1),basicInputData.get(numTuplesRead).second);
  180. numTuplesRead++;
  181. }
  182. assertEquals(basicInputData.size(),numTuplesRead);
  183. }
  184. public void testSchemaLoadComplex() throws IOException{
  185. PigServer server = new PigServer(ExecType.LOCAL, props);
  186. // test that schema was loaded correctly
  187. server.registerQuery("K = load '"+COMPLEX_TABLE+"' using org.apache.hcatalog.pig.HCatLoader();");
  188. Schema dumpedKSchema = server.dumpSchema("K");
  189. List<FieldSchema> Kfields = dumpedKSchema.getFields();
  190. assertEquals(6,Kfields.size());
  191. assertEquals(DataType.CHARARRAY,Kfields.get(0).type);
  192. assertEquals("name",Kfields.get(0).alias.toLowerCase());
  193. assertEquals( DataType.INTEGER,Kfields.get(1).type);
  194. assertEquals("studentid",Kfields.get(1).alias.toLowerCase());
  195. assertEquals(DataType.TUPLE,Kfields.get(2).type);
  196. assertEquals("contact",Kfields.get(2).alias.toLowerCase());
  197. {
  198. assertNotNull(Kfields.get(2).schema);
  199. assertTrue(Kfields.get(2).schema.getFields().size() == 2);
  200. assertTrue(Kfields.get(2).schema.getFields().get(0).type == DataType.CHARARRAY);
  201. assertTrue(Kfields.get(2).schema.getFields().get(0).alias.equalsIgnoreCase("phno"));
  202. assertTrue(Kfields.get(2).schema.getFields().get(1).type == DataType.CHARARRAY);
  203. assertTrue(Kfields.get(2).schema.getFields().get(1).alias.equalsIgnoreCase("email"));
  204. }
  205. assertEquals(DataType.BAG,Kfields.get(3).type);
  206. assertEquals("currently_registered_courses",Kfields.get(3).alias.toLowerCase());
  207. {
  208. assertNotNull(Kfields.get(3).schema);
  209. assertEquals(1,Kfields.get(3).schema.getFields().size());
  210. assertEquals(DataType.TUPLE,Kfields.get(3).schema.getFields().get(0).type);
  211. assertNotNull(Kfields.get(3).schema.getFields().get(0).schema);
  212. assertEquals(1,Kfields.get(3).schema.getFields().get(0).schema.getFields().size());
  213. assertEquals(DataType.CHARARRAY,Kfields.get(3).schema.getFields().get(0).schema.getFields().get(0).type);
  214. // assertEquals("course",Kfields.get(3).schema.getFields().get(0).schema.getFields().get(0).alias.toLowerCase());
  215. // commented out, because the name becomes "innerfield" by default - we call it "course" in pig,
  216. // but in the metadata, it'd be anonymous, so this would be autogenerated, which is fine
  217. }
  218. assertEquals(DataType.MAP,Kfields.get(4).type);
  219. assertEquals("current_grades",Kfields.get(4).alias.toLowerCase());
  220. assertEquals(DataType.BAG,Kfields.get(5).type);
  221. assertEquals("phnos",Kfields.get(5).alias.toLowerCase());
  222. {
  223. assertNotNull(Kfields.get(5).schema);
  224. assertEquals(1,Kfields.get(5).schema.getFields().size());
  225. assertEquals(DataType.TUPLE,Kfields.get(5).schema.getFields().get(0).type);
  226. assertNotNull(Kfields.get(5).schema.getFields().get(0).schema);
  227. assertTrue(Kfields.get(5).schema.getFields().get(0).schema.getFields().size() == 2);
  228. assertEquals(DataType.CHARARRAY,Kfields.get(5).schema.getFields().get(0).schema.getFields().get(0).type);
  229. assertEquals("phno",Kfields.get(5).schema.getFields().get(0).schema.getFields().get(0).alias.toLowerCase());
  230. assertEquals(DataType.CHARARRAY,Kfields.get(5).schema.getFields().get(0).schema.getFields().get(1).type);
  231. assertEquals("type",Kfields.get(5).schema.getFields().get(0).schema.getFields().get(1).alias.toLowerCase());
  232. }
  233. }
  234. public void testReadPartitionedBasic() throws IOException {
  235. PigServer server = new PigServer(ExecType.LOCAL, props);
  236. driver.run("select * from "+PARTITIONED_TABLE);
  237. ArrayList<String> valuesReadFromHiveDriver = new ArrayList<String>();
  238. driver.getResults(valuesReadFromHiveDriver);
  239. assertEquals(basicInputData.size(),valuesReadFromHiveDriver.size());
  240. server.registerQuery("W = load '"+PARTITIONED_TABLE+"' using org.apache.hcatalog.pig.HCatLoader();");
  241. Schema dumpedWSchema = server.dumpSchema("W");
  242. List<FieldSchema> Wfields = dumpedWSchema.getFields();
  243. assertEquals(3,Wfields.size());
  244. assertTrue(Wfields.get(0).alias.equalsIgnoreCase("a"));
  245. assertTrue(Wfields.get(0).type == DataType.INTEGER);
  246. assertTrue(Wfields.get(1).alias.equalsIgnoreCase("b"));
  247. assertTrue(Wfields.get(1).type == DataType.CHARARRAY);
  248. assertTrue(Wfields.get(2).alias.equalsIgnoreCase("bkt"));
  249. assertTrue(Wfields.get(2).type == DataType.CHARARRAY);
  250. Iterator<Tuple> WIter = server.openIterator("W");
  251. Collection<Pair<Integer,String>> valuesRead = new ArrayList<Pair<Integer,String>>();
  252. while( WIter.hasNext() ){
  253. Tuple t = WIter.next();
  254. assertTrue(t.size() == 3);
  255. assertTrue(t.get(0).getClass() == Integer.class);
  256. assertTrue(t.get(1).getClass() == String.class);
  257. assertTrue(t.get(2).getClass() == String.class);
  258. valuesRead.add(new Pair<Integer,String>((Integer)t.get(0),(String)t.get(1)));
  259. if ((Integer)t.get(0) < 2){
  260. assertEquals("0",t.get(2));
  261. }else{
  262. assertEquals("1",t.get(2));
  263. }
  264. }
  265. assertEquals(valuesReadFromHiveDriver.size(),valuesRead.size());
  266. server.registerQuery("P1 = load '"+PARTITIONED_TABLE+"' using org.apache.hcatalog.pig.HCatLoader();");
  267. server.registerQuery("P1filter = filter P1 by bkt == '0';");
  268. Iterator<Tuple> P1Iter = server.openIterator("P1filter");
  269. int count1 = 0;
  270. while( P1Iter.hasNext() ) {
  271. Tuple t = P1Iter.next();
  272. assertEquals("0", t.get(2));
  273. assertEquals(1, t.get(0));
  274. count1++;
  275. }
  276. assertEquals(3, count1);
  277. server.registerQuery("P2 = load '"+PARTITIONED_TABLE+"' using org.apache.hcatalog.pig.HCatLoader();");
  278. server.registerQuery("P2filter = filter P2 by bkt == '1';");
  279. Iterator<Tuple> P2Iter = server.openIterator("P2filter");
  280. int count2 = 0;
  281. while( P2Iter.hasNext() ) {
  282. Tuple t = P2Iter.next();
  283. assertEquals("1", t.get(2));
  284. assertTrue(((Integer) t.get(0)) > 1);
  285. count2++;
  286. }
  287. assertEquals(6, count2);
  288. }
  289. public void testProjectionsBasic() throws IOException {
  290. PigServer server = new PigServer(ExecType.LOCAL, props);
  291. // projections are handled by using generate, not "as" on the Load
  292. server.registerQuery("Y1 = load '"+BASIC_TABLE+"' using org.apache.hcatalog.pig.HCatLoader();");
  293. server.registerQuery("Y2 = foreach Y1 generate a;");
  294. server.registerQuery("Y3 = foreach Y1 generate b,a;");
  295. Schema dumpedY2Schema = server.dumpSchema("Y2");
  296. Schema dumpedY3Schema = server.dumpSchema("Y3");
  297. List<FieldSchema> Y2fields = dumpedY2Schema.getFields();
  298. List<FieldSchema> Y3fields = dumpedY3Schema.getFields();
  299. assertEquals(1,Y2fields.size());
  300. assertEquals("a",Y2fields.get(0).alias.toLowerCase());
  301. assertEquals(DataType.INTEGER,Y2fields.get(0).type);
  302. assertEquals(2,Y3fields.size());
  303. assertEquals("b",Y3fields.get(0).alias.toLowerCase());
  304. assertEquals(DataType.CHARARRAY,Y3fields.get(0).type);
  305. assertEquals("a",Y3fields.get(1).alias.toLowerCase());
  306. assertEquals(DataType.INTEGER,Y3fields.get(1).type);
  307. int numTuplesRead = 0;
  308. Iterator<Tuple> Y2Iter = server.openIterator("Y2");
  309. while( Y2Iter.hasNext() ){
  310. Tuple t = Y2Iter.next();
  311. assertEquals(t.size(),1);
  312. assertTrue(t.get(0).getClass() == Integer.class);
  313. assertEquals(t.get(0),basicInputData.get(numTuplesRead).first);
  314. numTuplesRead++;
  315. }
  316. numTuplesRead = 0;
  317. Iterator<Tuple> Y3Iter = server.openIterator("Y3");
  318. while( Y3Iter.hasNext() ){
  319. Tuple t = Y3Iter.next();
  320. assertEquals(t.size(),2);
  321. assertTrue(t.get(0).getClass() == String.class);
  322. assertEquals(t.get(0),basicInputData.get(numTuplesRead).second);
  323. assertTrue(t.get(1).getClass() == Integer.class);
  324. assertEquals(t.get(1),basicInputData.get(numTuplesRead).first);
  325. numTuplesRead++;
  326. }
  327. assertEquals(basicInputData.size(),numTuplesRead);
  328. }
  329. }