/test/e2e/pig/tests/nightly.conf
Perl | 4983 lines | 4534 code | 111 blank | 338 comment | 102 complexity | baec46f614bd4cfd568cf24e6026f5d7 MD5 | raw file
Possible License(s): Apache-2.0
- #!/usr/bin/env perl
- ############################################################################
- # Licensed to the Apache Software Foundation (ASF) under one or more
- # contributor license agreements. See the NOTICE file distributed with
- # this work for additional information regarding copyright ownership.
- # The ASF licenses this file to You under the Apache License, Version 2.0
- # (the "License"); you may not use this file except in compliance with
- # the License. You may obtain a copy of the License at
- #
- # http://www.apache.org/licenses/LICENSE-2.0
- #
- # Unless required by applicable law or agreed to in writing, software
- # distributed under the License is distributed on an "AS IS" BASIS,
- # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- # See the License for the specific language governing permissions and
- # limitations under the License.
-
- ###############################################################################
- # Nightly tests for pig.
- #
- #
- #PigSetup::setup();
- #my $me = `whoami`;
- #chomp $me;
- $cfg = {
- 'driver' => 'Pig',
- 'nummachines' => 5,
- 'verify_with_pig' => 1,
- 'verify_pig_version' => 'old',
- 'groups' => [
- {
- 'name' => 'Checkin',
- 'tests' => [
- {
- 'num' => 1,
- 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k' as (name, age, gpa);
- store a into ':OUTPATH:';\,
- },
- {
- 'num' => 2,
- 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k' as (name, age, gpa);
- b = load ':INPATH:/singlefile/votertab10k' as (name, age, registration, contributions);
- c = filter a by age < 50;
- d = filter b by age < 50;
- e = cogroup c by (name, age), d by (name, age) ;
- f = foreach e generate flatten(c), flatten(d);
- g = group f by registration;
- h = foreach g generate group, SUM(f.d::contributions);
- i = order h by $1;
- store i into ':OUTPATH:';\,
- 'floatpostprocess' => 1,
- 'delimiter' => ' ',
- 'sortArgs' => ['-t', ' ', '-k', '2,2'],
- }
- ]
- },
- {
- 'name' => 'LoaderDefaultDir',
- 'tests' => [
- {
- 'num' => 1,
- 'pig' => q\a = load ':INPATH:/dir/studenttab10k' as (name, age, gpa);
- store a into ':OUTPATH:';\,
- },
- ]
- },
- {
- 'name' => 'LoaderPigStorageArg',
- 'tests' => [
- {
- 'num' => 1,
- 'pig' => q\a = load ':INPATH:/singlefile/studentcolon10k' using PigStorage(':') as (name, age, gpa);
- store a into ':OUTPATH:';\,
- },
- {
- # load with control character
- 'num' => 2,
- 'pig' => q#a = load ':INPATH:/singlefile/studentctrla10k' using PigStorage('\\u0001') as (name, age, gpa);
- store a into ':OUTPATH:';#,
- },
- {
- # load and store with control character
- 'num' => 3,
- 'pig' => q#a = load ':INPATH:/singlefile/studentctrla10k' using PigStorage('\\u0001') as (name, age, gpa);
- store a into ':OUTPATH:.intermediate' using PigStorage('\\u0001');
- b = load ':OUTPATH:.intermediate' using PigStorage('\\u0001') as (name, age, gpa);
- store b into ':OUTPATH:'; #,
- 'notmq' => 1,
- },
- ]
- },
- {
- # Results doctored, if you change this query you need to copy the
- # expected results into test/nightly/benchmarks
- 'name' => 'LoaderBinStorage',
- 'tests' => [
- {
- 'num' => 1,
- 'pig' => q\register :FUNCPATH:/testudf.jar;
- a = load ':INPATH:/singlefile/studenttab10k' as (name, age, gpa);
- b = foreach a generate name, org.apache.pig.test.udf.evalfunc.Swap(name, age), TOKENIZE((chararray)name), org.apache.pig.test.udf.evalfunc.CreateMap((chararray)name, age);
- store b into ':OUTPATH:.intermediate' using BinStorage();
- c = load ':OUTPATH:.intermediate' using BinStorage();
- store c into ':OUTPATH:' using org.apache.pig.test.udf.storefunc.StringStore();\,
- 'notmq' => 1,
- },
- ]
- },
- {
- # Results doctored, if you change this query you need to copy the
- # expected results into test/nightly/benchmarks
- 'name' => 'LoaderTextLoader',
- 'tests' => [
- {
- 'num' => 1,
- 'pig' => q\register :FUNCPATH:/testudf.jar;
- a = load ':INPATH:/singlefile/textdoc' using TextLoader();
- b = foreach a generate TOKENIZE((chararray)$0);
- store b into ':OUTPATH:' using org.apache.pig.test.udf.storefunc.StringStore();\,
- },
- ]
- },
- {
- 'name' => 'FilterBoolean',
- 'tests' => [
- {
- 'num' => 1,
- 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name, age, gpa);
- b = filter a by name == 'fred allen' and age > 50;
- store b into ':OUTPATH:' using PigStorage;\,
- },
- {
- 'num' => 2,
- 'pig' => q\a = load ':INPATH:/dir/studenttab10k' using PigStorage() as (name, age, gpa);
- b = filter a by name != 'fred allen' or age < 10;
- store b into ':OUTPATH:' using PigStorage;\,
- },
- {
- 'num' => 3,
- 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name, age, gpa);
- b = filter a by not (age == 50);
- store b into ':OUTPATH:' using PigStorage;\,
- },
- {
- 'num' => 4,
- 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name, age, gpa);
- b = filter a by (age >= 50 or name > 'fred') and (gpa <= 3.0 or name >= 'bob');
- store b into ':OUTPATH:' using PigStorage;\,
- },
- {
- 'num' => 5,
- 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name, age, gpa);
- b = filter a by age >= 50 or name > 'fred' and gpa <= 3.0 or name >= 'bob';
- store b into ':OUTPATH:' using PigStorage;\,
- },
- # test filter <= and >= for chararray, int and double
- {
- 'num' => 6,
- 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name:chararray, age:int, gpa:double);
- b = filter a by age >= 40 and age <=50 and gpa >= 2.0 and gpa <= 3.0 and name >= 'bob' and name <= 'fred';
- store b into ':OUTPATH:' using PigStorage;\,
- },
- # test filter <= and >= for bytearray, long and float
- {
- 'num' => 7,
- 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name, age:long, gpa:float);
- b = filter a by age >= 40 and age <=50 and gpa >= 2.0f and gpa <= 3.0f and name >= 'bob' and name <= 'fred';
- store b into ':OUTPATH:' using PigStorage;\,
- },
- # test filter < and > for chararray, int and double
- {
- 'num' => 8,
- 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name:chararray, age:int, gpa:double);
- b = filter a by age > 40 and age <50 and gpa > 2.0 and gpa < 3.0 and name > 'bob' and name < 'fred';
- store b into ':OUTPATH:' using PigStorage;\,
- },
- # test filter < and > for bytearray, long and float
- {
- 'num' => 9,
- 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name, age:long, gpa:float);
- b = filter a by age > 40 and age <50 and gpa > 2.0f and gpa < 3.0f and name > 'bob' and name < 'fred';
- store b into ':OUTPATH:' using PigStorage;\,
- },
- # test filter <= and >= for explicit cast for chararray, int and double
- {
- 'num' => 10,
- 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name, age, gpa);
- b = filter a by (int)age >= 40 and (int)age <=50 and (double)gpa >= 2.0 and (double)gpa <= 3.0 and (chararray)name >= 'bob' and (chararray)name <= 'fred';
- store b into ':OUTPATH:' using PigStorage;\,
- },
- # test filter <= and >= for explicit cast for bytearray, long and float
- {
- 'num' => 11,
- 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name, age, gpa);
- b = filter a by (long)age >= 40 and (long)age <=50 and (float)gpa >= 2.0f and (float)gpa <= 3.0f and name >= 'bob' and name <= 'fred';
- store b into ':OUTPATH:' using PigStorage;\,
- },
- # test filter < and > for explicit cast for chararray, int and double
- {
- 'num' => 12,
- 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name, age, gpa);
- b = filter a by (int)age > 40 and (int)age <50 and (double)gpa > 2.0 and (double)gpa < 3.0 and (chararray)name > 'bob' and (chararray)name < 'fred';
- store b into ':OUTPATH:' using PigStorage;\,
- },
- # test filter < and > for explicit cast for bytearray, long and float
- {
- 'num' => 13,
- 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name, age, gpa);
- b = filter a by (long)age > 40 and (long)age <50 and (float)gpa > 2.0f and (float)gpa < 3.0f and name > 'bob' and name < 'fred';
- store b into ':OUTPATH:' using PigStorage;\,
- },
- # test AND with nulls
- {
- 'num' => 14,
- 'pig' => q\a = load ':INPATH:/singlefile/studentnulltab10k' using PigStorage() as (name, age, gpa);
- b = filter a by name == 'fred allen' and age > 50;
- store b into ':OUTPATH:' using PigStorage;\,
- },
- # test OR with nulls
- {
- 'num' => 15,
- 'pig' => q\a = load ':INPATH:/singlefile/studentnulltab10k' using PigStorage() as (name, age, gpa);
- b = filter a by name != 'fred allen' or age < 10;
- store b into ':OUTPATH:' using PigStorage;\,
- },
- # test with nulls filter <= and >= for chararray, int and double
- {
- 'num' => 16,
- 'pig' => q\a = load ':INPATH:/singlefile/studentnulltab10k' using PigStorage() as (name:chararray, age:int, gpa:double);
- b = filter a by age >= 40 and age <=50 and gpa >= 2.0 and gpa <= 3.0 and name >= 'bob' and name <= 'fred';
- store b into ':OUTPATH:' using PigStorage;\,
- },
- # test with nulls filter < and > for explicit cast for chararray, int and double
- {
- 'num' => 17,
- 'pig' => q\a = load ':INPATH:/singlefile/studentnulltab10k' using PigStorage() as (name, age, gpa);
- b = filter a by (int)age > 40 and (int)age <50 and (double)gpa > 2.0 and (double)gpa < 3.0 and (chararray)name > 'bob' and (chararray)name < 'fred';
- store b into ':OUTPATH:' using PigStorage;\,
- },
- {
- 'num' => 18,
- 'ignore' => 1, # PIG-2593 this case is not supported as instate need to be declared as boolean
- 'pig' => q\a = load ':INPATH:/singlefile/allscalar10k' as (name, age, gpa, instate);
- b = filter a by instate;
- store b into ':OUTPATH:' using PigStorage;\,
- 'verify_pig_script' => q\a = load ':INPATH:/singlefile/allscalar10k' as (name, age, gpa, instate);
- b = filter a by instate == 'true';
- store b into ':OUTPATH:' using PigStorage;\,
- },
- {
- 'num' => 19,
- 'ignore' => 1, # PIG-2593 this case is not supported as instate need to be declared as boolean
- 'pig' => q\a = load ':INPATH:/singlefile/allscalar10k' as (name, age, gpa, instate);
- b = filter a by not instate;
- store b into ':OUTPATH:' using PigStorage;\,
- 'verify_pig_script' => q\a = load ':INPATH:/singlefile/allscalar10k' as (name, age, gpa, instate);
- b = filter a by instate == 'false';
- store b into ':OUTPATH:' using PigStorage;\,
- },
- {
- 'num' => 20,
- 'pig' => q\a = load ':INPATH:/singlefile/allscalar10k' as (name, age, gpa, instate);
- b = filter a by instate is null;
- store b into ':OUTPATH:' using PigStorage;\,
- },
- {
- 'num' => 21,
- 'pig' => q\a = load ':INPATH:/singlefile/allscalar10k' as (name, age, gpa, instate);
- b = filter a by instate == true;
- store b into ':OUTPATH:' using PigStorage;\,
- 'verify_pig_script' => q\a = load ':INPATH:/singlefile/allscalar10k' as (name, age, gpa, instate);
- b = filter a by instate == 'true';
- store b into ':OUTPATH:' using PigStorage;\,
- },
- {
- 'num' => 22,
- 'pig' => q\a = load ':INPATH:/singlefile/allscalar10k' as (name, age, gpa, instate);
- b = filter a by instate == false;
- store b into ':OUTPATH:' using PigStorage;\,
- 'verify_pig_script' => q\a = load ':INPATH:/singlefile/allscalar10k' as (name, age, gpa, instate);
- b = filter a by instate == 'false';
- store b into ':OUTPATH:' using PigStorage;\,
- },
- {
- 'num' => 23,
- 'pig' => q\a = load ':INPATH:/singlefile/allscalar10k' as (name:chararray, age:int, gpa:double, instate:boolean);
- b = filter a by instate;
- store b into ':OUTPATH:' using PigStorage;\,
- 'verify_pig_script' => q\a = load ':INPATH:/singlefile/allscalar10k' as (name:chararray, age:int, gpa:double, instate:chararray);
- b = filter a by instate == 'true';
- store b into ':OUTPATH:' using PigStorage;\,
- },
- {
- 'num' => 24,
- 'pig' => q\a = load ':INPATH:/singlefile/allscalar10k' as (name:chararray, age:int, gpa:double, instate:boolean);
- b = filter a by not instate;
- store b into ':OUTPATH:' using PigStorage;\,
- 'verify_pig_script' => q\a = load ':INPATH:/singlefile/allscalar10k' as (name:chararray, age:int, gpa:double, instate:chararray);
- b = filter a by instate == 'false';
- store b into ':OUTPATH:' using PigStorage;\,
- },
- {
- 'num' => 25,
- 'pig' => q\a = load ':INPATH:/singlefile/allscalar10k' as (name:chararray, age:int, gpa:double, instate:boolean);
- b = filter a by instate is null;
- store b into ':OUTPATH:' using PigStorage;\,
- 'verify_pig_script' => q\a = load ':INPATH:/singlefile/allscalar10k' as (name:chararray, age:int, gpa:double, instate:chararray);
- b = filter a by instate is null;
- store b into ':OUTPATH:' using PigStorage;\,
- },
- {
- 'num' => 26,
- 'pig' => q\a = load ':INPATH:/singlefile/allscalar10k' as (name:chararray, age:int, gpa:double, instate:boolean);
- b = filter a by instate == true;
- store b into ':OUTPATH:' using PigStorage;\,
- 'verify_pig_script' => q\a = load ':INPATH:/singlefile/allscalar10k' as (name:chararray, age:int, gpa:double, instate:chararray);
- b = filter a by instate == 'true';
- store b into ':OUTPATH:' using PigStorage;\,
- },
- {
- 'num' => 27,
- 'pig' => q\a = load ':INPATH:/singlefile/allscalar10k' as (name:chararray, age:int, gpa:double, instate:boolean);
- b = filter a by instate == false;
- store b into ':OUTPATH:' using PigStorage;\,
- 'verify_pig_script' => q\a = load ':INPATH:/singlefile/allscalar10k' as (name:chararray, age:int, gpa:double, instate:chararray);
- b = filter a by instate == 'false';
- store b into ':OUTPATH:' using PigStorage;\,
- },
- ],
- },
- {
- 'name' => 'FilterEq',
- 'tests' => [
- {
- 'num' => 1,
- 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name, age, gpa);
- b = filter a by name == 'alice johnson' and age == 64 and gpa == 3.99;
- store b into ':OUTPATH:' using PigStorage;\,
- },
- {
- 'num' => 2,
- 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name, age, gpa);
- b = filter a by name > 'fred allen' and age > 40 and gpa > 2.50;
- store b into ':OUTPATH:' using PigStorage;\,
- },
- {
- 'num' => 3,
- 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name, age, gpa);
- b = filter a by name >= 'fred allen' and age >= 40 and gpa >= 2.50;
- store b into ':OUTPATH:' using PigStorage;\,
- },
- {
- 'num' => 4,
- 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name, age, gpa);
- b = filter a by name lt 'fred allen' and age < 40 and gpa < 2.50;
- store b into ':OUTPATH:' using PigStorage;\,
- },
- {
- 'num' => 5,
- 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name, age, gpa);
- b = filter a by name lte 'fred allen' and age <= 40 and gpa <= 2.50;
- store b into ':OUTPATH:' using PigStorage;\,
- },
- {
- 'num' => 6,
- 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k' using PigStorage();
- b = filter a by $0 neq 'fred allen' and $1 != '40' and $2 != '2.50';
- store b into ':OUTPATH:' using PigStorage;\,
- },
- # test for filter == for chararray, int and double
- {
- 'num' => 7,
- 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name:chararray, age:int, gpa:double);
- b = filter a by name == 'fred allen' and age == 61 and gpa == 1.42;
- store b into ':OUTPATH:' using PigStorage;\,
- },
- # test for filter == for bytearray, long and float
- {
- 'num' => 8,
- 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name, age:long, gpa:float);
- b = filter a by name == 'fred allen' and age == 61 and gpa == 1.42f;
- store b into ':OUTPATH:' using PigStorage;\,
- },
- # test for filter != for chararray, int and double
- {
- 'num' => 9,
- 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name:chararray, age:int, gpa:double);
- b = filter a by $0 != 'fred allen' and $1 != 40 and $2 != 2.50;
- store b into ':OUTPATH:' using PigStorage;\,
- },
- # test for filter != for bytearray, long and float
- {
- 'num' => 10,
- 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name, age:long, gpa:float);
- b = filter a by $0 != 'fred allen' and $1 != 40 and $2 != 2.50f;
- store b into ':OUTPATH:' using PigStorage;\,
- },
- # test for filter == for explicit casts to chararray, int and double
- {
- 'num' => 11,
- 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name, age, gpa);
- b = filter a by (chararray)name == 'fred allen' and (int)age == 61 and (double)gpa == 1.42;
- store b into ':OUTPATH:' using PigStorage;\,
- },
- # test for filter == for explicit casts to bytearray, long and float
- {
- 'num' => 12,
- 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name, age, gpa);
- b = filter a by name == 'fred allen' and (long)age == 61 and (float)gpa == 1.42f;
- store b into ':OUTPATH:' using PigStorage;\,
- },
- # test for filter != for explicit casts to chararray, int and double
- {
- 'num' => 13,
- 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k' using PigStorage() ;
- b = filter a by (chararray)$0 != 'fred allen' and (int)$1 != 40 and (double)$2 != 2.50;
- store b into ':OUTPATH:' using PigStorage;\,
- },
- # test for filter != for explicit casts to bytearray, long and float
- {
- 'num' => 14,
- 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k' using PigStorage() ;
- b = filter a by $0 != 'fred allen' and (long)$1 != 40 and (float)$2 != 2.50f;
- store b into ':OUTPATH:' using PigStorage;\,
- },
- ]
- },
- {
- 'name' => 'FilterMatches',
- 'tests' => [
- {
- 'num' => 1,
- 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name, age, gpa);
- b = filter a by name matches '^fred.*';
- store b into ':OUTPATH:' using PigStorage;\,
- },
- {
- 'num' => 2,
- 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k' using PigStorage();
- b = filter a by not $0 matches '^fred.*';
- store b into ':OUTPATH:' using PigStorage;\,
- },
- {
- # test for filter on matches for chararray (declared and explicit cast)
- 'num' => 3,
- 'pig' => q\a = load ':INPATH:/singlefile/votertab10k' using PigStorage() as (name:chararray, age:int, registration, contributions:double);
- b = filter a by name matches '^fred.*' and (chararray)registration matches '^dem.*';
- store b into ':OUTPATH:' using PigStorage;\,
- },
- {
- 'num' => 4,
- 'pig' => q\a = load ':INPATH:/singlefile/votertab10k' using PigStorage() as (name:chararray, age:int, registration, contributions:double);
- b = filter a by name matches 'f.ed' and (chararray)registration matches 'd.m';
- store b into ':OUTPATH:' using PigStorage;\,
- },
- {
- 'num' => 5,
- 'pig' => q\a = load ':INPATH:/singlefile/votertab10k' using PigStorage() as (name:chararray, age:int, registration, contributions:double);
- b = filter a by name matches 'f[^f]ed.*';
- store b into ':OUTPATH:' using PigStorage;\,
- },
- {
- 'num' => 6,
- 'pig' => "a = load ':INPATH:/singlefile/votertab10k' using PigStorage() as (name:chararray, age:int, registration, contributions:double);\nb = filter a by name matches '.*\\\\wan.*';\nstore b into ':OUTPATH:' using PigStorage;",
- },
- {
- 'num' => 7,
- 'pig' => "a = load ':INPATH:/singlefile/votertab10k' using PigStorage() as (name:chararray, age:int, registration, contributions:double);\nb = filter a by name matches '^e.*\\\\sc.*';\nstore b into ':OUTPATH:' using PigStorage;",
- },
- {
- 'num' => 8,
- 'pig' => "a = load ':INPATH:/singlefile/votertab10k' using PigStorage() as (name:chararray, age:int, registration, contributions:double);\nb = filter a by name matches 'ethan white';\nstore b into ':OUTPATH:' using PigStorage;",
- },
- {
- 'num' => 9,
- 'pig' => "a = load ':INPATH:/singlefile/studentnulltab10k' using PigStorage() as (name, age, gpa);\nb = filter a by gpa matches '\\\\d\\\\.45';\nstore b into ':OUTPATH:' using PigStorage;",
- },
- ]
- },
- {
- 'name' => 'FilterUdf',
- 'tests' => [
- {
- 'num' => 1,
- 'pig' => q\
- a = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name, age, gpa);
- b = load ':INPATH:/singlefile/votertab10k' as (name, age, registration, contributions);
- c = cogroup a by (name, age), b by (name, age);
- d = filter c by not IsEmpty(a);
- e = filter d by not IsEmpty(b);
- f = foreach e generate flatten(a), flatten(b);
- store f into ':OUTPATH:';\,
- },
- {
- 'num' => 2,
- 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k' as (name, age, gpa);
- b = load ':INPATH:/singlefile/votertab10k' as (name, age, registration, contributions);
- c = filter a by age < 50;
- d = filter b by age < 50;
- e = cogroup c by (name, age), d by (name, age);
- f = filter e by COUNT(c)> 0 AND COUNT(d)>0;
- store f into ':OUTPATH:';\,
- 'rc' => 0
- },
- ]
- },
- # TODO Group that don't flatten via Agg functions
- {
- 'name' => 'GroupAggFunc',
- 'tests' => [
- {
- 'num' => 1,
- 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k' as (name, age, gpa);
- b = group a by name;
- c = foreach b generate group, COUNT(a.age);
- store c into ':OUTPATH:';\,
- },
- {
- 'num' => 2,
- 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k';
- b = group a by $0;
- c = foreach b generate group, COUNT(a.$1);
- store c into ':OUTPATH:';\,
- },
- {
- 'num' => 3,
- 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k' as (name, age, gpa);
- b = group a by (name, age);
- c = foreach b generate group.name, group.age, COUNT(a.gpa);
- store c into ':OUTPATH:';\,
- },
- {
- 'num' => 5,
- 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k' as (name, age, gpa);
- b = group a all;
- c = foreach b generate COUNT(a.$0);
- store c into ':OUTPATH:';\,
- },
- {
- 'num' => 6,
- 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k' as (name, age, gpa);
- b = group a by name;
- c = foreach b generate group, SUM(a.age);
- store c into ':OUTPATH:';\,
- },
- {
- 'num' => 7,
- 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k' as (name, age, gpa);
- b = group a by name;
- c = foreach b generate group, SUM(a.gpa);
- store c into ':OUTPATH:';\,
- 'floatpostprocess' => 1,
- 'delimiter' => ' ',
- },
- {
- 'num' => 8,
- 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k' as (name, age, gpa);
- b = group a by name;
- c = foreach b generate group, AVG(a.age);
- store c into ':OUTPATH:';\,
- },
- {
- 'num' => 9,
- 'ignore23' => 'I cannot get it right due to float precision, temporarily disable',
- 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k' as (name, age, gpa);
- b = group a by name;
- c = foreach b generate group, AVG(a.gpa);
- store c into ':OUTPATH:';\,
- 'floatpostprocess' => 1,
- 'delimiter' => ' ',
- },
- {
- 'num' => 10,
- 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k' as (name, age, gpa);
- b = group a by name;
- c = foreach b generate group, MIN(a.gpa);
- store c into ':OUTPATH:';\,
- },
- {
- 'num' => 11,
- 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k' as (name, age, gpa);
- b = group a by name;
- c = foreach b generate group, MAX(a.gpa);
- store c into ':OUTPATH:';\,
- },
- {
- 'num' => 12,
- 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k' as (name, age, gpa);
- b = group a by (name, age);
- c = foreach b generate flatten(group), SUM(a.gpa);
- store c into ':OUTPATH:';\,
- 'floatpostprocess' => 1,
- 'delimiter' => ' ',
- },
- {
- 'num' => 13,
- 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k' as (name, age, gpa);
- b = group a by (name);
- c = load ':INPATH:/singlefile/studenttab10k' as (name, age, gpa);
- d = cogroup b by group, c by name;
- e = foreach d generate flatten(group), SUM(c.gpa), COUNT(c.name);
- store e into ':OUTPATH:';\,
- 'floatpostprocess' => 1,
- 'delimiter' => ' ',
- },
- {
- 'num' => 14,
- 'pig' => q\a = load ':INPATH:/singlefile/allscalar10k' as (name:chararray, age:int, gpa:double, instate:boolean);
- b = group a by (name);
- e = foreach b generate COUNT(a.name);
- store e into ':OUTPATH:';\,
- 'verify_pig_script' => q\a = load ':INPATH:/singlefile/allscalar10k' as (name:chararray, age:int, gpa:double, instate:chararray);
- b = group a by (name);
- e = foreach b generate COUNT(a.name);
- store e into ':OUTPATH:';\,
- }
- ],
- },
- {
- 'name' => 'MapPartialAgg',
- 'tests' => [
- {
- 'num' => 1,
- 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k' as (name, age, gpa);
- b = group a by name;
- c = foreach b generate group, COUNT(a.age);
- store c into ':OUTPATH:';\,
- 'java_params' => ['-Dpig.exec.mapPartAgg=true']
- },
- {
- #multiquery with group in one sub query
- 'num' => 2,
- 'pig' => q# a = load ':INPATH:/singlefile/studenttab10k' as (name: chararray, age: int, gpa: float);
- b = filter a by age < 22; store b into ':OUTPATH:.1';
- c = group b by age;
- d = foreach c generate group, SUM(b.gpa);
- store d into ':OUTPATH:.2'; #,
- 'java_params' => ['-Dpig.exec.mapPartAgg=true']
-
- },
- {
- #multi query with two group on diff columns
- 'num' => 3,
- 'pig' => q# a = load ':INPATH:/singlefile/studenttab10k' as (name: chararray, age: int, gpa: float);
- g1 = group a by name;
- f1 = foreach g1 generate group as name, MAX(a.gpa);
- store f1 into ':OUTPATH:.1';
- g2 = group a by age;
- f2 = foreach g2 generate group as age, AVG(a.gpa);
- store f2 into ':OUTPATH:.2'; #,
- 'java_params' => ['-Dpig.exec.mapPartAgg=true']
-
- },
- {
- #multi query with three groups on diff columns, group key being an expression
- 'num' => 4,
- 'pig' => q# a = load ':INPATH:/singlefile/studenttab10k' as (name: chararray, age: int, gpa: float);
- g1 = group a by name;
- f1 = foreach g1 generate group as name, MAX(a.gpa);
- store f1 into ':OUTPATH:.1';
- g2 = group a by age%10;
- f2 = foreach g2 generate group as age_mod10, AVG(a.gpa);
- store f2 into ':OUTPATH:.2';
- g3 = group a by age;
- f3 = foreach g3 generate group%10, AVG(a.gpa);
- store f3 into ':OUTPATH:.3';
- g4 = group a by gpa;
- f4 = foreach g4 generate group as gpa, COUNT(a);
- store f4 into ':OUTPATH:.4';
-
- #,
- 'java_params' => ['-Dpig.exec.mapPartAgg=true']
-
- },
- {
- #aggregation gets more than one tuple for every tuple from load func
-
- 'num' => 5,
- 'pig' => q# a = load ':INPATH:/singlefile/studenttab10k' as (name: chararray, age: int, gpa: float);
- b = foreach a generate name, age, gpa, flatten(TOBAG(age,age)) as x;
- c = group b by age;
- d = foreach c generate group, AVG(b.gpa);
- store d into ':OUTPATH:'; #,
- 'java_params' => ['-Dpig.exec.mapPartAgg=true']
-
- },
-
- ],
- },
- {
- 'name' => 'EvalFunc',
- 'tests' => [
- {
- 'num' => 1,
- 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k' as (name, age, gpa);
- b = filter a by name lt 'b';
- c = foreach b generate ARITY(name, age, gpa);
- store c into ':OUTPATH:';\,
- },
- {
- 'num' => 2,
- 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name:chararray, age, gpa);
- b = filter a by name lt 'b';
- c = foreach b generate TOKENIZE(name);
- d = foreach c generate flatten($0);
- store d into ':OUTPATH:';\,
- },
- {
- 'num' => 3,
- 'pig' => q\register :FUNCPATH:/testudf.jar;
- a = load ':INPATH:/singlefile/studenttab10k' as (name, age, gpa);
- b = filter a by name lt 'b';
- c = foreach b generate org.apache.pig.test.udf.evalfunc.Swap(name, age);
- store c into ':OUTPATH:' using org.apache.pig.test.udf.storefunc.StringStore();\,
- },
- {
- 'num' => 4,
- 'pig' => q\register :FUNCPATH:/testudf.jar;
- a = load ':INPATH:/singlefile/studenttab10k' as (name, age, gpa);
- b = filter a by name lt 'b';
- c = foreach b generate org.apache.pig.test.udf.evalfunc.CreateMap((chararray)name, age);
- store c into ':OUTPATH:' using org.apache.pig.test.udf.storefunc.StringStore();\,
- },
- {
- 'num' => 5,
- 'pig' => q\register :FUNCPATH:/testudf.jar;
- a = load ':INPATH:/singlefile/allscalar10k' as (name:chararray, age:int, gpa:double, instate:boolean);
- b = foreach a generate org.apache.pig.test.udf.evalfunc.TestBoolean(instate);
- store b into ':OUTPATH:';\,
- 'verify_pig_script' => q\a = load ':INPATH:/singlefile/allscalar10k' as (name:chararray, age:int, gpa:double, instate:chararray);
- b = foreach a generate (instate is null ? '' : (instate == 'true' ? 'false' : 'true'));
- store b into ':OUTPATH:';\,
- }
- ]
- },
- # TODO DIFF
- # TODO User defined grouping function
- {
- 'name' => 'CoGroupFlatten',
- 'tests' => [
- {
- 'num' => 1,
- 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name, age, gpa);
- b = load ':INPATH:/singlefile/votertab10k' as (name, age, registration, contributions);
- c = filter a by age < 20;
- d = filter b by age < 20;
- e = cogroup c by name, d by name;
- f = foreach e generate flatten (c), flatten(d);
- store f into ':OUTPATH:';\,
- },
- {
- 'num' => 2,
- 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name, age, gpa);
- b = load ':INPATH:/singlefile/votertab10k' as (name, age, registration, contributions);
- c = filter a by $1 < 20;
- d = filter b by $1 < 20;
- e = cogroup c by $0, d by $0;
- f = foreach e generate flatten (c), flatten(d);
- store f into ':OUTPATH:';\,
- },
- {
- 'num' => 3,
- 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name, age, gpa);
- b = load ':INPATH:/singlefile/votertab10k' as (name, age, registration, contributions);
- c = filter a by age < 20;
- d = filter b by age < 20;
- e = cogroup c by (name, age), d by (name, age);
- f = foreach e generate flatten (c), flatten(d);
- store f into ':OUTPATH:';\,
- },
- {
- 'num' => 4,
- 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name, age, gpa);
- b = load ':INPATH:/singlefile/votertab10k' as (name, age, registration, contributions);
- d = filter b by age < 20;
- e = cogroup a by (name, age) inner, d by (name, age);
- f = foreach e generate flatten (a), flatten(d);
- store f into ':OUTPATH:';\,
- },
- {
- 'num' => 5,
- 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name, age, gpa);
- b = load ':INPATH:/singlefile/votertab10k' as (name, age, registration, contributions);
- c = filter a by age < 20;
- e = cogroup c by (name, age), b by (name, age) inner;
- f = foreach e generate flatten (c), flatten(b);
- store f into ':OUTPATH:';\,
- },
- {
- 'num' => 6,
- 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name, age, gpa);
- b = load ':INPATH:/singlefile/votertab10k' as (name, age, registration, contributions);
- e = cogroup a by (name, age) inner, b by (name, age) inner;
- f = foreach e generate flatten (a), flatten(b);
- store f into ':OUTPATH:';\,
- },
- {
- # Test cogrouping data loaded from two separate loaders. We don't have any data that can join with studenttab that isn't also loaded with PigStorage, so the
- # first step is an intermediate load and store using BinStorage.
- 'num' => 7,
- 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name, age, gpa);
- store a into ':OUTPATH:.intermediate' using BinStorage();
- b = load ':OUTPATH:.intermediate' using BinStorage() as (name, age, gpa);
- c = load ':INPATH:/singlefile/votertab10k' as (name, age, registration, contributions);
- e = cogroup b by (name, age) inner, c by (name, age) inner;
- f = foreach e generate flatten (b), flatten(c);
- store f into ':OUTPATH:';\,
- 'notmq' => 1,
- },
-
- ]
- },
- {
- 'name' => 'CoGroup',
- 'tests' => [
- {
- 'num' => 1,
- 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name, age, gpa);
- b = load ':INPATH:/singlefile/votertab10k' as (name, age, registration, contributions);
- c = cogroup a by name, b by name;
- d = foreach c generate flatten(group), COUNT(a) + COUNT(b);
- store d into ':OUTPATH:';\,
- },
- ]
- },
- {
- 'name' => 'Join',
- 'tests' => [
- {
- 'num' => 1,
- 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name, age, gpa);
- b = load ':INPATH:/singlefile/votertab10k' as (name, age, registration, contributions);
- c = filter a by age < 20;
- d = filter b by age < 20;
- e = join c by name, d by name;
- store e into ':OUTPATH:';\,
- },
- {
- 'num' => 2,
- 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name, age, gpa);
- b = load ':INPATH:/singlefile/votertab10k' as (name, age, registration, contributions);
- c = filter a by age < 20;
- d = filter b by age < 20;
- e = join c by $0, d by $0;
- store e into ':OUTPATH:';\,
- },
- {
- 'num' => 3,
- 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name, age, gpa);
- b = load ':INPATH:/singlefile/votertab10k' as (name, age, registration, contributions);
- c = filter a by age < 20;
- d = filter b by age < 20;
- e = join c by (name, age), d by (name, age);
- store e into ':OUTPATH:';\,
- },
- # self join with implict split
- # JIRA PIG-429
- {
- 'num' => 4,
- 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k';
- b = filter a by $1 > 25;
- c = join a by $0, b by $0;
- store c into ':OUTPATH:';\,
- },
- # join with one input having schema and another without
- # JIRA PIG-428
- {
- 'num' => 5,
- 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k' as (name:chararray,age:int, gpa:double);
- another = load ':INPATH:/singlefile/studenttab10k';
- c = foreach another generate $0, $1+ 10, $2 + 10.0;
- d = join a by $0, c by $0;
- store d into ':OUTPATH:';\,
- },
- # self join using fragment replicate join
- # no types
- {
- 'num' => 6,
- 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k' as (name, age, gpa);
- b = load ':INPATH:/singlefile/studenttab10k' as (name, age, gpa);
- c = join a by name, b by name using 'repl';
- store c into ':OUTPATH:';\,
- 'verify_pig_script' => q\a = load ':INPATH:/singlefile/studenttab10k' as (name, age, gpa);
- b = load ':INPATH:/singlefile/studenttab10k' as (name, age, gpa);
- c = join a by name, b by name ;
- store c into ':OUTPATH:';\,
- },
- # self join using fragment replicate join
- # with types and no cast for join key
- {
- 'num' => 7,
- 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k' as (name:chararray, age:int, gpa:double);
- b = load ':INPATH:/singlefile/studenttab10k' as (name:chararray, age:int, gpa:double);
- c = join a by name, b by name using 'repl';
- store c into ':OUTPATH:';\,
- 'verify_pig_script' => q\a = load ':INPATH:/singlefile/studenttab10k' as (name:chararray, age:int, gpa:double);
- b = load ':INPATH:/singlefile/studenttab10k' as (name:chararray, age:int, gpa:double);
- c = join a by name, b by name ;
- store c into ':OUTPATH:';\,
- },
- # self join using fragment replicate join
- # with types and cast for join key
- {
- 'num' => 8,
- 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k' as (name:chararray, age:int, gpa:double);
- b = load ':INPATH:/singlefile/studenttab10k' as (name:chararray, age:int, gpa);
- c = join a by gpa, b by gpa using 'repl';
- store c into ':OUTPATH:';\,
- 'verify_pig_script' => q\a = load ':INPATH:/singlefile/studenttab10k' as (name:chararray, age:int, gpa:double);
- b = load ':INPATH:/singlefile/studenttab10k' as (name:chararray, age:int, gpa);
- c = join a by gpa, b by gpa ;
- store c into ':OUTPATH:';\,
- },
- # left outer join
- {
- 'num' => 9,
- 'pig' => q\a = load ':INPATH:/singlefile/studentnulltab10k' as (name:chararray, age:int, gpa:double);
- b = load ':INPATH:/singlefile/voternulltab10k' as (name:chararray, age:long, registration:chararray, contributions:double);
- c = join a by name left outer, b by name;
- store c into ':OUTPATH:';\,
- },
- # right outer join
- {
- 'num' => 10,
- 'pig' => q\a = load ':INPATH:/singlefile/studentnulltab10k' as (name:chararray, age:int, gpa:double);
- b = load ':INPATH:/singlefile/voternulltab10k' as (name:chararray, age:long, registration:chararray, contributions:double);
- c = join a by name right outer, b by name;
- store c into ':OUTPATH:';\,
- },
- # full outer join
- {
- 'num' => 11,
- 'pig' => q\a = load ':INPATH:/singlefile/studentnulltab10k' as (name:chararray, age:int, gpa:double);
- b = load ':INPATH:/singlefile/voternulltab10k' as (name:chararray, age:long, registration:chararray, contributions:double);
- c = join a by name full outer, b by name;
- store c into ':OUTPATH:';\,
- },
- # see PIG-1209 join package now uses internalcachedBag, so every tuple on reduce side in this test will spilled to disk.
- {
- 'num' => 12,
- 'java_params' => ['-Dpig.cachedbag.memusage=0'],
- 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name, age, gpa);
- b = load ':INPATH:/singlefile/votertab10k' as (name, age, registration, contributions);
- c = filter a by age < 20;
- d = filter b by age < 20;
- e = join c by name, d by name;
- store e into ':OUTPATH:';\,
- },
- {
- 'num' => 13,
- 'pig' => q\a = load ':INPATH:/singlefile/allscalar10k' as (name:chararray, age:int, gpa:double, instate:boolean);
- b = load ':INPATH:/singlefile/allscalar10k' as (name:chararray, age:int, gpa:double, instate:boolean);
- c = filter a by age < 20;
- d = filter b by age < 20;
- e = join c by instate, d by instate parallel 5;
- store e into ':OUTPATH:';\,
- 'verify_pig_script' => q\a = load ':INPATH:/singlefile/allscalar10k' as (name:chararray, age:int, gpa:double, instate:chararray);
- b = load ':INPATH:/singlefile/allscalar10k' as (name:chararray, age:int, gpa:double, instate:chararray);
- c = filter a by age < 20;
- d = filter b by age < 20;
- e = join c by instate, d by instate parallel 5;
- store e into ':OUTPATH:';\,
- }
- ]
- },
- {
- 'name' => 'Foreach',
- 'tests' => [
- {
- 'num' => 1,
- 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k' as (name, age, gpa);
- b = foreach a generate *;
- store b into ':OUTPATH:';\,
- },
- {
- 'num' => 2,
- 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k';
- b = foreach a generate *;
- store b into ':OUTPATH:';\,
- },
- {
- 'num' => 3,
- 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k' as (name, age, gpa);
- b = foreach a generate name, age;
- store b into ':OUTPATH:';\,
- },
- {
- 'num' => 4,
- 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k';
- b = foreach a generate $0, $2;
- store b into ':OUTPATH:';\,
- },
- {
- # test filter, projection, sort , duplicate elimination
- 'num' => 5,
- 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k' as (name, age, gpa);
- b = filter a by age < 20;
- c = group b by age;
- d = foreach c {
- cf = filter b by gpa < 3.0;
- cp = cf.gpa;
- cd = distinct cp;
- co = order cd by $0;
- generate group, flatten(co);
- }
- store d into ':OUTPATH:';\,
- },
- {
- # test flatten for map and scalar
- 'num' => 6,
- 'pig' => q\register :FUNCPATH:/testudf.jar;
- a = load ':INPATH:/singlefile/studenttab10k' as (name, age, gpa);
- b = foreach a generate flatten(name) as n, flatten(org.apache.pig.test.udf.evalfunc.CreateMap((chararray)name, gpa)) as m;
- store b into ':OUTPATH:' using org.apache.pig.test.udf.storefunc.StringStore();\,
- },
- {
- # test flatten for UDF that returns bag with multiple tuples with multiple columns
- 'num' => 7,
- 'pig' => q\register :FUNCPATH:/testudf.jar;
- a = load ':INPATH:/singlefile/studenttab10k' as (name, age, gpa);
- b = foreach a generate name, flatten(org.apache.pig.test.udf.evalfunc.CreateTupleBag(age, gpa)) as foo;
- store b into ':OUTPATH:';\,
- },
- {
- 'num' => 8,
- 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name, age: int, gpa);
- c = group a by name;
- d = foreach c generate flatten(group), MAX(a.age) + MIN(a.age);
- store d into ':OUTPATH:';\,
- },
- {
- # test filter, projection, sort , duplicate elimination
- 'num' => 9,
- 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k' as (name, age, gpa);
- b = filter a by age < 20;
- c = group b by age;
- d = foreach c {
- cf = filter b by gpa >= 3.0 and gpa <= 3.5;
- cp = cf.gpa;
- cd = distinct cp;
- co = order cd by $0;
- generate group, flatten(co);
- }
- store d into ':OUTPATH:';\,
- },
- {
- # test filter, projection, sort , duplicate elimination
- 'num' => 10,
- 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k' as (name, age, gpa);
- b = filter a by age < 20;
- c = group b by age;
- d = foreach c {
- cf = filter b by (gpa == 4.0 or gpa != 2.0) and name > 'a';
- cp = cf.gpa;
- cd = distinct cp;
- co = order cd by $0;
- generate group, flatten(co);
- }
- store d into ':OUTPATH:';\,
- },
- {
- # test filter, projection, sort , duplicate elimination
- 'num' => 11,
- 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k' as (name, age, gpa);
- b = filter a by age < 20;
- c = foreach b {
- exp1 = age + gpa;
- exp2 = exp1 + age;
- generate exp1, exp2;
- }
- store c into ':OUTPATH:';\,
- },
- {
- # test a udf with no args
- 'num' => 12,
- 'pig' => q\register :FUNCPATH:/testudf.jar;
- a = load ':INPATH:/singlefile/studenttab10k' as (name, age, gpa);
- b = foreach a generate name, org.apache.pig.test.udf.evalfunc.Fred() as fred;
- store b into ':OUTPATH:';\,
- },
- {
- 'num' => 13,
- 'pig' => q\a = load ':INPATH:/singlefile/allscalar10k' as (name:chararray, age:int, gpa:double, instate:boolean);
- b = foreach a generate *;
- store b into ':OUTPATH:';\,
- 'verify_pig_script' => q\a = load ':INPATH:/singlefile/allscalar10k' as (name:chararray, age:int, gpa:double, instate:chararray);
- b = foreach a generate *;
- store b into ':OUTPATH:';\,
- }
- ]
- },
- {
- 'name' => 'Order',
- 'tests' => [
- {
- 'num' => 1,
- 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k' as (name, age, gpa);
- b = foreach a generate name;
- c = order b by name;
- store c into ':OUTPATH:';\,
- 'sortArgs' => ['-t', ' ', '-k', '1,1'],
- },
- {
- 'num' => 2,
- 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k';
- b = foreach a generate $1;
- c = order b by $0;
- store c into ':OUTPATH:';\,
- 'sortArgs' => ['-t', ' ', '-k', '1,1'],
- },
- {
- 'num' => 3,
- 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k' as (name, age, gpa);
- b = foreach a generate gpa;
- c = order b by gpa;
- store c into ':OUTPATH:';\,
- 'sortArgs' => ['-t', ' ', '-k', '1,1'],
- },
- {
- 'num' => 4,
- 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k';
- b = order a by *;
- store b into ':OUTPATH:';\,
- 'sortArgs' => ['-t', ' '],
- },
- {
- 'num' => 5,
- 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k' as (name, age, gpa);
- b = foreach a generate name, age;
- c = order b by name, age;
- store c into ':OUTPATH:';\,
- 'sortArgs' => ['-t', ' ', '-k', '1,2'],
- },
- {
- 'num' => 6,
- 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k';
- c = order a by $0;
- store c into ':OUTPATH:';\,
- 'sortArgs' => ['-t', ' ', '-k', '1,1'],
- },
- {
- 'num' => 7,
- 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k';
- c = order a by $1;
- store c into ':OUTPATH:';\,
- 'sortArgs' => ['-t', ' ', '-k', '2,2'],
- },
- {
- 'num' => 8,
- 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k';
- c = order a by $0, $1;
- store c into ':OUTPATH:';\,
- 'sortArgs' => ['-t', ' ', '-k', '1,2'],
- },
- {
- 'num' => 9,
- 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k';
- c = order a by $1, $0;
- store c into ':OUTPATH:';\,
- 'sortArgs' => ['-t', ' ', '-k', '2,2', '-k', '1,1'],
- },
- {
- 'num' => 10,
- 'ignore' => 'order by UDF is not supported',
- 'pig' => q\register :FUNCPATH:/testudf.jar;
- a = load ':INPATH:/singlefile/studenttab10k';
- c = order a by * using org.apache.pig.test.udf.orderby.OrdDesc;
- store c into ':OUTPATH:';\,
- 'sortArgs' => ['-t', ' ', '-r'],
- },
- {
- 'num' => 11,
- 'ignore' => 'order by UDF is not supported',
- 'pig' => q\register :FUNCPATH:/testudf.jar;
- a = load ':INPATH:/singlefile/studenttab10k';
- c = order a by $0 using org.apache.pig.test.udf.orderby.OrdDesc;
- store c into ':OUTPATH:';\,
- 'sortArgs' => ['-t', ' ', '-r', '-k', '1,1'],
- },
- {
- 'num' => 12,
- 'ignore' => 'order by UDF is not supported',
- 'pig' => q\register :FUNCPATH:/testudf.jar;
- a = load ':INPATH:/singlefile/studenttab10k';
- c = order a by $0, $1 using org.apache.pig.test.udf.orderby.OrdDesc;
- store c into ':OUTPATH:';\,
- 'sortArgs' => ['-t', ' ', '-r', '-k', '1,2'],
- },
- # ALERT All these tests with inner order bys aren't testing the inner
- # ordering. We need to develop a sorting tool to do that.
- {
- 'num' => 13,
- 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k';
- b = group a by $0;
- c = foreach b {c1 = order $1 by $1; generate flatten(c1); };
- store c into ':OUTPATH:';\,
- },
- {
- 'num' => 14,
- 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k';
- b = group a by $0;
- c = foreach b {c1 = order $1 by *; generate flatten(c1); };
- store c into ':OUTPATH:';\,
- },
- {
- 'num' => 15,
- 'pig' => q\register :FUNCPATH:/testudf.jar;
- a = load ':INPATH:/singlefile/studenttab10k';
- b = group a by $0;
- c = foreach b {c1 = order $1 by * using org.apache.pig.test.udf.orderby.OrdDesc; generate flatten(c1); };
- store c into ':OUTPATH:';\,
- },
- {
- 'num' => 16,
- 'pig' => q\register :FUNCPATH:/testudf.jar;
- a = load ':INPATH:/singlefile/studenttab10k';
- b = group a by $0;
- c = foreach b {c1 = order $1 by $1 using org.apache.pig.test.udf.orderby.OrdDesc; generate flatten(c1);};
- store c into ':OUTPATH:';\,
- },
- {
- 'num' => 17,
- 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k';
- b = group a by $0;
- c = foreach b {c1 = order $1 by $1; generate flatten(c1), MAX($1.$1); };
- store c into ':OUTPATH:';\,
- },
- {
- # test to make sure the weighted range patitioning
- # works correctly when a sort key value repeats across
- # reduce partitions
- 'num' => 18,
- 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k';
- b = order a by $1 parallel 100;
- store b into ':OUTPATH:';\,
- 'sortArgs' => ['-t', ' ', '-k', '2,2'],
- },
- {
- 'num' => 19,
- 'pig' => q\a = load ':INPATH:/singlefile/allscalar10k' as (name:chararray, age:int, gpa:double, instate:boolean);
- b = foreach a generate instate;
- c = order b by instate;
- store c into ':OUTPATH:';\,
- 'verify_pig_script' => q\a = load ':INPATH:/singlefile/allscalar10k' using PigStorage() as (name:chararray, age:int, gpa:double, instate:chararray);
- b = foreach a generate instate;
- c = order b by instate;
- store c into ':OUTPATH:';\,
- 'sortArgs' => ['-t', ' ', '-k', '1,1'],
- },
- ]
- },
- {
- 'name' => 'Distinct',
- 'tests' => [
- {
- 'num' => 1,
- 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k' as (name, age, gpa);
- b = foreach a generate name;
- c = distinct b;
- store c into ':OUTPATH:';\,
- },
- {
- 'num' => 2,
- 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k';
- b = foreach a generate $1;
- c = distinct b;
- store c into ':OUTPATH:';\,
- },
- {
- 'num' => 3,
- 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k' as (name, age, gpa);
- b = foreach a generate gpa;
- c = distinct b;
- store c into ':OUTPATH:';\,
- },
- {
- 'num' => 4,
- 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k';
- b = distinct a;
- store b into ':OUTPATH:';\,
- },
- {
- 'num' => 5,
- 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k' as (name, age, gpa);
- b = foreach a generate name, age;
- c = distinct b;
- store c into ':OUTPATH:';\,
- },
- {
- 'num' => 6,
- 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k' as (name, age, gpa);
- b = group a by name;
- c = foreach b { aa = distinct a.age; generate group, COUNT(aa); }
- store c into ':OUTPATH:';\,
- }
- ]
- },
- {
- 'name' => 'Cross',
- 'tests' => [
- {
- 'num' => 1,
- 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k' as (name, age, gpa);
- b = load ':INPATH:/singlefile/votertab10k' as (name, age, registration, contributions);
- c = filter a by age < 19 and gpa < 1.0;
- d = filter b by age < 19;
- e = cross c, d;
- store e into ':OUTPATH:';\,
- },
- {
- 'num' => 2,
- 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k' as (name, age, gpa);
- b = load ':INPATH:/singlefile/votertab10k' as (name, age, registration, contributions);
- c = filter a by age < 19 and gpa < 1.0;
- d = filter b by age < 19;
- e = cross c, d parallel 10;
- store e into ':OUTPATH:';\,
- },
- {
- 'num' => 3,
- 'pig' => q\set default_parallel 10;
- a = load ':INPATH:/singlefile/studenttab10k' as (name, age, gpa);
- b = load ':INPATH:/singlefile/votertab10k' as (name, age, registration, contributions);
- c = filter a by age < 19 and gpa < 1.0;
- d = filter b by age < 19;
- e = cross c, d;
- store e into ':OUTPATH:';\,
- },
- {
- 'num' => 4,
- 'pig' => q\
- a = load ':INPATH:/singlefile/studenttab10k' as (name, age, gpa);
- b = load ':INPATH:/singlefile/votertab10k' as (name, age, registration, contributions);
- c = filter a by age < 25;
- d = filter b by age < 25;
- e = cross c, d;
- f = filter e by c::age < d::age;
- store f into ':OUTPATH:';\,
- }
- ]
- },
- {
- 'name' => 'Union',
- 'tests' => [
- {
- 'num' => 1,
- 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k' as (name, age, gpa);
- b = load ':INPATH:/singlefile/votertab10k' as (name, age, registration, contributions);
- c = foreach a generate name, age;
- d = foreach b generate name, age;
- e = union c, d;
- store e into ':OUTPATH:';\,
- },
- ]
- },
- {
- 'name' => 'Bincond',
- 'tests' => [
- {
- 'num' => 1,
- 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k' as (name, age, gpa);
- b = foreach a generate name, (name matches 'yuri.*' ? age - 10 : (int)age);
- store b into ':OUTPATH:';\,
- },
- ]
- },
- {
- 'name' => 'Glob',
- 'tests' => [
- {
- 'num' => 1,
- 'pig' => q\a = load ':INPATH:/singlefile/studenttab10?' as (name, age, gpa);
- b = filter a by name == 'nick miller';
- store b into ':OUTPATH:';\,
- },
- {
- 'num' => 2,
- 'pig' => q\a = load ':INPATH:/singlefile/st*ttab10k' as (name, age, gpa);
- b = filter a by name == 'nick miller';
- store b into ':OUTPATH:';\,
- },
- {
- 'num' => 3,
- 'pig' => q\a = load ':INPATH:/singlefile/studenttab*' as (name, age, gpa);
- b = filter a by name == 'nick miller';
- store b into ':OUTPATH:';\,
- },
- {
- 'num' => 4,
- 'pig' => q\a = load ':INPATH:/singlefile/studenttab???' as (name, age, gpa);
- b = filter a by name == 'nick miller';
- store b into ':OUTPATH:';\,
- },
- {
- 'num' => 5,
- 'pig' => q\a = load ':INPATH:/singlefile/studenttab[1-9]0[km]' as (name, age, gpa);
- b = filter a by name == 'nick miller';
- store b into ':OUTPATH:';\,
- },
- {
- 'num' => 6,
- 'pig' => q\a = load ':INPATH:/singlefile/studenttab[13]0[km]' as (name, age, gpa);
- b = filter a by name == 'nick miller';
- store b into ':OUTPATH:';\,
- },
- {
- 'num' => 7,
- 'pig' => q\a = load ':INPATH:/singlefile/studenttab[12]0[a-l]' as (name, age, gpa);
- b = filter a by name == 'nick miller';
- store b into ':OUTPATH:';\,
- },
- {
- 'num' => 8,
- 'pig' => q\a = load ':INPATH:/glob/star/*good' as (name, age, gpa);
- b = filter a by name == 'nick miller';
- store b into ':OUTPATH:';\,
- },
- {
- 'num' => 9,
- 'pig' => q\a = load ':INPATH:/glob/star/*' as (name, age, gpa);
- b = filter a by name == 'nick miller';
- store b into ':OUTPATH:';\,
- }
- ]
- },
- {
- 'name' => 'Arithmetic',
- 'tests' => [
- {
- 'num' => 1,
- 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k' as (name, age, gpa);
- c = foreach a generate age + 1, (int)gpa + 1;
- store c into ':OUTPATH:';\,
- },
- {
- 'num' => 2,
- 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k' as (name, age, gpa);
- c = foreach a generate (double)age + 1.5, gpa + 1.5;
- store c into ':OUTPATH:';\,
- },
- {
- 'num' => 3,
- 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k' as (name, age, gpa);
- c = foreach a generate age - 30, (int)gpa - 3;
- store c into ':OUTPATH:';\,
- },
- {
- 'num' => 4,
- 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k' as (name, age, gpa);
- c = foreach a generate (double)age - 30.1, gpa - 3.199;
- store c into ':OUTPATH:';\,
- },
- {
- 'num' => 5,
- 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k' as (name, age, gpa);
- c = foreach a generate age * 10, (int)gpa * 2;
- store c into ':OUTPATH:';\,
- },
- {
- 'num' => 6,
- 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k' as (name, age, gpa);
- c = foreach a generate (double)age * 10.1, gpa * 2.752342;
- store c into ':OUTPATH:';\,
- },
- {
- 'num' => 7,
- 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k' as (name, age, gpa);
- c = foreach a generate age / 30, (int)gpa / 3;
- store c into ':OUTPATH:';\,
- },
- {
- 'num' => 8,
- 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k' as (name, age, gpa);
- c = foreach a generate (double)age / 30.323, gpa / 3.22;
- store c into ':OUTPATH:';\,
- },
- {
- 'num' => 9,
- 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k' as (name, age, gpa);
- c = foreach a generate 3 * age + gpa / 9.1 - 2;
- store c into ':OUTPATH:';\,
- },
- {
- 'num' => 10,
- 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k' as (name, age, gpa);
- c = foreach a generate 3 * (age + gpa) / (9.1 - 2);
- store c into ':OUTPATH:';\,
- }
- ]
- },
- {
- 'name' => 'Regression',
- 'tests' => [
- {
- 'num' => 1459894,
- 'pig' => q\a = load ':INPATH:/singlefile/reg1459894';
- b = group a by $0;
- c = foreach b generate group, COUNT(a.$1);
- store c into ':OUTPATH:';\,
- },
- {
- 'num' => 97,
- 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name, age, gpa);
- b = load ':INPATH:/singlefile/votertab10k' as (name, age, registration, contributions);
- e = cogroup a by name, b by name;
- f = foreach e generate group, COUNT(a), COUNT(b);
- store f into ':OUTPATH:';\,
- },
- {
- 'num' => 203,
- 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name, age, gpa);
- b = group a by name;
- c = foreach b generate group, COUNT($1);
- store c into ':OUTPATH:';
- --This is a really long script to test that when script size exceeds 1k we can still parse it.
- --The quick sly fox jumped over the lazy brown dog.
- --he quick sly fox jumped over the lazy brown dog.T
- --e quick sly fox jumped over the lazy brown dog.Th
- -- quick sly fox jumped over the lazy brown dog.The
- --quick sly fox jumped over the lazy brown dog.The
- --uick sly fox jumped over the lazy brown dog.The q
- --ick sly fox jumped over the lazy brown dog.The qu
- --ck sly fox jumped over the lazy brown dog.The qui
- --k sly fox jumped over the lazy brown dog.The quic
- -- sly fox jumped over the lazy brown dog.The quick
- --sly fox jumped over the lazy brown dog.The quick
- --ly fox jumped over the lazy brown dog.The quick s
- --y fox jumped over the lazy brown dog.The quick sl
- -- fox jumped over the lazy brown dog.The quick sly
- --fox jumped over the lazy brown dog.The quick sly
- --ox jumped over the lazy brown dog.The quick sly f
- --x jumped over the lazy brown dog.The quick sly fo
- -- jumped over the lazy brown dog.The quick sly fox
- --jumped over the lazy brown dog.The quick sly fox
- --umped over the lazy brown dog.The quick sly fox j
- --mped over the lazy brown dog.The quick sly fox ju
- --ped over the lazy brown dog.The quick sly fox jum\,
- }
- ]
- },
- {
- 'name' => 'Unicode',
- 'tests' => [
- {
- 'num' => 1,
- 'pig' => q\a = load ':INPATH:/singlefile/unicode100';
- store a into ':OUTPATH:';\,
- },
- ]
- },
- {
- 'name' => 'Parameters',
- 'tests' => [
- {
- # test default
- 'num' => 1,
- 'pig' => q\%default fname 'studenttab10k'
- a = load ':INPATH:/singlefile/$fname' using PigStorage() as (name, age, gpa);
- b = foreach a generate name;
- store b into ':OUTPATH:';\,
- },
- {
- # test paramter from command line
- 'num' => 2,
- 'pig_params' => ['-p', qq(fname='studenttab10k')],
- 'pig' => q\a = load ':INPATH:/singlefile/$fname' using PigStorage() as (name, age, gpa);
- b = foreach a generate name;
- store b into ':OUTPATH:';\,
- },
- {
- # test paramter from param file
- 'num' => 3,
- 'pig_params' => ['-m', ":PARAMPATH:/params_3"],
- 'pig' => q\a = load ':INPATH:/singlefile/$fname' using PigStorage() as (name, age, gpa);
- b = foreach a generate name;
- store b into ':OUTPATH:';\,
- },
- {
- # test command
- 'num' => 4,
- 'pig' => q\%declare cmd `/usr/local/bin/perl -e 'print "studenttab10k"'`
- a = load ':INPATH:/singlefile/$cmd' using PigStorage() as (name, age, gpa);
- b = foreach a generate name;
- store b into ':OUTPATH:';\,
- },
- {
- # test parameter with a space
- 'num' => 5,
- 'pig_params' => ['-p', qq(setting='set default_parallel 100;'),'-p',qq(fname='studenttab10k')],
- 'pig' => q\a = load ':INPATH:/singlefile/$fname' using PigStorage() as (name, age, gpa);
- $setting
- b = foreach a generate name;
- store b into ':OUTPATH:';\,
- 'verify_pig_script' => q\a = load ':INPATH:/singlefile/$fname' using PigStorage() as (name, age, gpa);
- b = foreach a generate name;
- store b into ':OUTPATH:';\,
- },
- ]
- },
- {
- 'name' => 'Types',
- 'tests' => [
- {
- # constants
- 'num' => 1,
- 'pig' => q\a = load ':INPATH:/singlefile/studentnulltab10k' as (name:chararray, age:int, gpa:double);
- b = foreach a generate age + 1 + 0.2f + 253645L, gpa+1;
- store b into ':OUTPATH:';\,
- },
- {
- # NULL and cast
- 'num' => 2,
- 'pig' => q\a = load ':INPATH:/singlefile/studentnulltab10k' as (name:chararray, age:int, gpa:double);
- b = foreach a generate (int)((int)gpa/((int)gpa - 1)) as norm_gpa:int;
- c = foreach b generate (norm_gpa is null? 0 :norm_gpa);
- store c into ':OUTPATH:';\,
- # 'expected_err_regex' => "Encountered Warning DIVIDE_BY_ZERO 2387 time.*",
- # Driver does currently not support both 'sql' and 'expected_...' verification directives.
- },
- {
- # arithmetic operators and SIZE for int, double and size and concat operators for chararrays
- 'num' => 3,
- 'pig' => q\a = load ':INPATH:/singlefile/studentnulltab10k' as (name:chararray, age:int, gpa:double);
- b = foreach a generate age, gpa, age % 25, age + 25, age - 25, age/2, age * 2, SIZE(age), gpa + 10.1, gpa - 1.1 , gpa / 1.2, gpa * 2.5, SIZE(gpa), SIZE(name), CONCAT(name, 'test');
- store b into ':OUTPATH:';\,
- },
- {
- # arithmetic operators and SIZE for long, float and size and concat operators for bytearrays
- 'num' => 4,
- 'pig' => q\a = load ':INPATH:/singlefile/studentnulltab10k' as (name, age:long, gpa:float);
- b = foreach a generate age, gpa, age % 2L, age + 2500000000L, age - 2500000000L, age/2L, age * 250000000L, SIZE(age), gpa + 10.1f, gpa - 1.1f , gpa / 1.2f, gpa * 2.6f, SIZE(gpa), SIZE(name), CONCAT(name, name);
- store b into ':OUTPATH:';\,
- },
- {
- # equlity and implicit cast
- 'num' => 5,
- 'pig' => q\a = load ':INPATH:/singlefile/studentnulltab10k' as (name, age, gpa);
- b = filter a by age == '25' and gpa < 3;
- store b into ':OUTPATH:';\,
- },
- {
- # will need to test against previous version of pig
- # because in pig currently count includes nulls - this affects
- # avg
- 'num' => 6,
- 'pig' => q\a = load ':INPATH:/singlefile/studentnulltab10k' as (name:chararray, age:int, gpa:double);
- b = group a ALL;
- c = foreach b generate SUM(a.age), MIN(a.age), MAX(a.age), AVG(a.age), MIN(a.name), MAX(a.name), SUM(a.gpa), MIN(a.gpa), MAX(a.gpa), AVG(a.gpa);
- store c into ':OUTPATH:';\,
- 'floatpostprocess' => 1,
- 'delimiter' => ' ',
- },
- {
- # sum, min, max, avg for long and float (declared)
- 'num' => 7,
- 'pig' => q\a = load ':INPATH:/singlefile/studentnulltab10k' as (name, age:long, gpa:float);
- b = group a ALL;
- c = foreach b generate SUM(a.age), MIN(a.age), MAX(a.age), AVG(a.age), SUM(a.gpa), MIN(a.gpa), MAX(a.gpa), AVG(a.gpa);
- store c into ':OUTPATH:';\,
- },
- {
- # Explicit casts - arithmetic operators and SIZE for int, double and size and concat operators for chararrays
- 'num' => 8,
- 'pig' => q\a = load ':INPATH:/singlefile/studentnulltab10k' as (name, age, gpa);
- b = foreach a generate (int)age % 25, (int)age + 25, (int)age - 25, (int)age/2, (int)age * 2, SIZE((int)age), (double)gpa + 10.1, (double)gpa - 1.1 , (double)gpa / 1.2, (double)gpa * 2.5, SIZE((double)gpa), SIZE((chararray)name), CONCAT((chararray)name, 'test');
- store b into ':OUTPATH:';\,
- },
- {
- # Explicit casts - arithmetic operators and SIZE for long, float
- 'num' => 9,
- 'pig' => q\a = load ':INPATH:/singlefile/studentnulltab10k' as (name, age, gpa);
- b = foreach a generate (long)age, (long)age % 2L, (long)age + 2500000000L, (long)age - 2500000000L, (long)age/2L, (long)age * 250000000L, SIZE((long)age), (float)gpa + 10.1f, (float)gpa - 1.1f , (float)gpa / 1.2f, (float)gpa * 2.6f, SIZE((float)gpa);
- store b into ':OUTPATH:';\,
- },
- {
- # Filter is null for chararray and double and is not null for int
- 'num' => 10,
- 'pig' => q\a = load ':INPATH:/singlefile/studentnulltab10k' as (name:chararray, age:int, gpa:double);
- b = filter a by name is null and age is not null and gpa is null;
- c = group b ALL;
- d = foreach c generate COUNT(b);
- store d into ':OUTPATH:';\,
- },
- {
- # Filter is not null for chararray and double and is null for int
- 'num' => 11,
- 'pig' => q\a = load ':INPATH:/singlefile/studentnulltab10k' as (name:chararray, age:int, gpa:double);
- b = filter a by name is not null and age is null and gpa is not null;
- c = group b ALL;
- d = foreach c generate COUNT(b);
- store d into ':OUTPATH:';\,
- },
- {
- # Filter is null for bytearray and float and is not null for long
- 'num' => 12,
- 'pig' => q\a = load ':INPATH:/singlefile/studentnulltab10k' as (name, age:long, gpa:float);
- b = filter a by name is null and age is not null and gpa is null;
- c = group b ALL;
- d = foreach c generate COUNT(b);
- store d into ':OUTPATH:';\,
- },
- {
- # Filter is not null for bytearray and float and is null for long
- 'num' => 13,
- 'pig' => q\a = load ':INPATH:/singlefile/studentnulltab10k' as (name, age:long, gpa:float);
- b = filter a by name is not null and age is null and gpa is not null;
- c = group b ALL;
- d = foreach c generate COUNT(b);
- store d into ':OUTPATH:';\,
- },
- {
- # test that sorting is based on the type for chararray, int and double
- 'num' => 14,
- 'pig' =>q\a = load ':INPATH:/singlefile/studentnulltab10k' as (name:chararray, age:int, gpa:double);
- b = order a by name, age, gpa;
- store b into ':OUTPATH:';\,
- 'sortArgs' => ['-t', ' ', '-k', '1,1', '-k', '2n,2n'],
- },
- {
- # test that sorting descending is based on the type for chararray, int and double
- 'num' => 15,
- 'pig' =>q\a = load ':INPATH:/singlefile/studentnulltab10k' as (name:chararray, age:int, gpa:double);
- b = order a by name desc, age desc, gpa desc;
- store b into ':OUTPATH:';\,
- 'sortArgs' => ['-t', ' ', '-k', '1r,1r', '-k', '2nr,2nr'],
- },
- {
- # test that sorting is based on the type for bytearray, long and float
- 'num' => 16,
- 'pig' =>q\a = load ':INPATH:/singlefile/studentnulltab10k' as (name, age:long, gpa:float);
- b = order a by name, age, gpa;
- store b into ':OUTPATH:';\,
- 'sortArgs' => ['-t', ' ', '-k', '1,1', '-k', '2n,2n'],
- },
- {
- # test that sorting descending is based on the type for chararray, age and float
- 'num' => 17,
- 'pig' =>q\a = load ':INPATH:/singlefile/studentnulltab10k' as (name, age:long, gpa:float);
- b = order a by name desc, age desc, gpa desc;
- store b into ':OUTPATH:';\,
- 'sortArgs' => ['-t', ' ', '-k', '1r,1r', '-k', '2nr,2nr'],
- },
- {
- # test precision for doubles is atleast 15 digits
- 'num' => 18,
- 'pig' => q\a = load ':INPATH:/singlefile/studentnulltab10k' as (name:chararray, age:int, gpa:double);
- b = foreach a generate 0.123456789123456+0.123456789123456;
- store b into ':OUTPATH:';\,
- },
- {
- # order by string
- 'num' => 20,
- 'pig' => q\a = load ':INPATH:/singlefile/studentnulltab10k' as (name:chararray, age:int, gpa:double);
- b = order a by name;
- store b into ':OUTPATH:';\,
- 'sortArgs' => ['-t', ' ', '-k', '1,1'],
- },
- {
- # order by string desc
- 'num' => 21,
- 'pig' => q\a = load ':INPATH:/singlefile/studentnulltab10k' as (name:chararray, age:int, gpa:double);
- b = order a by name desc;
- store b into ':OUTPATH:';\,
- 'sortArgs' => ['-t', ' ', '-k', '1r,1r'],
- },
- {
- # order by int
- 'num' => 22,
- 'pig' => q\a = load ':INPATH:/singlefile/studentnulltab10k' as (name:chararray, age:int, gpa:double);
- b = order a by age;
- store b into ':OUTPATH:';\,
- 'sortArgs' => ['-t', ' ', '-k', '2n,2n'],
- },
- {
- # order by int desc
- 'num' => 23,
- 'pig' => q\a = load ':INPATH:/singlefile/studentnulltab10k' as (name:chararray, age:int, gpa:double);
- b = order a by age desc;
- store b into ':OUTPATH:';\,
- 'sortArgs' => ['-t', ' ', '-k', '2nr,2nr'],
- },
- {
- # order by long
- 'num' => 24,
- 'pig' => q\a = load ':INPATH:/singlefile/studentnulltab10k' as (name:chararray, age:long, gpa:double);
- b = order a by age;
- store b into ':OUTPATH:';\,
- 'sortArgs' => ['-t', ' ', '-k', '2n,2n'],
- },
- {
- # order by long desc
- 'num' => 25,
- 'pig' => q\a = load ':INPATH:/singlefile/studentnulltab10k' as (name:chararray, age:long, gpa:double);
- b = order a by age desc;
- store b into ':OUTPATH:';\,
- 'sortArgs' => ['-t', ' ', '-k', '2nr,2nr'],
- },
- {
- # order by float
- 'num' => 26,
- 'pig' => q\a = load ':INPATH:/singlefile/studentnulltab10k' as (name:chararray, age:int, gpa:float);
- b = order a by gpa;
- store b into ':OUTPATH:';\,
- 'sortArgs' => ['-t', ' ', '-k', '3n'],
- },
- {
- # order by float desc
- 'num' => 27,
- 'pig' => q\a = load ':INPATH:/singlefile/studentnulltab10k' as (name:chararray, age:int, gpa:float);
- b = order a by gpa desc;
- store b into ':OUTPATH:';\,
- 'sortArgs' => ['-t', ' ', '-k', '3nr'],
- },
- {
- # order by double
- 'num' => 28,
- 'pig' => q\a = load ':INPATH:/singlefile/studentnulltab10k' as (name:chararray, age:int, gpa:double);
- b = order a by gpa;
- store b into ':OUTPATH:';\,
- 'sortArgs' => ['-t', ' ', '-k', '3n'],
- },
- {
- # order by double desc
- 'num' => 29,
- 'pig' => q\a = load ':INPATH:/singlefile/studentnulltab10k' as (name:chararray, age:int, gpa:double);
- b = order a by gpa desc;
- store b into ':OUTPATH:';\,
- 'sortArgs' => ['-t', ' ', '-k', '3nr'],
- },
- {
- # order by *
- 'num' => 30,
- 'pig' => q\a = load ':INPATH:/singlefile/studentnulltab10k' as (name:chararray, age:int, gpa:double);
- b = order a by *;
- store b into ':OUTPATH:';\,
- 'sortArgs' => ['-t', ' ', '-k', '1,1', '-k', '2n,2n'],
- },
- {
- # order by * desc
- 'num' => 31,
- 'pig' => q\a = load ':INPATH:/singlefile/studentnulltab10k' as (name:chararray, age:int, gpa:double);
- b = order a by * desc;
- store b into ':OUTPATH:';\,
- 'sortArgs' => ['-t', ' ', '-k', '1r,1r', '-k', '2nr,2nr'],
- },
- {
- 'num' => 32,
- 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name:chararray, age:int, gpa:double);
- b = load ':INPATH:/singlefile/votertab10k' as (name:chararray, age:int, registration:chararray, contributions:double);
- c = filter a by age < 20;
- d = filter b by age < 20;
- e = cogroup c by name, d by name;
- f = foreach e generate flatten (c), flatten(d);
- store f into ':OUTPATH:';\,
- },
- {
- 'num' => 33,
- 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name:chararray, age:int, gpa:double);
- b = load ':INPATH:/singlefile/votertab10k' as (name:chararray, age:int, registration:chararray, contributions:double);
- c = filter a by age < 20;
- d = filter b by age < 20;
- e = cogroup c by age, d by age;
- f = foreach e generate flatten (c), flatten(d);
- store f into ':OUTPATH:';\,
- },
- {
- 'num' => 34,
- 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name:chararray, age:long, gpa:double);
- b = load ':INPATH:/singlefile/votertab10k' as (name:chararray, age:long, registration:chararray, contributions:double);
- c = filter a by age < 20;
- d = filter b by age < 20;
- e = cogroup c by age, d by age;
- f = foreach e generate flatten (c), flatten(d);
- store f into ':OUTPATH:';\,
- },
- {
- 'num' => 35,
- 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name:chararray, age:float, gpa:double);
- b = load ':INPATH:/singlefile/votertab10k' as (name:chararray, age:float, registration:chararray, contributions:double);
- c = filter a by age < 20;
- d = filter b by age < 20;
- e = cogroup c by age, d by age;
- f = foreach e generate flatten (c), flatten(d);
- store f into ':OUTPATH:';\,
- },
- {
- 'num' => 36,
- 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name:chararray, age:double, gpa:double);
- b = load ':INPATH:/singlefile/votertab10k' as (name:chararray, age:double, registration:chararray, contributions:double);
- c = filter a by age < 20;
- d = filter b by age < 20;
- e = cogroup c by age, d by age;
- f = foreach e generate flatten (c), flatten(d);
- store f into ':OUTPATH:';\,
- },
- {
- # NULL and cast
- 'num' => 37,
- 'pig' => q\a = load ':INPATH:/singlefile/studentnulltab10k' as (name:chararray, age:int, gpa:double);
- b = foreach a generate (int)((int)gpa/((int)gpa - 1)) as norm_gpa:int;
- c = foreach b generate (norm_gpa is not null? norm_gpa: 0);
- store c into ':OUTPATH:';\,
- },
- {
- # constants
- 'num' => 38,
- 'pig' => q\a = load ':INPATH:/singlefile/studentnulltab10k' as (name:chararray, age:int, gpa:double);
- b = foreach a generate -(age + 1 + 0.2f + 253645L), -(gpa+1);
- store b into ':OUTPATH:';\,
- },
- {
- 'num' => 39,
- 'pig' => q\a = load ':INPATH:/singlefile/allscalar10k' using PigStorage() as (name:chararray, age:int, gpa:double, instate:boolean);
- b = foreach a generate instate, true, false;
- store b into ':OUTPATH:';\,
- 'verify_pig_script' => q\a = load ':INPATH:/singlefile/allscalar10k' using PigStorage() as (name:chararray, age:int, gpa:double, instate:chararray);
- b = foreach a generate instate, 'true', 'false';
- store b into ':OUTPATH:';\,
- },
- ]
- },
- {
- 'name' => 'Limit',
- 'tests' => [
- {
- 'num' => 1,
- 'pig' =>q\a = load ':INPATH:/singlefile/studentnulltab10k';
- b = order a by $0, $1;
- c = filter b by $0 > 'a'; -- break the sort/limit optimization
- d = limit c 100;
- store d into ':OUTPATH:';\,
- 'sortArgs' => ['-t', ' ', '-k', '1,1'],
- },
- {
- 'num' => 2,
- 'ignore23' => 'The record limit pick is different in 23',
- 'pig' =>q\a = load ':INPATH:/singlefile/studentnulltab10k';
- b = order a by $0, $1;
- c = limit b 100;
- store c into ':OUTPATH:';\,
- 'sortArgs' => ['-t', ' ', '-k', '1,1'],
- },
- {
- # Make sure that limit higher than number of rows doesn't mess stuff up
- 'num' => 3,
- 'pig' =>q\a = load ':INPATH:/singlefile/studenttab10k';
- b = order a by $0, $1;
- c = filter b by $1 < 1000;
- d = limit c 100000;
- store d into ':OUTPATH:';\,
- },
- {
- 'num' => 4,
- 'pig' =>q\a = load ':INPATH:/singlefile/studentnulltab10k';
- b = distinct a;
- c = limit b 100;
- store c into ':OUTPATH:';\,
- },
- {
- 'num' => 5,
- 'pig' =>q\a = load ':INPATH:/singlefile/studenttab10k';
- b = load ':INPATH:/singlefile/votertab10k';
- a1 = foreach a generate $0, $1;
- b1 = foreach b generate $0, $1;
- c = union a1, b1;
- d = limit c 100;
- store d into ':OUTPATH:';\,
- },
- {
- 'num' => 6,
- 'pig' =>q\A = load ':INPATH:/singlefile/studenttab10k' as (name: chararray, age: int, gpa: float);
- B = limit A 40;
- C = filter B by age == 40;
- D = group C by name;
- E = foreach D generate group, COUNT(C);
- store E into ':OUTPATH:';\,
- },
- {
- 'num' => 7,
- 'pig' =>q\A = load ':INPATH:/singlefile/studenttab10k' as (name: chararray, age: int, gpa: float);
- B = group A by name;
- C = foreach B {
- C1 = limit A 10;
- generate group, COUNT(C1);
- }
- store C into ':OUTPATH:';\,
- },
- {
- 'num' => 8,
- 'pig' =>q\A = load ':INPATH:/singlefile/studenttab10k' as (name: chararray, age: int, gpa: float);
- B = group A by name;
- C = foreach B {
- C1 = filter A by age < 40;
- C2 = limit C1 10;
- generate group, COUNT(C2);
- }
- D = filter C by $1 > 0;
- store D into ':OUTPATH:';\,
- },
- {
- 'num' => 9,
- 'pig' =>q\a = load ':INPATH:/singlefile/studentnulltab10k';
- b = order a by $0, $1;
- c = limit b 1000/10;
- store c into ':OUTPATH:';\,
- 'verify_pig_script' => q\a = load ':INPATH:/singlefile/studentnulltab10k';
- b = order a by $0, $1;
- c = limit b 100;
- store c into ':OUTPATH:';\,
- 'sortArgs' => ['-t', ' ', '-k', '1,2'],
- },
- {
- 'num' => 10,
- 'pig' =>q\a = load ':INPATH:/singlefile/studenttab10k' as (name:chararray, age:int, gpa:double);
- b = group a all;
- c = foreach b generate COUNT(a) as count;
- d = limit a c.count/10;
- store d into ':OUTPATH:';\,
- 'verify_pig_script' => q\a = load ':INPATH:/singlefile/studenttab10k' as (name:chararray, age:int, gpa:double);
- b = limit a 1000;
- store b into ':OUTPATH:';\,
- },
- {
- 'num' => 11,
- 'pig' =>q\a = load ':INPATH:/singlefile/studenttab10k' as (name:chararray, age:int, gpa:double);
- b = group a all;
- c = foreach b generate COUNT(a) as count;
- d = load ':INPATH:/singlefile/votertab10k';
- e = group d all;
- f = foreach e generate COUNT(d) as count;
- d = limit a c.count/10+f.count/10;
- store d into ':OUTPATH:';\,
- 'verify_pig_script' => q\a = load ':INPATH:/singlefile/studenttab10k' as (name:chararray, age:int, gpa:double);
- b = limit a 2000;
- store b into ':OUTPATH:';\,
- }
- ]
- },
- {
- 'name' => 'Split',
- 'tests' => [
- {
- 'num' => 1,
- 'pig' =>q\a = load ':INPATH:/singlefile/studentnulltab10k';
- split a into a1 if $0 > 'm', a2 if $0 <= 'm';
- store a1 into ':OUTPATH:';\,
- },
- {
- 'num' => 2,
- 'pig' =>q\a = load ':INPATH:/singlefile/studentnulltab10k';
- split a into a1 if $0 > 'm', a2 if $0 <= 'm';
- store a2 into ':OUTPATH:';\,
- },
- {
- 'num' => 3,
- 'pig' =>q\a = load ':INPATH:/singlefile/studenttab10k';
- split a into a1 if $0 > 'm', a2 if $0 <= 'm';
- b = cogroup a1 by $1, a2 by $1;
- c = foreach b generate flatten(a1), flatten(a2);
- store c into ':OUTPATH:';\,
- },
- {
- 'num' => 4,
- 'pig' =>q\a = load ':INPATH:/singlefile/studenttab10k';
- split a into a1 if $0 > 'm', a2 if $0 <= 'm';
- b = cogroup a1 by $1, a2 by $1;
- c = foreach b generate flatten($1), flatten($2);
- store c into ':OUTPATH:';\,
- },
- {
- 'num' => 5,
- 'pig' =>q\a = load ':INPATH:/singlefile/studentnulltab10k' as (name, age, gpa);
- split a into a1 if name > 'm', a2 if name <= 'm';
- b = distinct a1;
- store b into ':OUTPATH:';\,
- },
- {
- 'num' => 6,
- 'pig' =>q\a = load ':INPATH:/singlefile/studentnulltab10k' as (name, age, gpa);
- split a into a1 if age > 50, a2 if age <= 25;
- b = order a2 by name;
- store b into ':OUTPATH:';\,
- 'sortArgs' => ['-t', ' ', '-k', '1,1'],
- },
- {
- 'num' => 7,
- 'pig' =>q\a = load ':INPATH:/singlefile/studentnulltab10k' as (name:chararray, age:int, gpa:double);
- split a into a1 if name > 'm', a2 if age < 50;
- b = distinct a1;
- store b into ':OUTPATH:';\,
- },
- {
- 'num' => 8,
- 'pig' =>q\a = load ':INPATH:/singlefile/studenttab10k' as (name:chararray, age:int, gpa:double);
- split a into a1 if age > 50, a2 if name < 'm';
- b2 = foreach a2 generate name, 1;
- b1 = foreach a1 generate name, 2;
- c = cogroup b2 by name, b1 by name;
- d = foreach c generate flatten(group), COUNT($1), COUNT($2);
- store d into ':OUTPATH:';\,
- },
- {
- 'num' => 9,
- 'pig' =>q\a = load ':INPATH:/singlefile/studenttab10k' as (name:chararray, age:int, gpa:double);
- split a into a1 if age > 50, a2 if name < 'm';
- b2 = distinct a2;
- b1 = order a1 by name;
- c = cogroup b2 by name, b1 by name;
- d = foreach c generate flatten(group), COUNT($1), COUNT($2);
- store d into ':OUTPATH:';\,
- },
- {
- 'num' => 10,
- 'pig' =>q\a = load ':INPATH:/singlefile/studenttab10k' as (name:chararray, age:int, gpa:double);
- split a into a1 if age > 50, a2 otherwise;
- store a1 into ':OUTPATH:.1';
- store a2 into ':OUTPATH:.2';\,
- 'verify_pig_script' =>q\a = load ':INPATH:/singlefile/studenttab10k' as (name:chararray, age:int, gpa:double);
- split a into a1 if age > 50, a2 if age<=50;
- store a1 into ':OUTPATH:.1';
- store a2 into ':OUTPATH:.2';\,
- }
- ]
- },
- {
- 'name' => 'ImplicitSplit',
- 'tests' => [
- {
- 'num' => 1,
- 'pig' =>q\a = load ':INPATH:/singlefile/studenttab10k';
- b = filter a by $1 > 50;
- c = filter a by $2 > 3.0;
- d = cogroup b by $0, c by $0;
- e = foreach d generate flatten(b), flatten(c);
- store e into ':OUTPATH:';\,
- },
- {
- 'num' => 2,
- 'pig' =>q\a = load ':INPATH:/singlefile/studenttab10k' as (name:chararray, age:int, gpa:double);
- b = filter a by age > 50;
- c = filter a by gpa > 3.0;
- d = cogroup b by name, c by name;
- e = foreach d generate flatten(b), flatten(c);
- f = filter e by b::age < 75;
- store f into ':OUTPATH:';\,
- }
- ]
- },
- {
- 'name' => 'describe',
- 'tests' => [
- #JIRA[PIG-373]
- {
- 'num' => 1,
- 'pig' => q\
- A = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name:chararray, age:int, gpa: double);
- describe A;
- store A into ':OUTPATH:';\,
- },
- ],
- },
- {
- 'name' => 'Sample',
- 'tests' => [
- {
- 'num' => 1,
- 'pig' => q\
- A = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name:chararray, age:int, gpa: double);
- S = sample A 2-1-1;
- store S into ':OUTPATH:';\,
- 'verify_pig_script' => q\
- A = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name:chararray, age:int, gpa: double);
- S = sample A 0;
- store S into ':OUTPATH:';\,
- },
- {
- 'num' => 2,
- 'pig' => q\
- A = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name:chararray, age:int, gpa: double);
- B = group A all;
- C = foreach B generate COUNT(A) as count;
- D = group A all;
- E = foreach D generate (double)COUNT(A) as count;
- S = sample A E.count/C.count;
- store S into ':OUTPATH:';\,
- 'verify_pig_script' => q\
- A = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name:chararray, age:int, gpa: double);
- S = sample A 1;
- store S into ':OUTPATH:';\,
- },
- ],
- },
- {
- 'name' => 'MissingColumns',
- 'tests' => [
- {
- 'num' => 1,
- 'pig' => q\
- A = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name:chararray, age: int, gpa: double, extra: chararray);
- B = filter A by age > 50 or extra > 'm';
- D = order B by age, extra;
- store D into ':OUTPATH:';\,
- 'sortArgs' => ['-t', ' ', '-k', '2n,2n'],
- },
- {
- 'num' => 2,
- 'pig' => q\
- A = load ':INPATH:/singlefile/studenttab10k' using PigStorage();
- B = foreach A generate $0, $1 + 1, $3 + 1;
- C = group B by ($0, $2);
- D = foreach C generate flatten(group), COUNT($1);
- store D into ':OUTPATH:';\,
- },
- {
- 'num' => 3,
- 'pig' => q\
- A = load ':INPATH:/singlefile/studenttab10k' as (name: chararray, age: int, gpa: double);
- B = load ':INPATH:/singlefile/studenttab10k' as (name, age, gpa, extra1, extra2);
- C = join A by (name, age), B by (name, extra1);
- store C into ':OUTPATH:';\,
- # The following SQL should produce empty results, which will match what our pig query should produce.
- }
- ],
- },
- {
- 'name' => 'Aliases',
- # check access of a field using multiple valid aliases
- 'tests' => [
- {
- # check that a free standing alias reference works
- # when it is unambiguous
- # check that a fully qualified alias reference works
- # check that a partially qualified unambiguous alias reference works
- 'num' => 1,
- 'pig' => q\
- a = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name:chararray, age:int, gpa: double);
- b = group a by name;
- c = foreach b generate flatten(a);
- d = filter c by name != 'fred';
- e = group d by name;
- f = foreach e generate flatten(d);
- g = foreach f generate name, d::a::name as dname, a::name as aname;
- store g into ':OUTPATH:';\,
- },
- {
- # check that the "group" alias is available
- # after a flatten(group)
- 'num' => 2,
- 'pig' => q\
- a = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name:chararray, age:int, gpa: double);
- b = group a by name;
- c = foreach b generate flatten(group), COUNT(a) as cnt;
- d = foreach c generate group;
- store d into ':OUTPATH:';\,
- },
- ],
- },
- {
- 'name' => 'Lineage',
- #test if the right cast function is picked
- 'tests' => [
- {
- 'num' => 1,
- 'pig' => q\
- a = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name, age, gpa);
- b = load ':INPATH:/singlefile/textdoc' using TextLoader() as (sentence);
- c = cogroup a ALL, b ALL;
- d = foreach c generate flatten(a), flatten(b);
- e = foreach d generate name, flatten(TOKENIZE((chararray)sentence)) as sentence;
- f = foreach e generate CONCAT((chararray)name, sentence);
- store f into ':OUTPATH:';\,
- },
- {
- 'num' => 2,
- 'pig' => q\
- a = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name, age:int, gpa: double);
- b = load ':INPATH:/singlefile/textdoc' using TextLoader() as (sentence);
- c = cross a, b;
- d = foreach c generate name, flatten(TOKENIZE((chararray)sentence)) as sentence;
- e = foreach d generate CONCAT((chararray)name, sentence);
- store e into ':OUTPATH:';\,
- },
- {
- 'num' => 3,
- 'pig' => q\
- a = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name, age, gpa: double);
- b = foreach a generate age as student_age;
- c = filter b by student_age > 50;
- d = foreach c generate student_age + 10;
- store d into ':OUTPATH:';\,
- },
- {
- 'num' => 4,
- 'pig' => q\register :FUNCPATH:/testudf.jar;
- a = load ':INPATH:/singlefile/studenttab10k' as (name, age, gpa);
- b = filter a by name lt 'b';
- c = foreach b generate org.apache.pig.test.udf.evalfunc.CreateMap((chararray)name, (int)age);
- d = foreach c generate $0#'alice young';
- split d into e if $0 is not null, f if $0 is null;
- store e into ':OUTPATH:';\,
- }
- ],
- },
- {
- 'name' => 'Casts',
- 'tests' => [
- {
- # check that a cast of a value of type
- # same as the result type of the cast works
- # when the value is treated as a bytearray
- 'num' => 1,
- 'floatpostprocess' => 1,
- 'delimiter' => ' ',
- 'pig' => q\
- a = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name:chararray, age:int, gpa: double);
- b = foreach a generate name, age, gpa;
- store b into ':OUTPATH:.intermediate' using BinStorage();
- c = load ':OUTPATH:.intermediate' using BinStorage();
- -- after this load, the fields are treated as bytearrays though
- -- they are actually "typed", test that the implicit casts
- -- introduced by the operations in the foreach below will work fine
- d = foreach c generate CONCAT((chararray)$0, 'test'), $1 + 1, $2 + 0.2;
- store d into ':OUTPATH:';\,
- 'notmq' => 1,
- },
- {
- # check that a cast of a value of type
- # same as the result type of the cast works
- # when the value is treated as a bytearray
- 'num' => 2,
- 'floatpostprocess' => 1,
- 'delimiter' => ' ',
- 'pig' => q\
- a = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name:chararray, age:long, gpa: float);
- b = foreach a generate name, age, gpa;
- store b into ':OUTPATH:.intermediate' using BinStorage();
- c = load ':OUTPATH:.intermediate' using BinStorage();
- -- after this load, the fields are treated as bytearrays though
- -- they are actually "typed", test that the implicit casts
- -- introduced by the operations in the foreach below will work fine
- d = foreach c generate CONCAT((chararray)$0, 'test'), $1 + 1L, $2 + 0.2f;
- store d into ':OUTPATH:';\,
- 'notmq' => 1,
- },
- {
- #check that a cast of a value of type
- #same as the result type of the cast works
- #when the value is treated as a bytearray
- 'num' => 3,
- 'floatpostprocess' => 1,
- 'delimiter' => ' ',
- 'pig' => q\
- a = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name:chararray, age:long, gpa: float);
- b = group a by name;
- c = foreach b generate a, (1,2,3), ['key1'#'value1','key2'#'value2'];
- -- store the bag, tuple and map
- store c into ':OUTPATH:.intermediate' using BinStorage();
- d = load ':OUTPATH:.intermediate' using BinStorage() as (b:bag{t:tuple(x,y,z)}, t2:tuple(a,b,c), m:map[]);
- -- after this load, the fields are treated as bytearrays though
- -- they are actually "typed", test that the implicit casts
- -- introduced by the operations in the foreach below will work fine
- e = foreach d generate COUNT(b), t2.a, t2.b, t2.c, m#'key1', m#'key2';
- store e into ':OUTPATH:';\,
- 'notmq' => 1,
- },
- {
- # check that a cast of a value of type
- # same as the result type of the cast works
- # when the value is treated as a bytearray
- 'num' => 4,
- 'floatpostprocess' => 1,
- 'delimiter' => ' ',
- 'pig' => q\
- a = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name:chararray, age:int, gpa: double);
- b = foreach a generate name, age, gpa;
- store b into ':OUTPATH:.intermediate' using PigStorage();
- c = load ':OUTPATH:.intermediate' using PigStorage();
- -- after this load, the fields are treated as bytearrays though
- -- they are actually "typed", test that the implicit casts
- -- introduced by the operations in the foreach below will work fine
- d = foreach c generate CONCAT((chararray)$0, 'test'), $1 + 1, $2 + 0.2;
- store d into ':OUTPATH:';\,
- 'notmq' => 1,
- },
- {
- # check that a cast of a value of type
- # same as the result type of the cast works
- # when the value is treated as a bytearray
- 'num' => 5,
- 'floatpostprocess' => 1,
- 'delimiter' => ' ',
- 'pig' => q\
- a = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name:chararray, age:long, gpa: float);
- b = foreach a generate name, age, gpa;
- store b into ':OUTPATH:.intermediate' using PigStorage();
- c = load ':OUTPATH:.intermediate' using PigStorage();
- -- after this load, the fields are treated as bytearrays though
- -- they are actually "typed", test that the implicit casts
- -- introduced by the operations in the foreach below will work fine
- d = foreach c generate CONCAT((chararray)$0, 'test'), $1 + 1L, $2 + 0.2f;
- store d into ':OUTPATH:';\,
- 'notmq' => 1,
- },
- {
- #check that a cast of a value of type
- #same as the result type of the cast works
- #when the value is treated as a bytearray
- 'num' => 6,
- 'floatpostprocess' => 1,
- 'delimiter' => ' ',
- 'pig' => q\
- a = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name:chararray, age:long, gpa: float);
- b = group a by name;
- c = foreach b generate a, (1,2,3), ['key1'#'value1','key2'#'value2'];
- -- store the bag, tuple and map
- store c into ':OUTPATH:.intermediate' using PigStorage();
- d = load ':OUTPATH:.intermediate' using PigStorage() as (b:bag{t:tuple(x,y,z)}, t2:tuple(a,b,c), m:map[]);
- -- after this load, the fields are treated as bytearrays though
- -- they are actually "typed", test that the implicit casts
- -- introduced by the operations in the foreach below will work fine
- e = foreach d generate COUNT(b), t2.a, t2.b, t2.c, m#'key1', m#'key2';
- store e into ':OUTPATH:';\,
- 'notmq' => 1,
- },
- {
- 'num' => 7,
- 'pig' => q\a = load ':INPATH:/singlefile/allscalar10k' using PigStorage() as (name, age, gpa, instate);
- b = foreach a generate (boolean)instate;
- c = filter b by instate == true;
- store c into ':OUTPATH:';\,
- 'verify_pig_script' => q\a = load ':INPATH:/singlefile/allscalar10k' using PigStorage() as (name:chararray, age:int, gpa:double, instate:chararray);
- b = foreach a generate instate;
- c = filter b by instate == 'true';
- store c into ':OUTPATH:';\,
- }
- ],
- },
- {
- 'name' => 'ClassResolution',
- 'tests' => [
- {
- # check that Loader specified without a package
- # name works if that package name is specified
- # in udf.import.list
- 'num' => 1,
- 'floatpostprocess' => 1,
- 'delimiter' => ' ',
- 'java_params' => ['-Dudf.import.list=org.apache.pig.test.udf.storefunc'],
- 'pig' => q\
- register :FUNCPATH:/testudf.jar;
- a = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name:chararray, age:int, gpa: double);
- b = foreach a generate CONCAT('(', name), CONCAT((chararray)age, ' )');
- store b into ':OUTPATH:.intermediate' using PigStorage(',');
- c = load ':OUTPATH:.intermediate' using org.apache.pig.test.udf.storefunc.DumpLoader();
- store c into ':OUTPATH:';\,
- 'notmq' => 1,
- },
- ],
- },
- {
- 'name' => 'MergeJoin',
- 'tests' => [
- # Simplest merge-join.
- {
- 'num' => 1,
- 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k';
- b = load ':INPATH:/singlefile/votertab10k';
- c = order a by $0;
- d = order b by $0;
- store c into ':OUTPATH:.intermediate1';
- store d into ':OUTPATH:.intermediate2';
- exec;
- e = load ':OUTPATH:.intermediate1';
- f = load ':OUTPATH:.intermediate2';
- g = join e by $0, f by $0 using 'merge';
- store g into ':OUTPATH:';\,
- 'verify_pig_script' => q\a = load ':INPATH:/singlefile/studenttab10k';
- b = load ':INPATH:/singlefile/votertab10k';
- g = join a by $0, b by $0;
- store g into ':OUTPATH:';\,
- 'notmq' => 1,
- },
- # Merge-join with left-side filter
- {
- 'num' => 2,
- 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k';
- b = load ':INPATH:/singlefile/votertab10k';
- c = order a by $0;
- d = order b by $0;
- store c into ':OUTPATH:.intermediate1';
- store d into ':OUTPATH:.intermediate2';
- exec;
- e = load ':OUTPATH:.intermediate1';
- h = filter e by $1 > 30;
- f = load ':OUTPATH:.intermediate2';
- g = join h by $0, f by $0 using 'merge';
- store g into ':OUTPATH:';\,
- 'verify_pig_script' => q\a = load ':INPATH:/singlefile/studenttab10k';
- b = load ':INPATH:/singlefile/votertab10k';
- h = filter a by $1 > 30;
- g = join h by $0, b by $0;
- store g into ':OUTPATH:';\,
- 'notmq' => 1,
- },
- # Merge-join with right-side filter
- {
- 'num' => 3,
- 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k';
- b = load ':INPATH:/singlefile/votertab10k';
- c = order a by $0;
- d = order b by $0;
- store c into ':OUTPATH:.intermediate1';
- store d into ':OUTPATH:.intermediate2';
- exec;
- e = load ':OUTPATH:.intermediate1';
- f = load ':OUTPATH:.intermediate2';
- i = filter f by $2 != 'democrat';
- g = join e by $0, i by $0 using 'merge';
- store g into ':OUTPATH:';\,
- 'verify_pig_script' => q\a = load ':INPATH:/singlefile/studenttab10k';
- b = load ':INPATH:/singlefile/votertab10k';
- i = filter b by $2 != 'democrat';
- g = join a by $0, i by $0;
- store g into ':OUTPATH:';\,
- 'notmq' => 1,
- },
- # Merge-join with schemas
- {
- 'num' => 4,
- 'floatpostprocess' => 1,
- 'delimiter' => ' ',
- 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k';
- b = load ':INPATH:/singlefile/votertab10k';
- c = order a by $0;
- d = order b by $0;
- store c into ':OUTPATH:.intermediate1';
- store d into ':OUTPATH:.intermediate2';
- exec;
- e = load ':OUTPATH:.intermediate1' as (name:chararray, age:int, gpa:float);
- f = load ':OUTPATH:.intermediate2' as (name:chararray, age:int, reg:chararray, contrib:float);
- g = join e by $0, f by $0 using 'merge';
- store g into ':OUTPATH:';\,
- 'verify_pig_script' => q\a = load ':INPATH:/singlefile/studenttab10k';
- b = load ':INPATH:/singlefile/votertab10k';
- g = join a by $0, b by $0;
- store g into ':OUTPATH:';\,
- 'notmq' => 1,
- },
- # Merge-join with key as expression
- {
- 'num' => 5,
- 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k';
- b = load ':INPATH:/singlefile/votertab10k';
- c = order a by $0,$1;
- d = order b by $0,$1;
- store c into ':OUTPATH:.intermediate1';
- store d into ':OUTPATH:.intermediate2';
- exec;
- e = load ':OUTPATH:.intermediate1';
- f = load ':OUTPATH:.intermediate2';
- g = join e by ($0,$1), f by ($0,$1) using 'merge';
- store g into ':OUTPATH:';\,
- 'verify_pig_script' => q\a = load ':INPATH:/singlefile/studenttab10k';
- b = load ':INPATH:/singlefile/votertab10k';
- g = join a by ($0,$1), b by ($0,$1);
- store g into ':OUTPATH:';\,
- 'notmq' => 1,
- },
- # Merge-join with key as expression This expression guarantees ordering
- {
- 'num' => 6,
- 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k';
- b = load ':INPATH:/singlefile/votertab10k';
- c = order a by $1;
- d = order b by $1;
- store c into ':OUTPATH:.intermediate1';
- store d into ':OUTPATH:.intermediate2';
- exec;
- e = load ':OUTPATH:.intermediate1';
- f = load ':OUTPATH:.intermediate2';
- g = join e by ($1+10), f by ($1+10) using 'merge';
- store g into ':OUTPATH:';\,
- 'verify_pig_script' => q\a = load ':INPATH:/singlefile/studenttab10k';
- b = load ':INPATH:/singlefile/votertab10k';
- g = join a by ($1+10), b by ($1+10) ;
- store g into ':OUTPATH:';\,
- 'notmq' => 1,
- },
- # Merge-join with nulls in keys and data.
- {
- 'num' => 7,
- 'pig' => q\a = load ':INPATH:/singlefile/studentnulltab10k';
- b = load ':INPATH:/singlefile/voternulltab10k';
- c = order a by $0;
- d = order b by $0;
- store c into ':OUTPATH:.intermediate1';
- store d into ':OUTPATH:.intermediate2';
- exec;
- e = load ':OUTPATH:.intermediate1';
- f = load ':OUTPATH:.intermediate2';
- g = join e by $0, f by $0 using 'merge';
- store g into ':OUTPATH:';\,
- 'verify_pig_script' => q\a = load ':INPATH:/singlefile/studentnulltab10k';
- b = load ':INPATH:/singlefile/voternulltab10k';
- g = join a by $0, b by $0;
- store g into ':OUTPATH:';\,
- 'notmq' => 1,
- },
- # Merge-join with one file across multiple blocks
- {
- 'num' => 8,
- 'execonly' => 'mapred', # since this join will run out of memory in local mode
- 'floatpostprocess' => 1,
- 'delimiter' => ' ',
- 'pig' => q\a = load ':INPATH:/singlefile/votertab10k';
- b = load ':INPATH:/singlefile/studenttab20m';
- h = filter b by $2 < 1.5;
- c = order a by $0;
- d = order h by $0 parallel 1;
- store c into ':OUTPATH:.intermediate1';
- store d into ':OUTPATH:.intermediate2';
- exec;
- e = load ':OUTPATH:.intermediate1' as (name:chararray, age:int, reg:chararray, contrib:float);
- f = load ':OUTPATH:.intermediate2'as (name:chararray, age:int, gpa:float);
- g = join e by $0, f by $0 using 'merge';
- i = filter g by $2 == 'democrat' and $1 > 76;
- store i into ':OUTPATH:';\,
- 'verify_pig_script' => q\a = load ':INPATH:/singlefile/votertab10k';
- b = load ':INPATH:/singlefile/studenttab20m';
- h = filter b by $2 < 1.5;
- g = join a by $0, h by $0;
- i = filter g by $2 == 'democrat' and $1 > 76;
- store i into ':OUTPATH:';\,
- 'notmq' => 1,
- },
- # Merge-join with join on numeric key
- {
- 'num' => 9,
- 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k' as (name:chararray, age:int, gpa:float);
- b = load ':INPATH:/singlefile/votertab10k'as (name:chararray, age:int, reg:chararray, contrib:float);
- c = order a by age;
- d = order b by age;
- store c into ':OUTPATH:.intermediate1';
- store d into ':OUTPATH:.intermediate2';
- exec;
- e = load ':OUTPATH:.intermediate1' as (name:chararray, age:int, gpa:float);
- f = load ':OUTPATH:.intermediate2' as (name:chararray, age:int, reg:chararray, contrib:float);
- g = join e by age, f by age using 'merge';
- store g into ':OUTPATH:';\,
- 'verify_pig_script' => q\a = load ':INPATH:/singlefile/studenttab10k' as (name:chararray, age:int, gpa:float);
- b = load ':INPATH:/singlefile/votertab10k'as (name:chararray, age:int, reg:chararray, contrib:float);
- g = join a by age, b by age;
- store g into ':OUTPATH:';\,
- 'notmq' => 1,
- },
- ]
- },
- {
- 'name' => 'SkewedJoin',
- 'floatpostprocess' => 1,
- 'delimiter' => ' ',
- 'tests' => [
- {
- 'num' => 1,
- 'java_params' => ['-Dpig.skewedjoin.reduce.maxtuple=100'],
- 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name, age, gpa);
- b = load ':INPATH:/singlefile/votertab10k' as (name, age, registration, contributions);
- e = join a by name, b by name using 'skewed' parallel 8;
- store e into ':OUTPATH:';\,
- 'verify_pig_script' => q\a = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name, age, gpa);
- b = load ':INPATH:/singlefile/votertab10k' as (name, age, registration, contributions);
- e = join a by name, b by name;
- store e into ':OUTPATH:';\,
- },
- # basic join with no skewed keys
- {
- 'num' => 2,
- 'java_params' => ['-Dpig.skewedjoin.reduce.maxtuple=10000'],
- 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name, age,
- gpa);
- b = load ':INPATH:/singlefile/votertab10k' as (name, age, registration, contributions);
- e = join a by name, b by name using 'skewed';
- store e into ':OUTPATH:';\,
- 'verify_pig_script' =>q\a = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name, age,
- gpa);
- b = load ':INPATH:/singlefile/votertab10k' as (name, age, registration, contributions);
- e = join a by name, b by name ;
- store e into ':OUTPATH:';\,
- },
- # join after filtering
- {
- 'num' => 3,
- 'java_params' => ['-Dpig.skewedjoin.reduce.maxtuple=3'],
- 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name, age,
- gpa);
- b = load ':INPATH:/singlefile/votertab10k' as (name, age, registration, contributions);
- c = filter a by age < 20;
- d = filter b by age < 20;
- e = join c by $0, d by $0 using 'skewed' parallel 8;
- store e into ':OUTPATH:';\,
- 'verify_pig_script' => q\a = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name, age,
- gpa);
- b = load ':INPATH:/singlefile/votertab10k' as (name, age, registration, contributions);
- c = filter a by age < 20;
- d = filter b by age < 20;
- e = join c by $0, d by $0 ;
- store e into ':OUTPATH:';\,
- },
- # join by two columns
- {
- 'num' => 4,
- 'java_params' => ['-Dpig.skewedjoin.reduce.maxtuple=3'],
- 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name, age, gpa);
- b = load ':INPATH:/singlefile/votertab10k' as (name, age, registration, contributions);
- c = filter a by age < 20;
- d = filter b by age < 20;
- e = join c by (name, age), d by (name, age) using 'skewed' parallel 8;
- store e into ':OUTPATH:';\,
- 'verify_pig_script' => q\a = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name, age, gpa);
- b = load ':INPATH:/singlefile/votertab10k' as (name, age, registration, contributions);
- c = filter a by age < 20;
- d = filter b by age < 20;
- e = join c by (name, age), d by (name, age) ;
- store e into ':OUTPATH:';\,
- },
- # join with add
- {
- 'num' => 5,
- 'java_params' => ['-Dpig.skewedjoin.reduce.maxtuple=50'],
- 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k' as (name:chararray,age:int, gpa:double);
- b = load ':INPATH:/singlefile/votertab10k' as (name, age, registration, contributions);
- c = filter a by age < 20;
- d = filter b by age < 20;
- e = join c by age+10, d by age + 20 using 'skewed' parallel 10;
- store e into ':OUTPATH:';\,
- 'verify_pig_script' => q\a = load ':INPATH:/singlefile/studenttab10k' as (name:chararray,age:int, gpa:double);
- b = load ':INPATH:/singlefile/votertab10k' as (name, age, registration, contributions);
- c = filter a by age < 20;
- d = filter b by age < 20;
- e = join c by age+10, d by age + 20 ;
- store e into ':OUTPATH:';\,
- },
- # join with split
- {
- 'num' => 6,
- 'java_params' => ['-Dpig.skewedjoin.reduce.maxtuple=100'],
- 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k';
- b = filter a by $1 > 25;
- c = join a by $0, b by $0 using 'skewed' parallel 7;
- store c into ':OUTPATH:';\,
- 'verify_pig_script' => q\a = load ':INPATH:/singlefile/studenttab10k';
- b = filter a by $1 > 25;
- c = join a by $0, b by $0 ;
- store c into ':OUTPATH:';\,
- },
- # join with UDF
- {
- 'num' => 7,
- 'java_params' => ['-Dpig.skewedjoin.reduce.maxtuple=20'],
- 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k' as (name, age, gpa);
- b = load ':INPATH:/singlefile/votertab10k' as (name, age, registration, contributions);
- c = filter a by age < 20;
- d = filter b by age < 20;
- e = join c by SIZE(name), d by SIZE(name) using 'skewed' parallel 7;
- store e into ':OUTPATH:';\,
- 'verify_pig_script' => q\a = load ':INPATH:/singlefile/studenttab10k' as (name, age, gpa);
- b = load ':INPATH:/singlefile/votertab10k' as (name, age, registration, contributions);
- c = filter a by age < 20;
- d = filter b by age < 20;
- e = join c by SIZE(name), d by SIZE(name) ;
- store e into ':OUTPATH:';\,
- },
- # left outer join
- {
- 'num' => 8,
- 'java_params' => ['-Dpig.skewedjoin.reduce.maxtuple=100'],
- 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name, age, gpa);
- b = load ':INPATH:/singlefile/votertab10k' as (name, age, registration, contributions);
- e = join a by name left outer, b by name using 'skewed' parallel 8;
- store e into ':OUTPATH:';\,
- 'verify_pig_script' => q\a = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name, age, gpa);
- b = load ':INPATH:/singlefile/votertab10k' as (name, age, registration, contributions);
- e = join a by name left outer, b by name ;
- store e into ':OUTPATH:';\,
- },
- # right outer join
- {
- 'num' => 9,
- 'java_params' => ['-Dpig.skewedjoin.reduce.maxtuple=100'],
- 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name, age, gpa);
- b = load ':INPATH:/singlefile/votertab10k' as (name, age, registration, contributions);
- e = join a by name right outer, b by name using 'skewed' parallel 8;
- store e into ':OUTPATH:';\,
- 'verify_pig_script' => q\a = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name, age, gpa);
- b = load ':INPATH:/singlefile/votertab10k' as (name, age, registration, contributions);
- e = join a by name right outer, b by name ;
- store e into ':OUTPATH:';\,
- },
- # full outer join
- {
- 'num' => 10,
- 'java_params' => ['-Dpig.skewedjoin.reduce.maxtuple=100'],
- 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name, age, gpa);
- b = load ':INPATH:/singlefile/votertab10k' as (name, age, registration, contributions);
- e = join a by name full outer, b by name using 'skewed' parallel 8;
- store e into ':OUTPATH:';\,
- 'verify_pig_script' => q\a = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name, age, gpa);
- b = load ':INPATH:/singlefile/votertab10k' as (name, age, registration, contributions);
- e = join a by name full outer, b by name ;
- store e into ':OUTPATH:';\,
- },
- ]
- },
- {
- 'name' => 'CollectedGroup',
- 'tests' => [
- # Simplest collected group.
- {
- 'num' => 1,
- 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k';
- b = order a by $0;
- store b into ':OUTPATH:.intermediate';
- exec;
- register :FUNCPATH:/testudf.jar;
- c = load ':OUTPATH:.intermediate' using org.apache.pig.test.udf.storefunc.SimpleCollectableLoader();
- d = group c by $0 using 'collected';
- e = foreach d generate group, COUNT(c);
- store e into ':OUTPATH:';\,
- 'notmq' => 1,
- 'verify_pig_script' => q\a = load ':INPATH:/singlefile/studenttab10k';
- d = group a by $0 ;
- e = foreach d generate group, COUNT(a);
- store e into ':OUTPATH:';\,
- },
- # Collected group with filter
- {
- 'num' => 2,
- 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k';
- b = order a by $0;
- store b into ':OUTPATH:.intermediate';
- exec;
- register :FUNCPATH:/testudf.jar;
- c = load ':OUTPATH:.intermediate' using org.apache.pig.test.udf.storefunc.SimpleCollectableLoader();
- d = filter c by $1 > 30;
- e = group d by $0 using 'collected';
- f = foreach e generate group, COUNT(d);
- store f into ':OUTPATH:';\,
- 'notmq' => 1,
- 'verify_pig_script' => q\a = load ':INPATH:/singlefile/studenttab10k';
- d = filter a by $1 > 30;
- e = group d by $0 ;
- f = foreach e generate group, COUNT(d);
- store f into ':OUTPATH:';\,
- },
- # Collected group with schemas
- {
- 'num' => 3,
- 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k';
- b = order a by $0;
- store b into ':OUTPATH:.intermediate';
- exec;
- register :FUNCPATH:/testudf.jar;
- c = load ':OUTPATH:.intermediate' using org.apache.pig.test.udf.storefunc.SimpleCollectableLoader() as (name:chararray, age:int, gpa:float);
- d = group c by $0 using 'collected';
- e = foreach d generate group, MAX(c.age);
- store e into ':OUTPATH:';\,
- 'notmq' => 1,
- 'verify_pig_script' => q\a = load ':INPATH:/singlefile/studenttab10k' as (name:chararray, age:int, gpa:float);
- d = group a by $0 ;
- e = foreach d generate group, MAX(a.$1);
- store e into ':OUTPATH:';\,
- },
- # Collected group with multiple columns
- {
- 'num' => 4,
- 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k' as (name, age, gpa);
- b = order a by name, age;
- store b into ':OUTPATH:.intermediate';
- exec;
- register :FUNCPATH:/testudf.jar;
- c = load ':OUTPATH:.intermediate' using org.apache.pig.test.udf.storefunc.SimpleCollectableLoader() as (name:chararray, age:int, gpa:float);
- d = group c by (name, age) using 'collected';
- e = foreach d generate group.name, group.age, MIN(c.gpa);
- store e into ':OUTPATH:';\,
- 'notmq' => 1,
- 'verify_pig_script' => q\a = load ':INPATH:/singlefile/studenttab10k' as (name, age, gpa);
- d = group a by (name, age) ;
- e = foreach d generate group.name, group.age, MIN(a.gpa);
- store e into ':OUTPATH:';\,
- },
- # Collected group with nulls in keys and data.
- {
- 'num' => 5,
- 'pig' => q\a = load ':INPATH:/singlefile/studentnulltab10k';
- b = order a by $0;
- store b into ':OUTPATH:.intermediate';
- exec;
- register :FUNCPATH:/testudf.jar;
- c = load ':OUTPATH:.intermediate' using org.apache.pig.test.udf.storefunc.SimpleCollectableLoader() as (name:chararray, age:int, gpa:float);
- d = group c by $0 using 'collected';
- e = foreach d generate group, SUM(c.$1);
- store e into ':OUTPATH:';\,
- 'notmq' => 1,
- 'verify_pig_script' => q\a = load ':INPATH:/singlefile/studentnulltab10k' as (name:chararray, age:int, gpa:float);
- d = group a by $0 ;
- e = foreach d generate group, SUM(a.$1);
- store e into ':OUTPATH:';\,
- },
- # Collected group with numeric key
- {
- 'num' => 6,
- 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k' as (name:chararray, age:int, gpa:float);
- b = order a by age;
- store b into ':OUTPATH:.intermediate';
- exec;
- register :FUNCPATH:/testudf.jar;
- c = load ':OUTPATH:.intermediate' using org.apache.pig.test.udf.storefunc.SimpleCollectableLoader() as (name:chararray, age:int, gpa:float);
- d = group c by age using 'collected';
- e = foreach d generate group, AVG(c.gpa), COUNT(c.name);
- store e into ':OUTPATH:';\,
- 'notmq' => 1,
- 'verify_pig_script' => q\a = load ':INPATH:/singlefile/studenttab10k' as (name:chararray, age:int, gpa:float);
- d = group a by age ;
- e = foreach d generate group, AVG(a.gpa), COUNT(a.name);
- store e into ':OUTPATH:';\,
- },
- ]
- },
- {
- 'name' => 'SecondarySort',
- 'tests' => [
- {
- # simple order by
- 'num' => 1,
- 'java_params' => ['-Dpig.accumulative.batchsize=5'],
- 'pig' => q\register :FUNCPATH:/testudf.jar;
- a = load ':INPATH:/singlefile/studenttab10k' as (name, age, gpa);
- b = group a by age parallel 10;
- c = foreach b {
- d = order a by name;
- generate group, org.apache.pig.test.udf.evalfunc.AllFirstLetter(d);
- };
- store c into ':OUTPATH:';\,
- },
- {
- # order by desc
- 'num' => 2,
- 'java_params' => ['-Dpig.accumulative.batchsize=5'],
- 'pig' => q\register :FUNCPATH:/testudf.jar;
- a = load ':INPATH:/singlefile/studenttab10k' as (name, age, gpa);
- b = group a by age parallel 10;
- c = foreach b {
- d = order a by name desc;
- generate group, org.apache.pig.test.udf.evalfunc.AllFirstLetter(d);
- };
- store c into ':OUTPATH:';\,
- },
- {
- # order by float type
- 'num' => 3,
- 'java_params' => ['-Dpig.accumulative.batchsize=5'],
- 'pig' => q\register :FUNCPATH:/testudf.jar;
- a = load ':INPATH:/singlefile/studenttab10k' as (name:chararray, age:int, gpa:float);
- b = group a by age parallel 10;
- c = foreach b {
- d = order a by gpa;
- generate group, org.apache.pig.test.udf.evalfunc.AllFirstLetter(d.gpa);
- };
- store c into ':OUTPATH:';\,
- },
- # order by string type
- {
- 'num' => 4,
- 'java_params' => ['-Dpig.accumulative.batchsize=5'],
- 'pig' => q\register :FUNCPATH:/testudf.jar;
- a = load ':INPATH:/singlefile/studenttab10k' as (name:chararray, age:int, gpa:float);
- b = group a by age parallel 10;
- c = foreach b {
- d = order a by name;
- generate group, org.apache.pig.test.udf.evalfunc.AllFirstLetter(d.name);
- };
- store c into ':OUTPATH:';\,
- },
- # simple distinct
- {
- 'num' => 5,
- 'java_params' => ['-Dpig.exec.nocombiner=true', '-Dpig.accumulative.batchsize=5'],
- 'pig' => q\register :FUNCPATH:/testudf.jar;
- a = load ':INPATH:/singlefile/studenttab10k' as (name:chararray, age:int, gpa:float);
- b = group a by age parallel 10;
- c = foreach b {
- d = a.name;
- e = distinct d;
- generate group, org.apache.pig.test.udf.evalfunc.AllFirstLetter(e);
- };
- store c into ':OUTPATH:';\,
- },
- # distinct on tuple
- {
- 'num' => 6,
- 'java_params' => ['-Dpig.exec.nocombiner=true', '-Dpig.accumulative.batchsize=5'],
- 'pig' => q\register :FUNCPATH:/testudf.jar;
- a = load ':INPATH:/singlefile/studenttab10k' as (name:chararray, age:int, gpa:float);
- b = group a by age parallel 10;
- c = foreach b {
- d = distinct a;
- generate group, org.apache.pig.test.udf.evalfunc.AllFirstLetter(d);
- };
- store c into ':OUTPATH:';\,
- },
- # sort by two columns
- {
- 'num' => 7,
- 'java_params' => ['-Dpig.accumulative.batchsize=5'],
- 'pig' => q\register :FUNCPATH:/testudf.jar;
- a = load ':INPATH:/singlefile/studenttab10k' as (name:chararray, age:int, gpa:float);
- b = group a by age parallel 10;
- c = foreach b {
- d = order a by gpa, name desc;
- generate group, org.apache.pig.test.udf.evalfunc.AllFirstLetter(d.gpa), org.apache.pig.test.udf.evalfunc.AllFirstLetter(d.name);
- };
- store c into ':OUTPATH:';\,
- },
- # sort, distinct mix
- {
- 'num' => 8,
- 'java_params' => ['-Dpig.exec.nocombiner=true', '-Dpig.accumulative.batchsize=5'],
- 'pig' => q\register :FUNCPATH:/testudf.jar;
- a = load ':INPATH:/singlefile/studenttab10k' as (name:chararray, age:int, gpa:float);
- b = group a by age parallel 10;
- c = foreach b {
- d = order a by name;
- e = d.gpa;
- f = distinct e;
- generate group, org.apache.pig.test.udf.evalfunc.AllFirstLetter(f);
- };
- store c into ':OUTPATH:';\,
- },
- # sort, distinct mix
- {
- 'num' => 9,
- 'java_params' => ['-Dpig.exec.nocombiner=true', '-Dpig.accumulative.batchsize=5'],
- 'pig' => q\register :FUNCPATH:/testudf.jar;
- a = load ':INPATH:/singlefile/studenttab10k' as (name:chararray, age:int, gpa:float);
- b = group a by age parallel 10;
- c = foreach b {
- d = order a by gpa;
- e = d.gpa;
- f = distinct e;
- generate group, org.apache.pig.test.udf.evalfunc.AllFirstLetter(f);
- };
- store c into ':OUTPATH:';\,
- },
- {
- # secondary sort boolean
- 'num' => 10,
- 'pig' => q\a = load ':INPATH:/singlefile/allscalar10k' using PigStorage() as (name:chararray, age:int, gpa:double, instate:boolean);
- b = group a by age;
- c = foreach b {
- d = order a by instate;
- generate group, flatten(d);
- };
- store c into ':OUTPATH:';\,
- 'verify_pig_script' => q\a = load ':INPATH:/singlefile/allscalar10k' using PigStorage() as (name:chararray, age:int, gpa:double, instate:chararray);
- b = group a by age;
- c = foreach b {
- d = order a by instate;
- generate group, flatten(d);
- };
- store c into ':OUTPATH:';\,
- }
- ]
- },
- {
- 'name' => 'Accumulator',
- 'tests' => [
- {
- 'num' => 1,
- 'java_params' => ['-Dpig.exec.nocombiner=true', '-Dpig.accumulative.batchsize=5'],
- 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name, age:int, gpa);
- b = load ':INPATH:/singlefile/votertab10k' as (name, age, registration, contributions);
- e = cogroup a by name, b by name parallel 8;
- f = foreach e generate group, SUM(a.age) as s;
- g = filter f by s>0;
- store g into ':OUTPATH:';\,
- },
- {
- 'num' => 2,
- 'java_params' => ['-Dpig.exec.nocombiner=true', '-Dpig.accumulative.batchsize=5'],
- 'pig' => q\a = load ':INPATH:/singlefile/votertab10k' using PigStorage() as (name, age:int, registration, contributions);
- e = group a by name parallel 8;
- f = foreach e generate group, COUNT(a), MAX(a.contributions), MIN(a.contributions) ;
- store f into ':OUTPATH:';\,
- },
- {
- 'num' => 3,
- 'java_params' => ['-Dpig.exec.nocombiner=true', '-Dpig.accumulative.batchsize=5'],
- 'pig' => q\a = load ':INPATH:/singlefile/votertab10k' using PigStorage() as (name, age:int, registration, contributions);
- e = group a by name parallel 8;
- f = foreach e generate group, (MAX(a.contributions)-MIN(a.contributions))*COUNT(a) ;
- store f into ':OUTPATH:';\,
- },
- {
- 'num' => 4,
- 'java_params' => ['-Dpig.exec.nocombiner=true', '-Dpig.accumulative.batchsize=5'],
- 'pig' => q\a = load ':INPATH:/singlefile/votertab10k' using PigStorage() as (name, age:int, registration, contributions);
- e = group a by name parallel 8;
- f = foreach e { g = distinct a.age; generate group, COUNT(g);}
- store f into ':OUTPATH:';\,
- },
- {
- 'num' => 5,
- 'java_params' => ['-Dpig.exec.nocombiner=true', '-Dpig.accumulative.batchsize=1'],
- 'pig' => q\a = load ':INPATH:/singlefile/votertab10k' using PigStorage() as (name, age:int, registration, contributions);
- register :FUNCPATH:/testudf.jar;
- DEFINE YesAccumulate org.apache.pig.TestingAccumulatorHelper('false');
- DEFINE NoAccumulate org.apache.pig.TestingAccumulatorHelper('true');
- b = foreach (group a all) generate COUNT(a) as ct, YesAccumulate(a) as yes_acc, NoAccumulate(a) as no_acc;
- store b into ':OUTPATH:';\,
- 'verify_pig_script' => q\a = load ':INPATH:/singlefile/votertab10k' using PigStorage() as (name, age:int, registration, contributions);
- b = foreach (group a all) generate COUNT(a) as ct;
- c = foreach b generate ct, ct as ct2, 0;
- store c into ':OUTPATH:';\,
- },
- {
- 'num' => 6,
- 'java_params' => ['-Dpig.exec.nocombiner=true'],
- 'pig' => q\a = load ':INPATH:/singlefile/votertab10k' using PigStorage() as (name, age:int, registration, contributions);
- register :FUNCPATH:/testudf.jar;
- DEFINE YesAccumulate org.apache.pig.TestingAccumulatorHelper('false');
- DEFINE NoAccumulate org.apache.pig.TestingAccumulatorHelper('true');
- b = foreach (group a all) generate org.apache.pig.test.udf.evalfunc.NonAlgNonAccCount(a) as ct, YesAccumulate(a) as yes_acc, NoAccumulate(a) as no_acc;
- store b into ':OUTPATH:';\,
- 'verify_pig_script' => q\a = load ':INPATH:/singlefile/votertab10k' using PigStorage() as (name, age:int, registration, contributions);
- register :FUNCPATH:/testudf.jar;
- b = foreach (group a all) generate org.apache.pig.test.udf.evalfunc.NonAlgNonAccCount(a) as ct;
- c = foreach b generate ct, 1, 1;
- store c into ':OUTPATH:';\,
- },
- {
- 'num' => 7,
- 'java_params' => ['-Dpig.exec.nocombiner=true', '-Dpig.accumulative.batchsize=5'],
- 'pig' => q\a = load ':INPATH:/singlefile/votertab10k' using PigStorage() as (name, age:long, registration, contributions);
- register :FUNCPATH:/testudf.jar;
- b = foreach (group a all) generate COUNT(a),
- org.apache.pig.test.udf.evalfunc.IteratingAccumulatorCount(a),
- org.apache.pig.test.udf.evalfunc.IteratingAccumulatorSum(a.age),
- org.apache.pig.test.udf.evalfunc.IteratingAccumulatorIsEmpty(a);
- store b into ':OUTPATH:';\,
- 'verify_pig_script' => q\a = load ':INPATH:/singlefile/votertab10k' using PigStorage() as (name, age:long, registration, contributions);
- b = foreach (group a all) generate COUNT(a), SUM(a.age), IsEmpty(a);
- c = foreach b generate $0, *;
- store c into ':OUTPATH:';\,
- },
- {
- 'num' => 8,
- 'java_params' => ['-Dpig.exec.nocombiner=true', '-Dpig.accumulative.batchsize=5'],
- 'pig' => q\a = load ':INPATH:/singlefile/votertab10k' using PigStorage() as (name, age:long, registration, contributions);
- register :FUNCPATH:/testudf.jar;
- b = foreach (group a all) generate org.apache.pig.test.udf.evalfunc.NonAlgNonAccCount(a),
- org.apache.pig.test.udf.evalfunc.IteratingAccumulatorCount(a),
- org.apache.pig.test.udf.evalfunc.IteratingAccumulatorSum(a.age),
- org.apache.pig.test.udf.evalfunc.IteratingAccumulatorIsEmpty(a);
- store b into ':OUTPATH:';\,
- 'verify_pig_script' => q\a = load ':INPATH:/singlefile/votertab10k' using PigStorage() as (name, age:long, registration, contributions);
- register :FUNCPATH:/testudf.jar;
- b = foreach (group a all) generate org.apache.pig.test.udf.evalfunc.NonAlgNonAccCount(a), COUNT(a), SUM(a.age), IsEmpty(a);
- store b into ':OUTPATH:';\,
- },
- ]
- },
- {
- 'name' => 'PruneColumns',
- 'tests' => [
- {
- 'num' => 1,
- 'execonly' => 'mapred', # studenttab20m not available in local mode
- 'pig' => q\
- a = load ':INPATH:/singlefile/studenttab20m' using PigStorage() as (name, age, gpa);
- b = foreach a generate age;
- store b into ':OUTPATH:';\,
- }
- ]
- },
- {
- 'name' => 'Bzip',
- 'tests' => [
- {
- # test reading and writing out files with .bz2 extension
- 'num' => 1,
- 'pig' => q\
- a = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name, age, gpa);
- store a into ':OUTPATH:.intermediate.bz2';
- b = load ':OUTPATH:.intermediate.bz2';
- store b into ':OUTPATH:';\,
- 'notmq' => 1,
- },
- {
- # test reading and writing with .bz extension
- 'num' => 2,
- 'pig' => q\
- a = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name, age, gpa);
- store a into ':OUTPATH:.intermediate.bz';
- b = load ':OUTPATH:.intermediate.bz';
- store b into ':OUTPATH:';\,
- 'notmq' => 1,
- },
- ]
- },
- {
- 'name' => 'Scalar',
- 'tests' => [
- {
- # test scalar in foreach (most common)
- 'num' => 1,
- 'pig' => q\
- a = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name, age, gpa);
- b = group a all;
- c = foreach b generate AVG(a.gpa) as avg, MAX(a.gpa) as max;
- y = foreach a generate name, (gpa - c.avg) / c.max;
- store y into ':OUTPATH:';\,
- 'floatpostprocess' => 1,
- 'delimiter' => ' ',
- },
- {
- # test scalar in filter
- 'num' => 2,
- 'pig' => q\
- a = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name, age, gpa);
- b = group a all;
- c = foreach b generate AVG(a.gpa) as avg;
- y = filter a by gpa > c.avg;
- store y into ':OUTPATH:';\,
- },
- {
- # test scalar with two branch
- 'num' => 3,
- 'pig' => q\
- a = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name, age, gpa);
- b = group a all;
- c = foreach b generate AVG(a.age) as avg;
- x = load ':INPATH:/singlefile/votertab10k' using PigStorage() as (name, age, registration, contributions);
- y = filter x by age > c.avg;
- store y into ':OUTPATH:';\,
- },
- {
- # test with scalar from two inputs
- 'num' => 4,
- 'pig' => q\
- a = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name, age, gpa);
- b = group a all;
- c = foreach b generate AVG(a.age) as avg;
- d = load ':INPATH:/singlefile/votertab10k' using PigStorage() as (name, age, registration, contributions);
- e = group d all;
- f = foreach e generate AVG(d.age) as avg;
- y = foreach a generate age/c.avg, age/f.avg;
- store y into ':OUTPATH:';\,
- },
- ]
- },
- {
- 'name' => 'Scripting',
- 'tests' => [
- {
- # test integer square
- 'num' => 1,
- 'pig' => q\
- register ':SCRIPTHOMEPATH:/python/scriptingudf.py' using jython as myfuncs;
- a = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name, age:int, gpa:double);
- b = foreach a generate myfuncs.square(age);
- store b into ':OUTPATH:';\,
- 'verify_pig_script' => q\
- a = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name, age:int, gpa:double);
- b = foreach a generate age * age;
- store b into ':OUTPATH:';\,
- },
- {
- # test string concat and referencing function without a namespace
- 'num' => 2,
- 'pig' => q\
- register ':SCRIPTHOMEPATH:/python/scriptingudf.py' using jython;
- a = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name:chararray, age, gpa);
- b = foreach a generate concat(name) as name;
- store b into ':OUTPATH:';\,
- 'verify_pig_script' => q\
- a = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name:chararray, age:int, gpa:double);
- b = foreach a generate CONCAT(name, name);
- store b into ':OUTPATH:';\,
- },
- {
- # test long and double square, plus two references to the same UDF with different schemas
- 'num' => 3,
- 'pig' => q\
- register ':SCRIPTHOMEPATH:/python/scriptingudf.py' using jython as myfuncs;
- a = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name, age:long, gpa:double);
- b = foreach a generate myfuncs.square(age), myfuncs.square(gpa);
- store b into ':OUTPATH:';\,
- 'verify_pig_script' => q\
- a = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name, age:int, gpa:double);
- b = foreach a generate age * age, gpa * gpa;
- store b into ':OUTPATH:';\,
- 'floatpostprocess' => 1,
- 'delimiter' => ' ',
- },
- {
- # test method with no schema decorator (ie, returns bytearray)
- 'num' => 4,
- 'pig' => q\
- register ':SCRIPTHOMEPATH:/python/scriptingudf.py' using jython as myfuncs;
- a = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name:chararray, age, gpa);
- b = foreach a generate myfuncs.byteconcat(name);
- store b into ':OUTPATH:';\,
- 'verify_pig_script' => q\
- a = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name, age, gpa);
- b = foreach a generate CONCAT(name, name);
- store b into ':OUTPATH:';\,
- },
- {
- # test method with no schema decorator (ie, returns bytearray)
- 'num' => 5,
- 'pig' => q\
- register ':SCRIPTHOMEPATH:/python/scriptingudf.py' using jython as myfuncs;
- a = load ':INPATH:/singlefile/studentcomplextab10k' using PigStorage() as (m:[], t:(name:chararray, age:int, gpa:double), b:{t:(name:chararray, age:int, gpa:double)});
- b = foreach a generate flatten(myfuncs.complexTypes(m, t, b)) as (mm, mt, mb);
- c = foreach b generate mm#'name', mt.$0, mb.$0;
- store c into ':OUTPATH:';\,
- 'verify_pig_script' => q\
- a = load ':INPATH:/singlefile/studentcomplextab10k' using PigStorage() as (m:[], t:(name:chararray, age:int, gpa:double), b:{t:(name:chararray, age:int, gpa:double)});
- b = foreach a generate SIZE(m#'name'), t.$2, b.$2;
- store b into ':OUTPATH:';\,
- },
- {
- # test null input and output
- 'num' => 6,
- 'pig' => q\
- register ':SCRIPTHOMEPATH:/python/scriptingudf.py' using jython as myfuncs;
- a = load ':INPATH:/singlefile/studentnulltab10k' using PigStorage() as (name, age:int, gpa:double);
- b = foreach a generate myfuncs.square(age);
- store b into ':OUTPATH:';\,
- 'verify_pig_script' => q\
- a = load ':INPATH:/singlefile/studentnulltab10k' using PigStorage() as (name, age:int, gpa:double);
- b = foreach a generate age * age;
- store b into ':OUTPATH:';\,
- },
- {
- # test functions that call other functions and include other files
- 'num' => 7,
- 'pig' => q\
- register ':SCRIPTHOMEPATH:/python/scriptingudf.py' using jython as myfuncs;
- a = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name, age:int, gpa:double);
- b = foreach a generate myfuncs.redirect(age);
- store b into ':OUTPATH:';\,
- 'verify_pig_script' => q\
- a = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name, age:int, gpa:double);
- b = foreach a generate age * age;
- store b into ':OUTPATH:';\,
- },
- {
- # test that functions with same names resolve correctly across name spaces
- 'num' => 8,
- 'pig' => q\
- register ':SCRIPTHOMEPATH:/python/scriptingudf.py' using jython as myfuncs;
- register ':SCRIPTHOMEPATH:/python/morepythonudfs.py' using jython as morefuncs;
- a = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name, age:int, gpa:double);
- b = foreach a generate myfuncs.square(age), morefuncs.square(age);
- store b into ':OUTPATH:';\,
- 'verify_pig_script' => q\
- a = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name, age:int, gpa:double);
- b = foreach a generate age * age, age * age * age;
- store b into ':OUTPATH:';\,
- },
- {
- # test that functions with same names resolve correctly across name spaces
- 'num' => 9,
- 'pig' => q\
- register ':SCRIPTHOMEPATH:/python/scriptingudf.py' using jython as myfuncs;
- a = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name, age:int, gpa:double);
- b = group a by name;
- c = foreach b generate group, myfuncs.count(a);
- store c into ':OUTPATH:';\,
- 'verify_pig_script' => q\
- a = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name, age:int, gpa:double);
- b = group a by name;
- c = foreach b generate group, COUNT(a);
- store c into ':OUTPATH:';\,
- },
- {
- # test that functions with same names resolve correctly across name spaces
- 'num' => 10,
- 'pig' => q\
- register ':SCRIPTHOMEPATH:/python/scriptingudf.py' using jython as myfuncs;
- a = load ':INPATH:/singlefile/allscalar10k' as (name:chararray, age:int, gpa:double, instate:boolean);
- b = foreach a generate name, myfuncs.adjustgpa(gpa, instate);
- store b into ':OUTPATH:';\,
- 'verify_pig_script' => q\
- a = load ':INPATH:/singlefile/allscalar10k' using PigStorage() as (name:chararray, age:int, gpa:double, instate:chararray);
- b = foreach a generate name, (instate=='true'?gpa:gpa+1);
- store b into ':OUTPATH:';\,
- },
- {
- # test that functions with same names resolve correctly across name spaces
- 'num' => 11,
- 'pig' => q\
- register ':SCRIPTHOMEPATH:/python/scriptingudf.py' using jython as myfuncs;
- a = load ':INPATH:/singlefile/studenttab10k' as (name:chararray, age:int, gpa:double);
- b = foreach a generate name, myfuncs.isretired(age);
- store b into ':OUTPATH:';\,
- 'verify_pig_script' => q\
- a = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name:chararray, age:int, gpa:double);
- b = foreach a generate name, (age>=60?'true':'false');
- store b into ':OUTPATH:';\,
- },
- {
- # jython udf which returns an array
- 'num' => 12,
- 'pig' => q\
- register ':SCRIPTHOMEPATH:/python/scriptingudf.py' using jython as myfuncs;
- a = load ':INPATH:/singlefile/studenttab10k' as (name:chararray, age:chararray, gpa:chararray);
- b = foreach a generate CONCAT(CONCAT(age, ' '), gpa) as sentence;
- c = foreach b generate flatten(myfuncs.tokenize(sentence));
- store c into ':OUTPATH:';\,
- 'verify_pig_script' => q\
- a = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name:chararray, age:chararray, gpa:chararray);
- b = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name:chararray, age:chararray, gpa:chararray);
- c = foreach a generate age;
- d = foreach b generate gpa;
- e = union c, d;
- store e into ':OUTPATH:';\,
- }
- ]
- },
- {
- 'name' => 'RubyUDFs',
- 'tests' => [
- {
- # test integer square
- 'num' => 1,
- 'pig' => q\
- register ':SCRIPTHOMEPATH:/ruby/scriptingudfs.rb' using jruby as myfuncs;
- a = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name, age:int, gpa:double);
- b = foreach a generate myfuncs.square(age);
- store b into ':OUTPATH:';\,
- 'verify_pig_script' => q\
- a = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name, age:int, gpa:double);
- b = foreach a generate age * age;
- store b into ':OUTPATH:';\,
- },
- {
- # test string concat and referencing function without a namespace
- 'num' => 2,
- 'pig' => q\
- register ':SCRIPTHOMEPATH:/ruby/scriptingudfs.rb' using jruby as myfuncs;
- a = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name:chararray, age, gpa);
- b = foreach a generate myfuncs.concat(name, name);
- store b into ':OUTPATH:';\,
- 'verify_pig_script' => q\
- a = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name:chararray, age:int, gpa:double);
- b = foreach a generate CONCAT(name, name);
- store b into ':OUTPATH:';\,
- },
- {
- # test long and double square, plus two references to the same UDF with different schemas
- 'num' => 3,
- 'pig' => q\
- register ':SCRIPTHOMEPATH:/ruby/scriptingudfs.rb' using jruby as myfuncs;
- a = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name, age:long, gpa:double);
- b = foreach a generate myfuncs.square(age), myfuncs.square(gpa);
- store b into ':OUTPATH:';\,
- 'verify_pig_script' => q\
- a = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name, age:int, gpa:double);
- b = foreach a generate age * age, gpa * gpa;
- store b into ':OUTPATH:';\,
- 'floatpostprocess' => 1,
- 'delimiter' => ' ',
- },
- {
- # test method with no schema decorator (ie, returns bytearray)
- 'num' => 4,
- 'pig' => q\
- register ':SCRIPTHOMEPATH:/ruby/scriptingudfs.rb' using jruby as myfuncs;
- a = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name, age, gpa);
- b = foreach a generate myfuncs.byteconcat(name, name);
- store b into ':OUTPATH:';\,
- 'verify_pig_script' => q\
- a = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name, age, gpa);
- b = foreach a generate CONCAT(name, name);
- store b into ':OUTPATH:';\,
- },
- {
- # test method with complex types
- 'num' => 5,
- 'pig' => q\
- register ':SCRIPTHOMEPATH:/ruby/scriptingudfs.rb' using jruby as myfuncs;
- a = load ':INPATH:/singlefile/studentcomplextab10k' using PigStorage() as (m:[], t:(name:chararray, age:int, gpa:double), b:{t:(name:chararray, age:int, gpa:double)});
- b = foreach a generate flatten(myfuncs.complexTypes(m, t, b)) as (mm, mt, mb);
- c = foreach b generate mm#'name', mt.$0, mb.$0;
- store c into ':OUTPATH:';\,
- 'verify_pig_script' => q\
- a = load ':INPATH:/singlefile/studentcomplextab10k' using PigStorage() as (m:[], t:(name:chararray, age:int, gpa:double), b:{t:(name:chararray, age:int, gpa:double)});
- b = foreach a generate SIZE(m#'name'), t.$2, b.$2;
- store b into ':OUTPATH:';\,
- },
- {
- # test null input and output
- 'num' => 6,
- 'pig' => q\
- register ':SCRIPTHOMEPATH:/ruby/scriptingudfs.rb' using jruby as myfuncs;
- a = load ':INPATH:/singlefile/studentnulltab10k' using PigStorage() as (name, age:int, gpa:double);
- b = foreach a generate myfuncs.square(age);
- store b into ':OUTPATH:';\,
- 'verify_pig_script' => q\
- a = load ':INPATH:/singlefile/studentnulltab10k' using PigStorage() as (name, age:int, gpa:double);
- b = foreach a generate age * age;
- store b into ':OUTPATH:';\,
- },
- {
- # test functions that call other functions and include other files
- 'num' => 7,
- 'pig' => q\
- register ':SCRIPTHOMEPATH:/ruby/scriptingudfs.rb' using jruby as myfuncs;
- a = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name, age:int, gpa:double);
- b = foreach a generate myfuncs.redirect(age);
- store b into ':OUTPATH:';\,
- 'verify_pig_script' => q\
- a = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name, age:int, gpa:double);
- b = foreach a generate age * age;
- store b into ':OUTPATH:';\,
- },
- {
- # test that functions with same names resolve correctly across name spaces
- 'num' => 8,
- 'pig' => q\
- register ':SCRIPTHOMEPATH:/ruby/scriptingudfs.rb' using jruby as myfuncs;
- register ':SCRIPTHOMEPATH:/ruby/morerubyudfs.rb' using jruby as morefuncs;
- a = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name, age:int, gpa:double);
- b = foreach a generate myfuncs.square(age), morefuncs.cube(age), morefuncs.CUBE(age);
- store b into ':OUTPATH:';\,
- 'verify_pig_script' => q\
- a = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name, age:int, gpa:double);
- b = foreach a generate age * age, age * age * age, age * age * age;
- store b into ':OUTPATH:';\,
- },
- {
- # test algebraic functions
- 'num' => 9,
- 'pig' => q\
- register ':SCRIPTHOMEPATH:/ruby/scriptingudfs.rb' using jruby as myfuncs;
- a = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name, age:int, gpa:double);
- b = group a by name;
- c = foreach b generate group, myfuncs.Count(a);
- store c into ':OUTPATH:';\,
- 'verify_pig_script' => q\
- a = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name, age:int, gpa:double);
- b = group a by name;
- c = foreach b generate group, COUNT(a);
- store c into ':OUTPATH:';\,
- },
- {
- # test accumulator functions
- 'num' => 10,
- 'java_params' => ['-Dpig.accumulative.batchsize=5'],
- 'pig' => q\
- register ':SCRIPTHOMEPATH:/ruby/scriptingudfs.rb' using jruby as myfuncs;
- a = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name, age:int, gpa:double);
- b = group a by name;
- c = foreach b generate group, myfuncs.Sum(a.age), myfuncs.Sum(a.gpa);
- d = foreach c generate $0, $1, (double)((int)$2*100)/100;
- store d into ':OUTPATH:';\,
- 'verify_pig_script' => q\
- a = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name, age:int, gpa:double);
- b = group a by name;
- c = foreach b generate group, SUM(a.age), SUM(a.gpa);
- d = foreach c generate $0, $1, (double)((int)$2*100)/100;
- store d into ':OUTPATH:';\,
- },
- {
- 'num' => 11,
- 'pig' => q\
- register ':SCRIPTHOMEPATH:/ruby/morerubyudfs.rb' using jruby as myfuncs;
- a = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name, age:int, gpa:double);
- b = foreach a generate flatten(myfuncs.reverse(name, age));
- store b into ':OUTPATH:';\,
- 'verify_pig_script' => q\
- a = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name, age:int, gpa:double);
- b = foreach a generate age, name;
- store b into ':OUTPATH:';\,
- },
- {
- 'num' => 12,
- 'pig' => q\
- register ':SCRIPTHOMEPATH:/ruby/morerubyudfs.rb' using jruby as myfuncs;
- a = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name, age:int, gpa:double);
- b = filter a by myfuncs.ISEVEN(age);
- store b into ':OUTPATH:';\,
- 'verify_pig_script' => q\
- a = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name, age:int, gpa:double);
- b = filter a by age%2==0;
- store b into ':OUTPATH:';\,
- },
- {
- 'num' => 13,
- 'java_params' => ['-Dpig.accumulative.batchsize=5'],
- 'pig' => q\
- register ':SCRIPTHOMEPATH:/ruby/morerubyudfs.rb' using jruby as myfuncs;
- a = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name, age:int, gpa:double);
- b = foreach (group a all) generate FLATTEN(myfuncs.AppendIndex(a));
- store b into ':OUTPATH:';\,
- 'verify_pig_script' => q\
- register :FUNCPATH:/testudf.jar;
- a = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name, age:int, gpa:double);
- b = foreach (group a all) generate FLATTEN(org.apache.pig.test.udf.evalfunc.AppendIndex(a));
- store b into ':OUTPATH:';\,
- },
- ]
- },
- {
- 'name' => 'Native',
- 'tests' => [
- {
- # test common
- 'num' => 1,
- 'pig' => q\
- rmf table_testNativeMRJobSimple_input
- rmf table_testNativeMRJobSimple_output
- a = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name, age, gpa);
- b = mapreduce ':MAPREDJARS:/hadoop-examples.jar' Store a into 'table_testNativeMRJobSimple_input' Load 'table_testNativeMRJobSimple_output' `wordcount table_testNativeMRJobSimple_input table_testNativeMRJobSimple_output`;
- store b into ':OUTPATH:';\,
- 'notmq' => 1,
- },
- {
- # test complex
- 'num' => 2,
- 'pig' => q\
- rmf table_testNativeMRJobSimple_input
- rmf table_testNativeMRJobSimple_output
- a = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name, age, gpa);
- b = foreach a generate name;
- c = distinct b;
- d = mapreduce ':MAPREDJARS:/hadoop-examples.jar' Store c into 'table_testNativeMRJobSimple_input' Load 'table_testNativeMRJobSimple_output' as (name:chararray, count: int) `wordcount table_testNativeMRJobSimple_input table_testNativeMRJobSimple_output`;
- e = order d by name;
- store e into ':OUTPATH:';\,
- 'sortArgs' => ['-t', ' '],
- 'notmq' => 1,
- },
- {
- # test streaming
- 'num' => 3,
- 'pig' => q\
- rmf table_testNativeMRJobSimple_input
- rmf table_testNativeMRJobSimple_output
- a = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name, age, gpa);
- b = mapreduce ':MAPREDJARS:/hadoop-streaming.jar' Store a into 'table_testNativeMRJobSimple_input' Load 'table_testNativeMRJobSimple_output' as (name:chararray, count: int) `-input table_testNativeMRJobSimple_input -output table_testNativeMRJobSimple_output -mapper /bin/cat -reducer /usr/bin/wc`;
- store b into ':OUTPATH:';\,
- 'pig23' => q\
- rmf table_testNativeMRJobSimple_input
- rmf table_testNativeMRJobSimple_output
- a = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name, age, gpa);
- b = mapreduce ':MAPREDJARS:/hadoop-0.23.0-streaming.jar' Store a into 'table_testNativeMRJobSimple_input' Load 'table_testNativeMRJobSimple_output' as (name:chararray, count: int) `-input table_testNativeMRJobSimple_input -output table_testNativeMRJobSimple_output -mapper /bin/cat -reducer /usr/bin/wc`;
- store b into ':OUTPATH:';\,
- 'notmq' => 1,
- },
- ]
- },
- {
- 'name' => 'Partitioner',
- 'tests' => [
- {
- # test group
- 'num' => 1,
- 'execonly' => 'mapred', # since this join will run out of memory in local mode
- 'pig' => q\register :FUNCPATH:/testudf.jar;
- a = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name, age:int, gpa);
- b = group a by age PARTITION BY org.apache.pig.test.utils.SimpleCustomPartitioner2 parallel 2;
- c = foreach b generate group, COUNT(a);
- store c into ':OUTPATH:';\,
- },
- ]
- },
- {
- ####################################################################
- # SUB : CastScalar
- # FEATURE: adds functionality that allows to cast elements of a single-tuple relation into a scalar value.
- # JIRA: Pig-1434
- #
- # TEST ITEMS:
- # 1 Test syntax
- # 2 Test scalar for simple data type
- # 3 Test scalar for complex data type: tuple, bag, map
- # 4 Test implicit cast
- # 5 Test explicit cast
- # 6 Positional parameter
- # 7 Cast within an aggregate function
- # 8 Cast within an UDF function
- # 9 Cast with a FOREACH
- # 10 Cast with a FILTER
- # 11 Cast with a SPLIT
- # 12 Cast in a JOIN
- # 13 Multiquery
- # 14 Cast on a schema that cannot be inferred should result in bytearray
- # 15 Replicated Join
- # 16 Test operations such as R1 * (int)R1
- # 17 CheckSingular(*)
- # 18 missing field in scalar file
- # 19 scalar referenced from an empty file
- # 20 empty input directory
- # 21 Single row vs Multiple Row
- # 22 Cast on a multi-field tuple
- # 23 Reference a non-scalar as a scalar
- # 24 Test multiple loaders
- 'name' => 'CastScalar',
- 'tests' => [
- {
- # 2 Test scalar for simple data type
- # 3 Test scalar for complex data type: tuple, bag, map
- # 9 Cast with a FOREACH
- #INPATH = /user/hadoopqa/pig/tests/data
- 'num' => 1,
- 'pig' => q#
- a = load ':INPATH:/singlefile/studenttab10k' as (name: chararray, age: int, gpa: float);
- b = group a all;
- c = foreach b generate SUM(a.age) as total;
- d = foreach a generate name, age+(double)c.total as d_sum;
- e = order d by name, d_sum;
- store d into ':OUTPATH:';
- #,
- # 6 Positional parameter
- }, {
- 'num' => 2,
- 'pig' => q#
- a = load ':INPATH:/singlefile/studenttab10k' as (name: chararray, age: int, gpa: float);
- b = group a all;
- c = foreach b generate SUM(a.age) as total;
- d = foreach a generate name, age+(double)c.$0 as d_sum;
- e = order d by name, d_sum;
- store d into ':OUTPATH:';
- #,
- # 2 Test scalar for simple data type
- # 3 Test scalar for complex data type:map
- # 9 Cast with a FOREACH
- # 13 Multiquery
- # 24 Test multiple loaders
- #INPATH = /user/hadoopqa/pig/tests/data
- }, {
- # 4 Test implicit cast
- # 10 Cast with a FILTER
- #
- # I set the benchmark to use "19" because pig trunkates during cast and sql rounds up.
- 'num' => 7,
- 'pig' => q\
- a = load ':INPATH:/singlefile/studenttab10k' as (name, age, gpa);
- b = group a by name;
- c = foreach b generate group, AVG(a.gpa)+20 as avg_gpa;
- d = order c by avg_gpa;
- simple_scalar = limit d 1;
- f = filter a by age < (int) simple_scalar.avg_gpa;
- g = order f by name, age, gpa;
- store g into ':OUTPATH:';\,
- }, {
- # 5 Test explicit cast
- # 10 Cast with a FILTER
- 'num' => 8,
- 'pig' => q\
- a = load ':INPATH:/singlefile/studenttab10k' as (name, age, gpa);
- b = group a by name;
- c = foreach b generate group, AVG(a.age) AS average;
- d = order c by average;
- simple_scalar = limit d 1;
- d = filter a by age > (int) simple_scalar.average;
- e = foreach d generate name, age;
- store e into ':OUTPATH:';
- \,
- }, {
- # 5 Test explicit cast
- # 6 Positional parameter
- 'num' => 9,
- 'pig' => q\
- a = load ':INPATH:/singlefile/studenttab10k' as (name, age, gpa);
- b = group a by name;
- c = foreach b generate group, AVG(a.age) AS average;
- d = order c by average;
- simple_scalar = limit d 1;
- d = filter a by age > (int) simple_scalar.$1;
- e = foreach d generate name, age;
- store e into ':OUTPATH:';
- \,
- }, {
- # 4 Test implicit cast
- # 6 Positional parameter
- # 10 Cast with a FILTER
- 'num' => 10,
- 'pig' => q\
- a = load ':INPATH:/singlefile/studenttab10k' as (name, age, gpa);
- b = group a by name;
- c = foreach b generate group, AVG(a.age) AS average;
- d = order c by average;
- simple_scalar = limit d 1;
- d = filter a by age > simple_scalar.$1;
- e = foreach d generate name, age;
- store e into ':OUTPATH:';
- \,
- }, {
- # 4 Test implicit cast
- # 6 Positional parameter
- # 11 Cast with a SPLIT
- 'num' => 11,
- 'pig' => q\
- a = load ':INPATH:/singlefile/studenttab10k' as (name, age, gpa);
- b = group a by name;
- c = foreach b generate group, AVG(a.age) AS average;
- d = order c by average;
- simple_scalar = limit d 1;
- split a into X1 if age > (int) simple_scalar.$1, X2 if age < 20;
- split a into X3 if age > (int) simple_scalar.$1, X4 if age > 70;
- store X1 into ':OUTPATH:.1';
- store X2 into ':OUTPATH:.2';
- store X3 into ':OUTPATH:.3';
- store X4 into ':OUTPATH:.4';
- \,
- }, {
- # 4 Test implicit cast
- # 6 Positional parameter
- # 12 Cast with a JOIN
- 'num' => 12,
- 'pig' => q\
- a = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name, age, gpa);
- b = load ':INPATH:/singlefile/votertab10k' as (name, age, registration, contributions);
- c = filter a by age < 20;
- d = filter b by age < 20;
- simple_scalar = limit d 1;
- e = join c by name, d by name;
- f= filter e by c::age <(int)simple_scalar.age;
- store f into ':OUTPATH:';\,
- },
- ]
- },{
-
- 'name' => 'udf_TOBAGandTOTUPLE',
-
- 'sortResults' => 1,
- 'floatpostprocess' => 1,
- 'delimiter' => ' ',
- 'tests' => [
- {
- # TEST : resulting schema for TOBAG/TOTUPLE with simple types
- # TEST : resulting schema for TOBAG/TOTUPLE with positional parameters
- # TEST : resulting schema for various projects using a combination of TOBAG/TOTUPLE and standard projections
- # TEST : resulting schema for various projects using a combination of TOBAG/TOTUPLE using AS clause
- 'num' => 1
- ,'pig' => q?
- A = load ':INPATH:/types/numbers.txt' using PigStorage(':') as (intnum1000: int,id: int,intnum5: int,intnum100: int,intnum: int,longnum: long,floatnum: float,doublenum: double);
- B = limit A 10;
- Gen1 = FOREACH B GENERATE $0, $1, $2 ;
- GroupById = GROUP B BY id;
- B1 = foreach B generate TOBAG( intnum1000, id, intnum5);
- B3 = foreach B generate TOBAG( $0, $1, $2);
- T1= foreach B generate TOTUPLE( intnum1000, id, intnum5);
- T2= foreach B generate TOTUPLE( $0, $1, $2);
- T3 = foreach B generate TOTUPLE( $0, $0, $0);
- T4= foreach B generate TOBAG($0, $1, $2), TOTUPLE($3, $4, $5), $6, $7;
- T5= foreach B generate $0, $1, TOTUPLE($2, $3, $4), TOBAG($5, $6), $7;
- T6= foreach B generate $0, TOTUPLE($0, $0, $0), TOBAG($0, $0), $0 AS duplicate;
- describe Gen1;
- describe GroupById;
- describe B1;
- describe B3;
- describe T1;
- describe T2;
- describe T3;
- describe T4;
- describe T5;
- describe T6;
- ?
- ,'expected_out_regex' => 'B1: {{int}}'
- ,'expected_out_regex' => 'B3: {{int}}'
- ,'expected_out_regex' => 'T1: {org.apache.pig.builtin.totuple_id_.*: (intnum1000: int,id: int,intnum5: int)}'
- ,'expected_out_regex' => 'T2: {org.apache.pig.builtin.totuple_id_.*: (intnum1000: int,id: int,intnum5: int)}'
- ,'expected_out_regex' => 'T3: {org.apache.pig.builtin.totuple_intnum1000.*: (intnum1000: int,intnum1000: int,intnum1000: int)}'
- ,'expected_out_regex' => 'T4: {{int},org.apache.pig.builtin.totuple_intnum100.*: (intnum100: int,intnum: int,longnum: long),floatnum: float,doublenum: double}'
- ,'expected_out_regex' => 'T5: {intnum1000: int,id: int,org.apache.pig.builtin.totuple_intnum100.*: (intnum5: int,intnum100: int,intnum: int).*{NULL}.*doublenum: double}'
- ,'expected_out_regex' => "T6: {intnum1000: int,org.apache.pig.builtin.totuple_intnum1000.*: \\(intnum1000: int,intnum1000: int,intnum1000: int\\),{\\(int\\)},duplicate: int}"
- }, {
- # TEST : bag of mixed data types
- # TEST : Order
- # TEST : positional parameters
- 'num' => 2
- ,'pig' => q?
- A = load ':INPATH:/types/numbers.txt' using PigStorage(':') as (intnum1000: int,id: int,intnum5: int,intnum100: int,intnum: int,longnum: long,floatnum: float,doublenum: double);
- C = foreach A generate TOBAG( id, floatnum, doublenum );
- D = foreach A generate TOBAG( id, intnum);
- E = foreach A generate TOBAG( (float) id,floatnum );
- F = foreach A generate TOBAG( (long) id,longnum );
- G = foreach A generate TOBAG( (double) id,doublenum );
- describe C;
- describe D;
- describe E;
- describe F;
- describe G;
- ?
- ,'expected_out_regex' => 'C: {{\\(NULL\\)}}'
- ,'expected_out_regex' => 'D: {{\\(int\\)}}'
- ,'expected_out_regex' => 'E: {{\\(float\\)}}'
- ,'expected_out_regex' => 'F: {{\\(long\\)}}'
- ,'expected_out_regex' => 'G: {{\\(double\\)}}'
- }, {
- # TEST : TOBAG/TOTUPLE with simple types
- # TEST : TOBAG/TOTUPLE with positional parameters
- # TEST : various projects using a combination of TOBAG/TOTUPLE and standard projections
- # TEST : various projects using a combination of TOBAG/TOTUPLE using AS clause
- 'num' => 3
- ,'pig' => q?
- A = load ':INPATH:/types/numbers.txt' using PigStorage(':') as (intnum1000: int,id: int,intnum5: int,intnum100: int,intnum: int,longnum: long,floatnum: float,doublenum: double);
- B = limit A 10;
- B1 = foreach B generate TOBAG( intnum1000, id, intnum5);
- B2 = foreach B generate TOBAG( $0, $1, $2);
- T1= foreach B generate TOTUPLE( intnum1000, id, intnum5);
- T2= foreach B generate TOTUPLE( $0, $1, $2);
- T3 = foreach B generate TOTUPLE( $0, $0, $0);
- T4= foreach B generate TOBAG($0, $1, $2), TOTUPLE($3, $4, $5), $6, $7;
- T5= foreach B generate $0, $1, TOTUPLE($2, $3, $4), TOBAG($5, $6), $7;
- T6= foreach B generate $0, TOTUPLE($0, $0, $0), TOBAG($0, $0), $0 AS duplicate;
- Gen1 = FOREACH B GENERATE $0, $1, $2 ;
- GroupById = GROUP B BY id;
- store Gen1 into ':OUTPATH:.1';
- store GroupById into ':OUTPATH:.2';
- store B1 into ':OUTPATH:.3';
- store B2 into ':OUTPATH:.4';
- store T1 into ':OUTPATH:.5';
- store T2 into ':OUTPATH:.6';
- store T3 into ':OUTPATH:.7';
- store T4 into ':OUTPATH:.8';
- ?
- }, {
- # TEST : cast for TOTUPLE/TOBAG
- 'num' => 4
- ,'ignore' => 1 # different error message for different version of hadoop
- ,'pig' => q?
- A = load ':INPATH:/types/numbers.txt' using PigStorage(':') as (intnum1000: int,id: int,intnum5: int,intnum100: int,intnum: int,longnum: long,floatnum: float,doublenum: double);
- B= limit A 10;
- C = foreach B generate $0, TOTUPLE((int) $0, (long) $0, (double) $0), TOBAG( (float) $0, (chararray) $0), $0;
- store C into ':OUTPATH:';
- ?
- ,'expected_err_regex' => 'ERROR 1108: Duplicate schema alias'
- ,'rc' => 6
- }, {
- # TEST : cast for TOTUPLE/TOBAG
- 'num' => 5
- ,'pig' => q?
- A = load ':INPATH:/types/numbers.txt' using PigStorage(':') as (intnum1000: int,id: int,intnum5: int,intnum100: int,intnum: int,longnum: long,floatnum: float,doublenum: double);
- B= limit A 1;
- C = foreach B generate $0, TOTUPLE((int) $0);
- D = foreach B generate $0, TOTUPLE((long) $0);
- E = foreach B generate $0, TOTUPLE((double) $0);
- F = foreach B generate $0, TOTUPLE((float) $0);
- G = foreach B generate $0, TOTUPLE((chararray) $0);
- store B into ':OUTPATH:.1';
- store C into ':OUTPATH:.2';
- store D into ':OUTPATH:.3';
- store E into ':OUTPATH:.4';
- store F into ':OUTPATH:.5';
- store G into ':OUTPATH:.6';
- ?
- }, {
- #TEST more complicated nested functions such as TOTUPLE(TOBAG())
- #TEST more complicated nested functions such as TOBAG(TOTUPLE())
- #TEST more complicated nested functions such as TOTUPLE(TOTUPLE())
- #TEST more complicated nested functions such as TOBAG(TOBAG())
- 'num' => 6
- ,'pig' => q?
- A = load ':INPATH:/types/numbers.txt' using PigStorage(':') as (intnum1000: int,id: int,intnum5: int,intnum100: int,intnum: int,longnum: long,floatnum: float,doublenum: double);
- B = limit A 10;
- tint = foreach B generate TOTUPLE( TOBAG( $1, $2, $3),TOTUPLE($3, $4, $5) );
- bint = foreach B generate TOTUPLE( TOBAG( $1, $2, $3),TOBAG($3, $4, $5) );
- binb = foreach B generate TOBAG( TOBAG( $1, $2, $3),TOBAG($3, $4, $5) );
- tinb = foreach B generate TOTUPLE( TOBAG( $1, $2, $3),TOBAG($3, $4, $5) );
- store B into ':OUTPATH:.1';
- store tint into ':OUTPATH:.2';
- store bint into ':OUTPATH:.3';
- store binb into ':OUTPATH:.4';
- store tinb into ':OUTPATH:.5';
- ?
- }, {
- #TEST arithmetic operation in TOTUPLE and TOBAG
- #TEST aggregate funcion - NOT IMPLEMENTED
- #TEST tuple with 50+ items
- #TEST with null
- 'num' => 7
- ,'pig' => q?
- A = load ':INPATH:/types/numbers.txt' using PigStorage(':') as (intnum1000: int,id: int,intnum5: int,intnum100: int,intnum: int,longnum: long,floatnum: float,doublenum: double);
- B = limit A 10;
- B1= foreach B generate TOTUPLE( $1, $2, $3);
- T1= foreach B generate TOTUPLE( $1, $2, $3);
- R1= foreach B generate TOTUPLE( $1, $0+1, $0+2, $0+3),TOBAG($0+4, $0+1 );
- R2= foreach B generate TOTUPLE( $0, $1, $2, $3, $4, $5, $6, $7, (int) 8, (int) 9 , $1, $2, $3, $4, $5, $6, $7, (int) 19, (int) 20, $0, $1, $2, $3, $4, $5, $6, $7 , (int) 29, (int) 30, $0, $1, $2, $3, $4, $5, $6, $7, (int) 39, (int) 40 , $1, $2, $3, $4, $5, $6, $7, (int) 19, (int) 20, $0, $1, $2, $3, $4, $5, $5, $7 );
- R3= foreach B generate $0, TOTUPLE(0,0,0), TOBAG( 0, 0 );
- R4= foreach B generate $0, TOTUPLE(null, id, null), TOBAG( id, null, id,null );
- describe R1;
- describe R2;
- describe R3;
- describe R4;
- store B into ':OUTPATH:.1';
- store B1 into ':OUTPATH:.2';
- store R1 into ':OUTPATH:.3';
- store R2 into ':OUTPATH:.4';
- store R3 into ':OUTPATH:.5';
- store R4 into ':OUTPATH:.6';
- ?
- }, {
- # TEST more TOTUPLE and TOBAG nested combinations
- 'num' => 8
- ,'pig' => q?
- A = load ':INPATH:/types/numbers.txt' using PigStorage(':') as (intnum1000: int,id: int,intnum5: int,intnum100: int,intnum: int,longnum: long,floatnum: float,doublenum: double);
- B = limit A 10;
- C = foreach B generate TOBAG( $0, $1, $2);
- T1= foreach B generate TOTUPLE( TOBAG( $1, $2, $3),TOTUPLE($3, $4, $5) );
- T2= foreach B generate TOTUPLE( TOBAG( $1, $2, $3),TOBAG($3, $4, $5) );
- T3= foreach B generate TOBAG( TOTUPLE( $1, $2, $3), TOTUPLE($4,$5), TOTUPLE($6,$7));
- store B into ':OUTPATH:.1';
- store C into ':OUTPATH:.2';
- store T1 into ':OUTPATH:.3';
- store T2 into ':OUTPATH:.4';
- store T3 into ':OUTPATH:.5';
- ?
- ,'verify_pig_script' => q?register :FUNCPATH:/testudf.jar;
- A = load ':INPATH:/types/numbers.txt' using PigStorage(':') as (intnum1000: int,id: int,intnum5: int,intnum100: int,intnum: int,longnum: long,floatnum: float,doublenum: double);
- B = limit A 10;
- C = foreach B generate TOBAG( $0, $1, $2);
- T1= foreach B generate TOTUPLE( TOBAG( $1, $2, $3),TOTUPLE($3, $4, $5) );
- T2= foreach B generate TOTUPLE( TOBAG( $1, $2, $3),TOBAG($3, $4, $5) );
- T3= foreach B generate org.apache.pig.test.udf.evalfunc.TOBAG2( TOTUPLE( $1, $2, $3), TOTUPLE($4,$5), TOTUPLE($6,$7));
- store B into ':OUTPATH:.1';
- store C into ':OUTPATH:.2';
- store T1 into ':OUTPATH:.3';
- store T2 into ':OUTPATH:.4';
- store T3 into ':OUTPATH:.5';
- ?
- }, {
- #TEST negative test case: out of bounds positional parameter
- # EVERYTHING IS CORRECT
- 'num' => 9
- ,'ignore' => 1 # different error message for different version of hadoop
- ,'pig' => q?
- A = load ':INPATH:/types/numbers.txt' using PigStorage(':') as (intnum1000: int,id: int,intnum5: int,intnum100: int,intnum: int,longnum: long,floatnum: float,doublenum: double);
- B = limit A 10;
- C = foreach B generate $0, $1, TOTUPLE($2, $998, $4), TOBAG($5, $6), $7;
- ?
- ,'expected_err_regex' => 'Out of bound access.*non-existent column: 998'
- }, {
- #TEST negative test case: out of bounds positional parameter
- # EVERYTHING IS CORRECT
- 'num' => 10
- ,'ignore' => 1 # different error message for different version of hadoop
- ,'pig' => q?
- A = load ':INPATH:/types/numbers.txt' using PigStorage(':') as (intnum1000: int,id: int,intnum5: int,intnum100: int,intnum: int,longnum: long,floatnum: float,doublenum: double);
- B = limit A 10;
- C = foreach B generate $0, $1, TOBAG($5, $999), $7;
- ?
- ,'expected_err_regex' => 'Out of bound access.*non-existent column: 999'
- },
- ] # end of tests
- },{
- 'name' => 'ToStuffSyntaxSugar',
- 'tests' => [
- {
- #TEST TOTUPLE syntax sugar
- 'num' => 1,
- 'pig' => q\
- A = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name:chararray, age:int, gpa:float);
- B = foreach A generate (name, age);
- store B into ':OUTPATH:';\,
- 'verify_pig_script' => q\
- A = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name:chararray, age:int, gpa:float);
- B = foreach A generate TOTUPLE(name, age);
- store B into ':OUTPATH:';\,
- }, {
- #TEST TOBAG syntax sugar
- 'num' => 2,
- 'pig' => q\
- A = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name:chararray, age:int, gpa:float);
- B = foreach A generate {name, age};
- store B into ':OUTPATH:';\,
- 'verify_pig_script' => q\
- A = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name:chararray, age:int, gpa:float);
- B = foreach A generate TOBAG(name, age);
- store B into ':OUTPATH:';\,
- }, {
- #TEST TOMAP syntax sugar
- 'num' => 3,
- 'pig' => q\
- A = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name:chararray, age:int, gpa:float);
- B = foreach A generate [name, age];
- store B into ':OUTPATH:';\,
- 'verify_pig_script' => q\
- A = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name:chararray, age:int, gpa:float);
- B = foreach A generate TOMAP(name, age);
- store B into ':OUTPATH:';\,
- }, {
- #TEST verify single element inside parenthesis does NOT call TOTUPLE
- 'num' => 4,
- 'pig' => q\
- A = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name:chararray, age:int, gpa:float);
- B = foreach A generate (age) + 1;
- store B into ':OUTPATH:';\,
- 'verify_pig_script' => q\
- A = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name:chararray, age:int, gpa:float);
- B = foreach A generate (age + 1);
- store B into ':OUTPATH:';\,
- }
- ] # end of tests
- },{
-
- 'name' => 'MergeOperator',
- 'tests' => [
- {
- # Test Union using merge where schema is identical | A&B have identical schema
- 'num' => 1,
- 'pig' => q\
- A = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name:chararray, age:int, gpa:float);
- B = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name:chararray, age:int, gpa:float);
- C = union onschema A, B;
- store C into ':OUTPATH:';\,
- 'verify_pig_script' => q\
- A = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name:chararray, age:int, gpa:float);
- B = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name:chararray, age:int, gpa:float);
- C = union A, B;
- store C into ':OUTPATH:';\,
- },{
- # Test Union using merge with type promotions, int->long and float->double
- 'num' => 2,
- 'floatpostprocess' => 1,
- 'delimiter' => ' ',
- 'pig' => q\
- A = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name:chararray, age:int, gpa:float);
- B = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name:chararray, age:long, gpa:double);
- C = union onschema A, B;
- store C into ':OUTPATH:';\,
- 'verify_pig_script' => q\
- A = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name:chararray, age:int, gpa:float);
- B = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name:chararray, age:int, gpa:float);
- C = union A, B;
- D = foreach C generate name, (long)age, (double)gpa;
- store C into ':OUTPATH:';\,
- },{
- # Test Union using merge with type promotions, int->float
- 'num' => 3,
- 'floatpostprocess' => 1,
- 'delimiter' => ' ',
- 'pig' => q\
- A = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name:chararray, age:int, gpa:float);
- B = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name:chararray, age:float, gpa:float);
- C = union onschema A, B;
- store C into ':OUTPATH:';\,
- 'verify_pig_script' => q\
- A = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name:chararray, age:float, gpa:float);
- B = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name:chararray, age:float, gpa:float);
- C = union A, B;
- D = foreach C generate name, (float)age, gpa;
- store C into ':OUTPATH:';\,
- },{
- # Test Union using merge with type promotions, int->double
- 'num' => 4,
- 'floatpostprocess' => 1,
- 'delimiter' => ' ',
- 'pig' => q\
- A = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name:chararray, age:int, gpa:float);
- B = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name:chararray, age:double, gpa:float);
- C = union onschema A, B;
- store C into ':OUTPATH:';\,
- 'verify_pig_script' => q\
- A = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name:chararray, age:double, gpa:float);
- B = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name:chararray, age:double, gpa:float);
- C = union A, B;
- D = foreach C generate name, (double)age, gpa;
- store C into ':OUTPATH:';\,
- },{
- # Test Union of an intersection
- 'num' => 5,
- 'pig' => q\
- A = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name:chararray, age:int, gpa:float);
- B = load ':INPATH:/singlefile/votertab10k' as (name:chararray, age:int, registration:chararray, contributions:float);
- C = union onschema A, B;
- store C into ':OUTPATH:';\,
- 'verify_pig_script' => q\
- register :FUNCPATH:/testudf.jar;
- define Nil org.apache.pig.test.udf.evalfunc.Nil();
- A = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name:chararray, age:int, gpa:float);
- B = load ':INPATH:/singlefile/votertab10k' as (name:chararray, age:int, registration:chararray, contributions:float);
- C = foreach A generate name, age, (chararray)gpa, Nil(), Nil();
- D = foreach B generate name, age, Nil(), registration, (chararray)contributions;
- E = union C, D;
- store E into ':OUTPATH:';\,
- },
- {
- # Test Union where the intersection is null
- 'num' => 6,
- 'pig' => q\
- A = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name:chararray, age:int, gpa:float);
- B = load ':INPATH:/singlefile/textdoc' as (line:chararray);
- C = union onschema A, B;
- store C into ':OUTPATH:';\,
- 'verify_pig_script' => q\
- register :FUNCPATH:/testudf.jar;
- define Nil org.apache.pig.test.udf.evalfunc.Nil();
- A = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name:chararray, age:int, gpa:float);
- B = load ':INPATH:/singlefile/textdoc' as (line:chararray);
- C = foreach A generate name, (chararray)age, (chararray)gpa, Nil(name);
- D = foreach B generate Nil(line), Nil(line), Nil(line), line;
- E = union C, D;
- store E into ':OUTPATH:';\,
- },
- {
- # Test Union using merge where schema is identical | A&B have identical schema
- 'num' => 7,
- 'pig' => q\
- a = load ':INPATH:/singlefile/allscalar10k' using PigStorage() as (name:chararray, age:int, gpa:double, instate:boolean);
- b = load ':INPATH:/singlefile/allscalar10k' using PigStorage() as (name:chararray, age:int, gpa:double, instate:boolean);
- C = union onschema a, b;
- store C into ':OUTPATH:';\,
- 'verify_pig_script' => q\
- a = load ':INPATH:/singlefile/allscalar10k' using PigStorage() as (name:chararray, age:int, gpa:double, instate:chararray);
- b = load ':INPATH:/singlefile/allscalar10k' using PigStorage() as (name:chararray, age:int, gpa:double, instate:chararray);
- C = union a, b;
- store C into ':OUTPATH:';\,
- }
- ]
- },
- {
- # Test Union using merge with Simple data types
- 'name' => 'UdfDistributedCache',
- 'tests' => [
- {
- 'num' => 1,
- 'execonly' => 'mapred', # since distributed cache is not supported in local mode
- 'pig' => q?
- register :FUNCPATH:/testudf.jar;
- define udfdc org.apache.pig.test.udf.evalfunc.Udfcachetest(':INPATH:/singlefile/votertab10k#foodle');
- a = load ':INPATH:/singlefile/studenttab10k' as (name, age, gpa);
- b = limit a 1;
- c = foreach b generate udfdc(age);
- STORE c into ':OUTPATH:';?,
- 'verify_pig_script' => q?
- a = load ':INPATH:/singlefile/studenttab10k' as (name, age, gpa);
- b = limit a 1;
- c = foreach b generate 'tom van buren', 68, 'socialist', 390.19;
- STORE c into ':OUTPATH:';?,
- },
- ]
- }, {
- 'name' => 'MonitoredUDF',
- 'tests' => [
- {
- 'num' => 1,
- 'ignore23' => 'guava version of Pig is higher than hadoop 23',
- 'pig' => q?register :FUNCPATH:/testudf.jar;
- define gm org.apache.pig.test.udf.evalfunc.GoodMonitored();
- a = load ':INPATH:/singlefile/studenttab10k' as (name, age, gpa);
- b = foreach a generate gm(name);
- store b into ':OUTPATH:';?,
- 'verify_pig_script' => q?a = load ':INPATH:/singlefile/studenttab10k' as (name, age, gpa);
- b = foreach a generate 'fred';
- store b into ':OUTPATH:';?,
- },{
- 'num' => 2,
- 'pig' => q?register :FUNCPATH:/testudf.jar;
- define bad org.apache.pig.test.udf.evalfunc.BadMonitored();
- a = load ':INPATH:/singlefile/studenttab10k' as (name, age, gpa);
- b = limit a 1;
- c = foreach b generate bad(name);
- store b into ':OUTPATH:';?,
- 'verify_pig_script' => q?a = load ':INPATH:/singlefile/studenttab10k' as (name, age, gpa);
- b = limit a 1;
- c = foreach b generate '';
- store b into ':OUTPATH:';?,
- },{
- 'num' => 3,
- 'pig' => q?register :FUNCPATH:/testudf.jar;
- define bad org.apache.pig.test.udf.evalfunc.BadMonitored();
- a = load ':INPATH:/singlefile/studenttab10k' as (name, age, gpa);
- b = limit a 1;
- c = foreach b generate bad(name);
- store b into ':OUTPATH:';?,
- 'verify_pig_script' => q?a = load ':INPATH:/singlefile/studenttab10k' as (name, age, gpa);
- b = limit a 1;
- c = foreach b generate 'barney';
- store b into ':OUTPATH:';?,
- }
- ],
- },{
- 'name' => 'MergeSparseJoin',
- 'tests' => [
- # Simplest merge-sparse-join.
- {
- 'num' => 1,
- 'pig' => q\register :PIGPATH:/contrib/piggybank/java/piggybank.jar
- a = load ':INPATH:/singlefile/studenttab10k';
- b = load ':INPATH:/singlefile/votertab10k';
- c = order a by $0;
- d = order b by $0;
- store c into ':OUTPATH:.intermediate1';
- store d into ':OUTPATH:.intermediate2' using org.apache.pig.piggybank.storage.IndexedStorage(',', '0');
- exec;
- e = load ':OUTPATH:.intermediate1';
- f = load ':OUTPATH:.intermediate2' using org.apache.pig.piggybank.storage.IndexedStorage(',', '0');
- g = join e by $0, f by $0 using 'merge-sparse';
- store g into ':OUTPATH:';\,
- 'verify_pig_script' => q\a = load ':INPATH:/singlefile/studenttab10k';
- b = load ':INPATH:/singlefile/votertab10k';
- g = join a by $0, b by $0;
- store g into ':OUTPATH:';\,
- 'notmq' => 1,
- },
- # Merge-sparse-join with left-side filter
- {
- 'num' => 2,
- 'pig' => q\register :PIGPATH:/contrib/piggybank/java/piggybank.jar
- a = load ':INPATH:/singlefile/studenttab10k';
- b = load ':INPATH:/singlefile/votertab10k';
- c = order a by $0;
- d = order b by $0;
- store c into ':OUTPATH:.intermediate1';
- store d into ':OUTPATH:.intermediate2' using org.apache.pig.piggybank.storage.IndexedStorage(',', '0');
- exec;
- e = load ':OUTPATH:.intermediate1';
- h = filter e by $1 > 30;
- f = load ':OUTPATH:.intermediate2' using org.apache.pig.piggybank.storage.IndexedStorage(',', '0');
- g = join h by $0, f by $0 using 'merge-sparse';
- store g into ':OUTPATH:';\,
- 'verify_pig_script' => q\a = load ':INPATH:/singlefile/studenttab10k';
- b = load ':INPATH:/singlefile/votertab10k';
- h = filter a by $1 > 30;
- g = join h by $0, b by $0;
- store g into ':OUTPATH:';\,
- 'notmq' => 1,
- },
- # Merge-sparse-join with right-side filter
- {
- 'num' => 3,
- 'pig' => q\register :PIGPATH:/contrib/piggybank/java/piggybank.jar
- a = load ':INPATH:/singlefile/studenttab10k';
- b = load ':INPATH:/singlefile/votertab10k';
- c = order a by $0;
- d = order b by $0;
- store c into ':OUTPATH:.intermediate1';
- store d into ':OUTPATH:.intermediate2' using org.apache.pig.piggybank.storage.IndexedStorage(',', '0');
- exec;
- e = load ':OUTPATH:.intermediate1';
- f = load ':OUTPATH:.intermediate2' using org.apache.pig.piggybank.storage.IndexedStorage(',', '0');
- i = filter f by $2 != 'democrat';
- g = join e by $0, i by $0 using 'merge-sparse';
- store g into ':OUTPATH:';\,
- 'verify_pig_script' => q\a = load ':INPATH:/singlefile/studenttab10k';
- b = load ':INPATH:/singlefile/votertab10k';
- i = filter b by $2 != 'democrat';
- g = join a by $0, i by $0;
- store g into ':OUTPATH:';\,
- 'notmq' => 1,
- },
- # Merge-sparse-join with key as expression
- {
- 'num' => 4,
- 'pig' => q\register :PIGPATH:/contrib/piggybank/java/piggybank.jar
- a = load ':INPATH:/singlefile/studenttab10k';
- b = load ':INPATH:/singlefile/votertab10k';
- c = order a by $0,$1;
- d = order b by $0,$1;
- store c into ':OUTPATH:.intermediate1';
- store d into ':OUTPATH:.intermediate2' using org.apache.pig.piggybank.storage.IndexedStorage(',', '0,1');
- exec;
- e = load ':OUTPATH:.intermediate1';
- f = load ':OUTPATH:.intermediate2' using org.apache.pig.piggybank.storage.IndexedStorage(',', '0,1');
- g = join e by ($0,$1), f by ($0,$1) using 'merge-sparse';
- store g into ':OUTPATH:';\,
- 'verify_pig_script' => q\a = load ':INPATH:/singlefile/studenttab10k';
- b = load ':INPATH:/singlefile/votertab10k';
- g = join a by ($0,$1), b by ($0,$1);
- store g into ':OUTPATH:';\,
- 'notmq' => 1,
- },
- # Merge-sparse-join with nulls in keys and data.
- {
- 'num' => 5,
- 'pig' => q\register :PIGPATH:/contrib/piggybank/java/piggybank.jar
- a = load ':INPATH:/singlefile/studentnulltab10k';
- b = load ':INPATH:/singlefile/voternulltab10k';
- c = order a by $0;
- d = order b by $0;
- store c into ':OUTPATH:.intermediate1';
- store d into ':OUTPATH:.intermediate2' using org.apache.pig.piggybank.storage.IndexedStorage(',', '0');
- exec;
- e = load ':OUTPATH:.intermediate1';
- f = load ':OUTPATH:.intermediate2' using org.apache.pig.piggybank.storage.IndexedStorage(',', '0');
- g = join e by $0, f by $0 using 'merge-sparse';
- store g into ':OUTPATH:';\,
- 'verify_pig_script' => q\a = load ':INPATH:/singlefile/studentnulltab10k';
- b = load ':INPATH:/singlefile/voternulltab10k';
- g = join a by $0, b by $0;
- store g into ':OUTPATH:';\,
- 'notmq' => 1,
- },
- # Merge-sparse-join with join on numeric key
- {
- 'num' => 6,
- 'pig' => q\register :PIGPATH:/contrib/piggybank/java/piggybank.jar
- a = load ':INPATH:/singlefile/studenttab10k' as (name:chararray, age:int, gpa:float);
- b = load ':INPATH:/singlefile/votertab10k'as (name:chararray, age:int, reg:chararray, contrib:float);
- c = order a by age;
- d = order b by age;
- store c into ':OUTPATH:.intermediate1';
- store d into ':OUTPATH:.intermediate2' using org.apache.pig.piggybank.storage.IndexedStorage(',', '0');
- exec;
- e = load ':OUTPATH:.intermediate1' as (name:chararray, age:int, gpa:float);
- f = load ':OUTPATH:.intermediate2' using org.apache.pig.piggybank.storage.IndexedStorage(',', '0') as (name:chararray, age:int, reg:chararray, contrib:float);
- g = join e by age, f by age using 'merge-sparse';
- store g into ':OUTPATH:';\,
- 'verify_pig_script' => q\a = load ':INPATH:/singlefile/studenttab10k' as (name:chararray, age:int, gpa:float);
- b = load ':INPATH:/singlefile/votertab10k'as (name:chararray, age:int, reg:chararray, contrib:float);
- g = join a by age, b by age;
- store g into ':OUTPATH:';\,
- 'notmq' => 1,
- }
- ],
- },{
- 'name' => 'BugFix',
- 'tests' => [
- {
- # PIG-2286
- 'num' => 1,
- 'pig' => q?A = LOAD ':INPATH:/singlefile/studenttab10k' AS (name, age:double, gpa:double);
- B = group A all;
- C = foreach B generate group, COR(A.age, A.gpa);
- store C into ':OUTPATH:';?,
- 'verify_pig_script' => q?set pig.exec.nocombiner true
- A = LOAD ':INPATH:/singlefile/studenttab10k' AS (name, age:double ,gpa:double);
- B = group A all;
- C = foreach B generate group, COR(A.age, A.gpa);
- store C into ':OUTPATH:';?,
- }, {
- # PIG-2286, with 3 inputs to COR
- 'num' => 2,
- 'pig' => q?A = LOAD ':INPATH:/singlefile/studenttab10k' AS (name, age:double ,gpa:double);
- B = foreach A generate age, gpa, gpa*gpa as gpa2;
- C = group B all;
- D = foreach C generate group, COR(B.age, B.gpa, B.gpa2);
- store D into ':OUTPATH:';?,
- 'verify_pig_script' => q?set pig.exec.nocombiner true
- A = LOAD ':INPATH:/singlefile/studenttab10k' AS (name, age:double ,gpa:double);
- B = foreach A generate age, gpa, gpa*gpa as gpa2;
- C = group B all;
- D = foreach C generate group, COR(B.age, B.gpa, B.gpa2);
- store D into ':OUTPATH:';?,
- }, {
- # PIG-2385
- 'num' => 3,
- 'pig_params' => ['-M'],
- 'pig' => q?A = LOAD ':INPATH:/singlefile/studenttab10k' AS (name:chararray, age:int,gpa:double);
- Z = group A all;
- Z1 = foreach Z generate AVG(A.gpa) as avg;
- B = foreach A generate name, age, gpa-Z1.avg as diff;
- STORE B INTO ':OUTPATH:.1';
- C = DISTINCT B ;
- store C into ':OUTPATH:.2';?,
- 'verify_pig_script' => q?A = LOAD ':INPATH:/singlefile/studenttab10k' AS (name:chararray, age:int,gpa:double);
- Z = group A all;
- Z1 = foreach Z generate AVG(A.gpa) as avg;
- B = cross A, Z1;
- B1 = foreach B generate name, age, gpa-Z1.avg as diff;
- STORE B1 INTO ':OUTPATH:.1';
- C = DISTINCT B1 ;
- store C into ':OUTPATH:.2';?,
- }, {
- # PIG-2576
- 'num' => 4,
- 'execonly' => 'mapred',
- 'pig' => q?register :FUNCPATH:/testudf.jar;
- define printconf org.apache.pig.test.udf.evalfunc.UdfContextFrontend('dummy');
- a = load ':INPATH:/singlefile/studenttab10k' as (name, age, gpa);
- b = limit a 1;
- c = foreach b generate printconf(name);
- store c into ':OUTPATH:';
- fs -ls;
- ?,
- 'rc' => 0,
- 'not_expected_out_regex' => "checkJobConf: conf is null: false",
- 'expected_out_regex' => "checkJobConf: conf is null: true",
- }
- ],
- },{
- 'name' => 'Bloom',
- 'execonly' => 'mapred', # distributed cache does not work in local mode
- 'tests' => [
- {
- 'num' => 1,
- 'pig' => "define bb BuildBloom('Hash.JENKINS_HASH', 'fixed', '128', '3');
- A = LOAD ':INPATH:/singlefile/studenttab10k' AS (name:chararray, age:int, gpa:double);
- B = filter A by name == 'alice allen';
- C = group B all;
- D = foreach C generate bb(B.name);
- store D into ':HDFSTMP:/mybloom_1';
- exec;
- define bloom Bloom(':HDFSTMP:/mybloom_1');
- E = LOAD ':INPATH:/singlefile/studenttab10k' AS (name:chararray, age:int, gpa:double);
- F = filter E by bloom(name);
- store F into ':OUTPATH:';",
- 'notmq' => 1,
- 'verify_pig_script' => "
- A = LOAD ':INPATH:/singlefile/studenttab10k' AS (name, age:int ,gpa:double);
- B = filter A by name == 'alice allen';
- store B into ':OUTPATH:';",
- }, {
- 'num' => 2,
- 'pig' => "define bb BuildBloom('Hash.MURMUR_HASH', 'fixed', '128', '3');
- A = LOAD ':INPATH:/singlefile/studenttab10k' AS (name:chararray, age:int, gpa:double);
- B = filter A by name == 'alice allen';
- C = group B all;
- D = foreach C generate bb(B.name);
- store D into ':HDFSTMP:/mybloom_2';
- exec;
- define bloom Bloom(':HDFSTMP:/mybloom_2');
- E = LOAD ':INPATH:/singlefile/studenttab10k' AS (name:chararray, age:int, gpa:double);
- F = filter E by bloom(name);
- G = load ':INPATH:/singlefile/votertab10k'as (name:chararray, age:int, reg:chararray, contrib:float);
- H = join F by name, G by name;
- store H into ':OUTPATH:';",
- 'notmq' => 1,
- 'verify_pig_script' => "
- A = LOAD ':INPATH:/singlefile/studenttab10k' AS (name, age:int ,gpa:double);
- B = filter A by name == 'alice allen';
- C = load ':INPATH:/singlefile/votertab10k'as (name:chararray, age:int, reg:chararray, contrib:float);
- D = join B by name, C by name;
- store D into ':OUTPATH:';",
- },{
- 'num' => 3,
- 'pig' => "define bb BuildBloom('Hash.JENKINS_HASH', '1', '0.0001');
- A = LOAD ':INPATH:/singlefile/studenttab10k' AS (name:chararray, age:int, gpa:double);
- B = filter A by name == 'alice allen';
- C = group B all;
- D = foreach C generate bb(B.name);
- store D into ':HDFSTMP:/mybloom_3';
- exec;
- define bloom Bloom(':HDFSTMP:/mybloom_3');
- E = LOAD ':INPATH:/singlefile/studenttab10k' AS (name:chararray, age:int, gpa:double);
- F = filter E by bloom(name);
- G = load ':INPATH:/singlefile/votertab10k'as (name:chararray, age:int, reg:chararray, contrib:float);
- H = join G by name, F by name using 'repl';
- store H into ':OUTPATH:';",
- 'notmq' => 1,
- 'verify_pig_script' => "
- A = LOAD ':INPATH:/singlefile/studenttab10k' AS (name, age:int ,gpa:double);
- B = filter A by name == 'alice allen';
- C = load ':INPATH:/singlefile/votertab10k'as (name:chararray, age:int, reg:chararray, contrib:float);
- D = join C by name, B by name;
- store D into ':OUTPATH:';",
- }
- ],
- },{
- 'name' => 'UDFContext',
- 'tests' => [
- {
- # See PIG-2338
- 'num' => 1,
- 'pig' => q?register :FUNCPATH:/testudf.jar
- a = load ':INPATH:/singlefile/studenttab10k' AS (a0);
- b = foreach a generate org.apache.pig.test.udf.evalfunc.UDFContextTestUDF(a0);
- c = load ':INPATH:/singlefile/studenttab10k' AS (c0:chararray);
- d = foreach c generate org.apache.pig.test.udf.evalfunc.UDFContextTestUDF(c0);
- e = union b, d;
- store e into ':OUTPATH:';?,
- 'verify_pig_script' => q?a = load ':INPATH:/singlefile/studenttab10k' AS (a0);
- b = foreach a generate '{a0: bytearray}';
- c = load ':INPATH:/singlefile/studenttab10k' AS (c0:chararray);
- d = foreach c generate '{c0: chararray}';
- e = union b, d;
- store e into ':OUTPATH:';?,
- }
- ],
-
- },{
- 'name' => 'UDFContextAuto',
- 'tests' => [
- {
- # See PIG-2337
- 'num' => 1,
- 'pig' => q?register :FUNCPATH:/testudf.jar
- a = load ':INPATH:/singlefile/studenttab10k' AS (a0);
- b = foreach a generate org.apache.pig.test.udf.evalfunc.UDFContextTestUDF(a0);
- c = load ':INPATH:/singlefile/studenttab10k' AS (c0:chararray);
- d = foreach c generate org.apache.pig.test.udf.evalfunc.UDFContextTestUDF(c0);
- e = union b, d;
- store e into ':OUTPATH:';?,
- 'verify_pig_script' => q?a = load ':INPATH:/singlefile/studenttab10k' AS (a0);
- b = foreach a generate '{a0: bytearray}';
- c = load ':INPATH:/singlefile/studenttab10k' AS (c0:chararray);
- d = foreach c generate '{c0: chararray}';
- e = union b, d;
- store e into ':OUTPATH:';?,
- }
- ],
- },{
- 'name' => 'JsonLoaderStorage',
- 'tests' => [
- {
- 'num' => 1,
- 'pig' => q?A = LOAD ':INPATH:/singlefile/studenttab10k' AS (name:chararray, age:int, gpa:double);
- store A into ':OUTPATH:.intermediate' using JsonStorage();
- exec
- A = LOAD ':OUTPATH:.intermediate' using JsonLoader();
- store A into ':OUTPATH:';?,
- 'notmq' => 1,
- 'verify_pig_script' => q?A = LOAD ':INPATH:/singlefile/studenttab10k' AS (name:chararray, age:int,gpa:double);
- store A into ':OUTPATH:';?,
- }, {
- 'num' => 2,
- 'pig' => q?A = LOAD ':INPATH:/singlefile/studenttab10k' AS (name:chararray, age:int, gpa:double);
- store A into ':OUTPATH:.intermediate1' using JsonStorage();
- B = LOAD ':INPATH:/singlefile/votertab10k' AS (name:chararray, age:int, registration:chararray, contributions:double);
- store B into ':OUTPATH:.intermediate2' using JsonStorage();
- exec
- A = LOAD ':OUTPATH:.intermediate1' using JsonLoader();
- B = LOAD ':OUTPATH:.intermediate2' using JsonLoader();
- C = JOIN A by name, B by name;
- store C into ':OUTPATH:';?,
- 'notmq' => 1,
- 'verify_pig_script' => q?A = LOAD ':INPATH:/singlefile/studenttab10k' AS (name:chararray, age:int,gpa:double);
- B = LOAD ':INPATH:/singlefile/votertab10k' AS (name:chararray, age:int, registration:chararray, contributions:double);
- C = JOIN A by name, B by name;
- store C into ':OUTPATH:';?,
- }, {
- 'num' => 3,
- 'ignore' => 1, # PIG-2594
- 'pig' => q\a = load ':INPATH:/singlefile/allscalar10k' using PigStorage() as (name:chararray, age:int, gpa:double, instate:boolean);
- store a into ':OUTPATH:.intermediate' using JsonStorage();
- exec
- B = LOAD ':OUTPATH:.intermediate' using JsonLoader();
- store B into ':OUTPATH:';\,
- 'notmq' => 1,
- 'verify_pig_script' => q\a = load ':INPATH:/singlefile/allscalar10k' using PigStorage() as (name:chararray, age:int, gpa:double, instate:chararray);
- store a into ':OUTPATH:';\,
- }
- ],
- },{
- 'name' => 'STRSPLIT',
- 'tests' => [
- {
- # See PIG-2311
- 'num' => 1,
- 'pig' => q?a = load ':INPATH:/singlefile/studenttab10k' AS (a0);
- b= filter a by NOT (a0 is null);
- c= foreach b generate STRSPLIT(a0);
- store c into ':OUTPATH:';?,
- 'verify_pig_script' => q?a = load ':INPATH:/singlefile/studenttab10k' AS (a0);
- b= filter a by NOT (a0 is null);
- b= foreach b generate (chararray)a0 as a0 ;
- c= foreach b generate STRSPLIT(a0);
- store c into ':OUTPATH:';?,
- }
- ],
- },
- {
- 'name' => 'Tokenize',
- 'tests' => [
- {
- 'num' => 1,
- 'pig' => q\
- A = LOAD ':INPATH:/singlefile/studenttab10k';
- B = foreach A generate TOKENIZE($0);
- store B into ':OUTPATH:';\,
- },
- {
- 'num' => 2,
- 'pig' => q\
- A = LOAD ':INPATH:/singlefile/studenttab10k';
- B = foreach A generate TOKENIZE($1,'9');
- store B into ':OUTPATH:';\,
- 'verify_pig_script' => q\
- A = LOAD ':INPATH:/singlefile/studenttab10k';
- -- TOKENIZE has tokens hardcoded so have to replace the '9' with
- -- one of the hardcoded tokens
- B = foreach A generate TOKENIZE(REPLACE($1, '9', ','));
- store B into ':OUTPATH:';\,
- }
- ]
- }, {
- 'name' => 'Realias',
- 'tests' => [
- {
- 'num' => 1,
- 'pig' => q\
- A = LOAD ':INPATH:/singlefile/studenttab10k';
- B = A;
- store B into ':OUTPATH:';\,
- 'verify_pig_script' => q\
- A = LOAD ':INPATH:/singlefile/studenttab10k';
- store A into ':OUTPATH:';\,
- }
- ]
- },
- {
- 'name' => 'NestedForEach',
- 'tests' => [
- {
- 'num' => 1,
- 'pig' => q\
- A = LOAD ':INPATH:/singlefile/studenttab10k' as (name:chararray, age:int, gpa:double);
- B = group A by name;
- C = foreach B {
- C1 = foreach A generate UPPER(name), age+1 as age, gpa;
- generate C1;
- }
- D = foreach C generate flatten(C1);
- store D into ':OUTPATH:';\,
- 'verify_pig_script' => q\
- A = LOAD ':INPATH:/singlefile/studenttab10k' as (name:chararray, age:int, gpa:double);
- B = foreach A generate UPPER(name), age+1, gpa;
- store B into ':OUTPATH:';\,
- },
- {
- 'num' => 2,
- 'pig' => q\
- A = LOAD ':INPATH:/singlefile/studenttab10k' as (name:chararray, age:int, gpa:double);;
- B = group A by name;
- C = foreach B {
- C1 = A.age;
- C2 = filter C1 by age>=30;
- C3 = foreach C2 generate age+1 as age;
- C4 = order C3 by age desc;
- generate C4;
- }
- D = foreach C generate flatten(C4);
- store D into ':OUTPATH:';\,
- 'verify_pig_script' => q\
- A = LOAD ':INPATH:/singlefile/studenttab10k' as (name:chararray, age:int, gpa:double);
- B = filter A by age>=30;
- C = foreach B generate age+1 as age;
- D = order C by age desc;
- store D into ':OUTPATH:';\,
- }
- ]
- },
- {
- 'name' => 'BagToTuple',
- 'tests' => [
- {
- # basic test of converting bag to tuples. Use the first and last tuple in the bag b
- 'num' => 1,
- 'pig' => q\
- a = load ':INPATH:/singlefile/studentcomplextab10k' using PigStorage() as (m:[], t:(name:chararray, age:int, gpa:double), b:{t:(name:chararray, age:int, gpa:double)});
- filterA = filter a by b is not null and COUNT(b) > 1;
- b = foreach filterA {
- order_desc = order b by age desc;
- limit_desc = limit order_desc 1;
- order_asc = order b by age asc;
- limit_asc = limit order_asc 1;
- generate FLATTEN(limit_desc), FLATTEN(limit_asc);
- };
- c = foreach b generate TOTUPLE(limit_desc::name, limit_desc::age, limit_desc::gpa) as first_t,
- TOTUPLE(limit_asc::name, limit_asc::age, limit_asc::gpa) as second_t;
- d = foreach c generate TOBAG(first_t, second_t) as n_bag;
- e = foreach d generate BagToTuple(n_bag);
- store e into ':OUTPATH:';\,
- 'verify_pig_script' => q\
- a = load ':INPATH:/singlefile/studentcomplextab10k' using PigStorage() as (m:[], t:(name:chararray, age:int, gpa:double), b:{t:(name:chararray, age:int, gpa:double)});
- filterA = filter a by b is not null and COUNT(b) > 1;
- b = foreach filterA {
- order_desc = order b by age desc;
- limit_desc = limit order_desc 1;
- order_asc = order b by age asc;
- limit_asc = limit order_asc 1;
- generate FLATTEN(limit_desc), FLATTEN(limit_asc);
- };
- c = foreach b generate TOTUPLE(limit_desc::name, limit_desc::age, limit_desc::gpa, limit_asc::name,limit_asc::age, limit_asc::gpa) as big_t;
- store c into ':OUTPATH:';\,
- },
- {
- # covert an existing tuple to bag and use the output of BagToTuple
- 'num' => 2,
- 'pig' => q\
- a = load ':INPATH:/singlefile/studentcomplextab10k' using PigStorage() as (m:[], t:(name:chararray, age:int, gpa:double), b:{t:(name:chararray, age:int, gpa:double)});
- b = filter a by t is not null;
- c = foreach b generate TOBAG(t) as newBag;
- d = foreach c generate BagToTuple(newBag);
- store d into ':OUTPATH:';\,
- 'verify_pig_script' => q\
- a = load ':INPATH:/singlefile/studentcomplextab10k' using PigStorage() as (m:[], t:(name:chararray, age:int, gpa:double), b:{t:(name:chararray, age:int, gpa:double)});
- b = filter a by t is not null;
- c = foreach b generate t;
- store c into ':OUTPATH:';\,
- },
- ]
- },
- {
- 'name' => 'BagToString',
- 'tests' => [
- {
- 'num' => 1,
- 'pig' => q\
- a = load ':INPATH:/singlefile/studentcomplextab10k' using PigStorage() as (m:[], t:(name:chararray, age:int, gpa:double), b:{t:(name:chararray, age:int, gpa:double)});
- filterA = filter a by b is not null and COUNT(b) > 1;
- b = foreach filterA {
- order_desc = order b by age desc;
- limit_desc = limit order_desc 1;
- order_asc = order b by age asc;
- limit_asc = limit order_asc 1;
- generate FLATTEN(limit_desc), FLATTEN(limit_asc);
- };
- c = foreach b generate TOTUPLE(limit_desc::name, limit_desc::age, limit_desc::gpa) as first_t,
- TOTUPLE(limit_asc::name, limit_asc::age, limit_asc::gpa) as second_t;
- d = foreach c generate TOBAG(first_t, second_t) as n_bag;
- e = foreach d generate BagToString(n_bag);
- store e into ':OUTPATH:';\,
- 'verify_pig_script' => q\
- a = load ':INPATH:/singlefile/studentcomplextab10k' using PigStorage() as (m:[], t:(name:chararray, age:int, gpa:double), b:{t:(name:chararray, age:int, gpa:double)});
- filterA = filter a by b is not null and COUNT(b) > 1;
- b = foreach filterA {
- order_desc = order b by age desc;
- limit_desc = limit order_desc 1;
- order_asc = order b by age asc;
- limit_asc = limit order_asc 1;
- generate FLATTEN(limit_desc), FLATTEN(limit_asc);
- };
- c = foreach b generate CONCAT(limit_desc::name, CONCAT('_', CONCAT((chararray)limit_desc::age, CONCAT('_', CONCAT((chararray)limit_desc::gpa, CONCAT('_',CONCAT(limit_asc::name,CONCAT('_',CONCAT((chararray)limit_asc::age, CONCAT('_',(chararray)limit_asc::gpa)))))))))) as big_t;
- store c into ':OUTPATH:';\,
- },
- {
- 'num' => 2,
- 'pig' => q\
- a = load ':INPATH:/singlefile/studentcomplextab10k' using PigStorage() as (m:[], t:(name:chararray, age:int, gpa:double), b:{t:(name:chararray, age:int, gpa:double)});
- b = filter a by t is not null;
- c = foreach b generate TOBAG(t) as newBag;
- d = foreach c generate BagToString(newBag);
- store d into ':OUTPATH:';\,
- 'verify_pig_script' => q\
- a = load ':INPATH:/singlefile/studentcomplextab10k' using PigStorage() as (m:[], t:(name:chararray, age:int, gpa:double), b:{t:(name:chararray, age:int, gpa:double)});
- b = filter a by t is not null;
- c = foreach b generate CONCAT(t.name, CONCAT('_', CONCAT((chararray)t.age, CONCAT('_', (chararray)t.gpa))));
- store c into ':OUTPATH:';\,
- },
- ]
- },
- {
- 'name' => 'NestedCross',
- 'tests' => [
- {
- 'num' => 1,
- 'pig' => q\
- A = LOAD ':INPATH:/singlefile/studenttab10k' as (name:chararray, age:int, gpa:double);
- B = LOAD ':INPATH:/singlefile/votertab10k' as (name:chararray, age:int, registration, contributions:double);
- C = cogroup A by name, B by name;
- D = foreach C {
- C1 = cross A, B;
- generate flatten(C1);
- }
- store D into ':OUTPATH:';\,
- 'verify_pig_script' => q\
- A = LOAD ':INPATH:/singlefile/studenttab10k' as (name:chararray, age:int, gpa:double);
- B = LOAD ':INPATH:/singlefile/votertab10k' as (name:chararray, age:int, registration, contributions:double);
- C = JOIN A by name, B by name;
- store C into ':OUTPATH:';\,
- },
- {
- 'num' => 2,
- 'pig' => q\
- A = LOAD ':INPATH:/singlefile/studenttab10k' as (name:chararray, age:int, gpa:double);
- B = LOAD ':INPATH:/singlefile/votertab10k' as (name:chararray, age:int, registration, contributions:double);
- C = cogroup A by name, B by name;
- D = foreach C {
- C1 = filter A by gpa > 4;
- C2 = filter B by contributions > 500;
- C3 = cross C1, C2;
- C4 = foreach C3 generate CONCAT(CONCAT((chararray)gpa, '_'), (chararray)contributions);
- generate flatten(C4);
- }
- store D into ':OUTPATH:';\,
- 'verify_pig_script' => q\
- A = LOAD ':INPATH:/singlefile/studenttab10k' as (name:chararray, age:int, gpa:double);
- B = LOAD ':INPATH:/singlefile/votertab10k' as (name:chararray, age:int, registration, contributions:double);
- C = filter A by gpa > 4;
- D = filter B by contributions > 500;
- E = JOIN C by name, D by name;
- F = foreach E generate CONCAT(CONCAT((chararray)gpa, '_'), (chararray)contributions);
- store F into ':OUTPATH:';\,
- }
- ]
- }
- ],
- },
- ;