PageRenderTime 84ms CodeModel.GetById 20ms RepoModel.GetById 0ms app.codeStats 1ms

/test/e2e/pig/tests/nightly.conf

https://github.com/ftian/pig
Perl | 4983 lines | 4534 code | 111 blank | 338 comment | 102 complexity | baec46f614bd4cfd568cf24e6026f5d7 MD5 | raw file
Possible License(s): Apache-2.0
  1. #!/usr/bin/env perl
  2. ############################################################################
  3. # Licensed to the Apache Software Foundation (ASF) under one or more
  4. # contributor license agreements. See the NOTICE file distributed with
  5. # this work for additional information regarding copyright ownership.
  6. # The ASF licenses this file to You under the Apache License, Version 2.0
  7. # (the "License"); you may not use this file except in compliance with
  8. # the License. You may obtain a copy of the License at
  9. #
  10. # http://www.apache.org/licenses/LICENSE-2.0
  11. #
  12. # Unless required by applicable law or agreed to in writing, software
  13. # distributed under the License is distributed on an "AS IS" BASIS,
  14. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  15. # See the License for the specific language governing permissions and
  16. # limitations under the License.
  17. ###############################################################################
  18. # Nightly tests for pig.
  19. #
  20. #
  21. #PigSetup::setup();
  22. #my $me = `whoami`;
  23. #chomp $me;
  24. $cfg = {
  25. 'driver' => 'Pig',
  26. 'nummachines' => 5,
  27. 'verify_with_pig' => 1,
  28. 'verify_pig_version' => 'old',
  29. 'groups' => [
  30. {
  31. 'name' => 'Checkin',
  32. 'tests' => [
  33. {
  34. 'num' => 1,
  35. 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k' as (name, age, gpa);
  36. store a into ':OUTPATH:';\,
  37. },
  38. {
  39. 'num' => 2,
  40. 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k' as (name, age, gpa);
  41. b = load ':INPATH:/singlefile/votertab10k' as (name, age, registration, contributions);
  42. c = filter a by age < 50;
  43. d = filter b by age < 50;
  44. e = cogroup c by (name, age), d by (name, age) ;
  45. f = foreach e generate flatten(c), flatten(d);
  46. g = group f by registration;
  47. h = foreach g generate group, SUM(f.d::contributions);
  48. i = order h by $1;
  49. store i into ':OUTPATH:';\,
  50. 'floatpostprocess' => 1,
  51. 'delimiter' => ' ',
  52. 'sortArgs' => ['-t', ' ', '-k', '2,2'],
  53. }
  54. ]
  55. },
  56. {
  57. 'name' => 'LoaderDefaultDir',
  58. 'tests' => [
  59. {
  60. 'num' => 1,
  61. 'pig' => q\a = load ':INPATH:/dir/studenttab10k' as (name, age, gpa);
  62. store a into ':OUTPATH:';\,
  63. },
  64. ]
  65. },
  66. {
  67. 'name' => 'LoaderPigStorageArg',
  68. 'tests' => [
  69. {
  70. 'num' => 1,
  71. 'pig' => q\a = load ':INPATH:/singlefile/studentcolon10k' using PigStorage(':') as (name, age, gpa);
  72. store a into ':OUTPATH:';\,
  73. },
  74. {
  75. # load with control character
  76. 'num' => 2,
  77. 'pig' => q#a = load ':INPATH:/singlefile/studentctrla10k' using PigStorage('\\u0001') as (name, age, gpa);
  78. store a into ':OUTPATH:';#,
  79. },
  80. {
  81. # load and store with control character
  82. 'num' => 3,
  83. 'pig' => q#a = load ':INPATH:/singlefile/studentctrla10k' using PigStorage('\\u0001') as (name, age, gpa);
  84. store a into ':OUTPATH:.intermediate' using PigStorage('\\u0001');
  85. b = load ':OUTPATH:.intermediate' using PigStorage('\\u0001') as (name, age, gpa);
  86. store b into ':OUTPATH:'; #,
  87. 'notmq' => 1,
  88. },
  89. ]
  90. },
  91. {
  92. # Results doctored, if you change this query you need to copy the
  93. # expected results into test/nightly/benchmarks
  94. 'name' => 'LoaderBinStorage',
  95. 'tests' => [
  96. {
  97. 'num' => 1,
  98. 'pig' => q\register :FUNCPATH:/testudf.jar;
  99. a = load ':INPATH:/singlefile/studenttab10k' as (name, age, gpa);
  100. b = foreach a generate name, org.apache.pig.test.udf.evalfunc.Swap(name, age), TOKENIZE((chararray)name), org.apache.pig.test.udf.evalfunc.CreateMap((chararray)name, age);
  101. store b into ':OUTPATH:.intermediate' using BinStorage();
  102. c = load ':OUTPATH:.intermediate' using BinStorage();
  103. store c into ':OUTPATH:' using org.apache.pig.test.udf.storefunc.StringStore();\,
  104. 'notmq' => 1,
  105. },
  106. ]
  107. },
  108. {
  109. # Results doctored, if you change this query you need to copy the
  110. # expected results into test/nightly/benchmarks
  111. 'name' => 'LoaderTextLoader',
  112. 'tests' => [
  113. {
  114. 'num' => 1,
  115. 'pig' => q\register :FUNCPATH:/testudf.jar;
  116. a = load ':INPATH:/singlefile/textdoc' using TextLoader();
  117. b = foreach a generate TOKENIZE((chararray)$0);
  118. store b into ':OUTPATH:' using org.apache.pig.test.udf.storefunc.StringStore();\,
  119. },
  120. ]
  121. },
  122. {
  123. 'name' => 'FilterBoolean',
  124. 'tests' => [
  125. {
  126. 'num' => 1,
  127. 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name, age, gpa);
  128. b = filter a by name == 'fred allen' and age > 50;
  129. store b into ':OUTPATH:' using PigStorage;\,
  130. },
  131. {
  132. 'num' => 2,
  133. 'pig' => q\a = load ':INPATH:/dir/studenttab10k' using PigStorage() as (name, age, gpa);
  134. b = filter a by name != 'fred allen' or age < 10;
  135. store b into ':OUTPATH:' using PigStorage;\,
  136. },
  137. {
  138. 'num' => 3,
  139. 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name, age, gpa);
  140. b = filter a by not (age == 50);
  141. store b into ':OUTPATH:' using PigStorage;\,
  142. },
  143. {
  144. 'num' => 4,
  145. 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name, age, gpa);
  146. b = filter a by (age >= 50 or name > 'fred') and (gpa <= 3.0 or name >= 'bob');
  147. store b into ':OUTPATH:' using PigStorage;\,
  148. },
  149. {
  150. 'num' => 5,
  151. 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name, age, gpa);
  152. b = filter a by age >= 50 or name > 'fred' and gpa <= 3.0 or name >= 'bob';
  153. store b into ':OUTPATH:' using PigStorage;\,
  154. },
  155. # test filter <= and >= for chararray, int and double
  156. {
  157. 'num' => 6,
  158. 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name:chararray, age:int, gpa:double);
  159. b = filter a by age >= 40 and age <=50 and gpa >= 2.0 and gpa <= 3.0 and name >= 'bob' and name <= 'fred';
  160. store b into ':OUTPATH:' using PigStorage;\,
  161. },
  162. # test filter <= and >= for bytearray, long and float
  163. {
  164. 'num' => 7,
  165. 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name, age:long, gpa:float);
  166. b = filter a by age >= 40 and age <=50 and gpa >= 2.0f and gpa <= 3.0f and name >= 'bob' and name <= 'fred';
  167. store b into ':OUTPATH:' using PigStorage;\,
  168. },
  169. # test filter < and > for chararray, int and double
  170. {
  171. 'num' => 8,
  172. 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name:chararray, age:int, gpa:double);
  173. b = filter a by age > 40 and age <50 and gpa > 2.0 and gpa < 3.0 and name > 'bob' and name < 'fred';
  174. store b into ':OUTPATH:' using PigStorage;\,
  175. },
  176. # test filter < and > for bytearray, long and float
  177. {
  178. 'num' => 9,
  179. 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name, age:long, gpa:float);
  180. b = filter a by age > 40 and age <50 and gpa > 2.0f and gpa < 3.0f and name > 'bob' and name < 'fred';
  181. store b into ':OUTPATH:' using PigStorage;\,
  182. },
  183. # test filter <= and >= for explicit cast for chararray, int and double
  184. {
  185. 'num' => 10,
  186. 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name, age, gpa);
  187. b = filter a by (int)age >= 40 and (int)age <=50 and (double)gpa >= 2.0 and (double)gpa <= 3.0 and (chararray)name >= 'bob' and (chararray)name <= 'fred';
  188. store b into ':OUTPATH:' using PigStorage;\,
  189. },
  190. # test filter <= and >= for explicit cast for bytearray, long and float
  191. {
  192. 'num' => 11,
  193. 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name, age, gpa);
  194. b = filter a by (long)age >= 40 and (long)age <=50 and (float)gpa >= 2.0f and (float)gpa <= 3.0f and name >= 'bob' and name <= 'fred';
  195. store b into ':OUTPATH:' using PigStorage;\,
  196. },
  197. # test filter < and > for explicit cast for chararray, int and double
  198. {
  199. 'num' => 12,
  200. 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name, age, gpa);
  201. b = filter a by (int)age > 40 and (int)age <50 and (double)gpa > 2.0 and (double)gpa < 3.0 and (chararray)name > 'bob' and (chararray)name < 'fred';
  202. store b into ':OUTPATH:' using PigStorage;\,
  203. },
  204. # test filter < and > for explicit cast for bytearray, long and float
  205. {
  206. 'num' => 13,
  207. 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name, age, gpa);
  208. b = filter a by (long)age > 40 and (long)age <50 and (float)gpa > 2.0f and (float)gpa < 3.0f and name > 'bob' and name < 'fred';
  209. store b into ':OUTPATH:' using PigStorage;\,
  210. },
  211. # test AND with nulls
  212. {
  213. 'num' => 14,
  214. 'pig' => q\a = load ':INPATH:/singlefile/studentnulltab10k' using PigStorage() as (name, age, gpa);
  215. b = filter a by name == 'fred allen' and age > 50;
  216. store b into ':OUTPATH:' using PigStorage;\,
  217. },
  218. # test OR with nulls
  219. {
  220. 'num' => 15,
  221. 'pig' => q\a = load ':INPATH:/singlefile/studentnulltab10k' using PigStorage() as (name, age, gpa);
  222. b = filter a by name != 'fred allen' or age < 10;
  223. store b into ':OUTPATH:' using PigStorage;\,
  224. },
  225. # test with nulls filter <= and >= for chararray, int and double
  226. {
  227. 'num' => 16,
  228. 'pig' => q\a = load ':INPATH:/singlefile/studentnulltab10k' using PigStorage() as (name:chararray, age:int, gpa:double);
  229. b = filter a by age >= 40 and age <=50 and gpa >= 2.0 and gpa <= 3.0 and name >= 'bob' and name <= 'fred';
  230. store b into ':OUTPATH:' using PigStorage;\,
  231. },
  232. # test with nulls filter < and > for explicit cast for chararray, int and double
  233. {
  234. 'num' => 17,
  235. 'pig' => q\a = load ':INPATH:/singlefile/studentnulltab10k' using PigStorage() as (name, age, gpa);
  236. b = filter a by (int)age > 40 and (int)age <50 and (double)gpa > 2.0 and (double)gpa < 3.0 and (chararray)name > 'bob' and (chararray)name < 'fred';
  237. store b into ':OUTPATH:' using PigStorage;\,
  238. },
  239. {
  240. 'num' => 18,
  241. 'ignore' => 1, # PIG-2593 this case is not supported as instate need to be declared as boolean
  242. 'pig' => q\a = load ':INPATH:/singlefile/allscalar10k' as (name, age, gpa, instate);
  243. b = filter a by instate;
  244. store b into ':OUTPATH:' using PigStorage;\,
  245. 'verify_pig_script' => q\a = load ':INPATH:/singlefile/allscalar10k' as (name, age, gpa, instate);
  246. b = filter a by instate == 'true';
  247. store b into ':OUTPATH:' using PigStorage;\,
  248. },
  249. {
  250. 'num' => 19,
  251. 'ignore' => 1, # PIG-2593 this case is not supported as instate need to be declared as boolean
  252. 'pig' => q\a = load ':INPATH:/singlefile/allscalar10k' as (name, age, gpa, instate);
  253. b = filter a by not instate;
  254. store b into ':OUTPATH:' using PigStorage;\,
  255. 'verify_pig_script' => q\a = load ':INPATH:/singlefile/allscalar10k' as (name, age, gpa, instate);
  256. b = filter a by instate == 'false';
  257. store b into ':OUTPATH:' using PigStorage;\,
  258. },
  259. {
  260. 'num' => 20,
  261. 'pig' => q\a = load ':INPATH:/singlefile/allscalar10k' as (name, age, gpa, instate);
  262. b = filter a by instate is null;
  263. store b into ':OUTPATH:' using PigStorage;\,
  264. },
  265. {
  266. 'num' => 21,
  267. 'pig' => q\a = load ':INPATH:/singlefile/allscalar10k' as (name, age, gpa, instate);
  268. b = filter a by instate == true;
  269. store b into ':OUTPATH:' using PigStorage;\,
  270. 'verify_pig_script' => q\a = load ':INPATH:/singlefile/allscalar10k' as (name, age, gpa, instate);
  271. b = filter a by instate == 'true';
  272. store b into ':OUTPATH:' using PigStorage;\,
  273. },
  274. {
  275. 'num' => 22,
  276. 'pig' => q\a = load ':INPATH:/singlefile/allscalar10k' as (name, age, gpa, instate);
  277. b = filter a by instate == false;
  278. store b into ':OUTPATH:' using PigStorage;\,
  279. 'verify_pig_script' => q\a = load ':INPATH:/singlefile/allscalar10k' as (name, age, gpa, instate);
  280. b = filter a by instate == 'false';
  281. store b into ':OUTPATH:' using PigStorage;\,
  282. },
  283. {
  284. 'num' => 23,
  285. 'pig' => q\a = load ':INPATH:/singlefile/allscalar10k' as (name:chararray, age:int, gpa:double, instate:boolean);
  286. b = filter a by instate;
  287. store b into ':OUTPATH:' using PigStorage;\,
  288. 'verify_pig_script' => q\a = load ':INPATH:/singlefile/allscalar10k' as (name:chararray, age:int, gpa:double, instate:chararray);
  289. b = filter a by instate == 'true';
  290. store b into ':OUTPATH:' using PigStorage;\,
  291. },
  292. {
  293. 'num' => 24,
  294. 'pig' => q\a = load ':INPATH:/singlefile/allscalar10k' as (name:chararray, age:int, gpa:double, instate:boolean);
  295. b = filter a by not instate;
  296. store b into ':OUTPATH:' using PigStorage;\,
  297. 'verify_pig_script' => q\a = load ':INPATH:/singlefile/allscalar10k' as (name:chararray, age:int, gpa:double, instate:chararray);
  298. b = filter a by instate == 'false';
  299. store b into ':OUTPATH:' using PigStorage;\,
  300. },
  301. {
  302. 'num' => 25,
  303. 'pig' => q\a = load ':INPATH:/singlefile/allscalar10k' as (name:chararray, age:int, gpa:double, instate:boolean);
  304. b = filter a by instate is null;
  305. store b into ':OUTPATH:' using PigStorage;\,
  306. 'verify_pig_script' => q\a = load ':INPATH:/singlefile/allscalar10k' as (name:chararray, age:int, gpa:double, instate:chararray);
  307. b = filter a by instate is null;
  308. store b into ':OUTPATH:' using PigStorage;\,
  309. },
  310. {
  311. 'num' => 26,
  312. 'pig' => q\a = load ':INPATH:/singlefile/allscalar10k' as (name:chararray, age:int, gpa:double, instate:boolean);
  313. b = filter a by instate == true;
  314. store b into ':OUTPATH:' using PigStorage;\,
  315. 'verify_pig_script' => q\a = load ':INPATH:/singlefile/allscalar10k' as (name:chararray, age:int, gpa:double, instate:chararray);
  316. b = filter a by instate == 'true';
  317. store b into ':OUTPATH:' using PigStorage;\,
  318. },
  319. {
  320. 'num' => 27,
  321. 'pig' => q\a = load ':INPATH:/singlefile/allscalar10k' as (name:chararray, age:int, gpa:double, instate:boolean);
  322. b = filter a by instate == false;
  323. store b into ':OUTPATH:' using PigStorage;\,
  324. 'verify_pig_script' => q\a = load ':INPATH:/singlefile/allscalar10k' as (name:chararray, age:int, gpa:double, instate:chararray);
  325. b = filter a by instate == 'false';
  326. store b into ':OUTPATH:' using PigStorage;\,
  327. },
  328. ],
  329. },
  330. {
  331. 'name' => 'FilterEq',
  332. 'tests' => [
  333. {
  334. 'num' => 1,
  335. 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name, age, gpa);
  336. b = filter a by name == 'alice johnson' and age == 64 and gpa == 3.99;
  337. store b into ':OUTPATH:' using PigStorage;\,
  338. },
  339. {
  340. 'num' => 2,
  341. 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name, age, gpa);
  342. b = filter a by name > 'fred allen' and age > 40 and gpa > 2.50;
  343. store b into ':OUTPATH:' using PigStorage;\,
  344. },
  345. {
  346. 'num' => 3,
  347. 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name, age, gpa);
  348. b = filter a by name >= 'fred allen' and age >= 40 and gpa >= 2.50;
  349. store b into ':OUTPATH:' using PigStorage;\,
  350. },
  351. {
  352. 'num' => 4,
  353. 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name, age, gpa);
  354. b = filter a by name lt 'fred allen' and age < 40 and gpa < 2.50;
  355. store b into ':OUTPATH:' using PigStorage;\,
  356. },
  357. {
  358. 'num' => 5,
  359. 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name, age, gpa);
  360. b = filter a by name lte 'fred allen' and age <= 40 and gpa <= 2.50;
  361. store b into ':OUTPATH:' using PigStorage;\,
  362. },
  363. {
  364. 'num' => 6,
  365. 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k' using PigStorage();
  366. b = filter a by $0 neq 'fred allen' and $1 != '40' and $2 != '2.50';
  367. store b into ':OUTPATH:' using PigStorage;\,
  368. },
  369. # test for filter == for chararray, int and double
  370. {
  371. 'num' => 7,
  372. 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name:chararray, age:int, gpa:double);
  373. b = filter a by name == 'fred allen' and age == 61 and gpa == 1.42;
  374. store b into ':OUTPATH:' using PigStorage;\,
  375. },
  376. # test for filter == for bytearray, long and float
  377. {
  378. 'num' => 8,
  379. 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name, age:long, gpa:float);
  380. b = filter a by name == 'fred allen' and age == 61 and gpa == 1.42f;
  381. store b into ':OUTPATH:' using PigStorage;\,
  382. },
  383. # test for filter != for chararray, int and double
  384. {
  385. 'num' => 9,
  386. 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name:chararray, age:int, gpa:double);
  387. b = filter a by $0 != 'fred allen' and $1 != 40 and $2 != 2.50;
  388. store b into ':OUTPATH:' using PigStorage;\,
  389. },
  390. # test for filter != for bytearray, long and float
  391. {
  392. 'num' => 10,
  393. 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name, age:long, gpa:float);
  394. b = filter a by $0 != 'fred allen' and $1 != 40 and $2 != 2.50f;
  395. store b into ':OUTPATH:' using PigStorage;\,
  396. },
  397. # test for filter == for explicit casts to chararray, int and double
  398. {
  399. 'num' => 11,
  400. 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name, age, gpa);
  401. b = filter a by (chararray)name == 'fred allen' and (int)age == 61 and (double)gpa == 1.42;
  402. store b into ':OUTPATH:' using PigStorage;\,
  403. },
  404. # test for filter == for explicit casts to bytearray, long and float
  405. {
  406. 'num' => 12,
  407. 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name, age, gpa);
  408. b = filter a by name == 'fred allen' and (long)age == 61 and (float)gpa == 1.42f;
  409. store b into ':OUTPATH:' using PigStorage;\,
  410. },
  411. # test for filter != for explicit casts to chararray, int and double
  412. {
  413. 'num' => 13,
  414. 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k' using PigStorage() ;
  415. b = filter a by (chararray)$0 != 'fred allen' and (int)$1 != 40 and (double)$2 != 2.50;
  416. store b into ':OUTPATH:' using PigStorage;\,
  417. },
  418. # test for filter != for explicit casts to bytearray, long and float
  419. {
  420. 'num' => 14,
  421. 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k' using PigStorage() ;
  422. b = filter a by $0 != 'fred allen' and (long)$1 != 40 and (float)$2 != 2.50f;
  423. store b into ':OUTPATH:' using PigStorage;\,
  424. },
  425. ]
  426. },
  427. {
  428. 'name' => 'FilterMatches',
  429. 'tests' => [
  430. {
  431. 'num' => 1,
  432. 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name, age, gpa);
  433. b = filter a by name matches '^fred.*';
  434. store b into ':OUTPATH:' using PigStorage;\,
  435. },
  436. {
  437. 'num' => 2,
  438. 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k' using PigStorage();
  439. b = filter a by not $0 matches '^fred.*';
  440. store b into ':OUTPATH:' using PigStorage;\,
  441. },
  442. {
  443. # test for filter on matches for chararray (declared and explicit cast)
  444. 'num' => 3,
  445. 'pig' => q\a = load ':INPATH:/singlefile/votertab10k' using PigStorage() as (name:chararray, age:int, registration, contributions:double);
  446. b = filter a by name matches '^fred.*' and (chararray)registration matches '^dem.*';
  447. store b into ':OUTPATH:' using PigStorage;\,
  448. },
  449. {
  450. 'num' => 4,
  451. 'pig' => q\a = load ':INPATH:/singlefile/votertab10k' using PigStorage() as (name:chararray, age:int, registration, contributions:double);
  452. b = filter a by name matches 'f.ed' and (chararray)registration matches 'd.m';
  453. store b into ':OUTPATH:' using PigStorage;\,
  454. },
  455. {
  456. 'num' => 5,
  457. 'pig' => q\a = load ':INPATH:/singlefile/votertab10k' using PigStorage() as (name:chararray, age:int, registration, contributions:double);
  458. b = filter a by name matches 'f[^f]ed.*';
  459. store b into ':OUTPATH:' using PigStorage;\,
  460. },
  461. {
  462. 'num' => 6,
  463. 'pig' => "a = load ':INPATH:/singlefile/votertab10k' using PigStorage() as (name:chararray, age:int, registration, contributions:double);\nb = filter a by name matches '.*\\\\wan.*';\nstore b into ':OUTPATH:' using PigStorage;",
  464. },
  465. {
  466. 'num' => 7,
  467. 'pig' => "a = load ':INPATH:/singlefile/votertab10k' using PigStorage() as (name:chararray, age:int, registration, contributions:double);\nb = filter a by name matches '^e.*\\\\sc.*';\nstore b into ':OUTPATH:' using PigStorage;",
  468. },
  469. {
  470. 'num' => 8,
  471. 'pig' => "a = load ':INPATH:/singlefile/votertab10k' using PigStorage() as (name:chararray, age:int, registration, contributions:double);\nb = filter a by name matches 'ethan white';\nstore b into ':OUTPATH:' using PigStorage;",
  472. },
  473. {
  474. 'num' => 9,
  475. 'pig' => "a = load ':INPATH:/singlefile/studentnulltab10k' using PigStorage() as (name, age, gpa);\nb = filter a by gpa matches '\\\\d\\\\.45';\nstore b into ':OUTPATH:' using PigStorage;",
  476. },
  477. ]
  478. },
  479. {
  480. 'name' => 'FilterUdf',
  481. 'tests' => [
  482. {
  483. 'num' => 1,
  484. 'pig' => q\
  485. a = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name, age, gpa);
  486. b = load ':INPATH:/singlefile/votertab10k' as (name, age, registration, contributions);
  487. c = cogroup a by (name, age), b by (name, age);
  488. d = filter c by not IsEmpty(a);
  489. e = filter d by not IsEmpty(b);
  490. f = foreach e generate flatten(a), flatten(b);
  491. store f into ':OUTPATH:';\,
  492. },
  493. {
  494. 'num' => 2,
  495. 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k' as (name, age, gpa);
  496. b = load ':INPATH:/singlefile/votertab10k' as (name, age, registration, contributions);
  497. c = filter a by age < 50;
  498. d = filter b by age < 50;
  499. e = cogroup c by (name, age), d by (name, age);
  500. f = filter e by COUNT(c)> 0 AND COUNT(d)>0;
  501. store f into ':OUTPATH:';\,
  502. 'rc' => 0
  503. },
  504. ]
  505. },
  506. # TODO Group that don't flatten via Agg functions
  507. {
  508. 'name' => 'GroupAggFunc',
  509. 'tests' => [
  510. {
  511. 'num' => 1,
  512. 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k' as (name, age, gpa);
  513. b = group a by name;
  514. c = foreach b generate group, COUNT(a.age);
  515. store c into ':OUTPATH:';\,
  516. },
  517. {
  518. 'num' => 2,
  519. 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k';
  520. b = group a by $0;
  521. c = foreach b generate group, COUNT(a.$1);
  522. store c into ':OUTPATH:';\,
  523. },
  524. {
  525. 'num' => 3,
  526. 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k' as (name, age, gpa);
  527. b = group a by (name, age);
  528. c = foreach b generate group.name, group.age, COUNT(a.gpa);
  529. store c into ':OUTPATH:';\,
  530. },
  531. {
  532. 'num' => 5,
  533. 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k' as (name, age, gpa);
  534. b = group a all;
  535. c = foreach b generate COUNT(a.$0);
  536. store c into ':OUTPATH:';\,
  537. },
  538. {
  539. 'num' => 6,
  540. 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k' as (name, age, gpa);
  541. b = group a by name;
  542. c = foreach b generate group, SUM(a.age);
  543. store c into ':OUTPATH:';\,
  544. },
  545. {
  546. 'num' => 7,
  547. 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k' as (name, age, gpa);
  548. b = group a by name;
  549. c = foreach b generate group, SUM(a.gpa);
  550. store c into ':OUTPATH:';\,
  551. 'floatpostprocess' => 1,
  552. 'delimiter' => ' ',
  553. },
  554. {
  555. 'num' => 8,
  556. 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k' as (name, age, gpa);
  557. b = group a by name;
  558. c = foreach b generate group, AVG(a.age);
  559. store c into ':OUTPATH:';\,
  560. },
  561. {
  562. 'num' => 9,
  563. 'ignore23' => 'I cannot get it right due to float precision, temporarily disable',
  564. 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k' as (name, age, gpa);
  565. b = group a by name;
  566. c = foreach b generate group, AVG(a.gpa);
  567. store c into ':OUTPATH:';\,
  568. 'floatpostprocess' => 1,
  569. 'delimiter' => ' ',
  570. },
  571. {
  572. 'num' => 10,
  573. 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k' as (name, age, gpa);
  574. b = group a by name;
  575. c = foreach b generate group, MIN(a.gpa);
  576. store c into ':OUTPATH:';\,
  577. },
  578. {
  579. 'num' => 11,
  580. 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k' as (name, age, gpa);
  581. b = group a by name;
  582. c = foreach b generate group, MAX(a.gpa);
  583. store c into ':OUTPATH:';\,
  584. },
  585. {
  586. 'num' => 12,
  587. 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k' as (name, age, gpa);
  588. b = group a by (name, age);
  589. c = foreach b generate flatten(group), SUM(a.gpa);
  590. store c into ':OUTPATH:';\,
  591. 'floatpostprocess' => 1,
  592. 'delimiter' => ' ',
  593. },
  594. {
  595. 'num' => 13,
  596. 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k' as (name, age, gpa);
  597. b = group a by (name);
  598. c = load ':INPATH:/singlefile/studenttab10k' as (name, age, gpa);
  599. d = cogroup b by group, c by name;
  600. e = foreach d generate flatten(group), SUM(c.gpa), COUNT(c.name);
  601. store e into ':OUTPATH:';\,
  602. 'floatpostprocess' => 1,
  603. 'delimiter' => ' ',
  604. },
  605. {
  606. 'num' => 14,
  607. 'pig' => q\a = load ':INPATH:/singlefile/allscalar10k' as (name:chararray, age:int, gpa:double, instate:boolean);
  608. b = group a by (name);
  609. e = foreach b generate COUNT(a.name);
  610. store e into ':OUTPATH:';\,
  611. 'verify_pig_script' => q\a = load ':INPATH:/singlefile/allscalar10k' as (name:chararray, age:int, gpa:double, instate:chararray);
  612. b = group a by (name);
  613. e = foreach b generate COUNT(a.name);
  614. store e into ':OUTPATH:';\,
  615. }
  616. ],
  617. },
  618. {
  619. 'name' => 'MapPartialAgg',
  620. 'tests' => [
  621. {
  622. 'num' => 1,
  623. 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k' as (name, age, gpa);
  624. b = group a by name;
  625. c = foreach b generate group, COUNT(a.age);
  626. store c into ':OUTPATH:';\,
  627. 'java_params' => ['-Dpig.exec.mapPartAgg=true']
  628. },
  629. {
  630. #multiquery with group in one sub query
  631. 'num' => 2,
  632. 'pig' => q# a = load ':INPATH:/singlefile/studenttab10k' as (name: chararray, age: int, gpa: float);
  633. b = filter a by age < 22; store b into ':OUTPATH:.1';
  634. c = group b by age;
  635. d = foreach c generate group, SUM(b.gpa);
  636. store d into ':OUTPATH:.2'; #,
  637. 'java_params' => ['-Dpig.exec.mapPartAgg=true']
  638. },
  639. {
  640. #multi query with two group on diff columns
  641. 'num' => 3,
  642. 'pig' => q# a = load ':INPATH:/singlefile/studenttab10k' as (name: chararray, age: int, gpa: float);
  643. g1 = group a by name;
  644. f1 = foreach g1 generate group as name, MAX(a.gpa);
  645. store f1 into ':OUTPATH:.1';
  646. g2 = group a by age;
  647. f2 = foreach g2 generate group as age, AVG(a.gpa);
  648. store f2 into ':OUTPATH:.2'; #,
  649. 'java_params' => ['-Dpig.exec.mapPartAgg=true']
  650. },
  651. {
  652. #multi query with three groups on diff columns, group key being an expression
  653. 'num' => 4,
  654. 'pig' => q# a = load ':INPATH:/singlefile/studenttab10k' as (name: chararray, age: int, gpa: float);
  655. g1 = group a by name;
  656. f1 = foreach g1 generate group as name, MAX(a.gpa);
  657. store f1 into ':OUTPATH:.1';
  658. g2 = group a by age%10;
  659. f2 = foreach g2 generate group as age_mod10, AVG(a.gpa);
  660. store f2 into ':OUTPATH:.2';
  661. g3 = group a by age;
  662. f3 = foreach g3 generate group%10, AVG(a.gpa);
  663. store f3 into ':OUTPATH:.3';
  664. g4 = group a by gpa;
  665. f4 = foreach g4 generate group as gpa, COUNT(a);
  666. store f4 into ':OUTPATH:.4';
  667. #,
  668. 'java_params' => ['-Dpig.exec.mapPartAgg=true']
  669. },
  670. {
  671. #aggregation gets more than one tuple for every tuple from load func
  672. 'num' => 5,
  673. 'pig' => q# a = load ':INPATH:/singlefile/studenttab10k' as (name: chararray, age: int, gpa: float);
  674. b = foreach a generate name, age, gpa, flatten(TOBAG(age,age)) as x;
  675. c = group b by age;
  676. d = foreach c generate group, AVG(b.gpa);
  677. store d into ':OUTPATH:'; #,
  678. 'java_params' => ['-Dpig.exec.mapPartAgg=true']
  679. },
  680. ],
  681. },
  682. {
  683. 'name' => 'EvalFunc',
  684. 'tests' => [
  685. {
  686. 'num' => 1,
  687. 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k' as (name, age, gpa);
  688. b = filter a by name lt 'b';
  689. c = foreach b generate ARITY(name, age, gpa);
  690. store c into ':OUTPATH:';\,
  691. },
  692. {
  693. 'num' => 2,
  694. 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name:chararray, age, gpa);
  695. b = filter a by name lt 'b';
  696. c = foreach b generate TOKENIZE(name);
  697. d = foreach c generate flatten($0);
  698. store d into ':OUTPATH:';\,
  699. },
  700. {
  701. 'num' => 3,
  702. 'pig' => q\register :FUNCPATH:/testudf.jar;
  703. a = load ':INPATH:/singlefile/studenttab10k' as (name, age, gpa);
  704. b = filter a by name lt 'b';
  705. c = foreach b generate org.apache.pig.test.udf.evalfunc.Swap(name, age);
  706. store c into ':OUTPATH:' using org.apache.pig.test.udf.storefunc.StringStore();\,
  707. },
  708. {
  709. 'num' => 4,
  710. 'pig' => q\register :FUNCPATH:/testudf.jar;
  711. a = load ':INPATH:/singlefile/studenttab10k' as (name, age, gpa);
  712. b = filter a by name lt 'b';
  713. c = foreach b generate org.apache.pig.test.udf.evalfunc.CreateMap((chararray)name, age);
  714. store c into ':OUTPATH:' using org.apache.pig.test.udf.storefunc.StringStore();\,
  715. },
  716. {
  717. 'num' => 5,
  718. 'pig' => q\register :FUNCPATH:/testudf.jar;
  719. a = load ':INPATH:/singlefile/allscalar10k' as (name:chararray, age:int, gpa:double, instate:boolean);
  720. b = foreach a generate org.apache.pig.test.udf.evalfunc.TestBoolean(instate);
  721. store b into ':OUTPATH:';\,
  722. 'verify_pig_script' => q\a = load ':INPATH:/singlefile/allscalar10k' as (name:chararray, age:int, gpa:double, instate:chararray);
  723. b = foreach a generate (instate is null ? '' : (instate == 'true' ? 'false' : 'true'));
  724. store b into ':OUTPATH:';\,
  725. }
  726. ]
  727. },
  728. # TODO DIFF
  729. # TODO User defined grouping function
  730. {
  731. 'name' => 'CoGroupFlatten',
  732. 'tests' => [
  733. {
  734. 'num' => 1,
  735. 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name, age, gpa);
  736. b = load ':INPATH:/singlefile/votertab10k' as (name, age, registration, contributions);
  737. c = filter a by age < 20;
  738. d = filter b by age < 20;
  739. e = cogroup c by name, d by name;
  740. f = foreach e generate flatten (c), flatten(d);
  741. store f into ':OUTPATH:';\,
  742. },
  743. {
  744. 'num' => 2,
  745. 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name, age, gpa);
  746. b = load ':INPATH:/singlefile/votertab10k' as (name, age, registration, contributions);
  747. c = filter a by $1 < 20;
  748. d = filter b by $1 < 20;
  749. e = cogroup c by $0, d by $0;
  750. f = foreach e generate flatten (c), flatten(d);
  751. store f into ':OUTPATH:';\,
  752. },
  753. {
  754. 'num' => 3,
  755. 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name, age, gpa);
  756. b = load ':INPATH:/singlefile/votertab10k' as (name, age, registration, contributions);
  757. c = filter a by age < 20;
  758. d = filter b by age < 20;
  759. e = cogroup c by (name, age), d by (name, age);
  760. f = foreach e generate flatten (c), flatten(d);
  761. store f into ':OUTPATH:';\,
  762. },
  763. {
  764. 'num' => 4,
  765. 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name, age, gpa);
  766. b = load ':INPATH:/singlefile/votertab10k' as (name, age, registration, contributions);
  767. d = filter b by age < 20;
  768. e = cogroup a by (name, age) inner, d by (name, age);
  769. f = foreach e generate flatten (a), flatten(d);
  770. store f into ':OUTPATH:';\,
  771. },
  772. {
  773. 'num' => 5,
  774. 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name, age, gpa);
  775. b = load ':INPATH:/singlefile/votertab10k' as (name, age, registration, contributions);
  776. c = filter a by age < 20;
  777. e = cogroup c by (name, age), b by (name, age) inner;
  778. f = foreach e generate flatten (c), flatten(b);
  779. store f into ':OUTPATH:';\,
  780. },
  781. {
  782. 'num' => 6,
  783. 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name, age, gpa);
  784. b = load ':INPATH:/singlefile/votertab10k' as (name, age, registration, contributions);
  785. e = cogroup a by (name, age) inner, b by (name, age) inner;
  786. f = foreach e generate flatten (a), flatten(b);
  787. store f into ':OUTPATH:';\,
  788. },
  789. {
  790. # Test cogrouping data loaded from two separate loaders. We don't have any data that can join with studenttab that isn't also loaded with PigStorage, so the
  791. # first step is an intermediate load and store using BinStorage.
  792. 'num' => 7,
  793. 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name, age, gpa);
  794. store a into ':OUTPATH:.intermediate' using BinStorage();
  795. b = load ':OUTPATH:.intermediate' using BinStorage() as (name, age, gpa);
  796. c = load ':INPATH:/singlefile/votertab10k' as (name, age, registration, contributions);
  797. e = cogroup b by (name, age) inner, c by (name, age) inner;
  798. f = foreach e generate flatten (b), flatten(c);
  799. store f into ':OUTPATH:';\,
  800. 'notmq' => 1,
  801. },
  802. ]
  803. },
  804. {
  805. 'name' => 'CoGroup',
  806. 'tests' => [
  807. {
  808. 'num' => 1,
  809. 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name, age, gpa);
  810. b = load ':INPATH:/singlefile/votertab10k' as (name, age, registration, contributions);
  811. c = cogroup a by name, b by name;
  812. d = foreach c generate flatten(group), COUNT(a) + COUNT(b);
  813. store d into ':OUTPATH:';\,
  814. },
  815. ]
  816. },
  817. {
  818. 'name' => 'Join',
  819. 'tests' => [
  820. {
  821. 'num' => 1,
  822. 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name, age, gpa);
  823. b = load ':INPATH:/singlefile/votertab10k' as (name, age, registration, contributions);
  824. c = filter a by age < 20;
  825. d = filter b by age < 20;
  826. e = join c by name, d by name;
  827. store e into ':OUTPATH:';\,
  828. },
  829. {
  830. 'num' => 2,
  831. 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name, age, gpa);
  832. b = load ':INPATH:/singlefile/votertab10k' as (name, age, registration, contributions);
  833. c = filter a by age < 20;
  834. d = filter b by age < 20;
  835. e = join c by $0, d by $0;
  836. store e into ':OUTPATH:';\,
  837. },
  838. {
  839. 'num' => 3,
  840. 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name, age, gpa);
  841. b = load ':INPATH:/singlefile/votertab10k' as (name, age, registration, contributions);
  842. c = filter a by age < 20;
  843. d = filter b by age < 20;
  844. e = join c by (name, age), d by (name, age);
  845. store e into ':OUTPATH:';\,
  846. },
  847. # self join with implict split
  848. # JIRA PIG-429
  849. {
  850. 'num' => 4,
  851. 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k';
  852. b = filter a by $1 > 25;
  853. c = join a by $0, b by $0;
  854. store c into ':OUTPATH:';\,
  855. },
  856. # join with one input having schema and another without
  857. # JIRA PIG-428
  858. {
  859. 'num' => 5,
  860. 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k' as (name:chararray,age:int, gpa:double);
  861. another = load ':INPATH:/singlefile/studenttab10k';
  862. c = foreach another generate $0, $1+ 10, $2 + 10.0;
  863. d = join a by $0, c by $0;
  864. store d into ':OUTPATH:';\,
  865. },
  866. # self join using fragment replicate join
  867. # no types
  868. {
  869. 'num' => 6,
  870. 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k' as (name, age, gpa);
  871. b = load ':INPATH:/singlefile/studenttab10k' as (name, age, gpa);
  872. c = join a by name, b by name using 'repl';
  873. store c into ':OUTPATH:';\,
  874. 'verify_pig_script' => q\a = load ':INPATH:/singlefile/studenttab10k' as (name, age, gpa);
  875. b = load ':INPATH:/singlefile/studenttab10k' as (name, age, gpa);
  876. c = join a by name, b by name ;
  877. store c into ':OUTPATH:';\,
  878. },
  879. # self join using fragment replicate join
  880. # with types and no cast for join key
  881. {
  882. 'num' => 7,
  883. 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k' as (name:chararray, age:int, gpa:double);
  884. b = load ':INPATH:/singlefile/studenttab10k' as (name:chararray, age:int, gpa:double);
  885. c = join a by name, b by name using 'repl';
  886. store c into ':OUTPATH:';\,
  887. 'verify_pig_script' => q\a = load ':INPATH:/singlefile/studenttab10k' as (name:chararray, age:int, gpa:double);
  888. b = load ':INPATH:/singlefile/studenttab10k' as (name:chararray, age:int, gpa:double);
  889. c = join a by name, b by name ;
  890. store c into ':OUTPATH:';\,
  891. },
  892. # self join using fragment replicate join
  893. # with types and cast for join key
  894. {
  895. 'num' => 8,
  896. 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k' as (name:chararray, age:int, gpa:double);
  897. b = load ':INPATH:/singlefile/studenttab10k' as (name:chararray, age:int, gpa);
  898. c = join a by gpa, b by gpa using 'repl';
  899. store c into ':OUTPATH:';\,
  900. 'verify_pig_script' => q\a = load ':INPATH:/singlefile/studenttab10k' as (name:chararray, age:int, gpa:double);
  901. b = load ':INPATH:/singlefile/studenttab10k' as (name:chararray, age:int, gpa);
  902. c = join a by gpa, b by gpa ;
  903. store c into ':OUTPATH:';\,
  904. },
  905. # left outer join
  906. {
  907. 'num' => 9,
  908. 'pig' => q\a = load ':INPATH:/singlefile/studentnulltab10k' as (name:chararray, age:int, gpa:double);
  909. b = load ':INPATH:/singlefile/voternulltab10k' as (name:chararray, age:long, registration:chararray, contributions:double);
  910. c = join a by name left outer, b by name;
  911. store c into ':OUTPATH:';\,
  912. },
  913. # right outer join
  914. {
  915. 'num' => 10,
  916. 'pig' => q\a = load ':INPATH:/singlefile/studentnulltab10k' as (name:chararray, age:int, gpa:double);
  917. b = load ':INPATH:/singlefile/voternulltab10k' as (name:chararray, age:long, registration:chararray, contributions:double);
  918. c = join a by name right outer, b by name;
  919. store c into ':OUTPATH:';\,
  920. },
  921. # full outer join
  922. {
  923. 'num' => 11,
  924. 'pig' => q\a = load ':INPATH:/singlefile/studentnulltab10k' as (name:chararray, age:int, gpa:double);
  925. b = load ':INPATH:/singlefile/voternulltab10k' as (name:chararray, age:long, registration:chararray, contributions:double);
  926. c = join a by name full outer, b by name;
  927. store c into ':OUTPATH:';\,
  928. },
  929. # see PIG-1209 join package now uses internalcachedBag, so every tuple on reduce side in this test will spilled to disk.
  930. {
  931. 'num' => 12,
  932. 'java_params' => ['-Dpig.cachedbag.memusage=0'],
  933. 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name, age, gpa);
  934. b = load ':INPATH:/singlefile/votertab10k' as (name, age, registration, contributions);
  935. c = filter a by age < 20;
  936. d = filter b by age < 20;
  937. e = join c by name, d by name;
  938. store e into ':OUTPATH:';\,
  939. },
  940. {
  941. 'num' => 13,
  942. 'pig' => q\a = load ':INPATH:/singlefile/allscalar10k' as (name:chararray, age:int, gpa:double, instate:boolean);
  943. b = load ':INPATH:/singlefile/allscalar10k' as (name:chararray, age:int, gpa:double, instate:boolean);
  944. c = filter a by age < 20;
  945. d = filter b by age < 20;
  946. e = join c by instate, d by instate parallel 5;
  947. store e into ':OUTPATH:';\,
  948. 'verify_pig_script' => q\a = load ':INPATH:/singlefile/allscalar10k' as (name:chararray, age:int, gpa:double, instate:chararray);
  949. b = load ':INPATH:/singlefile/allscalar10k' as (name:chararray, age:int, gpa:double, instate:chararray);
  950. c = filter a by age < 20;
  951. d = filter b by age < 20;
  952. e = join c by instate, d by instate parallel 5;
  953. store e into ':OUTPATH:';\,
  954. }
  955. ]
  956. },
  957. {
  958. 'name' => 'Foreach',
  959. 'tests' => [
  960. {
  961. 'num' => 1,
  962. 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k' as (name, age, gpa);
  963. b = foreach a generate *;
  964. store b into ':OUTPATH:';\,
  965. },
  966. {
  967. 'num' => 2,
  968. 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k';
  969. b = foreach a generate *;
  970. store b into ':OUTPATH:';\,
  971. },
  972. {
  973. 'num' => 3,
  974. 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k' as (name, age, gpa);
  975. b = foreach a generate name, age;
  976. store b into ':OUTPATH:';\,
  977. },
  978. {
  979. 'num' => 4,
  980. 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k';
  981. b = foreach a generate $0, $2;
  982. store b into ':OUTPATH:';\,
  983. },
  984. {
  985. # test filter, projection, sort , duplicate elimination
  986. 'num' => 5,
  987. 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k' as (name, age, gpa);
  988. b = filter a by age < 20;
  989. c = group b by age;
  990. d = foreach c {
  991. cf = filter b by gpa < 3.0;
  992. cp = cf.gpa;
  993. cd = distinct cp;
  994. co = order cd by $0;
  995. generate group, flatten(co);
  996. }
  997. store d into ':OUTPATH:';\,
  998. },
  999. {
  1000. # test flatten for map and scalar
  1001. 'num' => 6,
  1002. 'pig' => q\register :FUNCPATH:/testudf.jar;
  1003. a = load ':INPATH:/singlefile/studenttab10k' as (name, age, gpa);
  1004. b = foreach a generate flatten(name) as n, flatten(org.apache.pig.test.udf.evalfunc.CreateMap((chararray)name, gpa)) as m;
  1005. store b into ':OUTPATH:' using org.apache.pig.test.udf.storefunc.StringStore();\,
  1006. },
  1007. {
  1008. # test flatten for UDF that returns bag with multiple tuples with multiple columns
  1009. 'num' => 7,
  1010. 'pig' => q\register :FUNCPATH:/testudf.jar;
  1011. a = load ':INPATH:/singlefile/studenttab10k' as (name, age, gpa);
  1012. b = foreach a generate name, flatten(org.apache.pig.test.udf.evalfunc.CreateTupleBag(age, gpa)) as foo;
  1013. store b into ':OUTPATH:';\,
  1014. },
  1015. {
  1016. 'num' => 8,
  1017. 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name, age: int, gpa);
  1018. c = group a by name;
  1019. d = foreach c generate flatten(group), MAX(a.age) + MIN(a.age);
  1020. store d into ':OUTPATH:';\,
  1021. },
  1022. {
  1023. # test filter, projection, sort , duplicate elimination
  1024. 'num' => 9,
  1025. 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k' as (name, age, gpa);
  1026. b = filter a by age < 20;
  1027. c = group b by age;
  1028. d = foreach c {
  1029. cf = filter b by gpa >= 3.0 and gpa <= 3.5;
  1030. cp = cf.gpa;
  1031. cd = distinct cp;
  1032. co = order cd by $0;
  1033. generate group, flatten(co);
  1034. }
  1035. store d into ':OUTPATH:';\,
  1036. },
  1037. {
  1038. # test filter, projection, sort , duplicate elimination
  1039. 'num' => 10,
  1040. 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k' as (name, age, gpa);
  1041. b = filter a by age < 20;
  1042. c = group b by age;
  1043. d = foreach c {
  1044. cf = filter b by (gpa == 4.0 or gpa != 2.0) and name > 'a';
  1045. cp = cf.gpa;
  1046. cd = distinct cp;
  1047. co = order cd by $0;
  1048. generate group, flatten(co);
  1049. }
  1050. store d into ':OUTPATH:';\,
  1051. },
  1052. {
  1053. # test filter, projection, sort , duplicate elimination
  1054. 'num' => 11,
  1055. 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k' as (name, age, gpa);
  1056. b = filter a by age < 20;
  1057. c = foreach b {
  1058. exp1 = age + gpa;
  1059. exp2 = exp1 + age;
  1060. generate exp1, exp2;
  1061. }
  1062. store c into ':OUTPATH:';\,
  1063. },
  1064. {
  1065. # test a udf with no args
  1066. 'num' => 12,
  1067. 'pig' => q\register :FUNCPATH:/testudf.jar;
  1068. a = load ':INPATH:/singlefile/studenttab10k' as (name, age, gpa);
  1069. b = foreach a generate name, org.apache.pig.test.udf.evalfunc.Fred() as fred;
  1070. store b into ':OUTPATH:';\,
  1071. },
  1072. {
  1073. 'num' => 13,
  1074. 'pig' => q\a = load ':INPATH:/singlefile/allscalar10k' as (name:chararray, age:int, gpa:double, instate:boolean);
  1075. b = foreach a generate *;
  1076. store b into ':OUTPATH:';\,
  1077. 'verify_pig_script' => q\a = load ':INPATH:/singlefile/allscalar10k' as (name:chararray, age:int, gpa:double, instate:chararray);
  1078. b = foreach a generate *;
  1079. store b into ':OUTPATH:';\,
  1080. }
  1081. ]
  1082. },
  1083. {
  1084. 'name' => 'Order',
  1085. 'tests' => [
  1086. {
  1087. 'num' => 1,
  1088. 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k' as (name, age, gpa);
  1089. b = foreach a generate name;
  1090. c = order b by name;
  1091. store c into ':OUTPATH:';\,
  1092. 'sortArgs' => ['-t', ' ', '-k', '1,1'],
  1093. },
  1094. {
  1095. 'num' => 2,
  1096. 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k';
  1097. b = foreach a generate $1;
  1098. c = order b by $0;
  1099. store c into ':OUTPATH:';\,
  1100. 'sortArgs' => ['-t', ' ', '-k', '1,1'],
  1101. },
  1102. {
  1103. 'num' => 3,
  1104. 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k' as (name, age, gpa);
  1105. b = foreach a generate gpa;
  1106. c = order b by gpa;
  1107. store c into ':OUTPATH:';\,
  1108. 'sortArgs' => ['-t', ' ', '-k', '1,1'],
  1109. },
  1110. {
  1111. 'num' => 4,
  1112. 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k';
  1113. b = order a by *;
  1114. store b into ':OUTPATH:';\,
  1115. 'sortArgs' => ['-t', ' '],
  1116. },
  1117. {
  1118. 'num' => 5,
  1119. 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k' as (name, age, gpa);
  1120. b = foreach a generate name, age;
  1121. c = order b by name, age;
  1122. store c into ':OUTPATH:';\,
  1123. 'sortArgs' => ['-t', ' ', '-k', '1,2'],
  1124. },
  1125. {
  1126. 'num' => 6,
  1127. 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k';
  1128. c = order a by $0;
  1129. store c into ':OUTPATH:';\,
  1130. 'sortArgs' => ['-t', ' ', '-k', '1,1'],
  1131. },
  1132. {
  1133. 'num' => 7,
  1134. 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k';
  1135. c = order a by $1;
  1136. store c into ':OUTPATH:';\,
  1137. 'sortArgs' => ['-t', ' ', '-k', '2,2'],
  1138. },
  1139. {
  1140. 'num' => 8,
  1141. 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k';
  1142. c = order a by $0, $1;
  1143. store c into ':OUTPATH:';\,
  1144. 'sortArgs' => ['-t', ' ', '-k', '1,2'],
  1145. },
  1146. {
  1147. 'num' => 9,
  1148. 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k';
  1149. c = order a by $1, $0;
  1150. store c into ':OUTPATH:';\,
  1151. 'sortArgs' => ['-t', ' ', '-k', '2,2', '-k', '1,1'],
  1152. },
  1153. {
  1154. 'num' => 10,
  1155. 'ignore' => 'order by UDF is not supported',
  1156. 'pig' => q\register :FUNCPATH:/testudf.jar;
  1157. a = load ':INPATH:/singlefile/studenttab10k';
  1158. c = order a by * using org.apache.pig.test.udf.orderby.OrdDesc;
  1159. store c into ':OUTPATH:';\,
  1160. 'sortArgs' => ['-t', ' ', '-r'],
  1161. },
  1162. {
  1163. 'num' => 11,
  1164. 'ignore' => 'order by UDF is not supported',
  1165. 'pig' => q\register :FUNCPATH:/testudf.jar;
  1166. a = load ':INPATH:/singlefile/studenttab10k';
  1167. c = order a by $0 using org.apache.pig.test.udf.orderby.OrdDesc;
  1168. store c into ':OUTPATH:';\,
  1169. 'sortArgs' => ['-t', ' ', '-r', '-k', '1,1'],
  1170. },
  1171. {
  1172. 'num' => 12,
  1173. 'ignore' => 'order by UDF is not supported',
  1174. 'pig' => q\register :FUNCPATH:/testudf.jar;
  1175. a = load ':INPATH:/singlefile/studenttab10k';
  1176. c = order a by $0, $1 using org.apache.pig.test.udf.orderby.OrdDesc;
  1177. store c into ':OUTPATH:';\,
  1178. 'sortArgs' => ['-t', ' ', '-r', '-k', '1,2'],
  1179. },
  1180. # ALERT All these tests with inner order bys aren't testing the inner
  1181. # ordering. We need to develop a sorting tool to do that.
  1182. {
  1183. 'num' => 13,
  1184. 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k';
  1185. b = group a by $0;
  1186. c = foreach b {c1 = order $1 by $1; generate flatten(c1); };
  1187. store c into ':OUTPATH:';\,
  1188. },
  1189. {
  1190. 'num' => 14,
  1191. 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k';
  1192. b = group a by $0;
  1193. c = foreach b {c1 = order $1 by *; generate flatten(c1); };
  1194. store c into ':OUTPATH:';\,
  1195. },
  1196. {
  1197. 'num' => 15,
  1198. 'pig' => q\register :FUNCPATH:/testudf.jar;
  1199. a = load ':INPATH:/singlefile/studenttab10k';
  1200. b = group a by $0;
  1201. c = foreach b {c1 = order $1 by * using org.apache.pig.test.udf.orderby.OrdDesc; generate flatten(c1); };
  1202. store c into ':OUTPATH:';\,
  1203. },
  1204. {
  1205. 'num' => 16,
  1206. 'pig' => q\register :FUNCPATH:/testudf.jar;
  1207. a = load ':INPATH:/singlefile/studenttab10k';
  1208. b = group a by $0;
  1209. c = foreach b {c1 = order $1 by $1 using org.apache.pig.test.udf.orderby.OrdDesc; generate flatten(c1);};
  1210. store c into ':OUTPATH:';\,
  1211. },
  1212. {
  1213. 'num' => 17,
  1214. 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k';
  1215. b = group a by $0;
  1216. c = foreach b {c1 = order $1 by $1; generate flatten(c1), MAX($1.$1); };
  1217. store c into ':OUTPATH:';\,
  1218. },
  1219. {
  1220. # test to make sure the weighted range patitioning
  1221. # works correctly when a sort key value repeats across
  1222. # reduce partitions
  1223. 'num' => 18,
  1224. 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k';
  1225. b = order a by $1 parallel 100;
  1226. store b into ':OUTPATH:';\,
  1227. 'sortArgs' => ['-t', ' ', '-k', '2,2'],
  1228. },
  1229. {
  1230. 'num' => 19,
  1231. 'pig' => q\a = load ':INPATH:/singlefile/allscalar10k' as (name:chararray, age:int, gpa:double, instate:boolean);
  1232. b = foreach a generate instate;
  1233. c = order b by instate;
  1234. store c into ':OUTPATH:';\,
  1235. 'verify_pig_script' => q\a = load ':INPATH:/singlefile/allscalar10k' using PigStorage() as (name:chararray, age:int, gpa:double, instate:chararray);
  1236. b = foreach a generate instate;
  1237. c = order b by instate;
  1238. store c into ':OUTPATH:';\,
  1239. 'sortArgs' => ['-t', ' ', '-k', '1,1'],
  1240. },
  1241. ]
  1242. },
  1243. {
  1244. 'name' => 'Distinct',
  1245. 'tests' => [
  1246. {
  1247. 'num' => 1,
  1248. 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k' as (name, age, gpa);
  1249. b = foreach a generate name;
  1250. c = distinct b;
  1251. store c into ':OUTPATH:';\,
  1252. },
  1253. {
  1254. 'num' => 2,
  1255. 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k';
  1256. b = foreach a generate $1;
  1257. c = distinct b;
  1258. store c into ':OUTPATH:';\,
  1259. },
  1260. {
  1261. 'num' => 3,
  1262. 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k' as (name, age, gpa);
  1263. b = foreach a generate gpa;
  1264. c = distinct b;
  1265. store c into ':OUTPATH:';\,
  1266. },
  1267. {
  1268. 'num' => 4,
  1269. 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k';
  1270. b = distinct a;
  1271. store b into ':OUTPATH:';\,
  1272. },
  1273. {
  1274. 'num' => 5,
  1275. 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k' as (name, age, gpa);
  1276. b = foreach a generate name, age;
  1277. c = distinct b;
  1278. store c into ':OUTPATH:';\,
  1279. },
  1280. {
  1281. 'num' => 6,
  1282. 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k' as (name, age, gpa);
  1283. b = group a by name;
  1284. c = foreach b { aa = distinct a.age; generate group, COUNT(aa); }
  1285. store c into ':OUTPATH:';\,
  1286. }
  1287. ]
  1288. },
  1289. {
  1290. 'name' => 'Cross',
  1291. 'tests' => [
  1292. {
  1293. 'num' => 1,
  1294. 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k' as (name, age, gpa);
  1295. b = load ':INPATH:/singlefile/votertab10k' as (name, age, registration, contributions);
  1296. c = filter a by age < 19 and gpa < 1.0;
  1297. d = filter b by age < 19;
  1298. e = cross c, d;
  1299. store e into ':OUTPATH:';\,
  1300. },
  1301. {
  1302. 'num' => 2,
  1303. 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k' as (name, age, gpa);
  1304. b = load ':INPATH:/singlefile/votertab10k' as (name, age, registration, contributions);
  1305. c = filter a by age < 19 and gpa < 1.0;
  1306. d = filter b by age < 19;
  1307. e = cross c, d parallel 10;
  1308. store e into ':OUTPATH:';\,
  1309. },
  1310. {
  1311. 'num' => 3,
  1312. 'pig' => q\set default_parallel 10;
  1313. a = load ':INPATH:/singlefile/studenttab10k' as (name, age, gpa);
  1314. b = load ':INPATH:/singlefile/votertab10k' as (name, age, registration, contributions);
  1315. c = filter a by age < 19 and gpa < 1.0;
  1316. d = filter b by age < 19;
  1317. e = cross c, d;
  1318. store e into ':OUTPATH:';\,
  1319. },
  1320. {
  1321. 'num' => 4,
  1322. 'pig' => q\
  1323. a = load ':INPATH:/singlefile/studenttab10k' as (name, age, gpa);
  1324. b = load ':INPATH:/singlefile/votertab10k' as (name, age, registration, contributions);
  1325. c = filter a by age < 25;
  1326. d = filter b by age < 25;
  1327. e = cross c, d;
  1328. f = filter e by c::age < d::age;
  1329. store f into ':OUTPATH:';\,
  1330. }
  1331. ]
  1332. },
  1333. {
  1334. 'name' => 'Union',
  1335. 'tests' => [
  1336. {
  1337. 'num' => 1,
  1338. 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k' as (name, age, gpa);
  1339. b = load ':INPATH:/singlefile/votertab10k' as (name, age, registration, contributions);
  1340. c = foreach a generate name, age;
  1341. d = foreach b generate name, age;
  1342. e = union c, d;
  1343. store e into ':OUTPATH:';\,
  1344. },
  1345. ]
  1346. },
  1347. {
  1348. 'name' => 'Bincond',
  1349. 'tests' => [
  1350. {
  1351. 'num' => 1,
  1352. 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k' as (name, age, gpa);
  1353. b = foreach a generate name, (name matches 'yuri.*' ? age - 10 : (int)age);
  1354. store b into ':OUTPATH:';\,
  1355. },
  1356. ]
  1357. },
  1358. {
  1359. 'name' => 'Glob',
  1360. 'tests' => [
  1361. {
  1362. 'num' => 1,
  1363. 'pig' => q\a = load ':INPATH:/singlefile/studenttab10?' as (name, age, gpa);
  1364. b = filter a by name == 'nick miller';
  1365. store b into ':OUTPATH:';\,
  1366. },
  1367. {
  1368. 'num' => 2,
  1369. 'pig' => q\a = load ':INPATH:/singlefile/st*ttab10k' as (name, age, gpa);
  1370. b = filter a by name == 'nick miller';
  1371. store b into ':OUTPATH:';\,
  1372. },
  1373. {
  1374. 'num' => 3,
  1375. 'pig' => q\a = load ':INPATH:/singlefile/studenttab*' as (name, age, gpa);
  1376. b = filter a by name == 'nick miller';
  1377. store b into ':OUTPATH:';\,
  1378. },
  1379. {
  1380. 'num' => 4,
  1381. 'pig' => q\a = load ':INPATH:/singlefile/studenttab???' as (name, age, gpa);
  1382. b = filter a by name == 'nick miller';
  1383. store b into ':OUTPATH:';\,
  1384. },
  1385. {
  1386. 'num' => 5,
  1387. 'pig' => q\a = load ':INPATH:/singlefile/studenttab[1-9]0[km]' as (name, age, gpa);
  1388. b = filter a by name == 'nick miller';
  1389. store b into ':OUTPATH:';\,
  1390. },
  1391. {
  1392. 'num' => 6,
  1393. 'pig' => q\a = load ':INPATH:/singlefile/studenttab[13]0[km]' as (name, age, gpa);
  1394. b = filter a by name == 'nick miller';
  1395. store b into ':OUTPATH:';\,
  1396. },
  1397. {
  1398. 'num' => 7,
  1399. 'pig' => q\a = load ':INPATH:/singlefile/studenttab[12]0[a-l]' as (name, age, gpa);
  1400. b = filter a by name == 'nick miller';
  1401. store b into ':OUTPATH:';\,
  1402. },
  1403. {
  1404. 'num' => 8,
  1405. 'pig' => q\a = load ':INPATH:/glob/star/*good' as (name, age, gpa);
  1406. b = filter a by name == 'nick miller';
  1407. store b into ':OUTPATH:';\,
  1408. },
  1409. {
  1410. 'num' => 9,
  1411. 'pig' => q\a = load ':INPATH:/glob/star/*' as (name, age, gpa);
  1412. b = filter a by name == 'nick miller';
  1413. store b into ':OUTPATH:';\,
  1414. }
  1415. ]
  1416. },
  1417. {
  1418. 'name' => 'Arithmetic',
  1419. 'tests' => [
  1420. {
  1421. 'num' => 1,
  1422. 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k' as (name, age, gpa);
  1423. c = foreach a generate age + 1, (int)gpa + 1;
  1424. store c into ':OUTPATH:';\,
  1425. },
  1426. {
  1427. 'num' => 2,
  1428. 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k' as (name, age, gpa);
  1429. c = foreach a generate (double)age + 1.5, gpa + 1.5;
  1430. store c into ':OUTPATH:';\,
  1431. },
  1432. {
  1433. 'num' => 3,
  1434. 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k' as (name, age, gpa);
  1435. c = foreach a generate age - 30, (int)gpa - 3;
  1436. store c into ':OUTPATH:';\,
  1437. },
  1438. {
  1439. 'num' => 4,
  1440. 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k' as (name, age, gpa);
  1441. c = foreach a generate (double)age - 30.1, gpa - 3.199;
  1442. store c into ':OUTPATH:';\,
  1443. },
  1444. {
  1445. 'num' => 5,
  1446. 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k' as (name, age, gpa);
  1447. c = foreach a generate age * 10, (int)gpa * 2;
  1448. store c into ':OUTPATH:';\,
  1449. },
  1450. {
  1451. 'num' => 6,
  1452. 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k' as (name, age, gpa);
  1453. c = foreach a generate (double)age * 10.1, gpa * 2.752342;
  1454. store c into ':OUTPATH:';\,
  1455. },
  1456. {
  1457. 'num' => 7,
  1458. 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k' as (name, age, gpa);
  1459. c = foreach a generate age / 30, (int)gpa / 3;
  1460. store c into ':OUTPATH:';\,
  1461. },
  1462. {
  1463. 'num' => 8,
  1464. 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k' as (name, age, gpa);
  1465. c = foreach a generate (double)age / 30.323, gpa / 3.22;
  1466. store c into ':OUTPATH:';\,
  1467. },
  1468. {
  1469. 'num' => 9,
  1470. 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k' as (name, age, gpa);
  1471. c = foreach a generate 3 * age + gpa / 9.1 - 2;
  1472. store c into ':OUTPATH:';\,
  1473. },
  1474. {
  1475. 'num' => 10,
  1476. 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k' as (name, age, gpa);
  1477. c = foreach a generate 3 * (age + gpa) / (9.1 - 2);
  1478. store c into ':OUTPATH:';\,
  1479. }
  1480. ]
  1481. },
  1482. {
  1483. 'name' => 'Regression',
  1484. 'tests' => [
  1485. {
  1486. 'num' => 1459894,
  1487. 'pig' => q\a = load ':INPATH:/singlefile/reg1459894';
  1488. b = group a by $0;
  1489. c = foreach b generate group, COUNT(a.$1);
  1490. store c into ':OUTPATH:';\,
  1491. },
  1492. {
  1493. 'num' => 97,
  1494. 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name, age, gpa);
  1495. b = load ':INPATH:/singlefile/votertab10k' as (name, age, registration, contributions);
  1496. e = cogroup a by name, b by name;
  1497. f = foreach e generate group, COUNT(a), COUNT(b);
  1498. store f into ':OUTPATH:';\,
  1499. },
  1500. {
  1501. 'num' => 203,
  1502. 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name, age, gpa);
  1503. b = group a by name;
  1504. c = foreach b generate group, COUNT($1);
  1505. store c into ':OUTPATH:';
  1506. --This is a really long script to test that when script size exceeds 1k we can still parse it.
  1507. --The quick sly fox jumped over the lazy brown dog.
  1508. --he quick sly fox jumped over the lazy brown dog.T
  1509. --e quick sly fox jumped over the lazy brown dog.Th
  1510. -- quick sly fox jumped over the lazy brown dog.The
  1511. --quick sly fox jumped over the lazy brown dog.The
  1512. --uick sly fox jumped over the lazy brown dog.The q
  1513. --ick sly fox jumped over the lazy brown dog.The qu
  1514. --ck sly fox jumped over the lazy brown dog.The qui
  1515. --k sly fox jumped over the lazy brown dog.The quic
  1516. -- sly fox jumped over the lazy brown dog.The quick
  1517. --sly fox jumped over the lazy brown dog.The quick
  1518. --ly fox jumped over the lazy brown dog.The quick s
  1519. --y fox jumped over the lazy brown dog.The quick sl
  1520. -- fox jumped over the lazy brown dog.The quick sly
  1521. --fox jumped over the lazy brown dog.The quick sly
  1522. --ox jumped over the lazy brown dog.The quick sly f
  1523. --x jumped over the lazy brown dog.The quick sly fo
  1524. -- jumped over the lazy brown dog.The quick sly fox
  1525. --jumped over the lazy brown dog.The quick sly fox
  1526. --umped over the lazy brown dog.The quick sly fox j
  1527. --mped over the lazy brown dog.The quick sly fox ju
  1528. --ped over the lazy brown dog.The quick sly fox jum\,
  1529. }
  1530. ]
  1531. },
  1532. {
  1533. 'name' => 'Unicode',
  1534. 'tests' => [
  1535. {
  1536. 'num' => 1,
  1537. 'pig' => q\a = load ':INPATH:/singlefile/unicode100';
  1538. store a into ':OUTPATH:';\,
  1539. },
  1540. ]
  1541. },
  1542. {
  1543. 'name' => 'Parameters',
  1544. 'tests' => [
  1545. {
  1546. # test default
  1547. 'num' => 1,
  1548. 'pig' => q\%default fname 'studenttab10k'
  1549. a = load ':INPATH:/singlefile/$fname' using PigStorage() as (name, age, gpa);
  1550. b = foreach a generate name;
  1551. store b into ':OUTPATH:';\,
  1552. },
  1553. {
  1554. # test paramter from command line
  1555. 'num' => 2,
  1556. 'pig_params' => ['-p', qq(fname='studenttab10k')],
  1557. 'pig' => q\a = load ':INPATH:/singlefile/$fname' using PigStorage() as (name, age, gpa);
  1558. b = foreach a generate name;
  1559. store b into ':OUTPATH:';\,
  1560. },
  1561. {
  1562. # test paramter from param file
  1563. 'num' => 3,
  1564. 'pig_params' => ['-m', ":PARAMPATH:/params_3"],
  1565. 'pig' => q\a = load ':INPATH:/singlefile/$fname' using PigStorage() as (name, age, gpa);
  1566. b = foreach a generate name;
  1567. store b into ':OUTPATH:';\,
  1568. },
  1569. {
  1570. # test command
  1571. 'num' => 4,
  1572. 'pig' => q\%declare cmd `/usr/local/bin/perl -e 'print "studenttab10k"'`
  1573. a = load ':INPATH:/singlefile/$cmd' using PigStorage() as (name, age, gpa);
  1574. b = foreach a generate name;
  1575. store b into ':OUTPATH:';\,
  1576. },
  1577. {
  1578. # test parameter with a space
  1579. 'num' => 5,
  1580. 'pig_params' => ['-p', qq(setting='set default_parallel 100;'),'-p',qq(fname='studenttab10k')],
  1581. 'pig' => q\a = load ':INPATH:/singlefile/$fname' using PigStorage() as (name, age, gpa);
  1582. $setting
  1583. b = foreach a generate name;
  1584. store b into ':OUTPATH:';\,
  1585. 'verify_pig_script' => q\a = load ':INPATH:/singlefile/$fname' using PigStorage() as (name, age, gpa);
  1586. b = foreach a generate name;
  1587. store b into ':OUTPATH:';\,
  1588. },
  1589. ]
  1590. },
  1591. {
  1592. 'name' => 'Types',
  1593. 'tests' => [
  1594. {
  1595. # constants
  1596. 'num' => 1,
  1597. 'pig' => q\a = load ':INPATH:/singlefile/studentnulltab10k' as (name:chararray, age:int, gpa:double);
  1598. b = foreach a generate age + 1 + 0.2f + 253645L, gpa+1;
  1599. store b into ':OUTPATH:';\,
  1600. },
  1601. {
  1602. # NULL and cast
  1603. 'num' => 2,
  1604. 'pig' => q\a = load ':INPATH:/singlefile/studentnulltab10k' as (name:chararray, age:int, gpa:double);
  1605. b = foreach a generate (int)((int)gpa/((int)gpa - 1)) as norm_gpa:int;
  1606. c = foreach b generate (norm_gpa is null? 0 :norm_gpa);
  1607. store c into ':OUTPATH:';\,
  1608. # 'expected_err_regex' => "Encountered Warning DIVIDE_BY_ZERO 2387 time.*",
  1609. # Driver does currently not support both 'sql' and 'expected_...' verification directives.
  1610. },
  1611. {
  1612. # arithmetic operators and SIZE for int, double and size and concat operators for chararrays
  1613. 'num' => 3,
  1614. 'pig' => q\a = load ':INPATH:/singlefile/studentnulltab10k' as (name:chararray, age:int, gpa:double);
  1615. b = foreach a generate age, gpa, age % 25, age + 25, age - 25, age/2, age * 2, SIZE(age), gpa + 10.1, gpa - 1.1 , gpa / 1.2, gpa * 2.5, SIZE(gpa), SIZE(name), CONCAT(name, 'test');
  1616. store b into ':OUTPATH:';\,
  1617. },
  1618. {
  1619. # arithmetic operators and SIZE for long, float and size and concat operators for bytearrays
  1620. 'num' => 4,
  1621. 'pig' => q\a = load ':INPATH:/singlefile/studentnulltab10k' as (name, age:long, gpa:float);
  1622. b = foreach a generate age, gpa, age % 2L, age + 2500000000L, age - 2500000000L, age/2L, age * 250000000L, SIZE(age), gpa + 10.1f, gpa - 1.1f , gpa / 1.2f, gpa * 2.6f, SIZE(gpa), SIZE(name), CONCAT(name, name);
  1623. store b into ':OUTPATH:';\,
  1624. },
  1625. {
  1626. # equlity and implicit cast
  1627. 'num' => 5,
  1628. 'pig' => q\a = load ':INPATH:/singlefile/studentnulltab10k' as (name, age, gpa);
  1629. b = filter a by age == '25' and gpa < 3;
  1630. store b into ':OUTPATH:';\,
  1631. },
  1632. {
  1633. # will need to test against previous version of pig
  1634. # because in pig currently count includes nulls - this affects
  1635. # avg
  1636. 'num' => 6,
  1637. 'pig' => q\a = load ':INPATH:/singlefile/studentnulltab10k' as (name:chararray, age:int, gpa:double);
  1638. b = group a ALL;
  1639. c = foreach b generate SUM(a.age), MIN(a.age), MAX(a.age), AVG(a.age), MIN(a.name), MAX(a.name), SUM(a.gpa), MIN(a.gpa), MAX(a.gpa), AVG(a.gpa);
  1640. store c into ':OUTPATH:';\,
  1641. 'floatpostprocess' => 1,
  1642. 'delimiter' => ' ',
  1643. },
  1644. {
  1645. # sum, min, max, avg for long and float (declared)
  1646. 'num' => 7,
  1647. 'pig' => q\a = load ':INPATH:/singlefile/studentnulltab10k' as (name, age:long, gpa:float);
  1648. b = group a ALL;
  1649. c = foreach b generate SUM(a.age), MIN(a.age), MAX(a.age), AVG(a.age), SUM(a.gpa), MIN(a.gpa), MAX(a.gpa), AVG(a.gpa);
  1650. store c into ':OUTPATH:';\,
  1651. },
  1652. {
  1653. # Explicit casts - arithmetic operators and SIZE for int, double and size and concat operators for chararrays
  1654. 'num' => 8,
  1655. 'pig' => q\a = load ':INPATH:/singlefile/studentnulltab10k' as (name, age, gpa);
  1656. b = foreach a generate (int)age % 25, (int)age + 25, (int)age - 25, (int)age/2, (int)age * 2, SIZE((int)age), (double)gpa + 10.1, (double)gpa - 1.1 , (double)gpa / 1.2, (double)gpa * 2.5, SIZE((double)gpa), SIZE((chararray)name), CONCAT((chararray)name, 'test');
  1657. store b into ':OUTPATH:';\,
  1658. },
  1659. {
  1660. # Explicit casts - arithmetic operators and SIZE for long, float
  1661. 'num' => 9,
  1662. 'pig' => q\a = load ':INPATH:/singlefile/studentnulltab10k' as (name, age, gpa);
  1663. b = foreach a generate (long)age, (long)age % 2L, (long)age + 2500000000L, (long)age - 2500000000L, (long)age/2L, (long)age * 250000000L, SIZE((long)age), (float)gpa + 10.1f, (float)gpa - 1.1f , (float)gpa / 1.2f, (float)gpa * 2.6f, SIZE((float)gpa);
  1664. store b into ':OUTPATH:';\,
  1665. },
  1666. {
  1667. # Filter is null for chararray and double and is not null for int
  1668. 'num' => 10,
  1669. 'pig' => q\a = load ':INPATH:/singlefile/studentnulltab10k' as (name:chararray, age:int, gpa:double);
  1670. b = filter a by name is null and age is not null and gpa is null;
  1671. c = group b ALL;
  1672. d = foreach c generate COUNT(b);
  1673. store d into ':OUTPATH:';\,
  1674. },
  1675. {
  1676. # Filter is not null for chararray and double and is null for int
  1677. 'num' => 11,
  1678. 'pig' => q\a = load ':INPATH:/singlefile/studentnulltab10k' as (name:chararray, age:int, gpa:double);
  1679. b = filter a by name is not null and age is null and gpa is not null;
  1680. c = group b ALL;
  1681. d = foreach c generate COUNT(b);
  1682. store d into ':OUTPATH:';\,
  1683. },
  1684. {
  1685. # Filter is null for bytearray and float and is not null for long
  1686. 'num' => 12,
  1687. 'pig' => q\a = load ':INPATH:/singlefile/studentnulltab10k' as (name, age:long, gpa:float);
  1688. b = filter a by name is null and age is not null and gpa is null;
  1689. c = group b ALL;
  1690. d = foreach c generate COUNT(b);
  1691. store d into ':OUTPATH:';\,
  1692. },
  1693. {
  1694. # Filter is not null for bytearray and float and is null for long
  1695. 'num' => 13,
  1696. 'pig' => q\a = load ':INPATH:/singlefile/studentnulltab10k' as (name, age:long, gpa:float);
  1697. b = filter a by name is not null and age is null and gpa is not null;
  1698. c = group b ALL;
  1699. d = foreach c generate COUNT(b);
  1700. store d into ':OUTPATH:';\,
  1701. },
  1702. {
  1703. # test that sorting is based on the type for chararray, int and double
  1704. 'num' => 14,
  1705. 'pig' =>q\a = load ':INPATH:/singlefile/studentnulltab10k' as (name:chararray, age:int, gpa:double);
  1706. b = order a by name, age, gpa;
  1707. store b into ':OUTPATH:';\,
  1708. 'sortArgs' => ['-t', ' ', '-k', '1,1', '-k', '2n,2n'],
  1709. },
  1710. {
  1711. # test that sorting descending is based on the type for chararray, int and double
  1712. 'num' => 15,
  1713. 'pig' =>q\a = load ':INPATH:/singlefile/studentnulltab10k' as (name:chararray, age:int, gpa:double);
  1714. b = order a by name desc, age desc, gpa desc;
  1715. store b into ':OUTPATH:';\,
  1716. 'sortArgs' => ['-t', ' ', '-k', '1r,1r', '-k', '2nr,2nr'],
  1717. },
  1718. {
  1719. # test that sorting is based on the type for bytearray, long and float
  1720. 'num' => 16,
  1721. 'pig' =>q\a = load ':INPATH:/singlefile/studentnulltab10k' as (name, age:long, gpa:float);
  1722. b = order a by name, age, gpa;
  1723. store b into ':OUTPATH:';\,
  1724. 'sortArgs' => ['-t', ' ', '-k', '1,1', '-k', '2n,2n'],
  1725. },
  1726. {
  1727. # test that sorting descending is based on the type for chararray, age and float
  1728. 'num' => 17,
  1729. 'pig' =>q\a = load ':INPATH:/singlefile/studentnulltab10k' as (name, age:long, gpa:float);
  1730. b = order a by name desc, age desc, gpa desc;
  1731. store b into ':OUTPATH:';\,
  1732. 'sortArgs' => ['-t', ' ', '-k', '1r,1r', '-k', '2nr,2nr'],
  1733. },
  1734. {
  1735. # test precision for doubles is atleast 15 digits
  1736. 'num' => 18,
  1737. 'pig' => q\a = load ':INPATH:/singlefile/studentnulltab10k' as (name:chararray, age:int, gpa:double);
  1738. b = foreach a generate 0.123456789123456+0.123456789123456;
  1739. store b into ':OUTPATH:';\,
  1740. },
  1741. {
  1742. # order by string
  1743. 'num' => 20,
  1744. 'pig' => q\a = load ':INPATH:/singlefile/studentnulltab10k' as (name:chararray, age:int, gpa:double);
  1745. b = order a by name;
  1746. store b into ':OUTPATH:';\,
  1747. 'sortArgs' => ['-t', ' ', '-k', '1,1'],
  1748. },
  1749. {
  1750. # order by string desc
  1751. 'num' => 21,
  1752. 'pig' => q\a = load ':INPATH:/singlefile/studentnulltab10k' as (name:chararray, age:int, gpa:double);
  1753. b = order a by name desc;
  1754. store b into ':OUTPATH:';\,
  1755. 'sortArgs' => ['-t', ' ', '-k', '1r,1r'],
  1756. },
  1757. {
  1758. # order by int
  1759. 'num' => 22,
  1760. 'pig' => q\a = load ':INPATH:/singlefile/studentnulltab10k' as (name:chararray, age:int, gpa:double);
  1761. b = order a by age;
  1762. store b into ':OUTPATH:';\,
  1763. 'sortArgs' => ['-t', ' ', '-k', '2n,2n'],
  1764. },
  1765. {
  1766. # order by int desc
  1767. 'num' => 23,
  1768. 'pig' => q\a = load ':INPATH:/singlefile/studentnulltab10k' as (name:chararray, age:int, gpa:double);
  1769. b = order a by age desc;
  1770. store b into ':OUTPATH:';\,
  1771. 'sortArgs' => ['-t', ' ', '-k', '2nr,2nr'],
  1772. },
  1773. {
  1774. # order by long
  1775. 'num' => 24,
  1776. 'pig' => q\a = load ':INPATH:/singlefile/studentnulltab10k' as (name:chararray, age:long, gpa:double);
  1777. b = order a by age;
  1778. store b into ':OUTPATH:';\,
  1779. 'sortArgs' => ['-t', ' ', '-k', '2n,2n'],
  1780. },
  1781. {
  1782. # order by long desc
  1783. 'num' => 25,
  1784. 'pig' => q\a = load ':INPATH:/singlefile/studentnulltab10k' as (name:chararray, age:long, gpa:double);
  1785. b = order a by age desc;
  1786. store b into ':OUTPATH:';\,
  1787. 'sortArgs' => ['-t', ' ', '-k', '2nr,2nr'],
  1788. },
  1789. {
  1790. # order by float
  1791. 'num' => 26,
  1792. 'pig' => q\a = load ':INPATH:/singlefile/studentnulltab10k' as (name:chararray, age:int, gpa:float);
  1793. b = order a by gpa;
  1794. store b into ':OUTPATH:';\,
  1795. 'sortArgs' => ['-t', ' ', '-k', '3n'],
  1796. },
  1797. {
  1798. # order by float desc
  1799. 'num' => 27,
  1800. 'pig' => q\a = load ':INPATH:/singlefile/studentnulltab10k' as (name:chararray, age:int, gpa:float);
  1801. b = order a by gpa desc;
  1802. store b into ':OUTPATH:';\,
  1803. 'sortArgs' => ['-t', ' ', '-k', '3nr'],
  1804. },
  1805. {
  1806. # order by double
  1807. 'num' => 28,
  1808. 'pig' => q\a = load ':INPATH:/singlefile/studentnulltab10k' as (name:chararray, age:int, gpa:double);
  1809. b = order a by gpa;
  1810. store b into ':OUTPATH:';\,
  1811. 'sortArgs' => ['-t', ' ', '-k', '3n'],
  1812. },
  1813. {
  1814. # order by double desc
  1815. 'num' => 29,
  1816. 'pig' => q\a = load ':INPATH:/singlefile/studentnulltab10k' as (name:chararray, age:int, gpa:double);
  1817. b = order a by gpa desc;
  1818. store b into ':OUTPATH:';\,
  1819. 'sortArgs' => ['-t', ' ', '-k', '3nr'],
  1820. },
  1821. {
  1822. # order by *
  1823. 'num' => 30,
  1824. 'pig' => q\a = load ':INPATH:/singlefile/studentnulltab10k' as (name:chararray, age:int, gpa:double);
  1825. b = order a by *;
  1826. store b into ':OUTPATH:';\,
  1827. 'sortArgs' => ['-t', ' ', '-k', '1,1', '-k', '2n,2n'],
  1828. },
  1829. {
  1830. # order by * desc
  1831. 'num' => 31,
  1832. 'pig' => q\a = load ':INPATH:/singlefile/studentnulltab10k' as (name:chararray, age:int, gpa:double);
  1833. b = order a by * desc;
  1834. store b into ':OUTPATH:';\,
  1835. 'sortArgs' => ['-t', ' ', '-k', '1r,1r', '-k', '2nr,2nr'],
  1836. },
  1837. {
  1838. 'num' => 32,
  1839. 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name:chararray, age:int, gpa:double);
  1840. b = load ':INPATH:/singlefile/votertab10k' as (name:chararray, age:int, registration:chararray, contributions:double);
  1841. c = filter a by age < 20;
  1842. d = filter b by age < 20;
  1843. e = cogroup c by name, d by name;
  1844. f = foreach e generate flatten (c), flatten(d);
  1845. store f into ':OUTPATH:';\,
  1846. },
  1847. {
  1848. 'num' => 33,
  1849. 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name:chararray, age:int, gpa:double);
  1850. b = load ':INPATH:/singlefile/votertab10k' as (name:chararray, age:int, registration:chararray, contributions:double);
  1851. c = filter a by age < 20;
  1852. d = filter b by age < 20;
  1853. e = cogroup c by age, d by age;
  1854. f = foreach e generate flatten (c), flatten(d);
  1855. store f into ':OUTPATH:';\,
  1856. },
  1857. {
  1858. 'num' => 34,
  1859. 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name:chararray, age:long, gpa:double);
  1860. b = load ':INPATH:/singlefile/votertab10k' as (name:chararray, age:long, registration:chararray, contributions:double);
  1861. c = filter a by age < 20;
  1862. d = filter b by age < 20;
  1863. e = cogroup c by age, d by age;
  1864. f = foreach e generate flatten (c), flatten(d);
  1865. store f into ':OUTPATH:';\,
  1866. },
  1867. {
  1868. 'num' => 35,
  1869. 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name:chararray, age:float, gpa:double);
  1870. b = load ':INPATH:/singlefile/votertab10k' as (name:chararray, age:float, registration:chararray, contributions:double);
  1871. c = filter a by age < 20;
  1872. d = filter b by age < 20;
  1873. e = cogroup c by age, d by age;
  1874. f = foreach e generate flatten (c), flatten(d);
  1875. store f into ':OUTPATH:';\,
  1876. },
  1877. {
  1878. 'num' => 36,
  1879. 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name:chararray, age:double, gpa:double);
  1880. b = load ':INPATH:/singlefile/votertab10k' as (name:chararray, age:double, registration:chararray, contributions:double);
  1881. c = filter a by age < 20;
  1882. d = filter b by age < 20;
  1883. e = cogroup c by age, d by age;
  1884. f = foreach e generate flatten (c), flatten(d);
  1885. store f into ':OUTPATH:';\,
  1886. },
  1887. {
  1888. # NULL and cast
  1889. 'num' => 37,
  1890. 'pig' => q\a = load ':INPATH:/singlefile/studentnulltab10k' as (name:chararray, age:int, gpa:double);
  1891. b = foreach a generate (int)((int)gpa/((int)gpa - 1)) as norm_gpa:int;
  1892. c = foreach b generate (norm_gpa is not null? norm_gpa: 0);
  1893. store c into ':OUTPATH:';\,
  1894. },
  1895. {
  1896. # constants
  1897. 'num' => 38,
  1898. 'pig' => q\a = load ':INPATH:/singlefile/studentnulltab10k' as (name:chararray, age:int, gpa:double);
  1899. b = foreach a generate -(age + 1 + 0.2f + 253645L), -(gpa+1);
  1900. store b into ':OUTPATH:';\,
  1901. },
  1902. {
  1903. 'num' => 39,
  1904. 'pig' => q\a = load ':INPATH:/singlefile/allscalar10k' using PigStorage() as (name:chararray, age:int, gpa:double, instate:boolean);
  1905. b = foreach a generate instate, true, false;
  1906. store b into ':OUTPATH:';\,
  1907. 'verify_pig_script' => q\a = load ':INPATH:/singlefile/allscalar10k' using PigStorage() as (name:chararray, age:int, gpa:double, instate:chararray);
  1908. b = foreach a generate instate, 'true', 'false';
  1909. store b into ':OUTPATH:';\,
  1910. },
  1911. ]
  1912. },
  1913. {
  1914. 'name' => 'Limit',
  1915. 'tests' => [
  1916. {
  1917. 'num' => 1,
  1918. 'pig' =>q\a = load ':INPATH:/singlefile/studentnulltab10k';
  1919. b = order a by $0, $1;
  1920. c = filter b by $0 > 'a'; -- break the sort/limit optimization
  1921. d = limit c 100;
  1922. store d into ':OUTPATH:';\,
  1923. 'sortArgs' => ['-t', ' ', '-k', '1,1'],
  1924. },
  1925. {
  1926. 'num' => 2,
  1927. 'ignore23' => 'The record limit pick is different in 23',
  1928. 'pig' =>q\a = load ':INPATH:/singlefile/studentnulltab10k';
  1929. b = order a by $0, $1;
  1930. c = limit b 100;
  1931. store c into ':OUTPATH:';\,
  1932. 'sortArgs' => ['-t', ' ', '-k', '1,1'],
  1933. },
  1934. {
  1935. # Make sure that limit higher than number of rows doesn't mess stuff up
  1936. 'num' => 3,
  1937. 'pig' =>q\a = load ':INPATH:/singlefile/studenttab10k';
  1938. b = order a by $0, $1;
  1939. c = filter b by $1 < 1000;
  1940. d = limit c 100000;
  1941. store d into ':OUTPATH:';\,
  1942. },
  1943. {
  1944. 'num' => 4,
  1945. 'pig' =>q\a = load ':INPATH:/singlefile/studentnulltab10k';
  1946. b = distinct a;
  1947. c = limit b 100;
  1948. store c into ':OUTPATH:';\,
  1949. },
  1950. {
  1951. 'num' => 5,
  1952. 'pig' =>q\a = load ':INPATH:/singlefile/studenttab10k';
  1953. b = load ':INPATH:/singlefile/votertab10k';
  1954. a1 = foreach a generate $0, $1;
  1955. b1 = foreach b generate $0, $1;
  1956. c = union a1, b1;
  1957. d = limit c 100;
  1958. store d into ':OUTPATH:';\,
  1959. },
  1960. {
  1961. 'num' => 6,
  1962. 'pig' =>q\A = load ':INPATH:/singlefile/studenttab10k' as (name: chararray, age: int, gpa: float);
  1963. B = limit A 40;
  1964. C = filter B by age == 40;
  1965. D = group C by name;
  1966. E = foreach D generate group, COUNT(C);
  1967. store E into ':OUTPATH:';\,
  1968. },
  1969. {
  1970. 'num' => 7,
  1971. 'pig' =>q\A = load ':INPATH:/singlefile/studenttab10k' as (name: chararray, age: int, gpa: float);
  1972. B = group A by name;
  1973. C = foreach B {
  1974. C1 = limit A 10;
  1975. generate group, COUNT(C1);
  1976. }
  1977. store C into ':OUTPATH:';\,
  1978. },
  1979. {
  1980. 'num' => 8,
  1981. 'pig' =>q\A = load ':INPATH:/singlefile/studenttab10k' as (name: chararray, age: int, gpa: float);
  1982. B = group A by name;
  1983. C = foreach B {
  1984. C1 = filter A by age < 40;
  1985. C2 = limit C1 10;
  1986. generate group, COUNT(C2);
  1987. }
  1988. D = filter C by $1 > 0;
  1989. store D into ':OUTPATH:';\,
  1990. },
  1991. {
  1992. 'num' => 9,
  1993. 'pig' =>q\a = load ':INPATH:/singlefile/studentnulltab10k';
  1994. b = order a by $0, $1;
  1995. c = limit b 1000/10;
  1996. store c into ':OUTPATH:';\,
  1997. 'verify_pig_script' => q\a = load ':INPATH:/singlefile/studentnulltab10k';
  1998. b = order a by $0, $1;
  1999. c = limit b 100;
  2000. store c into ':OUTPATH:';\,
  2001. 'sortArgs' => ['-t', ' ', '-k', '1,2'],
  2002. },
  2003. {
  2004. 'num' => 10,
  2005. 'pig' =>q\a = load ':INPATH:/singlefile/studenttab10k' as (name:chararray, age:int, gpa:double);
  2006. b = group a all;
  2007. c = foreach b generate COUNT(a) as count;
  2008. d = limit a c.count/10;
  2009. store d into ':OUTPATH:';\,
  2010. 'verify_pig_script' => q\a = load ':INPATH:/singlefile/studenttab10k' as (name:chararray, age:int, gpa:double);
  2011. b = limit a 1000;
  2012. store b into ':OUTPATH:';\,
  2013. },
  2014. {
  2015. 'num' => 11,
  2016. 'pig' =>q\a = load ':INPATH:/singlefile/studenttab10k' as (name:chararray, age:int, gpa:double);
  2017. b = group a all;
  2018. c = foreach b generate COUNT(a) as count;
  2019. d = load ':INPATH:/singlefile/votertab10k';
  2020. e = group d all;
  2021. f = foreach e generate COUNT(d) as count;
  2022. d = limit a c.count/10+f.count/10;
  2023. store d into ':OUTPATH:';\,
  2024. 'verify_pig_script' => q\a = load ':INPATH:/singlefile/studenttab10k' as (name:chararray, age:int, gpa:double);
  2025. b = limit a 2000;
  2026. store b into ':OUTPATH:';\,
  2027. }
  2028. ]
  2029. },
  2030. {
  2031. 'name' => 'Split',
  2032. 'tests' => [
  2033. {
  2034. 'num' => 1,
  2035. 'pig' =>q\a = load ':INPATH:/singlefile/studentnulltab10k';
  2036. split a into a1 if $0 > 'm', a2 if $0 <= 'm';
  2037. store a1 into ':OUTPATH:';\,
  2038. },
  2039. {
  2040. 'num' => 2,
  2041. 'pig' =>q\a = load ':INPATH:/singlefile/studentnulltab10k';
  2042. split a into a1 if $0 > 'm', a2 if $0 <= 'm';
  2043. store a2 into ':OUTPATH:';\,
  2044. },
  2045. {
  2046. 'num' => 3,
  2047. 'pig' =>q\a = load ':INPATH:/singlefile/studenttab10k';
  2048. split a into a1 if $0 > 'm', a2 if $0 <= 'm';
  2049. b = cogroup a1 by $1, a2 by $1;
  2050. c = foreach b generate flatten(a1), flatten(a2);
  2051. store c into ':OUTPATH:';\,
  2052. },
  2053. {
  2054. 'num' => 4,
  2055. 'pig' =>q\a = load ':INPATH:/singlefile/studenttab10k';
  2056. split a into a1 if $0 > 'm', a2 if $0 <= 'm';
  2057. b = cogroup a1 by $1, a2 by $1;
  2058. c = foreach b generate flatten($1), flatten($2);
  2059. store c into ':OUTPATH:';\,
  2060. },
  2061. {
  2062. 'num' => 5,
  2063. 'pig' =>q\a = load ':INPATH:/singlefile/studentnulltab10k' as (name, age, gpa);
  2064. split a into a1 if name > 'm', a2 if name <= 'm';
  2065. b = distinct a1;
  2066. store b into ':OUTPATH:';\,
  2067. },
  2068. {
  2069. 'num' => 6,
  2070. 'pig' =>q\a = load ':INPATH:/singlefile/studentnulltab10k' as (name, age, gpa);
  2071. split a into a1 if age > 50, a2 if age <= 25;
  2072. b = order a2 by name;
  2073. store b into ':OUTPATH:';\,
  2074. 'sortArgs' => ['-t', ' ', '-k', '1,1'],
  2075. },
  2076. {
  2077. 'num' => 7,
  2078. 'pig' =>q\a = load ':INPATH:/singlefile/studentnulltab10k' as (name:chararray, age:int, gpa:double);
  2079. split a into a1 if name > 'm', a2 if age < 50;
  2080. b = distinct a1;
  2081. store b into ':OUTPATH:';\,
  2082. },
  2083. {
  2084. 'num' => 8,
  2085. 'pig' =>q\a = load ':INPATH:/singlefile/studenttab10k' as (name:chararray, age:int, gpa:double);
  2086. split a into a1 if age > 50, a2 if name < 'm';
  2087. b2 = foreach a2 generate name, 1;
  2088. b1 = foreach a1 generate name, 2;
  2089. c = cogroup b2 by name, b1 by name;
  2090. d = foreach c generate flatten(group), COUNT($1), COUNT($2);
  2091. store d into ':OUTPATH:';\,
  2092. },
  2093. {
  2094. 'num' => 9,
  2095. 'pig' =>q\a = load ':INPATH:/singlefile/studenttab10k' as (name:chararray, age:int, gpa:double);
  2096. split a into a1 if age > 50, a2 if name < 'm';
  2097. b2 = distinct a2;
  2098. b1 = order a1 by name;
  2099. c = cogroup b2 by name, b1 by name;
  2100. d = foreach c generate flatten(group), COUNT($1), COUNT($2);
  2101. store d into ':OUTPATH:';\,
  2102. },
  2103. {
  2104. 'num' => 10,
  2105. 'pig' =>q\a = load ':INPATH:/singlefile/studenttab10k' as (name:chararray, age:int, gpa:double);
  2106. split a into a1 if age > 50, a2 otherwise;
  2107. store a1 into ':OUTPATH:.1';
  2108. store a2 into ':OUTPATH:.2';\,
  2109. 'verify_pig_script' =>q\a = load ':INPATH:/singlefile/studenttab10k' as (name:chararray, age:int, gpa:double);
  2110. split a into a1 if age > 50, a2 if age<=50;
  2111. store a1 into ':OUTPATH:.1';
  2112. store a2 into ':OUTPATH:.2';\,
  2113. }
  2114. ]
  2115. },
  2116. {
  2117. 'name' => 'ImplicitSplit',
  2118. 'tests' => [
  2119. {
  2120. 'num' => 1,
  2121. 'pig' =>q\a = load ':INPATH:/singlefile/studenttab10k';
  2122. b = filter a by $1 > 50;
  2123. c = filter a by $2 > 3.0;
  2124. d = cogroup b by $0, c by $0;
  2125. e = foreach d generate flatten(b), flatten(c);
  2126. store e into ':OUTPATH:';\,
  2127. },
  2128. {
  2129. 'num' => 2,
  2130. 'pig' =>q\a = load ':INPATH:/singlefile/studenttab10k' as (name:chararray, age:int, gpa:double);
  2131. b = filter a by age > 50;
  2132. c = filter a by gpa > 3.0;
  2133. d = cogroup b by name, c by name;
  2134. e = foreach d generate flatten(b), flatten(c);
  2135. f = filter e by b::age < 75;
  2136. store f into ':OUTPATH:';\,
  2137. }
  2138. ]
  2139. },
  2140. {
  2141. 'name' => 'describe',
  2142. 'tests' => [
  2143. #JIRA[PIG-373]
  2144. {
  2145. 'num' => 1,
  2146. 'pig' => q\
  2147. A = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name:chararray, age:int, gpa: double);
  2148. describe A;
  2149. store A into ':OUTPATH:';\,
  2150. },
  2151. ],
  2152. },
  2153. {
  2154. 'name' => 'Sample',
  2155. 'tests' => [
  2156. {
  2157. 'num' => 1,
  2158. 'pig' => q\
  2159. A = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name:chararray, age:int, gpa: double);
  2160. S = sample A 2-1-1;
  2161. store S into ':OUTPATH:';\,
  2162. 'verify_pig_script' => q\
  2163. A = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name:chararray, age:int, gpa: double);
  2164. S = sample A 0;
  2165. store S into ':OUTPATH:';\,
  2166. },
  2167. {
  2168. 'num' => 2,
  2169. 'pig' => q\
  2170. A = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name:chararray, age:int, gpa: double);
  2171. B = group A all;
  2172. C = foreach B generate COUNT(A) as count;
  2173. D = group A all;
  2174. E = foreach D generate (double)COUNT(A) as count;
  2175. S = sample A E.count/C.count;
  2176. store S into ':OUTPATH:';\,
  2177. 'verify_pig_script' => q\
  2178. A = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name:chararray, age:int, gpa: double);
  2179. S = sample A 1;
  2180. store S into ':OUTPATH:';\,
  2181. },
  2182. ],
  2183. },
  2184. {
  2185. 'name' => 'MissingColumns',
  2186. 'tests' => [
  2187. {
  2188. 'num' => 1,
  2189. 'pig' => q\
  2190. A = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name:chararray, age: int, gpa: double, extra: chararray);
  2191. B = filter A by age > 50 or extra > 'm';
  2192. D = order B by age, extra;
  2193. store D into ':OUTPATH:';\,
  2194. 'sortArgs' => ['-t', ' ', '-k', '2n,2n'],
  2195. },
  2196. {
  2197. 'num' => 2,
  2198. 'pig' => q\
  2199. A = load ':INPATH:/singlefile/studenttab10k' using PigStorage();
  2200. B = foreach A generate $0, $1 + 1, $3 + 1;
  2201. C = group B by ($0, $2);
  2202. D = foreach C generate flatten(group), COUNT($1);
  2203. store D into ':OUTPATH:';\,
  2204. },
  2205. {
  2206. 'num' => 3,
  2207. 'pig' => q\
  2208. A = load ':INPATH:/singlefile/studenttab10k' as (name: chararray, age: int, gpa: double);
  2209. B = load ':INPATH:/singlefile/studenttab10k' as (name, age, gpa, extra1, extra2);
  2210. C = join A by (name, age), B by (name, extra1);
  2211. store C into ':OUTPATH:';\,
  2212. # The following SQL should produce empty results, which will match what our pig query should produce.
  2213. }
  2214. ],
  2215. },
  2216. {
  2217. 'name' => 'Aliases',
  2218. # check access of a field using multiple valid aliases
  2219. 'tests' => [
  2220. {
  2221. # check that a free standing alias reference works
  2222. # when it is unambiguous
  2223. # check that a fully qualified alias reference works
  2224. # check that a partially qualified unambiguous alias reference works
  2225. 'num' => 1,
  2226. 'pig' => q\
  2227. a = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name:chararray, age:int, gpa: double);
  2228. b = group a by name;
  2229. c = foreach b generate flatten(a);
  2230. d = filter c by name != 'fred';
  2231. e = group d by name;
  2232. f = foreach e generate flatten(d);
  2233. g = foreach f generate name, d::a::name as dname, a::name as aname;
  2234. store g into ':OUTPATH:';\,
  2235. },
  2236. {
  2237. # check that the "group" alias is available
  2238. # after a flatten(group)
  2239. 'num' => 2,
  2240. 'pig' => q\
  2241. a = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name:chararray, age:int, gpa: double);
  2242. b = group a by name;
  2243. c = foreach b generate flatten(group), COUNT(a) as cnt;
  2244. d = foreach c generate group;
  2245. store d into ':OUTPATH:';\,
  2246. },
  2247. ],
  2248. },
  2249. {
  2250. 'name' => 'Lineage',
  2251. #test if the right cast function is picked
  2252. 'tests' => [
  2253. {
  2254. 'num' => 1,
  2255. 'pig' => q\
  2256. a = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name, age, gpa);
  2257. b = load ':INPATH:/singlefile/textdoc' using TextLoader() as (sentence);
  2258. c = cogroup a ALL, b ALL;
  2259. d = foreach c generate flatten(a), flatten(b);
  2260. e = foreach d generate name, flatten(TOKENIZE((chararray)sentence)) as sentence;
  2261. f = foreach e generate CONCAT((chararray)name, sentence);
  2262. store f into ':OUTPATH:';\,
  2263. },
  2264. {
  2265. 'num' => 2,
  2266. 'pig' => q\
  2267. a = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name, age:int, gpa: double);
  2268. b = load ':INPATH:/singlefile/textdoc' using TextLoader() as (sentence);
  2269. c = cross a, b;
  2270. d = foreach c generate name, flatten(TOKENIZE((chararray)sentence)) as sentence;
  2271. e = foreach d generate CONCAT((chararray)name, sentence);
  2272. store e into ':OUTPATH:';\,
  2273. },
  2274. {
  2275. 'num' => 3,
  2276. 'pig' => q\
  2277. a = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name, age, gpa: double);
  2278. b = foreach a generate age as student_age;
  2279. c = filter b by student_age > 50;
  2280. d = foreach c generate student_age + 10;
  2281. store d into ':OUTPATH:';\,
  2282. },
  2283. {
  2284. 'num' => 4,
  2285. 'pig' => q\register :FUNCPATH:/testudf.jar;
  2286. a = load ':INPATH:/singlefile/studenttab10k' as (name, age, gpa);
  2287. b = filter a by name lt 'b';
  2288. c = foreach b generate org.apache.pig.test.udf.evalfunc.CreateMap((chararray)name, (int)age);
  2289. d = foreach c generate $0#'alice young';
  2290. split d into e if $0 is not null, f if $0 is null;
  2291. store e into ':OUTPATH:';\,
  2292. }
  2293. ],
  2294. },
  2295. {
  2296. 'name' => 'Casts',
  2297. 'tests' => [
  2298. {
  2299. # check that a cast of a value of type
  2300. # same as the result type of the cast works
  2301. # when the value is treated as a bytearray
  2302. 'num' => 1,
  2303. 'floatpostprocess' => 1,
  2304. 'delimiter' => ' ',
  2305. 'pig' => q\
  2306. a = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name:chararray, age:int, gpa: double);
  2307. b = foreach a generate name, age, gpa;
  2308. store b into ':OUTPATH:.intermediate' using BinStorage();
  2309. c = load ':OUTPATH:.intermediate' using BinStorage();
  2310. -- after this load, the fields are treated as bytearrays though
  2311. -- they are actually "typed", test that the implicit casts
  2312. -- introduced by the operations in the foreach below will work fine
  2313. d = foreach c generate CONCAT((chararray)$0, 'test'), $1 + 1, $2 + 0.2;
  2314. store d into ':OUTPATH:';\,
  2315. 'notmq' => 1,
  2316. },
  2317. {
  2318. # check that a cast of a value of type
  2319. # same as the result type of the cast works
  2320. # when the value is treated as a bytearray
  2321. 'num' => 2,
  2322. 'floatpostprocess' => 1,
  2323. 'delimiter' => ' ',
  2324. 'pig' => q\
  2325. a = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name:chararray, age:long, gpa: float);
  2326. b = foreach a generate name, age, gpa;
  2327. store b into ':OUTPATH:.intermediate' using BinStorage();
  2328. c = load ':OUTPATH:.intermediate' using BinStorage();
  2329. -- after this load, the fields are treated as bytearrays though
  2330. -- they are actually "typed", test that the implicit casts
  2331. -- introduced by the operations in the foreach below will work fine
  2332. d = foreach c generate CONCAT((chararray)$0, 'test'), $1 + 1L, $2 + 0.2f;
  2333. store d into ':OUTPATH:';\,
  2334. 'notmq' => 1,
  2335. },
  2336. {
  2337. #check that a cast of a value of type
  2338. #same as the result type of the cast works
  2339. #when the value is treated as a bytearray
  2340. 'num' => 3,
  2341. 'floatpostprocess' => 1,
  2342. 'delimiter' => ' ',
  2343. 'pig' => q\
  2344. a = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name:chararray, age:long, gpa: float);
  2345. b = group a by name;
  2346. c = foreach b generate a, (1,2,3), ['key1'#'value1','key2'#'value2'];
  2347. -- store the bag, tuple and map
  2348. store c into ':OUTPATH:.intermediate' using BinStorage();
  2349. d = load ':OUTPATH:.intermediate' using BinStorage() as (b:bag{t:tuple(x,y,z)}, t2:tuple(a,b,c), m:map[]);
  2350. -- after this load, the fields are treated as bytearrays though
  2351. -- they are actually "typed", test that the implicit casts
  2352. -- introduced by the operations in the foreach below will work fine
  2353. e = foreach d generate COUNT(b), t2.a, t2.b, t2.c, m#'key1', m#'key2';
  2354. store e into ':OUTPATH:';\,
  2355. 'notmq' => 1,
  2356. },
  2357. {
  2358. # check that a cast of a value of type
  2359. # same as the result type of the cast works
  2360. # when the value is treated as a bytearray
  2361. 'num' => 4,
  2362. 'floatpostprocess' => 1,
  2363. 'delimiter' => ' ',
  2364. 'pig' => q\
  2365. a = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name:chararray, age:int, gpa: double);
  2366. b = foreach a generate name, age, gpa;
  2367. store b into ':OUTPATH:.intermediate' using PigStorage();
  2368. c = load ':OUTPATH:.intermediate' using PigStorage();
  2369. -- after this load, the fields are treated as bytearrays though
  2370. -- they are actually "typed", test that the implicit casts
  2371. -- introduced by the operations in the foreach below will work fine
  2372. d = foreach c generate CONCAT((chararray)$0, 'test'), $1 + 1, $2 + 0.2;
  2373. store d into ':OUTPATH:';\,
  2374. 'notmq' => 1,
  2375. },
  2376. {
  2377. # check that a cast of a value of type
  2378. # same as the result type of the cast works
  2379. # when the value is treated as a bytearray
  2380. 'num' => 5,
  2381. 'floatpostprocess' => 1,
  2382. 'delimiter' => ' ',
  2383. 'pig' => q\
  2384. a = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name:chararray, age:long, gpa: float);
  2385. b = foreach a generate name, age, gpa;
  2386. store b into ':OUTPATH:.intermediate' using PigStorage();
  2387. c = load ':OUTPATH:.intermediate' using PigStorage();
  2388. -- after this load, the fields are treated as bytearrays though
  2389. -- they are actually "typed", test that the implicit casts
  2390. -- introduced by the operations in the foreach below will work fine
  2391. d = foreach c generate CONCAT((chararray)$0, 'test'), $1 + 1L, $2 + 0.2f;
  2392. store d into ':OUTPATH:';\,
  2393. 'notmq' => 1,
  2394. },
  2395. {
  2396. #check that a cast of a value of type
  2397. #same as the result type of the cast works
  2398. #when the value is treated as a bytearray
  2399. 'num' => 6,
  2400. 'floatpostprocess' => 1,
  2401. 'delimiter' => ' ',
  2402. 'pig' => q\
  2403. a = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name:chararray, age:long, gpa: float);
  2404. b = group a by name;
  2405. c = foreach b generate a, (1,2,3), ['key1'#'value1','key2'#'value2'];
  2406. -- store the bag, tuple and map
  2407. store c into ':OUTPATH:.intermediate' using PigStorage();
  2408. d = load ':OUTPATH:.intermediate' using PigStorage() as (b:bag{t:tuple(x,y,z)}, t2:tuple(a,b,c), m:map[]);
  2409. -- after this load, the fields are treated as bytearrays though
  2410. -- they are actually "typed", test that the implicit casts
  2411. -- introduced by the operations in the foreach below will work fine
  2412. e = foreach d generate COUNT(b), t2.a, t2.b, t2.c, m#'key1', m#'key2';
  2413. store e into ':OUTPATH:';\,
  2414. 'notmq' => 1,
  2415. },
  2416. {
  2417. 'num' => 7,
  2418. 'pig' => q\a = load ':INPATH:/singlefile/allscalar10k' using PigStorage() as (name, age, gpa, instate);
  2419. b = foreach a generate (boolean)instate;
  2420. c = filter b by instate == true;
  2421. store c into ':OUTPATH:';\,
  2422. 'verify_pig_script' => q\a = load ':INPATH:/singlefile/allscalar10k' using PigStorage() as (name:chararray, age:int, gpa:double, instate:chararray);
  2423. b = foreach a generate instate;
  2424. c = filter b by instate == 'true';
  2425. store c into ':OUTPATH:';\,
  2426. }
  2427. ],
  2428. },
  2429. {
  2430. 'name' => 'ClassResolution',
  2431. 'tests' => [
  2432. {
  2433. # check that Loader specified without a package
  2434. # name works if that package name is specified
  2435. # in udf.import.list
  2436. 'num' => 1,
  2437. 'floatpostprocess' => 1,
  2438. 'delimiter' => ' ',
  2439. 'java_params' => ['-Dudf.import.list=org.apache.pig.test.udf.storefunc'],
  2440. 'pig' => q\
  2441. register :FUNCPATH:/testudf.jar;
  2442. a = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name:chararray, age:int, gpa: double);
  2443. b = foreach a generate CONCAT('(', name), CONCAT((chararray)age, ' )');
  2444. store b into ':OUTPATH:.intermediate' using PigStorage(',');
  2445. c = load ':OUTPATH:.intermediate' using org.apache.pig.test.udf.storefunc.DumpLoader();
  2446. store c into ':OUTPATH:';\,
  2447. 'notmq' => 1,
  2448. },
  2449. ],
  2450. },
  2451. {
  2452. 'name' => 'MergeJoin',
  2453. 'tests' => [
  2454. # Simplest merge-join.
  2455. {
  2456. 'num' => 1,
  2457. 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k';
  2458. b = load ':INPATH:/singlefile/votertab10k';
  2459. c = order a by $0;
  2460. d = order b by $0;
  2461. store c into ':OUTPATH:.intermediate1';
  2462. store d into ':OUTPATH:.intermediate2';
  2463. exec;
  2464. e = load ':OUTPATH:.intermediate1';
  2465. f = load ':OUTPATH:.intermediate2';
  2466. g = join e by $0, f by $0 using 'merge';
  2467. store g into ':OUTPATH:';\,
  2468. 'verify_pig_script' => q\a = load ':INPATH:/singlefile/studenttab10k';
  2469. b = load ':INPATH:/singlefile/votertab10k';
  2470. g = join a by $0, b by $0;
  2471. store g into ':OUTPATH:';\,
  2472. 'notmq' => 1,
  2473. },
  2474. # Merge-join with left-side filter
  2475. {
  2476. 'num' => 2,
  2477. 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k';
  2478. b = load ':INPATH:/singlefile/votertab10k';
  2479. c = order a by $0;
  2480. d = order b by $0;
  2481. store c into ':OUTPATH:.intermediate1';
  2482. store d into ':OUTPATH:.intermediate2';
  2483. exec;
  2484. e = load ':OUTPATH:.intermediate1';
  2485. h = filter e by $1 > 30;
  2486. f = load ':OUTPATH:.intermediate2';
  2487. g = join h by $0, f by $0 using 'merge';
  2488. store g into ':OUTPATH:';\,
  2489. 'verify_pig_script' => q\a = load ':INPATH:/singlefile/studenttab10k';
  2490. b = load ':INPATH:/singlefile/votertab10k';
  2491. h = filter a by $1 > 30;
  2492. g = join h by $0, b by $0;
  2493. store g into ':OUTPATH:';\,
  2494. 'notmq' => 1,
  2495. },
  2496. # Merge-join with right-side filter
  2497. {
  2498. 'num' => 3,
  2499. 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k';
  2500. b = load ':INPATH:/singlefile/votertab10k';
  2501. c = order a by $0;
  2502. d = order b by $0;
  2503. store c into ':OUTPATH:.intermediate1';
  2504. store d into ':OUTPATH:.intermediate2';
  2505. exec;
  2506. e = load ':OUTPATH:.intermediate1';
  2507. f = load ':OUTPATH:.intermediate2';
  2508. i = filter f by $2 != 'democrat';
  2509. g = join e by $0, i by $0 using 'merge';
  2510. store g into ':OUTPATH:';\,
  2511. 'verify_pig_script' => q\a = load ':INPATH:/singlefile/studenttab10k';
  2512. b = load ':INPATH:/singlefile/votertab10k';
  2513. i = filter b by $2 != 'democrat';
  2514. g = join a by $0, i by $0;
  2515. store g into ':OUTPATH:';\,
  2516. 'notmq' => 1,
  2517. },
  2518. # Merge-join with schemas
  2519. {
  2520. 'num' => 4,
  2521. 'floatpostprocess' => 1,
  2522. 'delimiter' => ' ',
  2523. 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k';
  2524. b = load ':INPATH:/singlefile/votertab10k';
  2525. c = order a by $0;
  2526. d = order b by $0;
  2527. store c into ':OUTPATH:.intermediate1';
  2528. store d into ':OUTPATH:.intermediate2';
  2529. exec;
  2530. e = load ':OUTPATH:.intermediate1' as (name:chararray, age:int, gpa:float);
  2531. f = load ':OUTPATH:.intermediate2' as (name:chararray, age:int, reg:chararray, contrib:float);
  2532. g = join e by $0, f by $0 using 'merge';
  2533. store g into ':OUTPATH:';\,
  2534. 'verify_pig_script' => q\a = load ':INPATH:/singlefile/studenttab10k';
  2535. b = load ':INPATH:/singlefile/votertab10k';
  2536. g = join a by $0, b by $0;
  2537. store g into ':OUTPATH:';\,
  2538. 'notmq' => 1,
  2539. },
  2540. # Merge-join with key as expression
  2541. {
  2542. 'num' => 5,
  2543. 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k';
  2544. b = load ':INPATH:/singlefile/votertab10k';
  2545. c = order a by $0,$1;
  2546. d = order b by $0,$1;
  2547. store c into ':OUTPATH:.intermediate1';
  2548. store d into ':OUTPATH:.intermediate2';
  2549. exec;
  2550. e = load ':OUTPATH:.intermediate1';
  2551. f = load ':OUTPATH:.intermediate2';
  2552. g = join e by ($0,$1), f by ($0,$1) using 'merge';
  2553. store g into ':OUTPATH:';\,
  2554. 'verify_pig_script' => q\a = load ':INPATH:/singlefile/studenttab10k';
  2555. b = load ':INPATH:/singlefile/votertab10k';
  2556. g = join a by ($0,$1), b by ($0,$1);
  2557. store g into ':OUTPATH:';\,
  2558. 'notmq' => 1,
  2559. },
  2560. # Merge-join with key as expression This expression guarantees ordering
  2561. {
  2562. 'num' => 6,
  2563. 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k';
  2564. b = load ':INPATH:/singlefile/votertab10k';
  2565. c = order a by $1;
  2566. d = order b by $1;
  2567. store c into ':OUTPATH:.intermediate1';
  2568. store d into ':OUTPATH:.intermediate2';
  2569. exec;
  2570. e = load ':OUTPATH:.intermediate1';
  2571. f = load ':OUTPATH:.intermediate2';
  2572. g = join e by ($1+10), f by ($1+10) using 'merge';
  2573. store g into ':OUTPATH:';\,
  2574. 'verify_pig_script' => q\a = load ':INPATH:/singlefile/studenttab10k';
  2575. b = load ':INPATH:/singlefile/votertab10k';
  2576. g = join a by ($1+10), b by ($1+10) ;
  2577. store g into ':OUTPATH:';\,
  2578. 'notmq' => 1,
  2579. },
  2580. # Merge-join with nulls in keys and data.
  2581. {
  2582. 'num' => 7,
  2583. 'pig' => q\a = load ':INPATH:/singlefile/studentnulltab10k';
  2584. b = load ':INPATH:/singlefile/voternulltab10k';
  2585. c = order a by $0;
  2586. d = order b by $0;
  2587. store c into ':OUTPATH:.intermediate1';
  2588. store d into ':OUTPATH:.intermediate2';
  2589. exec;
  2590. e = load ':OUTPATH:.intermediate1';
  2591. f = load ':OUTPATH:.intermediate2';
  2592. g = join e by $0, f by $0 using 'merge';
  2593. store g into ':OUTPATH:';\,
  2594. 'verify_pig_script' => q\a = load ':INPATH:/singlefile/studentnulltab10k';
  2595. b = load ':INPATH:/singlefile/voternulltab10k';
  2596. g = join a by $0, b by $0;
  2597. store g into ':OUTPATH:';\,
  2598. 'notmq' => 1,
  2599. },
  2600. # Merge-join with one file across multiple blocks
  2601. {
  2602. 'num' => 8,
  2603. 'execonly' => 'mapred', # since this join will run out of memory in local mode
  2604. 'floatpostprocess' => 1,
  2605. 'delimiter' => ' ',
  2606. 'pig' => q\a = load ':INPATH:/singlefile/votertab10k';
  2607. b = load ':INPATH:/singlefile/studenttab20m';
  2608. h = filter b by $2 < 1.5;
  2609. c = order a by $0;
  2610. d = order h by $0 parallel 1;
  2611. store c into ':OUTPATH:.intermediate1';
  2612. store d into ':OUTPATH:.intermediate2';
  2613. exec;
  2614. e = load ':OUTPATH:.intermediate1' as (name:chararray, age:int, reg:chararray, contrib:float);
  2615. f = load ':OUTPATH:.intermediate2'as (name:chararray, age:int, gpa:float);
  2616. g = join e by $0, f by $0 using 'merge';
  2617. i = filter g by $2 == 'democrat' and $1 > 76;
  2618. store i into ':OUTPATH:';\,
  2619. 'verify_pig_script' => q\a = load ':INPATH:/singlefile/votertab10k';
  2620. b = load ':INPATH:/singlefile/studenttab20m';
  2621. h = filter b by $2 < 1.5;
  2622. g = join a by $0, h by $0;
  2623. i = filter g by $2 == 'democrat' and $1 > 76;
  2624. store i into ':OUTPATH:';\,
  2625. 'notmq' => 1,
  2626. },
  2627. # Merge-join with join on numeric key
  2628. {
  2629. 'num' => 9,
  2630. 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k' as (name:chararray, age:int, gpa:float);
  2631. b = load ':INPATH:/singlefile/votertab10k'as (name:chararray, age:int, reg:chararray, contrib:float);
  2632. c = order a by age;
  2633. d = order b by age;
  2634. store c into ':OUTPATH:.intermediate1';
  2635. store d into ':OUTPATH:.intermediate2';
  2636. exec;
  2637. e = load ':OUTPATH:.intermediate1' as (name:chararray, age:int, gpa:float);
  2638. f = load ':OUTPATH:.intermediate2' as (name:chararray, age:int, reg:chararray, contrib:float);
  2639. g = join e by age, f by age using 'merge';
  2640. store g into ':OUTPATH:';\,
  2641. 'verify_pig_script' => q\a = load ':INPATH:/singlefile/studenttab10k' as (name:chararray, age:int, gpa:float);
  2642. b = load ':INPATH:/singlefile/votertab10k'as (name:chararray, age:int, reg:chararray, contrib:float);
  2643. g = join a by age, b by age;
  2644. store g into ':OUTPATH:';\,
  2645. 'notmq' => 1,
  2646. },
  2647. ]
  2648. },
  2649. {
  2650. 'name' => 'SkewedJoin',
  2651. 'floatpostprocess' => 1,
  2652. 'delimiter' => ' ',
  2653. 'tests' => [
  2654. {
  2655. 'num' => 1,
  2656. 'java_params' => ['-Dpig.skewedjoin.reduce.maxtuple=100'],
  2657. 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name, age, gpa);
  2658. b = load ':INPATH:/singlefile/votertab10k' as (name, age, registration, contributions);
  2659. e = join a by name, b by name using 'skewed' parallel 8;
  2660. store e into ':OUTPATH:';\,
  2661. 'verify_pig_script' => q\a = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name, age, gpa);
  2662. b = load ':INPATH:/singlefile/votertab10k' as (name, age, registration, contributions);
  2663. e = join a by name, b by name;
  2664. store e into ':OUTPATH:';\,
  2665. },
  2666. # basic join with no skewed keys
  2667. {
  2668. 'num' => 2,
  2669. 'java_params' => ['-Dpig.skewedjoin.reduce.maxtuple=10000'],
  2670. 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name, age,
  2671. gpa);
  2672. b = load ':INPATH:/singlefile/votertab10k' as (name, age, registration, contributions);
  2673. e = join a by name, b by name using 'skewed';
  2674. store e into ':OUTPATH:';\,
  2675. 'verify_pig_script' =>q\a = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name, age,
  2676. gpa);
  2677. b = load ':INPATH:/singlefile/votertab10k' as (name, age, registration, contributions);
  2678. e = join a by name, b by name ;
  2679. store e into ':OUTPATH:';\,
  2680. },
  2681. # join after filtering
  2682. {
  2683. 'num' => 3,
  2684. 'java_params' => ['-Dpig.skewedjoin.reduce.maxtuple=3'],
  2685. 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name, age,
  2686. gpa);
  2687. b = load ':INPATH:/singlefile/votertab10k' as (name, age, registration, contributions);
  2688. c = filter a by age < 20;
  2689. d = filter b by age < 20;
  2690. e = join c by $0, d by $0 using 'skewed' parallel 8;
  2691. store e into ':OUTPATH:';\,
  2692. 'verify_pig_script' => q\a = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name, age,
  2693. gpa);
  2694. b = load ':INPATH:/singlefile/votertab10k' as (name, age, registration, contributions);
  2695. c = filter a by age < 20;
  2696. d = filter b by age < 20;
  2697. e = join c by $0, d by $0 ;
  2698. store e into ':OUTPATH:';\,
  2699. },
  2700. # join by two columns
  2701. {
  2702. 'num' => 4,
  2703. 'java_params' => ['-Dpig.skewedjoin.reduce.maxtuple=3'],
  2704. 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name, age, gpa);
  2705. b = load ':INPATH:/singlefile/votertab10k' as (name, age, registration, contributions);
  2706. c = filter a by age < 20;
  2707. d = filter b by age < 20;
  2708. e = join c by (name, age), d by (name, age) using 'skewed' parallel 8;
  2709. store e into ':OUTPATH:';\,
  2710. 'verify_pig_script' => q\a = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name, age, gpa);
  2711. b = load ':INPATH:/singlefile/votertab10k' as (name, age, registration, contributions);
  2712. c = filter a by age < 20;
  2713. d = filter b by age < 20;
  2714. e = join c by (name, age), d by (name, age) ;
  2715. store e into ':OUTPATH:';\,
  2716. },
  2717. # join with add
  2718. {
  2719. 'num' => 5,
  2720. 'java_params' => ['-Dpig.skewedjoin.reduce.maxtuple=50'],
  2721. 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k' as (name:chararray,age:int, gpa:double);
  2722. b = load ':INPATH:/singlefile/votertab10k' as (name, age, registration, contributions);
  2723. c = filter a by age < 20;
  2724. d = filter b by age < 20;
  2725. e = join c by age+10, d by age + 20 using 'skewed' parallel 10;
  2726. store e into ':OUTPATH:';\,
  2727. 'verify_pig_script' => q\a = load ':INPATH:/singlefile/studenttab10k' as (name:chararray,age:int, gpa:double);
  2728. b = load ':INPATH:/singlefile/votertab10k' as (name, age, registration, contributions);
  2729. c = filter a by age < 20;
  2730. d = filter b by age < 20;
  2731. e = join c by age+10, d by age + 20 ;
  2732. store e into ':OUTPATH:';\,
  2733. },
  2734. # join with split
  2735. {
  2736. 'num' => 6,
  2737. 'java_params' => ['-Dpig.skewedjoin.reduce.maxtuple=100'],
  2738. 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k';
  2739. b = filter a by $1 > 25;
  2740. c = join a by $0, b by $0 using 'skewed' parallel 7;
  2741. store c into ':OUTPATH:';\,
  2742. 'verify_pig_script' => q\a = load ':INPATH:/singlefile/studenttab10k';
  2743. b = filter a by $1 > 25;
  2744. c = join a by $0, b by $0 ;
  2745. store c into ':OUTPATH:';\,
  2746. },
  2747. # join with UDF
  2748. {
  2749. 'num' => 7,
  2750. 'java_params' => ['-Dpig.skewedjoin.reduce.maxtuple=20'],
  2751. 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k' as (name, age, gpa);
  2752. b = load ':INPATH:/singlefile/votertab10k' as (name, age, registration, contributions);
  2753. c = filter a by age < 20;
  2754. d = filter b by age < 20;
  2755. e = join c by SIZE(name), d by SIZE(name) using 'skewed' parallel 7;
  2756. store e into ':OUTPATH:';\,
  2757. 'verify_pig_script' => q\a = load ':INPATH:/singlefile/studenttab10k' as (name, age, gpa);
  2758. b = load ':INPATH:/singlefile/votertab10k' as (name, age, registration, contributions);
  2759. c = filter a by age < 20;
  2760. d = filter b by age < 20;
  2761. e = join c by SIZE(name), d by SIZE(name) ;
  2762. store e into ':OUTPATH:';\,
  2763. },
  2764. # left outer join
  2765. {
  2766. 'num' => 8,
  2767. 'java_params' => ['-Dpig.skewedjoin.reduce.maxtuple=100'],
  2768. 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name, age, gpa);
  2769. b = load ':INPATH:/singlefile/votertab10k' as (name, age, registration, contributions);
  2770. e = join a by name left outer, b by name using 'skewed' parallel 8;
  2771. store e into ':OUTPATH:';\,
  2772. 'verify_pig_script' => q\a = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name, age, gpa);
  2773. b = load ':INPATH:/singlefile/votertab10k' as (name, age, registration, contributions);
  2774. e = join a by name left outer, b by name ;
  2775. store e into ':OUTPATH:';\,
  2776. },
  2777. # right outer join
  2778. {
  2779. 'num' => 9,
  2780. 'java_params' => ['-Dpig.skewedjoin.reduce.maxtuple=100'],
  2781. 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name, age, gpa);
  2782. b = load ':INPATH:/singlefile/votertab10k' as (name, age, registration, contributions);
  2783. e = join a by name right outer, b by name using 'skewed' parallel 8;
  2784. store e into ':OUTPATH:';\,
  2785. 'verify_pig_script' => q\a = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name, age, gpa);
  2786. b = load ':INPATH:/singlefile/votertab10k' as (name, age, registration, contributions);
  2787. e = join a by name right outer, b by name ;
  2788. store e into ':OUTPATH:';\,
  2789. },
  2790. # full outer join
  2791. {
  2792. 'num' => 10,
  2793. 'java_params' => ['-Dpig.skewedjoin.reduce.maxtuple=100'],
  2794. 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name, age, gpa);
  2795. b = load ':INPATH:/singlefile/votertab10k' as (name, age, registration, contributions);
  2796. e = join a by name full outer, b by name using 'skewed' parallel 8;
  2797. store e into ':OUTPATH:';\,
  2798. 'verify_pig_script' => q\a = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name, age, gpa);
  2799. b = load ':INPATH:/singlefile/votertab10k' as (name, age, registration, contributions);
  2800. e = join a by name full outer, b by name ;
  2801. store e into ':OUTPATH:';\,
  2802. },
  2803. ]
  2804. },
  2805. {
  2806. 'name' => 'CollectedGroup',
  2807. 'tests' => [
  2808. # Simplest collected group.
  2809. {
  2810. 'num' => 1,
  2811. 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k';
  2812. b = order a by $0;
  2813. store b into ':OUTPATH:.intermediate';
  2814. exec;
  2815. register :FUNCPATH:/testudf.jar;
  2816. c = load ':OUTPATH:.intermediate' using org.apache.pig.test.udf.storefunc.SimpleCollectableLoader();
  2817. d = group c by $0 using 'collected';
  2818. e = foreach d generate group, COUNT(c);
  2819. store e into ':OUTPATH:';\,
  2820. 'notmq' => 1,
  2821. 'verify_pig_script' => q\a = load ':INPATH:/singlefile/studenttab10k';
  2822. d = group a by $0 ;
  2823. e = foreach d generate group, COUNT(a);
  2824. store e into ':OUTPATH:';\,
  2825. },
  2826. # Collected group with filter
  2827. {
  2828. 'num' => 2,
  2829. 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k';
  2830. b = order a by $0;
  2831. store b into ':OUTPATH:.intermediate';
  2832. exec;
  2833. register :FUNCPATH:/testudf.jar;
  2834. c = load ':OUTPATH:.intermediate' using org.apache.pig.test.udf.storefunc.SimpleCollectableLoader();
  2835. d = filter c by $1 > 30;
  2836. e = group d by $0 using 'collected';
  2837. f = foreach e generate group, COUNT(d);
  2838. store f into ':OUTPATH:';\,
  2839. 'notmq' => 1,
  2840. 'verify_pig_script' => q\a = load ':INPATH:/singlefile/studenttab10k';
  2841. d = filter a by $1 > 30;
  2842. e = group d by $0 ;
  2843. f = foreach e generate group, COUNT(d);
  2844. store f into ':OUTPATH:';\,
  2845. },
  2846. # Collected group with schemas
  2847. {
  2848. 'num' => 3,
  2849. 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k';
  2850. b = order a by $0;
  2851. store b into ':OUTPATH:.intermediate';
  2852. exec;
  2853. register :FUNCPATH:/testudf.jar;
  2854. c = load ':OUTPATH:.intermediate' using org.apache.pig.test.udf.storefunc.SimpleCollectableLoader() as (name:chararray, age:int, gpa:float);
  2855. d = group c by $0 using 'collected';
  2856. e = foreach d generate group, MAX(c.age);
  2857. store e into ':OUTPATH:';\,
  2858. 'notmq' => 1,
  2859. 'verify_pig_script' => q\a = load ':INPATH:/singlefile/studenttab10k' as (name:chararray, age:int, gpa:float);
  2860. d = group a by $0 ;
  2861. e = foreach d generate group, MAX(a.$1);
  2862. store e into ':OUTPATH:';\,
  2863. },
  2864. # Collected group with multiple columns
  2865. {
  2866. 'num' => 4,
  2867. 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k' as (name, age, gpa);
  2868. b = order a by name, age;
  2869. store b into ':OUTPATH:.intermediate';
  2870. exec;
  2871. register :FUNCPATH:/testudf.jar;
  2872. c = load ':OUTPATH:.intermediate' using org.apache.pig.test.udf.storefunc.SimpleCollectableLoader() as (name:chararray, age:int, gpa:float);
  2873. d = group c by (name, age) using 'collected';
  2874. e = foreach d generate group.name, group.age, MIN(c.gpa);
  2875. store e into ':OUTPATH:';\,
  2876. 'notmq' => 1,
  2877. 'verify_pig_script' => q\a = load ':INPATH:/singlefile/studenttab10k' as (name, age, gpa);
  2878. d = group a by (name, age) ;
  2879. e = foreach d generate group.name, group.age, MIN(a.gpa);
  2880. store e into ':OUTPATH:';\,
  2881. },
  2882. # Collected group with nulls in keys and data.
  2883. {
  2884. 'num' => 5,
  2885. 'pig' => q\a = load ':INPATH:/singlefile/studentnulltab10k';
  2886. b = order a by $0;
  2887. store b into ':OUTPATH:.intermediate';
  2888. exec;
  2889. register :FUNCPATH:/testudf.jar;
  2890. c = load ':OUTPATH:.intermediate' using org.apache.pig.test.udf.storefunc.SimpleCollectableLoader() as (name:chararray, age:int, gpa:float);
  2891. d = group c by $0 using 'collected';
  2892. e = foreach d generate group, SUM(c.$1);
  2893. store e into ':OUTPATH:';\,
  2894. 'notmq' => 1,
  2895. 'verify_pig_script' => q\a = load ':INPATH:/singlefile/studentnulltab10k' as (name:chararray, age:int, gpa:float);
  2896. d = group a by $0 ;
  2897. e = foreach d generate group, SUM(a.$1);
  2898. store e into ':OUTPATH:';\,
  2899. },
  2900. # Collected group with numeric key
  2901. {
  2902. 'num' => 6,
  2903. 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k' as (name:chararray, age:int, gpa:float);
  2904. b = order a by age;
  2905. store b into ':OUTPATH:.intermediate';
  2906. exec;
  2907. register :FUNCPATH:/testudf.jar;
  2908. c = load ':OUTPATH:.intermediate' using org.apache.pig.test.udf.storefunc.SimpleCollectableLoader() as (name:chararray, age:int, gpa:float);
  2909. d = group c by age using 'collected';
  2910. e = foreach d generate group, AVG(c.gpa), COUNT(c.name);
  2911. store e into ':OUTPATH:';\,
  2912. 'notmq' => 1,
  2913. 'verify_pig_script' => q\a = load ':INPATH:/singlefile/studenttab10k' as (name:chararray, age:int, gpa:float);
  2914. d = group a by age ;
  2915. e = foreach d generate group, AVG(a.gpa), COUNT(a.name);
  2916. store e into ':OUTPATH:';\,
  2917. },
  2918. ]
  2919. },
  2920. {
  2921. 'name' => 'SecondarySort',
  2922. 'tests' => [
  2923. {
  2924. # simple order by
  2925. 'num' => 1,
  2926. 'java_params' => ['-Dpig.accumulative.batchsize=5'],
  2927. 'pig' => q\register :FUNCPATH:/testudf.jar;
  2928. a = load ':INPATH:/singlefile/studenttab10k' as (name, age, gpa);
  2929. b = group a by age parallel 10;
  2930. c = foreach b {
  2931. d = order a by name;
  2932. generate group, org.apache.pig.test.udf.evalfunc.AllFirstLetter(d);
  2933. };
  2934. store c into ':OUTPATH:';\,
  2935. },
  2936. {
  2937. # order by desc
  2938. 'num' => 2,
  2939. 'java_params' => ['-Dpig.accumulative.batchsize=5'],
  2940. 'pig' => q\register :FUNCPATH:/testudf.jar;
  2941. a = load ':INPATH:/singlefile/studenttab10k' as (name, age, gpa);
  2942. b = group a by age parallel 10;
  2943. c = foreach b {
  2944. d = order a by name desc;
  2945. generate group, org.apache.pig.test.udf.evalfunc.AllFirstLetter(d);
  2946. };
  2947. store c into ':OUTPATH:';\,
  2948. },
  2949. {
  2950. # order by float type
  2951. 'num' => 3,
  2952. 'java_params' => ['-Dpig.accumulative.batchsize=5'],
  2953. 'pig' => q\register :FUNCPATH:/testudf.jar;
  2954. a = load ':INPATH:/singlefile/studenttab10k' as (name:chararray, age:int, gpa:float);
  2955. b = group a by age parallel 10;
  2956. c = foreach b {
  2957. d = order a by gpa;
  2958. generate group, org.apache.pig.test.udf.evalfunc.AllFirstLetter(d.gpa);
  2959. };
  2960. store c into ':OUTPATH:';\,
  2961. },
  2962. # order by string type
  2963. {
  2964. 'num' => 4,
  2965. 'java_params' => ['-Dpig.accumulative.batchsize=5'],
  2966. 'pig' => q\register :FUNCPATH:/testudf.jar;
  2967. a = load ':INPATH:/singlefile/studenttab10k' as (name:chararray, age:int, gpa:float);
  2968. b = group a by age parallel 10;
  2969. c = foreach b {
  2970. d = order a by name;
  2971. generate group, org.apache.pig.test.udf.evalfunc.AllFirstLetter(d.name);
  2972. };
  2973. store c into ':OUTPATH:';\,
  2974. },
  2975. # simple distinct
  2976. {
  2977. 'num' => 5,
  2978. 'java_params' => ['-Dpig.exec.nocombiner=true', '-Dpig.accumulative.batchsize=5'],
  2979. 'pig' => q\register :FUNCPATH:/testudf.jar;
  2980. a = load ':INPATH:/singlefile/studenttab10k' as (name:chararray, age:int, gpa:float);
  2981. b = group a by age parallel 10;
  2982. c = foreach b {
  2983. d = a.name;
  2984. e = distinct d;
  2985. generate group, org.apache.pig.test.udf.evalfunc.AllFirstLetter(e);
  2986. };
  2987. store c into ':OUTPATH:';\,
  2988. },
  2989. # distinct on tuple
  2990. {
  2991. 'num' => 6,
  2992. 'java_params' => ['-Dpig.exec.nocombiner=true', '-Dpig.accumulative.batchsize=5'],
  2993. 'pig' => q\register :FUNCPATH:/testudf.jar;
  2994. a = load ':INPATH:/singlefile/studenttab10k' as (name:chararray, age:int, gpa:float);
  2995. b = group a by age parallel 10;
  2996. c = foreach b {
  2997. d = distinct a;
  2998. generate group, org.apache.pig.test.udf.evalfunc.AllFirstLetter(d);
  2999. };
  3000. store c into ':OUTPATH:';\,
  3001. },
  3002. # sort by two columns
  3003. {
  3004. 'num' => 7,
  3005. 'java_params' => ['-Dpig.accumulative.batchsize=5'],
  3006. 'pig' => q\register :FUNCPATH:/testudf.jar;
  3007. a = load ':INPATH:/singlefile/studenttab10k' as (name:chararray, age:int, gpa:float);
  3008. b = group a by age parallel 10;
  3009. c = foreach b {
  3010. d = order a by gpa, name desc;
  3011. generate group, org.apache.pig.test.udf.evalfunc.AllFirstLetter(d.gpa), org.apache.pig.test.udf.evalfunc.AllFirstLetter(d.name);
  3012. };
  3013. store c into ':OUTPATH:';\,
  3014. },
  3015. # sort, distinct mix
  3016. {
  3017. 'num' => 8,
  3018. 'java_params' => ['-Dpig.exec.nocombiner=true', '-Dpig.accumulative.batchsize=5'],
  3019. 'pig' => q\register :FUNCPATH:/testudf.jar;
  3020. a = load ':INPATH:/singlefile/studenttab10k' as (name:chararray, age:int, gpa:float);
  3021. b = group a by age parallel 10;
  3022. c = foreach b {
  3023. d = order a by name;
  3024. e = d.gpa;
  3025. f = distinct e;
  3026. generate group, org.apache.pig.test.udf.evalfunc.AllFirstLetter(f);
  3027. };
  3028. store c into ':OUTPATH:';\,
  3029. },
  3030. # sort, distinct mix
  3031. {
  3032. 'num' => 9,
  3033. 'java_params' => ['-Dpig.exec.nocombiner=true', '-Dpig.accumulative.batchsize=5'],
  3034. 'pig' => q\register :FUNCPATH:/testudf.jar;
  3035. a = load ':INPATH:/singlefile/studenttab10k' as (name:chararray, age:int, gpa:float);
  3036. b = group a by age parallel 10;
  3037. c = foreach b {
  3038. d = order a by gpa;
  3039. e = d.gpa;
  3040. f = distinct e;
  3041. generate group, org.apache.pig.test.udf.evalfunc.AllFirstLetter(f);
  3042. };
  3043. store c into ':OUTPATH:';\,
  3044. },
  3045. {
  3046. # secondary sort boolean
  3047. 'num' => 10,
  3048. 'pig' => q\a = load ':INPATH:/singlefile/allscalar10k' using PigStorage() as (name:chararray, age:int, gpa:double, instate:boolean);
  3049. b = group a by age;
  3050. c = foreach b {
  3051. d = order a by instate;
  3052. generate group, flatten(d);
  3053. };
  3054. store c into ':OUTPATH:';\,
  3055. 'verify_pig_script' => q\a = load ':INPATH:/singlefile/allscalar10k' using PigStorage() as (name:chararray, age:int, gpa:double, instate:chararray);
  3056. b = group a by age;
  3057. c = foreach b {
  3058. d = order a by instate;
  3059. generate group, flatten(d);
  3060. };
  3061. store c into ':OUTPATH:';\,
  3062. }
  3063. ]
  3064. },
  3065. {
  3066. 'name' => 'Accumulator',
  3067. 'tests' => [
  3068. {
  3069. 'num' => 1,
  3070. 'java_params' => ['-Dpig.exec.nocombiner=true', '-Dpig.accumulative.batchsize=5'],
  3071. 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name, age:int, gpa);
  3072. b = load ':INPATH:/singlefile/votertab10k' as (name, age, registration, contributions);
  3073. e = cogroup a by name, b by name parallel 8;
  3074. f = foreach e generate group, SUM(a.age) as s;
  3075. g = filter f by s>0;
  3076. store g into ':OUTPATH:';\,
  3077. },
  3078. {
  3079. 'num' => 2,
  3080. 'java_params' => ['-Dpig.exec.nocombiner=true', '-Dpig.accumulative.batchsize=5'],
  3081. 'pig' => q\a = load ':INPATH:/singlefile/votertab10k' using PigStorage() as (name, age:int, registration, contributions);
  3082. e = group a by name parallel 8;
  3083. f = foreach e generate group, COUNT(a), MAX(a.contributions), MIN(a.contributions) ;
  3084. store f into ':OUTPATH:';\,
  3085. },
  3086. {
  3087. 'num' => 3,
  3088. 'java_params' => ['-Dpig.exec.nocombiner=true', '-Dpig.accumulative.batchsize=5'],
  3089. 'pig' => q\a = load ':INPATH:/singlefile/votertab10k' using PigStorage() as (name, age:int, registration, contributions);
  3090. e = group a by name parallel 8;
  3091. f = foreach e generate group, (MAX(a.contributions)-MIN(a.contributions))*COUNT(a) ;
  3092. store f into ':OUTPATH:';\,
  3093. },
  3094. {
  3095. 'num' => 4,
  3096. 'java_params' => ['-Dpig.exec.nocombiner=true', '-Dpig.accumulative.batchsize=5'],
  3097. 'pig' => q\a = load ':INPATH:/singlefile/votertab10k' using PigStorage() as (name, age:int, registration, contributions);
  3098. e = group a by name parallel 8;
  3099. f = foreach e { g = distinct a.age; generate group, COUNT(g);}
  3100. store f into ':OUTPATH:';\,
  3101. },
  3102. {
  3103. 'num' => 5,
  3104. 'java_params' => ['-Dpig.exec.nocombiner=true', '-Dpig.accumulative.batchsize=1'],
  3105. 'pig' => q\a = load ':INPATH:/singlefile/votertab10k' using PigStorage() as (name, age:int, registration, contributions);
  3106. register :FUNCPATH:/testudf.jar;
  3107. DEFINE YesAccumulate org.apache.pig.TestingAccumulatorHelper('false');
  3108. DEFINE NoAccumulate org.apache.pig.TestingAccumulatorHelper('true');
  3109. b = foreach (group a all) generate COUNT(a) as ct, YesAccumulate(a) as yes_acc, NoAccumulate(a) as no_acc;
  3110. store b into ':OUTPATH:';\,
  3111. 'verify_pig_script' => q\a = load ':INPATH:/singlefile/votertab10k' using PigStorage() as (name, age:int, registration, contributions);
  3112. b = foreach (group a all) generate COUNT(a) as ct;
  3113. c = foreach b generate ct, ct as ct2, 0;
  3114. store c into ':OUTPATH:';\,
  3115. },
  3116. {
  3117. 'num' => 6,
  3118. 'java_params' => ['-Dpig.exec.nocombiner=true'],
  3119. 'pig' => q\a = load ':INPATH:/singlefile/votertab10k' using PigStorage() as (name, age:int, registration, contributions);
  3120. register :FUNCPATH:/testudf.jar;
  3121. DEFINE YesAccumulate org.apache.pig.TestingAccumulatorHelper('false');
  3122. DEFINE NoAccumulate org.apache.pig.TestingAccumulatorHelper('true');
  3123. b = foreach (group a all) generate org.apache.pig.test.udf.evalfunc.NonAlgNonAccCount(a) as ct, YesAccumulate(a) as yes_acc, NoAccumulate(a) as no_acc;
  3124. store b into ':OUTPATH:';\,
  3125. 'verify_pig_script' => q\a = load ':INPATH:/singlefile/votertab10k' using PigStorage() as (name, age:int, registration, contributions);
  3126. register :FUNCPATH:/testudf.jar;
  3127. b = foreach (group a all) generate org.apache.pig.test.udf.evalfunc.NonAlgNonAccCount(a) as ct;
  3128. c = foreach b generate ct, 1, 1;
  3129. store c into ':OUTPATH:';\,
  3130. },
  3131. {
  3132. 'num' => 7,
  3133. 'java_params' => ['-Dpig.exec.nocombiner=true', '-Dpig.accumulative.batchsize=5'],
  3134. 'pig' => q\a = load ':INPATH:/singlefile/votertab10k' using PigStorage() as (name, age:long, registration, contributions);
  3135. register :FUNCPATH:/testudf.jar;
  3136. b = foreach (group a all) generate COUNT(a),
  3137. org.apache.pig.test.udf.evalfunc.IteratingAccumulatorCount(a),
  3138. org.apache.pig.test.udf.evalfunc.IteratingAccumulatorSum(a.age),
  3139. org.apache.pig.test.udf.evalfunc.IteratingAccumulatorIsEmpty(a);
  3140. store b into ':OUTPATH:';\,
  3141. 'verify_pig_script' => q\a = load ':INPATH:/singlefile/votertab10k' using PigStorage() as (name, age:long, registration, contributions);
  3142. b = foreach (group a all) generate COUNT(a), SUM(a.age), IsEmpty(a);
  3143. c = foreach b generate $0, *;
  3144. store c into ':OUTPATH:';\,
  3145. },
  3146. {
  3147. 'num' => 8,
  3148. 'java_params' => ['-Dpig.exec.nocombiner=true', '-Dpig.accumulative.batchsize=5'],
  3149. 'pig' => q\a = load ':INPATH:/singlefile/votertab10k' using PigStorage() as (name, age:long, registration, contributions);
  3150. register :FUNCPATH:/testudf.jar;
  3151. b = foreach (group a all) generate org.apache.pig.test.udf.evalfunc.NonAlgNonAccCount(a),
  3152. org.apache.pig.test.udf.evalfunc.IteratingAccumulatorCount(a),
  3153. org.apache.pig.test.udf.evalfunc.IteratingAccumulatorSum(a.age),
  3154. org.apache.pig.test.udf.evalfunc.IteratingAccumulatorIsEmpty(a);
  3155. store b into ':OUTPATH:';\,
  3156. 'verify_pig_script' => q\a = load ':INPATH:/singlefile/votertab10k' using PigStorage() as (name, age:long, registration, contributions);
  3157. register :FUNCPATH:/testudf.jar;
  3158. b = foreach (group a all) generate org.apache.pig.test.udf.evalfunc.NonAlgNonAccCount(a), COUNT(a), SUM(a.age), IsEmpty(a);
  3159. store b into ':OUTPATH:';\,
  3160. },
  3161. ]
  3162. },
  3163. {
  3164. 'name' => 'PruneColumns',
  3165. 'tests' => [
  3166. {
  3167. 'num' => 1,
  3168. 'execonly' => 'mapred', # studenttab20m not available in local mode
  3169. 'pig' => q\
  3170. a = load ':INPATH:/singlefile/studenttab20m' using PigStorage() as (name, age, gpa);
  3171. b = foreach a generate age;
  3172. store b into ':OUTPATH:';\,
  3173. }
  3174. ]
  3175. },
  3176. {
  3177. 'name' => 'Bzip',
  3178. 'tests' => [
  3179. {
  3180. # test reading and writing out files with .bz2 extension
  3181. 'num' => 1,
  3182. 'pig' => q\
  3183. a = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name, age, gpa);
  3184. store a into ':OUTPATH:.intermediate.bz2';
  3185. b = load ':OUTPATH:.intermediate.bz2';
  3186. store b into ':OUTPATH:';\,
  3187. 'notmq' => 1,
  3188. },
  3189. {
  3190. # test reading and writing with .bz extension
  3191. 'num' => 2,
  3192. 'pig' => q\
  3193. a = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name, age, gpa);
  3194. store a into ':OUTPATH:.intermediate.bz';
  3195. b = load ':OUTPATH:.intermediate.bz';
  3196. store b into ':OUTPATH:';\,
  3197. 'notmq' => 1,
  3198. },
  3199. ]
  3200. },
  3201. {
  3202. 'name' => 'Scalar',
  3203. 'tests' => [
  3204. {
  3205. # test scalar in foreach (most common)
  3206. 'num' => 1,
  3207. 'pig' => q\
  3208. a = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name, age, gpa);
  3209. b = group a all;
  3210. c = foreach b generate AVG(a.gpa) as avg, MAX(a.gpa) as max;
  3211. y = foreach a generate name, (gpa - c.avg) / c.max;
  3212. store y into ':OUTPATH:';\,
  3213. 'floatpostprocess' => 1,
  3214. 'delimiter' => ' ',
  3215. },
  3216. {
  3217. # test scalar in filter
  3218. 'num' => 2,
  3219. 'pig' => q\
  3220. a = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name, age, gpa);
  3221. b = group a all;
  3222. c = foreach b generate AVG(a.gpa) as avg;
  3223. y = filter a by gpa > c.avg;
  3224. store y into ':OUTPATH:';\,
  3225. },
  3226. {
  3227. # test scalar with two branch
  3228. 'num' => 3,
  3229. 'pig' => q\
  3230. a = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name, age, gpa);
  3231. b = group a all;
  3232. c = foreach b generate AVG(a.age) as avg;
  3233. x = load ':INPATH:/singlefile/votertab10k' using PigStorage() as (name, age, registration, contributions);
  3234. y = filter x by age > c.avg;
  3235. store y into ':OUTPATH:';\,
  3236. },
  3237. {
  3238. # test with scalar from two inputs
  3239. 'num' => 4,
  3240. 'pig' => q\
  3241. a = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name, age, gpa);
  3242. b = group a all;
  3243. c = foreach b generate AVG(a.age) as avg;
  3244. d = load ':INPATH:/singlefile/votertab10k' using PigStorage() as (name, age, registration, contributions);
  3245. e = group d all;
  3246. f = foreach e generate AVG(d.age) as avg;
  3247. y = foreach a generate age/c.avg, age/f.avg;
  3248. store y into ':OUTPATH:';\,
  3249. },
  3250. ]
  3251. },
  3252. {
  3253. 'name' => 'Scripting',
  3254. 'tests' => [
  3255. {
  3256. # test integer square
  3257. 'num' => 1,
  3258. 'pig' => q\
  3259. register ':SCRIPTHOMEPATH:/python/scriptingudf.py' using jython as myfuncs;
  3260. a = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name, age:int, gpa:double);
  3261. b = foreach a generate myfuncs.square(age);
  3262. store b into ':OUTPATH:';\,
  3263. 'verify_pig_script' => q\
  3264. a = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name, age:int, gpa:double);
  3265. b = foreach a generate age * age;
  3266. store b into ':OUTPATH:';\,
  3267. },
  3268. {
  3269. # test string concat and referencing function without a namespace
  3270. 'num' => 2,
  3271. 'pig' => q\
  3272. register ':SCRIPTHOMEPATH:/python/scriptingudf.py' using jython;
  3273. a = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name:chararray, age, gpa);
  3274. b = foreach a generate concat(name) as name;
  3275. store b into ':OUTPATH:';\,
  3276. 'verify_pig_script' => q\
  3277. a = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name:chararray, age:int, gpa:double);
  3278. b = foreach a generate CONCAT(name, name);
  3279. store b into ':OUTPATH:';\,
  3280. },
  3281. {
  3282. # test long and double square, plus two references to the same UDF with different schemas
  3283. 'num' => 3,
  3284. 'pig' => q\
  3285. register ':SCRIPTHOMEPATH:/python/scriptingudf.py' using jython as myfuncs;
  3286. a = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name, age:long, gpa:double);
  3287. b = foreach a generate myfuncs.square(age), myfuncs.square(gpa);
  3288. store b into ':OUTPATH:';\,
  3289. 'verify_pig_script' => q\
  3290. a = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name, age:int, gpa:double);
  3291. b = foreach a generate age * age, gpa * gpa;
  3292. store b into ':OUTPATH:';\,
  3293. 'floatpostprocess' => 1,
  3294. 'delimiter' => ' ',
  3295. },
  3296. {
  3297. # test method with no schema decorator (ie, returns bytearray)
  3298. 'num' => 4,
  3299. 'pig' => q\
  3300. register ':SCRIPTHOMEPATH:/python/scriptingudf.py' using jython as myfuncs;
  3301. a = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name:chararray, age, gpa);
  3302. b = foreach a generate myfuncs.byteconcat(name);
  3303. store b into ':OUTPATH:';\,
  3304. 'verify_pig_script' => q\
  3305. a = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name, age, gpa);
  3306. b = foreach a generate CONCAT(name, name);
  3307. store b into ':OUTPATH:';\,
  3308. },
  3309. {
  3310. # test method with no schema decorator (ie, returns bytearray)
  3311. 'num' => 5,
  3312. 'pig' => q\
  3313. register ':SCRIPTHOMEPATH:/python/scriptingudf.py' using jython as myfuncs;
  3314. a = load ':INPATH:/singlefile/studentcomplextab10k' using PigStorage() as (m:[], t:(name:chararray, age:int, gpa:double), b:{t:(name:chararray, age:int, gpa:double)});
  3315. b = foreach a generate flatten(myfuncs.complexTypes(m, t, b)) as (mm, mt, mb);
  3316. c = foreach b generate mm#'name', mt.$0, mb.$0;
  3317. store c into ':OUTPATH:';\,
  3318. 'verify_pig_script' => q\
  3319. a = load ':INPATH:/singlefile/studentcomplextab10k' using PigStorage() as (m:[], t:(name:chararray, age:int, gpa:double), b:{t:(name:chararray, age:int, gpa:double)});
  3320. b = foreach a generate SIZE(m#'name'), t.$2, b.$2;
  3321. store b into ':OUTPATH:';\,
  3322. },
  3323. {
  3324. # test null input and output
  3325. 'num' => 6,
  3326. 'pig' => q\
  3327. register ':SCRIPTHOMEPATH:/python/scriptingudf.py' using jython as myfuncs;
  3328. a = load ':INPATH:/singlefile/studentnulltab10k' using PigStorage() as (name, age:int, gpa:double);
  3329. b = foreach a generate myfuncs.square(age);
  3330. store b into ':OUTPATH:';\,
  3331. 'verify_pig_script' => q\
  3332. a = load ':INPATH:/singlefile/studentnulltab10k' using PigStorage() as (name, age:int, gpa:double);
  3333. b = foreach a generate age * age;
  3334. store b into ':OUTPATH:';\,
  3335. },
  3336. {
  3337. # test functions that call other functions and include other files
  3338. 'num' => 7,
  3339. 'pig' => q\
  3340. register ':SCRIPTHOMEPATH:/python/scriptingudf.py' using jython as myfuncs;
  3341. a = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name, age:int, gpa:double);
  3342. b = foreach a generate myfuncs.redirect(age);
  3343. store b into ':OUTPATH:';\,
  3344. 'verify_pig_script' => q\
  3345. a = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name, age:int, gpa:double);
  3346. b = foreach a generate age * age;
  3347. store b into ':OUTPATH:';\,
  3348. },
  3349. {
  3350. # test that functions with same names resolve correctly across name spaces
  3351. 'num' => 8,
  3352. 'pig' => q\
  3353. register ':SCRIPTHOMEPATH:/python/scriptingudf.py' using jython as myfuncs;
  3354. register ':SCRIPTHOMEPATH:/python/morepythonudfs.py' using jython as morefuncs;
  3355. a = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name, age:int, gpa:double);
  3356. b = foreach a generate myfuncs.square(age), morefuncs.square(age);
  3357. store b into ':OUTPATH:';\,
  3358. 'verify_pig_script' => q\
  3359. a = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name, age:int, gpa:double);
  3360. b = foreach a generate age * age, age * age * age;
  3361. store b into ':OUTPATH:';\,
  3362. },
  3363. {
  3364. # test that functions with same names resolve correctly across name spaces
  3365. 'num' => 9,
  3366. 'pig' => q\
  3367. register ':SCRIPTHOMEPATH:/python/scriptingudf.py' using jython as myfuncs;
  3368. a = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name, age:int, gpa:double);
  3369. b = group a by name;
  3370. c = foreach b generate group, myfuncs.count(a);
  3371. store c into ':OUTPATH:';\,
  3372. 'verify_pig_script' => q\
  3373. a = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name, age:int, gpa:double);
  3374. b = group a by name;
  3375. c = foreach b generate group, COUNT(a);
  3376. store c into ':OUTPATH:';\,
  3377. },
  3378. {
  3379. # test that functions with same names resolve correctly across name spaces
  3380. 'num' => 10,
  3381. 'pig' => q\
  3382. register ':SCRIPTHOMEPATH:/python/scriptingudf.py' using jython as myfuncs;
  3383. a = load ':INPATH:/singlefile/allscalar10k' as (name:chararray, age:int, gpa:double, instate:boolean);
  3384. b = foreach a generate name, myfuncs.adjustgpa(gpa, instate);
  3385. store b into ':OUTPATH:';\,
  3386. 'verify_pig_script' => q\
  3387. a = load ':INPATH:/singlefile/allscalar10k' using PigStorage() as (name:chararray, age:int, gpa:double, instate:chararray);
  3388. b = foreach a generate name, (instate=='true'?gpa:gpa+1);
  3389. store b into ':OUTPATH:';\,
  3390. },
  3391. {
  3392. # test that functions with same names resolve correctly across name spaces
  3393. 'num' => 11,
  3394. 'pig' => q\
  3395. register ':SCRIPTHOMEPATH:/python/scriptingudf.py' using jython as myfuncs;
  3396. a = load ':INPATH:/singlefile/studenttab10k' as (name:chararray, age:int, gpa:double);
  3397. b = foreach a generate name, myfuncs.isretired(age);
  3398. store b into ':OUTPATH:';\,
  3399. 'verify_pig_script' => q\
  3400. a = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name:chararray, age:int, gpa:double);
  3401. b = foreach a generate name, (age>=60?'true':'false');
  3402. store b into ':OUTPATH:';\,
  3403. },
  3404. {
  3405. # jython udf which returns an array
  3406. 'num' => 12,
  3407. 'pig' => q\
  3408. register ':SCRIPTHOMEPATH:/python/scriptingudf.py' using jython as myfuncs;
  3409. a = load ':INPATH:/singlefile/studenttab10k' as (name:chararray, age:chararray, gpa:chararray);
  3410. b = foreach a generate CONCAT(CONCAT(age, ' '), gpa) as sentence;
  3411. c = foreach b generate flatten(myfuncs.tokenize(sentence));
  3412. store c into ':OUTPATH:';\,
  3413. 'verify_pig_script' => q\
  3414. a = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name:chararray, age:chararray, gpa:chararray);
  3415. b = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name:chararray, age:chararray, gpa:chararray);
  3416. c = foreach a generate age;
  3417. d = foreach b generate gpa;
  3418. e = union c, d;
  3419. store e into ':OUTPATH:';\,
  3420. }
  3421. ]
  3422. },
  3423. {
  3424. 'name' => 'RubyUDFs',
  3425. 'tests' => [
  3426. {
  3427. # test integer square
  3428. 'num' => 1,
  3429. 'pig' => q\
  3430. register ':SCRIPTHOMEPATH:/ruby/scriptingudfs.rb' using jruby as myfuncs;
  3431. a = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name, age:int, gpa:double);
  3432. b = foreach a generate myfuncs.square(age);
  3433. store b into ':OUTPATH:';\,
  3434. 'verify_pig_script' => q\
  3435. a = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name, age:int, gpa:double);
  3436. b = foreach a generate age * age;
  3437. store b into ':OUTPATH:';\,
  3438. },
  3439. {
  3440. # test string concat and referencing function without a namespace
  3441. 'num' => 2,
  3442. 'pig' => q\
  3443. register ':SCRIPTHOMEPATH:/ruby/scriptingudfs.rb' using jruby as myfuncs;
  3444. a = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name:chararray, age, gpa);
  3445. b = foreach a generate myfuncs.concat(name, name);
  3446. store b into ':OUTPATH:';\,
  3447. 'verify_pig_script' => q\
  3448. a = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name:chararray, age:int, gpa:double);
  3449. b = foreach a generate CONCAT(name, name);
  3450. store b into ':OUTPATH:';\,
  3451. },
  3452. {
  3453. # test long and double square, plus two references to the same UDF with different schemas
  3454. 'num' => 3,
  3455. 'pig' => q\
  3456. register ':SCRIPTHOMEPATH:/ruby/scriptingudfs.rb' using jruby as myfuncs;
  3457. a = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name, age:long, gpa:double);
  3458. b = foreach a generate myfuncs.square(age), myfuncs.square(gpa);
  3459. store b into ':OUTPATH:';\,
  3460. 'verify_pig_script' => q\
  3461. a = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name, age:int, gpa:double);
  3462. b = foreach a generate age * age, gpa * gpa;
  3463. store b into ':OUTPATH:';\,
  3464. 'floatpostprocess' => 1,
  3465. 'delimiter' => ' ',
  3466. },
  3467. {
  3468. # test method with no schema decorator (ie, returns bytearray)
  3469. 'num' => 4,
  3470. 'pig' => q\
  3471. register ':SCRIPTHOMEPATH:/ruby/scriptingudfs.rb' using jruby as myfuncs;
  3472. a = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name, age, gpa);
  3473. b = foreach a generate myfuncs.byteconcat(name, name);
  3474. store b into ':OUTPATH:';\,
  3475. 'verify_pig_script' => q\
  3476. a = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name, age, gpa);
  3477. b = foreach a generate CONCAT(name, name);
  3478. store b into ':OUTPATH:';\,
  3479. },
  3480. {
  3481. # test method with complex types
  3482. 'num' => 5,
  3483. 'pig' => q\
  3484. register ':SCRIPTHOMEPATH:/ruby/scriptingudfs.rb' using jruby as myfuncs;
  3485. a = load ':INPATH:/singlefile/studentcomplextab10k' using PigStorage() as (m:[], t:(name:chararray, age:int, gpa:double), b:{t:(name:chararray, age:int, gpa:double)});
  3486. b = foreach a generate flatten(myfuncs.complexTypes(m, t, b)) as (mm, mt, mb);
  3487. c = foreach b generate mm#'name', mt.$0, mb.$0;
  3488. store c into ':OUTPATH:';\,
  3489. 'verify_pig_script' => q\
  3490. a = load ':INPATH:/singlefile/studentcomplextab10k' using PigStorage() as (m:[], t:(name:chararray, age:int, gpa:double), b:{t:(name:chararray, age:int, gpa:double)});
  3491. b = foreach a generate SIZE(m#'name'), t.$2, b.$2;
  3492. store b into ':OUTPATH:';\,
  3493. },
  3494. {
  3495. # test null input and output
  3496. 'num' => 6,
  3497. 'pig' => q\
  3498. register ':SCRIPTHOMEPATH:/ruby/scriptingudfs.rb' using jruby as myfuncs;
  3499. a = load ':INPATH:/singlefile/studentnulltab10k' using PigStorage() as (name, age:int, gpa:double);
  3500. b = foreach a generate myfuncs.square(age);
  3501. store b into ':OUTPATH:';\,
  3502. 'verify_pig_script' => q\
  3503. a = load ':INPATH:/singlefile/studentnulltab10k' using PigStorage() as (name, age:int, gpa:double);
  3504. b = foreach a generate age * age;
  3505. store b into ':OUTPATH:';\,
  3506. },
  3507. {
  3508. # test functions that call other functions and include other files
  3509. 'num' => 7,
  3510. 'pig' => q\
  3511. register ':SCRIPTHOMEPATH:/ruby/scriptingudfs.rb' using jruby as myfuncs;
  3512. a = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name, age:int, gpa:double);
  3513. b = foreach a generate myfuncs.redirect(age);
  3514. store b into ':OUTPATH:';\,
  3515. 'verify_pig_script' => q\
  3516. a = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name, age:int, gpa:double);
  3517. b = foreach a generate age * age;
  3518. store b into ':OUTPATH:';\,
  3519. },
  3520. {
  3521. # test that functions with same names resolve correctly across name spaces
  3522. 'num' => 8,
  3523. 'pig' => q\
  3524. register ':SCRIPTHOMEPATH:/ruby/scriptingudfs.rb' using jruby as myfuncs;
  3525. register ':SCRIPTHOMEPATH:/ruby/morerubyudfs.rb' using jruby as morefuncs;
  3526. a = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name, age:int, gpa:double);
  3527. b = foreach a generate myfuncs.square(age), morefuncs.cube(age), morefuncs.CUBE(age);
  3528. store b into ':OUTPATH:';\,
  3529. 'verify_pig_script' => q\
  3530. a = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name, age:int, gpa:double);
  3531. b = foreach a generate age * age, age * age * age, age * age * age;
  3532. store b into ':OUTPATH:';\,
  3533. },
  3534. {
  3535. # test algebraic functions
  3536. 'num' => 9,
  3537. 'pig' => q\
  3538. register ':SCRIPTHOMEPATH:/ruby/scriptingudfs.rb' using jruby as myfuncs;
  3539. a = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name, age:int, gpa:double);
  3540. b = group a by name;
  3541. c = foreach b generate group, myfuncs.Count(a);
  3542. store c into ':OUTPATH:';\,
  3543. 'verify_pig_script' => q\
  3544. a = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name, age:int, gpa:double);
  3545. b = group a by name;
  3546. c = foreach b generate group, COUNT(a);
  3547. store c into ':OUTPATH:';\,
  3548. },
  3549. {
  3550. # test accumulator functions
  3551. 'num' => 10,
  3552. 'java_params' => ['-Dpig.accumulative.batchsize=5'],
  3553. 'pig' => q\
  3554. register ':SCRIPTHOMEPATH:/ruby/scriptingudfs.rb' using jruby as myfuncs;
  3555. a = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name, age:int, gpa:double);
  3556. b = group a by name;
  3557. c = foreach b generate group, myfuncs.Sum(a.age), myfuncs.Sum(a.gpa);
  3558. d = foreach c generate $0, $1, (double)((int)$2*100)/100;
  3559. store d into ':OUTPATH:';\,
  3560. 'verify_pig_script' => q\
  3561. a = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name, age:int, gpa:double);
  3562. b = group a by name;
  3563. c = foreach b generate group, SUM(a.age), SUM(a.gpa);
  3564. d = foreach c generate $0, $1, (double)((int)$2*100)/100;
  3565. store d into ':OUTPATH:';\,
  3566. },
  3567. {
  3568. 'num' => 11,
  3569. 'pig' => q\
  3570. register ':SCRIPTHOMEPATH:/ruby/morerubyudfs.rb' using jruby as myfuncs;
  3571. a = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name, age:int, gpa:double);
  3572. b = foreach a generate flatten(myfuncs.reverse(name, age));
  3573. store b into ':OUTPATH:';\,
  3574. 'verify_pig_script' => q\
  3575. a = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name, age:int, gpa:double);
  3576. b = foreach a generate age, name;
  3577. store b into ':OUTPATH:';\,
  3578. },
  3579. {
  3580. 'num' => 12,
  3581. 'pig' => q\
  3582. register ':SCRIPTHOMEPATH:/ruby/morerubyudfs.rb' using jruby as myfuncs;
  3583. a = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name, age:int, gpa:double);
  3584. b = filter a by myfuncs.ISEVEN(age);
  3585. store b into ':OUTPATH:';\,
  3586. 'verify_pig_script' => q\
  3587. a = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name, age:int, gpa:double);
  3588. b = filter a by age%2==0;
  3589. store b into ':OUTPATH:';\,
  3590. },
  3591. {
  3592. 'num' => 13,
  3593. 'java_params' => ['-Dpig.accumulative.batchsize=5'],
  3594. 'pig' => q\
  3595. register ':SCRIPTHOMEPATH:/ruby/morerubyudfs.rb' using jruby as myfuncs;
  3596. a = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name, age:int, gpa:double);
  3597. b = foreach (group a all) generate FLATTEN(myfuncs.AppendIndex(a));
  3598. store b into ':OUTPATH:';\,
  3599. 'verify_pig_script' => q\
  3600. register :FUNCPATH:/testudf.jar;
  3601. a = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name, age:int, gpa:double);
  3602. b = foreach (group a all) generate FLATTEN(org.apache.pig.test.udf.evalfunc.AppendIndex(a));
  3603. store b into ':OUTPATH:';\,
  3604. },
  3605. ]
  3606. },
  3607. {
  3608. 'name' => 'Native',
  3609. 'tests' => [
  3610. {
  3611. # test common
  3612. 'num' => 1,
  3613. 'pig' => q\
  3614. rmf table_testNativeMRJobSimple_input
  3615. rmf table_testNativeMRJobSimple_output
  3616. a = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name, age, gpa);
  3617. b = mapreduce ':MAPREDJARS:/hadoop-examples.jar' Store a into 'table_testNativeMRJobSimple_input' Load 'table_testNativeMRJobSimple_output' `wordcount table_testNativeMRJobSimple_input table_testNativeMRJobSimple_output`;
  3618. store b into ':OUTPATH:';\,
  3619. 'notmq' => 1,
  3620. },
  3621. {
  3622. # test complex
  3623. 'num' => 2,
  3624. 'pig' => q\
  3625. rmf table_testNativeMRJobSimple_input
  3626. rmf table_testNativeMRJobSimple_output
  3627. a = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name, age, gpa);
  3628. b = foreach a generate name;
  3629. c = distinct b;
  3630. d = mapreduce ':MAPREDJARS:/hadoop-examples.jar' Store c into 'table_testNativeMRJobSimple_input' Load 'table_testNativeMRJobSimple_output' as (name:chararray, count: int) `wordcount table_testNativeMRJobSimple_input table_testNativeMRJobSimple_output`;
  3631. e = order d by name;
  3632. store e into ':OUTPATH:';\,
  3633. 'sortArgs' => ['-t', ' '],
  3634. 'notmq' => 1,
  3635. },
  3636. {
  3637. # test streaming
  3638. 'num' => 3,
  3639. 'pig' => q\
  3640. rmf table_testNativeMRJobSimple_input
  3641. rmf table_testNativeMRJobSimple_output
  3642. a = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name, age, gpa);
  3643. b = mapreduce ':MAPREDJARS:/hadoop-streaming.jar' Store a into 'table_testNativeMRJobSimple_input' Load 'table_testNativeMRJobSimple_output' as (name:chararray, count: int) `-input table_testNativeMRJobSimple_input -output table_testNativeMRJobSimple_output -mapper /bin/cat -reducer /usr/bin/wc`;
  3644. store b into ':OUTPATH:';\,
  3645. 'pig23' => q\
  3646. rmf table_testNativeMRJobSimple_input
  3647. rmf table_testNativeMRJobSimple_output
  3648. a = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name, age, gpa);
  3649. b = mapreduce ':MAPREDJARS:/hadoop-0.23.0-streaming.jar' Store a into 'table_testNativeMRJobSimple_input' Load 'table_testNativeMRJobSimple_output' as (name:chararray, count: int) `-input table_testNativeMRJobSimple_input -output table_testNativeMRJobSimple_output -mapper /bin/cat -reducer /usr/bin/wc`;
  3650. store b into ':OUTPATH:';\,
  3651. 'notmq' => 1,
  3652. },
  3653. ]
  3654. },
  3655. {
  3656. 'name' => 'Partitioner',
  3657. 'tests' => [
  3658. {
  3659. # test group
  3660. 'num' => 1,
  3661. 'execonly' => 'mapred', # since this join will run out of memory in local mode
  3662. 'pig' => q\register :FUNCPATH:/testudf.jar;
  3663. a = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name, age:int, gpa);
  3664. b = group a by age PARTITION BY org.apache.pig.test.utils.SimpleCustomPartitioner2 parallel 2;
  3665. c = foreach b generate group, COUNT(a);
  3666. store c into ':OUTPATH:';\,
  3667. },
  3668. ]
  3669. },
  3670. {
  3671. ####################################################################
  3672. # SUB : CastScalar
  3673. # FEATURE: adds functionality that allows to cast elements of a single-tuple relation into a scalar value.
  3674. # JIRA: Pig-1434
  3675. #
  3676. # TEST ITEMS:
  3677. # 1 Test syntax
  3678. # 2 Test scalar for simple data type
  3679. # 3 Test scalar for complex data type: tuple, bag, map
  3680. # 4 Test implicit cast
  3681. # 5 Test explicit cast
  3682. # 6 Positional parameter
  3683. # 7 Cast within an aggregate function
  3684. # 8 Cast within an UDF function
  3685. # 9 Cast with a FOREACH
  3686. # 10 Cast with a FILTER
  3687. # 11 Cast with a SPLIT
  3688. # 12 Cast in a JOIN
  3689. # 13 Multiquery
  3690. # 14 Cast on a schema that cannot be inferred should result in bytearray
  3691. # 15 Replicated Join
  3692. # 16 Test operations such as R1 * (int)R1
  3693. # 17 CheckSingular(*)
  3694. # 18 missing field in scalar file
  3695. # 19 scalar referenced from an empty file
  3696. # 20 empty input directory
  3697. # 21 Single row vs Multiple Row
  3698. # 22 Cast on a multi-field tuple
  3699. # 23 Reference a non-scalar as a scalar
  3700. # 24 Test multiple loaders
  3701. 'name' => 'CastScalar',
  3702. 'tests' => [
  3703. {
  3704. # 2 Test scalar for simple data type
  3705. # 3 Test scalar for complex data type: tuple, bag, map
  3706. # 9 Cast with a FOREACH
  3707. #INPATH = /user/hadoopqa/pig/tests/data
  3708. 'num' => 1,
  3709. 'pig' => q#
  3710. a = load ':INPATH:/singlefile/studenttab10k' as (name: chararray, age: int, gpa: float);
  3711. b = group a all;
  3712. c = foreach b generate SUM(a.age) as total;
  3713. d = foreach a generate name, age+(double)c.total as d_sum;
  3714. e = order d by name, d_sum;
  3715. store d into ':OUTPATH:';
  3716. #,
  3717. # 6 Positional parameter
  3718. }, {
  3719. 'num' => 2,
  3720. 'pig' => q#
  3721. a = load ':INPATH:/singlefile/studenttab10k' as (name: chararray, age: int, gpa: float);
  3722. b = group a all;
  3723. c = foreach b generate SUM(a.age) as total;
  3724. d = foreach a generate name, age+(double)c.$0 as d_sum;
  3725. e = order d by name, d_sum;
  3726. store d into ':OUTPATH:';
  3727. #,
  3728. # 2 Test scalar for simple data type
  3729. # 3 Test scalar for complex data type:map
  3730. # 9 Cast with a FOREACH
  3731. # 13 Multiquery
  3732. # 24 Test multiple loaders
  3733. #INPATH = /user/hadoopqa/pig/tests/data
  3734. }, {
  3735. # 4 Test implicit cast
  3736. # 10 Cast with a FILTER
  3737. #
  3738. # I set the benchmark to use "19" because pig trunkates during cast and sql rounds up.
  3739. 'num' => 7,
  3740. 'pig' => q\
  3741. a = load ':INPATH:/singlefile/studenttab10k' as (name, age, gpa);
  3742. b = group a by name;
  3743. c = foreach b generate group, AVG(a.gpa)+20 as avg_gpa;
  3744. d = order c by avg_gpa;
  3745. simple_scalar = limit d 1;
  3746. f = filter a by age < (int) simple_scalar.avg_gpa;
  3747. g = order f by name, age, gpa;
  3748. store g into ':OUTPATH:';\,
  3749. }, {
  3750. # 5 Test explicit cast
  3751. # 10 Cast with a FILTER
  3752. 'num' => 8,
  3753. 'pig' => q\
  3754. a = load ':INPATH:/singlefile/studenttab10k' as (name, age, gpa);
  3755. b = group a by name;
  3756. c = foreach b generate group, AVG(a.age) AS average;
  3757. d = order c by average;
  3758. simple_scalar = limit d 1;
  3759. d = filter a by age > (int) simple_scalar.average;
  3760. e = foreach d generate name, age;
  3761. store e into ':OUTPATH:';
  3762. \,
  3763. }, {
  3764. # 5 Test explicit cast
  3765. # 6 Positional parameter
  3766. 'num' => 9,
  3767. 'pig' => q\
  3768. a = load ':INPATH:/singlefile/studenttab10k' as (name, age, gpa);
  3769. b = group a by name;
  3770. c = foreach b generate group, AVG(a.age) AS average;
  3771. d = order c by average;
  3772. simple_scalar = limit d 1;
  3773. d = filter a by age > (int) simple_scalar.$1;
  3774. e = foreach d generate name, age;
  3775. store e into ':OUTPATH:';
  3776. \,
  3777. }, {
  3778. # 4 Test implicit cast
  3779. # 6 Positional parameter
  3780. # 10 Cast with a FILTER
  3781. 'num' => 10,
  3782. 'pig' => q\
  3783. a = load ':INPATH:/singlefile/studenttab10k' as (name, age, gpa);
  3784. b = group a by name;
  3785. c = foreach b generate group, AVG(a.age) AS average;
  3786. d = order c by average;
  3787. simple_scalar = limit d 1;
  3788. d = filter a by age > simple_scalar.$1;
  3789. e = foreach d generate name, age;
  3790. store e into ':OUTPATH:';
  3791. \,
  3792. }, {
  3793. # 4 Test implicit cast
  3794. # 6 Positional parameter
  3795. # 11 Cast with a SPLIT
  3796. 'num' => 11,
  3797. 'pig' => q\
  3798. a = load ':INPATH:/singlefile/studenttab10k' as (name, age, gpa);
  3799. b = group a by name;
  3800. c = foreach b generate group, AVG(a.age) AS average;
  3801. d = order c by average;
  3802. simple_scalar = limit d 1;
  3803. split a into X1 if age > (int) simple_scalar.$1, X2 if age < 20;
  3804. split a into X3 if age > (int) simple_scalar.$1, X4 if age > 70;
  3805. store X1 into ':OUTPATH:.1';
  3806. store X2 into ':OUTPATH:.2';
  3807. store X3 into ':OUTPATH:.3';
  3808. store X4 into ':OUTPATH:.4';
  3809. \,
  3810. }, {
  3811. # 4 Test implicit cast
  3812. # 6 Positional parameter
  3813. # 12 Cast with a JOIN
  3814. 'num' => 12,
  3815. 'pig' => q\
  3816. a = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name, age, gpa);
  3817. b = load ':INPATH:/singlefile/votertab10k' as (name, age, registration, contributions);
  3818. c = filter a by age < 20;
  3819. d = filter b by age < 20;
  3820. simple_scalar = limit d 1;
  3821. e = join c by name, d by name;
  3822. f= filter e by c::age <(int)simple_scalar.age;
  3823. store f into ':OUTPATH:';\,
  3824. },
  3825. ]
  3826. },{
  3827. 'name' => 'udf_TOBAGandTOTUPLE',
  3828. 'sortResults' => 1,
  3829. 'floatpostprocess' => 1,
  3830. 'delimiter' => ' ',
  3831. 'tests' => [
  3832. {
  3833. # TEST : resulting schema for TOBAG/TOTUPLE with simple types
  3834. # TEST : resulting schema for TOBAG/TOTUPLE with positional parameters
  3835. # TEST : resulting schema for various projects using a combination of TOBAG/TOTUPLE and standard projections
  3836. # TEST : resulting schema for various projects using a combination of TOBAG/TOTUPLE using AS clause
  3837. 'num' => 1
  3838. ,'pig' => q?
  3839. A = load ':INPATH:/types/numbers.txt' using PigStorage(':') as (intnum1000: int,id: int,intnum5: int,intnum100: int,intnum: int,longnum: long,floatnum: float,doublenum: double);
  3840. B = limit A 10;
  3841. Gen1 = FOREACH B GENERATE $0, $1, $2 ;
  3842. GroupById = GROUP B BY id;
  3843. B1 = foreach B generate TOBAG( intnum1000, id, intnum5);
  3844. B3 = foreach B generate TOBAG( $0, $1, $2);
  3845. T1= foreach B generate TOTUPLE( intnum1000, id, intnum5);
  3846. T2= foreach B generate TOTUPLE( $0, $1, $2);
  3847. T3 = foreach B generate TOTUPLE( $0, $0, $0);
  3848. T4= foreach B generate TOBAG($0, $1, $2), TOTUPLE($3, $4, $5), $6, $7;
  3849. T5= foreach B generate $0, $1, TOTUPLE($2, $3, $4), TOBAG($5, $6), $7;
  3850. T6= foreach B generate $0, TOTUPLE($0, $0, $0), TOBAG($0, $0), $0 AS duplicate;
  3851. describe Gen1;
  3852. describe GroupById;
  3853. describe B1;
  3854. describe B3;
  3855. describe T1;
  3856. describe T2;
  3857. describe T3;
  3858. describe T4;
  3859. describe T5;
  3860. describe T6;
  3861. ?
  3862. ,'expected_out_regex' => 'B1: {{int}}'
  3863. ,'expected_out_regex' => 'B3: {{int}}'
  3864. ,'expected_out_regex' => 'T1: {org.apache.pig.builtin.totuple_id_.*: (intnum1000: int,id: int,intnum5: int)}'
  3865. ,'expected_out_regex' => 'T2: {org.apache.pig.builtin.totuple_id_.*: (intnum1000: int,id: int,intnum5: int)}'
  3866. ,'expected_out_regex' => 'T3: {org.apache.pig.builtin.totuple_intnum1000.*: (intnum1000: int,intnum1000: int,intnum1000: int)}'
  3867. ,'expected_out_regex' => 'T4: {{int},org.apache.pig.builtin.totuple_intnum100.*: (intnum100: int,intnum: int,longnum: long),floatnum: float,doublenum: double}'
  3868. ,'expected_out_regex' => 'T5: {intnum1000: int,id: int,org.apache.pig.builtin.totuple_intnum100.*: (intnum5: int,intnum100: int,intnum: int).*{NULL}.*doublenum: double}'
  3869. ,'expected_out_regex' => "T6: {intnum1000: int,org.apache.pig.builtin.totuple_intnum1000.*: \\(intnum1000: int,intnum1000: int,intnum1000: int\\),{\\(int\\)},duplicate: int}"
  3870. }, {
  3871. # TEST : bag of mixed data types
  3872. # TEST : Order
  3873. # TEST : positional parameters
  3874. 'num' => 2
  3875. ,'pig' => q?
  3876. A = load ':INPATH:/types/numbers.txt' using PigStorage(':') as (intnum1000: int,id: int,intnum5: int,intnum100: int,intnum: int,longnum: long,floatnum: float,doublenum: double);
  3877. C = foreach A generate TOBAG( id, floatnum, doublenum );
  3878. D = foreach A generate TOBAG( id, intnum);
  3879. E = foreach A generate TOBAG( (float) id,floatnum );
  3880. F = foreach A generate TOBAG( (long) id,longnum );
  3881. G = foreach A generate TOBAG( (double) id,doublenum );
  3882. describe C;
  3883. describe D;
  3884. describe E;
  3885. describe F;
  3886. describe G;
  3887. ?
  3888. ,'expected_out_regex' => 'C: {{\\(NULL\\)}}'
  3889. ,'expected_out_regex' => 'D: {{\\(int\\)}}'
  3890. ,'expected_out_regex' => 'E: {{\\(float\\)}}'
  3891. ,'expected_out_regex' => 'F: {{\\(long\\)}}'
  3892. ,'expected_out_regex' => 'G: {{\\(double\\)}}'
  3893. }, {
  3894. # TEST : TOBAG/TOTUPLE with simple types
  3895. # TEST : TOBAG/TOTUPLE with positional parameters
  3896. # TEST : various projects using a combination of TOBAG/TOTUPLE and standard projections
  3897. # TEST : various projects using a combination of TOBAG/TOTUPLE using AS clause
  3898. 'num' => 3
  3899. ,'pig' => q?
  3900. A = load ':INPATH:/types/numbers.txt' using PigStorage(':') as (intnum1000: int,id: int,intnum5: int,intnum100: int,intnum: int,longnum: long,floatnum: float,doublenum: double);
  3901. B = limit A 10;
  3902. B1 = foreach B generate TOBAG( intnum1000, id, intnum5);
  3903. B2 = foreach B generate TOBAG( $0, $1, $2);
  3904. T1= foreach B generate TOTUPLE( intnum1000, id, intnum5);
  3905. T2= foreach B generate TOTUPLE( $0, $1, $2);
  3906. T3 = foreach B generate TOTUPLE( $0, $0, $0);
  3907. T4= foreach B generate TOBAG($0, $1, $2), TOTUPLE($3, $4, $5), $6, $7;
  3908. T5= foreach B generate $0, $1, TOTUPLE($2, $3, $4), TOBAG($5, $6), $7;
  3909. T6= foreach B generate $0, TOTUPLE($0, $0, $0), TOBAG($0, $0), $0 AS duplicate;
  3910. Gen1 = FOREACH B GENERATE $0, $1, $2 ;
  3911. GroupById = GROUP B BY id;
  3912. store Gen1 into ':OUTPATH:.1';
  3913. store GroupById into ':OUTPATH:.2';
  3914. store B1 into ':OUTPATH:.3';
  3915. store B2 into ':OUTPATH:.4';
  3916. store T1 into ':OUTPATH:.5';
  3917. store T2 into ':OUTPATH:.6';
  3918. store T3 into ':OUTPATH:.7';
  3919. store T4 into ':OUTPATH:.8';
  3920. ?
  3921. }, {
  3922. # TEST : cast for TOTUPLE/TOBAG
  3923. 'num' => 4
  3924. ,'ignore' => 1 # different error message for different version of hadoop
  3925. ,'pig' => q?
  3926. A = load ':INPATH:/types/numbers.txt' using PigStorage(':') as (intnum1000: int,id: int,intnum5: int,intnum100: int,intnum: int,longnum: long,floatnum: float,doublenum: double);
  3927. B= limit A 10;
  3928. C = foreach B generate $0, TOTUPLE((int) $0, (long) $0, (double) $0), TOBAG( (float) $0, (chararray) $0), $0;
  3929. store C into ':OUTPATH:';
  3930. ?
  3931. ,'expected_err_regex' => 'ERROR 1108: Duplicate schema alias'
  3932. ,'rc' => 6
  3933. }, {
  3934. # TEST : cast for TOTUPLE/TOBAG
  3935. 'num' => 5
  3936. ,'pig' => q?
  3937. A = load ':INPATH:/types/numbers.txt' using PigStorage(':') as (intnum1000: int,id: int,intnum5: int,intnum100: int,intnum: int,longnum: long,floatnum: float,doublenum: double);
  3938. B= limit A 1;
  3939. C = foreach B generate $0, TOTUPLE((int) $0);
  3940. D = foreach B generate $0, TOTUPLE((long) $0);
  3941. E = foreach B generate $0, TOTUPLE((double) $0);
  3942. F = foreach B generate $0, TOTUPLE((float) $0);
  3943. G = foreach B generate $0, TOTUPLE((chararray) $0);
  3944. store B into ':OUTPATH:.1';
  3945. store C into ':OUTPATH:.2';
  3946. store D into ':OUTPATH:.3';
  3947. store E into ':OUTPATH:.4';
  3948. store F into ':OUTPATH:.5';
  3949. store G into ':OUTPATH:.6';
  3950. ?
  3951. }, {
  3952. #TEST more complicated nested functions such as TOTUPLE(TOBAG())
  3953. #TEST more complicated nested functions such as TOBAG(TOTUPLE())
  3954. #TEST more complicated nested functions such as TOTUPLE(TOTUPLE())
  3955. #TEST more complicated nested functions such as TOBAG(TOBAG())
  3956. 'num' => 6
  3957. ,'pig' => q?
  3958. A = load ':INPATH:/types/numbers.txt' using PigStorage(':') as (intnum1000: int,id: int,intnum5: int,intnum100: int,intnum: int,longnum: long,floatnum: float,doublenum: double);
  3959. B = limit A 10;
  3960. tint = foreach B generate TOTUPLE( TOBAG( $1, $2, $3),TOTUPLE($3, $4, $5) );
  3961. bint = foreach B generate TOTUPLE( TOBAG( $1, $2, $3),TOBAG($3, $4, $5) );
  3962. binb = foreach B generate TOBAG( TOBAG( $1, $2, $3),TOBAG($3, $4, $5) );
  3963. tinb = foreach B generate TOTUPLE( TOBAG( $1, $2, $3),TOBAG($3, $4, $5) );
  3964. store B into ':OUTPATH:.1';
  3965. store tint into ':OUTPATH:.2';
  3966. store bint into ':OUTPATH:.3';
  3967. store binb into ':OUTPATH:.4';
  3968. store tinb into ':OUTPATH:.5';
  3969. ?
  3970. }, {
  3971. #TEST arithmetic operation in TOTUPLE and TOBAG
  3972. #TEST aggregate funcion - NOT IMPLEMENTED
  3973. #TEST tuple with 50+ items
  3974. #TEST with null
  3975. 'num' => 7
  3976. ,'pig' => q?
  3977. A = load ':INPATH:/types/numbers.txt' using PigStorage(':') as (intnum1000: int,id: int,intnum5: int,intnum100: int,intnum: int,longnum: long,floatnum: float,doublenum: double);
  3978. B = limit A 10;
  3979. B1= foreach B generate TOTUPLE( $1, $2, $3);
  3980. T1= foreach B generate TOTUPLE( $1, $2, $3);
  3981. R1= foreach B generate TOTUPLE( $1, $0+1, $0+2, $0+3),TOBAG($0+4, $0+1 );
  3982. R2= foreach B generate TOTUPLE( $0, $1, $2, $3, $4, $5, $6, $7, (int) 8, (int) 9 , $1, $2, $3, $4, $5, $6, $7, (int) 19, (int) 20, $0, $1, $2, $3, $4, $5, $6, $7 , (int) 29, (int) 30, $0, $1, $2, $3, $4, $5, $6, $7, (int) 39, (int) 40 , $1, $2, $3, $4, $5, $6, $7, (int) 19, (int) 20, $0, $1, $2, $3, $4, $5, $5, $7 );
  3983. R3= foreach B generate $0, TOTUPLE(0,0,0), TOBAG( 0, 0 );
  3984. R4= foreach B generate $0, TOTUPLE(null, id, null), TOBAG( id, null, id,null );
  3985. describe R1;
  3986. describe R2;
  3987. describe R3;
  3988. describe R4;
  3989. store B into ':OUTPATH:.1';
  3990. store B1 into ':OUTPATH:.2';
  3991. store R1 into ':OUTPATH:.3';
  3992. store R2 into ':OUTPATH:.4';
  3993. store R3 into ':OUTPATH:.5';
  3994. store R4 into ':OUTPATH:.6';
  3995. ?
  3996. }, {
  3997. # TEST more TOTUPLE and TOBAG nested combinations
  3998. 'num' => 8
  3999. ,'pig' => q?
  4000. A = load ':INPATH:/types/numbers.txt' using PigStorage(':') as (intnum1000: int,id: int,intnum5: int,intnum100: int,intnum: int,longnum: long,floatnum: float,doublenum: double);
  4001. B = limit A 10;
  4002. C = foreach B generate TOBAG( $0, $1, $2);
  4003. T1= foreach B generate TOTUPLE( TOBAG( $1, $2, $3),TOTUPLE($3, $4, $5) );
  4004. T2= foreach B generate TOTUPLE( TOBAG( $1, $2, $3),TOBAG($3, $4, $5) );
  4005. T3= foreach B generate TOBAG( TOTUPLE( $1, $2, $3), TOTUPLE($4,$5), TOTUPLE($6,$7));
  4006. store B into ':OUTPATH:.1';
  4007. store C into ':OUTPATH:.2';
  4008. store T1 into ':OUTPATH:.3';
  4009. store T2 into ':OUTPATH:.4';
  4010. store T3 into ':OUTPATH:.5';
  4011. ?
  4012. ,'verify_pig_script' => q?register :FUNCPATH:/testudf.jar;
  4013. A = load ':INPATH:/types/numbers.txt' using PigStorage(':') as (intnum1000: int,id: int,intnum5: int,intnum100: int,intnum: int,longnum: long,floatnum: float,doublenum: double);
  4014. B = limit A 10;
  4015. C = foreach B generate TOBAG( $0, $1, $2);
  4016. T1= foreach B generate TOTUPLE( TOBAG( $1, $2, $3),TOTUPLE($3, $4, $5) );
  4017. T2= foreach B generate TOTUPLE( TOBAG( $1, $2, $3),TOBAG($3, $4, $5) );
  4018. T3= foreach B generate org.apache.pig.test.udf.evalfunc.TOBAG2( TOTUPLE( $1, $2, $3), TOTUPLE($4,$5), TOTUPLE($6,$7));
  4019. store B into ':OUTPATH:.1';
  4020. store C into ':OUTPATH:.2';
  4021. store T1 into ':OUTPATH:.3';
  4022. store T2 into ':OUTPATH:.4';
  4023. store T3 into ':OUTPATH:.5';
  4024. ?
  4025. }, {
  4026. #TEST negative test case: out of bounds positional parameter
  4027. # EVERYTHING IS CORRECT
  4028. 'num' => 9
  4029. ,'ignore' => 1 # different error message for different version of hadoop
  4030. ,'pig' => q?
  4031. A = load ':INPATH:/types/numbers.txt' using PigStorage(':') as (intnum1000: int,id: int,intnum5: int,intnum100: int,intnum: int,longnum: long,floatnum: float,doublenum: double);
  4032. B = limit A 10;
  4033. C = foreach B generate $0, $1, TOTUPLE($2, $998, $4), TOBAG($5, $6), $7;
  4034. ?
  4035. ,'expected_err_regex' => 'Out of bound access.*non-existent column: 998'
  4036. }, {
  4037. #TEST negative test case: out of bounds positional parameter
  4038. # EVERYTHING IS CORRECT
  4039. 'num' => 10
  4040. ,'ignore' => 1 # different error message for different version of hadoop
  4041. ,'pig' => q?
  4042. A = load ':INPATH:/types/numbers.txt' using PigStorage(':') as (intnum1000: int,id: int,intnum5: int,intnum100: int,intnum: int,longnum: long,floatnum: float,doublenum: double);
  4043. B = limit A 10;
  4044. C = foreach B generate $0, $1, TOBAG($5, $999), $7;
  4045. ?
  4046. ,'expected_err_regex' => 'Out of bound access.*non-existent column: 999'
  4047. },
  4048. ] # end of tests
  4049. },{
  4050. 'name' => 'ToStuffSyntaxSugar',
  4051. 'tests' => [
  4052. {
  4053. #TEST TOTUPLE syntax sugar
  4054. 'num' => 1,
  4055. 'pig' => q\
  4056. A = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name:chararray, age:int, gpa:float);
  4057. B = foreach A generate (name, age);
  4058. store B into ':OUTPATH:';\,
  4059. 'verify_pig_script' => q\
  4060. A = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name:chararray, age:int, gpa:float);
  4061. B = foreach A generate TOTUPLE(name, age);
  4062. store B into ':OUTPATH:';\,
  4063. }, {
  4064. #TEST TOBAG syntax sugar
  4065. 'num' => 2,
  4066. 'pig' => q\
  4067. A = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name:chararray, age:int, gpa:float);
  4068. B = foreach A generate {name, age};
  4069. store B into ':OUTPATH:';\,
  4070. 'verify_pig_script' => q\
  4071. A = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name:chararray, age:int, gpa:float);
  4072. B = foreach A generate TOBAG(name, age);
  4073. store B into ':OUTPATH:';\,
  4074. }, {
  4075. #TEST TOMAP syntax sugar
  4076. 'num' => 3,
  4077. 'pig' => q\
  4078. A = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name:chararray, age:int, gpa:float);
  4079. B = foreach A generate [name, age];
  4080. store B into ':OUTPATH:';\,
  4081. 'verify_pig_script' => q\
  4082. A = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name:chararray, age:int, gpa:float);
  4083. B = foreach A generate TOMAP(name, age);
  4084. store B into ':OUTPATH:';\,
  4085. }, {
  4086. #TEST verify single element inside parenthesis does NOT call TOTUPLE
  4087. 'num' => 4,
  4088. 'pig' => q\
  4089. A = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name:chararray, age:int, gpa:float);
  4090. B = foreach A generate (age) + 1;
  4091. store B into ':OUTPATH:';\,
  4092. 'verify_pig_script' => q\
  4093. A = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name:chararray, age:int, gpa:float);
  4094. B = foreach A generate (age + 1);
  4095. store B into ':OUTPATH:';\,
  4096. }
  4097. ] # end of tests
  4098. },{
  4099. 'name' => 'MergeOperator',
  4100. 'tests' => [
  4101. {
  4102. # Test Union using merge where schema is identical | A&B have identical schema
  4103. 'num' => 1,
  4104. 'pig' => q\
  4105. A = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name:chararray, age:int, gpa:float);
  4106. B = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name:chararray, age:int, gpa:float);
  4107. C = union onschema A, B;
  4108. store C into ':OUTPATH:';\,
  4109. 'verify_pig_script' => q\
  4110. A = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name:chararray, age:int, gpa:float);
  4111. B = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name:chararray, age:int, gpa:float);
  4112. C = union A, B;
  4113. store C into ':OUTPATH:';\,
  4114. },{
  4115. # Test Union using merge with type promotions, int->long and float->double
  4116. 'num' => 2,
  4117. 'floatpostprocess' => 1,
  4118. 'delimiter' => ' ',
  4119. 'pig' => q\
  4120. A = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name:chararray, age:int, gpa:float);
  4121. B = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name:chararray, age:long, gpa:double);
  4122. C = union onschema A, B;
  4123. store C into ':OUTPATH:';\,
  4124. 'verify_pig_script' => q\
  4125. A = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name:chararray, age:int, gpa:float);
  4126. B = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name:chararray, age:int, gpa:float);
  4127. C = union A, B;
  4128. D = foreach C generate name, (long)age, (double)gpa;
  4129. store C into ':OUTPATH:';\,
  4130. },{
  4131. # Test Union using merge with type promotions, int->float
  4132. 'num' => 3,
  4133. 'floatpostprocess' => 1,
  4134. 'delimiter' => ' ',
  4135. 'pig' => q\
  4136. A = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name:chararray, age:int, gpa:float);
  4137. B = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name:chararray, age:float, gpa:float);
  4138. C = union onschema A, B;
  4139. store C into ':OUTPATH:';\,
  4140. 'verify_pig_script' => q\
  4141. A = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name:chararray, age:float, gpa:float);
  4142. B = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name:chararray, age:float, gpa:float);
  4143. C = union A, B;
  4144. D = foreach C generate name, (float)age, gpa;
  4145. store C into ':OUTPATH:';\,
  4146. },{
  4147. # Test Union using merge with type promotions, int->double
  4148. 'num' => 4,
  4149. 'floatpostprocess' => 1,
  4150. 'delimiter' => ' ',
  4151. 'pig' => q\
  4152. A = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name:chararray, age:int, gpa:float);
  4153. B = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name:chararray, age:double, gpa:float);
  4154. C = union onschema A, B;
  4155. store C into ':OUTPATH:';\,
  4156. 'verify_pig_script' => q\
  4157. A = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name:chararray, age:double, gpa:float);
  4158. B = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name:chararray, age:double, gpa:float);
  4159. C = union A, B;
  4160. D = foreach C generate name, (double)age, gpa;
  4161. store C into ':OUTPATH:';\,
  4162. },{
  4163. # Test Union of an intersection
  4164. 'num' => 5,
  4165. 'pig' => q\
  4166. A = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name:chararray, age:int, gpa:float);
  4167. B = load ':INPATH:/singlefile/votertab10k' as (name:chararray, age:int, registration:chararray, contributions:float);
  4168. C = union onschema A, B;
  4169. store C into ':OUTPATH:';\,
  4170. 'verify_pig_script' => q\
  4171. register :FUNCPATH:/testudf.jar;
  4172. define Nil org.apache.pig.test.udf.evalfunc.Nil();
  4173. A = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name:chararray, age:int, gpa:float);
  4174. B = load ':INPATH:/singlefile/votertab10k' as (name:chararray, age:int, registration:chararray, contributions:float);
  4175. C = foreach A generate name, age, (chararray)gpa, Nil(), Nil();
  4176. D = foreach B generate name, age, Nil(), registration, (chararray)contributions;
  4177. E = union C, D;
  4178. store E into ':OUTPATH:';\,
  4179. },
  4180. {
  4181. # Test Union where the intersection is null
  4182. 'num' => 6,
  4183. 'pig' => q\
  4184. A = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name:chararray, age:int, gpa:float);
  4185. B = load ':INPATH:/singlefile/textdoc' as (line:chararray);
  4186. C = union onschema A, B;
  4187. store C into ':OUTPATH:';\,
  4188. 'verify_pig_script' => q\
  4189. register :FUNCPATH:/testudf.jar;
  4190. define Nil org.apache.pig.test.udf.evalfunc.Nil();
  4191. A = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name:chararray, age:int, gpa:float);
  4192. B = load ':INPATH:/singlefile/textdoc' as (line:chararray);
  4193. C = foreach A generate name, (chararray)age, (chararray)gpa, Nil(name);
  4194. D = foreach B generate Nil(line), Nil(line), Nil(line), line;
  4195. E = union C, D;
  4196. store E into ':OUTPATH:';\,
  4197. },
  4198. {
  4199. # Test Union using merge where schema is identical | A&B have identical schema
  4200. 'num' => 7,
  4201. 'pig' => q\
  4202. a = load ':INPATH:/singlefile/allscalar10k' using PigStorage() as (name:chararray, age:int, gpa:double, instate:boolean);
  4203. b = load ':INPATH:/singlefile/allscalar10k' using PigStorage() as (name:chararray, age:int, gpa:double, instate:boolean);
  4204. C = union onschema a, b;
  4205. store C into ':OUTPATH:';\,
  4206. 'verify_pig_script' => q\
  4207. a = load ':INPATH:/singlefile/allscalar10k' using PigStorage() as (name:chararray, age:int, gpa:double, instate:chararray);
  4208. b = load ':INPATH:/singlefile/allscalar10k' using PigStorage() as (name:chararray, age:int, gpa:double, instate:chararray);
  4209. C = union a, b;
  4210. store C into ':OUTPATH:';\,
  4211. }
  4212. ]
  4213. },
  4214. {
  4215. # Test Union using merge with Simple data types
  4216. 'name' => 'UdfDistributedCache',
  4217. 'tests' => [
  4218. {
  4219. 'num' => 1,
  4220. 'execonly' => 'mapred', # since distributed cache is not supported in local mode
  4221. 'pig' => q?
  4222. register :FUNCPATH:/testudf.jar;
  4223. define udfdc org.apache.pig.test.udf.evalfunc.Udfcachetest(':INPATH:/singlefile/votertab10k#foodle');
  4224. a = load ':INPATH:/singlefile/studenttab10k' as (name, age, gpa);
  4225. b = limit a 1;
  4226. c = foreach b generate udfdc(age);
  4227. STORE c into ':OUTPATH:';?,
  4228. 'verify_pig_script' => q?
  4229. a = load ':INPATH:/singlefile/studenttab10k' as (name, age, gpa);
  4230. b = limit a 1;
  4231. c = foreach b generate 'tom van buren', 68, 'socialist', 390.19;
  4232. STORE c into ':OUTPATH:';?,
  4233. },
  4234. ]
  4235. }, {
  4236. 'name' => 'MonitoredUDF',
  4237. 'tests' => [
  4238. {
  4239. 'num' => 1,
  4240. 'ignore23' => 'guava version of Pig is higher than hadoop 23',
  4241. 'pig' => q?register :FUNCPATH:/testudf.jar;
  4242. define gm org.apache.pig.test.udf.evalfunc.GoodMonitored();
  4243. a = load ':INPATH:/singlefile/studenttab10k' as (name, age, gpa);
  4244. b = foreach a generate gm(name);
  4245. store b into ':OUTPATH:';?,
  4246. 'verify_pig_script' => q?a = load ':INPATH:/singlefile/studenttab10k' as (name, age, gpa);
  4247. b = foreach a generate 'fred';
  4248. store b into ':OUTPATH:';?,
  4249. },{
  4250. 'num' => 2,
  4251. 'pig' => q?register :FUNCPATH:/testudf.jar;
  4252. define bad org.apache.pig.test.udf.evalfunc.BadMonitored();
  4253. a = load ':INPATH:/singlefile/studenttab10k' as (name, age, gpa);
  4254. b = limit a 1;
  4255. c = foreach b generate bad(name);
  4256. store b into ':OUTPATH:';?,
  4257. 'verify_pig_script' => q?a = load ':INPATH:/singlefile/studenttab10k' as (name, age, gpa);
  4258. b = limit a 1;
  4259. c = foreach b generate '';
  4260. store b into ':OUTPATH:';?,
  4261. },{
  4262. 'num' => 3,
  4263. 'pig' => q?register :FUNCPATH:/testudf.jar;
  4264. define bad org.apache.pig.test.udf.evalfunc.BadMonitored();
  4265. a = load ':INPATH:/singlefile/studenttab10k' as (name, age, gpa);
  4266. b = limit a 1;
  4267. c = foreach b generate bad(name);
  4268. store b into ':OUTPATH:';?,
  4269. 'verify_pig_script' => q?a = load ':INPATH:/singlefile/studenttab10k' as (name, age, gpa);
  4270. b = limit a 1;
  4271. c = foreach b generate 'barney';
  4272. store b into ':OUTPATH:';?,
  4273. }
  4274. ],
  4275. },{
  4276. 'name' => 'MergeSparseJoin',
  4277. 'tests' => [
  4278. # Simplest merge-sparse-join.
  4279. {
  4280. 'num' => 1,
  4281. 'pig' => q\register :PIGPATH:/contrib/piggybank/java/piggybank.jar
  4282. a = load ':INPATH:/singlefile/studenttab10k';
  4283. b = load ':INPATH:/singlefile/votertab10k';
  4284. c = order a by $0;
  4285. d = order b by $0;
  4286. store c into ':OUTPATH:.intermediate1';
  4287. store d into ':OUTPATH:.intermediate2' using org.apache.pig.piggybank.storage.IndexedStorage(',', '0');
  4288. exec;
  4289. e = load ':OUTPATH:.intermediate1';
  4290. f = load ':OUTPATH:.intermediate2' using org.apache.pig.piggybank.storage.IndexedStorage(',', '0');
  4291. g = join e by $0, f by $0 using 'merge-sparse';
  4292. store g into ':OUTPATH:';\,
  4293. 'verify_pig_script' => q\a = load ':INPATH:/singlefile/studenttab10k';
  4294. b = load ':INPATH:/singlefile/votertab10k';
  4295. g = join a by $0, b by $0;
  4296. store g into ':OUTPATH:';\,
  4297. 'notmq' => 1,
  4298. },
  4299. # Merge-sparse-join with left-side filter
  4300. {
  4301. 'num' => 2,
  4302. 'pig' => q\register :PIGPATH:/contrib/piggybank/java/piggybank.jar
  4303. a = load ':INPATH:/singlefile/studenttab10k';
  4304. b = load ':INPATH:/singlefile/votertab10k';
  4305. c = order a by $0;
  4306. d = order b by $0;
  4307. store c into ':OUTPATH:.intermediate1';
  4308. store d into ':OUTPATH:.intermediate2' using org.apache.pig.piggybank.storage.IndexedStorage(',', '0');
  4309. exec;
  4310. e = load ':OUTPATH:.intermediate1';
  4311. h = filter e by $1 > 30;
  4312. f = load ':OUTPATH:.intermediate2' using org.apache.pig.piggybank.storage.IndexedStorage(',', '0');
  4313. g = join h by $0, f by $0 using 'merge-sparse';
  4314. store g into ':OUTPATH:';\,
  4315. 'verify_pig_script' => q\a = load ':INPATH:/singlefile/studenttab10k';
  4316. b = load ':INPATH:/singlefile/votertab10k';
  4317. h = filter a by $1 > 30;
  4318. g = join h by $0, b by $0;
  4319. store g into ':OUTPATH:';\,
  4320. 'notmq' => 1,
  4321. },
  4322. # Merge-sparse-join with right-side filter
  4323. {
  4324. 'num' => 3,
  4325. 'pig' => q\register :PIGPATH:/contrib/piggybank/java/piggybank.jar
  4326. a = load ':INPATH:/singlefile/studenttab10k';
  4327. b = load ':INPATH:/singlefile/votertab10k';
  4328. c = order a by $0;
  4329. d = order b by $0;
  4330. store c into ':OUTPATH:.intermediate1';
  4331. store d into ':OUTPATH:.intermediate2' using org.apache.pig.piggybank.storage.IndexedStorage(',', '0');
  4332. exec;
  4333. e = load ':OUTPATH:.intermediate1';
  4334. f = load ':OUTPATH:.intermediate2' using org.apache.pig.piggybank.storage.IndexedStorage(',', '0');
  4335. i = filter f by $2 != 'democrat';
  4336. g = join e by $0, i by $0 using 'merge-sparse';
  4337. store g into ':OUTPATH:';\,
  4338. 'verify_pig_script' => q\a = load ':INPATH:/singlefile/studenttab10k';
  4339. b = load ':INPATH:/singlefile/votertab10k';
  4340. i = filter b by $2 != 'democrat';
  4341. g = join a by $0, i by $0;
  4342. store g into ':OUTPATH:';\,
  4343. 'notmq' => 1,
  4344. },
  4345. # Merge-sparse-join with key as expression
  4346. {
  4347. 'num' => 4,
  4348. 'pig' => q\register :PIGPATH:/contrib/piggybank/java/piggybank.jar
  4349. a = load ':INPATH:/singlefile/studenttab10k';
  4350. b = load ':INPATH:/singlefile/votertab10k';
  4351. c = order a by $0,$1;
  4352. d = order b by $0,$1;
  4353. store c into ':OUTPATH:.intermediate1';
  4354. store d into ':OUTPATH:.intermediate2' using org.apache.pig.piggybank.storage.IndexedStorage(',', '0,1');
  4355. exec;
  4356. e = load ':OUTPATH:.intermediate1';
  4357. f = load ':OUTPATH:.intermediate2' using org.apache.pig.piggybank.storage.IndexedStorage(',', '0,1');
  4358. g = join e by ($0,$1), f by ($0,$1) using 'merge-sparse';
  4359. store g into ':OUTPATH:';\,
  4360. 'verify_pig_script' => q\a = load ':INPATH:/singlefile/studenttab10k';
  4361. b = load ':INPATH:/singlefile/votertab10k';
  4362. g = join a by ($0,$1), b by ($0,$1);
  4363. store g into ':OUTPATH:';\,
  4364. 'notmq' => 1,
  4365. },
  4366. # Merge-sparse-join with nulls in keys and data.
  4367. {
  4368. 'num' => 5,
  4369. 'pig' => q\register :PIGPATH:/contrib/piggybank/java/piggybank.jar
  4370. a = load ':INPATH:/singlefile/studentnulltab10k';
  4371. b = load ':INPATH:/singlefile/voternulltab10k';
  4372. c = order a by $0;
  4373. d = order b by $0;
  4374. store c into ':OUTPATH:.intermediate1';
  4375. store d into ':OUTPATH:.intermediate2' using org.apache.pig.piggybank.storage.IndexedStorage(',', '0');
  4376. exec;
  4377. e = load ':OUTPATH:.intermediate1';
  4378. f = load ':OUTPATH:.intermediate2' using org.apache.pig.piggybank.storage.IndexedStorage(',', '0');
  4379. g = join e by $0, f by $0 using 'merge-sparse';
  4380. store g into ':OUTPATH:';\,
  4381. 'verify_pig_script' => q\a = load ':INPATH:/singlefile/studentnulltab10k';
  4382. b = load ':INPATH:/singlefile/voternulltab10k';
  4383. g = join a by $0, b by $0;
  4384. store g into ':OUTPATH:';\,
  4385. 'notmq' => 1,
  4386. },
  4387. # Merge-sparse-join with join on numeric key
  4388. {
  4389. 'num' => 6,
  4390. 'pig' => q\register :PIGPATH:/contrib/piggybank/java/piggybank.jar
  4391. a = load ':INPATH:/singlefile/studenttab10k' as (name:chararray, age:int, gpa:float);
  4392. b = load ':INPATH:/singlefile/votertab10k'as (name:chararray, age:int, reg:chararray, contrib:float);
  4393. c = order a by age;
  4394. d = order b by age;
  4395. store c into ':OUTPATH:.intermediate1';
  4396. store d into ':OUTPATH:.intermediate2' using org.apache.pig.piggybank.storage.IndexedStorage(',', '0');
  4397. exec;
  4398. e = load ':OUTPATH:.intermediate1' as (name:chararray, age:int, gpa:float);
  4399. f = load ':OUTPATH:.intermediate2' using org.apache.pig.piggybank.storage.IndexedStorage(',', '0') as (name:chararray, age:int, reg:chararray, contrib:float);
  4400. g = join e by age, f by age using 'merge-sparse';
  4401. store g into ':OUTPATH:';\,
  4402. 'verify_pig_script' => q\a = load ':INPATH:/singlefile/studenttab10k' as (name:chararray, age:int, gpa:float);
  4403. b = load ':INPATH:/singlefile/votertab10k'as (name:chararray, age:int, reg:chararray, contrib:float);
  4404. g = join a by age, b by age;
  4405. store g into ':OUTPATH:';\,
  4406. 'notmq' => 1,
  4407. }
  4408. ],
  4409. },{
  4410. 'name' => 'BugFix',
  4411. 'tests' => [
  4412. {
  4413. # PIG-2286
  4414. 'num' => 1,
  4415. 'pig' => q?A = LOAD ':INPATH:/singlefile/studenttab10k' AS (name, age:double, gpa:double);
  4416. B = group A all;
  4417. C = foreach B generate group, COR(A.age, A.gpa);
  4418. store C into ':OUTPATH:';?,
  4419. 'verify_pig_script' => q?set pig.exec.nocombiner true
  4420. A = LOAD ':INPATH:/singlefile/studenttab10k' AS (name, age:double ,gpa:double);
  4421. B = group A all;
  4422. C = foreach B generate group, COR(A.age, A.gpa);
  4423. store C into ':OUTPATH:';?,
  4424. }, {
  4425. # PIG-2286, with 3 inputs to COR
  4426. 'num' => 2,
  4427. 'pig' => q?A = LOAD ':INPATH:/singlefile/studenttab10k' AS (name, age:double ,gpa:double);
  4428. B = foreach A generate age, gpa, gpa*gpa as gpa2;
  4429. C = group B all;
  4430. D = foreach C generate group, COR(B.age, B.gpa, B.gpa2);
  4431. store D into ':OUTPATH:';?,
  4432. 'verify_pig_script' => q?set pig.exec.nocombiner true
  4433. A = LOAD ':INPATH:/singlefile/studenttab10k' AS (name, age:double ,gpa:double);
  4434. B = foreach A generate age, gpa, gpa*gpa as gpa2;
  4435. C = group B all;
  4436. D = foreach C generate group, COR(B.age, B.gpa, B.gpa2);
  4437. store D into ':OUTPATH:';?,
  4438. }, {
  4439. # PIG-2385
  4440. 'num' => 3,
  4441. 'pig_params' => ['-M'],
  4442. 'pig' => q?A = LOAD ':INPATH:/singlefile/studenttab10k' AS (name:chararray, age:int,gpa:double);
  4443. Z = group A all;
  4444. Z1 = foreach Z generate AVG(A.gpa) as avg;
  4445. B = foreach A generate name, age, gpa-Z1.avg as diff;
  4446. STORE B INTO ':OUTPATH:.1';
  4447. C = DISTINCT B ;
  4448. store C into ':OUTPATH:.2';?,
  4449. 'verify_pig_script' => q?A = LOAD ':INPATH:/singlefile/studenttab10k' AS (name:chararray, age:int,gpa:double);
  4450. Z = group A all;
  4451. Z1 = foreach Z generate AVG(A.gpa) as avg;
  4452. B = cross A, Z1;
  4453. B1 = foreach B generate name, age, gpa-Z1.avg as diff;
  4454. STORE B1 INTO ':OUTPATH:.1';
  4455. C = DISTINCT B1 ;
  4456. store C into ':OUTPATH:.2';?,
  4457. }, {
  4458. # PIG-2576
  4459. 'num' => 4,
  4460. 'execonly' => 'mapred',
  4461. 'pig' => q?register :FUNCPATH:/testudf.jar;
  4462. define printconf org.apache.pig.test.udf.evalfunc.UdfContextFrontend('dummy');
  4463. a = load ':INPATH:/singlefile/studenttab10k' as (name, age, gpa);
  4464. b = limit a 1;
  4465. c = foreach b generate printconf(name);
  4466. store c into ':OUTPATH:';
  4467. fs -ls;
  4468. ?,
  4469. 'rc' => 0,
  4470. 'not_expected_out_regex' => "checkJobConf: conf is null: false",
  4471. 'expected_out_regex' => "checkJobConf: conf is null: true",
  4472. }
  4473. ],
  4474. },{
  4475. 'name' => 'Bloom',
  4476. 'execonly' => 'mapred', # distributed cache does not work in local mode
  4477. 'tests' => [
  4478. {
  4479. 'num' => 1,
  4480. 'pig' => "define bb BuildBloom('Hash.JENKINS_HASH', 'fixed', '128', '3');
  4481. A = LOAD ':INPATH:/singlefile/studenttab10k' AS (name:chararray, age:int, gpa:double);
  4482. B = filter A by name == 'alice allen';
  4483. C = group B all;
  4484. D = foreach C generate bb(B.name);
  4485. store D into ':HDFSTMP:/mybloom_1';
  4486. exec;
  4487. define bloom Bloom(':HDFSTMP:/mybloom_1');
  4488. E = LOAD ':INPATH:/singlefile/studenttab10k' AS (name:chararray, age:int, gpa:double);
  4489. F = filter E by bloom(name);
  4490. store F into ':OUTPATH:';",
  4491. 'notmq' => 1,
  4492. 'verify_pig_script' => "
  4493. A = LOAD ':INPATH:/singlefile/studenttab10k' AS (name, age:int ,gpa:double);
  4494. B = filter A by name == 'alice allen';
  4495. store B into ':OUTPATH:';",
  4496. }, {
  4497. 'num' => 2,
  4498. 'pig' => "define bb BuildBloom('Hash.MURMUR_HASH', 'fixed', '128', '3');
  4499. A = LOAD ':INPATH:/singlefile/studenttab10k' AS (name:chararray, age:int, gpa:double);
  4500. B = filter A by name == 'alice allen';
  4501. C = group B all;
  4502. D = foreach C generate bb(B.name);
  4503. store D into ':HDFSTMP:/mybloom_2';
  4504. exec;
  4505. define bloom Bloom(':HDFSTMP:/mybloom_2');
  4506. E = LOAD ':INPATH:/singlefile/studenttab10k' AS (name:chararray, age:int, gpa:double);
  4507. F = filter E by bloom(name);
  4508. G = load ':INPATH:/singlefile/votertab10k'as (name:chararray, age:int, reg:chararray, contrib:float);
  4509. H = join F by name, G by name;
  4510. store H into ':OUTPATH:';",
  4511. 'notmq' => 1,
  4512. 'verify_pig_script' => "
  4513. A = LOAD ':INPATH:/singlefile/studenttab10k' AS (name, age:int ,gpa:double);
  4514. B = filter A by name == 'alice allen';
  4515. C = load ':INPATH:/singlefile/votertab10k'as (name:chararray, age:int, reg:chararray, contrib:float);
  4516. D = join B by name, C by name;
  4517. store D into ':OUTPATH:';",
  4518. },{
  4519. 'num' => 3,
  4520. 'pig' => "define bb BuildBloom('Hash.JENKINS_HASH', '1', '0.0001');
  4521. A = LOAD ':INPATH:/singlefile/studenttab10k' AS (name:chararray, age:int, gpa:double);
  4522. B = filter A by name == 'alice allen';
  4523. C = group B all;
  4524. D = foreach C generate bb(B.name);
  4525. store D into ':HDFSTMP:/mybloom_3';
  4526. exec;
  4527. define bloom Bloom(':HDFSTMP:/mybloom_3');
  4528. E = LOAD ':INPATH:/singlefile/studenttab10k' AS (name:chararray, age:int, gpa:double);
  4529. F = filter E by bloom(name);
  4530. G = load ':INPATH:/singlefile/votertab10k'as (name:chararray, age:int, reg:chararray, contrib:float);
  4531. H = join G by name, F by name using 'repl';
  4532. store H into ':OUTPATH:';",
  4533. 'notmq' => 1,
  4534. 'verify_pig_script' => "
  4535. A = LOAD ':INPATH:/singlefile/studenttab10k' AS (name, age:int ,gpa:double);
  4536. B = filter A by name == 'alice allen';
  4537. C = load ':INPATH:/singlefile/votertab10k'as (name:chararray, age:int, reg:chararray, contrib:float);
  4538. D = join C by name, B by name;
  4539. store D into ':OUTPATH:';",
  4540. }
  4541. ],
  4542. },{
  4543. 'name' => 'UDFContext',
  4544. 'tests' => [
  4545. {
  4546. # See PIG-2338
  4547. 'num' => 1,
  4548. 'pig' => q?register :FUNCPATH:/testudf.jar
  4549. a = load ':INPATH:/singlefile/studenttab10k' AS (a0);
  4550. b = foreach a generate org.apache.pig.test.udf.evalfunc.UDFContextTestUDF(a0);
  4551. c = load ':INPATH:/singlefile/studenttab10k' AS (c0:chararray);
  4552. d = foreach c generate org.apache.pig.test.udf.evalfunc.UDFContextTestUDF(c0);
  4553. e = union b, d;
  4554. store e into ':OUTPATH:';?,
  4555. 'verify_pig_script' => q?a = load ':INPATH:/singlefile/studenttab10k' AS (a0);
  4556. b = foreach a generate '{a0: bytearray}';
  4557. c = load ':INPATH:/singlefile/studenttab10k' AS (c0:chararray);
  4558. d = foreach c generate '{c0: chararray}';
  4559. e = union b, d;
  4560. store e into ':OUTPATH:';?,
  4561. }
  4562. ],
  4563. },{
  4564. 'name' => 'UDFContextAuto',
  4565. 'tests' => [
  4566. {
  4567. # See PIG-2337
  4568. 'num' => 1,
  4569. 'pig' => q?register :FUNCPATH:/testudf.jar
  4570. a = load ':INPATH:/singlefile/studenttab10k' AS (a0);
  4571. b = foreach a generate org.apache.pig.test.udf.evalfunc.UDFContextTestUDF(a0);
  4572. c = load ':INPATH:/singlefile/studenttab10k' AS (c0:chararray);
  4573. d = foreach c generate org.apache.pig.test.udf.evalfunc.UDFContextTestUDF(c0);
  4574. e = union b, d;
  4575. store e into ':OUTPATH:';?,
  4576. 'verify_pig_script' => q?a = load ':INPATH:/singlefile/studenttab10k' AS (a0);
  4577. b = foreach a generate '{a0: bytearray}';
  4578. c = load ':INPATH:/singlefile/studenttab10k' AS (c0:chararray);
  4579. d = foreach c generate '{c0: chararray}';
  4580. e = union b, d;
  4581. store e into ':OUTPATH:';?,
  4582. }
  4583. ],
  4584. },{
  4585. 'name' => 'JsonLoaderStorage',
  4586. 'tests' => [
  4587. {
  4588. 'num' => 1,
  4589. 'pig' => q?A = LOAD ':INPATH:/singlefile/studenttab10k' AS (name:chararray, age:int, gpa:double);
  4590. store A into ':OUTPATH:.intermediate' using JsonStorage();
  4591. exec
  4592. A = LOAD ':OUTPATH:.intermediate' using JsonLoader();
  4593. store A into ':OUTPATH:';?,
  4594. 'notmq' => 1,
  4595. 'verify_pig_script' => q?A = LOAD ':INPATH:/singlefile/studenttab10k' AS (name:chararray, age:int,gpa:double);
  4596. store A into ':OUTPATH:';?,
  4597. }, {
  4598. 'num' => 2,
  4599. 'pig' => q?A = LOAD ':INPATH:/singlefile/studenttab10k' AS (name:chararray, age:int, gpa:double);
  4600. store A into ':OUTPATH:.intermediate1' using JsonStorage();
  4601. B = LOAD ':INPATH:/singlefile/votertab10k' AS (name:chararray, age:int, registration:chararray, contributions:double);
  4602. store B into ':OUTPATH:.intermediate2' using JsonStorage();
  4603. exec
  4604. A = LOAD ':OUTPATH:.intermediate1' using JsonLoader();
  4605. B = LOAD ':OUTPATH:.intermediate2' using JsonLoader();
  4606. C = JOIN A by name, B by name;
  4607. store C into ':OUTPATH:';?,
  4608. 'notmq' => 1,
  4609. 'verify_pig_script' => q?A = LOAD ':INPATH:/singlefile/studenttab10k' AS (name:chararray, age:int,gpa:double);
  4610. B = LOAD ':INPATH:/singlefile/votertab10k' AS (name:chararray, age:int, registration:chararray, contributions:double);
  4611. C = JOIN A by name, B by name;
  4612. store C into ':OUTPATH:';?,
  4613. }, {
  4614. 'num' => 3,
  4615. 'ignore' => 1, # PIG-2594
  4616. 'pig' => q\a = load ':INPATH:/singlefile/allscalar10k' using PigStorage() as (name:chararray, age:int, gpa:double, instate:boolean);
  4617. store a into ':OUTPATH:.intermediate' using JsonStorage();
  4618. exec
  4619. B = LOAD ':OUTPATH:.intermediate' using JsonLoader();
  4620. store B into ':OUTPATH:';\,
  4621. 'notmq' => 1,
  4622. 'verify_pig_script' => q\a = load ':INPATH:/singlefile/allscalar10k' using PigStorage() as (name:chararray, age:int, gpa:double, instate:chararray);
  4623. store a into ':OUTPATH:';\,
  4624. }
  4625. ],
  4626. },{
  4627. 'name' => 'STRSPLIT',
  4628. 'tests' => [
  4629. {
  4630. # See PIG-2311
  4631. 'num' => 1,
  4632. 'pig' => q?a = load ':INPATH:/singlefile/studenttab10k' AS (a0);
  4633. b= filter a by NOT (a0 is null);
  4634. c= foreach b generate STRSPLIT(a0);
  4635. store c into ':OUTPATH:';?,
  4636. 'verify_pig_script' => q?a = load ':INPATH:/singlefile/studenttab10k' AS (a0);
  4637. b= filter a by NOT (a0 is null);
  4638. b= foreach b generate (chararray)a0 as a0 ;
  4639. c= foreach b generate STRSPLIT(a0);
  4640. store c into ':OUTPATH:';?,
  4641. }
  4642. ],
  4643. },
  4644. {
  4645. 'name' => 'Tokenize',
  4646. 'tests' => [
  4647. {
  4648. 'num' => 1,
  4649. 'pig' => q\
  4650. A = LOAD ':INPATH:/singlefile/studenttab10k';
  4651. B = foreach A generate TOKENIZE($0);
  4652. store B into ':OUTPATH:';\,
  4653. },
  4654. {
  4655. 'num' => 2,
  4656. 'pig' => q\
  4657. A = LOAD ':INPATH:/singlefile/studenttab10k';
  4658. B = foreach A generate TOKENIZE($1,'9');
  4659. store B into ':OUTPATH:';\,
  4660. 'verify_pig_script' => q\
  4661. A = LOAD ':INPATH:/singlefile/studenttab10k';
  4662. -- TOKENIZE has tokens hardcoded so have to replace the '9' with
  4663. -- one of the hardcoded tokens
  4664. B = foreach A generate TOKENIZE(REPLACE($1, '9', ','));
  4665. store B into ':OUTPATH:';\,
  4666. }
  4667. ]
  4668. }, {
  4669. 'name' => 'Realias',
  4670. 'tests' => [
  4671. {
  4672. 'num' => 1,
  4673. 'pig' => q\
  4674. A = LOAD ':INPATH:/singlefile/studenttab10k';
  4675. B = A;
  4676. store B into ':OUTPATH:';\,
  4677. 'verify_pig_script' => q\
  4678. A = LOAD ':INPATH:/singlefile/studenttab10k';
  4679. store A into ':OUTPATH:';\,
  4680. }
  4681. ]
  4682. },
  4683. {
  4684. 'name' => 'NestedForEach',
  4685. 'tests' => [
  4686. {
  4687. 'num' => 1,
  4688. 'pig' => q\
  4689. A = LOAD ':INPATH:/singlefile/studenttab10k' as (name:chararray, age:int, gpa:double);
  4690. B = group A by name;
  4691. C = foreach B {
  4692. C1 = foreach A generate UPPER(name), age+1 as age, gpa;
  4693. generate C1;
  4694. }
  4695. D = foreach C generate flatten(C1);
  4696. store D into ':OUTPATH:';\,
  4697. 'verify_pig_script' => q\
  4698. A = LOAD ':INPATH:/singlefile/studenttab10k' as (name:chararray, age:int, gpa:double);
  4699. B = foreach A generate UPPER(name), age+1, gpa;
  4700. store B into ':OUTPATH:';\,
  4701. },
  4702. {
  4703. 'num' => 2,
  4704. 'pig' => q\
  4705. A = LOAD ':INPATH:/singlefile/studenttab10k' as (name:chararray, age:int, gpa:double);;
  4706. B = group A by name;
  4707. C = foreach B {
  4708. C1 = A.age;
  4709. C2 = filter C1 by age>=30;
  4710. C3 = foreach C2 generate age+1 as age;
  4711. C4 = order C3 by age desc;
  4712. generate C4;
  4713. }
  4714. D = foreach C generate flatten(C4);
  4715. store D into ':OUTPATH:';\,
  4716. 'verify_pig_script' => q\
  4717. A = LOAD ':INPATH:/singlefile/studenttab10k' as (name:chararray, age:int, gpa:double);
  4718. B = filter A by age>=30;
  4719. C = foreach B generate age+1 as age;
  4720. D = order C by age desc;
  4721. store D into ':OUTPATH:';\,
  4722. }
  4723. ]
  4724. },
  4725. {
  4726. 'name' => 'BagToTuple',
  4727. 'tests' => [
  4728. {
  4729. # basic test of converting bag to tuples. Use the first and last tuple in the bag b
  4730. 'num' => 1,
  4731. 'pig' => q\
  4732. a = load ':INPATH:/singlefile/studentcomplextab10k' using PigStorage() as (m:[], t:(name:chararray, age:int, gpa:double), b:{t:(name:chararray, age:int, gpa:double)});
  4733. filterA = filter a by b is not null and COUNT(b) > 1;
  4734. b = foreach filterA {
  4735. order_desc = order b by age desc;
  4736. limit_desc = limit order_desc 1;
  4737. order_asc = order b by age asc;
  4738. limit_asc = limit order_asc 1;
  4739. generate FLATTEN(limit_desc), FLATTEN(limit_asc);
  4740. };
  4741. c = foreach b generate TOTUPLE(limit_desc::name, limit_desc::age, limit_desc::gpa) as first_t,
  4742. TOTUPLE(limit_asc::name, limit_asc::age, limit_asc::gpa) as second_t;
  4743. d = foreach c generate TOBAG(first_t, second_t) as n_bag;
  4744. e = foreach d generate BagToTuple(n_bag);
  4745. store e into ':OUTPATH:';\,
  4746. 'verify_pig_script' => q\
  4747. a = load ':INPATH:/singlefile/studentcomplextab10k' using PigStorage() as (m:[], t:(name:chararray, age:int, gpa:double), b:{t:(name:chararray, age:int, gpa:double)});
  4748. filterA = filter a by b is not null and COUNT(b) > 1;
  4749. b = foreach filterA {
  4750. order_desc = order b by age desc;
  4751. limit_desc = limit order_desc 1;
  4752. order_asc = order b by age asc;
  4753. limit_asc = limit order_asc 1;
  4754. generate FLATTEN(limit_desc), FLATTEN(limit_asc);
  4755. };
  4756. c = foreach b generate TOTUPLE(limit_desc::name, limit_desc::age, limit_desc::gpa, limit_asc::name,limit_asc::age, limit_asc::gpa) as big_t;
  4757. store c into ':OUTPATH:';\,
  4758. },
  4759. {
  4760. # covert an existing tuple to bag and use the output of BagToTuple
  4761. 'num' => 2,
  4762. 'pig' => q\
  4763. a = load ':INPATH:/singlefile/studentcomplextab10k' using PigStorage() as (m:[], t:(name:chararray, age:int, gpa:double), b:{t:(name:chararray, age:int, gpa:double)});
  4764. b = filter a by t is not null;
  4765. c = foreach b generate TOBAG(t) as newBag;
  4766. d = foreach c generate BagToTuple(newBag);
  4767. store d into ':OUTPATH:';\,
  4768. 'verify_pig_script' => q\
  4769. a = load ':INPATH:/singlefile/studentcomplextab10k' using PigStorage() as (m:[], t:(name:chararray, age:int, gpa:double), b:{t:(name:chararray, age:int, gpa:double)});
  4770. b = filter a by t is not null;
  4771. c = foreach b generate t;
  4772. store c into ':OUTPATH:';\,
  4773. },
  4774. ]
  4775. },
  4776. {
  4777. 'name' => 'BagToString',
  4778. 'tests' => [
  4779. {
  4780. 'num' => 1,
  4781. 'pig' => q\
  4782. a = load ':INPATH:/singlefile/studentcomplextab10k' using PigStorage() as (m:[], t:(name:chararray, age:int, gpa:double), b:{t:(name:chararray, age:int, gpa:double)});
  4783. filterA = filter a by b is not null and COUNT(b) > 1;
  4784. b = foreach filterA {
  4785. order_desc = order b by age desc;
  4786. limit_desc = limit order_desc 1;
  4787. order_asc = order b by age asc;
  4788. limit_asc = limit order_asc 1;
  4789. generate FLATTEN(limit_desc), FLATTEN(limit_asc);
  4790. };
  4791. c = foreach b generate TOTUPLE(limit_desc::name, limit_desc::age, limit_desc::gpa) as first_t,
  4792. TOTUPLE(limit_asc::name, limit_asc::age, limit_asc::gpa) as second_t;
  4793. d = foreach c generate TOBAG(first_t, second_t) as n_bag;
  4794. e = foreach d generate BagToString(n_bag);
  4795. store e into ':OUTPATH:';\,
  4796. 'verify_pig_script' => q\
  4797. a = load ':INPATH:/singlefile/studentcomplextab10k' using PigStorage() as (m:[], t:(name:chararray, age:int, gpa:double), b:{t:(name:chararray, age:int, gpa:double)});
  4798. filterA = filter a by b is not null and COUNT(b) > 1;
  4799. b = foreach filterA {
  4800. order_desc = order b by age desc;
  4801. limit_desc = limit order_desc 1;
  4802. order_asc = order b by age asc;
  4803. limit_asc = limit order_asc 1;
  4804. generate FLATTEN(limit_desc), FLATTEN(limit_asc);
  4805. };
  4806. c = foreach b generate CONCAT(limit_desc::name, CONCAT('_', CONCAT((chararray)limit_desc::age, CONCAT('_', CONCAT((chararray)limit_desc::gpa, CONCAT('_',CONCAT(limit_asc::name,CONCAT('_',CONCAT((chararray)limit_asc::age, CONCAT('_',(chararray)limit_asc::gpa)))))))))) as big_t;
  4807. store c into ':OUTPATH:';\,
  4808. },
  4809. {
  4810. 'num' => 2,
  4811. 'pig' => q\
  4812. a = load ':INPATH:/singlefile/studentcomplextab10k' using PigStorage() as (m:[], t:(name:chararray, age:int, gpa:double), b:{t:(name:chararray, age:int, gpa:double)});
  4813. b = filter a by t is not null;
  4814. c = foreach b generate TOBAG(t) as newBag;
  4815. d = foreach c generate BagToString(newBag);
  4816. store d into ':OUTPATH:';\,
  4817. 'verify_pig_script' => q\
  4818. a = load ':INPATH:/singlefile/studentcomplextab10k' using PigStorage() as (m:[], t:(name:chararray, age:int, gpa:double), b:{t:(name:chararray, age:int, gpa:double)});
  4819. b = filter a by t is not null;
  4820. c = foreach b generate CONCAT(t.name, CONCAT('_', CONCAT((chararray)t.age, CONCAT('_', (chararray)t.gpa))));
  4821. store c into ':OUTPATH:';\,
  4822. },
  4823. ]
  4824. },
  4825. {
  4826. 'name' => 'NestedCross',
  4827. 'tests' => [
  4828. {
  4829. 'num' => 1,
  4830. 'pig' => q\
  4831. A = LOAD ':INPATH:/singlefile/studenttab10k' as (name:chararray, age:int, gpa:double);
  4832. B = LOAD ':INPATH:/singlefile/votertab10k' as (name:chararray, age:int, registration, contributions:double);
  4833. C = cogroup A by name, B by name;
  4834. D = foreach C {
  4835. C1 = cross A, B;
  4836. generate flatten(C1);
  4837. }
  4838. store D into ':OUTPATH:';\,
  4839. 'verify_pig_script' => q\
  4840. A = LOAD ':INPATH:/singlefile/studenttab10k' as (name:chararray, age:int, gpa:double);
  4841. B = LOAD ':INPATH:/singlefile/votertab10k' as (name:chararray, age:int, registration, contributions:double);
  4842. C = JOIN A by name, B by name;
  4843. store C into ':OUTPATH:';\,
  4844. },
  4845. {
  4846. 'num' => 2,
  4847. 'pig' => q\
  4848. A = LOAD ':INPATH:/singlefile/studenttab10k' as (name:chararray, age:int, gpa:double);
  4849. B = LOAD ':INPATH:/singlefile/votertab10k' as (name:chararray, age:int, registration, contributions:double);
  4850. C = cogroup A by name, B by name;
  4851. D = foreach C {
  4852. C1 = filter A by gpa > 4;
  4853. C2 = filter B by contributions > 500;
  4854. C3 = cross C1, C2;
  4855. C4 = foreach C3 generate CONCAT(CONCAT((chararray)gpa, '_'), (chararray)contributions);
  4856. generate flatten(C4);
  4857. }
  4858. store D into ':OUTPATH:';\,
  4859. 'verify_pig_script' => q\
  4860. A = LOAD ':INPATH:/singlefile/studenttab10k' as (name:chararray, age:int, gpa:double);
  4861. B = LOAD ':INPATH:/singlefile/votertab10k' as (name:chararray, age:int, registration, contributions:double);
  4862. C = filter A by gpa > 4;
  4863. D = filter B by contributions > 500;
  4864. E = JOIN C by name, D by name;
  4865. F = foreach E generate CONCAT(CONCAT((chararray)gpa, '_'), (chararray)contributions);
  4866. store F into ':OUTPATH:';\,
  4867. }
  4868. ]
  4869. }
  4870. ],
  4871. },
  4872. ;