PageRenderTime 80ms CodeModel.GetById 15ms RepoModel.GetById 0ms app.codeStats 1ms

/test/e2e/pig/tests/nightly.conf

https://github.com/zjffdu/pig
Perl | 4623 lines | 4186 code | 112 blank | 325 comment | 102 complexity | 735ef9c41158ef2319bd7bea7c6a5f6f MD5 | raw file
Possible License(s): Apache-2.0, CPL-1.0
  1. #!/usr/bin/env perl
  2. ############################################################################
  3. # Licensed to the Apache Software Foundation (ASF) under one or more
  4. # contributor license agreements. See the NOTICE file distributed with
  5. # this work for additional information regarding copyright ownership.
  6. # The ASF licenses this file to You under the Apache License, Version 2.0
  7. # (the "License"); you may not use this file except in compliance with
  8. # the License. You may obtain a copy of the License at
  9. #
  10. # http://www.apache.org/licenses/LICENSE-2.0
  11. #
  12. # Unless required by applicable law or agreed to in writing, software
  13. # distributed under the License is distributed on an "AS IS" BASIS,
  14. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  15. # See the License for the specific language governing permissions and
  16. # limitations under the License.
  17. ###############################################################################
  18. # Nightly tests for pig.
  19. #
  20. #
  21. #PigSetup::setup();
  22. #my $me = `whoami`;
  23. #chomp $me;
  24. $cfg = {
  25. 'driver' => 'Pig',
  26. 'nummachines' => 5,
  27. 'verify_with_pig' => 1,
  28. 'verify_pig_version' => 'old',
  29. 'groups' => [
  30. {
  31. 'name' => 'Checkin',
  32. 'tests' => [
  33. {
  34. 'num' => 1,
  35. 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k' as (name, age, gpa);
  36. store a into ':OUTPATH:';\,
  37. },
  38. {
  39. 'num' => 2,
  40. 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k' as (name, age, gpa);
  41. b = load ':INPATH:/singlefile/votertab10k' as (name, age, registration, contributions);
  42. c = filter a by age < 50;
  43. d = filter b by age < 50;
  44. e = cogroup c by (name, age), d by (name, age) ;
  45. f = foreach e generate flatten(c), flatten(d);
  46. g = group f by registration;
  47. h = foreach g generate group, SUM(f.d::contributions);
  48. i = order h by $1;
  49. store i into ':OUTPATH:';\,
  50. 'floatpostprocess' => 1,
  51. 'delimiter' => ' ',
  52. 'sortArgs' => ['-t', ' ', '+1', '-2'],
  53. }
  54. ]
  55. },
  56. {
  57. 'name' => 'LoaderDefaultDir',
  58. 'tests' => [
  59. {
  60. 'num' => 1,
  61. 'pig' => q\a = load ':INPATH:/dir/studenttab10k' as (name, age, gpa);
  62. store a into ':OUTPATH:';\,
  63. },
  64. ]
  65. },
  66. {
  67. 'name' => 'LoaderPigStorageArg',
  68. 'tests' => [
  69. {
  70. 'num' => 1,
  71. 'pig' => q\a = load ':INPATH:/singlefile/studentcolon10k' using PigStorage(':') as (name, age, gpa);
  72. store a into ':OUTPATH:';\,
  73. },
  74. {
  75. # load with control character
  76. 'num' => 2,
  77. 'pig' => q#a = load ':INPATH:/singlefile/studentctrla10k' using PigStorage('\\u0001') as (name, age, gpa);
  78. store a into ':OUTPATH:';#,
  79. },
  80. {
  81. # load and store with control character
  82. 'num' => 3,
  83. 'pig' => q#a = load ':INPATH:/singlefile/studentctrla10k' using PigStorage('\\u0001') as (name, age, gpa);
  84. store a into ':OUTPATH:.intermediate' using PigStorage('\\u0001');
  85. b = load ':OUTPATH:.intermediate' using PigStorage('\\u0001') as (name, age, gpa);
  86. store b into ':OUTPATH:'; #,
  87. 'notmq' => 1,
  88. },
  89. ]
  90. },
  91. {
  92. # Results doctored, if you change this query you need to copy the
  93. # expected results into test/nightly/benchmarks
  94. 'name' => 'LoaderBinStorage',
  95. 'tests' => [
  96. {
  97. 'num' => 1,
  98. 'pig' => q\register :FUNCPATH:/testudf.jar;
  99. a = load ':INPATH:/singlefile/studenttab10k' as (name, age, gpa);
  100. b = foreach a generate name, org.apache.pig.test.udf.evalfunc.Swap(name, age), TOKENIZE((chararray)name), org.apache.pig.test.udf.evalfunc.CreateMap((chararray)name, age);
  101. store b into ':OUTPATH:.intermediate' using BinStorage();
  102. c = load ':OUTPATH:.intermediate' using BinStorage();
  103. store c into ':OUTPATH:' using org.apache.pig.test.udf.storefunc.StringStore();\,
  104. 'notmq' => 1,
  105. },
  106. ]
  107. },
  108. {
  109. # Results doctored, if you change this query you need to copy the
  110. # expected results into test/nightly/benchmarks
  111. 'name' => 'LoaderTextLoader',
  112. 'tests' => [
  113. {
  114. 'num' => 1,
  115. 'pig' => q\register :FUNCPATH:/testudf.jar;
  116. a = load ':INPATH:/singlefile/textdoc' using TextLoader();
  117. b = foreach a generate TOKENIZE((chararray)$0);
  118. store b into ':OUTPATH:' using org.apache.pig.test.udf.storefunc.StringStore();\,
  119. },
  120. ]
  121. },
  122. {
  123. 'name' => 'FilterBoolean',
  124. 'tests' => [
  125. {
  126. 'num' => 1,
  127. 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name, age, gpa);
  128. b = filter a by name == 'fred allen' and age > 50;
  129. store b into ':OUTPATH:' using PigStorage;\,
  130. },
  131. {
  132. 'num' => 2,
  133. 'pig' => q\a = load ':INPATH:/dir/studenttab10k' using PigStorage() as (name, age, gpa);
  134. b = filter a by name != 'fred allen' or age < 10;
  135. store b into ':OUTPATH:' using PigStorage;\,
  136. },
  137. {
  138. 'num' => 3,
  139. 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name, age, gpa);
  140. b = filter a by not (age == 50);
  141. store b into ':OUTPATH:' using PigStorage;\,
  142. },
  143. {
  144. 'num' => 4,
  145. 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name, age, gpa);
  146. b = filter a by (age >= 50 or name > 'fred') and (gpa <= 3.0 or name >= 'bob');
  147. store b into ':OUTPATH:' using PigStorage;\,
  148. },
  149. {
  150. 'num' => 5,
  151. 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name, age, gpa);
  152. b = filter a by age >= 50 or name > 'fred' and gpa <= 3.0 or name >= 'bob';
  153. store b into ':OUTPATH:' using PigStorage;\,
  154. },
  155. # test filter <= and >= for chararray, int and double
  156. {
  157. 'num' => 6,
  158. 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name:chararray, age:int, gpa:double);
  159. b = filter a by age >= 40 and age <=50 and gpa >= 2.0 and gpa <= 3.0 and name >= 'bob' and name <= 'fred';
  160. store b into ':OUTPATH:' using PigStorage;\,
  161. },
  162. # test filter <= and >= for bytearray, long and float
  163. {
  164. 'num' => 7,
  165. 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name, age:long, gpa:float);
  166. b = filter a by age >= 40 and age <=50 and gpa >= 2.0f and gpa <= 3.0f and name >= 'bob' and name <= 'fred';
  167. store b into ':OUTPATH:' using PigStorage;\,
  168. },
  169. # test filter < and > for chararray, int and double
  170. {
  171. 'num' => 8,
  172. 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name:chararray, age:int, gpa:double);
  173. b = filter a by age > 40 and age <50 and gpa > 2.0 and gpa < 3.0 and name > 'bob' and name < 'fred';
  174. store b into ':OUTPATH:' using PigStorage;\,
  175. },
  176. # test filter < and > for bytearray, long and float
  177. {
  178. 'num' => 9,
  179. 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name, age:long, gpa:float);
  180. b = filter a by age > 40 and age <50 and gpa > 2.0f and gpa < 3.0f and name > 'bob' and name < 'fred';
  181. store b into ':OUTPATH:' using PigStorage;\,
  182. },
  183. # test filter <= and >= for explicit cast for chararray, int and double
  184. {
  185. 'num' => 10,
  186. 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name, age, gpa);
  187. b = filter a by (int)age >= 40 and (int)age <=50 and (double)gpa >= 2.0 and (double)gpa <= 3.0 and (chararray)name >= 'bob' and (chararray)name <= 'fred';
  188. store b into ':OUTPATH:' using PigStorage;\,
  189. },
  190. # test filter <= and >= for explicit cast for bytearray, long and float
  191. {
  192. 'num' => 11,
  193. 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name, age, gpa);
  194. b = filter a by (long)age >= 40 and (long)age <=50 and (float)gpa >= 2.0f and (float)gpa <= 3.0f and name >= 'bob' and name <= 'fred';
  195. store b into ':OUTPATH:' using PigStorage;\,
  196. },
  197. # test filter < and > for explicit cast for chararray, int and double
  198. {
  199. 'num' => 12,
  200. 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name, age, gpa);
  201. b = filter a by (int)age > 40 and (int)age <50 and (double)gpa > 2.0 and (double)gpa < 3.0 and (chararray)name > 'bob' and (chararray)name < 'fred';
  202. store b into ':OUTPATH:' using PigStorage;\,
  203. },
  204. # test filter < and > for explicit cast for bytearray, long and float
  205. {
  206. 'num' => 13,
  207. 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name, age, gpa);
  208. b = filter a by (long)age > 40 and (long)age <50 and (float)gpa > 2.0f and (float)gpa < 3.0f and name > 'bob' and name < 'fred';
  209. store b into ':OUTPATH:' using PigStorage;\,
  210. },
  211. # test AND with nulls
  212. {
  213. 'num' => 14,
  214. 'pig' => q\a = load ':INPATH:/singlefile/studentnulltab10k' using PigStorage() as (name, age, gpa);
  215. b = filter a by name == 'fred allen' and age > 50;
  216. store b into ':OUTPATH:' using PigStorage;\,
  217. },
  218. # test OR with nulls
  219. {
  220. 'num' => 15,
  221. 'pig' => q\a = load ':INPATH:/singlefile/studentnulltab10k' using PigStorage() as (name, age, gpa);
  222. b = filter a by name != 'fred allen' or age < 10;
  223. store b into ':OUTPATH:' using PigStorage;\,
  224. },
  225. # test with nulls filter <= and >= for chararray, int and double
  226. {
  227. 'num' => 16,
  228. 'pig' => q\a = load ':INPATH:/singlefile/studentnulltab10k' using PigStorage() as (name:chararray, age:int, gpa:double);
  229. b = filter a by age >= 40 and age <=50 and gpa >= 2.0 and gpa <= 3.0 and name >= 'bob' and name <= 'fred';
  230. store b into ':OUTPATH:' using PigStorage;\,
  231. },
  232. # test with nulls filter < and > for explicit cast for chararray, int and double
  233. {
  234. 'num' => 17,
  235. 'pig' => q\a = load ':INPATH:/singlefile/studentnulltab10k' using PigStorage() as (name, age, gpa);
  236. b = filter a by (int)age > 40 and (int)age <50 and (double)gpa > 2.0 and (double)gpa < 3.0 and (chararray)name > 'bob' and (chararray)name < 'fred';
  237. store b into ':OUTPATH:' using PigStorage;\,
  238. },
  239. {
  240. 'num' => 18,
  241. 'ignore' => 1, # PIG-2593
  242. 'pig' => q\a = load ':INPATH:/singlefile/allscalar10k' as (name, age, gpa, instate);
  243. b = filter a by instate;
  244. store b into ':OUTPATH:' using PigStorage;\,
  245. 'verify_pig_script' => q\a = load ':INPATH:/singlefile/allscalar10k' as (name, age, gpa, instate);
  246. b = filter a by instate == 'true';
  247. store b into ':OUTPATH:' using PigStorage;\,
  248. },
  249. {
  250. 'num' => 19,
  251. 'ignore' => 1, # PIG-2593
  252. 'pig' => q\a = load ':INPATH:/singlefile/allscalar10k' as (name, age, gpa, instate);
  253. b = filter a by not instate;
  254. store b into ':OUTPATH:' using PigStorage;\,
  255. 'verify_pig_script' => q\a = load ':INPATH:/singlefile/allscalar10k' as (name, age, gpa, instate);
  256. b = filter a by instate == 'false';
  257. store b into ':OUTPATH:' using PigStorage;\,
  258. },
  259. {
  260. 'num' => 20,
  261. 'pig' => q\a = load ':INPATH:/singlefile/allscalar10k' as (name, age, gpa, instate);
  262. b = filter a by instate is null;
  263. store b into ':OUTPATH:' using PigStorage;\,
  264. },
  265. {
  266. 'num' => 21,
  267. 'ignore' => 1, # TODO Need to file a JIRA-2
  268. 'pig' => q\a = load ':INPATH:/singlefile/allscalar10k' as (name, age, gpa, instate);
  269. b = filter a by instate == true;
  270. store b into ':OUTPATH:' using PigStorage;\,
  271. 'verify_pig_script' => q\a = load ':INPATH:/singlefile/allscalar10k' as (name, age, gpa, instate);
  272. b = filter a by instate == 'true';
  273. store b into ':OUTPATH:' using PigStorage;\,
  274. },
  275. {
  276. 'num' => 22,
  277. 'ignore' => 1, # TODO Need to file a JIRA-2
  278. 'pig' => q\a = load ':INPATH:/singlefile/allscalar10k' as (name, age, gpa, instate);
  279. b = filter a by instate == false;
  280. store b into ':OUTPATH:' using PigStorage;\,
  281. 'verify_pig_script' => q\a = load ':INPATH:/singlefile/allscalar10k' as (name, age, gpa, instate);
  282. b = filter a by instate == 'false';
  283. store b into ':OUTPATH:' using PigStorage;\,
  284. },
  285. {
  286. 'num' => 23,
  287. 'ignore' => 1, # TODO Need to file a JIRA-1
  288. 'pig' => q\a = load ':INPATH:/singlefile/allscalar10k' as (name:chararray, age:int, gpa:double, instate:boolean);
  289. b = filter a by instate;
  290. store b into ':OUTPATH:' using PigStorage;\,
  291. 'verify_pig_script' => q\a = load ':INPATH:/singlefile/allscalar10k' as (name:chararray, age:int, gpa:double, instate:chararray);
  292. b = filter a by instate == 'true';
  293. store b into ':OUTPATH:' using PigStorage;\,
  294. },
  295. {
  296. 'num' => 24,
  297. 'ignore' => 1, # TODO Need to file a JIRA-1
  298. 'pig' => q\a = load ':INPATH:/singlefile/allscalar10k' as (name:chararray, age:int, gpa:double, instate:boolean);
  299. b = filter a by not instate;
  300. store b into ':OUTPATH:' using PigStorage;\,
  301. 'verify_pig_script' => q\a = load ':INPATH:/singlefile/allscalar10k' as (name:chararray, age:int, gpa:double, instate:chararray);
  302. b = filter a by instate == 'false';
  303. store b into ':OUTPATH:' using PigStorage;\,
  304. },
  305. {
  306. 'num' => 25,
  307. 'pig' => q\a = load ':INPATH:/singlefile/allscalar10k' as (name:chararray, age:int, gpa:double, instate:boolean);
  308. b = filter a by instate is null;
  309. store b into ':OUTPATH:' using PigStorage;\,
  310. 'verify_pig_script' => q\a = load ':INPATH:/singlefile/allscalar10k' as (name:chararray, age:int, gpa:double, instate:chararray);
  311. b = filter a by instate is null;
  312. store b into ':OUTPATH:' using PigStorage;\,
  313. },
  314. {
  315. 'num' => 26,
  316. 'pig' => q\a = load ':INPATH:/singlefile/allscalar10k' as (name:chararray, age:int, gpa:double, instate:boolean);
  317. b = filter a by instate == true;
  318. store b into ':OUTPATH:' using PigStorage;\,
  319. 'verify_pig_script' => q\a = load ':INPATH:/singlefile/allscalar10k' as (name:chararray, age:int, gpa:double, instate:chararray);
  320. b = filter a by instate == 'true';
  321. store b into ':OUTPATH:' using PigStorage;\,
  322. },
  323. {
  324. 'num' => 27,
  325. 'pig' => q\a = load ':INPATH:/singlefile/allscalar10k' as (name:chararray, age:int, gpa:double, instate:boolean);
  326. b = filter a by instate == false;
  327. store b into ':OUTPATH:' using PigStorage;\,
  328. 'verify_pig_script' => q\a = load ':INPATH:/singlefile/allscalar10k' as (name:chararray, age:int, gpa:double, instate:chararray);
  329. b = filter a by instate == 'false';
  330. store b into ':OUTPATH:' using PigStorage;\,
  331. },
  332. ],
  333. },
  334. {
  335. 'name' => 'FilterEq',
  336. 'tests' => [
  337. {
  338. 'num' => 1,
  339. 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name, age, gpa);
  340. b = filter a by name == 'alice johnson' and age == 64 and gpa == 3.99;
  341. store b into ':OUTPATH:' using PigStorage;\,
  342. },
  343. {
  344. 'num' => 2,
  345. 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name, age, gpa);
  346. b = filter a by name > 'fred allen' and age > 40 and gpa > 2.50;
  347. store b into ':OUTPATH:' using PigStorage;\,
  348. },
  349. {
  350. 'num' => 3,
  351. 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name, age, gpa);
  352. b = filter a by name >= 'fred allen' and age >= 40 and gpa >= 2.50;
  353. store b into ':OUTPATH:' using PigStorage;\,
  354. },
  355. {
  356. 'num' => 4,
  357. 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name, age, gpa);
  358. b = filter a by name lt 'fred allen' and age < 40 and gpa < 2.50;
  359. store b into ':OUTPATH:' using PigStorage;\,
  360. },
  361. {
  362. 'num' => 5,
  363. 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name, age, gpa);
  364. b = filter a by name lte 'fred allen' and age <= 40 and gpa <= 2.50;
  365. store b into ':OUTPATH:' using PigStorage;\,
  366. },
  367. {
  368. 'num' => 6,
  369. 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k' using PigStorage();
  370. b = filter a by $0 neq 'fred allen' and $1 != '40' and $2 != '2.50';
  371. store b into ':OUTPATH:' using PigStorage;\,
  372. },
  373. # test for filter == for chararray, int and double
  374. {
  375. 'num' => 7,
  376. 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name:chararray, age:int, gpa:double);
  377. b = filter a by name == 'fred allen' and age == 61 and gpa == 1.42;
  378. store b into ':OUTPATH:' using PigStorage;\,
  379. },
  380. # test for filter == for bytearray, long and float
  381. {
  382. 'num' => 8,
  383. 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name, age:long, gpa:float);
  384. b = filter a by name == 'fred allen' and age == 61 and gpa == 1.42f;
  385. store b into ':OUTPATH:' using PigStorage;\,
  386. },
  387. # test for filter != for chararray, int and double
  388. {
  389. 'num' => 9,
  390. 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name:chararray, age:int, gpa:double);
  391. b = filter a by $0 != 'fred allen' and $1 != 40 and $2 != 2.50;
  392. store b into ':OUTPATH:' using PigStorage;\,
  393. },
  394. # test for filter != for bytearray, long and float
  395. {
  396. 'num' => 10,
  397. 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name, age:long, gpa:float);
  398. b = filter a by $0 != 'fred allen' and $1 != 40 and $2 != 2.50f;
  399. store b into ':OUTPATH:' using PigStorage;\,
  400. },
  401. # test for filter == for explicit casts to chararray, int and double
  402. {
  403. 'num' => 11,
  404. 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name, age, gpa);
  405. b = filter a by (chararray)name == 'fred allen' and (int)age == 61 and (double)gpa == 1.42;
  406. store b into ':OUTPATH:' using PigStorage;\,
  407. },
  408. # test for filter == for explicit casts to bytearray, long and float
  409. {
  410. 'num' => 12,
  411. 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name, age, gpa);
  412. b = filter a by name == 'fred allen' and (long)age == 61 and (float)gpa == 1.42f;
  413. store b into ':OUTPATH:' using PigStorage;\,
  414. },
  415. # test for filter != for explicit casts to chararray, int and double
  416. {
  417. 'num' => 13,
  418. 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k' using PigStorage() ;
  419. b = filter a by (chararray)$0 != 'fred allen' and (int)$1 != 40 and (double)$2 != 2.50;
  420. store b into ':OUTPATH:' using PigStorage;\,
  421. },
  422. # test for filter != for explicit casts to bytearray, long and float
  423. {
  424. 'num' => 14,
  425. 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k' using PigStorage() ;
  426. b = filter a by $0 != 'fred allen' and (long)$1 != 40 and (float)$2 != 2.50f;
  427. store b into ':OUTPATH:' using PigStorage;\,
  428. },
  429. ]
  430. },
  431. {
  432. 'name' => 'FilterMatches',
  433. 'tests' => [
  434. {
  435. 'num' => 1,
  436. 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name, age, gpa);
  437. b = filter a by name matches '^fred.*';
  438. store b into ':OUTPATH:' using PigStorage;\,
  439. },
  440. {
  441. 'num' => 2,
  442. 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k' using PigStorage();
  443. b = filter a by not $0 matches '^fred.*';
  444. store b into ':OUTPATH:' using PigStorage;\,
  445. },
  446. {
  447. # test for filter on matches for chararray (declared and explicit cast)
  448. 'num' => 3,
  449. 'pig' => q\a = load ':INPATH:/singlefile/votertab10k' using PigStorage() as (name:chararray, age:int, registration, contributions:double);
  450. b = filter a by name matches '^fred.*' and (chararray)registration matches '^dem.*';
  451. store b into ':OUTPATH:' using PigStorage;\,
  452. },
  453. {
  454. 'num' => 4,
  455. 'pig' => q\a = load ':INPATH:/singlefile/votertab10k' using PigStorage() as (name:chararray, age:int, registration, contributions:double);
  456. b = filter a by name matches 'f.ed' and (chararray)registration matches 'd.m';
  457. store b into ':OUTPATH:' using PigStorage;\,
  458. },
  459. {
  460. 'num' => 5,
  461. 'pig' => q\a = load ':INPATH:/singlefile/votertab10k' using PigStorage() as (name:chararray, age:int, registration, contributions:double);
  462. b = filter a by name matches 'f[^f]ed.*';
  463. store b into ':OUTPATH:' using PigStorage;\,
  464. },
  465. {
  466. 'num' => 6,
  467. 'pig' => "a = load ':INPATH:/singlefile/votertab10k' using PigStorage() as (name:chararray, age:int, registration, contributions:double);\nb = filter a by name matches '.*\\\\wan.*';\nstore b into ':OUTPATH:' using PigStorage;",
  468. },
  469. {
  470. 'num' => 7,
  471. 'pig' => "a = load ':INPATH:/singlefile/votertab10k' using PigStorage() as (name:chararray, age:int, registration, contributions:double);\nb = filter a by name matches '^e.*\\\\sc.*';\nstore b into ':OUTPATH:' using PigStorage;",
  472. },
  473. {
  474. 'num' => 8,
  475. 'pig' => "a = load ':INPATH:/singlefile/votertab10k' using PigStorage() as (name:chararray, age:int, registration, contributions:double);\nb = filter a by name matches 'ethan white';\nstore b into ':OUTPATH:' using PigStorage;",
  476. },
  477. {
  478. 'num' => 9,
  479. 'pig' => "a = load ':INPATH:/singlefile/studentnulltab10k' using PigStorage() as (name, age, gpa);\nb = filter a by gpa matches '\\\\d\\\\.45';\nstore b into ':OUTPATH:' using PigStorage;",
  480. },
  481. ]
  482. },
  483. {
  484. 'name' => 'FilterUdf',
  485. 'tests' => [
  486. {
  487. 'num' => 1,
  488. 'pig' => q\
  489. a = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name, age, gpa);
  490. b = load ':INPATH:/singlefile/votertab10k' as (name, age, registration, contributions);
  491. c = cogroup a by (name, age), b by (name, age);
  492. d = filter c by not IsEmpty(a);
  493. e = filter d by not IsEmpty(b);
  494. f = foreach e generate flatten(a), flatten(b);
  495. store f into ':OUTPATH:';\,
  496. },
  497. {
  498. 'num' => 2,
  499. 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k' as (name, age, gpa);
  500. b = load ':INPATH:/singlefile/votertab10k' as (name, age, registration, contributions);
  501. c = filter a by age < 50;
  502. d = filter b by age < 50;
  503. e = cogroup c by (name, age), d by (name, age);
  504. f = filter e by COUNT(c)> 0 AND COUNT(d)>0;
  505. store f into ':OUTPATH:';\,
  506. 'rc' => 0
  507. },
  508. ]
  509. },
  510. # TODO Group that don't flatten via Agg functions
  511. {
  512. 'name' => 'GroupAggFunc',
  513. 'tests' => [
  514. {
  515. 'num' => 1,
  516. 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k' as (name, age, gpa);
  517. b = group a by name;
  518. c = foreach b generate group, COUNT(a.age);
  519. store c into ':OUTPATH:';\,
  520. },
  521. {
  522. 'num' => 2,
  523. 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k';
  524. b = group a by $0;
  525. c = foreach b generate group, COUNT(a.$1);
  526. store c into ':OUTPATH:';\,
  527. },
  528. {
  529. 'num' => 3,
  530. 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k' as (name, age, gpa);
  531. b = group a by (name, age);
  532. c = foreach b generate group.name, group.age, COUNT(a.gpa);
  533. store c into ':OUTPATH:';\,
  534. },
  535. {
  536. 'num' => 5,
  537. 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k' as (name, age, gpa);
  538. b = group a all;
  539. c = foreach b generate COUNT(a.$0);
  540. store c into ':OUTPATH:';\,
  541. },
  542. {
  543. 'num' => 6,
  544. 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k' as (name, age, gpa);
  545. b = group a by name;
  546. c = foreach b generate group, SUM(a.age);
  547. store c into ':OUTPATH:';\,
  548. },
  549. {
  550. 'num' => 7,
  551. 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k' as (name, age, gpa);
  552. b = group a by name;
  553. c = foreach b generate group, SUM(a.gpa);
  554. store c into ':OUTPATH:';\,
  555. 'floatpostprocess' => 1,
  556. 'delimiter' => ' ',
  557. },
  558. {
  559. 'num' => 8,
  560. 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k' as (name, age, gpa);
  561. b = group a by name;
  562. c = foreach b generate group, AVG(a.age);
  563. store c into ':OUTPATH:';\,
  564. },
  565. {
  566. 'num' => 9,
  567. 'ignore23' => 'I cannot get it right due to float precision, temporarily disable',
  568. 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k' as (name, age, gpa);
  569. b = group a by name;
  570. c = foreach b generate group, AVG(a.gpa);
  571. store c into ':OUTPATH:';\,
  572. 'floatpostprocess' => 1,
  573. 'delimiter' => ' ',
  574. },
  575. {
  576. 'num' => 10,
  577. 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k' as (name, age, gpa);
  578. b = group a by name;
  579. c = foreach b generate group, MIN(a.gpa);
  580. store c into ':OUTPATH:';\,
  581. },
  582. {
  583. 'num' => 11,
  584. 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k' as (name, age, gpa);
  585. b = group a by name;
  586. c = foreach b generate group, MAX(a.gpa);
  587. store c into ':OUTPATH:';\,
  588. },
  589. {
  590. 'num' => 12,
  591. 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k' as (name, age, gpa);
  592. b = group a by (name, age);
  593. c = foreach b generate flatten(group), SUM(a.gpa);
  594. store c into ':OUTPATH:';\,
  595. 'floatpostprocess' => 1,
  596. 'delimiter' => ' ',
  597. },
  598. {
  599. 'num' => 13,
  600. 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k' as (name, age, gpa);
  601. b = group a by (name);
  602. c = load ':INPATH:/singlefile/studenttab10k' as (name, age, gpa);
  603. d = cogroup b by group, c by name;
  604. e = foreach d generate flatten(group), SUM(c.gpa), COUNT(c.name);
  605. store e into ':OUTPATH:';\,
  606. 'floatpostprocess' => 1,
  607. 'delimiter' => ' ',
  608. },
  609. {
  610. 'num' => 14,
  611. 'pig' => q\a = load ':INPATH:/singlefile/allscalar10k' as (name:chararray, age:int, gpa:double, instate:boolean);
  612. b = group a by (name);
  613. e = foreach b generate COUNT(a.name);
  614. store e into ':OUTPATH:';\,
  615. 'verify_pig_script' => q\a = load ':INPATH:/singlefile/allscalar10k' as (name:chararray, age:int, gpa:double, instate:chararray);
  616. b = group a by (name);
  617. e = foreach b generate COUNT(a.name);
  618. store e into ':OUTPATH:';\,
  619. }
  620. ],
  621. },
  622. {
  623. 'name' => 'MapPartialAgg',
  624. 'tests' => [
  625. {
  626. 'num' => 1,
  627. 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k' as (name, age, gpa);
  628. b = group a by name;
  629. c = foreach b generate group, COUNT(a.age);
  630. store c into ':OUTPATH:';\,
  631. 'java_params' => ['-Dpig.exec.mapPartAgg=true']
  632. },
  633. {
  634. #multiquery with group in one sub query
  635. 'num' => 2,
  636. 'pig' => q# a = load ':INPATH:/singlefile/studenttab10k' as (name: chararray, age: int, gpa: float);
  637. b = filter a by age < 22; store b into ':OUTPATH:.1';
  638. c = group b by age;
  639. d = foreach c generate group, SUM(b.gpa);
  640. store d into ':OUTPATH:.2'; #,
  641. 'java_params' => ['-Dpig.exec.mapPartAgg=true']
  642. },
  643. {
  644. #multi query with two group on diff columns
  645. 'num' => 3,
  646. 'pig' => q# a = load ':INPATH:/singlefile/studenttab10k' as (name: chararray, age: int, gpa: float);
  647. g1 = group a by name;
  648. f1 = foreach g1 generate group as name, MAX(a.gpa);
  649. store f1 into ':OUTPATH:.1';
  650. g2 = group a by age;
  651. f2 = foreach g2 generate group as age, AVG(a.gpa);
  652. store f2 into ':OUTPATH:.2'; #,
  653. 'java_params' => ['-Dpig.exec.mapPartAgg=true']
  654. },
  655. {
  656. #multi query with three groups on diff columns, group key being an expression
  657. 'num' => 4,
  658. 'pig' => q# a = load ':INPATH:/singlefile/studenttab10k' as (name: chararray, age: int, gpa: float);
  659. g1 = group a by name;
  660. f1 = foreach g1 generate group as name, MAX(a.gpa);
  661. store f1 into ':OUTPATH:.1';
  662. g2 = group a by age%10;
  663. f2 = foreach g2 generate group as age_mod10, AVG(a.gpa);
  664. store f2 into ':OUTPATH:.2';
  665. g3 = group a by age;
  666. f3 = foreach g3 generate group%10, AVG(a.gpa);
  667. store f3 into ':OUTPATH:.3';
  668. g4 = group a by gpa;
  669. f4 = foreach g4 generate group as gpa, COUNT(a);
  670. store f4 into ':OUTPATH:.4';
  671. #,
  672. 'java_params' => ['-Dpig.exec.mapPartAgg=true']
  673. },
  674. {
  675. #aggregation gets more than one tuple for every tuple from load func
  676. 'num' => 5,
  677. 'pig' => q# a = load ':INPATH:/singlefile/studenttab10k' as (name: chararray, age: int, gpa: float);
  678. b = foreach a generate name, age, gpa, flatten(TOBAG(age,age)) as x;
  679. c = group b by age;
  680. d = foreach c generate group, AVG(b.gpa);
  681. store d into ':OUTPATH:'; #,
  682. 'java_params' => ['-Dpig.exec.mapPartAgg=true']
  683. },
  684. ],
  685. },
  686. {
  687. 'name' => 'EvalFunc',
  688. 'tests' => [
  689. {
  690. 'num' => 1,
  691. 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k' as (name, age, gpa);
  692. b = filter a by name lt 'b';
  693. c = foreach b generate ARITY(name, age, gpa);
  694. store c into ':OUTPATH:';\,
  695. },
  696. {
  697. 'num' => 2,
  698. 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name:chararray, age, gpa);
  699. b = filter a by name lt 'b';
  700. c = foreach b generate TOKENIZE(name);
  701. d = foreach c generate flatten($0);
  702. store d into ':OUTPATH:';\,
  703. },
  704. {
  705. 'num' => 3,
  706. 'pig' => q\register :FUNCPATH:/testudf.jar;
  707. a = load ':INPATH:/singlefile/studenttab10k' as (name, age, gpa);
  708. b = filter a by name lt 'b';
  709. c = foreach b generate org.apache.pig.test.udf.evalfunc.Swap(name, age);
  710. store c into ':OUTPATH:' using org.apache.pig.test.udf.storefunc.StringStore();\,
  711. },
  712. {
  713. 'num' => 4,
  714. 'pig' => q\register :FUNCPATH:/testudf.jar;
  715. a = load ':INPATH:/singlefile/studenttab10k' as (name, age, gpa);
  716. b = filter a by name lt 'b';
  717. c = foreach b generate org.apache.pig.test.udf.evalfunc.CreateMap((chararray)name, age);
  718. store c into ':OUTPATH:' using org.apache.pig.test.udf.storefunc.StringStore();\,
  719. },
  720. {
  721. 'num' => 5,
  722. 'pig' => q\register :FUNCPATH:/testudf.jar;
  723. a = load ':INPATH:/singlefile/allscalar10k' as (name:chararray, age:int, gpa:double, instate:boolean);
  724. b = foreach a generate org.apache.pig.test.udf.evalfunc.TestBoolean(instate);
  725. store b into ':OUTPATH:';\,
  726. 'verify_pig_script' => q\a = load ':INPATH:/singlefile/allscalar10k' as (name:chararray, age:int, gpa:double, instate:chararray);
  727. b = foreach a generate (instate is null ? '' : (instate == 'true' ? 'false' : 'true'));
  728. store b into ':OUTPATH:';\,
  729. }
  730. ]
  731. },
  732. # TODO DIFF
  733. # TODO User defined grouping function
  734. {
  735. 'name' => 'CoGroupFlatten',
  736. 'tests' => [
  737. {
  738. 'num' => 1,
  739. 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name, age, gpa);
  740. b = load ':INPATH:/singlefile/votertab10k' as (name, age, registration, contributions);
  741. c = filter a by age < 20;
  742. d = filter b by age < 20;
  743. e = cogroup c by name, d by name;
  744. f = foreach e generate flatten (c), flatten(d);
  745. store f into ':OUTPATH:';\,
  746. },
  747. {
  748. 'num' => 2,
  749. 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name, age, gpa);
  750. b = load ':INPATH:/singlefile/votertab10k' as (name, age, registration, contributions);
  751. c = filter a by $1 < 20;
  752. d = filter b by $1 < 20;
  753. e = cogroup c by $0, d by $0;
  754. f = foreach e generate flatten (c), flatten(d);
  755. store f into ':OUTPATH:';\,
  756. },
  757. {
  758. 'num' => 3,
  759. 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name, age, gpa);
  760. b = load ':INPATH:/singlefile/votertab10k' as (name, age, registration, contributions);
  761. c = filter a by age < 20;
  762. d = filter b by age < 20;
  763. e = cogroup c by (name, age), d by (name, age);
  764. f = foreach e generate flatten (c), flatten(d);
  765. store f into ':OUTPATH:';\,
  766. },
  767. {
  768. 'num' => 4,
  769. 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name, age, gpa);
  770. b = load ':INPATH:/singlefile/votertab10k' as (name, age, registration, contributions);
  771. d = filter b by age < 20;
  772. e = cogroup a by (name, age) inner, d by (name, age);
  773. f = foreach e generate flatten (a), flatten(d);
  774. store f into ':OUTPATH:';\,
  775. },
  776. {
  777. 'num' => 5,
  778. 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name, age, gpa);
  779. b = load ':INPATH:/singlefile/votertab10k' as (name, age, registration, contributions);
  780. c = filter a by age < 20;
  781. e = cogroup c by (name, age), b by (name, age) inner;
  782. f = foreach e generate flatten (c), flatten(b);
  783. store f into ':OUTPATH:';\,
  784. },
  785. {
  786. 'num' => 6,
  787. 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name, age, gpa);
  788. b = load ':INPATH:/singlefile/votertab10k' as (name, age, registration, contributions);
  789. e = cogroup a by (name, age) inner, b by (name, age) inner;
  790. f = foreach e generate flatten (a), flatten(b);
  791. store f into ':OUTPATH:';\,
  792. },
  793. {
  794. # Test cogrouping data loaded from two separate loaders. We don't have any data that can join with studenttab that isn't also loaded with PigStorage, so the
  795. # first step is an intermediate load and store using BinStorage.
  796. 'num' => 7,
  797. 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name, age, gpa);
  798. store a into ':OUTPATH:.intermediate' using BinStorage();
  799. b = load ':OUTPATH:.intermediate' using BinStorage() as (name, age, gpa);
  800. c = load ':INPATH:/singlefile/votertab10k' as (name, age, registration, contributions);
  801. e = cogroup b by (name, age) inner, c by (name, age) inner;
  802. f = foreach e generate flatten (b), flatten(c);
  803. store f into ':OUTPATH:';\,
  804. 'notmq' => 1,
  805. },
  806. ]
  807. },
  808. {
  809. 'name' => 'CoGroup',
  810. 'tests' => [
  811. {
  812. 'num' => 1,
  813. 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name, age, gpa);
  814. b = load ':INPATH:/singlefile/votertab10k' as (name, age, registration, contributions);
  815. c = cogroup a by name, b by name;
  816. d = foreach c generate flatten(group), COUNT(a) + COUNT(b);
  817. store d into ':OUTPATH:';\,
  818. },
  819. ]
  820. },
  821. {
  822. 'name' => 'Join',
  823. 'tests' => [
  824. {
  825. 'num' => 1,
  826. 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name, age, gpa);
  827. b = load ':INPATH:/singlefile/votertab10k' as (name, age, registration, contributions);
  828. c = filter a by age < 20;
  829. d = filter b by age < 20;
  830. e = join c by name, d by name;
  831. store e into ':OUTPATH:';\,
  832. },
  833. {
  834. 'num' => 2,
  835. 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name, age, gpa);
  836. b = load ':INPATH:/singlefile/votertab10k' as (name, age, registration, contributions);
  837. c = filter a by age < 20;
  838. d = filter b by age < 20;
  839. e = join c by $0, d by $0;
  840. store e into ':OUTPATH:';\,
  841. },
  842. {
  843. 'num' => 3,
  844. 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name, age, gpa);
  845. b = load ':INPATH:/singlefile/votertab10k' as (name, age, registration, contributions);
  846. c = filter a by age < 20;
  847. d = filter b by age < 20;
  848. e = join c by (name, age), d by (name, age);
  849. store e into ':OUTPATH:';\,
  850. },
  851. # self join with implict split
  852. # JIRA PIG-429
  853. {
  854. 'num' => 4,
  855. 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k';
  856. b = filter a by $1 > 25;
  857. c = join a by $0, b by $0;
  858. store c into ':OUTPATH:';\,
  859. },
  860. # join with one input having schema and another without
  861. # JIRA PIG-428
  862. {
  863. 'num' => 5,
  864. 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k' as (name:chararray,age:int, gpa:double);
  865. another = load ':INPATH:/singlefile/studenttab10k';
  866. c = foreach another generate $0, $1+ 10, $2 + 10.0;
  867. d = join a by $0, c by $0;
  868. store d into ':OUTPATH:';\,
  869. },
  870. # self join using fragment replicate join
  871. # no types
  872. {
  873. 'num' => 6,
  874. 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k' as (name, age, gpa);
  875. b = load ':INPATH:/singlefile/studenttab10k' as (name, age, gpa);
  876. c = join a by name, b by name using 'repl';
  877. store c into ':OUTPATH:';\,
  878. 'verify_pig_script' => q\a = load ':INPATH:/singlefile/studenttab10k' as (name, age, gpa);
  879. b = load ':INPATH:/singlefile/studenttab10k' as (name, age, gpa);
  880. c = join a by name, b by name ;
  881. store c into ':OUTPATH:';\,
  882. },
  883. # self join using fragment replicate join
  884. # with types and no cast for join key
  885. {
  886. 'num' => 7,
  887. 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k' as (name:chararray, age:int, gpa:double);
  888. b = load ':INPATH:/singlefile/studenttab10k' as (name:chararray, age:int, gpa:double);
  889. c = join a by name, b by name using 'repl';
  890. store c into ':OUTPATH:';\,
  891. 'verify_pig_script' => q\a = load ':INPATH:/singlefile/studenttab10k' as (name:chararray, age:int, gpa:double);
  892. b = load ':INPATH:/singlefile/studenttab10k' as (name:chararray, age:int, gpa:double);
  893. c = join a by name, b by name ;
  894. store c into ':OUTPATH:';\,
  895. },
  896. # self join using fragment replicate join
  897. # with types and cast for join key
  898. {
  899. 'num' => 8,
  900. 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k' as (name:chararray, age:int, gpa:double);
  901. b = load ':INPATH:/singlefile/studenttab10k' as (name:chararray, age:int, gpa);
  902. c = join a by gpa, b by gpa using 'repl';
  903. store c into ':OUTPATH:';\,
  904. 'verify_pig_script' => q\a = load ':INPATH:/singlefile/studenttab10k' as (name:chararray, age:int, gpa:double);
  905. b = load ':INPATH:/singlefile/studenttab10k' as (name:chararray, age:int, gpa);
  906. c = join a by gpa, b by gpa ;
  907. store c into ':OUTPATH:';\,
  908. },
  909. # left outer join
  910. {
  911. 'num' => 9,
  912. 'pig' => q\a = load ':INPATH:/singlefile/studentnulltab10k' as (name:chararray, age:int, gpa:double);
  913. b = load ':INPATH:/singlefile/voternulltab10k' as (name:chararray, age:long, registration:chararray, contributions:double);
  914. c = join a by name left outer, b by name;
  915. store c into ':OUTPATH:';\,
  916. },
  917. # right outer join
  918. {
  919. 'num' => 10,
  920. 'pig' => q\a = load ':INPATH:/singlefile/studentnulltab10k' as (name:chararray, age:int, gpa:double);
  921. b = load ':INPATH:/singlefile/voternulltab10k' as (name:chararray, age:long, registration:chararray, contributions:double);
  922. c = join a by name right outer, b by name;
  923. store c into ':OUTPATH:';\,
  924. },
  925. # full outer join
  926. {
  927. 'num' => 11,
  928. 'pig' => q\a = load ':INPATH:/singlefile/studentnulltab10k' as (name:chararray, age:int, gpa:double);
  929. b = load ':INPATH:/singlefile/voternulltab10k' as (name:chararray, age:long, registration:chararray, contributions:double);
  930. c = join a by name full outer, b by name;
  931. store c into ':OUTPATH:';\,
  932. },
  933. # see PIG-1209 join package now uses internalcachedBag, so every tuple on reduce side in this test will spilled to disk.
  934. {
  935. 'num' => 12,
  936. 'java_params' => ['-Dpig.cachedbag.memusage=0'],
  937. 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name, age, gpa);
  938. b = load ':INPATH:/singlefile/votertab10k' as (name, age, registration, contributions);
  939. c = filter a by age < 20;
  940. d = filter b by age < 20;
  941. e = join c by name, d by name;
  942. store e into ':OUTPATH:';\,
  943. },
  944. {
  945. 'num' => 13,
  946. 'pig' => q\a = load ':INPATH:/singlefile/allscalar10k' as (name:chararray, age:int, gpa:double, instate:boolean);
  947. b = load ':INPATH:/singlefile/allscalar10k' as (name:chararray, age:int, gpa:double, instate:boolean);
  948. c = filter a by age < 20;
  949. d = filter b by age < 20;
  950. e = join c by instate, d by instate parallel 5;
  951. store e into ':OUTPATH:';\,
  952. 'verify_pig_script' => q\a = load ':INPATH:/singlefile/allscalar10k' as (name:chararray, age:int, gpa:double, instate:chararray);
  953. b = load ':INPATH:/singlefile/allscalar10k' as (name:chararray, age:int, gpa:double, instate:chararray);
  954. c = filter a by age < 20;
  955. d = filter b by age < 20;
  956. e = join c by instate, d by instate parallel 5;
  957. store e into ':OUTPATH:';\,
  958. }
  959. ]
  960. },
  961. {
  962. 'name' => 'Foreach',
  963. 'tests' => [
  964. {
  965. 'num' => 1,
  966. 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k' as (name, age, gpa);
  967. b = foreach a generate *;
  968. store b into ':OUTPATH:';\,
  969. },
  970. {
  971. 'num' => 2,
  972. 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k';
  973. b = foreach a generate *;
  974. store b into ':OUTPATH:';\,
  975. },
  976. {
  977. 'num' => 3,
  978. 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k' as (name, age, gpa);
  979. b = foreach a generate name, age;
  980. store b into ':OUTPATH:';\,
  981. },
  982. {
  983. 'num' => 4,
  984. 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k';
  985. b = foreach a generate $0, $2;
  986. store b into ':OUTPATH:';\,
  987. },
  988. {
  989. # test filter, projection, sort , duplicate elimination
  990. 'num' => 5,
  991. 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k' as (name, age, gpa);
  992. b = filter a by age < 20;
  993. c = group b by age;
  994. d = foreach c {
  995. cf = filter b by gpa < 3.0;
  996. cp = cf.gpa;
  997. cd = distinct cp;
  998. co = order cd by $0;
  999. generate group, flatten(co);
  1000. }
  1001. store d into ':OUTPATH:';\,
  1002. },
  1003. {
  1004. # test flatten for map and scalar
  1005. 'num' => 6,
  1006. 'pig' => q\register :FUNCPATH:/testudf.jar;
  1007. a = load ':INPATH:/singlefile/studenttab10k' as (name, age, gpa);
  1008. b = foreach a generate flatten(name) as n, flatten(org.apache.pig.test.udf.evalfunc.CreateMap((chararray)name, gpa)) as m;
  1009. store b into ':OUTPATH:' using org.apache.pig.test.udf.storefunc.StringStore();\,
  1010. },
  1011. {
  1012. # test flatten for UDF that returns bag with multiple tuples with multiple columns
  1013. 'num' => 7,
  1014. 'pig' => q\register :FUNCPATH:/testudf.jar;
  1015. a = load ':INPATH:/singlefile/studenttab10k' as (name, age, gpa);
  1016. b = foreach a generate name, flatten(org.apache.pig.test.udf.evalfunc.CreateTupleBag(age, gpa)) as foo;
  1017. store b into ':OUTPATH:';\,
  1018. },
  1019. {
  1020. 'num' => 8,
  1021. 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name, age: int, gpa);
  1022. c = group a by name;
  1023. d = foreach c generate flatten(group), MAX(a.age) + MIN(a.age);
  1024. store d into ':OUTPATH:';\,
  1025. },
  1026. {
  1027. # test filter, projection, sort , duplicate elimination
  1028. 'num' => 9,
  1029. 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k' as (name, age, gpa);
  1030. b = filter a by age < 20;
  1031. c = group b by age;
  1032. d = foreach c {
  1033. cf = filter b by gpa >= 3.0 and gpa <= 3.5;
  1034. cp = cf.gpa;
  1035. cd = distinct cp;
  1036. co = order cd by $0;
  1037. generate group, flatten(co);
  1038. }
  1039. store d into ':OUTPATH:';\,
  1040. },
  1041. {
  1042. # test filter, projection, sort , duplicate elimination
  1043. 'num' => 10,
  1044. 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k' as (name, age, gpa);
  1045. b = filter a by age < 20;
  1046. c = group b by age;
  1047. d = foreach c {
  1048. cf = filter b by (gpa == 4.0 or gpa != 2.0) and name > 'a';
  1049. cp = cf.gpa;
  1050. cd = distinct cp;
  1051. co = order cd by $0;
  1052. generate group, flatten(co);
  1053. }
  1054. store d into ':OUTPATH:';\,
  1055. },
  1056. {
  1057. # test filter, projection, sort , duplicate elimination
  1058. 'num' => 11,
  1059. 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k' as (name, age, gpa);
  1060. b = filter a by age < 20;
  1061. c = foreach b {
  1062. exp1 = age + gpa;
  1063. exp2 = exp1 + age;
  1064. generate exp1, exp2;
  1065. }
  1066. store c into ':OUTPATH:';\,
  1067. },
  1068. {
  1069. # test a udf with no args
  1070. 'num' => 12,
  1071. 'pig' => q\register :FUNCPATH:/testudf.jar;
  1072. a = load ':INPATH:/singlefile/studenttab10k' as (name, age, gpa);
  1073. b = foreach a generate name, org.apache.pig.test.udf.evalfunc.Fred() as fred;
  1074. store b into ':OUTPATH:';\,
  1075. },
  1076. {
  1077. 'num' => 13,
  1078. 'pig' => q\a = load ':INPATH:/singlefile/allscalar10k' as (name:chararray, age:int, gpa:double, instate:boolean);
  1079. b = foreach a generate *;
  1080. store b into ':OUTPATH:';\,
  1081. 'verify_pig_script' => q\a = load ':INPATH:/singlefile/allscalar10k' as (name:chararray, age:int, gpa:double, instate:chararray);
  1082. b = foreach a generate *;
  1083. store b into ':OUTPATH:';\,
  1084. }
  1085. ]
  1086. },
  1087. {
  1088. 'name' => 'Order',
  1089. 'tests' => [
  1090. {
  1091. 'num' => 1,
  1092. 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k' as (name, age, gpa);
  1093. b = foreach a generate name;
  1094. c = order b by name;
  1095. store c into ':OUTPATH:';\,
  1096. 'sortArgs' => ['-t', ' ', '+0', '-1'],
  1097. },
  1098. {
  1099. 'num' => 2,
  1100. 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k';
  1101. b = foreach a generate $1;
  1102. c = order b by $0;
  1103. store c into ':OUTPATH:';\,
  1104. 'sortArgs' => ['-t', ' ', '+0', '-1'],
  1105. },
  1106. {
  1107. 'num' => 3,
  1108. 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k' as (name, age, gpa);
  1109. b = foreach a generate gpa;
  1110. c = order b by gpa;
  1111. store c into ':OUTPATH:';\,
  1112. 'sortArgs' => ['-t', ' ', '+0', '-1'],
  1113. },
  1114. {
  1115. 'num' => 4,
  1116. 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k';
  1117. b = order a by *;
  1118. store b into ':OUTPATH:';\,
  1119. 'sortArgs' => ['-t', ' '],
  1120. },
  1121. {
  1122. 'num' => 5,
  1123. 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k' as (name, age, gpa);
  1124. b = foreach a generate name, age;
  1125. c = order b by name, age;
  1126. store c into ':OUTPATH:';\,
  1127. 'sortArgs' => ['-t', ' ', '+0', '-2'],
  1128. },
  1129. {
  1130. 'num' => 6,
  1131. 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k';
  1132. c = order a by $0;
  1133. store c into ':OUTPATH:';\,
  1134. 'sortArgs' => ['-t', ' ', '+0', '-1'],
  1135. },
  1136. {
  1137. 'num' => 7,
  1138. 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k';
  1139. c = order a by $1;
  1140. store c into ':OUTPATH:';\,
  1141. 'sortArgs' => ['-t', ' ', '+1', '-2'],
  1142. },
  1143. {
  1144. 'num' => 8,
  1145. 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k';
  1146. c = order a by $0, $1;
  1147. store c into ':OUTPATH:';\,
  1148. 'sortArgs' => ['-t', ' ', '+0', '-2'],
  1149. },
  1150. {
  1151. 'num' => 9,
  1152. 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k';
  1153. c = order a by $1, $0;
  1154. store c into ':OUTPATH:';\,
  1155. 'sortArgs' => ['-t', ' ', '+1', '-2', '+0', '-1'],
  1156. },
  1157. {
  1158. 'num' => 10,
  1159. 'ignore' => 'order by UDF is not supported',
  1160. 'pig' => q\register :FUNCPATH:/testudf.jar;
  1161. a = load ':INPATH:/singlefile/studenttab10k';
  1162. c = order a by * using org.apache.pig.test.udf.orderby.OrdDesc;
  1163. store c into ':OUTPATH:';\,
  1164. 'sortArgs' => ['-t', ' ', '-r'],
  1165. },
  1166. {
  1167. 'num' => 11,
  1168. 'ignore' => 'order by UDF is not supported',
  1169. 'pig' => q\register :FUNCPATH:/testudf.jar;
  1170. a = load ':INPATH:/singlefile/studenttab10k';
  1171. c = order a by $0 using org.apache.pig.test.udf.orderby.OrdDesc;
  1172. store c into ':OUTPATH:';\,
  1173. 'sortArgs' => ['-t', ' ', '-r', '+0', '-1'],
  1174. },
  1175. {
  1176. 'num' => 12,
  1177. 'ignore' => 'order by UDF is not supported',
  1178. 'pig' => q\register :FUNCPATH:/testudf.jar;
  1179. a = load ':INPATH:/singlefile/studenttab10k';
  1180. c = order a by $0, $1 using org.apache.pig.test.udf.orderby.OrdDesc;
  1181. store c into ':OUTPATH:';\,
  1182. 'sortArgs' => ['-t', ' ', '-r', '+0', '-2'],
  1183. },
  1184. # ALERT All these tests with inner order bys aren't testing the inner
  1185. # ordering. We need to develop a sorting tool to do that.
  1186. {
  1187. 'num' => 13,
  1188. 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k';
  1189. b = group a by $0;
  1190. c = foreach b {c1 = order $1 by $1; generate flatten(c1); };
  1191. store c into ':OUTPATH:';\,
  1192. },
  1193. {
  1194. 'num' => 14,
  1195. 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k';
  1196. b = group a by $0;
  1197. c = foreach b {c1 = order $1 by *; generate flatten(c1); };
  1198. store c into ':OUTPATH:';\,
  1199. },
  1200. {
  1201. 'num' => 15,
  1202. 'pig' => q\register :FUNCPATH:/testudf.jar;
  1203. a = load ':INPATH:/singlefile/studenttab10k';
  1204. b = group a by $0;
  1205. c = foreach b {c1 = order $1 by * using org.apache.pig.test.udf.orderby.OrdDesc; generate flatten(c1); };
  1206. store c into ':OUTPATH:';\,
  1207. },
  1208. {
  1209. 'num' => 16,
  1210. 'pig' => q\register :FUNCPATH:/testudf.jar;
  1211. a = load ':INPATH:/singlefile/studenttab10k';
  1212. b = group a by $0;
  1213. c = foreach b {c1 = order $1 by $1 using org.apache.pig.test.udf.orderby.OrdDesc; generate flatten(c1);};
  1214. store c into ':OUTPATH:';\,
  1215. },
  1216. {
  1217. 'num' => 17,
  1218. 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k';
  1219. b = group a by $0;
  1220. c = foreach b {c1 = order $1 by $1; generate flatten(c1), MAX($1.$1); };
  1221. store c into ':OUTPATH:';\,
  1222. },
  1223. {
  1224. # test to make sure the weighted range patitioning
  1225. # works correctly when a sort key value repeats across
  1226. # reduce partitions
  1227. 'num' => 18,
  1228. 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k';
  1229. b = order a by $1 parallel 100;
  1230. store b into ':OUTPATH:';\,
  1231. 'sortArgs' => ['-t', ' ', '+1', '-2'],
  1232. },
  1233. {
  1234. 'num' => 19,
  1235. 'pig' => q\a = load ':INPATH:/singlefile/allscalar10k' as (name:chararray, age:int, gpa:double, instate:boolean);
  1236. b = foreach a generate instate;
  1237. c = order b by instate;
  1238. store c into ':OUTPATH:';\,
  1239. 'verify_pig_script' => q\a = load ':INPATH:/singlefile/allscalar10k' using PigStorage() as (name:chararray, age:int, gpa:double, instate:chararray);
  1240. b = foreach a generate instate;
  1241. c = order b by instate;
  1242. store c into ':OUTPATH:';\,
  1243. 'sortArgs' => ['-t', ' ', '+0', '-1'],
  1244. },
  1245. ]
  1246. },
  1247. {
  1248. 'name' => 'Distinct',
  1249. 'tests' => [
  1250. {
  1251. 'num' => 1,
  1252. 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k' as (name, age, gpa);
  1253. b = foreach a generate name;
  1254. c = distinct b;
  1255. store c into ':OUTPATH:';\,
  1256. },
  1257. {
  1258. 'num' => 2,
  1259. 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k';
  1260. b = foreach a generate $1;
  1261. c = distinct b;
  1262. store c into ':OUTPATH:';\,
  1263. },
  1264. {
  1265. 'num' => 3,
  1266. 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k' as (name, age, gpa);
  1267. b = foreach a generate gpa;
  1268. c = distinct b;
  1269. store c into ':OUTPATH:';\,
  1270. },
  1271. {
  1272. 'num' => 4,
  1273. 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k';
  1274. b = distinct a;
  1275. store b into ':OUTPATH:';\,
  1276. },
  1277. {
  1278. 'num' => 5,
  1279. 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k' as (name, age, gpa);
  1280. b = foreach a generate name, age;
  1281. c = distinct b;
  1282. store c into ':OUTPATH:';\,
  1283. },
  1284. {
  1285. 'num' => 6,
  1286. 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k' as (name, age, gpa);
  1287. b = group a by name;
  1288. c = foreach b { aa = distinct a.age; generate group, COUNT(aa); }
  1289. store c into ':OUTPATH:';\,
  1290. }
  1291. ]
  1292. },
  1293. {
  1294. 'name' => 'Cross',
  1295. 'tests' => [
  1296. {
  1297. 'num' => 1,
  1298. 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k' as (name, age, gpa);
  1299. b = load ':INPATH:/singlefile/votertab10k' as (name, age, registration, contributions);
  1300. c = filter a by age < 19 and gpa < 1.0;
  1301. d = filter b by age < 19;
  1302. e = cross c, d;
  1303. store e into ':OUTPATH:';\,
  1304. },
  1305. {
  1306. 'num' => 2,
  1307. 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k' as (name, age, gpa);
  1308. b = load ':INPATH:/singlefile/votertab10k' as (name, age, registration, contributions);
  1309. c = filter a by age < 19 and gpa < 1.0;
  1310. d = filter b by age < 19;
  1311. e = cross c, d parallel 10;
  1312. store e into ':OUTPATH:';\,
  1313. },
  1314. {
  1315. 'num' => 3,
  1316. 'pig' => q\set default_parallel 10;
  1317. a = load ':INPATH:/singlefile/studenttab10k' as (name, age, gpa);
  1318. b = load ':INPATH:/singlefile/votertab10k' as (name, age, registration, contributions);
  1319. c = filter a by age < 19 and gpa < 1.0;
  1320. d = filter b by age < 19;
  1321. e = cross c, d;
  1322. store e into ':OUTPATH:';\,
  1323. },
  1324. {
  1325. 'num' => 4,
  1326. 'pig' => q\
  1327. a = load ':INPATH:/singlefile/studenttab10k' as (name, age, gpa);
  1328. b = load ':INPATH:/singlefile/votertab10k' as (name, age, registration, contributions);
  1329. c = filter a by age < 25;
  1330. d = filter b by age < 25;
  1331. e = cross c, d;
  1332. f = filter e by c::age < d::age;
  1333. store f into ':OUTPATH:';\,
  1334. }
  1335. ]
  1336. },
  1337. {
  1338. 'name' => 'Union',
  1339. 'tests' => [
  1340. {
  1341. 'num' => 1,
  1342. 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k' as (name, age, gpa);
  1343. b = load ':INPATH:/singlefile/votertab10k' as (name, age, registration, contributions);
  1344. c = foreach a generate name, age;
  1345. d = foreach b generate name, age;
  1346. e = union c, d;
  1347. store e into ':OUTPATH:';\,
  1348. },
  1349. ]
  1350. },
  1351. {
  1352. 'name' => 'Bincond',
  1353. 'tests' => [
  1354. {
  1355. 'num' => 1,
  1356. 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k' as (name, age, gpa);
  1357. b = foreach a generate name, (name matches 'yuri.*' ? age - 10 : (int)age);
  1358. store b into ':OUTPATH:';\,
  1359. },
  1360. ]
  1361. },
  1362. {
  1363. 'name' => 'Glob',
  1364. 'tests' => [
  1365. {
  1366. 'num' => 1,
  1367. 'pig' => q\a = load ':INPATH:/singlefile/studenttab10?' as (name, age, gpa);
  1368. b = filter a by name == 'nick miller';
  1369. store b into ':OUTPATH:';\,
  1370. },
  1371. {
  1372. 'num' => 2,
  1373. 'pig' => q\a = load ':INPATH:/singlefile/st*ttab10k' as (name, age, gpa);
  1374. b = filter a by name == 'nick miller';
  1375. store b into ':OUTPATH:';\,
  1376. },
  1377. {
  1378. 'num' => 3,
  1379. 'pig' => q\a = load ':INPATH:/singlefile/studenttab*' as (name, age, gpa);
  1380. b = filter a by name == 'nick miller';
  1381. store b into ':OUTPATH:';\,
  1382. },
  1383. {
  1384. 'num' => 4,
  1385. 'pig' => q\a = load ':INPATH:/singlefile/studenttab???' as (name, age, gpa);
  1386. b = filter a by name == 'nick miller';
  1387. store b into ':OUTPATH:';\,
  1388. },
  1389. {
  1390. 'num' => 5,
  1391. 'pig' => q\a = load ':INPATH:/singlefile/studenttab[1-9]0[km]' as (name, age, gpa);
  1392. b = filter a by name == 'nick miller';
  1393. store b into ':OUTPATH:';\,
  1394. },
  1395. {
  1396. 'num' => 6,
  1397. 'pig' => q\a = load ':INPATH:/singlefile/studenttab[13]0[km]' as (name, age, gpa);
  1398. b = filter a by name == 'nick miller';
  1399. store b into ':OUTPATH:';\,
  1400. },
  1401. {
  1402. 'num' => 7,
  1403. 'pig' => q\a = load ':INPATH:/singlefile/studenttab[12]0[a-l]' as (name, age, gpa);
  1404. b = filter a by name == 'nick miller';
  1405. store b into ':OUTPATH:';\,
  1406. },
  1407. {
  1408. 'num' => 8,
  1409. 'pig' => q\a = load ':INPATH:/glob/star/*good' as (name, age, gpa);
  1410. b = filter a by name == 'nick miller';
  1411. store b into ':OUTPATH:';\,
  1412. },
  1413. {
  1414. 'num' => 9,
  1415. 'pig' => q\a = load ':INPATH:/glob/star/*' as (name, age, gpa);
  1416. b = filter a by name == 'nick miller';
  1417. store b into ':OUTPATH:';\,
  1418. }
  1419. ]
  1420. },
  1421. {
  1422. 'name' => 'Arithmetic',
  1423. 'tests' => [
  1424. {
  1425. 'num' => 1,
  1426. 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k' as (name, age, gpa);
  1427. c = foreach a generate age + 1, (int)gpa + 1;
  1428. store c into ':OUTPATH:';\,
  1429. },
  1430. {
  1431. 'num' => 2,
  1432. 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k' as (name, age, gpa);
  1433. c = foreach a generate (double)age + 1.5, gpa + 1.5;
  1434. store c into ':OUTPATH:';\,
  1435. },
  1436. {
  1437. 'num' => 3,
  1438. 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k' as (name, age, gpa);
  1439. c = foreach a generate age - 30, (int)gpa - 3;
  1440. store c into ':OUTPATH:';\,
  1441. },
  1442. {
  1443. 'num' => 4,
  1444. 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k' as (name, age, gpa);
  1445. c = foreach a generate (double)age - 30.1, gpa - 3.199;
  1446. store c into ':OUTPATH:';\,
  1447. },
  1448. {
  1449. 'num' => 5,
  1450. 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k' as (name, age, gpa);
  1451. c = foreach a generate age * 10, (int)gpa * 2;
  1452. store c into ':OUTPATH:';\,
  1453. },
  1454. {
  1455. 'num' => 6,
  1456. 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k' as (name, age, gpa);
  1457. c = foreach a generate (double)age * 10.1, gpa * 2.752342;
  1458. store c into ':OUTPATH:';\,
  1459. },
  1460. {
  1461. 'num' => 7,
  1462. 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k' as (name, age, gpa);
  1463. c = foreach a generate age / 30, (int)gpa / 3;
  1464. store c into ':OUTPATH:';\,
  1465. },
  1466. {
  1467. 'num' => 8,
  1468. 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k' as (name, age, gpa);
  1469. c = foreach a generate (double)age / 30.323, gpa / 3.22;
  1470. store c into ':OUTPATH:';\,
  1471. },
  1472. {
  1473. 'num' => 9,
  1474. 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k' as (name, age, gpa);
  1475. c = foreach a generate 3 * age + gpa / 9.1 - 2;
  1476. store c into ':OUTPATH:';\,
  1477. },
  1478. {
  1479. 'num' => 10,
  1480. 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k' as (name, age, gpa);
  1481. c = foreach a generate 3 * (age + gpa) / (9.1 - 2);
  1482. store c into ':OUTPATH:';\,
  1483. }
  1484. ]
  1485. },
  1486. {
  1487. 'name' => 'Regression',
  1488. 'tests' => [
  1489. {
  1490. 'num' => 1459894,
  1491. 'pig' => q\a = load ':INPATH:/singlefile/reg1459894';
  1492. b = group a by $0;
  1493. c = foreach b generate group, COUNT(a.$1);
  1494. store c into ':OUTPATH:';\,
  1495. },
  1496. {
  1497. 'num' => 97,
  1498. 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name, age, gpa);
  1499. b = load ':INPATH:/singlefile/votertab10k' as (name, age, registration, contributions);
  1500. e = cogroup a by name, b by name;
  1501. f = foreach e generate group, COUNT(a), COUNT(b);
  1502. store f into ':OUTPATH:';\,
  1503. },
  1504. {
  1505. 'num' => 203,
  1506. 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name, age, gpa);
  1507. b = group a by name;
  1508. c = foreach b generate group, COUNT($1);
  1509. store c into ':OUTPATH:';
  1510. --This is a really long script to test that when script size exceeds 1k we can still parse it.
  1511. --The quick sly fox jumped over the lazy brown dog.
  1512. --he quick sly fox jumped over the lazy brown dog.T
  1513. --e quick sly fox jumped over the lazy brown dog.Th
  1514. -- quick sly fox jumped over the lazy brown dog.The
  1515. --quick sly fox jumped over the lazy brown dog.The
  1516. --uick sly fox jumped over the lazy brown dog.The q
  1517. --ick sly fox jumped over the lazy brown dog.The qu
  1518. --ck sly fox jumped over the lazy brown dog.The qui
  1519. --k sly fox jumped over the lazy brown dog.The quic
  1520. -- sly fox jumped over the lazy brown dog.The quick
  1521. --sly fox jumped over the lazy brown dog.The quick
  1522. --ly fox jumped over the lazy brown dog.The quick s
  1523. --y fox jumped over the lazy brown dog.The quick sl
  1524. -- fox jumped over the lazy brown dog.The quick sly
  1525. --fox jumped over the lazy brown dog.The quick sly
  1526. --ox jumped over the lazy brown dog.The quick sly f
  1527. --x jumped over the lazy brown dog.The quick sly fo
  1528. -- jumped over the lazy brown dog.The quick sly fox
  1529. --jumped over the lazy brown dog.The quick sly fox
  1530. --umped over the lazy brown dog.The quick sly fox j
  1531. --mped over the lazy brown dog.The quick sly fox ju
  1532. --ped over the lazy brown dog.The quick sly fox jum\,
  1533. }
  1534. ]
  1535. },
  1536. {
  1537. 'name' => 'Unicode',
  1538. 'tests' => [
  1539. {
  1540. 'num' => 1,
  1541. 'pig' => q\a = load ':INPATH:/singlefile/unicode100';
  1542. store a into ':OUTPATH:';\,
  1543. },
  1544. ]
  1545. },
  1546. {
  1547. 'name' => 'Parameters',
  1548. 'tests' => [
  1549. {
  1550. # test default
  1551. 'num' => 1,
  1552. 'pig' => q\%default fname 'studenttab10k'
  1553. a = load ':INPATH:/singlefile/$fname' using PigStorage() as (name, age, gpa);
  1554. b = foreach a generate name;
  1555. store b into ':OUTPATH:';\,
  1556. },
  1557. {
  1558. # test paramter from command line
  1559. 'num' => 2,
  1560. 'pig_params' => ['-p', qq(fname='studenttab10k')],
  1561. 'pig' => q\a = load ':INPATH:/singlefile/$fname' using PigStorage() as (name, age, gpa);
  1562. b = foreach a generate name;
  1563. store b into ':OUTPATH:';\,
  1564. },
  1565. {
  1566. # test paramter from param file
  1567. 'num' => 3,
  1568. 'pig_params' => ['-m', ":PARAMPATH:/params_3"],
  1569. 'pig' => q\a = load ':INPATH:/singlefile/$fname' using PigStorage() as (name, age, gpa);
  1570. b = foreach a generate name;
  1571. store b into ':OUTPATH:';\,
  1572. },
  1573. {
  1574. # test command
  1575. 'num' => 4,
  1576. 'pig' => q\%declare cmd `/usr/local/bin/perl -e 'print "studenttab10k"'`
  1577. a = load ':INPATH:/singlefile/$cmd' using PigStorage() as (name, age, gpa);
  1578. b = foreach a generate name;
  1579. store b into ':OUTPATH:';\,
  1580. },
  1581. {
  1582. # test parameter with a space
  1583. 'num' => 5,
  1584. 'pig_params' => ['-p', qq(setting='set default_parallel 100;'),'-p',qq(fname='studenttab10k')],
  1585. 'pig' => q\a = load ':INPATH:/singlefile/$fname' using PigStorage() as (name, age, gpa);
  1586. $setting
  1587. b = foreach a generate name;
  1588. store b into ':OUTPATH:';\,
  1589. 'verify_pig_script' => q\a = load ':INPATH:/singlefile/$fname' using PigStorage() as (name, age, gpa);
  1590. b = foreach a generate name;
  1591. store b into ':OUTPATH:';\,
  1592. },
  1593. ]
  1594. },
  1595. {
  1596. 'name' => 'Types',
  1597. 'tests' => [
  1598. {
  1599. # constants
  1600. 'num' => 1,
  1601. 'pig' => q\a = load ':INPATH:/singlefile/studentnulltab10k' as (name:chararray, age:int, gpa:double);
  1602. b = foreach a generate age + 1 + 0.2f + 253645L, gpa+1;
  1603. store b into ':OUTPATH:';\,
  1604. },
  1605. {
  1606. # NULL and cast
  1607. 'num' => 2,
  1608. 'pig' => q\a = load ':INPATH:/singlefile/studentnulltab10k' as (name:chararray, age:int, gpa:double);
  1609. b = foreach a generate (int)((int)gpa/((int)gpa - 1)) as norm_gpa:int;
  1610. c = foreach b generate (norm_gpa is null? 0 :norm_gpa);
  1611. store c into ':OUTPATH:';\,
  1612. # 'expected_err_regex' => "Encountered Warning DIVIDE_BY_ZERO 2387 time.*",
  1613. # Driver does currently not support both 'sql' and 'expected_...' verification directives.
  1614. },
  1615. {
  1616. # arithmetic operators and SIZE for int, double and size and concat operators for chararrays
  1617. 'num' => 3,
  1618. 'pig' => q\a = load ':INPATH:/singlefile/studentnulltab10k' as (name:chararray, age:int, gpa:double);
  1619. b = foreach a generate age, gpa, age % 25, age + 25, age - 25, age/2, age * 2, SIZE(age), gpa + 10.1, gpa - 1.1 , gpa / 1.2, gpa * 2.5, SIZE(gpa), SIZE(name), CONCAT(name, 'test');
  1620. store b into ':OUTPATH:';\,
  1621. },
  1622. {
  1623. # arithmetic operators and SIZE for long, float and size and concat operators for bytearrays
  1624. 'num' => 4,
  1625. 'pig' => q\a = load ':INPATH:/singlefile/studentnulltab10k' as (name, age:long, gpa:float);
  1626. b = foreach a generate age, gpa, age % 2L, age + 2500000000L, age - 2500000000L, age/2L, age * 250000000L, SIZE(age), gpa + 10.1f, gpa - 1.1f , gpa / 1.2f, gpa * 2.6f, SIZE(gpa), SIZE(name), CONCAT(name, name);
  1627. store b into ':OUTPATH:';\,
  1628. },
  1629. {
  1630. # equlity and implicit cast
  1631. 'num' => 5,
  1632. 'pig' => q\a = load ':INPATH:/singlefile/studentnulltab10k' as (name, age, gpa);
  1633. b = filter a by age == '25' and gpa < 3;
  1634. store b into ':OUTPATH:';\,
  1635. },
  1636. {
  1637. # will need to test against previous version of pig
  1638. # because in pig currently count includes nulls - this affects
  1639. # avg
  1640. 'num' => 6,
  1641. 'pig' => q\a = load ':INPATH:/singlefile/studentnulltab10k' as (name:chararray, age:int, gpa:double);
  1642. b = group a ALL;
  1643. c = foreach b generate SUM(a.age), MIN(a.age), MAX(a.age), AVG(a.age), MIN(a.name), MAX(a.name), SUM(a.gpa), MIN(a.gpa), MAX(a.gpa), AVG(a.gpa);
  1644. store c into ':OUTPATH:';\,
  1645. 'floatpostprocess' => 1,
  1646. 'delimiter' => ' ',
  1647. },
  1648. {
  1649. # sum, min, max, avg for long and float (declared)
  1650. 'num' => 7,
  1651. 'pig' => q\a = load ':INPATH:/singlefile/studentnulltab10k' as (name, age:long, gpa:float);
  1652. b = group a ALL;
  1653. c = foreach b generate SUM(a.age), MIN(a.age), MAX(a.age), AVG(a.age), SUM(a.gpa), MIN(a.gpa), MAX(a.gpa), AVG(a.gpa);
  1654. store c into ':OUTPATH:';\,
  1655. },
  1656. {
  1657. # Explicit casts - arithmetic operators and SIZE for int, double and size and concat operators for chararrays
  1658. 'num' => 8,
  1659. 'pig' => q\a = load ':INPATH:/singlefile/studentnulltab10k' as (name, age, gpa);
  1660. b = foreach a generate (int)age % 25, (int)age + 25, (int)age - 25, (int)age/2, (int)age * 2, SIZE((int)age), (double)gpa + 10.1, (double)gpa - 1.1 , (double)gpa / 1.2, (double)gpa * 2.5, SIZE((double)gpa), SIZE((chararray)name), CONCAT((chararray)name, 'test');
  1661. store b into ':OUTPATH:';\,
  1662. },
  1663. {
  1664. # Explicit casts - arithmetic operators and SIZE for long, float
  1665. 'num' => 9,
  1666. 'pig' => q\a = load ':INPATH:/singlefile/studentnulltab10k' as (name, age, gpa);
  1667. b = foreach a generate (long)age, (long)age % 2L, (long)age + 2500000000L, (long)age - 2500000000L, (long)age/2L, (long)age * 250000000L, SIZE((long)age), (float)gpa + 10.1f, (float)gpa - 1.1f , (float)gpa / 1.2f, (float)gpa * 2.6f, SIZE((float)gpa);
  1668. store b into ':OUTPATH:';\,
  1669. },
  1670. {
  1671. # Filter is null for chararray and double and is not null for int
  1672. 'num' => 10,
  1673. 'pig' => q\a = load ':INPATH:/singlefile/studentnulltab10k' as (name:chararray, age:int, gpa:double);
  1674. b = filter a by name is null and age is not null and gpa is null;
  1675. c = group b ALL;
  1676. d = foreach c generate COUNT(b);
  1677. store d into ':OUTPATH:';\,
  1678. },
  1679. {
  1680. # Filter is not null for chararray and double and is null for int
  1681. 'num' => 11,
  1682. 'pig' => q\a = load ':INPATH:/singlefile/studentnulltab10k' as (name:chararray, age:int, gpa:double);
  1683. b = filter a by name is not null and age is null and gpa is not null;
  1684. c = group b ALL;
  1685. d = foreach c generate COUNT(b);
  1686. store d into ':OUTPATH:';\,
  1687. },
  1688. {
  1689. # Filter is null for bytearray and float and is not null for long
  1690. 'num' => 12,
  1691. 'pig' => q\a = load ':INPATH:/singlefile/studentnulltab10k' as (name, age:long, gpa:float);
  1692. b = filter a by name is null and age is not null and gpa is null;
  1693. c = group b ALL;
  1694. d = foreach c generate COUNT(b);
  1695. store d into ':OUTPATH:';\,
  1696. },
  1697. {
  1698. # Filter is not null for bytearray and float and is null for long
  1699. 'num' => 13,
  1700. 'pig' => q\a = load ':INPATH:/singlefile/studentnulltab10k' as (name, age:long, gpa:float);
  1701. b = filter a by name is not null and age is null and gpa is not null;
  1702. c = group b ALL;
  1703. d = foreach c generate COUNT(b);
  1704. store d into ':OUTPATH:';\,
  1705. },
  1706. {
  1707. # test that sorting is based on the type for chararray, int and double
  1708. 'num' => 14,
  1709. 'pig' =>q\a = load ':INPATH:/singlefile/studentnulltab10k' as (name:chararray, age:int, gpa:double);
  1710. b = order a by name, age, gpa;
  1711. store b into ':OUTPATH:';\,
  1712. 'sortArgs' => ['-t', ' ', '+0', '-1', '+1n', '-2'],
  1713. },
  1714. {
  1715. # test that sorting descending is based on the type for chararray, int and double
  1716. 'num' => 15,
  1717. 'pig' =>q\a = load ':INPATH:/singlefile/studentnulltab10k' as (name:chararray, age:int, gpa:double);
  1718. b = order a by name desc, age desc, gpa desc;
  1719. store b into ':OUTPATH:';\,
  1720. 'sortArgs' => ['-t', ' ', '+0r', '-1', '+1nr', '-2'],
  1721. },
  1722. {
  1723. # test that sorting is based on the type for bytearray, long and float
  1724. 'num' => 16,
  1725. 'pig' =>q\a = load ':INPATH:/singlefile/studentnulltab10k' as (name, age:long, gpa:float);
  1726. b = order a by name, age, gpa;
  1727. store b into ':OUTPATH:';\,
  1728. 'sortArgs' => ['-t', ' ', '+0', '-1', '+1n', '-2'],
  1729. },
  1730. {
  1731. # test that sorting descending is based on the type for chararray, age and float
  1732. 'num' => 17,
  1733. 'pig' =>q\a = load ':INPATH:/singlefile/studentnulltab10k' as (name, age:long, gpa:float);
  1734. b = order a by name desc, age desc, gpa desc;
  1735. store b into ':OUTPATH:';\,
  1736. 'sortArgs' => ['-t', ' ', '+0r', '-1', '+1nr', '-2'],
  1737. },
  1738. {
  1739. # test precision for doubles is atleast 15 digits
  1740. 'num' => 18,
  1741. 'pig' => q\a = load ':INPATH:/singlefile/studentnulltab10k' as (name:chararray, age:int, gpa:double);
  1742. b = foreach a generate 0.123456789123456+0.123456789123456;
  1743. store b into ':OUTPATH:';\,
  1744. },
  1745. {
  1746. # order by string
  1747. 'num' => 20,
  1748. 'pig' => q\a = load ':INPATH:/singlefile/studentnulltab10k' as (name:chararray, age:int, gpa:double);
  1749. b = order a by name;
  1750. store b into ':OUTPATH:';\,
  1751. 'sortArgs' => ['-t', ' ', '+0', '-1'],
  1752. },
  1753. {
  1754. # order by string desc
  1755. 'num' => 21,
  1756. 'pig' => q\a = load ':INPATH:/singlefile/studentnulltab10k' as (name:chararray, age:int, gpa:double);
  1757. b = order a by name desc;
  1758. store b into ':OUTPATH:';\,
  1759. 'sortArgs' => ['-t', ' ', '+0r', '-1'],
  1760. },
  1761. {
  1762. # order by int
  1763. 'num' => 22,
  1764. 'pig' => q\a = load ':INPATH:/singlefile/studentnulltab10k' as (name:chararray, age:int, gpa:double);
  1765. b = order a by age;
  1766. store b into ':OUTPATH:';\,
  1767. 'sortArgs' => ['-t', ' ', '+1n', '-2'],
  1768. },
  1769. {
  1770. # order by int desc
  1771. 'num' => 23,
  1772. 'pig' => q\a = load ':INPATH:/singlefile/studentnulltab10k' as (name:chararray, age:int, gpa:double);
  1773. b = order a by age desc;
  1774. store b into ':OUTPATH:';\,
  1775. 'sortArgs' => ['-t', ' ', '+1nr', '-2'],
  1776. },
  1777. {
  1778. # order by long
  1779. 'num' => 24,
  1780. 'pig' => q\a = load ':INPATH:/singlefile/studentnulltab10k' as (name:chararray, age:long, gpa:double);
  1781. b = order a by age;
  1782. store b into ':OUTPATH:';\,
  1783. 'sortArgs' => ['-t', ' ', '+1n', '-2'],
  1784. },
  1785. {
  1786. # order by long desc
  1787. 'num' => 25,
  1788. 'pig' => q\a = load ':INPATH:/singlefile/studentnulltab10k' as (name:chararray, age:long, gpa:double);
  1789. b = order a by age desc;
  1790. store b into ':OUTPATH:';\,
  1791. 'sortArgs' => ['-t', ' ', '+1nr', '-2'],
  1792. },
  1793. {
  1794. # order by float
  1795. 'num' => 26,
  1796. 'pig' => q\a = load ':INPATH:/singlefile/studentnulltab10k' as (name:chararray, age:int, gpa:float);
  1797. b = order a by gpa;
  1798. store b into ':OUTPATH:';\,
  1799. 'sortArgs' => ['-t', ' ', '-k 3n'],
  1800. },
  1801. {
  1802. # order by float desc
  1803. 'num' => 27,
  1804. 'pig' => q\a = load ':INPATH:/singlefile/studentnulltab10k' as (name:chararray, age:int, gpa:float);
  1805. b = order a by gpa desc;
  1806. store b into ':OUTPATH:';\,
  1807. 'sortArgs' => ['-t', ' ', '-k 3nr'],
  1808. },
  1809. {
  1810. # order by double
  1811. 'num' => 28,
  1812. 'pig' => q\a = load ':INPATH:/singlefile/studentnulltab10k' as (name:chararray, age:int, gpa:double);
  1813. b = order a by gpa;
  1814. store b into ':OUTPATH:';\,
  1815. 'sortArgs' => ['-t', ' ', '-k 3n'],
  1816. },
  1817. {
  1818. # order by double desc
  1819. 'num' => 29,
  1820. 'pig' => q\a = load ':INPATH:/singlefile/studentnulltab10k' as (name:chararray, age:int, gpa:double);
  1821. b = order a by gpa desc;
  1822. store b into ':OUTPATH:';\,
  1823. 'sortArgs' => ['-t', ' ', '-k 3nr'],
  1824. },
  1825. {
  1826. # order by *
  1827. 'num' => 30,
  1828. 'pig' => q\a = load ':INPATH:/singlefile/studentnulltab10k' as (name:chararray, age:int, gpa:double);
  1829. b = order a by *;
  1830. store b into ':OUTPATH:';\,
  1831. 'sortArgs' => ['-t', ' ', '+0', '-1', '+1n', '-2'],
  1832. },
  1833. {
  1834. # order by * desc
  1835. 'num' => 31,
  1836. 'pig' => q\a = load ':INPATH:/singlefile/studentnulltab10k' as (name:chararray, age:int, gpa:double);
  1837. b = order a by * desc;
  1838. store b into ':OUTPATH:';\,
  1839. 'sortArgs' => ['-t', ' ', '+0r', '-1', '+1nr', '-2'],
  1840. },
  1841. {
  1842. 'num' => 32,
  1843. 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name:chararray, age:int, gpa:double);
  1844. b = load ':INPATH:/singlefile/votertab10k' as (name:chararray, age:int, registration:chararray, contributions:double);
  1845. c = filter a by age < 20;
  1846. d = filter b by age < 20;
  1847. e = cogroup c by name, d by name;
  1848. f = foreach e generate flatten (c), flatten(d);
  1849. store f into ':OUTPATH:';\,
  1850. },
  1851. {
  1852. 'num' => 33,
  1853. 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name:chararray, age:int, gpa:double);
  1854. b = load ':INPATH:/singlefile/votertab10k' as (name:chararray, age:int, registration:chararray, contributions:double);
  1855. c = filter a by age < 20;
  1856. d = filter b by age < 20;
  1857. e = cogroup c by age, d by age;
  1858. f = foreach e generate flatten (c), flatten(d);
  1859. store f into ':OUTPATH:';\,
  1860. },
  1861. {
  1862. 'num' => 34,
  1863. 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name:chararray, age:long, gpa:double);
  1864. b = load ':INPATH:/singlefile/votertab10k' as (name:chararray, age:long, registration:chararray, contributions:double);
  1865. c = filter a by age < 20;
  1866. d = filter b by age < 20;
  1867. e = cogroup c by age, d by age;
  1868. f = foreach e generate flatten (c), flatten(d);
  1869. store f into ':OUTPATH:';\,
  1870. },
  1871. {
  1872. 'num' => 35,
  1873. 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name:chararray, age:float, gpa:double);
  1874. b = load ':INPATH:/singlefile/votertab10k' as (name:chararray, age:float, registration:chararray, contributions:double);
  1875. c = filter a by age < 20;
  1876. d = filter b by age < 20;
  1877. e = cogroup c by age, d by age;
  1878. f = foreach e generate flatten (c), flatten(d);
  1879. store f into ':OUTPATH:';\,
  1880. },
  1881. {
  1882. 'num' => 36,
  1883. 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name:chararray, age:double, gpa:double);
  1884. b = load ':INPATH:/singlefile/votertab10k' as (name:chararray, age:double, registration:chararray, contributions:double);
  1885. c = filter a by age < 20;
  1886. d = filter b by age < 20;
  1887. e = cogroup c by age, d by age;
  1888. f = foreach e generate flatten (c), flatten(d);
  1889. store f into ':OUTPATH:';\,
  1890. },
  1891. {
  1892. # NULL and cast
  1893. 'num' => 37,
  1894. 'pig' => q\a = load ':INPATH:/singlefile/studentnulltab10k' as (name:chararray, age:int, gpa:double);
  1895. b = foreach a generate (int)((int)gpa/((int)gpa - 1)) as norm_gpa:int;
  1896. c = foreach b generate (norm_gpa is not null? norm_gpa: 0);
  1897. store c into ':OUTPATH:';\,
  1898. },
  1899. {
  1900. # constants
  1901. 'num' => 38,
  1902. 'pig' => q\a = load ':INPATH:/singlefile/studentnulltab10k' as (name:chararray, age:int, gpa:double);
  1903. b = foreach a generate -(age + 1 + 0.2f + 253645L), -(gpa+1);
  1904. store b into ':OUTPATH:';\,
  1905. },
  1906. {
  1907. 'num' => 39,
  1908. 'pig' => q\a = load ':INPATH:/singlefile/allscalar10k' using PigStorage() as (name:chararray, age:int, gpa:double, instate:boolean);
  1909. b = foreach a generate instate, true, false;
  1910. store b into ':OUTPATH:';\,
  1911. 'verify_pig_script' => q\a = load ':INPATH:/singlefile/allscalar10k' using PigStorage() as (name:chararray, age:int, gpa:double, instate:chararray);
  1912. b = foreach a generate instate, 'true', 'false';
  1913. store b into ':OUTPATH:';\,
  1914. },
  1915. ]
  1916. },
  1917. {
  1918. 'name' => 'Limit',
  1919. 'tests' => [
  1920. {
  1921. 'num' => 1,
  1922. 'pig' =>q\a = load ':INPATH:/singlefile/studentnulltab10k';
  1923. b = order a by $0, $1;
  1924. c = filter b by $0 > 'a'; -- break the sort/limit optimization
  1925. d = limit c 100;
  1926. store d into ':OUTPATH:';\,
  1927. 'sortArgs' => ['-t', ' ', '+0', '-1'],
  1928. },
  1929. {
  1930. 'num' => 2,
  1931. 'ignore23' => 'The record limit pick is different in 23',
  1932. 'pig' =>q\a = load ':INPATH:/singlefile/studentnulltab10k';
  1933. b = order a by $0, $1;
  1934. c = limit b 100;
  1935. store c into ':OUTPATH:';\,
  1936. 'sortArgs' => ['-t', ' ', '+0', '-1'],
  1937. },
  1938. {
  1939. # Make sure that limit higher than number of rows doesn't mess stuff up
  1940. 'num' => 3,
  1941. 'pig' =>q\a = load ':INPATH:/singlefile/studenttab10k';
  1942. b = order a by $0, $1;
  1943. c = filter b by $1 < 1000;
  1944. d = limit c 100000;
  1945. store d into ':OUTPATH:';\,
  1946. },
  1947. {
  1948. 'num' => 4,
  1949. 'pig' =>q\a = load ':INPATH:/singlefile/studentnulltab10k';
  1950. b = distinct a;
  1951. c = limit b 100;
  1952. store c into ':OUTPATH:';\,
  1953. },
  1954. {
  1955. 'num' => 5,
  1956. 'pig' =>q\a = load ':INPATH:/singlefile/studenttab10k';
  1957. b = load ':INPATH:/singlefile/votertab10k';
  1958. a1 = foreach a generate $0, $1;
  1959. b1 = foreach b generate $0, $1;
  1960. c = union a1, b1;
  1961. d = limit c 100;
  1962. store d into ':OUTPATH:';\,
  1963. },
  1964. {
  1965. 'num' => 6,
  1966. 'pig' =>q\A = load ':INPATH:/singlefile/studenttab10k' as (name: chararray, age: int, gpa: float);
  1967. B = limit A 40;
  1968. C = filter B by age == 40;
  1969. D = group C by name;
  1970. E = foreach D generate group, COUNT(C);
  1971. store E into ':OUTPATH:';\,
  1972. },
  1973. {
  1974. 'num' => 7,
  1975. 'pig' =>q\A = load ':INPATH:/singlefile/studenttab10k' as (name: chararray, age: int, gpa: float);
  1976. B = group A by name;
  1977. C = foreach B {
  1978. C1 = limit A 10;
  1979. generate group, COUNT(C1);
  1980. }
  1981. store C into ':OUTPATH:';\,
  1982. },
  1983. {
  1984. 'num' => 8,
  1985. 'pig' =>q\A = load ':INPATH:/singlefile/studenttab10k' as (name: chararray, age: int, gpa: float);
  1986. B = group A by name;
  1987. C = foreach B {
  1988. C1 = filter A by age < 40;
  1989. C2 = limit C1 10;
  1990. generate group, COUNT(C2);
  1991. }
  1992. D = filter C by $1 > 0;
  1993. store D into ':OUTPATH:';\,
  1994. },
  1995. {
  1996. 'num' => 9,
  1997. 'pig' =>q\a = load ':INPATH:/singlefile/studentnulltab10k';
  1998. b = order a by $0, $1;
  1999. c = limit b 1000/10;
  2000. store c into ':OUTPATH:';\,
  2001. 'verify_pig_script' => q\a = load ':INPATH:/singlefile/studentnulltab10k';
  2002. b = order a by $0, $1;
  2003. c = limit b 100;
  2004. store c into ':OUTPATH:';\,
  2005. 'sortArgs' => ['-t', ' ', '-k1,2'],
  2006. },
  2007. {
  2008. 'num' => 10,
  2009. 'pig' =>q\a = load ':INPATH:/singlefile/studenttab10k' as (name:chararray, age:int, gpa:double);
  2010. b = group a all;
  2011. c = foreach b generate COUNT(a) as count;
  2012. d = limit a c.count/10;
  2013. store d into ':OUTPATH:';\,
  2014. 'verify_pig_script' => q\a = load ':INPATH:/singlefile/studenttab10k' as (name:chararray, age:int, gpa:double);
  2015. b = limit a 1000;
  2016. store b into ':OUTPATH:';\,
  2017. },
  2018. {
  2019. 'num' => 11,
  2020. 'pig' =>q\a = load ':INPATH:/singlefile/studenttab10k' as (name:chararray, age:int, gpa:double);
  2021. b = group a all;
  2022. c = foreach b generate COUNT(a) as count;
  2023. d = load ':INPATH:/singlefile/votertab10k';
  2024. e = group d all;
  2025. f = foreach e generate COUNT(d) as count;
  2026. d = limit a c.count/10+f.count/10;
  2027. store d into ':OUTPATH:';\,
  2028. 'verify_pig_script' => q\a = load ':INPATH:/singlefile/studenttab10k' as (name:chararray, age:int, gpa:double);
  2029. b = limit a 2000;
  2030. store b into ':OUTPATH:';\,
  2031. }
  2032. ]
  2033. },
  2034. {
  2035. 'name' => 'Split',
  2036. 'tests' => [
  2037. {
  2038. 'num' => 1,
  2039. 'pig' =>q\a = load ':INPATH:/singlefile/studentnulltab10k';
  2040. split a into a1 if $0 > 'm', a2 if $0 <= 'm';
  2041. store a1 into ':OUTPATH:';\,
  2042. },
  2043. {
  2044. 'num' => 2,
  2045. 'pig' =>q\a = load ':INPATH:/singlefile/studentnulltab10k';
  2046. split a into a1 if $0 > 'm', a2 if $0 <= 'm';
  2047. store a2 into ':OUTPATH:';\,
  2048. },
  2049. {
  2050. 'num' => 3,
  2051. 'pig' =>q\a = load ':INPATH:/singlefile/studenttab10k';
  2052. split a into a1 if $0 > 'm', a2 if $0 <= 'm';
  2053. b = cogroup a1 by $1, a2 by $1;
  2054. c = foreach b generate flatten(a1), flatten(a2);
  2055. store c into ':OUTPATH:';\,
  2056. },
  2057. {
  2058. 'num' => 4,
  2059. 'pig' =>q\a = load ':INPATH:/singlefile/studenttab10k';
  2060. split a into a1 if $0 > 'm', a2 if $0 <= 'm';
  2061. b = cogroup a1 by $1, a2 by $1;
  2062. c = foreach b generate flatten($1), flatten($2);
  2063. store c into ':OUTPATH:';\,
  2064. },
  2065. {
  2066. 'num' => 5,
  2067. 'pig' =>q\a = load ':INPATH:/singlefile/studentnulltab10k' as (name, age, gpa);
  2068. split a into a1 if name > 'm', a2 if name <= 'm';
  2069. b = distinct a1;
  2070. store b into ':OUTPATH:';\,
  2071. },
  2072. {
  2073. 'num' => 6,
  2074. 'pig' =>q\a = load ':INPATH:/singlefile/studentnulltab10k' as (name, age, gpa);
  2075. split a into a1 if age > 50, a2 if age <= 25;
  2076. b = order a2 by name;
  2077. store b into ':OUTPATH:';\,
  2078. 'sortArgs' => ['-t', ' ', '+0', '-1'],
  2079. },
  2080. {
  2081. 'num' => 7,
  2082. 'pig' =>q\a = load ':INPATH:/singlefile/studentnulltab10k' as (name:chararray, age:int, gpa:double);
  2083. split a into a1 if name > 'm', a2 if age < 50;
  2084. b = distinct a1;
  2085. store b into ':OUTPATH:';\,
  2086. },
  2087. {
  2088. 'num' => 8,
  2089. 'pig' =>q\a = load ':INPATH:/singlefile/studenttab10k' as (name:chararray, age:int, gpa:double);
  2090. split a into a1 if age > 50, a2 if name < 'm';
  2091. b2 = foreach a2 generate name, 1;
  2092. b1 = foreach a1 generate name, 2;
  2093. c = cogroup b2 by name, b1 by name;
  2094. d = foreach c generate flatten(group), COUNT($1), COUNT($2);
  2095. store d into ':OUTPATH:';\,
  2096. },
  2097. {
  2098. 'num' => 9,
  2099. 'pig' =>q\a = load ':INPATH:/singlefile/studenttab10k' as (name:chararray, age:int, gpa:double);
  2100. split a into a1 if age > 50, a2 if name < 'm';
  2101. b2 = distinct a2;
  2102. b1 = order a1 by name;
  2103. c = cogroup b2 by name, b1 by name;
  2104. d = foreach c generate flatten(group), COUNT($1), COUNT($2);
  2105. store d into ':OUTPATH:';\,
  2106. },
  2107. {
  2108. 'num' => 10,
  2109. 'pig' =>q\a = load ':INPATH:/singlefile/studenttab10k' as (name:chararray, age:int, gpa:double);
  2110. split a into a1 if age > 50, a2 otherwise;
  2111. store a1 into ':OUTPATH:.1';
  2112. store a2 into ':OUTPATH:.2';\,
  2113. 'verify_pig_script' =>q\a = load ':INPATH:/singlefile/studenttab10k' as (name:chararray, age:int, gpa:double);
  2114. split a into a1 if age > 50, a2 if age<=50;
  2115. store a1 into ':OUTPATH:.1';
  2116. store a2 into ':OUTPATH:.2';\,
  2117. }
  2118. ]
  2119. },
  2120. {
  2121. 'name' => 'ImplicitSplit',
  2122. 'tests' => [
  2123. {
  2124. 'num' => 1,
  2125. 'pig' =>q\a = load ':INPATH:/singlefile/studenttab10k';
  2126. b = filter a by $1 > 50;
  2127. c = filter a by $2 > 3.0;
  2128. d = cogroup b by $0, c by $0;
  2129. e = foreach d generate flatten(b), flatten(c);
  2130. store e into ':OUTPATH:';\,
  2131. },
  2132. {
  2133. 'num' => 2,
  2134. 'pig' =>q\a = load ':INPATH:/singlefile/studenttab10k' as (name:chararray, age:int, gpa:double);
  2135. b = filter a by age > 50;
  2136. c = filter a by gpa > 3.0;
  2137. d = cogroup b by name, c by name;
  2138. e = foreach d generate flatten(b), flatten(c);
  2139. f = filter e by b::age < 75;
  2140. store f into ':OUTPATH:';\,
  2141. }
  2142. ]
  2143. },
  2144. {
  2145. 'name' => 'describe',
  2146. 'tests' => [
  2147. #JIRA[PIG-373]
  2148. {
  2149. 'num' => 1,
  2150. 'pig' => q\
  2151. A = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name:chararray, age:int, gpa: double);
  2152. describe A;
  2153. store A into ':OUTPATH:';\,
  2154. },
  2155. ],
  2156. },
  2157. {
  2158. 'name' => 'Sample',
  2159. 'tests' => [
  2160. {
  2161. 'num' => 1,
  2162. 'pig' => q\
  2163. A = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name:chararray, age:int, gpa: double);
  2164. S = sample A 2-1-1;
  2165. store S into ':OUTPATH:';\,
  2166. 'verify_pig_script' => q\
  2167. A = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name:chararray, age:int, gpa: double);
  2168. S = sample A 0;
  2169. store S into ':OUTPATH:';\,
  2170. },
  2171. {
  2172. 'num' => 2,
  2173. 'pig' => q\
  2174. A = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name:chararray, age:int, gpa: double);
  2175. B = group A all;
  2176. C = foreach B generate COUNT(A) as count;
  2177. D = group A all;
  2178. E = foreach D generate (double)COUNT(A) as count;
  2179. S = sample A E.count/C.count;
  2180. store S into ':OUTPATH:';\,
  2181. 'verify_pig_script' => q\
  2182. A = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name:chararray, age:int, gpa: double);
  2183. S = sample A 1;
  2184. store S into ':OUTPATH:';\,
  2185. },
  2186. ],
  2187. },
  2188. {
  2189. 'name' => 'MissingColumns',
  2190. 'tests' => [
  2191. {
  2192. 'num' => 1,
  2193. 'pig' => q\
  2194. A = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name:chararray, age: int, gpa: double, extra: chararray);
  2195. B = filter A by age > 50 or extra > 'm';
  2196. D = order B by age, extra;
  2197. store D into ':OUTPATH:';\,
  2198. 'sortArgs' => ['-t', ' ', '+1n', '-2'],
  2199. },
  2200. {
  2201. 'num' => 2,
  2202. 'pig' => q\
  2203. A = load ':INPATH:/singlefile/studenttab10k' using PigStorage();
  2204. B = foreach A generate $0, $1 + 1, $3 + 1;
  2205. C = group B by ($0, $2);
  2206. D = foreach C generate flatten(group), COUNT($1);
  2207. store D into ':OUTPATH:';\,
  2208. },
  2209. {
  2210. 'num' => 3,
  2211. 'pig' => q\
  2212. A = load ':INPATH:/singlefile/studenttab10k' as (name: chararray, age: int, gpa: double);
  2213. B = load ':INPATH:/singlefile/studenttab10k' as (name, age, gpa, extra1, extra2);
  2214. C = join A by (name, age), B by (name, extra1);
  2215. store C into ':OUTPATH:';\,
  2216. # The following SQL should produce empty results, which will match what our pig query should produce.
  2217. }
  2218. ],
  2219. },
  2220. {
  2221. 'name' => 'Aliases',
  2222. # check access of a field using multiple valid aliases
  2223. 'tests' => [
  2224. {
  2225. # check that a free standing alias reference works
  2226. # when it is unambiguous
  2227. # check that a fully qualified alias reference works
  2228. # check that a partially qualified unambiguous alias reference works
  2229. 'num' => 1,
  2230. 'pig' => q\
  2231. a = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name:chararray, age:int, gpa: double);
  2232. b = group a by name;
  2233. c = foreach b generate flatten(a);
  2234. d = filter c by name != 'fred';
  2235. e = group d by name;
  2236. f = foreach e generate flatten(d);
  2237. g = foreach f generate name, d::a::name as dname, a::name as aname;
  2238. store g into ':OUTPATH:';\,
  2239. },
  2240. {
  2241. # check that the "group" alias is available
  2242. # after a flatten(group)
  2243. 'num' => 2,
  2244. 'pig' => q\
  2245. a = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name:chararray, age:int, gpa: double);
  2246. b = group a by name;
  2247. c = foreach b generate flatten(group), COUNT(a) as cnt;
  2248. d = foreach c generate group;
  2249. store d into ':OUTPATH:';\,
  2250. },
  2251. ],
  2252. },
  2253. {
  2254. 'name' => 'Lineage',
  2255. #test if the right cast function is picked
  2256. 'tests' => [
  2257. {
  2258. 'num' => 1,
  2259. 'pig' => q\
  2260. a = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name, age, gpa);
  2261. b = load ':INPATH:/singlefile/textdoc' using TextLoader() as (sentence);
  2262. c = cogroup a ALL, b ALL;
  2263. d = foreach c generate flatten(a), flatten(b);
  2264. e = foreach d generate name, flatten(TOKENIZE((chararray)sentence)) as sentence;
  2265. f = foreach e generate CONCAT((chararray)name, sentence);
  2266. store f into ':OUTPATH:';\,
  2267. },
  2268. {
  2269. 'num' => 2,
  2270. 'pig' => q\
  2271. a = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name, age:int, gpa: double);
  2272. b = load ':INPATH:/singlefile/textdoc' using TextLoader() as (sentence);
  2273. c = cross a, b;
  2274. d = foreach c generate name, flatten(TOKENIZE((chararray)sentence)) as sentence;
  2275. e = foreach d generate CONCAT((chararray)name, sentence);
  2276. store e into ':OUTPATH:';\,
  2277. },
  2278. {
  2279. 'num' => 3,
  2280. 'pig' => q\
  2281. a = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name, age, gpa: double);
  2282. b = foreach a generate age as student_age;
  2283. c = filter b by student_age > 50;
  2284. d = foreach c generate student_age + 10;
  2285. store d into ':OUTPATH:';\,
  2286. },
  2287. {
  2288. 'num' => 4,
  2289. 'pig' => q\register :FUNCPATH:/testudf.jar;
  2290. a = load ':INPATH:/singlefile/studenttab10k' as (name, age, gpa);
  2291. b = filter a by name lt 'b';
  2292. c = foreach b generate org.apache.pig.test.udf.evalfunc.CreateMap((chararray)name, (int)age);
  2293. d = foreach c generate $0#'alice young';
  2294. split d into e if $0 is not null, f if $0 is null;
  2295. store e into ':OUTPATH:';\,
  2296. }
  2297. ],
  2298. },
  2299. {
  2300. 'name' => 'Casts',
  2301. 'tests' => [
  2302. {
  2303. # check that a cast of a value of type
  2304. # same as the result type of the cast works
  2305. # when the value is treated as a bytearray
  2306. 'num' => 1,
  2307. 'floatpostprocess' => 1,
  2308. 'delimiter' => ' ',
  2309. 'pig' => q\
  2310. a = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name:chararray, age:int, gpa: double);
  2311. b = foreach a generate name, age, gpa;
  2312. store b into ':OUTPATH:.intermediate' using BinStorage();
  2313. c = load ':OUTPATH:.intermediate' using BinStorage();
  2314. -- after this load, the fields are treated as bytearrays though
  2315. -- they are actually "typed", test that the implicit casts
  2316. -- introduced by the operations in the foreach below will work fine
  2317. d = foreach c generate CONCAT((chararray)$0, 'test'), $1 + 1, $2 + 0.2;
  2318. store d into ':OUTPATH:';\,
  2319. 'notmq' => 1,
  2320. },
  2321. {
  2322. # check that a cast of a value of type
  2323. # same as the result type of the cast works
  2324. # when the value is treated as a bytearray
  2325. 'num' => 2,
  2326. 'floatpostprocess' => 1,
  2327. 'delimiter' => ' ',
  2328. 'pig' => q\
  2329. a = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name:chararray, age:long, gpa: float);
  2330. b = foreach a generate name, age, gpa;
  2331. store b into ':OUTPATH:.intermediate' using BinStorage();
  2332. c = load ':OUTPATH:.intermediate' using BinStorage();
  2333. -- after this load, the fields are treated as bytearrays though
  2334. -- they are actually "typed", test that the implicit casts
  2335. -- introduced by the operations in the foreach below will work fine
  2336. d = foreach c generate CONCAT((chararray)$0, 'test'), $1 + 1L, $2 + 0.2f;
  2337. store d into ':OUTPATH:';\,
  2338. 'notmq' => 1,
  2339. },
  2340. {
  2341. #check that a cast of a value of type
  2342. #same as the result type of the cast works
  2343. #when the value is treated as a bytearray
  2344. 'num' => 3,
  2345. 'floatpostprocess' => 1,
  2346. 'delimiter' => ' ',
  2347. 'pig' => q\
  2348. a = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name:chararray, age:long, gpa: float);
  2349. b = group a by name;
  2350. c = foreach b generate a, (1,2,3), ['key1'#'value1','key2'#'value2'];
  2351. -- store the bag, tuple and map
  2352. store c into ':OUTPATH:.intermediate' using BinStorage();
  2353. d = load ':OUTPATH:.intermediate' using BinStorage() as (b:bag{t:tuple(x,y,z)}, t2:tuple(a,b,c), m:map[]);
  2354. -- after this load, the fields are treated as bytearrays though
  2355. -- they are actually "typed", test that the implicit casts
  2356. -- introduced by the operations in the foreach below will work fine
  2357. e = foreach d generate COUNT(b), t2.a, t2.b, t2.c, m#'key1', m#'key2';
  2358. store e into ':OUTPATH:';\,
  2359. 'notmq' => 1,
  2360. },
  2361. {
  2362. # check that a cast of a value of type
  2363. # same as the result type of the cast works
  2364. # when the value is treated as a bytearray
  2365. 'num' => 4,
  2366. 'floatpostprocess' => 1,
  2367. 'delimiter' => ' ',
  2368. 'pig' => q\
  2369. a = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name:chararray, age:int, gpa: double);
  2370. b = foreach a generate name, age, gpa;
  2371. store b into ':OUTPATH:.intermediate' using PigStorage();
  2372. c = load ':OUTPATH:.intermediate' using PigStorage();
  2373. -- after this load, the fields are treated as bytearrays though
  2374. -- they are actually "typed", test that the implicit casts
  2375. -- introduced by the operations in the foreach below will work fine
  2376. d = foreach c generate CONCAT((chararray)$0, 'test'), $1 + 1, $2 + 0.2;
  2377. store d into ':OUTPATH:';\,
  2378. 'notmq' => 1,
  2379. },
  2380. {
  2381. # check that a cast of a value of type
  2382. # same as the result type of the cast works
  2383. # when the value is treated as a bytearray
  2384. 'num' => 5,
  2385. 'floatpostprocess' => 1,
  2386. 'delimiter' => ' ',
  2387. 'pig' => q\
  2388. a = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name:chararray, age:long, gpa: float);
  2389. b = foreach a generate name, age, gpa;
  2390. store b into ':OUTPATH:.intermediate' using PigStorage();
  2391. c = load ':OUTPATH:.intermediate' using PigStorage();
  2392. -- after this load, the fields are treated as bytearrays though
  2393. -- they are actually "typed", test that the implicit casts
  2394. -- introduced by the operations in the foreach below will work fine
  2395. d = foreach c generate CONCAT((chararray)$0, 'test'), $1 + 1L, $2 + 0.2f;
  2396. store d into ':OUTPATH:';\,
  2397. 'notmq' => 1,
  2398. },
  2399. {
  2400. #check that a cast of a value of type
  2401. #same as the result type of the cast works
  2402. #when the value is treated as a bytearray
  2403. 'num' => 6,
  2404. 'floatpostprocess' => 1,
  2405. 'delimiter' => ' ',
  2406. 'pig' => q\
  2407. a = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name:chararray, age:long, gpa: float);
  2408. b = group a by name;
  2409. c = foreach b generate a, (1,2,3), ['key1'#'value1','key2'#'value2'];
  2410. -- store the bag, tuple and map
  2411. store c into ':OUTPATH:.intermediate' using PigStorage();
  2412. d = load ':OUTPATH:.intermediate' using PigStorage() as (b:bag{t:tuple(x,y,z)}, t2:tuple(a,b,c), m:map[]);
  2413. -- after this load, the fields are treated as bytearrays though
  2414. -- they are actually "typed", test that the implicit casts
  2415. -- introduced by the operations in the foreach below will work fine
  2416. e = foreach d generate COUNT(b), t2.a, t2.b, t2.c, m#'key1', m#'key2';
  2417. store e into ':OUTPATH:';\,
  2418. 'notmq' => 1,
  2419. },
  2420. {
  2421. 'num' => 7,
  2422. 'pig' => q\a = load ':INPATH:/singlefile/allscalar10k' using PigStorage() as (name, age, gpa, instate);
  2423. b = foreach a generate (boolean)instate;
  2424. c = filter b by instate == true;
  2425. store c into ':OUTPATH:';\,
  2426. 'verify_pig_script' => q\a = load ':INPATH:/singlefile/allscalar10k' using PigStorage() as (name:chararray, age:int, gpa:double, instate:chararray);
  2427. b = foreach a generate instate;
  2428. c = filter b by instate == 'true';
  2429. store c into ':OUTPATH:';\,
  2430. }
  2431. ],
  2432. },
  2433. {
  2434. 'name' => 'ClassResolution',
  2435. 'tests' => [
  2436. {
  2437. # check that Loader specified without a package
  2438. # name works if that package name is specified
  2439. # in udf.import.list
  2440. 'num' => 1,
  2441. 'floatpostprocess' => 1,
  2442. 'delimiter' => ' ',
  2443. 'java_params' => ['-Dudf.import.list=org.apache.pig.test.udf.storefunc'],
  2444. 'pig' => q\
  2445. register :FUNCPATH:/testudf.jar;
  2446. a = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name:chararray, age:int, gpa: double);
  2447. b = foreach a generate CONCAT('(', name), CONCAT((chararray)age, ' )');
  2448. store b into ':OUTPATH:.intermediate' using PigStorage(',');
  2449. c = load ':OUTPATH:.intermediate' using DumpLoader();
  2450. store c into ':OUTPATH:';\,
  2451. 'notmq' => 1,
  2452. },
  2453. ],
  2454. },
  2455. {
  2456. 'name' => 'MergeJoin',
  2457. 'tests' => [
  2458. # Simplest merge-join.
  2459. {
  2460. 'num' => 1,
  2461. 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k';
  2462. b = load ':INPATH:/singlefile/votertab10k';
  2463. c = order a by $0;
  2464. d = order b by $0;
  2465. store c into ':OUTPATH:.intermediate1';
  2466. store d into ':OUTPATH:.intermediate2';
  2467. exec;
  2468. e = load ':OUTPATH:.intermediate1';
  2469. f = load ':OUTPATH:.intermediate2';
  2470. g = join e by $0, f by $0 using 'merge';
  2471. store g into ':OUTPATH:';\,
  2472. 'verify_pig_script' => q\a = load ':INPATH:/singlefile/studenttab10k';
  2473. b = load ':INPATH:/singlefile/votertab10k';
  2474. g = join a by $0, b by $0;
  2475. store g into ':OUTPATH:';\,
  2476. 'notmq' => 1,
  2477. },
  2478. # Merge-join with left-side filter
  2479. {
  2480. 'num' => 2,
  2481. 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k';
  2482. b = load ':INPATH:/singlefile/votertab10k';
  2483. c = order a by $0;
  2484. d = order b by $0;
  2485. store c into ':OUTPATH:.intermediate1';
  2486. store d into ':OUTPATH:.intermediate2';
  2487. exec;
  2488. e = load ':OUTPATH:.intermediate1';
  2489. h = filter e by $1 > 30;
  2490. f = load ':OUTPATH:.intermediate2';
  2491. g = join h by $0, f by $0 using 'merge';
  2492. store g into ':OUTPATH:';\,
  2493. 'verify_pig_script' => q\a = load ':INPATH:/singlefile/studenttab10k';
  2494. b = load ':INPATH:/singlefile/votertab10k';
  2495. h = filter a by $1 > 30;
  2496. g = join h by $0, b by $0;
  2497. store g into ':OUTPATH:';\,
  2498. 'notmq' => 1,
  2499. },
  2500. # Merge-join with right-side filter
  2501. {
  2502. 'num' => 3,
  2503. 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k';
  2504. b = load ':INPATH:/singlefile/votertab10k';
  2505. c = order a by $0;
  2506. d = order b by $0;
  2507. store c into ':OUTPATH:.intermediate1';
  2508. store d into ':OUTPATH:.intermediate2';
  2509. exec;
  2510. e = load ':OUTPATH:.intermediate1';
  2511. f = load ':OUTPATH:.intermediate2';
  2512. i = filter f by $2 != 'democrat';
  2513. g = join e by $0, i by $0 using 'merge';
  2514. store g into ':OUTPATH:';\,
  2515. 'verify_pig_script' => q\a = load ':INPATH:/singlefile/studenttab10k';
  2516. b = load ':INPATH:/singlefile/votertab10k';
  2517. i = filter b by $2 != 'democrat';
  2518. g = join a by $0, i by $0;
  2519. store g into ':OUTPATH:';\,
  2520. 'notmq' => 1,
  2521. },
  2522. # Merge-join with schemas
  2523. {
  2524. 'num' => 4,
  2525. 'floatpostprocess' => 1,
  2526. 'delimiter' => ' ',
  2527. 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k';
  2528. b = load ':INPATH:/singlefile/votertab10k';
  2529. c = order a by $0;
  2530. d = order b by $0;
  2531. store c into ':OUTPATH:.intermediate1';
  2532. store d into ':OUTPATH:.intermediate2';
  2533. exec;
  2534. e = load ':OUTPATH:.intermediate1' as (name:chararray, age:int, gpa:float);
  2535. f = load ':OUTPATH:.intermediate2' as (name:chararray, age:int, reg:chararray, contrib:float);
  2536. g = join e by $0, f by $0 using 'merge';
  2537. store g into ':OUTPATH:';\,
  2538. 'verify_pig_script' => q\a = load ':INPATH:/singlefile/studenttab10k';
  2539. b = load ':INPATH:/singlefile/votertab10k';
  2540. g = join a by $0, b by $0;
  2541. store g into ':OUTPATH:';\,
  2542. 'notmq' => 1,
  2543. },
  2544. # Merge-join with key as expression
  2545. {
  2546. 'num' => 5,
  2547. 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k';
  2548. b = load ':INPATH:/singlefile/votertab10k';
  2549. c = order a by $0,$1;
  2550. d = order b by $0,$1;
  2551. store c into ':OUTPATH:.intermediate1';
  2552. store d into ':OUTPATH:.intermediate2';
  2553. exec;
  2554. e = load ':OUTPATH:.intermediate1';
  2555. f = load ':OUTPATH:.intermediate2';
  2556. g = join e by ($0,$1), f by ($0,$1) using 'merge';
  2557. store g into ':OUTPATH:';\,
  2558. 'verify_pig_script' => q\a = load ':INPATH:/singlefile/studenttab10k';
  2559. b = load ':INPATH:/singlefile/votertab10k';
  2560. g = join a by ($0,$1), b by ($0,$1);
  2561. store g into ':OUTPATH:';\,
  2562. 'notmq' => 1,
  2563. },
  2564. # Merge-join with key as expression This expression guarantees ordering
  2565. {
  2566. 'num' => 6,
  2567. 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k';
  2568. b = load ':INPATH:/singlefile/votertab10k';
  2569. c = order a by $1;
  2570. d = order b by $1;
  2571. store c into ':OUTPATH:.intermediate1';
  2572. store d into ':OUTPATH:.intermediate2';
  2573. exec;
  2574. e = load ':OUTPATH:.intermediate1';
  2575. f = load ':OUTPATH:.intermediate2';
  2576. g = join e by ($1+10), f by ($1+10) using 'merge';
  2577. store g into ':OUTPATH:';\,
  2578. 'verify_pig_script' => q\a = load ':INPATH:/singlefile/studenttab10k';
  2579. b = load ':INPATH:/singlefile/votertab10k';
  2580. g = join a by ($1+10), b by ($1+10) ;
  2581. store g into ':OUTPATH:';\,
  2582. 'notmq' => 1,
  2583. },
  2584. # Merge-join with nulls in keys and data.
  2585. {
  2586. 'num' => 7,
  2587. 'pig' => q\a = load ':INPATH:/singlefile/studentnulltab10k';
  2588. b = load ':INPATH:/singlefile/voternulltab10k';
  2589. c = order a by $0;
  2590. d = order b by $0;
  2591. store c into ':OUTPATH:.intermediate1';
  2592. store d into ':OUTPATH:.intermediate2';
  2593. exec;
  2594. e = load ':OUTPATH:.intermediate1';
  2595. f = load ':OUTPATH:.intermediate2';
  2596. g = join e by $0, f by $0 using 'merge';
  2597. store g into ':OUTPATH:';\,
  2598. 'verify_pig_script' => q\a = load ':INPATH:/singlefile/studentnulltab10k';
  2599. b = load ':INPATH:/singlefile/voternulltab10k';
  2600. g = join a by $0, b by $0;
  2601. store g into ':OUTPATH:';\,
  2602. 'notmq' => 1,
  2603. },
  2604. # Merge-join with one file across multiple blocks
  2605. {
  2606. 'num' => 8,
  2607. 'execonly' => 'mapred', # since this join will run out of memory in local mode
  2608. 'floatpostprocess' => 1,
  2609. 'delimiter' => ' ',
  2610. 'pig' => q\a = load ':INPATH:/singlefile/votertab10k';
  2611. b = load ':INPATH:/singlefile/studenttab20m';
  2612. h = filter b by $2 < 1.5;
  2613. c = order a by $0;
  2614. d = order h by $0 parallel 1;
  2615. store c into ':OUTPATH:.intermediate1';
  2616. store d into ':OUTPATH:.intermediate2';
  2617. exec;
  2618. e = load ':OUTPATH:.intermediate1' as (name:chararray, age:int, reg:chararray, contrib:float);
  2619. f = load ':OUTPATH:.intermediate2'as (name:chararray, age:int, gpa:float);
  2620. g = join e by $0, f by $0 using 'merge';
  2621. i = filter g by $2 == 'democrat' and $1 > 76;
  2622. store i into ':OUTPATH:';\,
  2623. 'verify_pig_script' => q\a = load ':INPATH:/singlefile/votertab10k';
  2624. b = load ':INPATH:/singlefile/studenttab20m';
  2625. h = filter b by $2 < 1.5;
  2626. g = join a by $0, h by $0;
  2627. i = filter g by $2 == 'democrat' and $1 > 76;
  2628. store i into ':OUTPATH:';\,
  2629. 'notmq' => 1,
  2630. },
  2631. # Merge-join with join on numeric key
  2632. {
  2633. 'num' => 9,
  2634. 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k' as (name:chararray, age:int, gpa:float);
  2635. b = load ':INPATH:/singlefile/votertab10k'as (name:chararray, age:int, reg:chararray, contrib:float);
  2636. c = order a by age;
  2637. d = order b by age;
  2638. store c into ':OUTPATH:.intermediate1';
  2639. store d into ':OUTPATH:.intermediate2';
  2640. exec;
  2641. e = load ':OUTPATH:.intermediate1' as (name:chararray, age:int, gpa:float);
  2642. f = load ':OUTPATH:.intermediate2' as (name:chararray, age:int, reg:chararray, contrib:float);
  2643. g = join e by age, f by age using 'merge';
  2644. store g into ':OUTPATH:';\,
  2645. 'verify_pig_script' => q\a = load ':INPATH:/singlefile/studenttab10k' as (name:chararray, age:int, gpa:float);
  2646. b = load ':INPATH:/singlefile/votertab10k'as (name:chararray, age:int, reg:chararray, contrib:float);
  2647. g = join a by age, b by age;
  2648. store g into ':OUTPATH:';\,
  2649. 'notmq' => 1,
  2650. },
  2651. ]
  2652. },
  2653. {
  2654. 'name' => 'SkewedJoin',
  2655. 'floatpostprocess' => 1,
  2656. 'delimiter' => ' ',
  2657. 'tests' => [
  2658. {
  2659. 'num' => 1,
  2660. 'java_params' => ['-Dpig.skewedjoin.reduce.maxtuple=100'],
  2661. 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name, age, gpa);
  2662. b = load ':INPATH:/singlefile/votertab10k' as (name, age, registration, contributions);
  2663. e = join a by name, b by name using 'skewed' parallel 8;
  2664. store e into ':OUTPATH:';\,
  2665. 'verify_pig_script' => q\a = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name, age, gpa);
  2666. b = load ':INPATH:/singlefile/votertab10k' as (name, age, registration, contributions);
  2667. e = join a by name, b by name;
  2668. store e into ':OUTPATH:';\,
  2669. },
  2670. # basic join with no skewed keys
  2671. {
  2672. 'num' => 2,
  2673. 'java_params' => ['-Dpig.skewedjoin.reduce.maxtuple=10000'],
  2674. 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name, age,
  2675. gpa);
  2676. b = load ':INPATH:/singlefile/votertab10k' as (name, age, registration, contributions);
  2677. e = join a by name, b by name using 'skewed';
  2678. store e into ':OUTPATH:';\,
  2679. 'verify_pig_script' =>q\a = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name, age,
  2680. gpa);
  2681. b = load ':INPATH:/singlefile/votertab10k' as (name, age, registration, contributions);
  2682. e = join a by name, b by name ;
  2683. store e into ':OUTPATH:';\,
  2684. },
  2685. # join after filtering
  2686. {
  2687. 'num' => 3,
  2688. 'java_params' => ['-Dpig.skewedjoin.reduce.maxtuple=3'],
  2689. 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name, age,
  2690. gpa);
  2691. b = load ':INPATH:/singlefile/votertab10k' as (name, age, registration, contributions);
  2692. c = filter a by age < 20;
  2693. d = filter b by age < 20;
  2694. e = join c by $0, d by $0 using 'skewed' parallel 8;
  2695. store e into ':OUTPATH:';\,
  2696. 'verify_pig_script' => q\a = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name, age,
  2697. gpa);
  2698. b = load ':INPATH:/singlefile/votertab10k' as (name, age, registration, contributions);
  2699. c = filter a by age < 20;
  2700. d = filter b by age < 20;
  2701. e = join c by $0, d by $0 ;
  2702. store e into ':OUTPATH:';\,
  2703. },
  2704. # join by two columns
  2705. {
  2706. 'num' => 4,
  2707. 'java_params' => ['-Dpig.skewedjoin.reduce.maxtuple=3'],
  2708. 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name, age, gpa);
  2709. b = load ':INPATH:/singlefile/votertab10k' as (name, age, registration, contributions);
  2710. c = filter a by age < 20;
  2711. d = filter b by age < 20;
  2712. e = join c by (name, age), d by (name, age) using 'skewed' parallel 8;
  2713. store e into ':OUTPATH:';\,
  2714. 'verify_pig_script' => q\a = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name, age, gpa);
  2715. b = load ':INPATH:/singlefile/votertab10k' as (name, age, registration, contributions);
  2716. c = filter a by age < 20;
  2717. d = filter b by age < 20;
  2718. e = join c by (name, age), d by (name, age) ;
  2719. store e into ':OUTPATH:';\,
  2720. },
  2721. # join with add
  2722. {
  2723. 'num' => 5,
  2724. 'java_params' => ['-Dpig.skewedjoin.reduce.maxtuple=50'],
  2725. 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k' as (name:chararray,age:int, gpa:double);
  2726. b = load ':INPATH:/singlefile/votertab10k' as (name, age, registration, contributions);
  2727. c = filter a by age < 20;
  2728. d = filter b by age < 20;
  2729. e = join c by age+10, d by age + 20 using 'skewed' parallel 10;
  2730. store e into ':OUTPATH:';\,
  2731. 'verify_pig_script' => q\a = load ':INPATH:/singlefile/studenttab10k' as (name:chararray,age:int, gpa:double);
  2732. b = load ':INPATH:/singlefile/votertab10k' as (name, age, registration, contributions);
  2733. c = filter a by age < 20;
  2734. d = filter b by age < 20;
  2735. e = join c by age+10, d by age + 20 ;
  2736. store e into ':OUTPATH:';\,
  2737. },
  2738. # join with split
  2739. {
  2740. 'num' => 6,
  2741. 'java_params' => ['-Dpig.skewedjoin.reduce.maxtuple=100'],
  2742. 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k';
  2743. b = filter a by $1 > 25;
  2744. c = join a by $0, b by $0 using 'skewed' parallel 7;
  2745. store c into ':OUTPATH:';\,
  2746. 'verify_pig_script' => q\a = load ':INPATH:/singlefile/studenttab10k';
  2747. b = filter a by $1 > 25;
  2748. c = join a by $0, b by $0 ;
  2749. store c into ':OUTPATH:';\,
  2750. },
  2751. # join with UDF
  2752. {
  2753. 'num' => 7,
  2754. 'java_params' => ['-Dpig.skewedjoin.reduce.maxtuple=20'],
  2755. 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k' as (name, age, gpa);
  2756. b = load ':INPATH:/singlefile/votertab10k' as (name, age, registration, contributions);
  2757. c = filter a by age < 20;
  2758. d = filter b by age < 20;
  2759. e = join c by SIZE(name), d by SIZE(name) using 'skewed' parallel 7;
  2760. store e into ':OUTPATH:';\,
  2761. 'verify_pig_script' => q\a = load ':INPATH:/singlefile/studenttab10k' as (name, age, gpa);
  2762. b = load ':INPATH:/singlefile/votertab10k' as (name, age, registration, contributions);
  2763. c = filter a by age < 20;
  2764. d = filter b by age < 20;
  2765. e = join c by SIZE(name), d by SIZE(name) ;
  2766. store e into ':OUTPATH:';\,
  2767. },
  2768. # left outer join
  2769. {
  2770. 'num' => 8,
  2771. 'java_params' => ['-Dpig.skewedjoin.reduce.maxtuple=100'],
  2772. 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name, age, gpa);
  2773. b = load ':INPATH:/singlefile/votertab10k' as (name, age, registration, contributions);
  2774. e = join a by name left outer, b by name using 'skewed' parallel 8;
  2775. store e into ':OUTPATH:';\,
  2776. 'verify_pig_script' => q\a = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name, age, gpa);
  2777. b = load ':INPATH:/singlefile/votertab10k' as (name, age, registration, contributions);
  2778. e = join a by name left outer, b by name ;
  2779. store e into ':OUTPATH:';\,
  2780. },
  2781. # right outer join
  2782. {
  2783. 'num' => 9,
  2784. 'java_params' => ['-Dpig.skewedjoin.reduce.maxtuple=100'],
  2785. 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name, age, gpa);
  2786. b = load ':INPATH:/singlefile/votertab10k' as (name, age, registration, contributions);
  2787. e = join a by name right outer, b by name using 'skewed' parallel 8;
  2788. store e into ':OUTPATH:';\,
  2789. 'verify_pig_script' => q\a = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name, age, gpa);
  2790. b = load ':INPATH:/singlefile/votertab10k' as (name, age, registration, contributions);
  2791. e = join a by name right outer, b by name ;
  2792. store e into ':OUTPATH:';\,
  2793. },
  2794. # full outer join
  2795. {
  2796. 'num' => 10,
  2797. 'java_params' => ['-Dpig.skewedjoin.reduce.maxtuple=100'],
  2798. 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name, age, gpa);
  2799. b = load ':INPATH:/singlefile/votertab10k' as (name, age, registration, contributions);
  2800. e = join a by name full outer, b by name using 'skewed' parallel 8;
  2801. store e into ':OUTPATH:';\,
  2802. 'verify_pig_script' => q\a = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name, age, gpa);
  2803. b = load ':INPATH:/singlefile/votertab10k' as (name, age, registration, contributions);
  2804. e = join a by name full outer, b by name ;
  2805. store e into ':OUTPATH:';\,
  2806. },
  2807. ]
  2808. },
  2809. {
  2810. 'name' => 'CollectedGroup',
  2811. 'tests' => [
  2812. # Simplest collected group.
  2813. {
  2814. 'num' => 1,
  2815. 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k';
  2816. b = order a by $0;
  2817. store b into ':OUTPATH:.intermediate';
  2818. exec;
  2819. register :FUNCPATH:/testudf.jar;
  2820. c = load ':OUTPATH:.intermediate' using org.apache.pig.test.udf.storefunc.SimpleCollectableLoader();
  2821. d = group c by $0 using 'collected';
  2822. e = foreach d generate group, COUNT(c);
  2823. store e into ':OUTPATH:';\,
  2824. 'notmq' => 1,
  2825. 'verify_pig_script' => q\a = load ':INPATH:/singlefile/studenttab10k';
  2826. d = group a by $0 ;
  2827. e = foreach d generate group, COUNT(a);
  2828. store e into ':OUTPATH:';\,
  2829. },
  2830. # Collected group with filter
  2831. {
  2832. 'num' => 2,
  2833. 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k';
  2834. b = order a by $0;
  2835. store b into ':OUTPATH:.intermediate';
  2836. exec;
  2837. register :FUNCPATH:/testudf.jar;
  2838. c = load ':OUTPATH:.intermediate' using org.apache.pig.test.udf.storefunc.SimpleCollectableLoader();
  2839. d = filter c by $1 > 30;
  2840. e = group d by $0 using 'collected';
  2841. f = foreach e generate group, COUNT(d);
  2842. store f into ':OUTPATH:';\,
  2843. 'notmq' => 1,
  2844. 'verify_pig_script' => q\a = load ':INPATH:/singlefile/studenttab10k';
  2845. d = filter a by $1 > 30;
  2846. e = group d by $0 ;
  2847. f = foreach e generate group, COUNT(d);
  2848. store f into ':OUTPATH:';\,
  2849. },
  2850. # Collected group with schemas
  2851. {
  2852. 'num' => 3,
  2853. 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k';
  2854. b = order a by $0;
  2855. store b into ':OUTPATH:.intermediate';
  2856. exec;
  2857. register :FUNCPATH:/testudf.jar;
  2858. c = load ':OUTPATH:.intermediate' using org.apache.pig.test.udf.storefunc.SimpleCollectableLoader() as (name:chararray, age:int, gpa:float);
  2859. d = group c by $0 using 'collected';
  2860. e = foreach d generate group, MAX(c.age);
  2861. store e into ':OUTPATH:';\,
  2862. 'notmq' => 1,
  2863. 'verify_pig_script' => q\a = load ':INPATH:/singlefile/studenttab10k' as (name:chararray, age:int, gpa:float);
  2864. d = group a by $0 ;
  2865. e = foreach d generate group, MAX(a.$1);
  2866. store e into ':OUTPATH:';\,
  2867. },
  2868. # Collected group with multiple columns
  2869. {
  2870. 'num' => 4,
  2871. 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k' as (name, age, gpa);
  2872. b = order a by name, age;
  2873. store b into ':OUTPATH:.intermediate';
  2874. exec;
  2875. register :FUNCPATH:/testudf.jar;
  2876. c = load ':OUTPATH:.intermediate' using org.apache.pig.test.udf.storefunc.SimpleCollectableLoader() as (name:chararray, age:int, gpa:float);
  2877. d = group c by (name, age) using 'collected';
  2878. e = foreach d generate group.name, group.age, MIN(c.gpa);
  2879. store e into ':OUTPATH:';\,
  2880. 'notmq' => 1,
  2881. 'verify_pig_script' => q\a = load ':INPATH:/singlefile/studenttab10k' as (name, age, gpa);
  2882. d = group a by (name, age) ;
  2883. e = foreach d generate group.name, group.age, MIN(a.gpa);
  2884. store e into ':OUTPATH:';\,
  2885. },
  2886. # Collected group with nulls in keys and data.
  2887. {
  2888. 'num' => 5,
  2889. 'pig' => q\a = load ':INPATH:/singlefile/studentnulltab10k';
  2890. b = order a by $0;
  2891. store b into ':OUTPATH:.intermediate';
  2892. exec;
  2893. register :FUNCPATH:/testudf.jar;
  2894. c = load ':OUTPATH:.intermediate' using org.apache.pig.test.udf.storefunc.SimpleCollectableLoader() as (name:chararray, age:int, gpa:float);
  2895. d = group c by $0 using 'collected';
  2896. e = foreach d generate group, SUM(c.$1);
  2897. store e into ':OUTPATH:';\,
  2898. 'notmq' => 1,
  2899. 'verify_pig_script' => q\a = load ':INPATH:/singlefile/studentnulltab10k' as (name:chararray, age:int, gpa:float);
  2900. d = group a by $0 ;
  2901. e = foreach d generate group, SUM(a.$1);
  2902. store e into ':OUTPATH:';\,
  2903. },
  2904. # Collected group with numeric key
  2905. {
  2906. 'num' => 6,
  2907. 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k' as (name:chararray, age:int, gpa:float);
  2908. b = order a by age;
  2909. store b into ':OUTPATH:.intermediate';
  2910. exec;
  2911. register :FUNCPATH:/testudf.jar;
  2912. c = load ':OUTPATH:.intermediate' using org.apache.pig.test.udf.storefunc.SimpleCollectableLoader() as (name:chararray, age:int, gpa:float);
  2913. d = group c by age using 'collected';
  2914. e = foreach d generate group, AVG(c.gpa), COUNT(c.name);
  2915. store e into ':OUTPATH:';\,
  2916. 'notmq' => 1,
  2917. 'verify_pig_script' => q\a = load ':INPATH:/singlefile/studenttab10k' as (name:chararray, age:int, gpa:float);
  2918. d = group a by age ;
  2919. e = foreach d generate group, AVG(a.gpa), COUNT(a.name);
  2920. store e into ':OUTPATH:';\,
  2921. },
  2922. ]
  2923. },
  2924. {
  2925. 'name' => 'SecondarySort',
  2926. 'tests' => [
  2927. {
  2928. # simple order by
  2929. 'num' => 1,
  2930. 'java_params' => ['-Dpig.accumulative.batchsize=5'],
  2931. 'pig' => q\register :FUNCPATH:/testudf.jar;
  2932. a = load ':INPATH:/singlefile/studenttab10k' as (name, age, gpa);
  2933. b = group a by age parallel 10;
  2934. c = foreach b {
  2935. d = order a by name;
  2936. generate group, org.apache.pig.test.udf.evalfunc.AllFirstLetter(d);
  2937. };
  2938. store c into ':OUTPATH:';\,
  2939. },
  2940. {
  2941. # order by desc
  2942. 'num' => 2,
  2943. 'java_params' => ['-Dpig.accumulative.batchsize=5'],
  2944. 'pig' => q\register :FUNCPATH:/testudf.jar;
  2945. a = load ':INPATH:/singlefile/studenttab10k' as (name, age, gpa);
  2946. b = group a by age parallel 10;
  2947. c = foreach b {
  2948. d = order a by name desc;
  2949. generate group, org.apache.pig.test.udf.evalfunc.AllFirstLetter(d);
  2950. };
  2951. store c into ':OUTPATH:';\,
  2952. },
  2953. {
  2954. # order by float type
  2955. 'num' => 3,
  2956. 'java_params' => ['-Dpig.accumulative.batchsize=5'],
  2957. 'pig' => q\register :FUNCPATH:/testudf.jar;
  2958. a = load ':INPATH:/singlefile/studenttab10k' as (name:chararray, age:int, gpa:float);
  2959. b = group a by age parallel 10;
  2960. c = foreach b {
  2961. d = order a by gpa;
  2962. generate group, org.apache.pig.test.udf.evalfunc.AllFirstLetter(d.gpa);
  2963. };
  2964. store c into ':OUTPATH:';\,
  2965. },
  2966. # order by string type
  2967. {
  2968. 'num' => 4,
  2969. 'java_params' => ['-Dpig.accumulative.batchsize=5'],
  2970. 'pig' => q\register :FUNCPATH:/testudf.jar;
  2971. a = load ':INPATH:/singlefile/studenttab10k' as (name:chararray, age:int, gpa:float);
  2972. b = group a by age parallel 10;
  2973. c = foreach b {
  2974. d = order a by name;
  2975. generate group, org.apache.pig.test.udf.evalfunc.AllFirstLetter(d.name);
  2976. };
  2977. store c into ':OUTPATH:';\,
  2978. },
  2979. # simple distinct
  2980. {
  2981. 'num' => 5,
  2982. 'java_params' => ['-Dpig.exec.nocombiner=true', '-Dpig.accumulative.batchsize=5'],
  2983. 'pig' => q\register :FUNCPATH:/testudf.jar;
  2984. a = load ':INPATH:/singlefile/studenttab10k' as (name:chararray, age:int, gpa:float);
  2985. b = group a by age parallel 10;
  2986. c = foreach b {
  2987. d = a.name;
  2988. e = distinct d;
  2989. generate group, org.apache.pig.test.udf.evalfunc.AllFirstLetter(e);
  2990. };
  2991. store c into ':OUTPATH:';\,
  2992. },
  2993. # distinct on tuple
  2994. {
  2995. 'num' => 6,
  2996. 'java_params' => ['-Dpig.exec.nocombiner=true', '-Dpig.accumulative.batchsize=5'],
  2997. 'pig' => q\register :FUNCPATH:/testudf.jar;
  2998. a = load ':INPATH:/singlefile/studenttab10k' as (name:chararray, age:int, gpa:float);
  2999. b = group a by age parallel 10;
  3000. c = foreach b {
  3001. d = distinct a;
  3002. generate group, org.apache.pig.test.udf.evalfunc.AllFirstLetter(d);
  3003. };
  3004. store c into ':OUTPATH:';\,
  3005. },
  3006. # sort by two columns
  3007. {
  3008. 'num' => 7,
  3009. 'java_params' => ['-Dpig.accumulative.batchsize=5'],
  3010. 'pig' => q\register :FUNCPATH:/testudf.jar;
  3011. a = load ':INPATH:/singlefile/studenttab10k' as (name:chararray, age:int, gpa:float);
  3012. b = group a by age parallel 10;
  3013. c = foreach b {
  3014. d = order a by gpa, name desc;
  3015. generate group, org.apache.pig.test.udf.evalfunc.AllFirstLetter(d.gpa), org.apache.pig.test.udf.evalfunc.AllFirstLetter(d.name);
  3016. };
  3017. store c into ':OUTPATH:';\,
  3018. },
  3019. # sort, distinct mix
  3020. {
  3021. 'num' => 8,
  3022. 'java_params' => ['-Dpig.exec.nocombiner=true', '-Dpig.accumulative.batchsize=5'],
  3023. 'pig' => q\register :FUNCPATH:/testudf.jar;
  3024. a = load ':INPATH:/singlefile/studenttab10k' as (name:chararray, age:int, gpa:float);
  3025. b = group a by age parallel 10;
  3026. c = foreach b {
  3027. d = order a by name;
  3028. e = d.gpa;
  3029. f = distinct e;
  3030. generate group, org.apache.pig.test.udf.evalfunc.AllFirstLetter(f);
  3031. };
  3032. store c into ':OUTPATH:';\,
  3033. },
  3034. # sort, distinct mix
  3035. {
  3036. 'num' => 9,
  3037. 'java_params' => ['-Dpig.exec.nocombiner=true', '-Dpig.accumulative.batchsize=5'],
  3038. 'pig' => q\register :FUNCPATH:/testudf.jar;
  3039. a = load ':INPATH:/singlefile/studenttab10k' as (name:chararray, age:int, gpa:float);
  3040. b = group a by age parallel 10;
  3041. c = foreach b {
  3042. d = order a by gpa;
  3043. e = d.gpa;
  3044. f = distinct e;
  3045. generate group, org.apache.pig.test.udf.evalfunc.AllFirstLetter(f);
  3046. };
  3047. store c into ':OUTPATH:';\,
  3048. },
  3049. {
  3050. # secondary sort boolean
  3051. 'num' => 10,
  3052. 'pig' => q\a = load ':INPATH:/singlefile/allscalar10k' using PigStorage() as (name:chararray, age:int, gpa:double, instate:boolean);
  3053. b = group a by age;
  3054. c = foreach b {
  3055. d = order a by instate;
  3056. generate group, flatten(d);
  3057. };
  3058. store c into ':OUTPATH:';\,
  3059. 'verify_pig_script' => q\a = load ':INPATH:/singlefile/allscalar10k' using PigStorage() as (name:chararray, age:int, gpa:double, instate:chararray);
  3060. b = group a by age;
  3061. c = foreach b {
  3062. d = order a by instate;
  3063. generate group, flatten(d);
  3064. };
  3065. store c into ':OUTPATH:';\,
  3066. }
  3067. ]
  3068. },
  3069. {
  3070. 'name' => 'Accumulator',
  3071. 'tests' => [
  3072. {
  3073. 'num' => 1,
  3074. 'java_params' => ['-Dpig.exec.nocombiner=true', '-Dpig.accumulative.batchsize=5'],
  3075. 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name, age:int, gpa);
  3076. b = load ':INPATH:/singlefile/votertab10k' as (name, age, registration, contributions);
  3077. e = cogroup a by name, b by name parallel 8;
  3078. f = foreach e generate group, SUM(a.age) as s;
  3079. g = filter f by s>0;
  3080. store g into ':OUTPATH:';\,
  3081. },
  3082. {
  3083. 'num' => 2,
  3084. 'java_params' => ['-Dpig.exec.nocombiner=true', '-Dpig.accumulative.batchsize=5'],
  3085. 'pig' => q\a = load ':INPATH:/singlefile/votertab10k' using PigStorage() as (name, age:int, registration, contributions);
  3086. e = group a by name parallel 8;
  3087. f = foreach e generate group, COUNT(a), MAX(a.contributions), MIN(a.contributions) ;
  3088. store f into ':OUTPATH:';\,
  3089. },
  3090. {
  3091. 'num' => 3,
  3092. 'java_params' => ['-Dpig.exec.nocombiner=true', '-Dpig.accumulative.batchsize=5'],
  3093. 'pig' => q\a = load ':INPATH:/singlefile/votertab10k' using PigStorage() as (name, age:int, registration, contributions);
  3094. e = group a by name parallel 8;
  3095. f = foreach e generate group, (MAX(a.contributions)-MIN(a.contributions))*COUNT(a) ;
  3096. store f into ':OUTPATH:';\,
  3097. },
  3098. {
  3099. 'num' => 4,
  3100. 'java_params' => ['-Dpig.exec.nocombiner=true', '-Dpig.accumulative.batchsize=5'],
  3101. 'pig' => q\a = load ':INPATH:/singlefile/votertab10k' using PigStorage() as (name, age:int, registration, contributions);
  3102. e = group a by name parallel 8;
  3103. f = foreach e { g = distinct a.age; generate group, COUNT(g);}
  3104. store f into ':OUTPATH:';\,
  3105. }
  3106. ]
  3107. },
  3108. {
  3109. 'name' => 'PruneColumns',
  3110. 'tests' => [
  3111. {
  3112. 'num' => 1,
  3113. 'execonly' => 'mapred', # studenttab20m not available in local mode
  3114. 'pig' => q\
  3115. a = load ':INPATH:/singlefile/studenttab20m' using PigStorage() as (name, age, gpa);
  3116. b = foreach a generate age;
  3117. store b into ':OUTPATH:';\,
  3118. }
  3119. ]
  3120. },
  3121. {
  3122. 'name' => 'Bzip',
  3123. 'tests' => [
  3124. {
  3125. # test reading and writing out files with .bz2 extension
  3126. 'num' => 1,
  3127. 'pig' => q\
  3128. a = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name, age, gpa);
  3129. store a into ':OUTPATH:.intermediate.bz2';
  3130. b = load ':OUTPATH:.intermediate.bz2';
  3131. store b into ':OUTPATH:';\,
  3132. 'notmq' => 1,
  3133. },
  3134. {
  3135. # test reading and writing with .bz extension
  3136. 'num' => 2,
  3137. 'pig' => q\
  3138. a = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name, age, gpa);
  3139. store a into ':OUTPATH:.intermediate.bz';
  3140. b = load ':OUTPATH:.intermediate.bz';
  3141. store b into ':OUTPATH:';\,
  3142. 'notmq' => 1,
  3143. },
  3144. ]
  3145. },
  3146. {
  3147. 'name' => 'Scalar',
  3148. 'tests' => [
  3149. {
  3150. # test scalar in foreach (most common)
  3151. 'num' => 1,
  3152. 'pig' => q\
  3153. a = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name, age, gpa);
  3154. b = group a all;
  3155. c = foreach b generate AVG(a.gpa) as avg, MAX(a.gpa) as max;
  3156. y = foreach a generate name, (gpa - c.avg) / c.max;
  3157. store y into ':OUTPATH:';\,
  3158. 'floatpostprocess' => 1,
  3159. 'delimiter' => ' ',
  3160. },
  3161. {
  3162. # test scalar in filter
  3163. 'num' => 2,
  3164. 'pig' => q\
  3165. a = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name, age, gpa);
  3166. b = group a all;
  3167. c = foreach b generate AVG(a.gpa) as avg;
  3168. y = filter a by gpa > c.avg;
  3169. store y into ':OUTPATH:';\,
  3170. },
  3171. {
  3172. # test scalar with two branch
  3173. 'num' => 3,
  3174. 'pig' => q\
  3175. a = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name, age, gpa);
  3176. b = group a all;
  3177. c = foreach b generate AVG(a.age) as avg;
  3178. x = load ':INPATH:/singlefile/votertab10k' using PigStorage() as (name, age, registration, contributions);
  3179. y = filter x by age > c.avg;
  3180. store y into ':OUTPATH:';\,
  3181. },
  3182. {
  3183. # test with scalar from two inputs
  3184. 'num' => 4,
  3185. 'pig' => q\
  3186. a = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name, age, gpa);
  3187. b = group a all;
  3188. c = foreach b generate AVG(a.age) as avg;
  3189. d = load ':INPATH:/singlefile/votertab10k' using PigStorage() as (name, age, registration, contributions);
  3190. e = group d all;
  3191. f = foreach e generate AVG(d.age) as avg;
  3192. y = foreach a generate age/c.avg, age/f.avg;
  3193. store y into ':OUTPATH:';\,
  3194. },
  3195. ]
  3196. },
  3197. {
  3198. 'name' => 'Scripting',
  3199. 'tests' => [
  3200. {
  3201. # test integer square
  3202. 'num' => 1,
  3203. 'ignore23' => 'MAPREDUCE-3700',
  3204. 'pig' => q\
  3205. register ':SCRIPTHOMEPATH:/python/scriptingudf.py' using jython as myfuncs;
  3206. a = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name, age:int, gpa:double);
  3207. b = foreach a generate myfuncs.square(age);
  3208. store b into ':OUTPATH:';\,
  3209. 'verify_pig_script' => q\
  3210. a = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name, age:int, gpa:double);
  3211. b = foreach a generate age * age;
  3212. store b into ':OUTPATH:';\,
  3213. },
  3214. {
  3215. # test string concat and referencing function without a namespace
  3216. 'num' => 2,
  3217. 'ignore23' => 'MAPREDUCE-3700',
  3218. 'pig' => q\
  3219. register ':SCRIPTHOMEPATH:/python/scriptingudf.py' using jython;
  3220. a = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name:chararray, age, gpa);
  3221. b = foreach a generate concat(name) as name;
  3222. store b into ':OUTPATH:';\,
  3223. 'verify_pig_script' => q\
  3224. a = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name:chararray, age:int, gpa:double);
  3225. b = foreach a generate CONCAT(name, name);
  3226. store b into ':OUTPATH:';\,
  3227. },
  3228. {
  3229. # test long and double square, plus two references to the same UDF with different schemas
  3230. 'num' => 3,
  3231. 'ignore23' => 'MAPREDUCE-3700',
  3232. 'pig' => q\
  3233. register ':SCRIPTHOMEPATH:/python/scriptingudf.py' using jython as myfuncs;
  3234. a = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name, age:long, gpa:double);
  3235. b = foreach a generate myfuncs.square(age), myfuncs.square(gpa);
  3236. store b into ':OUTPATH:';\,
  3237. 'verify_pig_script' => q\
  3238. a = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name, age:int, gpa:double);
  3239. b = foreach a generate age * age, gpa * gpa;
  3240. store b into ':OUTPATH:';\,
  3241. 'floatpostprocess' => 1,
  3242. 'delimiter' => ' ',
  3243. },
  3244. {
  3245. # test method with no schema decorator (ie, returns bytearray)
  3246. 'num' => 4,
  3247. 'ignore23' => 'MAPREDUCE-3700',
  3248. 'pig' => q\
  3249. register ':SCRIPTHOMEPATH:/python/scriptingudf.py' using jython as myfuncs;
  3250. a = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name:chararray, age, gpa);
  3251. b = foreach a generate myfuncs.byteconcat(name);
  3252. store b into ':OUTPATH:';\,
  3253. 'verify_pig_script' => q\
  3254. a = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name, age, gpa);
  3255. b = foreach a generate CONCAT(name, name);
  3256. store b into ':OUTPATH:';\,
  3257. },
  3258. {
  3259. # test method with no schema decorator (ie, returns bytearray)
  3260. 'num' => 5,
  3261. 'ignore23' => 'MAPREDUCE-3700',
  3262. 'pig' => q\
  3263. register ':SCRIPTHOMEPATH:/python/scriptingudf.py' using jython as myfuncs;
  3264. a = load ':INPATH:/singlefile/studentcomplextab10k' using PigStorage() as (m:[], t:(name:chararray, age:int, gpa:double), b:{t:(name:chararray, age:int, gpa:double)});
  3265. b = foreach a generate flatten(myfuncs.complexTypes(m, t, b)) as (mm, mt, mb);
  3266. c = foreach b generate mm#'name', mt.$0, mb.$0;
  3267. store c into ':OUTPATH:';\,
  3268. 'verify_pig_script' => q\
  3269. a = load ':INPATH:/singlefile/studentcomplextab10k' using PigStorage() as (m:[], t:(name:chararray, age:int, gpa:double), b:{t:(name:chararray, age:int, gpa:double)});
  3270. b = foreach a generate SIZE(m#'name'), t.$2, b.$2;
  3271. store b into ':OUTPATH:';\,
  3272. },
  3273. {
  3274. # test null input and output
  3275. 'num' => 6,
  3276. 'ignore23' => 'MAPREDUCE-3700',
  3277. 'pig' => q\
  3278. register ':SCRIPTHOMEPATH:/python/scriptingudf.py' using jython as myfuncs;
  3279. a = load ':INPATH:/singlefile/studentnulltab10k' using PigStorage() as (name, age:int, gpa:double);
  3280. b = foreach a generate myfuncs.square(age);
  3281. store b into ':OUTPATH:';\,
  3282. 'verify_pig_script' => q\
  3283. a = load ':INPATH:/singlefile/studentnulltab10k' using PigStorage() as (name, age:int, gpa:double);
  3284. b = foreach a generate age * age;
  3285. store b into ':OUTPATH:';\,
  3286. },
  3287. {
  3288. # test functions that call other functions and include other files
  3289. 'num' => 7,
  3290. 'ignore23' => 'MAPREDUCE-3700',
  3291. 'pig' => q\
  3292. register ':SCRIPTHOMEPATH:/python/scriptingudf.py' using jython as myfuncs;
  3293. a = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name, age:int, gpa:double);
  3294. b = foreach a generate myfuncs.redirect(age);
  3295. store b into ':OUTPATH:';\,
  3296. 'verify_pig_script' => q\
  3297. a = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name, age:int, gpa:double);
  3298. b = foreach a generate age * age;
  3299. store b into ':OUTPATH:';\,
  3300. },
  3301. {
  3302. # test that functions with same names resolve correctly across name spaces
  3303. 'num' => 8,
  3304. 'ignore23' => 'MAPREDUCE-3700',
  3305. 'pig' => q\
  3306. register ':SCRIPTHOMEPATH:/python/scriptingudf.py' using jython as myfuncs;
  3307. register ':SCRIPTHOMEPATH:/python/morepythonudfs.py' using jython as morefuncs;
  3308. a = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name, age:int, gpa:double);
  3309. b = foreach a generate myfuncs.square(age), morefuncs.square(age);
  3310. store b into ':OUTPATH:';\,
  3311. 'verify_pig_script' => q\
  3312. a = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name, age:int, gpa:double);
  3313. b = foreach a generate age * age, age * age * age;
  3314. store b into ':OUTPATH:';\,
  3315. },
  3316. {
  3317. # test that functions with same names resolve correctly across name spaces
  3318. 'num' => 9,
  3319. 'ignore23' => 'MAPREDUCE-3700',
  3320. 'pig' => q\
  3321. register ':SCRIPTHOMEPATH:/python/scriptingudf.py' using jython as myfuncs;
  3322. a = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name, age:int, gpa:double);
  3323. b = group a by name;
  3324. c = foreach b generate group, myfuncs.count(a);
  3325. store c into ':OUTPATH:';\,
  3326. 'verify_pig_script' => q\
  3327. a = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name, age:int, gpa:double);
  3328. b = group a by name;
  3329. c = foreach b generate group, COUNT(a);
  3330. store c into ':OUTPATH:';\,
  3331. },
  3332. {
  3333. # test that functions with same names resolve correctly across name spaces
  3334. 'num' => 10,
  3335. 'ignore23' => 'MAPREDUCE-3700',
  3336. 'pig' => q\
  3337. register ':SCRIPTHOMEPATH:/python/scriptingudf.py' using jython as myfuncs;
  3338. a = load ':INPATH:/singlefile/allscalar10k' as (name:chararray, age:int, gpa:double, instate:boolean);
  3339. b = foreach a generate name, myfuncs.adjustgpa(gpa, instate);
  3340. store b into ':OUTPATH:';\,
  3341. 'verify_pig_script' => q\
  3342. a = load ':INPATH:/singlefile/allscalar10k' using PigStorage() as (name:chararray, age:int, gpa:double, instate:chararray);
  3343. b = foreach a generate name, (instate=='true'?gpa:gpa+1);
  3344. store b into ':OUTPATH:';\,
  3345. },
  3346. {
  3347. # test that functions with same names resolve correctly across name spaces
  3348. 'num' => 11,
  3349. 'ignore' => 1, # PIG-2596
  3350. 'pig' => q\
  3351. register ':SCRIPTHOMEPATH:/python/scriptingudf.py' using jython as myfuncs;
  3352. a = load ':INPATH:/singlefile/studenttab10k' as (name:chararray, age:int, gpa:double);
  3353. b = foreach a generate name, myfuncs.isretired(age);
  3354. store b into ':OUTPATH:';\,
  3355. 'verify_pig_script' => q\
  3356. a = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name:chararray, age:int, gpa:double);
  3357. b = foreach a generate name, (age>=60?1:0);
  3358. store b into ':OUTPATH:';\,
  3359. }
  3360. ]
  3361. },
  3362. {
  3363. 'name' => 'Native',
  3364. 'tests' => [
  3365. {
  3366. # test common
  3367. 'num' => 1,
  3368. 'pig' => q\
  3369. rmf table_testNativeMRJobSimple_input
  3370. rmf table_testNativeMRJobSimple_output
  3371. a = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name, age, gpa);
  3372. b = mapreduce ':MAPREDJARS:/hadoop-examples.jar' Store a into 'table_testNativeMRJobSimple_input' Load 'table_testNativeMRJobSimple_output' `wordcount table_testNativeMRJobSimple_input table_testNativeMRJobSimple_output`;
  3373. store b into ':OUTPATH:';\,
  3374. 'notmq' => 1,
  3375. },
  3376. {
  3377. # test complex
  3378. 'num' => 2,
  3379. 'pig' => q\
  3380. rmf table_testNativeMRJobSimple_input
  3381. rmf table_testNativeMRJobSimple_output
  3382. a = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name, age, gpa);
  3383. b = foreach a generate name;
  3384. c = distinct b;
  3385. d = mapreduce ':MAPREDJARS:/hadoop-examples.jar' Store c into 'table_testNativeMRJobSimple_input' Load 'table_testNativeMRJobSimple_output' as (name:chararray, count: int) `wordcount table_testNativeMRJobSimple_input table_testNativeMRJobSimple_output`;
  3386. e = order d by name;
  3387. store e into ':OUTPATH:';\,
  3388. 'sortArgs' => ['-t', ' '],
  3389. 'notmq' => 1,
  3390. },
  3391. {
  3392. # test streaming
  3393. 'num' => 3,
  3394. 'pig' => q\
  3395. rmf table_testNativeMRJobSimple_input
  3396. rmf table_testNativeMRJobSimple_output
  3397. a = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name, age, gpa);
  3398. b = mapreduce ':MAPREDJARS:/hadoop-streaming.jar' Store a into 'table_testNativeMRJobSimple_input' Load 'table_testNativeMRJobSimple_output' as (name:chararray, count: int) `-input table_testNativeMRJobSimple_input -output table_testNativeMRJobSimple_output -mapper /bin/cat -reducer /usr/bin/wc`;
  3399. store b into ':OUTPATH:';\,
  3400. 'pig23' => q\
  3401. rmf table_testNativeMRJobSimple_input
  3402. rmf table_testNativeMRJobSimple_output
  3403. a = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name, age, gpa);
  3404. b = mapreduce ':MAPREDJARS:/hadoop-0.23.0-streaming.jar' Store a into 'table_testNativeMRJobSimple_input' Load 'table_testNativeMRJobSimple_output' as (name:chararray, count: int) `-input table_testNativeMRJobSimple_input -output table_testNativeMRJobSimple_output -mapper /bin/cat -reducer /usr/bin/wc`;
  3405. store b into ':OUTPATH:';\,
  3406. 'notmq' => 1,
  3407. },
  3408. ]
  3409. },
  3410. {
  3411. 'name' => 'Partitioner',
  3412. 'tests' => [
  3413. {
  3414. # test group
  3415. 'num' => 1,
  3416. 'execonly' => 'mapred', # since this join will run out of memory in local mode
  3417. 'pig' => q\register :FUNCPATH:/testudf.jar;
  3418. a = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name, age:int, gpa);
  3419. b = group a by age PARTITION BY org.apache.pig.test.utils.SimpleCustomPartitioner2 parallel 2;
  3420. c = foreach b generate group, COUNT(a);
  3421. store c into ':OUTPATH:';\,
  3422. },
  3423. ]
  3424. },
  3425. {
  3426. ####################################################################
  3427. # SUB : CastScalar
  3428. # FEATURE: adds functionality that allows to cast elements of a single-tuple relation into a scalar value.
  3429. # JIRA: Pig-1434
  3430. #
  3431. # TEST ITEMS:
  3432. # 1 Test syntax
  3433. # 2 Test scalar for simple data type
  3434. # 3 Test scalar for complex data type: tuple, bag, map
  3435. # 4 Test implicit cast
  3436. # 5 Test explicit cast
  3437. # 6 Positional parameter
  3438. # 7 Cast within an aggregate function
  3439. # 8 Cast within an UDF function
  3440. # 9 Cast with a FOREACH
  3441. # 10 Cast with a FILTER
  3442. # 11 Cast with a SPLIT
  3443. # 12 Cast in a JOIN
  3444. # 13 Multiquery
  3445. # 14 Cast on a schema that cannot be inferred should result in bytearray
  3446. # 15 Replicated Join
  3447. # 16 Test operations such as R1 * (int)R1
  3448. # 17 CheckSingular(*)
  3449. # 18 missing field in scalar file
  3450. # 19 scalar referenced from an empty file
  3451. # 20 empty input directory
  3452. # 21 Single row vs Multiple Row
  3453. # 22 Cast on a multi-field tuple
  3454. # 23 Reference a non-scalar as a scalar
  3455. # 24 Test multiple loaders
  3456. 'name' => 'CastScalar',
  3457. 'tests' => [
  3458. {
  3459. # 2 Test scalar for simple data type
  3460. # 3 Test scalar for complex data type: tuple, bag, map
  3461. # 9 Cast with a FOREACH
  3462. #INPATH = /user/hadoopqa/pig/tests/data
  3463. 'num' => 1,
  3464. 'pig' => q#
  3465. a = load ':INPATH:/singlefile/studenttab10k' as (name: chararray, age: int, gpa: float);
  3466. b = group a all;
  3467. c = foreach b generate SUM(a.age) as total;
  3468. d = foreach a generate name, age+(double)c.total as d_sum;
  3469. e = order d by name, d_sum;
  3470. store d into ':OUTPATH:';
  3471. #,
  3472. # 6 Positional parameter
  3473. }, {
  3474. 'num' => 2,
  3475. 'pig' => q#
  3476. a = load ':INPATH:/singlefile/studenttab10k' as (name: chararray, age: int, gpa: float);
  3477. b = group a all;
  3478. c = foreach b generate SUM(a.age) as total;
  3479. d = foreach a generate name, age+(double)c.$0 as d_sum;
  3480. e = order d by name, d_sum;
  3481. store d into ':OUTPATH:';
  3482. #,
  3483. # 2 Test scalar for simple data type
  3484. # 3 Test scalar for complex data type:map
  3485. # 9 Cast with a FOREACH
  3486. # 13 Multiquery
  3487. # 24 Test multiple loaders
  3488. #INPATH = /user/hadoopqa/pig/tests/data
  3489. }, {
  3490. # 4 Test implicit cast
  3491. # 10 Cast with a FILTER
  3492. #
  3493. # I set the benchmark to use "19" because pig trunkates during cast and sql rounds up.
  3494. 'num' => 7,
  3495. 'pig' => q\
  3496. a = load ':INPATH:/singlefile/studenttab10k' as (name, age, gpa);
  3497. b = group a by name;
  3498. c = foreach b generate group, AVG(a.gpa)+20 as avg_gpa;
  3499. d = order c by avg_gpa;
  3500. simple_scalar = limit d 1;
  3501. f = filter a by age < (int) simple_scalar.avg_gpa;
  3502. g = order f by name, age, gpa;
  3503. store g into ':OUTPATH:';\,
  3504. }, {
  3505. # 5 Test explicit cast
  3506. # 10 Cast with a FILTER
  3507. 'num' => 8,
  3508. 'pig' => q\
  3509. a = load ':INPATH:/singlefile/studenttab10k' as (name, age, gpa);
  3510. b = group a by name;
  3511. c = foreach b generate group, AVG(a.age) AS average;
  3512. d = order c by average;
  3513. simple_scalar = limit d 1;
  3514. d = filter a by age > (int) simple_scalar.average;
  3515. e = foreach d generate name, age;
  3516. store e into ':OUTPATH:';
  3517. \,
  3518. }, {
  3519. # 5 Test explicit cast
  3520. # 6 Positional parameter
  3521. 'num' => 9,
  3522. 'pig' => q\
  3523. a = load ':INPATH:/singlefile/studenttab10k' as (name, age, gpa);
  3524. b = group a by name;
  3525. c = foreach b generate group, AVG(a.age) AS average;
  3526. d = order c by average;
  3527. simple_scalar = limit d 1;
  3528. d = filter a by age > (int) simple_scalar.$1;
  3529. e = foreach d generate name, age;
  3530. store e into ':OUTPATH:';
  3531. \,
  3532. }, {
  3533. # 4 Test implicit cast
  3534. # 6 Positional parameter
  3535. # 10 Cast with a FILTER
  3536. 'num' => 10,
  3537. 'pig' => q\
  3538. a = load ':INPATH:/singlefile/studenttab10k' as (name, age, gpa);
  3539. b = group a by name;
  3540. c = foreach b generate group, AVG(a.age) AS average;
  3541. d = order c by average;
  3542. simple_scalar = limit d 1;
  3543. d = filter a by age > simple_scalar.$1;
  3544. e = foreach d generate name, age;
  3545. store e into ':OUTPATH:';
  3546. \,
  3547. }, {
  3548. # 4 Test implicit cast
  3549. # 6 Positional parameter
  3550. # 11 Cast with a SPLIT
  3551. 'num' => 11,
  3552. 'pig' => q\
  3553. a = load ':INPATH:/singlefile/studenttab10k' as (name, age, gpa);
  3554. b = group a by name;
  3555. c = foreach b generate group, AVG(a.age) AS average;
  3556. d = order c by average;
  3557. simple_scalar = limit d 1;
  3558. split a into X1 if age > (int) simple_scalar.$1, X2 if age < 20;
  3559. split a into X3 if age > (int) simple_scalar.$1, X4 if age > 70;
  3560. store X1 into ':OUTPATH:.1';
  3561. store X2 into ':OUTPATH:.2';
  3562. store X3 into ':OUTPATH:.3';
  3563. store X4 into ':OUTPATH:.4';
  3564. \,
  3565. }, {
  3566. # 4 Test implicit cast
  3567. # 6 Positional parameter
  3568. # 12 Cast with a JOIN
  3569. 'num' => 12,
  3570. 'pig' => q\
  3571. a = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name, age, gpa);
  3572. b = load ':INPATH:/singlefile/votertab10k' as (name, age, registration, contributions);
  3573. c = filter a by age < 20;
  3574. d = filter b by age < 20;
  3575. simple_scalar = limit d 1;
  3576. e = join c by name, d by name;
  3577. f= filter e by c::age <(int)simple_scalar.age;
  3578. store f into ':OUTPATH:';\,
  3579. },
  3580. ]
  3581. },{
  3582. 'name' => 'udf_TOBAGandTOTUPLE',
  3583. 'sortResults' => 1,
  3584. 'floatpostprocess' => 1,
  3585. 'delimiter' => ' ',
  3586. 'tests' => [
  3587. {
  3588. # TEST : resulting schema for TOBAG/TOTUPLE with simple types
  3589. # TEST : resulting schema for TOBAG/TOTUPLE with positional parameters
  3590. # TEST : resulting schema for various projects using a combination of TOBAG/TOTUPLE and standard projections
  3591. # TEST : resulting schema for various projects using a combination of TOBAG/TOTUPLE using AS clause
  3592. 'num' => 1
  3593. ,'pig' => q?
  3594. A = load ':INPATH:/types/numbers.txt' using PigStorage(':') as (intnum1000: int,id: int,intnum5: int,intnum100: int,intnum: int,longnum: long,floatnum: float,doublenum: double);
  3595. B = limit A 10;
  3596. Gen1 = FOREACH B GENERATE $0, $1, $2 ;
  3597. GroupById = GROUP B BY id;
  3598. B1 = foreach B generate TOBAG( intnum1000, id, intnum5);
  3599. B3 = foreach B generate TOBAG( $0, $1, $2);
  3600. T1= foreach B generate TOTUPLE( intnum1000, id, intnum5);
  3601. T2= foreach B generate TOTUPLE( $0, $1, $2);
  3602. T3 = foreach B generate TOTUPLE( $0, $0, $0);
  3603. T4= foreach B generate TOBAG($0, $1, $2), TOTUPLE($3, $4, $5), $6, $7;
  3604. T5= foreach B generate $0, $1, TOTUPLE($2, $3, $4), TOBAG($5, $6), $7;
  3605. T6= foreach B generate $0, TOTUPLE($0, $0, $0), TOBAG($0, $0), $0 AS duplicate;
  3606. describe Gen1;
  3607. describe GroupById;
  3608. describe B1;
  3609. describe B3;
  3610. describe T1;
  3611. describe T2;
  3612. describe T3;
  3613. describe T4;
  3614. describe T5;
  3615. describe T6;
  3616. ?
  3617. ,'expected_out_regex' => 'B1: {{int}}'
  3618. ,'expected_out_regex' => 'B3: {{int}}'
  3619. ,'expected_out_regex' => 'T1: {org.apache.pig.builtin.totuple_id_.*: (intnum1000: int,id: int,intnum5: int)}'
  3620. ,'expected_out_regex' => 'T2: {org.apache.pig.builtin.totuple_id_.*: (intnum1000: int,id: int,intnum5: int)}'
  3621. ,'expected_out_regex' => 'T3: {org.apache.pig.builtin.totuple_intnum1000.*: (intnum1000: int,intnum1000: int,intnum1000: int)}'
  3622. ,'expected_out_regex' => 'T4: {{int},org.apache.pig.builtin.totuple_intnum100.*: (intnum100: int,intnum: int,longnum: long),floatnum: float,doublenum: double}'
  3623. ,'expected_out_regex' => 'T5: {intnum1000: int,id: int,org.apache.pig.builtin.totuple_intnum100.*: (intnum5: int,intnum100: int,intnum: int).*{NULL}.*doublenum: double}'
  3624. ,'expected_out_regex' => "T6: {intnum1000: int,org.apache.pig.builtin.totuple_intnum1000.*: \\(intnum1000: int,intnum1000: int,intnum1000: int\\),{\\(int\\)},duplicate: int}"
  3625. }, {
  3626. # TEST : bag of mixed data types
  3627. # TEST : Order
  3628. # TEST : positional parameters
  3629. 'num' => 2
  3630. ,'pig' => q?
  3631. A = load ':INPATH:/types/numbers.txt' using PigStorage(':') as (intnum1000: int,id: int,intnum5: int,intnum100: int,intnum: int,longnum: long,floatnum: float,doublenum: double);
  3632. C = foreach A generate TOBAG( id, floatnum, doublenum );
  3633. D = foreach A generate TOBAG( id, intnum);
  3634. E = foreach A generate TOBAG( (float) id,floatnum );
  3635. F = foreach A generate TOBAG( (long) id,longnum );
  3636. G = foreach A generate TOBAG( (double) id,doublenum );
  3637. describe C;
  3638. describe D;
  3639. describe E;
  3640. describe F;
  3641. describe G;
  3642. ?
  3643. ,'expected_out_regex' => 'C: {{NULL}}'
  3644. ,'expected_out_regex' => 'D: {{int}}'
  3645. ,'expected_out_regex' => 'E: {{float}}'
  3646. ,'expected_out_regex' => 'F: {{long}}'
  3647. ,'expected_out_regex' => 'G: {{double}}'
  3648. }, {
  3649. # TEST : TOBAG/TOTUPLE with simple types
  3650. # TEST : TOBAG/TOTUPLE with positional parameters
  3651. # TEST : various projects using a combination of TOBAG/TOTUPLE and standard projections
  3652. # TEST : various projects using a combination of TOBAG/TOTUPLE using AS clause
  3653. 'num' => 3
  3654. ,'pig' => q?
  3655. A = load ':INPATH:/types/numbers.txt' using PigStorage(':') as (intnum1000: int,id: int,intnum5: int,intnum100: int,intnum: int,longnum: long,floatnum: float,doublenum: double);
  3656. B = limit A 10;
  3657. B1 = foreach B generate TOBAG( intnum1000, id, intnum5);
  3658. B2 = foreach B generate TOBAG( $0, $1, $2);
  3659. T1= foreach B generate TOTUPLE( intnum1000, id, intnum5);
  3660. T2= foreach B generate TOTUPLE( $0, $1, $2);
  3661. T3 = foreach B generate TOTUPLE( $0, $0, $0);
  3662. T4= foreach B generate TOBAG($0, $1, $2), TOTUPLE($3, $4, $5), $6, $7;
  3663. T5= foreach B generate $0, $1, TOTUPLE($2, $3, $4), TOBAG($5, $6), $7;
  3664. T6= foreach B generate $0, TOTUPLE($0, $0, $0), TOBAG($0, $0), $0 AS duplicate;
  3665. Gen1 = FOREACH B GENERATE $0, $1, $2 ;
  3666. GroupById = GROUP B BY id;
  3667. store Gen1 into ':OUTPATH:.1';
  3668. store GroupById into ':OUTPATH:.2';
  3669. store B1 into ':OUTPATH:.3';
  3670. store B2 into ':OUTPATH:.4';
  3671. store T1 into ':OUTPATH:.5';
  3672. store T2 into ':OUTPATH:.6';
  3673. store T3 into ':OUTPATH:.7';
  3674. store T4 into ':OUTPATH:.8';
  3675. ?
  3676. }, {
  3677. # TEST : cast for TOTUPLE/TOBAG
  3678. 'num' => 4
  3679. ,'pig' => q?
  3680. A = load ':INPATH:/types/numbers.txt' using PigStorage(':') as (intnum1000: int,id: int,intnum5: int,intnum100: int,intnum: int,longnum: long,floatnum: float,doublenum: double);
  3681. B= limit A 10;
  3682. C = foreach B generate $0, TOTUPLE((int) $0, (long) $0, (double) $0), TOBAG( (float) $0, (chararray) $0), $0;
  3683. store C into ':OUTPATH:';
  3684. ?
  3685. ,'expected_err_regex' => 'ERROR 1108: Duplicate schema alias'
  3686. ,'rc' => 6
  3687. }, {
  3688. # TEST : cast for TOTUPLE/TOBAG
  3689. 'num' => 5
  3690. ,'pig' => q?
  3691. A = load ':INPATH:/types/numbers.txt' using PigStorage(':') as (intnum1000: int,id: int,intnum5: int,intnum100: int,intnum: int,longnum: long,floatnum: float,doublenum: double);
  3692. B= limit A 1;
  3693. C = foreach B generate $0, TOTUPLE((int) $0);
  3694. D = foreach B generate $0, TOTUPLE((long) $0);
  3695. E = foreach B generate $0, TOTUPLE((double) $0);
  3696. F = foreach B generate $0, TOTUPLE((float) $0);
  3697. G = foreach B generate $0, TOTUPLE((chararray) $0);
  3698. store B into ':OUTPATH:.1';
  3699. store C into ':OUTPATH:.2';
  3700. store D into ':OUTPATH:.3';
  3701. store E into ':OUTPATH:.4';
  3702. store F into ':OUTPATH:.5';
  3703. store G into ':OUTPATH:.6';
  3704. ?
  3705. }, {
  3706. #TEST more complicated nested functions such as TOTUPLE(TOBAG())
  3707. #TEST more complicated nested functions such as TOBAG(TOTUPLE())
  3708. #TEST more complicated nested functions such as TOTUPLE(TOTUPLE())
  3709. #TEST more complicated nested functions such as TOBAG(TOBAG())
  3710. 'num' => 6
  3711. ,'pig' => q?
  3712. A = load ':INPATH:/types/numbers.txt' using PigStorage(':') as (intnum1000: int,id: int,intnum5: int,intnum100: int,intnum: int,longnum: long,floatnum: float,doublenum: double);
  3713. B = limit A 10;
  3714. tint = foreach B generate TOTUPLE( TOBAG( $1, $2, $3),TOTUPLE($3, $4, $5) );
  3715. bint = foreach B generate TOTUPLE( TOBAG( $1, $2, $3),TOBAG($3, $4, $5) );
  3716. binb = foreach B generate TOBAG( TOBAG( $1, $2, $3),TOBAG($3, $4, $5) );
  3717. tinb = foreach B generate TOTUPLE( TOBAG( $1, $2, $3),TOBAG($3, $4, $5) );
  3718. store B into ':OUTPATH:.1';
  3719. store tint into ':OUTPATH:.2';
  3720. store bint into ':OUTPATH:.3';
  3721. store binb into ':OUTPATH:.4';
  3722. store tinb into ':OUTPATH:.5';
  3723. ?
  3724. }, {
  3725. #TEST arithmetic operation in TOTUPLE and TOBAG
  3726. #TEST aggregate funcion - NOT IMPLEMENTED
  3727. #TEST tuple with 50+ items
  3728. #TEST with null
  3729. 'num' => 7
  3730. ,'pig' => q?
  3731. A = load ':INPATH:/types/numbers.txt' using PigStorage(':') as (intnum1000: int,id: int,intnum5: int,intnum100: int,intnum: int,longnum: long,floatnum: float,doublenum: double);
  3732. B = limit A 10;
  3733. B1= foreach B generate TOTUPLE( $1, $2, $3);
  3734. T1= foreach B generate TOTUPLE( $1, $2, $3);
  3735. R1= foreach B generate TOTUPLE( $1, $0+1, $0+2, $0+3),TOBAG($0+4, $0+1 );
  3736. R2= foreach B generate TOTUPLE( $0, $1, $2, $3, $4, $5, $6, $7, (int) 8, (int) 9 , $1, $2, $3, $4, $5, $6, $7, (int) 19, (int) 20, $0, $1, $2, $3, $4, $5, $6, $7 , (int) 29, (int) 30, $0, $1, $2, $3, $4, $5, $6, $7, (int) 39, (int) 40 , $1, $2, $3, $4, $5, $6, $7, (int) 19, (int) 20, $0, $1, $2, $3, $4, $5, $5, $7 );
  3737. R3= foreach B generate $0, TOTUPLE(0,0,0), TOBAG( 0, 0 );
  3738. R4= foreach B generate $0, TOTUPLE(null, id, null), TOBAG( id, null, id,null );
  3739. describe R1;
  3740. describe R2;
  3741. describe R3;
  3742. describe R4;
  3743. store B into ':OUTPATH:.1';
  3744. store B1 into ':OUTPATH:.2';
  3745. store R1 into ':OUTPATH:.3';
  3746. store R2 into ':OUTPATH:.4';
  3747. store R3 into ':OUTPATH:.5';
  3748. store R4 into ':OUTPATH:.6';
  3749. ?
  3750. }, {
  3751. # TEST more TOTUPLE and TOBAG nested combinations
  3752. 'num' => 8
  3753. ,'pig' => q?
  3754. A = load ':INPATH:/types/numbers.txt' using PigStorage(':') as (intnum1000: int,id: int,intnum5: int,intnum100: int,intnum: int,longnum: long,floatnum: float,doublenum: double);
  3755. B = limit A 10;
  3756. C = foreach B generate TOBAG( $0, $1, $2);
  3757. T1= foreach B generate TOTUPLE( TOBAG( $1, $2, $3),TOTUPLE($3, $4, $5) );
  3758. T2= foreach B generate TOTUPLE( TOBAG( $1, $2, $3),TOBAG($3, $4, $5) );
  3759. T3= foreach B generate TOBAG( TOTUPLE( $1, $2, $3), TOTUPLE($4,$5), TOTUPLE($6,$7));
  3760. store B into ':OUTPATH:.1';
  3761. store C into ':OUTPATH:.2';
  3762. store T1 into ':OUTPATH:.3';
  3763. store T2 into ':OUTPATH:.4';
  3764. store T3 into ':OUTPATH:.5';
  3765. ?
  3766. }, {
  3767. #TEST negative test case: out of bounds positional parameter
  3768. # EVERYTHING IS CORRECT
  3769. 'num' => 9
  3770. ,'pig' => q?
  3771. A = load ':INPATH:/types/numbers.txt' using PigStorage(':') as (intnum1000: int,id: int,intnum5: int,intnum100: int,intnum: int,longnum: long,floatnum: float,doublenum: double);
  3772. B = limit A 10;
  3773. C = foreach B generate $0, $1, TOTUPLE($2, $998, $4), TOBAG($5, $6), $7;
  3774. ?
  3775. ,'expected_err_regex' => 'Out of bound access.*non-existent column: 998'
  3776. }, {
  3777. #TEST negative test case: out of bounds positional parameter
  3778. # EVERYTHING IS CORRECT
  3779. 'num' => 10
  3780. ,'pig' => q?
  3781. A = load ':INPATH:/types/numbers.txt' using PigStorage(':') as (intnum1000: int,id: int,intnum5: int,intnum100: int,intnum: int,longnum: long,floatnum: float,doublenum: double);
  3782. B = limit A 10;
  3783. C = foreach B generate $0, $1, TOBAG($5, $999), $7;
  3784. ?
  3785. ,'expected_err_regex' => 'Out of bound access.*non-existent column: 999'
  3786. },
  3787. ] # end of tests
  3788. },{
  3789. 'name' => 'ToStuffSyntaxSugar',
  3790. 'tests' => [
  3791. {
  3792. #TEST TOTUPLE syntax sugar
  3793. 'num' => 1,
  3794. 'pig' => q\
  3795. A = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name:chararray, age:int, gpa:float);
  3796. B = foreach A generate (name, age);
  3797. store B into ':OUTPATH:';\,
  3798. 'verify_pig_script' => q\
  3799. A = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name:chararray, age:int, gpa:float);
  3800. B = foreach A generate TOTUPLE(name, age);
  3801. store B into ':OUTPATH:';\,
  3802. }, {
  3803. #TEST TOBAG syntax sugar
  3804. 'num' => 2,
  3805. 'pig' => q\
  3806. A = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name:chararray, age:int, gpa:float);
  3807. B = foreach A generate {name, age};
  3808. store B into ':OUTPATH:';\,
  3809. 'verify_pig_script' => q\
  3810. A = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name:chararray, age:int, gpa:float);
  3811. B = foreach A generate TOBAG(name, age);
  3812. store B into ':OUTPATH:';\,
  3813. }, {
  3814. #TEST TOMAP syntax sugar
  3815. 'num' => 3,
  3816. 'pig' => q\
  3817. A = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name:chararray, age:int, gpa:float);
  3818. B = foreach A generate [name, age];
  3819. store B into ':OUTPATH:';\,
  3820. 'verify_pig_script' => q\
  3821. A = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name:chararray, age:int, gpa:float);
  3822. B = foreach A generate TOMAP(name, age);
  3823. store B into ':OUTPATH:';\,
  3824. }, {
  3825. #TEST verify single element inside parenthesis does NOT call TOTUPLE
  3826. 'num' => 4,
  3827. 'pig' => q\
  3828. A = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name:chararray, age:int, gpa:float);
  3829. B = foreach A generate (age) + 1;
  3830. store B into ':OUTPATH:';\,
  3831. 'verify_pig_script' => q\
  3832. A = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name:chararray, age:int, gpa:float);
  3833. B = foreach A generate (age + 1);
  3834. store B into ':OUTPATH:';\,
  3835. }
  3836. ] # end of tests
  3837. },{
  3838. 'name' => 'MergeOperator',
  3839. 'tests' => [
  3840. {
  3841. # Test Union using merge where schema is identical | A&B have identical schema
  3842. 'num' => 1,
  3843. 'pig' => q\
  3844. A = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name:chararray, age:int, gpa:float);
  3845. B = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name:chararray, age:int, gpa:float);
  3846. C = union onschema A, B;
  3847. store C into ':OUTPATH:';\,
  3848. 'verify_pig_script' => q\
  3849. A = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name:chararray, age:int, gpa:float);
  3850. B = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name:chararray, age:int, gpa:float);
  3851. C = union A, B;
  3852. store C into ':OUTPATH:';\,
  3853. },{
  3854. # Test Union using merge with type promotions, int->long and float->double
  3855. 'num' => 2,
  3856. 'floatpostprocess' => 1,
  3857. 'delimiter' => ' ',
  3858. 'pig' => q\
  3859. A = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name:chararray, age:int, gpa:float);
  3860. B = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name:chararray, age:long, gpa:double);
  3861. C = union onschema A, B;
  3862. store C into ':OUTPATH:';\,
  3863. 'verify_pig_script' => q\
  3864. A = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name:chararray, age:int, gpa:float);
  3865. B = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name:chararray, age:int, gpa:float);
  3866. C = union A, B;
  3867. D = foreach C generate name, (long)age, (double)gpa;
  3868. store C into ':OUTPATH:';\,
  3869. },{
  3870. # Test Union using merge with type promotions, int->float
  3871. 'num' => 3,
  3872. 'floatpostprocess' => 1,
  3873. 'delimiter' => ' ',
  3874. 'pig' => q\
  3875. A = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name:chararray, age:int, gpa:float);
  3876. B = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name:chararray, age:float, gpa:float);
  3877. C = union onschema A, B;
  3878. store C into ':OUTPATH:';\,
  3879. 'verify_pig_script' => q\
  3880. A = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name:chararray, age:float, gpa:float);
  3881. B = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name:chararray, age:float, gpa:float);
  3882. C = union A, B;
  3883. D = foreach C generate name, (float)age, gpa;
  3884. store C into ':OUTPATH:';\,
  3885. },{
  3886. # Test Union using merge with type promotions, int->double
  3887. 'num' => 4,
  3888. 'floatpostprocess' => 1,
  3889. 'delimiter' => ' ',
  3890. 'pig' => q\
  3891. A = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name:chararray, age:int, gpa:float);
  3892. B = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name:chararray, age:double, gpa:float);
  3893. C = union onschema A, B;
  3894. store C into ':OUTPATH:';\,
  3895. 'verify_pig_script' => q\
  3896. A = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name:chararray, age:double, gpa:float);
  3897. B = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name:chararray, age:double, gpa:float);
  3898. C = union A, B;
  3899. D = foreach C generate name, (double)age, gpa;
  3900. store C into ':OUTPATH:';\,
  3901. },{
  3902. # Test Union of an intersection
  3903. 'num' => 5,
  3904. 'pig' => q\
  3905. A = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name:chararray, age:int, gpa:float);
  3906. B = load ':INPATH:/singlefile/votertab10k' as (name:chararray, age:int, registration:chararray, contributions:float);
  3907. C = union onschema A, B;
  3908. store C into ':OUTPATH:';\,
  3909. 'verify_pig_script' => q\
  3910. register :FUNCPATH:/testudf.jar;
  3911. define Nil org.apache.pig.test.udf.evalfunc.Nil();
  3912. A = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name:chararray, age:int, gpa:float);
  3913. B = load ':INPATH:/singlefile/votertab10k' as (name:chararray, age:int, registration:chararray, contributions:float);
  3914. C = foreach A generate name, age, (chararray)gpa, Nil(), Nil();
  3915. D = foreach B generate name, age, Nil(), registration, (chararray)contributions;
  3916. E = union C, D;
  3917. store E into ':OUTPATH:';\,
  3918. },
  3919. {
  3920. # Test Union where the intersection is null
  3921. 'num' => 6,
  3922. 'pig' => q\
  3923. A = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name:chararray, age:int, gpa:float);
  3924. B = load ':INPATH:/singlefile/textdoc' as (line:chararray);
  3925. C = union onschema A, B;
  3926. store C into ':OUTPATH:';\,
  3927. 'verify_pig_script' => q\
  3928. register :FUNCPATH:/testudf.jar;
  3929. define Nil org.apache.pig.test.udf.evalfunc.Nil();
  3930. A = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name:chararray, age:int, gpa:float);
  3931. B = load ':INPATH:/singlefile/textdoc' as (line:chararray);
  3932. C = foreach A generate name, (chararray)age, (chararray)gpa, Nil(name);
  3933. D = foreach B generate Nil(line), Nil(line), Nil(line), line;
  3934. E = union C, D;
  3935. store E into ':OUTPATH:';\,
  3936. },
  3937. {
  3938. # Test Union using merge where schema is identical | A&B have identical schema
  3939. 'num' => 7,
  3940. 'pig' => q\
  3941. a = load ':INPATH:/singlefile/allscalar10k' using PigStorage() as (name:chararray, age:int, gpa:double, instate:boolean);
  3942. b = load ':INPATH:/singlefile/allscalar10k' using PigStorage() as (name:chararray, age:int, gpa:double, instate:boolean);
  3943. C = union onschema a, b;
  3944. store C into ':OUTPATH:';\,
  3945. 'verify_pig_script' => q\
  3946. a = load ':INPATH:/singlefile/allscalar10k' using PigStorage() as (name:chararray, age:int, gpa:double, instate:chararray);
  3947. b = load ':INPATH:/singlefile/allscalar10k' using PigStorage() as (name:chararray, age:int, gpa:double, instate:chararray);
  3948. C = union a, b;
  3949. store C into ':OUTPATH:';\,
  3950. }
  3951. ]
  3952. },
  3953. {
  3954. # Test Union using merge with Simple data types
  3955. 'name' => 'UdfDistributedCache',
  3956. 'tests' => [
  3957. {
  3958. 'num' => 1,
  3959. 'execonly' => 'mapred', # since distributed cache is not supported in local mode
  3960. 'pig' => q?
  3961. register :FUNCPATH:/testudf.jar;
  3962. define udfdc org.apache.pig.test.udf.evalfunc.Udfcachetest(':INPATH:/singlefile/votertab10k#foodle');
  3963. a = load ':INPATH:/singlefile/studenttab10k' as (name, age, gpa);
  3964. b = limit a 1;
  3965. c = foreach b generate udfdc(age);
  3966. STORE c into ':OUTPATH:';?,
  3967. 'verify_pig_script' => q?
  3968. a = load ':INPATH:/singlefile/studenttab10k' as (name, age, gpa);
  3969. b = limit a 1;
  3970. c = foreach b generate 'tom van buren', 68, 'socialist', 390.19;
  3971. STORE c into ':OUTPATH:';?,
  3972. },
  3973. ]
  3974. }, {
  3975. 'name' => 'MonitoredUDF',
  3976. 'tests' => [
  3977. {
  3978. 'num' => 1,
  3979. 'ignore23' => 'guava version of Pig is higher than hadoop 23',
  3980. 'pig' => q?register :FUNCPATH:/testudf.jar;
  3981. define gm org.apache.pig.test.udf.evalfunc.GoodMonitored();
  3982. a = load ':INPATH:/singlefile/studenttab10k' as (name, age, gpa);
  3983. b = foreach a generate gm(name);
  3984. store b into ':OUTPATH:';?,
  3985. 'verify_pig_script' => q?a = load ':INPATH:/singlefile/studenttab10k' as (name, age, gpa);
  3986. b = foreach a generate 'fred';
  3987. store b into ':OUTPATH:';?,
  3988. },{
  3989. 'num' => 2,
  3990. 'pig' => q?register :FUNCPATH:/testudf.jar;
  3991. define bad org.apache.pig.test.udf.evalfunc.BadMonitored();
  3992. a = load ':INPATH:/singlefile/studenttab10k' as (name, age, gpa);
  3993. b = limit a 1;
  3994. c = foreach b generate bad(name);
  3995. store b into ':OUTPATH:';?,
  3996. 'verify_pig_script' => q?a = load ':INPATH:/singlefile/studenttab10k' as (name, age, gpa);
  3997. b = limit a 1;
  3998. c = foreach b generate '';
  3999. store b into ':OUTPATH:';?,
  4000. },{
  4001. 'num' => 3,
  4002. 'pig' => q?register :FUNCPATH:/testudf.jar;
  4003. define bad org.apache.pig.test.udf.evalfunc.BadMonitored();
  4004. a = load ':INPATH:/singlefile/studenttab10k' as (name, age, gpa);
  4005. b = limit a 1;
  4006. c = foreach b generate bad(name);
  4007. store b into ':OUTPATH:';?,
  4008. 'verify_pig_script' => q?a = load ':INPATH:/singlefile/studenttab10k' as (name, age, gpa);
  4009. b = limit a 1;
  4010. c = foreach b generate 'barney';
  4011. store b into ':OUTPATH:';?,
  4012. }
  4013. ],
  4014. },{
  4015. 'name' => 'MergeSparseJoin',
  4016. 'tests' => [
  4017. # Simplest merge-sparse-join.
  4018. {
  4019. 'num' => 1,
  4020. 'pig' => q\register :PIGPATH:/contrib/piggybank/java/piggybank.jar
  4021. a = load ':INPATH:/singlefile/studenttab10k';
  4022. b = load ':INPATH:/singlefile/votertab10k';
  4023. c = order a by $0;
  4024. d = order b by $0;
  4025. store c into ':OUTPATH:.intermediate1';
  4026. store d into ':OUTPATH:.intermediate2' using org.apache.pig.piggybank.storage.IndexedStorage(',', '0');
  4027. exec;
  4028. e = load ':OUTPATH:.intermediate1';
  4029. f = load ':OUTPATH:.intermediate2' using org.apache.pig.piggybank.storage.IndexedStorage(',', '0');
  4030. g = join e by $0, f by $0 using 'merge-sparse';
  4031. store g into ':OUTPATH:';\,
  4032. 'verify_pig_script' => q\a = load ':INPATH:/singlefile/studenttab10k';
  4033. b = load ':INPATH:/singlefile/votertab10k';
  4034. g = join a by $0, b by $0;
  4035. store g into ':OUTPATH:';\,
  4036. 'notmq' => 1,
  4037. },
  4038. # Merge-sparse-join with left-side filter
  4039. {
  4040. 'num' => 2,
  4041. 'pig' => q\register :PIGPATH:/contrib/piggybank/java/piggybank.jar
  4042. a = load ':INPATH:/singlefile/studenttab10k';
  4043. b = load ':INPATH:/singlefile/votertab10k';
  4044. c = order a by $0;
  4045. d = order b by $0;
  4046. store c into ':OUTPATH:.intermediate1';
  4047. store d into ':OUTPATH:.intermediate2' using org.apache.pig.piggybank.storage.IndexedStorage(',', '0');
  4048. exec;
  4049. e = load ':OUTPATH:.intermediate1';
  4050. h = filter e by $1 > 30;
  4051. f = load ':OUTPATH:.intermediate2' using org.apache.pig.piggybank.storage.IndexedStorage(',', '0');
  4052. g = join h by $0, f by $0 using 'merge-sparse';
  4053. store g into ':OUTPATH:';\,
  4054. 'verify_pig_script' => q\a = load ':INPATH:/singlefile/studenttab10k';
  4055. b = load ':INPATH:/singlefile/votertab10k';
  4056. h = filter a by $1 > 30;
  4057. g = join h by $0, b by $0;
  4058. store g into ':OUTPATH:';\,
  4059. 'notmq' => 1,
  4060. },
  4061. # Merge-sparse-join with right-side filter
  4062. {
  4063. 'num' => 3,
  4064. 'pig' => q\register :PIGPATH:/contrib/piggybank/java/piggybank.jar
  4065. a = load ':INPATH:/singlefile/studenttab10k';
  4066. b = load ':INPATH:/singlefile/votertab10k';
  4067. c = order a by $0;
  4068. d = order b by $0;
  4069. store c into ':OUTPATH:.intermediate1';
  4070. store d into ':OUTPATH:.intermediate2' using org.apache.pig.piggybank.storage.IndexedStorage(',', '0');
  4071. exec;
  4072. e = load ':OUTPATH:.intermediate1';
  4073. f = load ':OUTPATH:.intermediate2' using org.apache.pig.piggybank.storage.IndexedStorage(',', '0');
  4074. i = filter f by $2 != 'democrat';
  4075. g = join e by $0, i by $0 using 'merge-sparse';
  4076. store g into ':OUTPATH:';\,
  4077. 'verify_pig_script' => q\a = load ':INPATH:/singlefile/studenttab10k';
  4078. b = load ':INPATH:/singlefile/votertab10k';
  4079. i = filter b by $2 != 'democrat';
  4080. g = join a by $0, i by $0;
  4081. store g into ':OUTPATH:';\,
  4082. 'notmq' => 1,
  4083. },
  4084. # Merge-sparse-join with key as expression
  4085. {
  4086. 'num' => 4,
  4087. 'pig' => q\register :PIGPATH:/contrib/piggybank/java/piggybank.jar
  4088. a = load ':INPATH:/singlefile/studenttab10k';
  4089. b = load ':INPATH:/singlefile/votertab10k';
  4090. c = order a by $0,$1;
  4091. d = order b by $0,$1;
  4092. store c into ':OUTPATH:.intermediate1';
  4093. store d into ':OUTPATH:.intermediate2' using org.apache.pig.piggybank.storage.IndexedStorage(',', '0,1');
  4094. exec;
  4095. e = load ':OUTPATH:.intermediate1';
  4096. f = load ':OUTPATH:.intermediate2' using org.apache.pig.piggybank.storage.IndexedStorage(',', '0,1');
  4097. g = join e by ($0,$1), f by ($0,$1) using 'merge-sparse';
  4098. store g into ':OUTPATH:';\,
  4099. 'verify_pig_script' => q\a = load ':INPATH:/singlefile/studenttab10k';
  4100. b = load ':INPATH:/singlefile/votertab10k';
  4101. g = join a by ($0,$1), b by ($0,$1);
  4102. store g into ':OUTPATH:';\,
  4103. 'notmq' => 1,
  4104. },
  4105. # Merge-sparse-join with nulls in keys and data.
  4106. {
  4107. 'num' => 5,
  4108. 'pig' => q\register :PIGPATH:/contrib/piggybank/java/piggybank.jar
  4109. a = load ':INPATH:/singlefile/studentnulltab10k';
  4110. b = load ':INPATH:/singlefile/voternulltab10k';
  4111. c = order a by $0;
  4112. d = order b by $0;
  4113. store c into ':OUTPATH:.intermediate1';
  4114. store d into ':OUTPATH:.intermediate2' using org.apache.pig.piggybank.storage.IndexedStorage(',', '0');
  4115. exec;
  4116. e = load ':OUTPATH:.intermediate1';
  4117. f = load ':OUTPATH:.intermediate2' using org.apache.pig.piggybank.storage.IndexedStorage(',', '0');
  4118. g = join e by $0, f by $0 using 'merge-sparse';
  4119. store g into ':OUTPATH:';\,
  4120. 'verify_pig_script' => q\a = load ':INPATH:/singlefile/studentnulltab10k';
  4121. b = load ':INPATH:/singlefile/voternulltab10k';
  4122. g = join a by $0, b by $0;
  4123. store g into ':OUTPATH:';\,
  4124. 'notmq' => 1,
  4125. },
  4126. # Merge-sparse-join with join on numeric key
  4127. {
  4128. 'num' => 6,
  4129. 'pig' => q\register :PIGPATH:/contrib/piggybank/java/piggybank.jar
  4130. a = load ':INPATH:/singlefile/studenttab10k' as (name:chararray, age:int, gpa:float);
  4131. b = load ':INPATH:/singlefile/votertab10k'as (name:chararray, age:int, reg:chararray, contrib:float);
  4132. c = order a by age;
  4133. d = order b by age;
  4134. store c into ':OUTPATH:.intermediate1';
  4135. store d into ':OUTPATH:.intermediate2' using org.apache.pig.piggybank.storage.IndexedStorage(',', '0');
  4136. exec;
  4137. e = load ':OUTPATH:.intermediate1' as (name:chararray, age:int, gpa:float);
  4138. f = load ':OUTPATH:.intermediate2' using org.apache.pig.piggybank.storage.IndexedStorage(',', '0') as (name:chararray, age:int, reg:chararray, contrib:float);
  4139. g = join e by age, f by age using 'merge-sparse';
  4140. store g into ':OUTPATH:';\,
  4141. 'verify_pig_script' => q\a = load ':INPATH:/singlefile/studenttab10k' as (name:chararray, age:int, gpa:float);
  4142. b = load ':INPATH:/singlefile/votertab10k'as (name:chararray, age:int, reg:chararray, contrib:float);
  4143. g = join a by age, b by age;
  4144. store g into ':OUTPATH:';\,
  4145. 'notmq' => 1,
  4146. }
  4147. ],
  4148. },{
  4149. 'name' => 'BugFix',
  4150. 'tests' => [
  4151. {
  4152. # PIG-2286
  4153. 'num' => 1,
  4154. 'pig' => q?A = LOAD ':INPATH:/singlefile/studenttab10k' AS (name, age:double, gpa:double);
  4155. B = group A all;
  4156. C = foreach B generate group, COR(A.age, A.gpa);
  4157. store C into ':OUTPATH:';?,
  4158. 'verify_pig_script' => q?set pig.exec.nocombiner true
  4159. A = LOAD ':INPATH:/singlefile/studenttab10k' AS (name, age:double ,gpa:double);
  4160. B = group A all;
  4161. C = foreach B generate group, COR(A.age, A.gpa);
  4162. store C into ':OUTPATH:';?,
  4163. }, {
  4164. # PIG-2286, with 3 inputs to COR
  4165. 'num' => 2,
  4166. 'pig' => q?A = LOAD ':INPATH:/singlefile/studenttab10k' AS (name, age:double ,gpa:double);
  4167. B = foreach A generate age, gpa, gpa*gpa as gpa2;
  4168. C = group B all;
  4169. D = foreach C generate group, COR(B.age, B.gpa, B.gpa2);
  4170. store D into ':OUTPATH:';?,
  4171. 'verify_pig_script' => q?set pig.exec.nocombiner true
  4172. A = LOAD ':INPATH:/singlefile/studenttab10k' AS (name, age:double ,gpa:double);
  4173. B = foreach A generate age, gpa, gpa*gpa as gpa2;
  4174. C = group B all;
  4175. D = foreach C generate group, COR(B.age, B.gpa, B.gpa2);
  4176. store D into ':OUTPATH:';?,
  4177. }, {
  4178. # PIG-2385
  4179. 'num' => 3,
  4180. 'pig_params' => ['-M'],
  4181. 'pig' => q?A = LOAD ':INPATH:/singlefile/studenttab10k' AS (name:chararray, age:int,gpa:double);
  4182. Z = group A all;
  4183. Z1 = foreach Z generate AVG(A.gpa) as avg;
  4184. B = foreach A generate name, age, gpa-Z1.avg as diff;
  4185. STORE B INTO ':OUTPATH:.1';
  4186. C = DISTINCT B ;
  4187. store C into ':OUTPATH:.2';?,
  4188. 'verify_pig_script' => q?A = LOAD ':INPATH:/singlefile/studenttab10k' AS (name:chararray, age:int,gpa:double);
  4189. Z = group A all;
  4190. Z1 = foreach Z generate AVG(A.gpa) as avg;
  4191. B = cross A, Z1;
  4192. B1 = foreach B generate name, age, gpa-Z1.avg as diff;
  4193. STORE B1 INTO ':OUTPATH:.1';
  4194. C = DISTINCT B1 ;
  4195. store C into ':OUTPATH:.2';?,
  4196. }, {
  4197. # PIG-2576
  4198. 'num' => 4,
  4199. 'execonly' => 'mapred',
  4200. 'pig' => q?register :FUNCPATH:/testudf.jar;
  4201. define printconf org.apache.pig.test.udf.evalfunc.UdfContextFrontend('dummy');
  4202. a = load ':INPATH:/singlefile/studenttab10k' as (name, age, gpa);
  4203. b = limit a 1;
  4204. c = foreach b generate printconf(name);
  4205. store c into ':OUTPATH:';
  4206. fs -ls;
  4207. ?,
  4208. 'rc' => 0,
  4209. 'not_expected_out_regex' => "checkJobConf: conf is null: false",
  4210. 'expected_out_regex' => "checkJobConf: conf is null: true",
  4211. }
  4212. ],
  4213. },{
  4214. 'name' => 'Bloom',
  4215. 'execonly' => 'mapred', # distributed cache does not work in local mode
  4216. 'tests' => [
  4217. {
  4218. 'num' => 1,
  4219. 'pig' => "define bb BuildBloom('Hash.JENKINS_HASH', 'fixed', '128', '3');
  4220. A = LOAD ':INPATH:/singlefile/studenttab10k' AS (name:chararray, age:int, gpa:double);
  4221. B = filter A by name == 'alice allen';
  4222. C = group B all;
  4223. D = foreach C generate bb(B.name);
  4224. store D into ':HDFSTMP:/mybloom_1';
  4225. exec;
  4226. define bloom Bloom(':HDFSTMP:/mybloom_1');
  4227. E = LOAD ':INPATH:/singlefile/studenttab10k' AS (name:chararray, age:int, gpa:double);
  4228. F = filter E by bloom(name);
  4229. store F into ':OUTPATH:';",
  4230. 'notmq' => 1,
  4231. 'verify_pig_script' => "
  4232. A = LOAD ':INPATH:/singlefile/studenttab10k' AS (name, age:int ,gpa:double);
  4233. B = filter A by name == 'alice allen';
  4234. store B into ':OUTPATH:';",
  4235. }, {
  4236. 'num' => 2,
  4237. 'pig' => "define bb BuildBloom('Hash.MURMUR_HASH', 'fixed', '128', '3');
  4238. A = LOAD ':INPATH:/singlefile/studenttab10k' AS (name:chararray, age:int, gpa:double);
  4239. B = filter A by name == 'alice allen';
  4240. C = group B all;
  4241. D = foreach C generate bb(B.name);
  4242. store D into ':HDFSTMP:/mybloom_2';
  4243. exec;
  4244. define bloom Bloom(':HDFSTMP:/mybloom_2');
  4245. E = LOAD ':INPATH:/singlefile/studenttab10k' AS (name:chararray, age:int, gpa:double);
  4246. F = filter E by bloom(name);
  4247. G = load ':INPATH:/singlefile/votertab10k'as (name:chararray, age:int, reg:chararray, contrib:float);
  4248. H = join F by name, G by name;
  4249. store H into ':OUTPATH:';",
  4250. 'notmq' => 1,
  4251. 'verify_pig_script' => "
  4252. A = LOAD ':INPATH:/singlefile/studenttab10k' AS (name, age:int ,gpa:double);
  4253. B = filter A by name == 'alice allen';
  4254. C = load ':INPATH:/singlefile/votertab10k'as (name:chararray, age:int, reg:chararray, contrib:float);
  4255. D = join B by name, C by name;
  4256. store D into ':OUTPATH:';",
  4257. },{
  4258. 'num' => 3,
  4259. 'pig' => "define bb BuildBloom('Hash.JENKINS_HASH', '1', '0.0001');
  4260. A = LOAD ':INPATH:/singlefile/studenttab10k' AS (name:chararray, age:int, gpa:double);
  4261. B = filter A by name == 'alice allen';
  4262. C = group B all;
  4263. D = foreach C generate bb(B.name);
  4264. store D into ':HDFSTMP:/mybloom_3';
  4265. exec;
  4266. define bloom Bloom(':HDFSTMP:/mybloom_3');
  4267. E = LOAD ':INPATH:/singlefile/studenttab10k' AS (name:chararray, age:int, gpa:double);
  4268. F = filter E by bloom(name);
  4269. G = load ':INPATH:/singlefile/votertab10k'as (name:chararray, age:int, reg:chararray, contrib:float);
  4270. H = join G by name, F by name using 'repl';
  4271. store H into ':OUTPATH:';",
  4272. 'notmq' => 1,
  4273. 'verify_pig_script' => "
  4274. A = LOAD ':INPATH:/singlefile/studenttab10k' AS (name, age:int ,gpa:double);
  4275. B = filter A by name == 'alice allen';
  4276. C = load ':INPATH:/singlefile/votertab10k'as (name:chararray, age:int, reg:chararray, contrib:float);
  4277. D = join C by name, B by name;
  4278. store D into ':OUTPATH:';",
  4279. }
  4280. ],
  4281. },{
  4282. 'name' => 'UDFContext',
  4283. 'tests' => [
  4284. {
  4285. # See PIG-2338
  4286. 'num' => 1,
  4287. 'pig' => q?register :FUNCPATH:/testudf.jar
  4288. a = load ':INPATH:/singlefile/studenttab10k' AS (a0);
  4289. b = foreach a generate org.apache.pig.test.udf.evalfunc.UDFContextTestUDF(a0);
  4290. c = load ':INPATH:/singlefile/studenttab10k' AS (c0:chararray);
  4291. d = foreach c generate org.apache.pig.test.udf.evalfunc.UDFContextTestUDF(c0);
  4292. e = union b, d;
  4293. store e into ':OUTPATH:';?,
  4294. 'verify_pig_script' => q?a = load ':INPATH:/singlefile/studenttab10k' AS (a0);
  4295. b = foreach a generate '{a0: bytearray}';
  4296. c = load ':INPATH:/singlefile/studenttab10k' AS (c0:chararray);
  4297. d = foreach c generate '{c0: chararray}';
  4298. e = union b, d;
  4299. store e into ':OUTPATH:';?,
  4300. }
  4301. ],
  4302. },{
  4303. 'name' => 'UDFContextAuto',
  4304. 'tests' => [
  4305. {
  4306. # See PIG-2337
  4307. 'num' => 1,
  4308. 'pig' => q?register :FUNCPATH:/testudf.jar
  4309. a = load ':INPATH:/singlefile/studenttab10k' AS (a0);
  4310. b = foreach a generate org.apache.pig.test.udf.evalfunc.UDFContextTestUDF(a0);
  4311. c = load ':INPATH:/singlefile/studenttab10k' AS (c0:chararray);
  4312. d = foreach c generate org.apache.pig.test.udf.evalfunc.UDFContextTestUDF(c0);
  4313. e = union b, d;
  4314. store e into ':OUTPATH:';?,
  4315. 'verify_pig_script' => q?a = load ':INPATH:/singlefile/studenttab10k' AS (a0);
  4316. b = foreach a generate '{a0: bytearray}';
  4317. c = load ':INPATH:/singlefile/studenttab10k' AS (c0:chararray);
  4318. d = foreach c generate '{c0: chararray}';
  4319. e = union b, d;
  4320. store e into ':OUTPATH:';?,
  4321. }
  4322. ],
  4323. },{
  4324. 'name' => 'JsonLoaderStorage',
  4325. 'tests' => [
  4326. {
  4327. 'num' => 1,
  4328. 'pig' => q?A = LOAD ':INPATH:/singlefile/studenttab10k' AS (name:chararray, age:int, gpa:double);
  4329. store A into ':OUTPATH:.intermediate' using JsonStorage();
  4330. exec
  4331. A = LOAD ':OUTPATH:.intermediate' using JsonLoader();
  4332. store A into ':OUTPATH:';?,
  4333. 'notmq' => 1,
  4334. 'verify_pig_script' => q?A = LOAD ':INPATH:/singlefile/studenttab10k' AS (name:chararray, age:int,gpa:double);
  4335. store A into ':OUTPATH:';?,
  4336. }, {
  4337. 'num' => 2,
  4338. 'pig' => q?A = LOAD ':INPATH:/singlefile/studenttab10k' AS (name:chararray, age:int, gpa:double);
  4339. store A into ':OUTPATH:.intermediate1' using JsonStorage();
  4340. B = LOAD ':INPATH:/singlefile/votertab10k' AS (name:chararray, age:int, registration:chararray, contributions:double);
  4341. store B into ':OUTPATH:.intermediate2' using JsonStorage();
  4342. exec
  4343. A = LOAD ':OUTPATH:.intermediate1' using JsonLoader();
  4344. B = LOAD ':OUTPATH:.intermediate2' using JsonLoader();
  4345. C = JOIN A by name, B by name;
  4346. store C into ':OUTPATH:';?,
  4347. 'notmq' => 1,
  4348. 'verify_pig_script' => q?A = LOAD ':INPATH:/singlefile/studenttab10k' AS (name:chararray, age:int,gpa:double);
  4349. B = LOAD ':INPATH:/singlefile/votertab10k' AS (name:chararray, age:int, registration:chararray, contributions:double);
  4350. C = JOIN A by name, B by name;
  4351. store C into ':OUTPATH:';?,
  4352. }, {
  4353. 'num' => 3,
  4354. 'ignore' => 1, # PIG-2594
  4355. 'pig' => q\a = load ':INPATH:/singlefile/allscalar10k' using PigStorage() as (name:chararray, age:int, gpa:double, instate:boolean);
  4356. store a into ':OUTPATH:.intermediate' using JsonStorage();
  4357. exec
  4358. B = LOAD ':OUTPATH:.intermediate' using JsonLoader();
  4359. store B into ':OUTPATH:';\,
  4360. 'notmq' => 1,
  4361. 'verify_pig_script' => q\a = load ':INPATH:/singlefile/allscalar10k' using PigStorage() as (name:chararray, age:int, gpa:double, instate:chararray);
  4362. store a into ':OUTPATH:';\,
  4363. }
  4364. ],
  4365. },{
  4366. 'name' => 'STRSPLIT',
  4367. 'tests' => [
  4368. {
  4369. # See PIG-2311
  4370. 'num' => 1,
  4371. 'pig' => q?a = load ':INPATH:/singlefile/studenttab10k' AS (a0);
  4372. b= filter a by NOT (a0 is null);
  4373. c= foreach b generate STRSPLIT(a0);
  4374. store c into ':OUTPATH:';?,
  4375. 'verify_pig_script' => q?a = load ':INPATH:/singlefile/studenttab10k' AS (a0);
  4376. b= filter a by NOT (a0 is null);
  4377. b= foreach b generate (chararray)a0 as a0 ;
  4378. c= foreach b generate STRSPLIT(a0);
  4379. store c into ':OUTPATH:';?,
  4380. }
  4381. ],
  4382. },
  4383. {
  4384. 'name' => 'Tokenize',
  4385. 'tests' => [
  4386. {
  4387. 'num' => 1,
  4388. 'pig' => q\
  4389. A = LOAD ':INPATH:/singlefile/studenttab10k';
  4390. B = foreach A generate TOKENIZE($0);
  4391. store B into ':OUTPATH:';\,
  4392. },
  4393. {
  4394. 'num' => 2,
  4395. 'pig' => q\
  4396. A = LOAD ':INPATH:/singlefile/studenttab10k';
  4397. B = foreach A generate TOKENIZE($1,'9');
  4398. store B into ':OUTPATH:';\,
  4399. 'verify_pig_script' => q\
  4400. A = LOAD ':INPATH:/singlefile/studenttab10k';
  4401. -- TOKENIZE has tokens hardcoded so have to replace the '9' with
  4402. -- one of the hardcoded tokens
  4403. B = foreach A generate TOKENIZE(REPLACE($1, '9', ','));
  4404. store B into ':OUTPATH:';\,
  4405. }
  4406. ]
  4407. }, {
  4408. 'name' => 'Realias',
  4409. 'tests' => [
  4410. {
  4411. 'num' => 1,
  4412. 'pig' => q\
  4413. A = LOAD ':INPATH:/singlefile/studenttab10k';
  4414. B = A;
  4415. store B into ':OUTPATH:';\,
  4416. 'verify_pig_script' => q\
  4417. A = LOAD ':INPATH:/singlefile/studenttab10k';
  4418. store A into ':OUTPATH:';\,
  4419. }
  4420. ]
  4421. },
  4422. {
  4423. 'name' => 'NestedForEach',
  4424. 'tests' => [
  4425. {
  4426. 'num' => 1,
  4427. 'pig' => q\
  4428. A = LOAD ':INPATH:/singlefile/studenttab10k' as (name:chararray, age:int, gpa:double);
  4429. B = group A by name;
  4430. C = foreach B {
  4431. C1 = foreach A generate UPPER(name), age+1 as age, gpa;
  4432. generate C1;
  4433. }
  4434. D = foreach C generate flatten(C1);
  4435. store D into ':OUTPATH:';\,
  4436. 'verify_pig_script' => q\
  4437. A = LOAD ':INPATH:/singlefile/studenttab10k' as (name:chararray, age:int, gpa:double);
  4438. B = foreach A generate UPPER(name), age+1, gpa;
  4439. store B into ':OUTPATH:';\,
  4440. },
  4441. {
  4442. 'num' => 2,
  4443. 'pig' => q\
  4444. A = LOAD ':INPATH:/singlefile/studenttab10k' as (name:chararray, age:int, gpa:double);;
  4445. B = group A by name;
  4446. C = foreach B {
  4447. C1 = A.age;
  4448. C2 = filter C1 by age>=30;
  4449. C3 = foreach C2 generate age+1 as age;
  4450. C4 = order C3 by age desc;
  4451. generate C4;
  4452. }
  4453. D = foreach C generate flatten(C4);
  4454. store D into ':OUTPATH:';\,
  4455. 'verify_pig_script' => q\
  4456. A = LOAD ':INPATH:/singlefile/studenttab10k' as (name:chararray, age:int, gpa:double);
  4457. B = filter A by age>=30;
  4458. C = foreach B generate age+1 as age;
  4459. D = order C by age desc;
  4460. store D into ':OUTPATH:';\,
  4461. }
  4462. ]
  4463. },
  4464. {
  4465. 'name' => 'NestedCross',
  4466. 'tests' => [
  4467. {
  4468. 'num' => 1,
  4469. 'pig' => q\
  4470. A = LOAD ':INPATH:/singlefile/studenttab10k' as (name:chararray, age:int, gpa:double);
  4471. B = LOAD ':INPATH:/singlefile/votertab10k' as (name:chararray, age:int, registration, contributions:double);
  4472. C = cogroup A by name, B by name;
  4473. D = foreach C {
  4474. C1 = cross A, B;
  4475. generate flatten(C1);
  4476. }
  4477. store D into ':OUTPATH:';\,
  4478. 'verify_pig_script' => q\
  4479. A = LOAD ':INPATH:/singlefile/studenttab10k' as (name:chararray, age:int, gpa:double);
  4480. B = LOAD ':INPATH:/singlefile/votertab10k' as (name:chararray, age:int, registration, contributions:double);
  4481. C = JOIN A by name, B by name;
  4482. store C into ':OUTPATH:';\,
  4483. },
  4484. {
  4485. 'num' => 2,
  4486. 'pig' => q\
  4487. A = LOAD ':INPATH:/singlefile/studenttab10k' as (name:chararray, age:int, gpa:double);
  4488. B = LOAD ':INPATH:/singlefile/votertab10k' as (name:chararray, age:int, registration, contributions:double);
  4489. C = cogroup A by name, B by name;
  4490. D = foreach C {
  4491. C1 = filter A by gpa > 4;
  4492. C2 = filter B by contributions > 500;
  4493. C3 = cross C1, C2;
  4494. C4 = foreach C3 generate CONCAT(CONCAT(gpa, '_'), contributions);
  4495. generate flatten(C4);
  4496. }
  4497. store D into ':OUTPATH:';\,
  4498. 'verify_pig_script' => q\
  4499. A = LOAD ':INPATH:/singlefile/studenttab10k' as (name:chararray, age:int, gpa:double);
  4500. B = LOAD ':INPATH:/singlefile/votertab10k' as (name:chararray, age:int, registration, contributions:double);
  4501. C = filter A by gpa > 4;
  4502. D = filter B by contributions > 500;
  4503. E = JOIN C by name, D by name;
  4504. F = foreach E generate CONCAT(CONCAT(gpa, '_'), contributions);
  4505. store F into ':OUTPATH:';\,
  4506. }
  4507. ]
  4508. }
  4509. ],
  4510. },
  4511. ;