PageRenderTime 50ms CodeModel.GetById 18ms RepoModel.GetById 0ms app.codeStats 1ms

/py/testdir_single_jvm/test_model_management.py

https://github.com/dbfree/h2o
Python | 600 lines | 482 code | 75 blank | 43 comment | 21 complexity | 6c082cce572a1252cd1bfcc8f02dd8c0 MD5 | raw file
Possible License(s): Apache-2.0
  1. import unittest, time, sys, os
  2. sys.path.extend(['.','..','py'])
  3. import h2o, h2o_cmd, h2o_hosts, h2o_import as h2i, h2o_exec
  4. import h2o_glm, h2o_gbm, h2o_rf # TODO: DeepLearning
  5. class ModelManagementTestCase(unittest.TestCase):
  6. tear_down_cloud = True
  7. # tear_down_cloud = False
  8. def tearDown(self):
  9. h2o.check_sandbox_for_errors()
  10. @classmethod
  11. def setUpClass(cls):
  12. global localhost
  13. cloud_size = 5
  14. if h2o.clone_cloud_json is not None:
  15. print "NOTE: Connecting to existing cloud, and leaving the cloud running afterwards: " + os.path.abspath(h2o.clone_cloud_json)
  16. localhost = h2o.decide_if_localhost()
  17. if (localhost):
  18. print "Calling h2o.build_cloud(" + str(cloud_size) + "). . ."
  19. h2o.build_cloud(cloud_size)
  20. else:
  21. h2o_hosts.build_cloud_with_hosts(1)
  22. print "Calling h2o_hosts.build_cloud_with_hosts(1). . ."
  23. # USE FVec!
  24. h2o.beta_features = True
  25. @classmethod
  26. def tearDownClass(cls):
  27. if h2o.clone_cloud_json is None:
  28. if ModelManagementTestCase.tear_down_cloud:
  29. h2o.tear_down_cloud()
  30. else:
  31. None
  32. else:
  33. h2o.check_sandbox_for_errors(sandboxIgnoreErrors=False, python_test_name="test_model_management")
  34. already_set_up = False
  35. ''' Lazy setup of the common frames and models used by the test cases. '''
  36. def setUp(self):
  37. if ModelManagementTestCase.already_set_up:
  38. return
  39. self.create_models(self.import_frames())
  40. ModelManagementTestCase.already_set_up = True
  41. def import_frame(self, target_key, bucket, csvFilename, csvPathname, expected_rows, expected_cols):
  42. path = csvPathname + '/' + csvFilename
  43. parseResult = h2i.import_parse(bucket=bucket, path=path, hex_key=target_key, schema='put') # upload the file
  44. destination_key = parseResult['destination_key'] # we block until it's actually ready
  45. inspect = h2o_cmd.runInspect(None, parseResult['destination_key'])
  46. h2o_cmd.infoFromInspect(inspect, csvPathname)
  47. actual_rows = inspect['numRows']
  48. actual_cols = inspect['numCols']
  49. print 'loaded frame "' + target_key +'" from path: ' + path
  50. print 'rows: ', actual_rows
  51. print 'cols: ', actual_cols
  52. # Don't have access to the testCase assert methods here because they aren't class methods. :-(
  53. assert expected_rows == actual_rows, "Expected " + str(expected_rows) + " but got " + str(actual_rows) + " for path: " + path
  54. assert expected_cols == actual_cols, "Expected " + str(expected_cols) + " but got " + str(actual_cols) + " for path: " + path
  55. # TODO: other info we could check
  56. # (missingValuesDict, constantValuesDict, enumSizeDict, colTypeDict, colNameDict) = \
  57. # h2o_cmd.columnInfoFromInspect(parseResult['destination_key'], exceptionOnMissingValues=True)
  58. #
  59. # summaryResult = h2o_cmd.runSummary(key=parseResult['destination_key'])
  60. # h2o_cmd.infoFromSummary(summaryResult) # , noPrint=True
  61. return destination_key
  62. # TODO: generalize by passing in the exec2 expression
  63. def create_new_boolean(self, frame, old_col_name, new_col_name):
  64. node = h2o.nodes[0]
  65. # NOTE: 1-based column indexing!
  66. resultExec, ncols = h2o_exec.exec_expr(execExpr='ncol(' + frame + ')')
  67. # print 'before doing anything, ncols: ', int(ncols)
  68. resultExec, dontcare = h2o_exec.exec_expr(execExpr="{0}[, ncol({0}) + 1] = ({0}${1} == 1)".format(frame, old_col_name))
  69. resultExec, ncols = h2o_exec.exec_expr(execExpr="ncol({0})".format(frame))
  70. ncols = int(ncols)
  71. # print 'after allegedly creating new column ncols: ', ncols
  72. node.set_column_names(source=frame, cols='C' + str(ncols), comma_separated_list=new_col_name)
  73. def import_frames(self):
  74. node = h2o.nodes[0]
  75. prostate_hex = self.import_frame('prostate.hex', 'smalldata', 'prostate.csv', 'logreg', 380, 9)
  76. airlines_train_hex = self.import_frame('airlines_train.hex', 'smalldata', 'AirlinesTrain.csv.zip', 'airlines', 24421, 12)
  77. airlines_test_hex = self.import_frame('airlines_test.hex', 'smalldata', 'AirlinesTest.csv.zip', 'airlines', 2691, 12)
  78. has_uuid_hex = self.import_frame('has_uuid.hex', 'smalldata', 'test_all_raw_top10rows.csv', 'test', 12, 89)
  79. # get the hashes
  80. print "Checking " + str(len(h2o.nodes)) + " nodes for frames: "
  81. for a_node in h2o.nodes:
  82. print " " + a_node.http_addr + ":" + str(a_node.port)
  83. test_hash_before = -1
  84. train_hash_before = -1
  85. for a_node in h2o.nodes:
  86. frames = a_node.frames()
  87. self.assertKeysExist(frames, 'frames', ['airlines_train.hex'])
  88. self.assertKeysExist(frames, 'frames', ['airlines_test.hex'])
  89. self.assertKeysExist(frames, 'frames/airlines_test.hex', ['id'])
  90. self.assertKeysExist(frames, 'frames', ['has_uuid.hex'])
  91. # Make sure we have the same checksums everywhere:
  92. tmp = frames['frames']['airlines_test.hex']['id']
  93. if test_hash_before != -1:
  94. self.assertEquals(tmp, test_hash_before, "Same hash on every node for airlines_test.hex")
  95. test_hash_before = tmp
  96. # Make sure we have the same checksums everywhere:
  97. tmp = frames['frames']['airlines_train.hex']['id']
  98. if train_hash_before != -1:
  99. self.assertEquals(tmp, train_hash_before, "Same hash on every node for airlines_train.hex")
  100. train_hash_before = tmp
  101. self.assertNotEqual("ffffffffffffffff", test_hash_before);
  102. self.assertNotEqual("ffffffffffffffff", train_hash_before);
  103. self.assertNotEqual("0", test_hash_before);
  104. self.assertNotEqual("0", train_hash_before);
  105. # Add new proper boolean response columns
  106. self.create_new_boolean('airlines_train.hex', 'IsDepDelayed_REC', 'IsDepDelayed_REC_recoded')
  107. self.create_new_boolean('airlines_test.hex', 'IsDepDelayed_REC', 'IsDepDelayed_REC_recoded')
  108. # get the hashes and ensure they've changed
  109. frames = node.frames()
  110. self.assertKeysExist(frames, 'frames', ['airlines_train.hex'])
  111. self.assertKeysExist(frames, 'frames', ['airlines_test.hex'])
  112. self.assertKeysExist(frames, 'frames/airlines_test.hex', ['id'])
  113. train_hash_after = frames['frames']['airlines_train.hex']['id']
  114. test_hash_after = frames['frames']['airlines_test.hex']['id']
  115. self.assertNotEqual(train_hash_before, train_hash_after, "Expected airlines_train hash to change. . . Before and after were both: " + train_hash_after)
  116. self.assertNotEqual(test_hash_before, test_hash_after, "Expected airlines_test hash to change. . . Before and after were both: " + test_hash_after)
  117. print "airlines_train hash before: ", train_hash_before, ", after: ", train_hash_after
  118. print "airlines_test hash before: ", test_hash_before, ", after: ", test_hash_after
  119. return (prostate_hex, airlines_train_hex, airlines_test_hex)
  120. def create_models(self, frame_keys):
  121. prostate_hex, airlines_train_hex, airlines_test_hex = frame_keys
  122. self.assertIsNotNone(prostate_hex)
  123. self.assertIsNotNone(airlines_train_hex)
  124. self.assertIsNotNone(airlines_test_hex)
  125. node = h2o.nodes[0]
  126. num_models = 0
  127. print "##############################################################"
  128. print "Generating AirlinesTrain GLM2 binary classification model. . ."
  129. # R equivalent: h2o.glm.FV(y = "IsDepDelayed", x = c("Origin", "Dest", "fDayofMonth", "fYear", "UniqueCarrier", "fDayOfWeek", "fMonth", "DepTime", "ArrTime", "Distance"), data = airlines_train.hex, family = "binomial", alpha=0.05, lambda=1.0e-2, standardize=FALSE, nfolds=0)
  130. glm_AirlinesTrain_1_params = {
  131. 'destination_key': 'glm_AirlinesTrain_binary_1',
  132. 'response': 'IsDepDelayed',
  133. 'ignored_cols': 'IsDepDelayed_REC, IsDepDelayed_REC_recoded',
  134. 'family': 'binomial',
  135. 'alpha': 0.5,
  136. 'standardize': 0,
  137. 'lambda': 1.0e-2,
  138. 'n_folds': 0,
  139. 'use_all_factor_levels': 1
  140. }
  141. glm_AirlinesTrain_1 = node.GLM(airlines_train_hex, **glm_AirlinesTrain_1_params)
  142. num_models = num_models + 1
  143. h2o_glm.simpleCheckGLM(self, glm_AirlinesTrain_1, None, **glm_AirlinesTrain_1_params)
  144. print "####################################################################"
  145. print "Generating AirlinesTrain simple GBM binary classification model. . ."
  146. # R equivalent: h2o.gbm(y = "IsDepDelayed", x = c("Origin", "Dest", "fDayofMonth", "fYear", "UniqueCarrier", "fDayOfWeek", "fMonth", "DepTime", "ArrTime", "Distance"), data = airlines_train.hex, n.trees=3, interaction.depth=1, distribution="multinomial", n.minobsinnode=2, shrinkage=.1)
  147. gbm_AirlinesTrain_1_params = {
  148. 'destination_key': 'gbm_AirlinesTrain_binary_1',
  149. 'response': 'IsDepDelayed',
  150. 'ignored_cols_by_name': 'IsDepDelayed_REC, IsDepDelayed_REC_recoded',
  151. 'ntrees': 3,
  152. 'max_depth': 1,
  153. 'classification': 1
  154. # TODO: what about minobsinnode and shrinkage?!
  155. }
  156. gbm_AirlinesTrain_1 = node.gbm(airlines_train_hex, **gbm_AirlinesTrain_1_params)
  157. num_models = num_models + 1
  158. print "#####################################################################"
  159. print "Generating AirlinesTrain complex GBM binary classification model. . ."
  160. # R equivalent: h2o.gbm(y = "IsDepDelayed", x = c("Origin", "Dest", "fDayofMonth", "fYear", "UniqueCarrier", "fDayOfWeek", "fMonth", "DepTime", "ArrTime", "Distance"), data = airlines_train.hex, n.trees=50, interaction.depth=5, distribution="multinomial", n.minobsinnode=2, shrinkage=.1)
  161. gbm_AirlinesTrain_2_params = {
  162. 'destination_key': 'gbm_AirlinesTrain_binary_2',
  163. 'response': 'IsDepDelayed',
  164. 'ignored_cols_by_name': 'IsDepDelayed_REC, IsDepDelayed_REC_recoded',
  165. 'ntrees': 50,
  166. 'max_depth': 5,
  167. 'classification': 1
  168. # TODO: what about minobsinnode and shrinkage?!
  169. }
  170. gbm_AirlinesTrain_2 = node.gbm(airlines_train_hex, **gbm_AirlinesTrain_2_params)
  171. num_models = num_models + 1
  172. print "####################################################################"
  173. print "Generating AirlinesTrain simple DRF binary classification model. . ."
  174. # R equivalent: h2o.randomForest.FV(y = "IsDepDelayed", x = c("Origin", "Dest", "fDayofMonth", "fYear", "UniqueCarrier", "fDayOfWeek", "fMonth", "DepTime", "ArrTime", "Distance"), data = airlines_train.hex, ntree=5, depth=2)
  175. rf_AirlinesTrain_1_params = {
  176. 'destination_key': 'rf_AirlinesTrain_binary_1',
  177. 'response': 'IsDepDelayed',
  178. 'ignored_cols_by_name': 'IsDepDelayed_REC, IsDepDelayed_REC_recoded',
  179. 'ntrees': 5,
  180. 'max_depth': 2,
  181. 'classification': 1
  182. }
  183. rf_AirlinesTrain_1 = node.random_forest(airlines_train_hex, **rf_AirlinesTrain_1_params)
  184. num_models = num_models + 1
  185. print "#####################################################################"
  186. print "Generating AirlinesTrain complex DRF binary classification model. . ."
  187. # R equivalent: h2o.randomForest.FV(y = "IsDepDelayed", x = c("Origin", "Dest", "fDayofMonth", "fYear", "UniqueCarrier", "fDayOfWeek", "fMonth", "DepTime", "ArrTime", "Distance"), data = airlines_train.hex, ntree=50, depth=10)
  188. rf_AirlinesTrain_2_params = {
  189. 'destination_key': 'rf_AirlinesTrain_binary_2',
  190. 'response': 'IsDepDelayed',
  191. 'ignored_cols_by_name': 'IsDepDelayed_REC, IsDepDelayed_REC_recoded',
  192. 'ntrees': 50,
  193. 'max_depth': 10,
  194. 'classification': 1
  195. }
  196. rf_AirlinesTrain_2 = node.random_forest(airlines_train_hex, **rf_AirlinesTrain_2_params)
  197. num_models = num_models + 1
  198. print "#####################################################################"
  199. print "Generating AirlinesTrain complex SpeeDRF binary classification model. . ."
  200. # what is the R binding?
  201. speedrf_AirlinesTrain_1_params = {
  202. 'destination_key': 'speedrf_AirlinesTrain_binary_1',
  203. 'response': 'IsDepDelayed',
  204. 'ignored_cols_by_name': 'IsDepDelayed_REC, IsDepDelayed_REC_recoded',
  205. 'ntrees': 50,
  206. 'max_depth': 10,
  207. 'classification': 1
  208. }
  209. # TODO: put back; fails to complete in multinode
  210. # speedrf_AirlinesTrain_1 = node.speedrf(airlines_train_hex, **speedrf_AirlinesTrain_1_params)
  211. # num_models = num_models + 1
  212. print "######################################################################"
  213. print "Generating AirlinesTrain DeepLearning binary classification model. . ."
  214. # R equivalent: h2o.deeplearning(y = "IsDepDelayed", x = c("Origin", "Dest", "fDayofMonth", "fYear", "UniqueCarrier", "fDayOfWeek", "fMonth", "DepTime", "ArrTime", "Distance"), data = airlines_train.hex, classification=TRUE, hidden=c(10, 10))
  215. dl_AirlinesTrain_1_params = {
  216. 'destination_key': 'dl_AirlinesTrain_binary_1',
  217. 'response': 'IsDepDelayed',
  218. 'ignored_cols': 'IsDepDelayed_REC, IsDepDelayed_REC_recoded',
  219. 'hidden': [10, 10],
  220. 'classification': 1,
  221. 'variable_importances': 1
  222. }
  223. dl_AirlinesTrain_1 = node.deep_learning(airlines_train_hex, **dl_AirlinesTrain_1_params)
  224. num_models = num_models + 1
  225. print "##############################################################################################"
  226. print "Generating AirlinesTrain GLM2 binary classification model with different response column. . ."
  227. # R equivalent: h2o.glm.FV(y = "IsDepDelayed_REC", x = c("Origin", "Dest", "fDayofMonth", "fYear", "UniqueCarrier", "fDayOfWeek", "fMonth", "DepTime", "ArrTime", "Distance"), data = airlines_train.hex, family = "binomial", alpha=0.05, lambda=1.0e-2, standardize=FALSE, nfolds=0)
  228. glm_AirlinesTrain_A_params = {
  229. 'destination_key': 'glm_AirlinesTrain_binary_A',
  230. 'response': 'IsDepDelayed_REC_recoded',
  231. 'ignored_cols': 'IsDepDelayed, IsDepDelayed_REC',
  232. 'family': 'binomial',
  233. 'alpha': 0.5,
  234. 'standardize': 0,
  235. 'lambda': 1.0e-2,
  236. 'n_folds': 0,
  237. 'use_all_factor_levels': 1
  238. }
  239. glm_AirlinesTrain_A = node.GLM(airlines_train_hex, **glm_AirlinesTrain_A_params)
  240. num_models = num_models + 1
  241. h2o_glm.simpleCheckGLM(self, glm_AirlinesTrain_A, None, **glm_AirlinesTrain_A_params)
  242. print "#########################################################"
  243. print "Generating Prostate GLM2 binary classification model. . ."
  244. # R equivalent: h2o.glm.FV(y = "CAPSULE", x = c("AGE","RACE","PSA","DCAPS"), data = prostate.hex, family = "binomial", nfolds = 0, alpha = 0.5)
  245. glm_Prostate_1_params = {
  246. 'destination_key': 'glm_Prostate_binary_1',
  247. 'response': 'CAPSULE',
  248. 'ignored_cols': None,
  249. 'family': 'binomial',
  250. 'alpha': 0.5,
  251. 'n_folds': 0,
  252. 'use_all_factor_levels': 0 # should get warning about variable importances!
  253. }
  254. glm_Prostate_1 = node.GLM(prostate_hex, **glm_Prostate_1_params)
  255. num_models = num_models + 1
  256. h2o_glm.simpleCheckGLM(self, glm_Prostate_1, None, **glm_Prostate_1_params)
  257. print "###############################################################"
  258. print "Generating Prostate simple DRF binary classification model. . ."
  259. # R equivalent: h2o.randomForest.FV(y = "CAPSULE", x = c("AGE","RACE","DCAPS"), data = prostate.hex, ntree=10, depth=5)
  260. rf_Prostate_1_params = {
  261. 'destination_key': 'rf_Prostate_binary_1',
  262. 'response': 'CAPSULE',
  263. 'ignored_cols_by_name': None,
  264. 'ntrees': 10,
  265. 'max_depth': 5,
  266. 'classification': 1
  267. }
  268. rf_Prostate_1 = node.random_forest(prostate_hex, **rf_Prostate_1_params)
  269. num_models = num_models + 1
  270. print "#####################################################################"
  271. print "Generating Prostate complex SpeeDRF binary classification model. . ."
  272. speedrf_Prostate_1_params = {
  273. 'destination_key': 'speedrf_Prostate_binary_1',
  274. 'response': 'CAPSULE',
  275. 'ignored_cols_by_name': None,
  276. 'ntrees': 50,
  277. 'max_depth': 10,
  278. 'classification': 1
  279. }
  280. # TODO: put back; fails to complete in multinode
  281. # speedrf_Prostate_1 = node.speedrf(prostate_hex, **speedrf_Prostate_1_params)
  282. # num_models = num_models + 1
  283. print "##############################################"
  284. print "Generating Prostate GLM2 regression model. . ."
  285. # R equivalent: h2o.glm.FV(y = "AGE", x = c("CAPSULE","RACE","PSA","DCAPS"), data = prostate.hex, family = "gaussian", nfolds = 0, alpha = 0.5)
  286. glm_Prostate_regression_1_params = {
  287. 'destination_key': 'glm_Prostate_regression_1',
  288. 'response': 'AGE',
  289. 'ignored_cols': None,
  290. 'family': 'gaussian',
  291. 'alpha': 0.5,
  292. 'n_folds': 0,
  293. 'use_all_factor_levels': 1
  294. }
  295. glm_Prostate_regression_1 = node.GLM(prostate_hex, **glm_Prostate_regression_1_params)
  296. num_models = num_models + 1
  297. h2o_glm.simpleCheckGLM(self, glm_Prostate_regression_1, None, **glm_Prostate_regression_1_params)
  298. # We were getting different results for each node. Bad, bad bad. . .
  299. print "Checking " + str(len(h2o.nodes)) + " nodes for models: "
  300. for a_node in h2o.nodes:
  301. print " " + a_node.http_addr + ":" + str(a_node.port)
  302. found_problem = False
  303. for a_node in h2o.nodes:
  304. models = a_node.models()
  305. got = len(models['models'])
  306. print "For node: " + a_node.http_addr + ":" + str(a_node.port) + " checking that we got ",str(num_models), " models. . ."
  307. if num_models != got:
  308. print "p00p, not enough. . ."
  309. found_problem = True
  310. print "Got these models: " + repr(models['models'].keys())
  311. print "Expected " + str(num_models) + ", got: " + str(got)
  312. for key, value in models['models'].iteritems():
  313. self.assertEquals(value['state'], 'DONE', "Expected state to be DONE for model: " + key)
  314. self.assertNotEqual(found_problem, True, "Missing models on at least one node.")
  315. class ApiTestCase(ModelManagementTestCase):
  316. def followPath(self, d, path_elems):
  317. for path_elem in path_elems:
  318. if "" != path_elem:
  319. idx = -1
  320. if path_elem.endswith("]"):
  321. idx = int(path_elem[path_elem.find("[") + 1:path_elem.find("]")])
  322. path_elem = path_elem[:path_elem.find("[")]
  323. assert path_elem in d, "Failed to find key: " + path_elem + " in dict: " + repr(d)
  324. if -1 == idx:
  325. d = d[path_elem]
  326. else:
  327. d = d[path_elem][idx]
  328. return d
  329. def assertKeysExist(self, d, path, keys):
  330. path_elems = path.split("/")
  331. d = self.followPath(d, path_elems)
  332. for key in keys:
  333. assert key in d, "Failed to find key: " + key + " in dict: " + repr(d)
  334. def assertKeysDontExist(self, d, path, keys):
  335. path_elems = path.split("/")
  336. d = self.followPath(d, path_elems)
  337. for key in keys:
  338. assert key not in d, "Unexpectedly found key: " + key + " in dict: " + repr(d)
  339. def test_endpoints(self):
  340. node = h2o.nodes[0]
  341. print "##############################################"
  342. print "Testing /2/Frames with various options. . ."
  343. print "##############################################"
  344. print ""
  345. print "##############################################"
  346. print "Testing /2/Frames list. . ."
  347. frames = node.frames()
  348. self.assertKeysExist(frames, 'frames', ['airlines_train.hex', 'airlines_test.hex', 'prostate.hex'])
  349. self.assertKeysDontExist(frames, 'frames', ['glm_AirlinesTrain_binary_1', 'gbm_AirlinesTrain_binary_1', 'gbm_AirlinesTrain_binary_2', 'rf_AirlinesTrain_binary_1', 'rf_AirlinesTrain_binary_2', 'dl_AirlinesTrain_binary_1', 'glm_AirlinesTrain_binary_A', 'glm_Prostate_binary_1', 'rf_Prostate_binary_1', 'glm_Prostate_regression_1'])
  350. self.assertKeysDontExist(frames, '', ['models'])
  351. print "##############################################"
  352. print "Testing /2/Frames?key=airlines_test.hex. . ."
  353. frames = node.frames(key='airlines_test.hex')
  354. self.assertKeysExist(frames, 'frames', ['airlines_test.hex'])
  355. self.assertKeysDontExist(frames, 'frames', ['glm_AirlinesTrain_binary_1', 'gbm_AirlinesTrain_binary_1', 'gbm_AirlinesTrain_binary_2', 'rf_AirlinesTrain_binary_1', 'rf_AirlinesTrain_binary_2', 'dl_AirlinesTrain_binary_1', 'glm_AirlinesTrain_binary_A', 'glm_Prostate_binary_1', 'rf_Prostate_binary_1', 'glm_Prostate_regression_1', 'airlines_train.hex', 'prostate.hex'])
  356. self.assertKeysDontExist(frames, '', ['models'])
  357. self.assertKeysExist(frames, 'frames/airlines_test.hex', ['creation_epoch_time_millis', 'id', 'key', 'column_names', 'compatible_models'])
  358. self.assertEqual(frames['frames']['airlines_test.hex']['id'], "fffffffffffff38d", msg="The airlines_test.hex frame hash should be deterministic. Expected fffffffffffff38d, got: " + frames['frames']['airlines_test.hex']['id'])
  359. self.assertEqual(frames['frames']['airlines_test.hex']['key'], "airlines_test.hex", msg="The airlines_test.hex key should be airlines_test.hex.")
  360. print "##############################################"
  361. print "Testing /2/Frames?key=airlines_test.hex&find_compatible_models=true. . ."
  362. frames = node.frames(key='airlines_test.hex', find_compatible_models=1)
  363. self.assertKeysExist(frames, 'frames', ['airlines_test.hex'])
  364. self.assertKeysDontExist(frames, 'frames', ['glm_AirlinesTrain_binary_1', 'gbm_AirlinesTrain_binary_1', 'gbm_AirlinesTrain_binary_2', 'rf_AirlinesTrain_binary_1', 'rf_AirlinesTrain_binary_2', 'dl_AirlinesTrain_binary_1', 'glm_AirlinesTrain_binary_A', 'glm_Prostate_binary_1', 'rf_Prostate_binary_1', 'glm_Prostate_regression_1', 'airlines_train.hex', 'prostate.hex'])
  365. self.assertKeysExist(frames, '', ['models'])
  366. self.assertKeysExist(frames, 'models', ['glm_AirlinesTrain_binary_1', 'gbm_AirlinesTrain_binary_1', 'gbm_AirlinesTrain_binary_2', 'rf_AirlinesTrain_binary_1', 'rf_AirlinesTrain_binary_2', 'dl_AirlinesTrain_binary_1', 'glm_AirlinesTrain_binary_A'])
  367. self.assertKeysDontExist(frames, 'models', ['glm_Prostate_binary_1', 'rf_Prostate_binary_1', 'glm_Prostate_regression_1', 'airlines_train.hex', 'airlines_train.hex', 'airlines_test.hex', 'prostate.hex'])
  368. print "##############################################"
  369. print "Testing /2/Frames with various options. . ."
  370. print "##############################################"
  371. print ""
  372. print "##############################################"
  373. print "Testing /2/Models list. . ."
  374. models = node.models()
  375. self.assertKeysExist(models, 'models', ['glm_AirlinesTrain_binary_1', 'gbm_AirlinesTrain_binary_1', 'gbm_AirlinesTrain_binary_2', 'rf_AirlinesTrain_binary_1', 'rf_AirlinesTrain_binary_2', 'dl_AirlinesTrain_binary_1', 'glm_AirlinesTrain_binary_A', 'glm_Prostate_binary_1', 'rf_Prostate_binary_1', 'glm_Prostate_regression_1'])
  376. self.assertKeysExist(models, 'models/glm_AirlinesTrain_binary_1', ['id', 'key', 'creation_epoch_time_millis', 'model_category', 'state', 'input_column_names', 'response_column_name', 'critical_parameters', 'secondary_parameters', 'expert_parameters', 'compatible_frames', 'warnings'])
  377. self.assertEqual(0, len(models['models']['glm_AirlinesTrain_binary_1']['warnings']), msg="Expect no warnings for glm_AirlinesTrain_binary_1.")
  378. self.assertEqual(models['models']['glm_AirlinesTrain_binary_1']['key'], 'glm_AirlinesTrain_binary_1', "key should equal our key: " + "glm_AirlinesTrain_binary_1")
  379. self.assertKeysDontExist(models, 'models', ['airlines_train.hex', 'airlines_test.hex', 'prostate.hex'])
  380. self.assertKeysDontExist(models, '', ['frames'])
  381. print "##############################################"
  382. print "Testing /2/Models?key=rf_Prostate_binary_1. . ."
  383. models = node.models(key='rf_Prostate_binary_1')
  384. self.assertKeysExist(models, 'models', ['rf_Prostate_binary_1'])
  385. self.assertKeysExist(models, 'models/rf_Prostate_binary_1', ['warnings'])
  386. self.assertEqual(0, len(models['models']['rf_Prostate_binary_1']['warnings']), msg="Expect no warnings for rf_Prostate_binary_1.")
  387. self.assertKeysDontExist(models, 'models', ['airlines_train.hex', 'airlines_test.hex', 'prostate.hex', 'glm_AirlinesTrain_binary_1', 'gbm_AirlinesTrain_binary_1', 'gbm_AirlinesTrain_binary_2', 'rf_AirlinesTrain_binary_1', 'rf_AirlinesTrain_binary_2', 'dl_AirlinesTrain_binary_1', 'glm_AirlinesTrain_binary_A', 'glm_Prostate_binary_1', 'glm_Prostate_regression_1'])
  388. self.assertKeysDontExist(models, '', ['frames'])
  389. print "##############################################"
  390. print "Testing /2/Models?key=rf_Prostate_binary_1&find_compatible_frames=true. . ."
  391. models = node.models(key='rf_Prostate_binary_1', find_compatible_frames=1)
  392. self.assertKeysExist(models, 'models', ['rf_Prostate_binary_1'])
  393. self.assertKeysDontExist(models, 'models', ['airlines_train.hex', 'airlines_test.hex', 'prostate.hex', 'glm_AirlinesTrain_binary_1', 'gbm_AirlinesTrain_binary_1', 'gbm_AirlinesTrain_binary_2', 'rf_AirlinesTrain_binary_1', 'rf_AirlinesTrain_binary_2', 'dl_AirlinesTrain_binary_1', 'glm_AirlinesTrain_binary_A', 'glm_Prostate_binary_1', 'glm_Prostate_regression_1'])
  394. self.assertKeysExist(models, '', ['frames'])
  395. self.assertKeysExist(models, 'frames', ['prostate.hex'])
  396. self.assertKeysDontExist(models, 'frames', ['airlines_train.hex', 'airlines_test.hex'])
  397. print "##############################################"
  398. print "Testing /2/Models?key=glm_Prostate_binary_1 variable importance warning. . ."
  399. models = node.models(key='glm_Prostate_binary_1')
  400. self.assertKeysExist(models, 'models', ['glm_Prostate_binary_1'])
  401. self.assertKeysExist(models, 'models/glm_Prostate_binary_1', ['warnings'])
  402. self.assertEqual(1, len(models['models']['glm_Prostate_binary_1']['warnings']), msg="Expect one warning for glm_Prostate_binary_1.")
  403. self.assertTrue("use_all_factor_levels" in models['models']['glm_Prostate_binary_1']['warnings'][0], "Expect variable importances warning since we aren't using use_all_factor_levels.")
  404. def test_binary_classifiers(self):
  405. node = h2o.nodes[0]
  406. print "##############################################"
  407. print "Testing /2/Models with scoring. . ."
  408. print "##############################################"
  409. print ""
  410. print "##############################################"
  411. print "Scoring compatible frames for compatible models for /2/Models?key=airlines_train.hex&find_compatible_models=true. . ."
  412. frames = node.frames(key='airlines_train.hex', find_compatible_models=1)
  413. compatible_models = frames['frames']['airlines_train.hex']['compatible_models']
  414. # NOTE: we start with frame airlines_train.hex and find the compatible models.
  415. # Then for each of those models we find all the compatible frames (there are at least two)
  416. # and score them.
  417. for model_key in compatible_models:
  418. # find all compatible frames
  419. models = node.models(key=model_key, find_compatible_frames=1)
  420. compatible_frames = models['models'][model_key]['compatible_frames']
  421. self.assertKeysExist(models, 'models/' + model_key, ['training_duration_in_ms'])
  422. self.assertNotEqual(models['models'][model_key]['training_duration_in_ms'], 0, "Expected non-zero training time for model: " + model_key)
  423. for frame_key in compatible_frames:
  424. print "Scoring: /2/Models?key=" + model_key + "&score_frame=" + frame_key
  425. scoring_result = node.models(key=model_key, score_frame=frame_key)
  426. self.assertKeysExist(scoring_result, '', ['metrics'])
  427. self.assertKeysExist(scoring_result, 'metrics[0]', ['model', 'frame', 'duration_in_ms'])
  428. self.assertKeysExist(scoring_result, 'metrics[0]/model', ['key', 'model_category', 'id', 'creation_epoch_time_millis'])
  429. model_category = scoring_result['metrics'][0]['model']['model_category']
  430. self.assertEqual(scoring_result['metrics'][0]['model']['key'], model_key, "Expected model key: " + model_key + " but got: " + scoring_result['metrics'][0]['model']['key'])
  431. self.assertEqual(scoring_result['metrics'][0]['frame']['key'], frame_key, "Expected frame key: " + frame_key + " but got: " + scoring_result['metrics'][0]['frame']['key'])
  432. if model_category is 'Binomial':
  433. self.assertKeysExist(scoring_result, 'metrics[0]', ['cm', 'auc']) # TODO: HitRatio
  434. # TODO: look inside the auc and cm elements
  435. if model_category is 'Regression':
  436. self.assertKeysDontExist(scoring_result, 'metrics[0]', ['cm', 'auc']) # TODO: HitRatio
  437. print "##############################################"
  438. print "Testing /2/Frames with scoring. . ."
  439. print "##############################################"
  440. print ""
  441. print "##############################################"
  442. print "Scoring compatible models for /2/Frames?key=prostate.hex&find_compatible_models=true. . ."
  443. frames = node.frames(key='prostate.hex', find_compatible_models=1)
  444. compatible_models = frames['frames']['prostate.hex']['compatible_models']
  445. for model_key in compatible_models:
  446. print "Scoring: /2/Frames?key=prostate.hex&score_model=" + model_key
  447. scoring_result = node.frames(key='prostate.hex', score_model=model_key)
  448. self.assertKeysExist(scoring_result, '', ['metrics'])
  449. self.assertKeysExist(scoring_result, 'metrics[0]', ['model_category'])
  450. model_category = scoring_result['metrics'][0]['model_category']
  451. self.assertKeysExist(scoring_result, 'metrics[0]', ['model', 'frame', 'duration_in_ms'])
  452. self.assertEqual(scoring_result['metrics'][0]['model']['key'], model_key, "Expected model key: " + model_key + " but got: " + scoring_result['metrics'][0]['model']['key'])
  453. self.assertEqual(scoring_result['metrics'][0]['frame']['key'], 'prostate.hex', "Expected frame key: " + 'prostate.hex' + " but got: " + scoring_result['metrics'][0]['frame']['key'])
  454. if model_category is 'Binomial':
  455. self.assertKeysExist(scoring_result, 'metrics[0]', ['cm', 'auc']) # TODO: HitRatio
  456. # TODO: look inside the auc and cm elements
  457. if model_category is 'Regression':
  458. self.assertKeysDontExist(scoring_result, 'metrics[0]', ['cm', 'auc']) # TODO: HitRatio
  459. def test_steam(self):
  460. print "----------------------------------------------------------"
  461. print " Testing Steam... "
  462. print "----------------------------------------------------------"
  463. # Go up two dirs and add '/client'.
  464. # Don't know if there's a better way to do this. - Prithvi
  465. client_dir = os.path.join(os.path.split(os.path.split(os.path.dirname(os.path.realpath(__file__)))[0])[0], 'client')
  466. node0 = h2o.nodes[0]
  467. os.environ['STEAM_NODE_ADDR'] = node0.http_addr
  468. os.environ['STEAM_NODE_PORT'] = str(node0.port)
  469. # Run `make test -C path_to_h2o/client`
  470. command_string = "make test -C " + client_dir
  471. # However, when `make test` fails, h2o.spawn_wait() fails hard without an exit code.
  472. # Further, if this is trapped in a try/except, the failed tests are not routed to stdout.
  473. (ps, outpath, errpath) = h2o.spawn_cmd('steam_tests', command_string.split())
  474. h2o.spawn_wait(ps, outpath, errpath, timeout=1000)
  475. print "----------------------------------------------------------"
  476. print " Steam tests completed successfully! "
  477. print "----------------------------------------------------------"
  478. if __name__ == '__main__':
  479. h2o.unit_main()