PageRenderTime 59ms CodeModel.GetById 26ms RepoModel.GetById 0ms app.codeStats 0ms

/py/h2o_glm.py

https://github.com/dbfree/h2o
Python | 547 lines | 335 code | 87 blank | 125 comment | 110 complexity | cd4cdde076d74ca47665b3c6fc5a2b0c MD5 | raw file
Possible License(s): Apache-2.0
  1. import h2o_cmd, h2o, h2o_util, h2o_gbm
  2. import re, random, math
  3. def pickRandGlmParams(paramDict, params):
  4. colX = 0
  5. randomGroupSize = random.randint(1,len(paramDict))
  6. for i in range(randomGroupSize):
  7. randomKey = random.choice(paramDict.keys())
  8. randomV = paramDict[randomKey]
  9. randomValue = random.choice(randomV)
  10. params[randomKey] = randomValue
  11. if (randomKey=='x'):
  12. colX = randomValue
  13. # force legal family/ink combos
  14. if 'family' in params and 'link' in params:
  15. if params['family'] is not None:
  16. if params['family'] == 'poisson':
  17. if params['link'] is not None and params['link'] not in ('identity', 'log', 'inverse', 'familyDefault'):
  18. params['link'] = None
  19. # only tweedie/tweedie is legal?
  20. if params['family'] == 'tweedie':
  21. if params['link'] is not None and params['link'] not in ('tweedie'):
  22. params['link'] = None
  23. if params['family'] == 'binomial':
  24. if params['link'] is not None and params['link'] not in ('logit', 'identity', 'log', 'inverse', 'familyDefault'):
  25. params['link'] = None
  26. if params['family'] == 'gaussian':
  27. if params['link'] is not None and params['link'] not in ('logit', 'identity', 'log', 'inverse', 'familyDefault'):
  28. params['link'] = None
  29. # case only used if binomial? binomial is default if no family
  30. # update: apparently case and case_mode always affect things
  31. # make sure the combo of case and case_mode makes sense
  32. # there needs to be some entries in both effective cases
  33. if ('case_mode' in params):
  34. if ('case' not in params) or (params['case'] is None):
  35. params['case'] = 1
  36. else:
  37. maxCase = max(paramDict['case'])
  38. minCase = min(paramDict['case'])
  39. if params['case_mode']=="<" and params['case']==minCase:
  40. params['case'] += 1
  41. elif params['case_mode']==">" and params['case']==maxCase:
  42. params['case'] -= 1
  43. elif params['case_mode']==">=" and params['case']==minCase:
  44. params['case'] += 1
  45. elif params['case_mode']=="<=" and params['case']==maxCase:
  46. params['case'] -= 1
  47. return colX
  48. def simpleCheckGLMScore(self, glmScore, family='gaussian', allowFailWarning=False, **kwargs):
  49. warnings = None
  50. if 'warnings' in glmScore:
  51. warnings = glmScore['warnings']
  52. # stop on failed
  53. x = re.compile("failed", re.IGNORECASE)
  54. # don't stop if fail to converge
  55. c = re.compile("converge", re.IGNORECASE)
  56. for w in warnings:
  57. print "\nwarning:", w
  58. if re.search(x,w) and not allowFailWarning:
  59. if re.search(c,w):
  60. # ignore the fail to converge warning now
  61. pass
  62. else:
  63. # stop on other 'fail' warnings (are there any? fail to solve?
  64. raise Exception(w)
  65. validation = glmScore['validation']
  66. validation['err'] = h2o_util.cleanseInfNan(validation['err'])
  67. validation['nullDev'] = h2o_util.cleanseInfNan(validation['nullDev'])
  68. validation['resDev'] = h2o_util.cleanseInfNan(validation['resDev'])
  69. print "%15s %s" % ("err:\t", validation['err'])
  70. print "%15s %s" % ("nullDev:\t", validation['nullDev'])
  71. print "%15s %s" % ("resDev:\t", validation['resDev'])
  72. # threshold only there if binomial?
  73. # auc only for binomial
  74. if family=="binomial":
  75. print "%15s %s" % ("auc:\t", validation['auc'])
  76. print "%15s %s" % ("threshold:\t", validation['threshold'])
  77. err = False
  78. if family=="poisson" or family=="gaussian":
  79. if 'aic' not in validation:
  80. print "aic is missing from the glm json response"
  81. err = True
  82. if math.isnan(validation['err']):
  83. print "Why is this err = 'nan'?? %6s %s" % ("err:\t", validation['err'])
  84. err = True
  85. if math.isnan(validation['resDev']):
  86. print "Why is this resDev = 'nan'?? %6s %s" % ("resDev:\t", validation['resDev'])
  87. err = True
  88. if err:
  89. raise Exception ("How am I supposed to tell that any of these errors should be ignored?")
  90. # legal?
  91. if math.isnan(validation['nullDev']):
  92. ## emsg = "Why is this nullDev = 'nan'?? %6s %s" % ("nullDev:\t", validation['nullDev'])
  93. ## raise Exception(emsg)
  94. pass
  95. def simpleCheckGLM(self, glm, colX, allowFailWarning=False, allowZeroCoeff=False,
  96. prettyPrint=False, noPrint=False, maxExpectedIterations=None, doNormalized=False, **kwargs):
  97. # if we hit the max_iter, that means it probably didn't converge. should be 1-maxExpectedIter
  98. # h2o GLM will verboseprint the result and print errors.
  99. # so don't have to do that
  100. # different when cross validation is used? No trainingErrorDetails?
  101. GLMModel = glm['glm_model']
  102. if not GLMModel:
  103. raise Exception("GLMModel didn't exist in the glm response? %s" % h2o.dump_json(glm))
  104. warnings = None
  105. if 'warnings' in GLMModel and GLMModel['warnings']:
  106. warnings = GLMModel['warnings']
  107. # stop on failed
  108. x = re.compile("failed", re.IGNORECASE)
  109. # don't stop if fail to converge
  110. c = re.compile("converge", re.IGNORECASE)
  111. for w in warnings:
  112. print "\nwarning:", w
  113. if re.search(x,w) and not allowFailWarning:
  114. if re.search(c,w):
  115. # ignore the fail to converge warning now
  116. pass
  117. else:
  118. # stop on other 'fail' warnings (are there any? fail to solve?
  119. raise Exception(w)
  120. # for key, value in glm.iteritems(): print key
  121. # not in GLMGrid?
  122. # FIX! don't get GLMParams if it can't solve?
  123. GLMParams = GLMModel['glm']
  124. family = GLMParams["family"]
  125. # number of submodels = number of lambda
  126. # min of 2. lambda_max is first
  127. submodels = GLMModel['submodels']
  128. # since all our tests?? only use one lambda, the best_lamda_idx should = 1
  129. best_lambda_idx = GLMModel['best_lambda_idx']
  130. print "best_lambda_idx:", best_lambda_idx
  131. lambda_max = GLMModel['lambda_max']
  132. print "lambda_max:", lambda_max
  133. # currently lambda_max is not set by tomas. ..i.e.not valid
  134. if 1==0 and (lambda_max <= submodels[best_lambda_idx].lambda_value):
  135. raise Exception("lambda_max %s should always be > the lambda result %s we're checking" % (lambda_max, submodels[best_lambda_idx].lambda_value))
  136. # submodels0 = submodels[0]
  137. # submodels1 = submodels[-1] # hackery to make it work when there's just one
  138. if (best_lambda_idx >= len(submodels)) or (best_lambda_idx < 0):
  139. raise Exception("best_lambda_idx: %s should point to one of lambdas (which has len %s)" % (best_lambda_idx, len(submodels)))
  140. if (best_lambda_idx >= len(submodels)) or (best_lambda_idx < 0):
  141. raise Exception("best_lambda_idx: %s should point to one of submodels (which has len %s)" % (best_lambda_idx, len(submodels)))
  142. submodels1 = submodels[best_lambda_idx] # hackery to make it work when there's just one
  143. iterations = submodels1['iteration']
  144. print "GLMModel/iterations:", iterations
  145. # if we hit the max_iter, that means it probably didn't converge. should be 1-maxExpectedIter
  146. if maxExpectedIterations is not None and iterations > maxExpectedIterations:
  147. raise Exception("Convergence issue? GLM did iterations: %d which is greater than expected: %d" % (iterations, maxExpectedIterations) )
  148. if 'validation' not in submodels1:
  149. raise Exception("Should be a 'validation' key in submodels1: %s" % h2o.dump_json(submodels1))
  150. validationsList = submodels1['validation']
  151. validations = validationsList
  152. # xval. compare what we asked for and what we got.
  153. n_folds = kwargs.setdefault('n_folds', None)
  154. print "GLMModel/validations"
  155. validations['null_deviance'] = h2o_util.cleanseInfNan(validations['null_deviance'])
  156. validations['residual_deviance'] = h2o_util.cleanseInfNan(validations['residual_deviance'])
  157. print "%15s %s" % ("null_deviance:\t", validations['null_deviance'])
  158. print "%15s %s" % ("residual_deviance:\t", validations['residual_deviance'])
  159. # threshold only there if binomial?
  160. # auc only for binomial
  161. if family=="binomial":
  162. print "%15s %s" % ("auc:\t", validations['auc'])
  163. best_threshold = validations['best_threshold']
  164. thresholds = validations['thresholds']
  165. print "%15s %s" % ("best_threshold:\t", best_threshold)
  166. # have to look up the index for the cm, from the thresholds list
  167. best_index = None
  168. # FIX! best_threshold isn't necessarily in the list. jump out if >=
  169. for i,t in enumerate(thresholds):
  170. if t >= best_threshold: # ends up using next one if not present
  171. best_index = i
  172. break
  173. assert best_index!=None, "%s %s" % (best_threshold, thresholds)
  174. print "Now printing the right 'best_threshold' %s from '_cms" % best_threshold
  175. # cm = glm['glm_model']['submodels'][0]['validation']['_cms'][-1]
  176. submodels = glm['glm_model']['submodels']
  177. cms = submodels[0]['validation']['_cms']
  178. assert best_index<len(cms), "%s %s" % (best_index, len(cms))
  179. # if we want 0.5..rounds to int
  180. # mid = len(cms)/2
  181. # cm = cms[mid]
  182. cm = cms[best_index]
  183. print "cm:", h2o.dump_json(cm['_arr'])
  184. predErr = cm['_predErr']
  185. classErr = cm['_classErr']
  186. # compare to predErr
  187. pctWrong = h2o_gbm.pp_cm_summary(cm['_arr']);
  188. print "predErr:", predErr
  189. print "calculated pctWrong from cm:", pctWrong
  190. print "classErr:", classErr
  191. # self.assertLess(pctWrong, 9,"Should see less than 9% error (class = 4)")
  192. print "\nTrain\n==========\n"
  193. print h2o_gbm.pp_cm(cm['_arr'])
  194. if family=="poisson" or family=="gaussian":
  195. print "%15s %s" % ("aic:\t", validations['aic'])
  196. coefficients_names = GLMModel['coefficients_names']
  197. # print "coefficients_names:", coefficients_names
  198. idxs = submodels1['idxs']
  199. print "idxs:", idxs
  200. coefficients_names = coefficients_names
  201. # always check both normalized and normal coefficients
  202. norm_beta = submodels1['norm_beta']
  203. # if norm_beta and len(coefficients_names)!=len(norm_beta):
  204. # print len(coefficients_names), len(norm_beta)
  205. # raise Exception("coefficients_names and normalized_norm_beta from h2o json not same length. coefficients_names: %s normalized_norm_beta: %s" % (coefficients_names, norm_beta))
  206. #
  207. beta = submodels1['beta']
  208. # print "beta:", beta
  209. # if len(coefficients_names)!=len(beta):
  210. # print len(coefficients_names), len(beta)
  211. # raise Exception("coefficients_names and beta from h2o json not same length. coefficients_names: %s beta: %s" % (coefficients_names, beta))
  212. # test wants to use normalized?
  213. if doNormalized:
  214. beta_used = norm_beta
  215. else:
  216. beta_used = beta
  217. coefficients = {}
  218. # create a dictionary with name, beta (including intercept) just like v1
  219. for i,b in zip(idxs, beta_used[:-1]):
  220. name = coefficients_names[i]
  221. coefficients[name] = b
  222. print "len(idxs)", len(idxs), "len(beta_used)", len(beta_used)
  223. print "coefficients:", coefficients
  224. print "beta:", beta
  225. print "norm_beta:", norm_beta
  226. coefficients['Intercept'] = beta_used[-1]
  227. print "len(coefficients_names)", len(coefficients_names)
  228. print "len(idxs)", len(idxs)
  229. print "idxs[-1]", idxs[-1]
  230. print "intercept demapping info:", \
  231. "coefficients_names[-i]:", coefficients_names[-1], \
  232. "idxs[-1]:", idxs[-1], \
  233. "coefficients_names[idxs[-1]]:", coefficients_names[idxs[-1]], \
  234. "beta_used[-1]:", beta_used[-1], \
  235. "coefficients['Intercept']", coefficients['Intercept']
  236. # last one is intercept
  237. interceptName = coefficients_names[idxs[-1]]
  238. if interceptName != "Intercept" or abs(beta_used[-1])<1e-26:
  239. raise Exception("'Intercept' should be last in coefficients_names and beta %s %s %s" %\
  240. (idxs[-1], beta_used[-1], "-"+interceptName+"-"))
  241. # idxs has the order for non-zero coefficients, it's shorter than beta_used and coefficients_names
  242. # new 5/28/14. glm can point to zero coefficients
  243. # for i in idxs:
  244. # if beta_used[i]==0.0:
  245. ## raise Exception("idxs shouldn't point to any 0 coefficients i: %s %s:" % (i, beta_used[i]))
  246. if len(idxs) > len(beta_used):
  247. raise Exception("idxs shouldn't be longer than beta_used %s %s" % (len(idxs), len(beta_used)))
  248. intercept = coefficients.pop('Intercept', None)
  249. # intercept demapping info: idxs[-1]: 54 coefficients_names[[idxs[-1]]: Intercept beta_used[-1]: -6.6866753099
  250. # the last one shoudl be 'Intercept' ?
  251. coefficients_names.pop()
  252. # have to skip the output col! get it from kwargs
  253. # better always be there!
  254. y = kwargs['response']
  255. # the dict keys are column headers if they exist...how to order those? new: use the 'coefficients_names'
  256. # from the response
  257. # Tomas created 'coefficients_names which is the coefficient list in order.
  258. # Just use it to index coefficients! works for header or no-header cases
  259. # I guess now we won't print the "None" cases for dropped columns (constant columns!)
  260. # Because Tomas doesn't get everything in 'coefficients_names' if dropped by GLMQuery before
  261. # he gets it?
  262. def add_to_coefficient_list_and_string(c, cList, cString):
  263. if c in coefficients:
  264. cValue = coefficients[c]
  265. cValueString = "%s: %.5e " % (c, cValue)
  266. else:
  267. print "Warning: didn't see '" + c + "' in json coefficient response.",\
  268. "Inserting 'None' with assumption it was dropped due to constant column)"
  269. cValue = None
  270. cValueString = "%s: %s " % (c, cValue)
  271. cList.append(cValue)
  272. # we put each on newline for easy comparison to R..otherwise keep condensed
  273. if prettyPrint:
  274. cValueString = "H2O coefficient " + cValueString + "\n"
  275. # not mutable?
  276. return cString + cValueString
  277. # creating both a string for printing and a list of values
  278. cString = ""
  279. cList = []
  280. # print in order using col_names
  281. # coefficients_names is input only now..same for header or no header, or expanded enums
  282. for c in coefficients_names:
  283. cString = add_to_coefficient_list_and_string(c, cList, cString)
  284. if prettyPrint:
  285. print "\nH2O intercept:\t\t%.5e" % intercept
  286. print cString
  287. else:
  288. if not noPrint:
  289. print "\nintercept:", intercept, cString
  290. print "\nTotal # of coefficients:", len(coefficients_names)
  291. # pick out the coefficent for the column we enabled for enhanced checking. Can be None.
  292. # FIX! temporary hack to deal with disappearing/renaming columns in GLM
  293. if (not allowZeroCoeff) and (colX is not None):
  294. absXCoeff = abs(float(coefficients[str(colX)]))
  295. self.assertGreater(absXCoeff, 1e-26, (
  296. "abs. value of GLM coefficients['" + str(colX) + "'] is " +
  297. str(absXCoeff) + ", not >= 1e-26 for X=" + str(colX)
  298. ))
  299. # intercept is buried in there too
  300. absIntercept = abs(float(intercept))
  301. self.assertGreater(absIntercept, 1e-26, (
  302. "abs. value of GLM coefficients['Intercept'] is " +
  303. str(absIntercept) + ", not >= 1e-26 for Intercept"
  304. ))
  305. # this is good if we just want min or max
  306. # maxCoeff = max(coefficients, key=coefficients.get)
  307. # for more, just invert the dictionary and ...
  308. if (len(coefficients)>0):
  309. maxKey = max([(abs(coefficients[x]),x) for x in coefficients])[1]
  310. print "H2O Largest abs. coefficient value:", maxKey, coefficients[maxKey]
  311. minKey = min([(abs(coefficients[x]),x) for x in coefficients])[1]
  312. print "H2O Smallest abs. coefficient value:", minKey, coefficients[minKey]
  313. else:
  314. print "Warning, no coefficients returned. Must be intercept only?"
  315. # many of the GLM tests aren't single column though.
  316. # quick and dirty check: if all the coefficients are zero,
  317. # something is broken
  318. # intercept is in there too, but this will get it okay
  319. # just sum the abs value up..look for greater than 0
  320. # skip this test if there is just one coefficient. Maybe pointing to a non-important coeff?
  321. if (not allowZeroCoeff) and (len(coefficients)>1):
  322. s = 0.0
  323. for c in coefficients:
  324. v = coefficients[c]
  325. s += abs(float(v))
  326. self.assertGreater(s, 1e-26, (
  327. "sum of abs. value of GLM coefficients/intercept is " + str(s) + ", not >= 1e-26"
  328. ))
  329. print "submodels1, run_time (milliseconds):", submodels1['run_time']
  330. # shouldn't have any errors
  331. h2o.check_sandbox_for_errors()
  332. return (warnings, cList, intercept)
  333. # compare this glm to last one. since the files are concatenations,
  334. # the results should be similar? 10% of first is allowed delta
  335. def compareToFirstGlm(self, key, glm, firstglm):
  336. # if isinstance(firstglm[key], list):
  337. # in case it's not a list allready (err is a list)
  338. h2o.verboseprint("compareToFirstGlm key:", key)
  339. h2o.verboseprint("compareToFirstGlm glm[key]:", glm[key])
  340. # key could be a list or not. if a list, don't want to create list of that list
  341. # so use extend on an empty list. covers all cases?
  342. if type(glm[key]) is list:
  343. kList = glm[key]
  344. firstkList = firstglm[key]
  345. elif type(glm[key]) is dict:
  346. raise Exception("compareToFirstGLm: Not expecting dict for " + key)
  347. else:
  348. kList = [glm[key]]
  349. firstkList = [firstglm[key]]
  350. print "kbn:", kList, firstkList
  351. for k, firstk in zip(kList, firstkList):
  352. # delta must be a positive number ?
  353. delta = .1 * abs(float(firstk))
  354. msg = "Too large a delta (" + str(delta) + ") comparing current and first for: " + key
  355. self.assertAlmostEqual(float(k), float(firstk), delta=delta, msg=msg)
  356. self.assertGreaterEqual(abs(float(k)), 0.0, str(k) + " abs not >= 0.0 in current")
  357. def simpleCheckGLMGrid(self, glmGridResult, colX=None, allowFailWarning=False, **kwargs):
  358. # "grid": {
  359. # "destination_keys": [
  360. # "GLMGridResults__8222a49156af52532a34fb3ce4304308_0",
  361. # "GLMGridResults__8222a49156af52532a34fb3ce4304308_1",
  362. # "GLMGridResults__8222a49156af52532a34fb3ce4304308_2"
  363. # ]
  364. # },
  365. destination_key = glmGridResult['grid']['destination_keys'][0]
  366. inspectGG = h2o.nodes[0].glm_view(destination_key)
  367. models = inspectGG['glm_model']['submodels']
  368. h2o.verboseprint("GLMGrid inspect GLMGrid model 0(best):", h2o.dump_json(models[0]))
  369. g = simpleCheckGLM(self, inspectGG, colX, allowFailWarning=allowFailWarning, **kwargs)
  370. # just to get some save_model testing
  371. for i,m in enumerate(glmGridResult['grid']['destination_keys']):
  372. print "Saving model", m, "to model"+str(i)
  373. h2o.nodes[0].save_model(model=m, path='model'+str(i), force=1)
  374. return g
  375. # This gives me a comma separated x string, for all the columns, with cols with
  376. # missing values, enums, and optionally matching a pattern, removed. useful for GLM
  377. # since it removes rows with any col with NA
  378. # get input from this.
  379. # (missingValuesDict, constantValuesDict, enumSizeDict, colTypeDict, colNameDict) = \
  380. # h2o_cmd.columnInfoFromInspect(parseResult['destination_key',
  381. # exceptionOnMissingValues=False, timeoutSecs=300)
  382. def goodXFromColumnInfo(y,
  383. num_cols=None, missingValuesDict=None, constantValuesDict=None, enumSizeDict=None,
  384. colTypeDict=None, colNameDict=None, keepPattern=None, key=None,
  385. timeoutSecs=120, forRF=False, noPrint=False, returnStringX=True):
  386. y = str(y)
  387. # if we pass a key, means we want to get the info ourselves here
  388. if key is not None:
  389. (missingValuesDict, constantValuesDict, enumSizeDict, colTypeDict, colNameDict) = \
  390. h2o_cmd.columnInfoFromInspect(key, exceptionOnMissingValues=False,
  391. max_column_display=99999999, timeoutSecs=timeoutSecs)
  392. num_cols = len(colNameDict)
  393. # now remove any whose names don't match the required keepPattern
  394. if keepPattern is not None:
  395. keepX = re.compile(keepPattern)
  396. else:
  397. keepX = None
  398. x = range(num_cols)
  399. # need to walk over a copy, cause we change x
  400. xOrig = x[:]
  401. ignore_x = [] # for use by RF
  402. for k in xOrig:
  403. name = colNameDict[k]
  404. # remove it if it has the same name as the y output
  405. if str(k)== y: # if they pass the col index as y
  406. if not noPrint:
  407. print "Removing %d because name: %s matches output %s" % (k, str(k), y)
  408. x.remove(k)
  409. # rf doesn't want it in ignore list
  410. # ignore_x.append(k)
  411. elif name == y: # if they pass the name as y
  412. if not noPrint:
  413. print "Removing %d because name: %s matches output %s" % (k, name, y)
  414. x.remove(k)
  415. # rf doesn't want it in ignore list
  416. # ignore_x.append(k)
  417. elif keepX is not None and not keepX.match(name):
  418. if not noPrint:
  419. print "Removing %d because name: %s doesn't match desired keepPattern %s" % (k, name, keepPattern)
  420. x.remove(k)
  421. ignore_x.append(k)
  422. # missing values reports as constant also. so do missing first.
  423. # remove all cols with missing values
  424. # could change it against num_rows for a ratio
  425. elif k in missingValuesDict:
  426. value = missingValuesDict[k]
  427. if not noPrint:
  428. print "Removing %d with name: %s because it has %d missing values" % (k, name, value)
  429. x.remove(k)
  430. ignore_x.append(k)
  431. elif k in constantValuesDict:
  432. value = constantValuesDict[k]
  433. if not noPrint:
  434. print "Removing %d with name: %s because it has constant value: %s " % (k, name, str(value))
  435. x.remove(k)
  436. ignore_x.append(k)
  437. # this is extra pruning..
  438. # remove all cols with enums, if not already removed
  439. elif k in enumSizeDict:
  440. value = enumSizeDict[k]
  441. if not noPrint:
  442. print "Removing %d %s because it has enums of size: %d" % (k, name, value)
  443. x.remove(k)
  444. ignore_x.append(k)
  445. if not noPrint:
  446. print "x has", len(x), "cols"
  447. print "ignore_x has", len(ignore_x), "cols"
  448. # this is probably used in 'cols" in v2, which can take numbers
  449. if returnStringX:
  450. x = ",".join(map(str, x))
  451. ignore_x = ",".join(map(lambda x: "C" + str(x+1), ignore_x))
  452. if not noPrint:
  453. print "\nx:", x
  454. print "\nignore_x:", ignore_x
  455. if forRF:
  456. return ignore_x
  457. else:
  458. return x