PageRenderTime 49ms CodeModel.GetById 20ms RepoModel.GetById 0ms app.codeStats 0ms

/tests/python/test_with_sklearn.py

https://gitlab.com/admin-github-cloud/xgboost
Python | 300 lines | 281 code | 16 blank | 3 comment | 6 complexity | 2306ae33bb74ddf1d0d1a400a0c1ca69 MD5 | raw file
  1. import numpy as np
  2. import random
  3. import xgboost as xgb
  4. import testing as tm
  5. rng = np.random.RandomState(1994)
  6. def test_binary_classification():
  7. tm._skip_if_no_sklearn()
  8. from sklearn.datasets import load_digits
  9. from sklearn.cross_validation import KFold
  10. digits = load_digits(2)
  11. y = digits['target']
  12. X = digits['data']
  13. kf = KFold(y.shape[0], n_folds=2, shuffle=True, random_state=rng)
  14. for train_index, test_index in kf:
  15. xgb_model = xgb.XGBClassifier().fit(X[train_index], y[train_index])
  16. preds = xgb_model.predict(X[test_index])
  17. labels = y[test_index]
  18. err = sum(1 for i in range(len(preds))
  19. if int(preds[i] > 0.5) != labels[i]) / float(len(preds))
  20. assert err < 0.1
  21. def test_multiclass_classification():
  22. tm._skip_if_no_sklearn()
  23. from sklearn.datasets import load_iris
  24. from sklearn.cross_validation import KFold
  25. def check_pred(preds, labels):
  26. err = sum(1 for i in range(len(preds))
  27. if int(preds[i] > 0.5) != labels[i]) / float(len(preds))
  28. assert err < 0.4
  29. iris = load_iris()
  30. y = iris['target']
  31. X = iris['data']
  32. kf = KFold(y.shape[0], n_folds=2, shuffle=True, random_state=rng)
  33. for train_index, test_index in kf:
  34. xgb_model = xgb.XGBClassifier().fit(X[train_index], y[train_index])
  35. preds = xgb_model.predict(X[test_index])
  36. # test other params in XGBClassifier().fit
  37. preds2 = xgb_model.predict(X[test_index], output_margin=True, ntree_limit=3)
  38. preds3 = xgb_model.predict(X[test_index], output_margin=True, ntree_limit=0)
  39. preds4 = xgb_model.predict(X[test_index], output_margin=False, ntree_limit=3)
  40. labels = y[test_index]
  41. check_pred(preds, labels)
  42. check_pred(preds2, labels)
  43. check_pred(preds3, labels)
  44. check_pred(preds4, labels)
  45. def test_feature_importances():
  46. tm._skip_if_no_sklearn()
  47. from sklearn.datasets import load_digits
  48. digits = load_digits(2)
  49. y = digits['target']
  50. X = digits['data']
  51. xgb_model = xgb.XGBClassifier(seed=0).fit(X, y)
  52. exp = np.array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.00833333, 0.,
  53. 0., 0., 0., 0., 0., 0., 0., 0.025, 0.14166667, 0., 0., 0.,
  54. 0., 0., 0., 0.00833333, 0.25833333, 0., 0., 0., 0.,
  55. 0.03333334, 0.03333334, 0., 0.32499999, 0., 0., 0., 0.,
  56. 0.05, 0.06666667, 0., 0., 0., 0., 0., 0., 0., 0.04166667,
  57. 0., 0., 0., 0., 0., 0., 0., 0.00833333, 0., 0., 0., 0.,
  58. 0.], dtype=np.float32)
  59. np.testing.assert_almost_equal(xgb_model.feature_importances_, exp)
  60. # numeric columns
  61. import pandas as pd
  62. y = pd.Series(digits['target'])
  63. X = pd.DataFrame(digits['data'])
  64. xgb_model = xgb.XGBClassifier(seed=0).fit(X, y)
  65. np.testing.assert_almost_equal(xgb_model.feature_importances_, exp)
  66. # string columns, the feature order must be kept
  67. chars = list('abcdefghijklmnopqrstuvwxyz')
  68. X.columns = ["".join(random.sample(chars, 5)) for x in range(64)]
  69. xgb_model = xgb.XGBClassifier(seed=0).fit(X, y)
  70. np.testing.assert_almost_equal(xgb_model.feature_importances_, exp)
  71. def test_boston_housing_regression():
  72. tm._skip_if_no_sklearn()
  73. from sklearn.metrics import mean_squared_error
  74. from sklearn.datasets import load_boston
  75. from sklearn.cross_validation import KFold
  76. boston = load_boston()
  77. y = boston['target']
  78. X = boston['data']
  79. kf = KFold(y.shape[0], n_folds=2, shuffle=True, random_state=rng)
  80. for train_index, test_index in kf:
  81. xgb_model = xgb.XGBRegressor().fit(X[train_index], y[train_index])
  82. preds = xgb_model.predict(X[test_index])
  83. # test other params in XGBRegressor().fit
  84. preds2 = xgb_model.predict(X[test_index], output_margin=True, ntree_limit=3)
  85. preds3 = xgb_model.predict(X[test_index], output_margin=True, ntree_limit=0)
  86. preds4 = xgb_model.predict(X[test_index], output_margin=False, ntree_limit=3)
  87. labels = y[test_index]
  88. assert mean_squared_error(preds, labels) < 25
  89. assert mean_squared_error(preds2, labels) < 350
  90. assert mean_squared_error(preds3, labels) < 25
  91. assert mean_squared_error(preds4, labels) < 350
  92. def test_parameter_tuning():
  93. tm._skip_if_no_sklearn()
  94. from sklearn.grid_search import GridSearchCV
  95. from sklearn.datasets import load_boston
  96. boston = load_boston()
  97. y = boston['target']
  98. X = boston['data']
  99. xgb_model = xgb.XGBRegressor()
  100. clf = GridSearchCV(xgb_model, {'max_depth': [2, 4, 6],
  101. 'n_estimators': [50, 100, 200]}, verbose=1)
  102. clf.fit(X, y)
  103. assert clf.best_score_ < 0.7
  104. assert clf.best_params_ == {'n_estimators': 100, 'max_depth': 4}
  105. def test_regression_with_custom_objective():
  106. tm._skip_if_no_sklearn()
  107. from sklearn.metrics import mean_squared_error
  108. from sklearn.datasets import load_boston
  109. from sklearn.cross_validation import KFold
  110. def objective_ls(y_true, y_pred):
  111. grad = (y_pred - y_true)
  112. hess = np.ones(len(y_true))
  113. return grad, hess
  114. boston = load_boston()
  115. y = boston['target']
  116. X = boston['data']
  117. kf = KFold(y.shape[0], n_folds=2, shuffle=True, random_state=rng)
  118. for train_index, test_index in kf:
  119. xgb_model = xgb.XGBRegressor(objective=objective_ls).fit(
  120. X[train_index], y[train_index]
  121. )
  122. preds = xgb_model.predict(X[test_index])
  123. labels = y[test_index]
  124. assert mean_squared_error(preds, labels) < 25
  125. # Test that the custom objective function is actually used
  126. class XGBCustomObjectiveException(Exception):
  127. pass
  128. def dummy_objective(y_true, y_pred):
  129. raise XGBCustomObjectiveException()
  130. xgb_model = xgb.XGBRegressor(objective=dummy_objective)
  131. np.testing.assert_raises(XGBCustomObjectiveException, xgb_model.fit, X, y)
  132. def test_classification_with_custom_objective():
  133. tm._skip_if_no_sklearn()
  134. from sklearn.datasets import load_digits
  135. from sklearn.cross_validation import KFold
  136. def logregobj(y_true, y_pred):
  137. y_pred = 1.0 / (1.0 + np.exp(-y_pred))
  138. grad = y_pred - y_true
  139. hess = y_pred * (1.0 - y_pred)
  140. return grad, hess
  141. digits = load_digits(2)
  142. y = digits['target']
  143. X = digits['data']
  144. kf = KFold(y.shape[0], n_folds=2, shuffle=True, random_state=rng)
  145. for train_index, test_index in kf:
  146. xgb_model = xgb.XGBClassifier(objective=logregobj)
  147. xgb_model.fit(X[train_index], y[train_index])
  148. preds = xgb_model.predict(X[test_index])
  149. labels = y[test_index]
  150. err = sum(1 for i in range(len(preds))
  151. if int(preds[i] > 0.5) != labels[i]) / float(len(preds))
  152. assert err < 0.1
  153. # Test that the custom objective function is actually used
  154. class XGBCustomObjectiveException(Exception):
  155. pass
  156. def dummy_objective(y_true, y_preds):
  157. raise XGBCustomObjectiveException()
  158. xgb_model = xgb.XGBClassifier(objective=dummy_objective)
  159. np.testing.assert_raises(
  160. XGBCustomObjectiveException,
  161. xgb_model.fit,
  162. X, y
  163. )
  164. def test_sklearn_api():
  165. tm._skip_if_no_sklearn()
  166. from sklearn.datasets import load_iris
  167. from sklearn.cross_validation import train_test_split
  168. iris = load_iris()
  169. tr_d, te_d, tr_l, te_l = train_test_split(iris.data, iris.target, train_size=120)
  170. classifier = xgb.XGBClassifier()
  171. classifier.fit(tr_d, tr_l)
  172. preds = classifier.predict(te_d)
  173. labels = te_l
  174. err = sum([1 for p, l in zip(preds, labels) if p != l]) / len(te_l)
  175. assert err < 0.2
  176. def test_sklearn_plotting():
  177. tm._skip_if_no_sklearn()
  178. from sklearn.datasets import load_iris
  179. iris = load_iris()
  180. classifier = xgb.XGBClassifier()
  181. classifier.fit(iris.data, iris.target)
  182. import matplotlib
  183. matplotlib.use('Agg')
  184. from matplotlib.axes import Axes
  185. from graphviz import Digraph
  186. ax = xgb.plot_importance(classifier)
  187. assert isinstance(ax, Axes)
  188. assert ax.get_title() == 'Feature importance'
  189. assert ax.get_xlabel() == 'F score'
  190. assert ax.get_ylabel() == 'Features'
  191. assert len(ax.patches) == 4
  192. g = xgb.to_graphviz(classifier, num_trees=0)
  193. assert isinstance(g, Digraph)
  194. ax = xgb.plot_tree(classifier, num_trees=0)
  195. assert isinstance(ax, Axes)
  196. def test_sklearn_nfolds_cv():
  197. tm._skip_if_no_sklearn()
  198. from sklearn.datasets import load_digits
  199. from sklearn.cross_validation import StratifiedKFold
  200. digits = load_digits(3)
  201. X = digits['data']
  202. y = digits['target']
  203. dm = xgb.DMatrix(X, label=y)
  204. params = {
  205. 'max_depth': 2,
  206. 'eta': 1,
  207. 'silent': 1,
  208. 'objective':
  209. 'multi:softprob',
  210. 'num_class': 3
  211. }
  212. seed = 2016
  213. nfolds = 5
  214. skf = StratifiedKFold(y, n_folds=nfolds, shuffle=True, random_state=seed)
  215. cv1 = xgb.cv(params, dm, num_boost_round=10, nfold=nfolds, seed=seed)
  216. cv2 = xgb.cv(params, dm, num_boost_round=10, folds=skf, seed=seed)
  217. cv3 = xgb.cv(params, dm, num_boost_round=10, nfold=nfolds, stratified=True, seed=seed)
  218. assert cv1.shape[0] == cv2.shape[0] and cv2.shape[0] == cv3.shape[0]
  219. assert cv2.iloc[-1, 0] == cv3.iloc[-1, 0]
  220. def test_split_value_histograms():
  221. tm._skip_if_no_sklearn()
  222. from sklearn.datasets import load_digits
  223. digits_2class = load_digits(2)
  224. X = digits_2class['data']
  225. y = digits_2class['target']
  226. dm = xgb.DMatrix(X, label=y)
  227. params = {'max_depth': 6, 'eta': 0.01, 'silent': 1, 'objective': 'binary:logistic'}
  228. gbdt = xgb.train(params, dm, num_boost_round=10)
  229. assert gbdt.get_split_value_histogram("not_there", as_pandas=True).shape[0] == 0
  230. assert gbdt.get_split_value_histogram("not_there", as_pandas=False).shape[0] == 0
  231. assert gbdt.get_split_value_histogram("f28", bins=0).shape[0] == 1
  232. assert gbdt.get_split_value_histogram("f28", bins=1).shape[0] == 1
  233. assert gbdt.get_split_value_histogram("f28", bins=2).shape[0] == 2
  234. assert gbdt.get_split_value_histogram("f28", bins=5).shape[0] == 2
  235. assert gbdt.get_split_value_histogram("f28", bins=None).shape[0] == 2