PageRenderTime 45ms CodeModel.GetById 19ms RepoModel.GetById 0ms app.codeStats 0ms

/tests/python/test_basic.py

https://gitlab.com/github-cloud-corporation/xgboost
Python | 209 lines | 145 code | 36 blank | 28 comment | 6 complexity | 1b0dde26f8cc4ae39f2decde2d95d5b8 MD5 | raw file
  1. # -*- coding: utf-8 -*-
  2. import numpy as np
  3. import xgboost as xgb
  4. import unittest
  5. dpath = 'demo/data/'
  6. rng = np.random.RandomState(1994)
  7. class TestBasic(unittest.TestCase):
  8. def test_basic(self):
  9. dtrain = xgb.DMatrix(dpath + 'agaricus.txt.train')
  10. dtest = xgb.DMatrix(dpath + 'agaricus.txt.test')
  11. param = {'max_depth': 2, 'eta': 1, 'silent': 1, 'objective': 'binary:logistic'}
  12. # specify validations set to watch performance
  13. watchlist = [(dtest, 'eval'), (dtrain, 'train')]
  14. num_round = 2
  15. bst = xgb.train(param, dtrain, num_round, watchlist)
  16. # this is prediction
  17. preds = bst.predict(dtest)
  18. labels = dtest.get_label()
  19. err = sum(1 for i in range(len(preds)) if int(preds[i] > 0.5) != labels[i]) / float(len(preds))
  20. # error must be smaller than 10%
  21. assert err < 0.1
  22. # save dmatrix into binary buffer
  23. dtest.save_binary('dtest.buffer')
  24. # save model
  25. bst.save_model('xgb.model')
  26. # load model and data in
  27. bst2 = xgb.Booster(model_file='xgb.model')
  28. dtest2 = xgb.DMatrix('dtest.buffer')
  29. preds2 = bst2.predict(dtest2)
  30. # assert they are the same
  31. assert np.sum(np.abs(preds2 - preds)) == 0
  32. def test_record_results(self):
  33. dtrain = xgb.DMatrix(dpath + 'agaricus.txt.train')
  34. dtest = xgb.DMatrix(dpath + 'agaricus.txt.test')
  35. param = {'max_depth': 2, 'eta': 1, 'silent': 1, 'objective': 'binary:logistic'}
  36. # specify validations set to watch performance
  37. watchlist = [(dtest, 'eval'), (dtrain, 'train')]
  38. num_round = 2
  39. result = {}
  40. res2 = {}
  41. xgb.train(param, dtrain, num_round, watchlist,
  42. callbacks=[xgb.callback.record_evaluation(result)])
  43. xgb.train(param, dtrain, num_round, watchlist,
  44. evals_result=res2)
  45. assert result['train']['error'][0] < 0.1
  46. assert res2 == result
  47. def test_multiclass(self):
  48. dtrain = xgb.DMatrix(dpath + 'agaricus.txt.train')
  49. dtest = xgb.DMatrix(dpath + 'agaricus.txt.test')
  50. param = {'max_depth': 2, 'eta': 1, 'silent': 1, 'num_class': 2}
  51. # specify validations set to watch performance
  52. watchlist = [(dtest, 'eval'), (dtrain, 'train')]
  53. num_round = 2
  54. bst = xgb.train(param, dtrain, num_round, watchlist)
  55. # this is prediction
  56. preds = bst.predict(dtest)
  57. labels = dtest.get_label()
  58. err = sum(1 for i in range(len(preds)) if preds[i] != labels[i]) / float(len(preds))
  59. # error must be smaller than 10%
  60. assert err < 0.1
  61. # save dmatrix into binary buffer
  62. dtest.save_binary('dtest.buffer')
  63. # save model
  64. bst.save_model('xgb.model')
  65. # load model and data in
  66. bst2 = xgb.Booster(model_file='xgb.model')
  67. dtest2 = xgb.DMatrix('dtest.buffer')
  68. preds2 = bst2.predict(dtest2)
  69. # assert they are the same
  70. assert np.sum(np.abs(preds2 - preds)) == 0
  71. def test_dmatrix_init(self):
  72. data = np.random.randn(5, 5)
  73. # different length
  74. self.assertRaises(ValueError, xgb.DMatrix, data,
  75. feature_names=list('abcdef'))
  76. # contains duplicates
  77. self.assertRaises(ValueError, xgb.DMatrix, data,
  78. feature_names=['a', 'b', 'c', 'd', 'd'])
  79. # contains symbol
  80. self.assertRaises(ValueError, xgb.DMatrix, data,
  81. feature_names=['a', 'b', 'c', 'd', 'e<1'])
  82. dm = xgb.DMatrix(data)
  83. dm.feature_names = list('abcde')
  84. assert dm.feature_names == list('abcde')
  85. dm.feature_types = 'q'
  86. assert dm.feature_types == list('qqqqq')
  87. dm.feature_types = list('qiqiq')
  88. assert dm.feature_types == list('qiqiq')
  89. def incorrect_type_set():
  90. dm.feature_types = list('abcde')
  91. self.assertRaises(ValueError, incorrect_type_set)
  92. # reset
  93. dm.feature_names = None
  94. self.assertEqual(dm.feature_names, ['f0', 'f1', 'f2', 'f3', 'f4'])
  95. assert dm.feature_types is None
  96. def test_feature_names(self):
  97. data = np.random.randn(100, 5)
  98. target = np.array([0, 1] * 50)
  99. cases = [['Feature1', 'Feature2', 'Feature3', 'Feature4', 'Feature5'],
  100. [u'要因1', u'要因2', u'要因3', u'要因4', u'要因5']]
  101. for features in cases:
  102. dm = xgb.DMatrix(data, label=target,
  103. feature_names=features)
  104. assert dm.feature_names == features
  105. assert dm.num_row() == 100
  106. assert dm.num_col() == 5
  107. params = {'objective': 'multi:softprob',
  108. 'eval_metric': 'mlogloss',
  109. 'eta': 0.3,
  110. 'num_class': 3}
  111. bst = xgb.train(params, dm, num_boost_round=10)
  112. scores = bst.get_fscore()
  113. assert list(sorted(k for k in scores)) == features
  114. dummy = np.random.randn(5, 5)
  115. dm = xgb.DMatrix(dummy, feature_names=features)
  116. bst.predict(dm)
  117. # different feature name must raises error
  118. dm = xgb.DMatrix(dummy, feature_names=list('abcde'))
  119. self.assertRaises(ValueError, bst.predict, dm)
  120. def test_feature_importances(self):
  121. data = np.random.randn(100, 5)
  122. target = np.array([0, 1] * 50)
  123. features = ['Feature1', 'Feature2', 'Feature3', 'Feature4', 'Feature5']
  124. dm = xgb.DMatrix(data, label=target,
  125. feature_names=features)
  126. params = {'objective': 'multi:softprob',
  127. 'eval_metric': 'mlogloss',
  128. 'eta': 0.3,
  129. 'num_class': 3}
  130. bst = xgb.train(params, dm, num_boost_round=10)
  131. # number of feature importances should == number of features
  132. scores1 = bst.get_score()
  133. scores2 = bst.get_score(importance_type='weight')
  134. scores3 = bst.get_score(importance_type='cover')
  135. scores4 = bst.get_score(importance_type='gain')
  136. assert len(scores1) == len(features)
  137. assert len(scores2) == len(features)
  138. assert len(scores3) == len(features)
  139. assert len(scores4) == len(features)
  140. # check backwards compatibility of get_fscore
  141. fscores = bst.get_fscore()
  142. assert scores1 == fscores
  143. def test_load_file_invalid(self):
  144. self.assertRaises(xgb.core.XGBoostError, xgb.Booster,
  145. model_file='incorrect_path')
  146. self.assertRaises(xgb.core.XGBoostError, xgb.Booster,
  147. model_file=u'不正なパス')
  148. def test_dmatrix_numpy_init(self):
  149. data = np.random.randn(5, 5)
  150. dm = xgb.DMatrix(data)
  151. assert dm.num_row() == 5
  152. assert dm.num_col() == 5
  153. data = np.matrix([[1, 2], [3, 4]])
  154. dm = xgb.DMatrix(data)
  155. assert dm.num_row() == 2
  156. assert dm.num_col() == 2
  157. # 0d array
  158. self.assertRaises(ValueError, xgb.DMatrix, np.array(1))
  159. # 1d array
  160. self.assertRaises(ValueError, xgb.DMatrix, np.array([1, 2, 3]))
  161. # 3d array
  162. data = np.random.randn(5, 5, 5)
  163. self.assertRaises(ValueError, xgb.DMatrix, data)
  164. # object dtype
  165. data = np.array([['a', 'b'], ['c', 'd']])
  166. self.assertRaises(ValueError, xgb.DMatrix, data)
  167. def test_cv(self):
  168. dm = xgb.DMatrix(dpath + 'agaricus.txt.train')
  169. params = {'max_depth': 2, 'eta': 1, 'silent': 1, 'objective': 'binary:logistic'}
  170. # return np.ndarray
  171. cv = xgb.cv(params, dm, num_boost_round=10, nfold=10, as_pandas=False)
  172. assert isinstance(cv, dict)
  173. assert len(cv) == (4)