PageRenderTime 31ms CodeModel.GetById 0ms RepoModel.GetById 0ms app.codeStats 0ms

/tests/python/test_with_pandas.py

https://gitlab.com/github-cloud-corporation/xgboost
Python | 176 lines | 134 code | 31 blank | 11 comment | 2 complexity | 48d89a3b200a50985bbcdb5422216a4c MD5 | raw file
  1. # -*- coding: utf-8 -*-
  2. import numpy as np
  3. import xgboost as xgb
  4. import testing as tm
  5. import unittest
  6. try:
  7. import pandas as pd
  8. except ImportError:
  9. pass
  10. tm._skip_if_no_pandas()
  11. dpath = 'demo/data/'
  12. rng = np.random.RandomState(1994)
  13. class TestPandas(unittest.TestCase):
  14. def test_pandas(self):
  15. df = pd.DataFrame([[1, 2., True], [2, 3., False]], columns=['a', 'b', 'c'])
  16. dm = xgb.DMatrix(df, label=pd.Series([1, 2]))
  17. assert dm.feature_names == ['a', 'b', 'c']
  18. assert dm.feature_types == ['int', 'float', 'i']
  19. assert dm.num_row() == 2
  20. assert dm.num_col() == 3
  21. # overwrite feature_names and feature_types
  22. dm = xgb.DMatrix(df, label=pd.Series([1, 2]),
  23. feature_names=['x', 'y', 'z'], feature_types=['q', 'q', 'q'])
  24. assert dm.feature_names == ['x', 'y', 'z']
  25. assert dm.feature_types == ['q', 'q', 'q']
  26. assert dm.num_row() == 2
  27. assert dm.num_col() == 3
  28. # incorrect dtypes
  29. df = pd.DataFrame([[1, 2., 'x'], [2, 3., 'y']], columns=['a', 'b', 'c'])
  30. self.assertRaises(ValueError, xgb.DMatrix, df)
  31. # numeric columns
  32. df = pd.DataFrame([[1, 2., True], [2, 3., False]])
  33. dm = xgb.DMatrix(df, label=pd.Series([1, 2]))
  34. assert dm.feature_names == ['0', '1', '2']
  35. assert dm.feature_types == ['int', 'float', 'i']
  36. assert dm.num_row() == 2
  37. assert dm.num_col() == 3
  38. df = pd.DataFrame([[1, 2., 1], [2, 3., 1]], columns=[4, 5, 6])
  39. dm = xgb.DMatrix(df, label=pd.Series([1, 2]))
  40. assert dm.feature_names == ['4', '5', '6']
  41. assert dm.feature_types == ['int', 'float', 'int']
  42. assert dm.num_row() == 2
  43. assert dm.num_col() == 3
  44. df = pd.DataFrame({'A': ['X', 'Y', 'Z'], 'B': [1, 2, 3]})
  45. dummies = pd.get_dummies(df)
  46. # B A_X A_Y A_Z
  47. # 0 1 1 0 0
  48. # 1 2 0 1 0
  49. # 2 3 0 0 1
  50. result, _, _ = xgb.core._maybe_pandas_data(dummies, None, None)
  51. exp = np.array([[1., 1., 0., 0.],
  52. [2., 0., 1., 0.],
  53. [3., 0., 0., 1.]])
  54. np.testing.assert_array_equal(result, exp)
  55. dm = xgb.DMatrix(dummies)
  56. assert dm.feature_names == ['B', 'A_X', 'A_Y', 'A_Z']
  57. assert dm.feature_types == ['int', 'float', 'float', 'float']
  58. assert dm.num_row() == 3
  59. assert dm.num_col() == 4
  60. df = pd.DataFrame({'A=1': [1, 2, 3], 'A=2': [4, 5, 6]})
  61. dm = xgb.DMatrix(df)
  62. assert dm.feature_names == ['A=1', 'A=2']
  63. assert dm.feature_types == ['int', 'int']
  64. assert dm.num_row() == 3
  65. assert dm.num_col() == 2
  66. def test_pandas_label(self):
  67. # label must be a single column
  68. df = pd.DataFrame({'A': ['X', 'Y', 'Z'], 'B': [1, 2, 3]})
  69. self.assertRaises(ValueError, xgb.core._maybe_pandas_label, df)
  70. # label must be supported dtype
  71. df = pd.DataFrame({'A': np.array(['a', 'b', 'c'], dtype=object)})
  72. self.assertRaises(ValueError, xgb.core._maybe_pandas_label, df)
  73. df = pd.DataFrame({'A': np.array([1, 2, 3], dtype=int)})
  74. result = xgb.core._maybe_pandas_label(df)
  75. np.testing.assert_array_equal(result, np.array([[1.], [2.], [3.]], dtype=float))
  76. dm = xgb.DMatrix(np.random.randn(3, 2), label=df)
  77. assert dm.num_row() == 3
  78. assert dm.num_col() == 2
  79. def test_cv_as_pandas(self):
  80. dm = xgb.DMatrix(dpath + 'agaricus.txt.train')
  81. params = {'max_depth': 2, 'eta': 1, 'silent': 1, 'objective': 'binary:logistic'}
  82. import pandas as pd
  83. cv = xgb.cv(params, dm, num_boost_round=10, nfold=10)
  84. assert isinstance(cv, pd.DataFrame)
  85. exp = pd.Index([u'test-error-mean', u'test-error-std',
  86. u'train-error-mean', u'train-error-std'])
  87. assert cv.columns.equals(exp)
  88. # show progress log (result is the same as above)
  89. cv = xgb.cv(params, dm, num_boost_round=10, nfold=10,
  90. verbose_eval=True)
  91. assert isinstance(cv, pd.DataFrame)
  92. exp = pd.Index([u'test-error-mean', u'test-error-std',
  93. u'train-error-mean', u'train-error-std'])
  94. assert cv.columns.equals(exp)
  95. cv = xgb.cv(params, dm, num_boost_round=10, nfold=10,
  96. verbose_eval=True, show_stdv=False)
  97. assert isinstance(cv, pd.DataFrame)
  98. exp = pd.Index([u'test-error-mean', u'test-error-std',
  99. u'train-error-mean', u'train-error-std'])
  100. assert cv.columns.equals(exp)
  101. params = {'max_depth': 2, 'eta': 1, 'silent': 1,
  102. 'objective': 'binary:logistic', 'eval_metric': 'auc'}
  103. cv = xgb.cv(params, dm, num_boost_round=10, nfold=10, as_pandas=True)
  104. assert 'eval_metric' in params
  105. assert 'auc' in cv.columns[0]
  106. params = {'max_depth': 2, 'eta': 1, 'silent': 1,
  107. 'objective': 'binary:logistic', 'eval_metric': ['auc']}
  108. cv = xgb.cv(params, dm, num_boost_round=10, nfold=10, as_pandas=True)
  109. assert 'eval_metric' in params
  110. assert 'auc' in cv.columns[0]
  111. params = {'max_depth': 2, 'eta': 1, 'silent': 1,
  112. 'objective': 'binary:logistic', 'eval_metric': ['auc']}
  113. cv = xgb.cv(params, dm, num_boost_round=10, nfold=10,
  114. as_pandas=True, early_stopping_rounds=1)
  115. assert 'eval_metric' in params
  116. assert 'auc' in cv.columns[0]
  117. assert cv.shape[0] < 10
  118. params = {'max_depth': 2, 'eta': 1, 'silent': 1,
  119. 'objective': 'binary:logistic'}
  120. cv = xgb.cv(params, dm, num_boost_round=10, nfold=10,
  121. as_pandas=True, metrics='auc')
  122. assert 'auc' in cv.columns[0]
  123. params = {'max_depth': 2, 'eta': 1, 'silent': 1,
  124. 'objective': 'binary:logistic'}
  125. cv = xgb.cv(params, dm, num_boost_round=10, nfold=10,
  126. as_pandas=True, metrics=['auc'])
  127. assert 'auc' in cv.columns[0]
  128. params = {'max_depth': 2, 'eta': 1, 'silent': 1,
  129. 'objective': 'binary:logistic', 'eval_metric': ['auc']}
  130. cv = xgb.cv(params, dm, num_boost_round=10, nfold=10,
  131. as_pandas=True, metrics='error')
  132. assert 'eval_metric' in params
  133. assert 'auc' not in cv.columns[0]
  134. assert 'error' in cv.columns[0]
  135. cv = xgb.cv(params, dm, num_boost_round=10, nfold=10,
  136. as_pandas=True, metrics=['error'])
  137. assert 'eval_metric' in params
  138. assert 'auc' not in cv.columns[0]
  139. assert 'error' in cv.columns[0]
  140. params = list(params.items())
  141. cv = xgb.cv(params, dm, num_boost_round=10, nfold=10,
  142. as_pandas=True, metrics=['error'])
  143. assert isinstance(params, list)
  144. assert 'auc' not in cv.columns[0]
  145. assert 'error' in cv.columns[0]