PageRenderTime 59ms CodeModel.GetById 7ms RepoModel.GetById 0ms app.codeStats 0ms

/pandas/tests/frame/indexing/test_categorical.py

https://github.com/jreback/pandas
Python | 388 lines | 246 code | 74 blank | 68 comment | 22 complexity | 54d6ac3c967fe2128fb6c0dd86fc27f6 MD5 | raw file
  1. import numpy as np
  2. import pytest
  3. from pandas.core.dtypes.dtypes import CategoricalDtype
  4. import pandas as pd
  5. from pandas import Categorical, DataFrame, Index, Series
  6. import pandas._testing as tm
  7. class TestDataFrameIndexingCategorical:
  8. def test_assignment(self):
  9. # assignment
  10. df = DataFrame(
  11. {"value": np.array(np.random.randint(0, 10000, 100), dtype="int32")}
  12. )
  13. labels = Categorical([f"{i} - {i + 499}" for i in range(0, 10000, 500)])
  14. df = df.sort_values(by=["value"], ascending=True)
  15. s = pd.cut(df.value, range(0, 10500, 500), right=False, labels=labels)
  16. d = s.values
  17. df["D"] = d
  18. str(df)
  19. result = df.dtypes
  20. expected = Series(
  21. [np.dtype("int32"), CategoricalDtype(categories=labels, ordered=False)],
  22. index=["value", "D"],
  23. )
  24. tm.assert_series_equal(result, expected)
  25. df["E"] = s
  26. str(df)
  27. result = df.dtypes
  28. expected = Series(
  29. [
  30. np.dtype("int32"),
  31. CategoricalDtype(categories=labels, ordered=False),
  32. CategoricalDtype(categories=labels, ordered=False),
  33. ],
  34. index=["value", "D", "E"],
  35. )
  36. tm.assert_series_equal(result, expected)
  37. result1 = df["D"]
  38. result2 = df["E"]
  39. tm.assert_categorical_equal(result1._mgr._block.values, d)
  40. # sorting
  41. s.name = "E"
  42. tm.assert_series_equal(result2.sort_index(), s.sort_index())
  43. cat = Categorical([1, 2, 3, 10], categories=[1, 2, 3, 4, 10])
  44. df = DataFrame(Series(cat))
  45. def test_assigning_ops(self):
  46. # systematically test the assigning operations:
  47. # for all slicing ops:
  48. # for value in categories and value not in categories:
  49. # - assign a single value -> exp_single_cats_value
  50. # - assign a complete row (mixed values) -> exp_single_row
  51. # assign multiple rows (mixed values) (-> array) -> exp_multi_row
  52. # assign a part of a column with dtype == categorical ->
  53. # exp_parts_cats_col
  54. # assign a part of a column with dtype != categorical ->
  55. # exp_parts_cats_col
  56. cats = Categorical(["a", "a", "a", "a", "a", "a", "a"], categories=["a", "b"])
  57. idx = Index(["h", "i", "j", "k", "l", "m", "n"])
  58. values = [1, 1, 1, 1, 1, 1, 1]
  59. orig = DataFrame({"cats": cats, "values": values}, index=idx)
  60. # the expected values
  61. # changed single row
  62. cats1 = Categorical(["a", "a", "b", "a", "a", "a", "a"], categories=["a", "b"])
  63. idx1 = Index(["h", "i", "j", "k", "l", "m", "n"])
  64. values1 = [1, 1, 2, 1, 1, 1, 1]
  65. exp_single_row = DataFrame({"cats": cats1, "values": values1}, index=idx1)
  66. # changed multiple rows
  67. cats2 = Categorical(["a", "a", "b", "b", "a", "a", "a"], categories=["a", "b"])
  68. idx2 = Index(["h", "i", "j", "k", "l", "m", "n"])
  69. values2 = [1, 1, 2, 2, 1, 1, 1]
  70. exp_multi_row = DataFrame({"cats": cats2, "values": values2}, index=idx2)
  71. # changed part of the cats column
  72. cats3 = Categorical(["a", "a", "b", "b", "a", "a", "a"], categories=["a", "b"])
  73. idx3 = Index(["h", "i", "j", "k", "l", "m", "n"])
  74. values3 = [1, 1, 1, 1, 1, 1, 1]
  75. exp_parts_cats_col = DataFrame({"cats": cats3, "values": values3}, index=idx3)
  76. # changed single value in cats col
  77. cats4 = Categorical(["a", "a", "b", "a", "a", "a", "a"], categories=["a", "b"])
  78. idx4 = Index(["h", "i", "j", "k", "l", "m", "n"])
  79. values4 = [1, 1, 1, 1, 1, 1, 1]
  80. exp_single_cats_value = DataFrame(
  81. {"cats": cats4, "values": values4}, index=idx4
  82. )
  83. # iloc
  84. # ###############
  85. # - assign a single value -> exp_single_cats_value
  86. df = orig.copy()
  87. df.iloc[2, 0] = "b"
  88. tm.assert_frame_equal(df, exp_single_cats_value)
  89. df = orig.copy()
  90. df.iloc[df.index == "j", 0] = "b"
  91. tm.assert_frame_equal(df, exp_single_cats_value)
  92. # - assign a single value not in the current categories set
  93. msg1 = (
  94. "Cannot setitem on a Categorical with a new category, "
  95. "set the categories first"
  96. )
  97. msg2 = "Cannot set a Categorical with another, without identical categories"
  98. with pytest.raises(ValueError, match=msg1):
  99. df = orig.copy()
  100. df.iloc[2, 0] = "c"
  101. # - assign a complete row (mixed values) -> exp_single_row
  102. df = orig.copy()
  103. df.iloc[2, :] = ["b", 2]
  104. tm.assert_frame_equal(df, exp_single_row)
  105. # - assign a complete row (mixed values) not in categories set
  106. with pytest.raises(ValueError, match=msg1):
  107. df = orig.copy()
  108. df.iloc[2, :] = ["c", 2]
  109. # - assign multiple rows (mixed values) -> exp_multi_row
  110. df = orig.copy()
  111. df.iloc[2:4, :] = [["b", 2], ["b", 2]]
  112. tm.assert_frame_equal(df, exp_multi_row)
  113. with pytest.raises(ValueError, match=msg1):
  114. df = orig.copy()
  115. df.iloc[2:4, :] = [["c", 2], ["c", 2]]
  116. # assign a part of a column with dtype == categorical ->
  117. # exp_parts_cats_col
  118. df = orig.copy()
  119. df.iloc[2:4, 0] = Categorical(["b", "b"], categories=["a", "b"])
  120. tm.assert_frame_equal(df, exp_parts_cats_col)
  121. with pytest.raises(ValueError, match=msg2):
  122. # different categories -> not sure if this should fail or pass
  123. df = orig.copy()
  124. df.iloc[2:4, 0] = Categorical(list("bb"), categories=list("abc"))
  125. with pytest.raises(ValueError, match=msg2):
  126. # different values
  127. df = orig.copy()
  128. df.iloc[2:4, 0] = Categorical(list("cc"), categories=list("abc"))
  129. # assign a part of a column with dtype != categorical ->
  130. # exp_parts_cats_col
  131. df = orig.copy()
  132. df.iloc[2:4, 0] = ["b", "b"]
  133. tm.assert_frame_equal(df, exp_parts_cats_col)
  134. with pytest.raises(ValueError, match=msg1):
  135. df.iloc[2:4, 0] = ["c", "c"]
  136. # loc
  137. # ##############
  138. # - assign a single value -> exp_single_cats_value
  139. df = orig.copy()
  140. df.loc["j", "cats"] = "b"
  141. tm.assert_frame_equal(df, exp_single_cats_value)
  142. df = orig.copy()
  143. df.loc[df.index == "j", "cats"] = "b"
  144. tm.assert_frame_equal(df, exp_single_cats_value)
  145. # - assign a single value not in the current categories set
  146. with pytest.raises(ValueError, match=msg1):
  147. df = orig.copy()
  148. df.loc["j", "cats"] = "c"
  149. # - assign a complete row (mixed values) -> exp_single_row
  150. df = orig.copy()
  151. df.loc["j", :] = ["b", 2]
  152. tm.assert_frame_equal(df, exp_single_row)
  153. # - assign a complete row (mixed values) not in categories set
  154. with pytest.raises(ValueError, match=msg1):
  155. df = orig.copy()
  156. df.loc["j", :] = ["c", 2]
  157. # - assign multiple rows (mixed values) -> exp_multi_row
  158. df = orig.copy()
  159. df.loc["j":"k", :] = [["b", 2], ["b", 2]]
  160. tm.assert_frame_equal(df, exp_multi_row)
  161. with pytest.raises(ValueError, match=msg1):
  162. df = orig.copy()
  163. df.loc["j":"k", :] = [["c", 2], ["c", 2]]
  164. # assign a part of a column with dtype == categorical ->
  165. # exp_parts_cats_col
  166. df = orig.copy()
  167. df.loc["j":"k", "cats"] = Categorical(["b", "b"], categories=["a", "b"])
  168. tm.assert_frame_equal(df, exp_parts_cats_col)
  169. with pytest.raises(ValueError, match=msg2):
  170. # different categories -> not sure if this should fail or pass
  171. df = orig.copy()
  172. df.loc["j":"k", "cats"] = Categorical(
  173. ["b", "b"], categories=["a", "b", "c"]
  174. )
  175. with pytest.raises(ValueError, match=msg2):
  176. # different values
  177. df = orig.copy()
  178. df.loc["j":"k", "cats"] = Categorical(
  179. ["c", "c"], categories=["a", "b", "c"]
  180. )
  181. # assign a part of a column with dtype != categorical ->
  182. # exp_parts_cats_col
  183. df = orig.copy()
  184. df.loc["j":"k", "cats"] = ["b", "b"]
  185. tm.assert_frame_equal(df, exp_parts_cats_col)
  186. with pytest.raises(ValueError, match=msg1):
  187. df.loc["j":"k", "cats"] = ["c", "c"]
  188. # loc
  189. # ##############
  190. # - assign a single value -> exp_single_cats_value
  191. df = orig.copy()
  192. df.loc["j", df.columns[0]] = "b"
  193. tm.assert_frame_equal(df, exp_single_cats_value)
  194. df = orig.copy()
  195. df.loc[df.index == "j", df.columns[0]] = "b"
  196. tm.assert_frame_equal(df, exp_single_cats_value)
  197. # - assign a single value not in the current categories set
  198. with pytest.raises(ValueError, match=msg1):
  199. df = orig.copy()
  200. df.loc["j", df.columns[0]] = "c"
  201. # - assign a complete row (mixed values) -> exp_single_row
  202. df = orig.copy()
  203. df.loc["j", :] = ["b", 2]
  204. tm.assert_frame_equal(df, exp_single_row)
  205. # - assign a complete row (mixed values) not in categories set
  206. with pytest.raises(ValueError, match=msg1):
  207. df = orig.copy()
  208. df.loc["j", :] = ["c", 2]
  209. # - assign multiple rows (mixed values) -> exp_multi_row
  210. df = orig.copy()
  211. df.loc["j":"k", :] = [["b", 2], ["b", 2]]
  212. tm.assert_frame_equal(df, exp_multi_row)
  213. with pytest.raises(ValueError, match=msg1):
  214. df = orig.copy()
  215. df.loc["j":"k", :] = [["c", 2], ["c", 2]]
  216. # assign a part of a column with dtype == categorical ->
  217. # exp_parts_cats_col
  218. df = orig.copy()
  219. df.loc["j":"k", df.columns[0]] = Categorical(["b", "b"], categories=["a", "b"])
  220. tm.assert_frame_equal(df, exp_parts_cats_col)
  221. with pytest.raises(ValueError, match=msg2):
  222. # different categories -> not sure if this should fail or pass
  223. df = orig.copy()
  224. df.loc["j":"k", df.columns[0]] = Categorical(
  225. ["b", "b"], categories=["a", "b", "c"]
  226. )
  227. with pytest.raises(ValueError, match=msg2):
  228. # different values
  229. df = orig.copy()
  230. df.loc["j":"k", df.columns[0]] = Categorical(
  231. ["c", "c"], categories=["a", "b", "c"]
  232. )
  233. # assign a part of a column with dtype != categorical ->
  234. # exp_parts_cats_col
  235. df = orig.copy()
  236. df.loc["j":"k", df.columns[0]] = ["b", "b"]
  237. tm.assert_frame_equal(df, exp_parts_cats_col)
  238. with pytest.raises(ValueError, match=msg1):
  239. df.loc["j":"k", df.columns[0]] = ["c", "c"]
  240. # iat
  241. df = orig.copy()
  242. df.iat[2, 0] = "b"
  243. tm.assert_frame_equal(df, exp_single_cats_value)
  244. # - assign a single value not in the current categories set
  245. with pytest.raises(ValueError, match=msg1):
  246. df = orig.copy()
  247. df.iat[2, 0] = "c"
  248. # at
  249. # - assign a single value -> exp_single_cats_value
  250. df = orig.copy()
  251. df.at["j", "cats"] = "b"
  252. tm.assert_frame_equal(df, exp_single_cats_value)
  253. # - assign a single value not in the current categories set
  254. with pytest.raises(ValueError, match=msg1):
  255. df = orig.copy()
  256. df.at["j", "cats"] = "c"
  257. # fancy indexing
  258. catsf = Categorical(
  259. ["a", "a", "c", "c", "a", "a", "a"], categories=["a", "b", "c"]
  260. )
  261. idxf = Index(["h", "i", "j", "k", "l", "m", "n"])
  262. valuesf = [1, 1, 3, 3, 1, 1, 1]
  263. df = DataFrame({"cats": catsf, "values": valuesf}, index=idxf)
  264. exp_fancy = exp_multi_row.copy()
  265. return_value = exp_fancy["cats"].cat.set_categories(
  266. ["a", "b", "c"], inplace=True
  267. )
  268. assert return_value is None
  269. df[df["cats"] == "c"] = ["b", 2]
  270. # category c is kept in .categories
  271. tm.assert_frame_equal(df, exp_fancy)
  272. # set_value
  273. df = orig.copy()
  274. df.at["j", "cats"] = "b"
  275. tm.assert_frame_equal(df, exp_single_cats_value)
  276. with pytest.raises(ValueError, match=msg1):
  277. df = orig.copy()
  278. df.at["j", "cats"] = "c"
  279. # Assigning a Category to parts of a int/... column uses the values of
  280. # the Categorical
  281. df = DataFrame({"a": [1, 1, 1, 1, 1], "b": list("aaaaa")})
  282. exp = DataFrame({"a": [1, "b", "b", 1, 1], "b": list("aabba")})
  283. df.loc[1:2, "a"] = Categorical(["b", "b"], categories=["a", "b"])
  284. df.loc[2:3, "b"] = Categorical(["b", "b"], categories=["a", "b"])
  285. tm.assert_frame_equal(df, exp)
  286. def test_loc_setitem_single_row_categorical(self):
  287. # GH 25495
  288. df = DataFrame({"Alpha": ["a"], "Numeric": [0]})
  289. categories = Categorical(df["Alpha"], categories=["a", "b", "c"])
  290. df.loc[:, "Alpha"] = categories
  291. result = df["Alpha"]
  292. expected = Series(categories, index=df.index, name="Alpha")
  293. tm.assert_series_equal(result, expected)
  294. def test_loc_indexing_preserves_index_category_dtype(self):
  295. # GH 15166
  296. df = DataFrame(
  297. data=np.arange(2, 22, 2),
  298. index=pd.MultiIndex(
  299. levels=[pd.CategoricalIndex(["a", "b"]), range(10)],
  300. codes=[[0] * 5 + [1] * 5, range(10)],
  301. names=["Index1", "Index2"],
  302. ),
  303. )
  304. expected = pd.CategoricalIndex(
  305. ["a", "b"],
  306. categories=["a", "b"],
  307. ordered=False,
  308. name="Index1",
  309. dtype="category",
  310. )
  311. result = df.index.levels[0]
  312. tm.assert_index_equal(result, expected)
  313. result = df.loc[["a"]].index.levels[0]
  314. tm.assert_index_equal(result, expected)