PageRenderTime 278ms CodeModel.GetById 11ms RepoModel.GetById 0ms app.codeStats 0ms

/pandas/tests/io/parser/dtypes/test_categorical.py

https://github.com/jreback/pandas
Python | 294 lines | 284 code | 5 blank | 5 comment | 0 complexity | a01ece0cb58bd67046276c4a5fbfe74e MD5 | raw file
  1. """
  2. Tests dtype specification during parsing
  3. for all of the parsers defined in parsers.py
  4. """
  5. from io import StringIO
  6. import os
  7. import numpy as np
  8. import pytest
  9. from pandas.core.dtypes.dtypes import CategoricalDtype
  10. import pandas as pd
  11. from pandas import Categorical, DataFrame, Timestamp
  12. import pandas._testing as tm
  13. @pytest.mark.parametrize(
  14. "dtype",
  15. [
  16. "category",
  17. CategoricalDtype(),
  18. {"a": "category", "b": "category", "c": CategoricalDtype()},
  19. ],
  20. )
  21. def test_categorical_dtype(all_parsers, dtype):
  22. # see gh-10153
  23. parser = all_parsers
  24. data = """a,b,c
  25. 1,a,3.4
  26. 1,a,3.4
  27. 2,b,4.5"""
  28. expected = DataFrame(
  29. {
  30. "a": Categorical(["1", "1", "2"]),
  31. "b": Categorical(["a", "a", "b"]),
  32. "c": Categorical(["3.4", "3.4", "4.5"]),
  33. }
  34. )
  35. actual = parser.read_csv(StringIO(data), dtype=dtype)
  36. tm.assert_frame_equal(actual, expected)
  37. @pytest.mark.parametrize("dtype", [{"b": "category"}, {1: "category"}])
  38. def test_categorical_dtype_single(all_parsers, dtype):
  39. # see gh-10153
  40. parser = all_parsers
  41. data = """a,b,c
  42. 1,a,3.4
  43. 1,a,3.4
  44. 2,b,4.5"""
  45. expected = DataFrame(
  46. {"a": [1, 1, 2], "b": Categorical(["a", "a", "b"]), "c": [3.4, 3.4, 4.5]}
  47. )
  48. actual = parser.read_csv(StringIO(data), dtype=dtype)
  49. tm.assert_frame_equal(actual, expected)
  50. def test_categorical_dtype_unsorted(all_parsers):
  51. # see gh-10153
  52. parser = all_parsers
  53. data = """a,b,c
  54. 1,b,3.4
  55. 1,b,3.4
  56. 2,a,4.5"""
  57. expected = DataFrame(
  58. {
  59. "a": Categorical(["1", "1", "2"]),
  60. "b": Categorical(["b", "b", "a"]),
  61. "c": Categorical(["3.4", "3.4", "4.5"]),
  62. }
  63. )
  64. actual = parser.read_csv(StringIO(data), dtype="category")
  65. tm.assert_frame_equal(actual, expected)
  66. def test_categorical_dtype_missing(all_parsers):
  67. # see gh-10153
  68. parser = all_parsers
  69. data = """a,b,c
  70. 1,b,3.4
  71. 1,nan,3.4
  72. 2,a,4.5"""
  73. expected = DataFrame(
  74. {
  75. "a": Categorical(["1", "1", "2"]),
  76. "b": Categorical(["b", np.nan, "a"]),
  77. "c": Categorical(["3.4", "3.4", "4.5"]),
  78. }
  79. )
  80. actual = parser.read_csv(StringIO(data), dtype="category")
  81. tm.assert_frame_equal(actual, expected)
  82. @pytest.mark.slow
  83. def test_categorical_dtype_high_cardinality_numeric(all_parsers):
  84. # see gh-18186
  85. parser = all_parsers
  86. data = np.sort([str(i) for i in range(524289)])
  87. expected = DataFrame({"a": Categorical(data, ordered=True)})
  88. actual = parser.read_csv(StringIO("a\n" + "\n".join(data)), dtype="category")
  89. actual["a"] = actual["a"].cat.reorder_categories(
  90. np.sort(actual.a.cat.categories), ordered=True
  91. )
  92. tm.assert_frame_equal(actual, expected)
  93. def test_categorical_dtype_utf16(all_parsers, csv_dir_path):
  94. # see gh-10153
  95. pth = os.path.join(csv_dir_path, "utf16_ex.txt")
  96. parser = all_parsers
  97. encoding = "utf-16"
  98. sep = "\t"
  99. expected = parser.read_csv(pth, sep=sep, encoding=encoding)
  100. expected = expected.apply(Categorical)
  101. actual = parser.read_csv(pth, sep=sep, encoding=encoding, dtype="category")
  102. tm.assert_frame_equal(actual, expected)
  103. def test_categorical_dtype_chunksize_infer_categories(all_parsers):
  104. # see gh-10153
  105. parser = all_parsers
  106. data = """a,b
  107. 1,a
  108. 1,b
  109. 1,b
  110. 2,c"""
  111. expecteds = [
  112. DataFrame({"a": [1, 1], "b": Categorical(["a", "b"])}),
  113. DataFrame({"a": [1, 2], "b": Categorical(["b", "c"])}, index=[2, 3]),
  114. ]
  115. with parser.read_csv(
  116. StringIO(data), dtype={"b": "category"}, chunksize=2
  117. ) as actuals:
  118. for actual, expected in zip(actuals, expecteds):
  119. tm.assert_frame_equal(actual, expected)
  120. def test_categorical_dtype_chunksize_explicit_categories(all_parsers):
  121. # see gh-10153
  122. parser = all_parsers
  123. data = """a,b
  124. 1,a
  125. 1,b
  126. 1,b
  127. 2,c"""
  128. cats = ["a", "b", "c"]
  129. expecteds = [
  130. DataFrame({"a": [1, 1], "b": Categorical(["a", "b"], categories=cats)}),
  131. DataFrame(
  132. {"a": [1, 2], "b": Categorical(["b", "c"], categories=cats)},
  133. index=[2, 3],
  134. ),
  135. ]
  136. dtype = CategoricalDtype(cats)
  137. with parser.read_csv(StringIO(data), dtype={"b": dtype}, chunksize=2) as actuals:
  138. for actual, expected in zip(actuals, expecteds):
  139. tm.assert_frame_equal(actual, expected)
  140. def test_categorical_dtype_latin1(all_parsers, csv_dir_path):
  141. # see gh-10153
  142. pth = os.path.join(csv_dir_path, "unicode_series.csv")
  143. parser = all_parsers
  144. encoding = "latin-1"
  145. expected = parser.read_csv(pth, header=None, encoding=encoding)
  146. expected[1] = Categorical(expected[1])
  147. actual = parser.read_csv(pth, header=None, encoding=encoding, dtype={1: "category"})
  148. tm.assert_frame_equal(actual, expected)
  149. @pytest.mark.parametrize("ordered", [False, True])
  150. @pytest.mark.parametrize(
  151. "categories",
  152. [["a", "b", "c"], ["a", "c", "b"], ["a", "b", "c", "d"], ["c", "b", "a"]],
  153. )
  154. def test_categorical_category_dtype(all_parsers, categories, ordered):
  155. parser = all_parsers
  156. data = """a,b
  157. 1,a
  158. 1,b
  159. 1,b
  160. 2,c"""
  161. expected = DataFrame(
  162. {
  163. "a": [1, 1, 1, 2],
  164. "b": Categorical(
  165. ["a", "b", "b", "c"], categories=categories, ordered=ordered
  166. ),
  167. }
  168. )
  169. dtype = {"b": CategoricalDtype(categories=categories, ordered=ordered)}
  170. result = parser.read_csv(StringIO(data), dtype=dtype)
  171. tm.assert_frame_equal(result, expected)
  172. def test_categorical_category_dtype_unsorted(all_parsers):
  173. parser = all_parsers
  174. data = """a,b
  175. 1,a
  176. 1,b
  177. 1,b
  178. 2,c"""
  179. dtype = CategoricalDtype(["c", "b", "a"])
  180. expected = DataFrame(
  181. {
  182. "a": [1, 1, 1, 2],
  183. "b": Categorical(["a", "b", "b", "c"], categories=["c", "b", "a"]),
  184. }
  185. )
  186. result = parser.read_csv(StringIO(data), dtype={"b": dtype})
  187. tm.assert_frame_equal(result, expected)
  188. def test_categorical_coerces_numeric(all_parsers):
  189. parser = all_parsers
  190. dtype = {"b": CategoricalDtype([1, 2, 3])}
  191. data = "b\n1\n1\n2\n3"
  192. expected = DataFrame({"b": Categorical([1, 1, 2, 3])})
  193. result = parser.read_csv(StringIO(data), dtype=dtype)
  194. tm.assert_frame_equal(result, expected)
  195. def test_categorical_coerces_datetime(all_parsers):
  196. parser = all_parsers
  197. dti = pd.DatetimeIndex(["2017-01-01", "2018-01-01", "2019-01-01"], freq=None)
  198. dtype = {"b": CategoricalDtype(dti)}
  199. data = "b\n2017-01-01\n2018-01-01\n2019-01-01"
  200. expected = DataFrame({"b": Categorical(dtype["b"].categories)})
  201. result = parser.read_csv(StringIO(data), dtype=dtype)
  202. tm.assert_frame_equal(result, expected)
  203. def test_categorical_coerces_timestamp(all_parsers):
  204. parser = all_parsers
  205. dtype = {"b": CategoricalDtype([Timestamp("2014")])}
  206. data = "b\n2014-01-01\n2014-01-01T00:00:00"
  207. expected = DataFrame({"b": Categorical([Timestamp("2014")] * 2)})
  208. result = parser.read_csv(StringIO(data), dtype=dtype)
  209. tm.assert_frame_equal(result, expected)
  210. def test_categorical_coerces_timedelta(all_parsers):
  211. parser = all_parsers
  212. dtype = {"b": CategoricalDtype(pd.to_timedelta(["1H", "2H", "3H"]))}
  213. data = "b\n1H\n2H\n3H"
  214. expected = DataFrame({"b": Categorical(dtype["b"].categories)})
  215. result = parser.read_csv(StringIO(data), dtype=dtype)
  216. tm.assert_frame_equal(result, expected)
  217. @pytest.mark.parametrize(
  218. "data",
  219. [
  220. "b\nTrue\nFalse\nNA\nFalse",
  221. "b\ntrue\nfalse\nNA\nfalse",
  222. "b\nTRUE\nFALSE\nNA\nFALSE",
  223. "b\nTrue\nFalse\nNA\nFALSE",
  224. ],
  225. )
  226. def test_categorical_dtype_coerces_boolean(all_parsers, data):
  227. # see gh-20498
  228. parser = all_parsers
  229. dtype = {"b": CategoricalDtype([False, True])}
  230. expected = DataFrame({"b": Categorical([True, False, None, False])})
  231. result = parser.read_csv(StringIO(data), dtype=dtype)
  232. tm.assert_frame_equal(result, expected)
  233. def test_categorical_unexpected_categories(all_parsers):
  234. parser = all_parsers
  235. dtype = {"b": CategoricalDtype(["a", "b", "d", "e"])}
  236. data = "b\nd\na\nc\nd" # Unexpected c
  237. expected = DataFrame({"b": Categorical(list("dacd"), dtype=dtype["b"])})
  238. result = parser.read_csv(StringIO(data), dtype=dtype)
  239. tm.assert_frame_equal(result, expected)