/pandas/tests/io/parser/dtypes/test_categorical.py
Python | 294 lines | 284 code | 5 blank | 5 comment | 0 complexity | a01ece0cb58bd67046276c4a5fbfe74e MD5 | raw file
- """
- Tests dtype specification during parsing
- for all of the parsers defined in parsers.py
- """
- from io import StringIO
- import os
- import numpy as np
- import pytest
- from pandas.core.dtypes.dtypes import CategoricalDtype
- import pandas as pd
- from pandas import Categorical, DataFrame, Timestamp
- import pandas._testing as tm
- @pytest.mark.parametrize(
- "dtype",
- [
- "category",
- CategoricalDtype(),
- {"a": "category", "b": "category", "c": CategoricalDtype()},
- ],
- )
- def test_categorical_dtype(all_parsers, dtype):
- # see gh-10153
- parser = all_parsers
- data = """a,b,c
- 1,a,3.4
- 1,a,3.4
- 2,b,4.5"""
- expected = DataFrame(
- {
- "a": Categorical(["1", "1", "2"]),
- "b": Categorical(["a", "a", "b"]),
- "c": Categorical(["3.4", "3.4", "4.5"]),
- }
- )
- actual = parser.read_csv(StringIO(data), dtype=dtype)
- tm.assert_frame_equal(actual, expected)
- @pytest.mark.parametrize("dtype", [{"b": "category"}, {1: "category"}])
- def test_categorical_dtype_single(all_parsers, dtype):
- # see gh-10153
- parser = all_parsers
- data = """a,b,c
- 1,a,3.4
- 1,a,3.4
- 2,b,4.5"""
- expected = DataFrame(
- {"a": [1, 1, 2], "b": Categorical(["a", "a", "b"]), "c": [3.4, 3.4, 4.5]}
- )
- actual = parser.read_csv(StringIO(data), dtype=dtype)
- tm.assert_frame_equal(actual, expected)
- def test_categorical_dtype_unsorted(all_parsers):
- # see gh-10153
- parser = all_parsers
- data = """a,b,c
- 1,b,3.4
- 1,b,3.4
- 2,a,4.5"""
- expected = DataFrame(
- {
- "a": Categorical(["1", "1", "2"]),
- "b": Categorical(["b", "b", "a"]),
- "c": Categorical(["3.4", "3.4", "4.5"]),
- }
- )
- actual = parser.read_csv(StringIO(data), dtype="category")
- tm.assert_frame_equal(actual, expected)
- def test_categorical_dtype_missing(all_parsers):
- # see gh-10153
- parser = all_parsers
- data = """a,b,c
- 1,b,3.4
- 1,nan,3.4
- 2,a,4.5"""
- expected = DataFrame(
- {
- "a": Categorical(["1", "1", "2"]),
- "b": Categorical(["b", np.nan, "a"]),
- "c": Categorical(["3.4", "3.4", "4.5"]),
- }
- )
- actual = parser.read_csv(StringIO(data), dtype="category")
- tm.assert_frame_equal(actual, expected)
- @pytest.mark.slow
- def test_categorical_dtype_high_cardinality_numeric(all_parsers):
- # see gh-18186
- parser = all_parsers
- data = np.sort([str(i) for i in range(524289)])
- expected = DataFrame({"a": Categorical(data, ordered=True)})
- actual = parser.read_csv(StringIO("a\n" + "\n".join(data)), dtype="category")
- actual["a"] = actual["a"].cat.reorder_categories(
- np.sort(actual.a.cat.categories), ordered=True
- )
- tm.assert_frame_equal(actual, expected)
- def test_categorical_dtype_utf16(all_parsers, csv_dir_path):
- # see gh-10153
- pth = os.path.join(csv_dir_path, "utf16_ex.txt")
- parser = all_parsers
- encoding = "utf-16"
- sep = "\t"
- expected = parser.read_csv(pth, sep=sep, encoding=encoding)
- expected = expected.apply(Categorical)
- actual = parser.read_csv(pth, sep=sep, encoding=encoding, dtype="category")
- tm.assert_frame_equal(actual, expected)
- def test_categorical_dtype_chunksize_infer_categories(all_parsers):
- # see gh-10153
- parser = all_parsers
- data = """a,b
- 1,a
- 1,b
- 1,b
- 2,c"""
- expecteds = [
- DataFrame({"a": [1, 1], "b": Categorical(["a", "b"])}),
- DataFrame({"a": [1, 2], "b": Categorical(["b", "c"])}, index=[2, 3]),
- ]
- with parser.read_csv(
- StringIO(data), dtype={"b": "category"}, chunksize=2
- ) as actuals:
- for actual, expected in zip(actuals, expecteds):
- tm.assert_frame_equal(actual, expected)
- def test_categorical_dtype_chunksize_explicit_categories(all_parsers):
- # see gh-10153
- parser = all_parsers
- data = """a,b
- 1,a
- 1,b
- 1,b
- 2,c"""
- cats = ["a", "b", "c"]
- expecteds = [
- DataFrame({"a": [1, 1], "b": Categorical(["a", "b"], categories=cats)}),
- DataFrame(
- {"a": [1, 2], "b": Categorical(["b", "c"], categories=cats)},
- index=[2, 3],
- ),
- ]
- dtype = CategoricalDtype(cats)
- with parser.read_csv(StringIO(data), dtype={"b": dtype}, chunksize=2) as actuals:
- for actual, expected in zip(actuals, expecteds):
- tm.assert_frame_equal(actual, expected)
- def test_categorical_dtype_latin1(all_parsers, csv_dir_path):
- # see gh-10153
- pth = os.path.join(csv_dir_path, "unicode_series.csv")
- parser = all_parsers
- encoding = "latin-1"
- expected = parser.read_csv(pth, header=None, encoding=encoding)
- expected[1] = Categorical(expected[1])
- actual = parser.read_csv(pth, header=None, encoding=encoding, dtype={1: "category"})
- tm.assert_frame_equal(actual, expected)
- @pytest.mark.parametrize("ordered", [False, True])
- @pytest.mark.parametrize(
- "categories",
- [["a", "b", "c"], ["a", "c", "b"], ["a", "b", "c", "d"], ["c", "b", "a"]],
- )
- def test_categorical_category_dtype(all_parsers, categories, ordered):
- parser = all_parsers
- data = """a,b
- 1,a
- 1,b
- 1,b
- 2,c"""
- expected = DataFrame(
- {
- "a": [1, 1, 1, 2],
- "b": Categorical(
- ["a", "b", "b", "c"], categories=categories, ordered=ordered
- ),
- }
- )
- dtype = {"b": CategoricalDtype(categories=categories, ordered=ordered)}
- result = parser.read_csv(StringIO(data), dtype=dtype)
- tm.assert_frame_equal(result, expected)
- def test_categorical_category_dtype_unsorted(all_parsers):
- parser = all_parsers
- data = """a,b
- 1,a
- 1,b
- 1,b
- 2,c"""
- dtype = CategoricalDtype(["c", "b", "a"])
- expected = DataFrame(
- {
- "a": [1, 1, 1, 2],
- "b": Categorical(["a", "b", "b", "c"], categories=["c", "b", "a"]),
- }
- )
- result = parser.read_csv(StringIO(data), dtype={"b": dtype})
- tm.assert_frame_equal(result, expected)
- def test_categorical_coerces_numeric(all_parsers):
- parser = all_parsers
- dtype = {"b": CategoricalDtype([1, 2, 3])}
- data = "b\n1\n1\n2\n3"
- expected = DataFrame({"b": Categorical([1, 1, 2, 3])})
- result = parser.read_csv(StringIO(data), dtype=dtype)
- tm.assert_frame_equal(result, expected)
- def test_categorical_coerces_datetime(all_parsers):
- parser = all_parsers
- dti = pd.DatetimeIndex(["2017-01-01", "2018-01-01", "2019-01-01"], freq=None)
- dtype = {"b": CategoricalDtype(dti)}
- data = "b\n2017-01-01\n2018-01-01\n2019-01-01"
- expected = DataFrame({"b": Categorical(dtype["b"].categories)})
- result = parser.read_csv(StringIO(data), dtype=dtype)
- tm.assert_frame_equal(result, expected)
- def test_categorical_coerces_timestamp(all_parsers):
- parser = all_parsers
- dtype = {"b": CategoricalDtype([Timestamp("2014")])}
- data = "b\n2014-01-01\n2014-01-01T00:00:00"
- expected = DataFrame({"b": Categorical([Timestamp("2014")] * 2)})
- result = parser.read_csv(StringIO(data), dtype=dtype)
- tm.assert_frame_equal(result, expected)
- def test_categorical_coerces_timedelta(all_parsers):
- parser = all_parsers
- dtype = {"b": CategoricalDtype(pd.to_timedelta(["1H", "2H", "3H"]))}
- data = "b\n1H\n2H\n3H"
- expected = DataFrame({"b": Categorical(dtype["b"].categories)})
- result = parser.read_csv(StringIO(data), dtype=dtype)
- tm.assert_frame_equal(result, expected)
- @pytest.mark.parametrize(
- "data",
- [
- "b\nTrue\nFalse\nNA\nFalse",
- "b\ntrue\nfalse\nNA\nfalse",
- "b\nTRUE\nFALSE\nNA\nFALSE",
- "b\nTrue\nFalse\nNA\nFALSE",
- ],
- )
- def test_categorical_dtype_coerces_boolean(all_parsers, data):
- # see gh-20498
- parser = all_parsers
- dtype = {"b": CategoricalDtype([False, True])}
- expected = DataFrame({"b": Categorical([True, False, None, False])})
- result = parser.read_csv(StringIO(data), dtype=dtype)
- tm.assert_frame_equal(result, expected)
- def test_categorical_unexpected_categories(all_parsers):
- parser = all_parsers
- dtype = {"b": CategoricalDtype(["a", "b", "d", "e"])}
- data = "b\nd\na\nc\nd" # Unexpected c
- expected = DataFrame({"b": Categorical(list("dacd"), dtype=dtype["b"])})
- result = parser.read_csv(StringIO(data), dtype=dtype)
- tm.assert_frame_equal(result, expected)