PageRenderTime 76ms CodeModel.GetById 17ms RepoModel.GetById 1ms app.codeStats 0ms

/pandas/tests/reshape/concat/test_index.py

https://github.com/pydata/pandas
Python | 389 lines | 305 code | 62 blank | 22 comment | 11 complexity | 64c1ecb2c063989e86bb7cd8fbbaba13 MD5 | raw file
Possible License(s): BSD-3-Clause
  1. import numpy as np
  2. import pytest
  3. from pandas.errors import PerformanceWarning
  4. import pandas as pd
  5. from pandas import (
  6. DataFrame,
  7. Index,
  8. MultiIndex,
  9. Series,
  10. concat,
  11. )
  12. import pandas._testing as tm
  13. class TestIndexConcat:
  14. def test_concat_ignore_index(self, sort):
  15. frame1 = DataFrame(
  16. {"test1": ["a", "b", "c"], "test2": [1, 2, 3], "test3": [4.5, 3.2, 1.2]}
  17. )
  18. frame2 = DataFrame({"test3": [5.2, 2.2, 4.3]})
  19. frame1.index = Index(["x", "y", "z"])
  20. frame2.index = Index(["x", "y", "q"])
  21. v1 = concat([frame1, frame2], axis=1, ignore_index=True, sort=sort)
  22. nan = np.nan
  23. expected = DataFrame(
  24. [
  25. [nan, nan, nan, 4.3],
  26. ["a", 1, 4.5, 5.2],
  27. ["b", 2, 3.2, 2.2],
  28. ["c", 3, 1.2, nan],
  29. ],
  30. index=Index(["q", "x", "y", "z"]),
  31. )
  32. if not sort:
  33. expected = expected.loc[["x", "y", "z", "q"]]
  34. tm.assert_frame_equal(v1, expected)
  35. @pytest.mark.parametrize(
  36. "name_in1,name_in2,name_in3,name_out",
  37. [
  38. ("idx", "idx", "idx", "idx"),
  39. ("idx", "idx", None, None),
  40. ("idx", None, None, None),
  41. ("idx1", "idx2", None, None),
  42. ("idx1", "idx1", "idx2", None),
  43. ("idx1", "idx2", "idx3", None),
  44. (None, None, None, None),
  45. ],
  46. )
  47. def test_concat_same_index_names(self, name_in1, name_in2, name_in3, name_out):
  48. # GH13475
  49. indices = [
  50. Index(["a", "b", "c"], name=name_in1),
  51. Index(["b", "c", "d"], name=name_in2),
  52. Index(["c", "d", "e"], name=name_in3),
  53. ]
  54. frames = [
  55. DataFrame({c: [0, 1, 2]}, index=i) for i, c in zip(indices, ["x", "y", "z"])
  56. ]
  57. result = concat(frames, axis=1)
  58. exp_ind = Index(["a", "b", "c", "d", "e"], name=name_out)
  59. expected = DataFrame(
  60. {
  61. "x": [0, 1, 2, np.nan, np.nan],
  62. "y": [np.nan, 0, 1, 2, np.nan],
  63. "z": [np.nan, np.nan, 0, 1, 2],
  64. },
  65. index=exp_ind,
  66. )
  67. tm.assert_frame_equal(result, expected)
  68. def test_concat_rename_index(self):
  69. a = DataFrame(
  70. np.random.rand(3, 3),
  71. columns=list("ABC"),
  72. index=Index(list("abc"), name="index_a"),
  73. )
  74. b = DataFrame(
  75. np.random.rand(3, 3),
  76. columns=list("ABC"),
  77. index=Index(list("abc"), name="index_b"),
  78. )
  79. result = concat([a, b], keys=["key0", "key1"], names=["lvl0", "lvl1"])
  80. exp = concat([a, b], keys=["key0", "key1"], names=["lvl0"])
  81. names = list(exp.index.names)
  82. names[1] = "lvl1"
  83. exp.index.set_names(names, inplace=True)
  84. tm.assert_frame_equal(result, exp)
  85. assert result.index.names == exp.index.names
  86. def test_concat_copy_index_series(self, axis):
  87. # GH 29879
  88. ser = Series([1, 2])
  89. comb = concat([ser, ser], axis=axis, copy=True)
  90. assert comb.index is not ser.index
  91. def test_concat_copy_index_frame(self, axis):
  92. # GH 29879
  93. df = DataFrame([[1, 2], [3, 4]], columns=["a", "b"])
  94. comb = concat([df, df], axis=axis, copy=True)
  95. assert comb.index is not df.index
  96. assert comb.columns is not df.columns
  97. def test_default_index(self):
  98. # is_series and ignore_index
  99. s1 = Series([1, 2, 3], name="x")
  100. s2 = Series([4, 5, 6], name="y")
  101. res = concat([s1, s2], axis=1, ignore_index=True)
  102. assert isinstance(res.columns, pd.RangeIndex)
  103. exp = DataFrame([[1, 4], [2, 5], [3, 6]])
  104. # use check_index_type=True to check the result have
  105. # RangeIndex (default index)
  106. tm.assert_frame_equal(res, exp, check_index_type=True, check_column_type=True)
  107. # is_series and all inputs have no names
  108. s1 = Series([1, 2, 3])
  109. s2 = Series([4, 5, 6])
  110. res = concat([s1, s2], axis=1, ignore_index=False)
  111. assert isinstance(res.columns, pd.RangeIndex)
  112. exp = DataFrame([[1, 4], [2, 5], [3, 6]])
  113. exp.columns = pd.RangeIndex(2)
  114. tm.assert_frame_equal(res, exp, check_index_type=True, check_column_type=True)
  115. # is_dataframe and ignore_index
  116. df1 = DataFrame({"A": [1, 2], "B": [5, 6]})
  117. df2 = DataFrame({"A": [3, 4], "B": [7, 8]})
  118. res = concat([df1, df2], axis=0, ignore_index=True)
  119. exp = DataFrame([[1, 5], [2, 6], [3, 7], [4, 8]], columns=["A", "B"])
  120. tm.assert_frame_equal(res, exp, check_index_type=True, check_column_type=True)
  121. res = concat([df1, df2], axis=1, ignore_index=True)
  122. exp = DataFrame([[1, 5, 3, 7], [2, 6, 4, 8]])
  123. tm.assert_frame_equal(res, exp, check_index_type=True, check_column_type=True)
  124. def test_dups_index(self):
  125. # GH 4771
  126. # single dtypes
  127. df = DataFrame(
  128. np.random.randint(0, 10, size=40).reshape(10, 4),
  129. columns=["A", "A", "C", "C"],
  130. )
  131. result = concat([df, df], axis=1)
  132. tm.assert_frame_equal(result.iloc[:, :4], df)
  133. tm.assert_frame_equal(result.iloc[:, 4:], df)
  134. result = concat([df, df], axis=0)
  135. tm.assert_frame_equal(result.iloc[:10], df)
  136. tm.assert_frame_equal(result.iloc[10:], df)
  137. # multi dtypes
  138. df = concat(
  139. [
  140. DataFrame(np.random.randn(10, 4), columns=["A", "A", "B", "B"]),
  141. DataFrame(
  142. np.random.randint(0, 10, size=20).reshape(10, 2), columns=["A", "C"]
  143. ),
  144. ],
  145. axis=1,
  146. )
  147. result = concat([df, df], axis=1)
  148. tm.assert_frame_equal(result.iloc[:, :6], df)
  149. tm.assert_frame_equal(result.iloc[:, 6:], df)
  150. result = concat([df, df], axis=0)
  151. tm.assert_frame_equal(result.iloc[:10], df)
  152. tm.assert_frame_equal(result.iloc[10:], df)
  153. # append
  154. result = df.iloc[0:8, :]._append(df.iloc[8:])
  155. tm.assert_frame_equal(result, df)
  156. result = df.iloc[0:8, :]._append(df.iloc[8:9])._append(df.iloc[9:10])
  157. tm.assert_frame_equal(result, df)
  158. expected = concat([df, df], axis=0)
  159. result = df._append(df)
  160. tm.assert_frame_equal(result, expected)
  161. class TestMultiIndexConcat:
  162. def test_concat_multiindex_with_keys(self, multiindex_dataframe_random_data):
  163. frame = multiindex_dataframe_random_data
  164. index = frame.index
  165. result = concat([frame, frame], keys=[0, 1], names=["iteration"])
  166. assert result.index.names == ("iteration",) + index.names
  167. tm.assert_frame_equal(result.loc[0], frame)
  168. tm.assert_frame_equal(result.loc[1], frame)
  169. assert result.index.nlevels == 3
  170. def test_concat_multiindex_with_none_in_index_names(self):
  171. # GH 15787
  172. index = MultiIndex.from_product([[1], range(5)], names=["level1", None])
  173. df = DataFrame({"col": range(5)}, index=index, dtype=np.int32)
  174. result = concat([df, df], keys=[1, 2], names=["level2"])
  175. index = MultiIndex.from_product(
  176. [[1, 2], [1], range(5)], names=["level2", "level1", None]
  177. )
  178. expected = DataFrame({"col": list(range(5)) * 2}, index=index, dtype=np.int32)
  179. tm.assert_frame_equal(result, expected)
  180. result = concat([df, df[:2]], keys=[1, 2], names=["level2"])
  181. level2 = [1] * 5 + [2] * 2
  182. level1 = [1] * 7
  183. no_name = list(range(5)) + list(range(2))
  184. tuples = list(zip(level2, level1, no_name))
  185. index = MultiIndex.from_tuples(tuples, names=["level2", "level1", None])
  186. expected = DataFrame({"col": no_name}, index=index, dtype=np.int32)
  187. tm.assert_frame_equal(result, expected)
  188. def test_concat_multiindex_rangeindex(self):
  189. # GH13542
  190. # when multi-index levels are RangeIndex objects
  191. # there is a bug in concat with objects of len 1
  192. df = DataFrame(np.random.randn(9, 2))
  193. df.index = MultiIndex(
  194. levels=[pd.RangeIndex(3), pd.RangeIndex(3)],
  195. codes=[np.repeat(np.arange(3), 3), np.tile(np.arange(3), 3)],
  196. )
  197. res = concat([df.iloc[[2, 3, 4], :], df.iloc[[5], :]])
  198. exp = df.iloc[[2, 3, 4, 5], :]
  199. tm.assert_frame_equal(res, exp)
  200. def test_concat_multiindex_dfs_with_deepcopy(self):
  201. # GH 9967
  202. from copy import deepcopy
  203. example_multiindex1 = MultiIndex.from_product([["a"], ["b"]])
  204. example_dataframe1 = DataFrame([0], index=example_multiindex1)
  205. example_multiindex2 = MultiIndex.from_product([["a"], ["c"]])
  206. example_dataframe2 = DataFrame([1], index=example_multiindex2)
  207. example_dict = {"s1": example_dataframe1, "s2": example_dataframe2}
  208. expected_index = MultiIndex(
  209. levels=[["s1", "s2"], ["a"], ["b", "c"]],
  210. codes=[[0, 1], [0, 0], [0, 1]],
  211. names=["testname", None, None],
  212. )
  213. expected = DataFrame([[0], [1]], index=expected_index)
  214. result_copy = concat(deepcopy(example_dict), names=["testname"])
  215. tm.assert_frame_equal(result_copy, expected)
  216. result_no_copy = concat(example_dict, names=["testname"])
  217. tm.assert_frame_equal(result_no_copy, expected)
  218. @pytest.mark.parametrize(
  219. "mi1_list",
  220. [
  221. [["a"], range(2)],
  222. [["b"], np.arange(2.0, 4.0)],
  223. [["c"], ["A", "B"]],
  224. [["d"], pd.date_range(start="2017", end="2018", periods=2)],
  225. ],
  226. )
  227. @pytest.mark.parametrize(
  228. "mi2_list",
  229. [
  230. [["a"], range(2)],
  231. [["b"], np.arange(2.0, 4.0)],
  232. [["c"], ["A", "B"]],
  233. [["d"], pd.date_range(start="2017", end="2018", periods=2)],
  234. ],
  235. )
  236. def test_concat_with_various_multiindex_dtypes(
  237. self, mi1_list: list, mi2_list: list
  238. ):
  239. # GitHub #23478
  240. mi1 = MultiIndex.from_product(mi1_list)
  241. mi2 = MultiIndex.from_product(mi2_list)
  242. df1 = DataFrame(np.zeros((1, len(mi1))), columns=mi1)
  243. df2 = DataFrame(np.zeros((1, len(mi2))), columns=mi2)
  244. if mi1_list[0] == mi2_list[0]:
  245. expected_mi = MultiIndex(
  246. levels=[mi1_list[0], list(mi1_list[1])],
  247. codes=[[0, 0, 0, 0], [0, 1, 0, 1]],
  248. )
  249. else:
  250. expected_mi = MultiIndex(
  251. levels=[
  252. mi1_list[0] + mi2_list[0],
  253. list(mi1_list[1]) + list(mi2_list[1]),
  254. ],
  255. codes=[[0, 0, 1, 1], [0, 1, 2, 3]],
  256. )
  257. expected_df = DataFrame(np.zeros((1, len(expected_mi))), columns=expected_mi)
  258. with tm.assert_produces_warning(None):
  259. result_df = concat((df1, df2), axis=1)
  260. tm.assert_frame_equal(expected_df, result_df)
  261. def test_concat_multiindex_(self):
  262. # GitHub #44786
  263. df = DataFrame({"col": ["a", "b", "c"]}, index=["1", "2", "2"])
  264. df = concat([df], keys=["X"])
  265. iterables = [["X"], ["1", "2", "2"]]
  266. result_index = df.index
  267. expected_index = MultiIndex.from_product(iterables)
  268. tm.assert_index_equal(result_index, expected_index)
  269. result_df = df
  270. expected_df = DataFrame(
  271. {"col": ["a", "b", "c"]}, index=MultiIndex.from_product(iterables)
  272. )
  273. tm.assert_frame_equal(result_df, expected_df)
  274. def test_concat_with_key_not_unique(self):
  275. # GitHub #46519
  276. df1 = DataFrame({"name": [1]})
  277. df2 = DataFrame({"name": [2]})
  278. df3 = DataFrame({"name": [3]})
  279. df_a = concat([df1, df2, df3], keys=["x", "y", "x"])
  280. # the warning is caused by indexing unsorted multi-index
  281. with tm.assert_produces_warning(
  282. PerformanceWarning, match="indexing past lexsort depth"
  283. ):
  284. out_a = df_a.loc[("x", 0), :]
  285. df_b = DataFrame(
  286. {"name": [1, 2, 3]}, index=Index([("x", 0), ("y", 0), ("x", 0)])
  287. )
  288. with tm.assert_produces_warning(
  289. PerformanceWarning, match="indexing past lexsort depth"
  290. ):
  291. out_b = df_b.loc[("x", 0)]
  292. tm.assert_frame_equal(out_a, out_b)
  293. df1 = DataFrame({"name": ["a", "a", "b"]})
  294. df2 = DataFrame({"name": ["a", "b"]})
  295. df3 = DataFrame({"name": ["c", "d"]})
  296. df_a = concat([df1, df2, df3], keys=["x", "y", "x"])
  297. with tm.assert_produces_warning(
  298. PerformanceWarning, match="indexing past lexsort depth"
  299. ):
  300. out_a = df_a.loc[("x", 0), :]
  301. df_b = DataFrame(
  302. {
  303. "a": ["x", "x", "x", "y", "y", "x", "x"],
  304. "b": [0, 1, 2, 0, 1, 0, 1],
  305. "name": list("aababcd"),
  306. }
  307. ).set_index(["a", "b"])
  308. df_b.index.names = [None, None]
  309. with tm.assert_produces_warning(
  310. PerformanceWarning, match="indexing past lexsort depth"
  311. ):
  312. out_b = df_b.loc[("x", 0), :]
  313. tm.assert_frame_equal(out_a, out_b)
  314. def test_concat_with_duplicated_levels(self):
  315. # keyword levels should be unique
  316. df1 = DataFrame({"A": [1]}, index=["x"])
  317. df2 = DataFrame({"A": [1]}, index=["y"])
  318. msg = r"Level values not unique: \['x', 'y', 'y'\]"
  319. with pytest.raises(ValueError, match=msg):
  320. concat([df1, df2], keys=["x", "y"], levels=[["x", "y", "y"]])
  321. @pytest.mark.parametrize("levels", [[["x", "y"]], [["x", "y", "y"]]])
  322. def test_concat_with_levels_with_none_keys(self, levels):
  323. df1 = DataFrame({"A": [1]}, index=["x"])
  324. df2 = DataFrame({"A": [1]}, index=["y"])
  325. msg = "levels supported only when keys is not None"
  326. with pytest.raises(ValueError, match=msg):
  327. concat([df1, df2], levels=levels)