PageRenderTime 525ms CodeModel.GetById 13ms RepoModel.GetById 0ms app.codeStats 0ms

/pandas/tests/indexes/multi/test_sorting.py

https://github.com/jreback/pandas
Python | 284 lines | 196 code | 65 blank | 23 comment | 10 complexity | ef67be54b8389d274bda2659fdc137ac MD5 | raw file
  1. import random
  2. import numpy as np
  3. import pytest
  4. from pandas.errors import PerformanceWarning, UnsortedIndexError
  5. from pandas import CategoricalIndex, DataFrame, Index, MultiIndex, RangeIndex
  6. import pandas._testing as tm
  7. from pandas.core.indexes.frozen import FrozenList
  8. def test_sortlevel(idx):
  9. tuples = list(idx)
  10. random.shuffle(tuples)
  11. index = MultiIndex.from_tuples(tuples)
  12. sorted_idx, _ = index.sortlevel(0)
  13. expected = MultiIndex.from_tuples(sorted(tuples))
  14. assert sorted_idx.equals(expected)
  15. sorted_idx, _ = index.sortlevel(0, ascending=False)
  16. assert sorted_idx.equals(expected[::-1])
  17. sorted_idx, _ = index.sortlevel(1)
  18. by1 = sorted(tuples, key=lambda x: (x[1], x[0]))
  19. expected = MultiIndex.from_tuples(by1)
  20. assert sorted_idx.equals(expected)
  21. sorted_idx, _ = index.sortlevel(1, ascending=False)
  22. assert sorted_idx.equals(expected[::-1])
  23. def test_sortlevel_not_sort_remaining():
  24. mi = MultiIndex.from_tuples([[1, 1, 3], [1, 1, 1]], names=list("ABC"))
  25. sorted_idx, _ = mi.sortlevel("A", sort_remaining=False)
  26. assert sorted_idx.equals(mi)
  27. def test_sortlevel_deterministic():
  28. tuples = [
  29. ("bar", "one"),
  30. ("foo", "two"),
  31. ("qux", "two"),
  32. ("foo", "one"),
  33. ("baz", "two"),
  34. ("qux", "one"),
  35. ]
  36. index = MultiIndex.from_tuples(tuples)
  37. sorted_idx, _ = index.sortlevel(0)
  38. expected = MultiIndex.from_tuples(sorted(tuples))
  39. assert sorted_idx.equals(expected)
  40. sorted_idx, _ = index.sortlevel(0, ascending=False)
  41. assert sorted_idx.equals(expected[::-1])
  42. sorted_idx, _ = index.sortlevel(1)
  43. by1 = sorted(tuples, key=lambda x: (x[1], x[0]))
  44. expected = MultiIndex.from_tuples(by1)
  45. assert sorted_idx.equals(expected)
  46. sorted_idx, _ = index.sortlevel(1, ascending=False)
  47. assert sorted_idx.equals(expected[::-1])
  48. def test_numpy_argsort(idx):
  49. result = np.argsort(idx)
  50. expected = idx.argsort()
  51. tm.assert_numpy_array_equal(result, expected)
  52. # these are the only two types that perform
  53. # pandas compatibility input validation - the
  54. # rest already perform separate (or no) such
  55. # validation via their 'values' attribute as
  56. # defined in pandas.core.indexes/base.py - they
  57. # cannot be changed at the moment due to
  58. # backwards compatibility concerns
  59. if isinstance(type(idx), (CategoricalIndex, RangeIndex)):
  60. msg = "the 'axis' parameter is not supported"
  61. with pytest.raises(ValueError, match=msg):
  62. np.argsort(idx, axis=1)
  63. msg = "the 'kind' parameter is not supported"
  64. with pytest.raises(ValueError, match=msg):
  65. np.argsort(idx, kind="mergesort")
  66. msg = "the 'order' parameter is not supported"
  67. with pytest.raises(ValueError, match=msg):
  68. np.argsort(idx, order=("a", "b"))
  69. def test_unsortedindex():
  70. # GH 11897
  71. mi = MultiIndex.from_tuples(
  72. [("z", "a"), ("x", "a"), ("y", "b"), ("x", "b"), ("y", "a"), ("z", "b")],
  73. names=["one", "two"],
  74. )
  75. df = DataFrame([[i, 10 * i] for i in range(6)], index=mi, columns=["one", "two"])
  76. # GH 16734: not sorted, but no real slicing
  77. result = df.loc(axis=0)["z", "a"]
  78. expected = df.iloc[0]
  79. tm.assert_series_equal(result, expected)
  80. msg = (
  81. "MultiIndex slicing requires the index to be lexsorted: "
  82. r"slicing on levels \[1\], lexsort depth 0"
  83. )
  84. with pytest.raises(UnsortedIndexError, match=msg):
  85. df.loc(axis=0)["z", slice("a")]
  86. df.sort_index(inplace=True)
  87. assert len(df.loc(axis=0)["z", :]) == 2
  88. with pytest.raises(KeyError, match="'q'"):
  89. df.loc(axis=0)["q", :]
  90. def test_unsortedindex_doc_examples():
  91. # https://pandas.pydata.org/pandas-docs/stable/advanced.html#sorting-a-multiindex
  92. dfm = DataFrame(
  93. {"jim": [0, 0, 1, 1], "joe": ["x", "x", "z", "y"], "jolie": np.random.rand(4)}
  94. )
  95. dfm = dfm.set_index(["jim", "joe"])
  96. with tm.assert_produces_warning(PerformanceWarning):
  97. dfm.loc[(1, "z")]
  98. msg = r"Key length \(2\) was greater than MultiIndex lexsort depth \(1\)"
  99. with pytest.raises(UnsortedIndexError, match=msg):
  100. dfm.loc[(0, "y"):(1, "z")]
  101. assert not dfm.index.is_lexsorted()
  102. assert dfm.index.lexsort_depth == 1
  103. # sort it
  104. dfm = dfm.sort_index()
  105. dfm.loc[(1, "z")]
  106. dfm.loc[(0, "y"):(1, "z")]
  107. assert dfm.index.is_lexsorted()
  108. assert dfm.index.lexsort_depth == 2
  109. def test_reconstruct_sort():
  110. # starts off lexsorted & monotonic
  111. mi = MultiIndex.from_arrays([["A", "A", "B", "B", "B"], [1, 2, 1, 2, 3]])
  112. assert mi.is_lexsorted()
  113. assert mi.is_monotonic
  114. recons = mi._sort_levels_monotonic()
  115. assert recons.is_lexsorted()
  116. assert recons.is_monotonic
  117. assert mi is recons
  118. assert mi.equals(recons)
  119. assert Index(mi.values).equals(Index(recons.values))
  120. # cannot convert to lexsorted
  121. mi = MultiIndex.from_tuples(
  122. [("z", "a"), ("x", "a"), ("y", "b"), ("x", "b"), ("y", "a"), ("z", "b")],
  123. names=["one", "two"],
  124. )
  125. assert not mi.is_lexsorted()
  126. assert not mi.is_monotonic
  127. recons = mi._sort_levels_monotonic()
  128. assert not recons.is_lexsorted()
  129. assert not recons.is_monotonic
  130. assert mi.equals(recons)
  131. assert Index(mi.values).equals(Index(recons.values))
  132. # cannot convert to lexsorted
  133. mi = MultiIndex(
  134. levels=[["b", "d", "a"], [1, 2, 3]],
  135. codes=[[0, 1, 0, 2], [2, 0, 0, 1]],
  136. names=["col1", "col2"],
  137. )
  138. assert not mi.is_lexsorted()
  139. assert not mi.is_monotonic
  140. recons = mi._sort_levels_monotonic()
  141. assert not recons.is_lexsorted()
  142. assert not recons.is_monotonic
  143. assert mi.equals(recons)
  144. assert Index(mi.values).equals(Index(recons.values))
  145. def test_reconstruct_remove_unused():
  146. # xref to GH 2770
  147. df = DataFrame(
  148. [["deleteMe", 1, 9], ["keepMe", 2, 9], ["keepMeToo", 3, 9]],
  149. columns=["first", "second", "third"],
  150. )
  151. df2 = df.set_index(["first", "second"], drop=False)
  152. df2 = df2[df2["first"] != "deleteMe"]
  153. # removed levels are there
  154. expected = MultiIndex(
  155. levels=[["deleteMe", "keepMe", "keepMeToo"], [1, 2, 3]],
  156. codes=[[1, 2], [1, 2]],
  157. names=["first", "second"],
  158. )
  159. result = df2.index
  160. tm.assert_index_equal(result, expected)
  161. expected = MultiIndex(
  162. levels=[["keepMe", "keepMeToo"], [2, 3]],
  163. codes=[[0, 1], [0, 1]],
  164. names=["first", "second"],
  165. )
  166. result = df2.index.remove_unused_levels()
  167. tm.assert_index_equal(result, expected)
  168. # idempotent
  169. result2 = result.remove_unused_levels()
  170. tm.assert_index_equal(result2, expected)
  171. assert result2.is_(result)
  172. @pytest.mark.parametrize(
  173. "first_type,second_type", [("int64", "int64"), ("datetime64[D]", "str")]
  174. )
  175. def test_remove_unused_levels_large(first_type, second_type):
  176. # GH16556
  177. # because tests should be deterministic (and this test in particular
  178. # checks that levels are removed, which is not the case for every
  179. # random input):
  180. rng = np.random.RandomState(4) # seed is arbitrary value that works
  181. size = 1 << 16
  182. df = DataFrame(
  183. {
  184. "first": rng.randint(0, 1 << 13, size).astype(first_type),
  185. "second": rng.randint(0, 1 << 10, size).astype(second_type),
  186. "third": rng.rand(size),
  187. }
  188. )
  189. df = df.groupby(["first", "second"]).sum()
  190. df = df[df.third < 0.1]
  191. result = df.index.remove_unused_levels()
  192. assert len(result.levels[0]) < len(df.index.levels[0])
  193. assert len(result.levels[1]) < len(df.index.levels[1])
  194. assert result.equals(df.index)
  195. expected = df.reset_index().set_index(["first", "second"]).index
  196. tm.assert_index_equal(result, expected)
  197. @pytest.mark.parametrize("level0", [["a", "d", "b"], ["a", "d", "b", "unused"]])
  198. @pytest.mark.parametrize(
  199. "level1", [["w", "x", "y", "z"], ["w", "x", "y", "z", "unused"]]
  200. )
  201. def test_remove_unused_nan(level0, level1):
  202. # GH 18417
  203. mi = MultiIndex(levels=[level0, level1], codes=[[0, 2, -1, 1, -1], [0, 1, 2, 3, 2]])
  204. result = mi.remove_unused_levels()
  205. tm.assert_index_equal(result, mi)
  206. for level in 0, 1:
  207. assert "unused" not in result.levels[level]
  208. def test_argsort(idx):
  209. result = idx.argsort()
  210. expected = idx.values.argsort()
  211. tm.assert_numpy_array_equal(result, expected)
  212. def test_remove_unused_levels_with_nan():
  213. # GH 37510
  214. idx = Index([(1, np.nan), (3, 4)]).rename(["id1", "id2"])
  215. idx = idx.set_levels(["a", np.nan], level="id1")
  216. idx = idx.remove_unused_levels()
  217. result = idx.levels
  218. expected = FrozenList([["a", np.nan], [4]])
  219. assert str(result) == str(expected)