PageRenderTime 55ms CodeModel.GetById 26ms RepoModel.GetById 1ms app.codeStats 0ms

/pandas/tools/tests/test_pivot.py

https://github.com/thouis/pandas
Python | 326 lines | 305 code | 19 blank | 2 comment | 4 complexity | 882426cd69b18bc6645d3ff696c8ccf1 MD5 | raw file
Possible License(s): BSD-3-Clause
  1. import unittest
  2. import numpy as np
  3. from pandas import DataFrame, Series
  4. from pandas.tools.merge import concat
  5. from pandas.tools.pivot import pivot_table, crosstab
  6. import pandas.util.testing as tm
  7. class TestPivotTable(unittest.TestCase):
  8. _multiprocess_can_split_ = True
  9. def setUp(self):
  10. self.data = DataFrame({'A' : ['foo', 'foo', 'foo', 'foo',
  11. 'bar', 'bar', 'bar', 'bar',
  12. 'foo', 'foo', 'foo'],
  13. 'B' : ['one', 'one', 'one', 'two',
  14. 'one', 'one', 'one', 'two',
  15. 'two', 'two', 'one'],
  16. 'C' : ['dull', 'dull', 'shiny', 'dull',
  17. 'dull', 'shiny', 'shiny', 'dull',
  18. 'shiny', 'shiny', 'shiny'],
  19. 'D' : np.random.randn(11),
  20. 'E' : np.random.randn(11),
  21. 'F' : np.random.randn(11)})
  22. def test_pivot_table(self):
  23. rows = ['A', 'B']
  24. cols= 'C'
  25. table = pivot_table(self.data, values='D', rows=rows, cols=cols)
  26. table2 = self.data.pivot_table(values='D', rows=rows, cols=cols)
  27. tm.assert_frame_equal(table, table2)
  28. # this works
  29. pivot_table(self.data, values='D', rows=rows)
  30. if len(rows) > 1:
  31. self.assertEqual(table.index.names, rows)
  32. else:
  33. self.assertEqual(table.index.name, rows[0])
  34. if len(cols) > 1:
  35. self.assertEqual(table.columns.names, cols)
  36. else:
  37. self.assertEqual(table.columns.name, cols[0])
  38. expected = self.data.groupby(rows + [cols])['D'].agg(np.mean).unstack()
  39. tm.assert_frame_equal(table, expected)
  40. def test_pass_array(self):
  41. result = self.data.pivot_table('D', rows=self.data.A, cols=self.data.C)
  42. expected = self.data.pivot_table('D', rows='A', cols='C')
  43. tm.assert_frame_equal(result, expected)
  44. def test_pass_function(self):
  45. result = self.data.pivot_table('D', rows=lambda x: x // 5,
  46. cols=self.data.C)
  47. expected = self.data.pivot_table('D', rows=self.data.index // 5,
  48. cols='C')
  49. tm.assert_frame_equal(result, expected)
  50. def test_pivot_table_multiple(self):
  51. rows = ['A', 'B']
  52. cols= 'C'
  53. table = pivot_table(self.data, rows=rows, cols=cols)
  54. expected = self.data.groupby(rows + [cols]).agg(np.mean).unstack()
  55. tm.assert_frame_equal(table, expected)
  56. def test_pivot_multi_values(self):
  57. result = pivot_table(self.data, values=['D', 'E'],
  58. rows='A', cols=['B', 'C'], fill_value=0)
  59. expected = pivot_table(self.data.drop(['F'], axis=1),
  60. rows='A', cols=['B', 'C'], fill_value=0)
  61. tm.assert_frame_equal(result, expected)
  62. def test_pivot_multi_functions(self):
  63. f = lambda func: pivot_table(self.data, values=['D', 'E'],
  64. rows=['A', 'B'], cols='C',
  65. aggfunc=func)
  66. result = f([np.mean, np.std])
  67. means = f(np.mean)
  68. stds = f(np.std)
  69. expected = concat([means, stds], keys=['mean', 'std'], axis=1)
  70. tm.assert_frame_equal(result, expected)
  71. # margins not supported??
  72. f = lambda func: pivot_table(self.data, values=['D', 'E'],
  73. rows=['A', 'B'], cols='C',
  74. aggfunc=func, margins=True)
  75. result = f([np.mean, np.std])
  76. means = f(np.mean)
  77. stds = f(np.std)
  78. expected = concat([means, stds], keys=['mean', 'std'], axis=1)
  79. tm.assert_frame_equal(result, expected)
  80. def test_margins(self):
  81. def _check_output(res, col, rows=['A', 'B'], cols=['C']):
  82. cmarg = res['All'][:-1]
  83. exp = self.data.groupby(rows)[col].mean()
  84. tm.assert_series_equal(cmarg, exp)
  85. rmarg = res.xs(('All', ''))[:-1]
  86. exp = self.data.groupby(cols)[col].mean()
  87. tm.assert_series_equal(rmarg, exp)
  88. gmarg = res['All']['All', '']
  89. exp = self.data[col].mean()
  90. self.assertEqual(gmarg, exp)
  91. # column specified
  92. table = self.data.pivot_table('D', rows=['A', 'B'], cols='C',
  93. margins=True, aggfunc=np.mean)
  94. _check_output(table, 'D')
  95. # no column specified
  96. table = self.data.pivot_table(rows=['A', 'B'], cols='C',
  97. margins=True, aggfunc=np.mean)
  98. for valcol in table.columns.levels[0]:
  99. _check_output(table[valcol], valcol)
  100. # no col
  101. # to help with a buglet
  102. self.data.columns = [k * 2 for k in self.data.columns]
  103. table = self.data.pivot_table(rows=['AA', 'BB'], margins=True,
  104. aggfunc=np.mean)
  105. for valcol in table.columns:
  106. gmarg = table[valcol]['All', '']
  107. self.assertEqual(gmarg, self.data[valcol].mean())
  108. # this is OK
  109. table = self.data.pivot_table(rows=['AA', 'BB'], margins=True,
  110. aggfunc='mean')
  111. # no rows
  112. rtable = self.data.pivot_table(cols=['AA', 'BB'], margins=True,
  113. aggfunc=np.mean)
  114. self.assert_(isinstance(rtable, Series))
  115. for item in ['DD', 'EE', 'FF']:
  116. gmarg = table[item]['All', '']
  117. self.assertEqual(gmarg, self.data[item].mean())
  118. def test_pivot_integer_columns(self):
  119. # caused by upstream bug in unstack
  120. from pandas.util.compat import product
  121. import datetime
  122. import pandas
  123. d = datetime.date.min
  124. data = list(product(['foo', 'bar'], ['A', 'B', 'C'], ['x1', 'x2'],
  125. [d + datetime.timedelta(i) for i in xrange(20)], [1.0]))
  126. df = pandas.DataFrame(data)
  127. table = df.pivot_table(values=4, rows=[0,1,3],cols=[2])
  128. df2 = df.rename(columns=str)
  129. table2 = df2.pivot_table(values='4', rows=['0','1','3'], cols=['2'])
  130. tm.assert_frame_equal(table, table2)
  131. def test_pivot_no_level_overlap(self):
  132. # GH #1181
  133. data = DataFrame({'a': ['a', 'a', 'a', 'a', 'b', 'b', 'b', 'b'] * 2,
  134. 'b': [0, 0, 0, 0, 1, 1, 1, 1] * 2,
  135. 'c': (['foo'] * 4 + ['bar'] * 4) * 2,
  136. 'value': np.random.randn(16)})
  137. table = data.pivot_table('value', rows='a', cols=['b', 'c'])
  138. grouped = data.groupby(['a', 'b', 'c'])['value'].mean()
  139. expected = grouped.unstack('b').unstack('c').dropna(axis=1, how='all')
  140. tm.assert_frame_equal(table, expected)
  141. def test_pivot_columns_lexsorted(self):
  142. import datetime
  143. import numpy as np
  144. import pandas
  145. n = 10000
  146. dtype = np.dtype([
  147. ("Index", object),
  148. ("Symbol", object),
  149. ("Year", int),
  150. ("Month", int),
  151. ("Day", int),
  152. ("Quantity", int),
  153. ("Price", float),
  154. ])
  155. products = np.array([
  156. ('SP500', 'ADBE'),
  157. ('SP500', 'NVDA'),
  158. ('SP500', 'ORCL'),
  159. ('NDQ100', 'AAPL'),
  160. ('NDQ100', 'MSFT'),
  161. ('NDQ100', 'GOOG'),
  162. ('FTSE', 'DGE.L'),
  163. ('FTSE', 'TSCO.L'),
  164. ('FTSE', 'GSK.L'),
  165. ], dtype=[('Index', object), ('Symbol', object)])
  166. items = np.empty(n, dtype=dtype)
  167. iproduct = np.random.randint(0, len(products), n)
  168. items['Index'] = products['Index'][iproduct]
  169. items['Symbol'] = products['Symbol'][iproduct]
  170. dr = pandas.date_range(datetime.date(2000, 1, 1),
  171. datetime.date(2010, 12, 31))
  172. dates = dr[np.random.randint(0, len(dr), n)]
  173. items['Year'] = dates.year
  174. items['Month'] = dates.month
  175. items['Day'] = dates.day
  176. items['Price'] = np.random.lognormal(4.0, 2.0, n)
  177. df = DataFrame(items)
  178. pivoted = df.pivot_table('Price', rows=['Month', 'Day'],
  179. cols=['Index', 'Symbol', 'Year'],
  180. aggfunc='mean')
  181. self.assert_(pivoted.columns.is_monotonic)
  182. class TestCrosstab(unittest.TestCase):
  183. def setUp(self):
  184. df = DataFrame({'A' : ['foo', 'foo', 'foo', 'foo',
  185. 'bar', 'bar', 'bar', 'bar',
  186. 'foo', 'foo', 'foo'],
  187. 'B' : ['one', 'one', 'one', 'two',
  188. 'one', 'one', 'one', 'two',
  189. 'two', 'two', 'one'],
  190. 'C' : ['dull', 'dull', 'shiny', 'dull',
  191. 'dull', 'shiny', 'shiny', 'dull',
  192. 'shiny', 'shiny', 'shiny'],
  193. 'D' : np.random.randn(11),
  194. 'E' : np.random.randn(11),
  195. 'F' : np.random.randn(11)})
  196. self.df = df.append(df, ignore_index=True)
  197. def test_crosstab_single(self):
  198. df = self.df
  199. result = crosstab(df['A'], df['C'])
  200. expected = df.groupby(['A', 'C']).size().unstack()
  201. tm.assert_frame_equal(result, expected.fillna(0).astype(np.int64))
  202. def test_crosstab_multiple(self):
  203. df = self.df
  204. result = crosstab(df['A'], [df['B'], df['C']])
  205. expected = df.groupby(['A', 'B', 'C']).size()
  206. expected = expected.unstack('B').unstack('C').fillna(0).astype(np.int64)
  207. tm.assert_frame_equal(result, expected)
  208. result = crosstab([df['B'], df['C']], df['A'])
  209. expected = df.groupby(['B', 'C', 'A']).size()
  210. expected = expected.unstack('A').fillna(0).astype(np.int64)
  211. tm.assert_frame_equal(result, expected)
  212. def test_crosstab_ndarray(self):
  213. a = np.random.randint(0, 5, size=100)
  214. b = np.random.randint(0, 3, size=100)
  215. c = np.random.randint(0, 10, size=100)
  216. df = DataFrame({'a': a, 'b': b, 'c': c})
  217. result = crosstab(a, [b, c], rownames=['a'], colnames=('b', 'c'))
  218. expected = crosstab(df['a'], [df['b'], df['c']])
  219. tm.assert_frame_equal(result, expected)
  220. result = crosstab([b, c], a, colnames=['a'], rownames=('b', 'c'))
  221. expected = crosstab([df['b'], df['c']], df['a'])
  222. tm.assert_frame_equal(result, expected)
  223. # assign arbitrary names
  224. result = crosstab(self.df['A'].values, self.df['C'].values)
  225. self.assertEqual(result.index.name, 'row_0')
  226. self.assertEqual(result.columns.name, 'col_0')
  227. def test_crosstab_margins(self):
  228. a = np.random.randint(0, 7, size=100)
  229. b = np.random.randint(0, 3, size=100)
  230. c = np.random.randint(0, 5, size=100)
  231. df = DataFrame({'a': a, 'b': b, 'c': c})
  232. result = crosstab(a, [b, c], rownames=['a'], colnames=('b', 'c'),
  233. margins=True)
  234. self.assertEqual(result.index.names, ['a'])
  235. self.assertEqual(result.columns.names, ['b', 'c'])
  236. all_cols = result['All', '']
  237. exp_cols = df.groupby(['a']).size().astype('i8')
  238. exp_cols = exp_cols.append(Series([len(df)], index=['All']))
  239. tm.assert_series_equal(all_cols, exp_cols)
  240. all_rows = result.ix['All']
  241. exp_rows = df.groupby(['b', 'c']).size().astype('i8')
  242. exp_rows = exp_rows.append(Series([len(df)], index=[('All', '')]))
  243. exp_rows = exp_rows.reindex(all_rows.index)
  244. exp_rows = exp_rows.fillna(0).astype(np.int64)
  245. tm.assert_series_equal(all_rows, exp_rows)
  246. def test_crosstab_pass_values(self):
  247. a = np.random.randint(0, 7, size=100)
  248. b = np.random.randint(0, 3, size=100)
  249. c = np.random.randint(0, 5, size=100)
  250. values = np.random.randn(100)
  251. table = crosstab([a, b], c, values, aggfunc=np.sum,
  252. rownames=['foo', 'bar'], colnames=['baz'])
  253. df = DataFrame({'foo': a, 'bar': b, 'baz': c, 'values' : values})
  254. expected = df.pivot_table('values', rows=['foo', 'bar'], cols='baz',
  255. aggfunc=np.sum)
  256. tm.assert_frame_equal(table, expected)
  257. if __name__ == '__main__':
  258. import nose
  259. nose.runmodule(argv=[__file__,'-vvs','-x','--pdb', '--pdb-failure'],
  260. exit=False)