PageRenderTime 59ms CodeModel.GetById 20ms RepoModel.GetById 1ms app.codeStats 0ms

/pandas/core/reshape.py

http://github.com/pydata/pandas
Python | 1113 lines | 1098 code | 7 blank | 8 comment | 0 complexity | 7c191606d97c1bfd07b43d034ccdd0dd MD5 | raw file
Possible License(s): BSD-3-Clause, Apache-2.0
  1. # pylint: disable=E1101,E1103
  2. # pylint: disable=W0703,W0622,W0613,W0201
  3. from pandas.compat import range, zip
  4. from pandas import compat
  5. import itertools
  6. import numpy as np
  7. from pandas.core.series import Series
  8. from pandas.core.frame import DataFrame
  9. from pandas.core.categorical import Categorical
  10. from pandas.core.common import (notnull, _ensure_platform_int, _maybe_promote,
  11. isnull)
  12. from pandas.core.groupby import (get_group_index, _compress_group_index,
  13. decons_group_index)
  14. import pandas.core.common as com
  15. import pandas.algos as algos
  16. from pandas.core.index import MultiIndex, _get_na_value
  17. class _Unstacker(object):
  18. """
  19. Helper class to unstack data / pivot with multi-level index
  20. Parameters
  21. ----------
  22. level : int or str, default last level
  23. Level to "unstack". Accepts a name for the level.
  24. Examples
  25. --------
  26. >>> import pandas as pd
  27. >>> index = pd.MultiIndex.from_tuples([('one', 'a'), ('one', 'b'),
  28. ... ('two', 'a'), ('two', 'b')])
  29. >>> s = pd.Series(np.arange(1.0, 5.0), index=index)
  30. >>> s
  31. one a 1
  32. b 2
  33. two a 3
  34. b 4
  35. dtype: float64
  36. >>> s.unstack(level=-1)
  37. a b
  38. one 1 2
  39. two 3 4
  40. >>> s.unstack(level=0)
  41. one two
  42. a 1 2
  43. b 3 4
  44. Returns
  45. -------
  46. unstacked : DataFrame
  47. """
  48. def __init__(self, values, index, level=-1, value_columns=None):
  49. if values.ndim == 1:
  50. values = values[:, np.newaxis]
  51. self.values = values
  52. self.value_columns = value_columns
  53. if value_columns is None and values.shape[1] != 1: # pragma: no cover
  54. raise ValueError('must pass column labels for multi-column data')
  55. self.index = index
  56. if isinstance(self.index, MultiIndex):
  57. if index._reference_duplicate_name(level):
  58. msg = ("Ambiguous reference to {0}. The index "
  59. "names are not unique.".format(level))
  60. raise ValueError(msg)
  61. self.level = self.index._get_level_number(level)
  62. levels = index.levels
  63. labels = index.labels
  64. def _make_index(lev, lab):
  65. values = _make_index_array_level(lev.values, lab)
  66. i = lev._simple_new(values, lev.name,
  67. freq=getattr(lev, 'freq', None),
  68. tz=getattr(lev, 'tz', None))
  69. return i
  70. self.new_index_levels = [_make_index(lev, lab)
  71. for lev, lab in zip(levels, labels)]
  72. self.new_index_names = list(index.names)
  73. self.removed_name = self.new_index_names.pop(self.level)
  74. self.removed_level = self.new_index_levels.pop(self.level)
  75. self._make_sorted_values_labels()
  76. self._make_selectors()
  77. def _make_sorted_values_labels(self):
  78. v = self.level
  79. labs = list(self.index.labels)
  80. levs = list(self.index.levels)
  81. to_sort = labs[:v] + labs[v + 1:] + [labs[v]]
  82. sizes = [len(x) for x in levs[:v] + levs[v + 1:] + [levs[v]]]
  83. comp_index, obs_ids = get_compressed_ids(to_sort, sizes)
  84. # group_index = get_group_index(to_sort, sizes)
  85. # comp_index, obs_ids = _compress_group_index(group_index)
  86. ngroups = len(obs_ids)
  87. indexer = algos.groupsort_indexer(comp_index, ngroups)[0]
  88. indexer = _ensure_platform_int(indexer)
  89. self.sorted_values = com.take_nd(self.values, indexer, axis=0)
  90. self.sorted_labels = [l.take(indexer) for l in to_sort]
  91. def _make_selectors(self):
  92. new_levels = self.new_index_levels
  93. # make the mask
  94. remaining_labels = self.sorted_labels[:-1]
  95. level_sizes = [len(x) for x in new_levels]
  96. comp_index, obs_ids = get_compressed_ids(remaining_labels, level_sizes)
  97. ngroups = len(obs_ids)
  98. comp_index = _ensure_platform_int(comp_index)
  99. stride = self.index.levshape[self.level]
  100. self.full_shape = ngroups, stride
  101. selector = self.sorted_labels[-1] + stride * comp_index
  102. mask = np.zeros(np.prod(self.full_shape), dtype=bool)
  103. mask.put(selector, True)
  104. if mask.sum() < len(self.index):
  105. raise ValueError('Index contains duplicate entries, '
  106. 'cannot reshape')
  107. self.group_index = comp_index
  108. self.mask = mask
  109. self.unique_groups = obs_ids
  110. self.compressor = comp_index.searchsorted(np.arange(ngroups))
  111. def get_result(self):
  112. # TODO: find a better way than this masking business
  113. values, value_mask = self.get_new_values()
  114. columns = self.get_new_columns()
  115. index = self.get_new_index()
  116. # filter out missing levels
  117. if values.shape[1] > 0:
  118. col_inds, obs_ids = _compress_group_index(self.sorted_labels[-1])
  119. # rare case, level values not observed
  120. if len(obs_ids) < self.full_shape[1]:
  121. inds = (value_mask.sum(0) > 0).nonzero()[0]
  122. values = com.take_nd(values, inds, axis=1)
  123. columns = columns[inds]
  124. # we might have a missing index
  125. if len(index) != values.shape[0]:
  126. mask = isnull(index)
  127. if mask.any():
  128. l = np.arange(len(index))
  129. values, orig_values = (np.empty((len(index), values.shape[1])),
  130. values)
  131. values.fill(np.nan)
  132. values_indexer = com._ensure_int64(l[~mask])
  133. for i, j in enumerate(values_indexer):
  134. values[j] = orig_values[i]
  135. else:
  136. index = index.take(self.unique_groups)
  137. return DataFrame(values, index=index, columns=columns)
  138. def get_new_values(self):
  139. values = self.values
  140. # place the values
  141. length, width = self.full_shape
  142. stride = values.shape[1]
  143. result_width = width * stride
  144. result_shape = (length, result_width)
  145. # if our mask is all True, then we can use our existing dtype
  146. if self.mask.all():
  147. dtype = values.dtype
  148. new_values = np.empty(result_shape, dtype=dtype)
  149. else:
  150. dtype, fill_value = _maybe_promote(values.dtype)
  151. new_values = np.empty(result_shape, dtype=dtype)
  152. new_values.fill(fill_value)
  153. new_mask = np.zeros(result_shape, dtype=bool)
  154. # is there a simpler / faster way of doing this?
  155. for i in range(values.shape[1]):
  156. chunk = new_values[:, i * width: (i + 1) * width]
  157. mask_chunk = new_mask[:, i * width: (i + 1) * width]
  158. chunk.flat[self.mask] = self.sorted_values[:, i]
  159. mask_chunk.flat[self.mask] = True
  160. return new_values, new_mask
  161. def get_new_columns(self):
  162. if self.value_columns is None:
  163. return self.removed_level
  164. stride = len(self.removed_level)
  165. width = len(self.value_columns)
  166. propagator = np.repeat(np.arange(width), stride)
  167. if isinstance(self.value_columns, MultiIndex):
  168. new_levels = self.value_columns.levels + (self.removed_level,)
  169. new_names = self.value_columns.names + (self.removed_name,)
  170. new_labels = [lab.take(propagator)
  171. for lab in self.value_columns.labels]
  172. new_labels.append(np.tile(np.arange(stride), width))
  173. else:
  174. new_levels = [self.value_columns, self.removed_level]
  175. new_names = [self.value_columns.name, self.removed_name]
  176. new_labels = []
  177. new_labels.append(propagator)
  178. new_labels.append(np.tile(np.arange(stride), width))
  179. return MultiIndex(levels=new_levels, labels=new_labels,
  180. names=new_names, verify_integrity=False)
  181. def get_new_index(self):
  182. result_labels = []
  183. for cur in self.sorted_labels[:-1]:
  184. labels = cur.take(self.compressor)
  185. labels = _make_index_array_level(labels, cur)
  186. result_labels.append(labels)
  187. # construct the new index
  188. if len(self.new_index_levels) == 1:
  189. new_index = self.new_index_levels[0]
  190. new_index.name = self.new_index_names[0]
  191. else:
  192. new_index = MultiIndex(levels=self.new_index_levels,
  193. labels=result_labels,
  194. names=self.new_index_names,
  195. verify_integrity=False)
  196. return new_index
  197. def _make_index_array_level(lev, lab):
  198. """ create the combined index array, preserving nans, return an array """
  199. mask = lab == -1
  200. if not mask.any():
  201. return lev
  202. l = np.arange(len(lab))
  203. mask_labels = np.empty(len(mask[mask]), dtype=object)
  204. mask_labels.fill(_get_na_value(lev.dtype.type))
  205. mask_indexer = com._ensure_int64(l[mask])
  206. labels = lev
  207. labels_indexer = com._ensure_int64(l[~mask])
  208. new_labels = np.empty(tuple([len(lab)]), dtype=object)
  209. new_labels[labels_indexer] = labels
  210. new_labels[mask_indexer] = mask_labels
  211. return new_labels
  212. def _unstack_multiple(data, clocs):
  213. if len(clocs) == 0:
  214. return data
  215. # NOTE: This doesn't deal with hierarchical columns yet
  216. index = data.index
  217. clocs = [index._get_level_number(i) for i in clocs]
  218. rlocs = [i for i in range(index.nlevels) if i not in clocs]
  219. clevels = [index.levels[i] for i in clocs]
  220. clabels = [index.labels[i] for i in clocs]
  221. cnames = [index.names[i] for i in clocs]
  222. rlevels = [index.levels[i] for i in rlocs]
  223. rlabels = [index.labels[i] for i in rlocs]
  224. rnames = [index.names[i] for i in rlocs]
  225. shape = [len(x) for x in clevels]
  226. group_index = get_group_index(clabels, shape)
  227. comp_ids, obs_ids = _compress_group_index(group_index, sort=False)
  228. recons_labels = decons_group_index(obs_ids, shape)
  229. dummy_index = MultiIndex(levels=rlevels + [obs_ids],
  230. labels=rlabels + [comp_ids],
  231. names=rnames + ['__placeholder__'],
  232. verify_integrity=False)
  233. if isinstance(data, Series):
  234. dummy = Series(data.values, index=dummy_index)
  235. unstacked = dummy.unstack('__placeholder__')
  236. new_levels = clevels
  237. new_names = cnames
  238. new_labels = recons_labels
  239. else:
  240. if isinstance(data.columns, MultiIndex):
  241. result = data
  242. for i in range(len(clocs)):
  243. val = clocs[i]
  244. result = result.unstack(val)
  245. clocs = [val if i > val else val - 1 for val in clocs]
  246. return result
  247. dummy = DataFrame(data.values, index=dummy_index,
  248. columns=data.columns)
  249. unstacked = dummy.unstack('__placeholder__')
  250. if isinstance(unstacked, Series):
  251. unstcols = unstacked.index
  252. else:
  253. unstcols = unstacked.columns
  254. new_levels = [unstcols.levels[0]] + clevels
  255. new_names = [data.columns.name] + cnames
  256. new_labels = [unstcols.labels[0]]
  257. for rec in recons_labels:
  258. new_labels.append(rec.take(unstcols.labels[-1]))
  259. new_columns = MultiIndex(levels=new_levels, labels=new_labels,
  260. names=new_names, verify_integrity=False)
  261. if isinstance(unstacked, Series):
  262. unstacked.index = new_columns
  263. else:
  264. unstacked.columns = new_columns
  265. return unstacked
  266. def pivot(self, index=None, columns=None, values=None):
  267. """
  268. See DataFrame.pivot
  269. """
  270. if values is None:
  271. indexed = self.set_index([index, columns])
  272. return indexed.unstack(columns)
  273. else:
  274. indexed = Series(self[values].values,
  275. index=MultiIndex.from_arrays([self[index],
  276. self[columns]]))
  277. return indexed.unstack(columns)
  278. def pivot_simple(index, columns, values):
  279. """
  280. Produce 'pivot' table based on 3 columns of this DataFrame.
  281. Uses unique values from index / columns and fills with values.
  282. Parameters
  283. ----------
  284. index : ndarray
  285. Labels to use to make new frame's index
  286. columns : ndarray
  287. Labels to use to make new frame's columns
  288. values : ndarray
  289. Values to use for populating new frame's values
  290. Notes
  291. -----
  292. Obviously, all 3 of the input arguments must have the same length
  293. Returns
  294. -------
  295. DataFrame
  296. """
  297. if (len(index) != len(columns)) or (len(columns) != len(values)):
  298. raise AssertionError('Length of index, columns, and values must be the'
  299. ' same')
  300. if len(index) == 0:
  301. return DataFrame(index=[])
  302. hindex = MultiIndex.from_arrays([index, columns])
  303. series = Series(values.ravel(), index=hindex)
  304. series = series.sortlevel(0)
  305. return series.unstack()
  306. def _slow_pivot(index, columns, values):
  307. """
  308. Produce 'pivot' table based on 3 columns of this DataFrame.
  309. Uses unique values from index / columns and fills with values.
  310. Parameters
  311. ----------
  312. index : string or object
  313. Column name to use to make new frame's index
  314. columns : string or object
  315. Column name to use to make new frame's columns
  316. values : string or object
  317. Column name to use for populating new frame's values
  318. Could benefit from some Cython here.
  319. """
  320. tree = {}
  321. for i, (idx, col) in enumerate(zip(index, columns)):
  322. if col not in tree:
  323. tree[col] = {}
  324. branch = tree[col]
  325. branch[idx] = values[i]
  326. return DataFrame(tree)
  327. def unstack(obj, level):
  328. if isinstance(level, (tuple, list)):
  329. return _unstack_multiple(obj, level)
  330. if isinstance(obj, DataFrame):
  331. if isinstance(obj.index, MultiIndex):
  332. return _unstack_frame(obj, level)
  333. else:
  334. return obj.T.stack(dropna=False)
  335. else:
  336. unstacker = _Unstacker(obj.values, obj.index, level=level)
  337. return unstacker.get_result()
  338. def _unstack_frame(obj, level):
  339. from pandas.core.internals import BlockManager, make_block
  340. if obj._is_mixed_type:
  341. unstacker = _Unstacker(np.empty(obj.shape, dtype=bool), # dummy
  342. obj.index, level=level,
  343. value_columns=obj.columns)
  344. new_columns = unstacker.get_new_columns()
  345. new_index = unstacker.get_new_index()
  346. new_axes = [new_columns, new_index]
  347. new_blocks = []
  348. mask_blocks = []
  349. for blk in obj._data.blocks:
  350. blk_items = obj._data.items[blk.mgr_locs.indexer]
  351. bunstacker = _Unstacker(blk.values.T, obj.index, level=level,
  352. value_columns=blk_items)
  353. new_items = bunstacker.get_new_columns()
  354. new_placement = new_columns.get_indexer(new_items)
  355. new_values, mask = bunstacker.get_new_values()
  356. mblk = make_block(mask.T, placement=new_placement)
  357. mask_blocks.append(mblk)
  358. newb = make_block(new_values.T, placement=new_placement)
  359. new_blocks.append(newb)
  360. result = DataFrame(BlockManager(new_blocks, new_axes))
  361. mask_frame = DataFrame(BlockManager(mask_blocks, new_axes))
  362. return result.ix[:, mask_frame.sum(0) > 0]
  363. else:
  364. unstacker = _Unstacker(obj.values, obj.index, level=level,
  365. value_columns=obj.columns)
  366. return unstacker.get_result()
  367. def get_compressed_ids(labels, sizes):
  368. # no overflow
  369. if com._long_prod(sizes) < 2 ** 63:
  370. group_index = get_group_index(labels, sizes)
  371. comp_index, obs_ids = _compress_group_index(group_index)
  372. else:
  373. n = len(labels[0])
  374. mask = np.zeros(n, dtype=bool)
  375. for v in labels:
  376. mask |= v < 0
  377. while com._long_prod(sizes) >= 2 ** 63:
  378. i = len(sizes)
  379. while com._long_prod(sizes[:i]) >= 2 ** 63:
  380. i -= 1
  381. rem_index, rem_ids = get_compressed_ids(labels[:i],
  382. sizes[:i])
  383. sizes = [len(rem_ids)] + sizes[i:]
  384. labels = [rem_index] + labels[i:]
  385. return get_compressed_ids(labels, sizes)
  386. return comp_index, obs_ids
  387. def stack(frame, level=-1, dropna=True):
  388. """
  389. Convert DataFrame to Series with multi-level Index. Columns become the
  390. second level of the resulting hierarchical index
  391. Returns
  392. -------
  393. stacked : Series
  394. """
  395. N, K = frame.shape
  396. if isinstance(frame.columns, MultiIndex):
  397. if frame.columns._reference_duplicate_name(level):
  398. msg = ("Ambiguous reference to {0}. The column "
  399. "names are not unique.".format(level))
  400. raise ValueError(msg)
  401. if isinstance(level, int) and level < 0:
  402. level += frame.columns.nlevels
  403. level = frame.columns._get_level_number(level)
  404. if isinstance(frame.columns, MultiIndex):
  405. return _stack_multi_columns(frame, level=level, dropna=dropna)
  406. elif isinstance(frame.index, MultiIndex):
  407. new_levels = list(frame.index.levels)
  408. new_levels.append(frame.columns)
  409. new_labels = [lab.repeat(K) for lab in frame.index.labels]
  410. new_labels.append(np.tile(np.arange(K), N).ravel())
  411. new_names = list(frame.index.names)
  412. new_names.append(frame.columns.name)
  413. new_index = MultiIndex(levels=new_levels, labels=new_labels,
  414. names=new_names, verify_integrity=False)
  415. else:
  416. ilabels = np.arange(N).repeat(K)
  417. clabels = np.tile(np.arange(K), N).ravel()
  418. new_index = MultiIndex(levels=[frame.index, frame.columns],
  419. labels=[ilabels, clabels],
  420. names=[frame.index.name, frame.columns.name],
  421. verify_integrity=False)
  422. new_values = frame.values.ravel()
  423. if dropna:
  424. mask = notnull(new_values)
  425. new_values = new_values[mask]
  426. new_index = new_index[mask]
  427. return Series(new_values, index=new_index)
  428. def _stack_multi_columns(frame, level=-1, dropna=True):
  429. this = frame.copy()
  430. # this makes life much simpler
  431. if level != frame.columns.nlevels - 1:
  432. # roll levels to put selected level at end
  433. roll_columns = this.columns
  434. for i in range(level, frame.columns.nlevels - 1):
  435. roll_columns = roll_columns.swaplevel(i, i + 1)
  436. this.columns = roll_columns
  437. if not this.columns.is_lexsorted():
  438. this = this.sortlevel(0, axis=1)
  439. # tuple list excluding level for grouping columns
  440. if len(frame.columns.levels) > 2:
  441. tuples = list(zip(*[
  442. lev.values.take(lab) for lev, lab in
  443. zip(this.columns.levels[:-1], this.columns.labels[:-1])
  444. ]))
  445. unique_groups = [key for key, _ in itertools.groupby(tuples)]
  446. new_names = this.columns.names[:-1]
  447. new_columns = MultiIndex.from_tuples(unique_groups, names=new_names)
  448. else:
  449. new_columns = unique_groups = this.columns.levels[0]
  450. # time to ravel the values
  451. new_data = {}
  452. level_vals = this.columns.levels[-1]
  453. levsize = len(level_vals)
  454. drop_cols = []
  455. for key in unique_groups:
  456. loc = this.columns.get_loc(key)
  457. slice_len = loc.stop - loc.start
  458. # can make more efficient?
  459. if slice_len == 0:
  460. drop_cols.append(key)
  461. continue
  462. elif slice_len != levsize:
  463. chunk = this.ix[:, this.columns[loc]]
  464. chunk.columns = level_vals.take(chunk.columns.labels[-1])
  465. value_slice = chunk.reindex(columns=level_vals).values
  466. else:
  467. if frame._is_mixed_type:
  468. value_slice = this.ix[:, this.columns[loc]].values
  469. else:
  470. value_slice = this.values[:, loc]
  471. new_data[key] = value_slice.ravel()
  472. if len(drop_cols) > 0:
  473. new_columns = new_columns - drop_cols
  474. N = len(this)
  475. if isinstance(this.index, MultiIndex):
  476. new_levels = list(this.index.levels)
  477. new_names = list(this.index.names)
  478. new_labels = [lab.repeat(levsize) for lab in this.index.labels]
  479. else:
  480. new_levels = [this.index]
  481. new_labels = [np.arange(N).repeat(levsize)]
  482. new_names = [this.index.name] # something better?
  483. new_levels.append(frame.columns.levels[level])
  484. new_labels.append(np.tile(np.arange(levsize), N))
  485. new_names.append(frame.columns.names[level])
  486. new_index = MultiIndex(levels=new_levels, labels=new_labels,
  487. names=new_names, verify_integrity=False)
  488. result = DataFrame(new_data, index=new_index, columns=new_columns)
  489. # more efficient way to go about this? can do the whole masking biz but
  490. # will only save a small amount of time...
  491. if dropna:
  492. result = result.dropna(axis=0, how='all')
  493. return result
  494. def melt(frame, id_vars=None, value_vars=None,
  495. var_name=None, value_name='value', col_level=None):
  496. """
  497. "Unpivots" a DataFrame from wide format to long format, optionally leaving
  498. identifier variables set.
  499. This function is useful to massage a DataFrame into a format where one
  500. or more columns are identifier variables (`id_vars`), while all other
  501. columns, considered measured variables (`value_vars`), are "unpivoted" to
  502. the row axis, leaving just two non-identifier columns, 'variable' and
  503. 'value'.
  504. Parameters
  505. ----------
  506. frame : DataFrame
  507. id_vars : tuple, list, or ndarray, optional
  508. Column(s) to use as identifier variables.
  509. value_vars : tuple, list, or ndarray, optional
  510. Column(s) to unpivot. If not specified, uses all columns that
  511. are not set as `id_vars`.
  512. var_name : scalar
  513. Name to use for the 'variable' column. If None it uses
  514. ``frame.columns.name`` or 'variable'.
  515. value_name : scalar, default 'value'
  516. Name to use for the 'value' column.
  517. col_level : int or string, optional
  518. If columns are a MultiIndex then use this level to melt.
  519. See also
  520. --------
  521. pivot_table
  522. DataFrame.pivot
  523. Examples
  524. --------
  525. >>> import pandas as pd
  526. >>> df = pd.DataFrame({'A': {0: 'a', 1: 'b', 2: 'c'},
  527. ... 'B': {0: 1, 1: 3, 2: 5},
  528. ... 'C': {0: 2, 1: 4, 2: 6}})
  529. >>> df
  530. A B C
  531. 0 a 1 2
  532. 1 b 3 4
  533. 2 c 5 6
  534. >>> pd.melt(df, id_vars=['A'], value_vars=['B'])
  535. A variable value
  536. 0 a B 1
  537. 1 b B 3
  538. 2 c B 5
  539. >>> pd.melt(df, id_vars=['A'], value_vars=['B', 'C'])
  540. A variable value
  541. 0 a B 1
  542. 1 b B 3
  543. 2 c B 5
  544. 3 a C 2
  545. 4 b C 4
  546. 5 c C 6
  547. The names of 'variable' and 'value' columns can be customized:
  548. >>> pd.melt(df, id_vars=['A'], value_vars=['B'],
  549. ... var_name='myVarname', value_name='myValname')
  550. A myVarname myValname
  551. 0 a B 1
  552. 1 b B 3
  553. 2 c B 5
  554. If you have multi-index columns:
  555. >>> df.columns = [list('ABC'), list('DEF')]
  556. >>> df
  557. A B C
  558. D E F
  559. 0 a 1 2
  560. 1 b 3 4
  561. 2 c 5 6
  562. >>> pd.melt(df, col_level=0, id_vars=['A'], value_vars=['B'])
  563. A variable value
  564. 0 a B 1
  565. 1 b B 3
  566. 2 c B 5
  567. >>> pd.melt(df, id_vars=[('A', 'D')], value_vars=[('B', 'E')])
  568. (A, D) variable_0 variable_1 value
  569. 0 a B E 1
  570. 1 b B E 3
  571. 2 c B E 5
  572. """
  573. # TODO: what about the existing index?
  574. if id_vars is not None:
  575. if not isinstance(id_vars, (tuple, list, np.ndarray)):
  576. id_vars = [id_vars]
  577. else:
  578. id_vars = list(id_vars)
  579. else:
  580. id_vars = []
  581. if value_vars is not None:
  582. if not isinstance(value_vars, (tuple, list, np.ndarray)):
  583. value_vars = [value_vars]
  584. frame = frame.ix[:, id_vars + value_vars]
  585. else:
  586. frame = frame.copy()
  587. if col_level is not None: # allow list or other?
  588. # frame is a copy
  589. frame.columns = frame.columns.get_level_values(col_level)
  590. if var_name is None:
  591. if isinstance(frame.columns, MultiIndex):
  592. if len(frame.columns.names) == len(set(frame.columns.names)):
  593. var_name = frame.columns.names
  594. else:
  595. var_name = ['variable_%s' % i for i in
  596. range(len(frame.columns.names))]
  597. else:
  598. var_name = [frame.columns.name if frame.columns.name is not None
  599. else 'variable']
  600. if isinstance(var_name, compat.string_types):
  601. var_name = [var_name]
  602. N, K = frame.shape
  603. K -= len(id_vars)
  604. mdata = {}
  605. for col in id_vars:
  606. mdata[col] = np.tile(frame.pop(col).values, K)
  607. mcolumns = id_vars + var_name + [value_name]
  608. mdata[value_name] = frame.values.ravel('F')
  609. for i, col in enumerate(var_name):
  610. # asanyarray will keep the columns as an Index
  611. mdata[col] = np.asanyarray(frame.columns.get_level_values(i)).repeat(N)
  612. return DataFrame(mdata, columns=mcolumns)
  613. def lreshape(data, groups, dropna=True, label=None):
  614. """
  615. Reshape long-format data to wide. Generalized inverse of DataFrame.pivot
  616. Parameters
  617. ----------
  618. data : DataFrame
  619. groups : dict
  620. {new_name : list_of_columns}
  621. dropna : boolean, default True
  622. Examples
  623. --------
  624. >>> import pandas as pd
  625. >>> data = pd.DataFrame({'hr1': [514, 573], 'hr2': [545, 526],
  626. ... 'team': ['Red Sox', 'Yankees'],
  627. ... 'year1': [2007, 2008], 'year2': [2008, 2008]})
  628. >>> data
  629. hr1 hr2 team year1 year2
  630. 0 514 545 Red Sox 2007 2008
  631. 1 573 526 Yankees 2007 2008
  632. >>> pd.lreshape(data, {'year': ['year1', 'year2'], 'hr': ['hr1', 'hr2']})
  633. team hr year
  634. 0 Red Sox 514 2007
  635. 1 Yankees 573 2007
  636. 2 Red Sox 545 2008
  637. 3 Yankees 526 2008
  638. Returns
  639. -------
  640. reshaped : DataFrame
  641. """
  642. if isinstance(groups, dict):
  643. keys = list(groups.keys())
  644. values = list(groups.values())
  645. else:
  646. keys, values = zip(*groups)
  647. all_cols = list(set.union(*[set(x) for x in values]))
  648. id_cols = list(data.columns.diff(all_cols))
  649. K = len(values[0])
  650. for seq in values:
  651. if len(seq) != K:
  652. raise ValueError('All column lists must be same length')
  653. mdata = {}
  654. pivot_cols = []
  655. for target, names in zip(keys, values):
  656. mdata[target] = com._concat_compat([data[col].values for col in names])
  657. pivot_cols.append(target)
  658. for col in id_cols:
  659. mdata[col] = np.tile(data[col].values, K)
  660. if dropna:
  661. mask = np.ones(len(mdata[pivot_cols[0]]), dtype=bool)
  662. for c in pivot_cols:
  663. mask &= notnull(mdata[c])
  664. if not mask.all():
  665. mdata = dict((k, v[mask]) for k, v in compat.iteritems(mdata))
  666. return DataFrame(mdata, columns=id_cols + pivot_cols)
  667. def wide_to_long(df, stubnames, i, j):
  668. """
  669. Wide panel to long format. Less flexible but more user-friendly than melt.
  670. Parameters
  671. ----------
  672. df : DataFrame
  673. The wide-format DataFrame
  674. stubnames : list
  675. A list of stub names. The wide format variables are assumed to
  676. start with the stub names.
  677. i : str
  678. The name of the id variable.
  679. j : str
  680. The name of the subobservation variable.
  681. stubend : str
  682. Regex to match for the end of the stubs.
  683. Returns
  684. -------
  685. DataFrame
  686. A DataFrame that contains each stub name as a variable as well as
  687. variables for i and j.
  688. Examples
  689. --------
  690. >>> import pandas as pd
  691. >>> import numpy as np
  692. >>> np.random.seed(123)
  693. >>> df = pd.DataFrame({"A1970" : {0 : "a", 1 : "b", 2 : "c"},
  694. ... "A1980" : {0 : "d", 1 : "e", 2 : "f"},
  695. ... "B1970" : {0 : 2.5, 1 : 1.2, 2 : .7},
  696. ... "B1980" : {0 : 3.2, 1 : 1.3, 2 : .1},
  697. ... "X" : dict(zip(range(3), np.random.randn(3)))
  698. ... })
  699. >>> df["id"] = df.index
  700. >>> df
  701. A1970 A1980 B1970 B1980 X id
  702. 0 a d 2.5 3.2 -1.085631 0
  703. 1 b e 1.2 1.3 0.997345 1
  704. 2 c f 0.7 0.1 0.282978 2
  705. >>> wide_to_long(df, ["A", "B"], i="id", j="year")
  706. X A B
  707. id year
  708. 0 1970 -1.085631 a 2.5
  709. 1 1970 0.997345 b 1.2
  710. 2 1970 0.282978 c 0.7
  711. 0 1980 -1.085631 d 3.2
  712. 1 1980 0.997345 e 1.3
  713. 2 1980 0.282978 f 0.1
  714. Notes
  715. -----
  716. All extra variables are treated as extra id variables. This simply uses
  717. `pandas.melt` under the hood, but is hard-coded to "do the right thing"
  718. in a typicaly case.
  719. """
  720. def get_var_names(df, regex):
  721. return df.filter(regex=regex).columns.tolist()
  722. def melt_stub(df, stub, i, j):
  723. varnames = get_var_names(df, "^" + stub)
  724. newdf = melt(df, id_vars=i, value_vars=varnames, value_name=stub,
  725. var_name=j)
  726. newdf_j = newdf[j].str.replace(stub, "")
  727. try:
  728. newdf_j = newdf_j.astype(int)
  729. except ValueError:
  730. pass
  731. newdf[j] = newdf_j
  732. return newdf
  733. id_vars = get_var_names(df, "^(?!%s)" % "|".join(stubnames))
  734. if i not in id_vars:
  735. id_vars += [i]
  736. stub = stubnames.pop(0)
  737. newdf = melt_stub(df, stub, id_vars, j)
  738. for stub in stubnames:
  739. new = melt_stub(df, stub, id_vars, j)
  740. newdf = newdf.merge(new, how="outer", on=id_vars + [j], copy=False)
  741. return newdf.set_index([i, j])
  742. def convert_dummies(data, cat_variables, prefix_sep='_'):
  743. """
  744. Compute DataFrame with specified columns converted to dummy variables (0 /
  745. 1). Result columns will be prefixed with the column name, then the level
  746. name, e.g. 'A_foo' for column A and level foo
  747. Parameters
  748. ----------
  749. data : DataFrame
  750. cat_variables : list-like
  751. Must be column names in the DataFrame
  752. prefix_sep : string, default '_'
  753. String to use to separate column name from dummy level
  754. Returns
  755. -------
  756. dummies : DataFrame
  757. """
  758. result = data.drop(cat_variables, axis=1)
  759. for variable in cat_variables:
  760. dummies = get_dummies(data[variable], prefix=variable,
  761. prefix_sep=prefix_sep)
  762. result = result.join(dummies)
  763. return result
  764. def get_dummies(data, prefix=None, prefix_sep='_', dummy_na=False):
  765. """
  766. Convert categorical variable into dummy/indicator variables
  767. Parameters
  768. ----------
  769. data : array-like or Series
  770. prefix : string, default None
  771. String to append DataFrame column names
  772. prefix_sep : string, default '_'
  773. If appending prefix, separator/delimiter to use
  774. dummy_na : bool, default False
  775. Add a column to indicate NaNs, if False NaNs are ignored.
  776. Returns
  777. -------
  778. dummies : DataFrame
  779. Examples
  780. --------
  781. >>> import pandas as pd
  782. >>> s = pd.Series(list('abca'))
  783. >>> get_dummies(s)
  784. a b c
  785. 0 1 0 0
  786. 1 0 1 0
  787. 2 0 0 1
  788. 3 1 0 0
  789. >>> s1 = ['a', 'b', np.nan]
  790. >>> get_dummies(s1)
  791. a b
  792. 0 1 0
  793. 1 0 1
  794. 2 0 0
  795. >>> get_dummies(s1, dummy_na=True)
  796. a b NaN
  797. 0 1 0 0
  798. 1 0 1 0
  799. 2 0 0 1
  800. See also ``Series.str.get_dummies``.
  801. """
  802. # Series avoids inconsistent NaN handling
  803. cat = Categorical.from_array(Series(data))
  804. levels = cat.levels
  805. # if all NaN
  806. if not dummy_na and len(levels) == 0:
  807. if isinstance(data, Series):
  808. index = data.index
  809. else:
  810. index = np.arange(len(data))
  811. return DataFrame(index=index)
  812. number_of_cols = len(levels)
  813. if dummy_na:
  814. number_of_cols += 1
  815. dummy_mat = np.eye(number_of_cols).take(cat.labels, axis=0)
  816. if dummy_na:
  817. levels = np.append(cat.levels, np.nan)
  818. else:
  819. # reset NaN GH4446
  820. dummy_mat[cat.labels == -1] = 0
  821. if prefix is not None:
  822. dummy_cols = ['%s%s%s' % (prefix, prefix_sep, v)
  823. for v in levels]
  824. else:
  825. dummy_cols = levels
  826. if isinstance(data, Series):
  827. index = data.index
  828. else:
  829. index = None
  830. return DataFrame(dummy_mat, index=index, columns=dummy_cols)
  831. def make_axis_dummies(frame, axis='minor', transform=None):
  832. """
  833. Construct 1-0 dummy variables corresponding to designated axis
  834. labels
  835. Parameters
  836. ----------
  837. frame : DataFrame
  838. axis : {'major', 'minor'}, default 'minor'
  839. transform : function, default None
  840. Function to apply to axis labels first. For example, to
  841. get "day of week" dummies in a time series regression
  842. you might call::
  843. make_axis_dummies(panel, axis='major',
  844. transform=lambda d: d.weekday())
  845. Returns
  846. -------
  847. dummies : DataFrame
  848. Column names taken from chosen axis
  849. """
  850. numbers = {
  851. 'major': 0,
  852. 'minor': 1
  853. }
  854. num = numbers.get(axis, axis)
  855. items = frame.index.levels[num]
  856. labels = frame.index.labels[num]
  857. if transform is not None:
  858. mapped_items = items.map(transform)
  859. cat = Categorical.from_array(mapped_items.take(labels))
  860. labels = cat.labels
  861. items = cat.levels
  862. values = np.eye(len(items), dtype=float)
  863. values = values.take(labels, axis=0)
  864. return DataFrame(values, columns=items, index=frame.index)
  865. def block2d_to_blocknd(values, placement, shape, labels, ref_items):
  866. """ pivot to the labels shape """
  867. from pandas.core.internals import make_block
  868. panel_shape = (len(placement),) + shape
  869. # TODO: lexsort depth needs to be 2!!
  870. # Create observation selection vector using major and minor
  871. # labels, for converting to panel format.
  872. selector = factor_indexer(shape[1:], labels)
  873. mask = np.zeros(np.prod(shape), dtype=bool)
  874. mask.put(selector, True)
  875. if mask.all():
  876. pvalues = np.empty(panel_shape, dtype=values.dtype)
  877. else:
  878. dtype, fill_value = _maybe_promote(values.dtype)
  879. pvalues = np.empty(panel_shape, dtype=dtype)
  880. pvalues.fill(fill_value)
  881. values = values
  882. for i in range(len(placement)):
  883. pvalues[i].flat[mask] = values[:, i]
  884. return make_block(pvalues, placement=placement)
  885. def factor_indexer(shape, labels):
  886. """ given a tuple of shape and a list of Categorical labels, return the
  887. expanded label indexer
  888. """
  889. mult = np.array(shape)[::-1].cumprod()[::-1]
  890. return com._ensure_platform_int(
  891. np.sum(np.array(labels).T * np.append(mult, [1]), axis=1).T)