PageRenderTime 44ms CodeModel.GetById 12ms RepoModel.GetById 0ms app.codeStats 0ms

/pandas/core/reshape.py

http://github.com/wesm/pandas
Python | 1219 lines | 1202 code | 9 blank | 8 comment | 2 complexity | f9c24cf82789e3b09e2d0f650bdca65c MD5 | raw file
Possible License(s): BSD-3-Clause, Apache-2.0
  1. # pylint: disable=E1101,E1103
  2. # pylint: disable=W0703,W0622,W0613,W0201
  3. from pandas.compat import range, zip
  4. from pandas import compat
  5. import itertools
  6. import numpy as np
  7. from pandas.types.common import _ensure_platform_int, is_list_like
  8. from pandas.types.cast import _maybe_promote
  9. from pandas.types.missing import notnull
  10. import pandas.types.concat as _concat
  11. from pandas.core.series import Series
  12. from pandas.core.frame import DataFrame
  13. from pandas.core.sparse import SparseDataFrame, SparseSeries
  14. from pandas.sparse.array import SparseArray
  15. from pandas._sparse import IntIndex
  16. from pandas.core.categorical import Categorical
  17. from pandas.core.groupby import get_group_index, _compress_group_index
  18. import pandas.core.algorithms as algos
  19. import pandas.algos as _algos
  20. from pandas.core.index import MultiIndex, _get_na_value
  21. class _Unstacker(object):
  22. """
  23. Helper class to unstack data / pivot with multi-level index
  24. Parameters
  25. ----------
  26. level : int or str, default last level
  27. Level to "unstack". Accepts a name for the level.
  28. Examples
  29. --------
  30. >>> import pandas as pd
  31. >>> index = pd.MultiIndex.from_tuples([('one', 'a'), ('one', 'b'),
  32. ... ('two', 'a'), ('two', 'b')])
  33. >>> s = pd.Series(np.arange(1.0, 5.0), index=index)
  34. >>> s
  35. one a 1
  36. b 2
  37. two a 3
  38. b 4
  39. dtype: float64
  40. >>> s.unstack(level=-1)
  41. a b
  42. one 1 2
  43. two 3 4
  44. >>> s.unstack(level=0)
  45. one two
  46. a 1 2
  47. b 3 4
  48. Returns
  49. -------
  50. unstacked : DataFrame
  51. """
  52. def __init__(self, values, index, level=-1, value_columns=None,
  53. fill_value=None):
  54. self.is_categorical = None
  55. if values.ndim == 1:
  56. if isinstance(values, Categorical):
  57. self.is_categorical = values
  58. values = np.array(values)
  59. values = values[:, np.newaxis]
  60. self.values = values
  61. self.value_columns = value_columns
  62. self.fill_value = fill_value
  63. if value_columns is None and values.shape[1] != 1: # pragma: no cover
  64. raise ValueError('must pass column labels for multi-column data')
  65. self.index = index
  66. if isinstance(self.index, MultiIndex):
  67. if index._reference_duplicate_name(level):
  68. msg = ("Ambiguous reference to {0}. The index "
  69. "names are not unique.".format(level))
  70. raise ValueError(msg)
  71. self.level = self.index._get_level_number(level)
  72. # when index includes `nan`, need to lift levels/strides by 1
  73. self.lift = 1 if -1 in self.index.labels[self.level] else 0
  74. self.new_index_levels = list(index.levels)
  75. self.new_index_names = list(index.names)
  76. self.removed_name = self.new_index_names.pop(self.level)
  77. self.removed_level = self.new_index_levels.pop(self.level)
  78. self._make_sorted_values_labels()
  79. self._make_selectors()
  80. def _make_sorted_values_labels(self):
  81. v = self.level
  82. labs = list(self.index.labels)
  83. levs = list(self.index.levels)
  84. to_sort = labs[:v] + labs[v + 1:] + [labs[v]]
  85. sizes = [len(x) for x in levs[:v] + levs[v + 1:] + [levs[v]]]
  86. comp_index, obs_ids = get_compressed_ids(to_sort, sizes)
  87. ngroups = len(obs_ids)
  88. indexer = _algos.groupsort_indexer(comp_index, ngroups)[0]
  89. indexer = _ensure_platform_int(indexer)
  90. self.sorted_values = algos.take_nd(self.values, indexer, axis=0)
  91. self.sorted_labels = [l.take(indexer) for l in to_sort]
  92. def _make_selectors(self):
  93. new_levels = self.new_index_levels
  94. # make the mask
  95. remaining_labels = self.sorted_labels[:-1]
  96. level_sizes = [len(x) for x in new_levels]
  97. comp_index, obs_ids = get_compressed_ids(remaining_labels, level_sizes)
  98. ngroups = len(obs_ids)
  99. comp_index = _ensure_platform_int(comp_index)
  100. stride = self.index.levshape[self.level] + self.lift
  101. self.full_shape = ngroups, stride
  102. selector = self.sorted_labels[-1] + stride * comp_index + self.lift
  103. mask = np.zeros(np.prod(self.full_shape), dtype=bool)
  104. mask.put(selector, True)
  105. if mask.sum() < len(self.index):
  106. raise ValueError('Index contains duplicate entries, '
  107. 'cannot reshape')
  108. self.group_index = comp_index
  109. self.mask = mask
  110. self.unique_groups = obs_ids
  111. self.compressor = comp_index.searchsorted(np.arange(ngroups))
  112. def get_result(self):
  113. # TODO: find a better way than this masking business
  114. values, value_mask = self.get_new_values()
  115. columns = self.get_new_columns()
  116. index = self.get_new_index()
  117. # filter out missing levels
  118. if values.shape[1] > 0:
  119. col_inds, obs_ids = _compress_group_index(self.sorted_labels[-1])
  120. # rare case, level values not observed
  121. if len(obs_ids) < self.full_shape[1]:
  122. inds = (value_mask.sum(0) > 0).nonzero()[0]
  123. values = algos.take_nd(values, inds, axis=1)
  124. columns = columns[inds]
  125. # may need to coerce categoricals here
  126. if self.is_categorical is not None:
  127. categories = self.is_categorical.categories
  128. ordered = self.is_categorical.ordered
  129. values = [Categorical.from_array(values[:, i],
  130. categories=categories,
  131. ordered=ordered)
  132. for i in range(values.shape[-1])]
  133. return DataFrame(values, index=index, columns=columns)
  134. def get_new_values(self):
  135. values = self.values
  136. # place the values
  137. length, width = self.full_shape
  138. stride = values.shape[1]
  139. result_width = width * stride
  140. result_shape = (length, result_width)
  141. # if our mask is all True, then we can use our existing dtype
  142. if self.mask.all():
  143. dtype = values.dtype
  144. new_values = np.empty(result_shape, dtype=dtype)
  145. else:
  146. dtype, fill_value = _maybe_promote(values.dtype, self.fill_value)
  147. new_values = np.empty(result_shape, dtype=dtype)
  148. new_values.fill(fill_value)
  149. new_mask = np.zeros(result_shape, dtype=bool)
  150. # is there a simpler / faster way of doing this?
  151. for i in range(values.shape[1]):
  152. chunk = new_values[:, i * width:(i + 1) * width]
  153. mask_chunk = new_mask[:, i * width:(i + 1) * width]
  154. chunk.flat[self.mask] = self.sorted_values[:, i]
  155. mask_chunk.flat[self.mask] = True
  156. return new_values, new_mask
  157. def get_new_columns(self):
  158. if self.value_columns is None:
  159. if self.lift == 0:
  160. return self.removed_level
  161. lev = self.removed_level
  162. return lev.insert(0, _get_na_value(lev.dtype.type))
  163. stride = len(self.removed_level) + self.lift
  164. width = len(self.value_columns)
  165. propagator = np.repeat(np.arange(width), stride)
  166. if isinstance(self.value_columns, MultiIndex):
  167. new_levels = self.value_columns.levels + (self.removed_level,)
  168. new_names = self.value_columns.names + (self.removed_name,)
  169. new_labels = [lab.take(propagator)
  170. for lab in self.value_columns.labels]
  171. else:
  172. new_levels = [self.value_columns, self.removed_level]
  173. new_names = [self.value_columns.name, self.removed_name]
  174. new_labels = [propagator]
  175. new_labels.append(np.tile(np.arange(stride) - self.lift, width))
  176. return MultiIndex(levels=new_levels, labels=new_labels,
  177. names=new_names, verify_integrity=False)
  178. def get_new_index(self):
  179. result_labels = [lab.take(self.compressor)
  180. for lab in self.sorted_labels[:-1]]
  181. # construct the new index
  182. if len(self.new_index_levels) == 1:
  183. lev, lab = self.new_index_levels[0], result_labels[0]
  184. if (lab == -1).any():
  185. lev = lev.insert(len(lev), _get_na_value(lev.dtype.type))
  186. return lev.take(lab)
  187. return MultiIndex(levels=self.new_index_levels, labels=result_labels,
  188. names=self.new_index_names, verify_integrity=False)
  189. def _unstack_multiple(data, clocs):
  190. from pandas.core.groupby import decons_obs_group_ids
  191. if len(clocs) == 0:
  192. return data
  193. # NOTE: This doesn't deal with hierarchical columns yet
  194. index = data.index
  195. clocs = [index._get_level_number(i) for i in clocs]
  196. rlocs = [i for i in range(index.nlevels) if i not in clocs]
  197. clevels = [index.levels[i] for i in clocs]
  198. clabels = [index.labels[i] for i in clocs]
  199. cnames = [index.names[i] for i in clocs]
  200. rlevels = [index.levels[i] for i in rlocs]
  201. rlabels = [index.labels[i] for i in rlocs]
  202. rnames = [index.names[i] for i in rlocs]
  203. shape = [len(x) for x in clevels]
  204. group_index = get_group_index(clabels, shape, sort=False, xnull=False)
  205. comp_ids, obs_ids = _compress_group_index(group_index, sort=False)
  206. recons_labels = decons_obs_group_ids(comp_ids, obs_ids, shape, clabels,
  207. xnull=False)
  208. dummy_index = MultiIndex(levels=rlevels + [obs_ids],
  209. labels=rlabels + [comp_ids],
  210. names=rnames + ['__placeholder__'],
  211. verify_integrity=False)
  212. if isinstance(data, Series):
  213. dummy = Series(data.values, index=dummy_index)
  214. unstacked = dummy.unstack('__placeholder__')
  215. new_levels = clevels
  216. new_names = cnames
  217. new_labels = recons_labels
  218. else:
  219. if isinstance(data.columns, MultiIndex):
  220. result = data
  221. for i in range(len(clocs)):
  222. val = clocs[i]
  223. result = result.unstack(val)
  224. clocs = [v if i > v else v - 1 for v in clocs]
  225. return result
  226. dummy = DataFrame(data.values, index=dummy_index, columns=data.columns)
  227. unstacked = dummy.unstack('__placeholder__')
  228. if isinstance(unstacked, Series):
  229. unstcols = unstacked.index
  230. else:
  231. unstcols = unstacked.columns
  232. new_levels = [unstcols.levels[0]] + clevels
  233. new_names = [data.columns.name] + cnames
  234. new_labels = [unstcols.labels[0]]
  235. for rec in recons_labels:
  236. new_labels.append(rec.take(unstcols.labels[-1]))
  237. new_columns = MultiIndex(levels=new_levels, labels=new_labels,
  238. names=new_names, verify_integrity=False)
  239. if isinstance(unstacked, Series):
  240. unstacked.index = new_columns
  241. else:
  242. unstacked.columns = new_columns
  243. return unstacked
  244. def pivot(self, index=None, columns=None, values=None):
  245. """
  246. See DataFrame.pivot
  247. """
  248. if values is None:
  249. cols = [columns] if index is None else [index, columns]
  250. append = index is None
  251. indexed = self.set_index(cols, append=append)
  252. return indexed.unstack(columns)
  253. else:
  254. if index is None:
  255. index = self.index
  256. else:
  257. index = self[index]
  258. indexed = Series(self[values].values,
  259. index=MultiIndex.from_arrays([index, self[columns]]))
  260. return indexed.unstack(columns)
  261. def pivot_simple(index, columns, values):
  262. """
  263. Produce 'pivot' table based on 3 columns of this DataFrame.
  264. Uses unique values from index / columns and fills with values.
  265. Parameters
  266. ----------
  267. index : ndarray
  268. Labels to use to make new frame's index
  269. columns : ndarray
  270. Labels to use to make new frame's columns
  271. values : ndarray
  272. Values to use for populating new frame's values
  273. Notes
  274. -----
  275. Obviously, all 3 of the input arguments must have the same length
  276. Returns
  277. -------
  278. DataFrame
  279. """
  280. if (len(index) != len(columns)) or (len(columns) != len(values)):
  281. raise AssertionError('Length of index, columns, and values must be the'
  282. ' same')
  283. if len(index) == 0:
  284. return DataFrame(index=[])
  285. hindex = MultiIndex.from_arrays([index, columns])
  286. series = Series(values.ravel(), index=hindex)
  287. series = series.sortlevel(0)
  288. return series.unstack()
  289. def _slow_pivot(index, columns, values):
  290. """
  291. Produce 'pivot' table based on 3 columns of this DataFrame.
  292. Uses unique values from index / columns and fills with values.
  293. Parameters
  294. ----------
  295. index : string or object
  296. Column name to use to make new frame's index
  297. columns : string or object
  298. Column name to use to make new frame's columns
  299. values : string or object
  300. Column name to use for populating new frame's values
  301. Could benefit from some Cython here.
  302. """
  303. tree = {}
  304. for i, (idx, col) in enumerate(zip(index, columns)):
  305. if col not in tree:
  306. tree[col] = {}
  307. branch = tree[col]
  308. branch[idx] = values[i]
  309. return DataFrame(tree)
  310. def unstack(obj, level, fill_value=None):
  311. if isinstance(level, (tuple, list)):
  312. return _unstack_multiple(obj, level)
  313. if isinstance(obj, DataFrame):
  314. if isinstance(obj.index, MultiIndex):
  315. return _unstack_frame(obj, level, fill_value=fill_value)
  316. else:
  317. return obj.T.stack(dropna=False)
  318. else:
  319. unstacker = _Unstacker(obj.values, obj.index, level=level,
  320. fill_value=fill_value)
  321. return unstacker.get_result()
  322. def _unstack_frame(obj, level, fill_value=None):
  323. from pandas.core.internals import BlockManager, make_block
  324. if obj._is_mixed_type:
  325. unstacker = _Unstacker(np.empty(obj.shape, dtype=bool), # dummy
  326. obj.index, level=level,
  327. value_columns=obj.columns)
  328. new_columns = unstacker.get_new_columns()
  329. new_index = unstacker.get_new_index()
  330. new_axes = [new_columns, new_index]
  331. new_blocks = []
  332. mask_blocks = []
  333. for blk in obj._data.blocks:
  334. blk_items = obj._data.items[blk.mgr_locs.indexer]
  335. bunstacker = _Unstacker(blk.values.T, obj.index, level=level,
  336. value_columns=blk_items,
  337. fill_value=fill_value)
  338. new_items = bunstacker.get_new_columns()
  339. new_placement = new_columns.get_indexer(new_items)
  340. new_values, mask = bunstacker.get_new_values()
  341. mblk = make_block(mask.T, placement=new_placement)
  342. mask_blocks.append(mblk)
  343. newb = make_block(new_values.T, placement=new_placement)
  344. new_blocks.append(newb)
  345. result = DataFrame(BlockManager(new_blocks, new_axes))
  346. mask_frame = DataFrame(BlockManager(mask_blocks, new_axes))
  347. return result.ix[:, mask_frame.sum(0) > 0]
  348. else:
  349. unstacker = _Unstacker(obj.values, obj.index, level=level,
  350. value_columns=obj.columns,
  351. fill_value=fill_value)
  352. return unstacker.get_result()
  353. def get_compressed_ids(labels, sizes):
  354. from pandas.core.groupby import get_group_index
  355. ids = get_group_index(labels, sizes, sort=True, xnull=False)
  356. return _compress_group_index(ids, sort=True)
  357. def stack(frame, level=-1, dropna=True):
  358. """
  359. Convert DataFrame to Series with multi-level Index. Columns become the
  360. second level of the resulting hierarchical index
  361. Returns
  362. -------
  363. stacked : Series
  364. """
  365. def factorize(index):
  366. if index.is_unique:
  367. return index, np.arange(len(index))
  368. cat = Categorical(index, ordered=True)
  369. return cat.categories, cat.codes
  370. N, K = frame.shape
  371. if isinstance(frame.columns, MultiIndex):
  372. if frame.columns._reference_duplicate_name(level):
  373. msg = ("Ambiguous reference to {0}. The column "
  374. "names are not unique.".format(level))
  375. raise ValueError(msg)
  376. # Will also convert negative level numbers and check if out of bounds.
  377. level_num = frame.columns._get_level_number(level)
  378. if isinstance(frame.columns, MultiIndex):
  379. return _stack_multi_columns(frame, level_num=level_num, dropna=dropna)
  380. elif isinstance(frame.index, MultiIndex):
  381. new_levels = list(frame.index.levels)
  382. new_labels = [lab.repeat(K) for lab in frame.index.labels]
  383. clev, clab = factorize(frame.columns)
  384. new_levels.append(clev)
  385. new_labels.append(np.tile(clab, N).ravel())
  386. new_names = list(frame.index.names)
  387. new_names.append(frame.columns.name)
  388. new_index = MultiIndex(levels=new_levels, labels=new_labels,
  389. names=new_names, verify_integrity=False)
  390. else:
  391. levels, (ilab, clab) = zip(*map(factorize, (frame.index,
  392. frame.columns)))
  393. labels = ilab.repeat(K), np.tile(clab, N).ravel()
  394. new_index = MultiIndex(levels=levels, labels=labels,
  395. names=[frame.index.name, frame.columns.name],
  396. verify_integrity=False)
  397. new_values = frame.values.ravel()
  398. if dropna:
  399. mask = notnull(new_values)
  400. new_values = new_values[mask]
  401. new_index = new_index[mask]
  402. return Series(new_values, index=new_index)
  403. def stack_multiple(frame, level, dropna=True):
  404. # If all passed levels match up to column names, no
  405. # ambiguity about what to do
  406. if all(lev in frame.columns.names for lev in level):
  407. result = frame
  408. for lev in level:
  409. result = stack(result, lev, dropna=dropna)
  410. # Otherwise, level numbers may change as each successive level is stacked
  411. elif all(isinstance(lev, int) for lev in level):
  412. # As each stack is done, the level numbers decrease, so we need
  413. # to account for that when level is a sequence of ints
  414. result = frame
  415. # _get_level_number() checks level numbers are in range and converts
  416. # negative numbers to positive
  417. level = [frame.columns._get_level_number(lev) for lev in level]
  418. # Can't iterate directly through level as we might need to change
  419. # values as we go
  420. for index in range(len(level)):
  421. lev = level[index]
  422. result = stack(result, lev, dropna=dropna)
  423. # Decrement all level numbers greater than current, as these
  424. # have now shifted down by one
  425. updated_level = []
  426. for other in level:
  427. if other > lev:
  428. updated_level.append(other - 1)
  429. else:
  430. updated_level.append(other)
  431. level = updated_level
  432. else:
  433. raise ValueError("level should contain all level names or all level "
  434. "numbers, not a mixture of the two.")
  435. return result
  436. def _stack_multi_columns(frame, level_num=-1, dropna=True):
  437. def _convert_level_number(level_num, columns):
  438. """
  439. Logic for converting the level number to something we can safely pass
  440. to swaplevel:
  441. We generally want to convert the level number into a level name, except
  442. when columns do not have names, in which case we must leave as a level
  443. number
  444. """
  445. if level_num in columns.names:
  446. return columns.names[level_num]
  447. else:
  448. if columns.names[level_num] is None:
  449. return level_num
  450. else:
  451. return columns.names[level_num]
  452. this = frame.copy()
  453. # this makes life much simpler
  454. if level_num != frame.columns.nlevels - 1:
  455. # roll levels to put selected level at end
  456. roll_columns = this.columns
  457. for i in range(level_num, frame.columns.nlevels - 1):
  458. # Need to check if the ints conflict with level names
  459. lev1 = _convert_level_number(i, roll_columns)
  460. lev2 = _convert_level_number(i + 1, roll_columns)
  461. roll_columns = roll_columns.swaplevel(lev1, lev2)
  462. this.columns = roll_columns
  463. if not this.columns.is_lexsorted():
  464. # Workaround the edge case where 0 is one of the column names,
  465. # which interferes with trying to sort based on the first
  466. # level
  467. level_to_sort = _convert_level_number(0, this.columns)
  468. this = this.sortlevel(level_to_sort, axis=1)
  469. # tuple list excluding level for grouping columns
  470. if len(frame.columns.levels) > 2:
  471. tuples = list(zip(*[lev.take(lab)
  472. for lev, lab in zip(this.columns.levels[:-1],
  473. this.columns.labels[:-1])]))
  474. unique_groups = [key for key, _ in itertools.groupby(tuples)]
  475. new_names = this.columns.names[:-1]
  476. new_columns = MultiIndex.from_tuples(unique_groups, names=new_names)
  477. else:
  478. new_columns = unique_groups = this.columns.levels[0]
  479. # time to ravel the values
  480. new_data = {}
  481. level_vals = this.columns.levels[-1]
  482. level_labels = sorted(set(this.columns.labels[-1]))
  483. level_vals_used = level_vals[level_labels]
  484. levsize = len(level_labels)
  485. drop_cols = []
  486. for key in unique_groups:
  487. loc = this.columns.get_loc(key)
  488. slice_len = loc.stop - loc.start
  489. # can make more efficient?
  490. if slice_len == 0:
  491. drop_cols.append(key)
  492. continue
  493. elif slice_len != levsize:
  494. chunk = this.ix[:, this.columns[loc]]
  495. chunk.columns = level_vals.take(chunk.columns.labels[-1])
  496. value_slice = chunk.reindex(columns=level_vals_used).values
  497. else:
  498. if frame._is_mixed_type:
  499. value_slice = this.ix[:, this.columns[loc]].values
  500. else:
  501. value_slice = this.values[:, loc]
  502. new_data[key] = value_slice.ravel()
  503. if len(drop_cols) > 0:
  504. new_columns = new_columns.difference(drop_cols)
  505. N = len(this)
  506. if isinstance(this.index, MultiIndex):
  507. new_levels = list(this.index.levels)
  508. new_names = list(this.index.names)
  509. new_labels = [lab.repeat(levsize) for lab in this.index.labels]
  510. else:
  511. new_levels = [this.index]
  512. new_labels = [np.arange(N).repeat(levsize)]
  513. new_names = [this.index.name] # something better?
  514. new_levels.append(frame.columns.levels[level_num])
  515. new_labels.append(np.tile(level_labels, N))
  516. new_names.append(frame.columns.names[level_num])
  517. new_index = MultiIndex(levels=new_levels, labels=new_labels,
  518. names=new_names, verify_integrity=False)
  519. result = DataFrame(new_data, index=new_index, columns=new_columns)
  520. # more efficient way to go about this? can do the whole masking biz but
  521. # will only save a small amount of time...
  522. if dropna:
  523. result = result.dropna(axis=0, how='all')
  524. return result
  525. def melt(frame, id_vars=None, value_vars=None, var_name=None,
  526. value_name='value', col_level=None):
  527. """
  528. "Unpivots" a DataFrame from wide format to long format, optionally leaving
  529. identifier variables set.
  530. This function is useful to massage a DataFrame into a format where one
  531. or more columns are identifier variables (`id_vars`), while all other
  532. columns, considered measured variables (`value_vars`), are "unpivoted" to
  533. the row axis, leaving just two non-identifier columns, 'variable' and
  534. 'value'.
  535. Parameters
  536. ----------
  537. frame : DataFrame
  538. id_vars : tuple, list, or ndarray, optional
  539. Column(s) to use as identifier variables.
  540. value_vars : tuple, list, or ndarray, optional
  541. Column(s) to unpivot. If not specified, uses all columns that
  542. are not set as `id_vars`.
  543. var_name : scalar
  544. Name to use for the 'variable' column. If None it uses
  545. ``frame.columns.name`` or 'variable'.
  546. value_name : scalar, default 'value'
  547. Name to use for the 'value' column.
  548. col_level : int or string, optional
  549. If columns are a MultiIndex then use this level to melt.
  550. See also
  551. --------
  552. pivot_table
  553. DataFrame.pivot
  554. Examples
  555. --------
  556. >>> import pandas as pd
  557. >>> df = pd.DataFrame({'A': {0: 'a', 1: 'b', 2: 'c'},
  558. ... 'B': {0: 1, 1: 3, 2: 5},
  559. ... 'C': {0: 2, 1: 4, 2: 6}})
  560. >>> df
  561. A B C
  562. 0 a 1 2
  563. 1 b 3 4
  564. 2 c 5 6
  565. >>> pd.melt(df, id_vars=['A'], value_vars=['B'])
  566. A variable value
  567. 0 a B 1
  568. 1 b B 3
  569. 2 c B 5
  570. >>> pd.melt(df, id_vars=['A'], value_vars=['B', 'C'])
  571. A variable value
  572. 0 a B 1
  573. 1 b B 3
  574. 2 c B 5
  575. 3 a C 2
  576. 4 b C 4
  577. 5 c C 6
  578. The names of 'variable' and 'value' columns can be customized:
  579. >>> pd.melt(df, id_vars=['A'], value_vars=['B'],
  580. ... var_name='myVarname', value_name='myValname')
  581. A myVarname myValname
  582. 0 a B 1
  583. 1 b B 3
  584. 2 c B 5
  585. If you have multi-index columns:
  586. >>> df.columns = [list('ABC'), list('DEF')]
  587. >>> df
  588. A B C
  589. D E F
  590. 0 a 1 2
  591. 1 b 3 4
  592. 2 c 5 6
  593. >>> pd.melt(df, col_level=0, id_vars=['A'], value_vars=['B'])
  594. A variable value
  595. 0 a B 1
  596. 1 b B 3
  597. 2 c B 5
  598. >>> pd.melt(df, id_vars=[('A', 'D')], value_vars=[('B', 'E')])
  599. (A, D) variable_0 variable_1 value
  600. 0 a B E 1
  601. 1 b B E 3
  602. 2 c B E 5
  603. """
  604. # TODO: what about the existing index?
  605. if id_vars is not None:
  606. if not isinstance(id_vars, (tuple, list, np.ndarray)):
  607. id_vars = [id_vars]
  608. else:
  609. id_vars = list(id_vars)
  610. else:
  611. id_vars = []
  612. if value_vars is not None:
  613. if not isinstance(value_vars, (tuple, list, np.ndarray)):
  614. value_vars = [value_vars]
  615. frame = frame.ix[:, id_vars + value_vars]
  616. else:
  617. frame = frame.copy()
  618. if col_level is not None: # allow list or other?
  619. # frame is a copy
  620. frame.columns = frame.columns.get_level_values(col_level)
  621. if var_name is None:
  622. if isinstance(frame.columns, MultiIndex):
  623. if len(frame.columns.names) == len(set(frame.columns.names)):
  624. var_name = frame.columns.names
  625. else:
  626. var_name = ['variable_%s' % i
  627. for i in range(len(frame.columns.names))]
  628. else:
  629. var_name = [frame.columns.name if frame.columns.name is not None
  630. else 'variable']
  631. if isinstance(var_name, compat.string_types):
  632. var_name = [var_name]
  633. N, K = frame.shape
  634. K -= len(id_vars)
  635. mdata = {}
  636. for col in id_vars:
  637. mdata[col] = np.tile(frame.pop(col).values, K)
  638. mcolumns = id_vars + var_name + [value_name]
  639. mdata[value_name] = frame.values.ravel('F')
  640. for i, col in enumerate(var_name):
  641. # asanyarray will keep the columns as an Index
  642. mdata[col] = np.asanyarray(frame.columns.get_level_values(i)).repeat(N)
  643. return DataFrame(mdata, columns=mcolumns)
  644. def lreshape(data, groups, dropna=True, label=None):
  645. """
  646. Reshape long-format data to wide. Generalized inverse of DataFrame.pivot
  647. Parameters
  648. ----------
  649. data : DataFrame
  650. groups : dict
  651. {new_name : list_of_columns}
  652. dropna : boolean, default True
  653. Examples
  654. --------
  655. >>> import pandas as pd
  656. >>> data = pd.DataFrame({'hr1': [514, 573], 'hr2': [545, 526],
  657. ... 'team': ['Red Sox', 'Yankees'],
  658. ... 'year1': [2007, 2008], 'year2': [2008, 2008]})
  659. >>> data
  660. hr1 hr2 team year1 year2
  661. 0 514 545 Red Sox 2007 2008
  662. 1 573 526 Yankees 2007 2008
  663. >>> pd.lreshape(data, {'year': ['year1', 'year2'], 'hr': ['hr1', 'hr2']})
  664. team hr year
  665. 0 Red Sox 514 2007
  666. 1 Yankees 573 2007
  667. 2 Red Sox 545 2008
  668. 3 Yankees 526 2008
  669. Returns
  670. -------
  671. reshaped : DataFrame
  672. """
  673. if isinstance(groups, dict):
  674. keys = list(groups.keys())
  675. values = list(groups.values())
  676. else:
  677. keys, values = zip(*groups)
  678. all_cols = list(set.union(*[set(x) for x in values]))
  679. id_cols = list(data.columns.difference(all_cols))
  680. K = len(values[0])
  681. for seq in values:
  682. if len(seq) != K:
  683. raise ValueError('All column lists must be same length')
  684. mdata = {}
  685. pivot_cols = []
  686. for target, names in zip(keys, values):
  687. to_concat = [data[col].values for col in names]
  688. mdata[target] = _concat._concat_compat(to_concat)
  689. pivot_cols.append(target)
  690. for col in id_cols:
  691. mdata[col] = np.tile(data[col].values, K)
  692. if dropna:
  693. mask = np.ones(len(mdata[pivot_cols[0]]), dtype=bool)
  694. for c in pivot_cols:
  695. mask &= notnull(mdata[c])
  696. if not mask.all():
  697. mdata = dict((k, v[mask]) for k, v in compat.iteritems(mdata))
  698. return DataFrame(mdata, columns=id_cols + pivot_cols)
  699. def wide_to_long(df, stubnames, i, j):
  700. """
  701. Wide panel to long format. Less flexible but more user-friendly than melt.
  702. Parameters
  703. ----------
  704. df : DataFrame
  705. The wide-format DataFrame
  706. stubnames : list
  707. A list of stub names. The wide format variables are assumed to
  708. start with the stub names.
  709. i : str
  710. The name of the id variable.
  711. j : str
  712. The name of the subobservation variable.
  713. stubend : str
  714. Regex to match for the end of the stubs.
  715. Returns
  716. -------
  717. DataFrame
  718. A DataFrame that contains each stub name as a variable as well as
  719. variables for i and j.
  720. Examples
  721. --------
  722. >>> import pandas as pd
  723. >>> import numpy as np
  724. >>> np.random.seed(123)
  725. >>> df = pd.DataFrame({"A1970" : {0 : "a", 1 : "b", 2 : "c"},
  726. ... "A1980" : {0 : "d", 1 : "e", 2 : "f"},
  727. ... "B1970" : {0 : 2.5, 1 : 1.2, 2 : .7},
  728. ... "B1980" : {0 : 3.2, 1 : 1.3, 2 : .1},
  729. ... "X" : dict(zip(range(3), np.random.randn(3)))
  730. ... })
  731. >>> df["id"] = df.index
  732. >>> df
  733. A1970 A1980 B1970 B1980 X id
  734. 0 a d 2.5 3.2 -1.085631 0
  735. 1 b e 1.2 1.3 0.997345 1
  736. 2 c f 0.7 0.1 0.282978 2
  737. >>> wide_to_long(df, ["A", "B"], i="id", j="year")
  738. X A B
  739. id year
  740. 0 1970 -1.085631 a 2.5
  741. 1 1970 0.997345 b 1.2
  742. 2 1970 0.282978 c 0.7
  743. 0 1980 -1.085631 d 3.2
  744. 1 1980 0.997345 e 1.3
  745. 2 1980 0.282978 f 0.1
  746. Notes
  747. -----
  748. All extra variables are treated as extra id variables. This simply uses
  749. `pandas.melt` under the hood, but is hard-coded to "do the right thing"
  750. in a typicaly case.
  751. """
  752. def get_var_names(df, regex):
  753. return df.filter(regex=regex).columns.tolist()
  754. def melt_stub(df, stub, i, j):
  755. varnames = get_var_names(df, "^" + stub)
  756. newdf = melt(df, id_vars=i, value_vars=varnames, value_name=stub,
  757. var_name=j)
  758. newdf_j = newdf[j].str.replace(stub, "")
  759. try:
  760. newdf_j = newdf_j.astype(int)
  761. except ValueError:
  762. pass
  763. newdf[j] = newdf_j
  764. return newdf
  765. id_vars = get_var_names(df, "^(?!%s)" % "|".join(stubnames))
  766. if i not in id_vars:
  767. id_vars += [i]
  768. newdf = melt_stub(df, stubnames[0], id_vars, j)
  769. for stub in stubnames[1:]:
  770. new = melt_stub(df, stub, id_vars, j)
  771. newdf = newdf.merge(new, how="outer", on=id_vars + [j], copy=False)
  772. return newdf.set_index([i, j])
  773. def get_dummies(data, prefix=None, prefix_sep='_', dummy_na=False,
  774. columns=None, sparse=False, drop_first=False):
  775. """
  776. Convert categorical variable into dummy/indicator variables
  777. Parameters
  778. ----------
  779. data : array-like, Series, or DataFrame
  780. prefix : string, list of strings, or dict of strings, default None
  781. String to append DataFrame column names
  782. Pass a list with length equal to the number of columns
  783. when calling get_dummies on a DataFrame. Alternativly, `prefix`
  784. can be a dictionary mapping column names to prefixes.
  785. prefix_sep : string, default '_'
  786. If appending prefix, separator/delimiter to use. Or pass a
  787. list or dictionary as with `prefix.`
  788. dummy_na : bool, default False
  789. Add a column to indicate NaNs, if False NaNs are ignored.
  790. columns : list-like, default None
  791. Column names in the DataFrame to be encoded.
  792. If `columns` is None then all the columns with
  793. `object` or `category` dtype will be converted.
  794. sparse : bool, default False
  795. Whether the dummy columns should be sparse or not. Returns
  796. SparseDataFrame if `data` is a Series or if all columns are included.
  797. Otherwise returns a DataFrame with some SparseBlocks.
  798. .. versionadded:: 0.16.1
  799. drop_first : bool, default False
  800. Whether to get k-1 dummies out of n categorical levels by removing the
  801. first level.
  802. .. versionadded:: 0.18.0
  803. Returns
  804. -------
  805. dummies : DataFrame or SparseDataFrame
  806. Examples
  807. --------
  808. >>> import pandas as pd
  809. >>> s = pd.Series(list('abca'))
  810. >>> pd.get_dummies(s)
  811. a b c
  812. 0 1 0 0
  813. 1 0 1 0
  814. 2 0 0 1
  815. 3 1 0 0
  816. >>> s1 = ['a', 'b', np.nan]
  817. >>> pd.get_dummies(s1)
  818. a b
  819. 0 1 0
  820. 1 0 1
  821. 2 0 0
  822. >>> pd.get_dummies(s1, dummy_na=True)
  823. a b NaN
  824. 0 1 0 0
  825. 1 0 1 0
  826. 2 0 0 1
  827. >>> df = pd.DataFrame({'A': ['a', 'b', 'a'], 'B': ['b', 'a', 'c'],
  828. 'C': [1, 2, 3]})
  829. >>> pd.get_dummies(df, prefix=['col1', 'col2'])
  830. C col1_a col1_b col2_a col2_b col2_c
  831. 0 1 1 0 0 1 0
  832. 1 2 0 1 1 0 0
  833. 2 3 1 0 0 0 1
  834. >>> pd.get_dummies(pd.Series(list('abcaa')))
  835. a b c
  836. 0 1 0 0
  837. 1 0 1 0
  838. 2 0 0 1
  839. 3 1 0 0
  840. 4 1 0 0
  841. >>> pd.get_dummies(pd.Series(list('abcaa')), drop_first=True))
  842. b c
  843. 0 0 0
  844. 1 1 0
  845. 2 0 1
  846. 3 0 0
  847. 4 0 0
  848. See Also
  849. --------
  850. Series.str.get_dummies
  851. """
  852. from pandas.tools.merge import concat
  853. from itertools import cycle
  854. if isinstance(data, DataFrame):
  855. # determine columns being encoded
  856. if columns is None:
  857. columns_to_encode = data.select_dtypes(
  858. include=['object', 'category']).columns
  859. else:
  860. columns_to_encode = columns
  861. # validate prefixes and separator to avoid silently dropping cols
  862. def check_len(item, name):
  863. length_msg = ("Length of '{0}' ({1}) did not match the length of "
  864. "the columns being encoded ({2}).")
  865. if is_list_like(item):
  866. if not len(item) == len(columns_to_encode):
  867. raise ValueError(length_msg.format(name, len(item),
  868. len(columns_to_encode)))
  869. check_len(prefix, 'prefix')
  870. check_len(prefix_sep, 'prefix_sep')
  871. if isinstance(prefix, compat.string_types):
  872. prefix = cycle([prefix])
  873. if isinstance(prefix, dict):
  874. prefix = [prefix[col] for col in columns_to_encode]
  875. if prefix is None:
  876. prefix = columns_to_encode
  877. # validate separators
  878. if isinstance(prefix_sep, compat.string_types):
  879. prefix_sep = cycle([prefix_sep])
  880. elif isinstance(prefix_sep, dict):
  881. prefix_sep = [prefix_sep[col] for col in columns_to_encode]
  882. if set(columns_to_encode) == set(data.columns):
  883. with_dummies = []
  884. else:
  885. with_dummies = [data.drop(columns_to_encode, axis=1)]
  886. for (col, pre, sep) in zip(columns_to_encode, prefix, prefix_sep):
  887. dummy = _get_dummies_1d(data[col], prefix=pre, prefix_sep=sep,
  888. dummy_na=dummy_na, sparse=sparse,
  889. drop_first=drop_first)
  890. with_dummies.append(dummy)
  891. result = concat(with_dummies, axis=1)
  892. else:
  893. result = _get_dummies_1d(data, prefix, prefix_sep, dummy_na,
  894. sparse=sparse, drop_first=drop_first)
  895. return result
  896. def _get_dummies_1d(data, prefix, prefix_sep='_', dummy_na=False,
  897. sparse=False, drop_first=False):
  898. # Series avoids inconsistent NaN handling
  899. cat = Categorical.from_array(Series(data), ordered=True)
  900. levels = cat.categories
  901. def get_empty_Frame(data, sparse):
  902. if isinstance(data, Series):
  903. index = data.index
  904. else:
  905. index = np.arange(len(data))
  906. if not sparse:
  907. return DataFrame(index=index)
  908. else:
  909. return SparseDataFrame(index=index)
  910. # if all NaN
  911. if not dummy_na and len(levels) == 0:
  912. return get_empty_Frame(data, sparse)
  913. codes = cat.codes.copy()
  914. if dummy_na:
  915. codes[codes == -1] = len(cat.categories)
  916. levels = np.append(cat.categories, np.nan)
  917. # if dummy_na, we just fake a nan level. drop_first will drop it again
  918. if drop_first and len(levels) == 1:
  919. return get_empty_Frame(data, sparse)
  920. number_of_cols = len(levels)
  921. if prefix is not None:
  922. dummy_cols = ['%s%s%s' % (prefix, prefix_sep, v) for v in levels]
  923. else:
  924. dummy_cols = levels
  925. if isinstance(data, Series):
  926. index = data.index
  927. else:
  928. index = None
  929. if sparse:
  930. sparse_series = {}
  931. N = len(data)
  932. sp_indices = [[] for _ in range(len(dummy_cols))]
  933. for ndx, code in enumerate(codes):
  934. if code == -1:
  935. # Blank entries if not dummy_na and code == -1, #GH4446
  936. continue
  937. sp_indices[code].append(ndx)
  938. if drop_first:
  939. # remove first categorical level to avoid perfect collinearity
  940. # GH12042
  941. sp_indices = sp_indices[1:]
  942. dummy_cols = dummy_cols[1:]
  943. for col, ixs in zip(dummy_cols, sp_indices):
  944. sarr = SparseArray(np.ones(len(ixs)),
  945. sparse_index=IntIndex(N, ixs), fill_value=0)
  946. sparse_series[col] = SparseSeries(data=sarr, index=index)
  947. return SparseDataFrame(sparse_series, index=index, columns=dummy_cols)
  948. else:
  949. dummy_mat = np.eye(number_of_cols).take(codes, axis=0)
  950. if not dummy_na:
  951. # reset NaN GH4446
  952. dummy_mat[codes == -1] = 0
  953. if drop_first:
  954. # remove first GH12042
  955. dummy_mat = dummy_mat[:, 1:]
  956. dummy_cols = dummy_cols[1:]
  957. return DataFrame(dummy_mat, index=index, columns=dummy_cols)
  958. def make_axis_dummies(frame, axis='minor', transform=None):
  959. """
  960. Construct 1-0 dummy variables corresponding to designated axis
  961. labels
  962. Parameters
  963. ----------
  964. frame : DataFrame
  965. axis : {'major', 'minor'}, default 'minor'
  966. transform : function, default None
  967. Function to apply to axis labels first. For example, to
  968. get "day of week" dummies in a time series regression
  969. you might call::
  970. make_axis_dummies(panel, axis='major',
  971. transform=lambda d: d.weekday())
  972. Returns
  973. -------
  974. dummies : DataFrame
  975. Column names taken from chosen axis
  976. """
  977. numbers = {'major': 0, 'minor': 1}
  978. num = numbers.get(axis, axis)
  979. items = frame.index.levels[num]
  980. labels = frame.index.labels[num]
  981. if transform is not None:
  982. mapped_items = items.map(transform)
  983. cat = Categorical.from_array(mapped_items.take(labels), ordered=True)
  984. labels = cat.codes
  985. items = cat.categories
  986. values = np.eye(len(items), dtype=float)
  987. values = values.take(labels, axis=0)
  988. return DataFrame(values, columns=items, index=frame.index)