PageRenderTime 63ms CodeModel.GetById 24ms RepoModel.GetById 1ms app.codeStats 0ms

/pandas/tools/merge.py

http://github.com/pydata/pandas
Python | 1093 lines | 1073 code | 9 blank | 11 comment | 7 complexity | 81866f153f692cdbf553b48bcf95781c MD5 | raw file
Possible License(s): BSD-3-Clause, Apache-2.0
  1. """
  2. SQL-style merge routines
  3. """
  4. import types
  5. import numpy as np
  6. from pandas.compat import range, long, lrange, lzip, zip
  7. import pandas.compat as compat
  8. from pandas.core.categorical import Categorical
  9. from pandas.core.frame import DataFrame, _merge_doc
  10. from pandas.core.generic import NDFrame
  11. from pandas.core.groupby import get_group_index
  12. from pandas.core.series import Series
  13. from pandas.core.index import (Index, MultiIndex, _get_combined_index,
  14. _ensure_index, _get_consensus_names,
  15. _all_indexes_same)
  16. from pandas.core.internals import (items_overlap_with_suffix,
  17. concatenate_block_managers)
  18. from pandas.util.decorators import Appender, Substitution
  19. from pandas.core.common import ABCSeries
  20. from pandas.io.parsers import TextFileReader
  21. import pandas.core.common as com
  22. import pandas.lib as lib
  23. import pandas.algos as algos
  24. import pandas.hashtable as _hash
  25. @Substitution('\nleft : DataFrame')
  26. @Appender(_merge_doc, indents=0)
  27. def merge(left, right, how='inner', on=None, left_on=None, right_on=None,
  28. left_index=False, right_index=False, sort=False,
  29. suffixes=('_x', '_y'), copy=True):
  30. op = _MergeOperation(left, right, how=how, on=on, left_on=left_on,
  31. right_on=right_on, left_index=left_index,
  32. right_index=right_index, sort=sort, suffixes=suffixes,
  33. copy=copy)
  34. return op.get_result()
  35. if __debug__:
  36. merge.__doc__ = _merge_doc % '\nleft : DataFrame'
  37. class MergeError(Exception):
  38. pass
  39. def ordered_merge(left, right, on=None, left_by=None, right_by=None,
  40. left_on=None, right_on=None,
  41. fill_method=None, suffixes=('_x', '_y')):
  42. """Perform merge with optional filling/interpolation designed for ordered
  43. data like time series data. Optionally perform group-wise merge (see
  44. examples)
  45. Parameters
  46. ----------
  47. left : DataFrame
  48. right : DataFrame
  49. fill_method : {'ffill', None}, default None
  50. Interpolation method for data
  51. on : label or list
  52. Field names to join on. Must be found in both DataFrames.
  53. left_on : label or list, or array-like
  54. Field names to join on in left DataFrame. Can be a vector or list of
  55. vectors of the length of the DataFrame to use a particular vector as
  56. the join key instead of columns
  57. right_on : label or list, or array-like
  58. Field names to join on in right DataFrame or vector/list of vectors per
  59. left_on docs
  60. left_by : column name or list of column names
  61. Group left DataFrame by group columns and merge piece by piece with
  62. right DataFrame
  63. right_by : column name or list of column names
  64. Group right DataFrame by group columns and merge piece by piece with
  65. left DataFrame
  66. suffixes : 2-length sequence (tuple, list, ...)
  67. Suffix to apply to overlapping column names in the left and right
  68. side, respectively
  69. Examples
  70. --------
  71. >>> A >>> B
  72. key lvalue group key rvalue
  73. 0 a 1 a 0 b 1
  74. 1 c 2 a 1 c 2
  75. 2 e 3 a 2 d 3
  76. 3 a 1 b
  77. 4 c 2 b
  78. 5 e 3 b
  79. >>> ordered_merge(A, B, fill_method='ffill', left_by='group')
  80. key lvalue group rvalue
  81. 0 a 1 a NaN
  82. 1 b 1 a 1
  83. 2 c 2 a 2
  84. 3 d 2 a 3
  85. 4 e 3 a 3
  86. 5 f 3 a 4
  87. 6 a 1 b NaN
  88. 7 b 1 b 1
  89. 8 c 2 b 2
  90. 9 d 2 b 3
  91. 10 e 3 b 3
  92. 11 f 3 b 4
  93. Returns
  94. -------
  95. merged : DataFrame
  96. """
  97. def _merger(x, y):
  98. op = _OrderedMerge(x, y, on=on, left_on=left_on, right_on=right_on,
  99. # left_index=left_index, right_index=right_index,
  100. suffixes=suffixes, fill_method=fill_method)
  101. return op.get_result()
  102. if left_by is not None and right_by is not None:
  103. raise ValueError('Can only group either left or right frames')
  104. elif left_by is not None:
  105. if not isinstance(left_by, (list, tuple)):
  106. left_by = [left_by]
  107. pieces = []
  108. for key, xpiece in left.groupby(left_by):
  109. merged = _merger(xpiece, right)
  110. for k in left_by:
  111. # May have passed ndarray
  112. try:
  113. if k in merged:
  114. merged[k] = key
  115. except:
  116. pass
  117. pieces.append(merged)
  118. return concat(pieces, ignore_index=True)
  119. elif right_by is not None:
  120. if not isinstance(right_by, (list, tuple)):
  121. right_by = [right_by]
  122. pieces = []
  123. for key, ypiece in right.groupby(right_by):
  124. merged = _merger(left, ypiece)
  125. for k in right_by:
  126. try:
  127. if k in merged:
  128. merged[k] = key
  129. except:
  130. pass
  131. pieces.append(merged)
  132. return concat(pieces, ignore_index=True)
  133. else:
  134. return _merger(left, right)
  135. # TODO: transformations??
  136. # TODO: only copy DataFrames when modification necessary
  137. class _MergeOperation(object):
  138. """
  139. Perform a database (SQL) merge operation between two DataFrame objects
  140. using either columns as keys or their row indexes
  141. """
  142. def __init__(self, left, right, how='inner', on=None,
  143. left_on=None, right_on=None, axis=1,
  144. left_index=False, right_index=False, sort=True,
  145. suffixes=('_x', '_y'), copy=True):
  146. self.left = self.orig_left = left
  147. self.right = self.orig_right = right
  148. self.how = how
  149. self.axis = axis
  150. self.on = com._maybe_make_list(on)
  151. self.left_on = com._maybe_make_list(left_on)
  152. self.right_on = com._maybe_make_list(right_on)
  153. self.copy = copy
  154. self.suffixes = suffixes
  155. self.sort = sort
  156. self.left_index = left_index
  157. self.right_index = right_index
  158. # note this function has side effects
  159. (self.left_join_keys,
  160. self.right_join_keys,
  161. self.join_names) = self._get_merge_keys()
  162. def get_result(self):
  163. join_index, left_indexer, right_indexer = self._get_join_info()
  164. ldata, rdata = self.left._data, self.right._data
  165. lsuf, rsuf = self.suffixes
  166. llabels, rlabels = items_overlap_with_suffix(ldata.items, lsuf,
  167. rdata.items, rsuf)
  168. lindexers = {1: left_indexer} if left_indexer is not None else {}
  169. rindexers = {1: right_indexer} if right_indexer is not None else {}
  170. result_data = concatenate_block_managers(
  171. [(ldata, lindexers), (rdata, rindexers)],
  172. axes=[llabels.append(rlabels), join_index],
  173. concat_axis=0, copy=self.copy)
  174. result = DataFrame(result_data).__finalize__(self, method='merge')
  175. self._maybe_add_join_keys(result, left_indexer, right_indexer)
  176. return result
  177. def _maybe_add_join_keys(self, result, left_indexer, right_indexer):
  178. # insert group keys
  179. keys = zip(self.join_names, self.left_on, self.right_on)
  180. for i, (name, lname, rname) in enumerate(keys):
  181. if not _should_fill(lname, rname):
  182. continue
  183. if name in result:
  184. key_col = result[name]
  185. if left_indexer is not None and right_indexer is not None:
  186. if name in self.left:
  187. na_indexer = (left_indexer == -1).nonzero()[0]
  188. if len(na_indexer) == 0:
  189. continue
  190. right_na_indexer = right_indexer.take(na_indexer)
  191. key_col.put(
  192. na_indexer, com.take_1d(self.right_join_keys[i],
  193. right_na_indexer))
  194. elif name in self.right:
  195. na_indexer = (right_indexer == -1).nonzero()[0]
  196. if len(na_indexer) == 0:
  197. continue
  198. left_na_indexer = left_indexer.take(na_indexer)
  199. key_col.put(na_indexer, com.take_1d(self.left_join_keys[i],
  200. left_na_indexer))
  201. elif left_indexer is not None:
  202. if name is None:
  203. name = 'key_%d' % i
  204. # a faster way?
  205. key_col = com.take_1d(self.left_join_keys[i], left_indexer)
  206. na_indexer = (left_indexer == -1).nonzero()[0]
  207. right_na_indexer = right_indexer.take(na_indexer)
  208. key_col.put(na_indexer, com.take_1d(self.right_join_keys[i],
  209. right_na_indexer))
  210. result.insert(i, name, key_col)
  211. def _get_join_info(self):
  212. left_ax = self.left._data.axes[self.axis]
  213. right_ax = self.right._data.axes[self.axis]
  214. if self.left_index and self.right_index:
  215. join_index, left_indexer, right_indexer = \
  216. left_ax.join(right_ax, how=self.how, return_indexers=True)
  217. elif self.right_index and self.how == 'left':
  218. join_index, left_indexer, right_indexer = \
  219. _left_join_on_index(left_ax, right_ax, self.left_join_keys,
  220. sort=self.sort)
  221. elif self.left_index and self.how == 'right':
  222. join_index, right_indexer, left_indexer = \
  223. _left_join_on_index(right_ax, left_ax, self.right_join_keys,
  224. sort=self.sort)
  225. else:
  226. (left_indexer,
  227. right_indexer) = _get_join_indexers(self.left_join_keys,
  228. self.right_join_keys,
  229. sort=self.sort, how=self.how)
  230. if self.right_index:
  231. join_index = self.left.index.take(left_indexer)
  232. elif self.left_index:
  233. join_index = self.right.index.take(right_indexer)
  234. else:
  235. join_index = Index(np.arange(len(left_indexer)))
  236. return join_index, left_indexer, right_indexer
  237. def _get_merge_data(self):
  238. """
  239. Handles overlapping column names etc.
  240. """
  241. ldata, rdata = self.left._data, self.right._data
  242. lsuf, rsuf = self.suffixes
  243. llabels, rlabels = items_overlap_with_suffix(
  244. ldata.items, lsuf, rdata.items, rsuf)
  245. if not llabels.equals(ldata.items):
  246. ldata = ldata.copy(deep=False)
  247. ldata.set_axis(0, llabels)
  248. if not rlabels.equals(rdata.items):
  249. rdata = rdata.copy(deep=False)
  250. rdata.set_axis(0, rlabels)
  251. return ldata, rdata
  252. def _get_merge_keys(self):
  253. """
  254. Note: has side effects (copy/delete key columns)
  255. Parameters
  256. ----------
  257. left
  258. right
  259. on
  260. Returns
  261. -------
  262. left_keys, right_keys
  263. """
  264. self._validate_specification()
  265. left_keys = []
  266. right_keys = []
  267. join_names = []
  268. right_drop = []
  269. left_drop = []
  270. left, right = self.left, self.right
  271. is_lkey = lambda x: isinstance(x, (np.ndarray, ABCSeries)) and len(x) == len(left)
  272. is_rkey = lambda x: isinstance(x, (np.ndarray, ABCSeries)) and len(x) == len(right)
  273. # ugh, spaghetti re #733
  274. if _any(self.left_on) and _any(self.right_on):
  275. for lk, rk in zip(self.left_on, self.right_on):
  276. if is_lkey(lk):
  277. left_keys.append(lk)
  278. if is_rkey(rk):
  279. right_keys.append(rk)
  280. join_names.append(None) # what to do?
  281. else:
  282. right_keys.append(right[rk].values)
  283. join_names.append(rk)
  284. else:
  285. if not is_rkey(rk):
  286. right_keys.append(right[rk].values)
  287. if lk == rk:
  288. # avoid key upcast in corner case (length-0)
  289. if len(left) > 0:
  290. right_drop.append(rk)
  291. else:
  292. left_drop.append(lk)
  293. else:
  294. right_keys.append(rk)
  295. left_keys.append(left[lk].values)
  296. join_names.append(lk)
  297. elif _any(self.left_on):
  298. for k in self.left_on:
  299. if is_lkey(k):
  300. left_keys.append(k)
  301. join_names.append(None)
  302. else:
  303. left_keys.append(left[k].values)
  304. join_names.append(k)
  305. if isinstance(self.right.index, MultiIndex):
  306. right_keys = [lev.values.take(lab)
  307. for lev, lab in zip(self.right.index.levels,
  308. self.right.index.labels)]
  309. else:
  310. right_keys = [self.right.index.values]
  311. elif _any(self.right_on):
  312. for k in self.right_on:
  313. if is_rkey(k):
  314. right_keys.append(k)
  315. join_names.append(None)
  316. else:
  317. right_keys.append(right[k].values)
  318. join_names.append(k)
  319. if isinstance(self.left.index, MultiIndex):
  320. left_keys = [lev.values.take(lab)
  321. for lev, lab in zip(self.left.index.levels,
  322. self.left.index.labels)]
  323. else:
  324. left_keys = [self.left.index.values]
  325. if left_drop:
  326. self.left = self.left.drop(left_drop, axis=1)
  327. if right_drop:
  328. self.right = self.right.drop(right_drop, axis=1)
  329. return left_keys, right_keys, join_names
  330. def _validate_specification(self):
  331. # Hm, any way to make this logic less complicated??
  332. if (self.on is None and self.left_on is None
  333. and self.right_on is None):
  334. if self.left_index and self.right_index:
  335. self.left_on, self.right_on = (), ()
  336. elif self.left_index:
  337. if self.right_on is None:
  338. raise MergeError('Must pass right_on or right_index=True')
  339. elif self.right_index:
  340. if self.left_on is None:
  341. raise MergeError('Must pass left_on or left_index=True')
  342. else:
  343. if not self.left.columns.is_unique:
  344. raise MergeError("Left data columns not unique: %s"
  345. % repr(self.left.columns))
  346. if not self.right.columns.is_unique:
  347. raise MergeError("Right data columns not unique: %s"
  348. % repr(self.right.columns))
  349. # use the common columns
  350. common_cols = self.left.columns.intersection(
  351. self.right.columns)
  352. if len(common_cols) == 0:
  353. raise MergeError('No common columns to perform merge on')
  354. self.left_on = self.right_on = common_cols
  355. elif self.on is not None:
  356. if self.left_on is not None or self.right_on is not None:
  357. raise MergeError('Can only pass on OR left_on and '
  358. 'right_on')
  359. self.left_on = self.right_on = self.on
  360. elif self.left_on is not None:
  361. n = len(self.left_on)
  362. if self.right_index:
  363. if len(self.left_on) != self.right.index.nlevels:
  364. raise ValueError('len(left_on) must equal the number '
  365. 'of levels in the index of "right"')
  366. self.right_on = [None] * n
  367. elif self.right_on is not None:
  368. n = len(self.right_on)
  369. if self.left_index:
  370. if len(self.right_on) != self.left.index.nlevels:
  371. raise ValueError('len(right_on) must equal the number '
  372. 'of levels in the index of "left"')
  373. self.left_on = [None] * n
  374. if len(self.right_on) != len(self.left_on):
  375. raise ValueError("len(right_on) must equal len(left_on)")
  376. def _get_join_indexers(left_keys, right_keys, sort=False, how='inner'):
  377. """
  378. Parameters
  379. ----------
  380. Returns
  381. -------
  382. """
  383. if len(left_keys) != len(right_keys):
  384. raise AssertionError('left_key and right_keys must be the same length')
  385. left_labels = []
  386. right_labels = []
  387. group_sizes = []
  388. for lk, rk in zip(left_keys, right_keys):
  389. llab, rlab, count = _factorize_keys(lk, rk, sort=sort)
  390. left_labels.append(llab)
  391. right_labels.append(rlab)
  392. group_sizes.append(count)
  393. max_groups = long(1)
  394. for x in group_sizes:
  395. max_groups *= long(x)
  396. if max_groups > 2 ** 63: # pragma: no cover
  397. left_group_key, right_group_key, max_groups = \
  398. _factorize_keys(lib.fast_zip(left_labels),
  399. lib.fast_zip(right_labels))
  400. else:
  401. left_group_key = get_group_index(left_labels, group_sizes)
  402. right_group_key = get_group_index(right_labels, group_sizes)
  403. left_group_key, right_group_key, max_groups = \
  404. _factorize_keys(left_group_key, right_group_key, sort=sort)
  405. join_func = _join_functions[how]
  406. return join_func(left_group_key, right_group_key, max_groups)
  407. class _OrderedMerge(_MergeOperation):
  408. def __init__(self, left, right, on=None, by=None, left_on=None,
  409. right_on=None, axis=1, left_index=False, right_index=False,
  410. suffixes=('_x', '_y'), copy=True,
  411. fill_method=None):
  412. self.fill_method = fill_method
  413. _MergeOperation.__init__(self, left, right, on=on, left_on=left_on,
  414. right_on=right_on, axis=axis,
  415. left_index=left_index,
  416. right_index=right_index,
  417. how='outer', suffixes=suffixes,
  418. sort=True # sorts when factorizing
  419. )
  420. def get_result(self):
  421. join_index, left_indexer, right_indexer = self._get_join_info()
  422. # this is a bit kludgy
  423. ldata, rdata = self.left._data, self.right._data
  424. lsuf, rsuf = self.suffixes
  425. llabels, rlabels = items_overlap_with_suffix(ldata.items, lsuf,
  426. rdata.items, rsuf)
  427. if self.fill_method == 'ffill':
  428. left_join_indexer = algos.ffill_indexer(left_indexer)
  429. right_join_indexer = algos.ffill_indexer(right_indexer)
  430. else:
  431. left_join_indexer = left_indexer
  432. right_join_indexer = right_indexer
  433. lindexers = {1: left_join_indexer} if left_join_indexer is not None else {}
  434. rindexers = {1: right_join_indexer} if right_join_indexer is not None else {}
  435. result_data = concatenate_block_managers(
  436. [(ldata, lindexers), (rdata, rindexers)],
  437. axes=[llabels.append(rlabels), join_index],
  438. concat_axis=0, copy=self.copy)
  439. result = DataFrame(result_data)
  440. self._maybe_add_join_keys(result, left_indexer, right_indexer)
  441. return result
  442. def _get_multiindex_indexer(join_keys, index, sort=False):
  443. shape = []
  444. labels = []
  445. for level, key in zip(index.levels, join_keys):
  446. llab, rlab, count = _factorize_keys(level, key, sort=False)
  447. labels.append(rlab)
  448. shape.append(count)
  449. left_group_key = get_group_index(labels, shape)
  450. right_group_key = get_group_index(index.labels, shape)
  451. left_group_key, right_group_key, max_groups = \
  452. _factorize_keys(left_group_key, right_group_key,
  453. sort=False)
  454. left_indexer, right_indexer = \
  455. algos.left_outer_join(com._ensure_int64(left_group_key),
  456. com._ensure_int64(right_group_key),
  457. max_groups, sort=False)
  458. return left_indexer, right_indexer
  459. def _get_single_indexer(join_key, index, sort=False):
  460. left_key, right_key, count = _factorize_keys(join_key, index, sort=sort)
  461. left_indexer, right_indexer = \
  462. algos.left_outer_join(com._ensure_int64(left_key),
  463. com._ensure_int64(right_key),
  464. count, sort=sort)
  465. return left_indexer, right_indexer
  466. def _left_join_on_index(left_ax, right_ax, join_keys, sort=False):
  467. join_index = left_ax
  468. left_indexer = None
  469. if len(join_keys) > 1:
  470. if not ((isinstance(right_ax, MultiIndex) and
  471. len(join_keys) == right_ax.nlevels)):
  472. raise AssertionError("If more than one join key is given then "
  473. "'right_ax' must be a MultiIndex and the "
  474. "number of join keys must be the number of "
  475. "levels in right_ax")
  476. left_tmp, right_indexer = \
  477. _get_multiindex_indexer(join_keys, right_ax,
  478. sort=sort)
  479. if sort:
  480. left_indexer = left_tmp
  481. join_index = left_ax.take(left_indexer)
  482. else:
  483. jkey = join_keys[0]
  484. if sort:
  485. left_indexer, right_indexer = \
  486. _get_single_indexer(jkey, right_ax, sort=sort)
  487. join_index = left_ax.take(left_indexer)
  488. else:
  489. right_indexer = right_ax.get_indexer(jkey)
  490. return join_index, left_indexer, right_indexer
  491. def _right_outer_join(x, y, max_groups):
  492. right_indexer, left_indexer = algos.left_outer_join(y, x, max_groups)
  493. return left_indexer, right_indexer
  494. _join_functions = {
  495. 'inner': algos.inner_join,
  496. 'left': algos.left_outer_join,
  497. 'right': _right_outer_join,
  498. 'outer': algos.full_outer_join,
  499. }
  500. def _factorize_keys(lk, rk, sort=True):
  501. if com._is_int_or_datetime_dtype(lk) and com._is_int_or_datetime_dtype(rk):
  502. klass = _hash.Int64Factorizer
  503. lk = com._ensure_int64(lk)
  504. rk = com._ensure_int64(rk)
  505. else:
  506. klass = _hash.Factorizer
  507. lk = com._ensure_object(lk)
  508. rk = com._ensure_object(rk)
  509. rizer = klass(max(len(lk), len(rk)))
  510. llab = rizer.factorize(lk)
  511. rlab = rizer.factorize(rk)
  512. count = rizer.get_count()
  513. if sort:
  514. uniques = rizer.uniques.to_array()
  515. llab, rlab = _sort_labels(uniques, llab, rlab)
  516. # NA group
  517. lmask = llab == -1
  518. lany = lmask.any()
  519. rmask = rlab == -1
  520. rany = rmask.any()
  521. if lany or rany:
  522. if lany:
  523. np.putmask(llab, lmask, count)
  524. if rany:
  525. np.putmask(rlab, rmask, count)
  526. count += 1
  527. return llab, rlab, count
  528. def _sort_labels(uniques, left, right):
  529. if not isinstance(uniques, np.ndarray):
  530. # tuplesafe
  531. uniques = Index(uniques).values
  532. sorter = uniques.argsort()
  533. reverse_indexer = np.empty(len(sorter), dtype=np.int64)
  534. reverse_indexer.put(sorter, np.arange(len(sorter)))
  535. new_left = reverse_indexer.take(com._ensure_platform_int(left))
  536. np.putmask(new_left, left == -1, -1)
  537. new_right = reverse_indexer.take(com._ensure_platform_int(right))
  538. np.putmask(new_right, right == -1, -1)
  539. return new_left, new_right
  540. #----------------------------------------------------------------------
  541. # Concatenate DataFrame objects
  542. def concat(objs, axis=0, join='outer', join_axes=None, ignore_index=False,
  543. keys=None, levels=None, names=None, verify_integrity=False):
  544. """
  545. Concatenate pandas objects along a particular axis with optional set logic
  546. along the other axes. Can also add a layer of hierarchical indexing on the
  547. concatenation axis, which may be useful if the labels are the same (or
  548. overlapping) on the passed axis number
  549. Parameters
  550. ----------
  551. objs : list or dict of Series, DataFrame, or Panel objects
  552. If a dict is passed, the sorted keys will be used as the `keys`
  553. argument, unless it is passed, in which case the values will be
  554. selected (see below). Any None objects will be dropped silently unless
  555. they are all None in which case an Exception will be raised
  556. axis : {0, 1, ...}, default 0
  557. The axis to concatenate along
  558. join : {'inner', 'outer'}, default 'outer'
  559. How to handle indexes on other axis(es)
  560. join_axes : list of Index objects
  561. Specific indexes to use for the other n - 1 axes instead of performing
  562. inner/outer set logic
  563. verify_integrity : boolean, default False
  564. Check whether the new concatenated axis contains duplicates. This can
  565. be very expensive relative to the actual data concatenation
  566. keys : sequence, default None
  567. If multiple levels passed, should contain tuples. Construct
  568. hierarchical index using the passed keys as the outermost level
  569. levels : list of sequences, default None
  570. Specific levels (unique values) to use for constructing a
  571. MultiIndex. Otherwise they will be inferred from the keys
  572. names : list, default None
  573. Names for the levels in the resulting hierarchical index
  574. ignore_index : boolean, default False
  575. If True, do not use the index values along the concatenation axis. The
  576. resulting axis will be labeled 0, ..., n - 1. This is useful if you are
  577. concatenating objects where the concatenation axis does not have
  578. meaningful indexing information. Note the the index values on the other
  579. axes are still respected in the join.
  580. Notes
  581. -----
  582. The keys, levels, and names arguments are all optional
  583. Returns
  584. -------
  585. concatenated : type of objects
  586. """
  587. op = _Concatenator(objs, axis=axis, join_axes=join_axes,
  588. ignore_index=ignore_index, join=join,
  589. keys=keys, levels=levels, names=names,
  590. verify_integrity=verify_integrity)
  591. return op.get_result()
  592. class _Concatenator(object):
  593. """
  594. Orchestrates a concatenation operation for BlockManagers
  595. """
  596. def __init__(self, objs, axis=0, join='outer', join_axes=None,
  597. keys=None, levels=None, names=None,
  598. ignore_index=False, verify_integrity=False):
  599. if not isinstance(objs, (list,tuple,types.GeneratorType,dict,TextFileReader)):
  600. raise TypeError('first argument must be a list-like of pandas '
  601. 'objects, you passed an object of type '
  602. '"{0}"'.format(type(objs).__name__))
  603. if join == 'outer':
  604. self.intersect = False
  605. elif join == 'inner':
  606. self.intersect = True
  607. else: # pragma: no cover
  608. raise ValueError('Only can inner (intersect) or outer (union) '
  609. 'join the other axis')
  610. if isinstance(objs, dict):
  611. if keys is None:
  612. keys = sorted(objs)
  613. objs = [objs[k] for k in keys]
  614. if keys is None:
  615. objs = [obj for obj in objs if obj is not None ]
  616. else:
  617. # #1649
  618. clean_keys = []
  619. clean_objs = []
  620. for k, v in zip(keys, objs):
  621. if v is None:
  622. continue
  623. clean_keys.append(k)
  624. clean_objs.append(v)
  625. objs = clean_objs
  626. keys = clean_keys
  627. if len(objs) == 0:
  628. raise Exception('All objects passed were None')
  629. # consolidate data & figure out what our result ndim is going to be
  630. ndims = set()
  631. for obj in objs:
  632. if not isinstance(obj, NDFrame):
  633. raise TypeError("cannot concatenate a non-NDFrame object")
  634. # consolidate
  635. obj.consolidate(inplace=True)
  636. ndims.add(obj.ndim)
  637. # get the sample
  638. # want the higest ndim that we have, and must be non-empty
  639. # unless all objs are empty
  640. sample = None
  641. if len(ndims) > 1:
  642. max_ndim = max(ndims)
  643. for obj in objs:
  644. if obj.ndim == max_ndim and np.sum(obj.shape):
  645. sample = obj
  646. break
  647. else:
  648. # filter out the empties
  649. # if we have not multi-index possibiltes
  650. df = DataFrame([ obj.shape for obj in objs ]).sum(1)
  651. non_empties = df[df!=0]
  652. if len(non_empties) and (keys is None and names is None and levels is None and join_axes is None):
  653. objs = [ objs[i] for i in non_empties.index ]
  654. sample = objs[0]
  655. if sample is None:
  656. sample = objs[0]
  657. self.objs = objs
  658. # Need to flip BlockManager axis in the DataFrame special case
  659. self._is_frame = isinstance(sample, DataFrame)
  660. if self._is_frame:
  661. axis = 1 if axis == 0 else 0
  662. self._is_series = isinstance(sample, ABCSeries)
  663. if not 0 <= axis <= sample.ndim:
  664. raise AssertionError("axis must be between 0 and {0}, "
  665. "input was {1}".format(sample.ndim, axis))
  666. # if we have mixed ndims, then convert to highest ndim
  667. # creating column numbers as needed
  668. if len(ndims) > 1:
  669. current_column = 0
  670. max_ndim = sample.ndim
  671. self.objs, objs = [], self.objs
  672. for obj in objs:
  673. ndim = obj.ndim
  674. if ndim == max_ndim:
  675. pass
  676. elif ndim != max_ndim-1:
  677. raise ValueError("cannot concatenate unaligned mixed "
  678. "dimensional NDFrame objects")
  679. else:
  680. name = getattr(obj,'name',None)
  681. if ignore_index or name is None:
  682. name = current_column
  683. current_column += 1
  684. # doing a row-wise concatenation so need everything
  685. # to line up
  686. if self._is_frame and axis == 1:
  687. name = 0
  688. obj = sample._constructor({ name : obj })
  689. self.objs.append(obj)
  690. # note: this is the BlockManager axis (since DataFrame is transposed)
  691. self.axis = axis
  692. self.join_axes = join_axes
  693. self.keys = keys
  694. self.names = names
  695. self.levels = levels
  696. self.ignore_index = ignore_index
  697. self.verify_integrity = verify_integrity
  698. self.new_axes = self._get_new_axes()
  699. def get_result(self):
  700. if self._is_series:
  701. if self.axis == 0:
  702. new_data = com._concat_compat([x.get_values() for x in self.objs])
  703. name = com._consensus_name_attr(self.objs)
  704. return Series(new_data, index=self.new_axes[0], name=name).__finalize__(self, method='concat')
  705. else:
  706. data = dict(zip(range(len(self.objs)), self.objs))
  707. index, columns = self.new_axes
  708. tmpdf = DataFrame(data, index=index)
  709. if columns is not None:
  710. tmpdf.columns = columns
  711. return tmpdf.__finalize__(self, method='concat')
  712. else:
  713. mgrs_indexers = []
  714. for obj in self.objs:
  715. mgr = obj._data
  716. indexers = {}
  717. for ax, new_labels in enumerate(self.new_axes):
  718. if ax == self.axis:
  719. # Suppress reindexing on concat axis
  720. continue
  721. obj_labels = mgr.axes[ax]
  722. if not new_labels.equals(obj_labels):
  723. indexers[ax] = obj_labels.reindex(new_labels)[1]
  724. mgrs_indexers.append((obj._data, indexers))
  725. new_data = concatenate_block_managers(
  726. mgrs_indexers, self.new_axes, concat_axis=self.axis, copy=True)
  727. return self.objs[0]._from_axes(new_data, self.new_axes).__finalize__(self, method='concat')
  728. def _get_result_dim(self):
  729. if self._is_series and self.axis == 1:
  730. return 2
  731. else:
  732. return self.objs[0].ndim
  733. def _get_new_axes(self):
  734. ndim = self._get_result_dim()
  735. new_axes = [None] * ndim
  736. if self.join_axes is None:
  737. for i in range(ndim):
  738. if i == self.axis:
  739. continue
  740. new_axes[i] = self._get_comb_axis(i)
  741. else:
  742. if len(self.join_axes) != ndim - 1:
  743. raise AssertionError("length of join_axes must not be "
  744. "equal to {0}".format(ndim - 1))
  745. # ufff...
  746. indices = lrange(ndim)
  747. indices.remove(self.axis)
  748. for i, ax in zip(indices, self.join_axes):
  749. new_axes[i] = ax
  750. new_axes[self.axis] = self._get_concat_axis()
  751. return new_axes
  752. def _get_comb_axis(self, i):
  753. if self._is_series:
  754. all_indexes = [x.index for x in self.objs]
  755. else:
  756. try:
  757. all_indexes = [x._data.axes[i] for x in self.objs]
  758. except IndexError:
  759. types = [type(x).__name__ for x in self.objs]
  760. raise TypeError("Cannot concatenate list of %s" % types)
  761. return _get_combined_index(all_indexes, intersect=self.intersect)
  762. def _get_concat_axis(self):
  763. """
  764. Return index to be used along concatenation axis.
  765. """
  766. if self._is_series:
  767. if self.axis == 0:
  768. indexes = [x.index for x in self.objs]
  769. elif self.ignore_index:
  770. idx = Index(np.arange(len(self.objs)))
  771. idx.is_unique = True # arange is always unique
  772. return idx
  773. elif self.keys is None:
  774. names = []
  775. for x in self.objs:
  776. if not isinstance(x, Series):
  777. raise TypeError("Cannot concatenate type 'Series' "
  778. "with object of type "
  779. "%r" % type(x).__name__)
  780. if x.name is not None:
  781. names.append(x.name)
  782. else:
  783. idx = Index(np.arange(len(self.objs)))
  784. idx.is_unique = True
  785. return idx
  786. return Index(names)
  787. else:
  788. return _ensure_index(self.keys)
  789. else:
  790. indexes = [x._data.axes[self.axis] for x in self.objs]
  791. if self.ignore_index:
  792. idx = Index(np.arange(sum(len(i) for i in indexes)))
  793. idx.is_unique = True
  794. return idx
  795. if self.keys is None:
  796. concat_axis = _concat_indexes(indexes)
  797. else:
  798. concat_axis = _make_concat_multiindex(indexes, self.keys,
  799. self.levels, self.names)
  800. self._maybe_check_integrity(concat_axis)
  801. return concat_axis
  802. def _maybe_check_integrity(self, concat_index):
  803. if self.verify_integrity:
  804. if not concat_index.is_unique:
  805. overlap = concat_index.get_duplicates()
  806. raise ValueError('Indexes have overlapping values: %s'
  807. % str(overlap))
  808. def _concat_indexes(indexes):
  809. return indexes[0].append(indexes[1:])
  810. def _make_concat_multiindex(indexes, keys, levels=None, names=None):
  811. if ((levels is None and isinstance(keys[0], tuple)) or
  812. (levels is not None and len(levels) > 1)):
  813. zipped = lzip(*keys)
  814. if names is None:
  815. names = [None] * len(zipped)
  816. if levels is None:
  817. levels = [Categorical.from_array(zp).levels for zp in zipped]
  818. else:
  819. levels = [_ensure_index(x) for x in levels]
  820. else:
  821. zipped = [keys]
  822. if names is None:
  823. names = [None]
  824. if levels is None:
  825. levels = [_ensure_index(keys)]
  826. else:
  827. levels = [_ensure_index(x) for x in levels]
  828. if not _all_indexes_same(indexes):
  829. label_list = []
  830. # things are potentially different sizes, so compute the exact labels
  831. # for each level and pass those to MultiIndex.from_arrays
  832. for hlevel, level in zip(zipped, levels):
  833. to_concat = []
  834. for key, index in zip(hlevel, indexes):
  835. try:
  836. i = level.get_loc(key)
  837. except KeyError:
  838. raise ValueError('Key %s not in level %s'
  839. % (str(key), str(level)))
  840. to_concat.append(np.repeat(i, len(index)))
  841. label_list.append(np.concatenate(to_concat))
  842. concat_index = _concat_indexes(indexes)
  843. # these go at the end
  844. if isinstance(concat_index, MultiIndex):
  845. levels.extend(concat_index.levels)
  846. label_list.extend(concat_index.labels)
  847. else:
  848. factor = Categorical.from_array(concat_index)
  849. levels.append(factor.levels)
  850. label_list.append(factor.labels)
  851. if len(names) == len(levels):
  852. names = list(names)
  853. else:
  854. # make sure that all of the passed indices have the same nlevels
  855. if not len(set([ i.nlevels for i in indexes ])) == 1:
  856. raise AssertionError("Cannot concat indices that do"
  857. " not have the same number of levels")
  858. # also copies
  859. names = names + _get_consensus_names(indexes)
  860. return MultiIndex(levels=levels, labels=label_list, names=names,
  861. verify_integrity=False)
  862. new_index = indexes[0]
  863. n = len(new_index)
  864. kpieces = len(indexes)
  865. # also copies
  866. new_names = list(names)
  867. new_levels = list(levels)
  868. # construct labels
  869. new_labels = []
  870. # do something a bit more speedy
  871. for hlevel, level in zip(zipped, levels):
  872. hlevel = _ensure_index(hlevel)
  873. mapped = level.get_indexer(hlevel)
  874. mask = mapped == -1
  875. if mask.any():
  876. raise ValueError('Values not found in passed level: %s'
  877. % str(hlevel[mask]))
  878. new_labels.append(np.repeat(mapped, n))
  879. if isinstance(new_index, MultiIndex):
  880. new_levels.extend(new_index.levels)
  881. new_labels.extend([np.tile(lab, kpieces) for lab in new_index.labels])
  882. else:
  883. new_levels.append(new_index)
  884. new_labels.append(np.tile(np.arange(n), kpieces))
  885. if len(new_names) < len(new_levels):
  886. new_names.extend(new_index.names)
  887. return MultiIndex(levels=new_levels, labels=new_labels, names=new_names,
  888. verify_integrity=False)
  889. def _should_fill(lname, rname):
  890. if not isinstance(lname, compat.string_types) or not isinstance(rname, compat.string_types):
  891. return True
  892. return lname == rname
  893. def _any(x):
  894. return x is not None and len(x) > 0 and any([y is not None for y in x])