/pandas/tools/tests/test_merge.py
Python | 2158 lines | 1849 code | 260 blank | 49 comment | 24 complexity | 9d08c743c46fdbf243d91045fac3b860 MD5 | raw file
Possible License(s): BSD-3-Clause, Apache-2.0
Large files files are truncated, but you can click here to view the full file
- # pylint: disable=E1103
- import nose
- from datetime import datetime
- from numpy.random import randn
- from numpy import nan
- import numpy as np
- import random
- import pandas as pd
- from pandas.compat import range, lrange, lzip, zip, StringIO
- from pandas import compat, _np_version_under1p7
- from pandas.tseries.index import DatetimeIndex
- from pandas.tools.merge import merge, concat, ordered_merge, MergeError
- from pandas.util.testing import (assert_frame_equal, assert_series_equal,
- assert_almost_equal, rands,
- makeCustomDataframe as mkdf,
- assertRaisesRegexp)
- from pandas import isnull, DataFrame, Index, MultiIndex, Panel, Series, date_range, read_table, read_csv
- import pandas.algos as algos
- import pandas.util.testing as tm
- a_ = np.array
- N = 50
- NGROUPS = 8
- JOIN_TYPES = ['inner', 'outer', 'left', 'right']
- def get_test_data(ngroups=NGROUPS, n=N):
- unique_groups = lrange(ngroups)
- arr = np.asarray(np.tile(unique_groups, n // ngroups))
- if len(arr) < n:
- arr = np.asarray(list(arr) + unique_groups[:n - len(arr)])
- random.shuffle(arr)
- return arr
- class TestMerge(tm.TestCase):
- _multiprocess_can_split_ = True
- def setUp(self):
- # aggregate multiple columns
- self.df = DataFrame({'key1': get_test_data(),
- 'key2': get_test_data(),
- 'data1': np.random.randn(N),
- 'data2': np.random.randn(N)})
- # exclude a couple keys for fun
- self.df = self.df[self.df['key2'] > 1]
- self.df2 = DataFrame({'key1': get_test_data(n=N // 5),
- 'key2': get_test_data(ngroups=NGROUPS // 2,
- n=N // 5),
- 'value': np.random.randn(N // 5)})
- index, data = tm.getMixedTypeDict()
- self.target = DataFrame(data, index=index)
- # Join on string value
- self.source = DataFrame({'MergedA': data['A'], 'MergedD': data['D']},
- index=data['C'])
- self.left = DataFrame({'key': ['a', 'b', 'c', 'd', 'e', 'e', 'a'],
- 'v1': np.random.randn(7)})
- self.right = DataFrame({'v2': np.random.randn(4)},
- index=['d', 'b', 'c', 'a'])
- def test_cython_left_outer_join(self):
- left = a_([0, 1, 2, 1, 2, 0, 0, 1, 2, 3, 3], dtype=np.int64)
- right = a_([1, 1, 0, 4, 2, 2, 1], dtype=np.int64)
- max_group = 5
- ls, rs = algos.left_outer_join(left, right, max_group)
- exp_ls = left.argsort(kind='mergesort')
- exp_rs = right.argsort(kind='mergesort')
- exp_li = a_([0, 1, 2, 3, 3, 3, 4, 4, 4, 5, 5, 5,
- 6, 6, 7, 7, 8, 8, 9, 10])
- exp_ri = a_([0, 0, 0, 1, 2, 3, 1, 2, 3, 1, 2, 3,
- 4, 5, 4, 5, 4, 5, -1, -1])
- exp_ls = exp_ls.take(exp_li)
- exp_ls[exp_li == -1] = -1
- exp_rs = exp_rs.take(exp_ri)
- exp_rs[exp_ri == -1] = -1
- self.assert_numpy_array_equal(ls, exp_ls)
- self.assert_numpy_array_equal(rs, exp_rs)
- def test_cython_right_outer_join(self):
- left = a_([0, 1, 2, 1, 2, 0, 0, 1, 2, 3, 3], dtype=np.int64)
- right = a_([1, 1, 0, 4, 2, 2, 1], dtype=np.int64)
- max_group = 5
- rs, ls = algos.left_outer_join(right, left, max_group)
- exp_ls = left.argsort(kind='mergesort')
- exp_rs = right.argsort(kind='mergesort')
- # 0 1 1 1
- exp_li = a_([0, 1, 2, 3, 4, 5, 3, 4, 5, 3, 4, 5,
- # 2 2 4
- 6, 7, 8, 6, 7, 8, -1])
- exp_ri = a_([0, 0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3,
- 4, 4, 4, 5, 5, 5, 6])
- exp_ls = exp_ls.take(exp_li)
- exp_ls[exp_li == -1] = -1
- exp_rs = exp_rs.take(exp_ri)
- exp_rs[exp_ri == -1] = -1
- self.assert_numpy_array_equal(ls, exp_ls)
- self.assert_numpy_array_equal(rs, exp_rs)
- def test_cython_inner_join(self):
- left = a_([0, 1, 2, 1, 2, 0, 0, 1, 2, 3, 3], dtype=np.int64)
- right = a_([1, 1, 0, 4, 2, 2, 1, 4], dtype=np.int64)
- max_group = 5
- ls, rs = algos.inner_join(left, right, max_group)
- exp_ls = left.argsort(kind='mergesort')
- exp_rs = right.argsort(kind='mergesort')
- exp_li = a_([0, 1, 2, 3, 3, 3, 4, 4, 4, 5, 5, 5,
- 6, 6, 7, 7, 8, 8])
- exp_ri = a_([0, 0, 0, 1, 2, 3, 1, 2, 3, 1, 2, 3,
- 4, 5, 4, 5, 4, 5])
- exp_ls = exp_ls.take(exp_li)
- exp_ls[exp_li == -1] = -1
- exp_rs = exp_rs.take(exp_ri)
- exp_rs[exp_ri == -1] = -1
- self.assert_numpy_array_equal(ls, exp_ls)
- self.assert_numpy_array_equal(rs, exp_rs)
- def test_left_outer_join(self):
- joined_key2 = merge(self.df, self.df2, on='key2')
- _check_join(self.df, self.df2, joined_key2, ['key2'], how='left')
- joined_both = merge(self.df, self.df2)
- _check_join(self.df, self.df2, joined_both, ['key1', 'key2'],
- how='left')
- def test_right_outer_join(self):
- joined_key2 = merge(self.df, self.df2, on='key2', how='right')
- _check_join(self.df, self.df2, joined_key2, ['key2'], how='right')
- joined_both = merge(self.df, self.df2, how='right')
- _check_join(self.df, self.df2, joined_both, ['key1', 'key2'],
- how='right')
- def test_full_outer_join(self):
- joined_key2 = merge(self.df, self.df2, on='key2', how='outer')
- _check_join(self.df, self.df2, joined_key2, ['key2'], how='outer')
- joined_both = merge(self.df, self.df2, how='outer')
- _check_join(self.df, self.df2, joined_both, ['key1', 'key2'],
- how='outer')
- def test_inner_join(self):
- joined_key2 = merge(self.df, self.df2, on='key2', how='inner')
- _check_join(self.df, self.df2, joined_key2, ['key2'], how='inner')
- joined_both = merge(self.df, self.df2, how='inner')
- _check_join(self.df, self.df2, joined_both, ['key1', 'key2'],
- how='inner')
- def test_handle_overlap(self):
- joined = merge(self.df, self.df2, on='key2',
- suffixes=['.foo', '.bar'])
- self.assertIn('key1.foo', joined)
- self.assertIn('key1.bar', joined)
- def test_handle_overlap_arbitrary_key(self):
- joined = merge(self.df, self.df2,
- left_on='key2', right_on='key1',
- suffixes=['.foo', '.bar'])
- self.assertIn('key1.foo', joined)
- self.assertIn('key2.bar', joined)
- def test_merge_common(self):
- joined = merge(self.df, self.df2)
- exp = merge(self.df, self.df2, on=['key1', 'key2'])
- tm.assert_frame_equal(joined, exp)
- def test_join_on(self):
- target = self.target
- source = self.source
- merged = target.join(source, on='C')
- self.assert_numpy_array_equal(merged['MergedA'], target['A'])
- self.assert_numpy_array_equal(merged['MergedD'], target['D'])
- # join with duplicates (fix regression from DataFrame/Matrix merge)
- df = DataFrame({'key': ['a', 'a', 'b', 'b', 'c']})
- df2 = DataFrame({'value': [0, 1, 2]}, index=['a', 'b', 'c'])
- joined = df.join(df2, on='key')
- expected = DataFrame({'key': ['a', 'a', 'b', 'b', 'c'],
- 'value': [0, 0, 1, 1, 2]})
- assert_frame_equal(joined, expected)
- # Test when some are missing
- df_a = DataFrame([[1], [2], [3]], index=['a', 'b', 'c'],
- columns=['one'])
- df_b = DataFrame([['foo'], ['bar']], index=[1, 2],
- columns=['two'])
- df_c = DataFrame([[1], [2]], index=[1, 2],
- columns=['three'])
- joined = df_a.join(df_b, on='one')
- joined = joined.join(df_c, on='one')
- self.assertTrue(np.isnan(joined['two']['c']))
- self.assertTrue(np.isnan(joined['three']['c']))
- # merge column not p resent
- self.assertRaises(Exception, target.join, source, on='E')
- # overlap
- source_copy = source.copy()
- source_copy['A'] = 0
- self.assertRaises(Exception, target.join, source_copy, on='A')
- def test_join_on_fails_with_different_right_index(self):
- with tm.assertRaises(ValueError):
- df = DataFrame({'a': tm.choice(['m', 'f'], size=3),
- 'b': np.random.randn(3)})
- df2 = DataFrame({'a': tm.choice(['m', 'f'], size=10),
- 'b': np.random.randn(10)},
- index=tm.makeCustomIndex(10, 2))
- merge(df, df2, left_on='a', right_index=True)
- def test_join_on_fails_with_different_left_index(self):
- with tm.assertRaises(ValueError):
- df = DataFrame({'a': tm.choice(['m', 'f'], size=3),
- 'b': np.random.randn(3)},
- index=tm.makeCustomIndex(10, 2))
- df2 = DataFrame({'a': tm.choice(['m', 'f'], size=10),
- 'b': np.random.randn(10)})
- merge(df, df2, right_on='b', left_index=True)
- def test_join_on_fails_with_different_column_counts(self):
- with tm.assertRaises(ValueError):
- df = DataFrame({'a': tm.choice(['m', 'f'], size=3),
- 'b': np.random.randn(3)})
- df2 = DataFrame({'a': tm.choice(['m', 'f'], size=10),
- 'b': np.random.randn(10)},
- index=tm.makeCustomIndex(10, 2))
- merge(df, df2, right_on='a', left_on=['a', 'b'])
- def test_join_on_pass_vector(self):
- expected = self.target.join(self.source, on='C')
- del expected['C']
- join_col = self.target.pop('C')
- result = self.target.join(self.source, on=join_col)
- assert_frame_equal(result, expected)
- def test_join_with_len0(self):
- # nothing to merge
- merged = self.target.join(self.source.reindex([]), on='C')
- for col in self.source:
- self.assertIn(col, merged)
- self.assertTrue(merged[col].isnull().all())
- merged2 = self.target.join(self.source.reindex([]), on='C',
- how='inner')
- self.assertTrue(merged2.columns.equals(merged.columns))
- self.assertEqual(len(merged2), 0)
- def test_join_on_inner(self):
- df = DataFrame({'key': ['a', 'a', 'd', 'b', 'b', 'c']})
- df2 = DataFrame({'value': [0, 1]}, index=['a', 'b'])
- joined = df.join(df2, on='key', how='inner')
- expected = df.join(df2, on='key')
- expected = expected[expected['value'].notnull()]
- self.assert_numpy_array_equal(joined['key'], expected['key'])
- self.assert_numpy_array_equal(joined['value'], expected['value'])
- self.assertTrue(joined.index.equals(expected.index))
- def test_join_on_singlekey_list(self):
- df = DataFrame({'key': ['a', 'a', 'b', 'b', 'c']})
- df2 = DataFrame({'value': [0, 1, 2]}, index=['a', 'b', 'c'])
- # corner cases
- joined = df.join(df2, on=['key'])
- expected = df.join(df2, on='key')
- assert_frame_equal(joined, expected)
- def test_join_on_series(self):
- result = self.target.join(self.source['MergedA'], on='C')
- expected = self.target.join(self.source[['MergedA']], on='C')
- assert_frame_equal(result, expected)
- def test_join_on_series_buglet(self):
- # GH #638
- df = DataFrame({'a': [1, 1]})
- ds = Series([2], index=[1], name='b')
- result = df.join(ds, on='a')
- expected = DataFrame({'a': [1, 1],
- 'b': [2, 2]}, index=df.index)
- tm.assert_frame_equal(result, expected)
- def test_join_index_mixed(self):
- df1 = DataFrame({'A': 1., 'B': 2, 'C': 'foo', 'D': True},
- index=np.arange(10),
- columns=['A', 'B', 'C', 'D'])
- self.assertEqual(df1['B'].dtype, np.int64)
- self.assertEqual(df1['D'].dtype, np.bool_)
- df2 = DataFrame({'A': 1., 'B': 2, 'C': 'foo', 'D': True},
- index=np.arange(0, 10, 2),
- columns=['A', 'B', 'C', 'D'])
- # overlap
- joined = df1.join(df2, lsuffix='_one', rsuffix='_two')
- expected_columns = ['A_one', 'B_one', 'C_one', 'D_one',
- 'A_two', 'B_two', 'C_two', 'D_two']
- df1.columns = expected_columns[:4]
- df2.columns = expected_columns[4:]
- expected = _join_by_hand(df1, df2)
- assert_frame_equal(joined, expected)
- # no overlapping blocks
- df1 = DataFrame(index=np.arange(10))
- df1['bool'] = True
- df1['string'] = 'foo'
- df2 = DataFrame(index=np.arange(5, 15))
- df2['int'] = 1
- df2['float'] = 1.
- for kind in JOIN_TYPES:
- joined = df1.join(df2, how=kind)
- expected = _join_by_hand(df1, df2, how=kind)
- assert_frame_equal(joined, expected)
- joined = df2.join(df1, how=kind)
- expected = _join_by_hand(df2, df1, how=kind)
- assert_frame_equal(joined, expected)
- def test_join_empty_bug(self):
- # generated an exception in 0.4.3
- x = DataFrame()
- x.join(DataFrame([3], index=[0], columns=['A']), how='outer')
- def test_join_unconsolidated(self):
- # GH #331
- a = DataFrame(randn(30, 2), columns=['a', 'b'])
- c = Series(randn(30))
- a['c'] = c
- d = DataFrame(randn(30, 1), columns=['q'])
- # it works!
- a.join(d)
- d.join(a)
- def test_join_multiindex(self):
- index1 = MultiIndex.from_arrays([['a', 'a', 'a', 'b', 'b', 'b'],
- [1, 2, 3, 1, 2, 3]],
- names=['first', 'second'])
- index2 = MultiIndex.from_arrays([['b', 'b', 'b', 'c', 'c', 'c'],
- [1, 2, 3, 1, 2, 3]],
- names=['first', 'second'])
- df1 = DataFrame(data=np.random.randn(6), index=index1,
- columns=['var X'])
- df2 = DataFrame(data=np.random.randn(6), index=index2,
- columns=['var Y'])
- df1 = df1.sortlevel(0)
- df2 = df2.sortlevel(0)
- joined = df1.join(df2, how='outer')
- ex_index = index1._tuple_index + index2._tuple_index
- expected = df1.reindex(ex_index).join(df2.reindex(ex_index))
- expected.index.names = index1.names
- assert_frame_equal(joined, expected)
- self.assertEqual(joined.index.names, index1.names)
- df1 = df1.sortlevel(1)
- df2 = df2.sortlevel(1)
- joined = df1.join(df2, how='outer').sortlevel(0)
- ex_index = index1._tuple_index + index2._tuple_index
- expected = df1.reindex(ex_index).join(df2.reindex(ex_index))
- expected.index.names = index1.names
- assert_frame_equal(joined, expected)
- self.assertEqual(joined.index.names, index1.names)
- def test_join_inner_multiindex(self):
- key1 = ['bar', 'bar', 'bar', 'foo', 'foo', 'baz', 'baz', 'qux',
- 'qux', 'snap']
- key2 = ['two', 'one', 'three', 'one', 'two', 'one', 'two', 'two',
- 'three', 'one']
- data = np.random.randn(len(key1))
- data = DataFrame({'key1': key1, 'key2': key2,
- 'data': data})
- index = MultiIndex(levels=[['foo', 'bar', 'baz', 'qux'],
- ['one', 'two', 'three']],
- labels=[[0, 0, 0, 1, 1, 2, 2, 3, 3, 3],
- [0, 1, 2, 0, 1, 1, 2, 0, 1, 2]],
- names=['first', 'second'])
- to_join = DataFrame(np.random.randn(10, 3), index=index,
- columns=['j_one', 'j_two', 'j_three'])
- joined = data.join(to_join, on=['key1', 'key2'], how='inner')
- expected = merge(data, to_join.reset_index(),
- left_on=['key1', 'key2'],
- right_on=['first', 'second'], how='inner',
- sort=False)
- expected2 = merge(to_join, data,
- right_on=['key1', 'key2'], left_index=True,
- how='inner', sort=False)
- assert_frame_equal(joined, expected2.reindex_like(joined))
- expected2 = merge(to_join, data, right_on=['key1', 'key2'],
- left_index=True, how='inner', sort=False)
- expected = expected.drop(['first', 'second'], axis=1)
- expected.index = joined.index
- self.assertTrue(joined.index.is_monotonic)
- assert_frame_equal(joined, expected)
- # _assert_same_contents(expected, expected2.ix[:, expected.columns])
- def test_join_hierarchical_mixed(self):
- df = DataFrame([(1, 2, 3), (4, 5, 6)], columns=['a', 'b', 'c'])
- new_df = df.groupby(['a']).agg({'b': [np.mean, np.sum]})
- other_df = DataFrame(
- [(1, 2, 3), (7, 10, 6)], columns=['a', 'b', 'd'])
- other_df.set_index('a', inplace=True)
- result = merge(new_df, other_df, left_index=True, right_index=True)
- self.assertTrue(('b', 'mean') in result)
- self.assertTrue('b' in result)
- def test_join_float64_float32(self):
- a = DataFrame(randn(10, 2), columns=['a', 'b'], dtype = np.float64)
- b = DataFrame(randn(10, 1), columns=['c'], dtype = np.float32)
- joined = a.join(b)
- self.assertEqual(joined.dtypes['a'], 'float64')
- self.assertEqual(joined.dtypes['b'], 'float64')
- self.assertEqual(joined.dtypes['c'], 'float32')
- a = np.random.randint(0, 5, 100).astype('int64')
- b = np.random.random(100).astype('float64')
- c = np.random.random(100).astype('float32')
- df = DataFrame({'a': a, 'b': b, 'c': c})
- xpdf = DataFrame({'a': a, 'b': b, 'c': c })
- s = DataFrame(np.random.random(5).astype('float32'), columns=['md'])
- rs = df.merge(s, left_on='a', right_index=True)
- self.assertEqual(rs.dtypes['a'], 'int64')
- self.assertEqual(rs.dtypes['b'], 'float64')
- self.assertEqual(rs.dtypes['c'], 'float32')
- self.assertEqual(rs.dtypes['md'], 'float32')
- xp = xpdf.merge(s, left_on='a', right_index=True)
- assert_frame_equal(rs, xp)
- def test_join_many_non_unique_index(self):
- df1 = DataFrame({"a": [1, 1], "b": [1, 1], "c": [10, 20]})
- df2 = DataFrame({"a": [1, 1], "b": [1, 2], "d": [100, 200]})
- df3 = DataFrame({"a": [1, 1], "b": [1, 2], "e": [1000, 2000]})
- idf1 = df1.set_index(["a", "b"])
- idf2 = df2.set_index(["a", "b"])
- idf3 = df3.set_index(["a", "b"])
- result = idf1.join([idf2, idf3], how='outer')
- df_partially_merged = merge(df1, df2, on=['a', 'b'], how='outer')
- expected = merge(df_partially_merged, df3, on=['a', 'b'], how='outer')
- result = result.reset_index()
- result['a'] = result['a'].astype(np.float64)
- result['b'] = result['b'].astype(np.float64)
- assert_frame_equal(result, expected.ix[:, result.columns])
- df1 = DataFrame({"a": [1, 1, 1], "b": [1, 1, 1], "c": [10, 20, 30]})
- df2 = DataFrame({"a": [1, 1, 1], "b": [1, 1, 2], "d": [100, 200, 300]})
- df3 = DataFrame(
- {"a": [1, 1, 1], "b": [1, 1, 2], "e": [1000, 2000, 3000]})
- idf1 = df1.set_index(["a", "b"])
- idf2 = df2.set_index(["a", "b"])
- idf3 = df3.set_index(["a", "b"])
- result = idf1.join([idf2, idf3], how='inner')
- df_partially_merged = merge(df1, df2, on=['a', 'b'], how='inner')
- expected = merge(df_partially_merged, df3, on=['a', 'b'], how='inner')
- result = result.reset_index()
- assert_frame_equal(result, expected.ix[:, result.columns])
- def test_merge_index_singlekey_right_vs_left(self):
- left = DataFrame({'key': ['a', 'b', 'c', 'd', 'e', 'e', 'a'],
- 'v1': np.random.randn(7)})
- right = DataFrame({'v2': np.random.randn(4)},
- index=['d', 'b', 'c', 'a'])
- merged1 = merge(left, right, left_on='key',
- right_index=True, how='left', sort=False)
- merged2 = merge(right, left, right_on='key',
- left_index=True, how='right', sort=False)
- assert_frame_equal(merged1, merged2.ix[:, merged1.columns])
- merged1 = merge(left, right, left_on='key',
- right_index=True, how='left', sort=True)
- merged2 = merge(right, left, right_on='key',
- left_index=True, how='right', sort=True)
- assert_frame_equal(merged1, merged2.ix[:, merged1.columns])
- def test_merge_index_singlekey_inner(self):
- left = DataFrame({'key': ['a', 'b', 'c', 'd', 'e', 'e', 'a'],
- 'v1': np.random.randn(7)})
- right = DataFrame({'v2': np.random.randn(4)},
- index=['d', 'b', 'c', 'a'])
- # inner join
- result = merge(left, right, left_on='key', right_index=True,
- how='inner')
- expected = left.join(right, on='key').ix[result.index]
- assert_frame_equal(result, expected)
- result = merge(right, left, right_on='key', left_index=True,
- how='inner')
- expected = left.join(right, on='key').ix[result.index]
- assert_frame_equal(result, expected.ix[:, result.columns])
- def test_merge_misspecified(self):
- self.assertRaises(Exception, merge, self.left, self.right,
- left_index=True)
- self.assertRaises(Exception, merge, self.left, self.right,
- right_index=True)
- self.assertRaises(Exception, merge, self.left, self.left,
- left_on='key', on='key')
- self.assertRaises(Exception, merge, self.df, self.df2,
- left_on=['key1'], right_on=['key1', 'key2'])
- def test_merge_overlap(self):
- merged = merge(self.left, self.left, on='key')
- exp_len = (self.left['key'].value_counts() ** 2).sum()
- self.assertEqual(len(merged), exp_len)
- self.assertIn('v1_x', merged)
- self.assertIn('v1_y', merged)
- def test_merge_different_column_key_names(self):
- left = DataFrame({'lkey': ['foo', 'bar', 'baz', 'foo'],
- 'value': [1, 2, 3, 4]})
- right = DataFrame({'rkey': ['foo', 'bar', 'qux', 'foo'],
- 'value': [5, 6, 7, 8]})
- merged = left.merge(right, left_on='lkey', right_on='rkey',
- how='outer', sort=True)
- assert_almost_equal(merged['lkey'],
- ['bar', 'baz', 'foo', 'foo', 'foo', 'foo', np.nan])
- assert_almost_equal(merged['rkey'],
- ['bar', np.nan, 'foo', 'foo', 'foo', 'foo', 'qux'])
- assert_almost_equal(merged['value_x'], [2, 3, 1, 1, 4, 4, np.nan])
- assert_almost_equal(merged['value_y'], [6, np.nan, 5, 8, 5, 8, 7])
- def test_merge_copy(self):
- left = DataFrame({'a': 0, 'b': 1}, index=lrange(10))
- right = DataFrame({'c': 'foo', 'd': 'bar'}, index=lrange(10))
- merged = merge(left, right, left_index=True,
- right_index=True, copy=True)
- merged['a'] = 6
- self.assertTrue((left['a'] == 0).all())
- merged['d'] = 'peekaboo'
- self.assertTrue((right['d'] == 'bar').all())
- def test_merge_nocopy(self):
- left = DataFrame({'a': 0, 'b': 1}, index=lrange(10))
- right = DataFrame({'c': 'foo', 'd': 'bar'}, index=lrange(10))
- merged = merge(left, right, left_index=True,
- right_index=True, copy=False)
- merged['a'] = 6
- self.assertTrue((left['a'] == 6).all())
- merged['d'] = 'peekaboo'
- self.assertTrue((right['d'] == 'peekaboo').all())
- def test_join_sort(self):
- left = DataFrame({'key': ['foo', 'bar', 'baz', 'foo'],
- 'value': [1, 2, 3, 4]})
- right = DataFrame({'value2': ['a', 'b', 'c']},
- index=['bar', 'baz', 'foo'])
- joined = left.join(right, on='key', sort=True)
- expected = DataFrame({'key': ['bar', 'baz', 'foo', 'foo'],
- 'value': [2, 3, 1, 4],
- 'value2': ['a', 'b', 'c', 'c']},
- index=[1, 2, 0, 3])
- assert_frame_equal(joined, expected)
- # smoke test
- joined = left.join(right, on='key', sort=False)
- self.assert_numpy_array_equal(joined.index, lrange(4))
- def test_intelligently_handle_join_key(self):
- # #733, be a bit more 1337 about not returning unconsolidated DataFrame
- left = DataFrame({'key': [1, 1, 2, 2, 3],
- 'value': lrange(5)}, columns=['value', 'key'])
- right = DataFrame({'key': [1, 1, 2, 3, 4, 5],
- 'rvalue': lrange(6)})
- joined = merge(left, right, on='key', how='outer')
- expected = DataFrame({'key': [1, 1, 1, 1, 2, 2, 3, 4, 5.],
- 'value': np.array([0, 0, 1, 1, 2, 3, 4,
- np.nan, np.nan]),
- 'rvalue': np.array([0, 1, 0, 1, 2, 2, 3, 4, 5])},
- columns=['value', 'key', 'rvalue'])
- assert_frame_equal(joined, expected, check_dtype=False)
- self.assertTrue(joined._data.is_consolidated())
- def test_handle_join_key_pass_array(self):
- left = DataFrame({'key': [1, 1, 2, 2, 3],
- 'value': lrange(5)}, columns=['value', 'key'])
- right = DataFrame({'rvalue': lrange(6)})
- key = np.array([1, 1, 2, 3, 4, 5])
- merged = merge(left, right, left_on='key', right_on=key, how='outer')
- merged2 = merge(right, left, left_on=key, right_on='key', how='outer')
- assert_series_equal(merged['key'], merged2['key'])
- self.assertTrue(merged['key'].notnull().all())
- self.assertTrue(merged2['key'].notnull().all())
- left = DataFrame({'value': lrange(5)}, columns=['value'])
- right = DataFrame({'rvalue': lrange(6)})
- lkey = np.array([1, 1, 2, 2, 3])
- rkey = np.array([1, 1, 2, 3, 4, 5])
- merged = merge(left, right, left_on=lkey, right_on=rkey, how='outer')
- self.assert_numpy_array_equal(merged['key_0'],
- np.array([1, 1, 1, 1, 2, 2, 3, 4, 5]))
- left = DataFrame({'value': lrange(3)})
- right = DataFrame({'rvalue': lrange(6)})
- key = np.array([0, 1, 1, 2, 2, 3])
- merged = merge(left, right, left_index=True, right_on=key, how='outer')
- self.assert_numpy_array_equal(merged['key_0'], key)
- def test_mixed_type_join_with_suffix(self):
- # GH #916
- df = DataFrame(np.random.randn(20, 6),
- columns=['a', 'b', 'c', 'd', 'e', 'f'])
- df.insert(0, 'id', 0)
- df.insert(5, 'dt', 'foo')
- grouped = df.groupby('id')
- mn = grouped.mean()
- cn = grouped.count()
- # it works!
- mn.join(cn, rsuffix='_right')
- def test_no_overlap_more_informative_error(self):
- dt = datetime.now()
- df1 = DataFrame({'x': ['a']}, index=[dt])
- df2 = DataFrame({'y': ['b', 'c']}, index=[dt, dt])
- self.assertRaises(MergeError, merge, df1, df2)
- def test_merge_non_unique_indexes(self):
- dt = datetime(2012, 5, 1)
- dt2 = datetime(2012, 5, 2)
- dt3 = datetime(2012, 5, 3)
- dt4 = datetime(2012, 5, 4)
- df1 = DataFrame({'x': ['a']}, index=[dt])
- df2 = DataFrame({'y': ['b', 'c']}, index=[dt, dt])
- _check_merge(df1, df2)
- # Not monotonic
- df1 = DataFrame({'x': ['a', 'b', 'q']}, index=[dt2, dt, dt4])
- df2 = DataFrame({'y': ['c', 'd', 'e', 'f', 'g', 'h']},
- index=[dt3, dt3, dt2, dt2, dt, dt])
- _check_merge(df1, df2)
- df1 = DataFrame({'x': ['a', 'b']}, index=[dt, dt])
- df2 = DataFrame({'y': ['c', 'd']}, index=[dt, dt])
- _check_merge(df1, df2)
- def test_merge_non_unique_index_many_to_many(self):
- dt = datetime(2012, 5, 1)
- dt2 = datetime(2012, 5, 2)
- dt3 = datetime(2012, 5, 3)
- df1 = DataFrame({'x': ['a', 'b', 'c', 'd']},
- index=[dt2, dt2, dt, dt])
- df2 = DataFrame({'y': ['e', 'f', 'g', ' h', 'i']},
- index=[dt2, dt2, dt3, dt, dt])
- _check_merge(df1, df2)
- def test_left_merge_empty_dataframe(self):
- left = DataFrame({'key': [1], 'value': [2]})
- right = DataFrame({'key': []})
- result = merge(left, right, on='key', how='left')
- assert_frame_equal(result, left)
- result = merge(right, left, on='key', how='right')
- assert_frame_equal(result, left)
- def test_merge_nosort(self):
- # #2098, anything to do?
- from datetime import datetime
- d = {"var1": np.random.randint(0, 10, size=10),
- "var2": np.random.randint(0, 10, size=10),
- "var3": [datetime(2012, 1, 12), datetime(2011, 2, 4),
- datetime(
- 2010, 2, 3), datetime(2012, 1, 12),
- datetime(
- 2011, 2, 4), datetime(2012, 4, 3),
- datetime(
- 2012, 3, 4), datetime(2008, 5, 1),
- datetime(2010, 2, 3), datetime(2012, 2, 3)]}
- df = DataFrame.from_dict(d)
- var3 = df.var3.unique()
- var3.sort()
- new = DataFrame.from_dict({"var3": var3,
- "var8": np.random.random(7)})
- result = df.merge(new, on="var3", sort=False)
- exp = merge(df, new, on='var3', sort=False)
- assert_frame_equal(result, exp)
- self.assertTrue((df.var3.unique() == result.var3.unique()).all())
- def test_merge_nan_right(self):
- df1 = DataFrame({"i1" : [0, 1], "i2" : [0, 1]})
- df2 = DataFrame({"i1" : [0], "i3" : [0]})
- result = df1.join(df2, on="i1", rsuffix="_")
- expected = DataFrame({'i1': {0: 0.0, 1: 1}, 'i2': {0: 0, 1: 1},
- 'i1_': {0: 0, 1: np.nan}, 'i3': {0: 0.0, 1: np.nan},
- None: {0: 0, 1: 0}}).set_index(None).reset_index()[['i1', 'i2', 'i1_', 'i3']]
- assert_frame_equal(result, expected, check_dtype=False)
- df1 = DataFrame({"i1" : [0, 1], "i2" : [0.5, 1.5]})
- df2 = DataFrame({"i1" : [0], "i3" : [0.7]})
- result = df1.join(df2, rsuffix="_", on='i1')
- expected = DataFrame({'i1': {0: 0, 1: 1}, 'i1_': {0: 0.0, 1: nan},
- 'i2': {0: 0.5, 1: 1.5}, 'i3': {0: 0.69999999999999996,
- 1: nan}})[['i1', 'i2', 'i1_', 'i3']]
- assert_frame_equal(result, expected)
- def test_append_dtype_coerce(self):
- # GH 4993
- # appending with datetime will incorrectly convert datetime64
- import datetime as dt
- from pandas import NaT
- df1 = DataFrame(index=[1,2], data=[dt.datetime(2013,1,1,0,0),
- dt.datetime(2013,1,2,0,0)],
- columns=['start_time'])
- df2 = DataFrame(index=[4,5], data=[[dt.datetime(2013,1,3,0,0),
- dt.datetime(2013,1,3,6,10)],
- [dt.datetime(2013,1,4,0,0),
- dt.datetime(2013,1,4,7,10)]],
- columns=['start_time','end_time'])
- expected = concat([
- Series([NaT,NaT,dt.datetime(2013,1,3,6,10),dt.datetime(2013,1,4,7,10)],name='end_time'),
- Series([dt.datetime(2013,1,1,0,0),dt.datetime(2013,1,2,0,0),dt.datetime(2013,1,3,0,0),dt.datetime(2013,1,4,0,0)],name='start_time'),
- ],axis=1)
- result = df1.append(df2,ignore_index=True)
- assert_frame_equal(result, expected)
- def test_join_append_timedeltas(self):
- import datetime as dt
- from pandas import NaT
- # timedelta64 issues with join/merge
- # GH 5695
- tm._skip_if_not_numpy17_friendly()
- d = {'d': dt.datetime(2013, 11, 5, 5, 56), 't': dt.timedelta(0, 22500)}
- df = DataFrame(columns=list('dt'))
- df = df.append(d, ignore_index=True)
- result = df.append(d, ignore_index=True)
- expected = DataFrame({'d': [dt.datetime(2013, 11, 5, 5, 56),
- dt.datetime(2013, 11, 5, 5, 56) ],
- 't': [ dt.timedelta(0, 22500),
- dt.timedelta(0, 22500) ]})
- assert_frame_equal(result, expected)
- td = np.timedelta64(300000000)
- lhs = DataFrame(Series([td,td],index=["A","B"]))
- rhs = DataFrame(Series([td],index=["A"]))
- from pandas import NaT
- result = lhs.join(rhs,rsuffix='r', how="left")
- expected = DataFrame({ '0' : Series([td,td],index=list('AB')), '0r' : Series([td,NaT],index=list('AB')) })
- assert_frame_equal(result, expected)
- def test_overlapping_columns_error_message(self):
- # #2649
- df = DataFrame({'key': [1, 2, 3],
- 'v1': [4, 5, 6],
- 'v2': [7, 8, 9]})
- df2 = DataFrame({'key': [1, 2, 3],
- 'v1': [4, 5, 6],
- 'v2': [7, 8, 9]})
- df.columns = ['key', 'foo', 'foo']
- df2.columns = ['key', 'bar', 'bar']
- self.assertRaises(Exception, merge, df, df2)
- def _check_merge(x, y):
- for how in ['inner', 'left', 'outer']:
- result = x.join(y, how=how)
- expected = merge(x.reset_index(), y.reset_index(), how=how,
- sort=True)
- expected = expected.set_index('index')
- assert_frame_equal(result, expected, check_names=False) # TODO check_names on merge?
- class TestMergeMulti(tm.TestCase):
- def setUp(self):
- self.index = MultiIndex(levels=[['foo', 'bar', 'baz', 'qux'],
- ['one', 'two', 'three']],
- labels=[[0, 0, 0, 1, 1, 2, 2, 3, 3, 3],
- [0, 1, 2, 0, 1, 1, 2, 0, 1, 2]],
- names=['first', 'second'])
- self.to_join = DataFrame(np.random.randn(10, 3), index=self.index,
- columns=['j_one', 'j_two', 'j_three'])
- # a little relevant example with NAs
- key1 = ['bar', 'bar', 'bar', 'foo', 'foo', 'baz', 'baz', 'qux',
- 'qux', 'snap']
- key2 = ['two', 'one', 'three', 'one', 'two', 'one', 'two', 'two',
- 'three', 'one']
- data = np.random.randn(len(key1))
- self.data = DataFrame({'key1': key1, 'key2': key2,
- 'data': data})
- def test_merge_on_multikey(self):
- joined = self.data.join(self.to_join, on=['key1', 'key2'])
- join_key = Index(lzip(self.data['key1'], self.data['key2']))
- indexer = self.to_join.index.get_indexer(join_key)
- ex_values = self.to_join.values.take(indexer, axis=0)
- ex_values[indexer == -1] = np.nan
- expected = self.data.join(DataFrame(ex_values,
- columns=self.to_join.columns))
- # TODO: columns aren't in the same order yet
- assert_frame_equal(joined, expected.ix[:, joined.columns])
- def test_merge_right_vs_left(self):
- # compare left vs right merge with multikey
- merged1 = self.data.merge(self.to_join, left_on=['key1', 'key2'],
- right_index=True, how='left')
- merged2 = self.to_join.merge(self.data, right_on=['key1', 'key2'],
- left_index=True, how='right')
- merged2 = merged2.ix[:, merged1.columns]
- assert_frame_equal(merged1, merged2)
- def test_compress_group_combinations(self):
- # ~ 40000000 possible unique groups
- key1 = np.array([rands(10) for _ in range(10000)], dtype='O')
- key1 = np.tile(key1, 2)
- key2 = key1[::-1]
- df = DataFrame({'key1': key1, 'key2': key2,
- 'value1': np.random.randn(20000)})
- df2 = DataFrame({'key1': key1[::2], 'key2': key2[::2],
- 'value2': np.random.randn(10000)})
- # just to hit the label compression code path
- merged = merge(df, df2, how='outer')
- def test_left_join_index_preserve_order(self):
- left = DataFrame({'k1': [0, 1, 2] * 8,
- 'k2': ['foo', 'bar'] * 12,
- 'v': np.array(np.arange(24),dtype=np.int64) })
- index = MultiIndex.from_tuples([(2, 'bar'), (1, 'foo')])
- right = DataFrame({'v2': [5, 7]}, index=index)
- result = left.join(right, on=['k1', 'k2'])
- expected = left.copy()
- expected['v2'] = np.nan
- expected['v2'][(expected.k1 == 2) & (expected.k2 == 'bar')] = 5
- expected['v2'][(expected.k1 == 1) & (expected.k2 == 'foo')] = 7
- tm.assert_frame_equal(result, expected)
- # test join with multi dtypes blocks
- left = DataFrame({'k1': [0, 1, 2] * 8,
- 'k2': ['foo', 'bar'] * 12,
- 'k3' : np.array([0, 1, 2]*8, dtype=np.float32),
- 'v': np.array(np.arange(24),dtype=np.int32) })
- index = MultiIndex.from_tuples([(2, 'bar'), (1, 'foo')])
- right = DataFrame({'v2': [5, 7]}, index=index)
- result = left.join(right, on=['k1', 'k2'])
- expected = left.copy()
- expected['v2'] = np.nan
- expected['v2'][(expected.k1 == 2) & (expected.k2 == 'bar')] = 5
- expected['v2'][(expected.k1 == 1) & (expected.k2 == 'foo')] = 7
- tm.assert_frame_equal(result, expected)
- # do a right join for an extra test
- joined = merge(right, left, left_index=True,
- right_on=['k1', 'k2'], how='right')
- tm.assert_frame_equal(joined.ix[:, expected.columns], expected)
- def test_join_multi_dtypes(self):
- # test with multi dtypes in the join index
- def _test(dtype1,dtype2):
- left = DataFrame({'k1': np.array([0, 1, 2] * 8, dtype=dtype1),
- 'k2': ['foo', 'bar'] * 12,
- 'v': np.array(np.arange(24),dtype=np.int64) })
- index = MultiIndex.from_tuples([(2, 'bar'), (1, 'foo')])
- right = DataFrame({'v2': np.array([5, 7], dtype=dtype2)}, index=index)
- result = left.join(right, on=['k1', 'k2'])
- expected = left.copy()
- if dtype2.kind == 'i':
- dtype2 = np.dtype('float64')
- expected['v2'] = np.array(np.nan,dtype=dtype2)
- expected['v2'][(expected.k1 == 2) & (expected.k2 == 'bar')] = 5
- expected['v2'][(expected.k1 == 1) & (expected.k2 == 'foo')] = 7
- tm.assert_frame_equal(result, expected)
- for d1 in [np.int64,np.int32,np.int16,np.int8,np.uint8]:
- for d2 in [np.int64,np.float64,np.float32,np.float16]:
- _test(np.dtype(d1),np.dtype(d2))
- def test_left_merge_na_buglet(self):
- left = DataFrame({'id': list('abcde'), 'v1': randn(5),
- 'v2': randn(5), 'dummy': list('abcde'),
- 'v3': randn(5)},
- columns=['id', 'v1', 'v2', 'dummy', 'v3'])
- right = DataFrame({'id': ['a', 'b', np.nan, np.nan, np.nan],
- 'sv3': [1.234, 5.678, np.nan, np.nan, np.nan]})
- merged = merge(left, right, on='id', how='left')
- rdf = right.drop(['id'], axis=1)
- expected = left.join(rdf)
- tm.assert_frame_equal(merged, expected)
- def test_merge_na_keys(self):
- data = [[1950, "A", 1.5],
- [1950, "B", 1.5],
- [1955, "B", 1.5],
- [1960, "B", np.nan],
- [1970, "B", 4.],
- [1950, "C", 4.],
- [1960, "C", np.nan],
- [1965, "C", 3.],
- [1970, "C", 4.]]
- frame = DataFrame(data, columns=["year", "panel", "data"])
- other_data = [[1960, 'A', np.nan],
- [1970, 'A', np.nan],
- [1955, 'A', np.nan],
- [1965, 'A', np.nan],
- [1965, 'B', np.nan],
- [1955, 'C', np.nan]]
- other = DataFrame(other_data, columns=['year', 'panel', 'data'])
- result = frame.merge(other, how='outer')
- expected = frame.fillna(-999).merge(other.fillna(-999), how='outer')
- expected = expected.replace(-999, np.nan)
- tm.assert_frame_equal(result, expected)
- def test_int64_overflow_issues(self):
- # #2690, combinatorial explosion
- df1 = DataFrame(np.random.randn(1000, 7),
- columns=list('ABCDEF') + ['G1'])
- df2 = DataFrame(np.random.randn(1000, 7),
- columns=list('ABCDEF') + ['G2'])
- # it works!
- result = merge(df1, df2, how='outer')
- self.assertTrue(len(result) == 2000)
- def test_join_multi_levels(self):
- # GH 3662
- # merge multi-levels
- household = DataFrame(dict(household_id = [1,2,3],
- male = [0,1,0],
- wealth = [196087.3,316478.7,294750]),
- columns = ['household_id','male','wealth']).set_index('household_id')
- portfolio = DataFrame(dict(household_id = [1,2,2,3,3,3,4],
- asset_id = ["nl0000301109","nl0000289783","gb00b03mlx29","gb00b03mlx29","lu0197800237","nl0000289965",np.nan],
- name = ["ABN Amro","Robeco","Royal Dutch Shell","Royal Dutch Shell","AAB Eastern Europe Equity Fund","Postbank BioTech Fonds",np.nan],
- share = [1.0,0.4,0.6,0.15,0.6,0.25,1.0]),
- columns = ['household_id','asset_id','name','share']).set_index(['household_id','asset_id'])
- result = household.join(portfolio, how='inner')
- expected = DataFrame(dict(male = [0,1,1,0,0,0],
- wealth = [ 196087.3, 316478.7, 316478.7, 294750.0, 294750.0, 294750.0 ],
- name = ['ABN Amro','Robeco','Royal Dutch Shell','Royal Dutch Shell','AAB Eastern Europe Equity Fund','Postbank BioTech Fonds'],
- share = [1.00,0.40,0.60,0.15,0.60,0.25],
- household_id = [1,2,2,3,3,3],
- asset_id = ['nl0000301109','nl0000289783','gb00b03mlx29','gb00b03mlx29','lu0197800237','nl0000289965']),
- ).set_index(['household_id','asset_id']).reindex(columns=['male','wealth','name','share'])
- assert_frame_equal(result,expected)
- assert_frame_equal(result,expected)
- # equivalency
- result2 = merge(household.reset_index(),portfolio.reset_index(),on=['household_id'],how='inner').set_index(['household_id','asset_id'])
- assert_frame_equal(result2,expected)
- result = household.join(portfolio, how='outer')
- expected = concat([expected,DataFrame(dict(share = [1.00]),
- index=MultiIndex.from_tuples([(4,np.nan)],
- names=['household_id','asset_id']))],
- axis=0).reindex(columns=expected.columns)
- assert_frame_equal(result,expected)
- # invalid cases
- household.index.name = 'foo'
- def f():
- household.join(portfolio, how='inner')
- self.assertRaises(ValueError, f)
- portfolio2 = portfolio.copy()
- portfolio2.index.set_names(['household_id','foo'])
- def f():
- portfolio2.join(portfolio, how='inner')
- self.assertRaises(ValueError, f)
- def test_join_multi_levels2(self):
- # some more advanced merges
- # GH6360
- household = DataFrame(dict(household_id = [1,2,2,3,3,3,4],
- asset_id = ["nl0000301109","nl0000301109","gb00b03mlx29","gb00b03mlx29","lu0197800237","nl0000289965",np.nan],
- share = [1.0,0.4,0.6,0.15,0.6,0.25,1.0]),
- columns = ['household_id','asset_id','share']).set_index(['household_id','asset_id'])
- log_return = DataFrame(dict(
- asset_id = ["gb00b03mlx29", "gb00b03mlx29", "gb00b03mlx29", "lu0197800237", "lu0197800237"],
- t = [233, 234, 235, 180, 181],
- log_return = [.09604978, -.06524096, .03532373, .03025441, .036997]
- )).set_index(["asset_id","t"])
- expected = DataFrame(dict(
- household_id = [2, 2, 2, 3, 3, 3, 3, 3],
- asset_id = ["gb00b03mlx29", "gb00b03mlx29", "gb00b03mlx29", "gb00b03mlx29", "gb00b03mlx29", "gb00b03mlx29", "lu0197800237", "lu0197800237"],
- t = [233, 234, 235, 233, 234, 235, 180, 181],
- share = [0.6, 0.6, 0.6, 0.15, 0.15, 0.15, 0.6, 0.6],
- log_return = [.09604978, -.06524096, .03532373, .09604978, -.06524096, .03532373, .03025441, .036997]
- )).set_index(["household_id", "asset_id", "t"]).reindex(columns=['share','log_return'])
- def f():
- household.join(log_return, how='inner')
- self.assertRaises(NotImplementedError, f)
- # this is the equivalency
- result = merge(household.reset_index(),log_return.reset_index(),on=['asset_id'],how='inner').set_index(['household_id','asset_id','t'])
- assert_frame_equal(result,expected)
- expected = DataFrame(dict(
- household_id = [1, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 4],
- asset_id = ["nl0000301109", "nl0000289783", "gb00b03mlx29", "gb00b03mlx29", "gb00b03mlx29", "gb00b03mlx29", "gb00b03mlx29", "gb00b03mlx29", "lu0197800237", "lu0197800237", "nl0000289965", None],
- t = [None, None, 233, 234, 235, 233, 234, 235, 180, 181, None, None],
- share = [1.0, 0.4, 0.6, 0.6, 0.6, 0.15, 0.15, 0.15, 0.6, 0.6, 0.25, 1.0],
- log_return = [None, None, .09604978, -.06524096, .03532373, .09604978, -.06524096, .03532373, .03025441, .036997, None, None]
- )).set_index(["household_id", "asset_id", "t"])
- def f():
- household.join(log_return, how='outer')
- self.assertRaises(NotImplementedError, f)
- def _check_join(left, right, result, join_col, how='left',
- lsuffix='_x', rsuffix='_y'):
- # some smoke tests
- for c in join_col:
- assert(result[c].notnull().all())
- left_grouped = left.groupby(join_col)
- right_grouped = right.groupby(join_col)
- for group_key, group in result.groupby(join_col):
- l_joined = _restrict_to_columns(group, left.columns, lsuffix)
- r_joined = _restrict_to_columns(group, right.columns, rsuffix)
- try:
- lgroup = left_grouped.get_group(group_key)
- except KeyError:
- if how in ('left', 'inner'):
- raise AssertionError('key %s should not have been in the join'
- % str(group_key))
- _assert_all_na(l_joined, left.columns, join_col)
- else:
- _assert_same_contents(l_joined, lgroup)
- try:
- rgroup = right_grouped.get_group(group_key)
- except KeyError:
- if how in ('right', 'inner'):
- raise AssertionError('key %s should not have been in the join'
- % str(group_key))
- _assert_all_na(r_joined, right.columns, join_col)
- else:
- _assert_same_contents(r_joined, rgroup)
- def _restrict_to_columns(group, columns, suffix):
- found = [c for c in group.columns
- if c in columns or c.replace(suffix, '') in columns]
- # filter
- group = group.ix[:, found]
- # get rid of suffixes, if any
- group = group.rename(columns=lambda x: x.replace(suffix, ''))
- # put in the right order...
- group = group.ix[:, columns]
- return group
- def _assert_same_contents(join_chunk, source):
- NA_SENTINEL = -1234567 # drop_duplicates not so NA-friendly...
- jvalues = join_chunk.fillna(NA_SENTINEL).drop_duplicates().values
- svalues = source.fillna(NA_SENTINEL).drop_duplicates().values
- rows = set(tuple(row) for row in jvalues)
- assert(len(rows) == len(source))
- assert(all(tuple(row) in rows for row in svalues))
- def _assert_all_na(join_chunk, source_columns, join_col):
- for c in source_columns:
- if c in join_col:
- continue
- assert(join_chunk[c].isnull().all())
- def _join_by_hand(a, b, how='left'):
- join_index = a.index.join(b.index, how=how)
- a_re = a.reindex(join_index)
- b_re = b.reindex(join_index)
- result_columns = a.columns.append(b.columns)
- for col, s in compat.iteritems(b_re):
- a_re[col] = s
- return a_re.reindex(columns=result_columns)
- class TestConcatenate(tm.TestCase):
- _multiprocess_can_split_ = True
- def setUp(self):
- self.frame = DataFrame(tm.getSeriesData())
- self.mixed_frame = self.frame.copy()
- self.mixed_frame['foo'] = 'bar'
- def test_append(self):
- begin_index = self.frame.index[:5]
- end_index = self.frame.index[5:]
- begin_frame = self.frame.reindex(begin_index)
- end_frame = self.frame.reindex(end_index)
- appended = begin_frame.append(end_frame)
- …
Large files files are truncated, but you can click here to view the full file