PageRenderTime 77ms CodeModel.GetById 25ms RepoModel.GetById 0ms app.codeStats 1ms

/pandas/tools/tests/test_merge.py

http://github.com/pydata/pandas
Python | 2158 lines | 1849 code | 260 blank | 49 comment | 24 complexity | 9d08c743c46fdbf243d91045fac3b860 MD5 | raw file
Possible License(s): BSD-3-Clause, Apache-2.0
  1. # pylint: disable=E1103
  2. import nose
  3. from datetime import datetime
  4. from numpy.random import randn
  5. from numpy import nan
  6. import numpy as np
  7. import random
  8. import pandas as pd
  9. from pandas.compat import range, lrange, lzip, zip, StringIO
  10. from pandas import compat, _np_version_under1p7
  11. from pandas.tseries.index import DatetimeIndex
  12. from pandas.tools.merge import merge, concat, ordered_merge, MergeError
  13. from pandas.util.testing import (assert_frame_equal, assert_series_equal,
  14. assert_almost_equal, rands,
  15. makeCustomDataframe as mkdf,
  16. assertRaisesRegexp)
  17. from pandas import isnull, DataFrame, Index, MultiIndex, Panel, Series, date_range, read_table, read_csv
  18. import pandas.algos as algos
  19. import pandas.util.testing as tm
  20. a_ = np.array
  21. N = 50
  22. NGROUPS = 8
  23. JOIN_TYPES = ['inner', 'outer', 'left', 'right']
  24. def get_test_data(ngroups=NGROUPS, n=N):
  25. unique_groups = lrange(ngroups)
  26. arr = np.asarray(np.tile(unique_groups, n // ngroups))
  27. if len(arr) < n:
  28. arr = np.asarray(list(arr) + unique_groups[:n - len(arr)])
  29. random.shuffle(arr)
  30. return arr
  31. class TestMerge(tm.TestCase):
  32. _multiprocess_can_split_ = True
  33. def setUp(self):
  34. # aggregate multiple columns
  35. self.df = DataFrame({'key1': get_test_data(),
  36. 'key2': get_test_data(),
  37. 'data1': np.random.randn(N),
  38. 'data2': np.random.randn(N)})
  39. # exclude a couple keys for fun
  40. self.df = self.df[self.df['key2'] > 1]
  41. self.df2 = DataFrame({'key1': get_test_data(n=N // 5),
  42. 'key2': get_test_data(ngroups=NGROUPS // 2,
  43. n=N // 5),
  44. 'value': np.random.randn(N // 5)})
  45. index, data = tm.getMixedTypeDict()
  46. self.target = DataFrame(data, index=index)
  47. # Join on string value
  48. self.source = DataFrame({'MergedA': data['A'], 'MergedD': data['D']},
  49. index=data['C'])
  50. self.left = DataFrame({'key': ['a', 'b', 'c', 'd', 'e', 'e', 'a'],
  51. 'v1': np.random.randn(7)})
  52. self.right = DataFrame({'v2': np.random.randn(4)},
  53. index=['d', 'b', 'c', 'a'])
  54. def test_cython_left_outer_join(self):
  55. left = a_([0, 1, 2, 1, 2, 0, 0, 1, 2, 3, 3], dtype=np.int64)
  56. right = a_([1, 1, 0, 4, 2, 2, 1], dtype=np.int64)
  57. max_group = 5
  58. ls, rs = algos.left_outer_join(left, right, max_group)
  59. exp_ls = left.argsort(kind='mergesort')
  60. exp_rs = right.argsort(kind='mergesort')
  61. exp_li = a_([0, 1, 2, 3, 3, 3, 4, 4, 4, 5, 5, 5,
  62. 6, 6, 7, 7, 8, 8, 9, 10])
  63. exp_ri = a_([0, 0, 0, 1, 2, 3, 1, 2, 3, 1, 2, 3,
  64. 4, 5, 4, 5, 4, 5, -1, -1])
  65. exp_ls = exp_ls.take(exp_li)
  66. exp_ls[exp_li == -1] = -1
  67. exp_rs = exp_rs.take(exp_ri)
  68. exp_rs[exp_ri == -1] = -1
  69. self.assert_numpy_array_equal(ls, exp_ls)
  70. self.assert_numpy_array_equal(rs, exp_rs)
  71. def test_cython_right_outer_join(self):
  72. left = a_([0, 1, 2, 1, 2, 0, 0, 1, 2, 3, 3], dtype=np.int64)
  73. right = a_([1, 1, 0, 4, 2, 2, 1], dtype=np.int64)
  74. max_group = 5
  75. rs, ls = algos.left_outer_join(right, left, max_group)
  76. exp_ls = left.argsort(kind='mergesort')
  77. exp_rs = right.argsort(kind='mergesort')
  78. # 0 1 1 1
  79. exp_li = a_([0, 1, 2, 3, 4, 5, 3, 4, 5, 3, 4, 5,
  80. # 2 2 4
  81. 6, 7, 8, 6, 7, 8, -1])
  82. exp_ri = a_([0, 0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3,
  83. 4, 4, 4, 5, 5, 5, 6])
  84. exp_ls = exp_ls.take(exp_li)
  85. exp_ls[exp_li == -1] = -1
  86. exp_rs = exp_rs.take(exp_ri)
  87. exp_rs[exp_ri == -1] = -1
  88. self.assert_numpy_array_equal(ls, exp_ls)
  89. self.assert_numpy_array_equal(rs, exp_rs)
  90. def test_cython_inner_join(self):
  91. left = a_([0, 1, 2, 1, 2, 0, 0, 1, 2, 3, 3], dtype=np.int64)
  92. right = a_([1, 1, 0, 4, 2, 2, 1, 4], dtype=np.int64)
  93. max_group = 5
  94. ls, rs = algos.inner_join(left, right, max_group)
  95. exp_ls = left.argsort(kind='mergesort')
  96. exp_rs = right.argsort(kind='mergesort')
  97. exp_li = a_([0, 1, 2, 3, 3, 3, 4, 4, 4, 5, 5, 5,
  98. 6, 6, 7, 7, 8, 8])
  99. exp_ri = a_([0, 0, 0, 1, 2, 3, 1, 2, 3, 1, 2, 3,
  100. 4, 5, 4, 5, 4, 5])
  101. exp_ls = exp_ls.take(exp_li)
  102. exp_ls[exp_li == -1] = -1
  103. exp_rs = exp_rs.take(exp_ri)
  104. exp_rs[exp_ri == -1] = -1
  105. self.assert_numpy_array_equal(ls, exp_ls)
  106. self.assert_numpy_array_equal(rs, exp_rs)
  107. def test_left_outer_join(self):
  108. joined_key2 = merge(self.df, self.df2, on='key2')
  109. _check_join(self.df, self.df2, joined_key2, ['key2'], how='left')
  110. joined_both = merge(self.df, self.df2)
  111. _check_join(self.df, self.df2, joined_both, ['key1', 'key2'],
  112. how='left')
  113. def test_right_outer_join(self):
  114. joined_key2 = merge(self.df, self.df2, on='key2', how='right')
  115. _check_join(self.df, self.df2, joined_key2, ['key2'], how='right')
  116. joined_both = merge(self.df, self.df2, how='right')
  117. _check_join(self.df, self.df2, joined_both, ['key1', 'key2'],
  118. how='right')
  119. def test_full_outer_join(self):
  120. joined_key2 = merge(self.df, self.df2, on='key2', how='outer')
  121. _check_join(self.df, self.df2, joined_key2, ['key2'], how='outer')
  122. joined_both = merge(self.df, self.df2, how='outer')
  123. _check_join(self.df, self.df2, joined_both, ['key1', 'key2'],
  124. how='outer')
  125. def test_inner_join(self):
  126. joined_key2 = merge(self.df, self.df2, on='key2', how='inner')
  127. _check_join(self.df, self.df2, joined_key2, ['key2'], how='inner')
  128. joined_both = merge(self.df, self.df2, how='inner')
  129. _check_join(self.df, self.df2, joined_both, ['key1', 'key2'],
  130. how='inner')
  131. def test_handle_overlap(self):
  132. joined = merge(self.df, self.df2, on='key2',
  133. suffixes=['.foo', '.bar'])
  134. self.assertIn('key1.foo', joined)
  135. self.assertIn('key1.bar', joined)
  136. def test_handle_overlap_arbitrary_key(self):
  137. joined = merge(self.df, self.df2,
  138. left_on='key2', right_on='key1',
  139. suffixes=['.foo', '.bar'])
  140. self.assertIn('key1.foo', joined)
  141. self.assertIn('key2.bar', joined)
  142. def test_merge_common(self):
  143. joined = merge(self.df, self.df2)
  144. exp = merge(self.df, self.df2, on=['key1', 'key2'])
  145. tm.assert_frame_equal(joined, exp)
  146. def test_join_on(self):
  147. target = self.target
  148. source = self.source
  149. merged = target.join(source, on='C')
  150. self.assert_numpy_array_equal(merged['MergedA'], target['A'])
  151. self.assert_numpy_array_equal(merged['MergedD'], target['D'])
  152. # join with duplicates (fix regression from DataFrame/Matrix merge)
  153. df = DataFrame({'key': ['a', 'a', 'b', 'b', 'c']})
  154. df2 = DataFrame({'value': [0, 1, 2]}, index=['a', 'b', 'c'])
  155. joined = df.join(df2, on='key')
  156. expected = DataFrame({'key': ['a', 'a', 'b', 'b', 'c'],
  157. 'value': [0, 0, 1, 1, 2]})
  158. assert_frame_equal(joined, expected)
  159. # Test when some are missing
  160. df_a = DataFrame([[1], [2], [3]], index=['a', 'b', 'c'],
  161. columns=['one'])
  162. df_b = DataFrame([['foo'], ['bar']], index=[1, 2],
  163. columns=['two'])
  164. df_c = DataFrame([[1], [2]], index=[1, 2],
  165. columns=['three'])
  166. joined = df_a.join(df_b, on='one')
  167. joined = joined.join(df_c, on='one')
  168. self.assertTrue(np.isnan(joined['two']['c']))
  169. self.assertTrue(np.isnan(joined['three']['c']))
  170. # merge column not p resent
  171. self.assertRaises(Exception, target.join, source, on='E')
  172. # overlap
  173. source_copy = source.copy()
  174. source_copy['A'] = 0
  175. self.assertRaises(Exception, target.join, source_copy, on='A')
  176. def test_join_on_fails_with_different_right_index(self):
  177. with tm.assertRaises(ValueError):
  178. df = DataFrame({'a': tm.choice(['m', 'f'], size=3),
  179. 'b': np.random.randn(3)})
  180. df2 = DataFrame({'a': tm.choice(['m', 'f'], size=10),
  181. 'b': np.random.randn(10)},
  182. index=tm.makeCustomIndex(10, 2))
  183. merge(df, df2, left_on='a', right_index=True)
  184. def test_join_on_fails_with_different_left_index(self):
  185. with tm.assertRaises(ValueError):
  186. df = DataFrame({'a': tm.choice(['m', 'f'], size=3),
  187. 'b': np.random.randn(3)},
  188. index=tm.makeCustomIndex(10, 2))
  189. df2 = DataFrame({'a': tm.choice(['m', 'f'], size=10),
  190. 'b': np.random.randn(10)})
  191. merge(df, df2, right_on='b', left_index=True)
  192. def test_join_on_fails_with_different_column_counts(self):
  193. with tm.assertRaises(ValueError):
  194. df = DataFrame({'a': tm.choice(['m', 'f'], size=3),
  195. 'b': np.random.randn(3)})
  196. df2 = DataFrame({'a': tm.choice(['m', 'f'], size=10),
  197. 'b': np.random.randn(10)},
  198. index=tm.makeCustomIndex(10, 2))
  199. merge(df, df2, right_on='a', left_on=['a', 'b'])
  200. def test_join_on_pass_vector(self):
  201. expected = self.target.join(self.source, on='C')
  202. del expected['C']
  203. join_col = self.target.pop('C')
  204. result = self.target.join(self.source, on=join_col)
  205. assert_frame_equal(result, expected)
  206. def test_join_with_len0(self):
  207. # nothing to merge
  208. merged = self.target.join(self.source.reindex([]), on='C')
  209. for col in self.source:
  210. self.assertIn(col, merged)
  211. self.assertTrue(merged[col].isnull().all())
  212. merged2 = self.target.join(self.source.reindex([]), on='C',
  213. how='inner')
  214. self.assertTrue(merged2.columns.equals(merged.columns))
  215. self.assertEqual(len(merged2), 0)
  216. def test_join_on_inner(self):
  217. df = DataFrame({'key': ['a', 'a', 'd', 'b', 'b', 'c']})
  218. df2 = DataFrame({'value': [0, 1]}, index=['a', 'b'])
  219. joined = df.join(df2, on='key', how='inner')
  220. expected = df.join(df2, on='key')
  221. expected = expected[expected['value'].notnull()]
  222. self.assert_numpy_array_equal(joined['key'], expected['key'])
  223. self.assert_numpy_array_equal(joined['value'], expected['value'])
  224. self.assertTrue(joined.index.equals(expected.index))
  225. def test_join_on_singlekey_list(self):
  226. df = DataFrame({'key': ['a', 'a', 'b', 'b', 'c']})
  227. df2 = DataFrame({'value': [0, 1, 2]}, index=['a', 'b', 'c'])
  228. # corner cases
  229. joined = df.join(df2, on=['key'])
  230. expected = df.join(df2, on='key')
  231. assert_frame_equal(joined, expected)
  232. def test_join_on_series(self):
  233. result = self.target.join(self.source['MergedA'], on='C')
  234. expected = self.target.join(self.source[['MergedA']], on='C')
  235. assert_frame_equal(result, expected)
  236. def test_join_on_series_buglet(self):
  237. # GH #638
  238. df = DataFrame({'a': [1, 1]})
  239. ds = Series([2], index=[1], name='b')
  240. result = df.join(ds, on='a')
  241. expected = DataFrame({'a': [1, 1],
  242. 'b': [2, 2]}, index=df.index)
  243. tm.assert_frame_equal(result, expected)
  244. def test_join_index_mixed(self):
  245. df1 = DataFrame({'A': 1., 'B': 2, 'C': 'foo', 'D': True},
  246. index=np.arange(10),
  247. columns=['A', 'B', 'C', 'D'])
  248. self.assertEqual(df1['B'].dtype, np.int64)
  249. self.assertEqual(df1['D'].dtype, np.bool_)
  250. df2 = DataFrame({'A': 1., 'B': 2, 'C': 'foo', 'D': True},
  251. index=np.arange(0, 10, 2),
  252. columns=['A', 'B', 'C', 'D'])
  253. # overlap
  254. joined = df1.join(df2, lsuffix='_one', rsuffix='_two')
  255. expected_columns = ['A_one', 'B_one', 'C_one', 'D_one',
  256. 'A_two', 'B_two', 'C_two', 'D_two']
  257. df1.columns = expected_columns[:4]
  258. df2.columns = expected_columns[4:]
  259. expected = _join_by_hand(df1, df2)
  260. assert_frame_equal(joined, expected)
  261. # no overlapping blocks
  262. df1 = DataFrame(index=np.arange(10))
  263. df1['bool'] = True
  264. df1['string'] = 'foo'
  265. df2 = DataFrame(index=np.arange(5, 15))
  266. df2['int'] = 1
  267. df2['float'] = 1.
  268. for kind in JOIN_TYPES:
  269. joined = df1.join(df2, how=kind)
  270. expected = _join_by_hand(df1, df2, how=kind)
  271. assert_frame_equal(joined, expected)
  272. joined = df2.join(df1, how=kind)
  273. expected = _join_by_hand(df2, df1, how=kind)
  274. assert_frame_equal(joined, expected)
  275. def test_join_empty_bug(self):
  276. # generated an exception in 0.4.3
  277. x = DataFrame()
  278. x.join(DataFrame([3], index=[0], columns=['A']), how='outer')
  279. def test_join_unconsolidated(self):
  280. # GH #331
  281. a = DataFrame(randn(30, 2), columns=['a', 'b'])
  282. c = Series(randn(30))
  283. a['c'] = c
  284. d = DataFrame(randn(30, 1), columns=['q'])
  285. # it works!
  286. a.join(d)
  287. d.join(a)
  288. def test_join_multiindex(self):
  289. index1 = MultiIndex.from_arrays([['a', 'a', 'a', 'b', 'b', 'b'],
  290. [1, 2, 3, 1, 2, 3]],
  291. names=['first', 'second'])
  292. index2 = MultiIndex.from_arrays([['b', 'b', 'b', 'c', 'c', 'c'],
  293. [1, 2, 3, 1, 2, 3]],
  294. names=['first', 'second'])
  295. df1 = DataFrame(data=np.random.randn(6), index=index1,
  296. columns=['var X'])
  297. df2 = DataFrame(data=np.random.randn(6), index=index2,
  298. columns=['var Y'])
  299. df1 = df1.sortlevel(0)
  300. df2 = df2.sortlevel(0)
  301. joined = df1.join(df2, how='outer')
  302. ex_index = index1._tuple_index + index2._tuple_index
  303. expected = df1.reindex(ex_index).join(df2.reindex(ex_index))
  304. expected.index.names = index1.names
  305. assert_frame_equal(joined, expected)
  306. self.assertEqual(joined.index.names, index1.names)
  307. df1 = df1.sortlevel(1)
  308. df2 = df2.sortlevel(1)
  309. joined = df1.join(df2, how='outer').sortlevel(0)
  310. ex_index = index1._tuple_index + index2._tuple_index
  311. expected = df1.reindex(ex_index).join(df2.reindex(ex_index))
  312. expected.index.names = index1.names
  313. assert_frame_equal(joined, expected)
  314. self.assertEqual(joined.index.names, index1.names)
  315. def test_join_inner_multiindex(self):
  316. key1 = ['bar', 'bar', 'bar', 'foo', 'foo', 'baz', 'baz', 'qux',
  317. 'qux', 'snap']
  318. key2 = ['two', 'one', 'three', 'one', 'two', 'one', 'two', 'two',
  319. 'three', 'one']
  320. data = np.random.randn(len(key1))
  321. data = DataFrame({'key1': key1, 'key2': key2,
  322. 'data': data})
  323. index = MultiIndex(levels=[['foo', 'bar', 'baz', 'qux'],
  324. ['one', 'two', 'three']],
  325. labels=[[0, 0, 0, 1, 1, 2, 2, 3, 3, 3],
  326. [0, 1, 2, 0, 1, 1, 2, 0, 1, 2]],
  327. names=['first', 'second'])
  328. to_join = DataFrame(np.random.randn(10, 3), index=index,
  329. columns=['j_one', 'j_two', 'j_three'])
  330. joined = data.join(to_join, on=['key1', 'key2'], how='inner')
  331. expected = merge(data, to_join.reset_index(),
  332. left_on=['key1', 'key2'],
  333. right_on=['first', 'second'], how='inner',
  334. sort=False)
  335. expected2 = merge(to_join, data,
  336. right_on=['key1', 'key2'], left_index=True,
  337. how='inner', sort=False)
  338. assert_frame_equal(joined, expected2.reindex_like(joined))
  339. expected2 = merge(to_join, data, right_on=['key1', 'key2'],
  340. left_index=True, how='inner', sort=False)
  341. expected = expected.drop(['first', 'second'], axis=1)
  342. expected.index = joined.index
  343. self.assertTrue(joined.index.is_monotonic)
  344. assert_frame_equal(joined, expected)
  345. # _assert_same_contents(expected, expected2.ix[:, expected.columns])
  346. def test_join_hierarchical_mixed(self):
  347. df = DataFrame([(1, 2, 3), (4, 5, 6)], columns=['a', 'b', 'c'])
  348. new_df = df.groupby(['a']).agg({'b': [np.mean, np.sum]})
  349. other_df = DataFrame(
  350. [(1, 2, 3), (7, 10, 6)], columns=['a', 'b', 'd'])
  351. other_df.set_index('a', inplace=True)
  352. result = merge(new_df, other_df, left_index=True, right_index=True)
  353. self.assertTrue(('b', 'mean') in result)
  354. self.assertTrue('b' in result)
  355. def test_join_float64_float32(self):
  356. a = DataFrame(randn(10, 2), columns=['a', 'b'], dtype = np.float64)
  357. b = DataFrame(randn(10, 1), columns=['c'], dtype = np.float32)
  358. joined = a.join(b)
  359. self.assertEqual(joined.dtypes['a'], 'float64')
  360. self.assertEqual(joined.dtypes['b'], 'float64')
  361. self.assertEqual(joined.dtypes['c'], 'float32')
  362. a = np.random.randint(0, 5, 100).astype('int64')
  363. b = np.random.random(100).astype('float64')
  364. c = np.random.random(100).astype('float32')
  365. df = DataFrame({'a': a, 'b': b, 'c': c})
  366. xpdf = DataFrame({'a': a, 'b': b, 'c': c })
  367. s = DataFrame(np.random.random(5).astype('float32'), columns=['md'])
  368. rs = df.merge(s, left_on='a', right_index=True)
  369. self.assertEqual(rs.dtypes['a'], 'int64')
  370. self.assertEqual(rs.dtypes['b'], 'float64')
  371. self.assertEqual(rs.dtypes['c'], 'float32')
  372. self.assertEqual(rs.dtypes['md'], 'float32')
  373. xp = xpdf.merge(s, left_on='a', right_index=True)
  374. assert_frame_equal(rs, xp)
  375. def test_join_many_non_unique_index(self):
  376. df1 = DataFrame({"a": [1, 1], "b": [1, 1], "c": [10, 20]})
  377. df2 = DataFrame({"a": [1, 1], "b": [1, 2], "d": [100, 200]})
  378. df3 = DataFrame({"a": [1, 1], "b": [1, 2], "e": [1000, 2000]})
  379. idf1 = df1.set_index(["a", "b"])
  380. idf2 = df2.set_index(["a", "b"])
  381. idf3 = df3.set_index(["a", "b"])
  382. result = idf1.join([idf2, idf3], how='outer')
  383. df_partially_merged = merge(df1, df2, on=['a', 'b'], how='outer')
  384. expected = merge(df_partially_merged, df3, on=['a', 'b'], how='outer')
  385. result = result.reset_index()
  386. result['a'] = result['a'].astype(np.float64)
  387. result['b'] = result['b'].astype(np.float64)
  388. assert_frame_equal(result, expected.ix[:, result.columns])
  389. df1 = DataFrame({"a": [1, 1, 1], "b": [1, 1, 1], "c": [10, 20, 30]})
  390. df2 = DataFrame({"a": [1, 1, 1], "b": [1, 1, 2], "d": [100, 200, 300]})
  391. df3 = DataFrame(
  392. {"a": [1, 1, 1], "b": [1, 1, 2], "e": [1000, 2000, 3000]})
  393. idf1 = df1.set_index(["a", "b"])
  394. idf2 = df2.set_index(["a", "b"])
  395. idf3 = df3.set_index(["a", "b"])
  396. result = idf1.join([idf2, idf3], how='inner')
  397. df_partially_merged = merge(df1, df2, on=['a', 'b'], how='inner')
  398. expected = merge(df_partially_merged, df3, on=['a', 'b'], how='inner')
  399. result = result.reset_index()
  400. assert_frame_equal(result, expected.ix[:, result.columns])
  401. def test_merge_index_singlekey_right_vs_left(self):
  402. left = DataFrame({'key': ['a', 'b', 'c', 'd', 'e', 'e', 'a'],
  403. 'v1': np.random.randn(7)})
  404. right = DataFrame({'v2': np.random.randn(4)},
  405. index=['d', 'b', 'c', 'a'])
  406. merged1 = merge(left, right, left_on='key',
  407. right_index=True, how='left', sort=False)
  408. merged2 = merge(right, left, right_on='key',
  409. left_index=True, how='right', sort=False)
  410. assert_frame_equal(merged1, merged2.ix[:, merged1.columns])
  411. merged1 = merge(left, right, left_on='key',
  412. right_index=True, how='left', sort=True)
  413. merged2 = merge(right, left, right_on='key',
  414. left_index=True, how='right', sort=True)
  415. assert_frame_equal(merged1, merged2.ix[:, merged1.columns])
  416. def test_merge_index_singlekey_inner(self):
  417. left = DataFrame({'key': ['a', 'b', 'c', 'd', 'e', 'e', 'a'],
  418. 'v1': np.random.randn(7)})
  419. right = DataFrame({'v2': np.random.randn(4)},
  420. index=['d', 'b', 'c', 'a'])
  421. # inner join
  422. result = merge(left, right, left_on='key', right_index=True,
  423. how='inner')
  424. expected = left.join(right, on='key').ix[result.index]
  425. assert_frame_equal(result, expected)
  426. result = merge(right, left, right_on='key', left_index=True,
  427. how='inner')
  428. expected = left.join(right, on='key').ix[result.index]
  429. assert_frame_equal(result, expected.ix[:, result.columns])
  430. def test_merge_misspecified(self):
  431. self.assertRaises(Exception, merge, self.left, self.right,
  432. left_index=True)
  433. self.assertRaises(Exception, merge, self.left, self.right,
  434. right_index=True)
  435. self.assertRaises(Exception, merge, self.left, self.left,
  436. left_on='key', on='key')
  437. self.assertRaises(Exception, merge, self.df, self.df2,
  438. left_on=['key1'], right_on=['key1', 'key2'])
  439. def test_merge_overlap(self):
  440. merged = merge(self.left, self.left, on='key')
  441. exp_len = (self.left['key'].value_counts() ** 2).sum()
  442. self.assertEqual(len(merged), exp_len)
  443. self.assertIn('v1_x', merged)
  444. self.assertIn('v1_y', merged)
  445. def test_merge_different_column_key_names(self):
  446. left = DataFrame({'lkey': ['foo', 'bar', 'baz', 'foo'],
  447. 'value': [1, 2, 3, 4]})
  448. right = DataFrame({'rkey': ['foo', 'bar', 'qux', 'foo'],
  449. 'value': [5, 6, 7, 8]})
  450. merged = left.merge(right, left_on='lkey', right_on='rkey',
  451. how='outer', sort=True)
  452. assert_almost_equal(merged['lkey'],
  453. ['bar', 'baz', 'foo', 'foo', 'foo', 'foo', np.nan])
  454. assert_almost_equal(merged['rkey'],
  455. ['bar', np.nan, 'foo', 'foo', 'foo', 'foo', 'qux'])
  456. assert_almost_equal(merged['value_x'], [2, 3, 1, 1, 4, 4, np.nan])
  457. assert_almost_equal(merged['value_y'], [6, np.nan, 5, 8, 5, 8, 7])
  458. def test_merge_copy(self):
  459. left = DataFrame({'a': 0, 'b': 1}, index=lrange(10))
  460. right = DataFrame({'c': 'foo', 'd': 'bar'}, index=lrange(10))
  461. merged = merge(left, right, left_index=True,
  462. right_index=True, copy=True)
  463. merged['a'] = 6
  464. self.assertTrue((left['a'] == 0).all())
  465. merged['d'] = 'peekaboo'
  466. self.assertTrue((right['d'] == 'bar').all())
  467. def test_merge_nocopy(self):
  468. left = DataFrame({'a': 0, 'b': 1}, index=lrange(10))
  469. right = DataFrame({'c': 'foo', 'd': 'bar'}, index=lrange(10))
  470. merged = merge(left, right, left_index=True,
  471. right_index=True, copy=False)
  472. merged['a'] = 6
  473. self.assertTrue((left['a'] == 6).all())
  474. merged['d'] = 'peekaboo'
  475. self.assertTrue((right['d'] == 'peekaboo').all())
  476. def test_join_sort(self):
  477. left = DataFrame({'key': ['foo', 'bar', 'baz', 'foo'],
  478. 'value': [1, 2, 3, 4]})
  479. right = DataFrame({'value2': ['a', 'b', 'c']},
  480. index=['bar', 'baz', 'foo'])
  481. joined = left.join(right, on='key', sort=True)
  482. expected = DataFrame({'key': ['bar', 'baz', 'foo', 'foo'],
  483. 'value': [2, 3, 1, 4],
  484. 'value2': ['a', 'b', 'c', 'c']},
  485. index=[1, 2, 0, 3])
  486. assert_frame_equal(joined, expected)
  487. # smoke test
  488. joined = left.join(right, on='key', sort=False)
  489. self.assert_numpy_array_equal(joined.index, lrange(4))
  490. def test_intelligently_handle_join_key(self):
  491. # #733, be a bit more 1337 about not returning unconsolidated DataFrame
  492. left = DataFrame({'key': [1, 1, 2, 2, 3],
  493. 'value': lrange(5)}, columns=['value', 'key'])
  494. right = DataFrame({'key': [1, 1, 2, 3, 4, 5],
  495. 'rvalue': lrange(6)})
  496. joined = merge(left, right, on='key', how='outer')
  497. expected = DataFrame({'key': [1, 1, 1, 1, 2, 2, 3, 4, 5.],
  498. 'value': np.array([0, 0, 1, 1, 2, 3, 4,
  499. np.nan, np.nan]),
  500. 'rvalue': np.array([0, 1, 0, 1, 2, 2, 3, 4, 5])},
  501. columns=['value', 'key', 'rvalue'])
  502. assert_frame_equal(joined, expected, check_dtype=False)
  503. self.assertTrue(joined._data.is_consolidated())
  504. def test_handle_join_key_pass_array(self):
  505. left = DataFrame({'key': [1, 1, 2, 2, 3],
  506. 'value': lrange(5)}, columns=['value', 'key'])
  507. right = DataFrame({'rvalue': lrange(6)})
  508. key = np.array([1, 1, 2, 3, 4, 5])
  509. merged = merge(left, right, left_on='key', right_on=key, how='outer')
  510. merged2 = merge(right, left, left_on=key, right_on='key', how='outer')
  511. assert_series_equal(merged['key'], merged2['key'])
  512. self.assertTrue(merged['key'].notnull().all())
  513. self.assertTrue(merged2['key'].notnull().all())
  514. left = DataFrame({'value': lrange(5)}, columns=['value'])
  515. right = DataFrame({'rvalue': lrange(6)})
  516. lkey = np.array([1, 1, 2, 2, 3])
  517. rkey = np.array([1, 1, 2, 3, 4, 5])
  518. merged = merge(left, right, left_on=lkey, right_on=rkey, how='outer')
  519. self.assert_numpy_array_equal(merged['key_0'],
  520. np.array([1, 1, 1, 1, 2, 2, 3, 4, 5]))
  521. left = DataFrame({'value': lrange(3)})
  522. right = DataFrame({'rvalue': lrange(6)})
  523. key = np.array([0, 1, 1, 2, 2, 3])
  524. merged = merge(left, right, left_index=True, right_on=key, how='outer')
  525. self.assert_numpy_array_equal(merged['key_0'], key)
  526. def test_mixed_type_join_with_suffix(self):
  527. # GH #916
  528. df = DataFrame(np.random.randn(20, 6),
  529. columns=['a', 'b', 'c', 'd', 'e', 'f'])
  530. df.insert(0, 'id', 0)
  531. df.insert(5, 'dt', 'foo')
  532. grouped = df.groupby('id')
  533. mn = grouped.mean()
  534. cn = grouped.count()
  535. # it works!
  536. mn.join(cn, rsuffix='_right')
  537. def test_no_overlap_more_informative_error(self):
  538. dt = datetime.now()
  539. df1 = DataFrame({'x': ['a']}, index=[dt])
  540. df2 = DataFrame({'y': ['b', 'c']}, index=[dt, dt])
  541. self.assertRaises(MergeError, merge, df1, df2)
  542. def test_merge_non_unique_indexes(self):
  543. dt = datetime(2012, 5, 1)
  544. dt2 = datetime(2012, 5, 2)
  545. dt3 = datetime(2012, 5, 3)
  546. dt4 = datetime(2012, 5, 4)
  547. df1 = DataFrame({'x': ['a']}, index=[dt])
  548. df2 = DataFrame({'y': ['b', 'c']}, index=[dt, dt])
  549. _check_merge(df1, df2)
  550. # Not monotonic
  551. df1 = DataFrame({'x': ['a', 'b', 'q']}, index=[dt2, dt, dt4])
  552. df2 = DataFrame({'y': ['c', 'd', 'e', 'f', 'g', 'h']},
  553. index=[dt3, dt3, dt2, dt2, dt, dt])
  554. _check_merge(df1, df2)
  555. df1 = DataFrame({'x': ['a', 'b']}, index=[dt, dt])
  556. df2 = DataFrame({'y': ['c', 'd']}, index=[dt, dt])
  557. _check_merge(df1, df2)
  558. def test_merge_non_unique_index_many_to_many(self):
  559. dt = datetime(2012, 5, 1)
  560. dt2 = datetime(2012, 5, 2)
  561. dt3 = datetime(2012, 5, 3)
  562. df1 = DataFrame({'x': ['a', 'b', 'c', 'd']},
  563. index=[dt2, dt2, dt, dt])
  564. df2 = DataFrame({'y': ['e', 'f', 'g', ' h', 'i']},
  565. index=[dt2, dt2, dt3, dt, dt])
  566. _check_merge(df1, df2)
  567. def test_left_merge_empty_dataframe(self):
  568. left = DataFrame({'key': [1], 'value': [2]})
  569. right = DataFrame({'key': []})
  570. result = merge(left, right, on='key', how='left')
  571. assert_frame_equal(result, left)
  572. result = merge(right, left, on='key', how='right')
  573. assert_frame_equal(result, left)
  574. def test_merge_nosort(self):
  575. # #2098, anything to do?
  576. from datetime import datetime
  577. d = {"var1": np.random.randint(0, 10, size=10),
  578. "var2": np.random.randint(0, 10, size=10),
  579. "var3": [datetime(2012, 1, 12), datetime(2011, 2, 4),
  580. datetime(
  581. 2010, 2, 3), datetime(2012, 1, 12),
  582. datetime(
  583. 2011, 2, 4), datetime(2012, 4, 3),
  584. datetime(
  585. 2012, 3, 4), datetime(2008, 5, 1),
  586. datetime(2010, 2, 3), datetime(2012, 2, 3)]}
  587. df = DataFrame.from_dict(d)
  588. var3 = df.var3.unique()
  589. var3.sort()
  590. new = DataFrame.from_dict({"var3": var3,
  591. "var8": np.random.random(7)})
  592. result = df.merge(new, on="var3", sort=False)
  593. exp = merge(df, new, on='var3', sort=False)
  594. assert_frame_equal(result, exp)
  595. self.assertTrue((df.var3.unique() == result.var3.unique()).all())
  596. def test_merge_nan_right(self):
  597. df1 = DataFrame({"i1" : [0, 1], "i2" : [0, 1]})
  598. df2 = DataFrame({"i1" : [0], "i3" : [0]})
  599. result = df1.join(df2, on="i1", rsuffix="_")
  600. expected = DataFrame({'i1': {0: 0.0, 1: 1}, 'i2': {0: 0, 1: 1},
  601. 'i1_': {0: 0, 1: np.nan}, 'i3': {0: 0.0, 1: np.nan},
  602. None: {0: 0, 1: 0}}).set_index(None).reset_index()[['i1', 'i2', 'i1_', 'i3']]
  603. assert_frame_equal(result, expected, check_dtype=False)
  604. df1 = DataFrame({"i1" : [0, 1], "i2" : [0.5, 1.5]})
  605. df2 = DataFrame({"i1" : [0], "i3" : [0.7]})
  606. result = df1.join(df2, rsuffix="_", on='i1')
  607. expected = DataFrame({'i1': {0: 0, 1: 1}, 'i1_': {0: 0.0, 1: nan},
  608. 'i2': {0: 0.5, 1: 1.5}, 'i3': {0: 0.69999999999999996,
  609. 1: nan}})[['i1', 'i2', 'i1_', 'i3']]
  610. assert_frame_equal(result, expected)
  611. def test_append_dtype_coerce(self):
  612. # GH 4993
  613. # appending with datetime will incorrectly convert datetime64
  614. import datetime as dt
  615. from pandas import NaT
  616. df1 = DataFrame(index=[1,2], data=[dt.datetime(2013,1,1,0,0),
  617. dt.datetime(2013,1,2,0,0)],
  618. columns=['start_time'])
  619. df2 = DataFrame(index=[4,5], data=[[dt.datetime(2013,1,3,0,0),
  620. dt.datetime(2013,1,3,6,10)],
  621. [dt.datetime(2013,1,4,0,0),
  622. dt.datetime(2013,1,4,7,10)]],
  623. columns=['start_time','end_time'])
  624. expected = concat([
  625. Series([NaT,NaT,dt.datetime(2013,1,3,6,10),dt.datetime(2013,1,4,7,10)],name='end_time'),
  626. Series([dt.datetime(2013,1,1,0,0),dt.datetime(2013,1,2,0,0),dt.datetime(2013,1,3,0,0),dt.datetime(2013,1,4,0,0)],name='start_time'),
  627. ],axis=1)
  628. result = df1.append(df2,ignore_index=True)
  629. assert_frame_equal(result, expected)
  630. def test_join_append_timedeltas(self):
  631. import datetime as dt
  632. from pandas import NaT
  633. # timedelta64 issues with join/merge
  634. # GH 5695
  635. tm._skip_if_not_numpy17_friendly()
  636. d = {'d': dt.datetime(2013, 11, 5, 5, 56), 't': dt.timedelta(0, 22500)}
  637. df = DataFrame(columns=list('dt'))
  638. df = df.append(d, ignore_index=True)
  639. result = df.append(d, ignore_index=True)
  640. expected = DataFrame({'d': [dt.datetime(2013, 11, 5, 5, 56),
  641. dt.datetime(2013, 11, 5, 5, 56) ],
  642. 't': [ dt.timedelta(0, 22500),
  643. dt.timedelta(0, 22500) ]})
  644. assert_frame_equal(result, expected)
  645. td = np.timedelta64(300000000)
  646. lhs = DataFrame(Series([td,td],index=["A","B"]))
  647. rhs = DataFrame(Series([td],index=["A"]))
  648. from pandas import NaT
  649. result = lhs.join(rhs,rsuffix='r', how="left")
  650. expected = DataFrame({ '0' : Series([td,td],index=list('AB')), '0r' : Series([td,NaT],index=list('AB')) })
  651. assert_frame_equal(result, expected)
  652. def test_overlapping_columns_error_message(self):
  653. # #2649
  654. df = DataFrame({'key': [1, 2, 3],
  655. 'v1': [4, 5, 6],
  656. 'v2': [7, 8, 9]})
  657. df2 = DataFrame({'key': [1, 2, 3],
  658. 'v1': [4, 5, 6],
  659. 'v2': [7, 8, 9]})
  660. df.columns = ['key', 'foo', 'foo']
  661. df2.columns = ['key', 'bar', 'bar']
  662. self.assertRaises(Exception, merge, df, df2)
  663. def _check_merge(x, y):
  664. for how in ['inner', 'left', 'outer']:
  665. result = x.join(y, how=how)
  666. expected = merge(x.reset_index(), y.reset_index(), how=how,
  667. sort=True)
  668. expected = expected.set_index('index')
  669. assert_frame_equal(result, expected, check_names=False) # TODO check_names on merge?
  670. class TestMergeMulti(tm.TestCase):
  671. def setUp(self):
  672. self.index = MultiIndex(levels=[['foo', 'bar', 'baz', 'qux'],
  673. ['one', 'two', 'three']],
  674. labels=[[0, 0, 0, 1, 1, 2, 2, 3, 3, 3],
  675. [0, 1, 2, 0, 1, 1, 2, 0, 1, 2]],
  676. names=['first', 'second'])
  677. self.to_join = DataFrame(np.random.randn(10, 3), index=self.index,
  678. columns=['j_one', 'j_two', 'j_three'])
  679. # a little relevant example with NAs
  680. key1 = ['bar', 'bar', 'bar', 'foo', 'foo', 'baz', 'baz', 'qux',
  681. 'qux', 'snap']
  682. key2 = ['two', 'one', 'three', 'one', 'two', 'one', 'two', 'two',
  683. 'three', 'one']
  684. data = np.random.randn(len(key1))
  685. self.data = DataFrame({'key1': key1, 'key2': key2,
  686. 'data': data})
  687. def test_merge_on_multikey(self):
  688. joined = self.data.join(self.to_join, on=['key1', 'key2'])
  689. join_key = Index(lzip(self.data['key1'], self.data['key2']))
  690. indexer = self.to_join.index.get_indexer(join_key)
  691. ex_values = self.to_join.values.take(indexer, axis=0)
  692. ex_values[indexer == -1] = np.nan
  693. expected = self.data.join(DataFrame(ex_values,
  694. columns=self.to_join.columns))
  695. # TODO: columns aren't in the same order yet
  696. assert_frame_equal(joined, expected.ix[:, joined.columns])
  697. def test_merge_right_vs_left(self):
  698. # compare left vs right merge with multikey
  699. merged1 = self.data.merge(self.to_join, left_on=['key1', 'key2'],
  700. right_index=True, how='left')
  701. merged2 = self.to_join.merge(self.data, right_on=['key1', 'key2'],
  702. left_index=True, how='right')
  703. merged2 = merged2.ix[:, merged1.columns]
  704. assert_frame_equal(merged1, merged2)
  705. def test_compress_group_combinations(self):
  706. # ~ 40000000 possible unique groups
  707. key1 = np.array([rands(10) for _ in range(10000)], dtype='O')
  708. key1 = np.tile(key1, 2)
  709. key2 = key1[::-1]
  710. df = DataFrame({'key1': key1, 'key2': key2,
  711. 'value1': np.random.randn(20000)})
  712. df2 = DataFrame({'key1': key1[::2], 'key2': key2[::2],
  713. 'value2': np.random.randn(10000)})
  714. # just to hit the label compression code path
  715. merged = merge(df, df2, how='outer')
  716. def test_left_join_index_preserve_order(self):
  717. left = DataFrame({'k1': [0, 1, 2] * 8,
  718. 'k2': ['foo', 'bar'] * 12,
  719. 'v': np.array(np.arange(24),dtype=np.int64) })
  720. index = MultiIndex.from_tuples([(2, 'bar'), (1, 'foo')])
  721. right = DataFrame({'v2': [5, 7]}, index=index)
  722. result = left.join(right, on=['k1', 'k2'])
  723. expected = left.copy()
  724. expected['v2'] = np.nan
  725. expected['v2'][(expected.k1 == 2) & (expected.k2 == 'bar')] = 5
  726. expected['v2'][(expected.k1 == 1) & (expected.k2 == 'foo')] = 7
  727. tm.assert_frame_equal(result, expected)
  728. # test join with multi dtypes blocks
  729. left = DataFrame({'k1': [0, 1, 2] * 8,
  730. 'k2': ['foo', 'bar'] * 12,
  731. 'k3' : np.array([0, 1, 2]*8, dtype=np.float32),
  732. 'v': np.array(np.arange(24),dtype=np.int32) })
  733. index = MultiIndex.from_tuples([(2, 'bar'), (1, 'foo')])
  734. right = DataFrame({'v2': [5, 7]}, index=index)
  735. result = left.join(right, on=['k1', 'k2'])
  736. expected = left.copy()
  737. expected['v2'] = np.nan
  738. expected['v2'][(expected.k1 == 2) & (expected.k2 == 'bar')] = 5
  739. expected['v2'][(expected.k1 == 1) & (expected.k2 == 'foo')] = 7
  740. tm.assert_frame_equal(result, expected)
  741. # do a right join for an extra test
  742. joined = merge(right, left, left_index=True,
  743. right_on=['k1', 'k2'], how='right')
  744. tm.assert_frame_equal(joined.ix[:, expected.columns], expected)
  745. def test_join_multi_dtypes(self):
  746. # test with multi dtypes in the join index
  747. def _test(dtype1,dtype2):
  748. left = DataFrame({'k1': np.array([0, 1, 2] * 8, dtype=dtype1),
  749. 'k2': ['foo', 'bar'] * 12,
  750. 'v': np.array(np.arange(24),dtype=np.int64) })
  751. index = MultiIndex.from_tuples([(2, 'bar'), (1, 'foo')])
  752. right = DataFrame({'v2': np.array([5, 7], dtype=dtype2)}, index=index)
  753. result = left.join(right, on=['k1', 'k2'])
  754. expected = left.copy()
  755. if dtype2.kind == 'i':
  756. dtype2 = np.dtype('float64')
  757. expected['v2'] = np.array(np.nan,dtype=dtype2)
  758. expected['v2'][(expected.k1 == 2) & (expected.k2 == 'bar')] = 5
  759. expected['v2'][(expected.k1 == 1) & (expected.k2 == 'foo')] = 7
  760. tm.assert_frame_equal(result, expected)
  761. for d1 in [np.int64,np.int32,np.int16,np.int8,np.uint8]:
  762. for d2 in [np.int64,np.float64,np.float32,np.float16]:
  763. _test(np.dtype(d1),np.dtype(d2))
  764. def test_left_merge_na_buglet(self):
  765. left = DataFrame({'id': list('abcde'), 'v1': randn(5),
  766. 'v2': randn(5), 'dummy': list('abcde'),
  767. 'v3': randn(5)},
  768. columns=['id', 'v1', 'v2', 'dummy', 'v3'])
  769. right = DataFrame({'id': ['a', 'b', np.nan, np.nan, np.nan],
  770. 'sv3': [1.234, 5.678, np.nan, np.nan, np.nan]})
  771. merged = merge(left, right, on='id', how='left')
  772. rdf = right.drop(['id'], axis=1)
  773. expected = left.join(rdf)
  774. tm.assert_frame_equal(merged, expected)
  775. def test_merge_na_keys(self):
  776. data = [[1950, "A", 1.5],
  777. [1950, "B", 1.5],
  778. [1955, "B", 1.5],
  779. [1960, "B", np.nan],
  780. [1970, "B", 4.],
  781. [1950, "C", 4.],
  782. [1960, "C", np.nan],
  783. [1965, "C", 3.],
  784. [1970, "C", 4.]]
  785. frame = DataFrame(data, columns=["year", "panel", "data"])
  786. other_data = [[1960, 'A', np.nan],
  787. [1970, 'A', np.nan],
  788. [1955, 'A', np.nan],
  789. [1965, 'A', np.nan],
  790. [1965, 'B', np.nan],
  791. [1955, 'C', np.nan]]
  792. other = DataFrame(other_data, columns=['year', 'panel', 'data'])
  793. result = frame.merge(other, how='outer')
  794. expected = frame.fillna(-999).merge(other.fillna(-999), how='outer')
  795. expected = expected.replace(-999, np.nan)
  796. tm.assert_frame_equal(result, expected)
  797. def test_int64_overflow_issues(self):
  798. # #2690, combinatorial explosion
  799. df1 = DataFrame(np.random.randn(1000, 7),
  800. columns=list('ABCDEF') + ['G1'])
  801. df2 = DataFrame(np.random.randn(1000, 7),
  802. columns=list('ABCDEF') + ['G2'])
  803. # it works!
  804. result = merge(df1, df2, how='outer')
  805. self.assertTrue(len(result) == 2000)
  806. def test_join_multi_levels(self):
  807. # GH 3662
  808. # merge multi-levels
  809. household = DataFrame(dict(household_id = [1,2,3],
  810. male = [0,1,0],
  811. wealth = [196087.3,316478.7,294750]),
  812. columns = ['household_id','male','wealth']).set_index('household_id')
  813. portfolio = DataFrame(dict(household_id = [1,2,2,3,3,3,4],
  814. asset_id = ["nl0000301109","nl0000289783","gb00b03mlx29","gb00b03mlx29","lu0197800237","nl0000289965",np.nan],
  815. name = ["ABN Amro","Robeco","Royal Dutch Shell","Royal Dutch Shell","AAB Eastern Europe Equity Fund","Postbank BioTech Fonds",np.nan],
  816. share = [1.0,0.4,0.6,0.15,0.6,0.25,1.0]),
  817. columns = ['household_id','asset_id','name','share']).set_index(['household_id','asset_id'])
  818. result = household.join(portfolio, how='inner')
  819. expected = DataFrame(dict(male = [0,1,1,0,0,0],
  820. wealth = [ 196087.3, 316478.7, 316478.7, 294750.0, 294750.0, 294750.0 ],
  821. name = ['ABN Amro','Robeco','Royal Dutch Shell','Royal Dutch Shell','AAB Eastern Europe Equity Fund','Postbank BioTech Fonds'],
  822. share = [1.00,0.40,0.60,0.15,0.60,0.25],
  823. household_id = [1,2,2,3,3,3],
  824. asset_id = ['nl0000301109','nl0000289783','gb00b03mlx29','gb00b03mlx29','lu0197800237','nl0000289965']),
  825. ).set_index(['household_id','asset_id']).reindex(columns=['male','wealth','name','share'])
  826. assert_frame_equal(result,expected)
  827. assert_frame_equal(result,expected)
  828. # equivalency
  829. result2 = merge(household.reset_index(),portfolio.reset_index(),on=['household_id'],how='inner').set_index(['household_id','asset_id'])
  830. assert_frame_equal(result2,expected)
  831. result = household.join(portfolio, how='outer')
  832. expected = concat([expected,DataFrame(dict(share = [1.00]),
  833. index=MultiIndex.from_tuples([(4,np.nan)],
  834. names=['household_id','asset_id']))],
  835. axis=0).reindex(columns=expected.columns)
  836. assert_frame_equal(result,expected)
  837. # invalid cases
  838. household.index.name = 'foo'
  839. def f():
  840. household.join(portfolio, how='inner')
  841. self.assertRaises(ValueError, f)
  842. portfolio2 = portfolio.copy()
  843. portfolio2.index.set_names(['household_id','foo'])
  844. def f():
  845. portfolio2.join(portfolio, how='inner')
  846. self.assertRaises(ValueError, f)
  847. def test_join_multi_levels2(self):
  848. # some more advanced merges
  849. # GH6360
  850. household = DataFrame(dict(household_id = [1,2,2,3,3,3,4],
  851. asset_id = ["nl0000301109","nl0000301109","gb00b03mlx29","gb00b03mlx29","lu0197800237","nl0000289965",np.nan],
  852. share = [1.0,0.4,0.6,0.15,0.6,0.25,1.0]),
  853. columns = ['household_id','asset_id','share']).set_index(['household_id','asset_id'])
  854. log_return = DataFrame(dict(
  855. asset_id = ["gb00b03mlx29", "gb00b03mlx29", "gb00b03mlx29", "lu0197800237", "lu0197800237"],
  856. t = [233, 234, 235, 180, 181],
  857. log_return = [.09604978, -.06524096, .03532373, .03025441, .036997]
  858. )).set_index(["asset_id","t"])
  859. expected = DataFrame(dict(
  860. household_id = [2, 2, 2, 3, 3, 3, 3, 3],
  861. asset_id = ["gb00b03mlx29", "gb00b03mlx29", "gb00b03mlx29", "gb00b03mlx29", "gb00b03mlx29", "gb00b03mlx29", "lu0197800237", "lu0197800237"],
  862. t = [233, 234, 235, 233, 234, 235, 180, 181],
  863. share = [0.6, 0.6, 0.6, 0.15, 0.15, 0.15, 0.6, 0.6],
  864. log_return = [.09604978, -.06524096, .03532373, .09604978, -.06524096, .03532373, .03025441, .036997]
  865. )).set_index(["household_id", "asset_id", "t"]).reindex(columns=['share','log_return'])
  866. def f():
  867. household.join(log_return, how='inner')
  868. self.assertRaises(NotImplementedError, f)
  869. # this is the equivalency
  870. result = merge(household.reset_index(),log_return.reset_index(),on=['asset_id'],how='inner').set_index(['household_id','asset_id','t'])
  871. assert_frame_equal(result,expected)
  872. expected = DataFrame(dict(
  873. household_id = [1, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 4],
  874. asset_id = ["nl0000301109", "nl0000289783", "gb00b03mlx29", "gb00b03mlx29", "gb00b03mlx29", "gb00b03mlx29", "gb00b03mlx29", "gb00b03mlx29", "lu0197800237", "lu0197800237", "nl0000289965", None],
  875. t = [None, None, 233, 234, 235, 233, 234, 235, 180, 181, None, None],
  876. share = [1.0, 0.4, 0.6, 0.6, 0.6, 0.15, 0.15, 0.15, 0.6, 0.6, 0.25, 1.0],
  877. log_return = [None, None, .09604978, -.06524096, .03532373, .09604978, -.06524096, .03532373, .03025441, .036997, None, None]
  878. )).set_index(["household_id", "asset_id", "t"])
  879. def f():
  880. household.join(log_return, how='outer')
  881. self.assertRaises(NotImplementedError, f)
  882. def _check_join(left, right, result, join_col, how='left',
  883. lsuffix='_x', rsuffix='_y'):
  884. # some smoke tests
  885. for c in join_col:
  886. assert(result[c].notnull().all())
  887. left_grouped = left.groupby(join_col)
  888. right_grouped = right.groupby(join_col)
  889. for group_key, group in result.groupby(join_col):
  890. l_joined = _restrict_to_columns(group, left.columns, lsuffix)
  891. r_joined = _restrict_to_columns(group, right.columns, rsuffix)
  892. try:
  893. lgroup = left_grouped.get_group(group_key)
  894. except KeyError:
  895. if how in ('left', 'inner'):
  896. raise AssertionError('key %s should not have been in the join'
  897. % str(group_key))
  898. _assert_all_na(l_joined, left.columns, join_col)
  899. else:
  900. _assert_same_contents(l_joined, lgroup)
  901. try:
  902. rgroup = right_grouped.get_group(group_key)
  903. except KeyError:
  904. if how in ('right', 'inner'):
  905. raise AssertionError('key %s should not have been in the join'
  906. % str(group_key))
  907. _assert_all_na(r_joined, right.columns, join_col)
  908. else:
  909. _assert_same_contents(r_joined, rgroup)
  910. def _restrict_to_columns(group, columns, suffix):
  911. found = [c for c in group.columns
  912. if c in columns or c.replace(suffix, '') in columns]
  913. # filter
  914. group = group.ix[:, found]
  915. # get rid of suffixes, if any
  916. group = group.rename(columns=lambda x: x.replace(suffix, ''))
  917. # put in the right order...
  918. group = group.ix[:, columns]
  919. return group
  920. def _assert_same_contents(join_chunk, source):
  921. NA_SENTINEL = -1234567 # drop_duplicates not so NA-friendly...
  922. jvalues = join_chunk.fillna(NA_SENTINEL).drop_duplicates().values
  923. svalues = source.fillna(NA_SENTINEL).drop_duplicates().values
  924. rows = set(tuple(row) for row in jvalues)
  925. assert(len(rows) == len(source))
  926. assert(all(tuple(row) in rows for row in svalues))
  927. def _assert_all_na(join_chunk, source_columns, join_col):
  928. for c in source_columns:
  929. if c in join_col:
  930. continue
  931. assert(join_chunk[c].isnull().all())
  932. def _join_by_hand(a, b, how='left'):
  933. join_index = a.index.join(b.index, how=how)
  934. a_re = a.reindex(join_index)
  935. b_re = b.reindex(join_index)
  936. result_columns = a.columns.append(b.columns)
  937. for col, s in compat.iteritems(b_re):
  938. a_re[col] = s
  939. return a_re.reindex(columns=result_columns)
  940. class TestConcatenate(tm.TestCase):
  941. _multiprocess_can_split_ = True
  942. def setUp(self):
  943. self.frame = DataFrame(tm.getSeriesData())
  944. self.mixed_frame = self.frame.copy()
  945. self.mixed_frame['foo'] = 'bar'
  946. def test_append(self):
  947. begin_index = self.frame.index[:5]
  948. end_index = self.frame.index[5:]
  949. begin_frame = self.frame.reindex(begin_index)
  950. end_frame = self.frame.reindex(end_index)
  951. appended = begin_frame.append(end_frame)
  952. assert_almost_equal(appended['A'], self.frame['A'])
  953. del end_frame['A']
  954. partial_appended = begin_frame.append(end_frame)
  955. self.assertIn('A', partial_appended)
  956. partial_appended = end_frame.append(begin_frame)
  957. self.assertIn('A', partial_appended)
  958. # mixed type handling
  959. appended = self.mixed_frame[:5].append(self.mixed_frame[5:])
  960. assert_frame_equal(appended, self.mixed_frame)
  961. # what to test here
  962. mixed_appended = self.mixed_frame[:5].append(self.frame[5:])
  963. mixed_appended2 = self.frame[:5].append(self.mixed_frame[5:])
  964. # all equal except 'foo' column
  965. assert_frame_equal(
  966. mixed_appended.reindex(columns=['A', 'B', 'C', 'D']),
  967. mixed_appended2.reindex(columns=['A', 'B', 'C', 'D']))
  968. # append empty
  969. empty = DataFrame({})
  970. appended = self.frame.append(empty)
  971. assert_frame_equal(self.frame, appended)
  972. self.assertIsNot(appended, self.frame)
  973. appended = empty.append(self.frame)
  974. assert_frame_equal(self.frame, appended)
  975. self.assertIsNot(appended, self.frame)
  976. # overlap
  977. self.assertRaises(ValueError, self.frame.append, self.frame,
  978. verify_integrity=True)
  979. # new columns
  980. # GH 6129
  981. df = DataFrame({'a': {'x': 1, 'y': 2}, 'b': {'x': 3, 'y': 4}})
  982. row = Series([5, 6, 7], index=['a', 'b', 'c'], name='z')
  983. expected = DataFrame({'a': {'x': 1, 'y': 2, 'z': 5}, 'b': {'x': 3, 'y': 4, 'z': 6}, 'c' : {'z' : 7}})
  984. result = df.append(row)
  985. assert_frame_equal(result, expected)
  986. def test_append_length0_frame(self):
  987. df = DataFrame(columns=['A', 'B', 'C'])
  988. df3 = DataFrame(index=[0, 1], columns=['A', 'B'])
  989. df5 = df.append(df3)
  990. expected = DataFrame(index=[0, 1], columns=['A', 'B', 'C'])
  991. assert_frame_equal(df5, expected)
  992. def test_append_records(self):
  993. arr1 = np.zeros((2,), dtype=('i4,f4,a10'))
  994. arr1[:] = [(1, 2., 'Hello'), (2, 3., "World")]
  995. arr2 = np.zeros((3,), dtype=('i4,f4,a10'))
  996. arr2[:] = [(3, 4., 'foo'),
  997. (5, 6., "bar"),
  998. (7., 8., 'baz')]
  999. df1 = DataFrame(arr1)
  1000. df2 = DataFrame(arr2)
  1001. result = df1.append(df2, ignore_index=True)
  1002. expected = DataFrame(np.concatenate((arr1, arr2)))
  1003. assert_frame_equal(result, expected)
  1004. def test_append_different_columns(self):
  1005. df = DataFrame({'bools': np.random.randn(10) > 0,
  1006. 'ints': np.random.randint(0, 10, 10),
  1007. 'floats': np.random.randn(10),
  1008. 'strings': ['foo', 'bar'] * 5})
  1009. a = df[:5].ix[:, ['bools', 'ints', 'floats']]
  1010. b = df[5:].ix[:, ['strings', 'ints', 'floats']]
  1011. appended = a.append(b)
  1012. self.assertTrue(isnull(appended['strings'][0:4]).all())
  1013. self.assertTrue(isnull(appended['bools'][5:]).all())
  1014. def test_append_many(self):
  1015. chunks = [self.frame[:5], self.frame[5:10],
  1016. self.frame[10:15], self.frame[15:]]
  1017. result = chunks[0].append(chunks[1:])
  1018. tm.assert_frame_equal(result, self.frame)
  1019. chunks[-1]['foo'] = 'bar'
  1020. result = chunks[0].append(chunks[1:])
  1021. tm.assert_frame_equal(result.ix[:, self.frame.columns], self.frame)
  1022. self.assertTrue((result['foo'][15:] == 'bar').all())
  1023. self.assertTrue(result['foo'][:15].isnull().all())
  1024. def test_append_preserve_index_name(self):
  1025. # #980
  1026. df1 = DataFrame(data=None, columns=['A', 'B', 'C'])
  1027. df1 = df1.set_index(['A'])
  1028. df2 = DataFrame(data=[[1, 4, 7], [2, 5, 8], [3, 6, 9]],
  1029. columns=['A', 'B', 'C'])
  1030. df2 = df2.set_index(['A'])
  1031. result = df1.append(df2)
  1032. self.assertEqual(result.index.name, 'A')
  1033. def test_join_many(self):
  1034. df = DataFrame(np.random.randn(10, 6), columns=list('abcdef'))
  1035. df_list = [df[['a', 'b']], df[['c', 'd']], df[['e', 'f']]]
  1036. joined = df_list[0].join(df_list[1:])
  1037. tm.assert_frame_equal(joined, df)
  1038. df_list = [df[['a', 'b']][:-2],
  1039. df[['c', 'd']][2:], df[['e', 'f']][1:9]]
  1040. def _check_diff_index(df_list, result, exp_index):
  1041. reindexed = [x.reindex(exp_index) for x in df_list]
  1042. expected = reindexed[0].join(reindexed[1:])
  1043. tm.assert_frame_equal(result, expected)
  1044. # different join types
  1045. joined = df_list[0].join(df_list[1:], how='outer')
  1046. _check_diff_index(df_list, joined, df.index)
  1047. joined = df_list[0].join(df_list[1:])
  1048. _check_diff_index(df_list, joined, df_list[0].index)
  1049. joined = df_list[0].join(df_list[1:], how='inner')
  1050. _check_diff_index(df_list, joined, df.index[2:8])
  1051. self.assertRaises(ValueError, df_list[0].join, df_list[1:], on='a')
  1052. def test_join_many_mixed(self):
  1053. df = DataFrame(np.random.randn(8, 4), columns=['A', 'B', 'C', 'D'])
  1054. df['key'] = ['foo', 'bar'] * 4
  1055. df1 = df.ix[:, ['A', 'B']]
  1056. df2 = df.ix[:, ['C', 'D']]
  1057. df3 = df.ix[:, ['key']]
  1058. result = df1.join([df2, df3])
  1059. assert_frame_equal(result, df)
  1060. def test_append_missing_column_proper_upcast(self):
  1061. df1 = DataFrame({'A': np.array([1, 2, 3, 4], dtype='i8')})
  1062. df2 = DataFrame({'B': np.array([True, False, True, False],
  1063. dtype=bool)})
  1064. appended = df1.append(df2, ignore_index=True)
  1065. self.assertEqual(appended['A'].dtype, 'f8')
  1066. self.assertEqual(appended['B'].dtype, 'O')
  1067. def test_concat_with_group_keys(self):
  1068. df = DataFrame(np.random.randn(4, 3))
  1069. df2 = DataFrame(np.random.randn(4, 4))
  1070. # axis=0
  1071. df = DataFrame(np.random.randn(3, 4))
  1072. df2 = DataFrame(np.random.randn(4, 4))
  1073. result = concat([df, df2], keys=[0, 1])
  1074. exp_index = MultiIndex.from_arrays([[0, 0, 0, 1, 1, 1, 1],
  1075. [0, 1, 2, 0, 1, 2, 3]])
  1076. expected = DataFrame(np.r_[df.values, df2.values],
  1077. index=exp_index)
  1078. tm.assert_frame_equal(result, expected)
  1079. result = concat([df, df], keys=[0, 1])
  1080. exp_index2 = MultiIndex.from_arrays([[0, 0, 0, 1, 1, 1],
  1081. [0, 1, 2, 0, 1, 2]])
  1082. expected = DataFrame(np.r_[df.values, df.values],
  1083. index=exp_index2)
  1084. tm.assert_frame_equal(result, expected)
  1085. # axis=1
  1086. df = DataFrame(np.random.randn(4, 3))
  1087. df2 = DataFrame(np.random.randn(4, 4))
  1088. result = concat([df, df2], keys=[0, 1], axis=1)
  1089. expected = DataFrame(np.c_[df.values, df2.values],
  1090. columns=exp_index)
  1091. tm.assert_frame_equal(result, expected)
  1092. result = concat([df, df], keys=[0, 1], axis=1)
  1093. expected = DataFrame(np.c_[df.values, df.values],
  1094. columns=exp_index2)
  1095. tm.assert_frame_equal(result, expected)
  1096. def test_concat_keys_specific_levels(self):
  1097. df = DataFrame(np.random.randn(10, 4))
  1098. pieces = [df.ix[:, [0, 1]], df.ix[:, [2]], df.ix[:, [3]]]
  1099. level = ['three', 'two', 'one', 'zero']
  1100. result = concat(pieces, axis=1, keys=['one', 'two', 'three'],
  1101. levels=[level],
  1102. names=['group_key'])
  1103. self.assert_numpy_array_equal(result.columns.levels[0], level)
  1104. self.assertEqual(result.columns.names[0], 'group_key')
  1105. def test_concat_dataframe_keys_bug(self):
  1106. t1 = DataFrame({'value': Series([1, 2, 3],
  1107. index=Index(['a', 'b', 'c'], name='id'))})
  1108. t2 = DataFrame({'value': Series([7, 8],
  1109. index=Index(['a', 'b'], name='id'))})
  1110. # it works
  1111. result = concat([t1, t2], axis=1, keys=['t1', 't2'])
  1112. self.assertEqual(list(result.columns), [('t1', 'value'),
  1113. ('t2', 'value')])
  1114. def test_concat_dict(self):
  1115. frames = {'foo': DataFrame(np.random.randn(4, 3)),
  1116. 'bar': DataFrame(np.random.randn(4, 3)),
  1117. 'baz': DataFrame(np.random.randn(4, 3)),
  1118. 'qux': DataFrame(np.random.randn(4, 3))}
  1119. sorted_keys = sorted(frames)
  1120. result = concat(frames)
  1121. expected = concat([frames[k] for k in sorted_keys], keys=sorted_keys)
  1122. tm.assert_frame_equal(result, expected)
  1123. result = concat(frames, axis=1)
  1124. expected = concat([frames[k] for k in sorted_keys], keys=sorted_keys,
  1125. axis=1)
  1126. tm.assert_frame_equal(result, expected)
  1127. keys = ['baz', 'foo', 'bar']
  1128. result = concat(frames, keys=keys)
  1129. expected = concat([frames[k] for k in keys], keys=keys)
  1130. tm.assert_frame_equal(result, expected)
  1131. def test_concat_ignore_index(self):
  1132. frame1 = DataFrame({"test1": ["a", "b", "c"],
  1133. "test2": [1, 2, 3],
  1134. "test3": [4.5, 3.2, 1.2]})
  1135. frame2 = DataFrame({"test3": [5.2, 2.2, 4.3]})
  1136. frame1.index = Index(["x", "y", "z"])
  1137. frame2.index = Index(["x", "y", "q"])
  1138. v1 = concat([frame1, frame2], axis=1, ignore_index=True)
  1139. nan = np.nan
  1140. expected = DataFrame([[nan, nan, nan, 4.3],
  1141. ['a', 1, 4.5, 5.2],
  1142. ['b', 2, 3.2, 2.2],
  1143. ['c', 3, 1.2, nan]],
  1144. index=Index(["q", "x", "y", "z"]))
  1145. tm.assert_frame_equal(v1, expected)
  1146. def test_concat_multiindex_with_keys(self):
  1147. index = MultiIndex(levels=[['foo', 'bar', 'baz', 'qux'],
  1148. ['one', 'two', 'three']],
  1149. labels=[[0, 0, 0, 1, 1, 2, 2, 3, 3, 3],
  1150. [0, 1, 2, 0, 1, 1, 2, 0, 1, 2]],
  1151. names=['first', 'second'])
  1152. frame = DataFrame(np.random.randn(10, 3), index=index,
  1153. columns=Index(['A', 'B', 'C'], name='exp'))
  1154. result = concat([frame, frame], keys=[0, 1], names=['iteration'])
  1155. self.assertEqual(result.index.names, ('iteration',) + index.names)
  1156. tm.assert_frame_equal(result.ix[0], frame)
  1157. tm.assert_frame_equal(result.ix[1], frame)
  1158. self.assertEqual(result.index.nlevels, 3)
  1159. def test_concat_multiindex_with_tz(self):
  1160. # GH 6606
  1161. df = DataFrame({'dt': [datetime(2014, 1, 1),
  1162. datetime(2014, 1, 2),
  1163. datetime(2014, 1, 3)],
  1164. 'b': ['A', 'B', 'C'],
  1165. 'c': [1, 2, 3], 'd': [4, 5, 6]})
  1166. df['dt'] = df['dt'].apply(lambda d: pd.Timestamp(d, tz='US/Pacific'))
  1167. df = df.set_index(['dt', 'b'])
  1168. exp_idx1 = pd.DatetimeIndex(['2014-01-01', '2014-01-02', '2014-01-03'] * 2,
  1169. tz='US/Pacific', name='dt')
  1170. exp_idx2 = Index(['A', 'B', 'C'] * 2, name='b')
  1171. exp_idx = pd.MultiIndex.from_arrays([exp_idx1, exp_idx2])
  1172. expected = DataFrame({'c': [1, 2, 3] * 2, 'd': [4, 5, 6] * 2},
  1173. index=exp_idx, columns=['c', 'd'])
  1174. result = concat([df, df])
  1175. tm.assert_frame_equal(result, expected)
  1176. def test_concat_keys_and_levels(self):
  1177. df = DataFrame(np.random.randn(1, 3))
  1178. df2 = DataFrame(np.random.randn(1, 4))
  1179. levels = [['foo', 'baz'], ['one', 'two']]
  1180. names = ['first', 'second']
  1181. result = concat([df, df2, df, df2],
  1182. keys=[('foo', 'one'), ('foo', 'two'),
  1183. ('baz', 'one'), ('baz', 'two')],
  1184. levels=levels,
  1185. names=names)
  1186. expected = concat([df, df2, df, df2])
  1187. exp_index = MultiIndex(levels=levels + [[0]],
  1188. labels=[[0, 0, 1, 1], [0, 1, 0, 1],
  1189. [0, 0, 0, 0]],
  1190. names=names + [None])
  1191. expected.index = exp_index
  1192. assert_frame_equal(result, expected)
  1193. # no names
  1194. result = concat([df, df2, df, df2],
  1195. keys=[('foo', 'one'), ('foo', 'two'),
  1196. ('baz', 'one'), ('baz', 'two')],
  1197. levels=levels)
  1198. self.assertEqual(result.index.names, (None,) * 3)
  1199. # no levels
  1200. result = concat([df, df2, df, df2],
  1201. keys=[('foo', 'one'), ('foo', 'two'),
  1202. ('baz', 'one'), ('baz', 'two')],
  1203. names=['first', 'second'])
  1204. self.assertEqual(result.index.names, ('first', 'second') + (None,))
  1205. self.assert_numpy_array_equal(result.index.levels[0], ['baz', 'foo'])
  1206. def test_concat_keys_levels_no_overlap(self):
  1207. # GH #1406
  1208. df = DataFrame(np.random.randn(1, 3), index=['a'])
  1209. df2 = DataFrame(np.random.randn(1, 4), index=['b'])
  1210. self.assertRaises(ValueError, concat, [df, df],
  1211. keys=['one', 'two'], levels=[['foo', 'bar', 'baz']])
  1212. self.assertRaises(ValueError, concat, [df, df2],
  1213. keys=['one', 'two'], levels=[['foo', 'bar', 'baz']])
  1214. def test_concat_rename_index(self):
  1215. a = DataFrame(np.random.rand(3, 3),
  1216. columns=list('ABC'),
  1217. index=Index(list('abc'), name='index_a'))
  1218. b = DataFrame(np.random.rand(3, 3),
  1219. columns=list('ABC'),
  1220. index=Index(list('abc'), name='index_b'))
  1221. result = concat([a, b], keys=['key0', 'key1'],
  1222. names=['lvl0', 'lvl1'])
  1223. exp = concat([a, b], keys=['key0', 'key1'], names=['lvl0'])
  1224. names = list(exp.index.names)
  1225. names[1] = 'lvl1'
  1226. exp.index.set_names(names, inplace=True)
  1227. tm.assert_frame_equal(result, exp)
  1228. self.assertEqual(result.index.names, exp.index.names)
  1229. def test_crossed_dtypes_weird_corner(self):
  1230. columns = ['A', 'B', 'C', 'D']
  1231. df1 = DataFrame({'A': np.array([1, 2, 3, 4], dtype='f8'),
  1232. 'B': np.array([1, 2, 3, 4], dtype='i8'),
  1233. 'C': np.array([1, 2, 3, 4], dtype='f8'),
  1234. 'D': np.array([1, 2, 3, 4], dtype='i8')},
  1235. columns=columns)
  1236. df2 = DataFrame({'A': np.array([1, 2, 3, 4], dtype='i8'),
  1237. 'B': np.array([1, 2, 3, 4], dtype='f8'),
  1238. 'C': np.array([1, 2, 3, 4], dtype='i8'),
  1239. 'D': np.array([1, 2, 3, 4], dtype='f8')},
  1240. columns=columns)
  1241. appended = df1.append(df2, ignore_index=True)
  1242. expected = DataFrame(np.concatenate([df1.values, df2.values], axis=0),
  1243. columns=columns)
  1244. tm.assert_frame_equal(appended, expected)
  1245. df = DataFrame(np.random.randn(1, 3), index=['a'])
  1246. df2 = DataFrame(np.random.randn(1, 4), index=['b'])
  1247. result = concat(
  1248. [df, df2], keys=['one', 'two'], names=['first', 'second'])
  1249. self.assertEqual(result.index.names, ('first', 'second'))
  1250. def test_dups_index(self):
  1251. # GH 4771
  1252. # single dtypes
  1253. df = DataFrame(np.random.randint(0,10,size=40).reshape(10,4),columns=['A','A','C','C'])
  1254. result = concat([df,df],axis=1)
  1255. assert_frame_equal(result.iloc[:,:4],df)
  1256. assert_frame_equal(result.iloc[:,4:],df)
  1257. result = concat([df,df],axis=0)
  1258. assert_frame_equal(result.iloc[:10],df)
  1259. assert_frame_equal(result.iloc[10:],df)
  1260. # multi dtypes
  1261. df = concat([DataFrame(np.random.randn(10,4),columns=['A','A','B','B']),
  1262. DataFrame(np.random.randint(0,10,size=20).reshape(10,2),columns=['A','C'])],
  1263. axis=1)
  1264. result = concat([df,df],axis=1)
  1265. assert_frame_equal(result.iloc[:,:6],df)
  1266. assert_frame_equal(result.iloc[:,6:],df)
  1267. result = concat([df,df],axis=0)
  1268. assert_frame_equal(result.iloc[:10],df)
  1269. assert_frame_equal(result.iloc[10:],df)
  1270. # append
  1271. result = df.iloc[0:8,:].append(df.iloc[8:])
  1272. assert_frame_equal(result, df)
  1273. result = df.iloc[0:8,:].append(df.iloc[8:9]).append(df.iloc[9:10])
  1274. assert_frame_equal(result, df)
  1275. expected = concat([df,df],axis=0)
  1276. result = df.append(df)
  1277. assert_frame_equal(result, expected)
  1278. def test_join_dups(self):
  1279. # joining dups
  1280. df = concat([DataFrame(np.random.randn(10,4),columns=['A','A','B','B']),
  1281. DataFrame(np.random.randint(0,10,size=20).reshape(10,2),columns=['A','C'])],
  1282. axis=1)
  1283. expected = concat([df,df],axis=1)
  1284. result = df.join(df,rsuffix='_2')
  1285. result.columns = expected.columns
  1286. assert_frame_equal(result, expected)
  1287. # GH 4975, invalid join on dups
  1288. w = DataFrame(np.random.randn(4,2), columns=["x", "y"])
  1289. x = DataFrame(np.random.randn(4,2), columns=["x", "y"])
  1290. y = DataFrame(np.random.randn(4,2), columns=["x", "y"])
  1291. z = DataFrame(np.random.randn(4,2), columns=["x", "y"])
  1292. dta = x.merge(y, left_index=True, right_index=True).merge(z, left_index=True, right_index=True, how="outer")
  1293. dta = dta.merge(w, left_index=True, right_index=True)
  1294. expected = concat([x,y,z,w],axis=1)
  1295. expected.columns=['x_x','y_x','x_y','y_y','x_x','y_x','x_y','y_y']
  1296. assert_frame_equal(dta,expected)
  1297. def test_handle_empty_objects(self):
  1298. df = DataFrame(np.random.randn(10, 4), columns=list('abcd'))
  1299. baz = df[:5]
  1300. baz['foo'] = 'bar'
  1301. empty = df[5:5]
  1302. frames = [baz, empty, empty, df[5:]]
  1303. concatted = concat(frames, axis=0)
  1304. expected = df.ix[:, ['a', 'b', 'c', 'd', 'foo']]
  1305. expected['foo'] = expected['foo'].astype('O')
  1306. expected['foo'][:5] = 'bar'
  1307. tm.assert_frame_equal(concatted, expected)
  1308. # empty as first element with time series
  1309. # GH3259
  1310. df = DataFrame(dict(A = range(10000)),index=date_range('20130101',periods=10000,freq='s'))
  1311. empty = DataFrame()
  1312. result = concat([df,empty],axis=1)
  1313. assert_frame_equal(result, df)
  1314. result = concat([empty,df],axis=1)
  1315. assert_frame_equal(result, df)
  1316. result = concat([df,empty])
  1317. assert_frame_equal(result, df)
  1318. result = concat([empty,df])
  1319. assert_frame_equal(result, df)
  1320. def test_concat_mixed_objs(self):
  1321. # concat mixed series/frames
  1322. # G2385
  1323. # axis 1
  1324. index=date_range('01-Jan-2013', periods=10, freq='H')
  1325. arr = np.arange(10, dtype='int64')
  1326. s1 = Series(arr, index=index)
  1327. s2 = Series(arr, index=index)
  1328. df = DataFrame(arr.reshape(-1,1), index=index)
  1329. expected = DataFrame(np.repeat(arr,2).reshape(-1,2), index=index, columns = [0, 0])
  1330. result = concat([df,df], axis=1)
  1331. assert_frame_equal(result, expected)
  1332. expected = DataFrame(np.repeat(arr,2).reshape(-1,2), index=index, columns = [0, 1])
  1333. result = concat([s1,s2], axis=1)
  1334. assert_frame_equal(result, expected)
  1335. expected = DataFrame(np.repeat(arr,3).reshape(-1,3), index=index, columns = [0, 1, 2])
  1336. result = concat([s1,s2,s1], axis=1)
  1337. assert_frame_equal(result, expected)
  1338. expected = DataFrame(np.repeat(arr,5).reshape(-1,5), index=index, columns = [0, 0, 1, 2, 3])
  1339. result = concat([s1,df,s2,s2,s1], axis=1)
  1340. assert_frame_equal(result, expected)
  1341. # with names
  1342. s1.name = 'foo'
  1343. expected = DataFrame(np.repeat(arr,3).reshape(-1,3), index=index, columns = ['foo', 0, 0])
  1344. result = concat([s1,df,s2], axis=1)
  1345. assert_frame_equal(result, expected)
  1346. s2.name = 'bar'
  1347. expected = DataFrame(np.repeat(arr,3).reshape(-1,3), index=index, columns = ['foo', 0, 'bar'])
  1348. result = concat([s1,df,s2], axis=1)
  1349. assert_frame_equal(result, expected)
  1350. # ignore index
  1351. expected = DataFrame(np.repeat(arr,3).reshape(-1,3), index=index, columns = [0, 1, 2])
  1352. result = concat([s1,df,s2], axis=1, ignore_index=True)
  1353. assert_frame_equal(result, expected)
  1354. # axis 0
  1355. expected = DataFrame(np.tile(arr,3).reshape(-1,1), index=index.tolist() * 3, columns = [0])
  1356. result = concat([s1,df,s2])
  1357. assert_frame_equal(result, expected)
  1358. expected = DataFrame(np.tile(arr,3).reshape(-1,1), columns = [0])
  1359. result = concat([s1,df,s2], ignore_index=True)
  1360. assert_frame_equal(result, expected)
  1361. # invalid concatente of mixed dims
  1362. panel = tm.makePanel()
  1363. self.assertRaises(ValueError, lambda : concat([panel,s1],axis=1))
  1364. def test_panel_join(self):
  1365. panel = tm.makePanel()
  1366. tm.add_nans(panel)
  1367. p1 = panel.ix[:2, :10, :3]
  1368. p2 = panel.ix[2:, 5:, 2:]
  1369. # left join
  1370. result = p1.join(p2)
  1371. expected = p1.copy()
  1372. expected['ItemC'] = p2['ItemC']
  1373. tm.assert_panel_equal(result, expected)
  1374. # right join
  1375. result = p1.join(p2, how='right')
  1376. expected = p2.copy()
  1377. expected['ItemA'] = p1['ItemA']
  1378. expected['ItemB'] = p1['ItemB']
  1379. expected = expected.reindex(items=['ItemA', 'ItemB', 'ItemC'])
  1380. tm.assert_panel_equal(result, expected)
  1381. # inner join
  1382. result = p1.join(p2, how='inner')
  1383. expected = panel.ix[:, 5:10, 2:3]
  1384. tm.assert_panel_equal(result, expected)
  1385. # outer join
  1386. result = p1.join(p2, how='outer')
  1387. expected = p1.reindex(major=panel.major_axis,
  1388. minor=panel.minor_axis)
  1389. expected = expected.join(p2.reindex(major=panel.major_axis,
  1390. minor=panel.minor_axis))
  1391. tm.assert_panel_equal(result, expected)
  1392. def test_panel_join_overlap(self):
  1393. panel = tm.makePanel()
  1394. tm.add_nans(panel)
  1395. p1 = panel.ix[['ItemA', 'ItemB', 'ItemC']]
  1396. p2 = panel.ix[['ItemB', 'ItemC']]
  1397. # Expected index is
  1398. #
  1399. # ItemA, ItemB_p1, ItemC_p1, ItemB_p2, ItemC_p2
  1400. joined = p1.join(p2, lsuffix='_p1', rsuffix='_p2')
  1401. p1_suf = p1.ix[['ItemB', 'ItemC']].add_suffix('_p1')
  1402. p2_suf = p2.ix[['ItemB', 'ItemC']].add_suffix('_p2')
  1403. no_overlap = panel.ix[['ItemA']]
  1404. expected = no_overlap.join(p1_suf.join(p2_suf))
  1405. tm.assert_panel_equal(joined, expected)
  1406. def test_panel_join_many(self):
  1407. tm.K = 10
  1408. panel = tm.makePanel()
  1409. tm.K = 4
  1410. panels = [panel.ix[:2], panel.ix[2:6], panel.ix[6:]]
  1411. joined = panels[0].join(panels[1:])
  1412. tm.assert_panel_equal(joined, panel)
  1413. panels = [panel.ix[:2, :-5], panel.ix[2:6, 2:], panel.ix[6:, 5:-7]]
  1414. data_dict = {}
  1415. for p in panels:
  1416. data_dict.update(compat.iteritems(p))
  1417. joined = panels[0].join(panels[1:], how='inner')
  1418. expected = Panel.from_dict(data_dict, intersect=True)
  1419. tm.assert_panel_equal(joined, expected)
  1420. joined = panels[0].join(panels[1:], how='outer')
  1421. expected = Panel.from_dict(data_dict, intersect=False)
  1422. tm.assert_panel_equal(joined, expected)
  1423. # edge cases
  1424. self.assertRaises(ValueError, panels[0].join, panels[1:],
  1425. how='outer', lsuffix='foo', rsuffix='bar')
  1426. self.assertRaises(ValueError, panels[0].join, panels[1:],
  1427. how='right')
  1428. def test_panel_concat_other_axes(self):
  1429. panel = tm.makePanel()
  1430. p1 = panel.ix[:, :5, :]
  1431. p2 = panel.ix[:, 5:, :]
  1432. result = concat([p1, p2], axis=1)
  1433. tm.assert_panel_equal(result, panel)
  1434. p1 = panel.ix[:, :, :2]
  1435. p2 = panel.ix[:, :, 2:]
  1436. result = concat([p1, p2], axis=2)
  1437. tm.assert_panel_equal(result, panel)
  1438. # if things are a bit misbehaved
  1439. p1 = panel.ix[:2, :, :2]
  1440. p2 = panel.ix[:, :, 2:]
  1441. p1['ItemC'] = 'baz'
  1442. result = concat([p1, p2], axis=2)
  1443. expected = panel.copy()
  1444. expected['ItemC'] = expected['ItemC'].astype('O')
  1445. expected.ix['ItemC', :, :2] = 'baz'
  1446. tm.assert_panel_equal(result, expected)
  1447. def test_panel_concat_buglet(self):
  1448. # #2257
  1449. def make_panel():
  1450. index = 5
  1451. cols = 3
  1452. def df():
  1453. return DataFrame(np.random.randn(index, cols),
  1454. index=["I%s" % i for i in range(index)],
  1455. columns=["C%s" % i for i in range(cols)])
  1456. return Panel(dict([("Item%s" % x, df()) for x in ['A', 'B', 'C']]))
  1457. panel1 = make_panel()
  1458. panel2 = make_panel()
  1459. panel2 = panel2.rename_axis(dict([(x, "%s_1" % x)
  1460. for x in panel2.major_axis]),
  1461. axis=1)
  1462. panel3 = panel2.rename_axis(lambda x: '%s_1' % x, axis=1)
  1463. panel3 = panel3.rename_axis(lambda x: '%s_1' % x, axis=2)
  1464. # it works!
  1465. concat([panel1, panel3], axis=1, verify_integrity=True)
  1466. def test_panel4d_concat(self):
  1467. p4d = tm.makePanel4D()
  1468. p1 = p4d.ix[:, :, :5, :]
  1469. p2 = p4d.ix[:, :, 5:, :]
  1470. result = concat([p1, p2], axis=2)
  1471. tm.assert_panel4d_equal(result, p4d)
  1472. p1 = p4d.ix[:, :, :, :2]
  1473. p2 = p4d.ix[:, :, :, 2:]
  1474. result = concat([p1, p2], axis=3)
  1475. tm.assert_panel4d_equal(result, p4d)
  1476. def test_panel4d_concat_mixed_type(self):
  1477. p4d = tm.makePanel4D()
  1478. # if things are a bit misbehaved
  1479. p1 = p4d.ix[:, :2, :, :2]
  1480. p2 = p4d.ix[:, :, :, 2:]
  1481. p1['L5'] = 'baz'
  1482. result = concat([p1, p2], axis=3)
  1483. p2['L5'] = np.nan
  1484. expected = concat([p1, p2], axis=3)
  1485. expected = expected.ix[result.labels]
  1486. tm.assert_panel4d_equal(result, expected)
  1487. def test_concat_series(self):
  1488. ts = tm.makeTimeSeries()
  1489. ts.name = 'foo'
  1490. pieces = [ts[:5], ts[5:15], ts[15:]]
  1491. result = concat(pieces)
  1492. tm.assert_series_equal(result, ts)
  1493. self.assertEqual(result.name, ts.name)
  1494. result = concat(pieces, keys=[0, 1, 2])
  1495. expected = ts.copy()
  1496. ts.index = DatetimeIndex(np.array(ts.index.values, dtype='M8[ns]'))
  1497. exp_labels = [np.repeat([0, 1, 2], [len(x) for x in pieces]),
  1498. np.arange(len(ts))]
  1499. exp_index = MultiIndex(levels=[[0, 1, 2], ts.index],
  1500. labels=exp_labels)
  1501. expected.index = exp_index
  1502. tm.assert_series_equal(result, expected)
  1503. def test_concat_series_axis1(self):
  1504. ts = tm.makeTimeSeries()
  1505. pieces = [ts[:-2], ts[2:], ts[2:-2]]
  1506. result = concat(pieces, axis=1)
  1507. expected = DataFrame(pieces).T
  1508. assert_frame_equal(result, expected)
  1509. result = concat(pieces, keys=['A', 'B', 'C'], axis=1)
  1510. expected = DataFrame(pieces, index=['A', 'B', 'C']).T
  1511. assert_frame_equal(result, expected)
  1512. # preserve series names, #2489
  1513. s = Series(randn(5), name='A')
  1514. s2 = Series(randn(5), name='B')
  1515. result = concat([s, s2], axis=1)
  1516. expected = DataFrame({'A': s, 'B': s2})
  1517. assert_frame_equal(result, expected)
  1518. s2.name = None
  1519. result = concat([s, s2], axis=1)
  1520. self.assertTrue(np.array_equal(result.columns, lrange(2)))
  1521. # must reindex, #2603
  1522. s = Series(randn(3), index=['c', 'a', 'b'], name='A')
  1523. s2 = Series(randn(4), index=['d', 'a', 'b', 'c'], name='B')
  1524. result = concat([s, s2], axis=1)
  1525. expected = DataFrame({'A': s, 'B': s2})
  1526. assert_frame_equal(result, expected)
  1527. def test_concat_single_with_key(self):
  1528. df = DataFrame(np.random.randn(10, 4))
  1529. result = concat([df], keys=['foo'])
  1530. expected = concat([df, df], keys=['foo', 'bar'])
  1531. tm.assert_frame_equal(result, expected[:10])
  1532. def test_concat_exclude_none(self):
  1533. df = DataFrame(np.random.randn(10, 4))
  1534. pieces = [df[:5], None, None, df[5:]]
  1535. result = concat(pieces)
  1536. tm.assert_frame_equal(result, df)
  1537. self.assertRaises(Exception, concat, [None, None])
  1538. def test_concat_datetime64_block(self):
  1539. from pandas.tseries.index import date_range
  1540. rng = date_range('1/1/2000', periods=10)
  1541. df = DataFrame({'time': rng})
  1542. result = concat([df, df])
  1543. self.assertTrue((result.iloc[:10]['time'] == rng).all())
  1544. self.assertTrue((result.iloc[10:]['time'] == rng).all())
  1545. def test_concat_timedelta64_block(self):
  1546. # not friendly for < 1.7
  1547. tm._skip_if_not_numpy17_friendly()
  1548. from pandas import to_timedelta
  1549. rng = to_timedelta(np.arange(10),unit='s')
  1550. df = DataFrame({'time': rng})
  1551. result = concat([df, df])
  1552. self.assertTrue((result.iloc[:10]['time'] == rng).all())
  1553. self.assertTrue((result.iloc[10:]['time'] == rng).all())
  1554. def test_concat_keys_with_none(self):
  1555. # #1649
  1556. df0 = DataFrame([[10, 20, 30], [10, 20, 30], [10, 20, 30]])
  1557. result = concat(dict(a=None, b=df0, c=df0[:2], d=df0[:1], e=df0))
  1558. expected = concat(dict(b=df0, c=df0[:2], d=df0[:1], e=df0))
  1559. tm.assert_frame_equal(result, expected)
  1560. result = concat([None, df0, df0[:2], df0[:1], df0],
  1561. keys=['a', 'b', 'c', 'd', 'e'])
  1562. expected = concat([df0, df0[:2], df0[:1], df0],
  1563. keys=['b', 'c', 'd', 'e'])
  1564. tm.assert_frame_equal(result, expected)
  1565. def test_concat_bug_1719(self):
  1566. ts1 = tm.makeTimeSeries()
  1567. ts2 = tm.makeTimeSeries()[::2]
  1568. ## to join with union
  1569. ## these two are of different length!
  1570. left = concat([ts1, ts2], join='outer', axis=1)
  1571. right = concat([ts2, ts1], join='outer', axis=1)
  1572. self.assertEqual(len(left), len(right))
  1573. def test_concat_bug_2972(self):
  1574. ts0 = Series(np.zeros(5))
  1575. ts1 = Series(np.ones(5))
  1576. ts0.name = ts1.name = 'same name'
  1577. result = concat([ts0, ts1], axis=1)
  1578. expected = DataFrame({0: ts0, 1: ts1})
  1579. expected.columns=['same name', 'same name']
  1580. assert_frame_equal(result, expected)
  1581. def test_concat_bug_3602(self):
  1582. # GH 3602, duplicate columns
  1583. df1 = DataFrame({'firmNo' : [0,0,0,0], 'stringvar' : ['rrr', 'rrr', 'rrr', 'rrr'], 'prc' : [6,6,6,6] })
  1584. df2 = DataFrame({'misc' : [1,2,3,4], 'prc' : [6,6,6,6], 'C' : [9,10,11,12]})
  1585. expected = DataFrame([[0,6,'rrr',9,1,6],
  1586. [0,6,'rrr',10,2,6],
  1587. [0,6,'rrr',11,3,6],
  1588. [0,6,'rrr',12,4,6]])
  1589. expected.columns = ['firmNo','prc','stringvar','C','misc','prc']
  1590. result = concat([df1,df2],axis=1)
  1591. assert_frame_equal(result,expected)
  1592. def test_concat_series_axis1_same_names_ignore_index(self):
  1593. dates = date_range('01-Jan-2013', '01-Jan-2014', freq='MS')[0:-1]
  1594. s1 = Series(randn(len(dates)), index=dates, name='value')
  1595. s2 = Series(randn(len(dates)), index=dates, name='value')
  1596. result = concat([s1, s2], axis=1, ignore_index=True)
  1597. self.assertTrue(np.array_equal(result.columns, [0, 1]))
  1598. def test_concat_invalid(self):
  1599. # trying to concat a ndframe with a non-ndframe
  1600. df1 = mkdf(10, 2)
  1601. for obj in [1, dict(), [1, 2], (1, 2) ]:
  1602. self.assertRaises(TypeError, lambda x: concat([ df1, obj ]))
  1603. def test_concat_invalid_first_argument(self):
  1604. df1 = mkdf(10, 2)
  1605. df2 = mkdf(10, 2)
  1606. self.assertRaises(TypeError, concat, df1, df2)
  1607. # generator ok though
  1608. concat(DataFrame(np.random.rand(5,5)) for _ in range(3))
  1609. # text reader ok
  1610. # GH6583
  1611. data = """index,A,B,C,D
  1612. foo,2,3,4,5
  1613. bar,7,8,9,10
  1614. baz,12,13,14,15
  1615. qux,12,13,14,15
  1616. foo2,12,13,14,15
  1617. bar2,12,13,14,15
  1618. """
  1619. reader = read_csv(StringIO(data), chunksize=1)
  1620. result = concat(reader, ignore_index=True)
  1621. expected = read_csv(StringIO(data))
  1622. assert_frame_equal(result,expected)
  1623. class TestOrderedMerge(tm.TestCase):
  1624. def setUp(self):
  1625. self.left = DataFrame({'key': ['a', 'c', 'e'],
  1626. 'lvalue': [1, 2., 3]})
  1627. self.right = DataFrame({'key': ['b', 'c', 'd', 'f'],
  1628. 'rvalue': [1, 2, 3., 4]})
  1629. # GH #813
  1630. def test_basic(self):
  1631. result = ordered_merge(self.left, self.right, on='key')
  1632. expected = DataFrame({'key': ['a', 'b', 'c', 'd', 'e', 'f'],
  1633. 'lvalue': [1, nan, 2, nan, 3, nan],
  1634. 'rvalue': [nan, 1, 2, 3, nan, 4]})
  1635. assert_frame_equal(result, expected)
  1636. def test_ffill(self):
  1637. result = ordered_merge(
  1638. self.left, self.right, on='key', fill_method='ffill')
  1639. expected = DataFrame({'key': ['a', 'b', 'c', 'd', 'e', 'f'],
  1640. 'lvalue': [1., 1, 2, 2, 3, 3.],
  1641. 'rvalue': [nan, 1, 2, 3, 3, 4]})
  1642. assert_frame_equal(result, expected)
  1643. def test_multigroup(self):
  1644. left = concat([self.left, self.left], ignore_index=True)
  1645. # right = concat([self.right, self.right], ignore_index=True)
  1646. left['group'] = ['a'] * 3 + ['b'] * 3
  1647. # right['group'] = ['a'] * 4 + ['b'] * 4
  1648. result = ordered_merge(left, self.right, on='key', left_by='group',
  1649. fill_method='ffill')
  1650. expected = DataFrame({'key': ['a', 'b', 'c', 'd', 'e', 'f'] * 2,
  1651. 'lvalue': [1., 1, 2, 2, 3, 3.] * 2,
  1652. 'rvalue': [nan, 1, 2, 3, 3, 4] * 2})
  1653. expected['group'] = ['a'] * 6 + ['b'] * 6
  1654. assert_frame_equal(result, expected.ix[:, result.columns])
  1655. result2 = ordered_merge(self.right, left, on='key', right_by='group',
  1656. fill_method='ffill')
  1657. assert_frame_equal(result, result2.ix[:, result.columns])
  1658. result = ordered_merge(left, self.right, on='key', left_by='group')
  1659. self.assertTrue(result['group'].notnull().all())
  1660. if __name__ == '__main__':
  1661. nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'],
  1662. exit=False)