PageRenderTime 31ms CodeModel.GetById 15ms RepoModel.GetById 0ms app.codeStats 1ms

/pandas/tools/tests/test_merge.py

http://github.com/pydata/pandas
Python | 2158 lines | 1849 code | 260 blank | 49 comment | 24 complexity | 9d08c743c46fdbf243d91045fac3b860 MD5 | raw file
Possible License(s): BSD-3-Clause, Apache-2.0

Large files files are truncated, but you can click here to view the full file

  1. # pylint: disable=E1103
  2. import nose
  3. from datetime import datetime
  4. from numpy.random import randn
  5. from numpy import nan
  6. import numpy as np
  7. import random
  8. import pandas as pd
  9. from pandas.compat import range, lrange, lzip, zip, StringIO
  10. from pandas import compat, _np_version_under1p7
  11. from pandas.tseries.index import DatetimeIndex
  12. from pandas.tools.merge import merge, concat, ordered_merge, MergeError
  13. from pandas.util.testing import (assert_frame_equal, assert_series_equal,
  14. assert_almost_equal, rands,
  15. makeCustomDataframe as mkdf,
  16. assertRaisesRegexp)
  17. from pandas import isnull, DataFrame, Index, MultiIndex, Panel, Series, date_range, read_table, read_csv
  18. import pandas.algos as algos
  19. import pandas.util.testing as tm
  20. a_ = np.array
  21. N = 50
  22. NGROUPS = 8
  23. JOIN_TYPES = ['inner', 'outer', 'left', 'right']
  24. def get_test_data(ngroups=NGROUPS, n=N):
  25. unique_groups = lrange(ngroups)
  26. arr = np.asarray(np.tile(unique_groups, n // ngroups))
  27. if len(arr) < n:
  28. arr = np.asarray(list(arr) + unique_groups[:n - len(arr)])
  29. random.shuffle(arr)
  30. return arr
  31. class TestMerge(tm.TestCase):
  32. _multiprocess_can_split_ = True
  33. def setUp(self):
  34. # aggregate multiple columns
  35. self.df = DataFrame({'key1': get_test_data(),
  36. 'key2': get_test_data(),
  37. 'data1': np.random.randn(N),
  38. 'data2': np.random.randn(N)})
  39. # exclude a couple keys for fun
  40. self.df = self.df[self.df['key2'] > 1]
  41. self.df2 = DataFrame({'key1': get_test_data(n=N // 5),
  42. 'key2': get_test_data(ngroups=NGROUPS // 2,
  43. n=N // 5),
  44. 'value': np.random.randn(N // 5)})
  45. index, data = tm.getMixedTypeDict()
  46. self.target = DataFrame(data, index=index)
  47. # Join on string value
  48. self.source = DataFrame({'MergedA': data['A'], 'MergedD': data['D']},
  49. index=data['C'])
  50. self.left = DataFrame({'key': ['a', 'b', 'c', 'd', 'e', 'e', 'a'],
  51. 'v1': np.random.randn(7)})
  52. self.right = DataFrame({'v2': np.random.randn(4)},
  53. index=['d', 'b', 'c', 'a'])
  54. def test_cython_left_outer_join(self):
  55. left = a_([0, 1, 2, 1, 2, 0, 0, 1, 2, 3, 3], dtype=np.int64)
  56. right = a_([1, 1, 0, 4, 2, 2, 1], dtype=np.int64)
  57. max_group = 5
  58. ls, rs = algos.left_outer_join(left, right, max_group)
  59. exp_ls = left.argsort(kind='mergesort')
  60. exp_rs = right.argsort(kind='mergesort')
  61. exp_li = a_([0, 1, 2, 3, 3, 3, 4, 4, 4, 5, 5, 5,
  62. 6, 6, 7, 7, 8, 8, 9, 10])
  63. exp_ri = a_([0, 0, 0, 1, 2, 3, 1, 2, 3, 1, 2, 3,
  64. 4, 5, 4, 5, 4, 5, -1, -1])
  65. exp_ls = exp_ls.take(exp_li)
  66. exp_ls[exp_li == -1] = -1
  67. exp_rs = exp_rs.take(exp_ri)
  68. exp_rs[exp_ri == -1] = -1
  69. self.assert_numpy_array_equal(ls, exp_ls)
  70. self.assert_numpy_array_equal(rs, exp_rs)
  71. def test_cython_right_outer_join(self):
  72. left = a_([0, 1, 2, 1, 2, 0, 0, 1, 2, 3, 3], dtype=np.int64)
  73. right = a_([1, 1, 0, 4, 2, 2, 1], dtype=np.int64)
  74. max_group = 5
  75. rs, ls = algos.left_outer_join(right, left, max_group)
  76. exp_ls = left.argsort(kind='mergesort')
  77. exp_rs = right.argsort(kind='mergesort')
  78. # 0 1 1 1
  79. exp_li = a_([0, 1, 2, 3, 4, 5, 3, 4, 5, 3, 4, 5,
  80. # 2 2 4
  81. 6, 7, 8, 6, 7, 8, -1])
  82. exp_ri = a_([0, 0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3,
  83. 4, 4, 4, 5, 5, 5, 6])
  84. exp_ls = exp_ls.take(exp_li)
  85. exp_ls[exp_li == -1] = -1
  86. exp_rs = exp_rs.take(exp_ri)
  87. exp_rs[exp_ri == -1] = -1
  88. self.assert_numpy_array_equal(ls, exp_ls)
  89. self.assert_numpy_array_equal(rs, exp_rs)
  90. def test_cython_inner_join(self):
  91. left = a_([0, 1, 2, 1, 2, 0, 0, 1, 2, 3, 3], dtype=np.int64)
  92. right = a_([1, 1, 0, 4, 2, 2, 1, 4], dtype=np.int64)
  93. max_group = 5
  94. ls, rs = algos.inner_join(left, right, max_group)
  95. exp_ls = left.argsort(kind='mergesort')
  96. exp_rs = right.argsort(kind='mergesort')
  97. exp_li = a_([0, 1, 2, 3, 3, 3, 4, 4, 4, 5, 5, 5,
  98. 6, 6, 7, 7, 8, 8])
  99. exp_ri = a_([0, 0, 0, 1, 2, 3, 1, 2, 3, 1, 2, 3,
  100. 4, 5, 4, 5, 4, 5])
  101. exp_ls = exp_ls.take(exp_li)
  102. exp_ls[exp_li == -1] = -1
  103. exp_rs = exp_rs.take(exp_ri)
  104. exp_rs[exp_ri == -1] = -1
  105. self.assert_numpy_array_equal(ls, exp_ls)
  106. self.assert_numpy_array_equal(rs, exp_rs)
  107. def test_left_outer_join(self):
  108. joined_key2 = merge(self.df, self.df2, on='key2')
  109. _check_join(self.df, self.df2, joined_key2, ['key2'], how='left')
  110. joined_both = merge(self.df, self.df2)
  111. _check_join(self.df, self.df2, joined_both, ['key1', 'key2'],
  112. how='left')
  113. def test_right_outer_join(self):
  114. joined_key2 = merge(self.df, self.df2, on='key2', how='right')
  115. _check_join(self.df, self.df2, joined_key2, ['key2'], how='right')
  116. joined_both = merge(self.df, self.df2, how='right')
  117. _check_join(self.df, self.df2, joined_both, ['key1', 'key2'],
  118. how='right')
  119. def test_full_outer_join(self):
  120. joined_key2 = merge(self.df, self.df2, on='key2', how='outer')
  121. _check_join(self.df, self.df2, joined_key2, ['key2'], how='outer')
  122. joined_both = merge(self.df, self.df2, how='outer')
  123. _check_join(self.df, self.df2, joined_both, ['key1', 'key2'],
  124. how='outer')
  125. def test_inner_join(self):
  126. joined_key2 = merge(self.df, self.df2, on='key2', how='inner')
  127. _check_join(self.df, self.df2, joined_key2, ['key2'], how='inner')
  128. joined_both = merge(self.df, self.df2, how='inner')
  129. _check_join(self.df, self.df2, joined_both, ['key1', 'key2'],
  130. how='inner')
  131. def test_handle_overlap(self):
  132. joined = merge(self.df, self.df2, on='key2',
  133. suffixes=['.foo', '.bar'])
  134. self.assertIn('key1.foo', joined)
  135. self.assertIn('key1.bar', joined)
  136. def test_handle_overlap_arbitrary_key(self):
  137. joined = merge(self.df, self.df2,
  138. left_on='key2', right_on='key1',
  139. suffixes=['.foo', '.bar'])
  140. self.assertIn('key1.foo', joined)
  141. self.assertIn('key2.bar', joined)
  142. def test_merge_common(self):
  143. joined = merge(self.df, self.df2)
  144. exp = merge(self.df, self.df2, on=['key1', 'key2'])
  145. tm.assert_frame_equal(joined, exp)
  146. def test_join_on(self):
  147. target = self.target
  148. source = self.source
  149. merged = target.join(source, on='C')
  150. self.assert_numpy_array_equal(merged['MergedA'], target['A'])
  151. self.assert_numpy_array_equal(merged['MergedD'], target['D'])
  152. # join with duplicates (fix regression from DataFrame/Matrix merge)
  153. df = DataFrame({'key': ['a', 'a', 'b', 'b', 'c']})
  154. df2 = DataFrame({'value': [0, 1, 2]}, index=['a', 'b', 'c'])
  155. joined = df.join(df2, on='key')
  156. expected = DataFrame({'key': ['a', 'a', 'b', 'b', 'c'],
  157. 'value': [0, 0, 1, 1, 2]})
  158. assert_frame_equal(joined, expected)
  159. # Test when some are missing
  160. df_a = DataFrame([[1], [2], [3]], index=['a', 'b', 'c'],
  161. columns=['one'])
  162. df_b = DataFrame([['foo'], ['bar']], index=[1, 2],
  163. columns=['two'])
  164. df_c = DataFrame([[1], [2]], index=[1, 2],
  165. columns=['three'])
  166. joined = df_a.join(df_b, on='one')
  167. joined = joined.join(df_c, on='one')
  168. self.assertTrue(np.isnan(joined['two']['c']))
  169. self.assertTrue(np.isnan(joined['three']['c']))
  170. # merge column not p resent
  171. self.assertRaises(Exception, target.join, source, on='E')
  172. # overlap
  173. source_copy = source.copy()
  174. source_copy['A'] = 0
  175. self.assertRaises(Exception, target.join, source_copy, on='A')
  176. def test_join_on_fails_with_different_right_index(self):
  177. with tm.assertRaises(ValueError):
  178. df = DataFrame({'a': tm.choice(['m', 'f'], size=3),
  179. 'b': np.random.randn(3)})
  180. df2 = DataFrame({'a': tm.choice(['m', 'f'], size=10),
  181. 'b': np.random.randn(10)},
  182. index=tm.makeCustomIndex(10, 2))
  183. merge(df, df2, left_on='a', right_index=True)
  184. def test_join_on_fails_with_different_left_index(self):
  185. with tm.assertRaises(ValueError):
  186. df = DataFrame({'a': tm.choice(['m', 'f'], size=3),
  187. 'b': np.random.randn(3)},
  188. index=tm.makeCustomIndex(10, 2))
  189. df2 = DataFrame({'a': tm.choice(['m', 'f'], size=10),
  190. 'b': np.random.randn(10)})
  191. merge(df, df2, right_on='b', left_index=True)
  192. def test_join_on_fails_with_different_column_counts(self):
  193. with tm.assertRaises(ValueError):
  194. df = DataFrame({'a': tm.choice(['m', 'f'], size=3),
  195. 'b': np.random.randn(3)})
  196. df2 = DataFrame({'a': tm.choice(['m', 'f'], size=10),
  197. 'b': np.random.randn(10)},
  198. index=tm.makeCustomIndex(10, 2))
  199. merge(df, df2, right_on='a', left_on=['a', 'b'])
  200. def test_join_on_pass_vector(self):
  201. expected = self.target.join(self.source, on='C')
  202. del expected['C']
  203. join_col = self.target.pop('C')
  204. result = self.target.join(self.source, on=join_col)
  205. assert_frame_equal(result, expected)
  206. def test_join_with_len0(self):
  207. # nothing to merge
  208. merged = self.target.join(self.source.reindex([]), on='C')
  209. for col in self.source:
  210. self.assertIn(col, merged)
  211. self.assertTrue(merged[col].isnull().all())
  212. merged2 = self.target.join(self.source.reindex([]), on='C',
  213. how='inner')
  214. self.assertTrue(merged2.columns.equals(merged.columns))
  215. self.assertEqual(len(merged2), 0)
  216. def test_join_on_inner(self):
  217. df = DataFrame({'key': ['a', 'a', 'd', 'b', 'b', 'c']})
  218. df2 = DataFrame({'value': [0, 1]}, index=['a', 'b'])
  219. joined = df.join(df2, on='key', how='inner')
  220. expected = df.join(df2, on='key')
  221. expected = expected[expected['value'].notnull()]
  222. self.assert_numpy_array_equal(joined['key'], expected['key'])
  223. self.assert_numpy_array_equal(joined['value'], expected['value'])
  224. self.assertTrue(joined.index.equals(expected.index))
  225. def test_join_on_singlekey_list(self):
  226. df = DataFrame({'key': ['a', 'a', 'b', 'b', 'c']})
  227. df2 = DataFrame({'value': [0, 1, 2]}, index=['a', 'b', 'c'])
  228. # corner cases
  229. joined = df.join(df2, on=['key'])
  230. expected = df.join(df2, on='key')
  231. assert_frame_equal(joined, expected)
  232. def test_join_on_series(self):
  233. result = self.target.join(self.source['MergedA'], on='C')
  234. expected = self.target.join(self.source[['MergedA']], on='C')
  235. assert_frame_equal(result, expected)
  236. def test_join_on_series_buglet(self):
  237. # GH #638
  238. df = DataFrame({'a': [1, 1]})
  239. ds = Series([2], index=[1], name='b')
  240. result = df.join(ds, on='a')
  241. expected = DataFrame({'a': [1, 1],
  242. 'b': [2, 2]}, index=df.index)
  243. tm.assert_frame_equal(result, expected)
  244. def test_join_index_mixed(self):
  245. df1 = DataFrame({'A': 1., 'B': 2, 'C': 'foo', 'D': True},
  246. index=np.arange(10),
  247. columns=['A', 'B', 'C', 'D'])
  248. self.assertEqual(df1['B'].dtype, np.int64)
  249. self.assertEqual(df1['D'].dtype, np.bool_)
  250. df2 = DataFrame({'A': 1., 'B': 2, 'C': 'foo', 'D': True},
  251. index=np.arange(0, 10, 2),
  252. columns=['A', 'B', 'C', 'D'])
  253. # overlap
  254. joined = df1.join(df2, lsuffix='_one', rsuffix='_two')
  255. expected_columns = ['A_one', 'B_one', 'C_one', 'D_one',
  256. 'A_two', 'B_two', 'C_two', 'D_two']
  257. df1.columns = expected_columns[:4]
  258. df2.columns = expected_columns[4:]
  259. expected = _join_by_hand(df1, df2)
  260. assert_frame_equal(joined, expected)
  261. # no overlapping blocks
  262. df1 = DataFrame(index=np.arange(10))
  263. df1['bool'] = True
  264. df1['string'] = 'foo'
  265. df2 = DataFrame(index=np.arange(5, 15))
  266. df2['int'] = 1
  267. df2['float'] = 1.
  268. for kind in JOIN_TYPES:
  269. joined = df1.join(df2, how=kind)
  270. expected = _join_by_hand(df1, df2, how=kind)
  271. assert_frame_equal(joined, expected)
  272. joined = df2.join(df1, how=kind)
  273. expected = _join_by_hand(df2, df1, how=kind)
  274. assert_frame_equal(joined, expected)
  275. def test_join_empty_bug(self):
  276. # generated an exception in 0.4.3
  277. x = DataFrame()
  278. x.join(DataFrame([3], index=[0], columns=['A']), how='outer')
  279. def test_join_unconsolidated(self):
  280. # GH #331
  281. a = DataFrame(randn(30, 2), columns=['a', 'b'])
  282. c = Series(randn(30))
  283. a['c'] = c
  284. d = DataFrame(randn(30, 1), columns=['q'])
  285. # it works!
  286. a.join(d)
  287. d.join(a)
  288. def test_join_multiindex(self):
  289. index1 = MultiIndex.from_arrays([['a', 'a', 'a', 'b', 'b', 'b'],
  290. [1, 2, 3, 1, 2, 3]],
  291. names=['first', 'second'])
  292. index2 = MultiIndex.from_arrays([['b', 'b', 'b', 'c', 'c', 'c'],
  293. [1, 2, 3, 1, 2, 3]],
  294. names=['first', 'second'])
  295. df1 = DataFrame(data=np.random.randn(6), index=index1,
  296. columns=['var X'])
  297. df2 = DataFrame(data=np.random.randn(6), index=index2,
  298. columns=['var Y'])
  299. df1 = df1.sortlevel(0)
  300. df2 = df2.sortlevel(0)
  301. joined = df1.join(df2, how='outer')
  302. ex_index = index1._tuple_index + index2._tuple_index
  303. expected = df1.reindex(ex_index).join(df2.reindex(ex_index))
  304. expected.index.names = index1.names
  305. assert_frame_equal(joined, expected)
  306. self.assertEqual(joined.index.names, index1.names)
  307. df1 = df1.sortlevel(1)
  308. df2 = df2.sortlevel(1)
  309. joined = df1.join(df2, how='outer').sortlevel(0)
  310. ex_index = index1._tuple_index + index2._tuple_index
  311. expected = df1.reindex(ex_index).join(df2.reindex(ex_index))
  312. expected.index.names = index1.names
  313. assert_frame_equal(joined, expected)
  314. self.assertEqual(joined.index.names, index1.names)
  315. def test_join_inner_multiindex(self):
  316. key1 = ['bar', 'bar', 'bar', 'foo', 'foo', 'baz', 'baz', 'qux',
  317. 'qux', 'snap']
  318. key2 = ['two', 'one', 'three', 'one', 'two', 'one', 'two', 'two',
  319. 'three', 'one']
  320. data = np.random.randn(len(key1))
  321. data = DataFrame({'key1': key1, 'key2': key2,
  322. 'data': data})
  323. index = MultiIndex(levels=[['foo', 'bar', 'baz', 'qux'],
  324. ['one', 'two', 'three']],
  325. labels=[[0, 0, 0, 1, 1, 2, 2, 3, 3, 3],
  326. [0, 1, 2, 0, 1, 1, 2, 0, 1, 2]],
  327. names=['first', 'second'])
  328. to_join = DataFrame(np.random.randn(10, 3), index=index,
  329. columns=['j_one', 'j_two', 'j_three'])
  330. joined = data.join(to_join, on=['key1', 'key2'], how='inner')
  331. expected = merge(data, to_join.reset_index(),
  332. left_on=['key1', 'key2'],
  333. right_on=['first', 'second'], how='inner',
  334. sort=False)
  335. expected2 = merge(to_join, data,
  336. right_on=['key1', 'key2'], left_index=True,
  337. how='inner', sort=False)
  338. assert_frame_equal(joined, expected2.reindex_like(joined))
  339. expected2 = merge(to_join, data, right_on=['key1', 'key2'],
  340. left_index=True, how='inner', sort=False)
  341. expected = expected.drop(['first', 'second'], axis=1)
  342. expected.index = joined.index
  343. self.assertTrue(joined.index.is_monotonic)
  344. assert_frame_equal(joined, expected)
  345. # _assert_same_contents(expected, expected2.ix[:, expected.columns])
  346. def test_join_hierarchical_mixed(self):
  347. df = DataFrame([(1, 2, 3), (4, 5, 6)], columns=['a', 'b', 'c'])
  348. new_df = df.groupby(['a']).agg({'b': [np.mean, np.sum]})
  349. other_df = DataFrame(
  350. [(1, 2, 3), (7, 10, 6)], columns=['a', 'b', 'd'])
  351. other_df.set_index('a', inplace=True)
  352. result = merge(new_df, other_df, left_index=True, right_index=True)
  353. self.assertTrue(('b', 'mean') in result)
  354. self.assertTrue('b' in result)
  355. def test_join_float64_float32(self):
  356. a = DataFrame(randn(10, 2), columns=['a', 'b'], dtype = np.float64)
  357. b = DataFrame(randn(10, 1), columns=['c'], dtype = np.float32)
  358. joined = a.join(b)
  359. self.assertEqual(joined.dtypes['a'], 'float64')
  360. self.assertEqual(joined.dtypes['b'], 'float64')
  361. self.assertEqual(joined.dtypes['c'], 'float32')
  362. a = np.random.randint(0, 5, 100).astype('int64')
  363. b = np.random.random(100).astype('float64')
  364. c = np.random.random(100).astype('float32')
  365. df = DataFrame({'a': a, 'b': b, 'c': c})
  366. xpdf = DataFrame({'a': a, 'b': b, 'c': c })
  367. s = DataFrame(np.random.random(5).astype('float32'), columns=['md'])
  368. rs = df.merge(s, left_on='a', right_index=True)
  369. self.assertEqual(rs.dtypes['a'], 'int64')
  370. self.assertEqual(rs.dtypes['b'], 'float64')
  371. self.assertEqual(rs.dtypes['c'], 'float32')
  372. self.assertEqual(rs.dtypes['md'], 'float32')
  373. xp = xpdf.merge(s, left_on='a', right_index=True)
  374. assert_frame_equal(rs, xp)
  375. def test_join_many_non_unique_index(self):
  376. df1 = DataFrame({"a": [1, 1], "b": [1, 1], "c": [10, 20]})
  377. df2 = DataFrame({"a": [1, 1], "b": [1, 2], "d": [100, 200]})
  378. df3 = DataFrame({"a": [1, 1], "b": [1, 2], "e": [1000, 2000]})
  379. idf1 = df1.set_index(["a", "b"])
  380. idf2 = df2.set_index(["a", "b"])
  381. idf3 = df3.set_index(["a", "b"])
  382. result = idf1.join([idf2, idf3], how='outer')
  383. df_partially_merged = merge(df1, df2, on=['a', 'b'], how='outer')
  384. expected = merge(df_partially_merged, df3, on=['a', 'b'], how='outer')
  385. result = result.reset_index()
  386. result['a'] = result['a'].astype(np.float64)
  387. result['b'] = result['b'].astype(np.float64)
  388. assert_frame_equal(result, expected.ix[:, result.columns])
  389. df1 = DataFrame({"a": [1, 1, 1], "b": [1, 1, 1], "c": [10, 20, 30]})
  390. df2 = DataFrame({"a": [1, 1, 1], "b": [1, 1, 2], "d": [100, 200, 300]})
  391. df3 = DataFrame(
  392. {"a": [1, 1, 1], "b": [1, 1, 2], "e": [1000, 2000, 3000]})
  393. idf1 = df1.set_index(["a", "b"])
  394. idf2 = df2.set_index(["a", "b"])
  395. idf3 = df3.set_index(["a", "b"])
  396. result = idf1.join([idf2, idf3], how='inner')
  397. df_partially_merged = merge(df1, df2, on=['a', 'b'], how='inner')
  398. expected = merge(df_partially_merged, df3, on=['a', 'b'], how='inner')
  399. result = result.reset_index()
  400. assert_frame_equal(result, expected.ix[:, result.columns])
  401. def test_merge_index_singlekey_right_vs_left(self):
  402. left = DataFrame({'key': ['a', 'b', 'c', 'd', 'e', 'e', 'a'],
  403. 'v1': np.random.randn(7)})
  404. right = DataFrame({'v2': np.random.randn(4)},
  405. index=['d', 'b', 'c', 'a'])
  406. merged1 = merge(left, right, left_on='key',
  407. right_index=True, how='left', sort=False)
  408. merged2 = merge(right, left, right_on='key',
  409. left_index=True, how='right', sort=False)
  410. assert_frame_equal(merged1, merged2.ix[:, merged1.columns])
  411. merged1 = merge(left, right, left_on='key',
  412. right_index=True, how='left', sort=True)
  413. merged2 = merge(right, left, right_on='key',
  414. left_index=True, how='right', sort=True)
  415. assert_frame_equal(merged1, merged2.ix[:, merged1.columns])
  416. def test_merge_index_singlekey_inner(self):
  417. left = DataFrame({'key': ['a', 'b', 'c', 'd', 'e', 'e', 'a'],
  418. 'v1': np.random.randn(7)})
  419. right = DataFrame({'v2': np.random.randn(4)},
  420. index=['d', 'b', 'c', 'a'])
  421. # inner join
  422. result = merge(left, right, left_on='key', right_index=True,
  423. how='inner')
  424. expected = left.join(right, on='key').ix[result.index]
  425. assert_frame_equal(result, expected)
  426. result = merge(right, left, right_on='key', left_index=True,
  427. how='inner')
  428. expected = left.join(right, on='key').ix[result.index]
  429. assert_frame_equal(result, expected.ix[:, result.columns])
  430. def test_merge_misspecified(self):
  431. self.assertRaises(Exception, merge, self.left, self.right,
  432. left_index=True)
  433. self.assertRaises(Exception, merge, self.left, self.right,
  434. right_index=True)
  435. self.assertRaises(Exception, merge, self.left, self.left,
  436. left_on='key', on='key')
  437. self.assertRaises(Exception, merge, self.df, self.df2,
  438. left_on=['key1'], right_on=['key1', 'key2'])
  439. def test_merge_overlap(self):
  440. merged = merge(self.left, self.left, on='key')
  441. exp_len = (self.left['key'].value_counts() ** 2).sum()
  442. self.assertEqual(len(merged), exp_len)
  443. self.assertIn('v1_x', merged)
  444. self.assertIn('v1_y', merged)
  445. def test_merge_different_column_key_names(self):
  446. left = DataFrame({'lkey': ['foo', 'bar', 'baz', 'foo'],
  447. 'value': [1, 2, 3, 4]})
  448. right = DataFrame({'rkey': ['foo', 'bar', 'qux', 'foo'],
  449. 'value': [5, 6, 7, 8]})
  450. merged = left.merge(right, left_on='lkey', right_on='rkey',
  451. how='outer', sort=True)
  452. assert_almost_equal(merged['lkey'],
  453. ['bar', 'baz', 'foo', 'foo', 'foo', 'foo', np.nan])
  454. assert_almost_equal(merged['rkey'],
  455. ['bar', np.nan, 'foo', 'foo', 'foo', 'foo', 'qux'])
  456. assert_almost_equal(merged['value_x'], [2, 3, 1, 1, 4, 4, np.nan])
  457. assert_almost_equal(merged['value_y'], [6, np.nan, 5, 8, 5, 8, 7])
  458. def test_merge_copy(self):
  459. left = DataFrame({'a': 0, 'b': 1}, index=lrange(10))
  460. right = DataFrame({'c': 'foo', 'd': 'bar'}, index=lrange(10))
  461. merged = merge(left, right, left_index=True,
  462. right_index=True, copy=True)
  463. merged['a'] = 6
  464. self.assertTrue((left['a'] == 0).all())
  465. merged['d'] = 'peekaboo'
  466. self.assertTrue((right['d'] == 'bar').all())
  467. def test_merge_nocopy(self):
  468. left = DataFrame({'a': 0, 'b': 1}, index=lrange(10))
  469. right = DataFrame({'c': 'foo', 'd': 'bar'}, index=lrange(10))
  470. merged = merge(left, right, left_index=True,
  471. right_index=True, copy=False)
  472. merged['a'] = 6
  473. self.assertTrue((left['a'] == 6).all())
  474. merged['d'] = 'peekaboo'
  475. self.assertTrue((right['d'] == 'peekaboo').all())
  476. def test_join_sort(self):
  477. left = DataFrame({'key': ['foo', 'bar', 'baz', 'foo'],
  478. 'value': [1, 2, 3, 4]})
  479. right = DataFrame({'value2': ['a', 'b', 'c']},
  480. index=['bar', 'baz', 'foo'])
  481. joined = left.join(right, on='key', sort=True)
  482. expected = DataFrame({'key': ['bar', 'baz', 'foo', 'foo'],
  483. 'value': [2, 3, 1, 4],
  484. 'value2': ['a', 'b', 'c', 'c']},
  485. index=[1, 2, 0, 3])
  486. assert_frame_equal(joined, expected)
  487. # smoke test
  488. joined = left.join(right, on='key', sort=False)
  489. self.assert_numpy_array_equal(joined.index, lrange(4))
  490. def test_intelligently_handle_join_key(self):
  491. # #733, be a bit more 1337 about not returning unconsolidated DataFrame
  492. left = DataFrame({'key': [1, 1, 2, 2, 3],
  493. 'value': lrange(5)}, columns=['value', 'key'])
  494. right = DataFrame({'key': [1, 1, 2, 3, 4, 5],
  495. 'rvalue': lrange(6)})
  496. joined = merge(left, right, on='key', how='outer')
  497. expected = DataFrame({'key': [1, 1, 1, 1, 2, 2, 3, 4, 5.],
  498. 'value': np.array([0, 0, 1, 1, 2, 3, 4,
  499. np.nan, np.nan]),
  500. 'rvalue': np.array([0, 1, 0, 1, 2, 2, 3, 4, 5])},
  501. columns=['value', 'key', 'rvalue'])
  502. assert_frame_equal(joined, expected, check_dtype=False)
  503. self.assertTrue(joined._data.is_consolidated())
  504. def test_handle_join_key_pass_array(self):
  505. left = DataFrame({'key': [1, 1, 2, 2, 3],
  506. 'value': lrange(5)}, columns=['value', 'key'])
  507. right = DataFrame({'rvalue': lrange(6)})
  508. key = np.array([1, 1, 2, 3, 4, 5])
  509. merged = merge(left, right, left_on='key', right_on=key, how='outer')
  510. merged2 = merge(right, left, left_on=key, right_on='key', how='outer')
  511. assert_series_equal(merged['key'], merged2['key'])
  512. self.assertTrue(merged['key'].notnull().all())
  513. self.assertTrue(merged2['key'].notnull().all())
  514. left = DataFrame({'value': lrange(5)}, columns=['value'])
  515. right = DataFrame({'rvalue': lrange(6)})
  516. lkey = np.array([1, 1, 2, 2, 3])
  517. rkey = np.array([1, 1, 2, 3, 4, 5])
  518. merged = merge(left, right, left_on=lkey, right_on=rkey, how='outer')
  519. self.assert_numpy_array_equal(merged['key_0'],
  520. np.array([1, 1, 1, 1, 2, 2, 3, 4, 5]))
  521. left = DataFrame({'value': lrange(3)})
  522. right = DataFrame({'rvalue': lrange(6)})
  523. key = np.array([0, 1, 1, 2, 2, 3])
  524. merged = merge(left, right, left_index=True, right_on=key, how='outer')
  525. self.assert_numpy_array_equal(merged['key_0'], key)
  526. def test_mixed_type_join_with_suffix(self):
  527. # GH #916
  528. df = DataFrame(np.random.randn(20, 6),
  529. columns=['a', 'b', 'c', 'd', 'e', 'f'])
  530. df.insert(0, 'id', 0)
  531. df.insert(5, 'dt', 'foo')
  532. grouped = df.groupby('id')
  533. mn = grouped.mean()
  534. cn = grouped.count()
  535. # it works!
  536. mn.join(cn, rsuffix='_right')
  537. def test_no_overlap_more_informative_error(self):
  538. dt = datetime.now()
  539. df1 = DataFrame({'x': ['a']}, index=[dt])
  540. df2 = DataFrame({'y': ['b', 'c']}, index=[dt, dt])
  541. self.assertRaises(MergeError, merge, df1, df2)
  542. def test_merge_non_unique_indexes(self):
  543. dt = datetime(2012, 5, 1)
  544. dt2 = datetime(2012, 5, 2)
  545. dt3 = datetime(2012, 5, 3)
  546. dt4 = datetime(2012, 5, 4)
  547. df1 = DataFrame({'x': ['a']}, index=[dt])
  548. df2 = DataFrame({'y': ['b', 'c']}, index=[dt, dt])
  549. _check_merge(df1, df2)
  550. # Not monotonic
  551. df1 = DataFrame({'x': ['a', 'b', 'q']}, index=[dt2, dt, dt4])
  552. df2 = DataFrame({'y': ['c', 'd', 'e', 'f', 'g', 'h']},
  553. index=[dt3, dt3, dt2, dt2, dt, dt])
  554. _check_merge(df1, df2)
  555. df1 = DataFrame({'x': ['a', 'b']}, index=[dt, dt])
  556. df2 = DataFrame({'y': ['c', 'd']}, index=[dt, dt])
  557. _check_merge(df1, df2)
  558. def test_merge_non_unique_index_many_to_many(self):
  559. dt = datetime(2012, 5, 1)
  560. dt2 = datetime(2012, 5, 2)
  561. dt3 = datetime(2012, 5, 3)
  562. df1 = DataFrame({'x': ['a', 'b', 'c', 'd']},
  563. index=[dt2, dt2, dt, dt])
  564. df2 = DataFrame({'y': ['e', 'f', 'g', ' h', 'i']},
  565. index=[dt2, dt2, dt3, dt, dt])
  566. _check_merge(df1, df2)
  567. def test_left_merge_empty_dataframe(self):
  568. left = DataFrame({'key': [1], 'value': [2]})
  569. right = DataFrame({'key': []})
  570. result = merge(left, right, on='key', how='left')
  571. assert_frame_equal(result, left)
  572. result = merge(right, left, on='key', how='right')
  573. assert_frame_equal(result, left)
  574. def test_merge_nosort(self):
  575. # #2098, anything to do?
  576. from datetime import datetime
  577. d = {"var1": np.random.randint(0, 10, size=10),
  578. "var2": np.random.randint(0, 10, size=10),
  579. "var3": [datetime(2012, 1, 12), datetime(2011, 2, 4),
  580. datetime(
  581. 2010, 2, 3), datetime(2012, 1, 12),
  582. datetime(
  583. 2011, 2, 4), datetime(2012, 4, 3),
  584. datetime(
  585. 2012, 3, 4), datetime(2008, 5, 1),
  586. datetime(2010, 2, 3), datetime(2012, 2, 3)]}
  587. df = DataFrame.from_dict(d)
  588. var3 = df.var3.unique()
  589. var3.sort()
  590. new = DataFrame.from_dict({"var3": var3,
  591. "var8": np.random.random(7)})
  592. result = df.merge(new, on="var3", sort=False)
  593. exp = merge(df, new, on='var3', sort=False)
  594. assert_frame_equal(result, exp)
  595. self.assertTrue((df.var3.unique() == result.var3.unique()).all())
  596. def test_merge_nan_right(self):
  597. df1 = DataFrame({"i1" : [0, 1], "i2" : [0, 1]})
  598. df2 = DataFrame({"i1" : [0], "i3" : [0]})
  599. result = df1.join(df2, on="i1", rsuffix="_")
  600. expected = DataFrame({'i1': {0: 0.0, 1: 1}, 'i2': {0: 0, 1: 1},
  601. 'i1_': {0: 0, 1: np.nan}, 'i3': {0: 0.0, 1: np.nan},
  602. None: {0: 0, 1: 0}}).set_index(None).reset_index()[['i1', 'i2', 'i1_', 'i3']]
  603. assert_frame_equal(result, expected, check_dtype=False)
  604. df1 = DataFrame({"i1" : [0, 1], "i2" : [0.5, 1.5]})
  605. df2 = DataFrame({"i1" : [0], "i3" : [0.7]})
  606. result = df1.join(df2, rsuffix="_", on='i1')
  607. expected = DataFrame({'i1': {0: 0, 1: 1}, 'i1_': {0: 0.0, 1: nan},
  608. 'i2': {0: 0.5, 1: 1.5}, 'i3': {0: 0.69999999999999996,
  609. 1: nan}})[['i1', 'i2', 'i1_', 'i3']]
  610. assert_frame_equal(result, expected)
  611. def test_append_dtype_coerce(self):
  612. # GH 4993
  613. # appending with datetime will incorrectly convert datetime64
  614. import datetime as dt
  615. from pandas import NaT
  616. df1 = DataFrame(index=[1,2], data=[dt.datetime(2013,1,1,0,0),
  617. dt.datetime(2013,1,2,0,0)],
  618. columns=['start_time'])
  619. df2 = DataFrame(index=[4,5], data=[[dt.datetime(2013,1,3,0,0),
  620. dt.datetime(2013,1,3,6,10)],
  621. [dt.datetime(2013,1,4,0,0),
  622. dt.datetime(2013,1,4,7,10)]],
  623. columns=['start_time','end_time'])
  624. expected = concat([
  625. Series([NaT,NaT,dt.datetime(2013,1,3,6,10),dt.datetime(2013,1,4,7,10)],name='end_time'),
  626. Series([dt.datetime(2013,1,1,0,0),dt.datetime(2013,1,2,0,0),dt.datetime(2013,1,3,0,0),dt.datetime(2013,1,4,0,0)],name='start_time'),
  627. ],axis=1)
  628. result = df1.append(df2,ignore_index=True)
  629. assert_frame_equal(result, expected)
  630. def test_join_append_timedeltas(self):
  631. import datetime as dt
  632. from pandas import NaT
  633. # timedelta64 issues with join/merge
  634. # GH 5695
  635. tm._skip_if_not_numpy17_friendly()
  636. d = {'d': dt.datetime(2013, 11, 5, 5, 56), 't': dt.timedelta(0, 22500)}
  637. df = DataFrame(columns=list('dt'))
  638. df = df.append(d, ignore_index=True)
  639. result = df.append(d, ignore_index=True)
  640. expected = DataFrame({'d': [dt.datetime(2013, 11, 5, 5, 56),
  641. dt.datetime(2013, 11, 5, 5, 56) ],
  642. 't': [ dt.timedelta(0, 22500),
  643. dt.timedelta(0, 22500) ]})
  644. assert_frame_equal(result, expected)
  645. td = np.timedelta64(300000000)
  646. lhs = DataFrame(Series([td,td],index=["A","B"]))
  647. rhs = DataFrame(Series([td],index=["A"]))
  648. from pandas import NaT
  649. result = lhs.join(rhs,rsuffix='r', how="left")
  650. expected = DataFrame({ '0' : Series([td,td],index=list('AB')), '0r' : Series([td,NaT],index=list('AB')) })
  651. assert_frame_equal(result, expected)
  652. def test_overlapping_columns_error_message(self):
  653. # #2649
  654. df = DataFrame({'key': [1, 2, 3],
  655. 'v1': [4, 5, 6],
  656. 'v2': [7, 8, 9]})
  657. df2 = DataFrame({'key': [1, 2, 3],
  658. 'v1': [4, 5, 6],
  659. 'v2': [7, 8, 9]})
  660. df.columns = ['key', 'foo', 'foo']
  661. df2.columns = ['key', 'bar', 'bar']
  662. self.assertRaises(Exception, merge, df, df2)
  663. def _check_merge(x, y):
  664. for how in ['inner', 'left', 'outer']:
  665. result = x.join(y, how=how)
  666. expected = merge(x.reset_index(), y.reset_index(), how=how,
  667. sort=True)
  668. expected = expected.set_index('index')
  669. assert_frame_equal(result, expected, check_names=False) # TODO check_names on merge?
  670. class TestMergeMulti(tm.TestCase):
  671. def setUp(self):
  672. self.index = MultiIndex(levels=[['foo', 'bar', 'baz', 'qux'],
  673. ['one', 'two', 'three']],
  674. labels=[[0, 0, 0, 1, 1, 2, 2, 3, 3, 3],
  675. [0, 1, 2, 0, 1, 1, 2, 0, 1, 2]],
  676. names=['first', 'second'])
  677. self.to_join = DataFrame(np.random.randn(10, 3), index=self.index,
  678. columns=['j_one', 'j_two', 'j_three'])
  679. # a little relevant example with NAs
  680. key1 = ['bar', 'bar', 'bar', 'foo', 'foo', 'baz', 'baz', 'qux',
  681. 'qux', 'snap']
  682. key2 = ['two', 'one', 'three', 'one', 'two', 'one', 'two', 'two',
  683. 'three', 'one']
  684. data = np.random.randn(len(key1))
  685. self.data = DataFrame({'key1': key1, 'key2': key2,
  686. 'data': data})
  687. def test_merge_on_multikey(self):
  688. joined = self.data.join(self.to_join, on=['key1', 'key2'])
  689. join_key = Index(lzip(self.data['key1'], self.data['key2']))
  690. indexer = self.to_join.index.get_indexer(join_key)
  691. ex_values = self.to_join.values.take(indexer, axis=0)
  692. ex_values[indexer == -1] = np.nan
  693. expected = self.data.join(DataFrame(ex_values,
  694. columns=self.to_join.columns))
  695. # TODO: columns aren't in the same order yet
  696. assert_frame_equal(joined, expected.ix[:, joined.columns])
  697. def test_merge_right_vs_left(self):
  698. # compare left vs right merge with multikey
  699. merged1 = self.data.merge(self.to_join, left_on=['key1', 'key2'],
  700. right_index=True, how='left')
  701. merged2 = self.to_join.merge(self.data, right_on=['key1', 'key2'],
  702. left_index=True, how='right')
  703. merged2 = merged2.ix[:, merged1.columns]
  704. assert_frame_equal(merged1, merged2)
  705. def test_compress_group_combinations(self):
  706. # ~ 40000000 possible unique groups
  707. key1 = np.array([rands(10) for _ in range(10000)], dtype='O')
  708. key1 = np.tile(key1, 2)
  709. key2 = key1[::-1]
  710. df = DataFrame({'key1': key1, 'key2': key2,
  711. 'value1': np.random.randn(20000)})
  712. df2 = DataFrame({'key1': key1[::2], 'key2': key2[::2],
  713. 'value2': np.random.randn(10000)})
  714. # just to hit the label compression code path
  715. merged = merge(df, df2, how='outer')
  716. def test_left_join_index_preserve_order(self):
  717. left = DataFrame({'k1': [0, 1, 2] * 8,
  718. 'k2': ['foo', 'bar'] * 12,
  719. 'v': np.array(np.arange(24),dtype=np.int64) })
  720. index = MultiIndex.from_tuples([(2, 'bar'), (1, 'foo')])
  721. right = DataFrame({'v2': [5, 7]}, index=index)
  722. result = left.join(right, on=['k1', 'k2'])
  723. expected = left.copy()
  724. expected['v2'] = np.nan
  725. expected['v2'][(expected.k1 == 2) & (expected.k2 == 'bar')] = 5
  726. expected['v2'][(expected.k1 == 1) & (expected.k2 == 'foo')] = 7
  727. tm.assert_frame_equal(result, expected)
  728. # test join with multi dtypes blocks
  729. left = DataFrame({'k1': [0, 1, 2] * 8,
  730. 'k2': ['foo', 'bar'] * 12,
  731. 'k3' : np.array([0, 1, 2]*8, dtype=np.float32),
  732. 'v': np.array(np.arange(24),dtype=np.int32) })
  733. index = MultiIndex.from_tuples([(2, 'bar'), (1, 'foo')])
  734. right = DataFrame({'v2': [5, 7]}, index=index)
  735. result = left.join(right, on=['k1', 'k2'])
  736. expected = left.copy()
  737. expected['v2'] = np.nan
  738. expected['v2'][(expected.k1 == 2) & (expected.k2 == 'bar')] = 5
  739. expected['v2'][(expected.k1 == 1) & (expected.k2 == 'foo')] = 7
  740. tm.assert_frame_equal(result, expected)
  741. # do a right join for an extra test
  742. joined = merge(right, left, left_index=True,
  743. right_on=['k1', 'k2'], how='right')
  744. tm.assert_frame_equal(joined.ix[:, expected.columns], expected)
  745. def test_join_multi_dtypes(self):
  746. # test with multi dtypes in the join index
  747. def _test(dtype1,dtype2):
  748. left = DataFrame({'k1': np.array([0, 1, 2] * 8, dtype=dtype1),
  749. 'k2': ['foo', 'bar'] * 12,
  750. 'v': np.array(np.arange(24),dtype=np.int64) })
  751. index = MultiIndex.from_tuples([(2, 'bar'), (1, 'foo')])
  752. right = DataFrame({'v2': np.array([5, 7], dtype=dtype2)}, index=index)
  753. result = left.join(right, on=['k1', 'k2'])
  754. expected = left.copy()
  755. if dtype2.kind == 'i':
  756. dtype2 = np.dtype('float64')
  757. expected['v2'] = np.array(np.nan,dtype=dtype2)
  758. expected['v2'][(expected.k1 == 2) & (expected.k2 == 'bar')] = 5
  759. expected['v2'][(expected.k1 == 1) & (expected.k2 == 'foo')] = 7
  760. tm.assert_frame_equal(result, expected)
  761. for d1 in [np.int64,np.int32,np.int16,np.int8,np.uint8]:
  762. for d2 in [np.int64,np.float64,np.float32,np.float16]:
  763. _test(np.dtype(d1),np.dtype(d2))
  764. def test_left_merge_na_buglet(self):
  765. left = DataFrame({'id': list('abcde'), 'v1': randn(5),
  766. 'v2': randn(5), 'dummy': list('abcde'),
  767. 'v3': randn(5)},
  768. columns=['id', 'v1', 'v2', 'dummy', 'v3'])
  769. right = DataFrame({'id': ['a', 'b', np.nan, np.nan, np.nan],
  770. 'sv3': [1.234, 5.678, np.nan, np.nan, np.nan]})
  771. merged = merge(left, right, on='id', how='left')
  772. rdf = right.drop(['id'], axis=1)
  773. expected = left.join(rdf)
  774. tm.assert_frame_equal(merged, expected)
  775. def test_merge_na_keys(self):
  776. data = [[1950, "A", 1.5],
  777. [1950, "B", 1.5],
  778. [1955, "B", 1.5],
  779. [1960, "B", np.nan],
  780. [1970, "B", 4.],
  781. [1950, "C", 4.],
  782. [1960, "C", np.nan],
  783. [1965, "C", 3.],
  784. [1970, "C", 4.]]
  785. frame = DataFrame(data, columns=["year", "panel", "data"])
  786. other_data = [[1960, 'A', np.nan],
  787. [1970, 'A', np.nan],
  788. [1955, 'A', np.nan],
  789. [1965, 'A', np.nan],
  790. [1965, 'B', np.nan],
  791. [1955, 'C', np.nan]]
  792. other = DataFrame(other_data, columns=['year', 'panel', 'data'])
  793. result = frame.merge(other, how='outer')
  794. expected = frame.fillna(-999).merge(other.fillna(-999), how='outer')
  795. expected = expected.replace(-999, np.nan)
  796. tm.assert_frame_equal(result, expected)
  797. def test_int64_overflow_issues(self):
  798. # #2690, combinatorial explosion
  799. df1 = DataFrame(np.random.randn(1000, 7),
  800. columns=list('ABCDEF') + ['G1'])
  801. df2 = DataFrame(np.random.randn(1000, 7),
  802. columns=list('ABCDEF') + ['G2'])
  803. # it works!
  804. result = merge(df1, df2, how='outer')
  805. self.assertTrue(len(result) == 2000)
  806. def test_join_multi_levels(self):
  807. # GH 3662
  808. # merge multi-levels
  809. household = DataFrame(dict(household_id = [1,2,3],
  810. male = [0,1,0],
  811. wealth = [196087.3,316478.7,294750]),
  812. columns = ['household_id','male','wealth']).set_index('household_id')
  813. portfolio = DataFrame(dict(household_id = [1,2,2,3,3,3,4],
  814. asset_id = ["nl0000301109","nl0000289783","gb00b03mlx29","gb00b03mlx29","lu0197800237","nl0000289965",np.nan],
  815. name = ["ABN Amro","Robeco","Royal Dutch Shell","Royal Dutch Shell","AAB Eastern Europe Equity Fund","Postbank BioTech Fonds",np.nan],
  816. share = [1.0,0.4,0.6,0.15,0.6,0.25,1.0]),
  817. columns = ['household_id','asset_id','name','share']).set_index(['household_id','asset_id'])
  818. result = household.join(portfolio, how='inner')
  819. expected = DataFrame(dict(male = [0,1,1,0,0,0],
  820. wealth = [ 196087.3, 316478.7, 316478.7, 294750.0, 294750.0, 294750.0 ],
  821. name = ['ABN Amro','Robeco','Royal Dutch Shell','Royal Dutch Shell','AAB Eastern Europe Equity Fund','Postbank BioTech Fonds'],
  822. share = [1.00,0.40,0.60,0.15,0.60,0.25],
  823. household_id = [1,2,2,3,3,3],
  824. asset_id = ['nl0000301109','nl0000289783','gb00b03mlx29','gb00b03mlx29','lu0197800237','nl0000289965']),
  825. ).set_index(['household_id','asset_id']).reindex(columns=['male','wealth','name','share'])
  826. assert_frame_equal(result,expected)
  827. assert_frame_equal(result,expected)
  828. # equivalency
  829. result2 = merge(household.reset_index(),portfolio.reset_index(),on=['household_id'],how='inner').set_index(['household_id','asset_id'])
  830. assert_frame_equal(result2,expected)
  831. result = household.join(portfolio, how='outer')
  832. expected = concat([expected,DataFrame(dict(share = [1.00]),
  833. index=MultiIndex.from_tuples([(4,np.nan)],
  834. names=['household_id','asset_id']))],
  835. axis=0).reindex(columns=expected.columns)
  836. assert_frame_equal(result,expected)
  837. # invalid cases
  838. household.index.name = 'foo'
  839. def f():
  840. household.join(portfolio, how='inner')
  841. self.assertRaises(ValueError, f)
  842. portfolio2 = portfolio.copy()
  843. portfolio2.index.set_names(['household_id','foo'])
  844. def f():
  845. portfolio2.join(portfolio, how='inner')
  846. self.assertRaises(ValueError, f)
  847. def test_join_multi_levels2(self):
  848. # some more advanced merges
  849. # GH6360
  850. household = DataFrame(dict(household_id = [1,2,2,3,3,3,4],
  851. asset_id = ["nl0000301109","nl0000301109","gb00b03mlx29","gb00b03mlx29","lu0197800237","nl0000289965",np.nan],
  852. share = [1.0,0.4,0.6,0.15,0.6,0.25,1.0]),
  853. columns = ['household_id','asset_id','share']).set_index(['household_id','asset_id'])
  854. log_return = DataFrame(dict(
  855. asset_id = ["gb00b03mlx29", "gb00b03mlx29", "gb00b03mlx29", "lu0197800237", "lu0197800237"],
  856. t = [233, 234, 235, 180, 181],
  857. log_return = [.09604978, -.06524096, .03532373, .03025441, .036997]
  858. )).set_index(["asset_id","t"])
  859. expected = DataFrame(dict(
  860. household_id = [2, 2, 2, 3, 3, 3, 3, 3],
  861. asset_id = ["gb00b03mlx29", "gb00b03mlx29", "gb00b03mlx29", "gb00b03mlx29", "gb00b03mlx29", "gb00b03mlx29", "lu0197800237", "lu0197800237"],
  862. t = [233, 234, 235, 233, 234, 235, 180, 181],
  863. share = [0.6, 0.6, 0.6, 0.15, 0.15, 0.15, 0.6, 0.6],
  864. log_return = [.09604978, -.06524096, .03532373, .09604978, -.06524096, .03532373, .03025441, .036997]
  865. )).set_index(["household_id", "asset_id", "t"]).reindex(columns=['share','log_return'])
  866. def f():
  867. household.join(log_return, how='inner')
  868. self.assertRaises(NotImplementedError, f)
  869. # this is the equivalency
  870. result = merge(household.reset_index(),log_return.reset_index(),on=['asset_id'],how='inner').set_index(['household_id','asset_id','t'])
  871. assert_frame_equal(result,expected)
  872. expected = DataFrame(dict(
  873. household_id = [1, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 4],
  874. asset_id = ["nl0000301109", "nl0000289783", "gb00b03mlx29", "gb00b03mlx29", "gb00b03mlx29", "gb00b03mlx29", "gb00b03mlx29", "gb00b03mlx29", "lu0197800237", "lu0197800237", "nl0000289965", None],
  875. t = [None, None, 233, 234, 235, 233, 234, 235, 180, 181, None, None],
  876. share = [1.0, 0.4, 0.6, 0.6, 0.6, 0.15, 0.15, 0.15, 0.6, 0.6, 0.25, 1.0],
  877. log_return = [None, None, .09604978, -.06524096, .03532373, .09604978, -.06524096, .03532373, .03025441, .036997, None, None]
  878. )).set_index(["household_id", "asset_id", "t"])
  879. def f():
  880. household.join(log_return, how='outer')
  881. self.assertRaises(NotImplementedError, f)
  882. def _check_join(left, right, result, join_col, how='left',
  883. lsuffix='_x', rsuffix='_y'):
  884. # some smoke tests
  885. for c in join_col:
  886. assert(result[c].notnull().all())
  887. left_grouped = left.groupby(join_col)
  888. right_grouped = right.groupby(join_col)
  889. for group_key, group in result.groupby(join_col):
  890. l_joined = _restrict_to_columns(group, left.columns, lsuffix)
  891. r_joined = _restrict_to_columns(group, right.columns, rsuffix)
  892. try:
  893. lgroup = left_grouped.get_group(group_key)
  894. except KeyError:
  895. if how in ('left', 'inner'):
  896. raise AssertionError('key %s should not have been in the join'
  897. % str(group_key))
  898. _assert_all_na(l_joined, left.columns, join_col)
  899. else:
  900. _assert_same_contents(l_joined, lgroup)
  901. try:
  902. rgroup = right_grouped.get_group(group_key)
  903. except KeyError:
  904. if how in ('right', 'inner'):
  905. raise AssertionError('key %s should not have been in the join'
  906. % str(group_key))
  907. _assert_all_na(r_joined, right.columns, join_col)
  908. else:
  909. _assert_same_contents(r_joined, rgroup)
  910. def _restrict_to_columns(group, columns, suffix):
  911. found = [c for c in group.columns
  912. if c in columns or c.replace(suffix, '') in columns]
  913. # filter
  914. group = group.ix[:, found]
  915. # get rid of suffixes, if any
  916. group = group.rename(columns=lambda x: x.replace(suffix, ''))
  917. # put in the right order...
  918. group = group.ix[:, columns]
  919. return group
  920. def _assert_same_contents(join_chunk, source):
  921. NA_SENTINEL = -1234567 # drop_duplicates not so NA-friendly...
  922. jvalues = join_chunk.fillna(NA_SENTINEL).drop_duplicates().values
  923. svalues = source.fillna(NA_SENTINEL).drop_duplicates().values
  924. rows = set(tuple(row) for row in jvalues)
  925. assert(len(rows) == len(source))
  926. assert(all(tuple(row) in rows for row in svalues))
  927. def _assert_all_na(join_chunk, source_columns, join_col):
  928. for c in source_columns:
  929. if c in join_col:
  930. continue
  931. assert(join_chunk[c].isnull().all())
  932. def _join_by_hand(a, b, how='left'):
  933. join_index = a.index.join(b.index, how=how)
  934. a_re = a.reindex(join_index)
  935. b_re = b.reindex(join_index)
  936. result_columns = a.columns.append(b.columns)
  937. for col, s in compat.iteritems(b_re):
  938. a_re[col] = s
  939. return a_re.reindex(columns=result_columns)
  940. class TestConcatenate(tm.TestCase):
  941. _multiprocess_can_split_ = True
  942. def setUp(self):
  943. self.frame = DataFrame(tm.getSeriesData())
  944. self.mixed_frame = self.frame.copy()
  945. self.mixed_frame['foo'] = 'bar'
  946. def test_append(self):
  947. begin_index = self.frame.index[:5]
  948. end_index = self.frame.index[5:]
  949. begin_frame = self.frame.reindex(begin_index)
  950. end_frame = self.frame.reindex(end_index)
  951. appended = begin_frame.append(end_frame)

Large files files are truncated, but you can click here to view the full file