PageRenderTime 50ms CodeModel.GetById 21ms RepoModel.GetById 0ms app.codeStats 1ms

/pandas/tools/tests/test_merge_asof.py

http://github.com/wesm/pandas
Python | 464 lines | 361 code | 88 blank | 15 comment | 12 complexity | d27b1a8edfbe83ce7bb4aabc5a349eae MD5 | raw file
Possible License(s): BSD-3-Clause, Apache-2.0
  1. import nose
  2. import os
  3. import numpy as np
  4. import pandas as pd
  5. from pandas import (merge_asof, read_csv,
  6. to_datetime, Timedelta)
  7. from pandas.tools.merge import MergeError
  8. from pandas.util import testing as tm
  9. from pandas.util.testing import assert_frame_equal
  10. class TestAsOfMerge(tm.TestCase):
  11. _multiprocess_can_split_ = True
  12. def read_data(self, name, dedupe=False):
  13. path = os.path.join(tm.get_data_path(), name)
  14. x = read_csv(path)
  15. if dedupe:
  16. x = (x.drop_duplicates(['time', 'ticker'], keep='last')
  17. .reset_index(drop=True)
  18. )
  19. x.time = to_datetime(x.time)
  20. return x
  21. def setUp(self):
  22. self.trades = self.read_data('trades.csv')
  23. self.quotes = self.read_data('quotes.csv', dedupe=True)
  24. self.asof = self.read_data('asof.csv')
  25. self.tolerance = self.read_data('tolerance.csv')
  26. self.allow_exact_matches = self.read_data('allow_exact_matches.csv')
  27. self.allow_exact_matches_and_tolerance = self.read_data(
  28. 'allow_exact_matches_and_tolerance.csv')
  29. def test_examples1(self):
  30. """ doc-string examples """
  31. left = pd.DataFrame({'a': [1, 5, 10],
  32. 'left_val': ['a', 'b', 'c']})
  33. right = pd.DataFrame({'a': [1, 2, 3, 6, 7],
  34. 'right_val': [1, 2, 3, 6, 7]})
  35. pd.merge_asof(left, right, on='a')
  36. def test_examples2(self):
  37. """ doc-string examples """
  38. trades = pd.DataFrame({
  39. 'time': pd.to_datetime(['20160525 13:30:00.023',
  40. '20160525 13:30:00.038',
  41. '20160525 13:30:00.048',
  42. '20160525 13:30:00.048',
  43. '20160525 13:30:00.048']),
  44. 'ticker': ['MSFT', 'MSFT',
  45. 'GOOG', 'GOOG', 'AAPL'],
  46. 'price': [51.95, 51.95,
  47. 720.77, 720.92, 98.00],
  48. 'quantity': [75, 155,
  49. 100, 100, 100]},
  50. columns=['time', 'ticker', 'price', 'quantity'])
  51. quotes = pd.DataFrame({
  52. 'time': pd.to_datetime(['20160525 13:30:00.023',
  53. '20160525 13:30:00.023',
  54. '20160525 13:30:00.030',
  55. '20160525 13:30:00.041',
  56. '20160525 13:30:00.048',
  57. '20160525 13:30:00.049',
  58. '20160525 13:30:00.072',
  59. '20160525 13:30:00.075']),
  60. 'ticker': ['GOOG', 'MSFT', 'MSFT',
  61. 'MSFT', 'GOOG', 'AAPL', 'GOOG',
  62. 'MSFT'],
  63. 'bid': [720.50, 51.95, 51.97, 51.99,
  64. 720.50, 97.99, 720.50, 52.01],
  65. 'ask': [720.93, 51.96, 51.98, 52.00,
  66. 720.93, 98.01, 720.88, 52.03]},
  67. columns=['time', 'ticker', 'bid', 'ask'])
  68. pd.merge_asof(trades, quotes,
  69. on='time',
  70. by='ticker')
  71. pd.merge_asof(trades, quotes,
  72. on='time',
  73. by='ticker',
  74. tolerance=pd.Timedelta('2ms'))
  75. pd.merge_asof(trades, quotes,
  76. on='time',
  77. by='ticker',
  78. tolerance=pd.Timedelta('10ms'),
  79. allow_exact_matches=False)
  80. def test_basic(self):
  81. expected = self.asof
  82. trades = self.trades
  83. quotes = self.quotes
  84. result = merge_asof(trades, quotes,
  85. on='time',
  86. by='ticker')
  87. assert_frame_equal(result, expected)
  88. def test_basic_categorical(self):
  89. expected = self.asof
  90. trades = self.trades.copy()
  91. trades.ticker = trades.ticker.astype('category')
  92. quotes = self.quotes.copy()
  93. quotes.ticker = quotes.ticker.astype('category')
  94. result = merge_asof(trades, quotes,
  95. on='time',
  96. by='ticker')
  97. assert_frame_equal(result, expected)
  98. def test_missing_right_by(self):
  99. expected = self.asof
  100. trades = self.trades
  101. quotes = self.quotes
  102. q = quotes[quotes.ticker != 'MSFT']
  103. result = merge_asof(trades, q,
  104. on='time',
  105. by='ticker')
  106. expected.loc[expected.ticker == 'MSFT', ['bid', 'ask']] = np.nan
  107. assert_frame_equal(result, expected)
  108. def test_basic2(self):
  109. expected = self.read_data('asof2.csv')
  110. trades = self.read_data('trades2.csv')
  111. quotes = self.read_data('quotes2.csv', dedupe=True)
  112. result = merge_asof(trades, quotes,
  113. on='time',
  114. by='ticker')
  115. assert_frame_equal(result, expected)
  116. def test_basic_no_by(self):
  117. f = lambda x: x[x.ticker == 'MSFT'].drop('ticker', axis=1) \
  118. .reset_index(drop=True)
  119. # just use a single ticker
  120. expected = f(self.asof)
  121. trades = f(self.trades)
  122. quotes = f(self.quotes)
  123. result = merge_asof(trades, quotes,
  124. on='time')
  125. assert_frame_equal(result, expected)
  126. def test_valid_join_keys(self):
  127. trades = self.trades
  128. quotes = self.quotes
  129. with self.assertRaises(MergeError):
  130. merge_asof(trades, quotes,
  131. left_on='time',
  132. right_on='bid',
  133. by='ticker')
  134. with self.assertRaises(MergeError):
  135. merge_asof(trades, quotes,
  136. on=['time', 'ticker'],
  137. by='ticker')
  138. with self.assertRaises(MergeError):
  139. merge_asof(trades, quotes,
  140. by='ticker')
  141. def test_with_duplicates(self):
  142. q = pd.concat([self.quotes, self.quotes]).sort_values(
  143. ['time', 'ticker']).reset_index(drop=True)
  144. result = merge_asof(self.trades, q,
  145. on='time',
  146. by='ticker')
  147. expected = self.read_data('asof.csv')
  148. assert_frame_equal(result, expected)
  149. def test_with_duplicates_no_on(self):
  150. df1 = pd.DataFrame({'key': [1, 1, 3],
  151. 'left_val': [1, 2, 3]})
  152. df2 = pd.DataFrame({'key': [1, 2, 2],
  153. 'right_val': [1, 2, 3]})
  154. result = merge_asof(df1, df2, on='key')
  155. expected = pd.DataFrame({'key': [1, 1, 3],
  156. 'left_val': [1, 2, 3],
  157. 'right_val': [1, 1, 3]})
  158. assert_frame_equal(result, expected)
  159. def test_valid_allow_exact_matches(self):
  160. trades = self.trades
  161. quotes = self.quotes
  162. with self.assertRaises(MergeError):
  163. merge_asof(trades, quotes,
  164. on='time',
  165. by='ticker',
  166. allow_exact_matches='foo')
  167. def test_valid_tolerance(self):
  168. trades = self.trades
  169. quotes = self.quotes
  170. # dti
  171. merge_asof(trades, quotes,
  172. on='time',
  173. by='ticker',
  174. tolerance=Timedelta('1s'))
  175. # integer
  176. merge_asof(trades.reset_index(), quotes.reset_index(),
  177. on='index',
  178. by='ticker',
  179. tolerance=1)
  180. # incompat
  181. with self.assertRaises(MergeError):
  182. merge_asof(trades, quotes,
  183. on='time',
  184. by='ticker',
  185. tolerance=1)
  186. # invalid
  187. with self.assertRaises(MergeError):
  188. merge_asof(trades.reset_index(), quotes.reset_index(),
  189. on='index',
  190. by='ticker',
  191. tolerance=1.0)
  192. # invalid negative
  193. with self.assertRaises(MergeError):
  194. merge_asof(trades, quotes,
  195. on='time',
  196. by='ticker',
  197. tolerance=-Timedelta('1s'))
  198. with self.assertRaises(MergeError):
  199. merge_asof(trades.reset_index(), quotes.reset_index(),
  200. on='index',
  201. by='ticker',
  202. tolerance=-1)
  203. def test_non_sorted(self):
  204. trades = self.trades.sort_values('time', ascending=False)
  205. quotes = self.quotes.sort_values('time', ascending=False)
  206. # we require that we are already sorted on time & quotes
  207. self.assertFalse(trades.time.is_monotonic)
  208. self.assertFalse(quotes.time.is_monotonic)
  209. with self.assertRaises(ValueError):
  210. merge_asof(trades, quotes,
  211. on='time',
  212. by='ticker')
  213. trades = self.trades.sort_values('time')
  214. self.assertTrue(trades.time.is_monotonic)
  215. self.assertFalse(quotes.time.is_monotonic)
  216. with self.assertRaises(ValueError):
  217. merge_asof(trades, quotes,
  218. on='time',
  219. by='ticker')
  220. quotes = self.quotes.sort_values('time')
  221. self.assertTrue(trades.time.is_monotonic)
  222. self.assertTrue(quotes.time.is_monotonic)
  223. # ok, though has dupes
  224. merge_asof(trades, self.quotes,
  225. on='time',
  226. by='ticker')
  227. def test_tolerance(self):
  228. trades = self.trades
  229. quotes = self.quotes
  230. result = merge_asof(trades, quotes,
  231. on='time',
  232. by='ticker',
  233. tolerance=Timedelta('1day'))
  234. expected = self.tolerance
  235. assert_frame_equal(result, expected)
  236. def test_allow_exact_matches(self):
  237. result = merge_asof(self.trades, self.quotes,
  238. on='time',
  239. by='ticker',
  240. allow_exact_matches=False)
  241. expected = self.allow_exact_matches
  242. assert_frame_equal(result, expected)
  243. def test_allow_exact_matches_and_tolerance(self):
  244. result = merge_asof(self.trades, self.quotes,
  245. on='time',
  246. by='ticker',
  247. tolerance=Timedelta('100ms'),
  248. allow_exact_matches=False)
  249. expected = self.allow_exact_matches_and_tolerance
  250. assert_frame_equal(result, expected)
  251. def test_allow_exact_matches_and_tolerance2(self):
  252. # GH 13695
  253. df1 = pd.DataFrame({
  254. 'time': pd.to_datetime(['2016-07-15 13:30:00.030']),
  255. 'username': ['bob']})
  256. df2 = pd.DataFrame({
  257. 'time': pd.to_datetime(['2016-07-15 13:30:00.000',
  258. '2016-07-15 13:30:00.030']),
  259. 'version': [1, 2]})
  260. result = pd.merge_asof(df1, df2, on='time')
  261. expected = pd.DataFrame({
  262. 'time': pd.to_datetime(['2016-07-15 13:30:00.030']),
  263. 'username': ['bob'],
  264. 'version': [2]})
  265. assert_frame_equal(result, expected)
  266. result = pd.merge_asof(df1, df2, on='time', allow_exact_matches=False)
  267. expected = pd.DataFrame({
  268. 'time': pd.to_datetime(['2016-07-15 13:30:00.030']),
  269. 'username': ['bob'],
  270. 'version': [1]})
  271. assert_frame_equal(result, expected)
  272. result = pd.merge_asof(df1, df2, on='time', allow_exact_matches=False,
  273. tolerance=pd.Timedelta('10ms'))
  274. expected = pd.DataFrame({
  275. 'time': pd.to_datetime(['2016-07-15 13:30:00.030']),
  276. 'username': ['bob'],
  277. 'version': [np.nan]})
  278. assert_frame_equal(result, expected)
  279. def test_allow_exact_matches_and_tolerance3(self):
  280. # GH 13709
  281. df1 = pd.DataFrame({
  282. 'time': pd.to_datetime(['2016-07-15 13:30:00.030',
  283. '2016-07-15 13:30:00.030']),
  284. 'username': ['bob', 'charlie']})
  285. df2 = pd.DataFrame({
  286. 'time': pd.to_datetime(['2016-07-15 13:30:00.000',
  287. '2016-07-15 13:30:00.030']),
  288. 'version': [1, 2]})
  289. result = pd.merge_asof(df1, df2, on='time', allow_exact_matches=False,
  290. tolerance=pd.Timedelta('10ms'))
  291. expected = pd.DataFrame({
  292. 'time': pd.to_datetime(['2016-07-15 13:30:00.030',
  293. '2016-07-15 13:30:00.030']),
  294. 'username': ['bob', 'charlie'],
  295. 'version': [np.nan, np.nan]})
  296. assert_frame_equal(result, expected)
  297. def test_by_int(self):
  298. # we specialize by type, so test that this is correct
  299. df1 = pd.DataFrame({
  300. 'time': pd.to_datetime(['20160525 13:30:00.020',
  301. '20160525 13:30:00.030',
  302. '20160525 13:30:00.040',
  303. '20160525 13:30:00.050',
  304. '20160525 13:30:00.060']),
  305. 'key': [1, 2, 1, 3, 2],
  306. 'value1': [1.1, 1.2, 1.3, 1.4, 1.5]},
  307. columns=['time', 'key', 'value1'])
  308. df2 = pd.DataFrame({
  309. 'time': pd.to_datetime(['20160525 13:30:00.015',
  310. '20160525 13:30:00.020',
  311. '20160525 13:30:00.025',
  312. '20160525 13:30:00.035',
  313. '20160525 13:30:00.040',
  314. '20160525 13:30:00.055',
  315. '20160525 13:30:00.060',
  316. '20160525 13:30:00.065']),
  317. 'key': [2, 1, 1, 3, 2, 1, 2, 3],
  318. 'value2': [2.1, 2.2, 2.3, 2.4, 2.5, 2.6, 2.7, 2.8]},
  319. columns=['time', 'key', 'value2'])
  320. result = pd.merge_asof(df1, df2, on='time', by='key')
  321. expected = pd.DataFrame({
  322. 'time': pd.to_datetime(['20160525 13:30:00.020',
  323. '20160525 13:30:00.030',
  324. '20160525 13:30:00.040',
  325. '20160525 13:30:00.050',
  326. '20160525 13:30:00.060']),
  327. 'key': [1, 2, 1, 3, 2],
  328. 'value1': [1.1, 1.2, 1.3, 1.4, 1.5],
  329. 'value2': [2.2, 2.1, 2.3, 2.4, 2.7]},
  330. columns=['time', 'key', 'value1', 'value2'])
  331. assert_frame_equal(result, expected)
  332. def test_on_float(self):
  333. # mimics how to determine the minimum-price variation
  334. df1 = pd.DataFrame({
  335. 'price': [5.01, 0.0023, 25.13, 340.05, 30.78, 1040.90, 0.0078],
  336. 'symbol': list("ABCDEFG")},
  337. columns=['symbol', 'price'])
  338. df2 = pd.DataFrame({
  339. 'price': [0.0, 1.0, 100.0],
  340. 'mpv': [0.0001, 0.01, 0.05]},
  341. columns=['price', 'mpv'])
  342. df1 = df1.sort_values('price').reset_index(drop=True)
  343. result = pd.merge_asof(df1, df2, on='price')
  344. expected = pd.DataFrame({
  345. 'symbol': list("BGACEDF"),
  346. 'price': [0.0023, 0.0078, 5.01, 25.13, 30.78, 340.05, 1040.90],
  347. 'mpv': [0.0001, 0.0001, 0.01, 0.01, 0.01, 0.05, 0.05]},
  348. columns=['symbol', 'price', 'mpv'])
  349. assert_frame_equal(result, expected)
  350. def test_on_float_by_int(self):
  351. # type specialize both "by" and "on" parameters
  352. df1 = pd.DataFrame({
  353. 'symbol': list("AAABBBCCC"),
  354. 'exch': [1, 2, 3, 1, 2, 3, 1, 2, 3],
  355. 'price': [3.26, 3.2599, 3.2598, 12.58, 12.59,
  356. 12.5, 378.15, 378.2, 378.25]},
  357. columns=['symbol', 'exch', 'price'])
  358. df2 = pd.DataFrame({
  359. 'exch': [1, 1, 1, 2, 2, 2, 3, 3, 3],
  360. 'price': [0.0, 1.0, 100.0, 0.0, 5.0, 100.0, 0.0, 5.0, 1000.0],
  361. 'mpv': [0.0001, 0.01, 0.05, 0.0001, 0.01, 0.1, 0.0001, 0.25, 1.0]},
  362. columns=['exch', 'price', 'mpv'])
  363. df1 = df1.sort_values('price').reset_index(drop=True)
  364. df2 = df2.sort_values('price').reset_index(drop=True)
  365. result = pd.merge_asof(df1, df2, on='price', by='exch')
  366. expected = pd.DataFrame({
  367. 'symbol': list("AAABBBCCC"),
  368. 'exch': [3, 2, 1, 3, 1, 2, 1, 2, 3],
  369. 'price': [3.2598, 3.2599, 3.26, 12.5, 12.58,
  370. 12.59, 378.15, 378.2, 378.25],
  371. 'mpv': [0.0001, 0.0001, 0.01, 0.25, 0.01, 0.01, 0.05, 0.1, 0.25]},
  372. columns=['symbol', 'exch', 'price', 'mpv'])
  373. assert_frame_equal(result, expected)
  374. if __name__ == '__main__':
  375. nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'],
  376. exit=False)