/scripts/bench_join.py

https://github.com/dieterv77/pandas-1
Python | 211 lines | 145 code | 56 blank | 10 comment | 7 complexity | ec3d69835aa35f0b05f6e7bf71cc4198 MD5 | raw file
  1. from pandas.compat import range, lrange
  2. import numpy as np
  3. import pandas.lib as lib
  4. from pandas import *
  5. from copy import deepcopy
  6. import time
  7. n = 1000000
  8. K = 1
  9. pct_overlap = 0.2
  10. a = np.arange(n, dtype=np.int64)
  11. b = np.arange(n * pct_overlap, n * (1 + pct_overlap), dtype=np.int64)
  12. dr1 = DateRange('1/1/2000', periods=n, offset=datetools.Minute())
  13. dr2 = DateRange(
  14. dr1[int(pct_overlap * n)], periods=n, offset=datetools.Minute(2))
  15. aobj = a.astype(object)
  16. bobj = b.astype(object)
  17. av = np.random.randn(n)
  18. bv = np.random.randn(n)
  19. avf = np.random.randn(n, K)
  20. bvf = np.random.randn(n, K)
  21. a_series = Series(av, index=a)
  22. b_series = Series(bv, index=b)
  23. a_frame = DataFrame(avf, index=a, columns=lrange(K))
  24. b_frame = DataFrame(bvf, index=b, columns=lrange(K, 2 * K))
  25. def do_left_join(a, b, av, bv):
  26. out = np.empty((len(a), 2))
  27. lib.left_join_1d(a, b, av, bv, out)
  28. return out
  29. def do_outer_join(a, b, av, bv):
  30. result_index, aindexer, bindexer = lib.outer_join_indexer(a, b)
  31. result = np.empty((2, len(result_index)))
  32. lib.take_1d(av, aindexer, result[0])
  33. lib.take_1d(bv, bindexer, result[1])
  34. return result_index, result
  35. def do_inner_join(a, b, av, bv):
  36. result_index, aindexer, bindexer = lib.inner_join_indexer(a, b)
  37. result = np.empty((2, len(result_index)))
  38. lib.take_1d(av, aindexer, result[0])
  39. lib.take_1d(bv, bindexer, result[1])
  40. return result_index, result
  41. from line_profiler import LineProfiler
  42. prof = LineProfiler()
  43. from pandas.util.testing import set_trace
  44. def do_left_join_python(a, b, av, bv):
  45. indexer, mask = lib.ordered_left_join_int64(a, b)
  46. n, ak = av.shape
  47. _, bk = bv.shape
  48. result_width = ak + bk
  49. result = np.empty((result_width, n), dtype=np.float64)
  50. result[:ak] = av.T
  51. bchunk = result[ak:]
  52. _take_multi(bv.T, indexer, bchunk)
  53. np.putmask(bchunk, np.tile(mask, bk), np.nan)
  54. return result
  55. def _take_multi(data, indexer, out):
  56. if not data.flags.c_contiguous:
  57. data = data.copy()
  58. for i in range(data.shape[0]):
  59. data[i].take(indexer, out=out[i])
  60. def do_left_join_multi(a, b, av, bv):
  61. n, ak = av.shape
  62. _, bk = bv.shape
  63. result = np.empty((n, ak + bk), dtype=np.float64)
  64. lib.left_join_2d(a, b, av, bv, result)
  65. return result
  66. def do_outer_join_multi(a, b, av, bv):
  67. n, ak = av.shape
  68. _, bk = bv.shape
  69. result_index, rindexer, lindexer = lib.outer_join_indexer(a, b)
  70. result = np.empty((len(result_index), ak + bk), dtype=np.float64)
  71. lib.take_join_contiguous(av, bv, lindexer, rindexer, result)
  72. # result = np.empty((ak + bk, len(result_index)), dtype=np.float64)
  73. # lib.take_axis0(av, rindexer, out=result[:ak].T)
  74. # lib.take_axis0(bv, lindexer, out=result[ak:].T)
  75. return result_index, result
  76. def do_inner_join_multi(a, b, av, bv):
  77. n, ak = av.shape
  78. _, bk = bv.shape
  79. result_index, rindexer, lindexer = lib.inner_join_indexer(a, b)
  80. result = np.empty((len(result_index), ak + bk), dtype=np.float64)
  81. lib.take_join_contiguous(av, bv, lindexer, rindexer, result)
  82. # result = np.empty((ak + bk, len(result_index)), dtype=np.float64)
  83. # lib.take_axis0(av, rindexer, out=result[:ak].T)
  84. # lib.take_axis0(bv, lindexer, out=result[ak:].T)
  85. return result_index, result
  86. def do_left_join_multi_v2(a, b, av, bv):
  87. indexer, mask = lib.ordered_left_join_int64(a, b)
  88. bv_taken = bv.take(indexer, axis=0)
  89. np.putmask(bv_taken, mask.repeat(bv.shape[1]), np.nan)
  90. return np.concatenate((av, bv_taken), axis=1)
  91. def do_left_join_series(a, b):
  92. return b.reindex(a.index)
  93. def do_left_join_frame(a, b):
  94. a.index._indexMap = None
  95. b.index._indexMap = None
  96. return a.join(b, how='left')
  97. # a = np.array([1, 2, 3, 4, 5], dtype=np.int64)
  98. # b = np.array([0, 3, 5, 7, 9], dtype=np.int64)
  99. # print(lib.inner_join_indexer(a, b))
  100. out = np.empty((10, 120000))
  101. def join(a, b, av, bv, how="left"):
  102. func_dict = {'left': do_left_join_multi,
  103. 'outer': do_outer_join_multi,
  104. 'inner': do_inner_join_multi}
  105. f = func_dict[how]
  106. return f(a, b, av, bv)
  107. def bench_python(n=100000, pct_overlap=0.20, K=1):
  108. import gc
  109. ns = [2, 3, 4, 5, 6]
  110. iterations = 200
  111. pct_overlap = 0.2
  112. kinds = ['outer', 'left', 'inner']
  113. all_results = {}
  114. for logn in ns:
  115. n = 10 ** logn
  116. a = np.arange(n, dtype=np.int64)
  117. b = np.arange(n * pct_overlap, n * pct_overlap + n, dtype=np.int64)
  118. avf = np.random.randn(n, K)
  119. bvf = np.random.randn(n, K)
  120. a_frame = DataFrame(avf, index=a, columns=lrange(K))
  121. b_frame = DataFrame(bvf, index=b, columns=lrange(K, 2 * K))
  122. all_results[logn] = result = {}
  123. for kind in kinds:
  124. gc.disable()
  125. elapsed = 0
  126. _s = time.clock()
  127. for i in range(iterations):
  128. if i % 10 == 0:
  129. elapsed += time.clock() - _s
  130. gc.collect()
  131. _s = time.clock()
  132. a_frame.join(b_frame, how=kind)
  133. # join(a, b, avf, bvf, how=kind)
  134. elapsed += time.clock() - _s
  135. gc.enable()
  136. result[kind] = (elapsed / iterations) * 1000
  137. return DataFrame(all_results, index=kinds)
  138. def bench_xts(n=100000, pct_overlap=0.20):
  139. from pandas.rpy.common import r
  140. r('a <- 5')
  141. xrng = '1:%d' % n
  142. start = n * pct_overlap + 1
  143. end = n + start - 1
  144. yrng = '%d:%d' % (start, end)
  145. r('library(xts)')
  146. iterations = 500
  147. kinds = ['left', 'outer', 'inner']
  148. result = {}
  149. for kind in kinds:
  150. r('x <- xts(rnorm(%d), as.POSIXct(Sys.Date()) + %s)' % (n, xrng))
  151. r('y <- xts(rnorm(%d), as.POSIXct(Sys.Date()) + %s)' % (n, yrng))
  152. stmt = 'for (i in 1:%d) merge(x, y, join="%s")' % (iterations, kind)
  153. elapsed = r('as.list(system.time(%s, gcFirst=F))$elapsed' % stmt)[0]
  154. result[kind] = (elapsed / iterations) * 1000
  155. return Series(result)