PageRenderTime 30ms CodeModel.GetById 1ms RepoModel.GetById 1ms app.codeStats 0ms

/vb_suite/indexing.py

https://github.com/hoffstein/pandas
Python | 292 lines | 290 code | 2 blank | 0 comment | 0 complexity | 1ebdd74a6292235e215811bd3136b6a0 MD5 | raw file
  1. from vbench.benchmark import Benchmark
  2. from datetime import datetime
  3. SECTION = 'Indexing and scalar value access'
  4. common_setup = """from .pandas_vb_common import *
  5. """
  6. #----------------------------------------------------------------------
  7. # Series.__getitem__, get_value, __getitem__(slice)
  8. setup = common_setup + """
  9. tm.N = 1000
  10. ts = tm.makeTimeSeries()
  11. dt = ts.index[500]
  12. """
  13. statement = "ts[dt]"
  14. bm_getitem = Benchmark(statement, setup, ncalls=100000,
  15. name='time_series_getitem_scalar')
  16. setup = common_setup + """
  17. index = tm.makeStringIndex(1000)
  18. s = Series(np.random.rand(1000), index=index)
  19. idx = index[100]
  20. """
  21. statement = "s.get_value(idx)"
  22. bm_get_value = Benchmark(statement, setup,
  23. name='series_get_value',
  24. start_date=datetime(2011, 11, 12))
  25. setup = common_setup + """
  26. index = tm.makeStringIndex(1000000)
  27. s = Series(np.random.rand(1000000), index=index)
  28. """
  29. series_getitem_pos_slice = Benchmark("s[:800000]", setup,
  30. name="series_getitem_pos_slice")
  31. setup = common_setup + """
  32. index = tm.makeStringIndex(1000000)
  33. s = Series(np.random.rand(1000000), index=index)
  34. lbl = s.index[800000]
  35. """
  36. series_getitem_label_slice = Benchmark("s[:lbl]", setup,
  37. name="series_getitem_label_slice")
  38. #----------------------------------------------------------------------
  39. # DataFrame __getitem__
  40. setup = common_setup + """
  41. index = tm.makeStringIndex(1000)
  42. columns = tm.makeStringIndex(30)
  43. df = DataFrame(np.random.rand(1000, 30), index=index,
  44. columns=columns)
  45. idx = index[100]
  46. col = columns[10]
  47. """
  48. statement = "df[col][idx]"
  49. bm_df_getitem = Benchmark(statement, setup,
  50. name='dataframe_getitem_scalar')
  51. setup = common_setup + """
  52. try:
  53. klass = DataMatrix
  54. except:
  55. klass = DataFrame
  56. index = tm.makeStringIndex(1000)
  57. columns = tm.makeStringIndex(30)
  58. df = klass(np.random.rand(1000, 30), index=index, columns=columns)
  59. idx = index[100]
  60. col = columns[10]
  61. """
  62. statement = "df[col][idx]"
  63. bm_df_getitem2 = Benchmark(statement, setup,
  64. name='datamatrix_getitem_scalar')
  65. #----------------------------------------------------------------------
  66. # ix get scalar
  67. setup = common_setup + """
  68. index = tm.makeStringIndex(1000)
  69. columns = tm.makeStringIndex(30)
  70. df = DataFrame(np.random.randn(1000, 30), index=index, columns=columns)
  71. idx = index[100]
  72. col = columns[10]
  73. """
  74. indexing_frame_get_value_ix = Benchmark("df.ix[idx,col]", setup,
  75. name='indexing_frame_get_value_ix',
  76. start_date=datetime(2011, 11, 12))
  77. indexing_frame_get_value = Benchmark("df.get_value(idx,col)", setup,
  78. name='indexing_frame_get_value',
  79. start_date=datetime(2011, 11, 12))
  80. setup = common_setup + """
  81. mi = MultiIndex.from_tuples([(x,y) for x in range(1000) for y in range(1000)])
  82. s = Series(np.random.randn(1000000), index=mi)
  83. """
  84. series_xs_mi_ix = Benchmark("s.ix[999]", setup,
  85. name='series_xs_mi_ix',
  86. start_date=datetime(2013, 1, 1))
  87. setup = common_setup + """
  88. mi = MultiIndex.from_tuples([(x,y) for x in range(1000) for y in range(1000)])
  89. s = Series(np.random.randn(1000000), index=mi)
  90. df = DataFrame(s)
  91. """
  92. frame_xs_mi_ix = Benchmark("df.ix[999]", setup,
  93. name='frame_xs_mi_ix',
  94. start_date=datetime(2013, 1, 1))
  95. #----------------------------------------------------------------------
  96. # Boolean DataFrame row selection
  97. setup = common_setup + """
  98. df = DataFrame(np.random.randn(10000, 4), columns=['A', 'B', 'C', 'D'])
  99. indexer = df['B'] > 0
  100. obj_indexer = indexer.astype('O')
  101. """
  102. indexing_dataframe_boolean_rows = \
  103. Benchmark("df[indexer]", setup, name='indexing_dataframe_boolean_rows')
  104. indexing_dataframe_boolean_rows_object = \
  105. Benchmark("df[obj_indexer]", setup,
  106. name='indexing_dataframe_boolean_rows_object')
  107. setup = common_setup + """
  108. df = DataFrame(np.random.randn(50000, 100))
  109. df2 = DataFrame(np.random.randn(50000, 100))
  110. """
  111. indexing_dataframe_boolean = \
  112. Benchmark("df > df2", setup, name='indexing_dataframe_boolean',
  113. start_date=datetime(2012, 1, 1))
  114. setup = common_setup + """
  115. try:
  116. import pandas.computation.expressions as expr
  117. except:
  118. expr = None
  119. if expr is None:
  120. raise NotImplementedError
  121. df = DataFrame(np.random.randn(50000, 100))
  122. df2 = DataFrame(np.random.randn(50000, 100))
  123. expr.set_numexpr_threads(1)
  124. """
  125. indexing_dataframe_boolean_st = \
  126. Benchmark("df > df2", setup, name='indexing_dataframe_boolean_st',cleanup="expr.set_numexpr_threads()",
  127. start_date=datetime(2013, 2, 26))
  128. setup = common_setup + """
  129. try:
  130. import pandas.computation.expressions as expr
  131. except:
  132. expr = None
  133. if expr is None:
  134. raise NotImplementedError
  135. df = DataFrame(np.random.randn(50000, 100))
  136. df2 = DataFrame(np.random.randn(50000, 100))
  137. expr.set_use_numexpr(False)
  138. """
  139. indexing_dataframe_boolean_no_ne = \
  140. Benchmark("df > df2", setup, name='indexing_dataframe_boolean_no_ne',cleanup="expr.set_use_numexpr(True)",
  141. start_date=datetime(2013, 2, 26))
  142. #----------------------------------------------------------------------
  143. # MultiIndex sortlevel
  144. setup = common_setup + """
  145. a = np.repeat(np.arange(100), 1000)
  146. b = np.tile(np.arange(1000), 100)
  147. midx = MultiIndex.from_arrays([a, b])
  148. midx = midx.take(np.random.permutation(np.arange(100000)))
  149. """
  150. sort_level_zero = Benchmark("midx.sortlevel(0)", setup,
  151. start_date=datetime(2012, 1, 1))
  152. sort_level_one = Benchmark("midx.sortlevel(1)", setup,
  153. start_date=datetime(2012, 1, 1))
  154. #----------------------------------------------------------------------
  155. # Panel subset selection
  156. setup = common_setup + """
  157. p = Panel(np.random.randn(100, 100, 100))
  158. inds = range(0, 100, 10)
  159. """
  160. indexing_panel_subset = Benchmark('p.ix[inds, inds, inds]', setup,
  161. start_date=datetime(2012, 1, 1))
  162. #----------------------------------------------------------------------
  163. # Iloc
  164. setup = common_setup + """
  165. df = DataFrame({'A' : [0.1] * 3000, 'B' : [1] * 3000})
  166. idx = np.array(range(30)) * 99
  167. df2 = DataFrame({'A' : [0.1] * 1000, 'B' : [1] * 1000})
  168. df2 = concat([df2, 2*df2, 3*df2])
  169. """
  170. frame_iloc_dups = Benchmark('df2.iloc[idx]', setup,
  171. start_date=datetime(2013, 1, 1))
  172. frame_loc_dups = Benchmark('df2.loc[idx]', setup,
  173. start_date=datetime(2013, 1, 1))
  174. setup = common_setup + """
  175. df = DataFrame(dict( A = [ 'foo'] * 1000000))
  176. """
  177. frame_iloc_big = Benchmark('df.iloc[:100,0]', setup,
  178. start_date=datetime(2013, 1, 1))
  179. #----------------------------------------------------------------------
  180. # basic tests for [], .loc[], .iloc[] and .ix[]
  181. setup = common_setup + """
  182. s = Series(np.random.rand(1000000))
  183. """
  184. series_getitem_scalar = Benchmark("s[800000]", setup)
  185. series_getitem_slice = Benchmark("s[:800000]", setup)
  186. series_getitem_list_like = Benchmark("s[[800000]]", setup)
  187. series_getitem_array = Benchmark("s[np.arange(10000)]", setup)
  188. series_loc_scalar = Benchmark("s.loc[800000]", setup)
  189. series_loc_slice = Benchmark("s.loc[:800000]", setup)
  190. series_loc_list_like = Benchmark("s.loc[[800000]]", setup)
  191. series_loc_array = Benchmark("s.loc[np.arange(10000)]", setup)
  192. series_iloc_scalar = Benchmark("s.iloc[800000]", setup)
  193. series_iloc_slice = Benchmark("s.iloc[:800000]", setup)
  194. series_iloc_list_like = Benchmark("s.iloc[[800000]]", setup)
  195. series_iloc_array = Benchmark("s.iloc[np.arange(10000)]", setup)
  196. series_ix_scalar = Benchmark("s.ix[800000]", setup)
  197. series_ix_slice = Benchmark("s.ix[:800000]", setup)
  198. series_ix_list_like = Benchmark("s.ix[[800000]]", setup)
  199. series_ix_array = Benchmark("s.ix[np.arange(10000)]", setup)
  200. # multi-index slicing
  201. setup = common_setup + """
  202. np.random.seed(1234)
  203. idx=pd.IndexSlice
  204. n=100000
  205. mdt = pandas.DataFrame()
  206. mdt['A'] = np.random.choice(range(10000,45000,1000), n)
  207. mdt['B'] = np.random.choice(range(10,400), n)
  208. mdt['C'] = np.random.choice(range(1,150), n)
  209. mdt['D'] = np.random.choice(range(10000,45000), n)
  210. mdt['x'] = np.random.choice(range(400), n)
  211. mdt['y'] = np.random.choice(range(25), n)
  212. test_A = 25000
  213. test_B = 25
  214. test_C = 40
  215. test_D = 35000
  216. eps_A = 5000
  217. eps_B = 5
  218. eps_C = 5
  219. eps_D = 5000
  220. mdt2 = mdt.set_index(['A','B','C','D']).sortlevel()
  221. """
  222. multiindex_slicers = Benchmark('mdt2.loc[idx[test_A-eps_A:test_A+eps_A,test_B-eps_B:test_B+eps_B,test_C-eps_C:test_C+eps_C,test_D-eps_D:test_D+eps_D],:]', setup,
  223. start_date=datetime(2015, 1, 1))
  224. #----------------------------------------------------------------------
  225. # take
  226. setup = common_setup + """
  227. s = Series(np.random.rand(100000))
  228. ts = Series(np.random.rand(100000),
  229. index=date_range('2011-01-01', freq='S', periods=100000))
  230. indexer = [True, False, True, True, False] * 20000
  231. """
  232. series_take_intindex = Benchmark("s.take(indexer)", setup)
  233. series_take_dtindex = Benchmark("ts.take(indexer)", setup)