PageRenderTime 49ms CodeModel.GetById 21ms RepoModel.GetById 1ms app.codeStats 0ms

/vb_suite/reindex.py

http://github.com/wesm/pandas
Python | 225 lines | 224 code | 1 blank | 0 comment | 0 complexity | 8bc28300c7cfd82e2b5009b3c7614fbe MD5 | raw file
Possible License(s): BSD-3-Clause, Apache-2.0
  1. from vbench.benchmark import Benchmark
  2. from datetime import datetime
  3. common_setup = """from .pandas_vb_common import *
  4. """
  5. #----------------------------------------------------------------------
  6. # DataFrame reindex columns
  7. setup = common_setup + """
  8. df = DataFrame(index=range(10000), data=np.random.rand(10000,30),
  9. columns=range(30))
  10. """
  11. statement = "df.reindex(columns=df.columns[1:5])"
  12. frame_reindex_columns = Benchmark(statement, setup)
  13. #----------------------------------------------------------------------
  14. setup = common_setup + """
  15. rng = DatetimeIndex(start='1/1/1970', periods=10000, freq=datetools.Minute())
  16. df = DataFrame(np.random.rand(10000, 10), index=rng,
  17. columns=range(10))
  18. df['foo'] = 'bar'
  19. rng2 = Index(rng[::2])
  20. """
  21. statement = "df.reindex(rng2)"
  22. dataframe_reindex = Benchmark(statement, setup)
  23. #----------------------------------------------------------------------
  24. # multiindex reindexing
  25. setup = common_setup + """
  26. N = 1000
  27. K = 20
  28. level1 = tm.makeStringIndex(N).values.repeat(K)
  29. level2 = np.tile(tm.makeStringIndex(K).values, N)
  30. index = MultiIndex.from_arrays([level1, level2])
  31. s1 = Series(np.random.randn(N * K), index=index)
  32. s2 = s1[::2]
  33. """
  34. statement = "s1.reindex(s2.index)"
  35. reindex_multi = Benchmark(statement, setup,
  36. name='reindex_multiindex',
  37. start_date=datetime(2011, 9, 1))
  38. #----------------------------------------------------------------------
  39. # Pad / backfill
  40. def pad(source_series, target_index):
  41. try:
  42. source_series.reindex(target_index, method='pad')
  43. except:
  44. source_series.reindex(target_index, fillMethod='pad')
  45. def backfill(source_series, target_index):
  46. try:
  47. source_series.reindex(target_index, method='backfill')
  48. except:
  49. source_series.reindex(target_index, fillMethod='backfill')
  50. setup = common_setup + """
  51. rng = date_range('1/1/2000', periods=100000, freq=datetools.Minute())
  52. ts = Series(np.random.randn(len(rng)), index=rng)
  53. ts2 = ts[::2]
  54. ts3 = ts2.reindex(ts.index)
  55. ts4 = ts3.astype('float32')
  56. def pad(source_series, target_index):
  57. try:
  58. source_series.reindex(target_index, method='pad')
  59. except:
  60. source_series.reindex(target_index, fillMethod='pad')
  61. def backfill(source_series, target_index):
  62. try:
  63. source_series.reindex(target_index, method='backfill')
  64. except:
  65. source_series.reindex(target_index, fillMethod='backfill')
  66. """
  67. statement = "pad(ts2, ts.index)"
  68. reindex_daterange_pad = Benchmark(statement, setup,
  69. name="reindex_daterange_pad")
  70. statement = "backfill(ts2, ts.index)"
  71. reindex_daterange_backfill = Benchmark(statement, setup,
  72. name="reindex_daterange_backfill")
  73. reindex_fillna_pad = Benchmark("ts3.fillna(method='pad')", setup,
  74. name="reindex_fillna_pad",
  75. start_date=datetime(2011, 3, 1))
  76. reindex_fillna_pad_float32 = Benchmark("ts4.fillna(method='pad')", setup,
  77. name="reindex_fillna_pad_float32",
  78. start_date=datetime(2013, 1, 1))
  79. reindex_fillna_backfill = Benchmark("ts3.fillna(method='backfill')", setup,
  80. name="reindex_fillna_backfill",
  81. start_date=datetime(2011, 3, 1))
  82. reindex_fillna_backfill_float32 = Benchmark("ts4.fillna(method='backfill')", setup,
  83. name="reindex_fillna_backfill_float32",
  84. start_date=datetime(2013, 1, 1))
  85. #----------------------------------------------------------------------
  86. # align on level
  87. setup = common_setup + """
  88. index = MultiIndex(levels=[np.arange(10), np.arange(100), np.arange(100)],
  89. labels=[np.arange(10).repeat(10000),
  90. np.tile(np.arange(100).repeat(100), 10),
  91. np.tile(np.tile(np.arange(100), 100), 10)])
  92. random.shuffle(index.values)
  93. df = DataFrame(np.random.randn(len(index), 4), index=index)
  94. df_level = DataFrame(np.random.randn(100, 4), index=index.levels[1])
  95. """
  96. reindex_frame_level_align = \
  97. Benchmark("df.align(df_level, level=1, copy=False)", setup,
  98. name='reindex_frame_level_align',
  99. start_date=datetime(2011, 12, 27))
  100. reindex_frame_level_reindex = \
  101. Benchmark("df_level.reindex(df.index, level=1)", setup,
  102. name='reindex_frame_level_reindex',
  103. start_date=datetime(2011, 12, 27))
  104. #----------------------------------------------------------------------
  105. # sort_index, drop_duplicates
  106. # pathological, but realistic
  107. setup = common_setup + """
  108. N = 10000
  109. K = 10
  110. key1 = tm.makeStringIndex(N).values.repeat(K)
  111. key2 = tm.makeStringIndex(N).values.repeat(K)
  112. df = DataFrame({'key1' : key1, 'key2' : key2,
  113. 'value' : np.random.randn(N * K)})
  114. col_array_list = list(df.values.T)
  115. """
  116. statement = "df.sort_index(by=['key1', 'key2'])"
  117. frame_sort_index_by_columns = Benchmark(statement, setup,
  118. start_date=datetime(2011, 11, 1))
  119. # drop_duplicates
  120. statement = "df.drop_duplicates(['key1', 'key2'])"
  121. frame_drop_duplicates = Benchmark(statement, setup,
  122. start_date=datetime(2011, 11, 15))
  123. statement = "df.drop_duplicates(['key1', 'key2'], inplace=True)"
  124. frame_drop_dup_inplace = Benchmark(statement, setup,
  125. start_date=datetime(2012, 5, 16))
  126. lib_fast_zip = Benchmark('lib.fast_zip(col_array_list)', setup,
  127. name='lib_fast_zip',
  128. start_date=datetime(2012, 1, 1))
  129. setup = setup + """
  130. df.ix[:10000, :] = np.nan
  131. """
  132. statement2 = "df.drop_duplicates(['key1', 'key2'])"
  133. frame_drop_duplicates_na = Benchmark(statement2, setup,
  134. start_date=datetime(2012, 5, 15))
  135. lib_fast_zip_fillna = Benchmark('lib.fast_zip_fillna(col_array_list)', setup,
  136. start_date=datetime(2012, 5, 15))
  137. statement2 = "df.drop_duplicates(['key1', 'key2'], inplace=True)"
  138. frame_drop_dup_na_inplace = Benchmark(statement2, setup,
  139. start_date=datetime(2012, 5, 16))
  140. setup = common_setup + """
  141. s = Series(np.random.randint(0, 1000, size=10000))
  142. s2 = Series(np.tile(tm.makeStringIndex(1000).values, 10))
  143. """
  144. series_drop_duplicates_int = Benchmark('s.drop_duplicates()', setup,
  145. start_date=datetime(2012, 11, 27))
  146. series_drop_duplicates_string = \
  147. Benchmark('s2.drop_duplicates()', setup,
  148. start_date=datetime(2012, 11, 27))
  149. #----------------------------------------------------------------------
  150. # fillna, many columns
  151. setup = common_setup + """
  152. values = np.random.randn(1000, 1000)
  153. values[::2] = np.nan
  154. df = DataFrame(values)
  155. """
  156. frame_fillna_many_columns_pad = Benchmark("df.fillna(method='pad')",
  157. setup,
  158. start_date=datetime(2011, 3, 1))
  159. #----------------------------------------------------------------------
  160. # blog "pandas escaped the zoo"
  161. setup = common_setup + """
  162. n = 50000
  163. indices = tm.makeStringIndex(n)
  164. def sample(values, k):
  165. from random import shuffle
  166. sampler = np.arange(len(values))
  167. shuffle(sampler)
  168. return values.take(sampler[:k])
  169. subsample_size = 40000
  170. x = Series(np.random.randn(50000), indices)
  171. y = Series(np.random.randn(subsample_size),
  172. index=sample(indices, subsample_size))
  173. """
  174. series_align_irregular_string = Benchmark("x + y", setup,
  175. start_date=datetime(2010, 6, 1))