PageRenderTime 44ms CodeModel.GetById 18ms RepoModel.GetById 0ms app.codeStats 0ms

/pandas/tseries/resample.py

http://github.com/pydata/pandas
Python | 448 lines | 373 code | 46 blank | 29 comment | 53 complexity | e2ef86f1423dc11e662e1513904a4005 MD5 | raw file
Possible License(s): BSD-3-Clause, Apache-2.0
  1. from datetime import timedelta
  2. import numpy as np
  3. from pandas.core.groupby import BinGrouper, Grouper
  4. from pandas.tseries.frequencies import to_offset, is_subperiod, is_superperiod
  5. from pandas.tseries.index import DatetimeIndex, date_range
  6. from pandas.tseries.offsets import DateOffset, Tick, _delta_to_nanoseconds
  7. from pandas.tseries.period import PeriodIndex, period_range
  8. import pandas.tseries.tools as tools
  9. import pandas.core.common as com
  10. import pandas.compat as compat
  11. from pandas.lib import Timestamp
  12. import pandas.lib as lib
  13. import pandas.tslib as tslib
  14. _DEFAULT_METHOD = 'mean'
  15. class TimeGrouper(Grouper):
  16. """
  17. Custom groupby class for time-interval grouping
  18. Parameters
  19. ----------
  20. freq : pandas date offset or offset alias for identifying bin edges
  21. closed : closed end of interval; left or right
  22. label : interval boundary to use for labeling; left or right
  23. nperiods : optional, integer
  24. convention : {'start', 'end', 'e', 's'}
  25. If axis is PeriodIndex
  26. Notes
  27. -----
  28. Use begin, end, nperiods to generate intervals that cannot be derived
  29. directly from the associated object
  30. """
  31. def __init__(self, freq='Min', closed=None, label=None, how='mean',
  32. nperiods=None, axis=0,
  33. fill_method=None, limit=None, loffset=None, kind=None,
  34. convention=None, base=0, **kwargs):
  35. freq = to_offset(freq)
  36. end_types = set(['M', 'A', 'Q', 'BM', 'BA', 'BQ', 'W'])
  37. rule = freq.rule_code
  38. if (rule in end_types or
  39. ('-' in rule and rule[:rule.find('-')] in end_types)):
  40. if closed is None:
  41. closed = 'right'
  42. if label is None:
  43. label = 'right'
  44. else:
  45. if closed is None:
  46. closed = 'left'
  47. if label is None:
  48. label = 'left'
  49. self.closed = closed
  50. self.label = label
  51. self.nperiods = nperiods
  52. self.kind = kind
  53. self.convention = convention or 'E'
  54. self.convention = self.convention.lower()
  55. self.loffset = loffset
  56. self.how = how
  57. self.fill_method = fill_method
  58. self.limit = limit
  59. self.base = base
  60. # always sort time groupers
  61. kwargs['sort'] = True
  62. super(TimeGrouper, self).__init__(freq=freq, axis=axis, **kwargs)
  63. def resample(self, obj):
  64. self._set_grouper(obj, sort=True)
  65. ax = self.grouper
  66. if isinstance(ax, DatetimeIndex):
  67. rs = self._resample_timestamps()
  68. elif isinstance(ax, PeriodIndex):
  69. offset = to_offset(self.freq)
  70. if offset.n > 1:
  71. if self.kind == 'period': # pragma: no cover
  72. print('Warning: multiple of frequency -> timestamps')
  73. # Cannot have multiple of periods, convert to timestamp
  74. self.kind = 'timestamp'
  75. if self.kind is None or self.kind == 'period':
  76. rs = self._resample_periods()
  77. else:
  78. obj = self.obj.to_timestamp(how=self.convention)
  79. self._set_grouper(obj)
  80. rs = self._resample_timestamps()
  81. elif len(ax) == 0:
  82. return self.obj
  83. else: # pragma: no cover
  84. raise TypeError('Only valid with DatetimeIndex or PeriodIndex')
  85. rs_axis = rs._get_axis(self.axis)
  86. rs_axis.name = ax.name
  87. return rs
  88. def _get_grouper(self, obj):
  89. self._set_grouper(obj)
  90. return self._get_binner_for_resample()
  91. def _get_binner_for_resample(self):
  92. # create the BinGrouper
  93. # assume that self.set_grouper(obj) has already been called
  94. ax = self.ax
  95. if self.kind is None or self.kind == 'timestamp':
  96. self.binner, bins, binlabels = self._get_time_bins(ax)
  97. else:
  98. self.binner, bins, binlabels = self._get_time_period_bins(ax)
  99. self.grouper = BinGrouper(bins, binlabels)
  100. return self.binner, self.grouper, self.obj
  101. def _get_binner_for_grouping(self, obj):
  102. # return an ordering of the transformed group labels,
  103. # suitable for multi-grouping, e.g the labels for
  104. # the resampled intervals
  105. ax = self._set_grouper(obj)
  106. self._get_binner_for_resample()
  107. # create the grouper
  108. binner = self.binner
  109. l = []
  110. for key, group in self.grouper.get_iterator(ax):
  111. l.extend([key]*len(group))
  112. grouper = binner.__class__(l,freq=binner.freq,name=binner.name)
  113. # since we may have had to sort
  114. # may need to reorder groups here
  115. if self.indexer is not None:
  116. indexer = self.indexer.argsort(kind='quicksort')
  117. grouper = grouper.take(indexer)
  118. return grouper
  119. def _get_time_bins(self, ax):
  120. if not isinstance(ax, DatetimeIndex):
  121. raise TypeError('axis must be a DatetimeIndex, but got '
  122. 'an instance of %r' % type(ax).__name__)
  123. if len(ax) == 0:
  124. binner = labels = DatetimeIndex(data=[], freq=self.freq, name=ax.name)
  125. return binner, [], labels
  126. first, last = ax.min(), ax.max()
  127. first, last = _get_range_edges(first, last, self.freq, closed=self.closed,
  128. base=self.base)
  129. tz = ax.tz
  130. binner = labels = DatetimeIndex(freq=self.freq,
  131. start=first.replace(tzinfo=None),
  132. end=last.replace(tzinfo=None),
  133. tz=tz,
  134. name=ax.name)
  135. # a little hack
  136. trimmed = False
  137. if (len(binner) > 2 and binner[-2] == last and
  138. self.closed == 'right'):
  139. binner = binner[:-1]
  140. trimmed = True
  141. ax_values = ax.asi8
  142. binner, bin_edges = self._adjust_bin_edges(binner, ax_values)
  143. # general version, knowing nothing about relative frequencies
  144. bins = lib.generate_bins_dt64(ax_values, bin_edges, self.closed, hasnans=ax.hasnans)
  145. if self.closed == 'right':
  146. labels = binner
  147. if self.label == 'right':
  148. labels = labels[1:]
  149. elif not trimmed:
  150. labels = labels[:-1]
  151. else:
  152. if self.label == 'right':
  153. labels = labels[1:]
  154. elif not trimmed:
  155. labels = labels[:-1]
  156. if ax.hasnans:
  157. binner = binner.insert(0, tslib.NaT)
  158. labels = labels.insert(0, tslib.NaT)
  159. # if we end up with more labels than bins
  160. # adjust the labels
  161. # GH4076
  162. if len(bins) < len(labels):
  163. labels = labels[:len(bins)]
  164. return binner, bins, labels
  165. def _adjust_bin_edges(self, binner, ax_values):
  166. # Some hacks for > daily data, see #1471, #1458, #1483
  167. bin_edges = binner.asi8
  168. if self.freq != 'D' and is_superperiod(self.freq, 'D'):
  169. day_nanos = _delta_to_nanoseconds(timedelta(1))
  170. if self.closed == 'right':
  171. bin_edges = bin_edges + day_nanos - 1
  172. # intraday values on last day
  173. if bin_edges[-2] > ax_values.max():
  174. bin_edges = bin_edges[:-1]
  175. binner = binner[:-1]
  176. return binner, bin_edges
  177. def _get_time_period_bins(self, ax):
  178. if not isinstance(ax, DatetimeIndex):
  179. raise TypeError('axis must be a DatetimeIndex, but got '
  180. 'an instance of %r' % type(ax).__name__)
  181. if not len(ax):
  182. binner = labels = PeriodIndex(data=[], freq=self.freq, name=ax.name)
  183. return binner, [], labels
  184. labels = binner = PeriodIndex(start=ax[0],
  185. end=ax[-1],
  186. freq=self.freq,
  187. name=ax.name)
  188. end_stamps = (labels + 1).asfreq(self.freq, 's').to_timestamp()
  189. if ax.tzinfo:
  190. end_stamps = end_stamps.tz_localize(ax.tzinfo)
  191. bins = ax.searchsorted(end_stamps, side='left')
  192. return binner, bins, labels
  193. @property
  194. def _agg_method(self):
  195. return self.how if self.how else _DEFAULT_METHOD
  196. def _resample_timestamps(self):
  197. # assumes set_grouper(obj) already called
  198. axlabels = self.ax
  199. self._get_binner_for_resample()
  200. grouper = self.grouper
  201. binner = self.binner
  202. obj = self.obj
  203. # Determine if we're downsampling
  204. if axlabels.freq is not None or axlabels.inferred_freq is not None:
  205. if len(grouper.binlabels) < len(axlabels) or self.how is not None:
  206. # downsample
  207. grouped = obj.groupby(grouper, axis=self.axis)
  208. result = grouped.aggregate(self._agg_method)
  209. # GH2073
  210. if self.fill_method is not None:
  211. result = result.fillna(method=self.fill_method,
  212. limit=self.limit)
  213. else:
  214. # upsampling shortcut
  215. if self.axis:
  216. raise AssertionError('axis must be 0')
  217. if self.closed == 'right':
  218. res_index = binner[1:]
  219. else:
  220. res_index = binner[:-1]
  221. # if we have the same frequency as our axis, then we are equal sampling
  222. # even if how is None
  223. if self.fill_method is None and self.limit is None and to_offset(
  224. axlabels.inferred_freq) == self.freq:
  225. result = obj.copy()
  226. result.index = res_index
  227. else:
  228. result = obj.reindex(res_index, method=self.fill_method,
  229. limit=self.limit)
  230. else:
  231. # Irregular data, have to use groupby
  232. grouped = obj.groupby(grouper, axis=self.axis)
  233. result = grouped.aggregate(self._agg_method)
  234. if self.fill_method is not None:
  235. result = result.fillna(method=self.fill_method,
  236. limit=self.limit)
  237. loffset = self.loffset
  238. if isinstance(loffset, compat.string_types):
  239. loffset = to_offset(self.loffset)
  240. if isinstance(loffset, (DateOffset, timedelta)):
  241. if (isinstance(result.index, DatetimeIndex)
  242. and len(result.index) > 0):
  243. result.index = result.index + loffset
  244. return result
  245. def _resample_periods(self):
  246. # assumes set_grouper(obj) already called
  247. axlabels = self.ax
  248. obj = self.obj
  249. if len(axlabels) == 0:
  250. new_index = PeriodIndex(data=[], freq=self.freq)
  251. return obj.reindex(new_index)
  252. else:
  253. start = axlabels[0].asfreq(self.freq, how=self.convention)
  254. end = axlabels[-1].asfreq(self.freq, how='end')
  255. new_index = period_range(start, end, freq=self.freq)
  256. # Start vs. end of period
  257. memb = axlabels.asfreq(self.freq, how=self.convention)
  258. if is_subperiod(axlabels.freq, self.freq) or self.how is not None:
  259. # Downsampling
  260. rng = np.arange(memb.values[0], memb.values[-1] + 1)
  261. bins = memb.searchsorted(rng, side='right')
  262. grouper = BinGrouper(bins, new_index)
  263. grouped = obj.groupby(grouper, axis=self.axis)
  264. return grouped.aggregate(self._agg_method)
  265. elif is_superperiod(axlabels.freq, self.freq):
  266. # Get the fill indexer
  267. indexer = memb.get_indexer(new_index, method=self.fill_method,
  268. limit=self.limit)
  269. return _take_new_index(obj, indexer, new_index, axis=self.axis)
  270. else:
  271. raise ValueError('Frequency %s cannot be resampled to %s'
  272. % (axlabels.freq, self.freq))
  273. def _take_new_index(obj, indexer, new_index, axis=0):
  274. from pandas.core.api import Series, DataFrame
  275. if isinstance(obj, Series):
  276. new_values = com.take_1d(obj.values, indexer)
  277. return Series(new_values, index=new_index, name=obj.name)
  278. elif isinstance(obj, DataFrame):
  279. if axis == 1:
  280. raise NotImplementedError
  281. return DataFrame(obj._data.reindex_indexer(
  282. new_axis=new_index, indexer=indexer, axis=1))
  283. else:
  284. raise NotImplementedError
  285. def _get_range_edges(first, last, offset, closed='left', base=0):
  286. if isinstance(offset, compat.string_types):
  287. offset = to_offset(offset)
  288. if isinstance(offset, Tick):
  289. day_nanos = _delta_to_nanoseconds(timedelta(1))
  290. # #1165
  291. if (day_nanos % offset.nanos) == 0:
  292. return _adjust_dates_anchored(first, last, offset,
  293. closed=closed, base=base)
  294. if not isinstance(offset, Tick): # and first.time() != last.time():
  295. # hack!
  296. first = tools.normalize_date(first)
  297. last = tools.normalize_date(last)
  298. if closed == 'left':
  299. first = Timestamp(offset.rollback(first))
  300. else:
  301. first = Timestamp(first - offset)
  302. last = Timestamp(last + offset)
  303. return first, last
  304. def _adjust_dates_anchored(first, last, offset, closed='right', base=0):
  305. from pandas.tseries.tools import normalize_date
  306. start_day_nanos = Timestamp(normalize_date(first)).value
  307. last_day_nanos = Timestamp(normalize_date(last)).value
  308. base_nanos = (base % offset.n) * offset.nanos // offset.n
  309. start_day_nanos += base_nanos
  310. last_day_nanos += base_nanos
  311. foffset = (first.value - start_day_nanos) % offset.nanos
  312. loffset = (last.value - last_day_nanos) % offset.nanos
  313. if closed == 'right':
  314. if foffset > 0:
  315. # roll back
  316. fresult = first.value - foffset
  317. else:
  318. fresult = first.value - offset.nanos
  319. if loffset > 0:
  320. # roll forward
  321. lresult = last.value + (offset.nanos - loffset)
  322. else:
  323. # already the end of the road
  324. lresult = last.value
  325. else: # closed == 'left'
  326. if foffset > 0:
  327. fresult = first.value - foffset
  328. else:
  329. # start of the road
  330. fresult = first.value
  331. if loffset > 0:
  332. # roll forward
  333. lresult = last.value + (offset.nanos - loffset)
  334. else:
  335. lresult = last.value + offset.nanos
  336. return (Timestamp(fresult, tz=first.tz),
  337. Timestamp(lresult, tz=last.tz))
  338. def asfreq(obj, freq, method=None, how=None, normalize=False):
  339. """
  340. Utility frequency conversion method for Series/DataFrame
  341. """
  342. if isinstance(obj.index, PeriodIndex):
  343. if method is not None:
  344. raise NotImplementedError
  345. if how is None:
  346. how = 'E'
  347. new_index = obj.index.asfreq(freq, how=how)
  348. new_obj = obj.copy()
  349. new_obj.index = new_index
  350. return new_obj
  351. else:
  352. if len(obj.index) == 0:
  353. return obj.copy()
  354. dti = date_range(obj.index[0], obj.index[-1], freq=freq)
  355. rs = obj.reindex(dti, method=method)
  356. if normalize:
  357. rs.index = rs.index.normalize()
  358. return rs