PageRenderTime 44ms CodeModel.GetById 8ms RepoModel.GetById 0ms app.codeStats 0ms

/pandasreg/extensions.py

https://github.com/abielr/pandasreg
Python | 265 lines | 244 code | 8 blank | 13 comment | 4 complexity | 66cf1fb1062cca710fa62b346e421ce7 MD5 | raw file
  1. import numpy as np
  2. import pandas as pd
  3. from dateutil.relativedelta import relativedelta
  4. from pandas.tseries import frequencies
  5. from pandas.tseries.offsets import DateOffset, CacheableOffset
  6. from pandas.tseries.frequencies import to_offset
  7. from pandas.tseries import offsets
  8. from datetime import datetime, timedelta
  9. from collections import defaultdict
  10. import os
  11. import glob
  12. import subprocess
  13. import uuid
  14. import pandas.lib as lib
  15. from pandasreg.rperiod import RPeriodIndex, RFrequency, RPeriod
  16. def trim(series):
  17. """Trim trailing and leading NaN values"""
  18. ix = np.where(np.isfinite(series))[0]
  19. if len(ix) == 0:
  20. return series[0:0]
  21. return series[ix[0]:(ix[-1]+1)]
  22. def fill(series):
  23. """Makes a series regularly spaced if it is not so already"""
  24. if series.index.is_full:
  25. return series
  26. ix = RPeriodIndex(start=series.index[0], end=series.index[-1],
  27. freq=series.index.freq)
  28. return series.reindex(ix)
  29. def _agg_first(x):
  30. ix = np.where(np.isfinite(x))[0]
  31. if len(ix) == 0:
  32. return np.nan
  33. return x[ix[0]]
  34. def _agg_last(x):
  35. ix = np.where(np.isfinite(x))[0]
  36. if len(ix) == 0:
  37. return np.nan
  38. return x[ix[-1]]
  39. def resample(input, freq, how=None):
  40. """Resample (convert) a time series to another frequency.
  41. This function is conceptually similar to the resample() method that pandas
  42. provides for Series and DataFrames. It is designed to work with a Series or
  43. DataFrame that uses an instance of RPeriodIndex. Behavior slightly different
  44. than the pandas function, and there are fewer arguments that can be set.
  45. Arguments:
  46. freq (str, RFrequency): Frequency to convert to.
  47. how: a string, which can be 'mean', 'sum', 'first', 'last', 'min',
  48. 'max', or a function.
  49. """
  50. # TODO: allow/disallow partial periods in aggregation
  51. # TODO: disaggregation performance is too slow, avoid groupby and transform
  52. # at least for base cases
  53. if not isinstance(input.index, RPeriodIndex):
  54. raise ValueError("Index must be of type RPeriodIndex")
  55. if isinstance(freq, basestring):
  56. freq = RFrequency.init(freq)
  57. if how is None:
  58. how = input.index.observed
  59. aggfuncs = {
  60. "sum": np.sum,
  61. "mean": np.mean,
  62. "first": _agg_first,
  63. "last": _agg_last,
  64. "min": np.min,
  65. "max": np.max
  66. }
  67. def disagg_start(x):
  68. x[0] = x[-1]
  69. x[-1] = np.NaN
  70. return x
  71. disaggfuncs = {
  72. "sum": lambda x: x.fillna(method="backfill")/len(x),
  73. "mean": lambda x: x.fillna(method="backfill"),
  74. "first": disagg_start,
  75. "last": lambda x: x,
  76. "min": lambda x: x.fillna(method="backfill"),
  77. "max": lambda x: x.fillna(method="backfill")
  78. }
  79. if input.index.freq < freq: # disaggregation
  80. start = input.index[0].asfreq(freq, how='S')
  81. end = input.index[-1].asfreq(freq)
  82. index = RPeriodIndex(start=start, end=end, freq=freq)
  83. if isinstance(input, pd.Series):
  84. s = pd.Series(input.values, index=input.index.asfreq(freq))
  85. elif isinstance(input, pd.DataFrame):
  86. s = pd.DataFrame(input.values, columns=input.columns,
  87. index=input.index.asfreq(freq))
  88. idx = np.empty(len(s.index)+1, dtype=int)
  89. idx[0] = start.ordinal-1
  90. idx[1:] = s.index.values
  91. lengths = np.diff(idx)
  92. groups = np.repeat(np.arange(len(lengths)), lengths)
  93. if isinstance(how, basestring):
  94. try:
  95. how = disaggfuncs[how]
  96. except KeyError:
  97. raise KeyError("Invalid disaggregation function '%s'" % how)
  98. s = s.reindex(index).groupby(groups).transform(how)
  99. s.index = index # otherwise index is Int64 when using DataFrame
  100. return s
  101. elif input.index.freq > freq: # aggregation
  102. start = input.index[0].asfreq(freq)
  103. end = input.index[-1].asfreq(freq)
  104. indexnew = RPeriodIndex(start=start, end=end, freq=freq)
  105. index =indexnew.asfreq(input.index.freq)
  106. nbins = end-start+1
  107. idx = np.empty(nbins+1, dtype=int)
  108. idx[0] = input.index[0].ordinal-1
  109. idx[1:] = index.values
  110. idx[-1] = input.index[-1].ordinal
  111. lengths = np.diff(idx)
  112. groups = np.repeat(np.arange(len(lengths)), lengths)
  113. if isinstance(how, basestring):
  114. try:
  115. how = aggfuncs[how]
  116. except KeyError:
  117. raise KeyError("Invalid aggregation function '%s'" % how)
  118. s = input.groupby(groups).agg(how)
  119. s.index = indexnew
  120. return s
  121. return input
  122. def overlay(series, replace=True):
  123. """
  124. Overlay a list of series on top of each other
  125. Example: overlay([s1,s2,s3])
  126. If series overlap, the series that came last in the input list will have
  127. precedence if replace=True. If replace=false, a series coming later in the
  128. list will replace only NA values in the existing list.
  129. """
  130. if not isinstance(series, list) and not isinstance(series, tuple):
  131. raise ValueError("series argument should be list or tuple")
  132. if len(series) == 0:
  133. return None
  134. if len(set([s.index.freq.freqstr for s in series])) > 1:
  135. raise ValueError("Can only overlay series with the same frequencies")
  136. start = min([s.index[0].ordinal for s in series])
  137. end = max([s.index[-1].ordinal for s in series])
  138. index = RPeriodIndex(start=start, end=end, freq=series[0].index.freq)
  139. new_series = pd.Series(np.empty(len(index)), index=index)
  140. new_series[:] = np.nan
  141. for s in series:
  142. if replace:
  143. new_series[s.index[0]:s.index[-1]] = s
  144. else:
  145. new_series[s.index[0]:s.index[-1]][np.isnan(new_series[s.index[0]:s.index[-1]])] = \
  146. s[np.isnan(new_series[s.index[0]:s.index[-1]])]
  147. return new_series
  148. def extend(input, extender, direction="forward", extender_type="index"):
  149. """
  150. Extend a series forward or backward using another series or an array.
  151. Arguments:
  152. direction (str): 'forward', 'backward'
  153. extender_type (str): 'index', 'pc' (percent change), 'pca' (percent
  154. change, annualized), or 'diff'. If type = pc or pca, the percent changes
  155. should be in decimal form, i.e. 4% = .04
  156. """
  157. # TODO make this makes nicely with DataFrame
  158. if not isinstance(input, pd.Series) and not isinstance(input, pd.DataFrame):
  159. raise ValueError("Input must be Series or DataFrame")
  160. input = trim(input)
  161. if not direction in ["forward", "backward"]:
  162. raise ValueError("direction must be 'forward' or 'backward'")
  163. if not extender_type in ["index","pc","pca","diff"]:
  164. raise ValueError("extender_type must be 'index', 'pc', 'pca', or 'diff'")
  165. if isinstance(extender, list):
  166. extender = np.array(extender)
  167. if (isinstance(extender, pd.Series) and
  168. isinstance(extender.index, RPeriodIndex) and
  169. input.index.freq != input.index.freq):
  170. raise ValueError("Series and extender series must have same frequency")
  171. if extender_type == "index":
  172. if direction == "forward":
  173. if not input.index[-1] in extender.index:
  174. return input
  175. tmp = extender/extender[input.index[-1]]*input[-1]
  176. return overlay((input, tmp[input.index[-1]+1:]))
  177. elif direction == "backward":
  178. if not input.index[0] in extender.index:
  179. return input
  180. tmp = extender/extender[input.index[0]]*input[0]
  181. return overlay((input, tmp[:input.index[0]]))
  182. elif extender_type == "pc":
  183. index = RPeriodIndex(start=extender.index[0]-1, periods=len(extender)+1,
  184. freq=extender.index.freq)
  185. tmp = pd.Series(np.empty(len(index), dtype=input.dtype), index)
  186. tmp[0] = 100
  187. tmp[1:] = 1+extender
  188. tmp = np.cumprod(tmp)
  189. return extend(input, tmp, direction=direction, extender_type='index')
  190. elif extender_type == "pca":
  191. if (not isinstance(extender, pd.Series) or
  192. not isinstance(extender.index, RPeriodIndex)):
  193. raise ValueError("Can only use extender_type = pca with a time \
  194. series that has an RPeriodIndex")
  195. index = RPeriodIndex(start=extender.index[0]-1, periods=len(extender)+1,
  196. freq=extender.index.freq)
  197. tmp = pd.Series(np.empty(len(index), dtype=input.dtype), index)
  198. tmp[0] = 100
  199. tmp[1:] = (1+extender)**(1.0/input.index.freq.periodicity)
  200. tmp = np.cumprod(tmp)
  201. return extend(input, tmp, direction=direction, extender_type='index')
  202. elif extender_type == "diff":
  203. if direction == "forward":
  204. if not input.index[-1]+1 in extender.index:
  205. return input
  206. tmp = np.cumsum(extender[input.index[-1]+1:])+input[-1]
  207. return overlay((input, tmp[input.index[-1]+1:]))
  208. elif direction == "backward":
  209. if not input.index[0]-1 in extender.index:
  210. return input
  211. tmp = (np.cumsum(-extender[:input.index[0]-1][::-1])+input[0])[::-1]
  212. return overlay((input, tmp[:input.index[0]-1]))