PageRenderTime 776ms CodeModel.GetById 1ms RepoModel.GetById 0ms app.codeStats 0ms

/pandas/stats/moments.py

https://github.com/smc77/pandas
Python | 450 lines | 437 code | 7 blank | 6 comment | 2 complexity | dfbc72fa3a82a326cc2a03a8c8ea0d1c MD5 | raw file
  1. """
  2. Provides rolling statistical moments and related descriptive
  3. statistics implemented in Cython
  4. """
  5. from __future__ import division
  6. from functools import wraps
  7. from numpy import NaN
  8. import numpy as np
  9. from pandas.core.api import DataFrame, Series, notnull
  10. import pandas._tseries as _tseries
  11. from pandas.util.decorators import Substitution, Appender
  12. __all__ = ['rolling_count', 'rolling_max', 'rolling_min',
  13. 'rolling_sum', 'rolling_mean', 'rolling_std', 'rolling_cov',
  14. 'rolling_corr', 'rolling_var', 'rolling_skew', 'rolling_kurt',
  15. 'rolling_quantile', 'rolling_median', 'rolling_apply',
  16. 'rolling_corr_pairwise',
  17. 'ewma', 'ewmvar', 'ewmstd', 'ewmvol', 'ewmcorr', 'ewmcov']
  18. #-------------------------------------------------------------------------------
  19. # Docs
  20. _doc_template = """
  21. %s
  22. Parameters
  23. ----------
  24. %s
  25. window : Number of observations used for calculating statistic
  26. min_periods : int
  27. Minimum number of observations in window required to have a value
  28. time_rule : {None, 'WEEKDAY', 'EOM', 'W@MON', ...}, default=None
  29. Name of time rule to conform to before computing statistic
  30. Returns
  31. -------
  32. %s
  33. """
  34. _ewm_doc = r"""%s
  35. Parameters
  36. ----------
  37. %s
  38. com : float. optional
  39. Center of mass: \alpha = com / (1 + com),
  40. span : float, optional
  41. Specify decay in terms of span, \alpha = 2 / (span + 1)
  42. min_periods : int, default 0
  43. Number of observations in sample to require (only affects
  44. beginning)
  45. time_rule : {None, 'WEEKDAY', 'EOM', 'W@MON', ...}, default None
  46. Name of time rule to conform to before computing statistic
  47. %s
  48. Notes
  49. -----
  50. Either center of mass or span must be specified
  51. EWMA is sometimes specified using a "span" parameter s, we have have that the
  52. decay parameter \alpha is related to the span as :math:`\alpha = 1 - 2 / (s + 1)
  53. = c / (1 + c)`
  54. where c is the center of mass. Given a span, the associated center of mass is
  55. :math:`c = (s - 1) / 2`
  56. So a "20-day EWMA" would have center 9.5.
  57. Returns
  58. -------
  59. y : type of input argument
  60. """
  61. _type_of_input = "y : type of input argument"
  62. _flex_retval = """y : type depends on inputs
  63. DataFrame / DataFrame -> DataFrame (matches on columns)
  64. DataFrame / Series -> Computes result for each column
  65. Series / Series -> Series"""
  66. _unary_arg = "arg : Series, DataFrame"
  67. _binary_arg_flex = """arg1 : Series, DataFrame, or ndarray
  68. arg2 : Series, DataFrame, or ndarray"""
  69. _binary_arg = """arg1 : Series, DataFrame, or ndarray
  70. arg2 : Series, DataFrame, or ndarray"""
  71. _bias_doc = r"""bias : boolean, default False
  72. Use a standard estimation bias correction
  73. """
  74. def rolling_count(arg, window, time_rule=None):
  75. """
  76. Rolling count of number of non-NaN observations inside provided window.
  77. Parameters
  78. ----------
  79. arg : DataFrame or numpy ndarray-like
  80. window : Number of observations used for calculating statistic
  81. Returns
  82. -------
  83. rolling_count : type of caller
  84. """
  85. arg = _conv_timerule(arg, time_rule)
  86. window = min(window, len(arg))
  87. return_hook, values = _process_data_structure(arg, kill_inf=False)
  88. converted = np.isfinite(values).astype(float)
  89. result = rolling_sum(converted, window, min_periods=1,
  90. time_rule=time_rule)
  91. # putmask here?
  92. result[np.isnan(result)] = 0
  93. return return_hook(result)
  94. @Substitution("Unbiased moving covariance", _binary_arg_flex, _flex_retval)
  95. @Appender(_doc_template)
  96. def rolling_cov(arg1, arg2, window, min_periods=None, time_rule=None):
  97. def _get_cov(X, Y):
  98. mean = lambda x: rolling_mean(x, window, min_periods, time_rule)
  99. count = rolling_count(X + Y, window, time_rule)
  100. bias_adj = count / (count - 1)
  101. return (mean(X * Y) - mean(X) * mean(Y)) * bias_adj
  102. return _flex_binary_moment(arg1, arg2, _get_cov)
  103. @Substitution("Moving sample correlation", _binary_arg_flex, _flex_retval)
  104. @Appender(_doc_template)
  105. def rolling_corr(arg1, arg2, window, min_periods=None, time_rule=None):
  106. def _get_corr(a, b):
  107. num = rolling_cov(a, b, window, min_periods, time_rule)
  108. den = (rolling_std(a, window, min_periods, time_rule) *
  109. rolling_std(b, window, min_periods, time_rule))
  110. return num / den
  111. return _flex_binary_moment(arg1, arg2, _get_corr)
  112. def _flex_binary_moment(arg1, arg2, f):
  113. if isinstance(arg1, np.ndarray) and isinstance(arg2, np.ndarray):
  114. X, Y = _prep_binary(arg1, arg2)
  115. return f(X, Y)
  116. elif isinstance(arg1, DataFrame):
  117. results = {}
  118. if isinstance(arg2, DataFrame):
  119. X, Y = arg1.align(arg2, join='outer')
  120. X = X + 0 * Y
  121. Y = Y + 0 * X
  122. res_columns = arg1.columns.union(arg2.columns)
  123. for col in res_columns:
  124. if col in X and col in Y:
  125. results[col] = f(X[col], Y[col])
  126. else:
  127. res_columns = arg1.columns
  128. X, Y = arg1.align(arg2, axis=0, join='outer')
  129. results = {}
  130. for col in res_columns:
  131. results[col] = f(X[col], Y)
  132. return DataFrame(results, index=X.index, columns=res_columns)
  133. else:
  134. return _flex_binary_moment(arg2, arg1, f)
  135. def rolling_corr_pairwise(df, window, min_periods=None):
  136. """
  137. Computes pairwise rolling correlation matrices as Panel whose items are
  138. dates
  139. Parameters
  140. ----------
  141. df : DataFrame
  142. window : int
  143. min_periods : int, default None
  144. Returns
  145. -------
  146. correls : Panel
  147. """
  148. from pandas import Panel
  149. from collections import defaultdict
  150. all_results = defaultdict(dict)
  151. for i, k1 in enumerate(df.columns):
  152. for k2 in df.columns[i:]:
  153. corr = rolling_corr(df[k1], df[k2], window,
  154. min_periods=min_periods)
  155. all_results[k1][k2] = corr
  156. all_results[k2][k1] = corr
  157. return Panel.from_dict(all_results).swapaxes('items', 'major')
  158. def _rolling_moment(arg, window, func, minp, axis=0, time_rule=None):
  159. """
  160. Rolling statistical measure using supplied function. Designed to be
  161. used with passed-in Cython array-based functions.
  162. Parameters
  163. ----------
  164. arg : DataFrame or numpy ndarray-like
  165. window : Number of observations used for calculating statistic
  166. func : Cython function to compute rolling statistic on raw series
  167. minp : int
  168. Minimum number of observations required to have a value
  169. axis : int, default 0
  170. time_rule : string or DateOffset
  171. Time rule to conform to before computing result
  172. Returns
  173. -------
  174. y : type of input
  175. """
  176. arg = _conv_timerule(arg, time_rule)
  177. calc = lambda x: func(x, window, minp=minp)
  178. return_hook, values = _process_data_structure(arg)
  179. # actually calculate the moment. Faster way to do this?
  180. result = np.apply_along_axis(calc, axis, values)
  181. return return_hook(result)
  182. def _process_data_structure(arg, kill_inf=True):
  183. if isinstance(arg, DataFrame):
  184. return_hook = lambda v: type(arg)(v, index=arg.index,
  185. columns=arg.columns)
  186. values = arg.values
  187. elif isinstance(arg, Series):
  188. values = arg.values
  189. return_hook = lambda v: Series(v, arg.index)
  190. else:
  191. return_hook = lambda v: v
  192. values = arg
  193. if not issubclass(values.dtype.type, float):
  194. values = values.astype(float)
  195. if kill_inf:
  196. values = values.copy()
  197. values[np.isinf(values)] = np.NaN
  198. return return_hook, values
  199. #-------------------------------------------------------------------------------
  200. # Exponential moving moments
  201. def _get_center_of_mass(com, span):
  202. if span is not None:
  203. if com is not None:
  204. raise Exception("com and span are mutually exclusive")
  205. # convert span to center of mass
  206. com = (span - 1) / 2.
  207. elif com is None:
  208. raise Exception("Must pass either com or span")
  209. return float(com)
  210. @Substitution("Exponentially-weighted moving average", _unary_arg, "")
  211. @Appender(_ewm_doc)
  212. def ewma(arg, com=None, span=None, min_periods=0, time_rule=None):
  213. com = _get_center_of_mass(com, span)
  214. arg = _conv_timerule(arg, time_rule)
  215. def _ewma(v):
  216. result = _tseries.ewma(v, com)
  217. first_index = _first_valid_index(v)
  218. result[first_index : first_index + min_periods] = NaN
  219. return result
  220. return_hook, values = _process_data_structure(arg)
  221. output = np.apply_along_axis(_ewma, 0, values)
  222. return return_hook(output)
  223. def _first_valid_index(arr):
  224. # argmax scans from left
  225. return notnull(arr).argmax()
  226. @Substitution("Exponentially-weighted moving variance", _unary_arg, _bias_doc)
  227. @Appender(_ewm_doc)
  228. def ewmvar(arg, com=None, span=None, min_periods=0, bias=False,
  229. time_rule=None):
  230. com = _get_center_of_mass(com, span)
  231. arg = _conv_timerule(arg, time_rule)
  232. moment2nd = ewma(arg * arg, com=com, min_periods=min_periods)
  233. moment1st = ewma(arg, com=com, min_periods=min_periods)
  234. result = moment2nd - moment1st ** 2
  235. if not bias:
  236. result *= (1.0 + 2.0 * com) / (2.0 * com)
  237. return result
  238. @Substitution("Exponentially-weighted moving std", _unary_arg, _bias_doc)
  239. @Appender(_ewm_doc)
  240. def ewmstd(arg, com=None, span=None, min_periods=0, bias=False,
  241. time_rule=None):
  242. result = ewmvar(arg, com=com, span=span, time_rule=time_rule,
  243. min_periods=min_periods, bias=bias)
  244. return np.sqrt(result)
  245. ewmvol = ewmstd
  246. @Substitution("Exponentially-weighted moving covariance", _binary_arg, "")
  247. @Appender(_ewm_doc)
  248. def ewmcov(arg1, arg2, com=None, span=None, min_periods=0, bias=False,
  249. time_rule=None):
  250. X, Y = _prep_binary(arg1, arg2)
  251. X = _conv_timerule(X, time_rule)
  252. Y = _conv_timerule(Y, time_rule)
  253. mean = lambda x: ewma(x, com=com, span=span, min_periods=min_periods)
  254. result = (mean(X*Y) - mean(X) * mean(Y))
  255. com = _get_center_of_mass(com, span)
  256. if not bias:
  257. result *= (1.0 + 2.0 * com) / (2.0 * com)
  258. return result
  259. @Substitution("Exponentially-weighted moving " "correlation", _binary_arg, "")
  260. @Appender(_ewm_doc)
  261. def ewmcorr(arg1, arg2, com=None, span=None, min_periods=0,
  262. time_rule=None):
  263. X, Y = _prep_binary(arg1, arg2)
  264. X = _conv_timerule(X, time_rule)
  265. Y = _conv_timerule(Y, time_rule)
  266. mean = lambda x: ewma(x, com=com, span=span, min_periods=min_periods)
  267. var = lambda x: ewmvar(x, com=com, span=span, min_periods=min_periods,
  268. bias=True)
  269. return (mean(X*Y) - mean(X)*mean(Y)) / np.sqrt(var(X) * var(Y))
  270. def _prep_binary(arg1, arg2):
  271. if not isinstance(arg2, type(arg1)):
  272. raise Exception('Input arrays must be of the same type!')
  273. # mask out values, this also makes a common index...
  274. X = arg1 + 0 * arg2
  275. Y = arg2 + 0 * arg1
  276. return X, Y
  277. #-------------------------------------------------------------------------------
  278. # Python interface to Cython functions
  279. def _conv_timerule(arg, time_rule):
  280. types = (DataFrame, Series)
  281. if time_rule is not None and isinstance(arg, types):
  282. # Conform to whatever frequency needed.
  283. arg = arg.asfreq(time_rule)
  284. return arg
  285. def _require_min_periods(p):
  286. def _check_func(minp, window):
  287. if minp is None:
  288. return window
  289. else:
  290. return max(p, minp)
  291. return _check_func
  292. def _use_window(minp, window):
  293. if minp is None:
  294. return window
  295. else:
  296. return minp
  297. def _rolling_func(func, desc, check_minp=_use_window):
  298. @Substitution(desc, _unary_arg, _type_of_input)
  299. @Appender(_doc_template)
  300. @wraps(func)
  301. def f(arg, window, min_periods=None, time_rule=None):
  302. def call_cython(arg, window, minp):
  303. minp = check_minp(minp, window)
  304. return func(arg, window, minp)
  305. return _rolling_moment(arg, window, call_cython, min_periods,
  306. time_rule=time_rule)
  307. return f
  308. rolling_max = _rolling_func(_tseries.roll_max, 'Moving maximum')
  309. rolling_min = _rolling_func(_tseries.roll_min, 'Moving minimum')
  310. rolling_sum = _rolling_func(_tseries.roll_sum, 'Moving sum')
  311. rolling_mean = _rolling_func(_tseries.roll_mean, 'Moving mean')
  312. rolling_median = _rolling_func(_tseries.roll_median_cython, 'Moving median')
  313. _ts_std = lambda *a, **kw: np.sqrt(_tseries.roll_var(*a, **kw))
  314. rolling_std = _rolling_func(_ts_std, 'Unbiased moving standard deviation',
  315. check_minp=_require_min_periods(2))
  316. rolling_var = _rolling_func(_tseries.roll_var, 'Unbiased moving variance',
  317. check_minp=_require_min_periods(2))
  318. rolling_skew = _rolling_func(_tseries.roll_skew, 'Unbiased moving skewness',
  319. check_minp=_require_min_periods(3))
  320. rolling_kurt = _rolling_func(_tseries.roll_kurt, 'Unbiased moving kurtosis',
  321. check_minp=_require_min_periods(4))
  322. def rolling_quantile(arg, window, quantile, min_periods=None, time_rule=None):
  323. """Moving quantile
  324. Parameters
  325. ----------
  326. arg : Series, DataFrame
  327. window : Number of observations used for calculating statistic
  328. quantile : 0 <= quantile <= 1
  329. min_periods : int
  330. Minimum number of observations in window required to have a value
  331. time_rule : {None, 'WEEKDAY', 'EOM', 'W@MON', ...}, default=None
  332. Name of time rule to conform to before computing statistic
  333. Returns
  334. -------
  335. y : type of input argument
  336. """
  337. def call_cython(arg, window, minp):
  338. minp = _use_window(minp, window)
  339. return _tseries.roll_quantile(arg, window, minp, quantile)
  340. return _rolling_moment(arg, window, call_cython, min_periods,
  341. time_rule=time_rule)
  342. def rolling_apply(arg, window, func, min_periods=None, time_rule=None):
  343. """Generic moving function application
  344. Parameters
  345. ----------
  346. arg : Series, DataFrame
  347. window : Number of observations used for calculating statistic
  348. func : function
  349. Must produce a single value from an ndarray input
  350. min_periods : int
  351. Minimum number of observations in window required to have a value
  352. time_rule : {None, 'WEEKDAY', 'EOM', 'W@MON', ...}, default=None
  353. Name of time rule to conform to before computing statistic
  354. Returns
  355. -------
  356. y : type of input argument
  357. """
  358. def call_cython(arg, window, minp):
  359. minp = _use_window(minp, window)
  360. return _tseries.roll_generic(arg, window, minp, func)
  361. return _rolling_moment(arg, window, call_cython, min_periods,
  362. time_rule=time_rule)