moments.py | searchcode

/pandas/stats/moments.py

https://github.com/smc77/pandas
Python | 450 lines | 437 code | 7 blank | 6 comment | 2 complexity | dfbc72fa3a82a326cc2a03a8c8ea0d1c MD5 | raw file

"""
Provides rolling statistical moments and related descriptive
statistics implemented in Cython
"""
from __future__ import division

from functools import wraps

from numpy import NaN
import numpy as np

from pandas.core.api import DataFrame, Series, notnull
import pandas._tseries as _tseries

from pandas.util.decorators import Substitution, Appender

__all__ = ['rolling_count', 'rolling_max', 'rolling_min',
           'rolling_sum', 'rolling_mean', 'rolling_std', 'rolling_cov',
           'rolling_corr', 'rolling_var', 'rolling_skew', 'rolling_kurt',
           'rolling_quantile', 'rolling_median', 'rolling_apply',
           'rolling_corr_pairwise',
           'ewma', 'ewmvar', 'ewmstd', 'ewmvol', 'ewmcorr', 'ewmcov']

#-------------------------------------------------------------------------------
# Docs

_doc_template = """
%s

Parameters
----------
%s
window : Number of observations used for calculating statistic
min_periods : int
    Minimum number of observations in window required to have a value
time_rule : {None, 'WEEKDAY', 'EOM', 'W@MON', ...}, default=None
    Name of time rule to conform to before computing statistic

Returns
-------
%s
"""


_ewm_doc = r"""%s

Parameters
----------
%s
com : float. optional
    Center of mass: \alpha = com / (1 + com),
span : float, optional
    Specify decay in terms of span, \alpha = 2 / (span + 1)
min_periods : int, default 0
    Number of observations in sample to require (only affects
    beginning)
time_rule : {None, 'WEEKDAY', 'EOM', 'W@MON', ...}, default None
    Name of time rule to conform to before computing statistic
%s
Notes
-----
Either center of mass or span must be specified

EWMA is sometimes specified using a "span" parameter s, we have have that the
decay parameter \alpha is related to the span as :math:`\alpha = 1 - 2 / (s + 1)
= c / (1 + c)`

where c is the center of mass. Given a span, the associated center of mass is
:math:`c = (s - 1) / 2`

So a "20-day EWMA" would have center 9.5.

Returns
-------
y : type of input argument
"""

_type_of_input = "y : type of input argument"

_flex_retval = """y : type depends on inputs
    DataFrame / DataFrame -> DataFrame (matches on columns)
    DataFrame / Series -> Computes result for each column
    Series / Series -> Series"""

_unary_arg = "arg : Series, DataFrame"

_binary_arg_flex = """arg1 : Series, DataFrame, or ndarray
arg2 : Series, DataFrame, or ndarray"""

_binary_arg = """arg1 : Series, DataFrame, or ndarray
arg2 : Series, DataFrame, or ndarray"""

_bias_doc = r"""bias : boolean, default False
    Use a standard estimation bias correction
"""
def rolling_count(arg, window, time_rule=None):
    """
    Rolling count of number of non-NaN observations inside provided window.

    Parameters
    ----------
    arg :  DataFrame or numpy ndarray-like
    window : Number of observations used for calculating statistic

    Returns
    -------
    rolling_count : type of caller
    """
    arg = _conv_timerule(arg, time_rule)
    window = min(window, len(arg))

    return_hook, values = _process_data_structure(arg, kill_inf=False)

    converted = np.isfinite(values).astype(float)
    result = rolling_sum(converted, window, min_periods=1,
                         time_rule=time_rule)

    # putmask here?
    result[np.isnan(result)] = 0

    return return_hook(result)

@Substitution("Unbiased moving covariance", _binary_arg_flex, _flex_retval)
@Appender(_doc_template)
def rolling_cov(arg1, arg2, window, min_periods=None, time_rule=None):
    def _get_cov(X, Y):
        mean = lambda x: rolling_mean(x, window, min_periods, time_rule)
        count = rolling_count(X + Y, window, time_rule)
        bias_adj = count / (count - 1)
        return (mean(X * Y) - mean(X) * mean(Y)) * bias_adj
    return _flex_binary_moment(arg1, arg2, _get_cov)

@Substitution("Moving sample correlation", _binary_arg_flex, _flex_retval)
@Appender(_doc_template)
def rolling_corr(arg1, arg2, window, min_periods=None, time_rule=None):
    def _get_corr(a, b):
        num = rolling_cov(a, b, window, min_periods, time_rule)
        den  = (rolling_std(a, window, min_periods, time_rule) *
                rolling_std(b, window, min_periods, time_rule))
        return num / den
    return _flex_binary_moment(arg1, arg2, _get_corr)

def _flex_binary_moment(arg1, arg2, f):
    if isinstance(arg1, np.ndarray) and isinstance(arg2, np.ndarray):
        X, Y = _prep_binary(arg1, arg2)
        return f(X, Y)
    elif isinstance(arg1, DataFrame):
        results = {}
        if isinstance(arg2, DataFrame):
            X, Y = arg1.align(arg2, join='outer')
            X = X + 0 * Y
            Y = Y + 0 * X
            res_columns = arg1.columns.union(arg2.columns)
            for col in res_columns:
                if col in X and col in Y:
                    results[col] = f(X[col], Y[col])
        else:
            res_columns = arg1.columns
            X, Y = arg1.align(arg2, axis=0, join='outer')
            results = {}

            for col in res_columns:
                results[col] = f(X[col], Y)

        return DataFrame(results, index=X.index, columns=res_columns)
    else:
        return _flex_binary_moment(arg2, arg1, f)

def rolling_corr_pairwise(df, window, min_periods=None):
    """
    Computes pairwise rolling correlation matrices as Panel whose items are
    dates

    Parameters
    ----------
    df : DataFrame
    window : int
    min_periods : int, default None

    Returns
    -------
    correls : Panel
    """
    from pandas import Panel
    from collections import defaultdict

    all_results = defaultdict(dict)

    for i, k1 in enumerate(df.columns):
        for k2 in df.columns[i:]:
            corr = rolling_corr(df[k1], df[k2], window,
                                min_periods=min_periods)
            all_results[k1][k2] = corr
            all_results[k2][k1] = corr

    return Panel.from_dict(all_results).swapaxes('items', 'major')

def _rolling_moment(arg, window, func, minp, axis=0, time_rule=None):
    """
    Rolling statistical measure using supplied function. Designed to be
    used with passed-in Cython array-based functions.

    Parameters
    ----------
    arg :  DataFrame or numpy ndarray-like
    window : Number of observations used for calculating statistic
    func : Cython function to compute rolling statistic on raw series
    minp : int
        Minimum number of observations required to have a value
    axis : int, default 0
    time_rule : string or DateOffset
        Time rule to conform to before computing result

    Returns
    -------
    y : type of input
    """
    arg = _conv_timerule(arg, time_rule)
    calc = lambda x: func(x, window, minp=minp)
    return_hook, values = _process_data_structure(arg)
    # actually calculate the moment. Faster way to do this?
    result = np.apply_along_axis(calc, axis, values)

    return return_hook(result)

def _process_data_structure(arg, kill_inf=True):
    if isinstance(arg, DataFrame):
        return_hook = lambda v: type(arg)(v, index=arg.index,
                                          columns=arg.columns)
        values = arg.values
    elif isinstance(arg, Series):
        values = arg.values
        return_hook = lambda v: Series(v, arg.index)
    else:
        return_hook = lambda v: v
        values = arg

    if not issubclass(values.dtype.type, float):
        values = values.astype(float)

    if kill_inf:
        values = values.copy()
        values[np.isinf(values)] = np.NaN

    return return_hook, values

#-------------------------------------------------------------------------------
# Exponential moving moments

def _get_center_of_mass(com, span):
    if span is not None:
        if com is not None:
            raise Exception("com and span are mutually exclusive")

        # convert span to center of mass
        com = (span - 1) / 2.

    elif com is None:
        raise Exception("Must pass either com or span")

    return float(com)

@Substitution("Exponentially-weighted moving average", _unary_arg, "")
@Appender(_ewm_doc)
def ewma(arg, com=None, span=None, min_periods=0, time_rule=None):
    com = _get_center_of_mass(com, span)
    arg = _conv_timerule(arg, time_rule)

    def _ewma(v):
        result = _tseries.ewma(v, com)
        first_index = _first_valid_index(v)
        result[first_index : first_index + min_periods] = NaN
        return result

    return_hook, values = _process_data_structure(arg)
    output = np.apply_along_axis(_ewma, 0, values)
    return return_hook(output)

def _first_valid_index(arr):
    # argmax scans from left
    return notnull(arr).argmax()

@Substitution("Exponentially-weighted moving variance", _unary_arg, _bias_doc)
@Appender(_ewm_doc)
def ewmvar(arg, com=None, span=None, min_periods=0, bias=False,
           time_rule=None):
    com = _get_center_of_mass(com, span)
    arg = _conv_timerule(arg, time_rule)
    moment2nd = ewma(arg * arg, com=com, min_periods=min_periods)
    moment1st = ewma(arg, com=com, min_periods=min_periods)

    result = moment2nd - moment1st ** 2
    if not bias:
        result *= (1.0 + 2.0 * com) / (2.0 * com)

    return result

@Substitution("Exponentially-weighted moving std", _unary_arg, _bias_doc)
@Appender(_ewm_doc)
def ewmstd(arg, com=None, span=None, min_periods=0, bias=False,
           time_rule=None):
    result = ewmvar(arg, com=com, span=span, time_rule=time_rule,
                    min_periods=min_periods, bias=bias)
    return np.sqrt(result)

ewmvol = ewmstd

@Substitution("Exponentially-weighted moving covariance", _binary_arg, "")
@Appender(_ewm_doc)
def ewmcov(arg1, arg2, com=None, span=None, min_periods=0, bias=False,
           time_rule=None):
    X, Y = _prep_binary(arg1, arg2)

    X = _conv_timerule(X, time_rule)
    Y = _conv_timerule(Y, time_rule)

    mean = lambda x: ewma(x, com=com, span=span, min_periods=min_periods)

    result = (mean(X*Y) - mean(X) * mean(Y))
    com = _get_center_of_mass(com, span)
    if not bias:
        result *= (1.0 + 2.0 * com) / (2.0 * com)

    return result

@Substitution("Exponentially-weighted moving " "correlation", _binary_arg, "")
@Appender(_ewm_doc)
def ewmcorr(arg1, arg2, com=None, span=None, min_periods=0,
            time_rule=None):
    X, Y = _prep_binary(arg1, arg2)

    X = _conv_timerule(X, time_rule)
    Y = _conv_timerule(Y, time_rule)

    mean = lambda x: ewma(x, com=com, span=span, min_periods=min_periods)
    var = lambda x: ewmvar(x, com=com, span=span, min_periods=min_periods,
                           bias=True)
    return (mean(X*Y) - mean(X)*mean(Y)) / np.sqrt(var(X) * var(Y))

def _prep_binary(arg1, arg2):
    if not isinstance(arg2, type(arg1)):
        raise Exception('Input arrays must be of the same type!')

    # mask out values, this also makes a common index...
    X = arg1 + 0 * arg2
    Y = arg2 + 0 * arg1

    return X, Y

#-------------------------------------------------------------------------------
# Python interface to Cython functions

def _conv_timerule(arg, time_rule):
    types = (DataFrame, Series)
    if time_rule is not None and isinstance(arg, types):
        # Conform to whatever frequency needed.
        arg = arg.asfreq(time_rule)

    return arg

def _require_min_periods(p):
    def _check_func(minp, window):
        if minp is None:
            return window
        else:
            return max(p, minp)
    return _check_func

def _use_window(minp, window):
    if minp is None:
        return window
    else:
        return minp

def _rolling_func(func, desc, check_minp=_use_window):
    @Substitution(desc, _unary_arg, _type_of_input)
    @Appender(_doc_template)
    @wraps(func)
    def f(arg, window, min_periods=None, time_rule=None):
        def call_cython(arg, window, minp):
            minp = check_minp(minp, window)
            return func(arg, window, minp)
        return _rolling_moment(arg, window, call_cython, min_periods,
                               time_rule=time_rule)

    return f

rolling_max = _rolling_func(_tseries.roll_max, 'Moving maximum')
rolling_min = _rolling_func(_tseries.roll_min, 'Moving minimum')
rolling_sum = _rolling_func(_tseries.roll_sum, 'Moving sum')
rolling_mean = _rolling_func(_tseries.roll_mean, 'Moving mean')
rolling_median = _rolling_func(_tseries.roll_median_cython, 'Moving median')

_ts_std = lambda *a, **kw: np.sqrt(_tseries.roll_var(*a, **kw))
rolling_std = _rolling_func(_ts_std, 'Unbiased moving standard deviation',
                            check_minp=_require_min_periods(2))
rolling_var = _rolling_func(_tseries.roll_var, 'Unbiased moving variance',
                            check_minp=_require_min_periods(2))
rolling_skew = _rolling_func(_tseries.roll_skew, 'Unbiased moving skewness',
                             check_minp=_require_min_periods(3))
rolling_kurt = _rolling_func(_tseries.roll_kurt, 'Unbiased moving kurtosis',
                             check_minp=_require_min_periods(4))

def rolling_quantile(arg, window, quantile, min_periods=None, time_rule=None):
    """Moving quantile

    Parameters
    ----------
    arg : Series, DataFrame
    window : Number of observations used for calculating statistic
    quantile : 0 <= quantile <= 1
    min_periods : int
        Minimum number of observations in window required to have a value
    time_rule : {None, 'WEEKDAY', 'EOM', 'W@MON', ...}, default=None
        Name of time rule to conform to before computing statistic

    Returns
    -------
    y : type of input argument
    """

    def call_cython(arg, window, minp):
        minp = _use_window(minp, window)
        return _tseries.roll_quantile(arg, window, minp, quantile)
    return _rolling_moment(arg, window, call_cython, min_periods,
                           time_rule=time_rule)

def rolling_apply(arg, window, func, min_periods=None, time_rule=None):
    """Generic moving function application

    Parameters
    ----------
    arg : Series, DataFrame
    window : Number of observations used for calculating statistic
    func : function
        Must produce a single value from an ndarray input
    min_periods : int
        Minimum number of observations in window required to have a value
    time_rule : {None, 'WEEKDAY', 'EOM', 'W@MON', ...}, default=None
        Name of time rule to conform to before computing statistic

    Returns
    -------
    y : type of input argument
    """
    def call_cython(arg, window, minp):
        minp = _use_window(minp, window)
        return _tseries.roll_generic(arg, window, minp, func)
    return _rolling_moment(arg, window, call_cython, min_periods,
                           time_rule=time_rule)