extensions.py | searchcode

/pandasreg/extensions.py

https://github.com/abielr/pandasreg · Python · 265 lines · 244 code · 8 blank · 13 comment · 4 complexity · 66cf1fb1062cca710fa62b346e421ce7 MD5 · raw file

import numpy as np
import pandas as pd
from dateutil.relativedelta import relativedelta
from pandas.tseries import frequencies
from pandas.tseries.offsets import DateOffset, CacheableOffset
from pandas.tseries.frequencies import to_offset
from pandas.tseries import offsets
from datetime import datetime, timedelta
from collections import defaultdict
import os
import glob
import subprocess
import uuid
import pandas.lib as lib

from pandasreg.rperiod import RPeriodIndex, RFrequency, RPeriod

def trim(series):
    """Trim trailing and leading NaN values"""

    ix = np.where(np.isfinite(series))[0]
    if len(ix) == 0:
        return series[0:0]
    return series[ix[0]:(ix[-1]+1)]

def fill(series):
    """Makes a series regularly spaced if it is not so already"""

    if series.index.is_full:
        return series
    ix = RPeriodIndex(start=series.index[0], end=series.index[-1], 
        freq=series.index.freq)
    return series.reindex(ix)

def _agg_first(x):
    ix = np.where(np.isfinite(x))[0]
    if len(ix) == 0:
        return np.nan
    return x[ix[0]]

def _agg_last(x):
    ix = np.where(np.isfinite(x))[0]
    if len(ix) == 0:
        return np.nan
    return x[ix[-1]]

def resample(input, freq, how=None):
    """Resample (convert) a time series to another frequency.

    This function is conceptually similar to the resample() method that pandas
    provides for Series and DataFrames. It is designed to work with a Series or
    DataFrame that uses an instance of RPeriodIndex. Behavior slightly different
    than the pandas function, and there are fewer arguments that can be set.

    Arguments:

        freq (str, RFrequency): Frequency to convert to.

        how: a string, which can be 'mean', 'sum', 'first', 'last', 'min',
        'max', or a function.

    """

    # TODO: allow/disallow partial periods in aggregation
    # TODO: disaggregation performance is too slow, avoid groupby and transform
    #   at least for base cases

    if not isinstance(input.index, RPeriodIndex):
        raise ValueError("Index must be of type RPeriodIndex")

    if isinstance(freq, basestring):
        freq = RFrequency.init(freq)

    if how is None:
        how = input.index.observed

    aggfuncs = {
        "sum": np.sum,
        "mean": np.mean,
        "first": _agg_first,
        "last": _agg_last,
        "min": np.min,
        "max": np.max
    }

    def disagg_start(x):
        x[0] = x[-1]
        x[-1] = np.NaN
        return x

    disaggfuncs = {
        "sum": lambda x: x.fillna(method="backfill")/len(x),
        "mean": lambda x: x.fillna(method="backfill"),
        "first": disagg_start,
        "last": lambda x: x,
        "min": lambda x: x.fillna(method="backfill"),
        "max": lambda x: x.fillna(method="backfill")
    }

    if input.index.freq < freq: # disaggregation
        start = input.index[0].asfreq(freq, how='S')
        end = input.index[-1].asfreq(freq)
        index = RPeriodIndex(start=start, end=end, freq=freq)
        if isinstance(input, pd.Series):
            s = pd.Series(input.values, index=input.index.asfreq(freq))
        elif isinstance(input, pd.DataFrame):
            s = pd.DataFrame(input.values, columns=input.columns, 
                index=input.index.asfreq(freq))

        idx = np.empty(len(s.index)+1, dtype=int)
        idx[0] = start.ordinal-1
        idx[1:] = s.index.values
        lengths = np.diff(idx)
        groups = np.repeat(np.arange(len(lengths)), lengths)

        if isinstance(how, basestring):
            try:
                how = disaggfuncs[how]
            except KeyError:
                raise KeyError("Invalid disaggregation function '%s'" % how)

        s = s.reindex(index).groupby(groups).transform(how)
        s.index = index # otherwise index is Int64 when using DataFrame
        return s

    elif input.index.freq > freq: # aggregation
        start = input.index[0].asfreq(freq)
        end = input.index[-1].asfreq(freq)
        indexnew = RPeriodIndex(start=start, end=end, freq=freq)
        index =indexnew.asfreq(input.index.freq)
        nbins = end-start+1
        idx = np.empty(nbins+1, dtype=int)
        idx[0] = input.index[0].ordinal-1
        idx[1:] = index.values
        idx[-1] = input.index[-1].ordinal
        lengths = np.diff(idx)
        groups = np.repeat(np.arange(len(lengths)), lengths)

        if isinstance(how, basestring):
            try:
                how = aggfuncs[how]
            except KeyError:
                raise KeyError("Invalid aggregation function '%s'" % how)

        s = input.groupby(groups).agg(how)
        s.index = indexnew
        return s

    return input

def overlay(series, replace=True):
    """
    Overlay a list of series on top of each other

    Example: overlay([s1,s2,s3])
    If series overlap, the series that came last in the input list will have
    precedence if replace=True. If replace=false, a series coming later in the 
    list will replace only NA values in the existing list.
    """

    if not isinstance(series, list) and not isinstance(series, tuple):
        raise ValueError("series argument should be list or tuple")

    if len(series) == 0:
        return None

    if len(set([s.index.freq.freqstr for s in series])) > 1:
        raise ValueError("Can only overlay series with the same frequencies")

    start = min([s.index[0].ordinal for s in series])
    end = max([s.index[-1].ordinal for s in series])

    index = RPeriodIndex(start=start, end=end, freq=series[0].index.freq)
    new_series = pd.Series(np.empty(len(index)), index=index)
    new_series[:] = np.nan

    for s in series:
        if replace:
            new_series[s.index[0]:s.index[-1]] = s
        else:
            new_series[s.index[0]:s.index[-1]][np.isnan(new_series[s.index[0]:s.index[-1]])] = \
            s[np.isnan(new_series[s.index[0]:s.index[-1]])]

    return new_series

def extend(input, extender, direction="forward", extender_type="index"):
    """
    Extend a series forward or backward using another series or an array.

    Arguments:

        direction (str): 'forward', 'backward'

        extender_type (str): 'index', 'pc' (percent change), 'pca' (percent
        change, annualized), or 'diff'. If type = pc or pca, the percent changes
        should be in decimal form, i.e. 4% = .04

    """

    # TODO make this makes nicely with DataFrame

    if not isinstance(input, pd.Series) and not isinstance(input, pd.DataFrame):
        raise ValueError("Input must be Series or DataFrame")

    input = trim(input)

    if not direction in ["forward", "backward"]:
        raise ValueError("direction must be 'forward' or 'backward'")

    if not extender_type in ["index","pc","pca","diff"]:
        raise ValueError("extender_type must be 'index', 'pc', 'pca', or 'diff'")

    if isinstance(extender, list):
        extender = np.array(extender)

    if (isinstance(extender, pd.Series) and
        isinstance(extender.index, RPeriodIndex) and
        input.index.freq != input.index.freq):
        raise ValueError("Series and extender series must have same frequency")

    if extender_type == "index":
        if direction == "forward":
            if not input.index[-1] in extender.index:
                return input
            tmp = extender/extender[input.index[-1]]*input[-1]
            return overlay((input, tmp[input.index[-1]+1:]))
        elif direction == "backward":
            if not input.index[0] in extender.index:
                return input
            tmp = extender/extender[input.index[0]]*input[0]
            return overlay((input, tmp[:input.index[0]]))

    elif extender_type == "pc":
        index = RPeriodIndex(start=extender.index[0]-1, periods=len(extender)+1,
            freq=extender.index.freq)
        tmp = pd.Series(np.empty(len(index), dtype=input.dtype), index)
        tmp[0] = 100
        tmp[1:] = 1+extender
        tmp = np.cumprod(tmp)
        return extend(input, tmp, direction=direction, extender_type='index')

    elif extender_type == "pca":
        if (not isinstance(extender, pd.Series) or 
            not isinstance(extender.index, RPeriodIndex)):
            raise ValueError("Can only use extender_type = pca with a time \
                series that has an RPeriodIndex")
        index = RPeriodIndex(start=extender.index[0]-1, periods=len(extender)+1,
            freq=extender.index.freq)
        tmp = pd.Series(np.empty(len(index), dtype=input.dtype), index)
        tmp[0] = 100
        tmp[1:] = (1+extender)**(1.0/input.index.freq.periodicity)
        tmp = np.cumprod(tmp)
        return extend(input, tmp, direction=direction, extender_type='index')

    elif extender_type == "diff":
        if direction == "forward":
            if not input.index[-1]+1 in extender.index:
                return input
            tmp = np.cumsum(extender[input.index[-1]+1:])+input[-1]
            return overlay((input, tmp[input.index[-1]+1:]))
        elif direction == "backward":
            if not input.index[0]-1 in extender.index:
                return input
            tmp = (np.cumsum(-extender[:input.index[0]-1][::-1])+input[0])[::-1]
            return overlay((input, tmp[:input.index[0]-1]))
Tech Fingerprint

Alerts (13)

'isinstance(' Overuse may indicate design issues; consider polymorphism
68 71 104 106 116 139 161 202 213 217 243 244
'def' Ensure functions have docstrings for documentation
86