PageRenderTime 21ms CodeModel.GetById 11ms app.highlight 7ms RepoModel.GetById 1ms app.codeStats 1ms

/statsmodels/tools/data.py

http://github.com/statsmodels/statsmodels
Python | 126 lines | 73 code | 30 blank | 23 comment | 24 complexity | 54ae2348c2c35112755d2c605eb0adac MD5 | raw file
  1"""
  2Compatibility tools for various data structure inputs
  3"""
  4import numpy as np
  5import pandas as pd
  6
  7
  8def _check_period_index(x, freq="M"):
  9    from pandas import PeriodIndex, DatetimeIndex
 10    if not isinstance(x.index, (DatetimeIndex, PeriodIndex)):
 11        raise ValueError("The index must be a DatetimeIndex or PeriodIndex")
 12
 13    if x.index.freq is not None:
 14        inferred_freq = x.index.freqstr
 15    else:
 16        inferred_freq = pd.infer_freq(x.index)
 17    if not inferred_freq.startswith(freq):
 18        raise ValueError("Expected frequency {}. Got {}".format(inferred_freq,
 19                                                                freq))
 20
 21
 22def is_data_frame(obj):
 23    return isinstance(obj, pd.DataFrame)
 24
 25
 26def is_design_matrix(obj):
 27    from patsy import DesignMatrix
 28    return isinstance(obj, DesignMatrix)
 29
 30
 31def _is_structured_ndarray(obj):
 32    return isinstance(obj, np.ndarray) and obj.dtype.names is not None
 33
 34
 35def interpret_data(data, colnames=None, rownames=None):
 36    """
 37    Convert passed data structure to form required by estimation classes
 38
 39    Parameters
 40    ----------
 41    data : array_like
 42    colnames : sequence or None
 43        May be part of data structure
 44    rownames : sequence or None
 45
 46    Returns
 47    -------
 48    (values, colnames, rownames) : (homogeneous ndarray, list)
 49    """
 50    if isinstance(data, np.ndarray):
 51        if _is_structured_ndarray(data):
 52            import warnings
 53            from statsmodels.tools.sm_exceptions import recarray_warning
 54            warnings.warn(recarray_warning, FutureWarning)
 55            if colnames is None:
 56                colnames = data.dtype.names
 57            values = struct_to_ndarray(data)
 58        else:
 59            values = data
 60
 61        if colnames is None:
 62            colnames = ['Y_%d' % i for i in range(values.shape[1])]
 63    elif is_data_frame(data):
 64        # XXX: hack
 65        data = data.dropna()
 66        values = data.values
 67        colnames = data.columns
 68        rownames = data.index
 69    else:  # pragma: no cover
 70        raise TypeError('Cannot handle input type {typ}'
 71                        .format(typ=type(data).__name__))
 72
 73    if not isinstance(colnames, list):
 74        colnames = list(colnames)
 75
 76    # sanity check
 77    if len(colnames) != values.shape[1]:
 78        raise ValueError('length of colnames does not match number '
 79                         'of columns in data')
 80
 81    if rownames is not None and len(rownames) != len(values):
 82        raise ValueError('length of rownames does not match number '
 83                         'of rows in data')
 84
 85    return values, colnames, rownames
 86
 87
 88def struct_to_ndarray(arr):
 89    return arr.view((float, (len(arr.dtype.names),)), type=np.ndarray)
 90
 91
 92def _is_using_ndarray_type(endog, exog):
 93    return (type(endog) is np.ndarray and
 94            (type(exog) is np.ndarray or exog is None))
 95
 96
 97def _is_using_ndarray(endog, exog):
 98    return (isinstance(endog, np.ndarray) and
 99            (isinstance(exog, np.ndarray) or exog is None))
100
101
102def _is_using_pandas(endog, exog):
103    from statsmodels.compat.pandas import data_klasses as klasses
104    return (isinstance(endog, klasses) or isinstance(exog, klasses))
105
106
107def _is_array_like(endog, exog):
108    try:  # do it like this in case of mixed types, ie., ndarray and list
109        endog = np.asarray(endog)
110        exog = np.asarray(exog)
111        return True
112    except:
113        return False
114
115
116def _is_using_patsy(endog, exog):
117    # we get this when a structured array is passed through a formula
118    return (is_design_matrix(endog) and
119            (is_design_matrix(exog) or exog is None))
120
121
122def _is_recarray(data):
123    """
124    Returns true if data is a recarray
125    """
126    return isinstance(data, np.core.recarray)