/statsmodels/tools/data.py
Python | 118 lines | 65 code | 30 blank | 23 comment | 19 complexity | bf0bf9c8197460bc9c9d1ca53d93e9d8 MD5 | raw file
Possible License(s): BSD-3-Clause
- """
- Compatibility tools for various data structure inputs
- """
- import numpy as np
- import pandas as pd
- def _check_period_index(x, freq="M"):
- from pandas import PeriodIndex, DatetimeIndex
- if not isinstance(x.index, (DatetimeIndex, PeriodIndex)):
- raise ValueError("The index must be a DatetimeIndex or PeriodIndex")
- if x.index.freq is not None:
- inferred_freq = x.index.freqstr
- else:
- inferred_freq = pd.infer_freq(x.index)
- if not inferred_freq.startswith(freq):
- raise ValueError("Expected frequency {}. Got {}".format(inferred_freq,
- freq))
- def is_data_frame(obj):
- return isinstance(obj, pd.DataFrame)
- def is_design_matrix(obj):
- from patsy import DesignMatrix
- return isinstance(obj, DesignMatrix)
- def _is_structured_ndarray(obj):
- return isinstance(obj, np.ndarray) and obj.dtype.names is not None
- def interpret_data(data, colnames=None, rownames=None):
- """
- Convert passed data structure to form required by estimation classes
- Parameters
- ----------
- data : array_like
- colnames : sequence or None
- May be part of data structure
- rownames : sequence or None
- Returns
- -------
- (values, colnames, rownames) : (homogeneous ndarray, list)
- """
- if isinstance(data, np.ndarray):
- values = np.asarray(data)
- if colnames is None:
- colnames = ['Y_%d' % i for i in range(values.shape[1])]
- elif is_data_frame(data):
- # XXX: hack
- data = data.dropna()
- values = data.values
- colnames = data.columns
- rownames = data.index
- else: # pragma: no cover
- raise TypeError('Cannot handle input type {typ}'
- .format(typ=type(data).__name__))
- if not isinstance(colnames, list):
- colnames = list(colnames)
- # sanity check
- if len(colnames) != values.shape[1]:
- raise ValueError('length of colnames does not match number '
- 'of columns in data')
- if rownames is not None and len(rownames) != len(values):
- raise ValueError('length of rownames does not match number '
- 'of rows in data')
- return values, colnames, rownames
- def struct_to_ndarray(arr):
- return arr.view((float, (len(arr.dtype.names),)), type=np.ndarray)
- def _is_using_ndarray_type(endog, exog):
- return (type(endog) is np.ndarray and
- (type(exog) is np.ndarray or exog is None))
- def _is_using_ndarray(endog, exog):
- return (isinstance(endog, np.ndarray) and
- (isinstance(exog, np.ndarray) or exog is None))
- def _is_using_pandas(endog, exog):
- from statsmodels.compat.pandas import data_klasses as klasses
- return (isinstance(endog, klasses) or isinstance(exog, klasses))
- def _is_array_like(endog, exog):
- try: # do it like this in case of mixed types, ie., ndarray and list
- endog = np.asarray(endog)
- exog = np.asarray(exog)
- return True
- except:
- return False
- def _is_using_patsy(endog, exog):
- # we get this when a structured array is passed through a formula
- return (is_design_matrix(endog) and
- (is_design_matrix(exog) or exog is None))
- def _is_recarray(data):
- """
- Returns true if data is a recarray
- """
- return isinstance(data, np.core.recarray)