/statsmodels/tools/tools.py
http://github.com/statsmodels/statsmodels · Python · 623 lines · 544 code · 9 blank · 70 comment · 14 complexity · 4aacc0c59f7f978d61d7db635dad564e MD5 · raw file
- """
- Utility functions models code
- """
- import numpy as np
- import pandas as pd
- import scipy.linalg
- from statsmodels.tools.data import _is_using_pandas
- from statsmodels.tools.validation import array_like
- def asstr2(s):
- if isinstance(s, str):
- return s
- elif isinstance(s, bytes):
- return s.decode('latin1')
- else:
- return str(s)
- def _make_dictnames(tmp_arr, offset=0):
- """
- Helper function to create a dictionary mapping a column number
- to the name in tmp_arr.
- """
- col_map = {}
- for i, col_name in enumerate(tmp_arr):
- col_map[i + offset] = col_name
- return col_map
- def drop_missing(Y, X=None, axis=1):
- """
- Returns views on the arrays Y and X where missing observations are dropped.
- Y : array_like
- X : array_like, optional
- axis : int
- Axis along which to look for missing observations. Default is 1, ie.,
- observations in rows.
- Returns
- -------
- Y : ndarray
- All Y where the
- X : ndarray
- Notes
- -----
- If either Y or X is 1d, it is reshaped to be 2d.
- """
- Y = np.asarray(Y)
- if Y.ndim == 1:
- Y = Y[:, None]
- if X is not None:
- X = np.array(X)
- if X.ndim == 1:
- X = X[:, None]
- keepidx = np.logical_and(~np.isnan(Y).any(axis),
- ~np.isnan(X).any(axis))
- return Y[keepidx], X[keepidx]
- else:
- keepidx = ~np.isnan(Y).any(axis)
- return Y[keepidx]
- # TODO: needs to better preserve dtype and be more flexible
- # ie., if you still have a string variable in your array you do not
- # want to cast it to float
- # TODO: add name validator (ie., bad names for datasets.grunfeld)
- def categorical(data, col=None, dictnames=False, drop=False):
- """
- Construct a dummy matrix from categorical variables
- .. deprecated:: 0.12
- Use pandas.get_dummies instead.
- Parameters
- ----------
- data : array_like
- An array, Series or DataFrame. This can be either a 1d vector of
- the categorical variable or a 2d array with the column specifying
- the categorical variable specified by the col argument.
- col : {str, int, None}
- If data is a DataFrame col must in a column of data. If data is a
- Series, col must be either the name of the Series or None. For arrays,
- `col` can be an int that is the (zero-based) column index
- number. `col` can only be None for a 1d array. The default is None.
- dictnames : bool, optional
- If True, a dictionary mapping the column number to the categorical
- name is returned. Used to have information about plain arrays.
- drop : bool
- Whether or not keep the categorical variable in the returned matrix.
- Returns
- -------
- dummy_matrix : array_like
- A matrix of dummy (indicator/binary) float variables for the
- categorical data.
- dictnames : dict[int, str], optional
- Mapping between column numbers and categorical names.
- Notes
- -----
- This returns a dummy variable for *each* distinct variable. If a
- a DaataFrame is provided, the names for the new variable is the
- old variable name - underscore - category name. So if the a variable
- 'vote' had answers as 'yes' or 'no' then the returned array would have to
- new variables-- 'vote_yes' and 'vote_no'. There is currently
- no name checking.
- Examples
- --------
- >>> import numpy as np
- >>> import statsmodels.api as sm
- Univariate examples
- >>> import string
- >>> string_var = [string.ascii_lowercase[0:5],
- ... string.ascii_lowercase[5:10],
- ... string.ascii_lowercase[10:15],
- ... string.ascii_lowercase[15:20],
- ... string.ascii_lowercase[20:25]]
- >>> string_var *= 5
- >>> string_var = np.asarray(sorted(string_var))
- >>> design = sm.tools.categorical(string_var, drop=True)
- Or for a numerical categorical variable
- >>> instr = np.floor(np.arange(10,60, step=2)/10)
- >>> design = sm.tools.categorical(instr, drop=True)
- With a structured array
- >>> num = np.random.randn(25,2)
- >>> struct_ar = np.zeros((25,1),
- ... dtype=[('var1', 'f4'),('var2', 'f4'),
- ... ('instrument','f4'),('str_instr','a5')])
- >>> struct_ar['var1'] = num[:,0][:,None]
- >>> struct_ar['var2'] = num[:,1][:,None]
- >>> struct_ar['instrument'] = instr[:,None]
- >>> struct_ar['str_instr'] = string_var[:,None]
- >>> design = sm.tools.categorical(struct_ar, col='instrument', drop=True)
- Or
- >>> design2 = sm.tools.categorical(struct_ar, col='str_instr', drop=True)
- """
- import warnings
- warnings.warn(
- "categorical is deprecated. Use pandas Categorical to represent "
- "categorical data and can get_dummies to construct dummy arrays. "
- "It will be removed after release 0.13.",
- FutureWarning
- )
- # TODO: add a NameValidator function
- if isinstance(col, (list, tuple)):
- if len(col) == 1:
- col = col[0]
- else:
- raise ValueError("Can only convert one column at a time")
- if (not isinstance(data, (pd.DataFrame, pd.Series)) and
- not isinstance(col, (str, int)) and
- col is not None):
- raise TypeError('col must be a str, int or None')
- # Pull out a Series from a DataFrame if provided
- if isinstance(data, pd.DataFrame):
- if col is None:
- raise TypeError('col must be a str or int when using a DataFrame')
- elif col not in data:
- raise ValueError('Column \'{0}\' not found in data'.format(col))
- data = data[col]
- # Set col to None since we not have a Series
- col = None
- if isinstance(data, pd.Series):
- if col is not None and data.name != col:
- raise ValueError('data.name does not match col '
- '\'{0}\''.format(col))
- data_cat = pd.Categorical(data)
- dummies = pd.get_dummies(data_cat)
- col_map = {i: cat for i, cat in enumerate(data_cat.categories) if
- cat in dummies}
- if not drop:
- dummies.columns = list(dummies.columns)
- dummies = pd.concat([dummies, data], axis=1)
- if dictnames:
- return dummies, col_map
- return dummies
- # Catch array_like for an error
- elif not isinstance(data, np.ndarray):
- raise NotImplementedError("array_like objects are not supported")
- else:
- if isinstance(col, int):
- offset = data.shape[1] # need error catching here?
- tmp_arr = np.unique(data[:, col])
- tmp_dummy = (tmp_arr[:, np.newaxis] == data[:, col]).astype(float)
- tmp_dummy = tmp_dummy.swapaxes(1, 0)
- if drop is True:
- offset -= 1
- data = np.delete(data, col, axis=1).astype(float)
- data = np.column_stack((data, tmp_dummy))
- if dictnames is True:
- col_map = _make_dictnames(tmp_arr, offset)
- return data, col_map
- return data
- elif col is None and np.squeeze(data).ndim == 1:
- tmp_arr = np.unique(data)
- tmp_dummy = (tmp_arr[:, None] == data).astype(float)
- tmp_dummy = tmp_dummy.swapaxes(1, 0)
- if drop is True:
- if dictnames is True:
- col_map = _make_dictnames(tmp_arr)
- return tmp_dummy, col_map
- return tmp_dummy
- else:
- data = np.column_stack((data, tmp_dummy))
- if dictnames is True:
- col_map = _make_dictnames(tmp_arr, offset=1)
- return data, col_map
- return data
- else:
- raise IndexError("The index %s is not understood" % col)
- # TODO: add an axis argument to this for sysreg
- def add_constant(data, prepend=True, has_constant='skip'):
- """
- Add a column of ones to an array.
- Parameters
- ----------
- data : array_like
- A column-ordered design matrix.
- prepend : bool
- If true, the constant is in the first column. Else the constant is
- appended (last column).
- has_constant : str {'raise', 'add', 'skip'}
- Behavior if ``data`` already has a constant. The default will return
- data without adding another constant. If 'raise', will raise an
- error if any column has a constant value. Using 'add' will add a
- column of 1s if a constant column is present.
- Returns
- -------
- array_like
- The original values with a constant (column of ones) as the first or
- last column. Returned value type depends on input type.
- Notes
- -----
- When the input is a pandas Series or DataFrame, the added column's name
- is 'const'.
- """
- if _is_using_pandas(data, None):
- from statsmodels.tsa.tsatools import add_trend
- return add_trend(data, trend='c', prepend=prepend, has_constant=has_constant)
- # Special case for NumPy
- x = np.asarray(data)
- ndim = x.ndim
- if ndim == 1:
- x = x[:, None]
- elif x.ndim > 2:
- raise ValueError('Only implemented for 2-dimensional arrays')
- is_nonzero_const = np.ptp(x, axis=0) == 0
- is_nonzero_const &= np.all(x != 0.0, axis=0)
- if is_nonzero_const.any():
- if has_constant == 'skip':
- return x
- elif has_constant == 'raise':
- if ndim == 1:
- raise ValueError("data is constant.")
- else:
- columns = np.arange(x.shape[1])
- cols = ",".join([str(c) for c in columns[is_nonzero_const]])
- raise ValueError(f"Column(s) {cols} are constant.")
- x = [np.ones(x.shape[0]), x]
- x = x if prepend else x[::-1]
- return np.column_stack(x)
- def isestimable(c, d):
- """
- True if (Q, P) contrast `c` is estimable for (N, P) design `d`.
- From an Q x P contrast matrix `C` and an N x P design matrix `D`, checks if
- the contrast `C` is estimable by looking at the rank of ``vstack([C,D])``
- and verifying it is the same as the rank of `D`.
- Parameters
- ----------
- c : array_like
- A contrast matrix with shape (Q, P). If 1 dimensional assume shape is
- (1, P).
- d : array_like
- The design matrix, (N, P).
- Returns
- -------
- bool
- True if the contrast `c` is estimable on design `d`.
- Examples
- --------
- >>> d = np.array([[1, 1, 1, 0, 0, 0],
- ... [0, 0, 0, 1, 1, 1],
- ... [1, 1, 1, 1, 1, 1]]).T
- >>> isestimable([1, 0, 0], d)
- False
- >>> isestimable([1, -1, 0], d)
- True
- """
- c = array_like(c, 'c', maxdim=2)
- d = array_like(d, 'd', ndim=2)
- c = c[None, :] if c.ndim == 1 else c
- if c.shape[1] != d.shape[1]:
- raise ValueError('Contrast should have %d columns' % d.shape[1])
- new = np.vstack([c, d])
- if np.linalg.matrix_rank(new) != np.linalg.matrix_rank(d):
- return False
- return True
- def pinv_extended(x, rcond=1e-15):
- """
- Return the pinv of an array X as well as the singular values
- used in computation.
- Code adapted from numpy.
- """
- x = np.asarray(x)
- x = x.conjugate()
- u, s, vt = np.linalg.svd(x, False)
- s_orig = np.copy(s)
- m = u.shape[0]
- n = vt.shape[1]
- cutoff = rcond * np.maximum.reduce(s)
- for i in range(min(n, m)):
- if s[i] > cutoff:
- s[i] = 1./s[i]
- else:
- s[i] = 0.
- res = np.dot(np.transpose(vt), np.multiply(s[:, np.core.newaxis],
- np.transpose(u)))
- return res, s_orig
- def recipr(x):
- """
- Reciprocal of an array with entries less than or equal to 0 set to 0.
- Parameters
- ----------
- x : array_like
- The input array.
- Returns
- -------
- ndarray
- The array with 0-filled reciprocals.
- """
- x = np.asarray(x)
- out = np.zeros_like(x, dtype=np.float64)
- nans = np.isnan(x.flat)
- pos = ~nans
- pos[pos] = pos[pos] & (x.flat[pos] > 0)
- out.flat[pos] = 1.0 / x.flat[pos]
- out.flat[nans] = np.nan
- return out
- def recipr0(x):
- """
- Reciprocal of an array with entries less than 0 set to 0.
- Parameters
- ----------
- x : array_like
- The input array.
- Returns
- -------
- ndarray
- The array with 0-filled reciprocals.
- """
- x = np.asarray(x)
- out = np.zeros_like(x, dtype=np.float64)
- nans = np.isnan(x.flat)
- non_zero = ~nans
- non_zero[non_zero] = non_zero[non_zero] & (x.flat[non_zero] != 0)
- out.flat[non_zero] = 1.0 / x.flat[non_zero]
- out.flat[nans] = np.nan
- return out
- def clean0(matrix):
- """
- Erase columns of zeros: can save some time in pseudoinverse.
- Parameters
- ----------
- matrix : ndarray
- The array to clean.
- Returns
- -------
- ndarray
- The cleaned array.
- """
- colsum = np.add.reduce(matrix**2, 0)
- val = [matrix[:, i] for i in np.flatnonzero(colsum)]
- return np.array(np.transpose(val))
- def fullrank(x, r=None):
- """
- Return an array whose column span is the same as x.
- Parameters
- ----------
- x : ndarray
- The array to adjust, 2d.
- r : int, optional
- The rank of x. If not provided, determined by `np.linalg.matrix_rank`.
- Returns
- -------
- ndarray
- The array adjusted to have full rank.
- Notes
- -----
- If the rank of x is known it can be specified as r -- no check
- is made to ensure that this really is the rank of x.
- """
- if r is None:
- r = np.linalg.matrix_rank(x)
- v, d, u = np.linalg.svd(x, full_matrices=False)
- order = np.argsort(d)
- order = order[::-1]
- value = []
- for i in range(r):
- value.append(v[:, order[i]])
- return np.asarray(np.transpose(value)).astype(np.float64)
- def unsqueeze(data, axis, oldshape):
- """
- Unsqueeze a collapsed array.
- Parameters
- ----------
- data : ndarray
- The data to unsqueeze.
- axis : int
- The axis to unsqueeze.
- oldshape : tuple[int]
- The original shape before the squeeze or reduce operation.
- Returns
- -------
- ndarray
- The unsqueezed array.
- Examples
- --------
- >>> from numpy import mean
- >>> from numpy.random import standard_normal
- >>> x = standard_normal((3,4,5))
- >>> m = mean(x, axis=1)
- >>> m.shape
- (3, 5)
- >>> m = unsqueeze(m, 1, x.shape)
- >>> m.shape
- (3, 1, 5)
- >>>
- """
- newshape = list(oldshape)
- newshape[axis] = 1
- return data.reshape(newshape)
- def nan_dot(A, B):
- """
- Returns np.dot(left_matrix, right_matrix) with the convention that
- nan * 0 = 0 and nan * x = nan if x != 0.
- Parameters
- ----------
- A, B : ndarray
- """
- # Find out who should be nan due to nan * nonzero
- should_be_nan_1 = np.dot(np.isnan(A), (B != 0))
- should_be_nan_2 = np.dot((A != 0), np.isnan(B))
- should_be_nan = should_be_nan_1 + should_be_nan_2
- # Multiply after setting all nan to 0
- # This is what happens if there were no nan * nonzero conflicts
- C = np.dot(np.nan_to_num(A), np.nan_to_num(B))
- C[should_be_nan] = np.nan
- return C
- def maybe_unwrap_results(results):
- """
- Gets raw results back from wrapped results.
- Can be used in plotting functions or other post-estimation type
- routines.
- """
- return getattr(results, '_results', results)
- class Bunch(dict):
- """
- Returns a dict-like object with keys accessible via attribute lookup.
- Parameters
- ----------
- *args
- Arguments passed to dict constructor, tuples (key, value).
- **kwargs
- Keyword agument passed to dict constructor, key=value.
- """
- def __init__(self, *args, **kwargs):
- super(Bunch, self).__init__(*args, **kwargs)
- self.__dict__ = self
- def _ensure_2d(x, ndarray=False):
- """
- Parameters
- ----------
- x : ndarray, Series, DataFrame or None
- Input to verify dimensions, and to transform as necesary
- ndarray : bool
- Flag indicating whether to always return a NumPy array. Setting False
- will return an pandas DataFrame when the input is a Series or a
- DataFrame.
- Returns
- -------
- out : ndarray, DataFrame or None
- array or DataFrame with 2 dimensiona. One dimensional arrays are
- returned as nobs by 1. None is returned if x is None.
- names : list of str or None
- list containing variables names when the input is a pandas datatype.
- Returns None if the input is an ndarray.
- Notes
- -----
- Accepts None for simplicity
- """
- if x is None:
- return x
- is_pandas = _is_using_pandas(x, None)
- if x.ndim == 2:
- if is_pandas:
- return x, x.columns
- else:
- return x, None
- elif x.ndim > 2:
- raise ValueError('x mst be 1 or 2-dimensional.')
- name = x.name if is_pandas else None
- if ndarray:
- return np.asarray(x)[:, None], name
- else:
- return pd.DataFrame(x), name
- def matrix_rank(m, tol=None, method="qr"):
- """
- Matrix rank calculation using QR or SVD
- Parameters
- ----------
- m : array_like
- A 2-d array-like object to test
- tol : float, optional
- The tolerance to use when testing the matrix rank. If not provided
- an appropriate value is selected.
- method : {"ip", "qr", "svd"}
- The method used. "ip" uses the inner-product of a normalized version
- of m and then computes the rank using NumPy's matrix_rank.
- "qr" uses a QR decomposition and is the default. "svd" defers to
- NumPy's matrix_rank.
- Returns
- -------
- int
- The rank of m.
- Notes
- -----
- When using a QR factorization, the rank is determined by the number of
- elements on the leading diagonal of the R matrix that are above tol
- in absolute value.
- """
- m = array_like(m, "m", ndim=2)
- if method == "ip":
- m = m[:, np.any(m != 0, axis=0)]
- m = m / np.sqrt((m ** 2).sum(0))
- m = m.T @ m
- return np.linalg.matrix_rank(m, tol=tol, hermitian=True)
- elif method == "qr":
- r, = scipy.linalg.qr(m, mode="r")
- abs_diag = np.abs(np.diag(r))
- if tol is None:
- tol = abs_diag[0] * m.shape[1] * np.finfo(float).eps
- return int((abs_diag > tol).sum())
- else:
- return np.linalg.matrix_rank(m, tol=tol)