PageRenderTime 136ms CodeModel.GetById 111ms app.highlight 19ms RepoModel.GetById 1ms app.codeStats 0ms

/statsmodels/tools/tools.py

http://github.com/statsmodels/statsmodels
Python | 629 lines | 545 code | 10 blank | 74 comment | 14 complexity | d28c1a4a1ea521d0880f0c27bb412bfc MD5 | raw file
  1"""
  2Utility functions models code
  3"""
  4import numpy as np
  5import numpy.lib.recfunctions as nprf
  6import pandas as pd
  7
  8from statsmodels.compat.python import lzip, lmap
  9
 10from statsmodels.tools.data import _is_using_pandas, _is_recarray
 11from statsmodels.tools.validation import array_like
 12
 13
 14def asstr2(s):
 15    if isinstance(s, str):
 16        return s
 17    elif isinstance(s, bytes):
 18        return s.decode('latin1')
 19    else:
 20        return str(s)
 21
 22
 23def _make_dictnames(tmp_arr, offset=0):
 24    """
 25    Helper function to create a dictionary mapping a column number
 26    to the name in tmp_arr.
 27    """
 28    col_map = {}
 29    for i, col_name in enumerate(tmp_arr):
 30        col_map[i + offset] = col_name
 31    return col_map
 32
 33
 34def drop_missing(Y, X=None, axis=1):
 35    """
 36    Returns views on the arrays Y and X where missing observations are dropped.
 37
 38    Y : array_like
 39    X : array_like, optional
 40    axis : int
 41        Axis along which to look for missing observations.  Default is 1, ie.,
 42        observations in rows.
 43
 44    Returns
 45    -------
 46    Y : ndarray
 47        All Y where the
 48    X : ndarray
 49
 50    Notes
 51    -----
 52    If either Y or X is 1d, it is reshaped to be 2d.
 53    """
 54    Y = np.asarray(Y)
 55    if Y.ndim == 1:
 56        Y = Y[:, None]
 57    if X is not None:
 58        X = np.array(X)
 59        if X.ndim == 1:
 60            X = X[:, None]
 61        keepidx = np.logical_and(~np.isnan(Y).any(axis),
 62                                 ~np.isnan(X).any(axis))
 63        return Y[keepidx], X[keepidx]
 64    else:
 65        keepidx = ~np.isnan(Y).any(axis)
 66        return Y[keepidx]
 67
 68
 69# TODO: needs to better preserve dtype and be more flexible
 70# ie., if you still have a string variable in your array you do not
 71# want to cast it to float
 72# TODO: add name validator (ie., bad names for datasets.grunfeld)
 73def categorical(data, col=None, dictnames=False, drop=False):
 74    """
 75    Construct a dummy matrix from categorical variables
 76
 77    Parameters
 78    ----------
 79    data : array_like
 80        A structured array, recarray, array, Series or DataFrame.  This can be
 81        either a 1d vector of the categorical variable or a 2d array with
 82        the column specifying the categorical variable specified by the col
 83        argument.
 84    col : {str, int, None}
 85        If data is a DataFrame col must in a column of data. If data is a
 86        Series, col must be either the name of the Series or None. If data is a
 87        structured array or a recarray, `col` can be a string that is the name
 88        of the column that contains the variable.  For all other
 89        arrays `col` can be an int that is the (zero-based) column index
 90        number.  `col` can only be None for a 1d array.  The default is None.
 91    dictnames : bool, optional
 92        If True, a dictionary mapping the column number to the categorical
 93        name is returned.  Used to have information about plain arrays.
 94    drop : bool
 95        Whether or not keep the categorical variable in the returned matrix.
 96
 97    Returns
 98    -------
 99    dummy_matrix : array_like
100        A matrix of dummy (indicator/binary) float variables for the
101        categorical data.
102    dictnames :  dict[int, str], optional
103        Mapping between column numbers and categorical names.
104
105    Notes
106    -----
107    This returns a dummy variable for *each* distinct variable.  If a
108    a structured or recarray is provided, the names for the new variable is the
109    old variable name - underscore - category name.  So if the a variable
110    'vote' had answers as 'yes' or 'no' then the returned array would have to
111    new variables-- 'vote_yes' and 'vote_no'.  There is currently
112    no name checking.
113
114    Examples
115    --------
116    >>> import numpy as np
117    >>> import statsmodels.api as sm
118
119    Univariate examples
120
121    >>> import string
122    >>> string_var = [string.ascii_lowercase[0:5],
123    ...               string.ascii_lowercase[5:10],
124    ...               string.ascii_lowercase[10:15],
125    ...               string.ascii_lowercase[15:20],
126    ...               string.ascii_lowercase[20:25]]
127    >>> string_var *= 5
128    >>> string_var = np.asarray(sorted(string_var))
129    >>> design = sm.tools.categorical(string_var, drop=True)
130
131    Or for a numerical categorical variable
132
133    >>> instr = np.floor(np.arange(10,60, step=2)/10)
134    >>> design = sm.tools.categorical(instr, drop=True)
135
136    With a structured array
137
138    >>> num = np.random.randn(25,2)
139    >>> struct_ar = np.zeros((25,1),
140    ...                      dtype=[('var1', 'f4'),('var2', 'f4'),
141    ...                             ('instrument','f4'),('str_instr','a5')])
142    >>> struct_ar['var1'] = num[:,0][:,None]
143    >>> struct_ar['var2'] = num[:,1][:,None]
144    >>> struct_ar['instrument'] = instr[:,None]
145    >>> struct_ar['str_instr'] = string_var[:,None]
146    >>> design = sm.tools.categorical(struct_ar, col='instrument', drop=True)
147
148    Or
149
150    >>> design2 = sm.tools.categorical(struct_ar, col='str_instr', drop=True)
151    """
152    # TODO: add a NameValidator function
153    if isinstance(col, (list, tuple)):
154        if len(col) == 1:
155            col = col[0]
156        else:
157            raise ValueError("Can only convert one column at a time")
158    if (not isinstance(data, (pd.DataFrame, pd.Series)) and
159            not isinstance(col, (str, int)) and
160            col is not None):
161        raise TypeError('col must be a str, int or None')
162
163    # Pull out a Series from a DataFrame if provided
164    if isinstance(data, pd.DataFrame):
165        if col is None:
166            raise TypeError('col must be a str or int when using a DataFrame')
167        elif col not in data:
168            raise ValueError('Column \'{0}\' not found in data'.format(col))
169        data = data[col]
170        # Set col to None since we not have a Series
171        col = None
172
173    if isinstance(data, pd.Series):
174        if col is not None and data.name != col:
175            raise ValueError('data.name does not match col '
176                             '\'{0}\''.format(col))
177        data_cat = pd.Categorical(data)
178        dummies = pd.get_dummies(data_cat)
179        col_map = {i: cat for i, cat in enumerate(data_cat.categories) if
180                   cat in dummies}
181        if not drop:
182            dummies.columns = list(dummies.columns)
183            dummies = pd.concat([dummies, data], 1)
184        if dictnames:
185            return dummies, col_map
186        return dummies
187    # catch recarrays and structured arrays
188    elif data.dtype.names or data.__class__ is np.recarray:
189        # deprecated: remove path after 0.12
190        import warnings
191        from statsmodels.tools.sm_exceptions import recarray_warning
192        warnings.warn(recarray_warning, FutureWarning)
193        if not col and np.squeeze(data).ndim > 1:
194            raise IndexError("col is None and the input array is not 1d")
195        if isinstance(col, int):
196            col = data.dtype.names[col]
197        if col is None and data.dtype.names and len(data.dtype.names) == 1:
198            col = data.dtype.names[0]
199
200        tmp_arr = np.unique(data[col])
201
202        # if the cols are shape (#,) vs (#,1) need to add an axis and flip
203        _swap = True
204        if data[col].ndim == 1:
205            tmp_arr = tmp_arr[:, None]
206            _swap = False
207        tmp_dummy = (tmp_arr == data[col]).astype(float)
208        if _swap:
209            tmp_dummy = np.squeeze(tmp_dummy).swapaxes(1, 0)
210
211        if not tmp_arr.dtype.names:  # how do we get to this code path?
212            tmp_arr = [asstr2(item) for item in np.squeeze(tmp_arr)]
213        elif tmp_arr.dtype.names:
214            tmp_arr = [asstr2(item) for item in np.squeeze(tmp_arr.tolist())]
215
216        # prepend the varname and underscore, if col is numeric attribute
217        # lookup is lost for recarrays...
218        if col is None:
219            try:
220                col = data.dtype.names[0]
221            except:
222                col = 'var'
223        # TODO: the above needs to be made robust because there could be many
224        # var_yes, var_no varaibles for instance.
225        tmp_arr = [col + '_' + item for item in tmp_arr]
226        # TODO: test this for rec and structured arrays!!!
227
228        if drop is True:
229            if len(data.dtype) <= 1:
230                if tmp_dummy.shape[0] < tmp_dummy.shape[1]:
231                    tmp_dummy = np.squeeze(tmp_dummy).swapaxes(1, 0)
232                dt = lzip(tmp_arr, [tmp_dummy.dtype.str]*len(tmp_arr))
233                # preserve array type
234                return np.array(lmap(tuple, tmp_dummy.tolist()),
235                                dtype=dt).view(type(data))
236
237            data = nprf.drop_fields(data, col, usemask=False,
238                                    asrecarray=type(data) is np.recarray)
239        data = nprf.append_fields(data, tmp_arr, data=tmp_dummy,
240                                  usemask=False,
241                                  asrecarray=type(data) is np.recarray)
242        return data
243
244    # Catch array_like for an error
245    elif not isinstance(data, np.ndarray):
246        raise NotImplementedError("array_like objects are not supported")
247    else:
248        if isinstance(col, int):
249            offset = data.shape[1]          # need error catching here?
250            tmp_arr = np.unique(data[:, col])
251            tmp_dummy = (tmp_arr[:, np.newaxis] == data[:, col]).astype(float)
252            tmp_dummy = tmp_dummy.swapaxes(1, 0)
253            if drop is True:
254                offset -= 1
255                data = np.delete(data, col, axis=1).astype(float)
256            data = np.column_stack((data, tmp_dummy))
257            if dictnames is True:
258                col_map = _make_dictnames(tmp_arr, offset)
259                return data, col_map
260            return data
261        elif col is None and np.squeeze(data).ndim == 1:
262            tmp_arr = np.unique(data)
263            tmp_dummy = (tmp_arr[:, None] == data).astype(float)
264            tmp_dummy = tmp_dummy.swapaxes(1, 0)
265            if drop is True:
266                if dictnames is True:
267                    col_map = _make_dictnames(tmp_arr)
268                    return tmp_dummy, col_map
269                return tmp_dummy
270            else:
271                data = np.column_stack((data, tmp_dummy))
272                if dictnames is True:
273                    col_map = _make_dictnames(tmp_arr, offset=1)
274                    return data, col_map
275                return data
276        else:
277            raise IndexError("The index %s is not understood" % col)
278
279
280# TODO: add an axis argument to this for sysreg
281def add_constant(data, prepend=True, has_constant='skip'):
282    """
283    Add a column of ones to an array.
284
285    Parameters
286    ----------
287    data : array_like
288        A column-ordered design matrix.
289    prepend : bool
290        If true, the constant is in the first column.  Else the constant is
291        appended (last column).
292    has_constant : str {'raise', 'add', 'skip'}
293        Behavior if ``data`` already has a constant. The default will return
294        data without adding another constant. If 'raise', will raise an
295        error if any column has a constant value. Using 'add' will add a
296        column of 1s if a constant column is present.
297
298    Returns
299    -------
300    array_like
301        The original values with a constant (column of ones) as the first or
302        last column. Returned value type depends on input type.
303
304    Notes
305    -----
306    When the input is recarray or a pandas Series or DataFrame, the added
307    column's name is 'const'.
308    """
309    if _is_using_pandas(data, None) or _is_recarray(data):
310        if _is_recarray(data):
311            # deprecated: remove recarray support after 0.12
312            import warnings
313            from statsmodels.tools.sm_exceptions import recarray_warning
314            warnings.warn(recarray_warning, FutureWarning)
315        from statsmodels.tsa.tsatools import add_trend
316        return add_trend(data, trend='c', prepend=prepend, has_constant=has_constant)
317
318    # Special case for NumPy
319    x = np.asanyarray(data)
320    if x.ndim == 1:
321        x = x[:, None]
322    elif x.ndim > 2:
323        raise ValueError('Only implemented for 2-dimensional arrays')
324
325    is_nonzero_const = np.ptp(x, axis=0) == 0
326    is_nonzero_const &= np.all(x != 0.0, axis=0)
327    if is_nonzero_const.any():
328        if has_constant == 'skip':
329            return x
330        elif has_constant == 'raise':
331            raise ValueError("data already contains a constant")
332
333    x = [np.ones(x.shape[0]), x]
334    x = x if prepend else x[::-1]
335    return np.column_stack(x)
336
337
338def isestimable(c, d):
339    """
340    True if (Q, P) contrast `c` is estimable for (N, P) design `d`.
341
342    From an Q x P contrast matrix `C` and an N x P design matrix `D`, checks if
343    the contrast `C` is estimable by looking at the rank of ``vstack([C,D])``
344    and verifying it is the same as the rank of `D`.
345
346    Parameters
347    ----------
348    c : array_like
349        A contrast matrix with shape (Q, P). If 1 dimensional assume shape is
350        (1, P).
351    d : array_like
352        The design matrix, (N, P).
353
354    Returns
355    -------
356    bool
357        True if the contrast `c` is estimable on design `d`.
358
359    Examples
360    --------
361    >>> d = np.array([[1, 1, 1, 0, 0, 0],
362    ...               [0, 0, 0, 1, 1, 1],
363    ...               [1, 1, 1, 1, 1, 1]]).T
364    >>> isestimable([1, 0, 0], d)
365    False
366    >>> isestimable([1, -1, 0], d)
367    True
368    """
369    c = array_like(c, 'c', maxdim=2)
370    d = array_like(d, 'd', ndim=2)
371    c = c[None, :] if c.ndim == 1 else c
372    if c.shape[1] != d.shape[1]:
373        raise ValueError('Contrast should have %d columns' % d.shape[1])
374    new = np.vstack([c, d])
375    if np.linalg.matrix_rank(new) != np.linalg.matrix_rank(d):
376        return False
377    return True
378
379
380def pinv_extended(x, rcond=1e-15):
381    """
382    Return the pinv of an array X as well as the singular values
383    used in computation.
384
385    Code adapted from numpy.
386    """
387    x = np.asarray(x)
388    x = x.conjugate()
389    u, s, vt = np.linalg.svd(x, False)
390    s_orig = np.copy(s)
391    m = u.shape[0]
392    n = vt.shape[1]
393    cutoff = rcond * np.maximum.reduce(s)
394    for i in range(min(n, m)):
395        if s[i] > cutoff:
396            s[i] = 1./s[i]
397        else:
398            s[i] = 0.
399    res = np.dot(np.transpose(vt), np.multiply(s[:, np.core.newaxis],
400                                               np.transpose(u)))
401    return res, s_orig
402
403
404def recipr(x):
405    """
406    Reciprocal of an array with entries less than or equal to 0 set to 0.
407
408    Parameters
409    ----------
410    x : array_like
411        The input array.
412
413    Returns
414    -------
415    ndarray
416        The array with 0-filled reciprocals.
417    """
418    x = np.asarray(x)
419    out = np.zeros_like(x, dtype=np.float64)
420    nans = np.isnan(x.flat)
421    pos = ~nans
422    pos[pos] = pos[pos] & (x.flat[pos] > 0)
423    out.flat[pos] = 1.0 / x.flat[pos]
424    out.flat[nans] = np.nan
425    return out
426
427
428def recipr0(x):
429    """
430    Reciprocal of an array with entries less than 0 set to 0.
431
432    Parameters
433    ----------
434    x : array_like
435        The input array.
436
437    Returns
438    -------
439    ndarray
440        The array with 0-filled reciprocals.
441    """
442    x = np.asarray(x)
443    out = np.zeros_like(x, dtype=np.float64)
444    nans = np.isnan(x.flat)
445    non_zero = ~nans
446    non_zero[non_zero] = non_zero[non_zero] & (x.flat[non_zero] != 0)
447    out.flat[non_zero] = 1.0 / x.flat[non_zero]
448    out.flat[nans] = np.nan
449    return out
450
451
452def clean0(matrix):
453    """
454    Erase columns of zeros: can save some time in pseudoinverse.
455
456    Parameters
457    ----------
458    matrix : ndarray
459        The array to clean.
460
461    Returns
462    -------
463    ndarray
464        The cleaned array.
465    """
466    colsum = np.add.reduce(matrix**2, 0)
467    val = [matrix[:, i] for i in np.flatnonzero(colsum)]
468    return np.array(np.transpose(val))
469
470
471def fullrank(x, r=None):
472    """
473    Return an array whose column span is the same as x.
474
475    Parameters
476    ----------
477    x : ndarray
478        The array to adjust, 2d.
479    r : int, optional
480        The rank of x. If not provided, determined by `np.linalg.matrix_rank`.
481
482    Returns
483    -------
484    ndarray
485        The array adjusted to have full rank.
486
487    Notes
488    -----
489    If the rank of x is known it can be specified as r -- no check
490    is made to ensure that this really is the rank of x.
491    """
492    if r is None:
493        r = np.linalg.matrix_rank(x)
494
495    v, d, u = np.linalg.svd(x, full_matrices=False)
496    order = np.argsort(d)
497    order = order[::-1]
498    value = []
499    for i in range(r):
500        value.append(v[:, order[i]])
501    return np.asarray(np.transpose(value)).astype(np.float64)
502
503
504def unsqueeze(data, axis, oldshape):
505    """
506    Unsqueeze a collapsed array.
507
508    Parameters
509    ----------
510    data : ndarray
511        The data to unsqueeze.
512    axis : int
513        The axis to unsqueeze.
514    oldshape : tuple[int]
515        The original shape before the squeeze or reduce operation.
516
517    Returns
518    -------
519    ndarray
520        The unsqueezed array.
521
522    Examples
523    --------
524    >>> from numpy import mean
525    >>> from numpy.random import standard_normal
526    >>> x = standard_normal((3,4,5))
527    >>> m = mean(x, axis=1)
528    >>> m.shape
529    (3, 5)
530    >>> m = unsqueeze(m, 1, x.shape)
531    >>> m.shape
532    (3, 1, 5)
533    >>>
534    """
535    newshape = list(oldshape)
536    newshape[axis] = 1
537    return data.reshape(newshape)
538
539
540def nan_dot(A, B):
541    """
542    Returns np.dot(left_matrix, right_matrix) with the convention that
543    nan * 0 = 0 and nan * x = nan if x != 0.
544
545    Parameters
546    ----------
547    A, B : ndarray
548    """
549    # Find out who should be nan due to nan * nonzero
550    should_be_nan_1 = np.dot(np.isnan(A), (B != 0))
551    should_be_nan_2 = np.dot((A != 0), np.isnan(B))
552    should_be_nan = should_be_nan_1 + should_be_nan_2
553
554    # Multiply after setting all nan to 0
555    # This is what happens if there were no nan * nonzero conflicts
556    C = np.dot(np.nan_to_num(A), np.nan_to_num(B))
557
558    C[should_be_nan] = np.nan
559
560    return C
561
562
563def maybe_unwrap_results(results):
564    """
565    Gets raw results back from wrapped results.
566
567    Can be used in plotting functions or other post-estimation type
568    routines.
569    """
570    return getattr(results, '_results', results)
571
572
573class Bunch(dict):
574    """
575    Returns a dict-like object with keys accessible via attribute lookup.
576
577    Parameters
578    ----------
579    *args
580        Arguments passed to dict constructor, tuples (key, value).
581    **kwargs
582        Keyword agument passed to dict constructor, key=value.
583    """
584    def __init__(self, *args, **kwargs):
585        super(Bunch, self).__init__(*args, **kwargs)
586        self.__dict__ = self
587
588
589def _ensure_2d(x, ndarray=False):
590    """
591
592    Parameters
593    ----------
594    x : ndarray, Series, DataFrame or None
595        Input to verify dimensions, and to transform as necesary
596    ndarray : bool
597        Flag indicating whether to always return a NumPy array. Setting False
598        will return an pandas DataFrame when the input is a Series or a
599        DataFrame.
600
601    Returns
602    -------
603    out : ndarray, DataFrame or None
604        array or DataFrame with 2 dimensiona.  One dimensional arrays are
605        returned as nobs by 1. None is returned if x is None.
606    names : list of str or None
607        list containing variables names when the input is a pandas datatype.
608        Returns None if the input is an ndarray.
609
610    Notes
611    -----
612    Accepts None for simplicity
613    """
614    if x is None:
615        return x
616    is_pandas = _is_using_pandas(x, None)
617    if x.ndim == 2:
618        if is_pandas:
619            return x, x.columns
620        else:
621            return x, None
622    elif x.ndim > 2:
623        raise ValueError('x mst be 1 or 2-dimensional.')
624
625    name = x.name if is_pandas else None
626    if ndarray:
627        return np.asarray(x)[:, None], name
628    else:
629        return pd.DataFrame(x), name