/statsmodels/tools/tools.py

http://github.com/statsmodels/statsmodels · Python · 623 lines · 544 code · 9 blank · 70 comment · 14 complexity · 4aacc0c59f7f978d61d7db635dad564e MD5 · raw file

  1. """
  2. Utility functions models code
  3. """
  4. import numpy as np
  5. import pandas as pd
  6. import scipy.linalg
  7. from statsmodels.tools.data import _is_using_pandas
  8. from statsmodels.tools.validation import array_like
  9. def asstr2(s):
  10. if isinstance(s, str):
  11. return s
  12. elif isinstance(s, bytes):
  13. return s.decode('latin1')
  14. else:
  15. return str(s)
  16. def _make_dictnames(tmp_arr, offset=0):
  17. """
  18. Helper function to create a dictionary mapping a column number
  19. to the name in tmp_arr.
  20. """
  21. col_map = {}
  22. for i, col_name in enumerate(tmp_arr):
  23. col_map[i + offset] = col_name
  24. return col_map
  25. def drop_missing(Y, X=None, axis=1):
  26. """
  27. Returns views on the arrays Y and X where missing observations are dropped.
  28. Y : array_like
  29. X : array_like, optional
  30. axis : int
  31. Axis along which to look for missing observations. Default is 1, ie.,
  32. observations in rows.
  33. Returns
  34. -------
  35. Y : ndarray
  36. All Y where the
  37. X : ndarray
  38. Notes
  39. -----
  40. If either Y or X is 1d, it is reshaped to be 2d.
  41. """
  42. Y = np.asarray(Y)
  43. if Y.ndim == 1:
  44. Y = Y[:, None]
  45. if X is not None:
  46. X = np.array(X)
  47. if X.ndim == 1:
  48. X = X[:, None]
  49. keepidx = np.logical_and(~np.isnan(Y).any(axis),
  50. ~np.isnan(X).any(axis))
  51. return Y[keepidx], X[keepidx]
  52. else:
  53. keepidx = ~np.isnan(Y).any(axis)
  54. return Y[keepidx]
  55. # TODO: needs to better preserve dtype and be more flexible
  56. # ie., if you still have a string variable in your array you do not
  57. # want to cast it to float
  58. # TODO: add name validator (ie., bad names for datasets.grunfeld)
  59. def categorical(data, col=None, dictnames=False, drop=False):
  60. """
  61. Construct a dummy matrix from categorical variables
  62. .. deprecated:: 0.12
  63. Use pandas.get_dummies instead.
  64. Parameters
  65. ----------
  66. data : array_like
  67. An array, Series or DataFrame. This can be either a 1d vector of
  68. the categorical variable or a 2d array with the column specifying
  69. the categorical variable specified by the col argument.
  70. col : {str, int, None}
  71. If data is a DataFrame col must in a column of data. If data is a
  72. Series, col must be either the name of the Series or None. For arrays,
  73. `col` can be an int that is the (zero-based) column index
  74. number. `col` can only be None for a 1d array. The default is None.
  75. dictnames : bool, optional
  76. If True, a dictionary mapping the column number to the categorical
  77. name is returned. Used to have information about plain arrays.
  78. drop : bool
  79. Whether or not keep the categorical variable in the returned matrix.
  80. Returns
  81. -------
  82. dummy_matrix : array_like
  83. A matrix of dummy (indicator/binary) float variables for the
  84. categorical data.
  85. dictnames : dict[int, str], optional
  86. Mapping between column numbers and categorical names.
  87. Notes
  88. -----
  89. This returns a dummy variable for *each* distinct variable. If a
  90. a DaataFrame is provided, the names for the new variable is the
  91. old variable name - underscore - category name. So if the a variable
  92. 'vote' had answers as 'yes' or 'no' then the returned array would have to
  93. new variables-- 'vote_yes' and 'vote_no'. There is currently
  94. no name checking.
  95. Examples
  96. --------
  97. >>> import numpy as np
  98. >>> import statsmodels.api as sm
  99. Univariate examples
  100. >>> import string
  101. >>> string_var = [string.ascii_lowercase[0:5],
  102. ... string.ascii_lowercase[5:10],
  103. ... string.ascii_lowercase[10:15],
  104. ... string.ascii_lowercase[15:20],
  105. ... string.ascii_lowercase[20:25]]
  106. >>> string_var *= 5
  107. >>> string_var = np.asarray(sorted(string_var))
  108. >>> design = sm.tools.categorical(string_var, drop=True)
  109. Or for a numerical categorical variable
  110. >>> instr = np.floor(np.arange(10,60, step=2)/10)
  111. >>> design = sm.tools.categorical(instr, drop=True)
  112. With a structured array
  113. >>> num = np.random.randn(25,2)
  114. >>> struct_ar = np.zeros((25,1),
  115. ... dtype=[('var1', 'f4'),('var2', 'f4'),
  116. ... ('instrument','f4'),('str_instr','a5')])
  117. >>> struct_ar['var1'] = num[:,0][:,None]
  118. >>> struct_ar['var2'] = num[:,1][:,None]
  119. >>> struct_ar['instrument'] = instr[:,None]
  120. >>> struct_ar['str_instr'] = string_var[:,None]
  121. >>> design = sm.tools.categorical(struct_ar, col='instrument', drop=True)
  122. Or
  123. >>> design2 = sm.tools.categorical(struct_ar, col='str_instr', drop=True)
  124. """
  125. import warnings
  126. warnings.warn(
  127. "categorical is deprecated. Use pandas Categorical to represent "
  128. "categorical data and can get_dummies to construct dummy arrays. "
  129. "It will be removed after release 0.13.",
  130. FutureWarning
  131. )
  132. # TODO: add a NameValidator function
  133. if isinstance(col, (list, tuple)):
  134. if len(col) == 1:
  135. col = col[0]
  136. else:
  137. raise ValueError("Can only convert one column at a time")
  138. if (not isinstance(data, (pd.DataFrame, pd.Series)) and
  139. not isinstance(col, (str, int)) and
  140. col is not None):
  141. raise TypeError('col must be a str, int or None')
  142. # Pull out a Series from a DataFrame if provided
  143. if isinstance(data, pd.DataFrame):
  144. if col is None:
  145. raise TypeError('col must be a str or int when using a DataFrame')
  146. elif col not in data:
  147. raise ValueError('Column \'{0}\' not found in data'.format(col))
  148. data = data[col]
  149. # Set col to None since we not have a Series
  150. col = None
  151. if isinstance(data, pd.Series):
  152. if col is not None and data.name != col:
  153. raise ValueError('data.name does not match col '
  154. '\'{0}\''.format(col))
  155. data_cat = pd.Categorical(data)
  156. dummies = pd.get_dummies(data_cat)
  157. col_map = {i: cat for i, cat in enumerate(data_cat.categories) if
  158. cat in dummies}
  159. if not drop:
  160. dummies.columns = list(dummies.columns)
  161. dummies = pd.concat([dummies, data], axis=1)
  162. if dictnames:
  163. return dummies, col_map
  164. return dummies
  165. # Catch array_like for an error
  166. elif not isinstance(data, np.ndarray):
  167. raise NotImplementedError("array_like objects are not supported")
  168. else:
  169. if isinstance(col, int):
  170. offset = data.shape[1] # need error catching here?
  171. tmp_arr = np.unique(data[:, col])
  172. tmp_dummy = (tmp_arr[:, np.newaxis] == data[:, col]).astype(float)
  173. tmp_dummy = tmp_dummy.swapaxes(1, 0)
  174. if drop is True:
  175. offset -= 1
  176. data = np.delete(data, col, axis=1).astype(float)
  177. data = np.column_stack((data, tmp_dummy))
  178. if dictnames is True:
  179. col_map = _make_dictnames(tmp_arr, offset)
  180. return data, col_map
  181. return data
  182. elif col is None and np.squeeze(data).ndim == 1:
  183. tmp_arr = np.unique(data)
  184. tmp_dummy = (tmp_arr[:, None] == data).astype(float)
  185. tmp_dummy = tmp_dummy.swapaxes(1, 0)
  186. if drop is True:
  187. if dictnames is True:
  188. col_map = _make_dictnames(tmp_arr)
  189. return tmp_dummy, col_map
  190. return tmp_dummy
  191. else:
  192. data = np.column_stack((data, tmp_dummy))
  193. if dictnames is True:
  194. col_map = _make_dictnames(tmp_arr, offset=1)
  195. return data, col_map
  196. return data
  197. else:
  198. raise IndexError("The index %s is not understood" % col)
  199. # TODO: add an axis argument to this for sysreg
  200. def add_constant(data, prepend=True, has_constant='skip'):
  201. """
  202. Add a column of ones to an array.
  203. Parameters
  204. ----------
  205. data : array_like
  206. A column-ordered design matrix.
  207. prepend : bool
  208. If true, the constant is in the first column. Else the constant is
  209. appended (last column).
  210. has_constant : str {'raise', 'add', 'skip'}
  211. Behavior if ``data`` already has a constant. The default will return
  212. data without adding another constant. If 'raise', will raise an
  213. error if any column has a constant value. Using 'add' will add a
  214. column of 1s if a constant column is present.
  215. Returns
  216. -------
  217. array_like
  218. The original values with a constant (column of ones) as the first or
  219. last column. Returned value type depends on input type.
  220. Notes
  221. -----
  222. When the input is a pandas Series or DataFrame, the added column's name
  223. is 'const'.
  224. """
  225. if _is_using_pandas(data, None):
  226. from statsmodels.tsa.tsatools import add_trend
  227. return add_trend(data, trend='c', prepend=prepend, has_constant=has_constant)
  228. # Special case for NumPy
  229. x = np.asarray(data)
  230. ndim = x.ndim
  231. if ndim == 1:
  232. x = x[:, None]
  233. elif x.ndim > 2:
  234. raise ValueError('Only implemented for 2-dimensional arrays')
  235. is_nonzero_const = np.ptp(x, axis=0) == 0
  236. is_nonzero_const &= np.all(x != 0.0, axis=0)
  237. if is_nonzero_const.any():
  238. if has_constant == 'skip':
  239. return x
  240. elif has_constant == 'raise':
  241. if ndim == 1:
  242. raise ValueError("data is constant.")
  243. else:
  244. columns = np.arange(x.shape[1])
  245. cols = ",".join([str(c) for c in columns[is_nonzero_const]])
  246. raise ValueError(f"Column(s) {cols} are constant.")
  247. x = [np.ones(x.shape[0]), x]
  248. x = x if prepend else x[::-1]
  249. return np.column_stack(x)
  250. def isestimable(c, d):
  251. """
  252. True if (Q, P) contrast `c` is estimable for (N, P) design `d`.
  253. From an Q x P contrast matrix `C` and an N x P design matrix `D`, checks if
  254. the contrast `C` is estimable by looking at the rank of ``vstack([C,D])``
  255. and verifying it is the same as the rank of `D`.
  256. Parameters
  257. ----------
  258. c : array_like
  259. A contrast matrix with shape (Q, P). If 1 dimensional assume shape is
  260. (1, P).
  261. d : array_like
  262. The design matrix, (N, P).
  263. Returns
  264. -------
  265. bool
  266. True if the contrast `c` is estimable on design `d`.
  267. Examples
  268. --------
  269. >>> d = np.array([[1, 1, 1, 0, 0, 0],
  270. ... [0, 0, 0, 1, 1, 1],
  271. ... [1, 1, 1, 1, 1, 1]]).T
  272. >>> isestimable([1, 0, 0], d)
  273. False
  274. >>> isestimable([1, -1, 0], d)
  275. True
  276. """
  277. c = array_like(c, 'c', maxdim=2)
  278. d = array_like(d, 'd', ndim=2)
  279. c = c[None, :] if c.ndim == 1 else c
  280. if c.shape[1] != d.shape[1]:
  281. raise ValueError('Contrast should have %d columns' % d.shape[1])
  282. new = np.vstack([c, d])
  283. if np.linalg.matrix_rank(new) != np.linalg.matrix_rank(d):
  284. return False
  285. return True
  286. def pinv_extended(x, rcond=1e-15):
  287. """
  288. Return the pinv of an array X as well as the singular values
  289. used in computation.
  290. Code adapted from numpy.
  291. """
  292. x = np.asarray(x)
  293. x = x.conjugate()
  294. u, s, vt = np.linalg.svd(x, False)
  295. s_orig = np.copy(s)
  296. m = u.shape[0]
  297. n = vt.shape[1]
  298. cutoff = rcond * np.maximum.reduce(s)
  299. for i in range(min(n, m)):
  300. if s[i] > cutoff:
  301. s[i] = 1./s[i]
  302. else:
  303. s[i] = 0.
  304. res = np.dot(np.transpose(vt), np.multiply(s[:, np.core.newaxis],
  305. np.transpose(u)))
  306. return res, s_orig
  307. def recipr(x):
  308. """
  309. Reciprocal of an array with entries less than or equal to 0 set to 0.
  310. Parameters
  311. ----------
  312. x : array_like
  313. The input array.
  314. Returns
  315. -------
  316. ndarray
  317. The array with 0-filled reciprocals.
  318. """
  319. x = np.asarray(x)
  320. out = np.zeros_like(x, dtype=np.float64)
  321. nans = np.isnan(x.flat)
  322. pos = ~nans
  323. pos[pos] = pos[pos] & (x.flat[pos] > 0)
  324. out.flat[pos] = 1.0 / x.flat[pos]
  325. out.flat[nans] = np.nan
  326. return out
  327. def recipr0(x):
  328. """
  329. Reciprocal of an array with entries less than 0 set to 0.
  330. Parameters
  331. ----------
  332. x : array_like
  333. The input array.
  334. Returns
  335. -------
  336. ndarray
  337. The array with 0-filled reciprocals.
  338. """
  339. x = np.asarray(x)
  340. out = np.zeros_like(x, dtype=np.float64)
  341. nans = np.isnan(x.flat)
  342. non_zero = ~nans
  343. non_zero[non_zero] = non_zero[non_zero] & (x.flat[non_zero] != 0)
  344. out.flat[non_zero] = 1.0 / x.flat[non_zero]
  345. out.flat[nans] = np.nan
  346. return out
  347. def clean0(matrix):
  348. """
  349. Erase columns of zeros: can save some time in pseudoinverse.
  350. Parameters
  351. ----------
  352. matrix : ndarray
  353. The array to clean.
  354. Returns
  355. -------
  356. ndarray
  357. The cleaned array.
  358. """
  359. colsum = np.add.reduce(matrix**2, 0)
  360. val = [matrix[:, i] for i in np.flatnonzero(colsum)]
  361. return np.array(np.transpose(val))
  362. def fullrank(x, r=None):
  363. """
  364. Return an array whose column span is the same as x.
  365. Parameters
  366. ----------
  367. x : ndarray
  368. The array to adjust, 2d.
  369. r : int, optional
  370. The rank of x. If not provided, determined by `np.linalg.matrix_rank`.
  371. Returns
  372. -------
  373. ndarray
  374. The array adjusted to have full rank.
  375. Notes
  376. -----
  377. If the rank of x is known it can be specified as r -- no check
  378. is made to ensure that this really is the rank of x.
  379. """
  380. if r is None:
  381. r = np.linalg.matrix_rank(x)
  382. v, d, u = np.linalg.svd(x, full_matrices=False)
  383. order = np.argsort(d)
  384. order = order[::-1]
  385. value = []
  386. for i in range(r):
  387. value.append(v[:, order[i]])
  388. return np.asarray(np.transpose(value)).astype(np.float64)
  389. def unsqueeze(data, axis, oldshape):
  390. """
  391. Unsqueeze a collapsed array.
  392. Parameters
  393. ----------
  394. data : ndarray
  395. The data to unsqueeze.
  396. axis : int
  397. The axis to unsqueeze.
  398. oldshape : tuple[int]
  399. The original shape before the squeeze or reduce operation.
  400. Returns
  401. -------
  402. ndarray
  403. The unsqueezed array.
  404. Examples
  405. --------
  406. >>> from numpy import mean
  407. >>> from numpy.random import standard_normal
  408. >>> x = standard_normal((3,4,5))
  409. >>> m = mean(x, axis=1)
  410. >>> m.shape
  411. (3, 5)
  412. >>> m = unsqueeze(m, 1, x.shape)
  413. >>> m.shape
  414. (3, 1, 5)
  415. >>>
  416. """
  417. newshape = list(oldshape)
  418. newshape[axis] = 1
  419. return data.reshape(newshape)
  420. def nan_dot(A, B):
  421. """
  422. Returns np.dot(left_matrix, right_matrix) with the convention that
  423. nan * 0 = 0 and nan * x = nan if x != 0.
  424. Parameters
  425. ----------
  426. A, B : ndarray
  427. """
  428. # Find out who should be nan due to nan * nonzero
  429. should_be_nan_1 = np.dot(np.isnan(A), (B != 0))
  430. should_be_nan_2 = np.dot((A != 0), np.isnan(B))
  431. should_be_nan = should_be_nan_1 + should_be_nan_2
  432. # Multiply after setting all nan to 0
  433. # This is what happens if there were no nan * nonzero conflicts
  434. C = np.dot(np.nan_to_num(A), np.nan_to_num(B))
  435. C[should_be_nan] = np.nan
  436. return C
  437. def maybe_unwrap_results(results):
  438. """
  439. Gets raw results back from wrapped results.
  440. Can be used in plotting functions or other post-estimation type
  441. routines.
  442. """
  443. return getattr(results, '_results', results)
  444. class Bunch(dict):
  445. """
  446. Returns a dict-like object with keys accessible via attribute lookup.
  447. Parameters
  448. ----------
  449. *args
  450. Arguments passed to dict constructor, tuples (key, value).
  451. **kwargs
  452. Keyword agument passed to dict constructor, key=value.
  453. """
  454. def __init__(self, *args, **kwargs):
  455. super(Bunch, self).__init__(*args, **kwargs)
  456. self.__dict__ = self
  457. def _ensure_2d(x, ndarray=False):
  458. """
  459. Parameters
  460. ----------
  461. x : ndarray, Series, DataFrame or None
  462. Input to verify dimensions, and to transform as necesary
  463. ndarray : bool
  464. Flag indicating whether to always return a NumPy array. Setting False
  465. will return an pandas DataFrame when the input is a Series or a
  466. DataFrame.
  467. Returns
  468. -------
  469. out : ndarray, DataFrame or None
  470. array or DataFrame with 2 dimensiona. One dimensional arrays are
  471. returned as nobs by 1. None is returned if x is None.
  472. names : list of str or None
  473. list containing variables names when the input is a pandas datatype.
  474. Returns None if the input is an ndarray.
  475. Notes
  476. -----
  477. Accepts None for simplicity
  478. """
  479. if x is None:
  480. return x
  481. is_pandas = _is_using_pandas(x, None)
  482. if x.ndim == 2:
  483. if is_pandas:
  484. return x, x.columns
  485. else:
  486. return x, None
  487. elif x.ndim > 2:
  488. raise ValueError('x mst be 1 or 2-dimensional.')
  489. name = x.name if is_pandas else None
  490. if ndarray:
  491. return np.asarray(x)[:, None], name
  492. else:
  493. return pd.DataFrame(x), name
  494. def matrix_rank(m, tol=None, method="qr"):
  495. """
  496. Matrix rank calculation using QR or SVD
  497. Parameters
  498. ----------
  499. m : array_like
  500. A 2-d array-like object to test
  501. tol : float, optional
  502. The tolerance to use when testing the matrix rank. If not provided
  503. an appropriate value is selected.
  504. method : {"ip", "qr", "svd"}
  505. The method used. "ip" uses the inner-product of a normalized version
  506. of m and then computes the rank using NumPy's matrix_rank.
  507. "qr" uses a QR decomposition and is the default. "svd" defers to
  508. NumPy's matrix_rank.
  509. Returns
  510. -------
  511. int
  512. The rank of m.
  513. Notes
  514. -----
  515. When using a QR factorization, the rank is determined by the number of
  516. elements on the leading diagonal of the R matrix that are above tol
  517. in absolute value.
  518. """
  519. m = array_like(m, "m", ndim=2)
  520. if method == "ip":
  521. m = m[:, np.any(m != 0, axis=0)]
  522. m = m / np.sqrt((m ** 2).sum(0))
  523. m = m.T @ m
  524. return np.linalg.matrix_rank(m, tol=tol, hermitian=True)
  525. elif method == "qr":
  526. r, = scipy.linalg.qr(m, mode="r")
  527. abs_diag = np.abs(np.diag(r))
  528. if tol is None:
  529. tol = abs_diag[0] * m.shape[1] * np.finfo(float).eps
  530. return int((abs_diag > tol).sum())
  531. else:
  532. return np.linalg.matrix_rank(m, tol=tol)