/statsmodels/tools/data.py

http://github.com/statsmodels/statsmodels · Python · 118 lines · 65 code · 30 blank · 23 comment · 19 complexity · bf0bf9c8197460bc9c9d1ca53d93e9d8 MD5 · raw file

  1. """
  2. Compatibility tools for various data structure inputs
  3. """
  4. import numpy as np
  5. import pandas as pd
  6. def _check_period_index(x, freq="M"):
  7. from pandas import PeriodIndex, DatetimeIndex
  8. if not isinstance(x.index, (DatetimeIndex, PeriodIndex)):
  9. raise ValueError("The index must be a DatetimeIndex or PeriodIndex")
  10. if x.index.freq is not None:
  11. inferred_freq = x.index.freqstr
  12. else:
  13. inferred_freq = pd.infer_freq(x.index)
  14. if not inferred_freq.startswith(freq):
  15. raise ValueError("Expected frequency {}. Got {}".format(inferred_freq,
  16. freq))
  17. def is_data_frame(obj):
  18. return isinstance(obj, pd.DataFrame)
  19. def is_design_matrix(obj):
  20. from patsy import DesignMatrix
  21. return isinstance(obj, DesignMatrix)
  22. def _is_structured_ndarray(obj):
  23. return isinstance(obj, np.ndarray) and obj.dtype.names is not None
  24. def interpret_data(data, colnames=None, rownames=None):
  25. """
  26. Convert passed data structure to form required by estimation classes
  27. Parameters
  28. ----------
  29. data : array_like
  30. colnames : sequence or None
  31. May be part of data structure
  32. rownames : sequence or None
  33. Returns
  34. -------
  35. (values, colnames, rownames) : (homogeneous ndarray, list)
  36. """
  37. if isinstance(data, np.ndarray):
  38. values = np.asarray(data)
  39. if colnames is None:
  40. colnames = ['Y_%d' % i for i in range(values.shape[1])]
  41. elif is_data_frame(data):
  42. # XXX: hack
  43. data = data.dropna()
  44. values = data.values
  45. colnames = data.columns
  46. rownames = data.index
  47. else: # pragma: no cover
  48. raise TypeError('Cannot handle input type {typ}'
  49. .format(typ=type(data).__name__))
  50. if not isinstance(colnames, list):
  51. colnames = list(colnames)
  52. # sanity check
  53. if len(colnames) != values.shape[1]:
  54. raise ValueError('length of colnames does not match number '
  55. 'of columns in data')
  56. if rownames is not None and len(rownames) != len(values):
  57. raise ValueError('length of rownames does not match number '
  58. 'of rows in data')
  59. return values, colnames, rownames
  60. def struct_to_ndarray(arr):
  61. return arr.view((float, (len(arr.dtype.names),)), type=np.ndarray)
  62. def _is_using_ndarray_type(endog, exog):
  63. return (type(endog) is np.ndarray and
  64. (type(exog) is np.ndarray or exog is None))
  65. def _is_using_ndarray(endog, exog):
  66. return (isinstance(endog, np.ndarray) and
  67. (isinstance(exog, np.ndarray) or exog is None))
  68. def _is_using_pandas(endog, exog):
  69. from statsmodels.compat.pandas import data_klasses as klasses
  70. return (isinstance(endog, klasses) or isinstance(exog, klasses))
  71. def _is_array_like(endog, exog):
  72. try: # do it like this in case of mixed types, ie., ndarray and list
  73. endog = np.asarray(endog)
  74. exog = np.asarray(exog)
  75. return True
  76. except:
  77. return False
  78. def _is_using_patsy(endog, exog):
  79. # we get this when a structured array is passed through a formula
  80. return (is_design_matrix(endog) and
  81. (is_design_matrix(exog) or exog is None))
  82. def _is_recarray(data):
  83. """
  84. Returns true if data is a recarray
  85. """
  86. return isinstance(data, np.core.recarray)