PageRenderTime 53ms CodeModel.GetById 23ms RepoModel.GetById 0ms app.codeStats 0ms

/compute_genes_psi_id/pymodules/python2.7/lib/python/statsmodels-0.5.0-py2.7-linux-x86_64.egg/statsmodels/stats/anova.py

https://gitlab.com/pooja043/Globus_Docker_2
Python | 388 lines | 377 code | 3 blank | 8 comment | 7 complexity | 57456a0eefaf23bd2c1b036e95d77589 MD5 | raw file
  1. import numpy as np
  2. from scipy import stats
  3. from pandas import DataFrame, Index
  4. from statsmodels.formula.formulatools import (_remove_intercept_patsy,
  5. _has_intercept, _intercept_idx)
  6. def _get_covariance(model, robust):
  7. if robust is None:
  8. return model.cov_params()
  9. elif robust == "hc0":
  10. se = model.HC0_se
  11. return model.cov_HC0
  12. elif robust == "hc1":
  13. se = model.HC1_se
  14. return model.cov_HC1
  15. elif robust == "hc2":
  16. se = model.HC2_se
  17. return model.cov_HC2
  18. elif robust == "hc3":
  19. se = model.HC3_se
  20. return model.cov_HC3
  21. else: # pragma: no cover
  22. raise ValueError("robust options %s not understood" % robust)
  23. #NOTE: these need to take into account weights !
  24. def anova_single(model, **kwargs):
  25. """
  26. ANOVA table for one fitted linear model.
  27. Parameters
  28. ----------
  29. model : fitted linear model results instance
  30. A fitted linear model
  31. typ : int or str {1,2,3} or {"I","II","III"}
  32. Type of sum of squares to use.
  33. **kwargs**
  34. scale : float
  35. Estimate of variance, If None, will be estimated from the largest
  36. model. Default is None.
  37. test : str {"F", "Chisq", "Cp"} or None
  38. Test statistics to provide. Default is "F".
  39. Notes
  40. -----
  41. Use of this function is discouraged. Use anova_lm instead.
  42. """
  43. test = kwargs.get("test", "F")
  44. scale = kwargs.get("scale", None)
  45. typ = kwargs.get("typ", 1)
  46. robust = kwargs.get("robust", None)
  47. if robust:
  48. robust = robust.lower()
  49. endog = model.model.endog
  50. exog = model.model.exog
  51. nobs = exog.shape[0]
  52. response_name = model.model.endog_names
  53. design_info = model.model.data.orig_exog.design_info
  54. exog_names = model.model.exog_names
  55. # +1 for resids
  56. n_rows = (len(design_info.terms) - _has_intercept(design_info) + 1)
  57. pr_test = "PR(>%s)" % test
  58. names = ['df', 'sum_sq', 'mean_sq', test, pr_test]
  59. table = DataFrame(np.zeros((n_rows, 5)), columns = names)
  60. if typ in [1,"I"]:
  61. return anova1_lm_single(model, endog, exog, nobs, design_info, table,
  62. n_rows, test, pr_test, robust)
  63. elif typ in [2, "II"]:
  64. return anova2_lm_single(model, design_info, n_rows, test, pr_test,
  65. robust)
  66. elif typ in [3, "III"]:
  67. return anova3_lm_single(model, design_info, n_rows, test, pr_test,
  68. robust)
  69. elif typ in [4, "IV"]:
  70. raise NotImplemented("Type IV not yet implemented")
  71. else: # pragma: no cover
  72. raise ValueError("Type %s not understood" % str(typ))
  73. def anova1_lm_single(model, endog, exog, nobs, design_info, table, n_rows, test,
  74. pr_test, robust):
  75. """
  76. ANOVA table for one fitted linear model.
  77. Parameters
  78. ----------
  79. model : fitted linear model results instance
  80. A fitted linear model
  81. **kwargs**
  82. scale : float
  83. Estimate of variance, If None, will be estimated from the largest
  84. model. Default is None.
  85. test : str {"F", "Chisq", "Cp"} or None
  86. Test statistics to provide. Default is "F".
  87. Notes
  88. -----
  89. Use of this function is discouraged. Use anova_lm instead.
  90. """
  91. #maybe we should rethink using pinv > qr in OLS/linear models?
  92. effects = getattr(model, 'effects', None)
  93. if effects is None:
  94. q,r = np.linalg.qr(exog)
  95. effects = np.dot(q.T, endog)
  96. arr = np.zeros((len(design_info.terms), len(design_info.column_names)))
  97. slices = [design_info.slice(name) for name in design_info.term_names]
  98. for i,slice_ in enumerate(slices):
  99. arr[i, slice_] = 1
  100. sum_sq = np.dot(arr, effects**2)
  101. #NOTE: assumes intercept is first column
  102. idx = _intercept_idx(design_info)
  103. sum_sq = sum_sq[~idx]
  104. term_names = np.array(design_info.term_names) # want boolean indexing
  105. term_names = term_names[~idx]
  106. index = term_names.tolist()
  107. table.index = Index(index + ['Residual'])
  108. table.ix[index, ['df', 'sum_sq']] = np.c_[arr[~idx].sum(1), sum_sq]
  109. if test == 'F':
  110. table[:n_rows][test] = ((table['sum_sq']/table['df'])/
  111. (model.ssr/model.df_resid))
  112. table[:n_rows][pr_test] = stats.f.sf(table["F"], table["df"],
  113. model.df_resid)
  114. # fill in residual
  115. table.ix[['Residual'], ['sum_sq','df', test, pr_test]] = (model.ssr,
  116. model.df_resid,
  117. np.nan, np.nan)
  118. table['mean_sq'] = table['sum_sq'] / table['df']
  119. return table
  120. #NOTE: the below is not agnostic about formula...
  121. def anova2_lm_single(model, design_info, n_rows, test, pr_test, robust):
  122. """
  123. ANOVA type II table for one fitted linear model.
  124. Parameters
  125. ----------
  126. model : fitted linear model results instance
  127. A fitted linear model
  128. **kwargs**
  129. scale : float
  130. Estimate of variance, If None, will be estimated from the largest
  131. model. Default is None.
  132. test : str {"F", "Chisq", "Cp"} or None
  133. Test statistics to provide. Default is "F".
  134. Notes
  135. -----
  136. Use of this function is discouraged. Use anova_lm instead.
  137. Type II
  138. Sum of Squares compares marginal contribution of terms. Thus, it is
  139. not particularly useful for models with significant interaction terms.
  140. """
  141. terms_info = design_info.terms[:] # copy
  142. terms_info = _remove_intercept_patsy(terms_info)
  143. names = ['sum_sq', 'df', test, pr_test]
  144. table = DataFrame(np.zeros((n_rows, 4)), columns = names)
  145. cov = _get_covariance(model, None)
  146. robust_cov = _get_covariance(model, robust)
  147. col_order = []
  148. index = []
  149. for i, term in enumerate(terms_info):
  150. # grab all varaibles except interaction effects that contain term
  151. # need two hypotheses matrices L1 is most restrictive, ie., term==0
  152. # L2 is everything except term==0
  153. cols = design_info.slice(term)
  154. L1 = range(cols.start, cols.stop)
  155. L2 = []
  156. term_set = set(term.factors)
  157. for t in terms_info: # for the term you have
  158. other_set = set(t.factors)
  159. if term_set.issubset(other_set) and not term_set == other_set:
  160. col = design_info.slice(t)
  161. # on a higher order term containing current `term`
  162. L1.extend(range(col.start, col.stop))
  163. L2.extend(range(col.start, col.stop))
  164. L1 = np.eye(model.model.exog.shape[1])[L1]
  165. L2 = np.eye(model.model.exog.shape[1])[L2]
  166. if L2.size:
  167. LVL = np.dot(np.dot(L1,robust_cov),L2.T)
  168. from scipy import linalg
  169. orth_compl,_ = linalg.qr(LVL)
  170. r = L1.shape[0] - L2.shape[0]
  171. # L1|2
  172. # use the non-unique orthogonal completion since L12 is rank r
  173. L12 = np.dot(orth_compl[:,-r:].T, L1)
  174. else:
  175. L12 = L1
  176. r = L1.shape[0]
  177. #from IPython.core.debugger import Pdb; Pdb().set_trace()
  178. if test == 'F':
  179. f = model.f_test(L12, cov_p=robust_cov)
  180. table.ix[i][test] = test_value = f.fvalue
  181. table.ix[i][pr_test] = f.pvalue
  182. # need to back out SSR from f_test
  183. table.ix[i]['df'] = r
  184. col_order.append(cols.start)
  185. index.append(term.name())
  186. table.index = Index(index + ['Residual'])
  187. table = table.ix[np.argsort(col_order + [model.model.exog.shape[1]+1])]
  188. # back out sum of squares from f_test
  189. ssr = table[test] * table['df'] * model.ssr/model.df_resid
  190. table['sum_sq'] = ssr
  191. # fill in residual
  192. table.ix['Residual'][['sum_sq','df', test, pr_test]] = (model.ssr,
  193. model.df_resid,
  194. np.nan, np.nan)
  195. return table
  196. def anova3_lm_single(model, design_info, n_rows, test, pr_test, robust):
  197. n_rows += _has_intercept(design_info)
  198. terms_info = design_info.terms
  199. names = ['sum_sq', 'df', test, pr_test]
  200. table = DataFrame(np.zeros((n_rows, 4)), columns = names)
  201. cov = _get_covariance(model, robust)
  202. col_order = []
  203. index = []
  204. for i, term in enumerate(terms_info):
  205. # grab term, hypothesis is that term == 0
  206. cols = design_info.slice(term)
  207. L1 = np.eye(model.model.exog.shape[1])[cols]
  208. L12 = L1
  209. r = L1.shape[0]
  210. if test == 'F':
  211. f = model.f_test(L12, cov_p=cov)
  212. table.ix[i][test] = test_value = f.fvalue
  213. table.ix[i][pr_test] = f.pvalue
  214. # need to back out SSR from f_test
  215. table.ix[i]['df'] = r
  216. #col_order.append(cols.start)
  217. index.append(term.name())
  218. table.index = Index(index + ['Residual'])
  219. #NOTE: Don't need to sort because terms are an ordered dict now
  220. #table = table.ix[np.argsort(col_order + [model.model.exog.shape[1]+1])]
  221. # back out sum of squares from f_test
  222. ssr = table[test] * table['df'] * model.ssr/model.df_resid
  223. table['sum_sq'] = ssr
  224. # fill in residual
  225. table.ix['Residual'][['sum_sq','df', test, pr_test]] = (model.ssr,
  226. model.df_resid,
  227. np.nan, np.nan)
  228. return table
  229. def anova_lm(*args, **kwargs):
  230. """
  231. ANOVA table for one or more fitted linear models.
  232. Parameters
  233. ----------
  234. args : fitted linear model results instance
  235. One or more fitted linear models
  236. scale : float
  237. Estimate of variance, If None, will be estimated from the largest
  238. model. Default is None.
  239. test : str {"F", "Chisq", "Cp"} or None
  240. Test statistics to provide. Default is "F".
  241. typ : str or int {"I","II","III"} or {1,2,3}
  242. The type of ANOVA test to perform. See notes.
  243. robust : {None, "hc0", "hc1", "hc2", "hc3"}
  244. Use heteroscedasticity-corrected coefficient covariance matrix.
  245. If robust covariance is desired, it is recommended to use `hc3`.
  246. Returns
  247. -------
  248. anova : DataFrame
  249. A DataFrame containing.
  250. Notes
  251. -----
  252. Model statistics are given in the order of args. Models must have
  253. been fit using the formula api.
  254. See Also
  255. --------
  256. model_results.compare_f_test, model_results.compare_lm_test
  257. Examples
  258. --------
  259. >>> import statsmodels.api as sm
  260. >>> from statsmodels.formula.api import ols
  261. >>> moore = sm.datasets.get_rdataset("Moore", "car",
  262. ... cache=True) # load data
  263. >>> data = moore.data
  264. >>> data = data.rename(columns={"partner.status" :
  265. ... "partner_status"}) # make name pythonic
  266. >>> moore_lm = ols('conformity ~ C(fcategory, Sum)*C(partner_status, Sum)',
  267. ... data=data).fit()
  268. >>> table = sm.stats.anova_lm(moore_lm, typ=2) # Type 2 ANOVA DataFrame
  269. >>> print table
  270. """
  271. typ = kwargs.get('typ', 1)
  272. ### Farm Out Single model ANOVA Type I, II, III, and IV ###
  273. if len(args) == 1:
  274. model = args[0]
  275. return anova_single(model, **kwargs)
  276. try:
  277. assert typ in [1,"I"]
  278. except:
  279. raise ValueError("Multiple models only supported for type I. "
  280. "Got type %s" % str(typ))
  281. ### COMPUTE ANOVA TYPE I ###
  282. # if given a single model
  283. if len(args) == 1:
  284. return anova_single(*args, **kwargs)
  285. # received multiple fitted models
  286. test = kwargs.get("test", "F")
  287. scale = kwargs.get("scale", None)
  288. n_models = len(args)
  289. model_formula = []
  290. pr_test = "Pr(>%s)" % test
  291. names = ['df_resid', 'ssr', 'df_diff', 'ss_diff', test, pr_test]
  292. table = DataFrame(np.zeros((n_models, 6)), columns = names)
  293. if not scale: # assume biggest model is last
  294. scale = args[-1].scale
  295. table["ssr"] = map(getattr, args, ["ssr"]*n_models)
  296. table["df_resid"] = map(getattr, args, ["df_resid"]*n_models)
  297. table.ix[1:]["df_diff"] = np.diff(map(getattr, args, ["df_model"]*n_models))
  298. table["ss_diff"] = -table["ssr"].diff()
  299. if test == "F":
  300. table["F"] = table["ss_diff"] / table["df_diff"] / scale
  301. table[pr_test] = stats.f.sf(table["F"], table["df_diff"],
  302. table["df_resid"])
  303. # for earlier scipy - stats.f.sf(np.nan, 10, 2) -> 0 not nan
  304. table[pr_test][table['F'].isnull()] = np.nan
  305. return table
  306. if __name__ == "__main__":
  307. import pandas
  308. from statsmodels.formula.api import ols
  309. # in R
  310. #library(car)
  311. #write.csv(Moore, "moore.csv", row.names=FALSE)
  312. moore = pandas.read_table('moore.csv', delimiter=",", skiprows=1,
  313. names=['partner_status','conformity',
  314. 'fcategory','fscore'])
  315. moore_lm = ols('conformity ~ C(fcategory, Sum)*C(partner_status, Sum)',
  316. data=moore).fit()
  317. mooreB = ols('conformity ~ C(partner_status, Sum)', data=moore).fit()
  318. # for each term you just want to test vs the model without its
  319. # higher-order terms
  320. # using Monette-Fox slides and Marden class notes for linear algebra /
  321. # orthogonal complement
  322. # https://netfiles.uiuc.edu/jimarden/www/Classes/STAT324/
  323. table = anova_lm(moore_lm, typ=2)