PageRenderTime 54ms CodeModel.GetById 23ms RepoModel.GetById 0ms app.codeStats 0ms

/statsmodels/iolib/summary2.py

https://github.com/danielballan/statsmodels
Python | 541 lines | 447 code | 23 blank | 71 comment | 14 complexity | 9024e54d764e0a7e981a5df870ed754a MD5 | raw file
Possible License(s): BSD-3-Clause
  1. import numpy as np
  2. import pandas as pd
  3. import datetime
  4. import copy
  5. #import collections # OrderedDict requires python >= 2.7
  6. from statsmodels.compatnp.collections import OrderedDict
  7. import StringIO
  8. import textwrap
  9. from table import SimpleTable
  10. from tableformatting import fmt_latex, fmt_txt
  11. class Summary(object):
  12. def __init__(self):
  13. self.tables = []
  14. self.settings = []
  15. self.extra_txt = []
  16. self.title = None
  17. def __str__(self):
  18. return self.as_text()
  19. def __repr__(self):
  20. return str(type(self)) + '\n"""\n' + self.__str__() + '\n"""'
  21. def _repr_html_(self):
  22. '''Display as HTML in IPython notebook.'''
  23. return self.as_html()
  24. def add_df(self, df, index=True, header=True, float_format='%.4f',
  25. align='r'):
  26. '''Add the contents of a DataFrame to summary table
  27. Parameters
  28. ----------
  29. df : DataFrame
  30. header: bool
  31. Reproduce the DataFrame column labels in summary table
  32. index: bool
  33. Reproduce the DataFrame row labels in summary table
  34. float_format: string
  35. Formatting to float data columns
  36. align : string
  37. Data alignment (l/c/r)
  38. '''
  39. settings = {'index':index, 'header':header,
  40. 'float_format':float_format, 'align':align}
  41. self.tables.append(df)
  42. self.settings.append(settings)
  43. def add_array(self, array, align='r', float_format="%.4f"):
  44. '''Add the contents of a Numpy array to summary table
  45. Parameters
  46. ----------
  47. array : numpy array (2D)
  48. float_format: string
  49. Formatting to array if type is float
  50. align : string
  51. Data alignment (l/c/r)
  52. '''
  53. table = pd.DataFrame(array)
  54. self.add_df(table, index=False, header=False,
  55. float_format=float_format, align=align)
  56. def add_dict(self, d, ncols=2, align='l', float_format="%.4f"):
  57. '''Add the contents of a Dict to summary table
  58. Parameters
  59. ----------
  60. d : dict
  61. Keys and values are automatically coerced to strings with str().
  62. Users are encouraged to format them before using add_dict.
  63. ncols: int
  64. Number of columns of the output table
  65. align : string
  66. Data alignment (l/c/r)
  67. '''
  68. keys = [_formatter(x, float_format) for x in d.keys()]
  69. vals = [_formatter(x, float_format) for x in d.values()]
  70. data = np.array(zip(keys, vals))
  71. if data.shape[0] % ncols != 0:
  72. pad = ncols - (data.shape[0] % ncols)
  73. data = np.vstack([data, np.array(pad * [['','']])])
  74. data = np.split(data, ncols)
  75. data = reduce(lambda x,y: np.hstack([x,y]), data)
  76. self.add_array(data, align=align)
  77. def add_text(self, string):
  78. '''Append a note to the bottom of the summary table. In ASCII tables,
  79. the note will be wrapped to table width. Notes are not indendented.
  80. '''
  81. self.extra_txt.append(string)
  82. def add_title(self, title=None, results=None):
  83. '''Insert a title on top of the summary table. If a string is provided
  84. in the title argument, that string is printed. If no title string is
  85. provided but a results instance is provided, statsmodels attempts
  86. to construct a useful title automatically.
  87. '''
  88. if type(title) == str:
  89. self.title = title
  90. else:
  91. try:
  92. model = results.model.__class__.__name__
  93. if model in _model_types:
  94. model = _model_types[model]
  95. self.title = 'Results: ' + model
  96. except:
  97. self.title = ''
  98. def add_base(self, results, alpha=0.05, float_format="%.4f", title=None,
  99. xname=None, yname=None):
  100. '''Try to construct a basic summary instance.
  101. Parameters
  102. ----------
  103. results : Model results instance
  104. alpha : float
  105. significance level for the confidence intervals (optional)
  106. float_formatting: string
  107. Float formatting for summary of parameters (optional)
  108. title : string
  109. Title of the summary table (optional)
  110. xname : List of strings of length equal to the number of parameters
  111. Names of the independent variables (optional)
  112. yname : string
  113. Name of the dependent variable (optional)
  114. '''
  115. param = summary_params(results, alpha=alpha)
  116. info = summary_model(results)
  117. if xname != None:
  118. param.index = xname
  119. if yname != None:
  120. info['Dependent Variable:'] = yname
  121. self.add_dict(info, align='l')
  122. self.add_df(param, float_format=float_format)
  123. self.add_title(title=title, results=results)
  124. def as_text(self):
  125. '''Generate ASCII Summary Table
  126. '''
  127. tables = self.tables
  128. settings = self.settings
  129. title = self.title
  130. extra_txt = self.extra_txt
  131. pad_col, pad_index, widest = _measure_tables(tables, settings)
  132. rule_equal = widest * '='
  133. rule_dash = widest * '-'
  134. simple_tables = _simple_tables(tables, settings, pad_col, pad_index)
  135. tab = [x.as_text() for x in simple_tables]
  136. tab = '\n'.join(tab)
  137. tab = tab.split('\n')
  138. tab[0] = rule_equal
  139. tab.append(rule_equal)
  140. tab = '\n'.join(tab)
  141. if title != None:
  142. title = title
  143. if len(title) < widest:
  144. title = ' ' * int(widest/2 - len(title)/2) + title
  145. else:
  146. title = ''
  147. txt = [textwrap.wrap(x, widest) for x in extra_txt]
  148. txt = ['\n'.join(x) for x in txt]
  149. txt = '\n'.join(txt)
  150. out = '\n'.join([title, tab, txt])
  151. return out
  152. def as_html(self):
  153. '''Generate HTML Summary Table
  154. '''
  155. tables = self.tables
  156. settings = self.settings
  157. title = self.title
  158. simple_tables = _simple_tables(tables, settings)
  159. tab = [x.as_html() for x in simple_tables]
  160. tab = '\n'.join(tab)
  161. return tab
  162. def as_latex(self):
  163. '''Generate LaTeX Summary Table
  164. '''
  165. tables = self.tables
  166. settings = self.settings
  167. title = self.title
  168. if title != None:
  169. title = '\\caption{' + title + '} \\\\'
  170. else:
  171. title = '\\caption{}'
  172. simple_tables = _simple_tables(tables, settings)
  173. tab = [x.as_latex_tabular() for x in simple_tables]
  174. tab = '\n\\hline\n'.join(tab)
  175. out = '\\begin{table}', title, tab, '\\end{table}'
  176. out = '\n'.join(out)
  177. return out
  178. def _measure_tables(tables, settings):
  179. '''Compare width of ascii tables in a list and calculate padding values.
  180. We add space to each col_sep to get us as close as possible to the
  181. width of the largest table. Then, we add a few spaces to the first
  182. column to pad the rest.
  183. '''
  184. simple_tables = _simple_tables(tables, settings)
  185. tab = [x.as_text() for x in simple_tables]
  186. length = [len(x.splitlines()[0]) for x in tab]
  187. len_max = max(length)
  188. pad_sep = []
  189. pad_index = []
  190. for i in range(len(tab)):
  191. nsep = tables[i].shape[1] - 1
  192. pad = int((len_max - length[i]) / nsep)
  193. pad_sep.append(pad)
  194. len_new = length[i] + nsep * pad
  195. pad_index.append(len_max - len_new)
  196. return pad_sep, pad_index, max(length)
  197. # Useful stuff
  198. _model_types = {'OLS' : 'Ordinary least squares',
  199. 'GLS' : 'Generalized least squares',
  200. 'GLSAR' : 'Generalized least squares with AR(p)',
  201. 'WLS' : 'Weigthed least squares',
  202. 'RLM' : 'Robust linear model',
  203. 'NBin': 'Negative binomial model',
  204. 'GLM' : 'Generalized linear model'
  205. }
  206. def summary_model(results):
  207. '''Create a dict with information about the model
  208. '''
  209. def time_now(**kwrds):
  210. now = datetime.datetime.now()
  211. return now.strftime('%Y-%m-%d %H:%M')
  212. info = OrderedDict()
  213. info['Model:'] = lambda x: x.model.__class__.__name__
  214. info['Model Family:'] = lambda x: x.family.__class.__name__
  215. info['Link Function:'] = lambda x: x.family.link.__class__.__name__
  216. info['Dependent Variable:'] = lambda x: x.model.endog_names
  217. info['Date:'] = time_now()
  218. info['No. Observations:'] = lambda x: "%#6d" % x.nobs
  219. info['Df Model:'] = lambda x: "%#6d" % x.df_model
  220. info['Df Residuals:'] = lambda x: "%#6d" % x.df_resid
  221. info['Converged:'] = lambda x: x.mle_retvals['converged']
  222. info['No. Iterations:'] = lambda x: x.mle_retvals['iterations']
  223. info['Method:'] = lambda x: x.method
  224. info['Norm:'] = lambda x: x.fit_options['norm']
  225. info['Scale Est.:'] = lambda x: x.fit_options['scale_est']
  226. info['Cov. Type:'] = lambda x: x.fit_options['cov']
  227. info['R-squared:'] = lambda x: "%#8.3f" % x.rsquared
  228. info['Adj. R-squared:'] = lambda x: "%#8.3f" % x.rsquared_adj
  229. info['Pseudo R-squared:'] = lambda x: "%#8.3f" % x.prsquared
  230. info['AIC:'] = lambda x: "%8.4f" % x.aic
  231. info['BIC:'] = lambda x: "%8.4f" % x.bic
  232. info['Log-Likelihood:'] = lambda x: "%#8.5g" % x.llf
  233. info['LL-Null:'] = lambda x: "%#8.5g" % x.llnull
  234. info['LLR p-value:'] = lambda x: "%#8.5g" % x.llr_pvalue
  235. info['Deviance:'] = lambda x: "%#8.5g" % x.deviance
  236. info['Pearson chi2:'] = lambda x: "%#6.3g" % x.pearson_chi2
  237. info['F-statistic:'] = lambda x: "%#8.4g" % x.fvalue
  238. info['Prob (F-statistic):'] = lambda x: "%#6.3g" % x.f_pvalue
  239. info['Scale:'] = lambda x: "%#8.5g" % x.scale
  240. out = OrderedDict()
  241. for key in info.keys():
  242. try:
  243. out[key] = info[key](results)
  244. except:
  245. pass
  246. return out
  247. def summary_params(results, yname=None, xname=None, alpha=.05, use_t=True,
  248. skip_header=False, float_format="%.4f"):
  249. '''create a summary table of parameters from results instance
  250. Parameters
  251. ----------
  252. res : results instance
  253. some required information is directly taken from the result
  254. instance
  255. yname : string or None
  256. optional name for the endogenous variable, default is "y"
  257. xname : list of strings or None
  258. optional names for the exogenous variables, default is "var_xx"
  259. alpha : float
  260. significance level for the confidence intervals
  261. use_t : bool
  262. indicator whether the p-values are based on the Student-t
  263. distribution (if True) or on the normal distribution (if False)
  264. skip_headers : bool
  265. If false (default), then the header row is added. If true, then no
  266. header row is added.
  267. float_format : string
  268. float formatting options (e.g. ".3g")
  269. Returns
  270. -------
  271. params_table : SimpleTable instance
  272. '''
  273. if isinstance(results, tuple):
  274. results, params, std_err, tvalues, pvalues, conf_int = results
  275. else:
  276. params = results.params
  277. bse = results.bse
  278. tvalues = results.tvalues
  279. pvalues = results.pvalues
  280. conf_int = results.conf_int(alpha)
  281. data = np.array([params, bse, tvalues, pvalues]).T
  282. data = np.hstack([data, conf_int])
  283. data = pd.DataFrame(data)
  284. if use_t:
  285. data.columns = ['Coef.', 'Std.Err.', 't', 'P>|t|',
  286. '[' + str(alpha/2), str(1-alpha/2) + ']']
  287. else:
  288. data.columns = ['Coef.', 'Std.Err.', 'z', 'P>|z|',
  289. '[' + str(alpha/2), str(1-alpha/2) + ']']
  290. if not xname:
  291. data.index = results.model.exog_names
  292. else:
  293. data.index = xname
  294. return data
  295. # Vertical summary instance for multiple models
  296. def _col_params(result, float_format='%.4f', stars=True):
  297. '''Stack coefficients and standard errors in single column
  298. '''
  299. # Extract parameters
  300. res = summary_params(result)
  301. # Format float
  302. for col in res.columns[:2]:
  303. res[col] = res[col].apply(lambda x: float_format % x)
  304. # Std.Errors in parentheses
  305. res.ix[:,1] = '(' + res.ix[:,1] + ')'
  306. # Significance stars
  307. if stars:
  308. idx = res.ix[:,3] < .1
  309. res.ix[:,0][idx] = res.ix[:,0][idx] + '*'
  310. idx = res.ix[:,3] < .05
  311. res.ix[:,0][idx] = res.ix[:,0][idx] + '*'
  312. idx = res.ix[:,3] < .01
  313. res.ix[:,0][idx] = res.ix[:,0][idx] + '*'
  314. # Stack Coefs and Std.Errors
  315. res = res.ix[:,:2]
  316. res = res.stack()
  317. res = pd.DataFrame(res)
  318. res.columns = [str(result.model.endog_names)]
  319. return res
  320. def _col_info(result, info_dict=None):
  321. '''Stack model info in a column
  322. '''
  323. if info_dict == None:
  324. info_dict = {}
  325. out = []
  326. index = []
  327. for i in info_dict:
  328. if isinstance(info_dict[i], dict):
  329. # this is a specific model info_dict, but not for this result...
  330. continue
  331. try:
  332. out.append(info_dict[i](result))
  333. except:
  334. out.append('')
  335. index.append(i)
  336. out = pd.DataFrame({str(result.model.endog_names):out}, index=index)
  337. return out
  338. def _make_unique(list_of_names):
  339. if len(set(list_of_names)) == len(list_of_names):
  340. return list_of_names
  341. # pandas does not like it if multiple columns have the same names
  342. from collections import defaultdict
  343. name_counter = defaultdict(str)
  344. header = []
  345. for _name in list_of_names:
  346. name_counter[_name] += "I"
  347. header.append(_name+" " +name_counter[_name])
  348. return header
  349. def summary_col(results, float_format='%.4f', model_names=[], stars=False,
  350. info_dict=None, regressor_order=[]):
  351. '''Summarize multiple results instances side-by-side (coefs and SEs)
  352. Parameters
  353. ----------
  354. results : statsmodels results instance or list of result instances
  355. float_format : string
  356. float format for coefficients and standard errors
  357. Default : '%.4f'
  358. model_names : list of strings of length len(results) if the names are not
  359. unique, a roman number will be appended to all model names
  360. stars : bool
  361. print significance stars
  362. info_dict : dict
  363. dict of lambda functions to be applied to results instances to retrieve
  364. model info. To use specific information for different models, add a (nested)
  365. info_dict with model name as the key.
  366. Example: `info_dict = {"N":..., "R2": ..., "OLS":{"R2":...}}` would only show
  367. `R2` for OLS regression models, but additionally `N` for all other results.
  368. Default : None (use the info_dict specified in result.default_model_infos, if
  369. this property exists)
  370. regressor_order : list of strings
  371. list of names of the regressors in the desired order. All regressors
  372. not specified will be appended to the end of the list.
  373. '''
  374. if type(results) != list:
  375. results = [results]
  376. cols = [_col_params(x, stars=stars, float_format=float_format) for x in results]
  377. # Unique column names (pandas has problems merging otherwise)
  378. if model_names:
  379. colnames = _make_unique(model_names)
  380. else:
  381. colnames = _make_unique([x.columns[0] for x in cols])
  382. for i in range(len(cols)):
  383. cols[i].columns = [colnames[i]]
  384. merg = lambda x,y: x.merge(y, how='outer', right_index=True, left_index=True)
  385. summ = reduce(merg, cols)
  386. if regressor_order:
  387. varnames = summ.index.get_level_values(0).tolist()
  388. ordered = [x for x in regressor_order if x in varnames]
  389. unordered = [x for x in varnames if x not in regressor_order + ['']]
  390. order = ordered + list(np.unique(unordered))
  391. f = lambda idx: sum([[x + 'coef', x + 'stde'] for x in idx], [])
  392. summ.index = f(np.unique(varnames))
  393. summ = summ.reindex(f(order))
  394. summ.index = [x[:-4] for x in summ.index]
  395. idx = pd.Series(range(summ.shape[0])) %2 == 1
  396. summ.index = np.where(idx, '', summ.index.get_level_values(0))
  397. # add infos about the models.
  398. if info_dict:
  399. cols = [_col_info(x, info_dict.get(x.model.__class__.__name__, info_dict)) for x in results]
  400. else:
  401. cols = [_col_info(x, getattr(x, "default_model_infos", None)) for x in results]
  402. # use unique column names, otherwise the merge will not succeed
  403. for df , name in zip(cols, _make_unique([df.columns[0] for df in cols])):
  404. df.columns = [name]
  405. merg = lambda x,y: x.merge(y, how='outer', right_index=True, left_index=True)
  406. info = reduce(merg, cols)
  407. dat = pd.DataFrame(np.vstack([summ,info])) # pd.concat better, but error
  408. dat.columns = summ.columns
  409. dat.index = pd.Index(summ.index.tolist() + info.index.tolist())
  410. summ = dat
  411. summ = summ.fillna('')
  412. smry = Summary()
  413. smry.add_df(summ, header=True, align='l')
  414. smry.add_text('Standard errors in parentheses.')
  415. if stars:
  416. smry.add_text('* p<.1, ** p<.05, ***p<.01')
  417. return smry
  418. def _formatter(element, float_format='%.4f'):
  419. try:
  420. out = float_format % element
  421. except:
  422. out = str(element)
  423. return out.strip()
  424. def _df_to_simpletable(df, align='r', float_format="%.4f", header=True, index=True,
  425. table_dec_above='-', table_dec_below=None, header_dec_below='-',
  426. pad_col=0, pad_index=0):
  427. dat = df.copy()
  428. dat = dat.applymap(lambda x: _formatter(x, float_format))
  429. if header:
  430. headers = [str(x) for x in dat.columns.tolist()]
  431. else:
  432. headers = None
  433. if index:
  434. stubs = [str(x) + int(pad_index) * ' ' for x in dat.index.tolist()]
  435. else:
  436. dat.ix[:,0] = [str(x) + int(pad_index) * ' ' for x in dat.ix[:,0]]
  437. stubs = None
  438. st = SimpleTable(np.array(dat), headers=headers, stubs=stubs,
  439. ltx_fmt=fmt_latex, txt_fmt=fmt_txt)
  440. st.output_formats['latex']['data_aligns'] = align
  441. st.output_formats['txt']['data_aligns'] = align
  442. st.output_formats['txt']['table_dec_above'] = table_dec_above
  443. st.output_formats['txt']['table_dec_below'] = table_dec_below
  444. st.output_formats['txt']['header_dec_below'] = header_dec_below
  445. st.output_formats['txt']['colsep'] = ' ' * int(pad_col + 1)
  446. return st
  447. def _simple_tables(tables, settings, pad_col=None, pad_index=None):
  448. simple_tables = []
  449. float_format = '%.4f'
  450. if pad_col == None:
  451. pad_col = [0] * len(tables)
  452. if pad_index == None:
  453. pad_index = [0] * len(tables)
  454. for i,v in enumerate(tables):
  455. index = settings[i]['index']
  456. header = settings[i]['header']
  457. align = settings[i]['align']
  458. simple_tables.append(_df_to_simpletable(v, align=align,
  459. float_format=float_format, header=header, index=index,
  460. pad_col=pad_col[i], pad_index=pad_index[i]))
  461. return simple_tables