PageRenderTime 44ms CodeModel.GetById 9ms RepoModel.GetById 0ms app.codeStats 0ms

/pymc/glm/glm.py

http://github.com/jseabold/pymc
Python | 206 lines | 194 code | 2 blank | 10 comment | 0 complexity | 23fdd3a22a9caa1c907f4867a2506360 MD5 | raw file
Possible License(s): Apache-2.0
  1. import numpy as np
  2. from ..core import *
  3. from ..distributions import *
  4. from ..tuning.starting import find_MAP
  5. import patsy
  6. import theano
  7. import pandas as pd
  8. from collections import defaultdict
  9. from statsmodels.formula.api import glm as glm_sm
  10. import statsmodels.api as sm
  11. from pandas.tools.plotting import scatter_matrix
  12. from . import links
  13. from . import families
  14. def linear_component(formula, data, priors=None,
  15. intercept_prior=None,
  16. regressor_prior=None,
  17. init=True, init_vals=None, family=None,
  18. model=None):
  19. """Create linear model according to patsy specification.
  20. Parameters
  21. ----------
  22. formula : str
  23. Patsy linear model descriptor.
  24. data : array
  25. Labeled array (e.g. pandas DataFrame, recarray).
  26. priors : dict
  27. Mapping prior name to prior distribution.
  28. E.g. {'Intercept': Normal.dist(mu=0, sd=1)}
  29. intercept_prior : pymc distribution
  30. Prior to use for the intercept.
  31. Default: Normal.dist(mu=0, tau=1.0E-12)
  32. regressor_prior : pymc distribution
  33. Prior to use for all regressor(s).
  34. Default: Normal.dist(mu=0, tau=1.0E-12)
  35. init : bool
  36. Whether to set the starting values via statsmodels
  37. Default: True
  38. init_vals : dict
  39. Set starting values externally: parameter -> value
  40. Default: None
  41. family : statsmodels.family
  42. Link function to pass to statsmodels (init has to be True).
  43. See `statsmodels.api.families`
  44. Default: identity
  45. Output
  46. ------
  47. (y_est, coeffs) : Estimate for y, list of coefficients
  48. Example
  49. -------
  50. # Logistic regression
  51. y_est, coeffs = glm('male ~ height + weight',
  52. htwt_data,
  53. family=glm.families.Binomial(links=glm.link.Logit))
  54. y_data = Bernoulli('y', y_est, observed=data.male)
  55. """
  56. if intercept_prior is None:
  57. intercept_prior = Normal.dist(mu=0, tau=1.0E-12)
  58. if regressor_prior is None:
  59. regressor_prior = Normal.dist(mu=0, tau=1.0E-12)
  60. if priors is None:
  61. priors = defaultdict(None)
  62. # Build patsy design matrix and get regressor names.
  63. _, dmatrix = patsy.dmatrices(formula, data)
  64. reg_names = dmatrix.design_info.column_names
  65. if init_vals is None and init:
  66. init_vals = glm_sm(formula, data, family=family).fit().params
  67. else:
  68. init_vals = defaultdict(lambda: None)
  69. # Create individual coefficients
  70. model = modelcontext(model)
  71. coeffs = []
  72. if reg_names[0] == 'Intercept':
  73. prior = priors.get('Intercept', intercept_prior)
  74. coeff = model.Var(reg_names.pop(0), prior)
  75. coeff.tag.test_value = init_vals['Intercept']
  76. coeffs.append(coeff)
  77. for reg_name in reg_names:
  78. prior = priors.get(reg_name, regressor_prior)
  79. coeff = model.Var(reg_name, prior)
  80. coeff.tag.test_value = init_vals[reg_name]
  81. coeffs.append(coeff)
  82. y_est = theano.dot(np.asarray(dmatrix), theano.tensor.stack(*coeffs)).reshape((1, -1))
  83. return y_est, coeffs
  84. def glm(*args, **kwargs):
  85. """Create GLM after Patsy model specification string.
  86. Parameters
  87. ----------
  88. formula : str
  89. Patsy linear model descriptor.
  90. data : array
  91. Labeled array (e.g. pandas DataFrame, recarray).
  92. priors : dict
  93. Mapping prior name to prior distribution.
  94. E.g. {'Intercept': Normal.dist(mu=0, sd=1)}
  95. intercept_prior : pymc distribution
  96. Prior to use for the intercept.
  97. Default: Normal.dist(mu=0, tau=1.0E-12)
  98. regressor_prior : pymc distribution
  99. Prior to use for all regressor(s).
  100. Default: Normal.dist(mu=0, tau=1.0E-12)
  101. init : bool
  102. Whether initialize test values via statsmodels
  103. Default: True
  104. init_vals : dict
  105. Set starting values externally: parameter -> value
  106. Default: None
  107. find_MAP : bool
  108. Whether to call find_MAP on non-initialized nodes.
  109. family : statsmodels.family
  110. Distribution of likelihood, see pymc.glm.families
  111. (init has to be True).
  112. Output
  113. ------
  114. vars : List of created random variables (y_est, coefficients etc)
  115. Example
  116. -------
  117. # Logistic regression
  118. vars = glm('male ~ height + weight',
  119. data,
  120. family=glm.families.Binomial(link=glm.links.Logit))
  121. """
  122. model = modelcontext(kwargs.get('model'))
  123. family = kwargs.pop('family', families.Normal())
  124. call_find_map = kwargs.pop('find_MAP', True)
  125. formula = args[0]
  126. data = args[1]
  127. y_data = np.asarray(patsy.dmatrices(formula, data)[0]).T
  128. # Create GLM
  129. kwargs['family'] = family.create_statsmodel_family()
  130. y_est, coeffs = linear_component(*args, **kwargs)
  131. family.create_likelihood(y_est, y_data)
  132. # Find vars we have not initialized yet
  133. non_init_vars = set(model.vars).difference(set(coeffs))
  134. if len(non_init_vars) != 0 and call_find_map:
  135. start = find_MAP(vars=non_init_vars)
  136. for var in non_init_vars:
  137. var.tag.test_value = start[var.name]
  138. return [y_est] + coeffs + list(non_init_vars)
  139. def plot_posterior_predictive(trace, eval=None, lm=None, samples=30, **kwargs):
  140. """Plot posterior predictive of a linear model.
  141. :Arguments:
  142. trace : <array>
  143. Array of posterior samples with columns
  144. eval : <array>
  145. Array over which to evaluate lm
  146. lm : function <default: linear function>
  147. Function mapping parameters at different points
  148. to their respective outputs.
  149. input: point, sample
  150. output: estimated value
  151. samples : int <default=30>
  152. How many posterior samples to draw.
  153. Additional keyword arguments are passed to pylab.plot().
  154. """
  155. import matplotlib.pyplot as plt
  156. if lm is None:
  157. lm = lambda x, sample: sample['Intercept'] + sample['x'] * x
  158. if eval is None:
  159. eval = np.linspace(0, 1, 100)
  160. # Set default plotting arguments
  161. if 'lw' not in kwargs and 'linewidth' not in kwargs:
  162. kwargs['lw'] = .2
  163. if 'c' not in kwargs and 'color' not in kwargs:
  164. kwargs['c'] = 'k'
  165. for rand_loc in np.random.randint(0, len(trace), samples):
  166. rand_sample = trace[rand_loc]
  167. plt.plot(eval, lm(eval, rand_sample), **kwargs)
  168. # Make sure to not plot label multiple times
  169. kwargs.pop('label', None)
  170. plt.title('Posterior predictive')