glm.py | searchcode

/pymc/glm/glm.py

http://github.com/jseabold/pymc · Python · 206 lines · 194 code · 2 blank · 10 comment · 0 complexity · 23fdd3a22a9caa1c907f4867a2506360 MD5 · raw file

import numpy as np
from ..core import *
from ..distributions import *
from ..tuning.starting import find_MAP
import patsy
import theano
import pandas as pd
from collections import defaultdict
from statsmodels.formula.api import glm as glm_sm
import statsmodels.api as sm
from pandas.tools.plotting import scatter_matrix

from . import links
from . import families

def linear_component(formula, data, priors=None,
                     intercept_prior=None,
                     regressor_prior=None,
                     init=True, init_vals=None, family=None,
                     model=None):
    """Create linear model according to patsy specification.

    Parameters
    ----------
    formula : str
        Patsy linear model descriptor.
    data : array
        Labeled array (e.g. pandas DataFrame, recarray).
    priors : dict
        Mapping prior name to prior distribution.
        E.g. {'Intercept': Normal.dist(mu=0, sd=1)}
    intercept_prior : pymc distribution
        Prior to use for the intercept.
        Default: Normal.dist(mu=0, tau=1.0E-12)
    regressor_prior : pymc distribution
        Prior to use for all regressor(s).
        Default: Normal.dist(mu=0, tau=1.0E-12)
    init : bool
        Whether to set the starting values via statsmodels
        Default: True
    init_vals : dict
        Set starting values externally: parameter -> value
        Default: None
    family : statsmodels.family
        Link function to pass to statsmodels (init has to be True).
    See `statsmodels.api.families`
        Default: identity

    Output
    ------
    (y_est, coeffs) : Estimate for y, list of coefficients

    Example
    -------
    # Logistic regression
    y_est, coeffs = glm('male ~ height + weight',
                        htwt_data,
                        family=glm.families.Binomial(links=glm.link.Logit))
    y_data = Bernoulli('y', y_est, observed=data.male)
    """
    if intercept_prior is None:
        intercept_prior = Normal.dist(mu=0, tau=1.0E-12)
    if regressor_prior is None:
        regressor_prior = Normal.dist(mu=0, tau=1.0E-12)

    if priors is None:
        priors = defaultdict(None)

    # Build patsy design matrix and get regressor names.
    _, dmatrix = patsy.dmatrices(formula, data)
    reg_names = dmatrix.design_info.column_names

    if init_vals is None and init:
        init_vals = glm_sm(formula, data, family=family).fit().params
    else:
        init_vals = defaultdict(lambda: None)

    # Create individual coefficients
    model = modelcontext(model)
    coeffs = []

    if reg_names[0] == 'Intercept':
        prior = priors.get('Intercept', intercept_prior)
        coeff = model.Var(reg_names.pop(0), prior)
        coeff.tag.test_value = init_vals['Intercept']
        coeffs.append(coeff)

    for reg_name in reg_names:
        prior = priors.get(reg_name, regressor_prior)
        coeff = model.Var(reg_name, prior)
        coeff.tag.test_value = init_vals[reg_name]
        coeffs.append(coeff)

    y_est = theano.dot(np.asarray(dmatrix), theano.tensor.stack(*coeffs)).reshape((1, -1))

    return y_est, coeffs


def glm(*args, **kwargs):
    """Create GLM after Patsy model specification string.

    Parameters
    ----------
    formula : str
        Patsy linear model descriptor.
    data : array
        Labeled array (e.g. pandas DataFrame, recarray).
    priors : dict
        Mapping prior name to prior distribution.
        E.g. {'Intercept': Normal.dist(mu=0, sd=1)}
    intercept_prior : pymc distribution
        Prior to use for the intercept.
        Default: Normal.dist(mu=0, tau=1.0E-12)
    regressor_prior : pymc distribution
        Prior to use for all regressor(s).
        Default: Normal.dist(mu=0, tau=1.0E-12)
    init : bool
        Whether initialize test values via statsmodels
        Default: True
    init_vals : dict
        Set starting values externally: parameter -> value
        Default: None
    find_MAP : bool
        Whether to call find_MAP on non-initialized nodes.
    family : statsmodels.family
        Distribution of likelihood, see pymc.glm.families
        (init has to be True).

    Output
    ------
    vars : List of created random variables (y_est, coefficients etc)

    Example
    -------
    # Logistic regression
    vars = glm('male ~ height + weight',
               data,
               family=glm.families.Binomial(link=glm.links.Logit))
    """

    model = modelcontext(kwargs.get('model'))

    family = kwargs.pop('family', families.Normal())

    call_find_map = kwargs.pop('find_MAP', True)
    formula = args[0]
    data = args[1]
    y_data = np.asarray(patsy.dmatrices(formula, data)[0]).T

    # Create GLM
    kwargs['family'] = family.create_statsmodel_family()

    y_est, coeffs = linear_component(*args, **kwargs)
    family.create_likelihood(y_est, y_data)

    # Find vars we have not initialized yet
    non_init_vars = set(model.vars).difference(set(coeffs))
    if len(non_init_vars) != 0 and call_find_map:
        start = find_MAP(vars=non_init_vars)

        for var in non_init_vars:
            var.tag.test_value = start[var.name]

    return [y_est] + coeffs + list(non_init_vars)


def plot_posterior_predictive(trace, eval=None, lm=None, samples=30, **kwargs):
    """Plot posterior predictive of a linear model.

    :Arguments:
        trace : <array>
            Array of posterior samples with columns
        eval : <array>
            Array over which to evaluate lm
        lm : function <default: linear function>
            Function mapping parameters at different points
            to their respective outputs.
            input: point, sample
            output: estimated value
        samples : int <default=30>
            How many posterior samples to draw.

    Additional keyword arguments are passed to pylab.plot().

    """
    import matplotlib.pyplot as plt

    if lm is None:
        lm = lambda x, sample: sample['Intercept'] + sample['x'] * x

    if eval is None:
        eval = np.linspace(0, 1, 100)

    # Set default plotting arguments
    if 'lw' not in kwargs and 'linewidth' not in kwargs:
        kwargs['lw'] = .2
    if 'c' not in kwargs and 'color' not in kwargs:
        kwargs['c'] = 'k'

    for rand_loc in np.random.randint(0, len(trace), samples):
        rand_sample = trace[rand_loc]
        plt.plot(eval, lm(eval, rand_sample), **kwargs)
        # Make sure to not plot label multiple times
        kwargs.pop('label', None)

    plt.title('Posterior predictive')
Tech Fingerprint

Alerts (5)

'import *' Avoid to prevent namespace pollution; import specific names or use aliases
2 3
'def' Ensure functions have docstrings for documentation
16
'list(' Avoid unnecessary list conversions; use generators where possible
164
'lambda' Avoid complex 'lambda' functions; prefer named functions for clarity and debugging
189