PageRenderTime 26ms CodeModel.GetById 13ms app.highlight 10ms RepoModel.GetById 1ms app.codeStats 0ms

/tools/multivariate_stats/pca.py

https://bitbucket.org/cistrome/cistrome-harvard/
Python | 130 lines | 111 code | 18 blank | 1 comment | 57 complexity | ba6e78bf4d6a599a43b60035671d7d52 MD5 | raw file
  1#!/usr/bin/env python
  2
  3from galaxy import eggs
  4import sys, string
  5from rpy import *
  6import numpy
  7
  8def stop_err(msg):
  9    sys.stderr.write(msg)
 10    sys.exit()
 11
 12infile = sys.argv[1]
 13x_cols = sys.argv[2].split(',')
 14method = sys.argv[3]
 15outfile = sys.argv[4]
 16outfile2 = sys.argv[5]
 17
 18if method == 'svd':
 19    scale = center = "FALSE"
 20    if sys.argv[6] == 'both':
 21        scale = center = "TRUE"
 22    elif sys.argv[6] == 'center':
 23        center = "TRUE"
 24    elif sys.argv[6] == 'scale':
 25        scale = "TRUE"
 26    
 27fout = open(outfile,'w')
 28elems = []
 29for i, line in enumerate( file ( infile )):
 30    line = line.rstrip('\r\n')
 31    if len( line )>0 and not line.startswith( '#' ):
 32        elems = line.split( '\t' )
 33        break 
 34    if i == 30:
 35        break # Hopefully we'll never get here...
 36
 37if len( elems )<1:
 38    stop_err( "The data in your input dataset is either missing or not formatted properly." )
 39
 40x_vals = []
 41
 42for k,col in enumerate(x_cols):
 43    x_cols[k] = int(col)-1
 44    x_vals.append([])
 45
 46NA = 'NA'
 47skipped = 0
 48for ind,line in enumerate( file( infile )):
 49    if line and not line.startswith( '#' ):
 50        try:
 51            fields = line.strip().split("\t")
 52            valid_line = True
 53            for k,col in enumerate(x_cols):
 54                try:
 55                    xval = float(fields[col])
 56                except:
 57                    skipped += 1 
 58                    valid_line = False
 59                    break
 60            if valid_line:
 61                for k,col in enumerate(x_cols):
 62                    xval = float(fields[col])
 63                    x_vals[k].append(xval)
 64        except:
 65            skipped += 1
 66
 67x_vals1 = numpy.asarray(x_vals).transpose()
 68dat= r.list(array(x_vals1))
 69
 70set_default_mode(NO_CONVERSION)
 71try:
 72    if method == "cor":
 73        pc = r.princomp(r.na_exclude(dat), cor = r("TRUE"))
 74    elif method == "cov":
 75        pc = r.princomp(r.na_exclude(dat), cor = r("FALSE"))
 76    elif method=="svd":
 77        pc = r.prcomp(r.na_exclude(dat), center = r(center), scale = r(scale))
 78except RException, rex:
 79    stop_err("Encountered error while performing PCA on the input data: %s" %(rex))
 80
 81set_default_mode(BASIC_CONVERSION)
 82summary = r.summary(pc, loadings="TRUE")
 83ncomps = len(summary['sdev'])
 84
 85if type(summary['sdev']) == type({}):
 86    comps_unsorted = summary['sdev'].keys()
 87    comps=[]
 88    sd = summary['sdev'].values()
 89    for i in range(ncomps):
 90        sd[i] = summary['sdev'].values()[comps_unsorted.index('Comp.%s' %(i+1))]
 91        comps.append('Comp.%s' %(i+1))
 92elif type(summary['sdev']) == type([]):
 93    comps=[]
 94    for i in range(ncomps):
 95        comps.append('Comp.%s' %(i+1))
 96        sd = summary['sdev']
 97
 98print >>fout, "#Component\t%s" %("\t".join(["%s" % el for el in range(1,ncomps+1)]))
 99print >>fout, "#Std. deviation\t%s" %("\t".join(["%.4g" % el for el in sd]))
100total_var = 0
101vars = []
102for s in sd:
103    var = s*s
104    total_var += var
105    vars.append(var)
106for i,var in enumerate(vars):
107    vars[i] = vars[i]/total_var
108       
109print >>fout, "#Proportion of variance explained\t%s" %("\t".join(["%.4g" % el for el in vars]))
110
111print >>fout, "#Loadings\t%s" %("\t".join(["%s" % el for el in range(1,ncomps+1)]))
112xcolnames = ["c%d" %(el+1) for el in x_cols]
113if 'loadings' in summary: #in case of princomp
114    loadings = 'loadings'
115elif 'rotation' in summary: #in case of prcomp
116    loadings = 'rotation'
117for i,val in enumerate(summary[loadings]):
118    print >>fout, "%s\t%s" %(xcolnames[i], "\t".join(["%.4g" % el for el in val]))
119
120print >>fout, "#Scores\t%s" %("\t".join(["%s" % el for el in range(1,ncomps+1)]))
121if 'scores' in summary: #in case of princomp
122    scores = 'scores'
123elif 'x' in summary: #in case of prcomp
124    scores = 'x'
125for obs,sc in enumerate(summary[scores]):
126    print >>fout, "%s\t%s" %(obs+1, "\t".join(["%.4g" % el for el in sc]))
127
128r.pdf( outfile2, 8, 8 )
129r.biplot(pc)
130r.dev_off()