/tools/rgenetics/rgQC.py
https://bitbucket.org/cistrome/cistrome-harvard/ · Python · 1354 lines · 1132 code · 29 blank · 193 comment · 120 complexity · 60c9484f372fa7d2faad43140faf0631 MD5 · raw file
Large files are truncated click here to view the full file
- # oct 15 rpy replaced - temp fix until we get gnuplot working
- # rpy deprecated - replace with RRun
- # fixes to run functional test! oct1 2009
- # needed to expand our path with os.path.realpath to get newpath working with
- # all the fancy pdfnup stuff
- # and a fix to pruneld to write output to where it should be
- # smallish data in test-data/smallwga in various forms
- # python ../tools/rgenetics/rgQC.py -i smallwga -o smallwga -s smallwga/smallwga.html -p smallwga
- # child files are deprecated and broken as at july 19 2009
- # need to move them to the html file extrafiles path
- # found lots of corner cases with some illumina data where cnv markers were
- # included
- # and where affection status was all missing !
- # added links to tab files showing worst 1/keepfrac markers and subjects
- # ross lazarus january 2008
- #
- # added named parameters
- # to ensure no silly slippages if non required parameter in the most general case
- # some potentially useful things here reusable in complex scripts
- # with lots'o'html (TM)
- # aug 17 2007 rml
- #
- # added marker and subject and parenting april 14 rml
- # took a while to get the absolute paths right for all the file munging
- # as of april 16 seems to work..
- # getting galaxy to serve images in html reports is a little tricky
- # we don't want QC reports to be dozens of individual files, so need
- # to use the url /static/rg/... since galaxy's web server will happily serve images
- # from there
- # galaxy passes output files as relative paths
- # these have to be munged by rgQC.py before calling this
- # galaxy will pass in 2 file names - one for the log
- # and one for the final html report
- # of the form './database/files/dataset_66.dat'
- # we need to be working in that directory so our plink output files are there
- # so these have to be munged by rgQC.py before calling this
- # note no ped file passed so had to remove the -l option
- # for plinkParse.py that makes a heterozygosity report from the ped
- # file - needs fixing...
- # new: importing manhattan/qqplot plotter
- # def doManQQ(input_fname,chrom_col,offset_col,pval_cols,title,grey,ctitle,outdir):
- # """ draw a qq for pvals and a manhattan plot if chrom/offset <> 0
- # contains some R scripts as text strings - we substitute defaults into the calls
- # to make them do our bidding - and save the resulting code for posterity
- # this can be called externally, I guess...for QC eg?
- # """
- #
- # rcmd = '%s%s' % (rcode,rcode2 % (input_fname,chrom_col,offset_col,pval_cols,title,grey))
- # rlog,flist = RRun(rcmd=rcmd,title=ctitle,outdir=outdir)
- # return rlog,flist
-
- from optparse import OptionParser
- import sys,os,shutil, glob, math, subprocess, time, operator, random, tempfile, copy, string
- from os.path import abspath
- from rgutils import galhtmlprefix, galhtmlpostfix, RRun, timenow, plinke, rexe, runPlink, pruneLD
- import rgManQQ
- prog = os.path.split(sys.argv[0])[1]
- vers = '0.4 april 2009 rml'
- idjoiner = '_~_~_' # need something improbable..
- # many of these may need fixing for a new install
- myversion = vers
- keepfrac = 20 # fraction to keep after sorting by each interesting value
- missvals = {'0':'0','N':'N','-9':'-9','-':'-'} # fix me if these change!
- mogresize = "x300" # this controls the width for jpeg thumbnails
-
- def makePlots(markers=[],subjects=[],newfpath='.',basename='test',nbreaks='20',nup=3,height=10,width=8,rgbin=''):
- """
- marker rhead = ['snp','chrom','maf','a1','a2','missfrac',
- 'p_hwe_all','logp_hwe_all','p_hwe_unaff','logp_hwe_unaff','N_Mendel']
- subject rhead = ['famId','iId','FracMiss','Mendel_errors','Ped_sex','SNP_sex','Status','Fest']
- """
-
- def rHist(plotme=[],outfname='',xlabname='',title='',basename='',nbreaks=50):
- """ rHist <- function(plotme,froot,plotname,title,mfname,nbreaks=50)
- # generic histogram and vertical boxplot in a 3:1 layout
- # returns the graphic file name for inclusion in the web page
- # broken out here for reuse
- # ross lazarus march 2007
- """
- screenmat = (1,2,1,2) # create a 2x2 cabvas
- widthlist = (80,20) # change to 4:1 ratio for histo and boxplot
- rpy.r.pdf( outfname, height , width )
- #rpy.r.layout(rpy.r.matrix(rpy.r.c(1,1,1,2), 1, 4, byrow = True)) # 3 to 1 vertical plot
- m = rpy.r.matrix((1,1,1,2),nrow=1,ncol=4,byrow=True)
- # in R, m = matrix(c(1,2),nrow=1,ncol=2,byrow=T)
- rpy.r("layout(matrix(c(1,1,1,2),nrow=1,ncol=4,byrow=T))") # 4 to 1 vertical plot
- maint = 'QC for %s - %s' % (basename,title)
- rpy.r.hist(plotme,main=maint, xlab=xlabname,breaks=nbreaks,col="maroon",cex=0.8)
- rpy.r.boxplot(plotme,main='',col="maroon",outline=False)
- rpy.r.dev_off()
- def rCum(plotme=[],outfname='',xlabname='',title='',basename='',nbreaks=100):
- """
- Useful to see what various cutoffs yield - plot percentiles
- """
- n = len(plotme)
- maxveclen = 1000.0 # for reasonable pdf sizes!
- yvec = copy.copy(plotme)
- # arrives already in decending order of importance missingness or mendel count by subj or marker
- xvec = range(n)
- xvec = [100.0*(n-x)/n for x in xvec] # convert to centile
- # now have half a million markers eg - too many to plot all for a pdf - sample to get 10k or so points
- if n > maxveclen: # oversample part of the distribution
- always = min(1000,n/20) # oversample smaller of lowest few hundred items or 5%
- skip = int(n/maxveclen) # take 1 in skip to get about maxveclen points
- samplei = [i for i in range(n) if (i % skip == 0) or (i < always)] # always oversample first sorted values
- yvec = [yvec[i] for i in samplei] # always get first and last
- xvec = [xvec[i] for i in samplei] # always get first and last
- # need to supply the x axis or else rpy prints the freaking vector on the pdf - go figure
- rpy.r.pdf( outfname, height , width )
- maint = 'QC for %s - %s' % (basename,title)
- rpy.r("par(lab=c(10,10,10))") # so our grid is denser than the default 5
- rpy.r.plot(xvec,yvec,type='p',main=maint, ylab=xlabname, xlab='Sample Percentile',col="maroon",cex=0.8)
- rpy.r.grid(nx = None, ny = None, col = "lightgray", lty = "dotted")
- rpy.r.dev_off()
- def rQQ(plotme=[], outfname='fname',title='title',xlabname='Sample',basename=''):
- """
- y is data for a qq plot and ends up on the x axis go figure
- if sampling, oversample low values - all the top 1% ?
- this version called with -log10 transformed hwe
- """
- nrows = len(plotme)
- fn = float(nrows)
- xvec = [-math.log10(x/fn) for x in range(1,(nrows+1))]
- mx = [0,math.log10(fn)] # if 1000, becomes 3 for the null line
- maxveclen = 3000
- yvec = copy.copy(plotme)
- if nrows > maxveclen:
- # now have half a million markers eg - too many to plot all for a pdf - sample to get 10k or so points
- # oversample part of the distribution
- always = min(1000,nrows/20) # oversample smaller of lowest few hundred items or 5%
- skip = int(nrows/float(maxveclen)) # take 1 in skip to get about maxveclen points
- samplei = [i for i in range(nrows) if (i < always) or (i % skip == 0)]
- # always oversample first sorted (here lowest) values
- yvec = [yvec[i] for i in samplei] # always get first and last
- xvec = [xvec[i] for i in samplei] # and sample xvec same way
- maint='Log QQ Plot (random %d of %d)' % (len(yvec),nrows)
- else:
- maint='Log QQ Plot(n=%d)' % (nrows)
- mx = [0,math.log10(fn)] # if 1000, becomes 3 for the null line
- ylab = '%s' % xlabname
- xlab = '-log10(Uniform 0-1)'
- # need to supply the x axis or else rpy prints the freaking vector on the pdf - go figure
- rpy.r.pdf( outfname, height , width )
- rpy.r("par(lab=c(10,10,10))") # so our grid is denser than the default 5
- rpy.r.qqplot(xvec,yvec,xlab=xlab,ylab=ylab,main=maint,sub=title,pch=19,col="maroon",cex=0.8)
- rpy.r.points(mx,mx,type='l')
- rpy.r.grid(nx = None, ny = None, col = "lightgray", lty = "dotted")
- rpy.r.dev_off()
- def rMultiQQ(plotme = [],nsplits=5, outfname='fname',title='title',xlabname='Sample',basename=''):
- """
- data must contain p,x,y as data for a qq plot, quantiles of x and y axis used to create a
- grid of qq plots to show departure from null at extremes of data quality
- Need to plot qqplot(p,unif) where the p's come from one x and one y quantile
- and ends up on the x axis go figure
- if sampling, oversample low values - all the top 1% ?
- """
- data = copy.copy(plotme)
- nvals = len(data)
- stepsize = nvals/nsplits
- logstep = math.log10(stepsize) # so is 3 for steps of 1000
- quints = range(0,nvals,stepsize) # quintile cutpoints for each layer
- data.sort(key=itertools.itemgetter(1)) # into x order
- rpy.r.pdf( outfname, height , width )
- rpy.r("par(mfrow = c(%d,%d))" % (nsplits,nsplits))
- yvec = [-math.log10(random.random()) for x in range(stepsize)]
- yvec.sort() # size of each step is expected range for xvec under null?!
- for rowstart in quints:
- rowend = rowstart + stepsize
- if nvals - rowend < stepsize: # finish last split
- rowend = nvals
- row = data[rowstart:rowend]
- row.sort(key=itertools.itemgetter(2)) # into y order
- for colstart in quints:
- colend = colstart + stepsize
- if nvals - colend < stepsize: # finish last split
- colend = nvals
- cell = row[colstart:colend]
- xvec = [-math.log10(x[0]) for x in cell] # all the pvalues for this cell
- rpy.r.qqplot(xvec,yvec,xlab=xlab,ylab=ylab,pch=19,col="maroon",cex=0.8)
- rpy.r.points(c(0,logstep),c(0,logstep),type='l')
- rpy.r.dev_off()
- def rQQNorm(plotme=[], outfname='fname',title='title',xlabname='Sample',basename=''):
- """
- y is data for a qqnorm plot
- if sampling, oversample low values - all the top 1% ?
- """
- rangeunif = len(plotme)
- nunif = 1000
- maxveclen = 3000
- nrows = len(plotme)
- data = copy.copy(plotme)
- if nrows > maxveclen:
- # now have half a million markers eg - too many to plot all for a pdf - sample to get 10k or so points
- # oversample part of the distribution
- always = min(1000,nrows/20) # oversample smaller of lowest few hundred items or 5%
- skip = int((nrows-always)/float(maxveclen)) # take 1 in skip to get about maxveclen points
- samplei = [i for i in range(nrows) if (i % skip == 0) or (i < always)]
- # always oversample first sorted (here lowest) values
- yvec = [data[i] for i in samplei] # always get first and last
- maint='Log QQ Plot (random %d of %d)' % (len(yvec),nrows)
- else:
- yvec = data
- maint='Log QQ Plot(n=%d)' % (nrows)
- n = 1000
- ylab = '%s' % xlabname
- xlab = 'Normal'
- # need to supply the x axis or else rpy prints the freaking vector on the pdf - go figure
- rpy.r.pdf( outfname, height , width )
- rpy.r("par(lab=c(10,10,10))") # so our grid is denser than the default 5
- rpy.r.qqnorm(yvec,xlab=xlab,ylab=ylab,main=maint,sub=title,pch=19,col="maroon",cex=0.8)
- rpy.r.grid(nx = None, ny = None, col = "lightgray", lty = "dotted")
- rpy.r.dev_off()
- def rMAFMissqq(plotme=[], outfname='fname',title='title',xlabname='Sample',basename=''):
- """
- layout qq plots for pvalues within rows of increasing MAF and columns of increasing missingness
- like the GAIN qc tools
- y is data for a qq plot and ends up on the x axis go figure
- if sampling, oversample low values - all the top 1% ?
- """
- rangeunif = len(plotme)
- nunif = 1000
- fn = float(rangeunif)
- xvec = [-math.log10(x/fn) for x in range(1,(rangeunif+1))]
- skip = max(int(rangeunif/fn),1)
- # force include last points
- mx = [0,math.log10(fn)] # if 1000, becomes 3 for the null line
- maxveclen = 2000
- nrows = len(plotme)
- data = copy.copy(plotme)
- data.sort() # low to high - oversample low values
- if nrows > maxveclen:
- # now have half a million markers eg - too many to plot all for a pdf - sample to get 10k or so points
- # oversample part of the distribution
- always = min(1000,nrows/20) # oversample smaller of lowest few hundred items or 5%
- skip = int(nrows/float(maxveclen)) # take 1 in skip to get about maxveclen points
- samplei = [i for i in range(nrows) if (i % skip == 0) or (i < always)]
- # always oversample first sorted (here lowest) values
- yvec = [data[i] for i in samplei] # always get first and last
- xvec = [xvec[i] for i in samplei] # and sample xvec same way
- maint='Log QQ Plot (random %d of %d)' % (len(yvec),nrows)
- else:
- yvec = data
- maint='Log QQ Plot(n=%d)' % (nrows)
- n = 1000
- mx = [0,log10(fn)] # if 1000, becomes 3 for the null line
- ylab = '%s' % xlabname
- xlab = '-log10(Uniform 0-1)'
- # need to supply the x axis or else rpy prints the freaking vector on the pdf - go figure
- rpy.r.pdf( outfname, height , width )
- rpy.r("par(lab=c(10,10,10))") # so our grid is denser than the default 5
- rpy.r.qqplot(xvec,yvec,xlab=xlab,ylab=ylab,main=maint,sub=title,pch=19,col="maroon",cex=0.8)
- rpy.r.points(mx,mx,type='l')
- rpy.r.grid(nx = None, ny = None, col = "lightgray", lty = "dotted")
- rpy.r.dev_off()
- fdsto,stofile = tempfile.mkstemp()
- sto = open(stofile,'w')
- import rpy # delay to avoid rpy stdout chatter replacing galaxy file blurb
- mog = 'mogrify'
- pdfnup = 'pdfnup'
- pdfjoin = 'pdfjoin'
- shead = subjects.pop(0) # get rid of head
- mhead = markers.pop(0)
- maf = mhead.index('maf')
- missfrac = mhead.index('missfrac')
- logphweall = mhead.index('logp_hwe_all')
- logphweunaff = mhead.index('logp_hwe_unaff')
- # check for at least some unaffected rml june 2009
- m_mendel = mhead.index('N_Mendel')
- fracmiss = shead.index('FracMiss')
- s_mendel = shead.index('Mendel_errors')
- s_het = shead.index('F_Stat')
- params = {}
- hweres = [float(x[logphweunaff]) for x in markers if len(x[logphweunaff]) >= logphweunaff
- and x[logphweunaff].upper() <> 'NA']
- if len(hweres) <> 0:
- xs = [logphweunaff, missfrac, maf, m_mendel, fracmiss, s_mendel, s_het]
- # plot for each of these cols
- else: # try hwe all instead - maybe no affection status available
- xs = [logphweall, missfrac, maf, m_mendel, fracmiss, s_mendel, s_het]
- ordplotme = [1,1,1,1,1,1,1] # ordered plots for everything!
- oreverseme = [1,1,0,1,1,1,0] # so larger values are oversampled
- qqplotme = [1,0,0,0,0,0,0] #
- qnplotme = [0,0,0,0,0,0,1] #
- nplots = len(xs)
- xlabnames = ['log(p) HWE (unaff)', 'Missing Rate: Markers', 'Minor Allele Frequency',
- 'Marker Mendel Error Count', 'Missing Rate: Subjects',
- 'Subject Mendel Error Count','Subject Inbreeding (het) F Statistic']
- plotnames = ['logphweunaff', 'missfrac', 'maf', 'm_mendel', 'fracmiss', 's_mendel','s_het']
- ploturls = ['%s_%s.pdf' % (basename,x) for x in plotnames] # real plotnames
- ordplotnames = ['%s_cum' % x for x in plotnames]
- ordploturls = ['%s_%s.pdf' % (basename,x) for x in ordplotnames] # real plotnames
- outfnames = [os.path.join(newfpath,ploturls[x]) for x in range(nplots)]
- ordoutfnames = [os.path.join(newfpath,ordploturls[x]) for x in range(nplots)]
- datasources = [markers,markers,markers,markers,subjects,subjects,subjects] # use this table
- titles = ["Marker HWE","Marker Missing Genotype", "Marker MAF","Marker Mendel",
- "Subject Missing Genotype","Subject Mendel",'Subject F Statistic']
- html = []
- pdflist = []
- for n,column in enumerate(xs):
- dat = [float(x[column]) for x in datasources[n] if len(x) >= column
- and x[column][:2].upper() <> 'NA'] # plink gives both!
- if sum(dat) <> 0: # eg nada for mendel if case control?
- rHist(plotme=dat,outfname=outfnames[n],xlabname=xlabnames[n],
- title=titles[n],basename=basename,nbreaks=nbreaks)
- row = [titles[n],ploturls[n],outfnames[n] ]
- html.append(row)
- pdflist.append(outfnames[n])
- if ordplotme[n]: # for missingness, hwe - plots to see where cutoffs will end up
- otitle = 'Ranked %s' % titles[n]
- dat.sort()
- if oreverseme[n]:
- dat.reverse()
- rCum(plotme=dat,outfname=ordoutfnames[n],xlabname='Ordered %s' % xlabnames[n],
- title=otitle,basename=basename,nbreaks=1000)
- row = [otitle,ordploturls[n],ordoutfnames[n]]
- html.append(row)
- pdflist.append(ordoutfnames[n])
- if qqplotme[n]: #
- otitle = 'LogQQ plot %s' % titles[n]
- dat.sort()
- dat.reverse()
- ofn = os.path.split(ordoutfnames[n])
- ofn = os.path.join(ofn[0],'QQ%s' % ofn[1])
- ofu = os.path.split(ordploturls[n])
- ofu = os.path.join(ofu[0],'QQ%s' % ofu[1])
- rQQ(plotme=dat,outfname=ofn,xlabname='QQ %s' % xlabnames[n],
- title=otitle,basename=basename)
- row = [otitle,ofu,ofn]
- html.append(row)
- pdflist.append(ofn)
- elif qnplotme[n]:
- otitle = 'F Statistic %s' % titles[n]
- dat.sort()
- dat.reverse()
- ofn = os.path.split(ordoutfnames[n])
- ofn = os.path.join(ofn[0],'FQNorm%s' % ofn[1])
- ofu = os.path.split(ordploturls[n])
- ofu = os.path.join(ofu[0],'FQNorm%s' % ofu[1])
- rQQNorm(plotme=dat,outfname=ofn,xlabname='F QNorm %s' % xlabnames[n],
- title=otitle,basename=basename)
- row = [otitle,ofu,ofn]
- html.append(row)
- pdflist.append(ofn)
- else:
- print '#$# no data for # %d - %s, data[:10]=%s' % (n,titles[n],dat[:10])
- if nup>0:
- # pdfjoin --outfile chr1test.pdf `ls database/files/dataset_396_files/*.pdf`
- # pdfnup chr1test.pdf --nup 3x3 --frame true --outfile chr1test3.pdf
- filestojoin = ' '.join(pdflist) # all the file names so far
- afname = '%s_All_Paged.pdf' % (basename)
- fullafname = os.path.join(newfpath,afname)
- expl = 'All %s QC Plots joined into a single pdf' % basename
- vcl = '%s %s --outfile %s ' % (pdfjoin,filestojoin, fullafname)
- # make single page pdf
- x=subprocess.Popen(vcl,shell=True,cwd=newfpath,stderr=sto,stdout=sto)
- retval = x.wait()
- row = [expl,afname,fullafname]
- html.insert(0,row) # last rather than second
- nfname = '%s_All_%dx%d.pdf' % (basename,nup,nup)
- fullnfname = os.path.join(newfpath,nfname)
- expl = 'All %s QC Plots %d by %d to a page' % (basename,nup,nup)
- vcl = '%s %s --nup %dx%d --frame true --outfile %s' % (pdfnup,afname,nup,nup,fullnfname)
- # make thumbnail images
- x=subprocess.Popen(vcl,shell=True,cwd=newfpath,stderr=sto,stdout=sto)
- retval = x.wait()
- row = [expl,nfname,fullnfname]
- html.insert(1,row) # this goes second
- vcl = '%s -format jpg -resize %s %s' % (mog, mogresize, os.path.join(newfpath,'*.pdf'))
- # make thumbnail images
- x=subprocess.Popen(vcl,shell=True,cwd=newfpath,stderr=sto,stdout=sto)
- retval = x.wait()
- sto.close()
- cruft = open(stofile,'r').readlines()
- return html,cruft # elements for an ordered list of urls or whatever..
- def RmakePlots(markers=[],subjects=[],newfpath='.',basename='test',nbreaks='100',nup=3,height=8,width=10,rexe=''):
- """
- nice try but the R scripts are huge and take forever to run if there's a lot of data
- marker rhead = ['snp','chrom','maf','a1','a2','missfrac',
- 'p_hwe_all','logp_hwe_all','p_hwe_unaff','logp_hwe_unaff','N_Mendel']
- subject rhead = ['famId','iId','FracMiss','Mendel_errors','Ped_sex','SNP_sex','Status','Fest']
- """
- colour = "maroon"
-
- def rHist(plotme='',outfname='',xlabname='',title='',basename='',nbreaks=nbreaks):
- """ rHist <- function(plotme,froot,plotname,title,mfname,nbreaks=50)
- # generic histogram and vertical boxplot in a 3:1 layout
- # returns the graphic file name for inclusion in the web page
- # broken out here for reuse
- # ross lazarus march 2007
- """
- R = []
- maint = 'QC for %s - %s' % (basename,title)
- screenmat = (1,2,1,2) # create a 2x2 canvas
- widthlist = (80,20) # change to 4:1 ratio for histo and boxplot
- R.append('pdf("%s",h=%d,w=%d)' % (outfname,height,width))
- R.append("layout(matrix(c(1,1,1,2),nrow=1,ncol=4,byrow=T))")
- R.append("plotme = read.table(file='%s',head=F,sep='\t')" % plotme)
- R.append('hist(plotme, main="%s",xlab="%s",breaks=%d,col="%s")' % (maint,xlabname,nbreaks,colour))
- R.append('boxplot(plotme,main="",col="%s",outline=F)' % (colour) )
- R.append('dev.off()')
- return R
-
- def rCum(plotme='',outfname='',xlabname='',title='',basename='',nbreaks=100):
- """
- Useful to see what various cutoffs yield - plot percentiles
- """
- R = []
- n = len(plotme)
- R.append("plotme = read.table(file='%s',head=T,sep='\t')" % plotme)
- # arrives already in decending order of importance missingness or mendel count by subj or marker
- maint = 'QC for %s - %s' % (basename,title)
- R.append('pdf("%s",h=%d,w=%d)' % (outfname,height,width))
- R.append("par(lab=c(10,10,10))")
- R.append('plot(plotme$xvec,plotme$yvec,type="p",main="%s",ylab="%s",xlab="Sample Percentile",col="%s")' % (maint,xlabname,colour))
- R.append('dev.off()')
- return R
- def rQQ(plotme='', outfname='fname',title='title',xlabname='Sample',basename=''):
- """
- y is data for a qq plot and ends up on the x axis go figure
- if sampling, oversample low values - all the top 1% ?
- this version called with -log10 transformed hwe
- """
- R = []
- nrows = len(plotme)
- fn = float(nrows)
- xvec = [-math.log10(x/fn) for x in range(1,(nrows+1))]
- #mx = [0,math.log10(fn)] # if 1000, becomes 3 for the null line
- maxveclen = 3000
- yvec = copy.copy(plotme)
- if nrows > maxveclen:
- # now have half a million markers eg - too many to plot all for a pdf - sample to get 10k or so points
- # oversample part of the distribution
- always = min(1000,nrows/20) # oversample smaller of lowest few hundred items or 5%
- skip = int(nrows/float(maxveclen)) # take 1 in skip to get about maxveclen points
- if skip < 2:
- skip = 2
- samplei = [i for i in range(nrows) if (i < always) or (i % skip == 0)]
- # always oversample first sorted (here lowest) values
- yvec = [yvec[i] for i in samplei] # always get first and last
- xvec = [xvec[i] for i in samplei] # and sample xvec same way
- maint='Log QQ Plot (random %d of %d)' % (len(yvec),nrows)
- else:
- maint='Log QQ Plot(n=%d)' % (nrows)
- mx = [0,math.log10(fn)] # if 1000, becomes 3 for the null line
- ylab = '%s' % xlabname
- xlab = '-log10(Uniform 0-1)'
- # need to supply the x axis or else rpy prints the freaking vector on the pdf - go figure
- x = ['%f' % x for x in xvec]
- R.append('xvec = c(%s)' % ','.join(x))
- y = ['%f' % x for x in yvec]
- R.append('yvec = c(%s)' % ','.join(y))
- R.append('mx = c(0,%f)' % (math.log10(fn)))
- R.append('pdf("%s",h=%d,w=%d)' % (outfname,height,width))
- R.append("par(lab=c(10,10,10))")
- R.append('qqplot(xvec,yvec,xlab="%s",ylab="%s",main="%s",sub="%s",pch=19,col="%s",cex=0.8)' % (xlab,ylab,maint,title,colour))
- R.append('points(mx,mx,type="l")')
- R.append('grid(col="lightgray",lty="dotted")')
- R.append('dev.off()')
- return R
- def rMultiQQ(plotme = '',nsplits=5, outfname='fname',title='title',xlabname='Sample',basename=''):
- """
- data must contain p,x,y as data for a qq plot, quantiles of x and y axis used to create a
- grid of qq plots to show departure from null at extremes of data quality
- Need to plot qqplot(p,unif) where the p's come from one x and one y quantile
- and ends up on the x axis go figure
- if sampling, oversample low values - all the top 1% ?
- """
- data = copy.copy(plotme)
- nvals = len(data)
- stepsize = nvals/nsplits
- logstep = math.log10(stepsize) # so is 3 for steps of 1000
- R.append('mx = c(0,%f)' % logstep)
- quints = range(0,nvals,stepsize) # quintile cutpoints for each layer
- data.sort(key=itertools.itemgetter(1)) # into x order
- #rpy.r.pdf( outfname, h , w )
- #rpy.r("par(mfrow = c(%d,%d))" % (nsplits,nsplits))
- R.append('pdf("%s",h=%d,w=%d)' % (outfname,height,width))
- yvec = [-math.log10(random.random()) for x in range(stepsize)]
- yvec.sort() # size of each step is expected range for xvec under null?!
- y = ['%f' % x for x in yvec]
- R.append('yvec = c(%s)' % ','.join(y))
- for rowstart in quints:
- rowend = rowstart + stepsize
- if nvals - rowend < stepsize: # finish last split
- rowend = nvals
- row = data[rowstart:rowend]
- row.sort(key=itertools.itemgetter(2)) # into y order
- for colstart in quints:
- colend = colstart + stepsize
- if nvals - colend < stepsize: # finish last split
- colend = nvals
- cell = row[colstart:colend]
- xvec = [-math.log10(x[0]) for x in cell] # all the pvalues for this cell
- x = ['%f' % x for x in xvec]
- R.append('xvec = c(%s)' % ','.join(x))
- R.append('qqplot(xvec,yvec,xlab="%s",ylab="%s",main="%s",sub="%s",pch=19,col="%s",cex=0.8)' % (xlab,ylab,maint,title,colour))
- R.append('points(mx,mx,type="l")')
- R.append('grid(col="lightgray",lty="dotted")')
- #rpy.r.qqplot(xvec,yvec,xlab=xlab,ylab=ylab,pch=19,col="maroon",cex=0.8)
- #rpy.r.points(c(0,logstep),c(0,logstep),type='l')
- R.append('dev.off()')
- #rpy.r.dev_off()
- return R
- def rQQNorm(plotme=[], outfname='fname',title='title',xlabname='Sample',basename=''):
- """
- y is data for a qqnorm plot
- if sampling, oversample low values - all the top 1% ?
- """
- rangeunif = len(plotme)
- nunif = 1000
- maxveclen = 3000
- nrows = len(plotme)
- data = copy.copy(plotme)
- if nrows > maxveclen:
- # now have half a million markers eg - too many to plot all for a pdf - sample to get 10k or so points
- # oversample part of the distribution
- always = min(1000,nrows/20) # oversample smaller of lowest few hundred items or 5%
- skip = int((nrows-always)/float(maxveclen)) # take 1 in skip to get about maxveclen points
- samplei = [i for i in range(nrows) if (i % skip == 0) or (i < always)]
- # always oversample first sorted (here lowest) values
- yvec = [data[i] for i in samplei] # always get first and last
- maint='Log QQ Plot (random %d of %d)' % (len(yvec),nrows)
- else:
- yvec = data
- maint='Log QQ Plot(n=%d)' % (nrows)
- n = 1000
- ylab = '%s' % xlabname
- xlab = 'Normal'
- # need to supply the x axis or else rpy prints the freaking vector on the pdf - go figure
- #rpy.r.pdf( outfname, h , w )
- #rpy.r("par(lab=c(10,10,10))") # so our grid is denser than the default 5
- #rpy.r.qqnorm(yvec,xlab=xlab,ylab=ylab,main=maint,sub=title,pch=19,col="maroon",cex=0.8)
- #rpy.r.grid(nx = None, ny = None, col = "lightgray", lty = "dotted")
- #rpy.r.dev_off()
- y = ['%f' % x for x in yvec]
- R.append('yvec = c(%s)' % ','.join(y))
- R.append('pdf("%s",h=%d,w=%d)' % (outfname,height,width))
- R.append("par(lab=c(10,10,10))")
- R.append('qqnorm(yvec,xlab="%s",ylab="%s",main="%s",sub="%s",pch=19,col="%s",cex=0.8)' % (xlab,ylab,maint,title,colour))
- R.append('grid(col="lightgray",lty="dotted")')
- R.append('dev.off()')
- return R
- def rMAFMissqq(plotme=[], outfname='fname',title='title',xlabname='Sample',basename=''):
- """
- layout qq plots for pvalues within rows of increasing MAF and columns of increasing missingness
- like the GAIN qc tools
- y is data for a qq plot and ends up on the x axis go figure
- if sampling, oversample low values - all the top 1% ?
- """
- rangeunif = len(plotme)
- nunif = 1000
- fn = float(rangeunif)
- xvec = [-math.log10(x/fn) for x in range(1,(rangeunif+1))]
- skip = max(int(rangeunif/fn),1)
- # force include last points
- mx = [0,math.log10(fn)] # if 1000, becomes 3 for the null line
- maxveclen = 2000
- nrows = len(plotme)
- data = copy.copy(plotme)
- data.sort() # low to high - oversample low values
- if nrows > maxveclen:
- # now have half a million markers eg - too many to plot all for a pdf - sample to get 10k or so points
- # oversample part of the distribution
- always = min(1000,nrows/20) # oversample smaller of lowest few hundred items or 5%
- skip = int(nrows/float(maxveclen)) # take 1 in skip to get about maxveclen points
- samplei = [i for i in range(nrows) if (i % skip == 0) or (i < always)]
- # always oversample first sorted (here lowest) values
- yvec = [data[i] for i in samplei] # always get first and last
- xvec = [xvec[i] for i in samplei] # and sample xvec same way
- maint='Log QQ Plot (random %d of %d)' % (len(yvec),nrows)
- else:
- yvec = data
- maint='Log QQ Plot(n=%d)' % (nrows)
- n = 1000
- mx = [0,log10(fn)] # if 1000, becomes 3 for the null line
- ylab = '%s' % xlabname
- xlab = '-log10(Uniform 0-1)'
- R.append('mx = c(0,%f)' % (math.log10(fn)))
- x = ['%f' % x for x in xvec]
- R.append('xvec = c(%s)' % ','.join(x))
- y = ['%f' % x for x in yvec]
- R.append('yvec = c(%s)' % ','.join(y))
- R.append('pdf("%s",h=%d,w=%d)' % (outfname,height,width))
- R.append("par(lab=c(10,10,10))")
- R.append('qqplot(xvec,yvec,xlab="%s",ylab="%s",main="%s",sub="%s",pch=19,col="%s",cex=0.8)' % (xlab,ylab,maint,title,colour))
- R.append('points(mx,mx,type="l")')
- R.append('grid(col="lightgray",lty="dotted")')
- R.append('dev.off()')
- shead = subjects.pop(0) # get rid of head
- mhead = markers.pop(0)
- maf = mhead.index('maf')
- missfrac = mhead.index('missfrac')
- logphweall = mhead.index('logp_hwe_all')
- logphweunaff = mhead.index('logp_hwe_unaff')
- # check for at least some unaffected rml june 2009
- m_mendel = mhead.index('N_Mendel')
- fracmiss = shead.index('FracMiss')
- s_mendel = shead.index('Mendel_errors')
- s_het = shead.index('F_Stat')
- params = {}
- h = [float(x[logphweunaff]) for x in markers if len(x[logphweunaff]) >= logphweunaff
- and x[logphweunaff].upper() <> 'NA']
- if len(h) <> 0:
- xs = [logphweunaff, missfrac, maf, m_mendel, fracmiss, s_mendel, s_het]
- # plot for each of these cols
- else: # try hwe all instead - maybe no affection status available
- xs = [logphweall, missfrac, maf, m_mendel, fracmiss, s_mendel, s_het]
- ordplotme = [1,1,1,1,1,1,1] # ordered plots for everything!
- oreverseme = [1,1,0,1,1,1,0] # so larger values are oversampled
- qqplotme = [1,0,0,0,0,0,0] #
- qnplotme = [0,0,0,0,0,0,1] #
- nplots = len(xs)
- xlabnames = ['log(p) HWE (unaff)', 'Missing Rate: Markers', 'Minor Allele Frequency',
- 'Marker Mendel Error Count', 'Missing Rate: Subjects',
- 'Subject Mendel Error Count','Subject Inbreeding (het) F Statistic']
- plotnames = ['logphweunaff', 'missfrac', 'maf', 'm_mendel', 'fracmiss', 's_mendel','s_het']
- ploturls = ['%s_%s.pdf' % (basename,x) for x in plotnames] # real plotnames
- ordplotnames = ['%s_cum' % x for x in plotnames]
- ordploturls = ['%s_%s.pdf' % (basename,x) for x in ordplotnames] # real plotnames
- outfnames = [os.path.join(newfpath,ploturls[x]) for x in range(nplots)]
- ordoutfnames = [os.path.join(newfpath,ordploturls[x]) for x in range(nplots)]
- datasources = [markers,markers,markers,markers,subjects,subjects,subjects] # use this table
- titles = ["Marker HWE","Marker Missing Genotype", "Marker MAF","Marker Mendel",
- "Subject Missing Genotype","Subject Mendel",'Subject F Statistic']
- html = []
- pdflist = []
- R = []
- for n,column in enumerate(xs):
- dfn = '%d_%s.txt' % (n,titles[n])
- dfilepath = os.path.join(newfpath,dfn)
- dat = [float(x[column]) for x in datasources[n] if len(x) >= column
- and x[column][:2].upper() <> 'NA'] # plink gives both!
- if sum(dat) <> 0: # eg nada for mendel if case control?
- plotme = file(dfilepath,'w')
- plotme.write('\n'.join(['%f' % x for x in dat])) # pass as a file - copout!
- tR = rHist(plotme=dfilepath,outfname=outfnames[n],xlabname=xlabnames[n],
- title=titles[n],basename=basename,nbreaks=nbreaks)
- R += tR
- row = [titles[n],ploturls[n],outfnames[n] ]
- html.append(row)
- pdflist.append(outfnames[n])
- if ordplotme[n]: # for missingness, hwe - plots to see where cutoffs will end up
- otitle = 'Ranked %s' % titles[n]
- dat.sort()
- if oreverseme[n]:
- dat.reverse()
- ndat = len(dat)
- xvec = range(ndat)
- xvec = [100.0*(n-x)/n for x in xvec] # convert to centile
- # now have half a million markers eg - too many to plot all for a pdf - sample to get 10k or so points
- maxveclen = 1000.0 # for reasonable pdf sizes!
- if ndat > maxveclen: # oversample part of the distribution
- always = min(1000,ndat/20) # oversample smaller of lowest few hundred items or 5%
- skip = int(ndat/maxveclen) # take 1 in skip to get about maxveclen points
- samplei = [i for i in range(ndat) if (i % skip == 0) or (i < always)] # always oversample first sorted values
- yvec = [yvec[i] for i in samplei] # always get first and last
- xvec = [xvec[i] for i in samplei] # always get first and last
- plotme = file(dfilepath,'w')
- plotme.write('xvec\tyvec\n')
- plotme.write('\n'.join(['%f\t%f' % (xvec[i],y) for y in yvec])) # pass as a file - copout!
- tR = rCum(plotme=dat,outfname=ordoutfnames[n],xlabname='Ordered %s' % xlabnames[n],
- title=otitle,basename=basename,nbreaks=nbreaks)
- R += tR
- row = [otitle,ordploturls[n],ordoutfnames[n]]
- html.append(row)
- pdflist.append(ordoutfnames[n])
- if qqplotme[n]: #
- otitle = 'LogQQ plot %s' % titles[n]
- dat.sort()
- dat.reverse()
- ofn = os.path.split(ordoutfnames[n])
- ofn = os.path.join(ofn[0],'QQ%s' % ofn[1])
- ofu = os.path.split(ordploturls[n])
- ofu = os.path.join(ofu[0],'QQ%s' % ofu[1])
- tR = rQQ(plotme=dat,outfname=ofn,xlabname='QQ %s' % xlabnames[n],
- title=otitle,basename=basename)
- R += tR
- row = [otitle,ofu,ofn]
- html.append(row)
- pdflist.append(ofn)
- elif qnplotme[n]:
- otitle = 'F Statistic %s' % titles[n]
- dat.sort()
- dat.reverse()
- ofn = os.path.split(ordoutfnames[n])
- ofn = os.path.join(ofn[0],'FQNorm%s' % ofn[1])
- ofu = os.path.split(ordploturls[n])
- ofu = os.path.join(ofu[0],'FQNorm%s' % ofu[1])
- tR = rQQNorm(plotme=dat,outfname=ofn,xlabname='F QNorm %s' % xlabnames[n],
- title=otitle,basename=basename)
- R += tR
- row = [otitle,ofu,ofn]
- html.append(row)
- pdflist.append(ofn)
- else:
- print '#$# no data for # %d - %s, data[:10]=%s' % (n,titles[n],dat[:10])
- rlog,flist = RRun(rcmd=R,title='makeQCplots',outdir=newfpath)
- if nup>0:
- # pdfjoin --outfile chr1test.pdf `ls database/files/dataset_396_files/*.pdf`
- # pdfnup chr1test.pdf --nup 3x3 --frame true --outfile chr1test3.pdf
- filestojoin = ' '.join(pdflist) # all the file names so far
- afname = '%s_All_Paged.pdf' % (basename)
- fullafname = os.path.join(newfpath,afname)
- expl = 'All %s QC Plots joined into a single pdf' % basename
- vcl = 'pdfjoin %s --outfile %s ' % (filestojoin, fullafname)
- # make single page pdf
- x=subprocess.Popen(vcl,shell=True,cwd=newfpath)
- retval = x.wait()
- row = [expl,afname,fullafname]
- html.insert(0,row) # last rather than second
- nfname = '%s_All_%dx%d.pdf' % (basename,nup,nup)
- fullnfname = os.path.join(newfpath,nfname)
- expl = 'All %s QC Plots %d by %d to a page' % (basename,nup,nup)
- vcl = 'pdfnup %s --nup %dx%d --frame true --outfile %s' % (afname,nup,nup,fullnfname)
- # make thumbnail images
- x=subprocess.Popen(vcl,shell=True,cwd=newfpath)
- retval = x.wait()
- row = [expl,nfname,fullnfname]
- html.insert(1,row) # this goes second
- vcl = 'mogrify -format jpg -resize %s %s' % (mogresize, os.path.join(newfpath,'*.pdf'))
- # make thumbnail images
- x=subprocess.Popen(vcl,shell=True,cwd=newfpath)
- retval = x.wait()
- return html # elements for an ordered list of urls or whatever..
- def countHet(bedf='fakeped_500000',linkageped=True,froot='fake500k',outfname="ahetf",logf='rgQC.log'):
- """
- NO LONGER USED - historical interest
- count het loci for each subject to look for outliers = ? contamination
- assume ped file is linkage format
- need to make a ped file from the bed file we were passed
- """
- vcl = [plinke,'--bfile',bedf,'--recode','--out','%s_recode' % froot] # write a recoded ped file from the real .bed file
- p=subprocess.Popen(' '.join(vcl),shell=True)
- retval = p.wait()
- f = open('%s_recode.recode.ped' % froot,'r')
- if not linkageped:
- head = f.next() # throw away header
- hets = [] # simple count of het loci per subject. Expect poisson?
- n = 1
- for l in f:
- n += 1
- ll = l.strip().split()
- if len(ll) > 6:
- iid = idjoiner.join(ll[:2]) # fam_iid
- gender = ll[4]
- alleles = ll[6:]
- nallele = len(alleles)
- nhet = 0
- for i in range(nallele/2):
- a1=alleles[2*i]
- a2=alleles[2*i+1]
- if alleles[2*i] <> alleles[2*i+1]: # must be het
- if not missvals.get(a1,None) and not missvals.get(a2,None):
- nhet += 1
- hets.append((nhet,iid,gender)) # for sorting later
- f.close()
- hets.sort()
- hets.reverse() # biggest nhet now on top
- f = open(outfname ,'w')
- res = ['%d\t%s\t%s' % (x) for x in hets] # I love list comprehensions
- res.insert(0,'nhetloci\tfamid_iid\tgender')
- res.append('')
- f.write('\n'.join(res))
- f.close()
- def subjectRep(froot='cleantest',outfname="srep",newfpath='.',logf = None):
- """by subject (missingness = .imiss, mendel = .imendel)
- assume replicates have an underscore in family id for
- hapmap testing
- For sorting, we need floats and integers
- """
- isexfile = '%s.sexcheck' % froot
- imissfile = '%s.imiss' % froot
- imendfile = '%s.imendel' % froot
- ihetfile = '%s.het' % froot
- logf.write('## subject reports starting at %s\n' % timenow())
- outfile = os.path.join(newfpath,outfname)
- idlist = []
- imissdict = {}
- imenddict = {}
- isexdict = {}
- ihetdict = {}
- Tops = {}
- Tnames = ['Ranked Subject Missing Genotype', 'Ranked Subject Mendel',
- 'Ranked Sex check', 'Ranked Inbreeding (het) F statistic']
- Tsorts = [2,3,6,8]
- Treverse = [True,True,True,False] # so first values are worser
- #rhead = ['famId','iId','FracMiss','Mendel_errors','Ped_sex','SNP_sex','Status','Fest']
- ## FID IID MISS_PHENO N_MISS N_GENO F_MISS
- ## 1552042370_A 1552042370_A N 5480 549883 0.009966
- ## 1552042410_A 1552042410_A N 1638 549883 0.002979
-
- # ------------------missing--------------------
- # imiss has FID IID MISS_PHENO N_MISS F_MISS
- # we want F_MISS
- try:
- f = open(imissfile,'r')
- except:
- logf.write('# file %s is missing - talk about irony\n' % imissfile)
- f = None
- if f:
- for n,line in enumerate(f):
- ll = line.strip().split()
- if n == 0:
- head = [x.upper() for x in ll] # expect above
- fidpos = head.index('FID')
- iidpos = head.index('IID')
- fpos = head.index('F_MISS')
- elif len(ll) >= fpos: # full line
- fid = ll[fidpos]
- #if fid.find('_') == -1: # not replicate! - icondb ids have these...
- iid = ll[iidpos]
- fmiss = ll[fpos]
- id = '%s%s%s' % (fid,idjoiner,iid)
- imissdict[id] = fmiss
- idlist.append(id)
- f.close()
- logf.write('## imissfile %s contained %d ids\n' % (imissfile,len(idlist)))
- # ------------------mend-------------------
- # *.imendel has FID IID N
- # we want N
- gotmend = True
- try:
- f = open(imendfile,'r')
- except:
- gotmend = False
- for id in idlist:
- imenddict[id] = '0'
- if gotmend:
- for n,line in enumerate(f):
- ll = line.strip().split()
- if n == 0:
- head = [x.upper() for x in ll] # expect above
- npos = head.index('N')
- fidpos = head.index('FID')
- iidpos = head.index('IID')
- elif len(ll) >= npos: # full line
- fid = ll[fidpos]
- iid = ll[iidpos]
- id = '%s%s%s' % (fid,idjoiner,iid)
- nmend = ll[npos]
- imenddict[id] = nmend
- f.close()
- else:
- logf.write('## error No %s file - assuming not family data\n' % imendfile)
- # ------------------sex check------------------
- #[rerla@hg fresh]$ head /home/rerla/fresh/database/files/dataset_978_files/CAMP2007Dirty.sexcheck
- # sexcheck has FID IID PEDSEX SNPSEX STATUS F
- ##
- ## FID Family ID
- ## IID Individual ID
- ## PEDSEX Sex as determined in pedigree file (1=male, 2=female)
- ## SNPSEX Sex as determined by X chromosome
- ## STATUS Displays "PROBLEM" or "OK" for each individual
- ## F The actual X chromosome inbreeding (homozygosity) estimate
- ##
- ## A PROBLEM arises if the two sexes do not match, or if the SNP data or pedigree data are
- ## ambiguous with regard to sex.
- ## A male call is made if F is more than 0.8; a femle call is made if F is less than 0.2.
- try:
- f = open(isexfile,'r')
- got_sexcheck = 1
- except:
- got_sexcheck = 0
- if got_sexcheck:
- for n,line in enumerate(f):
- ll = line.strip().split()
- if n == 0:
- head = [x.upper() for x in ll] # expect above
- fidpos = head.index('FID')
- iidpos = head.index('IID')
- pedsexpos = head.index('PEDSEX')
- snpsexpos = head.index('SNPSEX')
- statuspos = head.index('STATUS')
- fpos = head.index('F')
- elif len(ll) >= fpos: # full line
- fid = ll[fidpos]
- iid = ll[iidpos]
- fest = ll[fpos]
- pedsex = ll[pedsexpos]
- snpsex = ll[snpsexpos]
- stat = ll[statuspos]
- id = '%s%s%s' % (fid,idjoiner,iid)
- isexdict[id] = (pedsex,snpsex,stat,fest)
- f.close()
- else:
- # this only happens if there are no subjects!
- logf.write('No %s file - assuming no sex errors' % isexfile)
- ##
- ## FID IID O(HOM) E(HOM) N(NM) F
- ## 457 2 490665 4.928e+05 722154 -0.009096
- ## 457 3 464519 4.85e+05 710986 -0.0908
- ## 1037 2 461632 4.856e+05 712025 -0.106
- ## 1037 1 491845 4.906e+05 719353 0.005577
- try:
- f = open(ihetfile,'r')
- except:
- f = None
- logf.write('## No %s file - did we run --het in plink?\n' % ihetfile)
- if f:
- for i,line in enumerate(f):
- ll = line.strip().split()
- if i == 0:
- head = [x.upper() for x in ll] # expect above
- fidpos = head.index('FID')
- iidpos = head.index('IID')
- fpos = head.index('F')
- n = 0
- elif len(ll) >= fpos: # full line
- fid = ll[fidpos]
- iid = ll[iidpos]
- fhet = ll[fpos]
- id = '%s%s%s' % (fid,idjoiner,iid)
- ihetdict[id] = fhet
- f.close() # now assemble and output result list
- rhead = ['famId','iId','FracMiss','Mendel_errors','Ped_sex','SNP_sex','Status','XHomEst','F_Stat']
- res = []
- fres = [] # floats for sorting
- for id in idlist: # for each snp in found order
- fid,iid = id.split(idjoiner) # recover keys
- f_missing = imissdict.get(id,'0.0')
- nmend = imenddict.get(id,'0')
- (pedsex,snpsex,status,fest) = isexdict.get(id,('0','0','0','0.0'))
- fhet = ihetdict.get(id,'0.0')
- res.append([fid,iid,f_missing,nmend,pedsex,snpsex,status,fest,fhet])
- try:
- ff_missing = float(f_missing)
- except:
- ff_missing = 0.0
- try:
- inmend = int(nmend)
- except:
- inmend = 0
- try:
- ffest = float(fest)
- except:
- fest = 0.0
- try:
- ffhet = float(fhet)
- except:
- ffhet = 0.0
- fres.append([fid,iid,ff_missing,inmend,pedsex,snpsex,status,ffest,ffhet])
- ntokeep = max(20,len(res)/keepfrac)
- for i,col in enumerate(Tsorts):
- fres.sort(key=operator.itemgetter(col))
- if Treverse[i]:
- fres.reverse()
- repname = Tnames[i]
- Tops[repname] = fres[0:ntokeep]
- Tops[repname] = [map(str,x) for x in Tops[repname]]
- Tops[repname].insert(0,rhead)
- res.sort()
- res.insert(0,rhead)
- logf.write('### writing %s report with %s' % ( outfile,res[0]))
- f = open(outfile,'w')
- f.write('\n'.join(['\t'.join(x) for x in res]))
- f.write('\n')
- f.close()
- return res,Tops
- def markerRep(froot='cleantest',outfname="mrep",newfpath='.',logf=None,maplist=None ):
- """by marker (hwe = .hwe, missingness=.lmiss, freq = .frq)
- keep a list of marker order but keep all stats in dicts
- write out a fake xls file for R or SAS etc
- kinda clunky, but..
- TODO: ensure stable if any file not found?
- """
- mapdict = {}
- if maplist <> None:
- rslist = [x[1] for x in maplist]
- offset = [(x[0],x[3]) for x in maplist]
- mapdict = dict(zip(rslist,offset))
- hwefile = '%s.hwe' % froot
- lmissfile = '%s.lmiss' % froot
- freqfile = '%s.frq' % froot
- lmendfile = '%s.lmendel' % froot
- outfile = os.path.join(newfpath,outfname)
- markerlist = []
- chromlist = []
- hwedict = {}
- lmissdict = {}
- freqdict = {}
- lmenddict = {}
- Tops = {}
- Tnames = ['Ranked Marker MAF', 'Ranked Marker Missing Genotype', 'Ranked Marker HWE', 'Ranked Marker Mendel']
- Tsorts = [3,6,10,11]
- Treverse = [False,True,True,True] # so first values are worse(r)
- #res.append([rs,chrom,offset,maf,a1,a2,f_missing,hwe_all[0],hwe_all[1],hwe_unaff[0],hwe_unaff[1],nmend])
- #rhead = ['snp','chrom','maf','a1','a2','missfrac','p_hwe_all','logp_hwe_all','p_hwe_unaff','logp_hwe_unaff','N_Mendel']
- # -------------------hwe--------------------------
- # hwe has SNP TEST GENO O(HET) E(HET) P_HWD
- # we want all hwe where P_HWD <> NA
- # ah changed in 1.04 to
- ## CHR SNP TEST A1 A2 GENO O(HET) E(HET) P
- ## 1 rs6671164 ALL 2 3 34/276/613 0.299 0.3032 0.6644
- ## 1 rs6671164 AFF 2 3…