PageRenderTime 142ms CodeModel.GetById 14ms app.highlight 115ms RepoModel.GetById 1ms app.codeStats 1ms

/tools/rgenetics/rgQC.py

https://bitbucket.org/cistrome/cistrome-harvard/
Python | 1354 lines | 1284 code | 11 blank | 59 comment | 5 complexity | 60c9484f372fa7d2faad43140faf0631 MD5 | raw file

Large files files are truncated, but you can click here to view the full file

   1# oct 15 rpy replaced - temp fix until we get gnuplot working 
   2# rpy deprecated - replace with RRun
   3# fixes to run functional test! oct1 2009
   4# needed to expand our path with os.path.realpath to get newpath working with
   5# all the fancy pdfnup stuff
   6# and a fix to pruneld to write output to where it should be
   7# smallish data in test-data/smallwga in various forms
   8# python ../tools/rgenetics/rgQC.py -i smallwga -o smallwga -s smallwga/smallwga.html -p smallwga
   9# child files are deprecated and broken as at july 19 2009
  10# need to move them to the html file extrafiles path
  11# found lots of corner cases with some illumina data where cnv markers were
  12# included
  13# and where affection status was all missing !
  14# added links to tab files showing worst 1/keepfrac markers and subjects
  15# ross lazarus january 2008
  16#
  17# added named parameters
  18# to ensure no silly slippages if non required parameter in the most general case
  19# some potentially useful things here reusable in complex scripts
  20# with lots'o'html (TM)
  21# aug 17 2007 rml
  22#
  23# added marker and subject and parenting april 14 rml
  24# took a while to get the absolute paths right for all the file munging
  25# as of april 16 seems to work..
  26# getting galaxy to serve images in html reports is a little tricky
  27# we don't want QC reports to be dozens of individual files, so need
  28# to use the url /static/rg/... since galaxy's web server will happily serve images
  29# from there
  30# galaxy passes output files as relative paths
  31# these have to be munged by rgQC.py before calling this
  32# galaxy will pass in 2 file names - one for the log
  33# and one for the final html report
  34# of the form './database/files/dataset_66.dat'
  35# we need to be working in that directory so our plink output files are there
  36# so these have to be munged by rgQC.py before calling this
  37# note no ped file passed so had to remove the -l option
  38# for plinkParse.py that makes a heterozygosity report from the ped
  39# file - needs fixing...
  40# new: importing manhattan/qqplot plotter
  41# def doManQQ(input_fname,chrom_col,offset_col,pval_cols,title,grey,ctitle,outdir):
  42#    """ draw a qq for pvals and a manhattan plot if chrom/offset <> 0
  43#    contains some R scripts as text strings - we substitute defaults into the calls
  44#    to make them do our bidding - and save the resulting code for posterity
  45#    this can be called externally, I guess...for QC eg?
  46#    """
  47#
  48#    rcmd = '%s%s' % (rcode,rcode2 % (input_fname,chrom_col,offset_col,pval_cols,title,grey))
  49#    rlog,flist = RRun(rcmd=rcmd,title=ctitle,outdir=outdir)
  50#    return rlog,flist
  51  
  52
  53from optparse import OptionParser
  54
  55import sys,os,shutil, glob, math, subprocess, time, operator, random, tempfile, copy, string
  56from os.path import abspath
  57from rgutils import galhtmlprefix, galhtmlpostfix, RRun, timenow, plinke, rexe, runPlink, pruneLD
  58import rgManQQ
  59
  60prog = os.path.split(sys.argv[0])[1]
  61vers = '0.4 april 2009 rml'
  62idjoiner = '_~_~_' # need something improbable..
  63# many of these may need fixing for a new install
  64
  65myversion = vers
  66keepfrac = 20 # fraction to keep after sorting by each interesting value
  67
  68missvals = {'0':'0','N':'N','-9':'-9','-':'-'} # fix me if these change!
  69
  70mogresize = "x300" # this controls the width for jpeg thumbnails
  71
  72
  73
  74            
  75def makePlots(markers=[],subjects=[],newfpath='.',basename='test',nbreaks='20',nup=3,height=10,width=8,rgbin=''):
  76    """
  77    marker rhead = ['snp','chrom','maf','a1','a2','missfrac',
  78    'p_hwe_all','logp_hwe_all','p_hwe_unaff','logp_hwe_unaff','N_Mendel']
  79    subject rhead = ['famId','iId','FracMiss','Mendel_errors','Ped_sex','SNP_sex','Status','Fest']
  80    """
  81
  82        
  83    def rHist(plotme=[],outfname='',xlabname='',title='',basename='',nbreaks=50):
  84        """   rHist <- function(plotme,froot,plotname,title,mfname,nbreaks=50)
  85        # generic histogram and vertical boxplot in a 3:1 layout
  86        # returns the graphic file name for inclusion in the web page
  87        # broken out here for reuse
  88        # ross lazarus march 2007
  89        """
  90        screenmat = (1,2,1,2) # create a 2x2 cabvas
  91        widthlist = (80,20) # change to 4:1 ratio for histo and boxplot
  92        rpy.r.pdf( outfname, height , width  )
  93        #rpy.r.layout(rpy.r.matrix(rpy.r.c(1,1,1,2), 1, 4, byrow = True)) # 3 to 1 vertical plot
  94        m = rpy.r.matrix((1,1,1,2),nrow=1,ncol=4,byrow=True)
  95        # in R, m = matrix(c(1,2),nrow=1,ncol=2,byrow=T)
  96        rpy.r("layout(matrix(c(1,1,1,2),nrow=1,ncol=4,byrow=T))") # 4 to 1 vertical plot
  97        maint = 'QC for %s - %s' % (basename,title)
  98        rpy.r.hist(plotme,main=maint, xlab=xlabname,breaks=nbreaks,col="maroon",cex=0.8)
  99        rpy.r.boxplot(plotme,main='',col="maroon",outline=False)
 100        rpy.r.dev_off()
 101
 102    def rCum(plotme=[],outfname='',xlabname='',title='',basename='',nbreaks=100):
 103        """
 104        Useful to see what various cutoffs yield - plot percentiles
 105        """
 106        n = len(plotme)
 107        maxveclen = 1000.0 # for reasonable pdf sizes!
 108        yvec = copy.copy(plotme)
 109        # arrives already in decending order of importance missingness or mendel count by subj or marker
 110        xvec = range(n)
 111        xvec = [100.0*(n-x)/n for x in xvec] # convert to centile
 112        # now have half a million markers eg - too many to plot all for a pdf - sample to get 10k or so points
 113        if n > maxveclen: # oversample part of the distribution
 114            always = min(1000,n/20) # oversample smaller of lowest few hundred items or 5%
 115            skip = int(n/maxveclen) # take 1 in skip to get about maxveclen points
 116            samplei = [i for i in range(n) if (i % skip == 0) or (i < always)] # always oversample first sorted values
 117            yvec = [yvec[i] for i in samplei] # always get first and last
 118            xvec = [xvec[i] for i in samplei] # always get first and last
 119        # need to supply the x axis or else rpy prints the freaking vector on the pdf - go figure
 120        rpy.r.pdf( outfname, height , width  )
 121        maint = 'QC for %s - %s' % (basename,title)
 122        rpy.r("par(lab=c(10,10,10))") # so our grid is denser than the default 5
 123        rpy.r.plot(xvec,yvec,type='p',main=maint, ylab=xlabname, xlab='Sample Percentile',col="maroon",cex=0.8)
 124        rpy.r.grid(nx = None, ny = None, col = "lightgray", lty = "dotted")
 125        rpy.r.dev_off()
 126
 127    def rQQ(plotme=[], outfname='fname',title='title',xlabname='Sample',basename=''):
 128        """
 129        y is data for a qq plot and ends up on the x axis go figure
 130        if sampling, oversample low values - all the top 1% ?
 131        this version called with -log10 transformed hwe
 132        """
 133        nrows = len(plotme)
 134        fn = float(nrows)
 135        xvec = [-math.log10(x/fn) for x in range(1,(nrows+1))]
 136        mx = [0,math.log10(fn)] # if 1000, becomes 3 for the null line
 137        maxveclen = 3000
 138        yvec = copy.copy(plotme)
 139        if nrows > maxveclen:
 140            # now have half a million markers eg - too many to plot all for a pdf - sample to get 10k or so points
 141            # oversample part of the distribution
 142            always = min(1000,nrows/20) # oversample smaller of lowest few hundred items or 5%
 143            skip = int(nrows/float(maxveclen)) # take 1 in skip to get about maxveclen points
 144            samplei = [i for i in range(nrows) if (i < always) or (i % skip == 0)]
 145            # always oversample first sorted (here lowest) values
 146            yvec = [yvec[i] for i in samplei] # always get first and last
 147            xvec = [xvec[i] for i in samplei] # and sample xvec same way
 148            maint='Log QQ Plot (random %d of %d)' % (len(yvec),nrows)
 149        else:
 150            maint='Log QQ Plot(n=%d)' % (nrows)
 151        mx = [0,math.log10(fn)] # if 1000, becomes 3 for the null line
 152        ylab = '%s' % xlabname
 153        xlab = '-log10(Uniform 0-1)'
 154        # need to supply the x axis or else rpy prints the freaking vector on the pdf - go figure
 155        rpy.r.pdf( outfname, height , width  )
 156        rpy.r("par(lab=c(10,10,10))") # so our grid is denser than the default 5
 157        rpy.r.qqplot(xvec,yvec,xlab=xlab,ylab=ylab,main=maint,sub=title,pch=19,col="maroon",cex=0.8)
 158        rpy.r.points(mx,mx,type='l')
 159        rpy.r.grid(nx = None, ny = None, col = "lightgray", lty = "dotted")
 160        rpy.r.dev_off()
 161
 162    def rMultiQQ(plotme = [],nsplits=5, outfname='fname',title='title',xlabname='Sample',basename=''):
 163        """
 164        data must contain p,x,y as data for a qq plot, quantiles of x and y axis used to create a
 165        grid of qq plots to show departure from null at extremes of data quality
 166        Need to plot qqplot(p,unif) where the p's come from one x and one y quantile
 167        and ends up on the x axis go figure
 168        if sampling, oversample low values - all the top 1% ?
 169        """
 170        data = copy.copy(plotme)
 171        nvals = len(data)
 172        stepsize = nvals/nsplits
 173        logstep = math.log10(stepsize) # so is 3 for steps of 1000
 174        quints = range(0,nvals,stepsize) # quintile cutpoints for each layer
 175        data.sort(key=itertools.itemgetter(1)) # into x order
 176        rpy.r.pdf( outfname, height , width  )
 177        rpy.r("par(mfrow = c(%d,%d))" % (nsplits,nsplits))
 178        yvec = [-math.log10(random.random()) for x in range(stepsize)]
 179        yvec.sort() # size of each step is expected range for xvec under null?!
 180        for rowstart in quints:
 181            rowend = rowstart + stepsize
 182            if nvals - rowend < stepsize: # finish last split
 183                rowend = nvals
 184            row = data[rowstart:rowend]
 185            row.sort(key=itertools.itemgetter(2)) # into y order
 186            for colstart in quints:
 187                colend = colstart + stepsize
 188                if nvals - colend < stepsize: # finish last split
 189                    colend = nvals
 190                cell = row[colstart:colend]
 191                xvec = [-math.log10(x[0]) for x in cell] # all the pvalues for this cell
 192                rpy.r.qqplot(xvec,yvec,xlab=xlab,ylab=ylab,pch=19,col="maroon",cex=0.8)
 193                rpy.r.points(c(0,logstep),c(0,logstep),type='l')
 194        rpy.r.dev_off()
 195
 196
 197    def rQQNorm(plotme=[], outfname='fname',title='title',xlabname='Sample',basename=''):
 198        """
 199        y is data for a qqnorm plot
 200        if sampling, oversample low values - all the top 1% ?
 201        """
 202        rangeunif = len(plotme)
 203        nunif = 1000
 204        maxveclen = 3000
 205        nrows = len(plotme)
 206        data = copy.copy(plotme)
 207        if nrows > maxveclen:
 208            # now have half a million markers eg - too many to plot all for a pdf - sample to get 10k or so points
 209            # oversample part of the distribution
 210            always = min(1000,nrows/20) # oversample smaller of lowest few hundred items or 5%
 211            skip = int((nrows-always)/float(maxveclen)) # take 1 in skip to get about maxveclen points
 212            samplei = [i for i in range(nrows) if (i % skip == 0) or (i < always)]
 213            # always oversample first sorted (here lowest) values
 214            yvec = [data[i] for i in samplei] # always get first and last
 215            maint='Log QQ Plot (random %d of %d)' % (len(yvec),nrows)
 216        else:
 217            yvec = data
 218            maint='Log QQ Plot(n=%d)' % (nrows)
 219        n = 1000
 220        ylab = '%s' % xlabname
 221        xlab = 'Normal'
 222        # need to supply the x axis or else rpy prints the freaking vector on the pdf - go figure
 223        rpy.r.pdf( outfname, height , width  )
 224        rpy.r("par(lab=c(10,10,10))") # so our grid is denser than the default 5
 225        rpy.r.qqnorm(yvec,xlab=xlab,ylab=ylab,main=maint,sub=title,pch=19,col="maroon",cex=0.8)
 226        rpy.r.grid(nx = None, ny = None, col = "lightgray", lty = "dotted")
 227        rpy.r.dev_off()
 228
 229    def rMAFMissqq(plotme=[], outfname='fname',title='title',xlabname='Sample',basename=''):
 230        """
 231        layout qq plots for pvalues within rows of increasing MAF and columns of increasing missingness
 232        like the GAIN qc tools
 233        y is data for a qq plot and ends up on the x axis go figure
 234        if sampling, oversample low values - all the top 1% ?
 235        """
 236        rangeunif = len(plotme)
 237        nunif = 1000
 238        fn = float(rangeunif)
 239        xvec = [-math.log10(x/fn) for x in range(1,(rangeunif+1))]
 240        skip = max(int(rangeunif/fn),1)
 241        # force include last points
 242        mx = [0,math.log10(fn)] # if 1000, becomes 3 for the null line
 243        maxveclen = 2000
 244        nrows = len(plotme)
 245        data = copy.copy(plotme)
 246        data.sort() # low to high - oversample low values
 247        if nrows > maxveclen:
 248            # now have half a million markers eg - too many to plot all for a pdf - sample to get 10k or so points
 249            # oversample part of the distribution
 250            always = min(1000,nrows/20) # oversample smaller of lowest few hundred items or 5%
 251            skip = int(nrows/float(maxveclen)) # take 1 in skip to get about maxveclen points
 252            samplei = [i for i in range(nrows) if (i % skip == 0) or (i < always)]
 253            # always oversample first sorted (here lowest) values
 254            yvec = [data[i] for i in samplei] # always get first and last
 255            xvec = [xvec[i] for i in samplei] # and sample xvec same way
 256            maint='Log QQ Plot (random %d of %d)' % (len(yvec),nrows)
 257        else:
 258            yvec = data
 259            maint='Log QQ Plot(n=%d)' % (nrows)
 260        n = 1000
 261        mx = [0,log10(fn)] # if 1000, becomes 3 for the null line
 262        ylab = '%s' % xlabname
 263        xlab = '-log10(Uniform 0-1)'
 264        # need to supply the x axis or else rpy prints the freaking vector on the pdf - go figure
 265        rpy.r.pdf( outfname, height , width  )
 266        rpy.r("par(lab=c(10,10,10))") # so our grid is denser than the default 5
 267        rpy.r.qqplot(xvec,yvec,xlab=xlab,ylab=ylab,main=maint,sub=title,pch=19,col="maroon",cex=0.8)
 268        rpy.r.points(mx,mx,type='l')
 269        rpy.r.grid(nx = None, ny = None, col = "lightgray", lty = "dotted")
 270        rpy.r.dev_off()
 271
 272
 273    fdsto,stofile = tempfile.mkstemp()
 274    sto = open(stofile,'w')
 275    import rpy # delay to avoid rpy stdout chatter replacing galaxy file blurb
 276    mog = 'mogrify'
 277    pdfnup = 'pdfnup'
 278    pdfjoin = 'pdfjoin'
 279    shead = subjects.pop(0) # get rid of head
 280    mhead = markers.pop(0)
 281    maf = mhead.index('maf')
 282    missfrac = mhead.index('missfrac')
 283    logphweall = mhead.index('logp_hwe_all')
 284    logphweunaff = mhead.index('logp_hwe_unaff')
 285    # check for at least some unaffected rml june 2009
 286    m_mendel = mhead.index('N_Mendel')
 287    fracmiss = shead.index('FracMiss')
 288    s_mendel = shead.index('Mendel_errors')
 289    s_het = shead.index('F_Stat')
 290    params = {}
 291    hweres = [float(x[logphweunaff]) for x in markers if len(x[logphweunaff]) >= logphweunaff
 292         and x[logphweunaff].upper() <> 'NA']
 293    if len(hweres) <> 0:
 294        xs = [logphweunaff, missfrac, maf, m_mendel, fracmiss, s_mendel, s_het]
 295        # plot for each of these cols
 296    else: # try hwe all instead - maybe no affection status available
 297        xs = [logphweall, missfrac, maf, m_mendel, fracmiss, s_mendel, s_het]
 298    ordplotme = [1,1,1,1,1,1,1] # ordered plots for everything!
 299    oreverseme = [1,1,0,1,1,1,0] # so larger values are oversampled
 300    qqplotme = [1,0,0,0,0,0,0] #
 301    qnplotme = [0,0,0,0,0,0,1] #
 302    nplots = len(xs)
 303    xlabnames = ['log(p) HWE (unaff)', 'Missing Rate: Markers', 'Minor Allele Frequency',
 304                 'Marker Mendel Error Count', 'Missing Rate: Subjects',
 305                 'Subject Mendel Error Count','Subject Inbreeding (het) F Statistic']
 306    plotnames = ['logphweunaff', 'missfrac', 'maf', 'm_mendel', 'fracmiss', 's_mendel','s_het']
 307    ploturls = ['%s_%s.pdf' % (basename,x) for x in plotnames] # real plotnames
 308    ordplotnames = ['%s_cum' % x for x in plotnames]
 309    ordploturls = ['%s_%s.pdf' % (basename,x) for x in ordplotnames] # real plotnames
 310    outfnames = [os.path.join(newfpath,ploturls[x]) for x in range(nplots)]
 311    ordoutfnames = [os.path.join(newfpath,ordploturls[x]) for x in range(nplots)]
 312    datasources = [markers,markers,markers,markers,subjects,subjects,subjects] # use this table
 313    titles = ["Marker HWE","Marker Missing Genotype", "Marker MAF","Marker Mendel",
 314        "Subject Missing Genotype","Subject Mendel",'Subject F Statistic']
 315    html = []
 316    pdflist = []
 317    for n,column in enumerate(xs):
 318        dat = [float(x[column]) for x in datasources[n] if len(x) >= column
 319               and x[column][:2].upper() <> 'NA'] # plink gives both!
 320        if sum(dat) <> 0: # eg nada for mendel if case control?
 321            rHist(plotme=dat,outfname=outfnames[n],xlabname=xlabnames[n],
 322              title=titles[n],basename=basename,nbreaks=nbreaks)
 323            row = [titles[n],ploturls[n],outfnames[n] ]
 324            html.append(row)
 325            pdflist.append(outfnames[n])
 326            if ordplotme[n]: # for missingness, hwe - plots to see where cutoffs will end up
 327                otitle = 'Ranked %s' % titles[n]
 328                dat.sort()
 329                if oreverseme[n]:
 330                    dat.reverse()
 331                rCum(plotme=dat,outfname=ordoutfnames[n],xlabname='Ordered %s' % xlabnames[n],
 332                  title=otitle,basename=basename,nbreaks=1000)
 333                row = [otitle,ordploturls[n],ordoutfnames[n]]
 334                html.append(row)
 335                pdflist.append(ordoutfnames[n])
 336            if qqplotme[n]: #
 337                otitle = 'LogQQ plot %s' % titles[n]
 338                dat.sort()
 339                dat.reverse()
 340                ofn = os.path.split(ordoutfnames[n])
 341                ofn = os.path.join(ofn[0],'QQ%s' % ofn[1])
 342                ofu = os.path.split(ordploturls[n])
 343                ofu = os.path.join(ofu[0],'QQ%s' % ofu[1])
 344                rQQ(plotme=dat,outfname=ofn,xlabname='QQ %s' % xlabnames[n],
 345                  title=otitle,basename=basename)
 346                row = [otitle,ofu,ofn]
 347                html.append(row)
 348                pdflist.append(ofn)
 349            elif qnplotme[n]:
 350                otitle = 'F Statistic %s' % titles[n]
 351                dat.sort()
 352                dat.reverse()
 353                ofn = os.path.split(ordoutfnames[n])
 354                ofn = os.path.join(ofn[0],'FQNorm%s' % ofn[1])
 355                ofu = os.path.split(ordploturls[n])
 356                ofu = os.path.join(ofu[0],'FQNorm%s' % ofu[1])
 357                rQQNorm(plotme=dat,outfname=ofn,xlabname='F QNorm %s' % xlabnames[n],
 358                  title=otitle,basename=basename)
 359                row = [otitle,ofu,ofn]
 360                html.append(row)
 361                pdflist.append(ofn)
 362        else:
 363            print '#$# no data for # %d - %s, data[:10]=%s' % (n,titles[n],dat[:10])
 364    if nup>0:
 365        # pdfjoin --outfile chr1test.pdf `ls database/files/dataset_396_files/*.pdf`
 366        # pdfnup chr1test.pdf --nup 3x3 --frame true --outfile chr1test3.pdf
 367        filestojoin = ' '.join(pdflist) # all the file names so far
 368        afname = '%s_All_Paged.pdf' % (basename)
 369        fullafname = os.path.join(newfpath,afname)
 370        expl = 'All %s QC Plots joined into a single pdf' % basename
 371        vcl = '%s %s --outfile %s ' % (pdfjoin,filestojoin, fullafname)
 372        # make single page pdf
 373        x=subprocess.Popen(vcl,shell=True,cwd=newfpath,stderr=sto,stdout=sto)
 374        retval = x.wait()
 375        row = [expl,afname,fullafname]
 376        html.insert(0,row) # last rather than second
 377        nfname = '%s_All_%dx%d.pdf' % (basename,nup,nup)
 378        fullnfname = os.path.join(newfpath,nfname)
 379        expl = 'All %s QC Plots %d by %d to a page' % (basename,nup,nup)
 380        vcl = '%s %s --nup %dx%d --frame true --outfile %s' % (pdfnup,afname,nup,nup,fullnfname)
 381        # make thumbnail images
 382        x=subprocess.Popen(vcl,shell=True,cwd=newfpath,stderr=sto,stdout=sto)
 383        retval = x.wait()
 384        row = [expl,nfname,fullnfname]
 385        html.insert(1,row) # this goes second
 386    vcl = '%s -format jpg -resize %s %s' % (mog, mogresize, os.path.join(newfpath,'*.pdf'))
 387    # make thumbnail images
 388    x=subprocess.Popen(vcl,shell=True,cwd=newfpath,stderr=sto,stdout=sto)
 389    retval = x.wait()
 390    sto.close()
 391    cruft = open(stofile,'r').readlines()
 392    return html,cruft # elements for an ordered list of urls or whatever..  
 393
 394
 395def RmakePlots(markers=[],subjects=[],newfpath='.',basename='test',nbreaks='100',nup=3,height=8,width=10,rexe=''):
 396    """
 397    nice try but the R scripts are huge and take forever to run if there's a lot of data
 398    marker rhead = ['snp','chrom','maf','a1','a2','missfrac',
 399    'p_hwe_all','logp_hwe_all','p_hwe_unaff','logp_hwe_unaff','N_Mendel']
 400    subject rhead = ['famId','iId','FracMiss','Mendel_errors','Ped_sex','SNP_sex','Status','Fest']
 401    """
 402    colour = "maroon"
 403        
 404    def rHist(plotme='',outfname='',xlabname='',title='',basename='',nbreaks=nbreaks):
 405        """   rHist <- function(plotme,froot,plotname,title,mfname,nbreaks=50)
 406        # generic histogram and vertical boxplot in a 3:1 layout
 407        # returns the graphic file name for inclusion in the web page
 408        # broken out here for reuse
 409        # ross lazarus march 2007
 410        """
 411        R = []
 412        maint = 'QC for %s - %s' % (basename,title)
 413        screenmat = (1,2,1,2) # create a 2x2 canvas
 414        widthlist = (80,20) # change to 4:1 ratio for histo and boxplot
 415        R.append('pdf("%s",h=%d,w=%d)' % (outfname,height,width))
 416        R.append("layout(matrix(c(1,1,1,2),nrow=1,ncol=4,byrow=T))")
 417        R.append("plotme = read.table(file='%s',head=F,sep='\t')" % plotme)
 418        R.append('hist(plotme, main="%s",xlab="%s",breaks=%d,col="%s")' % (maint,xlabname,nbreaks,colour))
 419        R.append('boxplot(plotme,main="",col="%s",outline=F)' % (colour) )
 420        R.append('dev.off()')
 421        return R
 422        
 423    def rCum(plotme='',outfname='',xlabname='',title='',basename='',nbreaks=100):
 424        """
 425        Useful to see what various cutoffs yield - plot percentiles
 426        """
 427        R = []
 428        n = len(plotme)
 429        R.append("plotme = read.table(file='%s',head=T,sep='\t')" % plotme)
 430        # arrives already in decending order of importance missingness or mendel count by subj or marker
 431        maint = 'QC for %s - %s' % (basename,title)
 432        R.append('pdf("%s",h=%d,w=%d)' % (outfname,height,width))
 433        R.append("par(lab=c(10,10,10))")
 434        R.append('plot(plotme$xvec,plotme$yvec,type="p",main="%s",ylab="%s",xlab="Sample Percentile",col="%s")' % (maint,xlabname,colour))
 435        R.append('dev.off()')
 436        return R
 437
 438    def rQQ(plotme='', outfname='fname',title='title',xlabname='Sample',basename=''):
 439        """
 440        y is data for a qq plot and ends up on the x axis go figure
 441        if sampling, oversample low values - all the top 1% ?
 442        this version called with -log10 transformed hwe
 443        """
 444        R = []
 445        nrows = len(plotme)
 446        fn = float(nrows)
 447        xvec = [-math.log10(x/fn) for x in range(1,(nrows+1))]
 448        #mx = [0,math.log10(fn)] # if 1000, becomes 3 for the null line
 449        maxveclen = 3000
 450        yvec = copy.copy(plotme)
 451        if nrows > maxveclen:
 452            # now have half a million markers eg - too many to plot all for a pdf - sample to get 10k or so points
 453            # oversample part of the distribution
 454            always = min(1000,nrows/20) # oversample smaller of lowest few hundred items or 5%
 455            skip = int(nrows/float(maxveclen)) # take 1 in skip to get about maxveclen points
 456            if skip < 2:
 457                skip = 2
 458            samplei = [i for i in range(nrows) if (i < always) or (i % skip == 0)]
 459            # always oversample first sorted (here lowest) values
 460            yvec = [yvec[i] for i in samplei] # always get first and last
 461            xvec = [xvec[i] for i in samplei] # and sample xvec same way
 462            maint='Log QQ Plot (random %d of %d)' % (len(yvec),nrows)
 463        else:
 464            maint='Log QQ Plot(n=%d)' % (nrows)
 465        mx = [0,math.log10(fn)] # if 1000, becomes 3 for the null line
 466        ylab = '%s' % xlabname
 467        xlab = '-log10(Uniform 0-1)'
 468        # need to supply the x axis or else rpy prints the freaking vector on the pdf - go figure
 469        x = ['%f' % x for x in xvec]
 470        R.append('xvec = c(%s)' % ','.join(x))
 471        y = ['%f' % x for x in yvec]
 472        R.append('yvec = c(%s)' % ','.join(y))
 473        R.append('mx = c(0,%f)' % (math.log10(fn)))
 474        R.append('pdf("%s",h=%d,w=%d)' % (outfname,height,width))
 475        R.append("par(lab=c(10,10,10))")
 476        R.append('qqplot(xvec,yvec,xlab="%s",ylab="%s",main="%s",sub="%s",pch=19,col="%s",cex=0.8)' % (xlab,ylab,maint,title,colour))
 477        R.append('points(mx,mx,type="l")')
 478        R.append('grid(col="lightgray",lty="dotted")')
 479        R.append('dev.off()')
 480        return R
 481
 482    def rMultiQQ(plotme = '',nsplits=5, outfname='fname',title='title',xlabname='Sample',basename=''):
 483        """
 484        data must contain p,x,y as data for a qq plot, quantiles of x and y axis used to create a
 485        grid of qq plots to show departure from null at extremes of data quality
 486        Need to plot qqplot(p,unif) where the p's come from one x and one y quantile
 487        and ends up on the x axis go figure
 488        if sampling, oversample low values - all the top 1% ?
 489        """
 490        data = copy.copy(plotme)
 491        nvals = len(data)
 492        stepsize = nvals/nsplits
 493        logstep = math.log10(stepsize) # so is 3 for steps of 1000
 494        R.append('mx = c(0,%f)' % logstep)
 495        quints = range(0,nvals,stepsize) # quintile cutpoints for each layer
 496        data.sort(key=itertools.itemgetter(1)) # into x order
 497        #rpy.r.pdf( outfname, h , w  )
 498        #rpy.r("par(mfrow = c(%d,%d))" % (nsplits,nsplits))
 499        R.append('pdf("%s",h=%d,w=%d)' % (outfname,height,width))
 500        yvec = [-math.log10(random.random()) for x in range(stepsize)]
 501        yvec.sort() # size of each step is expected range for xvec under null?!
 502        y = ['%f' % x for x in yvec]
 503        R.append('yvec = c(%s)' % ','.join(y))
 504        for rowstart in quints:
 505            rowend = rowstart + stepsize
 506            if nvals - rowend < stepsize: # finish last split
 507                rowend = nvals
 508            row = data[rowstart:rowend]
 509            row.sort(key=itertools.itemgetter(2)) # into y order
 510            for colstart in quints:
 511                colend = colstart + stepsize
 512                if nvals - colend < stepsize: # finish last split
 513                    colend = nvals
 514                cell = row[colstart:colend]
 515                xvec = [-math.log10(x[0]) for x in cell] # all the pvalues for this cell
 516                x = ['%f' % x for x in xvec]
 517                R.append('xvec = c(%s)' % ','.join(x))
 518                R.append('qqplot(xvec,yvec,xlab="%s",ylab="%s",main="%s",sub="%s",pch=19,col="%s",cex=0.8)' % (xlab,ylab,maint,title,colour))
 519                R.append('points(mx,mx,type="l")')
 520                R.append('grid(col="lightgray",lty="dotted")')
 521                #rpy.r.qqplot(xvec,yvec,xlab=xlab,ylab=ylab,pch=19,col="maroon",cex=0.8)
 522                #rpy.r.points(c(0,logstep),c(0,logstep),type='l')
 523        R.append('dev.off()')
 524        #rpy.r.dev_off()
 525        return R
 526
 527
 528    def rQQNorm(plotme=[], outfname='fname',title='title',xlabname='Sample',basename=''):
 529        """
 530        y is data for a qqnorm plot
 531        if sampling, oversample low values - all the top 1% ?
 532        """
 533        rangeunif = len(plotme)
 534        nunif = 1000
 535        maxveclen = 3000
 536        nrows = len(plotme)
 537        data = copy.copy(plotme)
 538        if nrows > maxveclen:
 539            # now have half a million markers eg - too many to plot all for a pdf - sample to get 10k or so points
 540            # oversample part of the distribution
 541            always = min(1000,nrows/20) # oversample smaller of lowest few hundred items or 5%
 542            skip = int((nrows-always)/float(maxveclen)) # take 1 in skip to get about maxveclen points
 543            samplei = [i for i in range(nrows) if (i % skip == 0) or (i < always)]
 544            # always oversample first sorted (here lowest) values
 545            yvec = [data[i] for i in samplei] # always get first and last
 546            maint='Log QQ Plot (random %d of %d)' % (len(yvec),nrows)
 547        else:
 548            yvec = data
 549            maint='Log QQ Plot(n=%d)' % (nrows)
 550        n = 1000
 551        ylab = '%s' % xlabname
 552        xlab = 'Normal'
 553        # need to supply the x axis or else rpy prints the freaking vector on the pdf - go figure
 554        #rpy.r.pdf( outfname, h , w  )
 555        #rpy.r("par(lab=c(10,10,10))") # so our grid is denser than the default 5
 556        #rpy.r.qqnorm(yvec,xlab=xlab,ylab=ylab,main=maint,sub=title,pch=19,col="maroon",cex=0.8)
 557        #rpy.r.grid(nx = None, ny = None, col = "lightgray", lty = "dotted")
 558        #rpy.r.dev_off()
 559        y = ['%f' % x for x in yvec]
 560        R.append('yvec = c(%s)' % ','.join(y))
 561        R.append('pdf("%s",h=%d,w=%d)' % (outfname,height,width))
 562        R.append("par(lab=c(10,10,10))")
 563        R.append('qqnorm(yvec,xlab="%s",ylab="%s",main="%s",sub="%s",pch=19,col="%s",cex=0.8)' % (xlab,ylab,maint,title,colour))
 564        R.append('grid(col="lightgray",lty="dotted")')
 565        R.append('dev.off()')
 566        return R
 567
 568    def rMAFMissqq(plotme=[], outfname='fname',title='title',xlabname='Sample',basename=''):
 569        """
 570        layout qq plots for pvalues within rows of increasing MAF and columns of increasing missingness
 571        like the GAIN qc tools
 572        y is data for a qq plot and ends up on the x axis go figure
 573        if sampling, oversample low values - all the top 1% ?
 574        """
 575        rangeunif = len(plotme)
 576        nunif = 1000
 577        fn = float(rangeunif)
 578        xvec = [-math.log10(x/fn) for x in range(1,(rangeunif+1))]
 579        skip = max(int(rangeunif/fn),1)
 580        # force include last points
 581        mx = [0,math.log10(fn)] # if 1000, becomes 3 for the null line
 582        maxveclen = 2000
 583        nrows = len(plotme)
 584        data = copy.copy(plotme)
 585        data.sort() # low to high - oversample low values
 586        if nrows > maxveclen:
 587            # now have half a million markers eg - too many to plot all for a pdf - sample to get 10k or so points
 588            # oversample part of the distribution
 589            always = min(1000,nrows/20) # oversample smaller of lowest few hundred items or 5%
 590            skip = int(nrows/float(maxveclen)) # take 1 in skip to get about maxveclen points
 591            samplei = [i for i in range(nrows) if (i % skip == 0) or (i < always)]
 592            # always oversample first sorted (here lowest) values
 593            yvec = [data[i] for i in samplei] # always get first and last
 594            xvec = [xvec[i] for i in samplei] # and sample xvec same way
 595            maint='Log QQ Plot (random %d of %d)' % (len(yvec),nrows)
 596        else:
 597            yvec = data
 598            maint='Log QQ Plot(n=%d)' % (nrows)
 599        n = 1000
 600        mx = [0,log10(fn)] # if 1000, becomes 3 for the null line
 601        ylab = '%s' % xlabname
 602        xlab = '-log10(Uniform 0-1)'
 603        R.append('mx = c(0,%f)' % (math.log10(fn)))
 604        x = ['%f' % x for x in xvec]
 605        R.append('xvec = c(%s)' % ','.join(x))
 606        y = ['%f' % x for x in yvec]
 607        R.append('yvec = c(%s)' % ','.join(y))
 608        R.append('pdf("%s",h=%d,w=%d)' % (outfname,height,width))
 609        R.append("par(lab=c(10,10,10))")
 610        R.append('qqplot(xvec,yvec,xlab="%s",ylab="%s",main="%s",sub="%s",pch=19,col="%s",cex=0.8)' % (xlab,ylab,maint,title,colour))
 611        R.append('points(mx,mx,type="l")')
 612        R.append('grid(col="lightgray",lty="dotted")')
 613        R.append('dev.off()')
 614
 615
 616    shead = subjects.pop(0) # get rid of head
 617    mhead = markers.pop(0)
 618    maf = mhead.index('maf')
 619    missfrac = mhead.index('missfrac')
 620    logphweall = mhead.index('logp_hwe_all')
 621    logphweunaff = mhead.index('logp_hwe_unaff')
 622    # check for at least some unaffected rml june 2009
 623    m_mendel = mhead.index('N_Mendel')
 624    fracmiss = shead.index('FracMiss')
 625    s_mendel = shead.index('Mendel_errors')
 626    s_het = shead.index('F_Stat')
 627    params = {}
 628    h = [float(x[logphweunaff]) for x in markers if len(x[logphweunaff]) >= logphweunaff
 629         and x[logphweunaff].upper() <> 'NA']
 630    if len(h) <> 0:
 631        xs = [logphweunaff, missfrac, maf, m_mendel, fracmiss, s_mendel, s_het]
 632        # plot for each of these cols
 633    else: # try hwe all instead - maybe no affection status available
 634        xs = [logphweall, missfrac, maf, m_mendel, fracmiss, s_mendel, s_het]
 635    ordplotme = [1,1,1,1,1,1,1] # ordered plots for everything!
 636    oreverseme = [1,1,0,1,1,1,0] # so larger values are oversampled
 637    qqplotme = [1,0,0,0,0,0,0] #
 638    qnplotme = [0,0,0,0,0,0,1] #
 639    nplots = len(xs)
 640    xlabnames = ['log(p) HWE (unaff)', 'Missing Rate: Markers', 'Minor Allele Frequency',
 641                 'Marker Mendel Error Count', 'Missing Rate: Subjects',
 642                 'Subject Mendel Error Count','Subject Inbreeding (het) F Statistic']
 643    plotnames = ['logphweunaff', 'missfrac', 'maf', 'm_mendel', 'fracmiss', 's_mendel','s_het']
 644    ploturls = ['%s_%s.pdf' % (basename,x) for x in plotnames] # real plotnames
 645    ordplotnames = ['%s_cum' % x for x in plotnames]
 646    ordploturls = ['%s_%s.pdf' % (basename,x) for x in ordplotnames] # real plotnames
 647    outfnames = [os.path.join(newfpath,ploturls[x]) for x in range(nplots)]
 648    ordoutfnames = [os.path.join(newfpath,ordploturls[x]) for x in range(nplots)]
 649    datasources = [markers,markers,markers,markers,subjects,subjects,subjects] # use this table
 650    titles = ["Marker HWE","Marker Missing Genotype", "Marker MAF","Marker Mendel",
 651        "Subject Missing Genotype","Subject Mendel",'Subject F Statistic']
 652    html = []
 653    pdflist = []
 654    R = []
 655    for n,column in enumerate(xs):
 656        dfn = '%d_%s.txt' % (n,titles[n])
 657        dfilepath = os.path.join(newfpath,dfn)
 658        dat = [float(x[column]) for x in datasources[n] if len(x) >= column
 659               and x[column][:2].upper() <> 'NA'] # plink gives both!
 660        if sum(dat) <> 0: # eg nada for mendel if case control?
 661            plotme = file(dfilepath,'w')
 662            plotme.write('\n'.join(['%f' % x for x in dat])) # pass as a file - copout!
 663            tR = rHist(plotme=dfilepath,outfname=outfnames[n],xlabname=xlabnames[n],
 664              title=titles[n],basename=basename,nbreaks=nbreaks)
 665            R += tR
 666            row = [titles[n],ploturls[n],outfnames[n] ]
 667            html.append(row)
 668            pdflist.append(outfnames[n])
 669            if ordplotme[n]: # for missingness, hwe - plots to see where cutoffs will end up
 670                otitle = 'Ranked %s' % titles[n]
 671                dat.sort()
 672                if oreverseme[n]:
 673                    dat.reverse()
 674                    ndat = len(dat)
 675                    xvec = range(ndat)
 676                    xvec = [100.0*(n-x)/n for x in xvec] # convert to centile
 677                    # now have half a million markers eg - too many to plot all for a pdf - sample to get 10k or so points
 678                    maxveclen = 1000.0 # for reasonable pdf sizes!
 679                    if ndat > maxveclen: # oversample part of the distribution
 680                        always = min(1000,ndat/20) # oversample smaller of lowest few hundred items or 5%
 681                        skip = int(ndat/maxveclen) # take 1 in skip to get about maxveclen points
 682                        samplei = [i for i in range(ndat) if (i % skip == 0) or (i < always)] # always oversample first sorted values
 683                        yvec = [yvec[i] for i in samplei] # always get first and last
 684                        xvec = [xvec[i] for i in samplei] # always get first and last
 685                        plotme = file(dfilepath,'w')
 686                        plotme.write('xvec\tyvec\n')
 687                        plotme.write('\n'.join(['%f\t%f' % (xvec[i],y) for y in yvec])) # pass as a file - copout!
 688                tR = rCum(plotme=dat,outfname=ordoutfnames[n],xlabname='Ordered %s' % xlabnames[n],
 689                  title=otitle,basename=basename,nbreaks=nbreaks)
 690                R += tR
 691                row = [otitle,ordploturls[n],ordoutfnames[n]]
 692                html.append(row)
 693                pdflist.append(ordoutfnames[n])
 694            if qqplotme[n]: #
 695                otitle = 'LogQQ plot %s' % titles[n]
 696                dat.sort()
 697                dat.reverse()
 698                ofn = os.path.split(ordoutfnames[n])
 699                ofn = os.path.join(ofn[0],'QQ%s' % ofn[1])
 700                ofu = os.path.split(ordploturls[n])
 701                ofu = os.path.join(ofu[0],'QQ%s' % ofu[1])
 702                tR = rQQ(plotme=dat,outfname=ofn,xlabname='QQ %s' % xlabnames[n],
 703                  title=otitle,basename=basename)
 704                R += tR
 705                row = [otitle,ofu,ofn]
 706                html.append(row)
 707                pdflist.append(ofn)
 708            elif qnplotme[n]:
 709                otitle = 'F Statistic %s' % titles[n]
 710                dat.sort()
 711                dat.reverse()
 712                ofn = os.path.split(ordoutfnames[n])
 713                ofn = os.path.join(ofn[0],'FQNorm%s' % ofn[1])
 714                ofu = os.path.split(ordploturls[n])
 715                ofu = os.path.join(ofu[0],'FQNorm%s' % ofu[1])
 716                tR = rQQNorm(plotme=dat,outfname=ofn,xlabname='F QNorm %s' % xlabnames[n],
 717                  title=otitle,basename=basename)
 718                R += tR
 719                row = [otitle,ofu,ofn]
 720                html.append(row)
 721                pdflist.append(ofn)
 722        else:
 723            print '#$# no data for # %d - %s, data[:10]=%s' % (n,titles[n],dat[:10])
 724    rlog,flist = RRun(rcmd=R,title='makeQCplots',outdir=newfpath)
 725    if nup>0:
 726        # pdfjoin --outfile chr1test.pdf `ls database/files/dataset_396_files/*.pdf`
 727        # pdfnup chr1test.pdf --nup 3x3 --frame true --outfile chr1test3.pdf
 728        filestojoin = ' '.join(pdflist) # all the file names so far
 729        afname = '%s_All_Paged.pdf' % (basename)
 730        fullafname = os.path.join(newfpath,afname)
 731        expl = 'All %s QC Plots joined into a single pdf' % basename
 732        vcl = 'pdfjoin %s --outfile %s ' % (filestojoin, fullafname)
 733        # make single page pdf
 734        x=subprocess.Popen(vcl,shell=True,cwd=newfpath)
 735        retval = x.wait()
 736        row = [expl,afname,fullafname]
 737        html.insert(0,row) # last rather than second
 738        nfname = '%s_All_%dx%d.pdf' % (basename,nup,nup)
 739        fullnfname = os.path.join(newfpath,nfname)
 740        expl = 'All %s QC Plots %d by %d to a page' % (basename,nup,nup)
 741        vcl = 'pdfnup %s --nup %dx%d --frame true --outfile %s' % (afname,nup,nup,fullnfname)
 742        # make thumbnail images
 743        x=subprocess.Popen(vcl,shell=True,cwd=newfpath)
 744        retval = x.wait()
 745        row = [expl,nfname,fullnfname]
 746        html.insert(1,row) # this goes second
 747    vcl = 'mogrify -format jpg -resize %s %s' % (mogresize, os.path.join(newfpath,'*.pdf'))
 748    # make thumbnail images
 749    x=subprocess.Popen(vcl,shell=True,cwd=newfpath)
 750    retval = x.wait()
 751    return html # elements for an ordered list of urls or whatever..
 752
 753def countHet(bedf='fakeped_500000',linkageped=True,froot='fake500k',outfname="ahetf",logf='rgQC.log'):
 754    """
 755    NO LONGER USED - historical interest
 756    count het loci for each subject to look for outliers = ? contamination
 757    assume ped file is linkage format
 758    need to make a ped file from the bed file we were passed
 759    """
 760    vcl = [plinke,'--bfile',bedf,'--recode','--out','%s_recode' % froot] # write a recoded ped file from the real .bed file
 761    p=subprocess.Popen(' '.join(vcl),shell=True)
 762    retval = p.wait()
 763    f = open('%s_recode.recode.ped' % froot,'r')
 764    if not linkageped:
 765        head = f.next() # throw away header
 766    hets = [] # simple count of het loci per subject. Expect poisson?
 767    n = 1
 768    for l in f:
 769        n += 1
 770        ll = l.strip().split()
 771        if len(ll) > 6:
 772            iid = idjoiner.join(ll[:2]) # fam_iid
 773            gender = ll[4]
 774            alleles = ll[6:]
 775            nallele = len(alleles)
 776            nhet = 0
 777            for i in range(nallele/2):
 778                a1=alleles[2*i]
 779                a2=alleles[2*i+1]
 780                if alleles[2*i] <> alleles[2*i+1]: # must be het
 781                    if not missvals.get(a1,None) and not missvals.get(a2,None):
 782                        nhet += 1
 783            hets.append((nhet,iid,gender)) # for sorting later
 784    f.close()
 785    hets.sort()
 786    hets.reverse() # biggest nhet now on top
 787    f = open(outfname ,'w')
 788    res = ['%d\t%s\t%s' % (x) for x in hets] # I love list comprehensions
 789    res.insert(0,'nhetloci\tfamid_iid\tgender')
 790    res.append('')
 791    f.write('\n'.join(res))
 792    f.close()
 793
 794
 795
 796def subjectRep(froot='cleantest',outfname="srep",newfpath='.',logf = None):
 797    """by subject (missingness = .imiss, mendel = .imendel)
 798    assume replicates have an underscore in family id for
 799    hapmap testing
 800    For sorting, we need floats and integers
 801    """
 802    isexfile = '%s.sexcheck' % froot
 803    imissfile = '%s.imiss' % froot
 804    imendfile = '%s.imendel' % froot
 805    ihetfile = '%s.het' % froot
 806    logf.write('## subject reports starting at %s\n' % timenow())
 807    outfile = os.path.join(newfpath,outfname)
 808    idlist = []
 809    imissdict = {}
 810    imenddict = {}
 811    isexdict = {}
 812    ihetdict = {}
 813    Tops = {}
 814    Tnames = ['Ranked Subject Missing Genotype', 'Ranked Subject Mendel',
 815              'Ranked Sex check', 'Ranked Inbreeding (het) F statistic']
 816    Tsorts = [2,3,6,8]
 817    Treverse = [True,True,True,False] # so first values are worser
 818    #rhead = ['famId','iId','FracMiss','Mendel_errors','Ped_sex','SNP_sex','Status','Fest']
 819    ##              FID            IID MISS_PHENO   N_MISS   N_GENO   F_MISS
 820    ##  1552042370_A   1552042370_A          N     5480   549883 0.009966
 821    ##  1552042410_A   1552042410_A          N     1638   549883 0.002979
 822 
 823    # ------------------missing--------------------
 824    # imiss has FID  IID MISS_PHENO N_MISS  F_MISS
 825    # we want F_MISS
 826    try:
 827        f = open(imissfile,'r')
 828    except:
 829        logf.write('# file %s is missing - talk about irony\n' % imissfile)
 830        f = None
 831    if f:
 832        for n,line in enumerate(f):
 833            ll = line.strip().split()
 834            if n == 0:
 835                head = [x.upper() for x in ll] # expect above                
 836                fidpos = head.index('FID')
 837                iidpos = head.index('IID')
 838                fpos = head.index('F_MISS')
 839            elif len(ll) >= fpos: # full line
 840                fid = ll[fidpos]
 841                #if fid.find('_') == -1: # not replicate! - icondb ids have these...
 842                iid = ll[iidpos]
 843                fmiss = ll[fpos]
 844                id = '%s%s%s' % (fid,idjoiner,iid)
 845                imissdict[id] = fmiss
 846                idlist.append(id)
 847        f.close()
 848    logf.write('## imissfile %s contained %d ids\n' % (imissfile,len(idlist)))
 849    # ------------------mend-------------------
 850    # *.imendel has FID  IID   N
 851    # we want N
 852    gotmend = True
 853    try:
 854        f = open(imendfile,'r')
 855    except:
 856        gotmend = False
 857        for id in idlist:
 858            imenddict[id] = '0'
 859    if gotmend:
 860        for n,line in enumerate(f):
 861            ll = line.strip().split()
 862            if n == 0:
 863                head = [x.upper() for x in ll] # expect above                
 864                npos = head.index('N')
 865                fidpos = head.index('FID')
 866                iidpos = head.index('IID')
 867            elif len(ll) >= npos: # full line
 868                fid = ll[fidpos]
 869                iid = ll[iidpos]
 870                id = '%s%s%s' % (fid,idjoiner,iid)
 871                nmend = ll[npos]
 872                imenddict[id] = nmend
 873        f.close()
 874    else:
 875        logf.write('## error No %s file - assuming not family data\n' % imendfile)
 876    # ------------------sex check------------------
 877    #[rerla@hg fresh]$ head /home/rerla/fresh/database/files/dataset_978_files/CAMP2007Dirty.sexcheck
 878    # sexcheck has FID IID PEDSEX SNPSEX STATUS F
 879    ##
 880    ##     FID     Family ID
 881    ##     IID     Individual ID
 882    ##     PEDSEX  Sex as determined in pedigree file (1=male, 2=female)
 883    ##     SNPSEX  Sex as determined by X chromosome
 884    ##     STATUS  Displays "PROBLEM" or "OK" for each individual
 885    ##     F       The actual X chromosome inbreeding (homozygosity) estimate
 886    ##
 887    ##    A PROBLEM arises if the two sexes do not match, or if the SNP data or pedigree data are
 888    ##    ambiguous with regard to sex.
 889    ##    A male call is made if F is more than 0.8; a femle call is made if F is less than 0.2.
 890    try:
 891        f = open(isexfile,'r')
 892        got_sexcheck = 1
 893    except:
 894        got_sexcheck = 0
 895    if got_sexcheck:
 896        for n,line in enumerate(f):
 897            ll = line.strip().split()
 898            if n == 0:
 899                head = [x.upper() for x in ll] # expect above                
 900                fidpos = head.index('FID')
 901                iidpos = head.index('IID')
 902                pedsexpos = head.index('PEDSEX')
 903                snpsexpos = head.index('SNPSEX')
 904                statuspos = head.index('STATUS')
 905                fpos = head.index('F')
 906            elif len(ll) >= fpos: # full line
 907                fid = ll[fidpos]
 908                iid = ll[iidpos]
 909                fest = ll[fpos]
 910                pedsex = ll[pedsexpos]
 911                snpsex = ll[snpsexpos]
 912                stat = ll[statuspos]
 913                id = '%s%s%s' % (fid,idjoiner,iid)
 914                isexdict[id] = (pedsex,snpsex,stat,fest)
 915        f.close()
 916    else:
 917        # this only happens if there are no subjects!
 918        logf.write('No %s file - assuming no sex errors' % isexfile)
 919    ##
 920    ##    FID  IID       O(HOM)       E(HOM)        N(NM)            F
 921    ##    457    2       490665    4.928e+05       722154    -0.009096
 922    ##    457    3       464519     4.85e+05       710986      -0.0908
 923    ##   1037    2       461632    4.856e+05       712025       -0.106
 924    ##   1037    1       491845    4.906e+05       719353     0.005577
 925    try:
 926        f = open(ihetfile,'r')
 927    except:
 928        f = None
 929        logf.write('## No %s file - did we run --het in plink?\n' % ihetfile)
 930    if f:
 931        for i,line in enumerate(f):
 932            ll = line.strip().split()
 933            if i == 0:
 934                head = [x.upper() for x in ll] # expect above                
 935                fidpos = head.index('FID')
 936                iidpos = head.index('IID')
 937                fpos = head.index('F')
 938                n = 0
 939            elif len(ll) >= fpos: # full line
 940                fid = ll[fidpos]            
 941                iid = ll[iidpos]
 942                fhet = ll[fpos]
 943                id = '%s%s%s' % (fid,idjoiner,iid)
 944                ihetdict[id] = fhet
 945        f.close()      # now assemble and output result list
 946    rhead = ['famId','iId','FracMiss','Mendel_errors','Ped_sex','SNP_sex','Status','XHomEst','F_Stat']
 947    res = []
 948    fres = [] # floats for sorting
 949    for id in idlist: # for each snp in found order
 950        fid,iid = id.split(idjoiner) # recover keys
 951        f_missing = imissdict.get(id,'0.0')
 952        nmend = imenddict.get(id,'0')
 953        (pedsex,snpsex,status,fest) = isexdict.get(id,('0','0','0','0.0'))
 954        fhet = ihetdict.get(id,'0.0')
 955        res.append([fid,iid,f_missing,nmend,pedsex,snpsex,status,fest,fhet])
 956        try:
 957            ff_missing = float(f_missing)
 958        except:
 959            ff_missing = 0.0
 960        try:
 961            inmend = int(nmend)
 962        except:
 963            inmend = 0
 964        try:
 965            ffest = float(fest)
 966        except:
 967            fest = 0.0
 968        try:
 969            ffhet = float(fhet)
 970        except:
 971            ffhet = 0.0
 972        fres.append([fid,iid,ff_missing,inmend,pedsex,snpsex,status,ffest,ffhet])
 973    ntokeep = max(20,len(res)/keepfrac)
 974    for i,col in enumerate(Tsorts):
 975        fres.sort(key=operator.itemgetter(col))
 976        if Treverse[i]:
 977            fres.reverse()
 978        repname = Tnames[i]
 979        Tops[repname] = fres[0:ntokeep]
 980        Tops[repname] = [map(str,x) for x in Tops[repname]]
 981        Tops[repname].insert(0,rhead)
 982    res.sort()
 983    res.insert(0,rhead)
 984    logf.write('### writing %s report with %s' % ( outfile,res[0]))  
 985    f = open(outfile,'w')
 986    f.write('\n'.join(['\t'.join(x) for x in res]))
 987    f.write('\n')
 988    f.close()
 989    return res,Tops
 990
 991def markerRep(froot='cleantest',outfname="mrep",newfpath='.',logf=None,maplist=None ):
 992    """by marker (hwe = .hwe, missingness=.lmiss, freq = .frq)
 993    keep a list of marker order but keep all stats in dicts
 994    write out a fake xls file for R or SAS etc
 995    kinda clunky, but..
 996    TODO: ensure stable if any file not found?
 997    """
 998    mapdict = {}
 999    if maplist <> None:
1000       rslist = [x[1] for x in maplist]
1001       offset = [(x[0],x[3]) for x in maplist]
1002       mapdict = dict(zip(rslist,offset))
1003    hwefile = '%s.hwe' % froot
1004    lmissfile = '%s.lmiss' % froot
1005    freqfile = '%s.frq' % froot
1006    lmendfile = '%s.lmendel' % froot
1007    outfile = os.path.join(newfpath,outfname)
1008    markerlist = []
1009    chromlist = []
1010    hwedict = {}
1011    lmissdict = {}
1012    freqdict = {}
1013    lmenddict = {}
1014    Tops = {}
1015    Tnames = ['Ranked Marker MAF', 'Ranked Marker Missing Genotype', 'Ranked Marker HWE', 'Ranked Marker Mendel']
1016    Tsorts = [3,6,10,11]
1017    Treverse = [False,True,True,True] # so first values are worse(r)
1018    #res.append([rs,chrom,offset,maf,a1,a2,f_missing,hwe_all[0],hwe_all[1],hwe_unaff[0],hwe_unaff[1],nmend])
1019    #rhead = ['snp','chrom','maf','a1','a2','missfrac','p_hwe_all','logp_hwe_all','p_hwe_unaff','logp_hwe_unaff','N_Mendel']
1020    # -------------------hwe--------------------------
1021    #    hwe has SNP TEST  GENO   O(HET)   E(HET) P_HWD
1022    # we want all hwe where P_HWD <> NA
1023    # ah changed in 1.04 to
1024    ##  CHR         SNP     TEST   A1   A2                 GENO   O(HET)   E(HET)            P 
1025    ##   1   rs6671164      ALL    2    3           34/276/613    0.299   0.3032       0.6644
1026    ##   1   rs6671164      AFF    2    3   …

Large files files are truncated, but you can click here to view the full file