PageRenderTime 53ms CodeModel.GetById 15ms RepoModel.GetById 0ms app.codeStats 0ms

/tools/rgenetics/rgQC.py

https://bitbucket.org/cistrome/cistrome-harvard/
Python | 1354 lines | 1284 code | 11 blank | 59 comment | 5 complexity | 60c9484f372fa7d2faad43140faf0631 MD5 | raw file
  1. # oct 15 rpy replaced - temp fix until we get gnuplot working
  2. # rpy deprecated - replace with RRun
  3. # fixes to run functional test! oct1 2009
  4. # needed to expand our path with os.path.realpath to get newpath working with
  5. # all the fancy pdfnup stuff
  6. # and a fix to pruneld to write output to where it should be
  7. # smallish data in test-data/smallwga in various forms
  8. # python ../tools/rgenetics/rgQC.py -i smallwga -o smallwga -s smallwga/smallwga.html -p smallwga
  9. # child files are deprecated and broken as at july 19 2009
  10. # need to move them to the html file extrafiles path
  11. # found lots of corner cases with some illumina data where cnv markers were
  12. # included
  13. # and where affection status was all missing !
  14. # added links to tab files showing worst 1/keepfrac markers and subjects
  15. # ross lazarus january 2008
  16. #
  17. # added named parameters
  18. # to ensure no silly slippages if non required parameter in the most general case
  19. # some potentially useful things here reusable in complex scripts
  20. # with lots'o'html (TM)
  21. # aug 17 2007 rml
  22. #
  23. # added marker and subject and parenting april 14 rml
  24. # took a while to get the absolute paths right for all the file munging
  25. # as of april 16 seems to work..
  26. # getting galaxy to serve images in html reports is a little tricky
  27. # we don't want QC reports to be dozens of individual files, so need
  28. # to use the url /static/rg/... since galaxy's web server will happily serve images
  29. # from there
  30. # galaxy passes output files as relative paths
  31. # these have to be munged by rgQC.py before calling this
  32. # galaxy will pass in 2 file names - one for the log
  33. # and one for the final html report
  34. # of the form './database/files/dataset_66.dat'
  35. # we need to be working in that directory so our plink output files are there
  36. # so these have to be munged by rgQC.py before calling this
  37. # note no ped file passed so had to remove the -l option
  38. # for plinkParse.py that makes a heterozygosity report from the ped
  39. # file - needs fixing...
  40. # new: importing manhattan/qqplot plotter
  41. # def doManQQ(input_fname,chrom_col,offset_col,pval_cols,title,grey,ctitle,outdir):
  42. # """ draw a qq for pvals and a manhattan plot if chrom/offset <> 0
  43. # contains some R scripts as text strings - we substitute defaults into the calls
  44. # to make them do our bidding - and save the resulting code for posterity
  45. # this can be called externally, I guess...for QC eg?
  46. # """
  47. #
  48. # rcmd = '%s%s' % (rcode,rcode2 % (input_fname,chrom_col,offset_col,pval_cols,title,grey))
  49. # rlog,flist = RRun(rcmd=rcmd,title=ctitle,outdir=outdir)
  50. # return rlog,flist
  51. from optparse import OptionParser
  52. import sys,os,shutil, glob, math, subprocess, time, operator, random, tempfile, copy, string
  53. from os.path import abspath
  54. from rgutils import galhtmlprefix, galhtmlpostfix, RRun, timenow, plinke, rexe, runPlink, pruneLD
  55. import rgManQQ
  56. prog = os.path.split(sys.argv[0])[1]
  57. vers = '0.4 april 2009 rml'
  58. idjoiner = '_~_~_' # need something improbable..
  59. # many of these may need fixing for a new install
  60. myversion = vers
  61. keepfrac = 20 # fraction to keep after sorting by each interesting value
  62. missvals = {'0':'0','N':'N','-9':'-9','-':'-'} # fix me if these change!
  63. mogresize = "x300" # this controls the width for jpeg thumbnails
  64. def makePlots(markers=[],subjects=[],newfpath='.',basename='test',nbreaks='20',nup=3,height=10,width=8,rgbin=''):
  65. """
  66. marker rhead = ['snp','chrom','maf','a1','a2','missfrac',
  67. 'p_hwe_all','logp_hwe_all','p_hwe_unaff','logp_hwe_unaff','N_Mendel']
  68. subject rhead = ['famId','iId','FracMiss','Mendel_errors','Ped_sex','SNP_sex','Status','Fest']
  69. """
  70. def rHist(plotme=[],outfname='',xlabname='',title='',basename='',nbreaks=50):
  71. """ rHist <- function(plotme,froot,plotname,title,mfname,nbreaks=50)
  72. # generic histogram and vertical boxplot in a 3:1 layout
  73. # returns the graphic file name for inclusion in the web page
  74. # broken out here for reuse
  75. # ross lazarus march 2007
  76. """
  77. screenmat = (1,2,1,2) # create a 2x2 cabvas
  78. widthlist = (80,20) # change to 4:1 ratio for histo and boxplot
  79. rpy.r.pdf( outfname, height , width )
  80. #rpy.r.layout(rpy.r.matrix(rpy.r.c(1,1,1,2), 1, 4, byrow = True)) # 3 to 1 vertical plot
  81. m = rpy.r.matrix((1,1,1,2),nrow=1,ncol=4,byrow=True)
  82. # in R, m = matrix(c(1,2),nrow=1,ncol=2,byrow=T)
  83. rpy.r("layout(matrix(c(1,1,1,2),nrow=1,ncol=4,byrow=T))") # 4 to 1 vertical plot
  84. maint = 'QC for %s - %s' % (basename,title)
  85. rpy.r.hist(plotme,main=maint, xlab=xlabname,breaks=nbreaks,col="maroon",cex=0.8)
  86. rpy.r.boxplot(plotme,main='',col="maroon",outline=False)
  87. rpy.r.dev_off()
  88. def rCum(plotme=[],outfname='',xlabname='',title='',basename='',nbreaks=100):
  89. """
  90. Useful to see what various cutoffs yield - plot percentiles
  91. """
  92. n = len(plotme)
  93. maxveclen = 1000.0 # for reasonable pdf sizes!
  94. yvec = copy.copy(plotme)
  95. # arrives already in decending order of importance missingness or mendel count by subj or marker
  96. xvec = range(n)
  97. xvec = [100.0*(n-x)/n for x in xvec] # convert to centile
  98. # now have half a million markers eg - too many to plot all for a pdf - sample to get 10k or so points
  99. if n > maxveclen: # oversample part of the distribution
  100. always = min(1000,n/20) # oversample smaller of lowest few hundred items or 5%
  101. skip = int(n/maxveclen) # take 1 in skip to get about maxveclen points
  102. samplei = [i for i in range(n) if (i % skip == 0) or (i < always)] # always oversample first sorted values
  103. yvec = [yvec[i] for i in samplei] # always get first and last
  104. xvec = [xvec[i] for i in samplei] # always get first and last
  105. # need to supply the x axis or else rpy prints the freaking vector on the pdf - go figure
  106. rpy.r.pdf( outfname, height , width )
  107. maint = 'QC for %s - %s' % (basename,title)
  108. rpy.r("par(lab=c(10,10,10))") # so our grid is denser than the default 5
  109. rpy.r.plot(xvec,yvec,type='p',main=maint, ylab=xlabname, xlab='Sample Percentile',col="maroon",cex=0.8)
  110. rpy.r.grid(nx = None, ny = None, col = "lightgray", lty = "dotted")
  111. rpy.r.dev_off()
  112. def rQQ(plotme=[], outfname='fname',title='title',xlabname='Sample',basename=''):
  113. """
  114. y is data for a qq plot and ends up on the x axis go figure
  115. if sampling, oversample low values - all the top 1% ?
  116. this version called with -log10 transformed hwe
  117. """
  118. nrows = len(plotme)
  119. fn = float(nrows)
  120. xvec = [-math.log10(x/fn) for x in range(1,(nrows+1))]
  121. mx = [0,math.log10(fn)] # if 1000, becomes 3 for the null line
  122. maxveclen = 3000
  123. yvec = copy.copy(plotme)
  124. if nrows > maxveclen:
  125. # now have half a million markers eg - too many to plot all for a pdf - sample to get 10k or so points
  126. # oversample part of the distribution
  127. always = min(1000,nrows/20) # oversample smaller of lowest few hundred items or 5%
  128. skip = int(nrows/float(maxveclen)) # take 1 in skip to get about maxveclen points
  129. samplei = [i for i in range(nrows) if (i < always) or (i % skip == 0)]
  130. # always oversample first sorted (here lowest) values
  131. yvec = [yvec[i] for i in samplei] # always get first and last
  132. xvec = [xvec[i] for i in samplei] # and sample xvec same way
  133. maint='Log QQ Plot (random %d of %d)' % (len(yvec),nrows)
  134. else:
  135. maint='Log QQ Plot(n=%d)' % (nrows)
  136. mx = [0,math.log10(fn)] # if 1000, becomes 3 for the null line
  137. ylab = '%s' % xlabname
  138. xlab = '-log10(Uniform 0-1)'
  139. # need to supply the x axis or else rpy prints the freaking vector on the pdf - go figure
  140. rpy.r.pdf( outfname, height , width )
  141. rpy.r("par(lab=c(10,10,10))") # so our grid is denser than the default 5
  142. rpy.r.qqplot(xvec,yvec,xlab=xlab,ylab=ylab,main=maint,sub=title,pch=19,col="maroon",cex=0.8)
  143. rpy.r.points(mx,mx,type='l')
  144. rpy.r.grid(nx = None, ny = None, col = "lightgray", lty = "dotted")
  145. rpy.r.dev_off()
  146. def rMultiQQ(plotme = [],nsplits=5, outfname='fname',title='title',xlabname='Sample',basename=''):
  147. """
  148. data must contain p,x,y as data for a qq plot, quantiles of x and y axis used to create a
  149. grid of qq plots to show departure from null at extremes of data quality
  150. Need to plot qqplot(p,unif) where the p's come from one x and one y quantile
  151. and ends up on the x axis go figure
  152. if sampling, oversample low values - all the top 1% ?
  153. """
  154. data = copy.copy(plotme)
  155. nvals = len(data)
  156. stepsize = nvals/nsplits
  157. logstep = math.log10(stepsize) # so is 3 for steps of 1000
  158. quints = range(0,nvals,stepsize) # quintile cutpoints for each layer
  159. data.sort(key=itertools.itemgetter(1)) # into x order
  160. rpy.r.pdf( outfname, height , width )
  161. rpy.r("par(mfrow = c(%d,%d))" % (nsplits,nsplits))
  162. yvec = [-math.log10(random.random()) for x in range(stepsize)]
  163. yvec.sort() # size of each step is expected range for xvec under null?!
  164. for rowstart in quints:
  165. rowend = rowstart + stepsize
  166. if nvals - rowend < stepsize: # finish last split
  167. rowend = nvals
  168. row = data[rowstart:rowend]
  169. row.sort(key=itertools.itemgetter(2)) # into y order
  170. for colstart in quints:
  171. colend = colstart + stepsize
  172. if nvals - colend < stepsize: # finish last split
  173. colend = nvals
  174. cell = row[colstart:colend]
  175. xvec = [-math.log10(x[0]) for x in cell] # all the pvalues for this cell
  176. rpy.r.qqplot(xvec,yvec,xlab=xlab,ylab=ylab,pch=19,col="maroon",cex=0.8)
  177. rpy.r.points(c(0,logstep),c(0,logstep),type='l')
  178. rpy.r.dev_off()
  179. def rQQNorm(plotme=[], outfname='fname',title='title',xlabname='Sample',basename=''):
  180. """
  181. y is data for a qqnorm plot
  182. if sampling, oversample low values - all the top 1% ?
  183. """
  184. rangeunif = len(plotme)
  185. nunif = 1000
  186. maxveclen = 3000
  187. nrows = len(plotme)
  188. data = copy.copy(plotme)
  189. if nrows > maxveclen:
  190. # now have half a million markers eg - too many to plot all for a pdf - sample to get 10k or so points
  191. # oversample part of the distribution
  192. always = min(1000,nrows/20) # oversample smaller of lowest few hundred items or 5%
  193. skip = int((nrows-always)/float(maxveclen)) # take 1 in skip to get about maxveclen points
  194. samplei = [i for i in range(nrows) if (i % skip == 0) or (i < always)]
  195. # always oversample first sorted (here lowest) values
  196. yvec = [data[i] for i in samplei] # always get first and last
  197. maint='Log QQ Plot (random %d of %d)' % (len(yvec),nrows)
  198. else:
  199. yvec = data
  200. maint='Log QQ Plot(n=%d)' % (nrows)
  201. n = 1000
  202. ylab = '%s' % xlabname
  203. xlab = 'Normal'
  204. # need to supply the x axis or else rpy prints the freaking vector on the pdf - go figure
  205. rpy.r.pdf( outfname, height , width )
  206. rpy.r("par(lab=c(10,10,10))") # so our grid is denser than the default 5
  207. rpy.r.qqnorm(yvec,xlab=xlab,ylab=ylab,main=maint,sub=title,pch=19,col="maroon",cex=0.8)
  208. rpy.r.grid(nx = None, ny = None, col = "lightgray", lty = "dotted")
  209. rpy.r.dev_off()
  210. def rMAFMissqq(plotme=[], outfname='fname',title='title',xlabname='Sample',basename=''):
  211. """
  212. layout qq plots for pvalues within rows of increasing MAF and columns of increasing missingness
  213. like the GAIN qc tools
  214. y is data for a qq plot and ends up on the x axis go figure
  215. if sampling, oversample low values - all the top 1% ?
  216. """
  217. rangeunif = len(plotme)
  218. nunif = 1000
  219. fn = float(rangeunif)
  220. xvec = [-math.log10(x/fn) for x in range(1,(rangeunif+1))]
  221. skip = max(int(rangeunif/fn),1)
  222. # force include last points
  223. mx = [0,math.log10(fn)] # if 1000, becomes 3 for the null line
  224. maxveclen = 2000
  225. nrows = len(plotme)
  226. data = copy.copy(plotme)
  227. data.sort() # low to high - oversample low values
  228. if nrows > maxveclen:
  229. # now have half a million markers eg - too many to plot all for a pdf - sample to get 10k or so points
  230. # oversample part of the distribution
  231. always = min(1000,nrows/20) # oversample smaller of lowest few hundred items or 5%
  232. skip = int(nrows/float(maxveclen)) # take 1 in skip to get about maxveclen points
  233. samplei = [i for i in range(nrows) if (i % skip == 0) or (i < always)]
  234. # always oversample first sorted (here lowest) values
  235. yvec = [data[i] for i in samplei] # always get first and last
  236. xvec = [xvec[i] for i in samplei] # and sample xvec same way
  237. maint='Log QQ Plot (random %d of %d)' % (len(yvec),nrows)
  238. else:
  239. yvec = data
  240. maint='Log QQ Plot(n=%d)' % (nrows)
  241. n = 1000
  242. mx = [0,log10(fn)] # if 1000, becomes 3 for the null line
  243. ylab = '%s' % xlabname
  244. xlab = '-log10(Uniform 0-1)'
  245. # need to supply the x axis or else rpy prints the freaking vector on the pdf - go figure
  246. rpy.r.pdf( outfname, height , width )
  247. rpy.r("par(lab=c(10,10,10))") # so our grid is denser than the default 5
  248. rpy.r.qqplot(xvec,yvec,xlab=xlab,ylab=ylab,main=maint,sub=title,pch=19,col="maroon",cex=0.8)
  249. rpy.r.points(mx,mx,type='l')
  250. rpy.r.grid(nx = None, ny = None, col = "lightgray", lty = "dotted")
  251. rpy.r.dev_off()
  252. fdsto,stofile = tempfile.mkstemp()
  253. sto = open(stofile,'w')
  254. import rpy # delay to avoid rpy stdout chatter replacing galaxy file blurb
  255. mog = 'mogrify'
  256. pdfnup = 'pdfnup'
  257. pdfjoin = 'pdfjoin'
  258. shead = subjects.pop(0) # get rid of head
  259. mhead = markers.pop(0)
  260. maf = mhead.index('maf')
  261. missfrac = mhead.index('missfrac')
  262. logphweall = mhead.index('logp_hwe_all')
  263. logphweunaff = mhead.index('logp_hwe_unaff')
  264. # check for at least some unaffected rml june 2009
  265. m_mendel = mhead.index('N_Mendel')
  266. fracmiss = shead.index('FracMiss')
  267. s_mendel = shead.index('Mendel_errors')
  268. s_het = shead.index('F_Stat')
  269. params = {}
  270. hweres = [float(x[logphweunaff]) for x in markers if len(x[logphweunaff]) >= logphweunaff
  271. and x[logphweunaff].upper() <> 'NA']
  272. if len(hweres) <> 0:
  273. xs = [logphweunaff, missfrac, maf, m_mendel, fracmiss, s_mendel, s_het]
  274. # plot for each of these cols
  275. else: # try hwe all instead - maybe no affection status available
  276. xs = [logphweall, missfrac, maf, m_mendel, fracmiss, s_mendel, s_het]
  277. ordplotme = [1,1,1,1,1,1,1] # ordered plots for everything!
  278. oreverseme = [1,1,0,1,1,1,0] # so larger values are oversampled
  279. qqplotme = [1,0,0,0,0,0,0] #
  280. qnplotme = [0,0,0,0,0,0,1] #
  281. nplots = len(xs)
  282. xlabnames = ['log(p) HWE (unaff)', 'Missing Rate: Markers', 'Minor Allele Frequency',
  283. 'Marker Mendel Error Count', 'Missing Rate: Subjects',
  284. 'Subject Mendel Error Count','Subject Inbreeding (het) F Statistic']
  285. plotnames = ['logphweunaff', 'missfrac', 'maf', 'm_mendel', 'fracmiss', 's_mendel','s_het']
  286. ploturls = ['%s_%s.pdf' % (basename,x) for x in plotnames] # real plotnames
  287. ordplotnames = ['%s_cum' % x for x in plotnames]
  288. ordploturls = ['%s_%s.pdf' % (basename,x) for x in ordplotnames] # real plotnames
  289. outfnames = [os.path.join(newfpath,ploturls[x]) for x in range(nplots)]
  290. ordoutfnames = [os.path.join(newfpath,ordploturls[x]) for x in range(nplots)]
  291. datasources = [markers,markers,markers,markers,subjects,subjects,subjects] # use this table
  292. titles = ["Marker HWE","Marker Missing Genotype", "Marker MAF","Marker Mendel",
  293. "Subject Missing Genotype","Subject Mendel",'Subject F Statistic']
  294. html = []
  295. pdflist = []
  296. for n,column in enumerate(xs):
  297. dat = [float(x[column]) for x in datasources[n] if len(x) >= column
  298. and x[column][:2].upper() <> 'NA'] # plink gives both!
  299. if sum(dat) <> 0: # eg nada for mendel if case control?
  300. rHist(plotme=dat,outfname=outfnames[n],xlabname=xlabnames[n],
  301. title=titles[n],basename=basename,nbreaks=nbreaks)
  302. row = [titles[n],ploturls[n],outfnames[n] ]
  303. html.append(row)
  304. pdflist.append(outfnames[n])
  305. if ordplotme[n]: # for missingness, hwe - plots to see where cutoffs will end up
  306. otitle = 'Ranked %s' % titles[n]
  307. dat.sort()
  308. if oreverseme[n]:
  309. dat.reverse()
  310. rCum(plotme=dat,outfname=ordoutfnames[n],xlabname='Ordered %s' % xlabnames[n],
  311. title=otitle,basename=basename,nbreaks=1000)
  312. row = [otitle,ordploturls[n],ordoutfnames[n]]
  313. html.append(row)
  314. pdflist.append(ordoutfnames[n])
  315. if qqplotme[n]: #
  316. otitle = 'LogQQ plot %s' % titles[n]
  317. dat.sort()
  318. dat.reverse()
  319. ofn = os.path.split(ordoutfnames[n])
  320. ofn = os.path.join(ofn[0],'QQ%s' % ofn[1])
  321. ofu = os.path.split(ordploturls[n])
  322. ofu = os.path.join(ofu[0],'QQ%s' % ofu[1])
  323. rQQ(plotme=dat,outfname=ofn,xlabname='QQ %s' % xlabnames[n],
  324. title=otitle,basename=basename)
  325. row = [otitle,ofu,ofn]
  326. html.append(row)
  327. pdflist.append(ofn)
  328. elif qnplotme[n]:
  329. otitle = 'F Statistic %s' % titles[n]
  330. dat.sort()
  331. dat.reverse()
  332. ofn = os.path.split(ordoutfnames[n])
  333. ofn = os.path.join(ofn[0],'FQNorm%s' % ofn[1])
  334. ofu = os.path.split(ordploturls[n])
  335. ofu = os.path.join(ofu[0],'FQNorm%s' % ofu[1])
  336. rQQNorm(plotme=dat,outfname=ofn,xlabname='F QNorm %s' % xlabnames[n],
  337. title=otitle,basename=basename)
  338. row = [otitle,ofu,ofn]
  339. html.append(row)
  340. pdflist.append(ofn)
  341. else:
  342. print '#$# no data for # %d - %s, data[:10]=%s' % (n,titles[n],dat[:10])
  343. if nup>0:
  344. # pdfjoin --outfile chr1test.pdf `ls database/files/dataset_396_files/*.pdf`
  345. # pdfnup chr1test.pdf --nup 3x3 --frame true --outfile chr1test3.pdf
  346. filestojoin = ' '.join(pdflist) # all the file names so far
  347. afname = '%s_All_Paged.pdf' % (basename)
  348. fullafname = os.path.join(newfpath,afname)
  349. expl = 'All %s QC Plots joined into a single pdf' % basename
  350. vcl = '%s %s --outfile %s ' % (pdfjoin,filestojoin, fullafname)
  351. # make single page pdf
  352. x=subprocess.Popen(vcl,shell=True,cwd=newfpath,stderr=sto,stdout=sto)
  353. retval = x.wait()
  354. row = [expl,afname,fullafname]
  355. html.insert(0,row) # last rather than second
  356. nfname = '%s_All_%dx%d.pdf' % (basename,nup,nup)
  357. fullnfname = os.path.join(newfpath,nfname)
  358. expl = 'All %s QC Plots %d by %d to a page' % (basename,nup,nup)
  359. vcl = '%s %s --nup %dx%d --frame true --outfile %s' % (pdfnup,afname,nup,nup,fullnfname)
  360. # make thumbnail images
  361. x=subprocess.Popen(vcl,shell=True,cwd=newfpath,stderr=sto,stdout=sto)
  362. retval = x.wait()
  363. row = [expl,nfname,fullnfname]
  364. html.insert(1,row) # this goes second
  365. vcl = '%s -format jpg -resize %s %s' % (mog, mogresize, os.path.join(newfpath,'*.pdf'))
  366. # make thumbnail images
  367. x=subprocess.Popen(vcl,shell=True,cwd=newfpath,stderr=sto,stdout=sto)
  368. retval = x.wait()
  369. sto.close()
  370. cruft = open(stofile,'r').readlines()
  371. return html,cruft # elements for an ordered list of urls or whatever..
  372. def RmakePlots(markers=[],subjects=[],newfpath='.',basename='test',nbreaks='100',nup=3,height=8,width=10,rexe=''):
  373. """
  374. nice try but the R scripts are huge and take forever to run if there's a lot of data
  375. marker rhead = ['snp','chrom','maf','a1','a2','missfrac',
  376. 'p_hwe_all','logp_hwe_all','p_hwe_unaff','logp_hwe_unaff','N_Mendel']
  377. subject rhead = ['famId','iId','FracMiss','Mendel_errors','Ped_sex','SNP_sex','Status','Fest']
  378. """
  379. colour = "maroon"
  380. def rHist(plotme='',outfname='',xlabname='',title='',basename='',nbreaks=nbreaks):
  381. """ rHist <- function(plotme,froot,plotname,title,mfname,nbreaks=50)
  382. # generic histogram and vertical boxplot in a 3:1 layout
  383. # returns the graphic file name for inclusion in the web page
  384. # broken out here for reuse
  385. # ross lazarus march 2007
  386. """
  387. R = []
  388. maint = 'QC for %s - %s' % (basename,title)
  389. screenmat = (1,2,1,2) # create a 2x2 canvas
  390. widthlist = (80,20) # change to 4:1 ratio for histo and boxplot
  391. R.append('pdf("%s",h=%d,w=%d)' % (outfname,height,width))
  392. R.append("layout(matrix(c(1,1,1,2),nrow=1,ncol=4,byrow=T))")
  393. R.append("plotme = read.table(file='%s',head=F,sep='\t')" % plotme)
  394. R.append('hist(plotme, main="%s",xlab="%s",breaks=%d,col="%s")' % (maint,xlabname,nbreaks,colour))
  395. R.append('boxplot(plotme,main="",col="%s",outline=F)' % (colour) )
  396. R.append('dev.off()')
  397. return R
  398. def rCum(plotme='',outfname='',xlabname='',title='',basename='',nbreaks=100):
  399. """
  400. Useful to see what various cutoffs yield - plot percentiles
  401. """
  402. R = []
  403. n = len(plotme)
  404. R.append("plotme = read.table(file='%s',head=T,sep='\t')" % plotme)
  405. # arrives already in decending order of importance missingness or mendel count by subj or marker
  406. maint = 'QC for %s - %s' % (basename,title)
  407. R.append('pdf("%s",h=%d,w=%d)' % (outfname,height,width))
  408. R.append("par(lab=c(10,10,10))")
  409. R.append('plot(plotme$xvec,plotme$yvec,type="p",main="%s",ylab="%s",xlab="Sample Percentile",col="%s")' % (maint,xlabname,colour))
  410. R.append('dev.off()')
  411. return R
  412. def rQQ(plotme='', outfname='fname',title='title',xlabname='Sample',basename=''):
  413. """
  414. y is data for a qq plot and ends up on the x axis go figure
  415. if sampling, oversample low values - all the top 1% ?
  416. this version called with -log10 transformed hwe
  417. """
  418. R = []
  419. nrows = len(plotme)
  420. fn = float(nrows)
  421. xvec = [-math.log10(x/fn) for x in range(1,(nrows+1))]
  422. #mx = [0,math.log10(fn)] # if 1000, becomes 3 for the null line
  423. maxveclen = 3000
  424. yvec = copy.copy(plotme)
  425. if nrows > maxveclen:
  426. # now have half a million markers eg - too many to plot all for a pdf - sample to get 10k or so points
  427. # oversample part of the distribution
  428. always = min(1000,nrows/20) # oversample smaller of lowest few hundred items or 5%
  429. skip = int(nrows/float(maxveclen)) # take 1 in skip to get about maxveclen points
  430. if skip < 2:
  431. skip = 2
  432. samplei = [i for i in range(nrows) if (i < always) or (i % skip == 0)]
  433. # always oversample first sorted (here lowest) values
  434. yvec = [yvec[i] for i in samplei] # always get first and last
  435. xvec = [xvec[i] for i in samplei] # and sample xvec same way
  436. maint='Log QQ Plot (random %d of %d)' % (len(yvec),nrows)
  437. else:
  438. maint='Log QQ Plot(n=%d)' % (nrows)
  439. mx = [0,math.log10(fn)] # if 1000, becomes 3 for the null line
  440. ylab = '%s' % xlabname
  441. xlab = '-log10(Uniform 0-1)'
  442. # need to supply the x axis or else rpy prints the freaking vector on the pdf - go figure
  443. x = ['%f' % x for x in xvec]
  444. R.append('xvec = c(%s)' % ','.join(x))
  445. y = ['%f' % x for x in yvec]
  446. R.append('yvec = c(%s)' % ','.join(y))
  447. R.append('mx = c(0,%f)' % (math.log10(fn)))
  448. R.append('pdf("%s",h=%d,w=%d)' % (outfname,height,width))
  449. R.append("par(lab=c(10,10,10))")
  450. R.append('qqplot(xvec,yvec,xlab="%s",ylab="%s",main="%s",sub="%s",pch=19,col="%s",cex=0.8)' % (xlab,ylab,maint,title,colour))
  451. R.append('points(mx,mx,type="l")')
  452. R.append('grid(col="lightgray",lty="dotted")')
  453. R.append('dev.off()')
  454. return R
  455. def rMultiQQ(plotme = '',nsplits=5, outfname='fname',title='title',xlabname='Sample',basename=''):
  456. """
  457. data must contain p,x,y as data for a qq plot, quantiles of x and y axis used to create a
  458. grid of qq plots to show departure from null at extremes of data quality
  459. Need to plot qqplot(p,unif) where the p's come from one x and one y quantile
  460. and ends up on the x axis go figure
  461. if sampling, oversample low values - all the top 1% ?
  462. """
  463. data = copy.copy(plotme)
  464. nvals = len(data)
  465. stepsize = nvals/nsplits
  466. logstep = math.log10(stepsize) # so is 3 for steps of 1000
  467. R.append('mx = c(0,%f)' % logstep)
  468. quints = range(0,nvals,stepsize) # quintile cutpoints for each layer
  469. data.sort(key=itertools.itemgetter(1)) # into x order
  470. #rpy.r.pdf( outfname, h , w )
  471. #rpy.r("par(mfrow = c(%d,%d))" % (nsplits,nsplits))
  472. R.append('pdf("%s",h=%d,w=%d)' % (outfname,height,width))
  473. yvec = [-math.log10(random.random()) for x in range(stepsize)]
  474. yvec.sort() # size of each step is expected range for xvec under null?!
  475. y = ['%f' % x for x in yvec]
  476. R.append('yvec = c(%s)' % ','.join(y))
  477. for rowstart in quints:
  478. rowend = rowstart + stepsize
  479. if nvals - rowend < stepsize: # finish last split
  480. rowend = nvals
  481. row = data[rowstart:rowend]
  482. row.sort(key=itertools.itemgetter(2)) # into y order
  483. for colstart in quints:
  484. colend = colstart + stepsize
  485. if nvals - colend < stepsize: # finish last split
  486. colend = nvals
  487. cell = row[colstart:colend]
  488. xvec = [-math.log10(x[0]) for x in cell] # all the pvalues for this cell
  489. x = ['%f' % x for x in xvec]
  490. R.append('xvec = c(%s)' % ','.join(x))
  491. R.append('qqplot(xvec,yvec,xlab="%s",ylab="%s",main="%s",sub="%s",pch=19,col="%s",cex=0.8)' % (xlab,ylab,maint,title,colour))
  492. R.append('points(mx,mx,type="l")')
  493. R.append('grid(col="lightgray",lty="dotted")')
  494. #rpy.r.qqplot(xvec,yvec,xlab=xlab,ylab=ylab,pch=19,col="maroon",cex=0.8)
  495. #rpy.r.points(c(0,logstep),c(0,logstep),type='l')
  496. R.append('dev.off()')
  497. #rpy.r.dev_off()
  498. return R
  499. def rQQNorm(plotme=[], outfname='fname',title='title',xlabname='Sample',basename=''):
  500. """
  501. y is data for a qqnorm plot
  502. if sampling, oversample low values - all the top 1% ?
  503. """
  504. rangeunif = len(plotme)
  505. nunif = 1000
  506. maxveclen = 3000
  507. nrows = len(plotme)
  508. data = copy.copy(plotme)
  509. if nrows > maxveclen:
  510. # now have half a million markers eg - too many to plot all for a pdf - sample to get 10k or so points
  511. # oversample part of the distribution
  512. always = min(1000,nrows/20) # oversample smaller of lowest few hundred items or 5%
  513. skip = int((nrows-always)/float(maxveclen)) # take 1 in skip to get about maxveclen points
  514. samplei = [i for i in range(nrows) if (i % skip == 0) or (i < always)]
  515. # always oversample first sorted (here lowest) values
  516. yvec = [data[i] for i in samplei] # always get first and last
  517. maint='Log QQ Plot (random %d of %d)' % (len(yvec),nrows)
  518. else:
  519. yvec = data
  520. maint='Log QQ Plot(n=%d)' % (nrows)
  521. n = 1000
  522. ylab = '%s' % xlabname
  523. xlab = 'Normal'
  524. # need to supply the x axis or else rpy prints the freaking vector on the pdf - go figure
  525. #rpy.r.pdf( outfname, h , w )
  526. #rpy.r("par(lab=c(10,10,10))") # so our grid is denser than the default 5
  527. #rpy.r.qqnorm(yvec,xlab=xlab,ylab=ylab,main=maint,sub=title,pch=19,col="maroon",cex=0.8)
  528. #rpy.r.grid(nx = None, ny = None, col = "lightgray", lty = "dotted")
  529. #rpy.r.dev_off()
  530. y = ['%f' % x for x in yvec]
  531. R.append('yvec = c(%s)' % ','.join(y))
  532. R.append('pdf("%s",h=%d,w=%d)' % (outfname,height,width))
  533. R.append("par(lab=c(10,10,10))")
  534. R.append('qqnorm(yvec,xlab="%s",ylab="%s",main="%s",sub="%s",pch=19,col="%s",cex=0.8)' % (xlab,ylab,maint,title,colour))
  535. R.append('grid(col="lightgray",lty="dotted")')
  536. R.append('dev.off()')
  537. return R
  538. def rMAFMissqq(plotme=[], outfname='fname',title='title',xlabname='Sample',basename=''):
  539. """
  540. layout qq plots for pvalues within rows of increasing MAF and columns of increasing missingness
  541. like the GAIN qc tools
  542. y is data for a qq plot and ends up on the x axis go figure
  543. if sampling, oversample low values - all the top 1% ?
  544. """
  545. rangeunif = len(plotme)
  546. nunif = 1000
  547. fn = float(rangeunif)
  548. xvec = [-math.log10(x/fn) for x in range(1,(rangeunif+1))]
  549. skip = max(int(rangeunif/fn),1)
  550. # force include last points
  551. mx = [0,math.log10(fn)] # if 1000, becomes 3 for the null line
  552. maxveclen = 2000
  553. nrows = len(plotme)
  554. data = copy.copy(plotme)
  555. data.sort() # low to high - oversample low values
  556. if nrows > maxveclen:
  557. # now have half a million markers eg - too many to plot all for a pdf - sample to get 10k or so points
  558. # oversample part of the distribution
  559. always = min(1000,nrows/20) # oversample smaller of lowest few hundred items or 5%
  560. skip = int(nrows/float(maxveclen)) # take 1 in skip to get about maxveclen points
  561. samplei = [i for i in range(nrows) if (i % skip == 0) or (i < always)]
  562. # always oversample first sorted (here lowest) values
  563. yvec = [data[i] for i in samplei] # always get first and last
  564. xvec = [xvec[i] for i in samplei] # and sample xvec same way
  565. maint='Log QQ Plot (random %d of %d)' % (len(yvec),nrows)
  566. else:
  567. yvec = data
  568. maint='Log QQ Plot(n=%d)' % (nrows)
  569. n = 1000
  570. mx = [0,log10(fn)] # if 1000, becomes 3 for the null line
  571. ylab = '%s' % xlabname
  572. xlab = '-log10(Uniform 0-1)'
  573. R.append('mx = c(0,%f)' % (math.log10(fn)))
  574. x = ['%f' % x for x in xvec]
  575. R.append('xvec = c(%s)' % ','.join(x))
  576. y = ['%f' % x for x in yvec]
  577. R.append('yvec = c(%s)' % ','.join(y))
  578. R.append('pdf("%s",h=%d,w=%d)' % (outfname,height,width))
  579. R.append("par(lab=c(10,10,10))")
  580. R.append('qqplot(xvec,yvec,xlab="%s",ylab="%s",main="%s",sub="%s",pch=19,col="%s",cex=0.8)' % (xlab,ylab,maint,title,colour))
  581. R.append('points(mx,mx,type="l")')
  582. R.append('grid(col="lightgray",lty="dotted")')
  583. R.append('dev.off()')
  584. shead = subjects.pop(0) # get rid of head
  585. mhead = markers.pop(0)
  586. maf = mhead.index('maf')
  587. missfrac = mhead.index('missfrac')
  588. logphweall = mhead.index('logp_hwe_all')
  589. logphweunaff = mhead.index('logp_hwe_unaff')
  590. # check for at least some unaffected rml june 2009
  591. m_mendel = mhead.index('N_Mendel')
  592. fracmiss = shead.index('FracMiss')
  593. s_mendel = shead.index('Mendel_errors')
  594. s_het = shead.index('F_Stat')
  595. params = {}
  596. h = [float(x[logphweunaff]) for x in markers if len(x[logphweunaff]) >= logphweunaff
  597. and x[logphweunaff].upper() <> 'NA']
  598. if len(h) <> 0:
  599. xs = [logphweunaff, missfrac, maf, m_mendel, fracmiss, s_mendel, s_het]
  600. # plot for each of these cols
  601. else: # try hwe all instead - maybe no affection status available
  602. xs = [logphweall, missfrac, maf, m_mendel, fracmiss, s_mendel, s_het]
  603. ordplotme = [1,1,1,1,1,1,1] # ordered plots for everything!
  604. oreverseme = [1,1,0,1,1,1,0] # so larger values are oversampled
  605. qqplotme = [1,0,0,0,0,0,0] #
  606. qnplotme = [0,0,0,0,0,0,1] #
  607. nplots = len(xs)
  608. xlabnames = ['log(p) HWE (unaff)', 'Missing Rate: Markers', 'Minor Allele Frequency',
  609. 'Marker Mendel Error Count', 'Missing Rate: Subjects',
  610. 'Subject Mendel Error Count','Subject Inbreeding (het) F Statistic']
  611. plotnames = ['logphweunaff', 'missfrac', 'maf', 'm_mendel', 'fracmiss', 's_mendel','s_het']
  612. ploturls = ['%s_%s.pdf' % (basename,x) for x in plotnames] # real plotnames
  613. ordplotnames = ['%s_cum' % x for x in plotnames]
  614. ordploturls = ['%s_%s.pdf' % (basename,x) for x in ordplotnames] # real plotnames
  615. outfnames = [os.path.join(newfpath,ploturls[x]) for x in range(nplots)]
  616. ordoutfnames = [os.path.join(newfpath,ordploturls[x]) for x in range(nplots)]
  617. datasources = [markers,markers,markers,markers,subjects,subjects,subjects] # use this table
  618. titles = ["Marker HWE","Marker Missing Genotype", "Marker MAF","Marker Mendel",
  619. "Subject Missing Genotype","Subject Mendel",'Subject F Statistic']
  620. html = []
  621. pdflist = []
  622. R = []
  623. for n,column in enumerate(xs):
  624. dfn = '%d_%s.txt' % (n,titles[n])
  625. dfilepath = os.path.join(newfpath,dfn)
  626. dat = [float(x[column]) for x in datasources[n] if len(x) >= column
  627. and x[column][:2].upper() <> 'NA'] # plink gives both!
  628. if sum(dat) <> 0: # eg nada for mendel if case control?
  629. plotme = file(dfilepath,'w')
  630. plotme.write('\n'.join(['%f' % x for x in dat])) # pass as a file - copout!
  631. tR = rHist(plotme=dfilepath,outfname=outfnames[n],xlabname=xlabnames[n],
  632. title=titles[n],basename=basename,nbreaks=nbreaks)
  633. R += tR
  634. row = [titles[n],ploturls[n],outfnames[n] ]
  635. html.append(row)
  636. pdflist.append(outfnames[n])
  637. if ordplotme[n]: # for missingness, hwe - plots to see where cutoffs will end up
  638. otitle = 'Ranked %s' % titles[n]
  639. dat.sort()
  640. if oreverseme[n]:
  641. dat.reverse()
  642. ndat = len(dat)
  643. xvec = range(ndat)
  644. xvec = [100.0*(n-x)/n for x in xvec] # convert to centile
  645. # now have half a million markers eg - too many to plot all for a pdf - sample to get 10k or so points
  646. maxveclen = 1000.0 # for reasonable pdf sizes!
  647. if ndat > maxveclen: # oversample part of the distribution
  648. always = min(1000,ndat/20) # oversample smaller of lowest few hundred items or 5%
  649. skip = int(ndat/maxveclen) # take 1 in skip to get about maxveclen points
  650. samplei = [i for i in range(ndat) if (i % skip == 0) or (i < always)] # always oversample first sorted values
  651. yvec = [yvec[i] for i in samplei] # always get first and last
  652. xvec = [xvec[i] for i in samplei] # always get first and last
  653. plotme = file(dfilepath,'w')
  654. plotme.write('xvec\tyvec\n')
  655. plotme.write('\n'.join(['%f\t%f' % (xvec[i],y) for y in yvec])) # pass as a file - copout!
  656. tR = rCum(plotme=dat,outfname=ordoutfnames[n],xlabname='Ordered %s' % xlabnames[n],
  657. title=otitle,basename=basename,nbreaks=nbreaks)
  658. R += tR
  659. row = [otitle,ordploturls[n],ordoutfnames[n]]
  660. html.append(row)
  661. pdflist.append(ordoutfnames[n])
  662. if qqplotme[n]: #
  663. otitle = 'LogQQ plot %s' % titles[n]
  664. dat.sort()
  665. dat.reverse()
  666. ofn = os.path.split(ordoutfnames[n])
  667. ofn = os.path.join(ofn[0],'QQ%s' % ofn[1])
  668. ofu = os.path.split(ordploturls[n])
  669. ofu = os.path.join(ofu[0],'QQ%s' % ofu[1])
  670. tR = rQQ(plotme=dat,outfname=ofn,xlabname='QQ %s' % xlabnames[n],
  671. title=otitle,basename=basename)
  672. R += tR
  673. row = [otitle,ofu,ofn]
  674. html.append(row)
  675. pdflist.append(ofn)
  676. elif qnplotme[n]:
  677. otitle = 'F Statistic %s' % titles[n]
  678. dat.sort()
  679. dat.reverse()
  680. ofn = os.path.split(ordoutfnames[n])
  681. ofn = os.path.join(ofn[0],'FQNorm%s' % ofn[1])
  682. ofu = os.path.split(ordploturls[n])
  683. ofu = os.path.join(ofu[0],'FQNorm%s' % ofu[1])
  684. tR = rQQNorm(plotme=dat,outfname=ofn,xlabname='F QNorm %s' % xlabnames[n],
  685. title=otitle,basename=basename)
  686. R += tR
  687. row = [otitle,ofu,ofn]
  688. html.append(row)
  689. pdflist.append(ofn)
  690. else:
  691. print '#$# no data for # %d - %s, data[:10]=%s' % (n,titles[n],dat[:10])
  692. rlog,flist = RRun(rcmd=R,title='makeQCplots',outdir=newfpath)
  693. if nup>0:
  694. # pdfjoin --outfile chr1test.pdf `ls database/files/dataset_396_files/*.pdf`
  695. # pdfnup chr1test.pdf --nup 3x3 --frame true --outfile chr1test3.pdf
  696. filestojoin = ' '.join(pdflist) # all the file names so far
  697. afname = '%s_All_Paged.pdf' % (basename)
  698. fullafname = os.path.join(newfpath,afname)
  699. expl = 'All %s QC Plots joined into a single pdf' % basename
  700. vcl = 'pdfjoin %s --outfile %s ' % (filestojoin, fullafname)
  701. # make single page pdf
  702. x=subprocess.Popen(vcl,shell=True,cwd=newfpath)
  703. retval = x.wait()
  704. row = [expl,afname,fullafname]
  705. html.insert(0,row) # last rather than second
  706. nfname = '%s_All_%dx%d.pdf' % (basename,nup,nup)
  707. fullnfname = os.path.join(newfpath,nfname)
  708. expl = 'All %s QC Plots %d by %d to a page' % (basename,nup,nup)
  709. vcl = 'pdfnup %s --nup %dx%d --frame true --outfile %s' % (afname,nup,nup,fullnfname)
  710. # make thumbnail images
  711. x=subprocess.Popen(vcl,shell=True,cwd=newfpath)
  712. retval = x.wait()
  713. row = [expl,nfname,fullnfname]
  714. html.insert(1,row) # this goes second
  715. vcl = 'mogrify -format jpg -resize %s %s' % (mogresize, os.path.join(newfpath,'*.pdf'))
  716. # make thumbnail images
  717. x=subprocess.Popen(vcl,shell=True,cwd=newfpath)
  718. retval = x.wait()
  719. return html # elements for an ordered list of urls or whatever..
  720. def countHet(bedf='fakeped_500000',linkageped=True,froot='fake500k',outfname="ahetf",logf='rgQC.log'):
  721. """
  722. NO LONGER USED - historical interest
  723. count het loci for each subject to look for outliers = ? contamination
  724. assume ped file is linkage format
  725. need to make a ped file from the bed file we were passed
  726. """
  727. vcl = [plinke,'--bfile',bedf,'--recode','--out','%s_recode' % froot] # write a recoded ped file from the real .bed file
  728. p=subprocess.Popen(' '.join(vcl),shell=True)
  729. retval = p.wait()
  730. f = open('%s_recode.recode.ped' % froot,'r')
  731. if not linkageped:
  732. head = f.next() # throw away header
  733. hets = [] # simple count of het loci per subject. Expect poisson?
  734. n = 1
  735. for l in f:
  736. n += 1
  737. ll = l.strip().split()
  738. if len(ll) > 6:
  739. iid = idjoiner.join(ll[:2]) # fam_iid
  740. gender = ll[4]
  741. alleles = ll[6:]
  742. nallele = len(alleles)
  743. nhet = 0
  744. for i in range(nallele/2):
  745. a1=alleles[2*i]
  746. a2=alleles[2*i+1]
  747. if alleles[2*i] <> alleles[2*i+1]: # must be het
  748. if not missvals.get(a1,None) and not missvals.get(a2,None):
  749. nhet += 1
  750. hets.append((nhet,iid,gender)) # for sorting later
  751. f.close()
  752. hets.sort()
  753. hets.reverse() # biggest nhet now on top
  754. f = open(outfname ,'w')
  755. res = ['%d\t%s\t%s' % (x) for x in hets] # I love list comprehensions
  756. res.insert(0,'nhetloci\tfamid_iid\tgender')
  757. res.append('')
  758. f.write('\n'.join(res))
  759. f.close()
  760. def subjectRep(froot='cleantest',outfname="srep",newfpath='.',logf = None):
  761. """by subject (missingness = .imiss, mendel = .imendel)
  762. assume replicates have an underscore in family id for
  763. hapmap testing
  764. For sorting, we need floats and integers
  765. """
  766. isexfile = '%s.sexcheck' % froot
  767. imissfile = '%s.imiss' % froot
  768. imendfile = '%s.imendel' % froot
  769. ihetfile = '%s.het' % froot
  770. logf.write('## subject reports starting at %s\n' % timenow())
  771. outfile = os.path.join(newfpath,outfname)
  772. idlist = []
  773. imissdict = {}
  774. imenddict = {}
  775. isexdict = {}
  776. ihetdict = {}
  777. Tops = {}
  778. Tnames = ['Ranked Subject Missing Genotype', 'Ranked Subject Mendel',
  779. 'Ranked Sex check', 'Ranked Inbreeding (het) F statistic']
  780. Tsorts = [2,3,6,8]
  781. Treverse = [True,True,True,False] # so first values are worser
  782. #rhead = ['famId','iId','FracMiss','Mendel_errors','Ped_sex','SNP_sex','Status','Fest']
  783. ## FID IID MISS_PHENO N_MISS N_GENO F_MISS
  784. ## 1552042370_A 1552042370_A N 5480 549883 0.009966
  785. ## 1552042410_A 1552042410_A N 1638 549883 0.002979
  786. # ------------------missing--------------------
  787. # imiss has FID IID MISS_PHENO N_MISS F_MISS
  788. # we want F_MISS
  789. try:
  790. f = open(imissfile,'r')
  791. except:
  792. logf.write('# file %s is missing - talk about irony\n' % imissfile)
  793. f = None
  794. if f:
  795. for n,line in enumerate(f):
  796. ll = line.strip().split()
  797. if n == 0:
  798. head = [x.upper() for x in ll] # expect above
  799. fidpos = head.index('FID')
  800. iidpos = head.index('IID')
  801. fpos = head.index('F_MISS')
  802. elif len(ll) >= fpos: # full line
  803. fid = ll[fidpos]
  804. #if fid.find('_') == -1: # not replicate! - icondb ids have these...
  805. iid = ll[iidpos]
  806. fmiss = ll[fpos]
  807. id = '%s%s%s' % (fid,idjoiner,iid)
  808. imissdict[id] = fmiss
  809. idlist.append(id)
  810. f.close()
  811. logf.write('## imissfile %s contained %d ids\n' % (imissfile,len(idlist)))
  812. # ------------------mend-------------------
  813. # *.imendel has FID IID N
  814. # we want N
  815. gotmend = True
  816. try:
  817. f = open(imendfile,'r')
  818. except:
  819. gotmend = False
  820. for id in idlist:
  821. imenddict[id] = '0'
  822. if gotmend:
  823. for n,line in enumerate(f):
  824. ll = line.strip().split()
  825. if n == 0:
  826. head = [x.upper() for x in ll] # expect above
  827. npos = head.index('N')
  828. fidpos = head.index('FID')
  829. iidpos = head.index('IID')
  830. elif len(ll) >= npos: # full line
  831. fid = ll[fidpos]
  832. iid = ll[iidpos]
  833. id = '%s%s%s' % (fid,idjoiner,iid)
  834. nmend = ll[npos]
  835. imenddict[id] = nmend
  836. f.close()
  837. else:
  838. logf.write('## error No %s file - assuming not family data\n' % imendfile)
  839. # ------------------sex check------------------
  840. #[rerla@hg fresh]$ head /home/rerla/fresh/database/files/dataset_978_files/CAMP2007Dirty.sexcheck
  841. # sexcheck has FID IID PEDSEX SNPSEX STATUS F
  842. ##
  843. ## FID Family ID
  844. ## IID Individual ID
  845. ## PEDSEX Sex as determined in pedigree file (1=male, 2=female)
  846. ## SNPSEX Sex as determined by X chromosome
  847. ## STATUS Displays "PROBLEM" or "OK" for each individual
  848. ## F The actual X chromosome inbreeding (homozygosity) estimate
  849. ##
  850. ## A PROBLEM arises if the two sexes do not match, or if the SNP data or pedigree data are
  851. ## ambiguous with regard to sex.
  852. ## A male call is made if F is more than 0.8; a femle call is made if F is less than 0.2.
  853. try:
  854. f = open(isexfile,'r')
  855. got_sexcheck = 1
  856. except:
  857. got_sexcheck = 0
  858. if got_sexcheck:
  859. for n,line in enumerate(f):
  860. ll = line.strip().split()
  861. if n == 0:
  862. head = [x.upper() for x in ll] # expect above
  863. fidpos = head.index('FID')
  864. iidpos = head.index('IID')
  865. pedsexpos = head.index('PEDSEX')
  866. snpsexpos = head.index('SNPSEX')
  867. statuspos = head.index('STATUS')
  868. fpos = head.index('F')
  869. elif len(ll) >= fpos: # full line
  870. fid = ll[fidpos]
  871. iid = ll[iidpos]
  872. fest = ll[fpos]
  873. pedsex = ll[pedsexpos]
  874. snpsex = ll[snpsexpos]
  875. stat = ll[statuspos]
  876. id = '%s%s%s' % (fid,idjoiner,iid)
  877. isexdict[id] = (pedsex,snpsex,stat,fest)
  878. f.close()
  879. else:
  880. # this only happens if there are no subjects!
  881. logf.write('No %s file - assuming no sex errors' % isexfile)
  882. ##
  883. ## FID IID O(HOM) E(HOM) N(NM) F
  884. ## 457 2 490665 4.928e+05 722154 -0.009096
  885. ## 457 3 464519 4.85e+05 710986 -0.0908
  886. ## 1037 2 461632 4.856e+05 712025 -0.106
  887. ## 1037 1 491845 4.906e+05 719353 0.005577
  888. try:
  889. f = open(ihetfile,'r')
  890. except:
  891. f = None
  892. logf.write('## No %s file - did we run --het in plink?\n' % ihetfile)
  893. if f:
  894. for i,line in enumerate(f):
  895. ll = line.strip().split()
  896. if i == 0:
  897. head = [x.upper() for x in ll] # expect above
  898. fidpos = head.index('FID')
  899. iidpos = head.index('IID')
  900. fpos = head.index('F')
  901. n = 0
  902. elif len(ll) >= fpos: # full line
  903. fid = ll[fidpos]
  904. iid = ll[iidpos]
  905. fhet = ll[fpos]
  906. id = '%s%s%s' % (fid,idjoiner,iid)
  907. ihetdict[id] = fhet
  908. f.close() # now assemble and output result list
  909. rhead = ['famId','iId','FracMiss','Mendel_errors','Ped_sex','SNP_sex','Status','XHomEst','F_Stat']
  910. res = []
  911. fres = [] # floats for sorting
  912. for id in idlist: # for each snp in found order
  913. fid,iid = id.split(idjoiner) # recover keys
  914. f_missing = imissdict.get(id,'0.0')
  915. nmend = imenddict.get(id,'0')
  916. (pedsex,snpsex,status,fest) = isexdict.get(id,('0','0','0','0.0'))
  917. fhet = ihetdict.get(id,'0.0')
  918. res.append([fid,iid,f_missing,nmend,pedsex,snpsex,status,fest,fhet])
  919. try:
  920. ff_missing = float(f_missing)
  921. except:
  922. ff_missing = 0.0
  923. try:
  924. inmend = int(nmend)
  925. except:
  926. inmend = 0
  927. try:
  928. ffest = float(fest)
  929. except:
  930. fest = 0.0
  931. try:
  932. ffhet = float(fhet)
  933. except:
  934. ffhet = 0.0
  935. fres.append([fid,iid,ff_missing,inmend,pedsex,snpsex,status,ffest,ffhet])
  936. ntokeep = max(20,len(res)/keepfrac)
  937. for i,col in enumerate(Tsorts):
  938. fres.sort(key=operator.itemgetter(col))
  939. if Treverse[i]:
  940. fres.reverse()
  941. repname = Tnames[i]
  942. Tops[repname] = fres[0:ntokeep]
  943. Tops[repname] = [map(str,x) for x in Tops[repname]]
  944. Tops[repname].insert(0,rhead)
  945. res.sort()
  946. res.insert(0,rhead)
  947. logf.write('### writing %s report with %s' % ( outfile,res[0]))
  948. f = open(outfile,'w')
  949. f.write('\n'.join(['\t'.join(x) for x in res]))
  950. f.write('\n')
  951. f.close()
  952. return res,Tops
  953. def markerRep(froot='cleantest',outfname="mrep",newfpath='.',logf=None,maplist=None ):
  954. """by marker (hwe = .hwe, missingness=.lmiss, freq = .frq)
  955. keep a list of marker order but keep all stats in dicts
  956. write out a fake xls file for R or SAS etc
  957. kinda clunky, but..
  958. TODO: ensure stable if any file not found?
  959. """
  960. mapdict = {}
  961. if maplist <> None:
  962. rslist = [x[1] for x in maplist]
  963. offset = [(x[0],x[3]) for x in maplist]
  964. mapdict = dict(zip(rslist,offset))
  965. hwefile = '%s.hwe' % froot
  966. lmissfile = '%s.lmiss' % froot
  967. freqfile = '%s.frq' % froot
  968. lmendfile = '%s.lmendel' % froot
  969. outfile = os.path.join(newfpath,outfname)
  970. markerlist = []
  971. chromlist = []
  972. hwedict = {}
  973. lmissdict = {}
  974. freqdict = {}
  975. lmenddict = {}
  976. Tops = {}
  977. Tnames = ['Ranked Marker MAF', 'Ranked Marker Missing Genotype', 'Ranked Marker HWE', 'Ranked Marker Mendel']
  978. Tsorts = [3,6,10,11]
  979. Treverse = [False,True,True,True] # so first values are worse(r)
  980. #res.append([rs,chrom,offset,maf,a1,a2,f_missing,hwe_all[0],hwe_all[1],hwe_unaff[0],hwe_unaff[1],nmend])
  981. #rhead = ['snp','chrom','maf','a1','a2','missfrac','p_hwe_all','logp_hwe_all','p_hwe_unaff','logp_hwe_unaff','N_Mendel']
  982. # -------------------hwe--------------------------
  983. # hwe has SNP TEST GENO O(HET) E(HET) P_HWD
  984. # we want all hwe where P_HWD <> NA
  985. # ah changed in 1.04 to
  986. ## CHR SNP TEST A1 A2 GENO O(HET) E(HET) P
  987. ## 1 rs6671164 ALL 2 3 34/276/613 0.299 0.3032 0.6644
  988. ## 1 rs6671164 AFF 2 3 0/0/0 nan nan NA
  989. ## 1 rs6671164 UNAFF 2 3 34/276/613 0.299 0.3032 0.6644
  990. ## 1 rs4448553 ALL 2 3 8/176/748 0.1888 0.1848 0.5975
  991. ## 1 rs4448553 AFF 2 3 0/0/0 nan nan NA
  992. ## 1 rs4448553 UNAFF 2 3 8/176/748 0.1888 0.1848 0.5975
  993. ## 1 rs1990150 ALL 1 3 54/303/569 0.3272 0.3453 0.1067
  994. ## 1 rs1990150 AFF 1 3 0/0/0 nan nan NA
  995. ## 1 rs1990150 UNAFF 1 3 54/303/569 0.3272 0.3453 0.1067
  996. logf.write('## marker reports starting at %s\n' % timenow())
  997. try:
  998. f = open(hwefile,'r')
  999. except:
  1000. f = None
  1001. logf.write('## error - no hwefile %s found\n' % hwefile)
  1002. if f:
  1003. for i,line in enumerate(f):
  1004. ll = line.strip().split()
  1005. if i == 0: # head
  1006. head = [x.upper() for x in ll] # expect above
  1007. try:
  1008. testpos = head.index('TEST')
  1009. except:
  1010. testpos = 2 # patch for 1.04 which has b0rken headers - otherwise use head.index('TEST')
  1011. try:
  1012. ppos = head.index('P')
  1013. except:
  1014. ppos = 8 # patch - for head.index('P')
  1015. snppos = head.index('SNP')
  1016. chrpos = head.index('CHR')
  1017. logf.write('hwe header testpos=%d,ppos=%d,snppos=%d\n' % (testpos,ppos,snppos))
  1018. elif len(ll) >= ppos: # full line
  1019. ps = ll[ppos].upper()
  1020. rs = ll[snppos]
  1021. chrom = ll[chrpos]
  1022. test = ll[testpos]
  1023. if not hwedict.get(rs,None):
  1024. hwedict[rs] = {}
  1025. markerlist.append(rs)
  1026. chromlist.append(chrom) # one place to find it?
  1027. lpvals = 0
  1028. if ps.upper() <> 'NA' and ps.upper() <> 'NAN': # worth keeping
  1029. lpvals = '0'
  1030. if ps <> '1':
  1031. try:
  1032. pval = float(ps)
  1033. lpvals = '%f' % -math.log10(pval)
  1034. except:
  1035. pass
  1036. hwedict[rs][test] = (ps,lpvals)
  1037. else:
  1038. logf.write('short line #%d = %s\n' % (i,ll))
  1039. f.close()
  1040. # ------------------missing--------------------
  1041. """lmiss has
  1042. CHR SNP N_MISS N_GENO F_MISS
  1043. 1 rs12354060 0 73 0
  1044. 1 rs4345758 1 73 0.0137
  1045. 1 rs2691310 73 73 1
  1046. 1 rs2531266 73 73 1
  1047. # we want F_MISS"""
  1048. try:
  1049. f = open(lmissfile,'r')
  1050. except:
  1051. f = None
  1052. if f:
  1053. for i,line in enumerate(f):
  1054. ll = line.strip().split()
  1055. if i == 0:
  1056. head = [x.upper() for x in ll] # expect above
  1057. fracpos = head.index('F_MISS')
  1058. npos = head.index('N_MISS')
  1059. snppos = head.index('SNP')
  1060. elif len(ll) >= fracpos: # full line
  1061. rs = ll[snppos]
  1062. fracval = ll[fracpos]
  1063. lmissdict[rs] = fracval # for now, just that?
  1064. else:
  1065. lmissdict[rs] = 'NA'
  1066. f.close()
  1067. # ------------------freq-------------------
  1068. # frq has CHR SNP A1 A2 MAF NM
  1069. # we want maf
  1070. try:
  1071. f = open(freqfile,'r')
  1072. except:
  1073. f = None
  1074. if f:
  1075. for i,line in enumerate(f):
  1076. ll = line.strip().split()
  1077. if i == 0:
  1078. head = [x.upper() for x in ll] # expect above
  1079. mafpos = head.index('MAF')
  1080. a1pos = head.index('A1')
  1081. a2pos = head.index('A2')
  1082. snppos = head.index('SNP')
  1083. elif len(ll) >= mafpos: # full line
  1084. rs = ll[snppos]
  1085. a1 = ll[a1pos]
  1086. a2 = ll[a2pos]
  1087. maf = ll[mafpos]
  1088. freqdict[rs] = (maf,a1,a2)
  1089. f.close()
  1090. # ------------------mend-------------------
  1091. # lmend has CHR SNP N
  1092. # we want N
  1093. gotmend = True
  1094. try:
  1095. f = open(lmendfile,'r')
  1096. except:
  1097. gotmend = False
  1098. for rs in markerlist:
  1099. lmenddict[rs] = '0'
  1100. if gotmend:
  1101. for i,line in enumerate(f):
  1102. ll = line.strip().split()
  1103. if i == 0:
  1104. head = [x.upper() for x in ll] # expect above
  1105. npos = head.index('N')
  1106. snppos = head.index('SNP')
  1107. elif len(ll) >= npos: # full line
  1108. rs = ll[snppos]
  1109. nmend = ll[npos]
  1110. lmenddict[rs] = nmend
  1111. f.close()
  1112. else:
  1113. logf.write('No %s file - assuming not family data\n' % lmendfile)
  1114. # now assemble result list
  1115. rhead = ['snp','chromosome','offset','maf','a1','a2','missfrac','p_hwe_all','logp_hwe_all','p_hwe_unaff','logp_hwe_unaff','N_Mendel']
  1116. res = []
  1117. fres = []
  1118. for rs in markerlist: # for each snp in found order
  1119. f_missing = lmissdict.get(rs,'NA')
  1120. maf,a1,a2 = freqdict.get(rs,('NA','NA','NA'))
  1121. hwe_all = hwedict[rs].get('ALL',('NA','NA')) # hope this doesn't change...
  1122. hwe_unaff = hwedict[rs].get('UNAFF',('NA','NA'))
  1123. nmend = lmenddict.get(rs,'NA')
  1124. (chrom,offset)=mapdict.get(rs,('?','0'))
  1125. res.append([rs,chrom,offset,maf,a1,a2,f_missing,hwe_all[0],hwe_all[1],hwe_unaff[0],hwe_unaff[1],nmend])
  1126. ntokeep = max(10,len(res)/keepfrac)
  1127. def msortk(item=None):
  1128. """
  1129. deal with non numeric sorting
  1130. """
  1131. try:
  1132. return float(item)
  1133. except:
  1134. return item
  1135. for i,col in enumerate(Tsorts):
  1136. res.sort(key=msortk(lambda x:x[col]))
  1137. if Treverse[i]:
  1138. res.reverse()
  1139. repname = Tnames[i]
  1140. Tops[repname] = res[0:ntokeep]
  1141. Tops[repname].insert(0,rhead)
  1142. res.sort(key=lambda x: '%s_%10d' % (x[1].ljust(4,'0'),int(x[2]))) # in chrom offset order
  1143. res.insert(0,rhead)
  1144. f = open(outfile,'w')
  1145. f.write('\n'.join(['\t'.join(x) for x in res]))
  1146. f.close()
  1147. return res,Tops
  1148. def getfSize(fpath,outpath):
  1149. """
  1150. format a nice file size string
  1151. """
  1152. size = ''
  1153. fp = os.path.join(outpath,fpath)
  1154. if os.path.isfile(fp):
  1155. n = float(os.path.getsize(fp))
  1156. if n > 2**20:
  1157. size = ' (%1.1f MB)' % (n/2**20)
  1158. elif n > 2**10:
  1159. size = ' (%1.1f KB)' % (n/2**10)
  1160. elif n > 0:
  1161. size = ' (%d B)' % (int(n))
  1162. return size
  1163. if __name__ == "__main__":
  1164. u = """ called in xml as
  1165. <command interpreter="python">
  1166. rgQC.py -i '$input_file.extra_files_path/$input_file.metadata.base_name' -o "$out_prefix"
  1167. -s '$html_file' -p '$html_file.files_path' -l '${GALAXY_DATA_INDEX_DIR}/rg/bin/plink'
  1168. -r '${GALAXY_DATA_INDEX_DIR}/rg/bin/R'
  1169. </command>
  1170. Prepare a qc report - eg:
  1171. print "%s %s -i birdlped -o birdlped -l qc.log -s htmlf -m marker.xls -s sub.xls -p ./" % (sys.executable,prog)
  1172. """
  1173. progname = os.path.basename(sys.argv[0])
  1174. if len(sys.argv) < 9:
  1175. print '%s requires 6 parameters - got %d = %s' % (progname,len(sys.argv),sys.argv)
  1176. sys.exit(1)
  1177. parser = OptionParser(usage=u, version="%prog 0.01")
  1178. a = parser.add_option
  1179. a("-i","--infile",dest="infile")
  1180. a("-o","--oprefix",dest="opref")
  1181. a("-l","--plinkexe",dest="plinke", default=plinke)
  1182. a("-r","--rexe",dest="rexe", default=rexe)
  1183. a("-s","--snps",dest="htmlf")
  1184. #a("-m","--markerRaw",dest="markf")
  1185. #a("-r","--rawsubject",dest="subjf")
  1186. a("-p","--patho",dest="newfpath")
  1187. (options,args) = parser.parse_args()
  1188. basename = os.path.split(options.infile)[-1] # just want the file prefix to find the .xls files below
  1189. killme = string.punctuation + string.whitespace
  1190. trantab = string.maketrans(killme,'_'*len(killme))
  1191. opref = options.opref.translate(trantab)
  1192. alogh,alog = tempfile.mkstemp(suffix='.txt')
  1193. plogh,plog = tempfile.mkstemp(suffix='.txt')
  1194. alogf = open(alog,'w')
  1195. plogf = open(plog,'w')
  1196. ahtmlf = options.htmlf
  1197. amarkf = 'MarkerDetails_%s.xls' % opref
  1198. asubjf = 'SubjectDetails_%s.xls' % opref
  1199. newfpath = options.newfpath
  1200. newfpath = os.path.realpath(newfpath)
  1201. try:
  1202. os.makedirs(newfpath)
  1203. except:
  1204. pass
  1205. ofn = basename
  1206. bfn = options.infile
  1207. try:
  1208. mapf = '%s.bim' % bfn
  1209. maplist = file(mapf,'r').readlines()
  1210. maplist = [x.split() for x in maplist]
  1211. except:
  1212. maplist = None
  1213. alogf.write('## error - cannot open %s to read map - no offsets will be available for output files')
  1214. #rerla@beast galaxy]$ head test-data/tinywga.bim
  1215. #22 rs2283802 0 21784722 4 2
  1216. #22 rs2267000 0 21785366 4 2
  1217. rgbin = os.path.split(rexe)[0] # get our rg bin path
  1218. #plinktasks = [' --freq',' --missing',' --mendel',' --hardy',' --check-sex'] # plink v1 fixes that bug!
  1219. # if we could, do all at once? Nope. Probably never.
  1220. plinktasks = [['--freq',],['--hwe 0.0', '--missing','--hardy'],
  1221. ['--mendel',],['--check-sex',]]
  1222. vclbase = [options.plinke,'--noweb','--out',basename,'--bfile',bfn,'--mind','1.0','--geno','1.0','--maf','0.0']
  1223. runPlink(logf=plogf,plinktasks=plinktasks,cd=newfpath, vclbase=vclbase)
  1224. plinktasks = [['--bfile',bfn,'--indep-pairwise 40 20 0.5','--out %s' % basename],
  1225. ['--bfile',bfn,'--extract %s.prune.in --make-bed --out ldp_%s' % (basename, basename)],
  1226. ['--bfile ldp_%s --het --out %s' % (basename,basename)]]
  1227. # subset of ld independent markers for eigenstrat and other requirements
  1228. vclbase = [options.plinke,'--noweb']
  1229. plogout = pruneLD(plinktasks=plinktasks,cd=newfpath,vclbase = vclbase)
  1230. plogf.write('\n'.join(plogout))
  1231. plogf.write('\n')
  1232. repout = os.path.join(newfpath,basename)
  1233. subjects,subjectTops = subjectRep(froot=repout,outfname=asubjf,newfpath=newfpath,
  1234. logf=alogf) # writes the subject_froot.xls file
  1235. markers,markerTops = markerRep(froot=repout,outfname=amarkf,newfpath=newfpath,
  1236. logf=alogf,maplist=maplist) # marker_froot.xls
  1237. nbreaks = 100
  1238. s = '## starting plotpage, newfpath=%s,m=%s,s=%s/n' % (newfpath,markers[:2],subjects[:2])
  1239. alogf.write(s)
  1240. print s
  1241. plotpage,cruft = makePlots(markers=markers,subjects=subjects,newfpath=newfpath,
  1242. basename=basename,nbreaks=nbreaks,height=10,width=8,rgbin=rgbin)
  1243. #plotpage = RmakePlots(markers=markers,subjects=subjects,newfpath=newfpath,basename=basename,nbreaks=nbreaks,rexe=rexe)
  1244. # [titles[n],plotnames[n],outfnames[n] ]
  1245. html = []
  1246. html.append('<table cellpadding="5" border="0">')
  1247. size = getfSize(amarkf,newfpath)
  1248. html.append('<tr><td colspan="3"><a href="%s" type="application/vnd.ms-excel">%s</a>%s tab delimited</td></tr>' % \
  1249. (amarkf,'Click here to download the Marker QC Detail report file',size))
  1250. size = getfSize(asubjf,newfpath)
  1251. html.append('<tr><td colspan="3"><a href="%s" type="application/vnd.ms-excel">%s</a>%s tab delimited</td></tr>' % \
  1252. (asubjf,'Click here to download the Subject QC Detail report file',size))
  1253. for (title,url,ofname) in plotpage:
  1254. ttitle = 'Ranked %s' % title
  1255. dat = subjectTops.get(ttitle,None)
  1256. if not dat:
  1257. dat = markerTops.get(ttitle,None)
  1258. imghref = '%s.jpg' % os.path.splitext(url)[0] # removes .pdf
  1259. thumbnail = os.path.join(newfpath,imghref)
  1260. if not os.path.exists(thumbnail): # for multipage pdfs, mogrify makes multiple jpgs - fugly hack
  1261. imghref = '%s-0.jpg' % os.path.splitext(url)[0] # try the first jpg
  1262. thumbnail = os.path.join(newfpath,imghref)
  1263. if not os.path.exists(thumbnail):
  1264. html.append('<tr><td colspan="3"><a href="%s">%s</a></td></tr>' % (url,title))
  1265. else:
  1266. html.append('<tr><td><a href="%s"><img src="%s" alt="%s" hspace="10" align="middle">' \
  1267. % (url,imghref,title))
  1268. if dat: # one or the other - write as an extra file and make a link here
  1269. t = '%s.xls' % (ttitle.replace(' ','_'))
  1270. fname = os.path.join(newfpath,t)
  1271. f = file(fname,'w')
  1272. f.write('\n'.join(['\t'.join(x) for x in dat])) # the report
  1273. size = getfSize(t,newfpath)
  1274. html.append('</a></td><td>%s</td><td><a href="%s">Worst data</a>%s</td></tr>' % (title,t,size))
  1275. else:
  1276. html.append('</a></td><td>%s</td><td>&nbsp;</td></tr>' % (title))
  1277. html.append('</table><hr><h3>All output files from the QC run are available below</h3>')
  1278. html.append('<table cellpadding="5" border="0">\n')
  1279. flist = os.listdir(newfpath) # we want to catch 'em all
  1280. flist.sort()
  1281. for f in flist:
  1282. fname = os.path.split(f)[-1]
  1283. size = getfSize(fname,newfpath)
  1284. html.append('<tr><td><a href="%s">%s</a>%s</td></tr>' % (fname,fname,size))
  1285. html.append('</table>')
  1286. alogf.close()
  1287. plogf.close()
  1288. llog = open(alog,'r').readlines()
  1289. plogfile = open(plog,'r').readlines()
  1290. os.unlink(alog)
  1291. os.unlink(plog)
  1292. llog += plogfile # add lines from pruneld log
  1293. lf = file(ahtmlf,'w') # galaxy will show this as the default view
  1294. lf.write(galhtmlprefix % progname)
  1295. s = '\n<div>Output from Rgenetics QC report tool run at %s<br>\n' % (timenow())
  1296. lf.write('<h4>%s</h4>\n' % s)
  1297. lf.write('</div><div><h4>(Click any preview image to download a full sized PDF version)</h4><br><ol>\n')
  1298. lf.write('\n'.join(html))
  1299. lf.write('<h4>QC run log contents</h4>')
  1300. lf.write('<pre>%s</pre>' % (''.join(llog))) # plink logs
  1301. if len(cruft) > 0:
  1302. lf.write('<h2>Blather from pdfnup follows:</h2><pre>%s</pre>' % (''.join(cruft))) # pdfnup
  1303. lf.write('%s\n<hr>\n' % galhtmlpostfix)
  1304. lf.close()