/test-data/rgtestouts/rgManQQ/rgManQQtest1.R

https://bitbucket.org/cistrome/cistrome-harvard/ · R · 195 lines · 157 code · 13 blank · 25 comment · 35 complexity · 47f8561eb86dd7290164372e91c26efd MD5 · raw file

  1. # generalised so 3 core fields passed as parameters ross lazarus March 24 2010 for rgenetics
  2. # Originally created as qqman with the following
  3. # attribution:
  4. #--------------
  5. # Stephen Turner
  6. # http://StephenTurner.us/
  7. # http://GettingGeneticsDone.blogspot.com/
  8. # Last updated: 19 July 2011 by Ross Lazarus
  9. # R code for making manhattan plots and QQ plots from plink output files.
  10. # With GWAS data this can take a lot of memory. Recommended for use on
  11. # 64bit machines only, for now.
  12. #
  13. library(ggplot2)
  14. coloursTouse = c('firebrick','darkblue','goldenrod','darkgreen')
  15. # not too ugly but need a colour expert please...
  16. DrawManhattan = function(pvals=Null,chrom=Null,offset=Null,title=NULL, max.y="max",suggestiveline=0, genomewide=T, size.x.labels=9,
  17. size.y.labels=10, annotate=F, SNPlist=NULL,grey=0) {
  18. if (annotate & is.null(SNPlist)) stop("You requested annotation but provided no SNPlist!")
  19. genomewideline=NULL # was genomewideline=-log10(5e-8)
  20. n = length(pvals)
  21. if (genomewide) { # use bonferroni since might be only a small region?
  22. genomewideline = -log10(0.05/n) }
  23. offset = as.integer(offset)
  24. if (n > 1000000) { offset = offset/10000 }
  25. else if (n > 10000) { offset = offset/1000}
  26. chro = as.integer(chrom) # already dealt with X and friends?
  27. pvals = as.double(pvals)
  28. d=data.frame(CHR=chro,BP=offset,P=pvals)
  29. if ("CHR" %in% names(d) & "BP" %in% names(d) & "P" %in% names(d) ) {
  30. d=d[!is.na(d$P), ]
  31. d=d[!is.na(d$BP), ]
  32. d=d[!is.na(d$CHR), ]
  33. #limit to only chrs 1-22, x=23,y=24,Mt=25?
  34. d=d[d$CHR %in% 1:25, ]
  35. d=d[d$P>0 & d$P<=1, ]
  36. d$logp = as.double(-log10(d$P))
  37. dlen = length(d$P)
  38. d$pos=NA
  39. ticks=NULL
  40. lastbase=0
  41. chrlist = unique(d$CHR)
  42. chrlist = as.integer(chrlist)
  43. chrlist = sort(chrlist) # returns lexical ordering
  44. if (max.y=="max") { maxy = ceiling(max(d$logp)) }
  45. else { maxy = max.y }
  46. nchr = length(chrlist) # may be any number?
  47. maxy = max(maxy,1.1*genomewideline)
  48. if (nchr >= 2) {
  49. for (x in c(1:nchr)) {
  50. i = chrlist[x] # need the chrom number - may not == index
  51. if (x == 1) { # first time
  52. d[d$CHR==i, ]$pos = d[d$CHR==i, ]$BP # initialize to first BP of chr1
  53. dsub = subset(d,CHR==i)
  54. dlen = length(dsub$P)
  55. lastbase = max(dsub$pos) # last one
  56. tks = d[d$CHR==i, ]$pos[floor(length(d[d$CHR==i, ]$pos)/2)+1]
  57. lastchr = i
  58. } else {
  59. d[d$CHR==i, ]$pos = d[d$CHR==i, ]$BP+lastbase # one humongous contig
  60. if (sum(is.na(lastchr),is.na(lastbase),is.na(d[d$CHR==i, ]$pos))) {
  61. cat(paste('manhattan: For',title,'chrlistx=',i,'lastchr=',lastchr,'lastbase=',lastbase,'pos=',d[d$CHR==i,]$pos))
  62. }
  63. tks=c(tks, d[d$CHR==i, ]$pos[floor(length(d[d$CHR==i, ]$pos)/2)+1])
  64. lastchr = i
  65. dsub = subset(d,CHR==i)
  66. lastbase = max(dsub$pos) # last one
  67. }
  68. ticklim=c(min(d$pos),max(d$pos))
  69. xlabs = chrlist
  70. }
  71. } else { # nchr is 1
  72. nticks = 10
  73. last = max(d$BP)
  74. first = min(d$BP)
  75. tks = c(first)
  76. t = (last-first)/nticks # units per tick
  77. for (x in c(1:(nticks))) {
  78. tks = c(tks,round(x*t)+first) }
  79. ticklim = c(first,last)
  80. } # else
  81. if (grey) {mycols=rep(c("gray10","gray60"),max(d$CHR))
  82. } else {
  83. mycols=rep(coloursTouse,max(d$CHR))
  84. }
  85. dlen = length(d$P)
  86. d$pranks = rank(d$P)/dlen
  87. d$centiles = 100*d$pranks # small are interesting
  88. d$sizes = ifelse((d$centile < 1),2,1)
  89. if (annotate) d.annotate=d[as.numeric(substr(d$SNP,3,100)) %in% SNPlist, ]
  90. if (nchr >= 2) {
  91. manplot=qplot(pos,logp,data=d, ylab=expression(-log[10](italic(p))) , colour=factor(CHR),size=factor(sizes))
  92. manplot=manplot+scale_x_continuous(name="Chromosome", breaks=tks, labels=xlabs,limits=ticklim)
  93. manplot=manplot+scale_size_manual(values = c(0.5,1.5)) # requires discreet scale - eg factor
  94. #manplot=manplot+scale_size(values=c(0.5,2)) # requires continuous
  95. }
  96. else {
  97. manplot=qplot(BP,logp,data=d, ylab=expression(-log[10](italic(p))) , colour=factor(CHR))
  98. manplot=manplot+scale_x_continuous(name=paste("Chromosome",chrlist[1]), breaks=tks, labels=tks,limits=ticklim)
  99. }
  100. manplot=manplot+scale_y_continuous(limits=c(0,maxy), breaks=1:maxy, labels=1:maxy)
  101. manplot=manplot+scale_colour_manual(value=mycols)
  102. if (annotate) { manplot=manplot + geom_point(data=d.annotate, colour=I("green3")) }
  103. manplot=manplot + opts(legend.position = "none")
  104. manplot=manplot + opts(title=title)
  105. manplot=manplot+opts(
  106. panel.background=theme_blank(),
  107. axis.text.x=theme_text(size=size.x.labels, colour="grey50"),
  108. axis.text.y=theme_text(size=size.y.labels, colour="grey50"),
  109. axis.ticks=theme_segment(colour=NA)
  110. )
  111. if (suggestiveline) manplot=manplot+geom_hline(yintercept=suggestiveline,colour="blue", alpha=I(1/3))
  112. if (genomewideline) manplot=manplot+geom_hline(yintercept=genomewideline,colour="red")
  113. manplot
  114. } else {
  115. stop("Make sure your data frame contains columns CHR, BP, and P")
  116. }
  117. }
  118. qq = function(pvector, title=NULL, spartan=F) {
  119. # Thanks to Daniel Shriner at NHGRI for providing this code for creating expected and observed values
  120. o = -log10(sort(pvector,decreasing=F))
  121. e = -log10( 1:length(o)/length(o) )
  122. # you could use base graphics
  123. # plot(e,o,pch=19,cex=0.25, xlab=expression(Expected~~-log[10](italic(p))),
  124. # ylab=expression(Observed~~-log[10](italic(p))), xlim=c(0,max(e)), ylim=c(0,max(e)))
  125. # lines(e,e,col="red")
  126. #You'll need ggplot2 installed to do the rest
  127. qq=qplot(e,o, xlim=c(0,max(e)), ylim=c(0,max(o))) + stat_abline(intercept=0,slope=1, col="red")
  128. qq=qq+opts(title=title)
  129. qq=qq+scale_x_continuous(name=expression(Expected~~-log[10](italic(p))))
  130. qq=qq+scale_y_continuous(name=expression(Observed~~-log[10](italic(p))))
  131. if (spartan) plot=plot+opts(panel.background=theme_rect(col="grey50"), panel.grid.minor=theme_blank())
  132. qq
  133. }
  134. rgqqMan = function(infile="/data/tmp/tmpNaxDwH/database/files/000/dataset_1.dat",chromcolumn=2, offsetcolumn=3, pvalscolumns=c(8),
  135. title="rgManQQtest1",grey=0) {
  136. rawd = read.table(infile,head=T,sep='\t')
  137. dn = names(rawd)
  138. cc = dn[chromcolumn]
  139. oc = dn[offsetcolumn]
  140. rawd[,cc] = sub('chr','',rawd[,cc],ignore.case = T) # just in case
  141. rawd[,cc] = sub(':','',rawd[,cc],ignore.case = T) # ugh
  142. rawd[,cc] = sub('X',23,rawd[,cc],ignore.case = T)
  143. rawd[,cc] = sub('Y',24,rawd[,cc],ignore.case = T)
  144. rawd[,cc] = sub('Mt',25,rawd[,cc], ignore.case = T)
  145. nams = c(cc,oc) # for sorting
  146. plen = length(rawd[,1])
  147. print(paste('###',plen,'values read from',infile,'read - now running plots',sep=' '))
  148. rawd = rawd[do.call(order,rawd[nams]),]
  149. # mmmf - suggested by http://onertipaday.blogspot.com/2007/08/sortingordering-dataframe-according.html
  150. # in case not yet ordered
  151. if (plen > 0) {
  152. for (pvalscolumn in pvalscolumns) {
  153. if (pvalscolumn > 0)
  154. {
  155. cname = names(rawd)[pvalscolumn]
  156. mytitle = paste('p=',cname,', ',title,sep='')
  157. myfname = chartr(' ','_',cname)
  158. myqqplot = qq(rawd[,pvalscolumn],title=mytitle)
  159. ggsave(filename=paste(myfname,"qqplot.png",sep='_'),myqqplot,width=8,height=6,dpi=96)
  160. ggsave(filename=paste(myfname,"qqplot.pdf",sep='_'),myqqplot,width=8,height=6,dpi=96)
  161. print(paste('## qqplot on',cname,'done'))
  162. if ((chromcolumn > 0) & (offsetcolumn > 0)) {
  163. print(paste('## manhattan on',cname,'starting',chromcolumn,offsetcolumn,pvalscolumn))
  164. mymanplot= DrawManhattan(chrom=rawd[,chromcolumn],offset=rawd[,offsetcolumn],pvals=rawd[,pvalscolumn],title=mytitle,grey=grey)
  165. ggsave(filename=paste(myfname,"manhattan.png",sep='_'),mymanplot,width=8,height=6,dpi=96)
  166. ggsave(filename=paste(myfname,"manhattan.pdf",sep='_'),mymanplot,width=8,height=6,dpi=96)
  167. print(paste('## manhattan plot on',cname,'done'))
  168. }
  169. else {
  170. print(paste('chrom column =',chromcolumn,'offset column = ',offsetcolumn,
  171. 'so no Manhattan plot - supply both chromosome and offset as numerics for Manhattan plots if required'))
  172. }
  173. }
  174. else {
  175. print(paste('pvalue column =',pvalscolumn,'Cannot parse it so no plots possible'))
  176. }
  177. } # for pvalscolumn
  178. } else { print('## Problem - no values available to plot - was there really a chromosome and offset column?') }
  179. }
  180. rgqqMan()
  181. # execute with defaults as substituted
  182. #R script autogenerated by rgenetics/rgutils.py on 04/08/2011 11:46:16