/tools/expression/utils.R

https://bitbucket.org/cistrome/cistrome-harvard/ · R · 1007 lines · 757 code · 78 blank · 172 comment · 140 complexity · b382b19424ccc8e22e818953152991d9 MD5 · raw file

  1. #################################################################################################
  2. ##
  3. #################################################################################################
  4. getSkipFromGPR <- function(file){
  5. con <- file(file,"r")
  6. twoLines <- readLines(con,n=2)
  7. close(con)
  8. if(substring(twoLines,1,3)[1]=="ATF"){
  9. skip <- substring(twoLines,1,2)
  10. }
  11. else{
  12. skip=c(0,-2)
  13. }
  14. as.numeric(skip[2])+2
  15. }
  16. ###########################################################################################
  17. ## creates a new GAL file from the information from the gpr files
  18. ###########################################################################################
  19. createGALFromGPR <- function(file,galfile="dummy.gal",colnames=c("Block","Column","Row","Name","ID")){
  20. table <- read.table(file,as.is=TRUE,comment.char="",check.names=FALSE,header=TRUE,skip=getSkipFromGPR(file),sep="\t")
  21. names <- colnames(table)
  22. result <- data.frame(matrix(ncol=5,nrow=length(table[,1])))
  23. colnames(result) <- colnames
  24. for(i in 1:length(result[1,])){
  25. result[,i] <- table[,colnames[i]]
  26. }
  27. #result[,1:5] <- table[,1:5]
  28. #colnames(result) <- names[1:5]
  29. write.table(file=galfile,result,sep="\t",row.name=FALSE,quote=FALSE)
  30. }
  31. plotHistogram <- function(R,G,r.col=2,g.col=3,breaks=NULL,log.transform=FALSE,add=FALSE,weights=NULL,return.breaks=FALSE,ylim=NULL,xlim=NULL,title=NULL,relative=FALSE,...){
  32. if(is.null(weights)){
  33. weights <- matrix(ncol=1,nrow=length(R))
  34. weights[,] <- 1
  35. }
  36. if(log.transform==TRUE){
  37. r <- log2(R)
  38. g <- log2(G)
  39. by <- 0.1
  40. toadd <- 0.1
  41. }
  42. else{
  43. r <- R
  44. g <- G
  45. by <- 50
  46. toadd <- 50
  47. }
  48. r <- r[weights==1]
  49. g <- g[weights==1]
  50. r <- r[!is.na(r)]
  51. g <- g[!is.na(g)]
  52. r <- r[!is.infinite(r)]
  53. g <- g[!is.infinite(g)]
  54. if(is.null(breaks)){
  55. breaks <- seq(min(0,(min(r,g))),(max(max(r),max(g))+toadd),by=by)
  56. }
  57. hr <- hist(r,plot=FALSE,breaks=breaks)
  58. hg <- hist(g,plot=FALSE,breaks=breaks)
  59. if(is.null(xlim)){
  60. xlim <- range(breaks)
  61. }
  62. r.counts <- hr$counts
  63. g.counts <- hg$counts
  64. ylabb <- "Counts"
  65. if(relative){
  66. r.counts <- r.counts / length(r)
  67. g.counts <- g.counts / length(g)
  68. ylabb <- "relative Counts"
  69. yy <- -0.001
  70. }
  71. else{
  72. yy <- -10
  73. }
  74. if(is.null(ylim)){
  75. ylim <- c(0,max(r.counts,g.counts))
  76. }
  77. if(!add){
  78. plot(hr$mids,r.counts,col=r.col,xlab="Intensity",ylab=ylabb,type="l",ylim=ylim,xlim=xlim,main=title,...)
  79. }
  80. else{
  81. points(hr$mids,r.counts,col=r.col,type="l",...)
  82. }
  83. points(hg$mids,g.counts,col=g.col,type="l",...)
  84. points(medianWithoutNA(r),yy,col=r.col,pch=3)
  85. points(medianWithoutNA(g),yy,col=g.col,pch=3)
  86. if(return.breaks){
  87. breaks
  88. }
  89. # hr <- density(r,na.rm=TRUE)
  90. # hg <- density(g,na.rm=TRUE)
  91. # xlim <- c(min(hr$x,hg$x),max(hr$x,hg$x))
  92. # ylim <- c(min(hr$y,hg$y),max(hr$y,hg$y))
  93. # plot(hr$x,hr$y,col=2,type="l",ylim=ylim,...)
  94. # points(hg$x,hg$y,col=3,type="l")
  95. # hr <- hist(r,breaks=breaks,plot=FALSE)
  96. # hg <- hist(g,breaks=breaks,plot=FALSE)
  97. }
  98. #############################################################################
  99. ##
  100. #############################################################################
  101. plotHistogramFromSlide <- function(lObject,slide=1,use.weights=TRUE,log.transform=FALSE,breaks=NULL,bg.lty=3,...){
  102. if(slide=="all"){
  103. }
  104. if(is.null(lObject$R)){
  105. R <- MAtoR(lObject$M[,slide],lObject$A[,slide])
  106. G <- MAtoG(lObject$M[,slide],lObject$A[,slide])
  107. Rb <- NULL
  108. Gb <- NULL
  109. draw.bg <- FALSE
  110. names <- colnames(lObject$M)
  111. }
  112. else{
  113. R <- lObject$R[,slide]
  114. G <- lObject$G[,slide]
  115. Rb <- lObject$Rb[,slide]
  116. Gb <- lObject$Gb[,slide]
  117. draw.bg <- !is.null(Rb)
  118. names <- colnames(lObject$R)
  119. }
  120. weights <- matrix(ncol=1,nrow=length(R))
  121. weights[,] <- 1
  122. if(is.null(lObject$weights)){
  123. }
  124. else{
  125. if(use.weights){
  126. weights <- lObject$weights[,slide]
  127. }
  128. }
  129. # r <- R[weights==1]
  130. # g <- G[weights==1]
  131. # rb <- Rb[weights==1]
  132. # gb <- Gb[weights==1]
  133. breaks <- plotHistogram(R=R,G=G,log.transform=log.transform,weights=weights,breaks=breaks,return.breaks=TRUE,title=names[slide],...)
  134. if(draw.bg){
  135. plotHistogram(R=Rb,G=Gb,log.transform=log.transform,breaks=NULL,weights=weights,lty=bg.lty,add=TRUE,...)
  136. }
  137. }
  138. checkExistantFile <- function(file,path=".",no.subdir=TRUE){
  139. file <- gsub(pattern="_",replacement=".",file)
  140. file <- gsub(pattern="/",replacement="-",file)
  141. counter <- 0
  142. suff <- NULL
  143. ending <- NULL
  144. ffile <- file
  145. splitted <- unlist(strsplit(file,split="\\."))
  146. if(length(splitted)>1){
  147. ending <- paste(".",splitted[length(splitted)],sep="")
  148. ffile <- paste(splitted[1:(length(splitted)-1)],collapse=".")
  149. #cat(ffile,"_",ending,"\n")
  150. }
  151. while(TRUE){
  152. newfile <- paste(ffile,suff,ending,sep="")
  153. if(file.exists(newfile)){
  154. #file does exist...
  155. counter <- counter +1
  156. suff <- paste("(",counter,")",sep="")
  157. }
  158. else{
  159. break
  160. }
  161. }
  162. newfile
  163. }
  164. saveMAList <- function(object,filename,remove.flagged=TRUE,na.value="NA",subset=NULL, allvalues=TRUE){
  165. genes <- NULL
  166. weights <- NULL
  167. g.length <- 0
  168. col.names <- NULL
  169. if(class(object)=="MAList"){
  170. if(is.null(dim(object$M))){
  171. object$M <- as.matrix(object$M)
  172. object$A <- as.matrix(object$A)
  173. if(!is.null(object$weights)){
  174. object$weights <- as.matrix(object$weights)
  175. }
  176. }
  177. if(is.null(subset)){
  178. subset <- rep(TRUE,nrow(object$M))
  179. }
  180. M <- as.matrix(object$M[subset,])
  181. CN <- colnames(object$M)
  182. if(is.null(CN)){
  183. CN <- object$targets[1,1]
  184. }
  185. colnames(M) <- CN
  186. # colnames(M) <- colnames(object$M)
  187. A <- as.matrix(object$A[subset,])
  188. colnames(A) <- CN
  189. # colnames(A) <- colnames(object$A)
  190. if(!is.null(object$weights)){
  191. weights <- as.matrix(object$weights[subset,])
  192. }
  193. if(!is.null(object$genes)){
  194. genes <- object$genes[subset,]
  195. if(!is.null(genes)){
  196. g.length <- ncol(genes)
  197. }
  198. }
  199. }
  200. else if(class(object)=="EexprSet" | class(object)=="MadbSet"){
  201. if(object@type=="TwoColor"){
  202. if(is.null(subset)){
  203. subset <- rep(TRUE,nrow(exprs(object)))
  204. }
  205. M <- getM(object)
  206. Colnames <- colnames(M)
  207. M <- as.matrix(M[subset,])
  208. colnames(M) <- Colnames
  209. # M <- as.matrix(getM(object)[subset,])
  210. # cat(class(M),colnames(M),"\n")
  211. # A <- as.matrix(getA(object)[subset,])
  212. A <- getA(object)
  213. A <- as.matrix(A[subset,])
  214. colnames(A) <- Colnames
  215. #cat("1")
  216. weights <- as.matrix(getWeights(object)[subset,])
  217. #cat("2")
  218. genes <- as.matrix(object@genes[subset,])
  219. if(!is.null(genes)){
  220. g.length <- ncol(genes)
  221. if(g.length==1){
  222. colnames(genes) <- "ID"
  223. }
  224. }
  225. #cat("g.length",g.length,"dim M",dim(M),"\n")
  226. #cat("3")
  227. }
  228. }
  229. else if(class(object)=="RGList"){
  230. if(is.null(subset)){
  231. subset <- rep(TRUE,nrow(object$R))
  232. }
  233. if(!.is.log(object$R)){
  234. object$R <- log2(object$R)
  235. }
  236. if(!.is.log(object$G)){
  237. object$G <- log2(object$G)
  238. }
  239. M <- object$R[subset,] - object$G[subset,]
  240. A <- 0.5*(object$R[subset,] + object$G[subset,])
  241. if(!is.null(object$weights)){
  242. weights <- object$weights[subset,]
  243. }
  244. if(!is.null(object$genes)){
  245. genes <- object$genes[subset,]
  246. if(!is.null(genes)){
  247. g.length <- ncol(genes)
  248. }
  249. }
  250. }
  251. else{
  252. stop("Objects from the class ",class(object)," can not be handled by this function\n")
  253. }
  254. #cat("ncol=",g.length+(ncol(M)*2),"nrow=",nrow=nrow(M),"\n")
  255. Table <- matrix(ncol=(g.length+(ncol(M)*2)),nrow=nrow(M))
  256. if(!is.null(genes)){
  257. Table[,1:ncol(genes)] <- as.matrix(genes)
  258. col.names <- colnames(genes)
  259. }
  260. #cat(colnames(M),"\n")
  261. for(i in 1:ncol(M)){
  262. col.names <- c(col.names,paste(colnames(M)[i],"M"),paste(colnames(A)[i],"A"))
  263. Table[,((i*2)-1+g.length)] <- M[,i]
  264. if(!is.null(weights) & remove.flagged){
  265. Table[weights[,i]==0,((i*2)-1+g.length)] <- NA
  266. }
  267. Table[is.na(Table[,((i*2)-1+g.length)]),((i*2)-1+g.length)] <- NA
  268. Table[,((i*2)+g.length)] <- A[,i]
  269. if(!is.null(weights) & remove.flagged){
  270. Table[weights[,i]==0,((i*2)+g.length)] <- NA
  271. }
  272. Table[is.na(Table[,((i*2)+g.length)]),((i*2)+g.length)] <- NA
  273. }
  274. colnames(Table) <- col.names
  275. if(allvalues){
  276. ## calculate the red and green intensities from the M and A values...
  277. R <- MAtoR(M, A)
  278. G <- MAtoG(M, A)
  279. if(is.null(ncol(R))){
  280. R <- matrix(R, ncol=1)
  281. G <- matrix(G, ncol=1)
  282. }
  283. colnames(R) <- paste(colnames(M), "R")
  284. colnames(G) <- paste(colnames(M), "G")
  285. Table <- cbind(Table, R, G)
  286. }
  287. write.table(Table,file=filename,sep="\t",na=na.value,row.names=FALSE,quote=FALSE)
  288. }
  289. ## closes all connections of gpr files...
  290. closeMyConnections <- function(pattern="gpr"){
  291. All <- showConnections(TRUE)
  292. what <- grep(All[,"description"],pattern=pattern)
  293. what <- what-1
  294. for(theCon in what){
  295. close(getConnection(theCon))
  296. }
  297. }
  298. checkGPRs <- function(files=NULL,overwrite=TRUE){
  299. ##
  300. # check for correct axon format...
  301. requiredcolumns <- c("Block","Column","Row","Name","ID","F635 Median","F635 Mean","B635 Median","B635 Mean","F532 Median","F532 Mean","B532 Median","B532 Mean","Flags")
  302. for(i in 1:length(files)){
  303. con <- file(files[i],"r")
  304. twoLines <- readLines(con,n=2)
  305. close(con)
  306. # twoLines <- scan(file=files[i], what="character",nlines=2, quiet=TRUE)
  307. # closeMyConnections()
  308. if(substring(twoLines,1,3)[1]!="ATF"){
  309. # is not in correct Axon file format
  310. cat("The file",files[i],"is not in correct Axon file format!\n")
  311. header <- read.table(files[i],header=TRUE,nrows=1,sep="\t",check.names=FALSE)
  312. if(sum(colnames(header) %in% requiredcolumns)==length(requiredcolumns)){
  313. BigFile <- readLines(con=files[i])
  314. writeLines(c("ATF\t1.0",paste("1\t",ncol(header),sep=""),"Provider=CARMAweb"),con=files[i])
  315. con <- file(files[i],"at")
  316. writeLines(con=con,BigFile)
  317. flush(con)
  318. close(con)
  319. cat("Successfully adjusted file",files[i],".\n")
  320. # closeMyConnections()
  321. }
  322. else{
  323. stop("The file",files[i],"is not in correct Axon file format! Was this file scanned with an Axon GenePix Scanner?\n")
  324. }
  325. }
  326. }
  327. skips <- NULL
  328. for(i in 1:length(files)){
  329. skips <- c(skips,getSkipFromGPR(files[i]))
  330. }
  331. if(length(unique(skips))!=1){
  332. cat("GPR files have different number of header rows!\nAdjusting GPR files\n")
  333. for(i in 1:length(files)){
  334. dummy <- read.table(files[i],skip=skips[i],header=TRUE,comment.char="",sep="\t",check.names=FALSE)
  335. #cat(colnames(dummy),"\n")
  336. write.table(dummy,file=files[i],sep="\t",quote=FALSE,row.names=FALSE)
  337. }
  338. checkGPRs(files) # just re-run the check once again...
  339. }
  340. }
  341. createM <- function(object,red.channels,green.channels){
  342. M <- matrix(ncol=length(red.channels),nrow=nrow(exprs(object)))
  343. weights <- getWeights(object)
  344. col.names <- NULL
  345. for(i in 1:ncol(M)){
  346. r.idx <- myGrep(colnames(exprs(object)),pattern=red.channels[i],exact.match=TRUE)
  347. g.idx <- myGrep(colnames(exprs(object)),pattern=green.channels[i],exact.match=TRUE)
  348. M[,i] <- getM(object,r=r.idx,g=g.idx)
  349. M[weights[,r.idx]==0 | weights[,g.idx]==0,i] <- NA
  350. col.names <- c(col.names,paste(red.channels[i]," vs ",green.channels[i],sep=""))
  351. }
  352. colnames(M) <- col.names
  353. M
  354. }
  355. createA <- function(object,red.channels,green.channels){
  356. A <- matrix(ncol=length(red.channels),nrow=nrow(exprs(object)))
  357. weights <- getWeights(object)
  358. col.names <- NULL
  359. for(i in 1:ncol(A)){
  360. r.idx <- myGrep(colnames(exprs(object)),pattern=red.channels[i],exact.match=TRUE)
  361. g.idx <- myGrep(colnames(exprs(object)),pattern=green.channels[i],exact.match=TRUE)
  362. A[,i] <- getA(object,r=r.idx,g=g.idx)
  363. A[weights[,r.idx]==0 | weights[,g.idx]==0,i] <- NA
  364. col.names <- c(col.names,paste(red.channels[i]," vs ",green.channels[i],sep=""))
  365. }
  366. colnames(A) <- col.names
  367. A
  368. }
  369. ##
  370. # to remove all those genes that have flags in
  371. excludeFromTest <- function(data,classlabels,weights=NULL,nr.present=2){
  372. if(!is.null(weights)){
  373. if(sum(dim(data)==dim(weights))==2){
  374. data <- .removeFlagged(data,weights)
  375. }
  376. }
  377. result <- apply(data,MARGIN=1,.dummyFun,cl=classlabels,np=nr.present)
  378. result
  379. }
  380. .removeFlagged <- function(data,weights,flagged.value=NA){
  381. for(i in 1:ncol(data)){
  382. data[weights[,i]==0,i] <- flagged.value
  383. }
  384. data
  385. }
  386. .dummyFun <- function(x,cl,np){
  387. res <- sum(!is.na(x[cl==0])) < np | sum(!is.na(x[cl==1])) < np
  388. res
  389. }
  390. tukey.biweight.mod <- function(x,c=5,epsilon=1e-4,na.rm=TRUE){
  391. x <- x[!is.na(x)]
  392. ret <- tukey.biweight(x,c=c,epsilon=epsilon)
  393. }
  394. if( !isGeneric("filterOnVariance") )
  395. setGeneric("filterOnVariance", function(object,variance=0,array.names=NULL,remove.flagged=TRUE,v=TRUE,...)
  396. standardGeneric("filterOnVariance"))
  397. setMethod("filterOnVariance","MadbSet",
  398. function(object,variance=0,array.names=NULL,remove.flagged=TRUE,v=TRUE,...){
  399. if(is.null(array.names)){
  400. array.names <- colnames(exprs(object))
  401. }
  402. E <- exprs(object)[,array.names]
  403. if(remove.flagged){
  404. weights <- getWeights(object)[,array.names]
  405. for(i in 1:ncol(weights)){
  406. E[weights[,i]==0,i] <- NA
  407. }
  408. }
  409. sds <- apply(E,MARGIN=1,sd,na.rm=TRUE)
  410. quant <- quantile(sds,variance,na.rm=TRUE)
  411. sel <- sds >= quant
  412. sel[is.na(sel)] <- FALSE
  413. if(v){
  414. cat(sum(sel)," features out of ",length(sel)," have a sd bigger than ",quant,"\n")
  415. }
  416. ret <- object
  417. exprs(ret) <- exprs(object)[sel,]
  418. ret@weights <- getWeights(object)[sel,]
  419. if(nrow(exprs(object))==nrow(object@genes)){
  420. if(ncol(object@genes)==1){
  421. G <- matrix(object@genes[sel,],ncol=1)
  422. rownames(G) <- rownames(exprs(ret))
  423. colnames(G) <- colnames(object@genes)
  424. ret@genes <- G
  425. }
  426. else{
  427. ret@genes <- object@genes[sel,]
  428. }
  429. }
  430. ret
  431. })
  432. ##
  433. # this function is a generalization of Robert Gentlemens GOHyperG funciton from the GOstats package.
  434. # it allows to submit all EntrezGenes present on the used microarray.
  435. # in principal this function performs the same steps as the GOHyperG function, nevertheless the results
  436. # are sligtly different (maybe because of the different annotation packages used:
  437. # mapping to GO terms using the affymetrix probe sets -> lokuslink -> GO terms (GOstats) or mapping the locuslink ids directly
  438. # to GO terms using the GO package like it is done here.)
  439. calculateGOHyperG <- function(x,allLLs=NULL,chip=NULL,what="MF",use.all=FALSE,v=TRUE){
  440. require(GO)
  441. require(GOstats)
  442. if (chip == "fly.db0.db"){chip = "fly.db0"}
  443. if(is.null(allLLs) & is.null(chip) & !use.all){
  444. stop("You have to submit either all EntrezGene IDs present on the array used, or the Affymetrix GeneChip used, or select use.all=TRUE to use all EntrezGenes available in GO!\n\n")
  445. }
  446. if(use.all){
  447. ##
  448. # just that the code below throws no exception...
  449. allLLs <- x
  450. }
  451. ##
  452. # if chip was submitted retrieve all LLs from the specific chip
  453. if(!is.null(chip)){
  454. require(chip,character.only=TRUE) || stop("need data package",chip)
  455. allLLs <- as.character(unlist(as.list(get(paste(chip,"LOCUSID",sep=""),mode="environment"))))
  456. }
  457. ##
  458. # prepare the x (myLLs) and allLLs. remove duplicates, NAs, use only those x that are present in allLLs
  459. # remove possible LL. or LL strings (old way of writing LocusLink IDs)
  460. x <- sub("LL.","",x)
  461. x <- sub("LL","",x)
  462. allLLs <- sub("LL.","",allLLs)
  463. allLLs <- sub("LL","",allLLs)
  464. allLLs <- unique(allLLs)
  465. allLLs <- allLLs[!is.na(allLLs) & allLLs!=""]
  466. x <- unique(x)
  467. x <- x[!is.na(x) & x!=""]
  468. ##
  469. # just interested in those x that are present in allLLs
  470. x <- allLLs[match(x,allLLs)]
  471. x <- x[!is.na(x) & x!=""]
  472. ##
  473. # do some checking...
  474. if(length(x)==0 || is.na(x)){
  475. stop("None of the submitted interesting genes are present in the list of all EntrezGene (LocusLink) IDs submitted!\nPossible causes:\n-wrong list of genes (wrong IDs) submitted\n-EntrezGene IDs of the interesting genes and list of EntrezGene IDs of all genes on the array are not present on the same microarray\n")
  476. }
  477. ##
  478. # ok, now map x to GO, retrieve all GO terms where one of the x EntrezGene IDs is present (using GOALLLOCUSID from the GO package)
  479. GOLL <- as.list(get("GOALLLOCUSID",mode="environment")) ## warning, GOALLLOCUSID is deprecated in newer versions, use GOALLENTREZID
  480. GOLL <- GOLL[!is.na(GOLL)] # just removing all the GO ids that are not mapped to any LL
  481. PresentGO <- sapply(GOLL,function(z){
  482. if(is.na(z) || length(z)==0)
  483. return(FALSE)
  484. any(x %in% z)
  485. }
  486. )
  487. ##
  488. # another checking...
  489. if(max(PresentGO)==0){
  490. stop("The submitted list of EntrezGene IDs of the interesting genes can not be mapped to any of the GO terms.\nMaybe the list contained no EntrezGene (LocusLink) IDs?\n")
  491. }
  492. if(v){
  493. cat("The",length(x),"submitted EntrezGene IDs map to",sum(PresentGO),"GO terms in all tree categories.\n")
  494. }
  495. GOLL <- GOLL[PresentGO]
  496. ##
  497. # next restrict the GO terms to the selected category (specified by "what")
  498. goCat <- unlist(getGOOntology(names(GOLL)))
  499. goodGO <- goCat == what
  500. goodGO[is.na(goodGO)] <- FALSE
  501. if(v){
  502. cat(sum(goodGO),"of them are in the GO category we are interested in.\n")
  503. }
  504. mm <- match(names(goCat),names(GOLL))
  505. mm <- mm[goodGO]
  506. GOLL <- GOLL[mm]
  507. ##
  508. # next we have to remove all the EntrezGene IDs from each GO term, that was not present on the microarray used.
  509. if(!use.all){
  510. myGOLL <- sapply(GOLL,function(z){
  511. if(length(z)==0 || is.na(z))
  512. return(NA)
  513. mylls <- unique(z[match(allLLs,z)])
  514. mylls <- mylls[!is.na(mylls)]
  515. mylls
  516. }
  517. )
  518. ##
  519. # removing all those GO terms that have no entries left (just in the case there are some...)
  520. empties <- sapply(myGOLL,function(z){length(z)==1 && is.na(z)})
  521. myGOLL <- myGOLL[!empties]
  522. GOLL <- GOLL[!empties]
  523. }
  524. else{
  525. ##
  526. # if use.all was selected, the proportion of the submitted EntrezGene IDs per GO term is compared to all
  527. # EntrezGenes mapped to that GO term.
  528. myGOLL <- GOLL
  529. }
  530. ##
  531. # the rest is the same as in the GOHyperG function...
  532. cLLs <- unique(unlist(myGOLL))
  533. nLL <- length(cLLs) # nr of unique LL IDs present in all GO terms
  534. goCounts <- sapply(myGOLL,length) # nr of LL IDs present on the array per GO term
  535. #allCounts <- sapply(GOLL,length) # nr of all LL IDs per GO term
  536. ##
  537. # some LL ids are represented more than once in a GO term...
  538. uniqueAllCounts <- sapply(GOLL,
  539. function(z){
  540. if(length(z)==0 || is.na(z))
  541. return(0)
  542. lls <- unique(z)
  543. lls <- lls[!is.na(lls)]
  544. return(length(lls))
  545. })
  546. whGood <- x[x %in% cLLs] # those LLs from the interesting ones that are present in the GO terms selected
  547. nInt <- length(whGood)
  548. if(nInt==0){
  549. warning("No interesting genes left")
  550. }
  551. MappingGOLL <- lapply(myGOLL,function(z){whGood[whGood %in% z]})
  552. useCts <- sapply(myGOLL,function(z){sum(whGood %in% z)}) # nr of interesting LLs per GO term
  553. ##
  554. # calculate the hypergeometric p values...
  555. pvs <- phyper(useCts-1,nInt,nLL-nInt,goCounts,lower.tail=FALSE)
  556. ord <- order(pvs)
  557. return(
  558. list(pvalues=pvs[ord],goCounts=goCounts[ord],intCounts=useCts[ord],allCounts=uniqueAllCounts[ord],GOLL=myGOLL[ord],intMapping=MappingGOLL[ord],numLL=nLL,numInt=nInt,intLLs=x)
  559. )
  560. }
  561. ################
  562. ## get EntrezGenes for the specified GO term
  563. getEntrezIDsFromGO <- function(GO, package="GO", entrezgenes){
  564. match.arg(package, c("GO", "humanLLMappings", "mouseLLMappings", "ratLLMappings"))
  565. require(package, character.only=TRUE)
  566. if(package=="GO"){
  567. ALLIDs <- mget(GO, env=GOALLENTREZID, ifnotfound=NA)
  568. if(!missing(entrezgenes)){
  569. ## subset to the specified entrezgene ids.
  570. ALLIDs <- lapply(ALLIDs, function(x){ x[ x %in% entrezgenes] })
  571. }
  572. return(ALLIDs)
  573. }
  574. if(package=="humanLLMappings"){
  575. ALLIDs <- mget(GO, env=humanLLMappingsGO2LL, ifnotfound=NA)
  576. if(!missing(entrezgenes)){
  577. ## subset to the specified entrezgene ids.
  578. ALLIDs <- lapply(ALLIDs, function(x){ x[ x %in% entrezgenes] })
  579. }
  580. return(ALLIDs)
  581. }
  582. if(package=="mouseLLMappings"){
  583. ALLIDs <- mget(GO, env=mouseLLMappingsGO2LL, ifnotfound=NA)
  584. if(!missing(entrezgenes)){
  585. ## subset to the specified entrezgene ids.
  586. ALLIDs <- lapply(ALLIDs, function(x){ x[ x %in% entrezgenes] })
  587. }
  588. return(ALLIDs)
  589. }
  590. if(package=="ratLLMappings"){
  591. ALLIDs <- mget(GO, env=ratLLMappingsGO2LL, ifnotfound=NA)
  592. if(!missing(entrezgenes)){
  593. ## subset to the specified entrezgene ids.
  594. ALLIDs <- lapply(ALLIDs, function(x){ x[ x %in% entrezgenes] })
  595. }
  596. return(ALLIDs)
  597. }
  598. }
  599. #getLLGOmapping <- function(x,what="MF"){
  600. # gos <- mget(x, env=GOLOCUSID2GO,ifnotfound=NA)
  601. # if(length(gos)==1)
  602. # bd = is.na(gos[[1]])
  603. # else
  604. # bd = is.na(gos)
  605. # gos <- gos[!bd]
  606. # mymapping <- lapply(gos,getOntology,ontology=what)
  607. # emptygos <- sapply(mymapping,function(x) length(x)==0 )
  608. # mymapping <- mymapping[!emptygos]
  609. # mymapping
  610. #}
  611. #getGOFromLL <- function(x,Ontology="MF"){
  612. # require(GO) || stop("no GO library")
  613. # match.arg(Ontology, c("MF", "BP", "CC"))
  614. # goterms <- mget(x, env = GOLOCUSID2GO, ifnotfound=NA)
  615. # if(length(goterms)==1){
  616. # goterms <- goterms[!is.na(goterms[[1]])]
  617. # }
  618. # else{
  619. # goterms <- goterms[!is.na(goterms)]
  620. # }
  621. # goterms <- lapply(goterms, function(x) x[sapply(x,
  622. # function(x) {if (is.na(x$Ontology) )
  623. # return(FALSE)
  624. # else
  625. # x$Ontology == Ontology})])
  626. # goids <- NULL
  627. # if(length(goterms)==0){
  628. # stop("The submitted list of EntrezGene IDs of the interesting genes can not be mapped to any of the GO terms.\nMaybe the list contained no EntrezGene (LocusLink) IDs?\n")
  629. # }
  630. # for(i in 1:length(goterms)){
  631. # goids <- c(goids,unique(unlist(sapply(goterms[[i]], function(x) x$GOID))))
  632. # }
  633. # goids
  634. #}
  635. #
  636. ##
  637. # check if package is available
  638. isAnnotationAvailable <- function(chip,develOK=FALSE){
  639. ok = FALSE
  640. if (chip == "fly.db0.db"){chip = "fly.db0"}
  641. # if(!require("reposTools",character.only=TRUE)){
  642. # warning("reposTools package is not installed.\n")
  643. # if(!require(chip,character.only=TRUE)){
  644. # ok = FALSE
  645. # }
  646. # else{
  647. # ok = TRUE
  648. # }
  649. # }
  650. # else{
  651. if(!require(chip,character.only=TRUE)){
  652. # source("http://www.bioconductor.org/getBioC.R")
  653. ## library(reposTools)
  654. ## # have to install the package
  655. ## tryCatch(install.packages2(pkg=chip,develOK=develOK,getAllDeps=TRUE),error=return(FALSE))
  656. # try(getBioC(chip))
  657. # if(!require(chip,character.only=TRUE)){
  658. # ok=FALSE
  659. # }
  660. # else{
  661. # ok=TRUE
  662. # }
  663. }
  664. else{
  665. ok = TRUE
  666. }
  667. # }
  668. ok
  669. }
  670. saveSAMResults <- function(SAM,table,delta,filename){
  671. dummy <- print(SAM)
  672. for(i in 1:nrow(dummy)){
  673. saveSAMResult(SAM,table,delta=dummy[i,"Delta"],filename=filename)
  674. }
  675. }
  676. saveSAMResult <- function(SAM, table,delta,filename){
  677. sum.SAM <- (summary(SAM,delta))@mat.sig
  678. if(!is.null(sum.SAM) & nrow(sum.SAM)>0){
  679. table.sub <- table[sum.SAM[,"Row"],]
  680. if(class(table)=="aafTable"){
  681. for(i in 2:(ncol(sum.SAM))){
  682. table.sub <- merge(table.sub,aafTable(sum.SAM[,i],colnames=colnames(sum.SAM)[i]))
  683. }
  684. Filename <- checkExistantFile(paste(filename,"-delta-",delta,".txt",sep=""))
  685. saveText(table.sub,filename=Filename)
  686. }
  687. else{
  688. if(nrow(sum.SAM)==1){
  689. table.sub <- matrix(table.sub,nrow=1)
  690. colnames(table.sub) <- colnames(table)
  691. }
  692. table.sub <- cbind(as.matrix(table.sub),as.matrix(sum.SAM[,2:ncol(sum.SAM)]))
  693. Filename <- checkExistantFile(paste(filename,"-delta-",delta,".txt",sep=""))
  694. write.table(table.sub,file=Filename,sep="\t",quote=FALSE,row.names=FALSE)
  695. }
  696. cat(nrow(table.sub),"significant genes (delta=",delta,") were saved to the file:",Filename,"\n")
  697. }
  698. else{
  699. cat("No significant genes with a delta of",delta,"\n")
  700. }
  701. }
  702. ###
  703. # fills missing spots
  704. fillMissingSpots <- function(object){
  705. ret <- object
  706. if(class(ret)=="RGList"){
  707. if(!is.null(ret$printer)){
  708. layout <- object$printer
  709. nr.row <- layout$ngrid.r * layout$ngrid.c * layout$nspot.r * layout$nspot.c
  710. missing <- (nr.row) - nrow(object$R)
  711. if(missing!=0){
  712. cat("Have to fill in",missing,"missing spots\n")
  713. index <- as.numeric(apply(object$genes,MARGIN=1,FUN=.getIndexFromBCR,nspot.r=layout$nspot.r,nspot.c=layout$nspot.c))
  714. R <- matrix(ncol=ncol(object$R),nrow=nr.row)
  715. colnames(R) <- colnames(object$R)
  716. rownames(R) <- 1:nrow(R)
  717. ## check if length of index and nrow of R are the same (<- problem with custom GAL files...)
  718. if(length(index)!=nrow(object$R)){
  719. stop("ERROR: number of data values does not match number of annotations!\nPlease try again without specifying a GAL file!")
  720. }
  721. R[index,] <- object$R
  722. ret$R <- R
  723. R[index,] <- object$G
  724. ret$G <- R
  725. if(!is.null(object$Rb)){
  726. R[index,] <- object$Rb
  727. ret$Rb <- R
  728. R[index,] <- object$Gb
  729. ret$Gb <- R
  730. }
  731. if(!is.null(object$weights)){
  732. R[,] <- 0
  733. R[index,] <- object$weights
  734. ret$weights <- R
  735. }
  736. else{
  737. R[,] <- 0
  738. R[index,] <- 1
  739. ret$weights <- R
  740. }
  741. rm(R)
  742. g <- gc()
  743. Genes = data.frame(matrix(ncol=ncol(object$genes),nrow=nr.row))
  744. colnames(Genes) <- colnames(object$genes)
  745. rownames(Genes) <- 1:nrow(Genes)
  746. # cat("nrow Genes",nrow(Genes),"ncol Genes",ncol(Genes),"length index",length(index),"\n")
  747. Genes[index,1:ncol(Genes)] = object$genes[,1:ncol(Genes)]
  748. # cat("genes filled\n","ncol= ",ncol(Genes)," nrow: ",nrow(Genes),"\n")
  749. ret$genes = NULL
  750. ret$genes = Genes
  751. # ret$genes = Genes
  752. # cat("genes set\n")
  753. rm(Genes)
  754. g <- gc()
  755. }
  756. }
  757. }
  758. ret
  759. }
  760. .getIndexFromBCR <- function(x,nspot.c,nspot.r){
  761. ((as.numeric(x["Block"])-1)*nspot.c*nspot.r) + ((as.numeric(x["Row"])-1)*nspot.c) + as.numeric(x["Column"])
  762. }
  763. if( !isGeneric("drawBoxplot") )
  764. setGeneric("drawBoxplot", function(x,new.plot=TRUE,xlim=NULL,at=0,exclude.flagged=FALSE,...)
  765. standardGeneric("drawBoxplot"))
  766. setMethod("drawBoxplot","MadbSet",
  767. function(x,new.plot=TRUE,xlim=NULL,at=0,exclude.flagged=FALSE,...){
  768. if(exclude.flagged){
  769. W <- getWeights(x)
  770. for(i in 1:ncol(exprs(x))){
  771. exprs(x)[W[,i]==0,i] <- NA
  772. }
  773. }
  774. boxplots(x=exprs(x),new.plot=new.plot,xlim=xlim,at=at,...)
  775. }
  776. )
  777. boxplots <- function(x,new.plot=TRUE,xlim=NULL,at=0,col=NULL,log2.transform=FALSE,ylim=NULL,...){
  778. data <- x
  779. # if(class(x)=="exprSet" | class(x)=="EexprSet"){
  780. # data <- exprs(x)
  781. # }
  782. if(log2.transform){
  783. data <- log2(data)
  784. }
  785. if(is.null(ncol(data))){
  786. data <- as.matrix(data)
  787. }
  788. ## remove those nasty infinite values
  789. for(i in 1:ncol(data)){
  790. data[is.infinite(data[,i]),i] <- NA
  791. }
  792. CN <- colnames(data)
  793. if(is.null(CN)){
  794. CN <- 1:ncol(data)
  795. }
  796. if(is.null(col)){
  797. col=rep(0,ncol(data))
  798. }
  799. if(length(col)!=ncol(data)){
  800. col=rep(col[1],ncol(data))
  801. }
  802. if(new.plot){
  803. if(is.null(xlim)){
  804. xlim=c(0.5,(ncol(data)+0.5))
  805. }
  806. par(omi=0.7*c(min(2,max(strwidth(CN,units="i"))),0,0,0),cex.axis=0.7)
  807. if(is.null(ylim)){
  808. ylim=c(min(data,na.rm=TRUE),max(data,na.rm=TRUE))
  809. }
  810. plot(1,1,pch=NA,ylab="expression",xaxt="n",ylim=ylim,xlim=xlim,xlab=NA)
  811. }
  812. axis(side=1,labels=CN,at=(1+at):(ncol(data)+at),las=3)
  813. for(i in 1:ncol(data)){
  814. boxplot(data[,i],at=(i+at),add=TRUE,col=col[i],range=0,...)
  815. }
  816. }
  817. plotRankedVariance <- function(data,subset=NULL,cut){
  818. if(!is.null(subset)){
  819. data <- data[,subset]
  820. }
  821. if(!.is.log(data)){
  822. data <- log2(data)
  823. }
  824. SD <- apply(data,MARGIN=1,sd,na.rm=TRUE)
  825. CUT <- quantile(SD,cut,na.rm=TRUE)
  826. plot(sort(SD),pch=16,cex=0.33,ylab="sorted variance")
  827. abline(h=CUT,col=2)
  828. text((length(SD)/20),(CUT+0.2),label=format(CUT,digits=4),col=2)
  829. }
  830. plotSortedPValues <- function(x,top=NULL,supported.cols=c("rawp","adjp","minP","maxT","BH","BY","Bonferroni","Hochberg","Holm","SidakSS","SidakSD"),...){
  831. if(is.null(nrow(x))){
  832. stop("Please submit a matrix, data.frame or instance of aafTable!\n")
  833. }
  834. if(!is.null(top)){
  835. top <- min(c(nrow(x),top))
  836. }
  837. else{
  838. top <- nrow(x)
  839. }
  840. Columns <- supported.cols[supported.cols %in% colnames(x)]
  841. if(length(Columns)==0){
  842. stop("The submitted matrix must contain at least one column with p-values that is named according to one of the supported column names!")
  843. }
  844. if(class(x)=="aafTable"){
  845. require(annaffy)
  846. #have to do some transformations...
  847. x.new <- matrix(ncol=length(Columns),nrow=nrow(x))
  848. colnames(x.new) <- Columns
  849. for(i in Columns){
  850. x.new[,i] <- as.numeric(getText(x[[i]]))
  851. }
  852. x <- x.new
  853. }
  854. #ok, now i can start drawing...
  855. x <- apply(x[,Columns],MARGIN=2,sort,na.last=TRUE)
  856. Max <- max(x[1:top,],na.rm=TRUE)
  857. plot(x[1:top,Columns[1]],xlab="number of significant genes",ylab="sorted p-value",col=1,ylim=c(0,Max),main=paste("Top",top,"sorted p-values"),...)
  858. if(length(Columns)>1){
  859. for(i in 2:length(Columns)){
  860. points(x[1:top,Columns[i]],col=i,...)
  861. }
  862. }
  863. legend(top-top/4,Max,Columns,col=1:length(Columns),pch=16)
  864. }
  865. .is.log <- function(data,maxi=100){
  866. data <- data[!is.na(data)]
  867. data <- data[!is.infinite(data)]
  868. # cat("Try to check wheter or not the values are in log scale...")
  869. if(max(data>maxi)){
  870. log.values = FALSE
  871. # cat("signals are not in log2 scale, transforming...\n")
  872. }
  873. else{
  874. log.values = TRUE
  875. # cat("signals are already in log2 scale \n")
  876. }
  877. log.values
  878. }
  879. toNumeric <- function(x,Sub=","){
  880. if(!is.null(dim(x))){
  881. RN <- rownames(x)
  882. if(class(x[1,])=="numeric"){
  883. return(x)
  884. }
  885. if(!is.null(Sub)){
  886. for(s in Sub){
  887. x <- apply(x,MARGIN=2,sub,pattern=s,replacement="")
  888. }
  889. }
  890. x <- tryCatch(apply(x,MARGIN=2,as.numeric),error=function(error){stop("ERROR: Can not perform calculations on non-numeric data! Please submit only numeric values.")},warning=function(warning){stop("ERROR: Can not perform calculations on non-numeric data! Please submit only numeric values.")})
  891. rownames(x) <- RN
  892. x
  893. }
  894. }
  895. ####
  896. ## function similar to PWamat function from the annotate package,
  897. ## generates a matrix with rows unique LL ids, columns KEGG pathways
  898. ## containing 1 if gene can be associated with the pathway, 0 otherwise.
  899. makeLL2KEGGMappingMatrix <- function(x,nrGenes=0,verbose=TRUE){
  900. if(is.null(x)){
  901. stop("No EntrezGene (LocusLink) IDs have been submitted!\n")
  902. }
  903. require(KEGG)
  904. x <- unique(as.character(x))
  905. paths <- mget(x,KEGGEXTID2PATHID,ifnotfound=NA)
  906. not_found <- sapply(paths,function(x){if(length(x)==1 && is.na(x)) TRUE else FALSE})
  907. paths <- paths[!not_found]
  908. if(verbose){
  909. cat(length(paths),"EntrezGene IDs from the inputlist can be associated with at least one KEGG pathway.\n",sum(not_found),"IDs can not be associated with KEGG pathways.\n")
  910. }
  911. unique_paths <- unique(unlist(paths))
  912. Test <- sapply(paths,function(x){
  913. mtch <- match(x,unique_paths)
  914. res <- rep(0,length(unique_paths))
  915. res[mtch] <- 1
  916. res
  917. })
  918. Test <- t(Test)
  919. rownames(Test) <- x
  920. colnames(Test) <- unique_paths
  921. if(nrGenes>0){
  922. to_exclude <- colSums(Test) < nrGenes
  923. if(verbose){
  924. cat(sum(to_exclude),"Pathways are excluded, because less than",nrGenes,"genes can be mapped to them.\n")
  925. }
  926. Test <- Test[,!to_exclude]
  927. }
  928. return(Test)
  929. }