/tools/expression/utils.R
https://bitbucket.org/cistrome/cistrome-harvard/ · R · 1007 lines · 757 code · 78 blank · 172 comment · 140 complexity · b382b19424ccc8e22e818953152991d9 MD5 · raw file
- #################################################################################################
- ##
- #################################################################################################
- getSkipFromGPR <- function(file){
- con <- file(file,"r")
- twoLines <- readLines(con,n=2)
- close(con)
- if(substring(twoLines,1,3)[1]=="ATF"){
- skip <- substring(twoLines,1,2)
- }
- else{
- skip=c(0,-2)
- }
- as.numeric(skip[2])+2
- }
- ###########################################################################################
- ## creates a new GAL file from the information from the gpr files
- ###########################################################################################
- createGALFromGPR <- function(file,galfile="dummy.gal",colnames=c("Block","Column","Row","Name","ID")){
- table <- read.table(file,as.is=TRUE,comment.char="",check.names=FALSE,header=TRUE,skip=getSkipFromGPR(file),sep="\t")
- names <- colnames(table)
- result <- data.frame(matrix(ncol=5,nrow=length(table[,1])))
- colnames(result) <- colnames
- for(i in 1:length(result[1,])){
- result[,i] <- table[,colnames[i]]
- }
- #result[,1:5] <- table[,1:5]
- #colnames(result) <- names[1:5]
- write.table(file=galfile,result,sep="\t",row.name=FALSE,quote=FALSE)
- }
- plotHistogram <- function(R,G,r.col=2,g.col=3,breaks=NULL,log.transform=FALSE,add=FALSE,weights=NULL,return.breaks=FALSE,ylim=NULL,xlim=NULL,title=NULL,relative=FALSE,...){
- if(is.null(weights)){
- weights <- matrix(ncol=1,nrow=length(R))
- weights[,] <- 1
- }
- if(log.transform==TRUE){
- r <- log2(R)
- g <- log2(G)
- by <- 0.1
- toadd <- 0.1
- }
- else{
- r <- R
- g <- G
- by <- 50
- toadd <- 50
- }
- r <- r[weights==1]
- g <- g[weights==1]
- r <- r[!is.na(r)]
- g <- g[!is.na(g)]
- r <- r[!is.infinite(r)]
- g <- g[!is.infinite(g)]
- if(is.null(breaks)){
- breaks <- seq(min(0,(min(r,g))),(max(max(r),max(g))+toadd),by=by)
- }
-
- hr <- hist(r,plot=FALSE,breaks=breaks)
- hg <- hist(g,plot=FALSE,breaks=breaks)
- if(is.null(xlim)){
- xlim <- range(breaks)
- }
- r.counts <- hr$counts
- g.counts <- hg$counts
- ylabb <- "Counts"
- if(relative){
- r.counts <- r.counts / length(r)
- g.counts <- g.counts / length(g)
- ylabb <- "relative Counts"
- yy <- -0.001
- }
- else{
- yy <- -10
- }
- if(is.null(ylim)){
- ylim <- c(0,max(r.counts,g.counts))
- }
-
- if(!add){
- plot(hr$mids,r.counts,col=r.col,xlab="Intensity",ylab=ylabb,type="l",ylim=ylim,xlim=xlim,main=title,...)
- }
- else{
- points(hr$mids,r.counts,col=r.col,type="l",...)
- }
- points(hg$mids,g.counts,col=g.col,type="l",...)
- points(medianWithoutNA(r),yy,col=r.col,pch=3)
- points(medianWithoutNA(g),yy,col=g.col,pch=3)
-
- if(return.breaks){
- breaks
- }
- # hr <- density(r,na.rm=TRUE)
- # hg <- density(g,na.rm=TRUE)
- # xlim <- c(min(hr$x,hg$x),max(hr$x,hg$x))
- # ylim <- c(min(hr$y,hg$y),max(hr$y,hg$y))
- # plot(hr$x,hr$y,col=2,type="l",ylim=ylim,...)
- # points(hg$x,hg$y,col=3,type="l")
- # hr <- hist(r,breaks=breaks,plot=FALSE)
- # hg <- hist(g,breaks=breaks,plot=FALSE)
- }
- #############################################################################
- ##
- #############################################################################
- plotHistogramFromSlide <- function(lObject,slide=1,use.weights=TRUE,log.transform=FALSE,breaks=NULL,bg.lty=3,...){
- if(slide=="all"){
-
-
-
- }
- if(is.null(lObject$R)){
- R <- MAtoR(lObject$M[,slide],lObject$A[,slide])
- G <- MAtoG(lObject$M[,slide],lObject$A[,slide])
- Rb <- NULL
- Gb <- NULL
- draw.bg <- FALSE
- names <- colnames(lObject$M)
- }
- else{
- R <- lObject$R[,slide]
- G <- lObject$G[,slide]
- Rb <- lObject$Rb[,slide]
- Gb <- lObject$Gb[,slide]
- draw.bg <- !is.null(Rb)
- names <- colnames(lObject$R)
- }
- weights <- matrix(ncol=1,nrow=length(R))
- weights[,] <- 1
- if(is.null(lObject$weights)){
- }
- else{
- if(use.weights){
- weights <- lObject$weights[,slide]
- }
- }
- # r <- R[weights==1]
- # g <- G[weights==1]
- # rb <- Rb[weights==1]
- # gb <- Gb[weights==1]
-
- breaks <- plotHistogram(R=R,G=G,log.transform=log.transform,weights=weights,breaks=breaks,return.breaks=TRUE,title=names[slide],...)
- if(draw.bg){
- plotHistogram(R=Rb,G=Gb,log.transform=log.transform,breaks=NULL,weights=weights,lty=bg.lty,add=TRUE,...)
- }
- }
- checkExistantFile <- function(file,path=".",no.subdir=TRUE){
- file <- gsub(pattern="_",replacement=".",file)
- file <- gsub(pattern="/",replacement="-",file)
- counter <- 0
- suff <- NULL
- ending <- NULL
- ffile <- file
- splitted <- unlist(strsplit(file,split="\\."))
- if(length(splitted)>1){
- ending <- paste(".",splitted[length(splitted)],sep="")
- ffile <- paste(splitted[1:(length(splitted)-1)],collapse=".")
- #cat(ffile,"_",ending,"\n")
- }
- while(TRUE){
- newfile <- paste(ffile,suff,ending,sep="")
- if(file.exists(newfile)){
- #file does exist...
- counter <- counter +1
- suff <- paste("(",counter,")",sep="")
- }
- else{
- break
- }
- }
- newfile
- }
- saveMAList <- function(object,filename,remove.flagged=TRUE,na.value="NA",subset=NULL, allvalues=TRUE){
- genes <- NULL
- weights <- NULL
- g.length <- 0
- col.names <- NULL
- if(class(object)=="MAList"){
- if(is.null(dim(object$M))){
- object$M <- as.matrix(object$M)
- object$A <- as.matrix(object$A)
- if(!is.null(object$weights)){
- object$weights <- as.matrix(object$weights)
- }
- }
- if(is.null(subset)){
- subset <- rep(TRUE,nrow(object$M))
- }
- M <- as.matrix(object$M[subset,])
- CN <- colnames(object$M)
- if(is.null(CN)){
- CN <- object$targets[1,1]
- }
- colnames(M) <- CN
- # colnames(M) <- colnames(object$M)
- A <- as.matrix(object$A[subset,])
- colnames(A) <- CN
- # colnames(A) <- colnames(object$A)
- if(!is.null(object$weights)){
- weights <- as.matrix(object$weights[subset,])
- }
- if(!is.null(object$genes)){
- genes <- object$genes[subset,]
- if(!is.null(genes)){
- g.length <- ncol(genes)
- }
- }
- }
- else if(class(object)=="EexprSet" | class(object)=="MadbSet"){
- if(object@type=="TwoColor"){
- if(is.null(subset)){
- subset <- rep(TRUE,nrow(exprs(object)))
- }
- M <- getM(object)
- Colnames <- colnames(M)
- M <- as.matrix(M[subset,])
- colnames(M) <- Colnames
- # M <- as.matrix(getM(object)[subset,])
- # cat(class(M),colnames(M),"\n")
- # A <- as.matrix(getA(object)[subset,])
- A <- getA(object)
- A <- as.matrix(A[subset,])
- colnames(A) <- Colnames
- #cat("1")
- weights <- as.matrix(getWeights(object)[subset,])
- #cat("2")
- genes <- as.matrix(object@genes[subset,])
- if(!is.null(genes)){
- g.length <- ncol(genes)
- if(g.length==1){
- colnames(genes) <- "ID"
- }
- }
- #cat("g.length",g.length,"dim M",dim(M),"\n")
- #cat("3")
- }
- }
- else if(class(object)=="RGList"){
- if(is.null(subset)){
- subset <- rep(TRUE,nrow(object$R))
- }
- if(!.is.log(object$R)){
- object$R <- log2(object$R)
- }
- if(!.is.log(object$G)){
- object$G <- log2(object$G)
- }
- M <- object$R[subset,] - object$G[subset,]
- A <- 0.5*(object$R[subset,] + object$G[subset,])
- if(!is.null(object$weights)){
- weights <- object$weights[subset,]
- }
- if(!is.null(object$genes)){
- genes <- object$genes[subset,]
- if(!is.null(genes)){
- g.length <- ncol(genes)
- }
- }
- }
- else{
- stop("Objects from the class ",class(object)," can not be handled by this function\n")
- }
- #cat("ncol=",g.length+(ncol(M)*2),"nrow=",nrow=nrow(M),"\n")
- Table <- matrix(ncol=(g.length+(ncol(M)*2)),nrow=nrow(M))
- if(!is.null(genes)){
- Table[,1:ncol(genes)] <- as.matrix(genes)
- col.names <- colnames(genes)
- }
- #cat(colnames(M),"\n")
- for(i in 1:ncol(M)){
- col.names <- c(col.names,paste(colnames(M)[i],"M"),paste(colnames(A)[i],"A"))
- Table[,((i*2)-1+g.length)] <- M[,i]
- if(!is.null(weights) & remove.flagged){
- Table[weights[,i]==0,((i*2)-1+g.length)] <- NA
- }
- Table[is.na(Table[,((i*2)-1+g.length)]),((i*2)-1+g.length)] <- NA
-
- Table[,((i*2)+g.length)] <- A[,i]
- if(!is.null(weights) & remove.flagged){
- Table[weights[,i]==0,((i*2)+g.length)] <- NA
- }
- Table[is.na(Table[,((i*2)+g.length)]),((i*2)+g.length)] <- NA
- }
- colnames(Table) <- col.names
-
- if(allvalues){
- ## calculate the red and green intensities from the M and A values...
- R <- MAtoR(M, A)
- G <- MAtoG(M, A)
- if(is.null(ncol(R))){
- R <- matrix(R, ncol=1)
- G <- matrix(G, ncol=1)
- }
- colnames(R) <- paste(colnames(M), "R")
- colnames(G) <- paste(colnames(M), "G")
- Table <- cbind(Table, R, G)
- }
-
- write.table(Table,file=filename,sep="\t",na=na.value,row.names=FALSE,quote=FALSE)
- }
- ## closes all connections of gpr files...
- closeMyConnections <- function(pattern="gpr"){
- All <- showConnections(TRUE)
- what <- grep(All[,"description"],pattern=pattern)
- what <- what-1
- for(theCon in what){
- close(getConnection(theCon))
- }
- }
- checkGPRs <- function(files=NULL,overwrite=TRUE){
- ##
- # check for correct axon format...
- requiredcolumns <- c("Block","Column","Row","Name","ID","F635 Median","F635 Mean","B635 Median","B635 Mean","F532 Median","F532 Mean","B532 Median","B532 Mean","Flags")
- for(i in 1:length(files)){
- con <- file(files[i],"r")
- twoLines <- readLines(con,n=2)
- close(con)
- # twoLines <- scan(file=files[i], what="character",nlines=2, quiet=TRUE)
- # closeMyConnections()
- if(substring(twoLines,1,3)[1]!="ATF"){
- # is not in correct Axon file format
- cat("The file",files[i],"is not in correct Axon file format!\n")
- header <- read.table(files[i],header=TRUE,nrows=1,sep="\t",check.names=FALSE)
- if(sum(colnames(header) %in% requiredcolumns)==length(requiredcolumns)){
- BigFile <- readLines(con=files[i])
- writeLines(c("ATF\t1.0",paste("1\t",ncol(header),sep=""),"Provider=CARMAweb"),con=files[i])
- con <- file(files[i],"at")
- writeLines(con=con,BigFile)
- flush(con)
- close(con)
- cat("Successfully adjusted file",files[i],".\n")
- # closeMyConnections()
- }
- else{
- stop("The file",files[i],"is not in correct Axon file format! Was this file scanned with an Axon GenePix Scanner?\n")
- }
- }
- }
- skips <- NULL
- for(i in 1:length(files)){
- skips <- c(skips,getSkipFromGPR(files[i]))
- }
- if(length(unique(skips))!=1){
- cat("GPR files have different number of header rows!\nAdjusting GPR files\n")
- for(i in 1:length(files)){
- dummy <- read.table(files[i],skip=skips[i],header=TRUE,comment.char="",sep="\t",check.names=FALSE)
- #cat(colnames(dummy),"\n")
- write.table(dummy,file=files[i],sep="\t",quote=FALSE,row.names=FALSE)
- }
- checkGPRs(files) # just re-run the check once again...
- }
- }
- createM <- function(object,red.channels,green.channels){
- M <- matrix(ncol=length(red.channels),nrow=nrow(exprs(object)))
- weights <- getWeights(object)
- col.names <- NULL
- for(i in 1:ncol(M)){
- r.idx <- myGrep(colnames(exprs(object)),pattern=red.channels[i],exact.match=TRUE)
- g.idx <- myGrep(colnames(exprs(object)),pattern=green.channels[i],exact.match=TRUE)
- M[,i] <- getM(object,r=r.idx,g=g.idx)
- M[weights[,r.idx]==0 | weights[,g.idx]==0,i] <- NA
- col.names <- c(col.names,paste(red.channels[i]," vs ",green.channels[i],sep=""))
- }
- colnames(M) <- col.names
- M
- }
- createA <- function(object,red.channels,green.channels){
- A <- matrix(ncol=length(red.channels),nrow=nrow(exprs(object)))
- weights <- getWeights(object)
- col.names <- NULL
- for(i in 1:ncol(A)){
- r.idx <- myGrep(colnames(exprs(object)),pattern=red.channels[i],exact.match=TRUE)
- g.idx <- myGrep(colnames(exprs(object)),pattern=green.channels[i],exact.match=TRUE)
- A[,i] <- getA(object,r=r.idx,g=g.idx)
- A[weights[,r.idx]==0 | weights[,g.idx]==0,i] <- NA
- col.names <- c(col.names,paste(red.channels[i]," vs ",green.channels[i],sep=""))
- }
- colnames(A) <- col.names
- A
- }
- ##
- # to remove all those genes that have flags in
- excludeFromTest <- function(data,classlabels,weights=NULL,nr.present=2){
- if(!is.null(weights)){
- if(sum(dim(data)==dim(weights))==2){
- data <- .removeFlagged(data,weights)
- }
- }
- result <- apply(data,MARGIN=1,.dummyFun,cl=classlabels,np=nr.present)
- result
- }
- .removeFlagged <- function(data,weights,flagged.value=NA){
- for(i in 1:ncol(data)){
- data[weights[,i]==0,i] <- flagged.value
- }
- data
- }
- .dummyFun <- function(x,cl,np){
- res <- sum(!is.na(x[cl==0])) < np | sum(!is.na(x[cl==1])) < np
- res
- }
- tukey.biweight.mod <- function(x,c=5,epsilon=1e-4,na.rm=TRUE){
- x <- x[!is.na(x)]
- ret <- tukey.biweight(x,c=c,epsilon=epsilon)
- }
- if( !isGeneric("filterOnVariance") )
- setGeneric("filterOnVariance", function(object,variance=0,array.names=NULL,remove.flagged=TRUE,v=TRUE,...)
- standardGeneric("filterOnVariance"))
- setMethod("filterOnVariance","MadbSet",
- function(object,variance=0,array.names=NULL,remove.flagged=TRUE,v=TRUE,...){
- if(is.null(array.names)){
- array.names <- colnames(exprs(object))
- }
- E <- exprs(object)[,array.names]
- if(remove.flagged){
- weights <- getWeights(object)[,array.names]
- for(i in 1:ncol(weights)){
- E[weights[,i]==0,i] <- NA
- }
- }
- sds <- apply(E,MARGIN=1,sd,na.rm=TRUE)
- quant <- quantile(sds,variance,na.rm=TRUE)
- sel <- sds >= quant
- sel[is.na(sel)] <- FALSE
- if(v){
- cat(sum(sel)," features out of ",length(sel)," have a sd bigger than ",quant,"\n")
- }
- ret <- object
- exprs(ret) <- exprs(object)[sel,]
- ret@weights <- getWeights(object)[sel,]
- if(nrow(exprs(object))==nrow(object@genes)){
- if(ncol(object@genes)==1){
- G <- matrix(object@genes[sel,],ncol=1)
- rownames(G) <- rownames(exprs(ret))
- colnames(G) <- colnames(object@genes)
- ret@genes <- G
- }
- else{
- ret@genes <- object@genes[sel,]
- }
- }
- ret
- })
- ##
- # this function is a generalization of Robert Gentlemens GOHyperG funciton from the GOstats package.
- # it allows to submit all EntrezGenes present on the used microarray.
- # in principal this function performs the same steps as the GOHyperG function, nevertheless the results
- # are sligtly different (maybe because of the different annotation packages used:
- # mapping to GO terms using the affymetrix probe sets -> lokuslink -> GO terms (GOstats) or mapping the locuslink ids directly
- # to GO terms using the GO package like it is done here.)
- calculateGOHyperG <- function(x,allLLs=NULL,chip=NULL,what="MF",use.all=FALSE,v=TRUE){
- require(GO)
- require(GOstats)
- if (chip == "fly.db0.db"){chip = "fly.db0"}
- if(is.null(allLLs) & is.null(chip) & !use.all){
- stop("You have to submit either all EntrezGene IDs present on the array used, or the Affymetrix GeneChip used, or select use.all=TRUE to use all EntrezGenes available in GO!\n\n")
- }
- if(use.all){
- ##
- # just that the code below throws no exception...
- allLLs <- x
- }
- ##
- # if chip was submitted retrieve all LLs from the specific chip
- if(!is.null(chip)){
- require(chip,character.only=TRUE) || stop("need data package",chip)
- allLLs <- as.character(unlist(as.list(get(paste(chip,"LOCUSID",sep=""),mode="environment"))))
- }
- ##
- # prepare the x (myLLs) and allLLs. remove duplicates, NAs, use only those x that are present in allLLs
- # remove possible LL. or LL strings (old way of writing LocusLink IDs)
- x <- sub("LL.","",x)
- x <- sub("LL","",x)
- allLLs <- sub("LL.","",allLLs)
- allLLs <- sub("LL","",allLLs)
- allLLs <- unique(allLLs)
- allLLs <- allLLs[!is.na(allLLs) & allLLs!=""]
- x <- unique(x)
- x <- x[!is.na(x) & x!=""]
- ##
- # just interested in those x that are present in allLLs
- x <- allLLs[match(x,allLLs)]
- x <- x[!is.na(x) & x!=""]
-
- ##
- # do some checking...
- if(length(x)==0 || is.na(x)){
- stop("None of the submitted interesting genes are present in the list of all EntrezGene (LocusLink) IDs submitted!\nPossible causes:\n-wrong list of genes (wrong IDs) submitted\n-EntrezGene IDs of the interesting genes and list of EntrezGene IDs of all genes on the array are not present on the same microarray\n")
- }
-
-
-
- ##
- # ok, now map x to GO, retrieve all GO terms where one of the x EntrezGene IDs is present (using GOALLLOCUSID from the GO package)
- GOLL <- as.list(get("GOALLLOCUSID",mode="environment")) ## warning, GOALLLOCUSID is deprecated in newer versions, use GOALLENTREZID
- GOLL <- GOLL[!is.na(GOLL)] # just removing all the GO ids that are not mapped to any LL
- PresentGO <- sapply(GOLL,function(z){
- if(is.na(z) || length(z)==0)
- return(FALSE)
- any(x %in% z)
- }
- )
- ##
- # another checking...
- if(max(PresentGO)==0){
- stop("The submitted list of EntrezGene IDs of the interesting genes can not be mapped to any of the GO terms.\nMaybe the list contained no EntrezGene (LocusLink) IDs?\n")
- }
- if(v){
- cat("The",length(x),"submitted EntrezGene IDs map to",sum(PresentGO),"GO terms in all tree categories.\n")
- }
- GOLL <- GOLL[PresentGO]
- ##
- # next restrict the GO terms to the selected category (specified by "what")
- goCat <- unlist(getGOOntology(names(GOLL)))
- goodGO <- goCat == what
- goodGO[is.na(goodGO)] <- FALSE
- if(v){
- cat(sum(goodGO),"of them are in the GO category we are interested in.\n")
- }
- mm <- match(names(goCat),names(GOLL))
- mm <- mm[goodGO]
- GOLL <- GOLL[mm]
-
- ##
- # next we have to remove all the EntrezGene IDs from each GO term, that was not present on the microarray used.
- if(!use.all){
- myGOLL <- sapply(GOLL,function(z){
- if(length(z)==0 || is.na(z))
- return(NA)
- mylls <- unique(z[match(allLLs,z)])
- mylls <- mylls[!is.na(mylls)]
- mylls
- }
- )
- ##
- # removing all those GO terms that have no entries left (just in the case there are some...)
- empties <- sapply(myGOLL,function(z){length(z)==1 && is.na(z)})
- myGOLL <- myGOLL[!empties]
- GOLL <- GOLL[!empties]
- }
- else{
- ##
- # if use.all was selected, the proportion of the submitted EntrezGene IDs per GO term is compared to all
- # EntrezGenes mapped to that GO term.
- myGOLL <- GOLL
- }
-
- ##
- # the rest is the same as in the GOHyperG function...
- cLLs <- unique(unlist(myGOLL))
- nLL <- length(cLLs) # nr of unique LL IDs present in all GO terms
- goCounts <- sapply(myGOLL,length) # nr of LL IDs present on the array per GO term
- #allCounts <- sapply(GOLL,length) # nr of all LL IDs per GO term
- ##
- # some LL ids are represented more than once in a GO term...
- uniqueAllCounts <- sapply(GOLL,
- function(z){
- if(length(z)==0 || is.na(z))
- return(0)
- lls <- unique(z)
- lls <- lls[!is.na(lls)]
- return(length(lls))
- })
- whGood <- x[x %in% cLLs] # those LLs from the interesting ones that are present in the GO terms selected
- nInt <- length(whGood)
- if(nInt==0){
- warning("No interesting genes left")
- }
-
- MappingGOLL <- lapply(myGOLL,function(z){whGood[whGood %in% z]})
- useCts <- sapply(myGOLL,function(z){sum(whGood %in% z)}) # nr of interesting LLs per GO term
- ##
- # calculate the hypergeometric p values...
- pvs <- phyper(useCts-1,nInt,nLL-nInt,goCounts,lower.tail=FALSE)
- ord <- order(pvs)
- return(
- list(pvalues=pvs[ord],goCounts=goCounts[ord],intCounts=useCts[ord],allCounts=uniqueAllCounts[ord],GOLL=myGOLL[ord],intMapping=MappingGOLL[ord],numLL=nLL,numInt=nInt,intLLs=x)
- )
-
- }
- ################
- ## get EntrezGenes for the specified GO term
- getEntrezIDsFromGO <- function(GO, package="GO", entrezgenes){
- match.arg(package, c("GO", "humanLLMappings", "mouseLLMappings", "ratLLMappings"))
- require(package, character.only=TRUE)
- if(package=="GO"){
- ALLIDs <- mget(GO, env=GOALLENTREZID, ifnotfound=NA)
- if(!missing(entrezgenes)){
- ## subset to the specified entrezgene ids.
- ALLIDs <- lapply(ALLIDs, function(x){ x[ x %in% entrezgenes] })
- }
- return(ALLIDs)
- }
- if(package=="humanLLMappings"){
- ALLIDs <- mget(GO, env=humanLLMappingsGO2LL, ifnotfound=NA)
- if(!missing(entrezgenes)){
- ## subset to the specified entrezgene ids.
- ALLIDs <- lapply(ALLIDs, function(x){ x[ x %in% entrezgenes] })
- }
- return(ALLIDs)
- }
- if(package=="mouseLLMappings"){
- ALLIDs <- mget(GO, env=mouseLLMappingsGO2LL, ifnotfound=NA)
- if(!missing(entrezgenes)){
- ## subset to the specified entrezgene ids.
- ALLIDs <- lapply(ALLIDs, function(x){ x[ x %in% entrezgenes] })
- }
- return(ALLIDs)
- }
- if(package=="ratLLMappings"){
- ALLIDs <- mget(GO, env=ratLLMappingsGO2LL, ifnotfound=NA)
- if(!missing(entrezgenes)){
- ## subset to the specified entrezgene ids.
- ALLIDs <- lapply(ALLIDs, function(x){ x[ x %in% entrezgenes] })
- }
- return(ALLIDs)
- }
- }
- #getLLGOmapping <- function(x,what="MF"){
- # gos <- mget(x, env=GOLOCUSID2GO,ifnotfound=NA)
- # if(length(gos)==1)
- # bd = is.na(gos[[1]])
- # else
- # bd = is.na(gos)
- # gos <- gos[!bd]
- # mymapping <- lapply(gos,getOntology,ontology=what)
- # emptygos <- sapply(mymapping,function(x) length(x)==0 )
- # mymapping <- mymapping[!emptygos]
- # mymapping
- #}
- #getGOFromLL <- function(x,Ontology="MF"){
- # require(GO) || stop("no GO library")
- # match.arg(Ontology, c("MF", "BP", "CC"))
- # goterms <- mget(x, env = GOLOCUSID2GO, ifnotfound=NA)
- # if(length(goterms)==1){
- # goterms <- goterms[!is.na(goterms[[1]])]
- # }
- # else{
- # goterms <- goterms[!is.na(goterms)]
- # }
- # goterms <- lapply(goterms, function(x) x[sapply(x,
- # function(x) {if (is.na(x$Ontology) )
- # return(FALSE)
- # else
- # x$Ontology == Ontology})])
- # goids <- NULL
- # if(length(goterms)==0){
- # stop("The submitted list of EntrezGene IDs of the interesting genes can not be mapped to any of the GO terms.\nMaybe the list contained no EntrezGene (LocusLink) IDs?\n")
- # }
- # for(i in 1:length(goterms)){
- # goids <- c(goids,unique(unlist(sapply(goterms[[i]], function(x) x$GOID))))
- # }
- # goids
- #}
- #
- ##
- # check if package is available
- isAnnotationAvailable <- function(chip,develOK=FALSE){
- ok = FALSE
- if (chip == "fly.db0.db"){chip = "fly.db0"}
- # if(!require("reposTools",character.only=TRUE)){
- # warning("reposTools package is not installed.\n")
- # if(!require(chip,character.only=TRUE)){
- # ok = FALSE
- # }
- # else{
- # ok = TRUE
- # }
- # }
- # else{
- if(!require(chip,character.only=TRUE)){
- # source("http://www.bioconductor.org/getBioC.R")
- ## library(reposTools)
- ## # have to install the package
- ## tryCatch(install.packages2(pkg=chip,develOK=develOK,getAllDeps=TRUE),error=return(FALSE))
- # try(getBioC(chip))
- # if(!require(chip,character.only=TRUE)){
- # ok=FALSE
- # }
- # else{
- # ok=TRUE
- # }
- }
- else{
- ok = TRUE
- }
- # }
- ok
- }
- saveSAMResults <- function(SAM,table,delta,filename){
- dummy <- print(SAM)
- for(i in 1:nrow(dummy)){
- saveSAMResult(SAM,table,delta=dummy[i,"Delta"],filename=filename)
- }
- }
- saveSAMResult <- function(SAM, table,delta,filename){
- sum.SAM <- (summary(SAM,delta))@mat.sig
- if(!is.null(sum.SAM) & nrow(sum.SAM)>0){
- table.sub <- table[sum.SAM[,"Row"],]
- if(class(table)=="aafTable"){
- for(i in 2:(ncol(sum.SAM))){
- table.sub <- merge(table.sub,aafTable(sum.SAM[,i],colnames=colnames(sum.SAM)[i]))
- }
- Filename <- checkExistantFile(paste(filename,"-delta-",delta,".txt",sep=""))
- saveText(table.sub,filename=Filename)
- }
- else{
- if(nrow(sum.SAM)==1){
- table.sub <- matrix(table.sub,nrow=1)
- colnames(table.sub) <- colnames(table)
- }
- table.sub <- cbind(as.matrix(table.sub),as.matrix(sum.SAM[,2:ncol(sum.SAM)]))
- Filename <- checkExistantFile(paste(filename,"-delta-",delta,".txt",sep=""))
- write.table(table.sub,file=Filename,sep="\t",quote=FALSE,row.names=FALSE)
- }
- cat(nrow(table.sub),"significant genes (delta=",delta,") were saved to the file:",Filename,"\n")
- }
- else{
- cat("No significant genes with a delta of",delta,"\n")
- }
- }
- ###
- # fills missing spots
- fillMissingSpots <- function(object){
- ret <- object
- if(class(ret)=="RGList"){
- if(!is.null(ret$printer)){
- layout <- object$printer
- nr.row <- layout$ngrid.r * layout$ngrid.c * layout$nspot.r * layout$nspot.c
- missing <- (nr.row) - nrow(object$R)
- if(missing!=0){
- cat("Have to fill in",missing,"missing spots\n")
- index <- as.numeric(apply(object$genes,MARGIN=1,FUN=.getIndexFromBCR,nspot.r=layout$nspot.r,nspot.c=layout$nspot.c))
- R <- matrix(ncol=ncol(object$R),nrow=nr.row)
- colnames(R) <- colnames(object$R)
- rownames(R) <- 1:nrow(R)
- ## check if length of index and nrow of R are the same (<- problem with custom GAL files...)
- if(length(index)!=nrow(object$R)){
- stop("ERROR: number of data values does not match number of annotations!\nPlease try again without specifying a GAL file!")
- }
- R[index,] <- object$R
- ret$R <- R
- R[index,] <- object$G
- ret$G <- R
- if(!is.null(object$Rb)){
- R[index,] <- object$Rb
- ret$Rb <- R
- R[index,] <- object$Gb
- ret$Gb <- R
- }
- if(!is.null(object$weights)){
- R[,] <- 0
- R[index,] <- object$weights
- ret$weights <- R
- }
- else{
- R[,] <- 0
- R[index,] <- 1
- ret$weights <- R
- }
- rm(R)
- g <- gc()
- Genes = data.frame(matrix(ncol=ncol(object$genes),nrow=nr.row))
- colnames(Genes) <- colnames(object$genes)
- rownames(Genes) <- 1:nrow(Genes)
- # cat("nrow Genes",nrow(Genes),"ncol Genes",ncol(Genes),"length index",length(index),"\n")
- Genes[index,1:ncol(Genes)] = object$genes[,1:ncol(Genes)]
- # cat("genes filled\n","ncol= ",ncol(Genes)," nrow: ",nrow(Genes),"\n")
- ret$genes = NULL
- ret$genes = Genes
- # ret$genes = Genes
- # cat("genes set\n")
- rm(Genes)
- g <- gc()
- }
- }
- }
- ret
- }
- .getIndexFromBCR <- function(x,nspot.c,nspot.r){
- ((as.numeric(x["Block"])-1)*nspot.c*nspot.r) + ((as.numeric(x["Row"])-1)*nspot.c) + as.numeric(x["Column"])
- }
- if( !isGeneric("drawBoxplot") )
- setGeneric("drawBoxplot", function(x,new.plot=TRUE,xlim=NULL,at=0,exclude.flagged=FALSE,...)
- standardGeneric("drawBoxplot"))
- setMethod("drawBoxplot","MadbSet",
- function(x,new.plot=TRUE,xlim=NULL,at=0,exclude.flagged=FALSE,...){
- if(exclude.flagged){
- W <- getWeights(x)
- for(i in 1:ncol(exprs(x))){
- exprs(x)[W[,i]==0,i] <- NA
- }
- }
- boxplots(x=exprs(x),new.plot=new.plot,xlim=xlim,at=at,...)
- }
- )
- boxplots <- function(x,new.plot=TRUE,xlim=NULL,at=0,col=NULL,log2.transform=FALSE,ylim=NULL,...){
- data <- x
- # if(class(x)=="exprSet" | class(x)=="EexprSet"){
- # data <- exprs(x)
- # }
- if(log2.transform){
- data <- log2(data)
- }
- if(is.null(ncol(data))){
- data <- as.matrix(data)
- }
- ## remove those nasty infinite values
- for(i in 1:ncol(data)){
- data[is.infinite(data[,i]),i] <- NA
- }
- CN <- colnames(data)
- if(is.null(CN)){
- CN <- 1:ncol(data)
- }
- if(is.null(col)){
- col=rep(0,ncol(data))
- }
- if(length(col)!=ncol(data)){
- col=rep(col[1],ncol(data))
- }
- if(new.plot){
- if(is.null(xlim)){
- xlim=c(0.5,(ncol(data)+0.5))
- }
- par(omi=0.7*c(min(2,max(strwidth(CN,units="i"))),0,0,0),cex.axis=0.7)
- if(is.null(ylim)){
- ylim=c(min(data,na.rm=TRUE),max(data,na.rm=TRUE))
- }
- plot(1,1,pch=NA,ylab="expression",xaxt="n",ylim=ylim,xlim=xlim,xlab=NA)
- }
- axis(side=1,labels=CN,at=(1+at):(ncol(data)+at),las=3)
- for(i in 1:ncol(data)){
- boxplot(data[,i],at=(i+at),add=TRUE,col=col[i],range=0,...)
- }
- }
- plotRankedVariance <- function(data,subset=NULL,cut){
- if(!is.null(subset)){
- data <- data[,subset]
- }
- if(!.is.log(data)){
- data <- log2(data)
- }
- SD <- apply(data,MARGIN=1,sd,na.rm=TRUE)
- CUT <- quantile(SD,cut,na.rm=TRUE)
- plot(sort(SD),pch=16,cex=0.33,ylab="sorted variance")
- abline(h=CUT,col=2)
- text((length(SD)/20),(CUT+0.2),label=format(CUT,digits=4),col=2)
- }
- plotSortedPValues <- function(x,top=NULL,supported.cols=c("rawp","adjp","minP","maxT","BH","BY","Bonferroni","Hochberg","Holm","SidakSS","SidakSD"),...){
- if(is.null(nrow(x))){
- stop("Please submit a matrix, data.frame or instance of aafTable!\n")
- }
- if(!is.null(top)){
- top <- min(c(nrow(x),top))
- }
- else{
- top <- nrow(x)
- }
- Columns <- supported.cols[supported.cols %in% colnames(x)]
- if(length(Columns)==0){
- stop("The submitted matrix must contain at least one column with p-values that is named according to one of the supported column names!")
- }
- if(class(x)=="aafTable"){
- require(annaffy)
- #have to do some transformations...
- x.new <- matrix(ncol=length(Columns),nrow=nrow(x))
- colnames(x.new) <- Columns
- for(i in Columns){
- x.new[,i] <- as.numeric(getText(x[[i]]))
- }
- x <- x.new
- }
- #ok, now i can start drawing...
- x <- apply(x[,Columns],MARGIN=2,sort,na.last=TRUE)
- Max <- max(x[1:top,],na.rm=TRUE)
- plot(x[1:top,Columns[1]],xlab="number of significant genes",ylab="sorted p-value",col=1,ylim=c(0,Max),main=paste("Top",top,"sorted p-values"),...)
- if(length(Columns)>1){
- for(i in 2:length(Columns)){
- points(x[1:top,Columns[i]],col=i,...)
- }
- }
- legend(top-top/4,Max,Columns,col=1:length(Columns),pch=16)
- }
- .is.log <- function(data,maxi=100){
- data <- data[!is.na(data)]
- data <- data[!is.infinite(data)]
- # cat("Try to check wheter or not the values are in log scale...")
- if(max(data>maxi)){
- log.values = FALSE
- # cat("signals are not in log2 scale, transforming...\n")
- }
- else{
- log.values = TRUE
- # cat("signals are already in log2 scale \n")
- }
- log.values
- }
- toNumeric <- function(x,Sub=","){
- if(!is.null(dim(x))){
- RN <- rownames(x)
- if(class(x[1,])=="numeric"){
- return(x)
- }
- if(!is.null(Sub)){
- for(s in Sub){
- x <- apply(x,MARGIN=2,sub,pattern=s,replacement="")
- }
- }
- x <- tryCatch(apply(x,MARGIN=2,as.numeric),error=function(error){stop("ERROR: Can not perform calculations on non-numeric data! Please submit only numeric values.")},warning=function(warning){stop("ERROR: Can not perform calculations on non-numeric data! Please submit only numeric values.")})
- rownames(x) <- RN
- x
- }
- }
- ####
- ## function similar to PWamat function from the annotate package,
- ## generates a matrix with rows unique LL ids, columns KEGG pathways
- ## containing 1 if gene can be associated with the pathway, 0 otherwise.
- makeLL2KEGGMappingMatrix <- function(x,nrGenes=0,verbose=TRUE){
- if(is.null(x)){
- stop("No EntrezGene (LocusLink) IDs have been submitted!\n")
- }
- require(KEGG)
- x <- unique(as.character(x))
- paths <- mget(x,KEGGEXTID2PATHID,ifnotfound=NA)
- not_found <- sapply(paths,function(x){if(length(x)==1 && is.na(x)) TRUE else FALSE})
- paths <- paths[!not_found]
- if(verbose){
- cat(length(paths),"EntrezGene IDs from the inputlist can be associated with at least one KEGG pathway.\n",sum(not_found),"IDs can not be associated with KEGG pathways.\n")
- }
- unique_paths <- unique(unlist(paths))
- Test <- sapply(paths,function(x){
- mtch <- match(x,unique_paths)
- res <- rep(0,length(unique_paths))
- res[mtch] <- 1
- res
- })
- Test <- t(Test)
- rownames(Test) <- x
- colnames(Test) <- unique_paths
- if(nrGenes>0){
- to_exclude <- colSums(Test) < nrGenes
- if(verbose){
- cat(sum(to_exclude),"Pathways are excluded, because less than",nrGenes,"genes can be mapped to them.\n")
- }
- Test <- Test[,!to_exclude]
- }
- return(Test)
- }