utils.R - The provided R code is a collection of functions …

/tools/expression/utils.R

https://bitbucket.org/cistrome/cistrome-harvard/ · R · 1007 lines · 757 code · 78 blank · 172 comment · 140 complexity · b382b19424ccc8e22e818953152991d9 MD5 · raw file

#################################################################################################
##
#################################################################################################
getSkipFromGPR <- function(file){
	con <- file(file,"r")
	twoLines <- readLines(con,n=2)
    close(con)
	if(substring(twoLines,1,3)[1]=="ATF"){
		skip <- substring(twoLines,1,2)
	}
	else{
		skip=c(0,-2)
	}
	as.numeric(skip[2])+2
}

###########################################################################################
## creates a new GAL file from the information from the gpr files
###########################################################################################
createGALFromGPR <- function(file,galfile="dummy.gal",colnames=c("Block","Column","Row","Name","ID")){
	table <- read.table(file,as.is=TRUE,comment.char="",check.names=FALSE,header=TRUE,skip=getSkipFromGPR(file),sep="\t")
	names <- colnames(table)
	result <- data.frame(matrix(ncol=5,nrow=length(table[,1])))
	colnames(result) <- colnames
	for(i in 1:length(result[1,])){
		result[,i] <- table[,colnames[i]]
	}
	#result[,1:5] <- table[,1:5]
	#colnames(result) <- names[1:5]
	write.table(file=galfile,result,sep="\t",row.name=FALSE,quote=FALSE)
}

plotHistogram <- function(R,G,r.col=2,g.col=3,breaks=NULL,log.transform=FALSE,add=FALSE,weights=NULL,return.breaks=FALSE,ylim=NULL,xlim=NULL,title=NULL,relative=FALSE,...){
	if(is.null(weights)){
		weights <- matrix(ncol=1,nrow=length(R))
		weights[,] <- 1
	}

	if(log.transform==TRUE){
		r <- log2(R)
		g <- log2(G)
		by <- 0.1
		toadd <- 0.1
	}
	else{
		r <- R
		g <- G
		by <- 50
		toadd <- 50
	}
	r <- r[weights==1]
	g <- g[weights==1]
	r <- r[!is.na(r)]
	g <- g[!is.na(g)]
	r <- r[!is.infinite(r)]
	g <- g[!is.infinite(g)]

	if(is.null(breaks)){
		breaks <- seq(min(0,(min(r,g))),(max(max(r),max(g))+toadd),by=by)
	}
	
	hr <- hist(r,plot=FALSE,breaks=breaks)
	hg <- hist(g,plot=FALSE,breaks=breaks)
	if(is.null(xlim)){
		xlim <- range(breaks)
	}

	r.counts <- hr$counts
	g.counts <- hg$counts
	ylabb <- "Counts"
	if(relative){
		r.counts <- r.counts / length(r)
		g.counts <- g.counts / length(g)
		ylabb <- "relative Counts"
		yy <- -0.001
	}
	else{
		yy <- -10
	}

	if(is.null(ylim)){
		ylim <- c(0,max(r.counts,g.counts))
	}
		
	if(!add){
		plot(hr$mids,r.counts,col=r.col,xlab="Intensity",ylab=ylabb,type="l",ylim=ylim,xlim=xlim,main=title,...)
	}
	else{
		points(hr$mids,r.counts,col=r.col,type="l",...)
	}
	points(hg$mids,g.counts,col=g.col,type="l",...)
	points(medianWithoutNA(r),yy,col=r.col,pch=3)
	points(medianWithoutNA(g),yy,col=g.col,pch=3)
	
	if(return.breaks){
		breaks
	}
#	hr <- density(r,na.rm=TRUE)
#	hg <- density(g,na.rm=TRUE)
#	xlim <- c(min(hr$x,hg$x),max(hr$x,hg$x))
#	ylim <- c(min(hr$y,hg$y),max(hr$y,hg$y))
#	plot(hr$x,hr$y,col=2,type="l",ylim=ylim,...)
#	points(hg$x,hg$y,col=3,type="l")
#	hr <- hist(r,breaks=breaks,plot=FALSE)
#	hg <- hist(g,breaks=breaks,plot=FALSE)
}


#############################################################################
##
#############################################################################
plotHistogramFromSlide <- function(lObject,slide=1,use.weights=TRUE,log.transform=FALSE,breaks=NULL,bg.lty=3,...){
	if(slide=="all"){
	
	
	
	}
	if(is.null(lObject$R)){
		R <- MAtoR(lObject$M[,slide],lObject$A[,slide])
		G <- MAtoG(lObject$M[,slide],lObject$A[,slide])
		Rb <- NULL
		Gb <- NULL
		draw.bg <- FALSE
		names <- colnames(lObject$M)
	}
	else{
		R <- lObject$R[,slide]
		G <- lObject$G[,slide]
		Rb <- lObject$Rb[,slide]
		Gb <- lObject$Gb[,slide]
		draw.bg <- !is.null(Rb)
		names <- colnames(lObject$R)
	}
	weights <- matrix(ncol=1,nrow=length(R))
	weights[,] <- 1
	if(is.null(lObject$weights)){
	}
	else{
		if(use.weights){
			weights <- lObject$weights[,slide]
		}
	}
#	r <- R[weights==1]
#	g <- G[weights==1]
#	rb <- Rb[weights==1]
#	gb <- Gb[weights==1]
	
	breaks <- plotHistogram(R=R,G=G,log.transform=log.transform,weights=weights,breaks=breaks,return.breaks=TRUE,title=names[slide],...)
	if(draw.bg){
		plotHistogram(R=Rb,G=Gb,log.transform=log.transform,breaks=NULL,weights=weights,lty=bg.lty,add=TRUE,...)
	}
}




checkExistantFile <- function(file,path=".",no.subdir=TRUE){
	file <- gsub(pattern="_",replacement=".",file)
	file <- gsub(pattern="/",replacement="-",file)
	counter <- 0
	suff <- NULL
	ending <- NULL
	ffile <- file
	splitted <- unlist(strsplit(file,split="\\."))
	if(length(splitted)>1){
		ending <- paste(".",splitted[length(splitted)],sep="")
		ffile <- paste(splitted[1:(length(splitted)-1)],collapse=".")
		#cat(ffile,"_",ending,"\n")
	}
	while(TRUE){
		newfile <- paste(ffile,suff,ending,sep="")
		if(file.exists(newfile)){
			#file does exist...
			counter <- counter +1
			suff <- paste("(",counter,")",sep="")
		}
		else{
			break
		}
	}
	newfile
}




saveMAList <- function(object,filename,remove.flagged=TRUE,na.value="NA",subset=NULL, allvalues=TRUE){
	genes <- NULL
	weights <- NULL
	g.length <- 0
	col.names <- NULL
	if(class(object)=="MAList"){
		if(is.null(dim(object$M))){
			object$M <- as.matrix(object$M)
			object$A <- as.matrix(object$A)
			if(!is.null(object$weights)){
				object$weights <- as.matrix(object$weights)
			}
		}
		if(is.null(subset)){
			subset <- rep(TRUE,nrow(object$M))
		}
		M <- as.matrix(object$M[subset,])
		CN <- colnames(object$M)
		if(is.null(CN)){
			CN <- object$targets[1,1]
		}
		colnames(M) <- CN
#		colnames(M) <- colnames(object$M)
		A <- as.matrix(object$A[subset,])
		colnames(A) <- CN
#		colnames(A) <- colnames(object$A)
		if(!is.null(object$weights)){
			weights <- as.matrix(object$weights[subset,])
		}
		if(!is.null(object$genes)){
			genes <- object$genes[subset,]
			if(!is.null(genes)){
				g.length <- ncol(genes)
			}
		}
	}
	else if(class(object)=="EexprSet" | class(object)=="MadbSet"){
		if(object@type=="TwoColor"){
			if(is.null(subset)){
				subset <- rep(TRUE,nrow(exprs(object)))
			}
			M <- getM(object)
			Colnames <- colnames(M)
			M <- as.matrix(M[subset,])
			colnames(M) <- Colnames
#			M <- as.matrix(getM(object)[subset,])
#			cat(class(M),colnames(M),"\n")
#			A <- as.matrix(getA(object)[subset,])
			A <- getA(object)
			A <- as.matrix(A[subset,])
			colnames(A) <- Colnames
			#cat("1")
			weights <- as.matrix(getWeights(object)[subset,])
			#cat("2")
			genes <- as.matrix(object@genes[subset,])
			if(!is.null(genes)){
				g.length <- ncol(genes)
				if(g.length==1){
					colnames(genes) <- "ID"
				}
			}
			#cat("g.length",g.length,"dim M",dim(M),"\n")
			#cat("3")
		}
	}
	else if(class(object)=="RGList"){
		if(is.null(subset)){
			subset <- rep(TRUE,nrow(object$R))
		}
		if(!.is.log(object$R)){
			object$R <- log2(object$R)
		}
		if(!.is.log(object$G)){
			object$G <- log2(object$G)
		}
		M <- object$R[subset,] - object$G[subset,]
		A <- 0.5*(object$R[subset,] + object$G[subset,])
		if(!is.null(object$weights)){
			weights <- object$weights[subset,]
		}
		if(!is.null(object$genes)){
			genes <- object$genes[subset,]
			if(!is.null(genes)){
				g.length <- ncol(genes)
			}
		}
	}
	else{
		stop("Objects from the class ",class(object)," can not be handled by this function\n")
	}
	#cat("ncol=",g.length+(ncol(M)*2),"nrow=",nrow=nrow(M),"\n")
	Table <- matrix(ncol=(g.length+(ncol(M)*2)),nrow=nrow(M))
	if(!is.null(genes)){
		Table[,1:ncol(genes)] <- as.matrix(genes)
		col.names <- colnames(genes)
	}

	#cat(colnames(M),"\n")

	for(i in 1:ncol(M)){
		col.names <- c(col.names,paste(colnames(M)[i],"M"),paste(colnames(A)[i],"A"))
		Table[,((i*2)-1+g.length)] <- M[,i]
		if(!is.null(weights) & remove.flagged){
			Table[weights[,i]==0,((i*2)-1+g.length)] <- NA
		}
		Table[is.na(Table[,((i*2)-1+g.length)]),((i*2)-1+g.length)] <- NA
		
		Table[,((i*2)+g.length)] <- A[,i]
		if(!is.null(weights) & remove.flagged){
			Table[weights[,i]==0,((i*2)+g.length)] <- NA
		}
		Table[is.na(Table[,((i*2)+g.length)]),((i*2)+g.length)] <- NA
	}
	colnames(Table) <- col.names
	
	if(allvalues){
		## calculate the red and green intensities from the M and A values...
		R <- MAtoR(M, A)
		G <- MAtoG(M, A)
		if(is.null(ncol(R))){
			R <- matrix(R, ncol=1)
			G <- matrix(G, ncol=1)
		}
		colnames(R) <- paste(colnames(M), "R")
		colnames(G) <- paste(colnames(M), "G")
		Table <- cbind(Table, R, G)
	}
	
	write.table(Table,file=filename,sep="\t",na=na.value,row.names=FALSE,quote=FALSE)
}

## closes all connections of gpr files...
closeMyConnections <- function(pattern="gpr"){
    All <- showConnections(TRUE)
    what <- grep(All[,"description"],pattern=pattern)
    what <- what-1
    for(theCon in what){
        close(getConnection(theCon))
    }
}

checkGPRs <- function(files=NULL,overwrite=TRUE){
	##
	# check for correct axon format...
	requiredcolumns <- c("Block","Column","Row","Name","ID","F635 Median","F635 Mean","B635 Median","B635 Mean","F532 Median","F532 Mean","B532 Median","B532 Mean","Flags")
	for(i in 1:length(files)){
		con <- file(files[i],"r")
		twoLines <- readLines(con,n=2)
		close(con)
#        twoLines <- scan(file=files[i], what="character",nlines=2, quiet=TRUE)
#        closeMyConnections()
		if(substring(twoLines,1,3)[1]!="ATF"){
			# is not in correct Axon file format
			cat("The file",files[i],"is not in correct Axon file format!\n")
			header <- read.table(files[i],header=TRUE,nrows=1,sep="\t",check.names=FALSE)
			if(sum(colnames(header) %in% requiredcolumns)==length(requiredcolumns)){
				BigFile <- readLines(con=files[i])
				writeLines(c("ATF\t1.0",paste("1\t",ncol(header),sep=""),"Provider=CARMAweb"),con=files[i])
				con <- file(files[i],"at")
				writeLines(con=con,BigFile)
                flush(con)
				close(con)
				cat("Successfully adjusted file",files[i],".\n")
#                closeMyConnections()
			}
			else{
				stop("The file",files[i],"is not in correct Axon file format! Was this file scanned with an Axon GenePix Scanner?\n")
			}
		}
	}

	skips <- NULL
	for(i in 1:length(files)){
		skips <- c(skips,getSkipFromGPR(files[i]))
	}
	if(length(unique(skips))!=1){
		cat("GPR files have different number of header rows!\nAdjusting GPR files\n")
		for(i in 1:length(files)){
			dummy <- read.table(files[i],skip=skips[i],header=TRUE,comment.char="",sep="\t",check.names=FALSE)
			#cat(colnames(dummy),"\n")
			write.table(dummy,file=files[i],sep="\t",quote=FALSE,row.names=FALSE)
		}
        checkGPRs(files)    # just re-run the check once again...
	}
}

createM <- function(object,red.channels,green.channels){
	M <- matrix(ncol=length(red.channels),nrow=nrow(exprs(object)))
	weights <- getWeights(object)
	col.names <- NULL
	for(i in 1:ncol(M)){
		r.idx <- myGrep(colnames(exprs(object)),pattern=red.channels[i],exact.match=TRUE)
		g.idx <- myGrep(colnames(exprs(object)),pattern=green.channels[i],exact.match=TRUE)
		M[,i] <- getM(object,r=r.idx,g=g.idx)
		M[weights[,r.idx]==0 | weights[,g.idx]==0,i] <- NA
		col.names <- c(col.names,paste(red.channels[i]," vs ",green.channels[i],sep=""))
	}
	colnames(M) <- col.names
	M
}


createA <- function(object,red.channels,green.channels){
	A <- matrix(ncol=length(red.channels),nrow=nrow(exprs(object)))
	weights <- getWeights(object)
	col.names <- NULL
	for(i in 1:ncol(A)){
		r.idx <- myGrep(colnames(exprs(object)),pattern=red.channels[i],exact.match=TRUE)
		g.idx <- myGrep(colnames(exprs(object)),pattern=green.channels[i],exact.match=TRUE)
		A[,i] <- getA(object,r=r.idx,g=g.idx)
		A[weights[,r.idx]==0 | weights[,g.idx]==0,i] <- NA
		col.names <- c(col.names,paste(red.channels[i]," vs ",green.channels[i],sep=""))
	}
	colnames(A) <- col.names
	A
}


##
# to remove all those genes that have flags in 
excludeFromTest <- function(data,classlabels,weights=NULL,nr.present=2){
	if(!is.null(weights)){
		if(sum(dim(data)==dim(weights))==2){
			data <- .removeFlagged(data,weights)
		}
	}
	result <- apply(data,MARGIN=1,.dummyFun,cl=classlabels,np=nr.present)
	result
}

.removeFlagged <- function(data,weights,flagged.value=NA){
	for(i in 1:ncol(data)){
		data[weights[,i]==0,i] <- flagged.value
	}
	data
}

.dummyFun <- function(x,cl,np){
	res <- sum(!is.na(x[cl==0])) < np | sum(!is.na(x[cl==1])) < np
	res
}

tukey.biweight.mod <- function(x,c=5,epsilon=1e-4,na.rm=TRUE){
	x <- x[!is.na(x)]
	ret <- tukey.biweight(x,c=c,epsilon=epsilon)
}

if( !isGeneric("filterOnVariance") )
	setGeneric("filterOnVariance", function(object,variance=0,array.names=NULL,remove.flagged=TRUE,v=TRUE,...)
	standardGeneric("filterOnVariance"))

setMethod("filterOnVariance","MadbSet",
	function(object,variance=0,array.names=NULL,remove.flagged=TRUE,v=TRUE,...){
		if(is.null(array.names)){
			array.names <- colnames(exprs(object))
		}
		E <- exprs(object)[,array.names]
		if(remove.flagged){
			weights <- getWeights(object)[,array.names]
			for(i in 1:ncol(weights)){
				E[weights[,i]==0,i] <- NA
			}
		}
		sds <- apply(E,MARGIN=1,sd,na.rm=TRUE)
		quant <- quantile(sds,variance,na.rm=TRUE)
		sel <- sds >= quant
		sel[is.na(sel)] <- FALSE
		if(v){
			cat(sum(sel)," features out of ",length(sel)," have a sd bigger than ",quant,"\n")
		}
		ret <- object
		exprs(ret) <- exprs(object)[sel,]
		ret@weights <- getWeights(object)[sel,]
		if(nrow(exprs(object))==nrow(object@genes)){
			if(ncol(object@genes)==1){
				G <- matrix(object@genes[sel,],ncol=1)
				rownames(G) <- rownames(exprs(ret))
				colnames(G) <- colnames(object@genes)
				ret@genes <- G
			}
			else{
				ret@genes <- object@genes[sel,]
			}
		}
		ret
})


##
# this function is a generalization of Robert Gentlemens GOHyperG funciton from the GOstats package.
# it allows to submit all EntrezGenes present on the used microarray.
# in principal this function performs the same steps as the GOHyperG function, nevertheless the results
# are sligtly different (maybe because of the different annotation packages used:
#  mapping to GO terms using the affymetrix probe sets -> lokuslink -> GO terms (GOstats) or mapping the locuslink ids directly
#  to GO terms using the GO package like it is done here.)
calculateGOHyperG <- function(x,allLLs=NULL,chip=NULL,what="MF",use.all=FALSE,v=TRUE){
	require(GO)
	require(GOstats)
        if (chip == "fly.db0.db"){chip = "fly.db0"}
	if(is.null(allLLs) & is.null(chip) & !use.all){
		stop("You have to submit either all EntrezGene IDs present on the array used, or the Affymetrix GeneChip used, or select use.all=TRUE to use all EntrezGenes available in GO!\n\n")
	}
	if(use.all){
		##
		# just that the code below throws no exception...
		allLLs <- x
	}
	##
	# if chip was submitted retrieve all LLs from the specific chip
	if(!is.null(chip)){
		require(chip,character.only=TRUE) || stop("need data package",chip)
		allLLs <- as.character(unlist(as.list(get(paste(chip,"LOCUSID",sep=""),mode="environment"))))
	}
	##
	# prepare the x (myLLs) and allLLs. remove duplicates, NAs, use only those x that are present in allLLs
	# remove possible LL. or LL strings (old way of writing LocusLink IDs)
	x <- sub("LL.","",x)
	x <- sub("LL","",x)
	allLLs <- sub("LL.","",allLLs)
	allLLs <- sub("LL","",allLLs)
	allLLs <- unique(allLLs)
	allLLs <- allLLs[!is.na(allLLs) & allLLs!=""]
	x <- unique(x)
	x <- x[!is.na(x) & x!=""]
	##
	# just interested in those x that are present in allLLs
	x <- allLLs[match(x,allLLs)]
	x <- x[!is.na(x) & x!=""]
	
	##
	# do some checking...
	if(length(x)==0 || is.na(x)){
		stop("None of the submitted interesting genes are present in the list of all EntrezGene (LocusLink) IDs submitted!\nPossible causes:\n-wrong list of genes (wrong IDs) submitted\n-EntrezGene IDs of the interesting genes and list of EntrezGene IDs of all genes on the array are not present on the same microarray\n")
	}
	
	
	
	##
	# ok, now map x to GO, retrieve all GO terms where one of the x EntrezGene IDs is present (using GOALLLOCUSID from the GO package)
	GOLL <- as.list(get("GOALLLOCUSID",mode="environment"))     ## warning, GOALLLOCUSID is deprecated in newer versions, use GOALLENTREZID
	GOLL <- GOLL[!is.na(GOLL)] # just removing all the GO ids that are not mapped to any LL
	PresentGO <- sapply(GOLL,function(z){
		if(is.na(z) || length(z)==0)
			return(FALSE)
		any(x %in% z)
		}
	)
	##
	# another checking...
	if(max(PresentGO)==0){
		stop("The submitted list of EntrezGene IDs of the interesting genes can not be mapped to any of the GO terms.\nMaybe the list contained no EntrezGene (LocusLink) IDs?\n")
	}
	if(v){
		cat("The",length(x),"submitted EntrezGene IDs map to",sum(PresentGO),"GO terms in all tree categories.\n")
	}
	GOLL <- GOLL[PresentGO]
	##
	# next restrict the GO terms to the selected category (specified by "what")
	goCat <- unlist(getGOOntology(names(GOLL)))
	goodGO <- goCat == what
	goodGO[is.na(goodGO)] <- FALSE
	if(v){
		cat(sum(goodGO),"of them are in the GO category we are interested in.\n")
	}
	mm <- match(names(goCat),names(GOLL))
	mm <- mm[goodGO]
	GOLL <- GOLL[mm]
	
	##
	# next we have to remove all the EntrezGene IDs from each GO term, that was not present on the microarray used.
	if(!use.all){
		myGOLL <- sapply(GOLL,function(z){
			if(length(z)==0 || is.na(z))
				return(NA)
			mylls <- unique(z[match(allLLs,z)])
			mylls <- mylls[!is.na(mylls)]
			mylls
		}
		)
		##
		# removing all those GO terms that have no entries left (just in the case there are some...)
		empties <- sapply(myGOLL,function(z){length(z)==1 && is.na(z)})
		myGOLL <- myGOLL[!empties]
		GOLL <- GOLL[!empties]
	}
	else{
		##
		# if use.all was selected, the proportion of the submitted EntrezGene IDs per GO term is compared to all
		# EntrezGenes mapped to that GO term.
		myGOLL <- GOLL
	}
	

	##
	# the rest is the same as in the GOHyperG function...
	cLLs <- unique(unlist(myGOLL))
	nLL <- length(cLLs)                    # nr of unique LL IDs present in all GO terms
	goCounts <- sapply(myGOLL,length)      # nr of LL IDs present on the array per GO term
	#allCounts <- sapply(GOLL,length)       # nr of all LL IDs per GO term
	##
	# some LL ids are represented more than once in a GO term...
	uniqueAllCounts <- sapply(GOLL,
		function(z){
			if(length(z)==0 || is.na(z))
				return(0)
			lls <- unique(z)
			lls <- lls[!is.na(lls)]
			return(length(lls))
		})
	whGood <- x[x %in% cLLs]               # those LLs from the interesting ones that are present in the GO terms selected
	nInt <- length(whGood)
	if(nInt==0){
		warning("No interesting genes left")
	}
	
	MappingGOLL <- lapply(myGOLL,function(z){whGood[whGood %in% z]})
	useCts <- sapply(myGOLL,function(z){sum(whGood %in% z)})   # nr of interesting LLs per GO term
	##
	# calculate the hypergeometric p values...
	pvs <- phyper(useCts-1,nInt,nLL-nInt,goCounts,lower.tail=FALSE)
	ord <- order(pvs)

	return(
		list(pvalues=pvs[ord],goCounts=goCounts[ord],intCounts=useCts[ord],allCounts=uniqueAllCounts[ord],GOLL=myGOLL[ord],intMapping=MappingGOLL[ord],numLL=nLL,numInt=nInt,intLLs=x)
	)
	
}


################
## get EntrezGenes for the specified GO term
getEntrezIDsFromGO <- function(GO, package="GO", entrezgenes){
	match.arg(package, c("GO", "humanLLMappings", "mouseLLMappings", "ratLLMappings"))
	require(package, character.only=TRUE)
	if(package=="GO"){
		ALLIDs <- mget(GO, env=GOALLENTREZID, ifnotfound=NA)
		if(!missing(entrezgenes)){
			## subset to the specified entrezgene ids.
			ALLIDs <- lapply(ALLIDs, function(x){ x[ x %in% entrezgenes] })
		}
		return(ALLIDs)
	}
	if(package=="humanLLMappings"){
		ALLIDs <- mget(GO, env=humanLLMappingsGO2LL, ifnotfound=NA)
		if(!missing(entrezgenes)){
			## subset to the specified entrezgene ids.
			ALLIDs <- lapply(ALLIDs, function(x){ x[ x %in% entrezgenes] })
		}
		return(ALLIDs)		
	}
	if(package=="mouseLLMappings"){
		ALLIDs <- mget(GO, env=mouseLLMappingsGO2LL, ifnotfound=NA)
		if(!missing(entrezgenes)){
			## subset to the specified entrezgene ids.
			ALLIDs <- lapply(ALLIDs, function(x){ x[ x %in% entrezgenes] })
		}
		return(ALLIDs)		
	}
	if(package=="ratLLMappings"){
		ALLIDs <- mget(GO, env=ratLLMappingsGO2LL, ifnotfound=NA)
		if(!missing(entrezgenes)){
			## subset to the specified entrezgene ids.
			ALLIDs <- lapply(ALLIDs, function(x){ x[ x %in% entrezgenes] })
		}
		return(ALLIDs)		
	}
}


#getLLGOmapping <- function(x,what="MF"){
#	gos <- mget(x, env=GOLOCUSID2GO,ifnotfound=NA)
#	if(length(gos)==1)
#		bd = is.na(gos[[1]])
#	else
#		bd = is.na(gos)
#	gos <- gos[!bd]
#	mymapping <- lapply(gos,getOntology,ontology=what)
#	emptygos <- sapply(mymapping,function(x) length(x)==0 )
#	mymapping <- mymapping[!emptygos]
#	mymapping
#}

#getGOFromLL <- function(x,Ontology="MF"){
#	require(GO) || stop("no GO library")
#	match.arg(Ontology, c("MF", "BP", "CC"))
#	goterms <- mget(x, env = GOLOCUSID2GO, ifnotfound=NA)
#	if(length(goterms)==1){
#		goterms <- goterms[!is.na(goterms[[1]])]
#	}
#	else{
#		goterms <- goterms[!is.na(goterms)]
#	}
#	goterms <- lapply(goterms, function(x) x[sapply(x,
#                      function(x) {if (is.na(x$Ontology) )
#                                          return(FALSE)
#                                      else
#                                          x$Ontology == Ontology})])
#	goids <- NULL
#	if(length(goterms)==0){
#		stop("The submitted list of EntrezGene IDs of the interesting genes can not be mapped to any of the GO terms.\nMaybe the list contained no EntrezGene (LocusLink) IDs?\n")
#	}
#	for(i in 1:length(goterms)){
#		goids <- c(goids,unique(unlist(sapply(goterms[[i]], function(x) x$GOID))))
#	}
#	goids
#}
#

##
# check if package is available
isAnnotationAvailable <- function(chip,develOK=FALSE){
	ok = FALSE
        if (chip == "fly.db0.db"){chip  = "fly.db0"}
#	if(!require("reposTools",character.only=TRUE)){
#		warning("reposTools package is not installed.\n")
#		if(!require(chip,character.only=TRUE)){
#			ok = FALSE
#		}
#		else{
#			ok = TRUE
#		}
#	}
#	else{
		if(!require(chip,character.only=TRUE)){
#             source("http://www.bioconductor.org/getBioC.R")
##			library(reposTools)
##			# have to install the package
##			tryCatch(install.packages2(pkg=chip,develOK=develOK,getAllDeps=TRUE),error=return(FALSE))
#			try(getBioC(chip))
#			if(!require(chip,character.only=TRUE)){
#			    ok=FALSE
#			}
#			else{
#			    ok=TRUE
#			}
		}
		else{
			ok = TRUE
		}
#	}
	ok
}

saveSAMResults <- function(SAM,table,delta,filename){
	dummy <- print(SAM)
	for(i in 1:nrow(dummy)){
		saveSAMResult(SAM,table,delta=dummy[i,"Delta"],filename=filename)
	}
}

saveSAMResult <- function(SAM, table,delta,filename){
	sum.SAM <- (summary(SAM,delta))@mat.sig
	if(!is.null(sum.SAM) & nrow(sum.SAM)>0){
		table.sub <- table[sum.SAM[,"Row"],]
		if(class(table)=="aafTable"){
			for(i in 2:(ncol(sum.SAM))){
				table.sub <- merge(table.sub,aafTable(sum.SAM[,i],colnames=colnames(sum.SAM)[i]))
			}
			Filename <- checkExistantFile(paste(filename,"-delta-",delta,".txt",sep=""))
			saveText(table.sub,filename=Filename)
		}
		else{
			if(nrow(sum.SAM)==1){
				table.sub <- matrix(table.sub,nrow=1)
				colnames(table.sub) <- colnames(table)
			}
			table.sub <- cbind(as.matrix(table.sub),as.matrix(sum.SAM[,2:ncol(sum.SAM)]))
			Filename <- checkExistantFile(paste(filename,"-delta-",delta,".txt",sep=""))
			write.table(table.sub,file=Filename,sep="\t",quote=FALSE,row.names=FALSE)
		}
		cat(nrow(table.sub),"significant genes (delta=",delta,") were saved to the file:",Filename,"\n")
	}
	else{
		cat("No significant genes with a delta of",delta,"\n")
	}
}

###
# fills missing spots
fillMissingSpots <- function(object){
	ret <- object
	if(class(ret)=="RGList"){
		if(!is.null(ret$printer)){
			layout <- object$printer
			nr.row <- layout$ngrid.r * layout$ngrid.c * layout$nspot.r * layout$nspot.c
			missing <- (nr.row) - nrow(object$R)
			if(missing!=0){
				cat("Have to fill in",missing,"missing spots\n")
				index <- as.numeric(apply(object$genes,MARGIN=1,FUN=.getIndexFromBCR,nspot.r=layout$nspot.r,nspot.c=layout$nspot.c))
				R <- matrix(ncol=ncol(object$R),nrow=nr.row)
				colnames(R) <- colnames(object$R)
				rownames(R) <- 1:nrow(R)

				## check if length of index and nrow of R are the same (<- problem with custom GAL files...)
				if(length(index)!=nrow(object$R)){
					stop("ERROR: number of data values does not match number of annotations!\nPlease try again without specifying a GAL file!")
				}
				R[index,] <- object$R
				ret$R <- R
				R[index,] <- object$G
				ret$G <- R
				if(!is.null(object$Rb)){
					R[index,] <- object$Rb
					ret$Rb <- R
					R[index,] <- object$Gb
					ret$Gb <- R
				}
				if(!is.null(object$weights)){
					R[,] <- 0
					R[index,] <- object$weights
					ret$weights <- R
				}
				else{
					R[,] <- 0
					R[index,] <- 1
					ret$weights <- R
				}
				rm(R)
				g <- gc()
				Genes = data.frame(matrix(ncol=ncol(object$genes),nrow=nr.row))
				colnames(Genes) <- colnames(object$genes)
				rownames(Genes) <- 1:nrow(Genes)
#				cat("nrow Genes",nrow(Genes),"ncol Genes",ncol(Genes),"length index",length(index),"\n")
				Genes[index,1:ncol(Genes)] = object$genes[,1:ncol(Genes)]
#				cat("genes filled\n","ncol= ",ncol(Genes)," nrow: ",nrow(Genes),"\n")
				ret$genes = NULL
				ret$genes = Genes
#				ret$genes = Genes
#				cat("genes set\n")
				rm(Genes)
				g <- gc()
			}
		}
	}
	ret
}

.getIndexFromBCR <- function(x,nspot.c,nspot.r){
	((as.numeric(x["Block"])-1)*nspot.c*nspot.r) + ((as.numeric(x["Row"])-1)*nspot.c) + as.numeric(x["Column"])
}

if( !isGeneric("drawBoxplot") )
	setGeneric("drawBoxplot", function(x,new.plot=TRUE,xlim=NULL,at=0,exclude.flagged=FALSE,...)
	standardGeneric("drawBoxplot"))

setMethod("drawBoxplot","MadbSet",
	function(x,new.plot=TRUE,xlim=NULL,at=0,exclude.flagged=FALSE,...){
		if(exclude.flagged){
			W <- getWeights(x)
			for(i in 1:ncol(exprs(x))){
				exprs(x)[W[,i]==0,i] <- NA
			}
		}
		boxplots(x=exprs(x),new.plot=new.plot,xlim=xlim,at=at,...)
	}
)

boxplots <- function(x,new.plot=TRUE,xlim=NULL,at=0,col=NULL,log2.transform=FALSE,ylim=NULL,...){
	data <- x
#	if(class(x)=="exprSet" | class(x)=="EexprSet"){
#		data <- exprs(x)
#	}
	if(log2.transform){
		data <- log2(data)
	}
	if(is.null(ncol(data))){
		data <- as.matrix(data)
	}
	## remove those nasty infinite values
	for(i in 1:ncol(data)){
		data[is.infinite(data[,i]),i] <- NA
	}

	CN <- colnames(data)
	if(is.null(CN)){
		CN <- 1:ncol(data)
	}
	if(is.null(col)){
		col=rep(0,ncol(data))
	}
	if(length(col)!=ncol(data)){
		col=rep(col[1],ncol(data))
	}
	if(new.plot){
	if(is.null(xlim)){
		xlim=c(0.5,(ncol(data)+0.5))
	}
	par(omi=0.7*c(min(2,max(strwidth(CN,units="i"))),0,0,0),cex.axis=0.7)
	if(is.null(ylim)){
		ylim=c(min(data,na.rm=TRUE),max(data,na.rm=TRUE))
	}
	plot(1,1,pch=NA,ylab="expression",xaxt="n",ylim=ylim,xlim=xlim,xlab=NA)
	}
	axis(side=1,labels=CN,at=(1+at):(ncol(data)+at),las=3)
	for(i in 1:ncol(data)){
		boxplot(data[,i],at=(i+at),add=TRUE,col=col[i],range=0,...)
	}
}


plotRankedVariance <- function(data,subset=NULL,cut){
	if(!is.null(subset)){
		data <- data[,subset]
	}
	if(!.is.log(data)){
		data <- log2(data)
	}
	SD <- apply(data,MARGIN=1,sd,na.rm=TRUE)
	CUT <- quantile(SD,cut,na.rm=TRUE)
	plot(sort(SD),pch=16,cex=0.33,ylab="sorted variance")
	abline(h=CUT,col=2)
	text((length(SD)/20),(CUT+0.2),label=format(CUT,digits=4),col=2)
}

plotSortedPValues <- function(x,top=NULL,supported.cols=c("rawp","adjp","minP","maxT","BH","BY","Bonferroni","Hochberg","Holm","SidakSS","SidakSD"),...){
	if(is.null(nrow(x))){
		stop("Please submit a matrix, data.frame or instance of aafTable!\n")
	}
	if(!is.null(top)){
		top <- min(c(nrow(x),top))
	}
	else{
		top <- nrow(x)
	}
	Columns <- supported.cols[supported.cols %in% colnames(x)]
	if(length(Columns)==0){
		stop("The submitted matrix must contain at least one column with p-values that is named according to one of the supported column names!")
	}
	if(class(x)=="aafTable"){
		require(annaffy)
		#have to do some transformations...
		x.new <- matrix(ncol=length(Columns),nrow=nrow(x))
		colnames(x.new) <- Columns
		for(i in Columns){
			x.new[,i] <- as.numeric(getText(x[[i]]))
		}
		x <- x.new
	}
	#ok, now i can start drawing...
	x <- apply(x[,Columns],MARGIN=2,sort,na.last=TRUE)
	Max <- max(x[1:top,],na.rm=TRUE)

	plot(x[1:top,Columns[1]],xlab="number of significant genes",ylab="sorted p-value",col=1,ylim=c(0,Max),main=paste("Top",top,"sorted p-values"),...)
	if(length(Columns)>1){
		for(i in 2:length(Columns)){
			points(x[1:top,Columns[i]],col=i,...)
		}
	}
	legend(top-top/4,Max,Columns,col=1:length(Columns),pch=16)
}

.is.log <- function(data,maxi=100){
	data <- data[!is.na(data)]
	data <- data[!is.infinite(data)]
#	cat("Try to check wheter or not the values are in log scale...")
	if(max(data>maxi)){
		log.values = FALSE
#		cat("signals are not in log2 scale, transforming...\n")
	}
	else{
		log.values = TRUE
#		cat("signals are already in log2 scale \n")
	}
	log.values
}

toNumeric <- function(x,Sub=","){
	if(!is.null(dim(x))){
		RN <- rownames(x)
		if(class(x[1,])=="numeric"){
			return(x)
		}
		if(!is.null(Sub)){
			for(s in Sub){
				x <- apply(x,MARGIN=2,sub,pattern=s,replacement="")
			}
		}
		x <- tryCatch(apply(x,MARGIN=2,as.numeric),error=function(error){stop("ERROR: Can not perform calculations on non-numeric data! Please submit only numeric values.")},warning=function(warning){stop("ERROR: Can not perform calculations on non-numeric data! Please submit only numeric values.")})
		rownames(x) <- RN
		x
	}
}

####
## function similar to PWamat function from the annotate package,
## generates a matrix with rows unique LL ids, columns KEGG pathways
## containing 1 if gene can be associated with the pathway, 0 otherwise.
makeLL2KEGGMappingMatrix <- function(x,nrGenes=0,verbose=TRUE){
	if(is.null(x)){
		stop("No EntrezGene (LocusLink) IDs have been submitted!\n")
	}
	require(KEGG)
	x <- unique(as.character(x))
	paths <- mget(x,KEGGEXTID2PATHID,ifnotfound=NA)
	not_found <- sapply(paths,function(x){if(length(x)==1 && is.na(x)) TRUE else FALSE})
	paths <- paths[!not_found]
	if(verbose){
		cat(length(paths),"EntrezGene IDs from the inputlist can be associated with at least one KEGG pathway.\n",sum(not_found),"IDs can not be associated with KEGG pathways.\n")
	}
	unique_paths <- unique(unlist(paths))
	Test <- sapply(paths,function(x){
		mtch <- match(x,unique_paths)
		res <- rep(0,length(unique_paths))
		res[mtch] <- 1
		res
	})
	Test <- t(Test)
	rownames(Test) <- x
	colnames(Test) <- unique_paths
	if(nrGenes>0){
		to_exclude <- colSums(Test) < nrGenes
		if(verbose){
			cat(sum(to_exclude),"Pathways are excluded, because less than",nrGenes,"genes can be mapped to them.\n")
		}
		Test <- Test[,!to_exclude]
	}
	return(Test)
}
Summary ✨

The provided R code is a collection of functions for data analysis and visualization. The makeLL2KEGGMappingMatrix function takes an input list of EntrezGene (LocusLink) IDs, queries the KEGG database to retrieve pathway information, and generates a matrix with rows corresponding to unique LL IDs and columns corresponding to KEGG pathways containing at least one gene. The toNumeric function converts non-numeric data to numeric values.
Alerts (3)

Complexity hotspot; lines 223 to 225 (total complexity: 4)
223 224 225