PageRenderTime 69ms CodeModel.GetById 18ms app.highlight 42ms RepoModel.GetById 1ms app.codeStats 0ms

/tools/expression/utils.R

https://bitbucket.org/cistrome/cistrome-harvard/
R | 1007 lines | 757 code | 78 blank | 172 comment | 140 complexity | b382b19424ccc8e22e818953152991d9 MD5 | raw file
   1#################################################################################################
   2##
   3#################################################################################################
   4getSkipFromGPR <- function(file){
   5	con <- file(file,"r")
   6	twoLines <- readLines(con,n=2)
   7    close(con)
   8	if(substring(twoLines,1,3)[1]=="ATF"){
   9		skip <- substring(twoLines,1,2)
  10	}
  11	else{
  12		skip=c(0,-2)
  13	}
  14	as.numeric(skip[2])+2
  15}
  16
  17###########################################################################################
  18## creates a new GAL file from the information from the gpr files
  19###########################################################################################
  20createGALFromGPR <- function(file,galfile="dummy.gal",colnames=c("Block","Column","Row","Name","ID")){
  21	table <- read.table(file,as.is=TRUE,comment.char="",check.names=FALSE,header=TRUE,skip=getSkipFromGPR(file),sep="\t")
  22	names <- colnames(table)
  23	result <- data.frame(matrix(ncol=5,nrow=length(table[,1])))
  24	colnames(result) <- colnames
  25	for(i in 1:length(result[1,])){
  26		result[,i] <- table[,colnames[i]]
  27	}
  28	#result[,1:5] <- table[,1:5]
  29	#colnames(result) <- names[1:5]
  30	write.table(file=galfile,result,sep="\t",row.name=FALSE,quote=FALSE)
  31}
  32
  33plotHistogram <- function(R,G,r.col=2,g.col=3,breaks=NULL,log.transform=FALSE,add=FALSE,weights=NULL,return.breaks=FALSE,ylim=NULL,xlim=NULL,title=NULL,relative=FALSE,...){
  34	if(is.null(weights)){
  35		weights <- matrix(ncol=1,nrow=length(R))
  36		weights[,] <- 1
  37	}
  38
  39	if(log.transform==TRUE){
  40		r <- log2(R)
  41		g <- log2(G)
  42		by <- 0.1
  43		toadd <- 0.1
  44	}
  45	else{
  46		r <- R
  47		g <- G
  48		by <- 50
  49		toadd <- 50
  50	}
  51	r <- r[weights==1]
  52	g <- g[weights==1]
  53	r <- r[!is.na(r)]
  54	g <- g[!is.na(g)]
  55	r <- r[!is.infinite(r)]
  56	g <- g[!is.infinite(g)]
  57
  58	if(is.null(breaks)){
  59		breaks <- seq(min(0,(min(r,g))),(max(max(r),max(g))+toadd),by=by)
  60	}
  61	
  62	hr <- hist(r,plot=FALSE,breaks=breaks)
  63	hg <- hist(g,plot=FALSE,breaks=breaks)
  64	if(is.null(xlim)){
  65		xlim <- range(breaks)
  66	}
  67
  68	r.counts <- hr$counts
  69	g.counts <- hg$counts
  70	ylabb <- "Counts"
  71	if(relative){
  72		r.counts <- r.counts / length(r)
  73		g.counts <- g.counts / length(g)
  74		ylabb <- "relative Counts"
  75		yy <- -0.001
  76	}
  77	else{
  78		yy <- -10
  79	}
  80
  81	if(is.null(ylim)){
  82		ylim <- c(0,max(r.counts,g.counts))
  83	}
  84		
  85	if(!add){
  86		plot(hr$mids,r.counts,col=r.col,xlab="Intensity",ylab=ylabb,type="l",ylim=ylim,xlim=xlim,main=title,...)
  87	}
  88	else{
  89		points(hr$mids,r.counts,col=r.col,type="l",...)
  90	}
  91	points(hg$mids,g.counts,col=g.col,type="l",...)
  92	points(medianWithoutNA(r),yy,col=r.col,pch=3)
  93	points(medianWithoutNA(g),yy,col=g.col,pch=3)
  94	
  95	if(return.breaks){
  96		breaks
  97	}
  98#	hr <- density(r,na.rm=TRUE)
  99#	hg <- density(g,na.rm=TRUE)
 100#	xlim <- c(min(hr$x,hg$x),max(hr$x,hg$x))
 101#	ylim <- c(min(hr$y,hg$y),max(hr$y,hg$y))
 102#	plot(hr$x,hr$y,col=2,type="l",ylim=ylim,...)
 103#	points(hg$x,hg$y,col=3,type="l")
 104#	hr <- hist(r,breaks=breaks,plot=FALSE)
 105#	hg <- hist(g,breaks=breaks,plot=FALSE)
 106}
 107
 108
 109#############################################################################
 110##
 111#############################################################################
 112plotHistogramFromSlide <- function(lObject,slide=1,use.weights=TRUE,log.transform=FALSE,breaks=NULL,bg.lty=3,...){
 113	if(slide=="all"){
 114	
 115	
 116	
 117	}
 118	if(is.null(lObject$R)){
 119		R <- MAtoR(lObject$M[,slide],lObject$A[,slide])
 120		G <- MAtoG(lObject$M[,slide],lObject$A[,slide])
 121		Rb <- NULL
 122		Gb <- NULL
 123		draw.bg <- FALSE
 124		names <- colnames(lObject$M)
 125	}
 126	else{
 127		R <- lObject$R[,slide]
 128		G <- lObject$G[,slide]
 129		Rb <- lObject$Rb[,slide]
 130		Gb <- lObject$Gb[,slide]
 131		draw.bg <- !is.null(Rb)
 132		names <- colnames(lObject$R)
 133	}
 134	weights <- matrix(ncol=1,nrow=length(R))
 135	weights[,] <- 1
 136	if(is.null(lObject$weights)){
 137	}
 138	else{
 139		if(use.weights){
 140			weights <- lObject$weights[,slide]
 141		}
 142	}
 143#	r <- R[weights==1]
 144#	g <- G[weights==1]
 145#	rb <- Rb[weights==1]
 146#	gb <- Gb[weights==1]
 147	
 148	breaks <- plotHistogram(R=R,G=G,log.transform=log.transform,weights=weights,breaks=breaks,return.breaks=TRUE,title=names[slide],...)
 149	if(draw.bg){
 150		plotHistogram(R=Rb,G=Gb,log.transform=log.transform,breaks=NULL,weights=weights,lty=bg.lty,add=TRUE,...)
 151	}
 152}
 153
 154
 155
 156
 157checkExistantFile <- function(file,path=".",no.subdir=TRUE){
 158	file <- gsub(pattern="_",replacement=".",file)
 159	file <- gsub(pattern="/",replacement="-",file)
 160	counter <- 0
 161	suff <- NULL
 162	ending <- NULL
 163	ffile <- file
 164	splitted <- unlist(strsplit(file,split="\\."))
 165	if(length(splitted)>1){
 166		ending <- paste(".",splitted[length(splitted)],sep="")
 167		ffile <- paste(splitted[1:(length(splitted)-1)],collapse=".")
 168		#cat(ffile,"_",ending,"\n")
 169	}
 170	while(TRUE){
 171		newfile <- paste(ffile,suff,ending,sep="")
 172		if(file.exists(newfile)){
 173			#file does exist...
 174			counter <- counter +1
 175			suff <- paste("(",counter,")",sep="")
 176		}
 177		else{
 178			break
 179		}
 180	}
 181	newfile
 182}
 183
 184
 185
 186
 187saveMAList <- function(object,filename,remove.flagged=TRUE,na.value="NA",subset=NULL, allvalues=TRUE){
 188	genes <- NULL
 189	weights <- NULL
 190	g.length <- 0
 191	col.names <- NULL
 192	if(class(object)=="MAList"){
 193		if(is.null(dim(object$M))){
 194			object$M <- as.matrix(object$M)
 195			object$A <- as.matrix(object$A)
 196			if(!is.null(object$weights)){
 197				object$weights <- as.matrix(object$weights)
 198			}
 199		}
 200		if(is.null(subset)){
 201			subset <- rep(TRUE,nrow(object$M))
 202		}
 203		M <- as.matrix(object$M[subset,])
 204		CN <- colnames(object$M)
 205		if(is.null(CN)){
 206			CN <- object$targets[1,1]
 207		}
 208		colnames(M) <- CN
 209#		colnames(M) <- colnames(object$M)
 210		A <- as.matrix(object$A[subset,])
 211		colnames(A) <- CN
 212#		colnames(A) <- colnames(object$A)
 213		if(!is.null(object$weights)){
 214			weights <- as.matrix(object$weights[subset,])
 215		}
 216		if(!is.null(object$genes)){
 217			genes <- object$genes[subset,]
 218			if(!is.null(genes)){
 219				g.length <- ncol(genes)
 220			}
 221		}
 222	}
 223	else if(class(object)=="EexprSet" | class(object)=="MadbSet"){
 224		if(object@type=="TwoColor"){
 225			if(is.null(subset)){
 226				subset <- rep(TRUE,nrow(exprs(object)))
 227			}
 228			M <- getM(object)
 229			Colnames <- colnames(M)
 230			M <- as.matrix(M[subset,])
 231			colnames(M) <- Colnames
 232#			M <- as.matrix(getM(object)[subset,])
 233#			cat(class(M),colnames(M),"\n")
 234#			A <- as.matrix(getA(object)[subset,])
 235			A <- getA(object)
 236			A <- as.matrix(A[subset,])
 237			colnames(A) <- Colnames
 238			#cat("1")
 239			weights <- as.matrix(getWeights(object)[subset,])
 240			#cat("2")
 241			genes <- as.matrix(object@genes[subset,])
 242			if(!is.null(genes)){
 243				g.length <- ncol(genes)
 244				if(g.length==1){
 245					colnames(genes) <- "ID"
 246				}
 247			}
 248			#cat("g.length",g.length,"dim M",dim(M),"\n")
 249			#cat("3")
 250		}
 251	}
 252	else if(class(object)=="RGList"){
 253		if(is.null(subset)){
 254			subset <- rep(TRUE,nrow(object$R))
 255		}
 256		if(!.is.log(object$R)){
 257			object$R <- log2(object$R)
 258		}
 259		if(!.is.log(object$G)){
 260			object$G <- log2(object$G)
 261		}
 262		M <- object$R[subset,] - object$G[subset,]
 263		A <- 0.5*(object$R[subset,] + object$G[subset,])
 264		if(!is.null(object$weights)){
 265			weights <- object$weights[subset,]
 266		}
 267		if(!is.null(object$genes)){
 268			genes <- object$genes[subset,]
 269			if(!is.null(genes)){
 270				g.length <- ncol(genes)
 271			}
 272		}
 273	}
 274	else{
 275		stop("Objects from the class ",class(object)," can not be handled by this function\n")
 276	}
 277	#cat("ncol=",g.length+(ncol(M)*2),"nrow=",nrow=nrow(M),"\n")
 278	Table <- matrix(ncol=(g.length+(ncol(M)*2)),nrow=nrow(M))
 279	if(!is.null(genes)){
 280		Table[,1:ncol(genes)] <- as.matrix(genes)
 281		col.names <- colnames(genes)
 282	}
 283
 284	#cat(colnames(M),"\n")
 285
 286	for(i in 1:ncol(M)){
 287		col.names <- c(col.names,paste(colnames(M)[i],"M"),paste(colnames(A)[i],"A"))
 288		Table[,((i*2)-1+g.length)] <- M[,i]
 289		if(!is.null(weights) & remove.flagged){
 290			Table[weights[,i]==0,((i*2)-1+g.length)] <- NA
 291		}
 292		Table[is.na(Table[,((i*2)-1+g.length)]),((i*2)-1+g.length)] <- NA
 293		
 294		Table[,((i*2)+g.length)] <- A[,i]
 295		if(!is.null(weights) & remove.flagged){
 296			Table[weights[,i]==0,((i*2)+g.length)] <- NA
 297		}
 298		Table[is.na(Table[,((i*2)+g.length)]),((i*2)+g.length)] <- NA
 299	}
 300	colnames(Table) <- col.names
 301	
 302	if(allvalues){
 303		## calculate the red and green intensities from the M and A values...
 304		R <- MAtoR(M, A)
 305		G <- MAtoG(M, A)
 306		if(is.null(ncol(R))){
 307			R <- matrix(R, ncol=1)
 308			G <- matrix(G, ncol=1)
 309		}
 310		colnames(R) <- paste(colnames(M), "R")
 311		colnames(G) <- paste(colnames(M), "G")
 312		Table <- cbind(Table, R, G)
 313	}
 314	
 315	write.table(Table,file=filename,sep="\t",na=na.value,row.names=FALSE,quote=FALSE)
 316}
 317
 318## closes all connections of gpr files...
 319closeMyConnections <- function(pattern="gpr"){
 320    All <- showConnections(TRUE)
 321    what <- grep(All[,"description"],pattern=pattern)
 322    what <- what-1
 323    for(theCon in what){
 324        close(getConnection(theCon))
 325    }
 326}
 327
 328checkGPRs <- function(files=NULL,overwrite=TRUE){
 329	##
 330	# check for correct axon format...
 331	requiredcolumns <- c("Block","Column","Row","Name","ID","F635 Median","F635 Mean","B635 Median","B635 Mean","F532 Median","F532 Mean","B532 Median","B532 Mean","Flags")
 332	for(i in 1:length(files)){
 333		con <- file(files[i],"r")
 334		twoLines <- readLines(con,n=2)
 335		close(con)
 336#        twoLines <- scan(file=files[i], what="character",nlines=2, quiet=TRUE)
 337#        closeMyConnections()
 338		if(substring(twoLines,1,3)[1]!="ATF"){
 339			# is not in correct Axon file format
 340			cat("The file",files[i],"is not in correct Axon file format!\n")
 341			header <- read.table(files[i],header=TRUE,nrows=1,sep="\t",check.names=FALSE)
 342			if(sum(colnames(header) %in% requiredcolumns)==length(requiredcolumns)){
 343				BigFile <- readLines(con=files[i])
 344				writeLines(c("ATF\t1.0",paste("1\t",ncol(header),sep=""),"Provider=CARMAweb"),con=files[i])
 345				con <- file(files[i],"at")
 346				writeLines(con=con,BigFile)
 347                flush(con)
 348				close(con)
 349				cat("Successfully adjusted file",files[i],".\n")
 350#                closeMyConnections()
 351			}
 352			else{
 353				stop("The file",files[i],"is not in correct Axon file format! Was this file scanned with an Axon GenePix Scanner?\n")
 354			}
 355		}
 356	}
 357
 358	skips <- NULL
 359	for(i in 1:length(files)){
 360		skips <- c(skips,getSkipFromGPR(files[i]))
 361	}
 362	if(length(unique(skips))!=1){
 363		cat("GPR files have different number of header rows!\nAdjusting GPR files\n")
 364		for(i in 1:length(files)){
 365			dummy <- read.table(files[i],skip=skips[i],header=TRUE,comment.char="",sep="\t",check.names=FALSE)
 366			#cat(colnames(dummy),"\n")
 367			write.table(dummy,file=files[i],sep="\t",quote=FALSE,row.names=FALSE)
 368		}
 369        checkGPRs(files)    # just re-run the check once again...
 370	}
 371}
 372
 373createM <- function(object,red.channels,green.channels){
 374	M <- matrix(ncol=length(red.channels),nrow=nrow(exprs(object)))
 375	weights <- getWeights(object)
 376	col.names <- NULL
 377	for(i in 1:ncol(M)){
 378		r.idx <- myGrep(colnames(exprs(object)),pattern=red.channels[i],exact.match=TRUE)
 379		g.idx <- myGrep(colnames(exprs(object)),pattern=green.channels[i],exact.match=TRUE)
 380		M[,i] <- getM(object,r=r.idx,g=g.idx)
 381		M[weights[,r.idx]==0 | weights[,g.idx]==0,i] <- NA
 382		col.names <- c(col.names,paste(red.channels[i]," vs ",green.channels[i],sep=""))
 383	}
 384	colnames(M) <- col.names
 385	M
 386}
 387
 388
 389createA <- function(object,red.channels,green.channels){
 390	A <- matrix(ncol=length(red.channels),nrow=nrow(exprs(object)))
 391	weights <- getWeights(object)
 392	col.names <- NULL
 393	for(i in 1:ncol(A)){
 394		r.idx <- myGrep(colnames(exprs(object)),pattern=red.channels[i],exact.match=TRUE)
 395		g.idx <- myGrep(colnames(exprs(object)),pattern=green.channels[i],exact.match=TRUE)
 396		A[,i] <- getA(object,r=r.idx,g=g.idx)
 397		A[weights[,r.idx]==0 | weights[,g.idx]==0,i] <- NA
 398		col.names <- c(col.names,paste(red.channels[i]," vs ",green.channels[i],sep=""))
 399	}
 400	colnames(A) <- col.names
 401	A
 402}
 403
 404
 405##
 406# to remove all those genes that have flags in 
 407excludeFromTest <- function(data,classlabels,weights=NULL,nr.present=2){
 408	if(!is.null(weights)){
 409		if(sum(dim(data)==dim(weights))==2){
 410			data <- .removeFlagged(data,weights)
 411		}
 412	}
 413	result <- apply(data,MARGIN=1,.dummyFun,cl=classlabels,np=nr.present)
 414	result
 415}
 416
 417.removeFlagged <- function(data,weights,flagged.value=NA){
 418	for(i in 1:ncol(data)){
 419		data[weights[,i]==0,i] <- flagged.value
 420	}
 421	data
 422}
 423
 424.dummyFun <- function(x,cl,np){
 425	res <- sum(!is.na(x[cl==0])) < np | sum(!is.na(x[cl==1])) < np
 426	res
 427}
 428
 429tukey.biweight.mod <- function(x,c=5,epsilon=1e-4,na.rm=TRUE){
 430	x <- x[!is.na(x)]
 431	ret <- tukey.biweight(x,c=c,epsilon=epsilon)
 432}
 433
 434if( !isGeneric("filterOnVariance") )
 435	setGeneric("filterOnVariance", function(object,variance=0,array.names=NULL,remove.flagged=TRUE,v=TRUE,...)
 436	standardGeneric("filterOnVariance"))
 437
 438setMethod("filterOnVariance","MadbSet",
 439	function(object,variance=0,array.names=NULL,remove.flagged=TRUE,v=TRUE,...){
 440		if(is.null(array.names)){
 441			array.names <- colnames(exprs(object))
 442		}
 443		E <- exprs(object)[,array.names]
 444		if(remove.flagged){
 445			weights <- getWeights(object)[,array.names]
 446			for(i in 1:ncol(weights)){
 447				E[weights[,i]==0,i] <- NA
 448			}
 449		}
 450		sds <- apply(E,MARGIN=1,sd,na.rm=TRUE)
 451		quant <- quantile(sds,variance,na.rm=TRUE)
 452		sel <- sds >= quant
 453		sel[is.na(sel)] <- FALSE
 454		if(v){
 455			cat(sum(sel)," features out of ",length(sel)," have a sd bigger than ",quant,"\n")
 456		}
 457		ret <- object
 458		exprs(ret) <- exprs(object)[sel,]
 459		ret@weights <- getWeights(object)[sel,]
 460		if(nrow(exprs(object))==nrow(object@genes)){
 461			if(ncol(object@genes)==1){
 462				G <- matrix(object@genes[sel,],ncol=1)
 463				rownames(G) <- rownames(exprs(ret))
 464				colnames(G) <- colnames(object@genes)
 465				ret@genes <- G
 466			}
 467			else{
 468				ret@genes <- object@genes[sel,]
 469			}
 470		}
 471		ret
 472})
 473
 474
 475##
 476# this function is a generalization of Robert Gentlemens GOHyperG funciton from the GOstats package.
 477# it allows to submit all EntrezGenes present on the used microarray.
 478# in principal this function performs the same steps as the GOHyperG function, nevertheless the results
 479# are sligtly different (maybe because of the different annotation packages used:
 480#  mapping to GO terms using the affymetrix probe sets -> lokuslink -> GO terms (GOstats) or mapping the locuslink ids directly
 481#  to GO terms using the GO package like it is done here.)
 482calculateGOHyperG <- function(x,allLLs=NULL,chip=NULL,what="MF",use.all=FALSE,v=TRUE){
 483	require(GO)
 484	require(GOstats)
 485        if (chip == "fly.db0.db"){chip = "fly.db0"}
 486	if(is.null(allLLs) & is.null(chip) & !use.all){
 487		stop("You have to submit either all EntrezGene IDs present on the array used, or the Affymetrix GeneChip used, or select use.all=TRUE to use all EntrezGenes available in GO!\n\n")
 488	}
 489	if(use.all){
 490		##
 491		# just that the code below throws no exception...
 492		allLLs <- x
 493	}
 494	##
 495	# if chip was submitted retrieve all LLs from the specific chip
 496	if(!is.null(chip)){
 497		require(chip,character.only=TRUE) || stop("need data package",chip)
 498		allLLs <- as.character(unlist(as.list(get(paste(chip,"LOCUSID",sep=""),mode="environment"))))
 499	}
 500	##
 501	# prepare the x (myLLs) and allLLs. remove duplicates, NAs, use only those x that are present in allLLs
 502	# remove possible LL. or LL strings (old way of writing LocusLink IDs)
 503	x <- sub("LL.","",x)
 504	x <- sub("LL","",x)
 505	allLLs <- sub("LL.","",allLLs)
 506	allLLs <- sub("LL","",allLLs)
 507	allLLs <- unique(allLLs)
 508	allLLs <- allLLs[!is.na(allLLs) & allLLs!=""]
 509	x <- unique(x)
 510	x <- x[!is.na(x) & x!=""]
 511	##
 512	# just interested in those x that are present in allLLs
 513	x <- allLLs[match(x,allLLs)]
 514	x <- x[!is.na(x) & x!=""]
 515	
 516	##
 517	# do some checking...
 518	if(length(x)==0 || is.na(x)){
 519		stop("None of the submitted interesting genes are present in the list of all EntrezGene (LocusLink) IDs submitted!\nPossible causes:\n-wrong list of genes (wrong IDs) submitted\n-EntrezGene IDs of the interesting genes and list of EntrezGene IDs of all genes on the array are not present on the same microarray\n")
 520	}
 521	
 522	
 523	
 524	##
 525	# ok, now map x to GO, retrieve all GO terms where one of the x EntrezGene IDs is present (using GOALLLOCUSID from the GO package)
 526	GOLL <- as.list(get("GOALLLOCUSID",mode="environment"))     ## warning, GOALLLOCUSID is deprecated in newer versions, use GOALLENTREZID
 527	GOLL <- GOLL[!is.na(GOLL)] # just removing all the GO ids that are not mapped to any LL
 528	PresentGO <- sapply(GOLL,function(z){
 529		if(is.na(z) || length(z)==0)
 530			return(FALSE)
 531		any(x %in% z)
 532		}
 533	)
 534	##
 535	# another checking...
 536	if(max(PresentGO)==0){
 537		stop("The submitted list of EntrezGene IDs of the interesting genes can not be mapped to any of the GO terms.\nMaybe the list contained no EntrezGene (LocusLink) IDs?\n")
 538	}
 539	if(v){
 540		cat("The",length(x),"submitted EntrezGene IDs map to",sum(PresentGO),"GO terms in all tree categories.\n")
 541	}
 542	GOLL <- GOLL[PresentGO]
 543	##
 544	# next restrict the GO terms to the selected category (specified by "what")
 545	goCat <- unlist(getGOOntology(names(GOLL)))
 546	goodGO <- goCat == what
 547	goodGO[is.na(goodGO)] <- FALSE
 548	if(v){
 549		cat(sum(goodGO),"of them are in the GO category we are interested in.\n")
 550	}
 551	mm <- match(names(goCat),names(GOLL))
 552	mm <- mm[goodGO]
 553	GOLL <- GOLL[mm]
 554	
 555	##
 556	# next we have to remove all the EntrezGene IDs from each GO term, that was not present on the microarray used.
 557	if(!use.all){
 558		myGOLL <- sapply(GOLL,function(z){
 559			if(length(z)==0 || is.na(z))
 560				return(NA)
 561			mylls <- unique(z[match(allLLs,z)])
 562			mylls <- mylls[!is.na(mylls)]
 563			mylls
 564		}
 565		)
 566		##
 567		# removing all those GO terms that have no entries left (just in the case there are some...)
 568		empties <- sapply(myGOLL,function(z){length(z)==1 && is.na(z)})
 569		myGOLL <- myGOLL[!empties]
 570		GOLL <- GOLL[!empties]
 571	}
 572	else{
 573		##
 574		# if use.all was selected, the proportion of the submitted EntrezGene IDs per GO term is compared to all
 575		# EntrezGenes mapped to that GO term.
 576		myGOLL <- GOLL
 577	}
 578	
 579
 580	##
 581	# the rest is the same as in the GOHyperG function...
 582	cLLs <- unique(unlist(myGOLL))
 583	nLL <- length(cLLs)                    # nr of unique LL IDs present in all GO terms
 584	goCounts <- sapply(myGOLL,length)      # nr of LL IDs present on the array per GO term
 585	#allCounts <- sapply(GOLL,length)       # nr of all LL IDs per GO term
 586	##
 587	# some LL ids are represented more than once in a GO term...
 588	uniqueAllCounts <- sapply(GOLL,
 589		function(z){
 590			if(length(z)==0 || is.na(z))
 591				return(0)
 592			lls <- unique(z)
 593			lls <- lls[!is.na(lls)]
 594			return(length(lls))
 595		})
 596	whGood <- x[x %in% cLLs]               # those LLs from the interesting ones that are present in the GO terms selected
 597	nInt <- length(whGood)
 598	if(nInt==0){
 599		warning("No interesting genes left")
 600	}
 601	
 602	MappingGOLL <- lapply(myGOLL,function(z){whGood[whGood %in% z]})
 603	useCts <- sapply(myGOLL,function(z){sum(whGood %in% z)})   # nr of interesting LLs per GO term
 604	##
 605	# calculate the hypergeometric p values...
 606	pvs <- phyper(useCts-1,nInt,nLL-nInt,goCounts,lower.tail=FALSE)
 607	ord <- order(pvs)
 608
 609	return(
 610		list(pvalues=pvs[ord],goCounts=goCounts[ord],intCounts=useCts[ord],allCounts=uniqueAllCounts[ord],GOLL=myGOLL[ord],intMapping=MappingGOLL[ord],numLL=nLL,numInt=nInt,intLLs=x)
 611	)
 612	
 613}
 614
 615
 616################
 617## get EntrezGenes for the specified GO term
 618getEntrezIDsFromGO <- function(GO, package="GO", entrezgenes){
 619	match.arg(package, c("GO", "humanLLMappings", "mouseLLMappings", "ratLLMappings"))
 620	require(package, character.only=TRUE)
 621	if(package=="GO"){
 622		ALLIDs <- mget(GO, env=GOALLENTREZID, ifnotfound=NA)
 623		if(!missing(entrezgenes)){
 624			## subset to the specified entrezgene ids.
 625			ALLIDs <- lapply(ALLIDs, function(x){ x[ x %in% entrezgenes] })
 626		}
 627		return(ALLIDs)
 628	}
 629	if(package=="humanLLMappings"){
 630		ALLIDs <- mget(GO, env=humanLLMappingsGO2LL, ifnotfound=NA)
 631		if(!missing(entrezgenes)){
 632			## subset to the specified entrezgene ids.
 633			ALLIDs <- lapply(ALLIDs, function(x){ x[ x %in% entrezgenes] })
 634		}
 635		return(ALLIDs)		
 636	}
 637	if(package=="mouseLLMappings"){
 638		ALLIDs <- mget(GO, env=mouseLLMappingsGO2LL, ifnotfound=NA)
 639		if(!missing(entrezgenes)){
 640			## subset to the specified entrezgene ids.
 641			ALLIDs <- lapply(ALLIDs, function(x){ x[ x %in% entrezgenes] })
 642		}
 643		return(ALLIDs)		
 644	}
 645	if(package=="ratLLMappings"){
 646		ALLIDs <- mget(GO, env=ratLLMappingsGO2LL, ifnotfound=NA)
 647		if(!missing(entrezgenes)){
 648			## subset to the specified entrezgene ids.
 649			ALLIDs <- lapply(ALLIDs, function(x){ x[ x %in% entrezgenes] })
 650		}
 651		return(ALLIDs)		
 652	}
 653}
 654
 655
 656#getLLGOmapping <- function(x,what="MF"){
 657#	gos <- mget(x, env=GOLOCUSID2GO,ifnotfound=NA)
 658#	if(length(gos)==1)
 659#		bd = is.na(gos[[1]])
 660#	else
 661#		bd = is.na(gos)
 662#	gos <- gos[!bd]
 663#	mymapping <- lapply(gos,getOntology,ontology=what)
 664#	emptygos <- sapply(mymapping,function(x) length(x)==0 )
 665#	mymapping <- mymapping[!emptygos]
 666#	mymapping
 667#}
 668
 669#getGOFromLL <- function(x,Ontology="MF"){
 670#	require(GO) || stop("no GO library")
 671#	match.arg(Ontology, c("MF", "BP", "CC"))
 672#	goterms <- mget(x, env = GOLOCUSID2GO, ifnotfound=NA)
 673#	if(length(goterms)==1){
 674#		goterms <- goterms[!is.na(goterms[[1]])]
 675#	}
 676#	else{
 677#		goterms <- goterms[!is.na(goterms)]
 678#	}
 679#	goterms <- lapply(goterms, function(x) x[sapply(x,
 680#                      function(x) {if (is.na(x$Ontology) )
 681#                                          return(FALSE)
 682#                                      else
 683#                                          x$Ontology == Ontology})])
 684#	goids <- NULL
 685#	if(length(goterms)==0){
 686#		stop("The submitted list of EntrezGene IDs of the interesting genes can not be mapped to any of the GO terms.\nMaybe the list contained no EntrezGene (LocusLink) IDs?\n")
 687#	}
 688#	for(i in 1:length(goterms)){
 689#		goids <- c(goids,unique(unlist(sapply(goterms[[i]], function(x) x$GOID))))
 690#	}
 691#	goids
 692#}
 693#
 694
 695##
 696# check if package is available
 697isAnnotationAvailable <- function(chip,develOK=FALSE){
 698	ok = FALSE
 699        if (chip == "fly.db0.db"){chip  = "fly.db0"}
 700#	if(!require("reposTools",character.only=TRUE)){
 701#		warning("reposTools package is not installed.\n")
 702#		if(!require(chip,character.only=TRUE)){
 703#			ok = FALSE
 704#		}
 705#		else{
 706#			ok = TRUE
 707#		}
 708#	}
 709#	else{
 710		if(!require(chip,character.only=TRUE)){
 711#             source("http://www.bioconductor.org/getBioC.R")
 712##			library(reposTools)
 713##			# have to install the package
 714##			tryCatch(install.packages2(pkg=chip,develOK=develOK,getAllDeps=TRUE),error=return(FALSE))
 715#			try(getBioC(chip))
 716#			if(!require(chip,character.only=TRUE)){
 717#			    ok=FALSE
 718#			}
 719#			else{
 720#			    ok=TRUE
 721#			}
 722		}
 723		else{
 724			ok = TRUE
 725		}
 726#	}
 727	ok
 728}
 729
 730saveSAMResults <- function(SAM,table,delta,filename){
 731	dummy <- print(SAM)
 732	for(i in 1:nrow(dummy)){
 733		saveSAMResult(SAM,table,delta=dummy[i,"Delta"],filename=filename)
 734	}
 735}
 736
 737saveSAMResult <- function(SAM, table,delta,filename){
 738	sum.SAM <- (summary(SAM,delta))@mat.sig
 739	if(!is.null(sum.SAM) & nrow(sum.SAM)>0){
 740		table.sub <- table[sum.SAM[,"Row"],]
 741		if(class(table)=="aafTable"){
 742			for(i in 2:(ncol(sum.SAM))){
 743				table.sub <- merge(table.sub,aafTable(sum.SAM[,i],colnames=colnames(sum.SAM)[i]))
 744			}
 745			Filename <- checkExistantFile(paste(filename,"-delta-",delta,".txt",sep=""))
 746			saveText(table.sub,filename=Filename)
 747		}
 748		else{
 749			if(nrow(sum.SAM)==1){
 750				table.sub <- matrix(table.sub,nrow=1)
 751				colnames(table.sub) <- colnames(table)
 752			}
 753			table.sub <- cbind(as.matrix(table.sub),as.matrix(sum.SAM[,2:ncol(sum.SAM)]))
 754			Filename <- checkExistantFile(paste(filename,"-delta-",delta,".txt",sep=""))
 755			write.table(table.sub,file=Filename,sep="\t",quote=FALSE,row.names=FALSE)
 756		}
 757		cat(nrow(table.sub),"significant genes (delta=",delta,") were saved to the file:",Filename,"\n")
 758	}
 759	else{
 760		cat("No significant genes with a delta of",delta,"\n")
 761	}
 762}
 763
 764###
 765# fills missing spots
 766fillMissingSpots <- function(object){
 767	ret <- object
 768	if(class(ret)=="RGList"){
 769		if(!is.null(ret$printer)){
 770			layout <- object$printer
 771			nr.row <- layout$ngrid.r * layout$ngrid.c * layout$nspot.r * layout$nspot.c
 772			missing <- (nr.row) - nrow(object$R)
 773			if(missing!=0){
 774				cat("Have to fill in",missing,"missing spots\n")
 775				index <- as.numeric(apply(object$genes,MARGIN=1,FUN=.getIndexFromBCR,nspot.r=layout$nspot.r,nspot.c=layout$nspot.c))
 776				R <- matrix(ncol=ncol(object$R),nrow=nr.row)
 777				colnames(R) <- colnames(object$R)
 778				rownames(R) <- 1:nrow(R)
 779
 780				## check if length of index and nrow of R are the same (<- problem with custom GAL files...)
 781				if(length(index)!=nrow(object$R)){
 782					stop("ERROR: number of data values does not match number of annotations!\nPlease try again without specifying a GAL file!")
 783				}
 784				R[index,] <- object$R
 785				ret$R <- R
 786				R[index,] <- object$G
 787				ret$G <- R
 788				if(!is.null(object$Rb)){
 789					R[index,] <- object$Rb
 790					ret$Rb <- R
 791					R[index,] <- object$Gb
 792					ret$Gb <- R
 793				}
 794				if(!is.null(object$weights)){
 795					R[,] <- 0
 796					R[index,] <- object$weights
 797					ret$weights <- R
 798				}
 799				else{
 800					R[,] <- 0
 801					R[index,] <- 1
 802					ret$weights <- R
 803				}
 804				rm(R)
 805				g <- gc()
 806				Genes = data.frame(matrix(ncol=ncol(object$genes),nrow=nr.row))
 807				colnames(Genes) <- colnames(object$genes)
 808				rownames(Genes) <- 1:nrow(Genes)
 809#				cat("nrow Genes",nrow(Genes),"ncol Genes",ncol(Genes),"length index",length(index),"\n")
 810				Genes[index,1:ncol(Genes)] = object$genes[,1:ncol(Genes)]
 811#				cat("genes filled\n","ncol= ",ncol(Genes)," nrow: ",nrow(Genes),"\n")
 812				ret$genes = NULL
 813				ret$genes = Genes
 814#				ret$genes = Genes
 815#				cat("genes set\n")
 816				rm(Genes)
 817				g <- gc()
 818			}
 819		}
 820	}
 821	ret
 822}
 823
 824.getIndexFromBCR <- function(x,nspot.c,nspot.r){
 825	((as.numeric(x["Block"])-1)*nspot.c*nspot.r) + ((as.numeric(x["Row"])-1)*nspot.c) + as.numeric(x["Column"])
 826}
 827
 828if( !isGeneric("drawBoxplot") )
 829	setGeneric("drawBoxplot", function(x,new.plot=TRUE,xlim=NULL,at=0,exclude.flagged=FALSE,...)
 830	standardGeneric("drawBoxplot"))
 831
 832setMethod("drawBoxplot","MadbSet",
 833	function(x,new.plot=TRUE,xlim=NULL,at=0,exclude.flagged=FALSE,...){
 834		if(exclude.flagged){
 835			W <- getWeights(x)
 836			for(i in 1:ncol(exprs(x))){
 837				exprs(x)[W[,i]==0,i] <- NA
 838			}
 839		}
 840		boxplots(x=exprs(x),new.plot=new.plot,xlim=xlim,at=at,...)
 841	}
 842)
 843
 844boxplots <- function(x,new.plot=TRUE,xlim=NULL,at=0,col=NULL,log2.transform=FALSE,ylim=NULL,...){
 845	data <- x
 846#	if(class(x)=="exprSet" | class(x)=="EexprSet"){
 847#		data <- exprs(x)
 848#	}
 849	if(log2.transform){
 850		data <- log2(data)
 851	}
 852	if(is.null(ncol(data))){
 853		data <- as.matrix(data)
 854	}
 855	## remove those nasty infinite values
 856	for(i in 1:ncol(data)){
 857		data[is.infinite(data[,i]),i] <- NA
 858	}
 859
 860	CN <- colnames(data)
 861	if(is.null(CN)){
 862		CN <- 1:ncol(data)
 863	}
 864	if(is.null(col)){
 865		col=rep(0,ncol(data))
 866	}
 867	if(length(col)!=ncol(data)){
 868		col=rep(col[1],ncol(data))
 869	}
 870	if(new.plot){
 871	if(is.null(xlim)){
 872		xlim=c(0.5,(ncol(data)+0.5))
 873	}
 874	par(omi=0.7*c(min(2,max(strwidth(CN,units="i"))),0,0,0),cex.axis=0.7)
 875	if(is.null(ylim)){
 876		ylim=c(min(data,na.rm=TRUE),max(data,na.rm=TRUE))
 877	}
 878	plot(1,1,pch=NA,ylab="expression",xaxt="n",ylim=ylim,xlim=xlim,xlab=NA)
 879	}
 880	axis(side=1,labels=CN,at=(1+at):(ncol(data)+at),las=3)
 881	for(i in 1:ncol(data)){
 882		boxplot(data[,i],at=(i+at),add=TRUE,col=col[i],range=0,...)
 883	}
 884}
 885
 886
 887plotRankedVariance <- function(data,subset=NULL,cut){
 888	if(!is.null(subset)){
 889		data <- data[,subset]
 890	}
 891	if(!.is.log(data)){
 892		data <- log2(data)
 893	}
 894	SD <- apply(data,MARGIN=1,sd,na.rm=TRUE)
 895	CUT <- quantile(SD,cut,na.rm=TRUE)
 896	plot(sort(SD),pch=16,cex=0.33,ylab="sorted variance")
 897	abline(h=CUT,col=2)
 898	text((length(SD)/20),(CUT+0.2),label=format(CUT,digits=4),col=2)
 899}
 900
 901plotSortedPValues <- function(x,top=NULL,supported.cols=c("rawp","adjp","minP","maxT","BH","BY","Bonferroni","Hochberg","Holm","SidakSS","SidakSD"),...){
 902	if(is.null(nrow(x))){
 903		stop("Please submit a matrix, data.frame or instance of aafTable!\n")
 904	}
 905	if(!is.null(top)){
 906		top <- min(c(nrow(x),top))
 907	}
 908	else{
 909		top <- nrow(x)
 910	}
 911	Columns <- supported.cols[supported.cols %in% colnames(x)]
 912	if(length(Columns)==0){
 913		stop("The submitted matrix must contain at least one column with p-values that is named according to one of the supported column names!")
 914	}
 915	if(class(x)=="aafTable"){
 916		require(annaffy)
 917		#have to do some transformations...
 918		x.new <- matrix(ncol=length(Columns),nrow=nrow(x))
 919		colnames(x.new) <- Columns
 920		for(i in Columns){
 921			x.new[,i] <- as.numeric(getText(x[[i]]))
 922		}
 923		x <- x.new
 924	}
 925	#ok, now i can start drawing...
 926	x <- apply(x[,Columns],MARGIN=2,sort,na.last=TRUE)
 927	Max <- max(x[1:top,],na.rm=TRUE)
 928
 929	plot(x[1:top,Columns[1]],xlab="number of significant genes",ylab="sorted p-value",col=1,ylim=c(0,Max),main=paste("Top",top,"sorted p-values"),...)
 930	if(length(Columns)>1){
 931		for(i in 2:length(Columns)){
 932			points(x[1:top,Columns[i]],col=i,...)
 933		}
 934	}
 935	legend(top-top/4,Max,Columns,col=1:length(Columns),pch=16)
 936}
 937
 938.is.log <- function(data,maxi=100){
 939	data <- data[!is.na(data)]
 940	data <- data[!is.infinite(data)]
 941#	cat("Try to check wheter or not the values are in log scale...")
 942	if(max(data>maxi)){
 943		log.values = FALSE
 944#		cat("signals are not in log2 scale, transforming...\n")
 945	}
 946	else{
 947		log.values = TRUE
 948#		cat("signals are already in log2 scale \n")
 949	}
 950	log.values
 951}
 952
 953toNumeric <- function(x,Sub=","){
 954	if(!is.null(dim(x))){
 955		RN <- rownames(x)
 956		if(class(x[1,])=="numeric"){
 957			return(x)
 958		}
 959		if(!is.null(Sub)){
 960			for(s in Sub){
 961				x <- apply(x,MARGIN=2,sub,pattern=s,replacement="")
 962			}
 963		}
 964		x <- tryCatch(apply(x,MARGIN=2,as.numeric),error=function(error){stop("ERROR: Can not perform calculations on non-numeric data! Please submit only numeric values.")},warning=function(warning){stop("ERROR: Can not perform calculations on non-numeric data! Please submit only numeric values.")})
 965		rownames(x) <- RN
 966		x
 967	}
 968}
 969
 970####
 971## function similar to PWamat function from the annotate package,
 972## generates a matrix with rows unique LL ids, columns KEGG pathways
 973## containing 1 if gene can be associated with the pathway, 0 otherwise.
 974makeLL2KEGGMappingMatrix <- function(x,nrGenes=0,verbose=TRUE){
 975	if(is.null(x)){
 976		stop("No EntrezGene (LocusLink) IDs have been submitted!\n")
 977	}
 978	require(KEGG)
 979	x <- unique(as.character(x))
 980	paths <- mget(x,KEGGEXTID2PATHID,ifnotfound=NA)
 981	not_found <- sapply(paths,function(x){if(length(x)==1 && is.na(x)) TRUE else FALSE})
 982	paths <- paths[!not_found]
 983	if(verbose){
 984		cat(length(paths),"EntrezGene IDs from the inputlist can be associated with at least one KEGG pathway.\n",sum(not_found),"IDs can not be associated with KEGG pathways.\n")
 985	}
 986	unique_paths <- unique(unlist(paths))
 987	Test <- sapply(paths,function(x){
 988		mtch <- match(x,unique_paths)
 989		res <- rep(0,length(unique_paths))
 990		res[mtch] <- 1
 991		res
 992	})
 993	Test <- t(Test)
 994	rownames(Test) <- x
 995	colnames(Test) <- unique_paths
 996	if(nrGenes>0){
 997		to_exclude <- colSums(Test) < nrGenes
 998		if(verbose){
 999			cat(sum(to_exclude),"Pathways are excluded, because less than",nrGenes,"genes can be mapped to them.\n")
1000		}
1001		Test <- Test[,!to_exclude]
1002	}
1003	return(Test)
1004}
1005
1006
1007