SNPAssortedMating.R | searchcode

/programming/Rcode/Rexamples/R/SNPAssortedMating.R

https://github.com/DannyArends/Other
R | 364 lines | 321 code | 20 blank | 23 comment | 48 complexity | 2d1d7c099e48ee8ef852cb2cfd65f88d MD5 | raw file

#####################################################################
#
# AssortedMating.R
#
# copyright parent_check (c) 2009, Danny Arends
# last modified Apr, 2009
# first written Apr, 2009
# 
# Contains: assortedmating
#
######################################################################

plotDIR <- function(dir = "D:\\GBIC\\Lude\\Resultsgood", which = 2){
	setwd(dir)
	cnt <- 0
	genome <- NULL
	m_markers <- 1
	m_done <- 0
	for(x in dir()){
		if(!is.na(grep("res",x)==1&&1)){
		chr <- read.table(x,row.names=1,header=FALSE,sep="\t")
		markers <- nrow(chr)
		m_done <- m_done+markers
		score <- max(abs(chr))
		cnt <- cnt+1
		m_markers <- c(m_markers,m_done)
		}
	}
	c <- c("black","gray")
	cnt <- 0
	names <- NULL
	if(which==1){
		plot(x=c(1,m_done),y=c(0,2000),main="Population Score",xlab="Markers",ylab="Chi^2",type="n")
	}else{
		if(which==2){
			plot(x=c(1,m_done),y=c(-150,150),main="InbreadingScores",xlab="Markers",ylab="Chi^2", type="n")
		}else{
			plot(x=c(1,m_done),y=c(-150,150),main="AMScores",xlab="Markers",ylab="Chi^2", type="n")
		}
	}
	for(x in dir()){
		if(!is.na(grep("res",x)==1&&1)){
		cnt <- cnt+1
		names <- c(names,x)
		chr <- read.table(x,row.names=1,header=FALSE,sep="\t")
		colo <- (cnt%%length(c))
		if(colo==0){colo<- length(c)}
		points(x=m_markers[cnt]+1:length(chr[,which]),chr[,which],type="h",col=c[colo], main=x)
		}
	}
	names <- gsub("res","",names)
	names <- gsub(".txt","",names)
	legend("topright",names,lty=1,col=c,cex=.6,ncol=2)
}

plotLength<- function(x,LengthMatrix,name=TRUE,which = 2){
	if(which==1){
		text <- "Inbreading (Chi^2)"
	}else{
		text <- "Deviation from random mating(Chi^2)"
	}
	if(name){
		lbls <- paste("Chr ",LengthMatrix[,1],"_",LengthMatrix[,2],sep="")
		size=0.5
	}else{
		lbls <- rownames(LengthMatrix)
		size=0.7
	}
	plot(x[[2]],type="b",axes=0,xlab="",ylab=text)
	box()
	Axis(side=1,at=1:37,labels=lbls,cex.axis=size,las=0.5,las=2)
	Axis(side=2)
}

assortedmating.init <- function(){
	workD="D:\\GBIC\\Lude\\DataDanny"
	setwd(workD)	
	snps <- as.matrix(read.table("length.txt"))
	LengthMatrix <- NULL
	for(x in dir()[grep("Chr",dir())]){
		if(is.na(grep("res",x)==1&&1)){
			cat("Reading",x,"\n")
			data <- read.table(x,row.names=1)
			for(searching in 1:nrow(snps)){
				chr <- as.numeric(snps[searching,2])
				if(chr %in% data[,1]){
					bp <- as.numeric(snps[searching,3])
					cat("SNPs in length.txt on this chromosome:",snps[searching,1:3],"\n")
					smaller <- which(data[,2]<(bp+1000))
					larger <- which(data[,2]>(bp-1000))
					intersect(smaller,larger)
					if(!is.na(intersect(smaller,larger)&&1)){
						for( y in intersect(smaller,larger)){
							cat("->",rownames(data)[y],data[y,1],data[y,2],"\n")
							LengthMatrix <- rbind(LengthMatrix,data[y,])
						}
					}
				}
			}
		}
	}
	LengthMatrix
}


doDir <- function(){
setwd("D:\\GBIC\\Lude\\DataDanny")
order <- as.vector(read.csv("ChipIDOrderInGenotypeFiles.txt",header=FALSE,sep="\n"))
info <- read.csv("ChipIDToSampleInformation.txt",row.names=1,header=TRUE,sep="\t")
coln <- c("Chr","Pos",as.vector(order[,1]))
for(x in dir()){
	if(!is.na(grep("Chr",x)==1&&1)){
	if(is.na(grep("res",x)==1&&1)){
		data <- read.csv(x,row.names=1,header=TRUE,sep="\t")
		colnames(data) <- coln
		assortedmating(data,info,1,nrow(data)-1,paste("res",x,sep=""),plot=FALSE)
	}
	}
}
}

init <- function(){
setwd("D:\\GBIC\\Lude\\RAW")
data <- read.csv("Chr2.txt",row.names=1,sep="\t")
order <- as.vector(read.csv("ChipIDOrderInGenotypeFiles.txt",header=FALSE,sep="\n"))
info <- read.csv("ChipIDToSampleInformation.txt",row.names=1,header=TRUE,sep="\t")
coln <- c("Chr","Pos",as.vector(order[,1]))
colnames(data) <- coln
}

initplottingqq <- function(name="Chromosome 1",resF="resChr1.txt"){
	setwd("D:\\GBIC\\Lude\\Resultsgood")
	r <- read.csv(resF,row.names=1,header=FALSE,sep="\t")
	plottingqq(name,r[,3],3)
}

readallres <- function(){
	setwd("D:\\GBIC\\Lude\\Resultsgood")
	reslist <- NULL
	for(x in dir()){
		if(!is.na(grep("Chr",x)==1&&1)){
			if(!is.na(grep("res",x)==1&&1)){
				cat(x,"\n")
				data <- read.csv(x,row.names=1,header=FALSE,sep="\t")
				reslist <- rbind(reslist,data)
			}
		}
	}
	reslist
}

ScaledownusingHW <- function(result,cutoff){
	result[result[,1]<cutoff,]
}

AboveAM <- function(result,cutoff_hw,cutoff_am){
	adhereHW <- ScaledownusingHW(result,cutoff_hw);
	uber <-  adhereHW[adhereHW[,3]>cutoff_am,]
	write.table(file=paste("HW<"+cutoff_hw+"AM<"+cutoff_am+".txt",sep=""),uber,sep="\t",quote=F,row.names=F,col.names=F)
}
addLoc <- function(name){

}

plottingqq <- function(name,scores,df){
	pos <- scores[scores>=0]
	neg <- scores[scores<0]
	exp_pos <- rchisq(length(pos),df=df)
	exp_neg <- -rchisq(length(neg),df=df)
	colors = rep("blue",length(neg))
	colors = c(colors,rep("green",length(pos)))
	
	plot(-75:150,-75:150,main=paste(name,"DegFree:",df),xlab="Observed Chi^2",ylab="Expected Chi^2",type="n")
	points(sort(c(pos,neg)),sort(c(exp_pos,exp_neg)),pch=19,lwd=0.1,col=colors)
	lines(-max(abs(scores)):max(abs(scores)),-max(abs(scores)):max(abs(scores)),col="black")
	legend("bottomright", legend = c("Homozygote", "Hetrozygote"),col=c("blue","green"),pch=19,title="Chi^2 Scores")
}

assortedmating <- function(data,info,start=1,number=0,name="Test.txt",plot=TRUE,verbose=TRUE,logger=FALSE){
	PopulationScores <- NULL		#Teststatistic results holder
	DirScores <- NULL
	SPopulationScores <- NULL				#Scaled Teststatistic result holder
	InbreadingScores <- NULL				#Inbreading coeficient result holder
	ReturnMatrix <- c("",0,0,0,0)			#Final scoring matrix
	pDir <- NULL				#ParentalMatingSchema LikelyhoodScores
	PopScores <- NULL
	ParScores <- NULL
	ourcat(logger,"AM","INFO: Analyzing",number,"SNPMarkers\n",verbose=verbose)
	
	plist <- NULL
	for(locus in start:(start+number)){
		ourcat(logger,"AM","-----------------------------------------------------------------------\n",verbose=verbose)
		ourcat(logger,"AM","Starting with locus:",locus,"\n",verbose=verbose)
		ourcat(logger,"AM","-----------------------------------------------------------------------\n",verbose=verbose)
		ind_num <- 1
		nparents <- 0
		ObservedAllel <- c(0,0,0,0)
		Observed <- rep(0,10)
		Expected <- rep(0,10)
		ExpectedAllel <- c(0,0,0,0)
		Observedtable <- cbind(rep(0,10),rep(0,10),rep(0,10),rep(0,10),rep(0,10),rep(0,10),rep(0,10),rep(0,10),rep(0,10),rep(0,10))
		Expectedtable <- cbind(rep(0,10),rep(0,10),rep(0,10),rep(0,10),rep(0,10),rep(0,10),rep(0,10),rep(0,10),rep(0,10),rep(0,10))
		names(ObservedAllel) <- c("A","C","G","T")
		names(ExpectedAllel) <- c("A","C","G","T")
		rownames(Observedtable) <- c("AA","AC","AG","AT","CC","CG","CT","GG","GT","TT")
		names(Observed) <- c("AA","AC","AG","AT","CC","CG","CT","GG","GT","TT")
		names(Expected) <- c("AA","AC","AG","AT","CC","CG","CT","GG","GT","TT")
		rownames(Expectedtable) <- c("AA","AC","AG","AT","CC","CG","CT","GG","GT","TT")
		colnames(Observedtable) <- c("AA","AC","AG","AT","CC","CG","CT","GG","GT","TT")
		colnames(Expectedtable) <- c("AA","AC","AG","AT","CC","CG","CT","GG","GT","TT")
		#cat("here",ind_num)
		for(ind in rownames(info)){
			#cat("starting with:",ind,"\n")
			colnum <- which(colnames(data)==ind)
			#cat(ind,"colnum in data:",colnum,"\n")
			#cat(ind,"rownum in info:",ind_num,"\n")
			ind.status <- info[ind_num,1]
			ind.sex <- info[ind_num,2]
			ind.MFC <- info[ind_num,3]
			ind.Fam <- info[ind_num,4]
			#cat("Data:",ind.status,ind.sex,ind.MFC,ind.Fam,sep="\t","\n")
			if(info[ind_num,3]=="M" || info[ind_num,3]=="F"){
				#parentfound
				nparents <- nparents+1
				geno_1 <- as.character(data[locus,colnum])
				geno_1 <- paste(sort(c(substr(geno_1,1,1),substr(geno_1,2,2)))[1],sort(c(substr(geno_1,1,1),substr(geno_1,2,2)))[2],sep="")
				ObservedAllel[substr(geno_1,1,1)] <- ObservedAllel[substr(geno_1,1,1)]+1
				ObservedAllel[substr(geno_1,2,2)] <- ObservedAllel[substr(geno_1,2,2)]+1
				Observed[geno_1] <- Observed[geno_1] +1
				for(ind2_num in which(info[,4]==ind.Fam)){
					if(info[ind2_num,3] != info[ind_num,3] && info[ind2_num,3] != "C"){
						#cat(rownames(info)[ind2_num]," !!!\n")
						colnum_2 <- which(colnames(data)==rownames(info)[ind2_num])
						#cat("Colnum2:",colnum_2,"\n")
						geno_2 <- as.character(data[locus,colnum_2])
						#cat("Geno's:",geno_1,geno_2,"\n")
						geno_2 <- paste(sort(c(substr(geno_2,1,1),substr(geno_2,2,2)))[1],sort(c(substr(geno_2,1,1),substr(geno_2,2,2)))[2],sep="")
						Observedtable[geno_1,geno_2] <- Observedtable[geno_1,geno_2] +1
					}				
				}
			}
			ind_num = ind_num+1
		}
		n.allells=sum(ObservedAllel)
		for(x in 1:length(ObservedAllel)){
			ExpectedAllel[x]=(ObservedAllel[x]) / n.allells
			#here there is a bug: expected != observed / n.allees (this is exected freq)
			ourcat(logger,"AM",names(ObservedAllel)[x],"\t",ObservedAllel[x],"\t",ExpectedAllel[x],"\n",verbose=verbose)
		}
		ourcat(logger,"AM","-----------------------------------------------------------------------\n",verbose=verbose)
		for(x in 1:length(Observed)){
			#cat(x)
			geno_1 <- names(Observed)[x]
			geno_1 <- paste(sort(c(substr(geno_1,1,1),substr(geno_1,2,2)))[1],sort(c(substr(geno_1,1,1),substr(geno_1,2,2)))[2],sep="")
			if(substr(geno_1,1,1) != substr(geno_1,2,2)){
				Expected[x] <- (ExpectedAllel[substr(geno_1,1,1)] * ExpectedAllel[substr(geno_1,2,2)]) * n.allells
			}else{
				Expected[x] <- (ExpectedAllel[substr(geno_1,1,1)] * ExpectedAllel[substr(geno_1,2,2)]) * n.allells /2
			}
			ourcat(logger,"AM","Tests",names(Observed)[x],"\t",Observed[x],"\t",Expected[x],"\n",verbose=verbose)
		}
		cat(" ",colnames(Observedtable),"\n",sep="\t")
		for(x in 1:dim(Observedtable)[1]){
			cat(rownames(Observedtable)[x],rep("\t",x))
			for(y in x:dim(Observedtable)[2]){
				Expectedtable[x,y] <- as.integer((Observed[x]/nparents)*(Observed[y]/nparents)*(nparents/2))
				if(x==y){
					Observedtable[x,y] <- Observedtable[x,y]/2
				}else{
					Expectedtable[x,y] <- Expectedtable[x,y]*2
				}
				cat(Observedtable[x,y],"/",Expectedtable[x,y],"\t",sep="")
			}
			cat("\n")
		}
		#Statistics
		teststat = 0
		dir = 0
		teststat1 = 0
		for(x in 1:length(Expected)){
			geno_1 <- names(Expected)[x]
			a <- substr(geno_1,1,1)
			b <- substr(geno_1,2,2)
			value=0
			if(Expected[x] !=0){
				value = ((Observed[x]-Expected[x])^2/Expected[x])
				teststat <- teststat+value
				if(a==b){
					dir <- 1-(Observed[x]/Expected[x])
				}
			}else{
				teststat <- teststat+0
			}
		}
		ourcat(logger,"AM","Pop TestStat:",teststat,dir,dir*teststat,"\n",verbose=verbose)
		dirHO = 0
		dirHE = 0
		ParentDirection <- NULL
		Pnames <- NULL
		for(x in 1:dim(Expectedtable)[1]){
			for(y in 1:dim(Expectedtable)[2]){
				value <- 0
				if(Expectedtable[x,y] !=0){
					value <- ((Observedtable[x,y]-Expectedtable[x,y])^2/Expectedtable[x,y])	
				}
				if(x==y){
					dirHO = dirHO+value
				}else{
					dirHE = dirHE+value
				}
				ParentDirection <- c(ParentDirection,value)
				Pnames <- c(Pnames,paste(names(Expectedtable)[x],names(Expectedtable)[y]))
				teststat1 <- teststat1+value
			}
		}
		cat(dirHO,dirHE,"\n")
		if((dirHO-dirHE) < 0){
			teststat1 = -teststat1
		}
		ourcat(logger,"AM","Pop TestStat_1:",teststat1,"\n",verbose=verbose)
		pDir <- cbind(pDir,ParentDirection)
		rownames(pDir) <- Pnames
		if(plot){
			op <- par(mfrow = c(2,1))
			DirScores <- c(DirScores,dir)
			PopScores <- c(PopScores,teststat)
			max <- max(as.numeric(PopScores))
			min <- min(as.numeric(PopScores))
			plot(c(1,length(PopScores)),c(min,max),xlab="markers",ylab="Chi^2",type='n')
			lines(PopScores,lwd=2,col=rgb(0.3,0.3,1.0,1.0),type="o")

			ParScores <- c(ParScores,teststat1)		
			max <- max(as.numeric(ParScores))
			min <- min(as.numeric(ParScores))
			plot(c(1,length(ParScores)),c(min,max),xlab="markers",ylab="Chi^2",type='n')
			lines(ParScores,lwd=2,col=rgb(0.3,0.3,1.0,1.0),type="o")
		}
		cat(file=name,paste(rownames(data)[locus],teststat,dir,teststat1,sep="\t"),"\n",append=T)
	}
	list(PopScores,DirScores,ParScores,pDir)
}


init.QQplot <- function(){
	full <- NULL
	for(x in dir()[grep("Chr",dir())]){
		if(!is.na(grep("res",x)==1&&1)){
			cat("Reading",x,"\n")
			data <- read.table(x)
			full <- c(full,data[,2])
		}
	}

}

init.length <- function(){
	setwd("D:\\GBIC\\Lude\\DataDanny")
	order <- as.vector(read.csv("ChipIDOrderInGenotypeFiles.txt",header=FALSE,sep="\n"))
	info <- read.csv("ChipIDToSampleInformation.txt",row.names=1,header=TRUE,sep="\t")
	coln <- c("Chr","Pos",as.vector(order[,1]))
	load("LM.Rdata")
	colnames(LengthMatrix) <- coln
	aaa <- assortedmating(LengthMatrix,info,1,5)
}