PageRenderTime 71ms CodeModel.GetById 45ms RepoModel.GetById 0ms app.codeStats 0ms

/programming/Rcode/Rexamples/R/SNPAssortedMating.R

https://github.com/DannyArends/Other
R | 364 lines | 321 code | 20 blank | 23 comment | 48 complexity | 2d1d7c099e48ee8ef852cb2cfd65f88d MD5 | raw file
  1. #####################################################################
  2. #
  3. # AssortedMating.R
  4. #
  5. # copyright parent_check (c) 2009, Danny Arends
  6. # last modified Apr, 2009
  7. # first written Apr, 2009
  8. #
  9. # Contains: assortedmating
  10. #
  11. ######################################################################
  12. plotDIR <- function(dir = "D:\\GBIC\\Lude\\Resultsgood", which = 2){
  13. setwd(dir)
  14. cnt <- 0
  15. genome <- NULL
  16. m_markers <- 1
  17. m_done <- 0
  18. for(x in dir()){
  19. if(!is.na(grep("res",x)==1&&1)){
  20. chr <- read.table(x,row.names=1,header=FALSE,sep="\t")
  21. markers <- nrow(chr)
  22. m_done <- m_done+markers
  23. score <- max(abs(chr))
  24. cnt <- cnt+1
  25. m_markers <- c(m_markers,m_done)
  26. }
  27. }
  28. c <- c("black","gray")
  29. cnt <- 0
  30. names <- NULL
  31. if(which==1){
  32. plot(x=c(1,m_done),y=c(0,2000),main="Population Score",xlab="Markers",ylab="Chi^2",type="n")
  33. }else{
  34. if(which==2){
  35. plot(x=c(1,m_done),y=c(-150,150),main="InbreadingScores",xlab="Markers",ylab="Chi^2", type="n")
  36. }else{
  37. plot(x=c(1,m_done),y=c(-150,150),main="AMScores",xlab="Markers",ylab="Chi^2", type="n")
  38. }
  39. }
  40. for(x in dir()){
  41. if(!is.na(grep("res",x)==1&&1)){
  42. cnt <- cnt+1
  43. names <- c(names,x)
  44. chr <- read.table(x,row.names=1,header=FALSE,sep="\t")
  45. colo <- (cnt%%length(c))
  46. if(colo==0){colo<- length(c)}
  47. points(x=m_markers[cnt]+1:length(chr[,which]),chr[,which],type="h",col=c[colo], main=x)
  48. }
  49. }
  50. names <- gsub("res","",names)
  51. names <- gsub(".txt","",names)
  52. legend("topright",names,lty=1,col=c,cex=.6,ncol=2)
  53. }
  54. plotLength<- function(x,LengthMatrix,name=TRUE,which = 2){
  55. if(which==1){
  56. text <- "Inbreading (Chi^2)"
  57. }else{
  58. text <- "Deviation from random mating(Chi^2)"
  59. }
  60. if(name){
  61. lbls <- paste("Chr ",LengthMatrix[,1],"_",LengthMatrix[,2],sep="")
  62. size=0.5
  63. }else{
  64. lbls <- rownames(LengthMatrix)
  65. size=0.7
  66. }
  67. plot(x[[2]],type="b",axes=0,xlab="",ylab=text)
  68. box()
  69. Axis(side=1,at=1:37,labels=lbls,cex.axis=size,las=0.5,las=2)
  70. Axis(side=2)
  71. }
  72. assortedmating.init <- function(){
  73. workD="D:\\GBIC\\Lude\\DataDanny"
  74. setwd(workD)
  75. snps <- as.matrix(read.table("length.txt"))
  76. LengthMatrix <- NULL
  77. for(x in dir()[grep("Chr",dir())]){
  78. if(is.na(grep("res",x)==1&&1)){
  79. cat("Reading",x,"\n")
  80. data <- read.table(x,row.names=1)
  81. for(searching in 1:nrow(snps)){
  82. chr <- as.numeric(snps[searching,2])
  83. if(chr %in% data[,1]){
  84. bp <- as.numeric(snps[searching,3])
  85. cat("SNPs in length.txt on this chromosome:",snps[searching,1:3],"\n")
  86. smaller <- which(data[,2]<(bp+1000))
  87. larger <- which(data[,2]>(bp-1000))
  88. intersect(smaller,larger)
  89. if(!is.na(intersect(smaller,larger)&&1)){
  90. for( y in intersect(smaller,larger)){
  91. cat("->",rownames(data)[y],data[y,1],data[y,2],"\n")
  92. LengthMatrix <- rbind(LengthMatrix,data[y,])
  93. }
  94. }
  95. }
  96. }
  97. }
  98. }
  99. LengthMatrix
  100. }
  101. doDir <- function(){
  102. setwd("D:\\GBIC\\Lude\\DataDanny")
  103. order <- as.vector(read.csv("ChipIDOrderInGenotypeFiles.txt",header=FALSE,sep="\n"))
  104. info <- read.csv("ChipIDToSampleInformation.txt",row.names=1,header=TRUE,sep="\t")
  105. coln <- c("Chr","Pos",as.vector(order[,1]))
  106. for(x in dir()){
  107. if(!is.na(grep("Chr",x)==1&&1)){
  108. if(is.na(grep("res",x)==1&&1)){
  109. data <- read.csv(x,row.names=1,header=TRUE,sep="\t")
  110. colnames(data) <- coln
  111. assortedmating(data,info,1,nrow(data)-1,paste("res",x,sep=""),plot=FALSE)
  112. }
  113. }
  114. }
  115. }
  116. init <- function(){
  117. setwd("D:\\GBIC\\Lude\\RAW")
  118. data <- read.csv("Chr2.txt",row.names=1,sep="\t")
  119. order <- as.vector(read.csv("ChipIDOrderInGenotypeFiles.txt",header=FALSE,sep="\n"))
  120. info <- read.csv("ChipIDToSampleInformation.txt",row.names=1,header=TRUE,sep="\t")
  121. coln <- c("Chr","Pos",as.vector(order[,1]))
  122. colnames(data) <- coln
  123. }
  124. initplottingqq <- function(name="Chromosome 1",resF="resChr1.txt"){
  125. setwd("D:\\GBIC\\Lude\\Resultsgood")
  126. r <- read.csv(resF,row.names=1,header=FALSE,sep="\t")
  127. plottingqq(name,r[,3],3)
  128. }
  129. readallres <- function(){
  130. setwd("D:\\GBIC\\Lude\\Resultsgood")
  131. reslist <- NULL
  132. for(x in dir()){
  133. if(!is.na(grep("Chr",x)==1&&1)){
  134. if(!is.na(grep("res",x)==1&&1)){
  135. cat(x,"\n")
  136. data <- read.csv(x,row.names=1,header=FALSE,sep="\t")
  137. reslist <- rbind(reslist,data)
  138. }
  139. }
  140. }
  141. reslist
  142. }
  143. ScaledownusingHW <- function(result,cutoff){
  144. result[result[,1]<cutoff,]
  145. }
  146. AboveAM <- function(result,cutoff_hw,cutoff_am){
  147. adhereHW <- ScaledownusingHW(result,cutoff_hw);
  148. uber <- adhereHW[adhereHW[,3]>cutoff_am,]
  149. write.table(file=paste("HW<"+cutoff_hw+"AM<"+cutoff_am+".txt",sep=""),uber,sep="\t",quote=F,row.names=F,col.names=F)
  150. }
  151. addLoc <- function(name){
  152. }
  153. plottingqq <- function(name,scores,df){
  154. pos <- scores[scores>=0]
  155. neg <- scores[scores<0]
  156. exp_pos <- rchisq(length(pos),df=df)
  157. exp_neg <- -rchisq(length(neg),df=df)
  158. colors = rep("blue",length(neg))
  159. colors = c(colors,rep("green",length(pos)))
  160. plot(-75:150,-75:150,main=paste(name,"DegFree:",df),xlab="Observed Chi^2",ylab="Expected Chi^2",type="n")
  161. points(sort(c(pos,neg)),sort(c(exp_pos,exp_neg)),pch=19,lwd=0.1,col=colors)
  162. lines(-max(abs(scores)):max(abs(scores)),-max(abs(scores)):max(abs(scores)),col="black")
  163. legend("bottomright", legend = c("Homozygote", "Hetrozygote"),col=c("blue","green"),pch=19,title="Chi^2 Scores")
  164. }
  165. assortedmating <- function(data,info,start=1,number=0,name="Test.txt",plot=TRUE,verbose=TRUE,logger=FALSE){
  166. PopulationScores <- NULL #Teststatistic results holder
  167. DirScores <- NULL
  168. SPopulationScores <- NULL #Scaled Teststatistic result holder
  169. InbreadingScores <- NULL #Inbreading coeficient result holder
  170. ReturnMatrix <- c("",0,0,0,0) #Final scoring matrix
  171. pDir <- NULL #ParentalMatingSchema LikelyhoodScores
  172. PopScores <- NULL
  173. ParScores <- NULL
  174. ourcat(logger,"AM","INFO: Analyzing",number,"SNPMarkers\n",verbose=verbose)
  175. plist <- NULL
  176. for(locus in start:(start+number)){
  177. ourcat(logger,"AM","-----------------------------------------------------------------------\n",verbose=verbose)
  178. ourcat(logger,"AM","Starting with locus:",locus,"\n",verbose=verbose)
  179. ourcat(logger,"AM","-----------------------------------------------------------------------\n",verbose=verbose)
  180. ind_num <- 1
  181. nparents <- 0
  182. ObservedAllel <- c(0,0,0,0)
  183. Observed <- rep(0,10)
  184. Expected <- rep(0,10)
  185. ExpectedAllel <- c(0,0,0,0)
  186. Observedtable <- cbind(rep(0,10),rep(0,10),rep(0,10),rep(0,10),rep(0,10),rep(0,10),rep(0,10),rep(0,10),rep(0,10),rep(0,10))
  187. Expectedtable <- cbind(rep(0,10),rep(0,10),rep(0,10),rep(0,10),rep(0,10),rep(0,10),rep(0,10),rep(0,10),rep(0,10),rep(0,10))
  188. names(ObservedAllel) <- c("A","C","G","T")
  189. names(ExpectedAllel) <- c("A","C","G","T")
  190. rownames(Observedtable) <- c("AA","AC","AG","AT","CC","CG","CT","GG","GT","TT")
  191. names(Observed) <- c("AA","AC","AG","AT","CC","CG","CT","GG","GT","TT")
  192. names(Expected) <- c("AA","AC","AG","AT","CC","CG","CT","GG","GT","TT")
  193. rownames(Expectedtable) <- c("AA","AC","AG","AT","CC","CG","CT","GG","GT","TT")
  194. colnames(Observedtable) <- c("AA","AC","AG","AT","CC","CG","CT","GG","GT","TT")
  195. colnames(Expectedtable) <- c("AA","AC","AG","AT","CC","CG","CT","GG","GT","TT")
  196. #cat("here",ind_num)
  197. for(ind in rownames(info)){
  198. #cat("starting with:",ind,"\n")
  199. colnum <- which(colnames(data)==ind)
  200. #cat(ind,"colnum in data:",colnum,"\n")
  201. #cat(ind,"rownum in info:",ind_num,"\n")
  202. ind.status <- info[ind_num,1]
  203. ind.sex <- info[ind_num,2]
  204. ind.MFC <- info[ind_num,3]
  205. ind.Fam <- info[ind_num,4]
  206. #cat("Data:",ind.status,ind.sex,ind.MFC,ind.Fam,sep="\t","\n")
  207. if(info[ind_num,3]=="M" || info[ind_num,3]=="F"){
  208. #parentfound
  209. nparents <- nparents+1
  210. geno_1 <- as.character(data[locus,colnum])
  211. geno_1 <- paste(sort(c(substr(geno_1,1,1),substr(geno_1,2,2)))[1],sort(c(substr(geno_1,1,1),substr(geno_1,2,2)))[2],sep="")
  212. ObservedAllel[substr(geno_1,1,1)] <- ObservedAllel[substr(geno_1,1,1)]+1
  213. ObservedAllel[substr(geno_1,2,2)] <- ObservedAllel[substr(geno_1,2,2)]+1
  214. Observed[geno_1] <- Observed[geno_1] +1
  215. for(ind2_num in which(info[,4]==ind.Fam)){
  216. if(info[ind2_num,3] != info[ind_num,3] && info[ind2_num,3] != "C"){
  217. #cat(rownames(info)[ind2_num]," !!!\n")
  218. colnum_2 <- which(colnames(data)==rownames(info)[ind2_num])
  219. #cat("Colnum2:",colnum_2,"\n")
  220. geno_2 <- as.character(data[locus,colnum_2])
  221. #cat("Geno's:",geno_1,geno_2,"\n")
  222. geno_2 <- paste(sort(c(substr(geno_2,1,1),substr(geno_2,2,2)))[1],sort(c(substr(geno_2,1,1),substr(geno_2,2,2)))[2],sep="")
  223. Observedtable[geno_1,geno_2] <- Observedtable[geno_1,geno_2] +1
  224. }
  225. }
  226. }
  227. ind_num = ind_num+1
  228. }
  229. n.allells=sum(ObservedAllel)
  230. for(x in 1:length(ObservedAllel)){
  231. ExpectedAllel[x]=(ObservedAllel[x]) / n.allells
  232. #here there is a bug: expected != observed / n.allees (this is exected freq)
  233. ourcat(logger,"AM",names(ObservedAllel)[x],"\t",ObservedAllel[x],"\t",ExpectedAllel[x],"\n",verbose=verbose)
  234. }
  235. ourcat(logger,"AM","-----------------------------------------------------------------------\n",verbose=verbose)
  236. for(x in 1:length(Observed)){
  237. #cat(x)
  238. geno_1 <- names(Observed)[x]
  239. geno_1 <- paste(sort(c(substr(geno_1,1,1),substr(geno_1,2,2)))[1],sort(c(substr(geno_1,1,1),substr(geno_1,2,2)))[2],sep="")
  240. if(substr(geno_1,1,1) != substr(geno_1,2,2)){
  241. Expected[x] <- (ExpectedAllel[substr(geno_1,1,1)] * ExpectedAllel[substr(geno_1,2,2)]) * n.allells
  242. }else{
  243. Expected[x] <- (ExpectedAllel[substr(geno_1,1,1)] * ExpectedAllel[substr(geno_1,2,2)]) * n.allells /2
  244. }
  245. ourcat(logger,"AM","Tests",names(Observed)[x],"\t",Observed[x],"\t",Expected[x],"\n",verbose=verbose)
  246. }
  247. cat(" ",colnames(Observedtable),"\n",sep="\t")
  248. for(x in 1:dim(Observedtable)[1]){
  249. cat(rownames(Observedtable)[x],rep("\t",x))
  250. for(y in x:dim(Observedtable)[2]){
  251. Expectedtable[x,y] <- as.integer((Observed[x]/nparents)*(Observed[y]/nparents)*(nparents/2))
  252. if(x==y){
  253. Observedtable[x,y] <- Observedtable[x,y]/2
  254. }else{
  255. Expectedtable[x,y] <- Expectedtable[x,y]*2
  256. }
  257. cat(Observedtable[x,y],"/",Expectedtable[x,y],"\t",sep="")
  258. }
  259. cat("\n")
  260. }
  261. #Statistics
  262. teststat = 0
  263. dir = 0
  264. teststat1 = 0
  265. for(x in 1:length(Expected)){
  266. geno_1 <- names(Expected)[x]
  267. a <- substr(geno_1,1,1)
  268. b <- substr(geno_1,2,2)
  269. value=0
  270. if(Expected[x] !=0){
  271. value = ((Observed[x]-Expected[x])^2/Expected[x])
  272. teststat <- teststat+value
  273. if(a==b){
  274. dir <- 1-(Observed[x]/Expected[x])
  275. }
  276. }else{
  277. teststat <- teststat+0
  278. }
  279. }
  280. ourcat(logger,"AM","Pop TestStat:",teststat,dir,dir*teststat,"\n",verbose=verbose)
  281. dirHO = 0
  282. dirHE = 0
  283. ParentDirection <- NULL
  284. Pnames <- NULL
  285. for(x in 1:dim(Expectedtable)[1]){
  286. for(y in 1:dim(Expectedtable)[2]){
  287. value <- 0
  288. if(Expectedtable[x,y] !=0){
  289. value <- ((Observedtable[x,y]-Expectedtable[x,y])^2/Expectedtable[x,y])
  290. }
  291. if(x==y){
  292. dirHO = dirHO+value
  293. }else{
  294. dirHE = dirHE+value
  295. }
  296. ParentDirection <- c(ParentDirection,value)
  297. Pnames <- c(Pnames,paste(names(Expectedtable)[x],names(Expectedtable)[y]))
  298. teststat1 <- teststat1+value
  299. }
  300. }
  301. cat(dirHO,dirHE,"\n")
  302. if((dirHO-dirHE) < 0){
  303. teststat1 = -teststat1
  304. }
  305. ourcat(logger,"AM","Pop TestStat_1:",teststat1,"\n",verbose=verbose)
  306. pDir <- cbind(pDir,ParentDirection)
  307. rownames(pDir) <- Pnames
  308. if(plot){
  309. op <- par(mfrow = c(2,1))
  310. DirScores <- c(DirScores,dir)
  311. PopScores <- c(PopScores,teststat)
  312. max <- max(as.numeric(PopScores))
  313. min <- min(as.numeric(PopScores))
  314. plot(c(1,length(PopScores)),c(min,max),xlab="markers",ylab="Chi^2",type='n')
  315. lines(PopScores,lwd=2,col=rgb(0.3,0.3,1.0,1.0),type="o")
  316. ParScores <- c(ParScores,teststat1)
  317. max <- max(as.numeric(ParScores))
  318. min <- min(as.numeric(ParScores))
  319. plot(c(1,length(ParScores)),c(min,max),xlab="markers",ylab="Chi^2",type='n')
  320. lines(ParScores,lwd=2,col=rgb(0.3,0.3,1.0,1.0),type="o")
  321. }
  322. cat(file=name,paste(rownames(data)[locus],teststat,dir,teststat1,sep="\t"),"\n",append=T)
  323. }
  324. list(PopScores,DirScores,ParScores,pDir)
  325. }
  326. init.QQplot <- function(){
  327. full <- NULL
  328. for(x in dir()[grep("Chr",dir())]){
  329. if(!is.na(grep("res",x)==1&&1)){
  330. cat("Reading",x,"\n")
  331. data <- read.table(x)
  332. full <- c(full,data[,2])
  333. }
  334. }
  335. }
  336. init.length <- function(){
  337. setwd("D:\\GBIC\\Lude\\DataDanny")
  338. order <- as.vector(read.csv("ChipIDOrderInGenotypeFiles.txt",header=FALSE,sep="\n"))
  339. info <- read.csv("ChipIDToSampleInformation.txt",row.names=1,header=TRUE,sep="\t")
  340. coln <- c("Chr","Pos",as.vector(order[,1]))
  341. load("LM.Rdata")
  342. colnames(LengthMatrix) <- coln
  343. aaa <- assortedmating(LengthMatrix,info,1,5)
  344. }