PageRenderTime 26ms CodeModel.GetById 18ms RepoModel.GetById 0ms app.codeStats 0ms

/R/read_surfaces.R

https://bitbucket.org/lachlancoin/proteomics_variable_selection
R | 525 lines | 369 code | 110 blank | 46 comment | 42 complexity | b1990a67919d8a5215b150f378e7099f MD5 | raw file
  1. ####MERGING FUNCTION
  2. mergeOnFirst<-function(prot, surf, col = 1){
  3. inds = match(prot[,1], surf[,1])
  4. cbind(prot, surf[inds,-1, drop=F])
  5. }
  6. lastnme<-function(nme){
  7. rev(strsplit(nme, "/")[[1]])[1]
  8. }
  9. #############################################
  10. ## read.all.surfaces
  11. ## Loop through surfaces but get the same file
  12. ## the all.surfaces list should have entries as many surfaces are there *2 (high/low range)
  13. read.all.surfaces<-function(path,fname, droprepeats = TRUE){
  14. all.surfaces=list()
  15. if(!file.exists(path)) print(paste("path does not exist: ", path));
  16. surfaces<-list.dirs(path)
  17. print(surfaces)
  18. for(s in 1:length(surfaces)){
  19. surface=surfaces[s] #paste(path,surfaces[s],sep="/")
  20. #setwd(surface) ## ESSENTIAL TO READ SAME FILE IN DIFF. DIR
  21. ## High/Low range dirs
  22. ranges<-list.dirs(surface)##
  23. #print("ranges")
  24. #print(ranges);
  25. for(r in 1:length(ranges)){
  26. pathnme = ranges[r];
  27. files = dir(path=pathnme);
  28. if(length(files)>0){
  29. #grep(files, pathnme, v = TRUE);
  30. fname1 = grep("csv", grep(fname, files, v = TRUE), v=TRUE);
  31. file<-paste(ranges[r],fname1,sep="/")
  32. print(file)
  33. col.name<-paste(lastnme(surfaces[s]),lastnme(ranges[r]),sep="_")
  34. col.name<-gsub(" ", "_", col.name)
  35. col.name<-gsub("High", "H", col.name)
  36. col.name<-gsub("Low", "L", col.name)
  37. col.name<-gsub("_range", "", col.name)
  38. if(file.exists(file)){
  39. print(paste("reading ",file));
  40. surface.data<-read.surface(file,col.name)
  41. print("done reading ");
  42. all.surfaces[[length(all.surfaces)+1]]=surface.data
  43. }else{
  44. print(paste("no file:", file));
  45. } ## end if
  46. }
  47. } ## end ranges
  48. } ## end surfaces
  49. length(all.surfaces) ## should be #surfaces*2
  50. ##all.surfaces
  51. ## merge all data into a single table
  52. proteome=all.surfaces[[1]]
  53. for(i in 2:length(all.surfaces)){
  54. surface<-all.surfaces[[i]]
  55. proteome <-mergeOnFirst(proteome, surface)
  56. }
  57. print("Total number of proteins")
  58. print(dim(proteome))
  59. ## drop the column / make it rownames
  60. rownames = proteome[,1];
  61. rownames(proteome)=make.unique(as.character(rownames),sep="---")
  62. todrop = grep("---", rownames(proteome));
  63. if(droprepeats && length(todrop)>0) {
  64. print(paste("dropping repeates",paste(rownames(proteome)[todrop], collapse=",")));
  65. proteome = proteome[-todrop,];
  66. }
  67. proteome_intensity = proteome[,c(1,seq(2,dim(proteome)[2]-1,2))]
  68. proteome_mz = proteome[,c(seq(3,dim(proteome)[2],2))]
  69. dimnames(proteome_mz)[[2]] = dimnames(proteome_intensity)[[2]][-1]
  70. mean_mz = apply(proteome_mz, 2,mean, na.rm=T)
  71. sd_mz = apply(proteome_mz, 2,sd, na.rm=T)
  72. mz = cbind(mean_mz, sd_mz)
  73. # neworder = c(dim(proteome_intensity)[2]-1,dim(proteome_intensity)[2], 2:(dim(proteome_intensity)[2]-2))
  74. # proteome_intensity = proteome_intensity[,neworder]
  75. print("finished read surfaces");
  76. attr(proteome_intensity, "mz") = mz
  77. print(dim(mz))
  78. proteome_intensity
  79. } ## end of read.all.surfaces
  80. list.dirs <- function(path) {
  81. #setwd(path)
  82. all <- paste(path,list.files(path=path), sep="/")
  83. all[file.info(all)$isdir]
  84. }
  85. ######## end of function ######################
  86. ## Input: Reads the .csv into a single table
  87. ## Output: Table of intensities, rows=samples, columns=intensities of each peak
  88. read.surface<-function(fname,col.name, filter.tophalf=FALSE){
  89. data= read.csv(fname,header=TRUE, sep=",")
  90. max.peak<-max(data$Peak..)
  91. ## Table of intensities, rows=samples, columns=intensities of each peak
  92. surface=subset(data,Peak..==1,select=c(Sample.name,Intensity, MZ))
  93. names(surface)=c("Sample.name",paste(col.name,"1",sep="-"), paste(col.name,"1",sep="_"))
  94. for(i in 2:max.peak){
  95. ## get peak intensities
  96. peak.data=subset(data,Peak..==i,select=c(Sample.name,Intensity, MZ))
  97. colnme1 = paste(col.name,i,sep="-");
  98. ## merge with the table of intensities of that surface
  99. surface = mergeOnFirst(surface, peak.data)
  100. names(surface)[c(dim(surface)[2]-1, dim(surface)[2])] = c(colnme1, colnme1)
  101. }
  102. ## Filter proteins based on p-value
  103. if(filter.tophalf){
  104. fsingle<-gsub("range/","range/res-",fname)
  105. fsingle<-gsub("csv","txt",fsingle)
  106. fsingle.res<-read.table(fsingle,sep="\t",header=T)
  107. keep.index<-fsingle.res$Index[1:ceiling(length(fsingle.res$Index)/2)]
  108. ## Be careful! surface has 1 more column because the first is Sample.Name
  109. ## so the keep-index needs to be incremented by 1
  110. keep.index<-keep.index+1
  111. keep.index<-c(1,keep.index) ## add the Sample.name column
  112. surface<-surface[,keep.index]
  113. }
  114. return(surface)
  115. }
  116. ######## end of function ######################
  117. merget <- function(t1, t2){
  118. colnames1 = dimnames(t1)[[2]]
  119. colnames2 = dimnames(t2)[[2]]
  120. colnames = c(colnames1, colnames2[which(!(colnames2 %in% colnames1))])
  121. rownames1 = dimnames(t1)[[1]]
  122. proti1 = array(dimnames = list(rownames1, colnames), dim = c(length(rownames1), length(colnames)))
  123. proti1[,match(colnames1, colnames)] = t1
  124. t11 = t1[,match(colnames, colnames1)]
  125. t21 = t2[,match(colnames, colnames2)]
  126. }
  127. disease.label<-function(fname){
  128. data= read.csv(fname,header=TRUE, sep=",")
  129. surface=subset(data,Peak..==1,select=c(Sample.name,Sample.group))
  130. }
  131. setToNA<-function(status1, torem = "TB.diagnosis", toremvalue = "HP", setToNA = TRUE){
  132. TBind = which(dimnames(status1)[[2]] == torem)
  133. #this sets HP to NA
  134. if(length(TBind)>0){
  135. if(setToNA) status1[which(status1[,TBind]==toremvalue),2] = NA
  136. status1 = status1[,-TBind]
  137. }
  138. }
  139. #finds repeated indices
  140. findRepeats<-function(sts){
  141. sts1 = cbind(as.character(sts),1:length(sts), rep(0, length(sts)))
  142. ord = order(sts1[,1])
  143. for(i in 2:length(ord)){
  144. if(sts1[ord[i],1] == sts1[ord[i-1],1]) sts1[ord[i],3] = 1;
  145. }
  146. which(sts1[,3]==1)
  147. }
  148. readAll<-function(data.path, site, comps = c("1 vs 3,6","2 vs 4,5"), droprepeats = TRUE){
  149. #comps<-c("1 vs 3","1 vs 6","2 vs 4","2 vs 5");
  150. proteome = list();
  151. #comps = c("1,2 vs 5,6");
  152. #need 1,2 vs 5,6
  153. #need to add 1,2 vs 5,6
  154. ## loop through different comparisons
  155. colnames = c();
  156. matrix = NULL;
  157. rownames = c();
  158. rowindex = c()
  159. for (i in 1:length(comps)){
  160. ## comparison
  161. fname=comps[i]
  162. ## read all surfaces into a list
  163. proti<-read.all.surfaces(data.path,fname, droprepeats = droprepeats)
  164. rownames1 = dimnames(proti)[[1]]
  165. rowindex1 = rep(i, length(rownames1))
  166. mz1 = attr(proti, "mz")
  167. proti = data.frame(proti)
  168. attr(proti,"mz") = mz1
  169. proteome[[i]] = proti
  170. if(i==1){
  171. matrix = proti
  172. mz = mz1
  173. rownames = rownames1
  174. rowindex = rowindex1;
  175. }else{
  176. matrix = merge(matrix,proti, all=T, sort = F)
  177. rownames = c(rownames, rownames1);
  178. rowindex = c(rowindex, rowindex1);
  179. }
  180. }
  181. #dimnames(matrix)[[1]] = rownames
  182. names(proteome) = comps;
  183. toreplace = paste(site,"_", sep="")
  184. print(toreplace)
  185. colnames = colnames(matrix)
  186. colnames(matrix) = gsub(toreplace, "",colnames)
  187. #print(colnames(matrix))
  188. list(matrix = matrix, proteome = proteome, rownames = rownames, rowindex = rowindex)
  189. }
  190. mergeSites<-function(allSites){
  191. proteome = list();
  192. colnames = c();
  193. matrix = NULL;
  194. rownames = c();
  195. rowindex = c();
  196. for(i in 1:length(allSites)){
  197. matrix1 = allSites[[i]]$matrix
  198. rownames1 = allSites[[i]]$rownames
  199. rowindex1 = allSites[[i]]$rowindex;
  200. dimnames(matrix1)[[1]] = rownames1;
  201. if(i==1){
  202. matrix = matrix1;
  203. rownames = rownames1;
  204. rowindex = rowindex1;
  205. }else{
  206. matrix = merge(data.frame(matrix),data.frame(matrix1), all=T, sort = F)
  207. rownames = c(rownames, rownames1);
  208. rowindex = c(rowindex, rowindex1);
  209. }
  210. }
  211. list(matrix = matrix, proteome = proteome, rownames = rownames, rowindex = rowindex)
  212. }
  213. ## this function gets the different disease groups and assigns them to 0 or 1
  214. define_pheno<-function(classes,y){
  215. ynew = rep(NA, length(y))
  216. for(i in 1:length(classes)) {
  217. classi = classes[i]
  218. classi1 = strsplit(classi,",")[[1]]
  219. ynew[which(y %in% classi1)] = i-1
  220. }
  221. ynew
  222. }
  223. markTraining<-function(y, frac = 0.8){
  224. training = rep(NA, length(y))
  225. lev = levels(as.factor(y))
  226. for(i in 1:length(lev)){
  227. inds = which(y ==lev[i])
  228. indst = sample(seq(1,length(inds)),length(inds)*frac)
  229. training[inds[indst]] <- 1;
  230. training[inds[-indst]] <- 0;
  231. }
  232. training
  233. }
  234. split80_20<-function(x,y,train.index){
  235. train<-x[train.index,]
  236. test<-x[-train.index,]
  237. ytr<-y[train.index]
  238. yts<-y[-train.index]
  239. list(train=train,test=test,ytr=ytr,yts=yts)
  240. }
  241. splitData<-function(proteome,fname, frac = 0.8, use.tags = TRUE, vars1 = NULL, logtransform = getOption("logtransform")){
  242. y<-proteome[,1]
  243. tags = proteome[,2]
  244. #quantiles = attr(proteome, "quantiles")
  245. data=proteome[,-(1:2)]
  246. if(!is.null(vars1)){
  247. vars = vars1;
  248. if(length(which(is.na(vars)))>0) vars = vars1[-which(is.na(vars1))]
  249. data = orthogAll(data,vars)
  250. }
  251. classes = strsplit(as.character(fname), "vs")[[1]]
  252. for(i in 1:length(classes)) {
  253. classes[i] = sub(" ","", classes[i])
  254. }
  255. y<-define_pheno(classes,y)
  256. yinclude = !is.na(y);
  257. data = data[yinclude,]
  258. y = y[yinclude]
  259. tags = tags[yinclude]
  260. if(use.tags){
  261. train.index = which(tags==1)
  262. }else{
  263. train.index<-sample(seq(1,length(y)),length(y)*frac)
  264. }
  265. attr(train.index, "classes") <- classes;
  266. dsplit=split80_20(data,y, train.index)
  267. train=list(data=dsplit$train,y=dsplit$ytr)
  268. test=list(data=dsplit$test,y=dsplit$yts)
  269. combined = list(data = data, y = y)
  270. ratios = data.frame(t(apply(train$data,2,ratio, train$y, logtransform)))
  271. names(ratios) = c(paste("mean",gsub(",", "",classes)), "diff")
  272. list(test=test,train=train, combined = combined, classes = classes, train.index =train.index, ratios = ratios)
  273. }
  274. removeMissing<-function(proteome, missThresh = NULL, firstcols = c(1,2)){
  275. cols = 1:dim(proteome)[2]
  276. rows = 1:dim(proteome)[1]
  277. if(!is.null(missThresh)){
  278. missingBySample = apply(apply(proteome,c(1,2), is.na),1,sum)/dim(proteome)[2]
  279. missSample = which(missingBySample>missThresh | is.na(proteome[,1]))
  280. #print(length(missSample))
  281. print(paste("dropping", rownames(proteome)[missSample], collapse=","));
  282. if(length(missSample)>0)rows = rows[-missSample]
  283. ##careful could remove the phenotype column
  284. missingByCol = apply(apply(proteome[rows,],c(1,2), is.na),2,sum)/dim(proteome)[1]
  285. missingByCol[firstcols] = 1.0
  286. missCol = which(missingByCol>missThresh)
  287. print(length(missCol))
  288. print(paste("dropping", colnames(proteome)[missCol], collapse=","));
  289. if(length(missCol)>0) cols = cols[-missCol]
  290. }
  291. toincl = list(rows = rows, cols = cols)
  292. torem = list(rows = missSample, cols = missCol)
  293. list(toincl = toincl, torem = torem)
  294. }
  295. revstr <-function(vec){
  296. for(i in 1:length(vec)){
  297. a = vec[i]
  298. vec[i] = paste(rev(substring(a,1:nchar(a),1:nchar(a))),collapse="")
  299. }
  300. vec
  301. }
  302. getMarkers<-function(markerstr, colnames){
  303. markers = strsplit(markerstr,":")[[1]]
  304. markers = revstr(sub("_","-", revstr(markers)))
  305. vars1 = match(markers, colnames)
  306. list(vars = vars1, markers = markers)
  307. }
  308. writeOldMarkers<-function(fout=paste("biomarkers","txt", sep=".")){
  309. markers = matrix(nrow = 3, ncol = 1)
  310. markers[1,1] = "Q10_pH_7.5_L_66:Q10_pH_7.5_L_31:CM10_pH_4.0_L_68:Q10_pH_9.5_L_56:CM10_pH_4.0_L_4"; #final TB vs LTBI, HIV-/+
  311. markers[2,1] = "Q10_pH_9.5_L_28:CM10_pH_6.0_L_42:CM10_pH_4.0_L_36:IMAC_L_36:Q10_pH_9.5_L_47:CM10_pH_6.0_L_14:Q10_pH_7.5_L_21:IMAC_L_74"; # final TB vs OI, HIV-/+
  312. markers[3,1] = "Q10_pH_7.5_L_28:Q10_pH_7.5_L_67:IMAC_L_38:CM10_pH_4.0_L_62:IMAC_L_36"; #final TB vs OI+LTBI HIV-/+
  313. dimnames(markers) = list(c("tb vs ltbi", "tb vs oi", "tb vs oi+ltbi"), "markers");
  314. write.table(markers, file=fout, sep=",", quote=F, row.names = T)
  315. markers
  316. }
  317. matchvars<-function(selrow, mz1, thresh =0.05){
  318. matchplate = as.character(mz1[,1]) ==as.character(selrow[[1]])
  319. if(length(which(matchplate))==0) print(paste("problem, no matches to plate !!! ", selrow[[1]]));
  320. diff = abs(mz1[,2]-as.numeric(selrow[[2]]))/as.numeric(selrow[[2]])
  321. diff[diff>thresh] = NA
  322. diff[!matchplate] = NA
  323. oa = order(diff)
  324. len = length(which(!is.na(diff)))
  325. if(len ==0) res = NA
  326. else res= oa[1]
  327. res
  328. }
  329. ratio<-function(vec,y, logt=F){
  330. m1 = mean(vec[y==1], na.rm=T)
  331. m2 = mean(vec[y==0], na.rm=T)
  332. diff =abs(m1-m2)
  333. # if(logt) max( m1/m2, m2/m1) else abs(m1 - m2)
  334. v = c(m1,m2,diff)
  335. v
  336. #print(paste(m1,m2, abs(log(m1/m2))))
  337. #abs(log2(m1/m2))
  338. }
  339. getMarkerfiles<-function(biohome, title, suffix1, suffix, logtransform= FALSE, max = 3){
  340. fout = list();
  341. logt = "";
  342. if(logtransform) logt="log";
  343. for(i in 1:max){
  344. fout[[i]] = paste(biohome, paste(title,i,logt, suffix1,suffix,sep="."), sep="/");
  345. }
  346. fout
  347. }
  348. makeSelected<-function(mz1, model, fileout, ratios, eval, append = F, text = NULL){
  349. beta = model$beta
  350. variables = model$variables
  351. betaGlobal = t(data.frame(lapply(model$betaGlobal, paste, collapse=":")))
  352. coeffOrth = t(data.frame(lapply(model$coeffOrth, paste, collapse=":")))
  353. selected = cbind( beta, variables,eval, betaGlobal, coeffOrth);
  354. if(!is.null(ratios)) selected = cbind(selected, ratios[model$variables,])
  355. if(!is.null(mz1)){
  356. mz11 = mz1[model$variables,, drop=F];
  357. selected = cbind(mz11,selected);
  358. }
  359. if(!is.null(text)){
  360. write(text, fileout,sep=",", append=append)
  361. append=T;
  362. }
  363. write.table(selected,fileout, sep=",", quote=F, row.names = F, append=append)
  364. selected
  365. }
  366. quantileM<-function(vec, prob= 0.75){
  367. q = quantile(vec, probs = prob, na.rm=T)
  368. vec/q
  369. }
  370. quantileNorm<-function(proteome, prob = 0.9){
  371. protN = apply(proteome,2,quantileM, prob)
  372. qv = apply(proteome,2,quantile, probs=prob, na.rm=T)
  373. attr(protN, "quantiles") = list(qv=qv, prob = prob)
  374. protN
  375. }
  376. readSelected<-function(mz1, filein, thresh = 0.1){
  377. selected1 <-read.table(filein,header=T, sep=",", strip.white=T)
  378. betaGIndex = which(dimnames(selected1)[[2]]=="betaGlobal");
  379. coeffOrthIndex = which(dimnames(selected1)[[2]]=="coeffOrt");
  380. vars1 = apply(selected1,1,matchvars,mz1, thresh = thresh)
  381. nav = which(is.na(vars1));
  382. max = length(vars1);
  383. if(length(nav)>0) max = nav[1]-1
  384. betaG = selected1$betaG[1:max]
  385. beta = selected1$beta[1:max]
  386. coeffO = selected1$coeffOrth[1:max]
  387. dimnames(selected1)[[2]] = paste(dimnames(selected1)[[2]],"import", sep=".")
  388. #print(dimnames(selected1)[[2]])
  389. sel2 = cbind(selected1[,1:3],mz1[vars1,])
  390. betaGlobal = list();
  391. coeffOrth = list()
  392. for(i in 1:length(betaG)){
  393. betaGlobal[[i]] =as.numeric(strsplit(as.character(betaG[i]),':') [[1]])
  394. coeffOrth[[i]] =as.numeric(strsplit(as.character(coeffO[i]),':') [[1]])
  395. }
  396. model1 = list(variables = vars1[1:max], beta = beta, betaGlobal = betaGlobal[1:max], coeffOrth = coeffOrth[1:max], selected=sel2)
  397. model1
  398. }
  399. readStatus<-function(labelfile, rownames = NULL, levs = c("N","HP", "CC")){
  400. print("read status");
  401. status <- read.table(labelfile, head=T, sep=",")
  402. rep_inds = findRepeats(status[,1])
  403. if(length(rep_inds)>0) status = status[-rep_inds,]
  404. training = markTraining(status[,dim(status)[[2]]])
  405. #if(!is.null(setHPAsNA)) status = setToNA(status, torem = "TB.diagnosis", toremvalue = "HP", setHPAsNA)
  406. status = cbind(status, training)
  407. dimnames(status)[[1]] = status[,1]
  408. status = status[,-1]
  409. status = status[match(rownames, dimnames(status)[[1]]),]
  410. phens = rep(NA, dim(status)[[1]])
  411. for(i in 1:length(levs)){
  412. phens[status[,2]==levs[i]] = i-1
  413. }
  414. cbind(status, phens)[,c(4,3)]
  415. }