/R/read_surfaces.R
R | 525 lines | 369 code | 110 blank | 46 comment | 42 complexity | b1990a67919d8a5215b150f378e7099f MD5 | raw file
- ####MERGING FUNCTION
- mergeOnFirst<-function(prot, surf, col = 1){
- inds = match(prot[,1], surf[,1])
- cbind(prot, surf[inds,-1, drop=F])
- }
- lastnme<-function(nme){
- rev(strsplit(nme, "/")[[1]])[1]
- }
- #############################################
- ## read.all.surfaces
- ## Loop through surfaces but get the same file
- ## the all.surfaces list should have entries as many surfaces are there *2 (high/low range)
- read.all.surfaces<-function(path,fname, droprepeats = TRUE){
- all.surfaces=list()
- if(!file.exists(path)) print(paste("path does not exist: ", path));
- surfaces<-list.dirs(path)
- print(surfaces)
- for(s in 1:length(surfaces)){
- surface=surfaces[s] #paste(path,surfaces[s],sep="/")
- #setwd(surface) ## ESSENTIAL TO READ SAME FILE IN DIFF. DIR
-
- ## High/Low range dirs
- ranges<-list.dirs(surface)##
- #print("ranges")
- #print(ranges);
- for(r in 1:length(ranges)){
- pathnme = ranges[r];
- files = dir(path=pathnme);
- if(length(files)>0){
- #grep(files, pathnme, v = TRUE);
- fname1 = grep("csv", grep(fname, files, v = TRUE), v=TRUE);
- file<-paste(ranges[r],fname1,sep="/")
- print(file)
-
- col.name<-paste(lastnme(surfaces[s]),lastnme(ranges[r]),sep="_")
- col.name<-gsub(" ", "_", col.name)
- col.name<-gsub("High", "H", col.name)
- col.name<-gsub("Low", "L", col.name)
- col.name<-gsub("_range", "", col.name)
- if(file.exists(file)){
- print(paste("reading ",file));
-
- surface.data<-read.surface(file,col.name)
- print("done reading ");
- all.surfaces[[length(all.surfaces)+1]]=surface.data
- }else{
- print(paste("no file:", file));
- } ## end if
- }
- } ## end ranges
- } ## end surfaces
- length(all.surfaces) ## should be #surfaces*2
- ##all.surfaces
- ## merge all data into a single table
- proteome=all.surfaces[[1]]
- for(i in 2:length(all.surfaces)){
- surface<-all.surfaces[[i]]
- proteome <-mergeOnFirst(proteome, surface)
- }
- print("Total number of proteins")
- print(dim(proteome))
-
- ## drop the column / make it rownames
- rownames = proteome[,1];
- rownames(proteome)=make.unique(as.character(rownames),sep="---")
- todrop = grep("---", rownames(proteome));
- if(droprepeats && length(todrop)>0) {
- print(paste("dropping repeates",paste(rownames(proteome)[todrop], collapse=",")));
- proteome = proteome[-todrop,];
- }
-
- proteome_intensity = proteome[,c(1,seq(2,dim(proteome)[2]-1,2))]
- proteome_mz = proteome[,c(seq(3,dim(proteome)[2],2))]
- dimnames(proteome_mz)[[2]] = dimnames(proteome_intensity)[[2]][-1]
- mean_mz = apply(proteome_mz, 2,mean, na.rm=T)
- sd_mz = apply(proteome_mz, 2,sd, na.rm=T)
- mz = cbind(mean_mz, sd_mz)
- # neworder = c(dim(proteome_intensity)[2]-1,dim(proteome_intensity)[2], 2:(dim(proteome_intensity)[2]-2))
- # proteome_intensity = proteome_intensity[,neworder]
- print("finished read surfaces");
- attr(proteome_intensity, "mz") = mz
- print(dim(mz))
- proteome_intensity
- } ## end of read.all.surfaces
- list.dirs <- function(path) {
- #setwd(path)
- all <- paste(path,list.files(path=path), sep="/")
- all[file.info(all)$isdir]
- }
- ######## end of function ######################
- ## Input: Reads the .csv into a single table
- ## Output: Table of intensities, rows=samples, columns=intensities of each peak
- read.surface<-function(fname,col.name, filter.tophalf=FALSE){
- data= read.csv(fname,header=TRUE, sep=",")
- max.peak<-max(data$Peak..)
- ## Table of intensities, rows=samples, columns=intensities of each peak
- surface=subset(data,Peak..==1,select=c(Sample.name,Intensity, MZ))
- names(surface)=c("Sample.name",paste(col.name,"1",sep="-"), paste(col.name,"1",sep="_"))
- for(i in 2:max.peak){
-
- ## get peak intensities
- peak.data=subset(data,Peak..==i,select=c(Sample.name,Intensity, MZ))
- colnme1 = paste(col.name,i,sep="-");
-
-
- ## merge with the table of intensities of that surface
-
- surface = mergeOnFirst(surface, peak.data)
- names(surface)[c(dim(surface)[2]-1, dim(surface)[2])] = c(colnme1, colnme1)
- }
-
- ## Filter proteins based on p-value
- if(filter.tophalf){
- fsingle<-gsub("range/","range/res-",fname)
- fsingle<-gsub("csv","txt",fsingle)
-
- fsingle.res<-read.table(fsingle,sep="\t",header=T)
- keep.index<-fsingle.res$Index[1:ceiling(length(fsingle.res$Index)/2)]
- ## Be careful! surface has 1 more column because the first is Sample.Name
- ## so the keep-index needs to be incremented by 1
- keep.index<-keep.index+1
- keep.index<-c(1,keep.index) ## add the Sample.name column
- surface<-surface[,keep.index]
- }
- return(surface)
- }
- ######## end of function ######################
- merget <- function(t1, t2){
- colnames1 = dimnames(t1)[[2]]
- colnames2 = dimnames(t2)[[2]]
- colnames = c(colnames1, colnames2[which(!(colnames2 %in% colnames1))])
- rownames1 = dimnames(t1)[[1]]
- proti1 = array(dimnames = list(rownames1, colnames), dim = c(length(rownames1), length(colnames)))
- proti1[,match(colnames1, colnames)] = t1
- t11 = t1[,match(colnames, colnames1)]
- t21 = t2[,match(colnames, colnames2)]
- }
- disease.label<-function(fname){
- data= read.csv(fname,header=TRUE, sep=",")
- surface=subset(data,Peak..==1,select=c(Sample.name,Sample.group))
-
- }
- setToNA<-function(status1, torem = "TB.diagnosis", toremvalue = "HP", setToNA = TRUE){
- TBind = which(dimnames(status1)[[2]] == torem)
- #this sets HP to NA
- if(length(TBind)>0){
- if(setToNA) status1[which(status1[,TBind]==toremvalue),2] = NA
- status1 = status1[,-TBind]
- }
- }
- #finds repeated indices
- findRepeats<-function(sts){
- sts1 = cbind(as.character(sts),1:length(sts), rep(0, length(sts)))
- ord = order(sts1[,1])
-
- for(i in 2:length(ord)){
- if(sts1[ord[i],1] == sts1[ord[i-1],1]) sts1[ord[i],3] = 1;
- }
- which(sts1[,3]==1)
- }
- readAll<-function(data.path, site, comps = c("1 vs 3,6","2 vs 4,5"), droprepeats = TRUE){
- #comps<-c("1 vs 3","1 vs 6","2 vs 4","2 vs 5");
- proteome = list();
- #comps = c("1,2 vs 5,6");
- #need 1,2 vs 5,6
- #need to add 1,2 vs 5,6
- ## loop through different comparisons
- colnames = c();
- matrix = NULL;
- rownames = c();
- rowindex = c()
- for (i in 1:length(comps)){
- ## comparison
- fname=comps[i]
- ## read all surfaces into a list
- proti<-read.all.surfaces(data.path,fname, droprepeats = droprepeats)
- rownames1 = dimnames(proti)[[1]]
- rowindex1 = rep(i, length(rownames1))
- mz1 = attr(proti, "mz")
- proti = data.frame(proti)
- attr(proti,"mz") = mz1
- proteome[[i]] = proti
- if(i==1){
- matrix = proti
- mz = mz1
- rownames = rownames1
- rowindex = rowindex1;
- }else{
- matrix = merge(matrix,proti, all=T, sort = F)
- rownames = c(rownames, rownames1);
- rowindex = c(rowindex, rowindex1);
- }
- }
- #dimnames(matrix)[[1]] = rownames
- names(proteome) = comps;
- toreplace = paste(site,"_", sep="")
- print(toreplace)
- colnames = colnames(matrix)
- colnames(matrix) = gsub(toreplace, "",colnames)
- #print(colnames(matrix))
- list(matrix = matrix, proteome = proteome, rownames = rownames, rowindex = rowindex)
- }
- mergeSites<-function(allSites){
- proteome = list();
- colnames = c();
- matrix = NULL;
- rownames = c();
- rowindex = c();
-
-
- for(i in 1:length(allSites)){
- matrix1 = allSites[[i]]$matrix
- rownames1 = allSites[[i]]$rownames
- rowindex1 = allSites[[i]]$rowindex;
- dimnames(matrix1)[[1]] = rownames1;
- if(i==1){
- matrix = matrix1;
- rownames = rownames1;
- rowindex = rowindex1;
- }else{
-
- matrix = merge(data.frame(matrix),data.frame(matrix1), all=T, sort = F)
- rownames = c(rownames, rownames1);
- rowindex = c(rowindex, rowindex1);
- }
- }
-
- list(matrix = matrix, proteome = proteome, rownames = rownames, rowindex = rowindex)
- }
- ## this function gets the different disease groups and assigns them to 0 or 1
- define_pheno<-function(classes,y){
- ynew = rep(NA, length(y))
- for(i in 1:length(classes)) {
- classi = classes[i]
- classi1 = strsplit(classi,",")[[1]]
- ynew[which(y %in% classi1)] = i-1
- }
- ynew
- }
- markTraining<-function(y, frac = 0.8){
- training = rep(NA, length(y))
- lev = levels(as.factor(y))
- for(i in 1:length(lev)){
- inds = which(y ==lev[i])
- indst = sample(seq(1,length(inds)),length(inds)*frac)
- training[inds[indst]] <- 1;
- training[inds[-indst]] <- 0;
- }
- training
- }
- split80_20<-function(x,y,train.index){
- train<-x[train.index,]
- test<-x[-train.index,]
- ytr<-y[train.index]
- yts<-y[-train.index]
- list(train=train,test=test,ytr=ytr,yts=yts)
- }
- splitData<-function(proteome,fname, frac = 0.8, use.tags = TRUE, vars1 = NULL, logtransform = getOption("logtransform")){
- y<-proteome[,1]
- tags = proteome[,2]
-
- #quantiles = attr(proteome, "quantiles")
- data=proteome[,-(1:2)]
- if(!is.null(vars1)){
- vars = vars1;
- if(length(which(is.na(vars)))>0) vars = vars1[-which(is.na(vars1))]
- data = orthogAll(data,vars)
- }
- classes = strsplit(as.character(fname), "vs")[[1]]
- for(i in 1:length(classes)) {
- classes[i] = sub(" ","", classes[i])
- }
- y<-define_pheno(classes,y)
- yinclude = !is.na(y);
- data = data[yinclude,]
- y = y[yinclude]
- tags = tags[yinclude]
- if(use.tags){
- train.index = which(tags==1)
-
- }else{
- train.index<-sample(seq(1,length(y)),length(y)*frac)
-
- }
- attr(train.index, "classes") <- classes;
- dsplit=split80_20(data,y, train.index)
- train=list(data=dsplit$train,y=dsplit$ytr)
- test=list(data=dsplit$test,y=dsplit$yts)
- combined = list(data = data, y = y)
- ratios = data.frame(t(apply(train$data,2,ratio, train$y, logtransform)))
- names(ratios) = c(paste("mean",gsub(",", "",classes)), "diff")
- list(test=test,train=train, combined = combined, classes = classes, train.index =train.index, ratios = ratios)
- }
- removeMissing<-function(proteome, missThresh = NULL, firstcols = c(1,2)){
- cols = 1:dim(proteome)[2]
- rows = 1:dim(proteome)[1]
- if(!is.null(missThresh)){
- missingBySample = apply(apply(proteome,c(1,2), is.na),1,sum)/dim(proteome)[2]
- missSample = which(missingBySample>missThresh | is.na(proteome[,1]))
- #print(length(missSample))
- print(paste("dropping", rownames(proteome)[missSample], collapse=","));
- if(length(missSample)>0)rows = rows[-missSample]
- ##careful could remove the phenotype column
- missingByCol = apply(apply(proteome[rows,],c(1,2), is.na),2,sum)/dim(proteome)[1]
- missingByCol[firstcols] = 1.0
- missCol = which(missingByCol>missThresh)
-
- print(length(missCol))
- print(paste("dropping", colnames(proteome)[missCol], collapse=","));
- if(length(missCol)>0) cols = cols[-missCol]
- }
- toincl = list(rows = rows, cols = cols)
- torem = list(rows = missSample, cols = missCol)
- list(toincl = toincl, torem = torem)
- }
- revstr <-function(vec){
- for(i in 1:length(vec)){
- a = vec[i]
- vec[i] = paste(rev(substring(a,1:nchar(a),1:nchar(a))),collapse="")
- }
- vec
- }
- getMarkers<-function(markerstr, colnames){
- markers = strsplit(markerstr,":")[[1]]
- markers = revstr(sub("_","-", revstr(markers)))
- vars1 = match(markers, colnames)
- list(vars = vars1, markers = markers)
- }
- writeOldMarkers<-function(fout=paste("biomarkers","txt", sep=".")){
- markers = matrix(nrow = 3, ncol = 1)
- markers[1,1] = "Q10_pH_7.5_L_66:Q10_pH_7.5_L_31:CM10_pH_4.0_L_68:Q10_pH_9.5_L_56:CM10_pH_4.0_L_4"; #final TB vs LTBI, HIV-/+
- markers[2,1] = "Q10_pH_9.5_L_28:CM10_pH_6.0_L_42:CM10_pH_4.0_L_36:IMAC_L_36:Q10_pH_9.5_L_47:CM10_pH_6.0_L_14:Q10_pH_7.5_L_21:IMAC_L_74"; # final TB vs OI, HIV-/+
- markers[3,1] = "Q10_pH_7.5_L_28:Q10_pH_7.5_L_67:IMAC_L_38:CM10_pH_4.0_L_62:IMAC_L_36"; #final TB vs OI+LTBI HIV-/+
- dimnames(markers) = list(c("tb vs ltbi", "tb vs oi", "tb vs oi+ltbi"), "markers");
- write.table(markers, file=fout, sep=",", quote=F, row.names = T)
- markers
- }
- matchvars<-function(selrow, mz1, thresh =0.05){
- matchplate = as.character(mz1[,1]) ==as.character(selrow[[1]])
- if(length(which(matchplate))==0) print(paste("problem, no matches to plate !!! ", selrow[[1]]));
- diff = abs(mz1[,2]-as.numeric(selrow[[2]]))/as.numeric(selrow[[2]])
- diff[diff>thresh] = NA
- diff[!matchplate] = NA
- oa = order(diff)
- len = length(which(!is.na(diff)))
- if(len ==0) res = NA
- else res= oa[1]
- res
- }
- ratio<-function(vec,y, logt=F){
- m1 = mean(vec[y==1], na.rm=T)
- m2 = mean(vec[y==0], na.rm=T)
- diff =abs(m1-m2)
- # if(logt) max( m1/m2, m2/m1) else abs(m1 - m2)
- v = c(m1,m2,diff)
- v
- #print(paste(m1,m2, abs(log(m1/m2))))
- #abs(log2(m1/m2))
- }
- getMarkerfiles<-function(biohome, title, suffix1, suffix, logtransform= FALSE, max = 3){
- fout = list();
- logt = "";
- if(logtransform) logt="log";
- for(i in 1:max){
- fout[[i]] = paste(biohome, paste(title,i,logt, suffix1,suffix,sep="."), sep="/");
- }
- fout
- }
- makeSelected<-function(mz1, model, fileout, ratios, eval, append = F, text = NULL){
- beta = model$beta
- variables = model$variables
- betaGlobal = t(data.frame(lapply(model$betaGlobal, paste, collapse=":")))
- coeffOrth = t(data.frame(lapply(model$coeffOrth, paste, collapse=":")))
- selected = cbind( beta, variables,eval, betaGlobal, coeffOrth);
- if(!is.null(ratios)) selected = cbind(selected, ratios[model$variables,])
- if(!is.null(mz1)){
- mz11 = mz1[model$variables,, drop=F];
- selected = cbind(mz11,selected);
- }
- if(!is.null(text)){
- write(text, fileout,sep=",", append=append)
- append=T;
- }
- write.table(selected,fileout, sep=",", quote=F, row.names = F, append=append)
- selected
- }
- quantileM<-function(vec, prob= 0.75){
- q = quantile(vec, probs = prob, na.rm=T)
- vec/q
- }
- quantileNorm<-function(proteome, prob = 0.9){
- protN = apply(proteome,2,quantileM, prob)
- qv = apply(proteome,2,quantile, probs=prob, na.rm=T)
- attr(protN, "quantiles") = list(qv=qv, prob = prob)
- protN
- }
- readSelected<-function(mz1, filein, thresh = 0.1){
- selected1 <-read.table(filein,header=T, sep=",", strip.white=T)
- betaGIndex = which(dimnames(selected1)[[2]]=="betaGlobal");
- coeffOrthIndex = which(dimnames(selected1)[[2]]=="coeffOrt");
- vars1 = apply(selected1,1,matchvars,mz1, thresh = thresh)
- nav = which(is.na(vars1));
- max = length(vars1);
- if(length(nav)>0) max = nav[1]-1
- betaG = selected1$betaG[1:max]
- beta = selected1$beta[1:max]
- coeffO = selected1$coeffOrth[1:max]
- dimnames(selected1)[[2]] = paste(dimnames(selected1)[[2]],"import", sep=".")
- #print(dimnames(selected1)[[2]])
- sel2 = cbind(selected1[,1:3],mz1[vars1,])
- betaGlobal = list();
- coeffOrth = list()
- for(i in 1:length(betaG)){
- betaGlobal[[i]] =as.numeric(strsplit(as.character(betaG[i]),':') [[1]])
- coeffOrth[[i]] =as.numeric(strsplit(as.character(coeffO[i]),':') [[1]])
- }
- model1 = list(variables = vars1[1:max], beta = beta, betaGlobal = betaGlobal[1:max], coeffOrth = coeffOrth[1:max], selected=sel2)
- model1
- }
- readStatus<-function(labelfile, rownames = NULL, levs = c("N","HP", "CC")){
- print("read status");
- status <- read.table(labelfile, head=T, sep=",")
- rep_inds = findRepeats(status[,1])
- if(length(rep_inds)>0) status = status[-rep_inds,]
- training = markTraining(status[,dim(status)[[2]]])
- #if(!is.null(setHPAsNA)) status = setToNA(status, torem = "TB.diagnosis", toremvalue = "HP", setHPAsNA)
-
- status = cbind(status, training)
- dimnames(status)[[1]] = status[,1]
- status = status[,-1]
- status = status[match(rownames, dimnames(status)[[1]]),]
- phens = rep(NA, dim(status)[[1]])
- for(i in 1:length(levs)){
- phens[status[,2]==levs[i]] = i-1
- }
- cbind(status, phens)[,c(4,3)]
- }