read_surfaces.R | searchcode

/R/read_surfaces.R

https://bitbucket.org/lachlancoin/proteomics_variable_selection · R · 525 lines · 369 code · 110 blank · 46 comment · 42 complexity · b1990a67919d8a5215b150f378e7099f MD5 · raw file

####MERGING FUNCTION

mergeOnFirst<-function(prot, surf, col = 1){
	inds = match(prot[,1], surf[,1])
	cbind(prot, surf[inds,-1, drop=F])  
}

lastnme<-function(nme){
rev(strsplit(nme, "/")[[1]])[1]
}

#############################################
##  read.all.surfaces


## Loop through surfaces but get the same file
## the all.surfaces list should have entries as many surfaces are there *2 (high/low range)

read.all.surfaces<-function(path,fname,  droprepeats = TRUE){

all.surfaces=list()
if(!file.exists(path)) print(paste("path does not exist: ", path));
surfaces<-list.dirs(path)
print(surfaces)

for(s in 1:length(surfaces)){

	surface=surfaces[s]  #paste(path,surfaces[s],sep="/")
	#setwd(surface) ## ESSENTIAL TO READ SAME FILE IN DIFF. DIR
	
	## High/Low range dirs
	ranges<-list.dirs(surface)##
	#print("ranges")
	#print(ranges);
	for(r in 1:length(ranges)){
		pathnme = ranges[r];
		files =  dir(path=pathnme);
		if(length(files)>0){
			#grep(files, pathnme, v = TRUE);
			fname1 = grep("csv", grep(fname, files,  v = TRUE), v=TRUE);
			file<-paste(ranges[r],fname1,sep="/")
			print(file)
		
			col.name<-paste(lastnme(surfaces[s]),lastnme(ranges[r]),sep="_")
			col.name<-gsub(" ", "_", col.name)
			col.name<-gsub("High", "H", col.name)
			col.name<-gsub("Low", "L", col.name)
			col.name<-gsub("_range", "", col.name)

			if(file.exists(file)){
				print(paste("reading ",file));
			
				surface.data<-read.surface(file,col.name)
				print("done reading ");
				all.surfaces[[length(all.surfaces)+1]]=surface.data
			}else{
				print(paste("no file:", file));
			} ## end if
		}
	} ## end ranges

} ## end surfaces

length(all.surfaces) ## should be #surfaces*2

##all.surfaces


## merge all data into a single table

proteome=all.surfaces[[1]]
for(i in 2:length(all.surfaces)){
	surface<-all.surfaces[[i]]
	proteome <-mergeOnFirst(proteome, surface)
}

print("Total number of proteins")
print(dim(proteome))


		
	## drop the column / make it rownames
rownames = proteome[,1];
	rownames(proteome)=make.unique(as.character(rownames),sep="---")
	todrop = grep("---", rownames(proteome));
	if(droprepeats && length(todrop)>0) {
		print(paste("dropping repeates",paste(rownames(proteome)[todrop], collapse=",")));
		proteome = proteome[-todrop,];
	}

	
  proteome_intensity = proteome[,c(1,seq(2,dim(proteome)[2]-1,2))]
  proteome_mz = proteome[,c(seq(3,dim(proteome)[2],2))]
  dimnames(proteome_mz)[[2]] = dimnames(proteome_intensity)[[2]][-1]
 mean_mz = apply(proteome_mz, 2,mean, na.rm=T)
 sd_mz = apply(proteome_mz, 2,sd, na.rm=T)
mz = cbind(mean_mz, sd_mz)
#	neworder = c(dim(proteome_intensity)[2]-1,dim(proteome_intensity)[2], 2:(dim(proteome_intensity)[2]-2))

#	proteome_intensity = proteome_intensity[,neworder]


print("finished read surfaces");
attr(proteome_intensity, "mz") = mz
print(dim(mz))
  proteome_intensity
} ## end of read.all.surfaces




list.dirs <- function(path) {
	#setwd(path)
  all <- paste(path,list.files(path=path), sep="/")
  all[file.info(all)$isdir]
}

######## end of function ######################

## Input: Reads the .csv into a single table 
## Output: Table of intensities, rows=samples, columns=intensities of each peak


read.surface<-function(fname,col.name, filter.tophalf=FALSE){
	data= read.csv(fname,header=TRUE, sep=",")
	max.peak<-max(data$Peak..)

	## Table of intensities, rows=samples, columns=intensities of each peak
	surface=subset(data,Peak..==1,select=c(Sample.name,Intensity, MZ))
	names(surface)=c("Sample.name",paste(col.name,"1",sep="-"), paste(col.name,"1",sep="_"))

	for(i in 2:max.peak){
		
		## get peak intensities
		peak.data=subset(data,Peak..==i,select=c(Sample.name,Intensity, MZ))
		colnme1 = paste(col.name,i,sep="-");
		
	
		## merge with the table of intensities of that surface
		
		surface = mergeOnFirst(surface, peak.data)
		names(surface)[c(dim(surface)[2]-1, dim(surface)[2])] = c(colnme1, colnme1)
	}
	
	## Filter proteins based on p-value
	if(filter.tophalf){
		fsingle<-gsub("range/","range/res-",fname)
		fsingle<-gsub("csv","txt",fsingle)
	
		fsingle.res<-read.table(fsingle,sep="\t",header=T)
		keep.index<-fsingle.res$Index[1:ceiling(length(fsingle.res$Index)/2)]

	## Be careful! surface has 1 more column because the first is Sample.Name
	## so the keep-index needs to be incremented by 1
		keep.index<-keep.index+1
		keep.index<-c(1,keep.index) ## add the Sample.name column

		surface<-surface[,keep.index]
	}

	return(surface)
}

######## end of function ######################

merget <- function(t1, t2){
colnames1 = dimnames(t1)[[2]]
colnames2 = dimnames(t2)[[2]]
colnames = c(colnames1,  colnames2[which(!(colnames2 %in% colnames1))])

rownames1 = dimnames(t1)[[1]]
proti1 = array(dimnames = list(rownames1, colnames), dim = c(length(rownames1), length(colnames)))
proti1[,match(colnames1, colnames)] = t1


t11 = t1[,match(colnames, colnames1)]
t21 = t2[,match(colnames, colnames2)]
}

disease.label<-function(fname){

	data= read.csv(fname,header=TRUE, sep=",")

	surface=subset(data,Peak..==1,select=c(Sample.name,Sample.group))
	
}

setToNA<-function(status1, torem = "TB.diagnosis", toremvalue = "HP", setToNA = TRUE){
	TBind = which(dimnames(status1)[[2]] == torem)
	#this sets HP to NA
	if(length(TBind)>0){
			if(setToNA) status1[which(status1[,TBind]==toremvalue),2] = NA
			status1 = status1[,-TBind]
	}
}

#finds repeated indices
findRepeats<-function(sts){
	sts1 = cbind(as.character(sts),1:length(sts), rep(0, length(sts)))
	ord = order(sts1[,1])
	
	for(i in 2:length(ord)){
		if(sts1[ord[i],1] == sts1[ord[i-1],1]) sts1[ord[i],3] = 1;
	}
	which(sts1[,3]==1)
}

readAll<-function(data.path, site, comps = c("1 vs 3,6","2 vs 4,5"), droprepeats = TRUE){
#comps<-c("1 vs 3","1 vs 6","2 vs 4","2 vs 5");

proteome = list();
#comps = c("1,2 vs 5,6");
#need 1,2 vs 5,6
#need to add 1,2 vs 5,6
## loop through different comparisons
colnames = c();
matrix = NULL;
rownames = c();
rowindex = c()

for (i in 1:length(comps)){
	## comparison
	fname=comps[i]
	## read all surfaces into a list
	proti<-read.all.surfaces(data.path,fname,  droprepeats = droprepeats)
		rownames1 = dimnames(proti)[[1]]
	rowindex1 = rep(i, length(rownames1))
	mz1 = attr(proti, "mz")
	proti = data.frame(proti)
	attr(proti,"mz") = mz1
	proteome[[i]] = proti
	if(i==1){
		 matrix = proti
		mz = mz1
		rownames = rownames1
		rowindex = rowindex1;
	}else{
		matrix = merge(matrix,proti, all=T, sort = F)
		rownames = c(rownames, rownames1);
		rowindex = c(rowindex, rowindex1);
	}
}
#dimnames(matrix)[[1]] = rownames
names(proteome) = comps;
toreplace = paste(site,"_", sep="")
print(toreplace)
colnames = colnames(matrix)
colnames(matrix) = gsub(toreplace, "",colnames)
#print(colnames(matrix))
list(matrix = matrix, proteome = proteome, rownames = rownames, rowindex = rowindex)
}

mergeSites<-function(allSites){
	proteome = list();
	colnames = c();
	matrix = NULL;
	rownames = c();
	rowindex = c();
	
	
for(i in 1:length(allSites)){
	matrix1 = allSites[[i]]$matrix
	rownames1 = allSites[[i]]$rownames
	rowindex1 = allSites[[i]]$rowindex;
	dimnames(matrix1)[[1]] = rownames1;

	if(i==1){
		matrix = matrix1;
		rownames = rownames1;
		rowindex = rowindex1;
	}else{
		
		matrix = merge(data.frame(matrix),data.frame(matrix1), all=T, sort = F)
		rownames = c(rownames, rownames1);
		rowindex = c(rowindex, rowindex1);
	}
}
	
	list(matrix = matrix, proteome = proteome, rownames = rownames, rowindex = rowindex)
}



## this function gets the different disease groups and assigns them to 0 or 1
define_pheno<-function(classes,y){

ynew = rep(NA, length(y))
for(i in 1:length(classes)) {
classi = classes[i]
classi1 = strsplit(classi,",")[[1]]
ynew[which(y %in% classi1)] = i-1
}

ynew
}

markTraining<-function(y, frac = 0.8){
   training = rep(NA, length(y))
   lev = levels(as.factor(y))
   for(i in 1:length(lev)){
	inds = which(y ==lev[i])
	indst = sample(seq(1,length(inds)),length(inds)*frac)
	training[inds[indst]] <- 1;
	training[inds[-indst]] <- 0;
   }
	training
}

split80_20<-function(x,y,train.index){
 train<-x[train.index,]
 test<-x[-train.index,]
 ytr<-y[train.index]
 yts<-y[-train.index]
 list(train=train,test=test,ytr=ytr,yts=yts)
}


splitData<-function(proteome,fname,  frac = 0.8, use.tags = TRUE, vars1 = NULL, logtransform = getOption("logtransform")){
	y<-proteome[,1]
	tags = proteome[,2]
	
	#quantiles = attr(proteome, "quantiles") 
	data=proteome[,-(1:2)]
        if(!is.null(vars1)){
		vars = vars1;
		if(length(which(is.na(vars)))>0) vars = vars1[-which(is.na(vars1))]
		data = orthogAll(data,vars)
	}	
	classes = strsplit(as.character(fname), "vs")[[1]]
	for(i in 1:length(classes)) {
		classes[i] = sub(" ","", classes[i])
	}
	y<-define_pheno(classes,y)
	yinclude = !is.na(y);


	data = data[yinclude,]
	y = y[yinclude]
	tags = tags[yinclude]
	if(use.tags){
		train.index = which(tags==1)
		
	}else{
		 train.index<-sample(seq(1,length(y)),length(y)*frac)
		
	}
	 attr(train.index, "classes") <- classes;
	dsplit=split80_20(data,y, train.index)
	train=list(data=dsplit$train,y=dsplit$ytr)
	test=list(data=dsplit$test,y=dsplit$yts)
	combined = list(data = data, y = y)
	ratios = data.frame(t(apply(train$data,2,ratio, train$y, logtransform)))
	names(ratios) = c(paste("mean",gsub(",", "",classes)), "diff")
	list(test=test,train=train, combined = combined, classes = classes, train.index =train.index, ratios = ratios)
}



removeMissing<-function(proteome, missThresh = NULL, firstcols = c(1,2)){
	cols = 1:dim(proteome)[2]
	rows = 1:dim(proteome)[1]
	if(!is.null(missThresh)){
		missingBySample = apply(apply(proteome,c(1,2), is.na),1,sum)/dim(proteome)[2]
		missSample = which(missingBySample>missThresh | is.na(proteome[,1]))

		#print(length(missSample))
		print(paste("dropping", rownames(proteome)[missSample], collapse=","));
		if(length(missSample)>0)rows = rows[-missSample]

		##careful could remove the phenotype column
		missingByCol = apply(apply(proteome[rows,],c(1,2), is.na),2,sum)/dim(proteome)[1]
		missingByCol[firstcols] = 1.0
		missCol =  which(missingByCol>missThresh)
		
		print(length(missCol))
		print(paste("dropping", colnames(proteome)[missCol], collapse=","));
		if(length(missCol)>0) cols = cols[-missCol]
	}
	toincl = list(rows = rows, cols = cols)
	torem = list(rows = missSample, cols = missCol)
	list(toincl = toincl, torem = torem)
}






revstr <-function(vec){
for(i in 1:length(vec)){
a = vec[i]
vec[i] = paste(rev(substring(a,1:nchar(a),1:nchar(a))),collapse="") 
}
vec
}


getMarkers<-function(markerstr, colnames){
	markers = strsplit(markerstr,":")[[1]]
	markers = revstr(sub("_","-", revstr(markers)))
	vars1 =  match(markers, colnames)
	list(vars = vars1, markers = markers)
}


writeOldMarkers<-function(fout=paste("biomarkers","txt", sep=".")){
	markers = matrix(nrow = 3, ncol = 1)
	markers[1,1] = "Q10_pH_7.5_L_66:Q10_pH_7.5_L_31:CM10_pH_4.0_L_68:Q10_pH_9.5_L_56:CM10_pH_4.0_L_4"; #final  TB vs LTBI, HIV-/+
	markers[2,1] = "Q10_pH_9.5_L_28:CM10_pH_6.0_L_42:CM10_pH_4.0_L_36:IMAC_L_36:Q10_pH_9.5_L_47:CM10_pH_6.0_L_14:Q10_pH_7.5_L_21:IMAC_L_74"; # final TB vs OI, HIV-/+
	markers[3,1] = "Q10_pH_7.5_L_28:Q10_pH_7.5_L_67:IMAC_L_38:CM10_pH_4.0_L_62:IMAC_L_36"; #final TB vs OI+LTBI  HIV-/+
	dimnames(markers) = list(c("tb vs ltbi", "tb vs oi", "tb vs oi+ltbi"), "markers");
	write.table(markers, file=fout, sep=",", quote=F, row.names = T)
	markers
}

matchvars<-function(selrow, mz1, thresh =0.05){
	matchplate = as.character(mz1[,1]) ==as.character(selrow[[1]])
	if(length(which(matchplate))==0) print(paste("problem, no matches to plate !!! ", selrow[[1]]));
	diff = abs(mz1[,2]-as.numeric(selrow[[2]]))/as.numeric(selrow[[2]])
	diff[diff>thresh] = NA
	diff[!matchplate] = NA
	oa = order(diff)
	len = length(which(!is.na(diff)))
	if(len ==0) res = NA 	
	else res= oa[1]
	res
}

ratio<-function(vec,y, logt=F){
m1 = mean(vec[y==1], na.rm=T)
m2 =  mean(vec[y==0], na.rm=T)
diff =abs(m1-m2) 
# if(logt) max( m1/m2, m2/m1) else abs(m1 - m2)
v = c(m1,m2,diff)
v
#print(paste(m1,m2, abs(log(m1/m2))))
#abs(log2(m1/m2))
}

getMarkerfiles<-function(biohome, title, suffix1, suffix, logtransform= FALSE, max = 3){
	fout = list();
	logt = "";
	if(logtransform) logt="log";
	for(i in 1:max){
		fout[[i]] =   paste(biohome, paste(title,i,logt, suffix1,suffix,sep="."), sep="/");
	}
	fout
}

makeSelected<-function(mz1, model, fileout, ratios, eval, append = F, text = NULL){
beta = model$beta
variables = model$variables
betaGlobal = t(data.frame(lapply(model$betaGlobal, paste, collapse=":")))
coeffOrth = t(data.frame(lapply(model$coeffOrth, paste, collapse=":")))
selected = cbind( beta, variables,eval, betaGlobal, coeffOrth);
if(!is.null(ratios)) selected = cbind(selected, ratios[model$variables,])
if(!is.null(mz1)){
mz11 =  mz1[model$variables,, drop=F];
selected = cbind(mz11,selected);
}
if(!is.null(text)){
	write(text, fileout,sep=",", append=append)
	append=T;
}
 write.table(selected,fileout, sep=",", quote=F, row.names = F, append=append)
selected
}


quantileM<-function(vec, prob= 0.75){
q = quantile(vec, probs = prob, na.rm=T)
vec/q
}
quantileNorm<-function(proteome, prob = 0.9){
protN = apply(proteome,2,quantileM, prob)
qv = apply(proteome,2,quantile, probs=prob, na.rm=T)
attr(protN, "quantiles") = list(qv=qv, prob = prob)
protN
}

readSelected<-function(mz1, filein, thresh = 0.1){
	selected1 <-read.table(filein,header=T, sep=",", strip.white=T)
	betaGIndex = which(dimnames(selected1)[[2]]=="betaGlobal");
	coeffOrthIndex = which(dimnames(selected1)[[2]]=="coeffOrt");
	vars1  = apply(selected1,1,matchvars,mz1, thresh = thresh)
	nav = which(is.na(vars1));
	max = length(vars1);
	if(length(nav)>0) max = nav[1]-1
	betaG = selected1$betaG[1:max]
	beta = selected1$beta[1:max]
	coeffO =  selected1$coeffOrth[1:max]
        dimnames(selected1)[[2]] = paste(dimnames(selected1)[[2]],"import", sep=".")
	#print(dimnames(selected1)[[2]])
	sel2 = cbind(selected1[,1:3],mz1[vars1,])
	betaGlobal = list();
	coeffOrth  = list()
	for(i in 1:length(betaG)){
		betaGlobal[[i]]  =as.numeric(strsplit(as.character(betaG[i]),':') [[1]])
		coeffOrth[[i]]  =as.numeric(strsplit(as.character(coeffO[i]),':') [[1]])
	}
	model1 = list(variables = vars1[1:max], beta = beta, betaGlobal = betaGlobal[1:max], coeffOrth = coeffOrth[1:max], selected=sel2)
	model1
}



readStatus<-function(labelfile, rownames = NULL, levs = c("N","HP", "CC")){
	print("read status");
	status <- read.table(labelfile, head=T, sep=",")
	rep_inds = findRepeats(status[,1])
	if(length(rep_inds)>0)	status = status[-rep_inds,]
	training = markTraining(status[,dim(status)[[2]]])		
	#if(!is.null(setHPAsNA)) status = setToNA(status, torem = "TB.diagnosis", toremvalue = "HP", setHPAsNA)
	
	status = cbind(status, training)
	dimnames(status)[[1]] = status[,1]
	status = status[,-1]
	status = status[match(rownames, dimnames(status)[[1]]),]
	phens = rep(NA, dim(status)[[1]])
	for(i in 1:length(levs)){
		phens[status[,2]==levs[i]] = i-1
	}
	cbind(status, phens)[,c(4,3)]
}