PageRenderTime 93ms CodeModel.GetById 70ms app.highlight 17ms RepoModel.GetById 1ms app.codeStats 0ms

/in.progress/auteurExtended/R/auteur.utilities.R

http://github.com/eastman/auteur
R | 342 lines | 257 code | 53 blank | 32 comment | 67 complexity | ba748660a951b9f66085a1155c6259e0 MD5 | raw file
  1#logging utility used by rjmcmc
  2#author: JM EASTMAN 2010
  3
  4parlogger <- function(phy, init=FALSE, primary.parameter, parameters, parmBase, runLog, end=FALSE, class=c("rj","hmm")) {
  5	if(missing(class)) class="rj"
  6	
  7	# determine if rjMCMC or hmmMCMC: parlogs are edgewise parameters
  8	if(class=="hmm") {
  9		parlogs=paste(parmBase, paste(c(primary.parameter), "txt", sep="."), sep="")
 10	} else {
 11		parlogs=paste(parmBase, paste(c("shifts",primary.parameter), "txt", sep="."), sep="")
 12	}
 13	
 14	# add eol to files if terminating run
 15	if(end==TRUE) {
 16		sapply(1:length(parlogs), function(x) write.table("\n", parlogs[x], quote=FALSE, col.names=FALSE, row.names=FALSE, append=TRUE))
 17		return()
 18	}
 19	
 20	parameters->parms
 21	parnames=names(parms)
 22	ii=match(c("gen"),parnames)
 23	general=match(c("lnL","lnL.p","lnL.h"),parnames)
 24	target=match(primary.parameter, parnames)
 25
 26	delta=match("delta",parnames)
 27	accessory=match(parnames[-c(ii,delta,target,general)],parnames)
 28	
 29	if(init) {
 30		if(class=="hmm") {
 31			sapply(parlogs[1:length(parlogs)], function(x) write.table("", file=x, quote=FALSE, col.names=FALSE, row.names=FALSE))
 32			write.table(paste("state", primary.parameter, paste(parnames[accessory], collapse="\t"), paste(parnames[general],collapse="\t"), sep="\t"), file=runLog, quote=FALSE, col.names=FALSE, row.names=FALSE)		
 33		} else {
 34			sapply(parlogs[1:length(parlogs)], function(x) write.table("", file=x, quote=FALSE, col.names=FALSE, row.names=FALSE))
 35			write.table(paste("state", "min", "max", "mean", primary.parameter, paste(parnames[accessory],collapse="\t"), paste(parnames[general],collapse="\t"), sep="\t"), file=runLog, quote=FALSE, col.names=FALSE, row.names=FALSE)		
 36		}
 37	} else {
 38		dd=parms[[delta]]
 39
 40		if(class=="hmm") {
 41			# to log file
 42			msg<-paste(parms[[ii]], parms[[target]], paste(sprintf("%.3f", parms[accessory]),collapse="\t"), paste(sprintf("%.2f", parms[general]),collapse="\t"),sep="\t")
 43			write(msg, file=runLog, append=TRUE)
 44			
 45			# to parameter files
 46			out=list(dd)		
 47			sapply(1:length(parlogs), function(x) write.table(paste(out[[x]],collapse=" "), parlogs[x], quote=FALSE, col.names=FALSE, row.names=FALSE, append=TRUE))
 48			
 49		} else {
 50			pp=parms[[target]]
 51			K=sum(dd)+1
 52			range.p=range(pp)
 53			mean.p=mean(pp)
 54			
 55			# to log file
 56			msg<-paste(parms[[ii]], sprintf("%.3f", range.p[1]), sprintf("%.3f", range.p[2]), sprintf("%.3f", mean.p), K, paste(sprintf("%.3f", parms[accessory]),collapse="\t"), paste(sprintf("%.2f", parms[general]),collapse="\t"),sep="\t")
 57			write(msg, file=runLog, append=TRUE)
 58			
 59			# to parameter files
 60			out=list(dd,pp)		
 61			sapply(1:length(parlogs), function(x) write.table(paste(out[[x]],collapse=" "), parlogs[x], quote=FALSE, col.names=FALSE, row.names=FALSE, append=TRUE))
 62			
 63		}
 64	}
 65}
 66
 67
 68#rjmcmc utility for initiating a proposal width for Markov sampling
 69#author: JM EASTMAN 2010
 70#modified: 02.26.2011 to use spline() in computation of best prop.width
 71
 72calibrate.proposalwidth <-
 73function(phy, dat, nsteps=100, model, widths=NULL) {
 74	if(!withinrange(nsteps, 100, 1000)) {
 75		if(nsteps>1000) nsteps=1000 else nsteps=100
 76	}
 77	if(is.null(widths)) widths=2^(-2:4)
 78	if(model=="BM") {
 79		acceptance.rates=sapply(widths, function(x) rjmcmc.bm(phy=phy, dat=dat, ngen=nsteps, prop.width=x, summary=FALSE, fileBase="propwidth", internal.only=FALSE)$acceptance.rate)
 80	} else if(model=="jumpBM") {
 81		acceptance.rates=sapply(widths, function(x) mcmc.levy(phy=phy, dat=dat, ngen=nsteps, prop.width=x, summary=FALSE, fileBase="propwidth")$acceptance.rate)
 82	} else if(model=="OU") {
 83		while(1){
 84			acceptance.rates=sapply(widths, function(x) {
 85								f=try(x<-rjmcmc.ou(phy=phy, dat=dat, ngen=nsteps, prop.width=x, summary=FALSE, fileBase="propwidth", internal.only=FALSE)$acceptance.rate,silent=TRUE)
 86								if(inherits(f,"try-error")) return(0) else return(x)
 87								}
 88								)
 89			if(any(acceptance.rates!=0))break()
 90		}
 91	}
 92	s=spline(log(widths,base=2),acceptance.rates)
 93	m.s=mean(s$y)
 94	s.pick=which(abs(s$y-m.s)==min(abs(s$y-m.s)))
 95	return(2^mean(s$x[s.pick]))
 96}
 97
 98
 99
100#utility for converting text files to .rda to compress output from rjmcmc
101#author: JM EASTMAN 2011
102
103cleanup.files <-
104function(parmBase, model, fileBase, phy){
105	if(model=="BM") {
106		parm="rates"
107		parms=c(parm,"shifts")
108	} else if(model=="OU") {
109		parm="optima"
110		parms=c(parm,"shifts")
111	} else if(model=="jumpBM") {
112		parms="jumps"
113	}
114	
115# read in raw data
116	posteriorsamples=lapply(parms, function(x) {
117			  y=read.table(paste(parmBase,paste(x,"txt",sep="."),sep="/"))
118			  names(y)=phy$edge[,2]
119			  return(y)
120			  })
121	names(posteriorsamples)=parms
122	
123	save(posteriorsamples, file=paste(parmBase,paste(fileBase,"posteriorsamples","rda",sep="."),sep="/"))
124	unlink(paste(parmBase,paste(parms,"txt",sep="."),sep="/"))
125}
126
127
128#general phylogenetic statistical utility for comparing values ('posterior.rates') from posterior densities for two sets of supplied branches ('nodes')
129#author: JM EASTMAN 2010
130
131compare.rates <-
132function(branches=list(A=c(NULL,NULL),B=c(NULL,NULL)), phy, posterior.values, ...){
133	nodes=branches
134	rates=posterior.values
135	if(is.null(names(nodes))) names(nodes)=c("A","B")
136	rates.a=rates[,match(nodes[[1]],names(rates))]
137	rates.b=rates[,match(nodes[[2]],names(rates))]
138	edges.a=phy$edge.length[match(nodes[[1]],phy$edge[,2])]
139	edges.b=phy$edge.length[match(nodes[[2]],phy$edge[,2])]
140	weights.a=edges.a/sum(edges.a)
141	weights.b=edges.b/sum(edges.b)
142	if(length(nodes[[1]])>1) {r.scl.a=apply(rates.a, 1,function(x) sum(x*weights.a))} else {r.scl.a=sapply(rates.a, function(x) sum(x*weights.a))}
143	if(length(nodes[[2]])>1) r.scl.b=apply(rates.b, 1,function(x) sum(x*weights.b)) else r.scl.b=sapply(rates.b, function(x) sum(x*weights.b))
144	return(list(scl.A=r.scl.a, scl.B=r.scl.b, r.test=randomization.test(r.scl.a, r.scl.b, ...)))	
145}
146
147#general phylogenetic utility for rapid computation of the maximum-likelihood estimate of the Brownian motion rate parameter (unless there are polytomies, in which case the function wraps geiger:::fitContinuous)
148#author: L REVELL 2010, LJ HARMON 2009, and JM EASTMAN 2010
149
150fit.continuous <-
151function(phy, dat){
152	n=Ntip(phy)
153	ic=try(pic(dat,phy),silent=TRUE)
154	if(!inherits(ic, "try-error")) {
155		r=mean(ic^2)*((n-1)/n)
156		return(r)
157	} else {
158		return(fitContinuous(phy,dat)$Trait1$beta)
159	}
160}
161
162#general phylogenetic utility to find starting point of alpha and sigmasq under an OU process
163#author: JM EASTMAN 2011
164
165fit.hansen <-
166function(phy, dat, alphabounds){
167	ot=ape2ouch(phy)
168
169	otd <- as(ot,"data.frame")
170	ot <- with(otd,ouchtree(nodes=nodes,ancestors=ancestors,times=times,labels=labels))
171	otd$regimes <- as.factor(1)
172	dat.df<-data.frame(trait=dat,row.names=names(dat))
173	dat.df$labels <- rownames(dat.df)
174	otd <- merge(otd,dat.df,by="labels",all=TRUE)
175	rownames(otd) <- otd$nodes
176	h=hansen(tree=ot, data=otd[c("trait")], regimes=otd["regimes"],sqrt.alpha=1,sigma=1)
177	alpha=h@sqrt.alpha^2 
178	sigmasq=h@sigma^2
179	if(withinrange(alpha, min(alphabounds), max(alphabounds))) alpha=alpha else alpha=alphabounds[which(abs(alphabounds-alpha)==min(abs(alphabounds-alpha)))]
180	return(list(alpha=h@sqrt.alpha^2, sigmasq=h@sigma^2))
181}
182
183
184#logging utility used by rjmcmc
185#author: JM EASTMAN 2010
186
187generate.error.message <-
188function(ntip, i, mod.cur, mod.new, lnL, lnPrior, lnHastings, curCats, newCats, errorLog) {
189	if(!file.exists(file=errorLog)) {
190		write(paste("gen", "cur.lnL", "new.lnL", "lnL", "lnPrior", "lnHastings", "cur.catg", "new.catg", sep="\t"), file=errorLog)
191	} 
192	write(paste(i, sprintf("%.3f", mod.cur$lnL), sprintf("%.3f", mod.new$lnL), sprintf("%.3f", lnL), sprintf("%.3f", lnPrior), sprintf("%.3f", lnHastings), sum(curCats), sum(newCats), sep="\t"),  file=errorLog, append=TRUE)
193	
194}
195
196#utility for generating a list of indicator variables and values for each branch in a phylogeny, using a random model complexity as a starting point (or by constraining model complexity)
197#author: JM EASTMAN 2010
198
199generate.starting.point <-
200function(data, phy, node.des, logspace=TRUE, K=FALSE, prop.width) { 
201	nn=length(phy$edge.length)
202	ntip<-n<-Ntip(phy)
203	if(!K) nshifts=rtpois(1,log(ntip),nn) else nshifts=K-1
204	if(nshifts!=0) bb=sample(c(rep(1,nshifts),rep(0,nn-nshifts)),replace=FALSE) else bb=rep(0,nn)
205	names(bb)=phy$edge[,2]
206	shifts=as.numeric(names(bb)[bb==1])
207	if(!logspace) {
208		init.rate=mean(data)
209		vd=sapply(shifts, function(x) {
210				  z=c()
211				  xx=unlist(node.des[[which(names(node.des)==x)]])
212				  xx=xx[xx<=n]
213				  z=c(xx,x[x<=n])
214				  yy=c(mean(data[z]))
215				return(yy)})
216		values=c(vd, init.rate)
217	} else {
218		init.rate=fit.continuous(phy,data)
219		min.max=c(adjustrate(init.rate, prop.width), adjustrate(init.rate, prop.width))
220		values=runif(sum(bb)+1, min=min.max[min.max==min(min.max)], max=min.max[min.max==max(min.max)])
221	}
222	internal.shifts<-tip.shifts<-numeric(0)
223	internal.shifts=sort(shifts[shifts>ntip])
224	tip.shifts=shifts[shifts<=ntip]
225	
226	if(length(internal.shifts)==0 & length(tip.shifts)==0) {
227		vv=rep(values, nn)
228	} else {
229		vv=bb
230		vv[]=values[length(values)]
231		i=0
232		if(length(internal.shifts)!=0) {
233			for(i in 1:length(internal.shifts)) {
234				d=node.des[which(names(node.des)==internal.shifts[i])]
235				vv[match(c(internal.shifts[i], unlist(d)), names(vv))]=values[i]
236			}
237		}
238		if(length(tip.shifts)!=0) {
239			for(j in 1:length(tip.shifts)) {
240				vv[match(tip.shifts[j], names(vv))]=values[j+i]
241			}
242		}
243	}
244	return(list(delta=unname(bb), values=unname(vv)))
245}
246
247#general utility for combining supplied list of dataframes into a single 'intercalated' sample. E.g., list(as.data.frame(c(AAA)),as.data.frame(c(BBB))) becomes data.frame(c(ABABAB))
248#author: JM EASTMAN 2010
249
250intercalate.samples <-
251function(list.obj) {
252	if(length(list.obj)<2) warning("Nothing to be done... too few objects in list.")
253	vv=sapply(list.obj, is.vector)
254	if(all(vv)) list.obj=lapply(list.obj, function(x) {y=data.frame(x); names(y)=NULL; return(y)})
255	orig.names=names(list.obj[[1]])
256	n=length(list.obj)
257	R=sapply(list.obj, nrow)
258	if(length(unique(R))!=1) {
259		minR=nrow(list.obj[[which(R==min(R))]])
260		list.obj=lapply(list.obj, function(x) {y=as.data.frame(x[1:minR,]); names(y)=orig.names; return(y)})
261		warning("Objects pruned to the length of the smallest.")
262	}
263	C=sapply(list.obj, ncol)
264	if(length(unique(C))!=1) stop("Cannot process objects; column lengths do not match.")
265	c=unique(C)
266	if(c>1) {
267		M=lapply(1:length(list.obj), function(x) mm=match(names(list.obj[[x]]), orig.names))
268		for(mm in 2:length(list.obj)) {
269			list.obj[[mm]]=list.obj[[mm]][,M[[mm]]]
270		}
271	}
272	r=nrow(list.obj[[1]])
273	indices=lapply(1:n, function(x) seq(x, r*n, by=n))
274	
275	out.array=array(dim=c(r*n, c))
276	for(nn in 1:n){
277		out.array[indices[[nn]],]=as.matrix(list.obj[[nn]])
278	}
279	
280	out.array=data.frame(out.array)
281	if(!is.null(orig.names)) names(out.array)=orig.names else names(out.array)=paste("X",1:ncol(out.array),sep="")
282	return(out.array)
283}
284
285
286
287#intercalates samples from multiple independent Markov chains (generated by rjmcmc.bm())
288#author: JM EASTMAN 2010
289
290pool.rjmcmcsamples <-
291function(base.dirs, lab="combined"){
292	outdir=paste(lab,"combined.rjmcmc",sep=".")
293	if(!file.exists(outdir)) dir.create(outdir)
294	
295# estimated parameters
296	samples=lapply(base.dirs, function(x) {load(paste(x, dir(x, pattern="posteriorsamples.rda"), sep="/")); return(posteriorsamples)})
297	nn=unique(unlist(lapply(samples, names)))
298	posteriorsamples=list()
299	for(i in nn) {
300		res.sub=intercalate.samples(lapply(1:length(samples), function(x) samples[[x]][[i]]))
301		posteriorsamples[[which(nn==i)]]=res.sub
302	}
303	names(posteriorsamples)=nn
304	save(posteriorsamples, file=paste(outdir, paste(lab,"posteriorsamples.rda",sep="."), sep="/"))
305	
306# logfile
307	
308	write.table(intercalate.samples(lapply(base.dirs, function(x) {y=read.table(paste(x, dir(x, pattern="rjmcmc.log"), sep="/"), header=TRUE); return(y)})), paste(outdir, paste(lab,"rjmcmc.log",sep="."), sep="/"), quote=FALSE, row.names=FALSE, sep="\t")
309}
310
311#general phylogentic utility wrapping geiger:::treedata for checking data consistency
312#author: JM EASTMAN 2010
313
314prepare.data <-
315function(phy, data, SE) {
316	td <- treedata(phy, data, sort = TRUE)	
317	if(any(SE!=0)) {
318		if(!is.null(names(SE))) {
319			se=rep(0,length(td$phy$tip.label))
320			names(se)=td$phy$tip.label
321			ss.match=match(names(SE), names(se))
322			if(any(is.na(ss.match))) warning(paste(names(SE[is.na(ss.match)]), "not found in the dataset", sep=" "))
323			se[match(names(SE[!is.na(ss.match)]),names(se))]=SE[!is.na(ss.match)]
324			SE=se
325		} else {
326			if(name.check(phy,data)=="OK" & length(data)==length(SE)) {
327				names(SE)=td$phy$tip.label
328				warning("ordering of values in SE assumed to be in perfect correspondence with phy$tip.label")
329			} else {
330				stop("SE must be a named vector of measurement error")
331			}
332		}
333	} else {
334		SE=rep(0,length(td$phy$tip.label))
335		names(SE)=td$phy$tip.label
336	}
337	
338	return(list(ape.tre=td$phy, orig.dat=td$data[,1], SE=SE))
339}
340
341
342