/in.progress/auteurExtended/R/auteur.utilities.R

http://github.com/eastman/auteur · R · 342 lines · 257 code · 53 blank · 32 comment · 67 complexity · ba748660a951b9f66085a1155c6259e0 MD5 · raw file

  1. #logging utility used by rjmcmc
  2. #author: JM EASTMAN 2010
  3. parlogger <- function(phy, init=FALSE, primary.parameter, parameters, parmBase, runLog, end=FALSE, class=c("rj","hmm")) {
  4. if(missing(class)) class="rj"
  5. # determine if rjMCMC or hmmMCMC: parlogs are edgewise parameters
  6. if(class=="hmm") {
  7. parlogs=paste(parmBase, paste(c(primary.parameter), "txt", sep="."), sep="")
  8. } else {
  9. parlogs=paste(parmBase, paste(c("shifts",primary.parameter), "txt", sep="."), sep="")
  10. }
  11. # add eol to files if terminating run
  12. if(end==TRUE) {
  13. sapply(1:length(parlogs), function(x) write.table("\n", parlogs[x], quote=FALSE, col.names=FALSE, row.names=FALSE, append=TRUE))
  14. return()
  15. }
  16. parameters->parms
  17. parnames=names(parms)
  18. ii=match(c("gen"),parnames)
  19. general=match(c("lnL","lnL.p","lnL.h"),parnames)
  20. target=match(primary.parameter, parnames)
  21. delta=match("delta",parnames)
  22. accessory=match(parnames[-c(ii,delta,target,general)],parnames)
  23. if(init) {
  24. if(class=="hmm") {
  25. sapply(parlogs[1:length(parlogs)], function(x) write.table("", file=x, quote=FALSE, col.names=FALSE, row.names=FALSE))
  26. write.table(paste("state", primary.parameter, paste(parnames[accessory], collapse="\t"), paste(parnames[general],collapse="\t"), sep="\t"), file=runLog, quote=FALSE, col.names=FALSE, row.names=FALSE)
  27. } else {
  28. sapply(parlogs[1:length(parlogs)], function(x) write.table("", file=x, quote=FALSE, col.names=FALSE, row.names=FALSE))
  29. write.table(paste("state", "min", "max", "mean", primary.parameter, paste(parnames[accessory],collapse="\t"), paste(parnames[general],collapse="\t"), sep="\t"), file=runLog, quote=FALSE, col.names=FALSE, row.names=FALSE)
  30. }
  31. } else {
  32. dd=parms[[delta]]
  33. if(class=="hmm") {
  34. # to log file
  35. msg<-paste(parms[[ii]], parms[[target]], paste(sprintf("%.3f", parms[accessory]),collapse="\t"), paste(sprintf("%.2f", parms[general]),collapse="\t"),sep="\t")
  36. write(msg, file=runLog, append=TRUE)
  37. # to parameter files
  38. out=list(dd)
  39. sapply(1:length(parlogs), function(x) write.table(paste(out[[x]],collapse=" "), parlogs[x], quote=FALSE, col.names=FALSE, row.names=FALSE, append=TRUE))
  40. } else {
  41. pp=parms[[target]]
  42. K=sum(dd)+1
  43. range.p=range(pp)
  44. mean.p=mean(pp)
  45. # to log file
  46. msg<-paste(parms[[ii]], sprintf("%.3f", range.p[1]), sprintf("%.3f", range.p[2]), sprintf("%.3f", mean.p), K, paste(sprintf("%.3f", parms[accessory]),collapse="\t"), paste(sprintf("%.2f", parms[general]),collapse="\t"),sep="\t")
  47. write(msg, file=runLog, append=TRUE)
  48. # to parameter files
  49. out=list(dd,pp)
  50. sapply(1:length(parlogs), function(x) write.table(paste(out[[x]],collapse=" "), parlogs[x], quote=FALSE, col.names=FALSE, row.names=FALSE, append=TRUE))
  51. }
  52. }
  53. }
  54. #rjmcmc utility for initiating a proposal width for Markov sampling
  55. #author: JM EASTMAN 2010
  56. #modified: 02.26.2011 to use spline() in computation of best prop.width
  57. calibrate.proposalwidth <-
  58. function(phy, dat, nsteps=100, model, widths=NULL) {
  59. if(!withinrange(nsteps, 100, 1000)) {
  60. if(nsteps>1000) nsteps=1000 else nsteps=100
  61. }
  62. if(is.null(widths)) widths=2^(-2:4)
  63. if(model=="BM") {
  64. acceptance.rates=sapply(widths, function(x) rjmcmc.bm(phy=phy, dat=dat, ngen=nsteps, prop.width=x, summary=FALSE, fileBase="propwidth", internal.only=FALSE)$acceptance.rate)
  65. } else if(model=="jumpBM") {
  66. acceptance.rates=sapply(widths, function(x) mcmc.levy(phy=phy, dat=dat, ngen=nsteps, prop.width=x, summary=FALSE, fileBase="propwidth")$acceptance.rate)
  67. } else if(model=="OU") {
  68. while(1){
  69. acceptance.rates=sapply(widths, function(x) {
  70. f=try(x<-rjmcmc.ou(phy=phy, dat=dat, ngen=nsteps, prop.width=x, summary=FALSE, fileBase="propwidth", internal.only=FALSE)$acceptance.rate,silent=TRUE)
  71. if(inherits(f,"try-error")) return(0) else return(x)
  72. }
  73. )
  74. if(any(acceptance.rates!=0))break()
  75. }
  76. }
  77. s=spline(log(widths,base=2),acceptance.rates)
  78. m.s=mean(s$y)
  79. s.pick=which(abs(s$y-m.s)==min(abs(s$y-m.s)))
  80. return(2^mean(s$x[s.pick]))
  81. }
  82. #utility for converting text files to .rda to compress output from rjmcmc
  83. #author: JM EASTMAN 2011
  84. cleanup.files <-
  85. function(parmBase, model, fileBase, phy){
  86. if(model=="BM") {
  87. parm="rates"
  88. parms=c(parm,"shifts")
  89. } else if(model=="OU") {
  90. parm="optima"
  91. parms=c(parm,"shifts")
  92. } else if(model=="jumpBM") {
  93. parms="jumps"
  94. }
  95. # read in raw data
  96. posteriorsamples=lapply(parms, function(x) {
  97. y=read.table(paste(parmBase,paste(x,"txt",sep="."),sep="/"))
  98. names(y)=phy$edge[,2]
  99. return(y)
  100. })
  101. names(posteriorsamples)=parms
  102. save(posteriorsamples, file=paste(parmBase,paste(fileBase,"posteriorsamples","rda",sep="."),sep="/"))
  103. unlink(paste(parmBase,paste(parms,"txt",sep="."),sep="/"))
  104. }
  105. #general phylogenetic statistical utility for comparing values ('posterior.rates') from posterior densities for two sets of supplied branches ('nodes')
  106. #author: JM EASTMAN 2010
  107. compare.rates <-
  108. function(branches=list(A=c(NULL,NULL),B=c(NULL,NULL)), phy, posterior.values, ...){
  109. nodes=branches
  110. rates=posterior.values
  111. if(is.null(names(nodes))) names(nodes)=c("A","B")
  112. rates.a=rates[,match(nodes[[1]],names(rates))]
  113. rates.b=rates[,match(nodes[[2]],names(rates))]
  114. edges.a=phy$edge.length[match(nodes[[1]],phy$edge[,2])]
  115. edges.b=phy$edge.length[match(nodes[[2]],phy$edge[,2])]
  116. weights.a=edges.a/sum(edges.a)
  117. weights.b=edges.b/sum(edges.b)
  118. if(length(nodes[[1]])>1) {r.scl.a=apply(rates.a, 1,function(x) sum(x*weights.a))} else {r.scl.a=sapply(rates.a, function(x) sum(x*weights.a))}
  119. if(length(nodes[[2]])>1) r.scl.b=apply(rates.b, 1,function(x) sum(x*weights.b)) else r.scl.b=sapply(rates.b, function(x) sum(x*weights.b))
  120. return(list(scl.A=r.scl.a, scl.B=r.scl.b, r.test=randomization.test(r.scl.a, r.scl.b, ...)))
  121. }
  122. #general phylogenetic utility for rapid computation of the maximum-likelihood estimate of the Brownian motion rate parameter (unless there are polytomies, in which case the function wraps geiger:::fitContinuous)
  123. #author: L REVELL 2010, LJ HARMON 2009, and JM EASTMAN 2010
  124. fit.continuous <-
  125. function(phy, dat){
  126. n=Ntip(phy)
  127. ic=try(pic(dat,phy),silent=TRUE)
  128. if(!inherits(ic, "try-error")) {
  129. r=mean(ic^2)*((n-1)/n)
  130. return(r)
  131. } else {
  132. return(fitContinuous(phy,dat)$Trait1$beta)
  133. }
  134. }
  135. #general phylogenetic utility to find starting point of alpha and sigmasq under an OU process
  136. #author: JM EASTMAN 2011
  137. fit.hansen <-
  138. function(phy, dat, alphabounds){
  139. ot=ape2ouch(phy)
  140. otd <- as(ot,"data.frame")
  141. ot <- with(otd,ouchtree(nodes=nodes,ancestors=ancestors,times=times,labels=labels))
  142. otd$regimes <- as.factor(1)
  143. dat.df<-data.frame(trait=dat,row.names=names(dat))
  144. dat.df$labels <- rownames(dat.df)
  145. otd <- merge(otd,dat.df,by="labels",all=TRUE)
  146. rownames(otd) <- otd$nodes
  147. h=hansen(tree=ot, data=otd[c("trait")], regimes=otd["regimes"],sqrt.alpha=1,sigma=1)
  148. alpha=h@sqrt.alpha^2
  149. sigmasq=h@sigma^2
  150. if(withinrange(alpha, min(alphabounds), max(alphabounds))) alpha=alpha else alpha=alphabounds[which(abs(alphabounds-alpha)==min(abs(alphabounds-alpha)))]
  151. return(list(alpha=h@sqrt.alpha^2, sigmasq=h@sigma^2))
  152. }
  153. #logging utility used by rjmcmc
  154. #author: JM EASTMAN 2010
  155. generate.error.message <-
  156. function(ntip, i, mod.cur, mod.new, lnL, lnPrior, lnHastings, curCats, newCats, errorLog) {
  157. if(!file.exists(file=errorLog)) {
  158. write(paste("gen", "cur.lnL", "new.lnL", "lnL", "lnPrior", "lnHastings", "cur.catg", "new.catg", sep="\t"), file=errorLog)
  159. }
  160. write(paste(i, sprintf("%.3f", mod.cur$lnL), sprintf("%.3f", mod.new$lnL), sprintf("%.3f", lnL), sprintf("%.3f", lnPrior), sprintf("%.3f", lnHastings), sum(curCats), sum(newCats), sep="\t"), file=errorLog, append=TRUE)
  161. }
  162. #utility for generating a list of indicator variables and values for each branch in a phylogeny, using a random model complexity as a starting point (or by constraining model complexity)
  163. #author: JM EASTMAN 2010
  164. generate.starting.point <-
  165. function(data, phy, node.des, logspace=TRUE, K=FALSE, prop.width) {
  166. nn=length(phy$edge.length)
  167. ntip<-n<-Ntip(phy)
  168. if(!K) nshifts=rtpois(1,log(ntip),nn) else nshifts=K-1
  169. if(nshifts!=0) bb=sample(c(rep(1,nshifts),rep(0,nn-nshifts)),replace=FALSE) else bb=rep(0,nn)
  170. names(bb)=phy$edge[,2]
  171. shifts=as.numeric(names(bb)[bb==1])
  172. if(!logspace) {
  173. init.rate=mean(data)
  174. vd=sapply(shifts, function(x) {
  175. z=c()
  176. xx=unlist(node.des[[which(names(node.des)==x)]])
  177. xx=xx[xx<=n]
  178. z=c(xx,x[x<=n])
  179. yy=c(mean(data[z]))
  180. return(yy)})
  181. values=c(vd, init.rate)
  182. } else {
  183. init.rate=fit.continuous(phy,data)
  184. min.max=c(adjustrate(init.rate, prop.width), adjustrate(init.rate, prop.width))
  185. values=runif(sum(bb)+1, min=min.max[min.max==min(min.max)], max=min.max[min.max==max(min.max)])
  186. }
  187. internal.shifts<-tip.shifts<-numeric(0)
  188. internal.shifts=sort(shifts[shifts>ntip])
  189. tip.shifts=shifts[shifts<=ntip]
  190. if(length(internal.shifts)==0 & length(tip.shifts)==0) {
  191. vv=rep(values, nn)
  192. } else {
  193. vv=bb
  194. vv[]=values[length(values)]
  195. i=0
  196. if(length(internal.shifts)!=0) {
  197. for(i in 1:length(internal.shifts)) {
  198. d=node.des[which(names(node.des)==internal.shifts[i])]
  199. vv[match(c(internal.shifts[i], unlist(d)), names(vv))]=values[i]
  200. }
  201. }
  202. if(length(tip.shifts)!=0) {
  203. for(j in 1:length(tip.shifts)) {
  204. vv[match(tip.shifts[j], names(vv))]=values[j+i]
  205. }
  206. }
  207. }
  208. return(list(delta=unname(bb), values=unname(vv)))
  209. }
  210. #general utility for combining supplied list of dataframes into a single 'intercalated' sample. E.g., list(as.data.frame(c(AAA)),as.data.frame(c(BBB))) becomes data.frame(c(ABABAB))
  211. #author: JM EASTMAN 2010
  212. intercalate.samples <-
  213. function(list.obj) {
  214. if(length(list.obj)<2) warning("Nothing to be done... too few objects in list.")
  215. vv=sapply(list.obj, is.vector)
  216. if(all(vv)) list.obj=lapply(list.obj, function(x) {y=data.frame(x); names(y)=NULL; return(y)})
  217. orig.names=names(list.obj[[1]])
  218. n=length(list.obj)
  219. R=sapply(list.obj, nrow)
  220. if(length(unique(R))!=1) {
  221. minR=nrow(list.obj[[which(R==min(R))]])
  222. list.obj=lapply(list.obj, function(x) {y=as.data.frame(x[1:minR,]); names(y)=orig.names; return(y)})
  223. warning("Objects pruned to the length of the smallest.")
  224. }
  225. C=sapply(list.obj, ncol)
  226. if(length(unique(C))!=1) stop("Cannot process objects; column lengths do not match.")
  227. c=unique(C)
  228. if(c>1) {
  229. M=lapply(1:length(list.obj), function(x) mm=match(names(list.obj[[x]]), orig.names))
  230. for(mm in 2:length(list.obj)) {
  231. list.obj[[mm]]=list.obj[[mm]][,M[[mm]]]
  232. }
  233. }
  234. r=nrow(list.obj[[1]])
  235. indices=lapply(1:n, function(x) seq(x, r*n, by=n))
  236. out.array=array(dim=c(r*n, c))
  237. for(nn in 1:n){
  238. out.array[indices[[nn]],]=as.matrix(list.obj[[nn]])
  239. }
  240. out.array=data.frame(out.array)
  241. if(!is.null(orig.names)) names(out.array)=orig.names else names(out.array)=paste("X",1:ncol(out.array),sep="")
  242. return(out.array)
  243. }
  244. #intercalates samples from multiple independent Markov chains (generated by rjmcmc.bm())
  245. #author: JM EASTMAN 2010
  246. pool.rjmcmcsamples <-
  247. function(base.dirs, lab="combined"){
  248. outdir=paste(lab,"combined.rjmcmc",sep=".")
  249. if(!file.exists(outdir)) dir.create(outdir)
  250. # estimated parameters
  251. samples=lapply(base.dirs, function(x) {load(paste(x, dir(x, pattern="posteriorsamples.rda"), sep="/")); return(posteriorsamples)})
  252. nn=unique(unlist(lapply(samples, names)))
  253. posteriorsamples=list()
  254. for(i in nn) {
  255. res.sub=intercalate.samples(lapply(1:length(samples), function(x) samples[[x]][[i]]))
  256. posteriorsamples[[which(nn==i)]]=res.sub
  257. }
  258. names(posteriorsamples)=nn
  259. save(posteriorsamples, file=paste(outdir, paste(lab,"posteriorsamples.rda",sep="."), sep="/"))
  260. # logfile
  261. write.table(intercalate.samples(lapply(base.dirs, function(x) {y=read.table(paste(x, dir(x, pattern="rjmcmc.log"), sep="/"), header=TRUE); return(y)})), paste(outdir, paste(lab,"rjmcmc.log",sep="."), sep="/"), quote=FALSE, row.names=FALSE, sep="\t")
  262. }
  263. #general phylogentic utility wrapping geiger:::treedata for checking data consistency
  264. #author: JM EASTMAN 2010
  265. prepare.data <-
  266. function(phy, data, SE) {
  267. td <- treedata(phy, data, sort = TRUE)
  268. if(any(SE!=0)) {
  269. if(!is.null(names(SE))) {
  270. se=rep(0,length(td$phy$tip.label))
  271. names(se)=td$phy$tip.label
  272. ss.match=match(names(SE), names(se))
  273. if(any(is.na(ss.match))) warning(paste(names(SE[is.na(ss.match)]), "not found in the dataset", sep=" "))
  274. se[match(names(SE[!is.na(ss.match)]),names(se))]=SE[!is.na(ss.match)]
  275. SE=se
  276. } else {
  277. if(name.check(phy,data)=="OK" & length(data)==length(SE)) {
  278. names(SE)=td$phy$tip.label
  279. warning("ordering of values in SE assumed to be in perfect correspondence with phy$tip.label")
  280. } else {
  281. stop("SE must be a named vector of measurement error")
  282. }
  283. }
  284. } else {
  285. SE=rep(0,length(td$phy$tip.label))
  286. names(SE)=td$phy$tip.label
  287. }
  288. return(list(ape.tre=td$phy, orig.dat=td$data[,1], SE=SE))
  289. }