PageRenderTime 63ms CodeModel.GetById 25ms RepoModel.GetById 0ms app.codeStats 0ms

/geiger/R/congruify.R

https://bitbucket.org/jonathaneastman/geiger
R | 389 lines | 270 code | 56 blank | 63 comment | 34 complexity | a994b24a62f009cd2acc80ba3ba693bc MD5 | raw file
  1. .build_classification=function(species){
  2. .split_lab=function(label){
  3. lab=gsub(".", "_", label, fixed=TRUE)
  4. lab=gsub(" ", "_", lab, fixed=TRUE)
  5. lab=unlist(strsplit(lab, "_", fixed=TRUE))
  6. lab=lab[lab!=""]
  7. lab
  8. }
  9. data.frame(genus=sapply(species, function(s) .split_lab(s)[1]), species=species, stringsAsFactors=FALSE)
  10. }
  11. .build_calibrations=function(dat, scion, scion_desc=NULL, tol=0){
  12. # dat: branching times from stock; rows are 1:(Ntip(stock)+Nnode(stock))
  13. # time hash
  14. # 1001 352.234677 3a4adb7cc0d4a51b9012dfb5615b3d71
  15. # 1002 243.269677 33757769ee61bde8dd5574ae35b47053
  16. # scion: phylo tree with 'hash' object -- to be scaled from stock
  17. fetch_spanning=function(phy, nd, desc){
  18. # desc: a list from 1:(Ntip(phy)+Nnode(phy)) of tips descended from 'nd'
  19. if(nd<=Ntip(phy)) return(NULL)
  20. dd=.get.desc.of.node(nd,phy)[1:2]
  21. tt=sapply(dd, function(x) return(desc[[x]][1]))
  22. return(phy$tip.label[sort(tt)])
  23. }
  24. if(is.null(scion_desc)) scion_desc=.cache.descendants(scion)$tips
  25. N=Ntip(scion)
  26. stock_times=dat
  27. scion_hash=scion$hash
  28. df=data.frame(MRCA=scion_hash[(N+1):length(scion_hash)], MaxAge=NA, MinAge=NA, taxonA=NA, taxonB=NA, valid=FALSE, stringsAsFactors=FALSE)
  29. for(i in 1:nrow(df)){
  30. if(!is.na(hash.cur<-df$MRCA[i])){
  31. if(hash.cur%in%stock_times$hash){
  32. node.idx=i+N
  33. df[i,c("MaxAge","MinAge")]<-age.idx<-stock_times$time[match(hash.cur, stock_times$hash)]
  34. df[i,c("taxonA","taxonB")]<-taxa.idx<-fetch_spanning(scion, node.idx, scion_desc)
  35. if(age.idx>tol & all(!is.na(taxa.idx))) df[i,"valid"]=TRUE
  36. }
  37. }
  38. }
  39. df=df[df$valid,]
  40. return(df[,-which(names(df)=="valid")])
  41. }
  42. congruify.phylo=function(reference, target, taxonomy=NULL, tol=0, scale=c(NA, "PATHd8")){
  43. ## adding requirement for ncbit
  44. require(ncbit)
  45. stock=reference
  46. scion=target
  47. # stock: a time-calibrated phylogeny with tip-labels that can be treated as an exemplar for clades in 'scion'
  48. # -- e.g., tip.label in 'stock' might be "Pinus" while in 'scion' we might have "Pinus_cembra"
  49. # -- tips in 'stock' can be contained in 'scion' (FIXME: is this true?)
  50. # megaphylogeny: a rooted phylogeny that is to be time-scaled based on 'stock'
  51. # taxonomy: linkage table between tipsets for 'stock' and 'scion'; if empty, one is attempted to be built by 'scion' labels
  52. # -- if NULL, 'stock' tips must correspond to tips in 'scion'... e.g., A, B, C in 'stock'; A_sp1, B_sp2, C_sp3 in 'scion'
  53. # -- rownames of taxonomy must be tips in megaphylogeny
  54. ## functions
  55. method=match.arg(unname(sapply(scale, toString)), c("NA", "PATHd8"))
  56. hashes.mapping <- function (phy, taxa, mapping){
  57. ## GENERAL FUNCTION: find hash tag for every edge in tree (using phy$tip.label or predefined set of 'taxa')
  58. # returns list of hash tags from node 1:(Nnode(phy)+Ntip(phy))
  59. # taxa: set of species used to create hash keys
  60. # mapping: named list -- names are tips in 'phy'; elements are tips represented by each tip in 'phy' (and should also be present in 'taxa')
  61. mapping=mapping[names(mapping)%in%phy$tip.label]
  62. if(is.null(taxa)) stop("Must supply 'tips'.")
  63. if(!all(names(mapping)%in%phy$tip.label)) stop("'mapping' must be named list with names in tip labels of 'phy'.")
  64. mapping=mapping[match(names(mapping), phy$tip.label)]
  65. descendants <- .cache.descendants(phy)$tips
  66. hashes <- sapply(descendants, function(desc) .hash.tip(unlist(mapping[desc]), taxa))
  67. empty=.md5(integer(length(taxa)))
  68. hashes[hashes==empty]=NA
  69. phy$hash=hashes
  70. phy=.uniquify_hashes(phy)
  71. return(phy)
  72. }
  73. times.mapping=function(phy, taxa, mapping){
  74. # mapping: named list -- names are tips in 'phy'; elements are tips represented by each tip in 'phy' (and should also be present in 'taxa')
  75. # taxa: species that are represented by tips in 'stock'
  76. # find hash tags for stock 'phy'
  77. stock=hashes.mapping(phy, taxa, mapping)
  78. tmp=heights.phylo(stock)
  79. tmp$hash=stock$hash
  80. dat=data.frame(time=tmp[,"end"], hash=stock$hash, stringsAsFactors=FALSE)
  81. dat$hash[1:Ntip(stock)]=NA # destroy keys that are associated with tips
  82. return(list(stock=stock,dat=dat))
  83. }
  84. smooth_scion=function(stock, scion, scion_desc, taxa, spp, tol=0.01, method=c("PATHd8", NA)){
  85. method=match.arg(toString(method), c("NA", "PATHd8"))
  86. if(!is.ultrametric(stock)) warning("Supplied 'stock' is non-ultrametric.")
  87. stock_tmp=times.mapping(stock, taxa, spp)
  88. stock=stock_tmp$stock
  89. stock_dat=stock_tmp$dat
  90. calibration=.build_calibrations(stock_dat, scion, scion_desc, tol=tol)
  91. if(!nrow(calibration)) {
  92. warning("No concordant branches reconciled between 'stock' and 'scion'; ensure that 'tax' involves rownames found as tip labels in 'scion'")
  93. return(NA)
  94. }
  95. if(method=="PATHd8") {
  96. phy=PATHd8.phylo(scion, calibration, base=".tmp_PATHd8", rm=FALSE)
  97. phy$hash=c(rep("", Ntip(phy)), phy$node.label)
  98. phy$hash[phy$hash==""]=NA
  99. } else if(method=="NA"){
  100. phy=NULL
  101. }
  102. stock$node.label=stock$hash[(Ntip(stock)+1):max(stock$edge)]
  103. stock$node.label[is.na(stock$node.label)]=""
  104. return(list(phy=phy, calibrations=calibration, reference=stock, target=scion))
  105. }
  106. ## end functions
  107. ## PROCESSING ##
  108. classification=taxonomy
  109. unfurl=FALSE
  110. if(class(stock)=="phylo") {
  111. stock=list(stock)
  112. unfurl=TRUE
  113. }
  114. if(is.null(classification)) {
  115. classification=as.data.frame(unique(as.matrix(.build_classification(scion$tip.label)),MARGIN=2))
  116. }
  117. tips=unique(unlist(lapply(stock, function(x) x$tip.label)))
  118. spp=lapply(tips, function(o) {
  119. x=rownames(classification)[which(classification==o, arr.ind=TRUE)[,1]]
  120. x=x[x%in%scion$tip.label]
  121. })
  122. names(spp)=tips
  123. taxa=unique(unlist(spp))
  124. scion=hashes.phylo(scion, taxa)
  125. scion_desc=.cache.descendants(scion)$tips
  126. if(is.null(scion$edge.length)) scion$edge.length=numeric(nrow(scion$edge)) ## JME 01302013
  127. f=lapply
  128. results=f(1:length(stock), function(i) {
  129. phy=stock[[i]]
  130. smooth_scion(phy, scion, scion_desc, taxa, spp, tol=tol, method=method)
  131. })
  132. if(unfurl) results=results[[1]]
  133. return(results)
  134. }
  135. ## END CONGRUIFICATION FUNCTIONS ##
  136. write.treePL=function(phy, calibrations, nsites, min=1e-4, base="", opts=list(smooth=100, nthreads=8, optad=0, opt=1, cvstart=1000, cviter=3, cvend=0.1, thorough=TRUE)){
  137. # calibrations: dataframe with minimally 'MRCA' 'MaxAge' 'MinAge' 'taxonA' and 'taxonB' from .build_calibrations
  138. # MRCA MaxAge MinAge taxonA taxonB
  139. # c65bacdf65aa29635bec90f3f0447c6e 352.234677 352.234677 Inga_chartacea Encephalartos_umbeluziensis
  140. # d4bc6557dccbd4e8e18b867979f34f8e 243.269677 243.269677 Inga_chartacea Nuphar_sagittifolia
  141. # 5e30c16531694aff7d94da3b35806677 217.632627 217.632627 Inga_chartacea Schisandra_glaucescens
  142. if(file.exists(inp<-paste(base,"infile",sep="."))) unlink(inp)
  143. if(file.exists(int<-paste(base,"intree",sep="."))) unlink(inp)
  144. poss=list(
  145. cv="numeric",
  146. collapse="boolean",
  147. checkconstraints="boolean",
  148. cvstart="numeric",
  149. cvstop="numeric",
  150. cvmultstep="numeric",
  151. verbose="boolean",
  152. lftemp="numeric",
  153. pltemp="numeric",
  154. plcool="numeric",
  155. lfstoptemp="numeric",
  156. plstoptemp="numeric",
  157. lfrtstep="numeric",
  158. plrtstep="numeric",
  159. thorough="boolean",
  160. lfiter="integer",
  161. pliter="integer",
  162. cviter="integer",
  163. ldfsimaniter="integer",
  164. plsimaniter="integer",
  165. cvsimaniter="integer",
  166. calcgrad="numeric",
  167. paramverbose="boolean",
  168. prime="boolean",
  169. opt="boolean",
  170. optad="boolean",
  171. optcvad="boolean",
  172. moredetail="boolean",
  173. moredetailad="boolean",
  174. moredetailcvad="boolean",
  175. randomcv="boolean",
  176. ftol="numeric",
  177. xtol="numeric",
  178. mapspace="boolean",
  179. nthreads="integer"
  180. )
  181. if(length(opts)==0) {
  182. print(poss)
  183. stop("No 'opts' specified")
  184. }
  185. # correct small branch lengths
  186. z=phy$edge.length[which(phy$edge.length>0)]
  187. if(any(z<min)){
  188. scl=min/min(z)
  189. phy$edge.length=phy$edge.length*scl
  190. }
  191. write.tree(phy, file=int)
  192. ## check appropriateness of constraints ##
  193. # check for 'calibrations' and 'phy' congruence
  194. # if(!is.null(phy)){
  195. # check=function(t, phy) all(t%in%phy$tip.label)
  196. # a=check(calibrations$taxonA, phy)
  197. # b=check(calibrations$taxonB, phy)
  198. # if(!all(c(a,b))) stop("Some calibrations not encountered in tree.")
  199. # }
  200. ## build r8s file
  201. # calibrations$fixage=ifelse(calibrations$MinAge==calibrations$MaxAge, TRUE, FALSE)
  202. constraints<-constraintnames<-character(nrow(calibrations))
  203. for(i in 1:nrow(calibrations)){
  204. cal=calibrations[i,]
  205. taxon=cal$MRCA
  206. desc=c(cal$taxonA, cal$taxonB)
  207. txt1=ifelse(!is.na(cal$MinAge), paste("min =", taxon, cal$MinAge, sep=" "), "")
  208. txt2=ifelse(!is.na(cal$MaxAge), paste("max =", taxon, cal$MaxAge, sep=" "), "")
  209. txt=paste(txt1,txt2,sep="\n")
  210. constraints[i]=txt
  211. constraintnames[i]=paste("mrca =", taxon, desc[1], desc[2], sep=" ")
  212. }
  213. infile=list(
  214. tree=paste("treefile = ", int, sep=""),
  215. ns=paste("numsites = ", nsites, sep=""),
  216. names=paste(unlist(constraintnames), collapse="\n"),
  217. mrca=paste(unlist(constraints), collapse="\n"),
  218. out=paste("outfile = ", paste(base, "dated", "tre", sep="."), sep=""),
  219. opt=paste(names(opts), opts, sep="=", collapse="\n")
  220. )
  221. inp=paste(base,"infile",sep=".")
  222. writeLines(paste(infile,collapse="\n\n"), con=inp)
  223. attr(inp, "method")="treePL"
  224. return(inp)
  225. }
  226. write.r8s=function(phy=NULL, calibrations, base="", blformat=c(lengths="persite", nsites=1, ultrametric="no", round="yes"), divtime=c(method="NPRS", algorithm="POWELL"), describe=c(plot="chrono_description")){
  227. # calibrations: dataframe with minimally 'MRCA' 'MaxAge' 'MinAge' 'taxonA' and 'taxonB' from .build_calibrations
  228. # MRCA MaxAge MinAge taxonA taxonB
  229. # c65bacdf65aa29635bec90f3f0447c6e 352.234677 352.234677 Inga_chartacea Encephalartos_umbeluziensis
  230. # d4bc6557dccbd4e8e18b867979f34f8e 243.269677 243.269677 Inga_chartacea Nuphar_sagittifolia
  231. # 5e30c16531694aff7d94da3b35806677 217.632627 217.632627 Inga_chartacea Schisandra_glaucescens
  232. if(file.exists(inp<-paste(base,"infile",sep="."))) unlink(inp)
  233. ## check appropriateness of constraints ##
  234. # check for 'calibrations' and 'phy' congruence
  235. # if(!is.null(phy)){
  236. # check=function(t, phy) all(t%in%phy$tip.label)
  237. # a=check(calibrations$taxonA, phy)
  238. # b=check(calibrations$taxonB, phy)
  239. # if(!all(c(a,b))) stop("Some calibrations not encountered in tree.")
  240. # }
  241. ## build r8s file
  242. # calibrations$fixage=ifelse(calibrations$MinAge==calibrations$MaxAge, TRUE, FALSE)
  243. constraints<-constraintnames<-character(nrow(calibrations))
  244. for(i in 1:nrow(calibrations)){
  245. cal=calibrations[i,]
  246. taxon=cal$MRCA
  247. desc=c(cal$taxonA, cal$taxonB)
  248. txt=paste(paste("\tfixage taxon=", taxon,sep=""), paste("age=", cal$MinAge, ";\n", sep=""), sep=" ")
  249. constraints[i]=txt
  250. constraintnames[i]=paste("\tMRCA", taxon, desc[1], paste(desc[2], ";\n", sep=""), sep=" ")
  251. }
  252. # phy$node.label=NULL
  253. infile=paste(c(
  254. "#nexus\n",
  255. "begin trees;\n",
  256. paste("tree r8s = ", write.tree(phy), "\n", sep=""),
  257. "end;\n",
  258. "begin r8s;\n",
  259. paste("\tblformat ", paste(names(blformat), blformat, collapse=" ", sep="="), ";\n", sep=""),
  260. names=paste(unlist(constraintnames), collapse=""),
  261. mrca=paste(unlist(constraints), collapse=""),
  262. "\tcollapse;\n",
  263. paste("\tdivtime ", paste(names(divtime), divtime, collapse=" ", sep="="), ";\n", sep=""),
  264. paste("\tdescribe ", paste(names(describe), describe, sep="="), ";\n", sep="", collapse=""),
  265. "end;"
  266. ),collapse="")
  267. inp=paste(base,"infile",sep=".")
  268. writeLines(paste(infile,collapse="\n\n"), con=inp)
  269. attr(inp, "method")="r8s"
  270. return(inp)
  271. }
  272. write.pathd8=function(phy, calibrations, base=""){
  273. # calibrations: dataframe with minimally 'MRCA' 'MaxAge' 'MinAge' 'taxonA' and 'taxonB'
  274. if(file.exists(inp<-paste(base,"infile",sep="."))) unlink(inp)
  275. ## check appropriateness of constraints ##
  276. # check for 'calibrations' and 'phy' congruence
  277. check=function(t, phy) all(t%in%phy$tip.label)
  278. a=check(calibrations$taxonA, phy)
  279. b=check(calibrations$taxonB, phy)
  280. if(!all(c(a,b))) stop("Some calibrations not encountered in tree.")
  281. ## build PATHd8 file
  282. calibrations$fixage=ifelse(calibrations$MinAge==calibrations$MaxAge, TRUE, FALSE)
  283. constraints<-constraintnames<-character(nrow(calibrations))
  284. for(i in 1:nrow(calibrations)){
  285. cal=calibrations[i,]
  286. taxon=cal$MRCA
  287. desc=c(cal$taxonA, cal$taxonB)
  288. if(cal$fixage) {
  289. txt=paste("mrca:", paste(desc[1], ", ", desc[2], ", ", sep=""), paste("fixage=", cal$MinAge, ";", sep=""), sep=" ")
  290. } else {
  291. txt1=paste("mrca:", paste(desc[1], ", ", desc[2], ", ", sep=""), paste("minage=", cal$MinAge, ";", sep=""), sep=" ")
  292. txt2=paste("mrca:", paste(desc[1], ", ", desc[2], ", ", sep=""), paste("maxage=", cal$MaxAge, ";", sep=""), sep=" ")
  293. txt=paste(txt1,txt2,sep="\n")
  294. }
  295. constraints[i]=txt
  296. constraintnames[i]=paste("name of mrca: ", paste(desc[1], ", ", desc[2], ", ", sep=""), paste("name=", cal$MRCA, ";", sep=""), sep=" ")
  297. }
  298. phy$node.label=NULL
  299. infile=list(tree=write.tree(phy),
  300. mrca=paste(unlist(constraints), collapse="\n"),
  301. names=paste(unlist(constraintnames), collapse="\n")
  302. )
  303. inp=paste(base,"infile",sep=".")
  304. writeLines(paste(infile,collapse="\n\n"), con=inp)
  305. attr(inp, "method")="pathd8"
  306. return(inp)
  307. }
  308. PATHd8.phylo=function(phy, calibrations=NULL, base="", rm=TRUE){
  309. # calibrations: dataframe with minimally 'MRCA' 'MaxAge' 'MinAge' 'taxonA' and 'taxonB'
  310. # -- if NULL, simple ultrametricization of 'phy' is performed
  311. phy$node.label=NULL
  312. if(!is.null(calibrations)){
  313. infile=write.pathd8(phy, calibrations, base)
  314. } else {
  315. infile=paste(base, "infile", sep=".")
  316. write.tree(phy, infile)
  317. }
  318. smooth.file=paste(base, "smoothed.tre", sep=".")
  319. parsed.outfile=paste(base, "pathd8.out", sep=".")
  320. outfile=paste(base, "pathd8.orig.out", sep=".")
  321. if(file.exists(outfile)) unlink(outfile)
  322. if(!system("which PATHd8", ignore.stdout=TRUE)==0) stop("Install 'PATHd8' before proceeding.")
  323. system(paste("PATHd8 -n", infile, "-pn >", outfile, sep=" "))
  324. system(paste("grep \"d8 tree\" ", outfile, ">", parsed.outfile, sep=" "))
  325. smoothed=read.tree(parsed.outfile)
  326. if(rm & base=="") {
  327. unlink(parsed.outfile)
  328. unlink(smooth.file)
  329. unlink(outfile)
  330. unlink(infile)
  331. }
  332. return(smoothed)
  333. }