PageRenderTime 57ms CodeModel.GetById 26ms RepoModel.GetById 0ms app.codeStats 0ms

/MEDUSA/R/MEDUSA.R

https://github.com/josephwb/turboMEDUSA
R | 239 lines | 173 code | 43 blank | 23 comment | 47 complexity | bea41f965289b56bf53a34a03a0b7db2 MD5 | raw file
  1. MEDUSA <- function(phy, richness=NULL, model="mixed", modelLimit=20, stop="threshold",
  2. shiftCut="both", criterion="aicc", stepBack=TRUE, preserveModelFlavour=FALSE, epsilon=NULL, r=NULL,
  3. b=NULL, d=NULL, fixThreshold=NULL, initialR=0.05, initialE=0.5, verbose=TRUE, mc=FALSE, numCores=NULL,
  4. resolveTree=FALSE, ...)
  5. {
  6. checkValidArguments(phy, richness, model, modelLimit, stop, shiftCut, criterion, stepBack,
  7. preserveModelFlavour, epsilon, r, b, d, fixThreshold, initialR, initialE,
  8. verbose, mc, numCores, resolveTree);
  9. conf <- configureModel(model=model, epsilon=epsilon, r=r, b=b, d=d, initialR=initialR, initialE=initialE);
  10. sp <- conf$sp;
  11. model <- conf$model;
  12. fixPar <- conf$fixPar;
  13. richness <- formatRichness(richness, phy); # do this once, instead of once per tree
  14. # Determine correct AICc threshold from tree size (based on simulations)
  15. # Should be used for interpreting model-fit
  16. threshold <- getThreshold(phy, fixThreshold=fixThreshold, richness=richness, stop=stop, resolveTree= resolveTree);
  17. # Limit on number of piecewise models fitted; based on tree size, aicc correction factor,
  18. # and flavour of model fitted (i.e. # parameters estimated; birth-death or pure-birth)
  19. modelLimit <- getMaxModelLimit(richness=richness, modelLimit=modelLimit, model=model, stop=stop);
  20. # temporary work-around hack
  21. if (mc) {
  22. if (is.null(numCores)) {
  23. numCores <- parallel::detectCores();
  24. }
  25. cat("Running a multicore analysis with", numCores, "cores.\n\n");
  26. }
  27. runMEDUSA <- function (phy, richness, multiTree=FALSE, verbose, ...) { # wtf is multiTree?!?
  28. phyData <- prepareData(phy=phy, richness=richness, verbose=verbose, resolveTree);
  29. phy <- phyData$phy;
  30. richness <- phyData$richness;
  31. # Store pertinent information: branch times, richness, descendants
  32. cat("Preparing data for analysis:\n");
  33. # Keep track of all nodes, internal and pendant (for keeping track of breakpoints)
  34. pend.nodes <- seq_len(length(phy$tip.label)); # Calculate pendant splits just once, keep track through various models
  35. int.nodes <- unique(phy$edge[,1])[-1]; # Omit root node
  36. root.node <- length(phy$tip.label) + 1;
  37. all.nodes <- c(pend.nodes, root.node, int.nodes);
  38. # The important bits. Set up z, get descendants and number of tips per node
  39. obj <- makeCacheMedusa(phy=phy, richness=richness, all.nodes=all.nodes, shiftCut=shiftCut, mc=mc, numCores=numCores);
  40. desc <- list(stem=obj$desc.stem, node=obj$desc.node);
  41. z <- obj$z;
  42. num.tips <- obj$num.tips;
  43. cat("done.\n\n");
  44. # Fit the base model. This is done first as it will catch potential invalid fixed parameter values.
  45. # 'fit' holds current results; useful for initializing subsequent models
  46. baseFit <- list();
  47. baseFit <- medusaMLFitBase(z=z, sp=sp, model=model, fixPar=fixPar, criterion=criterion);
  48. if (baseFit$lnLik == -Inf && !is.null(fixPar))
  49. {stop("\n\nConstrained model cannot be fit to data with current fixed parameter value. Stopping analysis.\n\n");}
  50. # Pre-fit pendant edges so these values need not be re(re(re))calculated; amounts to ~25% of all calculations
  51. # Will show particular performance gain for edges with many fossil observations
  52. cat("Optimizing parameters for pendant edges... ");
  53. tips <- NULL;
  54. # Will always be shiftCut="stem"; if mixed model, keep only best fit and throw out other in medusaMLPrefit
  55. tips <- prefitTips(pend.nodes=pend.nodes, z=z, sp=sp, model=model, fixPar=fixPar, criterion=criterion,
  56. mc=mc, numCores=numCores);
  57. cat("done.\n");
  58. # Pre-fit virgin internal nodes; should deliver performance gain for early models, and especially for large trees
  59. # Remain useful until a spilt is accepted within the clade
  60. if (length(int.nodes) > 0) { # hmm, when would this be false?
  61. cat("Pre-calculating parameters for internal nodes... ");
  62. virgin.stem <- list(); virgin.node <- list();
  63. if (mc) {
  64. if (shiftCut == "stem" || shiftCut == "both") {
  65. virgin.stem <- parallel::mclapply(int.nodes, medusaMLPrefitStem, z=z, desc=desc$stem, sp=sp, model=model,
  66. fixPar=fixPar, criterion=criterion, mc.cores=numCores);
  67. }
  68. if (shiftCut == "node" || shiftCut == "both") {
  69. virgin.node <- parallel::mclapply(int.nodes, medusaMLPrefitNode, z=z, desc=desc$node, sp=sp, model=model,
  70. fixPar=fixPar, criterion=criterion, mc.cores=numCores);
  71. }
  72. } else {
  73. if (shiftCut == "stem" || shiftCut == "both") {
  74. virgin.stem <- lapply(int.nodes, medusaMLPrefitStem, z=z, desc=desc$stem, sp=sp, model=model,
  75. fixPar=fixPar, criterion=criterion);
  76. }
  77. if (shiftCut == "node" || shiftCut == "both") {
  78. virgin.node <- lapply(int.nodes, medusaMLPrefitNode, z=z, desc=desc$node, sp=sp, model=model,
  79. fixPar=fixPar, criterion=criterion);
  80. }
  81. }
  82. virgin.nodes <- list(stem=virgin.stem, node=virgin.node);
  83. } else {
  84. virgin.nodes <- NULL;
  85. }
  86. cat("done.\n\n");
  87. prefit <- list(tips=tips, virgin.nodes=virgin.nodes, num.tips=num.tips);
  88. stopOnLimit <- function(fit) { # not likely to be used. get rid of it?
  89. optModel <- fit;
  90. cat("Step 1 (of ", modelLimit, "): lnLik=", optModel$lnLik, "; AICc=", optModel$aicc,
  91. "; model=", optModel$model, "\n", sep="");
  92. for (i in seq_len(modelLimit-1)) {
  93. node.list <- all.nodes[-fit$split.at];
  94. if (mc) { # parallel (i.e. multithreaded) processing. No GUI, and not at all on Windows
  95. res <- parallel::mclapply(node.list, medusaMLUpdate, z=z, desc=desc, fit=fit, prefit=prefit, root.node=root.node,
  96. model=model, fixPar=fixPar, criterion=criterion, shiftCut=shiftCut, preserveModelFlavour=preserveModelFlavour, mc.cores=numCores);
  97. } else {
  98. res <- lapply(node.list, medusaMLUpdate, z=z, desc=desc, fit=fit, prefit=prefit, root.node=root.node,
  99. model=model, fixPar=fixPar, criterion=criterion, shiftCut=shiftCut, preserveModelFlavour=preserveModelFlavour);
  100. }
  101. # Select model with best score according to the specific criterion employed (default aicc)
  102. best <- res[[which.min(unlist(lapply(res, "[[", criterion)))]];
  103. node <- best$split.at;
  104. cut <- best$cut.at;
  105. # re-optimize on best break node/position
  106. fit <- medusaFitOptimal(node=node, z=z, desc=desc, fit=fit, prefit=prefit, root.node=root.node, model=model,
  107. fixPar=fixPar, criterion=criterion, shiftCut=cut, preserveModelFlavour=preserveModelFlavour);
  108. z <- medusaSplit(node=node, z=z, desc=desc, shiftCut=tail(fit$cut.at,1))$z;
  109. step <- rbind(optModel$step, c("add", tail(fit$split.at,1)));
  110. # Consider parameter removal
  111. if (stepBack) {
  112. backFit <- backStep(currentModel=fit, z=z, step=step, model=model, fixPar=fixPar, criterion=criterion);
  113. fit <- backFit$fit;
  114. z <- backFit$z;
  115. step <- backFit$step;
  116. }
  117. fit$z <- z;
  118. fit$step <- step;
  119. optModel <- fit;
  120. cat("Step ", i+1, " (of ", modelLimit, "): lnLik=", round(fit$lnLik, digits=7),
  121. "; AICc=", fit$aicc, "; shift at node ", tail(fit$split.at,1), "; model=",
  122. tail(fit$model,1), "; cut=", tail(fit$cut.at,1), "; # shifts=", length(fit$split.at) - 1, "\n", sep="");
  123. }
  124. return(optModel);
  125. }
  126. stopOnThreshold <- function (fit) {
  127. optModel <- fit;
  128. i <- 1;
  129. done <- FALSE;
  130. cat("Step 1: lnLik=", round(optModel$lnLik, digits=7), "; AICc=", round(optModel$aicc, digits=7),
  131. "; model=", optModel$model[1], "\n", sep="");
  132. while (!done && i) {
  133. node.list <- all.nodes[-fit$split.at];
  134. if (mc) # parallel (i.e. multithreaded) processing. No GUI, and not at all on Windows
  135. {
  136. res <- parallel::mclapply(node.list, medusaMLUpdate, z=z, desc=desc, fit=fit, prefit=prefit, root.node=root.node,
  137. model=model, fixPar=fixPar, criterion=criterion, shiftCut=shiftCut, preserveModelFlavour=preserveModelFlavour,
  138. mc.cores=numCores);
  139. } else {
  140. res <- lapply(node.list, medusaMLUpdate, z=z, desc=desc, fit=fit, prefit=prefit, root.node=root.node,
  141. model=model, fixPar=fixPar, criterion=criterion, shiftCut=shiftCut, preserveModelFlavour=preserveModelFlavour);
  142. }
  143. # Select model with best score according to the specific criterion employed (default aicc)
  144. best <- res[[which.min(unlist(lapply(res, "[[", criterion)))]];
  145. node <- best$split.at;
  146. cut <- best$cut.at;
  147. # re-optimize on best break node/position
  148. fit <- medusaFitOptimal(node=node, z=z, desc=desc, fit=fit, prefit=prefit, root.node=root.node, model=model,
  149. fixPar=fixPar, criterion=criterion, shiftCut=cut, preserveModelFlavour=preserveModelFlavour);
  150. z <- medusaSplit(node=node, z=z, desc=desc, shiftCut=tail(fit$cut.at,1))$z;
  151. step <- rbind(optModel$step, c("add", tail(fit$split.at,1)));
  152. # Consider parameter removal
  153. if (stepBack) {
  154. backFit <- backStep(currentModel=fit, z=z, step=step, model=model, fixPar=fixPar, criterion=criterion);
  155. fit <- backFit$fit;
  156. z <- backFit$z;
  157. step <- backFit$step;
  158. }
  159. fit$z <- z;
  160. fit$step <- step;
  161. if (as.numeric(optModel[criterion]) - as.numeric(fit[criterion]) < threshold) {
  162. if (verbose) cat("\nNo significant increase in ", criterion, " score. Disregarding subsequent piecewise models.\n", sep="");
  163. done <- TRUE;
  164. break;
  165. }
  166. if (!is.null(backFit$remove)) {printRemovedShifts(remove=backFit$remove);}
  167. optModel <- fit;
  168. cat("Step ", i+1, ": lnLik=", round(fit$lnLik, digits=7),
  169. "; AICc=", fit$aicc, "; shift at node ", tail(fit$split.at,1), "; model=",
  170. tail(fit$model,1), "; cut=", tail(fit$cut.at,1), "; # shifts=", length(fit$split.at) - 1, "\n", sep="");
  171. i <- i+1;
  172. }
  173. return(optModel);
  174. }
  175. if (stop == "modelLimit") optModel <- stopOnLimit(fit=baseFit) else optModel <- stopOnThreshold(fit=baseFit);
  176. modelSummary <- optModelSummary(optModel);
  177. results <- list(desc=desc, optModel=optModel, phy=phy, fixPar=fixPar, criterion=criterion,
  178. stop=stop, threshold=threshold, modelSummary=modelSummary);
  179. class(results) <- "medusa";
  180. if (verbose) {cat("\n"); cat("Summary of optimal MEDUSA model:\n"); print(modelSummary)};
  181. cat("\n");
  182. return(results);
  183. }
  184. if (class(phy) == "multiPhylo") {
  185. phy <- manageTipLabels(phy=phy, mc=mc, numCores=numCores);
  186. results <- lapply(phy, runMEDUSA, richness=richness, multiTree=TRUE, verbose=FALSE, ...); # prevent extraneous bits from being printed to screen
  187. results <- list(results=results, richness=richness, medusaVersion=packageVersion("MEDUSA"));
  188. class(results) <- "multiMedusa";
  189. } else {
  190. results <- runMEDUSA(phy=phy, richness=richness, verbose=verbose, ...);
  191. results$richness <- richness;
  192. # probably get rid of the following line
  193. #results$medusaVersion <- packageVersion("MEDUSA");
  194. }
  195. invisible(results);
  196. }