knitr::opts_chunk$set(
  collapse = TRUE,
  comment = "#>"
)

if(!requireNamespace("ggtree")) {
  message("Building the vignette requires ggtree R-package. Trying to install.")
  status.ggtree <- try({
    if (!requireNamespace("BiocManager", quietly = TRUE))
      install.packages("BiocManager")
    BiocManager::install("ggtree", version = "3.8")
  }, silent = TRUE)
  if(class(status.ggtree == "try-error")) {
    stop(
      "The ggtree installation did not succeed. The vignette cannot be built.")
  }
}

useCacheMGPM_A_F_BC2_RR <- TRUE

Note: The writing of this vignette is currently in progress. If needed, please, contact the author for assistance. Thanks for your understanding.

The main R-script

Loading needed packages and parsing command line arguments

library(PCMBase)
library(PCMBaseCpp)
library(PCMFit)
library(data.table)
# other needed packages, e.g. ape, data.table etc...

# A character string used in filenames for a model inference on a given data:
prefixFiles = paste0("MGPM_A_F_BC2_RR")

Setting up a parallel cluster

Using MPI

# creating the cluster for this PCMFit run:
if(!exists("cluster") || is.null(cluster)) {
  if(require(doMPI)) {
    # using MPI cluster as distributed node cluster (possibly running on a 
    # cluster of multiple nodes)
    # Get the number of cores. Assume this is run in a batch job.
    p = strtoi(Sys.getenv('LSB_DJOB_NUMPROC'))
    cluster <- startMPIcluster(count = p-1, verbose = TRUE)
    doMPI::registerDoMPI(cluster)
  } else {
    warning("Could not create an MPI cluster")
  }
}

Using a personal computer without mpi installation

if(!exists("cluster") || is.null(cluster)) {
  if(require(parallel)) {
    # possibly running on personal computer without mpi installation
    cluster <- parallel::makeCluster(
      parallel::detectCores(logical = TRUE),
      outfile = paste0("log_", prefixFiles, ".txt"))
    doParallel::registerDoParallel(cluster)
  } else {
    warning("Could not create a parallel cluster")
  }
}

Generating PCMModels on the worker nodes

# This function is going to be executed on each worker node.
generatePCMModelsFunction <- function() {
  # make results reproducible
  set.seed(4, kind = "Mersenne-Twister", normal.kind = "Inversion")

  PCMGenerateModelTypes()
  # An example DefineParameterLimits.R file can be found in 
  # vignettes/DefineParameterLimits.R of the PCMFit package source-code. 
  # Note that the path here is relative to the working directory of
  # the worker node R-process. 
  source('DefineParameterLimits.R', local=FALSE)
}

Tree and trait data

tree <- PCMTree(PCMFitDemoObjects$dtSimulated$tree[[1]])
X <- PCMFitDemoObjects$dtSimulated$X[[1]][, seq_len(PCMTreeNumTips(tree))]

Running the MGPM Inference

currentResultFile <- paste0("Current_", prefixFiles, ".RData")
if(file.exists(currentResultFile)) {
  load(currentResultFile)
  tableFitsPrev <- listResults$tableFits
} else {
  tableFitsPrev <- NULL
}

if(is.null(PCMFitDemoObjects$fitMGPM_A_F_BC2_RR) || 
   !useCacheMGPM_A_F_BC2_RR) {
  fitMGPM_A_F_BC2_RR <- PCMFitMixed(
    X = X, tree = tree, metaIFun = PCMInfoCpp,
    tableFitsPrev = tableFitsPrev,
    generatePCMModelsFun = generatePCMModelsFunction, 
    maxNumRoundRobins = 2, maxNumPartitionsInRoundRobins = 2,
    printFitVectorsToConsole = TRUE,
    doParallel = TRUE)

  PCMFitDemoObjects$fitMGPM_A_F_BC2_RR <- fitMGPM_A_F_BC2_RR
  usethis::use_data(PCMFitDemoObjects, overwrite = TRUE)
} 

fitMGPM_A_F_BC2_RR <- PCMFitDemoObjects$fitMGPM_A_F_BC2_RR
currentResultFile <- paste0("Current_", prefixFiles, ".RData")
if(file.exists(currentResultFile)) {
  load(currentResultFile)
  tableFitsPrev <- listResults$tableFits
} else {
  tableFitsPrev <- NULL
}

fitMGPM_A_F_BC2_RR <- PCMFitMixed(
    X = X, tree = tree, metaIFun = PCMInfoCpp,
    generatePCMModelsFun = generatePCMModelsFunction, 
    maxNumRoundRobins = 2, maxNumPartitionsInRoundRobins = 2,
    tableFitsPrev = tableFitsPrev,
    prefixFiles = prefixFiles,
    doParallel = TRUE)

Retrieving and analyzing the best scoring model fit

modelTrue <- PCMFitDemoObjects$dtSimulated$model[[1]]

# We specify the tree and trait values for the true model in order to easily 
# calculate parameter count likelihood and AIC for it:
attr(modelTrue, "tree") <- PCMFitDemoObjects$dtSimulated$treeWithRegimes[[1]]
attr(modelTrue, "X") <- X
attr(modelTrue, "SE") <- X * 0.0
bestFit <- RetrieveBestFitScore(fitMGPM_A_F_BC2_RR)

modelTrue <- PCMFitDemoObjects$dtSimulated$model[[1]]
# We specify the tree and trait values for the true model in order to easily 
# calculate parameter count likelihood and AIC for it:
attr(modelTrue, "tree") <- PCMFitDemoObjects$dtSimulated$treeWithRegimes[[1]]
attr(modelTrue, "X") <- X
attr(modelTrue, "SE") <- X * 0.0

listModels <- list(
  RetrieveBestModel(PCMFitDemoObjects$fitBM), 
  RetrieveBestModel(PCMFitDemoObjects$fitOU),
  RetrieveBestModel(PCMFitDemoObjects$fitMGPMTrueTypeMapping), 
  RetrieveBestModel(PCMFitDemoObjects$fitMGPMTrueTypeMappingCheat), 
  modelTrue,
  bestFit$inferredModel)

dtSummary <- data.table(
  model = c(
    "Global BM", 
    "Global OU", 
    "True MGPM, known shifts, unknown parameters", 
    "True MGPM, known shifts, starting from known true parameters (cheating)", 
    "True MGPM, fixed true shifts and true parameters",
    "Inferred MGPM, unknown shifts and parameters"),
  p = sapply(listModels, PCMParamCount),
  logLik = sapply(listModels, logLik), 
  AIC = sapply(listModels, AIC))
knitr::kable(dtSummary)
library(ggtree)
library(ggplot2)

TipLabelTable <- function(model, ...) {
  tree <- PCMTree(attr(model, "tree"))
  data.table(
    node = sapply(sapply(PCMRegimes(tree), 
                         PCMTreeGetTipsInRegime, 
                         tree = tree), sample, size = 1),
    part.model = 
      paste0(" ", PCMRegimes(tree), ".", 
             LETTERS[PCMMapModelTypesToRegimes(model, tree)], " "), 
    ...)
}

plTree <- PCMTreePlot(attr(modelTrue, "tree"), layout="fan") %<+% 
  TipLabelTable(modelTrue, offset = 5) + 
  geom_tiplab2(size = 2) + 
  geom_tiplab2(aes(label = part.model), offset = 16) + 
  geom_nodelab(size = 2, color = "black") + 
  geom_treescale(
    width = max(PCMTreeNodeTimes(attr(modelTrue, "tree"))), x = 0, 
    linesize = .25, fontsize = 2, offset = 79)

plX <- PCMPlotTraitData2D(
  X[, seq_len(PCMTreeNumTips(attr(modelTrue, "tree")))], 
  attr(modelTrue, "tree"), 
  scaleSizeWithTime = FALSE,
  numTimeFacets = 4, nrowTimeFacets = 2, ncolTimeFacets = 2) +
  geom_text(
    aes(x = x, y = y, label = id, color = regime), 
    size=2, 
    position = position_jitter(.4, .4)) +
  theme_bw() +
  theme(legend.position = "bottom")

plTreeInferred <- 
  PCMTreePlot(attr(bestFit$inferredModel, "tree"), layout="fan") %<+% 
  TipLabelTable(bestFit$inferredModel, offset = 5) +
  geom_tiplab2(size = 2) + 
  geom_tiplab2(aes(label = part.model), offset = 16) + 
  geom_nodelab(size = 2, color = "black") + 
  geom_treescale(
    width = max(PCMTreeNodeTimes(attr(bestFit$inferredModel, "tree"))), x = 0, 
    linesize = .25, fontsize = 2, offset = 79)

plXInferred <- PCMPlotTraitData2D(
  X[, seq_len(PCMTreeNumTips(attr(bestFit$inferredModel, "tree")))], 
  attr(bestFit$inferredModel, "tree"), 
  scaleSizeWithTime = FALSE,
  numTimeFacets = 4, nrowTimeFacets = 2, ncolTimeFacets = 2) +
  geom_text(
    aes(x = x, y = y, label = id, color = regime), 
    size=2, 
    position = position_jitter(.4, .4)) +
  theme_bw() +
  theme(legend.position = "bottom")

cowplot::plot_grid(
  plTree, plX, plTreeInferred, plXInferred, labels = LETTERS[1:4], nrow = 2)

References



venelin/PCMFit documentation built on June 7, 2021, 12:14 p.m.