knitr::opts_chunk$set(
  collapse = TRUE,
  comment = "#>"
)

if(!requireNamespace("ggtree")) {
  message("Building the vignette requires ggtree R-package. Trying to install.")
  status.ggtree <- try({
    if (!requireNamespace("BiocManager", quietly = TRUE))
      install.packages("BiocManager")
    BiocManager::install("ggtree", version = "3.8")
  }, silent = TRUE)
  if(class(status.ggtree == "try-error")) {
    stop(
      "The ggtree installation did not succeed. The vignette cannot be built.")
  }
}

useCacheGlobalModels <- TRUE
useCacheMGPMTrueTypeMapping <- TRUE
useCacheMGPMTrueTypeMappingCheat <- TRUE

Note: The writing of this vignette is currently in progress. If needed, please, contact the author for assistance. Thanks for your understanding.

Using the PCMFit R-package comprises several steps:

  1. Preparing the data, to which we wish to fit phylogenetic comparative model (PCM);
  2. Deciding on the type of PCM we wish to infer;
  3. Defining the PCM;
  4. Calling a PCMFit model inference function on the data and model type we have defined;
  5. Evaluating the inferred model.

Before we start, let's load the packages needed for this tutorial:

library(PCMBase)
library(PCMFit)
library(data.table)
library(ggplot2)
library(ggtree)
library(cowplot)

# make results reproducible
set.seed(4, kind = "Mersenne-Twister", normal.kind = "Inversion")

In the next sections, we go through each of the above steps:

Step 1. An example dataset

For this tutorial, we will use a simulated non-ultrametric tree of $N=80$ tips and simulated $k=2$-variate trait values. Both of these are stored in the data-table dtSimulated included with the PCMFit package.

The tree

tree <- PCMTree(PCMFitDemoObjects$dtSimulated$tree[[1]])
tree

The data

X <- PCMFitDemoObjects$dtSimulated$X[[1]][, seq_len(PCMTreeNumTips(tree))]
dim(X)

We use the PCMBase::PCMTreePlot() and PCMBase::PCMPlotTraitData2D() functions to generate plots of the tree and the data (note that PCMBase::PCMTreePlot() requires the ggtree R-package which is currently not on CRAN - see the ggtree home page for instructions how to install this package):

plTree <- PCMTreePlot(tree, layout="fan") +
  geom_tiplab2(size = 2) + 
  geom_nodelab(size = 2, color = "black") + 
  geom_treescale(width = max(PCMTreeNodeTimes(tree)), x = 0, linesize = .25, fontsize = 2, offset = 79)

plX <- PCMPlotTraitData2D(
  X[, seq_len(PCMTreeNumTips(tree))], 
  tree, 
  scaleSizeWithTime = FALSE,
  numTimeFacets = 4) +
  geom_text(
    aes(x = x, y = y, label = id, color = regime), 
    size=2, 
    position = position_jitter(.4, .4)) +
  theme_bw() +
  theme(legend.position = "bottom")

cowplot::plot_grid(plTree, plX, labels = LETTERS[1:2], nrow=2, rel_heights = c(2,1))

Step 2. Gaussian model types

In this tutorial, we use the six model types $BM_{A}$, $BM_{B}$, $OU_{C}$, $OU_{D}$, $OU_{E}$, $OU_{F}$ from the $\mathcal{G}_{LInv}$-family as defined in [@Mitov:2019agh]. These model types are also described in the PCMBase Getting started guide:

# scroll to the right in the following listing to see the aliases for the six 
# default model types:
PCMDefaultModelTypes()

Our purpose for this tutorial will be to select an optimal model for the data. We will compare "global" models in which one model of one of the above model types is fit to the entire tree and data, versus a mixed Gaussian phylogenetic model where different models of the above model types are assigned to different parts of the tree.

Step 3. Creating model objects

Global models

A global $BM_{B}$ model

modelBM <- PCM(
  PCMDefaultModelTypes()["B"], modelTypes = PCMDefaultModelTypes(), k = 2)

modelBM

A global $OU_{F}$ model

modelOU <- PCM(
  PCMDefaultModelTypes()["F"], modelTypes = PCMDefaultModelTypes(), k = 2)

modelOU

MGPM models

A MGPM with the true shift-point configuration and model type mapping

modelTrueTypeMapping <- MixedGaussian(
  k = 2,
  modelTypes = MGPMDefaultModelTypes(),
  mapping = c(4, 3, 2),
  X0 = structure(
    0, class = c("VectorParameter",
                 "_Global"), description = "trait values at the root"), 
  Sigmae_x = structure(
    0, class = c("MatrixParameter", "_Omitted",
                 "_Global"),
    description =
      "Upper triangular Choleski factor of the non-phylogenetic variance-covariance"))

treeWithTrueShifts <- PCMTree(PCMFitDemoObjects$dtSimulated$tree[[1]])
PCMTreeSetPartRegimes(
  treeWithTrueShifts, 
  part.regime = c(`81` = 1, `105` = 2, `125` = 3), 
  setPartition = TRUE)

A MGPM with the true shift-point configuration, model type mapping and parameter values

modelTrue <- PCMFitDemoObjects$dtSimulated$model[[1]]
modelTrue

# We specify the tree and trait values for the true model in order to easily 
# calculate parameter count likelihood and AIC for it:
attr(modelTrue, "tree") <- treeWithTrueShifts
attr(modelTrue, "X") <- X
attr(modelTrue, "SE") <- X * 0.0

A view of the data according to the true shift-point configuration

plTree <- PCMTreePlot(treeWithTrueShifts, layout="fan") %<+% 
  data.table(
    node = c(12, 77, 45), 
    part.model = c(" 1.D ", " 2.C ", " 3.B "),
    offset = 5) + 
  geom_tiplab2(size = 2) + 
  geom_tiplab2(aes(label = part.model), offset = 16) + 
  geom_nodelab(size = 2, color = "black") + 
  geom_treescale(
    width = max(PCMTreeNodeTimes(treeWithTrueShifts)), x = 0, 
    linesize = .25, fontsize = 2, offset = 79)

plX <- PCMPlotTraitData2D(
  X[, seq_len(PCMTreeNumTips(treeWithTrueShifts))], 
  treeWithTrueShifts, 
  scaleSizeWithTime = FALSE,
  numTimeFacets = 4) +
  geom_text(
    aes(x = x, y = y, label = id, color = regime), 
    size=2, 
    position = position_jitter(.4, .4)) +
  theme_bw() +
  theme(legend.position = "bottom")

cowplot::plot_grid(plTree, plX, labels = LETTERS[1:2], nrow = 2, rel_heights = c(2,1))

Step 4. Model inference

Specifying parameter limits

An optional but important preliminary step is to explicitly specify the limits for the model parameters. This is needed, because the default settings might not be appropriate for the data in question. Here is an example how the model parameter limits can be set. Note that we specify these limits for the base "BM" and "OU" model types, but they are inherited by their subtypes A, ..., F, as specified in the comments.

## File: DefineParameterLimits.R
# lower limits for models A and B
PCMParamLowerLimit.BM <- function(o, k, R, ...) {
  o <- NextMethod()
  k <- attr(o, "k", exact = TRUE)
  R <- length(attr(o, "regimes", exact = TRUE))

  if(is.Global(o$Sigma_x)) {
    if(!is.Diagonal(o$Sigma_x)) {
      o$Sigma_x[1, 2] <- -.0
    }
  } else {
    if(!is.Diagonal(o$Sigma_x)) {
      for(r in seq_len(R)) {
        o$Sigma_x[1, 2, r] <- -.0
      }
    }
  }
  o
}

# upper limits for models A and B
PCMParamUpperLimit.BM <- function(o, k, R, ...) {
  o <- NextMethod()
  k <- attr(o, "k", exact = TRUE)
  R <- length(attr(o, "regimes", exact = TRUE))

  if(is.Global(o$Sigma_x)) {
    o$Sigma_x[1, 1] <- o$Sigma_x[2, 2] <- 1.0
    if(!is.Diagonal(o$Sigma_x)) {
      o$Sigma_x[1, 2] <- 1.0
    }
  } else {
    for(r in seq_len(R)) {
      o$Sigma_x[1, 1, r] <- o$Sigma_x[2, 2, r] <- 1.0
      if(!is.Diagonal(o$Sigma_x)) {
        o$Sigma_x[1, 2, r] <- 1.0
      }
    }
  }
  o
}

# lower limits for models C, ..., F.
PCMParamLowerLimit.OU <- function(o, k, R, ...) {
  o <- NextMethod()
  k <- attr(o, "k", exact = TRUE)
  R <- length(attr(o, "regimes", exact = TRUE))

  if(is.Global(o$Theta)) {
    o$Theta[1] <- 0.0
    o$Theta[2] <- -1.2
  } else {
    for(r in seq_len(R)) {
      o$Theta[1, r] <- 0.0
      o$Theta[2, r] <- -1.2
    }
  }
  if(is.Global(o$Sigma_x)) {
    if(!is.Diagonal(o$Sigma_x)) {
      o$Sigma_x[1, 2] <- -.0
    }
  } else {
    if(!is.Diagonal(o$Sigma_x)) {
      for(r in seq_len(R)) {
        o$Sigma_x[1, 2, r] <- -.0
      }
    }
  }
  o
}

# upper limits for models C, ..., F.
PCMParamUpperLimit.OU <- function(o, k, R, ...) {
  o <- NextMethod()
  k <- attr(o, "k", exact = TRUE)
  R <- length(attr(o, "regimes", exact = TRUE))

  if(is.Global(o$Theta)) {
    o$Theta[1] <- 7.8
    o$Theta[2] <- 4.2
  } else {
    for(r in seq_len(R)) {
      o$Theta[1, r] <- 7.8
      o$Theta[2, r] <- 4.2
    }
  }
  if(is.Global(o$Sigma_x)) {
    o$Sigma_x[1, 1] <- o$Sigma_x[2, 2] <- 1.0
    if(!is.Diagonal(o$Sigma_x)) {
      o$Sigma_x[1, 2] <- 1.0
    }
  } else {
    for(r in seq_len(R)) {
      o$Sigma_x[1, 1, r] <- o$Sigma_x[2, 2, r] <- 1.0
      if(!is.Diagonal(o$Sigma_x)) {
        o$Sigma_x[1, 2, r] <- 1.0
      }
    }
  }
  o
}

Fitting the global models

if(is.null(PCMFitDemoObjects$fitBM) || !useCacheGlobalModels) {
  fitBM <- PCMFit(model = modelBM, tree = tree, X = X,
                  metaI = PCMBaseCpp::PCMInfoCpp)
  PCMFitDemoObjects$fitBM <- fitBM
  usethis::use_data(PCMFitDemoObjects, overwrite = TRUE)
} 

if(is.null(PCMFitDemoObjects$fitOU) || !useCacheGlobalModels) {
  fitOU <- PCMFit(model = modelOU, tree = tree, X = X, 
                  metaI = PCMBaseCpp::PCMInfoCpp)
  PCMFitDemoObjects$fitOU <- fitOU
  usethis::use_data(PCMFitDemoObjects, overwrite = TRUE)
} 

fitBM <- PCMFitDemoObjects$fitBM
fitOU <- PCMFitDemoObjects$fitOU
fitBM <- PCMFit(model = modelBM, tree = tree, X = X, 
                metaI = PCMBaseCpp::PCMInfoCpp)
fitOU <- PCMFit(model = modelOU, tree = tree, X = X, 
                metaI = PCMBaseCpp::PCMInfoCpp)

Fitting an MGPM model with known shift-point configuration and model type mapping but unknown model parameters

if(is.null(PCMFitDemoObjects$fitMGPMTrueTypeMapping) || !useCacheMGPMTrueTypeMapping) {
  fitMGPMTrueTypeMapping <- PCMFit(
    model = modelTrueTypeMapping, 
    tree = treeWithTrueShifts,
    X = X,
    metaI = PCMBaseCpp::PCMInfoCpp)

  PCMFitDemoObjects$fitMGPMTrueTypeMapping <- fitMGPMTrueTypeMapping
  usethis::use_data(PCMFitDemoObjects, overwrite = TRUE)
}

fitMGPMTrueTypeMapping <- PCMFitDemoObjects$fitMGPMTrueTypeMapping
# This can take about 5 minutes to finish
fitMGPMTrueTypeMapping <- PCMFit(
  model = modelTrueTypeMapping, 
  tree = treeWithTrueShifts, 
  X = X,
  metaI = PCMBaseCpp::PCMInfoCpp)

Fitting an MGPM model with known shift-point configuration and model type mapping starting from the true parameter values

if(is.null(PCMFitDemoObjects$fitMGPMTrueTypeMappingCheat) || !useCacheMGPMTrueTypeMappingCheat) {
  fitMGPMTrueTypeMappingCheat <- PCMFit(
    model = modelTrueTypeMapping, 
    tree = treeWithTrueShifts, 
    X = X, 
    matParInit = matrix(PCMParamGetShortVector(modelTrue), 1L),
    numRunifInitVecParams = 1000L,
    numGuessInitVecParams = 100L,
    metaI = PCMBaseCpp::PCMInfoCpp)

  PCMFitDemoObjects$fitMGPMTrueTypeMappingCheat <- fitMGPMTrueTypeMappingCheat
  usethis::use_data(PCMFitDemoObjects, overwrite = TRUE)
}

fitMGPMTrueTypeMappingCheat <- PCMFitDemoObjects$fitMGPMTrueTypeMappingCheat
fitMGPMTrueTypeMappingCheat <- PCMFit(
  model = modelTrueTypeMapping, 
  tree = treeWithTrueShifts, 
  X = X,
  matParInit = matrix(PCMParamGetShortVector(modelTrue), 1L),
  numRunifInitVecParams = 1000L,
  numGuessInitVecParams = 100L,
  metaI = PCMBaseCpp::PCMInfoCpp)

Step 5. Evaluating the model fits

listModels <- list(
  RetrieveBestModel(fitBM), 
  RetrieveBestModel(fitOU),
  RetrieveBestModel(fitMGPMTrueTypeMapping), 
  RetrieveBestModel(fitMGPMTrueTypeMappingCheat), 
  modelTrue)

dtSummary <- data.table(
  model = c(
    "Global BM", 
    "Global OU", 
    "True MGPM, unknown parameters", 
    "True MGPM, known true parameters", 
    "True MGPM, true parameters"),
  p = sapply(listModels, PCMParamCount),
  logLik = sapply(listModels, logLik), 
  AIC = sapply(listModels, AIC))
knitr::kable(dtSummary)

References



venelin/PCMFit documentation built on June 7, 2021, 12:14 p.m.