R/simulatePhyloPoissonLogNormal.R

Defines functions prune_tree_one_obs nEffRatio nEffPhylolm nEffSchur nEffNaive trim_tree generateLengthsPhylo get_model_factor scale_variance_process simulateDataPhylo NB_to_PLN get_poisson_log_normal_parameters add_replicates checkSpecies checkParamVector checkParamMatrix simulatePhyloPoissonLogNormal

Documented in add_replicates checkParamMatrix checkParamVector checkSpecies generateLengthsPhylo get_model_factor get_poisson_log_normal_parameters NB_to_PLN nEffNaive nEffRatio scale_variance_process simulateDataPhylo simulatePhyloPoissonLogNormal

#' @title Simulate Tree Structured Counts
#'
#' @description
#' Simulate a tree structured matrix of counts according to a Poisson-lognormal
#' model, with the log parameter of the poisson following a Brownian Motion (BM)
#' on the tree with noise.
#'
#' @details
#' For each gene, the log-lambda parameter evolves like a BM on the tree,
#' with an extra independent variance noise that can depend on the species.
#' Each gene has its own tree variance for the BM.
#' Each gene and each species has its own mean.
#' The counts for each gene and each species are then obtained as a Poisson draw
#' with a different lambda parameter, as generated by the BM.
#' 
#' @param tree A phylogenetic tree with n tips.
#' @param log_means a matrix with the number of genes p rows and the number of
#' species n columns. Column names should match the tree taxa names.
#' @param log_variance_phylo a vector of length p of phylogenetic variances for 
#' the BM in the log space for each gene.
#' @param log_variance_sample a matrix of size p x n of environmental variances
#' for individual variations in the log space, for each gene and species.
#' Column names should match the tree taxa names.
#' 
#' @return A list, with:
#' \describe{
#' \item{log_lambda}{the p x n matrix of log-lambda simulated by the BM on the tree.}
#' \item{counts}{the p x n matrix of counts with corresponding Poisson draws.}
#' }
#' 
#' @keywords internal
#' 
simulatePhyloPoissonLogNormal <- function(tree, log_means, log_variance_phylo, log_variance_sample, model.process = "BM", selection.strength = 0) {
  
  ## Check for packages
  if (!requireNamespace("ape", quietly = TRUE) || !requireNamespace("phylolm", quietly = TRUE)) {
    stop("Packages 'ape' and 'phylolm' are needed for tree simulations.", call. = FALSE)
  }
  
  ## Check means
  log_means <- checkParamMatrix(log_means, "log means", tree)
  log_variance_sample <- checkParamMatrix(log_variance_sample, "log variance sample", tree)
  
  N <- length(tree$tip.label)
  P <- nrow(log_means)
  
  ## Check Dimension
  if (nrow(log_variance_sample) != P || length(log_variance_phylo) != P) {
    stop("`log_means` and `log_variance_sample should have as many rows as the length of `log_variance_phylo`.")
  }
  
  ## Phylogenetic simulation of log(lambda)
  resids <- phylolm::rTrait(n = P,
                            phy = tree,
                            model = model.process,
                            parameters = list(sigma2 = 1, ancestral.state = 0, optimal.value = 0, alpha = selection.strength),
                            plot.tree = FALSE)
  
  log_sd_phylo <- scale_variance_process(log_variance_phylo, tree, model.process, selection.strength)
  
  ## phylo variances
  resids <- resids * log_sd_phylo
  
  ## resid variances
  resids <- t(resids) + matrix(rnorm(N * P, mean = 0, sd = sqrt(log_variance_sample)), ncol = N)
  
  ## Expectations
  log_lambda <- resids + log_means
  
  ## Poisson
  counts <- matrix(rpois(N * P, exp(log_lambda)), ncol = N)
  
  ## Names
  colnames(counts) <- colnames(log_lambda) <- tree$tip.label
  
  return(list(log_lambda = log_lambda,
              counts = counts))
}

#' @title Check Matrix Parameter
#'
#' @description
#' Check that the parameters are compatible with the tree. Throws an error if not.
#' 
#' @param x matrix of parameters being tested.
#' @param name name of the parameter.
#' @param tree A phylogenetic tree with n tips.
#' 
#' @keywords internal
#' 
checkParamMatrix <- function(x, name, tree) {
  N <- length(tree$tip.label)
  
  if (ncol(x) != N) {
    stop(paste0("`", name, "` should have as many columns as the number of taxa in the tree."))
  }
  if (is.null(tree$tip.label)) stop("The tips of the phylogeny are not named.")
  if (is.null(colnames(x))){
    warning(paste0("Columns of `", name, "` are not named. I'm naming them, assuming they are in the same order as the tree."))
    colnames(x) <- tree$tip.label
  } else {
    if (!all(tree$tip.label == colnames(x))){
      correspondances <- match(tree$tip.label, colnames(x))
      if (length(na.omit(unique(correspondances))) != length(tree$tip.label)){
        stop(paste0("`", name, "` names do not match the tip labels."))
      }
      warning(paste0("`", name, "` was not sorted in the correct order, when compared with the tips label. I am re-ordering it."))
      x <- x[, correspondances, drop = FALSE]
    }
  }
  return(x)
}

#' @title Check Vector Parameter
#'
#' @description
#' Check that the parameters are compatible with the tree. Throws an error if not.
#' 
#' @param x vector of parameters being tested.
#' @param name name of the parameter.
#' @param tree A phylogenetic tree with n tips.
#' 
#' @keywords internal
#' 
checkParamVector <- function(x, name, tree) {
  N <- length(tree$tip.label)
  
  if (length(x) != N) {
    stop(paste0("`", name, "` should have the same length as the number of taxa in the tree."))
  }
  if (is.null(tree$tip.label)) stop("The tips of the phylogeny are not named.")
  if (is.null(names(x))){
    warning(paste0("`", name, "` is not named. I'm naming them, assuming they are in the same order as the tree."))
    names(x) <- tree$tip.label
  } else {
    if (!all(tree$tip.label == names(x))){
      correspondances <- match(tree$tip.label, names(x))
      if (length(na.omit(unique(correspondances))) != length(tree$tip.label)){
        stop(paste0("`", name, "` names do not match the tip labels."))
      }
      warning(paste0("`", name, "` was not sorted in the correct order, when compared with the tips label. I am re-ordering it."))
      x <- x[correspondances]
    }
  }
  return(x)
}

#' @title Check Species
#'
#' @description
#' Check that the parameters are compatible with the tree. Throws an error if not.
#' 
#' @inheritParams checkParamVector
#' 
#' @keywords internal
#' 
checkSpecies <- function(x, name, tree, tol, check.id.species) {
  x <- checkParamVector(x, name, tree)
  if (check.id.species) {
    related_matrices <- sapply(x, function(i) i == x)
    if (!all(related_matrices == (ape::cophenetic.phylo(tree) <= tol))) {
      stop("The provided species do not match with the tree branch lengths. Please check the 'id.species' vector.")
    }
  }
  return(x)
}

#' @title Add replicates to a tree
#'
#' @description
#' Utility function to add replicates to a tree, as tips with zero length branches.
#' 
#' @param tree A phylogenetic tree with n tips.
#' @param r the number of replicates too add at each species.
#' 
#' @return A phylogenetic tree with n * r tips, and clusters of tips with zero branch lengths.
#' 
#' @keywords internal
#' 
add_replicates <- function(tree, r) {
  if (!requireNamespace("phytools", quietly = TRUE)) {
    stop("Package 'phytools' is needed for function 'add_replicates'.", call. = FALSE)
  }
  tree_rep <- tree
  # Add replicates
  for (tip_label in tree$tip.label) {
    for (rep in seq_len(r)) {
      tree_rep <- phytools::bind.tip(tree_rep, tip.label = paste0(tip_label, "_", rep),
                                     where = which(tree_rep$tip.label == tip_label))
    }
  }
  # Remove original tips
  tree_rep <- ape::drop.tip(tree_rep, tree$tip.label)
  return(tree_rep)
}

#' @title Compute log means and variances
#'
#' @description 
#' From the parameters of a negative binomial (count_means and count_dispersions),
#' compute the parameters of a phylogenetic Poisson log-normal with the
#' same expectations and variances.
#' 
#' @param count_means a matrix with the number of genes p rows and the number of
#' species n columns. Column names should match the tree taxa names.
#' @param count_dispersions a matrix of size p x n, for each gene and species.
#' Column names should match the tree taxa names.
#' 
#' @return A list, with:
#' \describe{
#' \item{log_means}{the p x n matrix of log-means for Poisson-lognormal simulations.}
#' \item{log_variance_phylo}{the p vector of phylogenetic log-variances for Poisson-lognormal simulations.}
#' \item{log_variance_sample}{the p x n matrix of environmental log-variances for Poisson-lognormal simulations.}
#' }
#' 
#' @keywords internal
#' 
get_poisson_log_normal_parameters <- function(count_means, count_dispersions, prop.var.tree) {
  N <- ncol(count_means)
  P <- nrow(count_means)
  
  ## Check Dimension
  if (nrow(count_dispersions) != P || ncol(count_dispersions) != N) {
    stop("`count_means` and `count_dispersions` should have the same dimensions.")
  }
  
  ## Check Proportion
  if (!is.vector(prop.var.tree)) {
    stop("`prop.var.tree` should be a vector.")
  }
  if (length(prop.var.tree) != P) {
    if (length(prop.var.tree) != 1) stop("`prop.var.tree` should be a vector of length the number of genes, or a scalar (in which case it will be recycled).")
  }
  if (any(prop.var.tree > 1.0 | prop.var.tree < 0.0)) {
    stop("`prop.var.tree` should be between 0 and 1.")
  }
  
  ## Parameters of the PLN
  params_PLN <- NB_to_PLN(count_means, count_dispersions)
  log_means <- params_PLN$log_means_pln
  log_variances_all <- params_PLN$log_variances_pln
  
  ## Phylo variances
  ## Check for packages
  if (!requireNamespace("matrixStats", quietly = TRUE)) {
    log_variance_phylo <- apply(log_variances_all, 1, min) * prop.var.tree
  } else {
    log_variance_phylo <- matrixStats::rowMins(log_variances_all) * prop.var.tree
  }
  
  ## Sample variances
  log_variance_sample <- log_variances_all - log_variance_phylo %*% t(rep(1, N))
  
  return(list(log_means = log_means,
              log_variance_phylo = log_variance_phylo,
              log_variance_sample = log_variance_sample))
}

#' @title Negative Binomial to Poisson Log-Normal
#'
#' @description 
#' From the parameters of a negative binomial (mean and dispersion),
#' compute the parameters of a Poisson log-normal with the
#' same expectation and variance.
#' 
#' @param mean mean of the negative binomial.
#' @param dispersion dispersion of the negative binomial.
#' 
#' @return A list, with:
#' \describe{
#' \item{log_means_pln}{Mean of the Poisson log normal in the log space.}
#' \item{log_variances_pln}{Variance of the Poisson log normal in the log space.}
#' }
#' 
#' @keywords internal
#' 
NB_to_PLN <- function(mean, dispersion) {
  ## Variance in log space
  log_variances_pln <- log(1 + dispersion)
  ## Mean in log space
  log_means_pln <- log(mean) - 0.5 * log_variances_pln
  
  return(list(log_means_pln = log_means_pln,
              log_variances_pln = log_variances_pln))
}

#' @title Simulate the Data using the tree
#'
#' @description 
#' Use the Phylogenetic Poisson Log Normal model to simulate the data.
#' 
#' @inheritParams generateSyntheticData 
#' @inheritParams simulateData
#' 
#' @return Z a matrix with the data
#' 
#' @keywords internal
#' 
simulateDataPhylo <- function(count_means,
                              count_dispersions,
                              tree, prop.var.tree,
                              model.process = "BM", selection.strength = 0) {
  ### Initialize data matrix
  colnames(count_means) <- colnames(count_dispersions) <- tree$tip.label
  
  ### Get Equivalent Phylogenetic Poisson log-normal parameters
  params_poisson_lognormal <- get_poisson_log_normal_parameters(count_means, count_dispersions, prop.var.tree)
  
  ### Simulate Data
  res_sim <- simulatePhyloPoissonLogNormal(tree,
                                           params_poisson_lognormal$log_means,
                                           params_poisson_lognormal$log_variance_phylo,
                                           params_poisson_lognormal$log_variance_sample,
                                           model.process = model.process,
                                           selection.strength = selection.strength)
  return(res_sim$counts)
}

#' @title Scale the variances
#'
#' @description 
#' Scale the variances of the process simulation so that they are equal to log_variance_phylo.
#' 
#' @inheritParams generateSyntheticData 
#' @inheritParams simulateData
#' 
#' @return A matrix N * P of factors to multiply the simulated phylogenetic residuals.
#' 
#' @keywords internal
#' 
scale_variance_process <- function(log_variance_phylo, tree, model.process, selection.strength) {
  fac <- get_model_factor(model.process, selection.strength, tree)
  return(fac %*% t(sqrt(log_variance_phylo)))
}

#' @title Get the scaling factor
#'
#' @description 
#' Get the scaling factors.
#' 
#' @inheritParams generateSyntheticData 
#' 
#' @return A vector N of factors.
#' 
#' @keywords internal
#' 
get_model_factor <- function(model.process, selection.strength, tree) {
  heights <- ape::node.depth.edgelength(tree)[1:length(tree$tip.label)]
  if (model.process == "BM" || selection.strength == 0) {
    return(sqrt(1 / heights))
  } else if (model.process == "OU") {
    return(sqrt(1 / expm1(-2 * selection.strength * heights) * (-2 * selection.strength)))
  } else {
    stop("Process not yet implemented.")
  }
}

#' @title Simulate a length matrix with a phylo model
#'
#' @description 
#' Simulate a length matrix of size n.vars times n.sample, with the length of
#' each gene in each sample. 
#' 
#' @param tree The phylogeneti tree.
#' @param id.species An n.sample vector, indicating the species of each sample.
#' @param lengths.relmeans A vector of mean values to use in the simulation of
#' lengths from the Negative Binomial distribution.
#' @param lengths.dispersions A vector or matrix of dispersions to use in the
#' simulation of data from the Negative Binomial distribution.
#' @param lengths.lambda A vector of heritability parameters to use in the
#' simulation of data from the lambda model.
#' 
#' @return A matrix of the same size as 'length_matrix', with normalization
#' factors to be applied for each sample and each gene.
#' 
#' @keywords internal
#'
generateLengthsPhylo <- function(tree, id.species, lengths.relmeans, lengths.dispersions) {
  tree_sp <- trim_tree(tree, id.species)
  ntaxa <- length(unique(id.species))
  params_simus <- getNegativeBinomialParameters(n.vars = length(lengths.relmeans),
                                                S1 = 1:ntaxa, prob.S1 = lengths.relmeans, sum.S1 = 1.0, truedispersions.S1 = lengths.dispersions, nfact_length.S1 = matrix(NA, 0, 0),
                                                S2 = NULL, prob.S2 = 0, sum.S2 = 0, truedispersions.S2 = 0, nfact_length.S2 = matrix(NA, 0, 0),
                                                seq.depths = rep(1.0, ntaxa))
  lengths_unique <- simulateDataPhylo(params_simus$count_means,
                                      params_simus$count_dispersions,
                                      tree = tree_sp, prop.var.tree = 1.0,
                                      model.process = "BM", selection.strength = 0) 
  length_matrix <- lengths_unique[, id.species]
  return(length_matrix)
}

trim_tree <- function(tree, id.species) {
  tips_to_keep <- tree$tip.label[!duplicated(id.species)]
  tree_sp <- ape::keep.tip(tree, tips_to_keep)
  tree_sp$tip.label <- as.character(unique(id.species))
  return(tree_sp)
}

#' @title Effective Sample Size
#'
#' @description 
#' Sample size so that the variance of the estimator is sigma^2 / nEff.
#' 
#' @param tree A phylogenetic tree. If \code{NULL}, samples are assumed to be iid.
#' @param id.condition A named vector giving the state of each tip (sample).
#' @param model The trait evolution model. One of "BM" or "OU".
#' @param selection.strength If \code{model="OU"}, the selection strength parameter.
#' 
#' @return The effective sample size.
#' 
#' @keywords internal
#' 
nEffNaive <- function(tree, id.condition, model, selection.strength) {
  if (length(unique(id.condition)) != 2) stop("There should be only two conditions")
  if (is.null(tree)) {
    n <- length(id.condition)
    id_cond_uni <- id.condition
    Vinv <- diag(1, n)
  } else {
    # Check vector order
    id.condition <- checkParamVector(id.condition, "id.condition", tree)
    # tree
    tree_uni <- prune_tree_one_obs(tree)
    id_cond_uni <- id.condition[tree_uni$tip.label]
    n <- length(tree_uni$tip.label)
    if (model == "OU") model <- "OUfixedRoot"
    V <- ape::vcv(phylolm::transf.branch.lengths(tree_uni, model = model, list(alpha = selection.strength))$tree)
    Vinv <- try(solve(V))
    if (inherits(Vinv, 'try-error')) Vinv <- ginv(V)
  }
  # Intercept
  X <- rep(1, n)
  # Model matrix
  Z <- stats::model.matrix(~factor(id_cond_uni))[, -1, drop = FALSE]
  # Variance of the condition
  X <- cbind(X, Z)
  nEff <- 1 / solve(t(X) %*% Vinv %*% X)[2, 2]
  return(nEff)
}

nEffSchur <- function(tree, id.condition, model, selection.strength) {
  if (length(unique(id.condition)) != 2) stop("There should be only two conditions")
  if (is.null(tree)) {
    n <- length(id.condition)
    id_cond_uni <- id.condition
    Vinv <- diag(1, n)
  } else {
    # Check vector order
    id.condition <- checkParamVector(id.condition, "id.condition", tree)
    # tree
    tree_uni <- prune_tree_one_obs(tree)
    id_cond_uni <- id.condition[tree_uni$tip.label]
    n <- length(tree_uni$tip.label)
    if (model == "OU") model <- "OUfixedRoot"
    V <- ape::vcv(phylolm::transf.branch.lengths(tree_uni, model = model, list(alpha = selection.strength))$tree)
    Vinv <- try(solve(V))
    if (inherits(Vinv, 'try-error')) Vinv <- ginv(V)
  }
  # Intercept
  X <- rep(1, n)
  # Model matrix
  Z <- stats::model.matrix(~factor(id_cond_uni))[, -1, drop = FALSE]
  # Variance of the condition
  X <- cbind(X, Z)
  M <- t(X) %*% Vinv %*% X
  nEff <- det(M) / M[1,1]
  return(nEff)
}

nEffPhylolm <- function(tree, id.condition, model, selection.strength) {
  if (length(unique(id.condition)) != 2) stop("There should be only two conditions")
  if (is.null(tree)) return(nEffNaive(tree, id.condition, model, selection.strength))
  # Check vector order
  id.condition <- checkParamVector(id.condition, "id.condition", tree)
  # tree
  tree_uni <- prune_tree_one_obs(tree)
  id_cond_uni <- id.condition[tree_uni$tip.label]
  n <- length(tree_uni$tip.label)
  # phylolm
  e1 <- rnorm(n)
  names(e1) <- tree_uni$tip.label
  if (model == "OU") model <- "OUfixedRoot"
  tree_trans <- phylolm::transf.branch.lengths(tree_uni, model = model, list(alpha = selection.strength))$tree
  phyfit <- phylolm::phylolm(e1 ~ 1 + as.factor(id_cond_uni), phy = tree_trans)
  nEff <- 1 / (phyfit$vcov[2, 2] / phyfit$sigma2 / n * (n - 2))
  return(nEff)
}

#' @title Effective Sample Size Ratio
#'
#' @description 
#' Ratio between the tree sample size and the sample size of the equivalent problem
#' with independent measures. A result larger than one indicates a problem that is
#' made "easier" by the tree structure. Note that it strongly depends on the 
#' tip conditions (see examples).
#' 
#' @param tree A phylogenetic tree. If \code{NULL}, samples are assumed to be iid.
#' @param id.condition A named vector giving the state of each tip (sample).
#' @param model The trait evolution model. One of "BM" or "OU".
#' @param selection.strength If \code{model="OU"}, the selection strength parameter.
#' 
#' @return The ratio of sample sizes.
#' 
#' @examples 
#' set.seed(1289)
#' ## Ballanced tree
#' ntips <- 2^5
#' tree <- ape::compute.brlen(ape::stree(ntips, "balanced"))
#' ## Alt cond : nEff greater than 1
#' id_cond <- rep(rep(0:1, each = 2), ntips / 4)
#' names(id_cond) <- tree$tip.label
#' plot(tree); ape::tiplabels(pch = 21, col = id_cond, bg = id_cond)
#' compcodeR:::nEffRatio(tree, id_cond, "BM", 0)
#' ## Bloc cond : nEff smaller than 1
#' id_cond <- rep(0:1, each = ntips / 2)
#' names(id_cond) <- tree$tip.label
#' plot(tree); ape::tiplabels(pch = 21, col = id_cond, bg = id_cond)
#' compcodeR:::nEffRatio(tree, id_cond, "BM", 0)
#' 
#' @keywords internal
#' 
nEffRatio <- function(tree, id.condition, model, selection.strength) {
  if (is.null(tree)) {
    n <- length(id.condition)
  } else {
    n <- length(prune_tree_one_obs(tree)$tip.label)
  }
  return(nEffNaive(tree, id.condition, model, selection.strength) / (n / 4))
}

prune_tree_one_obs <- function(tree) {
  C <- ape::cophenetic.phylo(tree)
  C <- apply(C, c(1, 2), function(x) isTRUE(all.equal(x, 0)))
  duplicated_species <- colnames(C)[duplicated(C)]
  tree_unique <- ape::drop.tip(tree, duplicated_species)
  return(tree_unique)
}
csoneson/compcodeR documentation built on Oct. 25, 2023, 1:28 a.m.