R/utils.R
In ddtlcm: Latent Class Analysis with Dirichlet Diffusion Tree Process Prior

Documented in add_leaf_branch add_multichotomous_tip add_root A_t_inv_one A_t_inv_two a_t_one a_t_one_cum a_t_two a_t_two_cum compute_IC create_leaf_cor_matrix div_time draw_mnorm expit exp_normalize log_expit logit quiet WAIC

#' Compute divergence function
#' @description Compute value, cumulative hazard, and inverse for divergence function \eqn{a(t) = c / (1-t)}
#' @describeIn a_t_one value of the divergence function
#' @param c a positive number for the divergence hyperparameter. A larger value implies
#'  earlier divergence on the tree
#' @param t a number in the interval (0, 1) indicating the divergence time
#' @family divergence functions
#' @return The value and cumulative hazard return a positive number. The inverse function returns a number in the interval (0, 1).
#' @examples
#' a_t_one(1, 0.5)
#' @export
a_t_one <- function(c, t) c/(1-t)

#' @describeIn a_t_one cumulative hazard function
#' @examples
#' a_t_one_cum(1, 0.5)
#' @export
a_t_one_cum <- function(c, t) -c * log(1-t)

#' @param y a positive number to take inverse
#' @describeIn a_t_one inverse function
#' @examples
#' A_t_inv_one(1, 2)
#' @export
A_t_inv_one <- function(c, y) 1.0 - exp(- y/c)

#' Compute divergence function
#' @description Compute value, cumulative hazard, and inverse for divergence function \eqn{a(t) = c / (1-t)^2}
#' @describeIn a_t_two value of the divergence function
#' @param c a positive number for the divergence hyperparameter. A larger value implies
#'  earlier divergence on the tree
#' @param t a number in the interval (0, 1) indicating the divergence time
#' @family divergence functions
#' @return The value and cumulative hazard return a positive number. The inverse function returns a number in the interval (0, 1).
#' @examples
#' a_t_two(1, 0.5)
#' @export
a_t_two <- function(c, t) c/(1-t)^2

#' @describeIn a_t_two cumulative hazard function
#' @examples
#' a_t_two_cum(1, 0.5)
#' @export
a_t_two_cum <- function(c, t) -c + c / (1.0-t)

#' @param y a positive number to take inverse
#' @describeIn a_t_two inverse function
#' @examples
#' A_t_inv_two(1, 2)
#' @export
A_t_inv_two <- function(c, y) y / (c+y)

#' Sample divergence time on an edge uv previously traversed by m(v) data points
#' @param t_u a number in the interval (0, 1) indicating the divergence time at node u
#' @param m_v an integer for the number of data points traversed through node v
#' @param c a positive number for the divergence hyperparameter. A larger value implies
#'  earlier divergence on the tree
#' @param c_order equals 1 if using divergence function \eqn{a(t) = c / (1-t)}, or 2 if
#'  a(t) = c / (1-t)^2. Default is 1
#' @param alpha,theta hyparameter of branching probability a(t) Gamma(m-alpha) / Gamma(m+1+theta)
#'    For DDT, alpha = theta = 0. For general multifurcating tree from a Pitman-Yor process,
#'    specify positive values to alpha and theta. It is, however, recommended using alpha =
#'    theta = 0 in inference because multifurcating trees have not been tested rigorously.
#' @return a number in the interval (0, 1)
div_time <- function(t_u, m_v, c, c_order = 1, alpha = 0, theta = 0){
  u <- runif(1)
  if (c_order == 1){
    x = a_t_one_cum(c, t_u) - exp( lgamma(m_v+1.0+theta) - lgamma(m_v-alpha) ) * log(1-u)
    return (A_t_inv_one(c, x))
  } else if (c_order == 2){
    x = a_t_two_cum(c, t_u) - exp( lgamma(m_v+1.0+theta) - lgamma(m_v-alpha) ) * log(1-u);
    return (A_t_inv_two(c, x))
  } else {
    stop("c_order must take value 1 or 2.")
  }
}


#' Add a leaf branch to an existing tree tree_old
#' @param tree_old the original "phylo" tree (with K leaves) to which the leaf branch will be added
#' @param div_t divergence time of the new branch
#' @param new_leaf_label the label of the newly added leaf
#' @param where node name of to which node in the existing tree the new leaf branch should connect to
#' @param position the numerical location of the left side of the added branch
#' @return a "phylo" tree with K+1 leaves
add_leaf_branch <- function(tree_old, div_t, new_leaf_label, where, position){
  leaf_branch <- list(edge = matrix(c(2,1), nrow = 1),
                      tip.label = new_leaf_label,
                      edge.length = 1.0 - div_t, Nnode=1)
  class(leaf_branch) <- "phylo"
  tree_new <- bind.tree(tree_old, leaf_branch,
                        # where = as.numeric(substr(names(where), 2, nchar(names(where)))),
                        where = where,
                        position = position)
  return(tree_new)
}

#' Add a leaf branch to an existing tree tree_old to make a multichotomus branch
#' @param tree_old the original "phylo" tree (with K leaves) to which the leaf branch will be added
#' @param div_t divergence time of the new branch
#' @param new_leaf_label the label of the newly added leaf
#' @param where node name of to which node in the existing tree the new leaf branch should connect to
#' @return a "phylo" tree with K+1 leaves that could possibly be multichotomus
add_multichotomous_tip <- function(tree_old, div_t, new_leaf_label, where){
  branch <- list(edge = matrix(c(2,1), nrow = 1),
                 tip.label = new_leaf_label,
                 edge.length = 1 - div_t,
                 Nnode = 1)
  class(branch) <- "phylo"
  tree_new <- bind.tree(tree_old, branch,
                        where = where
                        # where = as.numeric(substr(names(where), 2, nchar(names(where))))
                        )
  return(tree_new)
}



#' Add a singular root node to an existing nonsingular tree
#' @param tree_old the original nonsingular "phylo" tree
#' @param root_edge_length a number in (0, 1) representing the distance
#'  between the new and the original root nodes
#' @param root_label a character label of the new root node
#' @param leaf_label a character label of the leaf node
#' @return a singular "phylo" tree
add_root <- function(tree_old, root_edge_length, root_label, leaf_label){
  leaf_branch <- list(edge = matrix(c(2,1), nrow = 1),
                      node.label = root_label,
                      tip.label = leaf_label,
                      edge.length = root_edge_length,
                      Nnode = 1)
  class(leaf_branch) <- "phylo"
  tree_new <- ape::bind.tree(leaf_branch, tree_old, where=1)
  return(tree_new)
}


#' Compute normalized probabilities: exp(x_i) / sum_j exp(x_j)
#' @param x a number or real-valued vector
#' @return a number or real-valued vector
exp_normalize <- function(x){
  return(exp(x - logSumExp(x)))
}


#' Efficiently sample multivariate normal using precision matrix
#'  from \eqn{x ~ N(Q^{-1}a, Q^{-1})}, where \eqn{Q^{-1}} is the precision matrix
#' @param precision_mat precision matrix Q of the multivariate normal distribution
#' @param precision_a_vec a vector a such that the mean of the multivariate normal distribution is
#'  \eqn{Q^{-1}a}
draw_mnorm <- function(precision_mat, precision_a_vec){
  U <- chol(precision_mat)
  b <- rnorm(nrow(precision_mat))
  backsolve(U, backsolve(U, precision_a_vec, transpose = TRUE) + b)
}


#' The expit function
#' @description The expit function: f(x) = exp(x) / (1+exp(x)), computed
#'  in a way to avoid numerical underflow.
#' @param x a value or a numeric vector between 0 and 1 (exclusive)
#' @return a number or real-valued vector
#' @export
#' @examples
#' expit(0.2)
#' expit(c(-1, -0.3, 0.6))
expit <- function(x){
  out <- x
  positives <- which(x >= 0)
  if (length(positives) > 0){
    # print("1,")
    out[positives] <- 1 / (1 + exp(-x[positives]))
    # return(1 / (1 + exp(-x)))
  }
  if (length(positives) < length(x)) {
    # print("2,")
    negatives <- which(x < 0)
    out[negatives] <- exp(x[negatives]) / (1 + exp(x[negatives]))
    # return(exp(x) / (1 + exp(x)))
  }
  return(out)
}


#' The logistic function
#' @description The logit function: f(x) = log(x / (1/x)). Large absolute values of
#'  x will be truncated to +/- 5 after logit transformation according to its sign.
#' @param x a value or a numeric vector between 0 and 1 (exclusive)
#' @return a number or real-valued vector
#' @export
#' @examples
#' logit(0.2)
#' logit(c(0.2, 0.6, 0.95))
logit <- function(x){
  y <- log(x / (1-x))
  large <- which(abs(y) > 5)
  if (length(large) > 0){
    y[large] <- sign(y[large]) * 5
  }
  return(y)
}


#' Numerically accurately compute f(x) = log(x / (1/x)).
#' @param x a value or a numeric vector between 0 and 1 (exclusive)
#' @return a number or real-valued vector
log_expit <- function(x){
  out <- x
  idx <- which(x < -33.3)
  if (length(idx) > 0){
    out[idx] <- x[idx]
  }
  idx <- which(x >= -33 & x < -18)
  if (length(idx) > 0){
    out[idx] <- x[idx] - exp(x[idx])
  }
  idx <- which((x >= -18) & (x < 37))
  if (length(idx) > 0){
    out[idx] <- -log1p(exp(-x[idx]))
  }
  return(out)
}


#' Suppress print from cat()
#' @param x evaluation of a statement that may explicitly or implicitly involve cat()
#' @param be_quiet logical. TRUE to suppress print from cat(); FALSE to continue printing
quiet <- function(x, be_quiet=TRUE) {
  if (be_quiet){
    sink(tempfile())
    on.exit(sink())
    invisible(force(x))
  }else{
    x
  }
}



#' Create a tree-structured covariance matrix from a given tree
#' @description Retrieve the covariance matrix of leaf nodes of a DDT tree
#' @param tree_phylo4d a "phylo4d" object
#' @return a K by K covariance matrix
#' @export
#' @examples
#'# load the MAP tree structure obtained from the real HCHS/SOL data
#'data(data_synthetic)
#'# extract elements into the global environment
#'list2env(setNames(data_synthetic, names(data_synthetic)), envir = globalenv())
#'create_leaf_cor_matrix(tree_with_parameter)
create_leaf_cor_matrix <- function(tree_phylo4d){
  K <- nTips(tree_phylo4d)
  # K <- nNodes(tree_phylo4d)
  root_node <- names(rootNode(tree_phylo4d))

  # create combinations of two leaf nodes including identical combinations
  # leaf_grid <- expand.grid(leaf_nodes, leaf_nodes)
  leaf_grid <- expand.grid(1:K, 1:K)
  leaf_grid <- leaf_grid[leaf_grid$Var1 >= leaf_grid$Var2, ]
  # add "v" to match leaf node labels
  leaf_grid[] <- data.frame(lapply(leaf_grid, function(x) paste0("v", x)))
  ## we will construct a row covariance matrix for the matrix normal
  # get the MRCA of each pair of leaf nodes
  mrca_nodes <- tree_phylo4d@label[
    as.character(apply(leaf_grid, 1, function(x) phylobase::MRCA(tree_phylo4d, x))) ]

  calculate_row_var_by_group <- function(node){
    # internal ancestors of the MRCA
    ancestors_internal <- c(names(shortestPath(tree_phylo4d, root_node, node)), node)
    # parents of the internal ancestors
    pa_ancestors_internal <- names(ancestor(tree_phylo4d, ancestors_internal))
    # get divergence times of all ancestors of the MRCA
    branch_lengths <- nodeHeight(tree_phylo4d, c(root_node, ancestors_internal), "root")
    # branch_lengths <- diff(c(0, branch_lengths))
    branch_lengths <- diff(branch_lengths)
    names(branch_lengths) <- ancestors_internal

    # compute row variance of the matrix normal distribution of leaf nodes
    row_var_by_group <- sum(branch_lengths)
    return(row_var_by_group)
  }
  unique_mrca_nodes <- unique(mrca_nodes)
  mrca_var_by_group <- unlist(lapply(tree_phylo4d@label, calculate_row_var_by_group))
  # mrca_var_by_group <- matrix(unlist(mrca_var_by_group), ncol = G, nrow = 2*K-1, byrow = TRUE)
  names(mrca_var_by_group) <- tree_phylo4d@label
  row_var_by_group <- mrca_var_by_group[mrca_nodes]

  ## now construct a pairwise distance matrix between leaf nodes
  tree_Sigma_hclust <- matrix(NA, nrow = K, ncol = K)
  # create distance matrix, equivalently, row covariance matrix
  tree_Sigma_hclust[lower.tri(tree_Sigma_hclust,diag = TRUE)] <- row_var_by_group
  tree_Sigma_hclust[upper.tri(tree_Sigma_hclust)] <- t(tree_Sigma_hclust)[upper.tri(tree_Sigma_hclust)]
  rownames(tree_Sigma_hclust) <- colnames(tree_Sigma_hclust) <- paste0("v", 1:K)

  return(tree_Sigma_hclust)
}






#' Compute information criteria for the DDT-LCM model
#' @description Compute information criteria for the DDT-LCM model, including the Widely Applicable
#'  Information Criterion (WAIC), and Deviance Information Criterion (DIC). WAIC and DIC are computed
#'  using two different methods described in Gelman, Hwang, and Vehtari (2013), one based on (1) posterior
#'  means and the other based on (2) posterior variances.
#' @param result a "ddt_lcm" object
#' @param burnin an integer specifying the number of burn-in iterations from MCMC chain
#' @param ncores an integer specifying the number of cores to compute marginal posterior log-likelihood
#'  in parallel
#' @importFrom parallel mclapply
#' @return a named list of the following elements
#' \describe{
#' \item{`WAIC_result`}{a list of WAIC-related results computed using the two methods}
#' \item{`DIC1`}{DIC computed using method 1.}
#' \item{`DIC2`}{DIC computed using method 2.}
#' }
#' @export
#' @examples
#' data(result_diet_1000iters)
#' IC_result <- compute_IC(result = result_diet_1000iters, burnin = 800, ncores = 1L)
compute_IC <- function(result, burnin = 5000, ncores = 1L){
  # require(parallel)
  num_samples <- length(result$loglikelihood[-(1:burnin)]) - 1
  dat <- result$data
  N <- nrow(result$data)
  K <- result$setting$K

  ### calculate WAIC
  compute_posteriorllk_matrix <- function(iter){
    # print(iter)
    # leaf_data <- t(matrix(result$response_probs_samples[iter, ,], nrow = K))
    leaf_data <- t(result$response_probs_samples[iter, ,])
    # marginal loglikelihood of LCM, integrated over Z
    class_probability <- log(result$class_probs_samples[, iter])
    logllk_lcm_sample <- rep(0, N)
    for (i in 1:N) {
      logllk_lcm_sample[i] <- logSumExp(class_probability + colSums(dat[i,] * log(leaf_data) + (1 - dat[i,]) * log(1 - leaf_data)))
    }
    return(logllk_lcm_sample)
  }
  llk_matrix <- mclapply(1:num_samples+burnin, function(x) compute_posteriorllk_matrix(x), mc.cores = ncores) #num_samples
  llk_matrix <- matrix(unlist(llk_matrix), nrow = N, ncol = num_samples, byrow = TRUE)
  WAIC_result <- WAIC(llk_matrix)


  ### calculate DIC
  s <- summary(result, burnin)
  leaf_data <- t(matrix(s$response_probs_summary[,'Mean'], nrow = K))
  class_probability <- s$class_probs_summary[,"Mean"]
  logllk_lcm_sample_thetahat <- 0
  for (i in 1:N) {
    logllk_lcm_sample_thetahat <- logllk_lcm_sample_thetahat +
      logSumExp(class_probability + colSums(dat[i,] * log(leaf_data) + (1 - dat[i,]) * log(1 - leaf_data)))
  }
  logllk_allindividuals <- colSums(llk_matrix) #apply(llk_matrix, 2, logSumExp)
  p_DIC_1 <- 2 * (logllk_lcm_sample_thetahat - mean(logllk_allindividuals))
  p_DIC_2 <- 2 * var(logllk_allindividuals)

  DIC1 <- -2 * (logllk_lcm_sample_thetahat + p_DIC_1)
  DIC2 <- -2 * (logllk_lcm_sample_thetahat + p_DIC_2)

  return(list(WAIC_result = WAIC_result, DIC1 = DIC1, DIC2 = DIC2))
}


#' Compute WAIC
#' @description Compute the Widely Applicable Information Criterion (WAIC), also known
#'   as the Widely Available Information Criterion or the Watanable-Akaike, of Watanabe (2010).
#' @param llk_matrix a N x S matrix, where N is the number of individuals and S is the number of posterior samples
#' @importFrom matrixStats rowVars
#' @return a named list
WAIC <- function(llk_matrix) {
  # require(matrixStats)
  #  log pointwise predictive density
  S <- ncol(llk_matrix)
  part <- apply(llk_matrix, 1, logSumExp)
  lppd <- - nrow(llk_matrix) * log(S) + sum(part)
  pWAIC1 <- 2 * (lppd - sum(llk_matrix) / S)
  pWAIC2 <- sum(rowVars(llk_matrix))
  elppd_WAIC1 <- -2*lppd + 2*pWAIC1
  elppd_WAIC2 <- -2*lppd + 2*pWAIC2
  return(list(WAIC1=elppd_WAIC1, WAIC2=elppd_WAIC2, lppd=lppd, pWAIC2=pWAIC2, pWAIC1=pWAIC1))
}

Any scripts or data that you put into this service are public.

ddtlcm documentation built on May 29, 2024, 5:41 a.m.

rdrr.io home R language documentation Run R code online

CRAN packages Bioconductor packages R-Forge packages GitHub packages

Note that we can't provide technical support on individual packages. You should contact the package authors for that.

ddtlcm
Latent Class Analysis with Dirichlet Diffusion Tree Process Prior

R/utils.R
In ddtlcm: Latent Class Analysis with Dirichlet Diffusion Tree Process Prior

Defines functions WAIC compute_IC create_leaf_cor_matrix quiet log_expit logit expit draw_mnorm exp_normalize add_root add_multichotomous_tip add_leaf_branch div_time A_t_inv_two a_t_two_cum a_t_two A_t_inv_one a_t_one_cum a_t_one

Documented in add_leaf_branch add_multichotomous_tip add_root A_t_inv_one A_t_inv_two a_t_one a_t_one_cum a_t_two a_t_two_cum compute_IC create_leaf_cor_matrix div_time draw_mnorm expit exp_normalize log_expit logit quiet WAIC

Try the ddtlcm package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

ddtlcm Latent Class Analysis with Dirichlet Diffusion Tree Process Prior

R/utils.R In ddtlcm: Latent Class Analysis with Dirichlet Diffusion Tree Process Prior

Defines functions WAIC compute_IC create_leaf_cor_matrix quiet log_expit logit expit draw_mnorm exp_normalize add_root add_multichotomous_tip add_leaf_branch div_time A_t_inv_two a_t_two_cum a_t_two A_t_inv_one a_t_one_cum a_t_one

Documented in add_leaf_branch add_multichotomous_tip add_root A_t_inv_one A_t_inv_two a_t_one a_t_one_cum a_t_two a_t_two_cum compute_IC create_leaf_cor_matrix div_time draw_mnorm expit exp_normalize log_expit logit quiet WAIC

Try the ddtlcm package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

ddtlcm
Latent Class Analysis with Dirichlet Diffusion Tree Process Prior

R/utils.R
In ddtlcm: Latent Class Analysis with Dirichlet Diffusion Tree Process Prior