R/loglikehoods.R
In ddtlcm: Latent Class Analysis with Dirichlet Diffusion Tree Process Prior

Documented in H_n J_n logllk_ddt logllk_ddt_lcm logllk_div_time_one logllk_div_time_two logllk_lcm logllk_location logllk_tree_topology

#' Compute log likelihood of parameters
#' @description Compute the marginal log likelihood of the parameters on the leaves of a tree
#' @param tree_phylo4d a "phylo4d" object
#' @param Sigma_by_group a vector of diffusion variances of G groups 
#' @param item_membership_list a list of G elements, where the g-th element contains the column
#'  indices of `data` corresponding to items in major group g
#' @param dist_mat a tree-structured covariance matrix from a given tree. Default is NULL. If given
#'  a matrix, then computation of the covariance matrix will be skipped to save time. This is useful
#'  in the Metropolis-Hasting algorithm when the previous proposal is not accepted.
#' @param tol a small number to prevent underflow when computing eigenvalues
#' @return A list of two elements: a numeric loglikelihood, a covariance matrix of the input tree
#' @family likelihood functions
logllk_location <- function(tree_phylo4d, Sigma_by_group, item_membership_list, dist_mat = NULL, tol = 1e-7){
  # phylobase::phylobase.options(singleton="ok")
  # total number of items
  J <- length(unlist(item_membership_list))
  # number of major item groups
  G <- length(item_membership_list)
  # # cumulative index of item groups
  # cum_J_index <- c(0, cumsum(num_items_per_group))
  # number of samples
  K <- nTips(tree_phylo4d)
  # extract locations
  locations <- tree_phylo4d@data[as.character(1:K), paste0("x", 1:J)]
  rownames(locations) <- tipLabels(tree_phylo4d)
  locations <- locations[order(as.numeric(substring(rownames(locations),2))),]
  root_node <- names(rootNode(tree_phylo4d))
  root_child <- names(phylobase::descendants(tree_phylo4d, root_node, "child"))
  # get the location of the root node
  root_location <- unlist(tree_phylo4d@data[as.character(rootNode(tree_phylo4d)), paste0("x", 1:J)])

  ### start construction of dist_mat_list
  if (is.null(dist_mat)) {
    ## now construct a pairwise distance matrix between leaf nodes
    leaf_nodes <- as.numeric(gsub("v", "", tipLabels(tree_phylo4d)))
    # create combinations of two leaf nodes including identical combinations
    leaf_grid <- expand.grid(1:K, 1:K)
    leaf_grid <- leaf_grid[leaf_grid$Var1 >= leaf_grid$Var2, ]
    # add "v" to match leaf node labels
    leaf_grid[] <- data.frame(lapply(leaf_grid, function(x) paste0("v", x)))

    ## we will construct a row covariance matrix for the matrix normal
    # get the MRCA of each pair of leaf nodes
    mrca_nodes <- tree_phylo4d@label[
      as.character(apply(leaf_grid, 1, function(x) phylobase::MRCA(tree_phylo4d, x))) ]

    calculate_row_var_by_group <- function(node){
      # internal ancestors of the MRCA
      ancestors_internal <- c(names(shortestPath(tree_phylo4d, root_node, node)), node)
      # parents of the internal ancestors
      pa_ancestors_internal <- names(ancestor(tree_phylo4d, ancestors_internal))
      # get divergence times of all ancestors of the MRCA
      branch_lengths <- nodeHeight(tree_phylo4d, c(root_node, ancestors_internal), "root")
      branch_lengths <- diff(c(0, branch_lengths))
      names(branch_lengths) <- ancestors_internal

      # compute row variance of the matrix normal distribution of leaf nodes
      row_var_by_group <- unlist(t(sapply(seq_along(pa_ancestors_internal),
                                          function(x) branch_lengths[x])))
      # row_var_by_group <- colSums(matrix(row_var_by_group, ncol = G))
      row_var_by_group <- sum(row_var_by_group)
      return(row_var_by_group)
    }

    unique_mrca_nodes <- unique(mrca_nodes)
    mrca_var_by_group <- lapply(setdiff(tree_phylo4d@label, "u1"), calculate_row_var_by_group)
    mrca_var_by_group <- matrix(unlist(mrca_var_by_group), ncol=1)
    # mrca_var_by_group <- matrix(unlist(mrca_var_by_group), ncol = G, nrow = 2*K-1, byrow = TRUE)
    rownames(mrca_var_by_group) <- setdiff(tree_phylo4d@label, "u1")
    row_var_by_group <- mrca_var_by_group[mrca_nodes,,drop=F]

    # note that given all other parameters, the eta at leaf nodes are independent between groups
    # so we calculate the log likelihood of leaf nodes by group and then add them up
    create_dist_mat_by_group <- function(g){
      ## now construct a pairwise distance matrix between leaf nodes
      dist_mat_by_group <- matrix(NA, nrow = K, ncol = K)
      # create distance matrix, equivalently, row covariance matrix
      dist_mat_by_group[lower.tri(dist_mat_by_group,diag = TRUE)] <- row_var_by_group[,g]
      dist_mat_by_group[upper.tri(dist_mat_by_group)] <- t(dist_mat_by_group)[upper.tri(dist_mat_by_group)]

      # perform SVD on the dist_mat
      s <- svd(dist_mat_by_group)
      d_inv <- s$d
      # check whether dist_mat is invertible: all eigenvalues are positive, or full rank
      mat_rank <- sum(abs(s$d) > tol)
      is_invertible <- mat_rank == K

      # if dist_mat is not invertible, use density formula for a degenerate gaussian
      if (!is_invertible){
        # we find out the zero eigenvalues
        d_inv[which(abs(s$d) > tol)] <- 1 / s$d[which(abs(s$d) > tol)]
        d_inv[which(abs(s$d) <= tol)] <- 0
        # compute D^{-1/2} U^t
        Dhalf_Ut <- diag(sqrt(d_inv)) %*% t(s$u)
        # compute log determinant
        log_det <- sum(log(s$d[abs(s$d) > tol]))
      } else if (is_invertible){ # if dist_mat is invertible, just calculate its inverse
        # compute D^{-1/2} U^t
        Dhalf_Ut <- diag(sqrt(1/d_inv)) %*% t(s$u)
        # compute log determinant of Sigma
        log_det <- sum(log(s$d))
      }
      return(list(dist_mat_g = dist_mat_by_group, Dhalf_Ut = Dhalf_Ut, log_det = log_det, mat_rank = mat_rank))
    }
    dist_mat <- create_dist_mat_by_group(1)
  }
  ### end construction of dist_mat

  # compute loglikelihood
  logllk <- -sum(length(item_membership_list) * 0.5 * (K * log(Sigma_by_group * 2 * pi) + dist_mat$log_det))
  D_half_X <- dist_mat$Dhalf_Ut %*% as.matrix(locations)
  for (g in 1:G){
    D_half_X_g <- D_half_X[, item_membership_list[[g]], drop=F]
    logllk <- logllk - 0.5 / Sigma_by_group[g] * sum(D_half_X_g * D_half_X_g)
  }

  if (is.na(logllk[1])){
    return(list(logllk = unname(logllk), dist_mat = dist_mat))
  } else {
    return(list(logllk = unname(logllk[1]), dist_mat = dist_mat))
  }
}



#' Harmonic series
#' @param k a positive integer
H_n <- function(k){
  if(k == 0) {
    return(0)
  }else{
    return(sum(1/1:k))
  }
}

#' Compute factor in the exponent of the divergence time distribution
#' @param l number of data points to the left
#' @param r number of data points to the right
J_n <- function(l, r){
  sum(1/l:(r+l-1)) - H_n(r - 1)
  # H_n(r + l - 1) - H_n(l - 1) - H_n(r - 1)
}


#' Compute loglikelihood of divergence times for a(t) = c/(1-t)
#' @param c a positive number for the divergence hyperparameter. A larger value implies
#'  earlier divergence on the tree
#' @param l number of data points to the left
#' @param r number of data points to the right
#' @param t a number in the interval (0, 1) indicating the divergence time
#' @family likelihood functions
logllk_div_time_one <- function(c, l, r, t){
  return( log(c) + (c*J_n(l, r)-1) * log(1-t) )
}

#' Compute loglikelihood of divergence times for a(t) = c/(1-t)^2
#' @param c a positive number for the divergence hyperparameter. A larger value implies
#'  earlier divergence on the tree
#' @param l number of data points to the left
#' @param r number of data points to the right
#' @param t a number in the interval (0, 1) indicating the divergence time
#' @family likelihood functions
logllk_div_time_two <- function(c, l, r, t){
  return( log(c) - 2*log(1-t) + c*t/(1-t)*J_n(l, r) )
}


#' Compute loglikelihood of the tree topology
#' @param l number of data points to the left
#' @param r number of data points to the right
#' @family likelihood functions
logllk_tree_topology <- function(l, r){
  return(lfactorial(l-1) + lfactorial(r-1) - lfactorial(r+l-1))
}


#' Calculate loglikelihood of a DDT, including the tree structure and node parameters
#' @param c a positive number for the divergence hyperparameter. A larger value implies
#'  earlier divergence on the tree
#' @param c_order equals 1 if using divergence function a(t) = c / (1-t), or 2 if 
#'  \eqn{a(t) = c / (1-t)^2}. Default is 1
#' @param Sigma_by_group a vector of diffusion variances of G groups from the previous iteration
#' @param tree_phylo4d a "phylo4d" object
#' @param item_membership_list a list of G elements, where the g-th element contains the column
#'  indices of `data` corresponding to items in major group g
#' @param tree_structure_old a list of at least named elements: loglikelihoods of the input tree topology
#'  and divergence times. These can be directly obtained from the return of this function.
#'  Default is NULL. If given a list, then computation of the loglikelihoods will be skipped to save time. 
#'  This is useful in the Metropolis-Hasting algorithm when the previous proposal is not accepted.
#' @param dist_mat_old a tree-structured covariance matrix from a given tree. Default is NULL. 
#' @return a numeric of loglikelihood
#' @family likelihood functions
logllk_ddt <- function(c, c_order, Sigma_by_group, tree_phylo4d, item_membership_list,
                       tree_structure_old = NULL, dist_mat_old = NULL){
  # phylobase::phylobase.options(singleton="ok")
  if(is.null(tree_structure_old)){
    # obtain the tree topology
    tree <- extractTree(tree_phylo4d)
    # number of samples
    K <- nTips(tree_phylo4d)
    # get divergence times
    div_times <- c(nodeHeight(tree, nodeLabels(tree), from="root"), rep(1, K))
    names(div_times)[(K+1):(2*K)] <- 1:K
    # get the number of descends of each node
    num_descends <- c(sapply(descendants(tree, nodeLabels(tree), "tip"), length), rep(1, K))
    names(num_descends)[(K+1):(2*K)] <- 1:K
    # construct a dataframe containing the node information
    tree_structure_df <- data.frame(node = names(div_times), div_times = div_times, m_v = num_descends)

    # construct a data frame containing the edge information, divergence times (of descendends), and number of data
    #  points passing to the left and right
    tree_structure <- data.frame(tree@edge)
    # keep internal nodes only
    tree_structure <- tree_structure[tree_structure$ancestor != 0,]
    tree_structure <- tree_structure[tree_structure$descendant > K,]
    # add divergence time info
    tree_structure <- merge(x = tree_structure, y = tree_structure_df[, c("node", "div_times", "m_v")],
                            by.x = "descendant", by.y = "node", all.x = TRUE)
    # get the number of data points passing through each internal node from the left side
    tree_structure2 <- merge(x = tree_structure, y = tree_structure[,c("ancestor", "m_v")],
                             by.x = "descendant", by.y = "ancestor", all.x=T, all.y=F)
    # remove the duplicated
    tree_structure2 <- tree_structure2[!duplicated(tree_structure2[,1:3]),]
    colnames(tree_structure2)[4:5] <- c("m_v","l")
    # replace NA's with 1. These nodes are the parents of leaves
    tree_structure2$l[tree_structure2$m_v==2] = 1
    # get the number of data points passing to the right
    tree_structure2$r <- tree_structure2$m_v - tree_structure2$l

    # compute the log likelihood of tree topology
    tree_structure2$logllk_tree_topology <- apply(tree_structure2, 1, function(x) logllk_tree_topology(l = x[5], r=x[6]))
    # compute the log likelihood of divergence times
    if(c_order){
      tree_structure2$logllk_div_time <- apply(tree_structure2, 1,
                                               function(x) logllk_div_time_one(c = c, l = x[5], r = x[6], t = x[3]))
    } else{
      tree_structure2$logllk_div_time <- apply(tree_structure2, 1,
                                               function(x) logllk_div_time_two(c = c, l = x[5], r = x[6], t = x[3]))
    }
  }else{
    tree_structure2 = tree_structure_old
  }
  # compute the log likelihood of locations
  logllk_location_value <- logllk_location(tree_phylo4d =  tree_phylo4d, Sigma_by_group = Sigma_by_group,
                                           item_membership_list, dist_mat_old)
  # cat("topology: ", sum(tree_structure2$logllk_tree_topology, tree_structure2$logllk_div_time))
  # cat("locations: ", logllk_location_value$logllk)
  logllk <- sum(tree_structure2$logllk_tree_topology, tree_structure2$logllk_div_time) + logllk_location_value$logllk
  return(list(logllk = logllk, tree_structure = tree_structure2, dist_mat = logllk_location_value$dist_mat))
}



#' Calculate loglikelihood of the latent class model, conditional on tree structure
#'@param response_matrix a N by J binary matrix, where the i,j-th element is the response
#'    of item j for individual i
#'@param leaf_data a K by J matrix of \eqn{logit(theta_{kj})}
#'@param prior_class_probability a length K vector, where the k-th element is the
#'    probability of assigning an individual to class k. It does not have to sum up to 1
#'@param prior_dirichlet a vector of length K. The Dirichlet prior of class probabilities
#'@param ClassItem a K by J matrix, where the k,j-th element counts the number of individuals
#'    that belong to class k have a positive response to item j
#'@param Class_count a length K vector, where the k-th element counts the number of individuals
#'    belonging to class k
#'@return a numeric of loglikelihood
#'@family likelihood functions
logllk_lcm <- function(response_matrix, leaf_data, prior_class_probability,
                       prior_dirichlet, ClassItem, Class_count){
  # loglikelihood of the responses
  logllk <- sum(ClassItem * leaf_data) - colSums(t(log1p(exp(leaf_data)))) %*% Class_count
  # loglikelihood of the responses
  logllk <- logllk + sum((Class_count + prior_dirichlet - 1) * log(prior_class_probability))
  return(logllk[1])
}



#' Calculate loglikelihood of the DDT-LCM
#' @param leaf_data a K by J matrix of \eqn{logit(theta_kj)}
#' @param c a positive number for the divergence hyperparameter. A larger value implies
#'  earlier divergence on the tree
#' @param Sigma_by_group a vector of diffusion variances of G groups 
#' @param tree_phylo4d a "phylo4d" object
#' @param item_membership_list a list of G elements, where the g-th element contains the column
#'  indices of `data` corresponding to items in major group g
#' @param response_matrix a N by J binary matrix, where the i,j-th element is the response
#'    of item j for individual i
#' @param tree_structure_old a list of at least named elements: loglikelihoods of the input tree topology
#'  and divergence times. These can be directly obtained from the return of this function.
#'  Default is NULL. If given a list, then computation of the loglikelihoods will be skipped to save time. 
#'  This is useful in the Metropolis-Hasting algorithm when the previous proposal is not accepted.
#' @param dist_mat_old a tree-structured covariance matrix from a given tree. Default is NULL. 
#' @param prior_class_probability a length K vector, where the k-th element is the
#'    probability of assigning an individual to class k. It does not have to sum up to 1
#' @param prior_dirichlet a vector of length K. The Dirichlet prior of class probabilities
#' @param ClassItem a K by J matrix, where the k,j-th element counts the number of individuals
#'    that belong to class k have a positive response to item j
#' @param Class_count a length K vector, where the k-th element counts the number of individuals
#'    belonging to class k
#' @return a numeric of loglikelihood
#' @family likelihood functions
logllk_ddt_lcm <- function(c, Sigma_by_group, tree_phylo4d, item_membership_list,
                           tree_structure_old = NULL, dist_mat_old = NULL,
                           response_matrix, leaf_data, prior_class_probability,
                           prior_dirichlet, ClassItem, Class_count){
  logllk_ddt(c, Sigma_by_group, tree_phylo4d, item_membership_list,
             tree_structure_old, dist_mat_old)[1] +
    logllk_lcm(response_matrix, leaf_data, prior_class_probability,
               prior_dirichlet, ClassItem, Class_count)$logllk
}

Any scripts or data that you put into this service are public.

ddtlcm documentation built on May 29, 2024, 5:41 a.m.

rdrr.io home R language documentation Run R code online

CRAN packages Bioconductor packages R-Forge packages GitHub packages

Note that we can't provide technical support on individual packages. You should contact the package authors for that.

ddtlcm
Latent Class Analysis with Dirichlet Diffusion Tree Process Prior

R/loglikehoods.R
In ddtlcm: Latent Class Analysis with Dirichlet Diffusion Tree Process Prior

Defines functions logllk_ddt_lcm logllk_lcm logllk_ddt logllk_tree_topology logllk_div_time_two logllk_div_time_one J_n H_n logllk_location

Documented in H_n J_n logllk_ddt logllk_ddt_lcm logllk_div_time_one logllk_div_time_two logllk_lcm logllk_location logllk_tree_topology

Try the ddtlcm package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

ddtlcm Latent Class Analysis with Dirichlet Diffusion Tree Process Prior

R/loglikehoods.R In ddtlcm: Latent Class Analysis with Dirichlet Diffusion Tree Process Prior

Defines functions logllk_ddt_lcm logllk_lcm logllk_ddt logllk_tree_topology logllk_div_time_two logllk_div_time_one J_n H_n logllk_location

Documented in H_n J_n logllk_ddt logllk_ddt_lcm logllk_div_time_one logllk_div_time_two logllk_lcm logllk_location logllk_tree_topology

Try the ddtlcm package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

ddtlcm
Latent Class Analysis with Dirichlet Diffusion Tree Process Prior

R/loglikehoods.R
In ddtlcm: Latent Class Analysis with Dirichlet Diffusion Tree Process Prior