R/initialization.R

Defines functions initialize initialize_hclust initialize_randomLCM initialize_poLCA

Documented in initialize initialize_hclust initialize_poLCA initialize_randomLCM

#' Estimate an initial response profile from latent class model using poLCA()
#'@param K number of latent classes
#'@param data a N by J observed binary matrix, where the i,j-th element is the response
#'    of item j for individual i
#'@param ... optional arguments for the poLCA function
#'@family initialization functions
#'@return a K by J probability matrix, the k,j-th entry being the response probability to item
#'    j of an individual in class k
initialize_poLCA <- function(K, data, ...){
  variable_names <- colnames(data)
  fm <- as.formula(paste("cbind(", paste(variable_names, collapse= ","), ") ~ 1"))
  # data[,] <- as.integer(data[,])
  model <- poLCA(fm, data.frame(data+1), nclass=K, ...)
  # model <- poLCA(fm, data.frame(data+1), nclass=K, maxiter=100,
  #                tol=1e-5, na.rm=FALSE, nrep=10, verbose=FALSE, calc.se=TRUE)
  response_prob_simple_lcm <- matrix(0, nrow = K, ncol = ncol(data))
  for (j in seq_len(ncol(data))) {
    response_prob_simple_lcm[,j] <- model$probs[[variable_names[j]]][,2]
  }
  colnames(response_prob_simple_lcm) <- variable_names
  return(list(response_probs = response_prob_simple_lcm,
              class_probability = model$P,
              class_assignments = model$predclass,
              model_poLCA = model))
}


#' Provide a random initial response profile based on latent class mode
#'@param K number of latent classes
#'@param data a N by J observed binary matrix, where the i,j-th element is the response
#'    of item j for individual i
#'@return a K by J probability matrix, the k,j-th entry being the response probability to item
#'    j of an individual in class k
initialize_randomLCM <- function(K, data){
  J <- ncol(data)
  N <- nrow(data)
  # randomly sample initial response probabilities
  response_probs <- matrix(runif(K*J), nrow=K, ncol=J)

  class_probability <- rep(1/K, K)
  class_assignments <- sample(1:K, N, prob = class_probability, replace = TRUE)
  return(list(response_probs = response_probs,
              class_probability = class_probability,
              class_assignments = class_assignments))
}




#' Estimate an initial binary tree on latent classes using hclust()
#'@param leaf_data a K by J matrix of \eqn{logit(theta_{kj})}
#'@param c hyparameter of divergence function a(t)
#'@param c_order equals 1 (default) or 2 to choose divergence function
#'@param method_dist string specifying the distance measure to be used in dist().
#'    This must be one of "euclidean", "maximum", "manhattan", "canberra", "binary" or "minkowski".
#'    Any unambiguous substring can be given.
#'@param method_hclust string specifying the distance measure to be used in hclust().
#'    This should be (an unambiguous abbreviation of) one of "ward.D", "ward.D2", "single",
#'    "complete", "average" (= UPGMA), "mcquitty" (= WPGMA), "median" (= WPGMC) or
#'    "centroid" (= UPGMC).
#'@param method_add_root string specifying the method to add the initial branch to the tree
#'    output from hclust(). This should be one of "min_cor" (the absolute value of the minimum
#'    between-class correlation) or "sample_ddt" (randomly sample a small divergence time from
#'    the DDT process with a large c = 100)
#'@param alpha,theta hyparameter of branching probability a(t) Gamma(m-alpha) / Gamma(m+1+theta)
#'    For DDT, alpha = theta = 0
#'@param ... optional arguments for the poLCA function
#'@importFrom ape as.phylo bind.tree
#'@family initialization functions
#'@return phylo4d object of tree topology
initialize_hclust <- function(leaf_data, c, c_order=1, method_dist = "euclidean", method_hclust = "ward.D",
                              method_add_root = "min_cor",
                              alpha=0, theta=0, ...){
  K <- nrow(leaf_data)
  # create distance matrix between classes
  leaf_dist <- dist(leaf_data, method = method_dist)
  # hierarchical clustering
  hclust_phylo <- as.phylo(hclust(leaf_dist, method = method_hclust))
  total_length <- nodeHeight(phylo4d(hclust_phylo), "1", "root")
  # since the resulting tree is not rooted, we need to sample a divergence time
  # for the root node and add to the tree
  if (method_add_root == "min_cor"){
    # use the absolute value of the minimum between-class correlation
    root_edge_length <- abs(min(cor(t(leaf_data))))
  } else if (method_add_root == "sample_ddt"){
    # randomly sample a very small number
    root_edge_length <- div_time(0, 1, c, c_order, alpha, theta)
    # print(root_edge_length)
  }
  # since the total depth rom hclust is not 1, we need to normalize. The tree
  # from hclust will have total depth 1 - root_edge_length
  # cat("edge len: ", hclust_phylo$edge.length)
  hclust_phylo$edge.length <- hclust_phylo$edge.length / total_length * (1 - root_edge_length)
  # cat("\n\nedge len: ", hclust_phylo$edge.length)
  # add root node
  hclust_phylo <- add_root(hclust_phylo, root_edge_length = root_edge_length,
           root_label = "u1", leaf_label = "u2")
  hclust_phylo <- suppressWarnings({ phylo4d(hclust_phylo) })
  # add node labels
  tipLabels(hclust_phylo) <- paste0("v", 1:K)
  nodeLabels(hclust_phylo) <- paste0("u", 1:K)
  # nodeHeight(hclust_phylo, "v1", "root")
  # plot(hclust_phylo, show.node=T)
  return(hclust_phylo)
}




#' Initialize the MH-within-Gibbs algorithm for DDT-LCM
#'@param K number of classes (integer)
#'@param data an NxJ matrix of multivariate binary responses, where
#'   N is the number of individuals, and J is the number of granular items
#'@param item_membership_list a list of G elements, where the g-th element contains the column
#'  indices of `data` corresponding to items in major group g
#'@param c hyparameter of divergence function a(t)
#'@param c_order equals 1 (default) or 2 to choose divergence function
#'  a(t) = c/(1-t) or c/(1-t)^2.
#'@param method_lcm a character. If `random` (default), the initial LCM parameters will be random values.
#'  If `poLCA`, the initial LCM parameters will be EM algorithm estimates from the \code{poLCA} function.
#'@param method_dist string specifying the distance measure to be used in dist().
#'    This must be one of "euclidean" (defaults), "maximum", "manhattan", "canberra", "binary" or "minkowski".
#'    Any unambiguous substring can be given.
#'@param method_hclust string specifying the distance measure to be used in hclust().
#'    This should be (an unambiguous abbreviation of) one of "ward.D" (defaults), "ward.D2", "single",
#'    "complete", "average" (= UPGMA), "mcquitty" (= WPGMA), "median" (= WPGMC) or
#'    "centroid" (= UPGMC).
#'@param method_add_root string specifying the method to add the initial branch to the tree
#'    output from hclust(). This should be one of "min_cor" (the absolute value of the minimum
#'    between-class correlation; default) or "sample_ddt" (randomly sample a small divergence time from
#'    the DDT process with c = 100)
#'@param fixed_initials a named list of fixed initial values, including
#' the initial values for tree ("phylo4d"), response_prob, class_probability, class_assignments,
#' Sigma_by_group, and c. Default is NULL. See
#'@param fixed_priors a named list of fixed prior hyperparameters, including the
#' the Gamma prior for `c`, inverse-Gamma prior for `sigma_g^2`, and Dirichlet prior
#' for `pi`. Moreover, we allow for a type III generalized logistic distribution such
#' that f(`eta`; a_pg) = `theta`. This becomes a standard logistic distribution when a_pg = 1. See
#' Dalla Valle, L., Leisen, F., Rossini, L., & Zhu, W. (2021). A Pólya–Gamma sampler for a
#' generalized logistic regression. Journal of Statistical Computation and Simulation, 91(14), 2899-2916.
#' An example input list is
#' `list("shape_c" = 1, "rate_c" = 1, "shape_sigma" = rep(2, G), "rate_sigma" = rep(2, G), "a_pg" = 1.0)`, where
#' `G` is the number of major item groups. Default is NULL.
#'@param alpha,theta hyparameter of branching probability a(t) Gamma(m-alpha) / Gamma(m+1+theta)
#'    For DDT, alpha = theta = 0
#'@param ... optional arguments for the poLCA function
#'@return phylo4d object of tree topology
#'@family initialization functions
#'@seealso [ddtlcm_fit()]
#'@export
#'@examples
#' # load the MAP tree structure obtained from the real HCHS/SOL data
#' data(data_synthetic)
#' # extract elements into the global environment
#' list2env(setNames(data_synthetic, names(data_synthetic)), envir = globalenv())
#' K <- 3
#' G <- length(item_membership_list)
#' fixed_initials <- list("c" = 5)
#' fixed_priors <- list("rate_sigma" = rep(3, G), "shape_c" = 2, "rate_c" = 2)
#' initials <- initialize(K, data = response_matrix, item_membership_list,
#'   c=1, c_order=1, fixed_initials = fixed_initials, fixed_priors = fixed_priors)
initialize <- function(K, data, item_membership_list, c=1, c_order=1,
                       method_lcm = "random", #c("poLCA", "random"),
                       method_dist = "euclidean", method_hclust = "ward.D",
                       method_add_root = "min_cor",
                       fixed_initials = list(), fixed_priors = list(),
                       alpha=0, theta=0, ...){
  # phylobase::phylobase.options(singleton="ok")
  G <- length(item_membership_list)
  J <- length(unlist(item_membership_list))

  ### step 1: initialize LCM
  if (length(fixed_initials) != 6){
    if (method_lcm == "poLCA"){
      # run classical LCM on the responses
      simple_lcm <- initialize_poLCA(K=K, data, ...)
      # simple_lcm <- initialize_poLCA(K=K, data, maxiter=100,
      #                 tol=1e-5, na.rm=FALSE, nrep=10, verbose=FALSE, calc.se=TRUE)
    } else if (method_lcm == "random"){
      simple_lcm <- initialize_randomLCM(K=K, data)
    } else {
      stop("Initialization method for LCM should be chosen from 'poLCA' or 'random'.")
    }
    response_prob <- simple_lcm$response_probs
    class_probability <- simple_lcm$class_probability
    class_assignments <- simple_lcm$class_assignments
    leaf_data <- logit(response_prob)
  } else {
    initials <- fixed_initials
  }
  if (!is.null(fixed_initials$response_prob)){
    response_prob <- fixed_initials$response_prob
    leaf_data <- logit(response_prob)
  }
  if (!is.null(fixed_initials$class_probability)){
    class_probability <- fixed_initials$class_probability / sum(fixed_initials$class_probability)
  }
  if (!is.null(fixed_initials$class_assignments)){
    class_assignments <- fixed_initials$class_assignments
  }


  ### step 2: estimate c using MLE
  if (!is.null(fixed_initials$tree_phylo4d)){
    tree_phylo4d <- fixed_initials$tree_phylo4d
    init_info <- logllk_ddt(1, c_order, Sigma_by_group = rep(1, G), tree_phylo4d=tree_phylo4d,
                            item_membership_list,
                            tree_structure_old = NULL, dist_mat_old = NULL)
    c_part <- log( 1 - init_info$tree_structure$div_times) *
      exp(init_info$tree_structure$logllk_tree_topology)
    c_hclust <- -(K-1) / sum(c_part)
  } else{
    # get the information of the initial tree by adding a small initial segment
    initial_tree_hclust <- initialize_hclust(leaf_data, 100, c_order, method_dist,
                                             method_hclust, method_add_root, alpha=0, theta=0)
    # add leaf and root data to the tree
    root_location <- rep(0, J)
    # add root to the leaf data
    tree_data <- rbind(leaf_data, root_location)
    # need to add rows corresponding to internal nodes
    tree_data <- rbind(tree_data, matrix(NA, nrow = K-1, ncol = ncol(tree_data)))
    rownames(tree_data) <- c(paste0("v", 1:K), paste0("u", 1:K))
    colnames(tree_data) <- c(paste0("x", seq_len(ncol(tree_data))))
    initial_tree_hclust <- addData(initial_tree_hclust, all.data = tree_data )
    init_info <- logllk_ddt(1, c_order, Sigma_by_group = rep(1, G), tree_phylo4d=initial_tree_hclust,
                            item_membership_list,
                            tree_structure_old = NULL, dist_mat_old = NULL)
    c_part <- log( 1 - init_info$tree_structure$div_times) *
      exp(init_info$tree_structure$logllk_tree_topology)
    c_hclust <- -(K-1) / sum(c_part)

    ### step 3: run hierarchical clustering on logit response profiles
    # options(warn = -1)
    initial_tree_hclust <- initialize_hclust(leaf_data, c_hclust, c_order, method_dist,
                                             method_hclust, method_add_root, alpha=0, theta=0)

    # step 4: add leaf and root data to the tree
    # let the root location be the population logit mean
    root_location <- rep(0, J)
    # add root to the leaf data
    tree_data <- rbind(leaf_data, root_location)
    # need to add rows corresponding to internal nodes
    tree_data <- rbind(tree_data, matrix(NA, nrow = K-1, ncol = ncol(tree_data)))
    rownames(tree_data) <- c(paste0("v", 1:K), paste0("u", 1:K))
    colnames(tree_data) <- c(paste0("x", seq_len(ncol(tree_data))))
    tree_phylo4d <- addData(initial_tree_hclust, all.data = tree_data )
  }

  if (!is.null(fixed_initials$c)){
    c_hclust <- fixed_initials$c
  }


  ### step 5: estimate Sigma_by_group
  if (!is.null(fixed_initials$Sigma_by_group)){
    Sigma_by_group <- fixed_initials$Sigma_by_group
  } else{
    Sigma_by_group <- rep(0, G)
    for (g in 1:G) {
      Sigma_by_group[g] <- (sd(leaf_data[, item_membership_list[[g]] ]))^2
    }
  }

  ### return initial values
  initials = list(
    tree_phylo4d = tree_phylo4d,
    response_prob = response_prob,
    class_probability = class_probability,
    class_assignments = class_assignments,
    Sigma_by_group = Sigma_by_group,
    c = c_hclust
  )

  priors <- list(
    shape_sigma = rep(2, G),
    rate_sigma = rep(2, G),
    prior_dirichlet = rep(5, K),
    shape_c = 1,
    rate_c = 1,
    a_pg = 1.0
  )
  if (!is.null(fixed_priors$shape_sigma)){
    priors$shape_sigma <- fixed_priors$shape_sigma
  }
  if (!is.null(fixed_priors$rate_sigma)){
    priors$rate_sigma <- fixed_priors$rate_sigma
  }
  if (!is.null(fixed_priors$prior_dirichlet)){
    priors$prior_dirichlet <- fixed_priors$prior_dirichlet
  }
  if (!is.null(fixed_priors$shape_c)){
    priors$shape_c <- fixed_priors$shape_c
  }
  if (!is.null(fixed_priors$rate_c)){
    priors$rate_c <- fixed_priors$rate_c
  }
  if (!is.null(fixed_priors$a_pg)){
    priors$a_pg <- fixed_priors$a_pg
  }

  if ("simple_lcm" %in% ls()) {
    return(list(initials = initials, priors = priors, model_lcm = simple_lcm))
  } else{
    return(list(initials = initials, priors = priors))
  }
}

Try the ddtlcm package in your browser

Any scripts or data that you put into this service are public.

ddtlcm documentation built on May 29, 2024, 5:41 a.m.