R/BestC.R

Defines functions BestC

Documented in BestC

#' Find Optimal Number of Clusters for Longitudinal Data
#'
#' This function determines the best number of clusters (C) for longitudinal data
#' clustering based on internal validation indices using the latrend package.
#'
#' @param Y A matrix or data frame of longitudinal outcomes (subjects x timepoints).
#' @param range_clusters A numeric vector of cluster numbers to evaluate (e.g., 2:6).
#' @param method Clustering method to use. Currently supports "kml" (default).
#'
#' @return A list with:
#' \item{best_c}{Optimal number of clusters}
#' \item{criteria}{Data frame of criteria values for each cluster number}
#' \item{criteria_best}{Criteria values for the best cluster number}
#'
#' @examples
#' \donttest{
#' set.seed(123)
#' n <- 30
#' T <- 3
#' y <- matrix(rnorm(n * T), nrow = n)
#' best_c_info <- BestC(Y = y, range_clusters = 2:4)
#' print(best_c_info$best_c)
#' }
#'
#' @importFrom latrend lcMethodKML latrend trajectoryAssignments
#' @importFrom stats reshape logLik AIC BIC
#'
#' @export
BestC <- function(Y, range_clusters = 2:6, method = "kml") {
  
  Y <- as.matrix(Y)
  n_subjects <- nrow(Y)
  n_timepoints <- ncol(Y)
  
  long_data <- data.frame(
    id = rep(1:n_subjects, each = n_timepoints),
    time = rep(1:n_timepoints, times = n_subjects),
    value = as.vector(t(Y))
  )
  
  crit_list <- list()
  
  calc_calinski <- function(data, clusters) {
    k <- length(unique(clusters))
    n <- nrow(data)
    if (k < 2 || n <= k) return(NA)
    overall_mean <- colMeans(data)
    between_ss <- 0
    for (i in 1:k) {
      cluster_data <- data[clusters == i, , drop = FALSE]
      if (nrow(cluster_data) > 0) {
        cluster_mean <- colMeans(cluster_data)
        between_ss <- between_ss + nrow(cluster_data) * sum((cluster_mean - overall_mean)^2)
      }
    }
    within_ss <- 0
    for (i in 1:k) {
      cluster_data <- data[clusters == i, , drop = FALSE]
      if (nrow(cluster_data) > 0) {
        cluster_mean <- colMeans(cluster_data)
        within_ss <- within_ss + sum(rowSums((cluster_data - cluster_mean)^2))
      }
    }
    if (within_ss == 0) return(NA)
    return((between_ss / (k - 1)) / (within_ss / (n - k)))
  }
  
  for (k in range_clusters) {
    if (method == "kml") {
      method_obj <- lcMethodKML(
        response = "value",
        time = "time",
        id = "id",
        nClusters = k,
        save.data = FALSE
      )
    } else {
      stop("Currently only 'kml' method is supported via latrend")
    }
    
    model <- tryCatch({
      latrend(method_obj, data = long_data, verbose = FALSE)
    }, error = function(e) {
      warning(paste("Clustering failed for k =", k, ":", e$message))
      return(NULL)
    })
    
    if (!is.null(model)) {
      cluster_assign <- trajectoryAssignments(model)
      traj_data <- as.data.frame(model)
      traj_wide <- stats::reshape(traj_data, 
                                  idvar = "id", 
                                  timevar = "time", 
                                  direction = "wide")
      traj_matrix <- as.matrix(traj_wide[, grep("value", names(traj_wide))])
      colnames(traj_matrix) <- NULL
      calinski_val <- calc_calinski(traj_matrix, cluster_assign)
      criteria <- c(
        CalinskiHarabasz = calinski_val,
        LogLik = as.numeric(stats::logLik(model)),
        AIC = stats::AIC(model),
        BIC = stats::BIC(model)
      )
      crit_list[[paste0("c", k)]] <- criteria
    } else {
      crit_list[[paste0("c", k)]] <- rep(NA, 4)
    }
  }
  
  max_len <- max(sapply(crit_list, length))
  crit_list_padded <- lapply(crit_list, function(x) {
    if (length(x) < max_len) c(x, rep(NA, max_len - length(x))) else x
  })
  
  criteria_df <- as.data.frame(do.call(cbind, crit_list_padded))
  rownames(criteria_df) <- c("CalinskiHarabasz", "LogLik", "AIC", "BIC")
  
  calinski_row <- which(rownames(criteria_df) == "CalinskiHarabasz")
  if (length(calinski_row) > 0 && any(!is.na(criteria_df[calinski_row, ]))) {
    best_c <- range_clusters[which.max(criteria_df[calinski_row, ])]
  } else {
    best_c <- range_clusters[1]
    warning("Could not determine optimal clusters. Using first value.")
  }
  
  criteria_best <- criteria_df[, paste0("c", best_c), drop = FALSE]
  
  list(
    best_c = as.integer(best_c),
    criteria = criteria_df,
    criteria_best = criteria_best
  )
}

Try the CKNNRLD package in your browser

Any scripts or data that you put into this service are public.

CKNNRLD documentation built on May 29, 2026, 1:06 a.m.