Nothing
#' Find Optimal Number of Clusters for Longitudinal Data
#'
#' This function determines the best number of clusters (C) for longitudinal data
#' clustering based on internal validation indices using the latrend package.
#'
#' @param Y A matrix or data frame of longitudinal outcomes (subjects x timepoints).
#' @param range_clusters A numeric vector of cluster numbers to evaluate (e.g., 2:6).
#' @param method Clustering method to use. Currently supports "kml" (default).
#'
#' @return A list with:
#' \item{best_c}{Optimal number of clusters}
#' \item{criteria}{Data frame of criteria values for each cluster number}
#' \item{criteria_best}{Criteria values for the best cluster number}
#'
#' @examples
#' \donttest{
#' set.seed(123)
#' n <- 30
#' T <- 3
#' y <- matrix(rnorm(n * T), nrow = n)
#' best_c_info <- BestC(Y = y, range_clusters = 2:4)
#' print(best_c_info$best_c)
#' }
#'
#' @importFrom latrend lcMethodKML latrend trajectoryAssignments
#' @importFrom stats reshape logLik AIC BIC
#'
#' @export
BestC <- function(Y, range_clusters = 2:6, method = "kml") {
Y <- as.matrix(Y)
n_subjects <- nrow(Y)
n_timepoints <- ncol(Y)
long_data <- data.frame(
id = rep(1:n_subjects, each = n_timepoints),
time = rep(1:n_timepoints, times = n_subjects),
value = as.vector(t(Y))
)
crit_list <- list()
calc_calinski <- function(data, clusters) {
k <- length(unique(clusters))
n <- nrow(data)
if (k < 2 || n <= k) return(NA)
overall_mean <- colMeans(data)
between_ss <- 0
for (i in 1:k) {
cluster_data <- data[clusters == i, , drop = FALSE]
if (nrow(cluster_data) > 0) {
cluster_mean <- colMeans(cluster_data)
between_ss <- between_ss + nrow(cluster_data) * sum((cluster_mean - overall_mean)^2)
}
}
within_ss <- 0
for (i in 1:k) {
cluster_data <- data[clusters == i, , drop = FALSE]
if (nrow(cluster_data) > 0) {
cluster_mean <- colMeans(cluster_data)
within_ss <- within_ss + sum(rowSums((cluster_data - cluster_mean)^2))
}
}
if (within_ss == 0) return(NA)
return((between_ss / (k - 1)) / (within_ss / (n - k)))
}
for (k in range_clusters) {
if (method == "kml") {
method_obj <- lcMethodKML(
response = "value",
time = "time",
id = "id",
nClusters = k,
save.data = FALSE
)
} else {
stop("Currently only 'kml' method is supported via latrend")
}
model <- tryCatch({
latrend(method_obj, data = long_data, verbose = FALSE)
}, error = function(e) {
warning(paste("Clustering failed for k =", k, ":", e$message))
return(NULL)
})
if (!is.null(model)) {
cluster_assign <- trajectoryAssignments(model)
traj_data <- as.data.frame(model)
traj_wide <- stats::reshape(traj_data,
idvar = "id",
timevar = "time",
direction = "wide")
traj_matrix <- as.matrix(traj_wide[, grep("value", names(traj_wide))])
colnames(traj_matrix) <- NULL
calinski_val <- calc_calinski(traj_matrix, cluster_assign)
criteria <- c(
CalinskiHarabasz = calinski_val,
LogLik = as.numeric(stats::logLik(model)),
AIC = stats::AIC(model),
BIC = stats::BIC(model)
)
crit_list[[paste0("c", k)]] <- criteria
} else {
crit_list[[paste0("c", k)]] <- rep(NA, 4)
}
}
max_len <- max(sapply(crit_list, length))
crit_list_padded <- lapply(crit_list, function(x) {
if (length(x) < max_len) c(x, rep(NA, max_len - length(x))) else x
})
criteria_df <- as.data.frame(do.call(cbind, crit_list_padded))
rownames(criteria_df) <- c("CalinskiHarabasz", "LogLik", "AIC", "BIC")
calinski_row <- which(rownames(criteria_df) == "CalinskiHarabasz")
if (length(calinski_row) > 0 && any(!is.na(criteria_df[calinski_row, ]))) {
best_c <- range_clusters[which.max(criteria_df[calinski_row, ])]
} else {
best_c <- range_clusters[1]
warning("Could not determine optimal clusters. Using first value.")
}
criteria_best <- criteria_df[, paste0("c", best_c), drop = FALSE]
list(
best_c = as.integer(best_c),
criteria = criteria_df,
criteria_best = criteria_best
)
}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.