Nothing
#' Tune CKNNRLD Model with Automatic Cluster Selection
#'
#' Automatically selects the best number of clusters (C) using internal validation
#' criteria, then tunes the CKNNRLD model within each cluster via cross-validation.
#'
#' @param y Matrix of longitudinal outcomes (subjects x timepoints).
#' @param x Matrix of predictor variables (subjects x features).
#' @param nfolds Number of folds for cross-validation (default = 10).
#' @param folds Optional list of pre-specified fold indices.
#' @param seed Random seed for reproducibility.
#' @param A Maximum number of neighbors to evaluate (searches from 2 to A, default = 10).
#' @param C_range Range of cluster numbers to evaluate (default = 2:6).
#' @param cluster_method Clustering method to use. Currently supports "kml" (default).
#'
#' @return A list containing:
#' \item{best_c}{Optimal number of clusters}
#' \item{cluster_results}{Tuning results for each cluster (from KNNRLD.tune)}
#' \item{cluster_sizes}{Size of each cluster}
#' \item{cluster_assignments}{Cluster labels for each subject}
#' \item{criteria}{Data frame of quality criteria used for selecting best C}
#'
#' @examples
#' \donttest{
#' set.seed(123)
#' n <- 30
#' T <- 3
#' d <- 2
#' x <- matrix(runif(n * d), nrow = n)
#' y <- matrix(rnorm(n * T), nrow = n)
#' tune_result <- CKNNRLD.tune(
#' y = y,
#' x = x,
#' nfolds = 3,
#' A = 4,
#' C_range = 2:3
#' )
#' print(tune_result$best_c)
#' }
#'
#' @importFrom latrend lcMethodKML latrend trajectoryAssignments
#' @importFrom stats reshape logLik AIC BIC
#'
#' @export
CKNNRLD.tune <- function(y, x, nfolds = 10, folds = NULL, seed = NULL,
A = 10, C_range = 2:6, cluster_method = "kml") {
y <- as.matrix(y)
x <- as.matrix(x)
n_subjects <- nrow(y)
n_timepoints <- ncol(y)
n_predictors <- ncol(x)
long_data <- data.frame(
id = 1:n_subjects,
time = rep(1:n_timepoints, each = n_subjects),
value = as.vector(t(y))
)
calc_calinski <- function(data, clusters) {
k <- length(unique(clusters))
n <- nrow(data)
if (k < 2 || n <= k) return(NA)
overall_mean <- colMeans(data)
between_ss <- 0
for (i in 1:k) {
cluster_data <- data[clusters == i, , drop = FALSE]
if (nrow(cluster_data) > 0) {
cluster_mean <- colMeans(cluster_data)
between_ss <- between_ss + nrow(cluster_data) * sum((cluster_mean - overall_mean)^2)
}
}
within_ss <- 0
for (i in 1:k) {
cluster_data <- data[clusters == i, , drop = FALSE]
if (nrow(cluster_data) > 0) {
cluster_mean <- colMeans(cluster_data)
within_ss <- within_ss + sum(rowSums((cluster_data - cluster_mean)^2))
}
}
if (within_ss == 0) return(NA)
return((between_ss / (k - 1)) / (within_ss / (n - k)))
}
crit_list <- list()
for (k in C_range) {
if (cluster_method == "kml") {
method_obj <- lcMethodKML(
response = "value",
time = "time",
id = "id",
nClusters = k,
save.data = FALSE
)
} else {
stop(paste("Method", cluster_method, "is not implemented. Use 'kml'."))
}
model <- tryCatch({
latrend(method_obj, data = long_data, verbose = FALSE)
}, error = function(e) {
warning(paste("Clustering failed for C =", k, ":", e$message))
return(NULL)
})
if (!is.null(model)) {
cluster_assign <- trajectoryAssignments(model)
traj_data <- as.data.frame(model)
traj_wide <- stats::reshape(traj_data,
idvar = "id",
timevar = "time",
direction = "wide")
traj_matrix <- as.matrix(traj_wide[, grep("value", names(traj_wide))])
colnames(traj_matrix) <- NULL
calinski_val <- calc_calinski(traj_matrix, cluster_assign)
criteria <- c(
CalinskiHarabasz = calinski_val,
LogLik = as.numeric(stats::logLik(model)),
AIC = stats::AIC(model),
BIC = stats::BIC(model)
)
crit_list[[paste0("c", k)]] <- criteria
} else {
crit_list[[paste0("c", k)]] <- rep(NA, 4)
}
}
max_len <- max(sapply(crit_list, length))
crit_list_padded <- lapply(crit_list, function(x) {
if (length(x) < max_len) c(x, rep(NA, max_len - length(x))) else x
})
criteria_df <- as.data.frame(do.call(cbind, crit_list_padded))
rownames(criteria_df) <- c("CalinskiHarabasz", "LogLik", "AIC", "BIC")
calinski_row <- which(rownames(criteria_df) == "CalinskiHarabasz")
if (length(calinski_row) > 0 && any(!is.na(criteria_df[calinski_row, ]))) {
best_c <- C_range[which.max(criteria_df[calinski_row, ])]
} else {
best_c <- C_range[1]
warning("Could not determine optimal clusters. Using first C value.")
}
final_method <- lcMethodKML(
response = "value",
time = "time",
id = "id",
nClusters = best_c,
save.data = FALSE
)
final_model <- latrend(final_method, data = long_data, verbose = FALSE)
cluster_assignments <- trajectoryAssignments(final_model)
tdata <- cbind(cluster = cluster_assignments, y, x)
cluster_sizes <- table(cluster_assignments)
y_start_idx <- 2
y_end_idx <- y_start_idx + n_timepoints - 1
x_start_idx <- y_end_idx + 1
x_end_idx <- x_start_idx + n_predictors - 1
xnew_clustered <- cbind(cluster = cluster_assignments, x)
cluster_results <- vector("list", best_c)
for (i in 1:best_c) {
cluster_rows <- which(xnew_clustered[, 1] == i)
if (length(cluster_rows) == 0) {
warning(paste("No data for Cluster =", i))
cluster_results[[i]] <- NULL
next
}
x_i <- tdata[tdata[, 1] == i, x_start_idx:x_end_idx, drop = FALSE]
y_i <- tdata[tdata[, 1] == i, y_start_idx:y_end_idx, drop = FALSE]
if (nrow(y_i) < 3) {
warning(paste("Cluster", i, "has insufficient data (n =", nrow(y_i), "). Skipping tuning."))
cluster_results[[i]] <- list(
crit = NA,
best_k = NA,
performance = NA,
warning = "Insufficient data"
)
next
}
cluster_results[[i]] <- KNNRLD.tune(
y = y_i,
x = x_i,
nfolds = nfolds,
folds = folds,
seed = seed,
A = A
)
}
names(cluster_results) <- paste0("Cluster_", 1:best_c)
list(
best_c = best_c,
cluster_results = cluster_results,
cluster_sizes = cluster_sizes,
cluster_assignments = as.integer(cluster_assignments),
criteria = criteria_df
)
}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.