R/Tuning_CKNNRLD.R
In CKNNRLD: Clustering-Based K-Nearest Neighbor Regression for Longitudinal Data

Documented in CKNNRLD.tune

#' Tune CKNNRLD Model with Automatic Cluster Selection
#'
#' Automatically selects the best number of clusters (C) using internal validation
#' criteria, then tunes the CKNNRLD model within each cluster via cross-validation.
#'
#' @param y Matrix of longitudinal outcomes (subjects x timepoints).
#' @param x Matrix of predictor variables (subjects x features).
#' @param nfolds Number of folds for cross-validation (default = 10).
#' @param folds Optional list of pre-specified fold indices.
#' @param seed Random seed for reproducibility.
#' @param A Maximum number of neighbors to evaluate (searches from 2 to A, default = 10).
#' @param C_range Range of cluster numbers to evaluate (default = 2:6).
#' @param cluster_method Clustering method to use. Currently supports "kml" (default).
#'
#' @return A list containing:
#' \item{best_c}{Optimal number of clusters}
#' \item{cluster_results}{Tuning results for each cluster (from KNNRLD.tune)}
#' \item{cluster_sizes}{Size of each cluster}
#' \item{cluster_assignments}{Cluster labels for each subject}
#' \item{criteria}{Data frame of quality criteria used for selecting best C}
#'
#' @examples
#' \donttest{
#' set.seed(123)
#' n <- 30
#' T <- 3
#' d <- 2
#' x <- matrix(runif(n * d), nrow = n)
#' y <- matrix(rnorm(n * T), nrow = n)
#' tune_result <- CKNNRLD.tune(
#'   y = y,
#'   x = x,
#'   nfolds = 3,
#'   A = 4,
#'   C_range = 2:3
#' )
#' print(tune_result$best_c)
#' }
#'
#' @importFrom latrend lcMethodKML latrend trajectoryAssignments
#' @importFrom stats reshape logLik AIC BIC
#'
#' @export
CKNNRLD.tune <- function(y, x, nfolds = 10, folds = NULL, seed = NULL, 
                          A = 10, C_range = 2:6, cluster_method = "kml") {
  
  y <- as.matrix(y)
  x <- as.matrix(x)
  
  n_subjects <- nrow(y)
  n_timepoints <- ncol(y)
  n_predictors <- ncol(x)
  
  long_data <- data.frame(
    id = 1:n_subjects,
    time = rep(1:n_timepoints, each = n_subjects),
    value = as.vector(t(y))
  )
  
  calc_calinski <- function(data, clusters) {
    k <- length(unique(clusters))
    n <- nrow(data)
    if (k < 2 || n <= k) return(NA)
    overall_mean <- colMeans(data)
    between_ss <- 0
    for (i in 1:k) {
      cluster_data <- data[clusters == i, , drop = FALSE]
      if (nrow(cluster_data) > 0) {
        cluster_mean <- colMeans(cluster_data)
        between_ss <- between_ss + nrow(cluster_data) * sum((cluster_mean - overall_mean)^2)
      }
    }
    within_ss <- 0
    for (i in 1:k) {
      cluster_data <- data[clusters == i, , drop = FALSE]
      if (nrow(cluster_data) > 0) {
        cluster_mean <- colMeans(cluster_data)
        within_ss <- within_ss + sum(rowSums((cluster_data - cluster_mean)^2))
      }
    }
    if (within_ss == 0) return(NA)
    return((between_ss / (k - 1)) / (within_ss / (n - k)))
  }
  
  crit_list <- list()
  
  for (k in C_range) {
    if (cluster_method == "kml") {
      method_obj <- lcMethodKML(
        response = "value",
        time = "time",
        id = "id",
        nClusters = k,
        save.data = FALSE
      )
    } else {
      stop(paste("Method", cluster_method, "is not implemented. Use 'kml'."))
    }
    
    model <- tryCatch({
      latrend(method_obj, data = long_data, verbose = FALSE)
    }, error = function(e) {
      warning(paste("Clustering failed for C =", k, ":", e$message))
      return(NULL)
    })
    
    if (!is.null(model)) {
      cluster_assign <- trajectoryAssignments(model)
      traj_data <- as.data.frame(model)
      traj_wide <- stats::reshape(traj_data, 
                                  idvar = "id", 
                                  timevar = "time", 
                                  direction = "wide")
      traj_matrix <- as.matrix(traj_wide[, grep("value", names(traj_wide))])
      colnames(traj_matrix) <- NULL
      calinski_val <- calc_calinski(traj_matrix, cluster_assign)
      criteria <- c(
        CalinskiHarabasz = calinski_val,
        LogLik = as.numeric(stats::logLik(model)),
        AIC = stats::AIC(model),
        BIC = stats::BIC(model)
      )
      crit_list[[paste0("c", k)]] <- criteria
    } else {
      crit_list[[paste0("c", k)]] <- rep(NA, 4)
    }
  }
  
  max_len <- max(sapply(crit_list, length))
  crit_list_padded <- lapply(crit_list, function(x) {
    if (length(x) < max_len) c(x, rep(NA, max_len - length(x))) else x
  })
  
  criteria_df <- as.data.frame(do.call(cbind, crit_list_padded))
  rownames(criteria_df) <- c("CalinskiHarabasz", "LogLik", "AIC", "BIC")
  
  calinski_row <- which(rownames(criteria_df) == "CalinskiHarabasz")
  if (length(calinski_row) > 0 && any(!is.na(criteria_df[calinski_row, ]))) {
    best_c <- C_range[which.max(criteria_df[calinski_row, ])]
  } else {
    best_c <- C_range[1]
    warning("Could not determine optimal clusters. Using first C value.")
  }
  
  final_method <- lcMethodKML(
    response = "value",
    time = "time",
    id = "id",
    nClusters = best_c,
    save.data = FALSE
  )
  
  final_model <- latrend(final_method, data = long_data, verbose = FALSE)
  cluster_assignments <- trajectoryAssignments(final_model)
  
  tdata <- cbind(cluster = cluster_assignments, y, x)
  cluster_sizes <- table(cluster_assignments)
  
  y_start_idx <- 2
  y_end_idx <- y_start_idx + n_timepoints - 1
  x_start_idx <- y_end_idx + 1
  x_end_idx <- x_start_idx + n_predictors - 1
  
  xnew_clustered <- cbind(cluster = cluster_assignments, x)
  
  cluster_results <- vector("list", best_c)
  
  for (i in 1:best_c) {
    cluster_rows <- which(xnew_clustered[, 1] == i)
    
    if (length(cluster_rows) == 0) {
      warning(paste("No data for Cluster =", i))
      cluster_results[[i]] <- NULL
      next
    }
    
    x_i <- tdata[tdata[, 1] == i, x_start_idx:x_end_idx, drop = FALSE]
    y_i <- tdata[tdata[, 1] == i, y_start_idx:y_end_idx, drop = FALSE]
    
    if (nrow(y_i) < 3) {
      warning(paste("Cluster", i, "has insufficient data (n =", nrow(y_i), "). Skipping tuning."))
      cluster_results[[i]] <- list(
        crit = NA,
        best_k = NA,
        performance = NA,
        warning = "Insufficient data"
      )
      next
    }
    
    cluster_results[[i]] <- KNNRLD.tune(
      y = y_i,
      x = x_i,
      nfolds = nfolds,
      folds = folds,
      seed = seed,
      A = A
    )
  }
  
  names(cluster_results) <- paste0("Cluster_", 1:best_c)
  
  list(
    best_c = best_c,
    cluster_results = cluster_results,
    cluster_sizes = cluster_sizes,
    cluster_assignments = as.integer(cluster_assignments),
    criteria = criteria_df
  )
}