R/make_partitions.R
In lgspline: Lagrangian Multiplier Smoothing Splines for Smooth Function Estimation

Documented in find_neighbors get_centers make_partitions

#' Get Centers for Partitioning
#'
#' @param data Matrix of predictor data (already processed: binary/excluded
#'   columns zeroed out by the caller when \code{data_already_processed = TRUE})
#' @param K Number of partitions minus 1 (\eqn{K})
#' @param cluster_args List with custom centers and kmeans args
#' @param cluster_on_indicators Include binary predictors in clustering
#' @param data_already_processed Logical; when TRUE the caller has already
#'   zeroed out binary / excluded columns, so \code{get_centers} skips its
#'   own binary-zeroing step.  Defaults to FALSE for backward compatibility.
#'
#' @details
#' Returns partition centers via:
#'
#' 1. Custom supplied centers if provided as a valid \eqn{(K+1) \times q} matrix
#'
#' 2. kmeans clustering on all non-spline variables if \code{cluster_on_indicators=TRUE}
#'    or if \code{data_already_processed=TRUE}
#'
#' 3. kmeans clustering excluding binary variables if \code{cluster_on_indicators=FALSE}
#'    and \code{data_already_processed=FALSE}
#'
#' @return Matrix of cluster centers
#'
#' @keywords internal
#' @export
get_centers <- function(data, K, cluster_args, cluster_on_indicators,
                        data_already_processed = FALSE) {

  ## If custom centers isn't null, return them
  #  These will be supplied to the first position
  #  Defaults to NA
  if(any(!is.na(cluster_args[[1]]))){
    return(cluster_args[[1]])

    ## Partition clusters including 0/1 predictors,
    #  or when the caller has pre-processed the data
    #  suppressWarnings prevents irrelevant "did not converge in 10 iterations"
  } else if(cluster_on_indicators || data_already_processed){
    km <- suppressWarnings(do.call(stats::kmeans,
                  c(list(x = data, centers = K + 1),
                    as.list(cluster_args[-1]))))

  } else {
    bin <- which(apply(data, 2, is_binary))
    data[,bin] <- 0
    km <- suppressWarnings(do.call(stats::kmeans,
                  c(list(x = data, centers = K + 1),
                    as.list(cluster_args[-1]))))
  }
  return(km$centers)
}

#' Find Neighboring Cluster Partitions Using Midpoint Distance Criterion
#'
#' @description
#' Identifies neighboring partitions by evaluating whether the midpoint between
#' cluster centers is closer to those centers than to any other center.
#'
#' @param centers Matrix; rows are cluster center coordinates
#' @param parallel Logical; use parallel processing
#' @param cl Cluster object for parallel execution
#' @param neighbor_tolerance Numeric; scaling factor for distance comparisons
#'
#' @return List where element i contains indices of centers neighboring center i
#'
#' @keywords internal
#' @export
find_neighbors <- function(centers, parallel, cl, neighbor_tolerance) {
  num_centers <- nrow(centers)
  neighbors <- vector("list", num_centers)

  if (!is.null(cl) & parallel) {
    ## Create index pairs for all comparisons
    pairs <- expand.grid(i = 1:(num_centers-1), j = 2:num_centers)
    pairs <- pairs[pairs$j > pairs$i, ]

    ## Split into chunks
    chunk_size <- max(1, ceiling(nrow(pairs) / length(cl)))
    chunks <- split(1:nrow(pairs), ceiling(seq_along(1:nrow(pairs))/chunk_size))

    ## Export data to cluster
    parallel::clusterExport(cl, "centers", envir = environment())

    ## Process chunks in parallel
    results <- parallel::parLapply(cl, chunks, function(chunk_indices) {
      chunk_results <- vector("list", length(chunk_indices))

      for (idx in seq_along(chunk_indices)) {
        pair_idx <- chunk_indices[idx]
        i <- pairs$i[pair_idx]
        j <- pairs$j[pair_idx]

        ## Check if midpoint is neighbor
        mid <- (centers[i,] + centers[j,])/2
        dist_ij <- sum((mid - centers[i,])^2) / neighbor_tolerance

        distances_to_others <- apply(centers[-c(i,j),,drop=FALSE], 1,
                                     function(k)sum((mid - k)^2))

        if(all(dist_ij < distances_to_others)) {
          chunk_results[[idx]] <- c(i, j)
        }
      }

      return(do.call(rbind, chunk_results[!sapply(chunk_results, is.null)]))
    })

    ## Combine results
    neighbor_pairs <- do.call(rbind, results)

    ## Convert to neighbor list
    if (!is.null(neighbor_pairs)) {
      for (k in 1:nrow(neighbor_pairs)) {
        i <- neighbor_pairs[k, 1]
        j <- neighbor_pairs[k, 2]
        neighbors[[i]] <- c(neighbors[[i]], j)
        neighbors[[j]] <- c(neighbors[[j]], i)
      }
    }

  } else {
    ## Sequential neighbor computation
    for (i in 1:(num_centers-1)) {
      for (j in (i+1):num_centers) {
        ## Get midpoint between centers i and j
        mid <- (centers[i,] + centers[j,])/2

        ## Compute scaled distance from midpoint to center i
        dist_ij <- sum((mid - centers[i,])^2) / neighbor_tolerance

        ## Get distances from midpoint to all other centers
        distances_to_others <- apply(centers[-c(i,j),,drop=FALSE], 1,
                                     function(k)sum((mid - k)^2))

        ## If midpoint closer to i,j than others, they are neighbors
        if(all(dist_ij < distances_to_others)) {
          neighbors[[i]] <- c(neighbors[[i]], j)
          neighbors[[j]] <- c(neighbors[[j]], i)
        }
      }
    }
  }

  return(neighbors)
}


#' Create Data Partitions Using Clustering
#'
#' @description
#' Partitions data support into clusters using Voronoi-like diagrams.
#' Standardization for clustering is handled internally; centers, knots, and
#' the returned \code{assign_partition} function are all on the **raw
#' (natural) scale** of the original predictors.
#'
#' @param data Numeric matrix of predictor variables (\strong{raw scale})
#' @param cluster_args Parameters for clustering
#' @param cluster_on_indicators Logical to include binary predictors
#' @param K Number of partitions minus 1 (\eqn{K})
#' @param parallel Logical to enable parallel processing
#' @param cl Cluster object for parallel computation
#' @param do_not_cluster_on_these Columns to exclude from clustering
#' @param neighbor_tolerance Scaling factor for neighbor detection
#' @param standardize Logical; whether to standardize data internally before
#'   clustering.  Should equal \code{standardize_predictors_for_knots} from
#'   \code{lgspline.fit}.  Default \code{TRUE}.
#' @param standardize_mode Character; \code{"auto"} (default) selects
#'   \code{"minmax"} for a single effective clustering dimension and
#'   \code{"normal"} for multiple dimensions.  Can be forced to either value.
#' @param dummy_adder Small constant added to numerator during standardization
#'   to avoid exact-zero values (matches \code{lgspline.fit}'s
#'   \code{dummy_adder}).  Default \code{0}.
#' @param dummy_dividor Small constant added to denominator during
#'   standardization to avoid division by zero (matches
#'   \code{lgspline.fit}'s \code{dummy_dividor}).  Default \code{0}.
#'
#' @return
#' A list containing:
#' \describe{
#'   \item{centers}{Cluster center coordinates on the \strong{raw} scale.}
#'   \item{knots}{Knot points between centers on the \strong{raw} scale.}
#'   \item{assign_partition}{Function that accepts \strong{raw}-scale new data
#'     and returns integer-like partition assignments (0.5, 1.5, \ldots).}
#'   \item{neighbors}{List of neighboring partition indices.}
#'   \item{standardize_transf}{The forward standardization function used
#'     internally (for diagnostic use only).}
#'   \item{standardize_inv_transf}{The inverse standardization function
#'     (for diagnostic use only).}
#'   \item{centers_std}{Cluster centers on the standardized scale (for
#'     diagnostic use only).}
#' }
#'
#' @keywords internal
#' @export
make_partitions <- function(data,
                            cluster_args,
                            cluster_on_indicators,
                            K,
                            parallel,
                            cl,
                            do_not_cluster_on_these,
                            neighbor_tolerance,
                            standardize      = TRUE,
                            standardize_mode = "auto",
                            dummy_adder      = 0,
                            dummy_dividor    = 0) {

  q <- ncol(data)

  ## If K is 0, return as is
  if(K == 0){
    return(list(
      centers                = matrix()[-1,,drop=FALSE],
      knots                  = matrix()[-1,,drop=FALSE],
      assign_partition       = function(x) 1,
      neighbors              = list(),
      standardize_transf     = function(X) X,
      standardize_inv_transf = function(X) X,
      centers_std            = matrix()[-1,,drop=FALSE]
    ))
  }

  ## Internal standardization for clustering
  #  Cluster on a standardized copy of the data so that variables measured on
  #  very different scales do not dominate the kmeans objective.  Centers and
  #  knots are back-transformed to raw scale before returning so that
  #  downstream consumers (constraint matrix, return_list, etc.) see raw-scale
  #  coordinates without any additional transformation.
  ##
  #  standardize_mode "auto": choose "minmax" when only one effective clustering
  #  dimension remains after zeroing excluded / binary columns, otherwise
  #  "normal".  Can be forced to "minmax" or "normal" by the caller.

  if(standardize_mode == "auto"){
    ## Count columns that will actually vary during clustering
    effective_dims <- q - length(do_not_cluster_on_these)
    if(!cluster_on_indicators){
      bin_check <- which(apply(data, 2, is_binary))
      active    <- setdiff(seq_len(q), do_not_cluster_on_these)
      effective_dims <- length(setdiff(active, bin_check))
    }
    standardize_mode <- if(effective_dims <= 1L) "minmax" else "normal"
  }

  if(standardize){
    if(standardize_mode == "minmax"){
      std_mins <- apply(data, 2, min)
      std_maxs <- apply(data, 2, max)
      std_transf <- function(X){
        for(j in seq_len(ncol(X))){
          X[,j] <- (X[,j] - std_mins[j] + dummy_adder) /
            (std_maxs[j] - std_mins[j] + dummy_dividor)
        }
        X
      }
      std_inv_transf <- function(X){
        for(j in seq_len(ncol(X))){
          X[,j] <- X[,j] * (std_maxs[j] - std_mins[j] + dummy_dividor) +
            std_mins[j] - dummy_adder
        }
        X
      }
    } else {
      ## Normal (z-score) standardization
      std_means <- apply(data, 2, mean)
      std_sds   <- apply(data, 2, function(x) tryCatch(sd(x), error = function(e) 1))
      std_transf <- function(X){
        for(j in seq_len(ncol(X))){
          X[,j] <- (X[,j] - std_means[j] + dummy_adder) /
            (std_sds[j] + dummy_dividor)
        }
        X
      }
      std_inv_transf <- function(X){
        for(j in seq_len(ncol(X))){
          X[,j] <- X[,j] * (std_sds[j] + dummy_dividor) +
            std_means[j] - dummy_adder
        }
        X
      }
    }
    data_std <- std_transf(data)
  } else {
    data_std       <- data
    std_transf     <- function(X) X
    std_inv_transf <- function(X) X
  }

  ## Zero out excluded / binary columns in the standardized copy   -
  #  We operate on data_std from here so that find_neighbors and kmeans work
  #  on the same scale.  The originals (data, data_std) are not modified.
  data_cluster <- data_std

  ## [Change 2026-03-14] Detect binary columns from the TRAINING data and
  #  store the indices for reuse in the assign_partition closure. Previously
  #  is_binary was re-evaluated on new data at prediction time, which caused
  #  small prediction sets (nrow <= 2) to misidentify continuous predictors
  #  as binary, zeroing them out and producing wrong partition assignments.
  if(!cluster_on_indicators){
    train_binary_cols <- which(apply(data_cluster, 2, is_binary))
    if(length(train_binary_cols) > 0){
      data_cluster[, train_binary_cols] <- 0
    }
  } else {
    train_binary_cols <- integer(0)
  }

  if(length(do_not_cluster_on_these) > 0){
    data_cluster[,do_not_cluster_on_these] <- 0
  }

  ## Compute CVT centers on the standardized scale       -
  #  Pass data_already_processed = TRUE so get_centers does not re-zero binary
  #  columns (we already did it above on data_cluster).
  #  Custom centers from cluster_args[[1]] are returned as-is; they are
  #  assumed to be on the STANDARDIZED scale when supplied via cluster_args
  #  (same assumption as before).  If the caller supplies raw-scale custom
  # centers they should pass standardize = FALSE.
  initial_points_std <- get_centers(data_cluster, K, cluster_args,
                                    cluster_on_indicators = TRUE,
                                    data_already_processed = TRUE)

  ## Re-order cluster indices by ascending L1-norm of their standardized
  #  center. This makes cluster 1 the "leftmost/smallest" and cluster K+1
  #  the "rightmost/largest", so that knot rownames like "1_2", "2_3" reflect
  #  actual spatial adjacency.  For 1-D this gives strict left-to-right order.
  #  For multi-D it gives a consistent, reproducible numbering that is
  #  insensitive to kmeans initialisation order.
  ## Re-index partitions so numbering follows the geometry more naturally
  #  and is reproducible across kmeans initialisations.
  l1_order <- order(apply(initial_points_std,
                          1,
                          function(r) sum(abs(r))))
  ## overwrite the initial_points_std above
  initial_points_std <- initial_points_std[l1_order, , drop = FALSE]

  ## Find neighboring partitions (on standardized scale)     -
  #  find_neighbors uses Euclidean distances; working on the standardized scale
  #  keeps comparisons numerically stable and scale-invariant.
  neighbors <- find_neighbors(initial_points_std,
                              parallel,
                              cl,
                              neighbor_tolerance)

  ## Compute knots as midpoints between neighboring centers
  find_knots <- function(pts, nbrs) {
    additional_points <- list()
    for (i in seq_along(nbrs)) {
      for (j in nbrs[[i]]) {
        if(j > i) next       ## avoid double-counting
        c1  <- pts[i,]
        c2  <- pts[j,]
        mid <- (c1 + c2) / 2
        additional_points[[length(additional_points) + 1]] <- mid
        names(additional_points)[length(additional_points)] <-
          paste0(i, '_', j)
      }
    }
    if(length(additional_points) > 0){
      return(do.call(rbind, additional_points))
    } else {
      return(matrix(nrow = 0, ncol = ncol(pts)))
    }
  }

  knots_std <- find_knots(initial_points_std, neighbors)

  ## Back-transform centers and knots to raw scale       -
  initial_points_raw <- std_inv_transf(initial_points_std)
  knots_raw <- if(nrow(knots_std) > 0) std_inv_transf(knots_std) else knots_std

  ## Partition-assignment closure
  #  The returned function accepts RAW-scale new data. It standardizes
  #  internally using the same transform fitted above, then uses nearest-
  #  neighbor lookup against the standardized centers.
  #
  #  Binary column mask (train_binary_cols) and excluded
  #  column mask (do_not_cluster_on_these) are captured from the training
  #  data at partition-creation time. Previously is_binary was re-evaluated
  #  on new_data, which misclassified continuous predictors as binary when
  #  the prediction set had <= 2 rows with distinct values.
  assign_partition <- function(new_data) {

    ## Standardize new data with the same transform used for training
    new_data_std <- std_transf(new_data)

    ## Apply the TRAINING-TIME column masks: binary cols and excluded cols
    #  Do NOT re-detect binary columns from new_data
    if(length(train_binary_cols) > 0){
      new_data_std[, train_binary_cols] <- 0
    }
    if(length(do_not_cluster_on_these) > 0){
      new_data_std[, do_not_cluster_on_these] <- 0
    }

    ## Batch the nearest-neighbor lookup so large prediction sets do not
    #  allocate one full distance matrix at once.
    bytes_per_row <- 8 * nrow(initial_points_std)
    batch_size    <- floor((0.5 * 1024^3) / bytes_per_row)
    total_rows    <- nrow(new_data_std)
    assignments   <- numeric(total_rows)

    for(i in seq(1, total_rows, batch_size)) {
      nd_idx <- min(i + batch_size - 1, total_rows)
      batch  <- new_data_std[i:nd_idx, , drop = FALSE]
      nn     <- FNN::get.knnx(initial_points_std, batch, k = 1)
      assignments[i:nd_idx] <- nn$nn.index
    }

    return(assignments - 0.5)
  }

  ## Return raw-scale centers/knots together with the assignment closure
  #  and the internal standardization objects used to build them.
  rownames(initial_points_raw) <- paste0("center_",
                                         seq_len(nrow(initial_points_raw)))
  return(list(
    centers                = initial_points_raw,   # raw scale
    knots                  = knots_raw,            # raw scale
    assign_partition       = assign_partition,     # Accepts raw scale input
    neighbors              = neighbors,
    standardize_transf     = std_transf,           # For diagnostics only
    standardize_inv_transf = std_inv_transf,       # For diagnostics only
    centers_std            = initial_points_std    # Standardized, for reference
  ))
}