R/diss_correlation.R
In resemble: Similarity Retrieval and Local Learning for Spectral Chemometrics

Documented in diss_correlation

# =============================================================================
# Correlation dissimilarity — aligned with shared constructor pattern
# =============================================================================
# Public:   diss_correlation()   — method constructor
# Internal: .cor_diss_compute()  — raw computation, not exported


#' @title Correlation dissimilarity method constructor
#'
#' @description
#' Creates a configuration object that fully specifies a correlation (or moving
#' correlation) dissimilarity method. Pass the result to \code{dissimilarity()}
#' to compute the dissimilarity matrix.
#'
#' @param ws Either \code{NULL} (default) or an odd integer greater than 2.
#'   When \code{NULL}, standard Pearson correlation dissimilarity is used.
#'   When an odd integer is provided, a moving (rolling) correlation
#'   dissimilarity is computed using a window of that size.
#' @param center Logical. Should the data be mean-centered before computing
#'   dissimilarities? Centering is applied jointly to \code{Xr} and \code{Xu}
#'   (if provided) based on their combined column means. Default is \code{TRUE}.
#' @param scale Logical. Should the data be scaled (divided by column standard
#'   deviations) before computing dissimilarities? Scaling is applied jointly
#'   to \code{Xr} and \code{Xu} (if provided). Default is \code{FALSE}.
#'
#' @return An object of class \code{c("diss_correlation", "diss_method")} — a
#'   list holding the validated method parameters. Intended to be passed to
#'   \code{dissimilarity()}, not used directly.
#'
#' @details
#' The correlation dissimilarity between two observations \eqn{x_i} and
#' \eqn{x_j} is:
#'
#' \deqn{d(x_i, x_j) = \frac{1}{2}(1 - \rho(x_i, x_j))}
#'
#' where \eqn{\rho} is the Pearson correlation coefficient. This is used when
#' \code{ws = NULL}.
#'
#' When \code{ws} is specified, the moving correlation dissimilarity is:
#'
#' \deqn{d(x_i, x_j; ws) = \frac{1}{2\,ws} \sum_{k=1}^{p - ws}
#'   \bigl(1 - \rho(x_{i,(k:k+ws)},\, x_{j,(k:k+ws)})\bigr)}
#'
#' where \eqn{ws} is the window size and \eqn{p} is the number of variables.
#'
#' @section Parallel execution:
#' The underlying C++ implementation uses OpenMP for parallel computation.
#' Thread count is controlled by the \code{OMP_NUM_THREADS} environment
#' variable. To limit threads (e.g., when calling from within a parallel
#' backend):
#' \preformatted{
#' Sys.setenv(OMP_NUM_THREADS = 1)
#' }
#'
#' @author \href{https://orcid.org/0000-0002-5369-5120}{Leonardo Ramirez-Lopez}
#'
#' @seealso \code{\link{dissimilarity}}, \code{\link{diss_euclidean}},
#'   \code{\link{diss_mahalanobis}}, \code{\link{diss_cosine}}
#'
#' @examples
#' # Standard correlation dissimilarity
#' m <- diss_correlation()
#'
#' # Moving correlation with window size 41
#' m <- diss_correlation(ws = 41)
#'
#' # Without centering
#' m <- diss_correlation(center = FALSE)
#'
#' @export
diss_correlation <- function(
    ws     = NULL,
    center = TRUE,
    scale  = FALSE
) {
  
  # --- validate ws -----------------------------------------------------------
  if (!is.null(ws)) {
    if (!is.numeric(ws) || length(ws) != 1L) {
      stop("'ws' must be a single integer value or NULL.")
    }
    ws <- as.integer(ws)
    if (ws < 3L) {
      stop("'ws' must be an odd integer greater than 2.")
    }
    if (ws %% 2L == 0L) {
      stop("'ws' must be an odd integer (e.g. 3, 5, 11, 41, ...).")
    }
  }
  
  # --- delegate center/scale validation and object construction --------------
  .new_diss_method(
    "correlation",
    center = center,
    scale  = scale,
    extra  = list(ws = ws)
  )
}


# --- print method ------------------------------------------------------------
#' @export
print.diss_correlation <- function(x, ...) {
  ws_label <- if (is.null(x$ws)) "full (no moving window)" else x$ws
  cat(
    "Dissimilarity method: correlation\n",
    "  window size (ws) :", ws_label, "\n",
    "  center           :", x$center, "\n",
    "  scale            :", x$scale, "\n"
  )
  invisible(x)
}


# =============================================================================
# Internal compute function — NOT exported
# =============================================================================
#
# Called by dissimilarity() after data-level validation has been done there.
# Assumes:
#   - Xr and Xu are numeric matrices with no NAs
#   - ncol(Xu) == ncol(Xr) if Xu is provided
#   - method$ws < ncol(Xr) if ws is not NULL (checked in dissimilarity())
#
.cor_diss_compute <- function(Xr, Xu = NULL, method) {
  
  stopifnot(inherits(method, "diss_correlation"))
  
  ws     <- method$ws
  center <- method$center
  scale  <- method$scale
  
  # --- preprocessing ---------------------------------------------------------
  if (center || scale) {
    X <- rbind(Xr, Xu)
    
    if (center) {
      X <- sweep(X, MARGIN = 2, STATS = colMeans(X), FUN = "-")
    }
    if (scale) {
      col_sds <- get_col_sds(X)
      if (any(col_sds == 0)) {
        stop("Cannot scale: one or more columns have zero standard deviation.")
      }
      X <- sweep(X, MARGIN = 2, STATS = col_sds, FUN = "/")
    }
    
    if (!is.null(Xu)) {
      n_xu <- nrow(Xu)
      Xu   <- X[(nrow(X) - n_xu + 1L):nrow(X), , drop = FALSE]
      Xr   <- X[1L:(nrow(X) - n_xu), , drop = FALSE]
    } else {
      Xr <- X
    }
    rm(X)
  }
  
  # --- zero-sd guard (row-level for correlation) -----------------------------
  xr_row_sds <- get_col_sds(t(Xr))
  if (any(xr_row_sds == 0)) {
    stop(
      sprintf(
        "Correlation cannot be computed: Xr contains %d observation(s) with zero variance.",
        sum(xr_row_sds == 0)
      )
    )
  }
  
  if (!is.null(Xu)) {
    xu_row_sds <- get_col_sds(t(Xu))
    if (any(xu_row_sds == 0)) {
      stop(
        sprintf(
          "Correlation cannot be computed: Xu contains %d observation(s) with zero variance.",
          sum(xu_row_sds == 0)
        )
      )
    }
  }
  
  # --- resolve full-correlation window size ----------------------------------
  if (is.null(ws)) {
    ws <- ncol(Xr)
  }
  
  # --- dispatch to C++ backend -----------------------------------------------
  if (!is.null(Xu)) {
    result <- moving_cor_diss_xy(
      Xu, Xr, ws,
      compute_block_rows(dim(Xu)),
      compute_block_rows(dim(Xr))
    )
    rownames(result) <- paste("Xr", seq_len(nrow(Xr)), sep = "_")
    colnames(result) <- paste("Xu", seq_len(nrow(Xu)), sep = "_")
  } else {
    result <- moving_cor_diss_self_f64(Xr, ws, compute_block_rows(dim(Xr)))
    rownames(result) <- colnames(result) <- paste("Xr", seq_len(nrow(Xr)), sep = "_")
  }
  
  result
}


#' Heuristic for cache-aware tile height (block_rows)
#'
#' @description
#' Choose a tile height for tiled rolling-window correlation/distance kernels,
#' balancing per-tile cache footprint against matrix width and available
#' threads. Cross-platform (macOS, Linux, Windows).
#'
#' @details
#' Let \eqn{b} be the tile height and \eqn{T} the number of columns.
#' The working set per tile (in elements) is approximated as:
#'
#' \deqn{M(b) \approx 3 b^2 + 2 b T} if \code{include_squares = TRUE}
#' (materialised \eqn{X_i^2}, \eqn{X_j^2}), and
#' \deqn{M(b) \approx 3 b^2} otherwise.
#'
#' A per-thread budget \eqn{K} (elements) is obtained from \code{target_mb},
#' divided by the effective OpenMP thread count. The quadratic bound yields
#' \deqn{b_T = \frac{-T + \sqrt{T^2 + 3K}}{3}}
#' (or \eqn{b_T = \sqrt{K/3}} when \code{include_squares = FALSE}).
#'
#' The final choice is
#' \code{block_rows = round_to_64( min( max(64, m / min_tiles), b_T ) )},
#' clamped to \code{[64, min(max_block_cap, m)]}.
#'
#' If OpenMP is unavailable (\code{capabilities("openmp") == FALSE}),
#' the budget assumes a single thread.
#'
#' @param dimensions Integer vector of length two \code{c(m, T)}:
#'   number of rows and columns of \eqn{X}.
#' @param include_squares Logical. If \code{TRUE}, assumes the kernel
#'   materialises \code{Xi_sq} and \code{Xj_sq}; if \code{FALSE}, uses the
#'   lighter memory model.
#' @param target_mb Numeric. Target MiB of cache to devote per thread to one
#'   tile. Default is 8.
#' @param min_tiles Integer. Aim for at least this many tiles along the
#'   row dimension (default \code{12L}).
#' @param max_block_cap Integer. Hard upper bound for \code{block_rows}
#'   (default \code{1024L}).
#'
#' @return Integer scalar, the recommended \code{block_rows} (multiple of 64).
#'
#' @section Thread detection:
#' Thread count is determined by:
#' \enumerate{
#'   \item \code{OMP_NUM_THREADS} environment variable (if set)
#'   \item \code{parallel::detectCores(logical = FALSE)} (physical cores)
#'   \item Falls back to 1 if detection fails or OpenMP is unavailable
#' }
#'
#' @keywords internal
#' @noRd
compute_block_rows <- function(
    dimensions,
    include_squares = TRUE,
    target_mb       = 8,
    min_tiles       = 12L,
    max_block_cap   = 1024L
) {
  mm <- as.integer(dimensions[1])
  tt <- as.integer(dimensions[2])
  
  max_block <- min(max_block_cap, mm)
  if (max_block < 64L) return(max_block)
  
  # --- detect effective thread count -----------------------------------------
  threads <- suppressWarnings(as.integer(Sys.getenv("OMP_NUM_THREADS", NA)))
  if (is.na(threads) || threads < 1L) {
    threads <- parallel::detectCores(logical = FALSE)
    if (is.na(threads) || threads < 1L) threads <- 1L
  }
  if (!isTRUE(capabilities("openmp"))) threads <- 1L
  
  # --- per-thread element budget (double -> 8 bytes) -------------------------
  elt_size <- 8
  per_thread_mb <- max(8, as.numeric(target_mb) / threads)
  K <- (per_thread_mb * 1024^2) / elt_size
  
  # --- working-set model bound -----------------------------------------------
  b_T <- if (isTRUE(include_squares)) {
    (-tt + sqrt(tt * tt + 3 * K)) / 3
  } else {
    sqrt(K / 3)
  }
  
  # --- ensure enough tiles along rows ----------------------------------------
  b_m   <- mm / as.numeric(min_tiles)
  b_raw <- min(max(64, b_m), b_T)
  
  block <- round_to_64(b_raw)
  block <- max(64L, min(as.integer(block), max_block))
  block
}


#' Round to nearest multiple of 64
#' @param x Numeric value to round.
#' @param mult Integer multiple (default 64).
#' @return Integer rounded to nearest multiple.
#' @keywords internal
#' @noRd
round_to_64 <- function(x, mult = 64L) {
  as.integer(mult * round(x / mult))
}

Any scripts or data that you put into this service are public.

resemble documentation built on April 21, 2026, 1:07 a.m.

rdrr.io home R language documentation Run R code online

CRAN packages Bioconductor packages R-Forge packages GitHub packages

Note that we can't provide technical support on individual packages. You should contact the package authors for that.

resemble
Similarity Retrieval and Local Learning for Spectral Chemometrics

R/diss_correlation.R
In resemble: Similarity Retrieval and Local Learning for Spectral Chemometrics

Defines functions round_to_64 compute_block_rows .cor_diss_compute print.diss_correlation diss_correlation

Documented in diss_correlation

Try the resemble package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

resemble Similarity Retrieval and Local Learning for Spectral Chemometrics

R/diss_correlation.R In resemble: Similarity Retrieval and Local Learning for Spectral Chemometrics

Defines functions round_to_64 compute_block_rows .cor_diss_compute print.diss_correlation diss_correlation

Documented in diss_correlation

Try the resemble package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

resemble
Similarity Retrieval and Local Learning for Spectral Chemometrics

R/diss_correlation.R
In resemble: Similarity Retrieval and Local Learning for Spectral Chemometrics