R/S3_decompositions.R

Defines functions print.HDF5PCA prcomp.HDF5Matrix svd.HDF5Matrix svd.default svd

Documented in prcomp.HDF5Matrix print.HDF5PCA svd svd.HDF5Matrix

# S3_decompositions.R
#
# S3 dispatch for svd() and prcomp() on HDF5Matrix objects.
#
# Both svd() and prcomp() are proper S3 generics in base R / stats,
# so HDF5Matrix methods are registered through the standard mechanism
# without any masking of the base generic.
#
#   svd() -> base::svd    (UseMethod in base)
#   prcomp() -> stats::prcomp (UseMethod in stats)
#
# User-facing interface follows base R conventions:
#   svd(X, nu, nv, ...)  -> list(d, u, v) where d is numeric, u/v are HDF5Matrix
#   prcomp(X, retx, center, scale., ...) -> HDF5PCA object
#
# NOTE: nu and nv (number of left/right vectors to return) are passed
# as nev = max(nu, nv) to the underlying block-wise algorithm.  The
# caller can then close unwanted matrices if needed.


# ---------------------------------------------------------------------------
# Generic override  (base::cor has no UseMethod -> must mask)
# ---------------------------------------------------------------------------


# svd() override - base::svd() has no '...' so extra args fail before S3 dispatch.
# We create our own generic that routes HDF5Matrix to svd.HDF5Matrix and
# everything else to base::svd().
# #' @export
# svd <- function(x, nu = min(dim(x)), nv = min(dim(x)), ...) UseMethod("svd")

#' Singular Value Decomposition (generic)
#'
#' @description
#' S3 generic for \code{svd()}. Dispatches to \code{\link{svd.HDF5Matrix}}
#' for \code{HDF5Matrix} objects, and to \code{base::svd()} for all others.
#'
#' @param x A matrix or \code{HDF5Matrix} object.
#' @param nu Number of left singular vectors to compute.
#' @param nv Number of right singular vectors to compute.
#' @param ... Additional arguments passed to the method.
#' @return Named list with components \code{d}, \code{u}, \code{v}.
#' @name svd
#' @rdname svd
#' @export
svd <- function(x, nu = min(dim(x)), nv = min(dim(x)), ...) UseMethod("svd")


#' @exportS3Method
svd.default <- function(x, nu = min(dim(x)), nv = min(dim(x)), ...) {
    base::svd(x, nu = nu, nv = nv)
}


# -- svd.HDF5Matrix ------------------------------------------------------------

#' Singular Value Decomposition of an HDF5Matrix
#'
#' Block-wise SVD entirely on disk.  The matrix \code{x} is decomposed
#' into \code{x = u \%*\% diag(d) \%*\% t(v)}.
#'
#' Singular values \code{d} are loaded into a plain numeric vector
#' (they are always small: at most \code{min(nrow(x), ncol(x))} values).
#' \code{u} and \code{v} are returned as \code{HDF5Matrix} objects.
#'
#' @param x   An \code{HDF5Matrix} object.
#' @param nu  Number of left  singular vectors to compute (default = \code{min(dim(x))}).
#' @param nv  Number of right singular vectors to compute (default = \code{min(dim(x))}).
#' @param center Logical.  Center columns before decomposition (default \code{TRUE}).
#' @param scale  Logical.  Scale  columns before decomposition (default \code{TRUE}).
#' @param k      Number of local SVDs per incremental level (default 2).
#' @param q      Number of incremental levels (default 1).
#' @param method Computation method: \code{"auto"} (default), \code{"blocks"},
#'   or \code{"full"}.
#' @param rankthreshold Numeric in \code{[0, 0.1]}.  Rank approximation
#'   threshold (default 0).
#' @param overwrite Logical.  Overwrite existing SVD results (default \code{FALSE}).
#' @param threads Integer.  OpenMP threads (\code{-1} = auto-detect).
#' @param ... Ignored (S3 compatibility).
#'
#' @return Named list with:
#'   \describe{
#'     \item{\code{d}}{Numeric vector of non-negative singular values, decreasing.}
#'     \item{\code{u}}{HDF5Matrix of left  singular vectors, \code{nrow(x) x nu}.}
#'     \item{\code{v}}{HDF5Matrix of right singular vectors, \code{ncol(x) x nv}.}
#'   }
#'
#' @examples
#' \donttest{
#' tmp <- tempfile(fileext = ".h5")
#' X   <- hdf5_create_matrix(tmp, "data/M", data = matrix(rnorm(500), 50, 10))
#' res <- svd(X)
#' length(res$d)   # 10  (min(50,10))
#' dim(res$u)      # 50 x 10
#' dim(res$v)      # 10 x 10
#' X$close()
#' res$u$close(); res$v$close()
#' unlink(tmp)
#' }
#'
#' @export
svd.HDF5Matrix <- function(x,
                            nu     = min(dim(x)),
                            nv     = min(dim(x)),
                            center = TRUE,
                            scale  = TRUE,
                            k      = 2L,
                            q      = 1L,
                            method = "auto",
                            rankthreshold = 0.0,
                            overwrite     = FALSE,
                            threads       = -1L,
                            ...) {
    if (!x$is_valid()) stop("HDF5Matrix is closed or invalid")

    nu_int <- as.integer(nu)
    nv_int <- as.integer(nv)

    # NOTE: nev controls per-block truncation in the block-SVD algorithm.
    # Passing max(nu,nv) when the user requests fewer than full rank keeps
    # B_i = U_i * d_i compact, reducing the cost of the final joined SVD.
    # Pass nev to the block algorithm so per-block U matrices are truncated
    # to the requested rank, reducing the cost of the final joined SVD.
    # nev=0 means full rank (no truncation) — only truncate when the user
    # explicitly requests fewer vectors than the full decomposition.
    nev_blk <- if (max(nu_int, nv_int) < min(dim(x))) as.integer(max(nu_int, nv_int)) else 0L
    res <- x$svd(
        k             = k,
        q             = q,
        nev           = nev_blk,
        ##..## nev           = 0L,   # 0 = all; nev as block param is unused in C++
        center        = center,
        scale         = scale,
        method        = method,
        rankthreshold = rankthreshold,
        overwrite     = overwrite,
        threads       = threads
    )

    # -- Truncate d ------------------------------------------------------------
    k_out <- max(nu_int, nv_int)
    d_full <- res$d
    if (k_out > 0L && k_out < length(d_full))
        res$d <- d_full[seq_len(k_out)]

    # -- Truncate U to nu columns ----------------------------------------------
    # u is a HDF5Matrix of shape nrow(x) x r where r = min(dim(x)).
    # When nu < r, select only the first nu columns via the existing [r,c] method.
    if (nu_int > 0L && nu_int < ncol(res$u)) {
        u_mat  <- res$u[, seq_len(nu_int)]    # reads into memory (nu cols only)
        u_path <- paste0(res$u$get_group(), "/", res$u$get_dataset(), "_nu")
        try(close(res$u), silent = TRUE)
        res$u  <- hdf5_create_matrix(x$get_filename(), u_path,
                                     data = u_mat, overwrite = TRUE)
    }

    # -- Truncate V to nv columns ----------------------------------------------
    if (nv_int > 0L && nv_int < ncol(res$v)) {
        v_mat  <- res$v[, seq_len(nv_int)]
        v_path <- paste0(res$v$get_group(), "/", res$v$get_dataset(), "_nv")
        try(close(res$v), silent = TRUE)
        res$v  <- hdf5_create_matrix(x$get_filename(), v_path,
                                     data = v_mat, overwrite = TRUE)
    }

    res
}


# -- prcomp.HDF5Matrix ---------------------------------------------------------

#' Principal Component Analysis of an HDF5Matrix
#'
#' Block-wise PCA entirely on disk, equivalent to \code{\link{prcomp}()}.
#' Implements the same interface as \code{stats::prcomp()} but operates on
#' data stored in an HDF5 file without loading it into RAM.
#'
#' @param x        An \code{HDF5Matrix} object.
#' @param retx     Logical.  If \code{TRUE} (default) return the individual
#'   coordinates (\code{x} slot).  If \code{FALSE} the \code{x} slot is
#'   \code{NULL} in the returned object.
#' @param center   Logical.  Subtract column means before PCA (default \code{TRUE}).
#' @param scale.   Logical.  Divide by column SDs before PCA (default \code{FALSE}).
#' @param tol      Ignored (present for interface compatibility with \code{prcomp()}).
#' @param rank. Ignored. Present for compatibility with \code{stats::prcomp}.
#' @param ncomponents Integer.  Number of PCs to compute (0 = all, default).
#' @param k        Number of local SVDs per incremental level (default 2).
#' @param q        Number of incremental levels (default 1).
#' @param method   Computation method: \code{"auto"} (default), \code{"blocks"},
#'   or \code{"full"}.
#' @param rankthreshold Numeric in \code{[0, 0.1]}.  Rank approximation threshold.
#' @param svdgroup HDF5 group for intermediate SVD storage (default \code{"SVD/"}).
#' @param overwrite Logical.  Recompute even if PCA results exist (default \code{FALSE}).
#' @param threads  Integer.  OpenMP threads (\code{-1} = auto-detect).
#' @param ... Ignored (S3 compatibility).
#'
#' @return An object of class \code{c("HDF5PCA", "list")} with elements:
#'   \describe{
#'     \item{\code{sdev}}{Numeric vector.  Standard deviations of the PCs.}
#'     \item{\code{rotation}}{HDF5Matrix.  Variable loadings (rotation matrix).}
#'     \item{\code{x}}{HDF5Matrix or \code{NULL}.  Individual coordinates.}
#'     \item{\code{center}}{Logical.  Whether columns were centered.}
#'     \item{\code{scale}}{Logical.  Whether columns were scaled.}
#'     \item{\code{cumvar}}{Numeric vector.  Cumulative variance explained (percent).}
#'     \item{\code{lambda}}{Numeric vector.  Eigenvalues.}
#'     \item{\code{var.cos2}}{HDF5Matrix.  Squared cosines for variables.}
#'     \item{\code{ind.cos2}}{HDF5Matrix.  Squared cosines for individuals.}
#'     \item{\code{ind.contrib}}{HDF5Matrix.  Contributions of individuals to PCs.}
#'     \item{\code{file}}{Character.  Path to the HDF5 file with all results.}
#'   }
#'
#' @examples
#' \donttest{
#' tmp <- tempfile(fileext = ".h5")
#' X   <- hdf5_create_matrix(tmp, "data/M", data = matrix(rnorm(1000), 100, 10))
#' pca <- prcomp(X, center = TRUE, scale. = FALSE)
#' cat("Variance explained (PC1-3):", pca$cumvar[1:3], "\n")
#' dim(pca$rotation)   # 10 x nPC
#' dim(pca$x)          # 100 x nPC
#' hdf5_close_all()
#' unlink(tmp)
#' }
#'
#' @exportS3Method stats::prcomp HDF5Matrix
prcomp.HDF5Matrix <- function(x,
                               retx          = TRUE,
                               center        = TRUE,
                              
                               scale.        = FALSE,
                               tol           = NULL,
                               rank.         = NULL,       # base R alias -> ncomponents
                               ncomponents   = 0L,
                               k             = 2L,
                               q             = 1L,
                               method        = "auto",
                               rankthreshold = 0.0,
                               svdgroup      = "SVD/",
                               overwrite     = FALSE,
                               threads       = -1L,
                               ...) {
    if (!x$is_valid()) stop("HDF5Matrix is closed or invalid")

    # rank. (base R prcomp convention) takes precedence over ncomponents
    if (!is.null(rank.))
        ncomponents <- as.integer(rank.)

    result <- x$pca(
        ncomponents   = ncomponents,
        center        = center,
        scale         = scale.,
        k             = k,
        q             = q,
        method        = method,
        rankthreshold = rankthreshold,
        svdgroup      = svdgroup,
        overwrite     = overwrite,
        threads       = threads
    )

    if (!isTRUE(retx)) result$x <- NULL

    result
}


# -- print method for HDF5PCA --------------------------------------------------

#' Print method for HDF5PCA objects
#'
#' @param x   An \code{HDF5PCA} object returned by \code{prcomp()} or \code{$pca()}.
#' @param ... Ignored.
#' @exportS3Method base::print HDF5PCA
#' @importFrom utils head
print.HDF5PCA <- function(x, ...) {
    cat("HDF5PCA - Principal Component Analysis (on-disk)\n")
    cat(sprintf("  File     : %s\n", x$file))
    cat(sprintf("  PCs      : %d\n", length(x$sdev)))
    cat(sprintf("  center   : %s  |  scale: %s\n",
                x$center, x$scale))
    cat(sprintf("  sdev[1:3]: %s\n",
                paste(round(head(x$sdev, 3), 4), collapse = ", ")))
    if (!is.null(x$cumvar))
        cat(sprintf("  cumvar%%  : %s ...\n",
                    paste(round(head(x$cumvar, 3), 2), collapse = ", ")))
    if (!is.null(x$rotation) && inherits(x$rotation, "HDF5Matrix"))
        cat(sprintf("  rotation : %d x %d  [HDF5Matrix]\n",
                    dim(x$rotation)[1], dim(x$rotation)[2]))
    if (!is.null(x$x) && inherits(x$x, "HDF5Matrix"))
        cat(sprintf("  x (ind.) : %d x %d  [HDF5Matrix]\n",
                    dim(x$x)[1], dim(x$x)[2]))
    invisible(x)
}

Try the BigDataStatMeth package in your browser

Any scripts or data that you put into this service are public.

BigDataStatMeth documentation built on May 15, 2026, 1:07 a.m.