R/S3_operations.R

Defines functions apply_function.HDF5Matrix apply_function reduce.HDF5Matrix reduce split.HDF5Matrix multiply_sparse.HDF5Matrix multiply_sparse

Documented in apply_function multiply_sparse multiply_sparse.HDF5Matrix reduce split.HDF5Matrix

# S3_operations.R
#
# S3 generics and methods for:
#   multiply_sparse()   — sparse-aware matrix multiplication
#   split.HDF5Matrix()  — split dataset into blocks
#   reduce()            — reduce a group of datasets
#   apply_function()    — apply algebraic/statistical function to datasets
#
# Design notes:
#   - multiply_sparse():  new generic (no base R equivalent).
#   - split():            base::split(x, f, drop, ...) has "..."
#                         → no override needed.
#   - reduce():           new generic (no base R equivalent).
#   - apply_function():   new generic; name chosen to avoid collision with
#                         base::apply().


# ── multiply_sparse() — new generic ─────────────────────────────────────────

#' Sparse-aware matrix multiplication (generic)
#'
#' @description
#' Generic function for block-wise sparse matrix multiplication.
#' The method for \code{HDF5Matrix} computes \code{x \%*\% y} using the
#' BigDataStatMeth sparse multiplication algorithm, which skips all-zero
#' blocks and is more efficient when one or both matrices are highly sparse.
#'
#' @param x   An \code{HDF5Matrix}.
#' @param y   An \code{HDF5Matrix}. Must be in the same HDF5 file as \code{x}.
#' @param ... Additional arguments forwarded to the method.
#' @return A new \code{HDF5Matrix} containing the product.
#'
#' @examples
#' \donttest{
#' fn <- tempfile(fileext = ".h5")
#' X  <- hdf5_create_matrix(fn, "data/A", data = matrix(rnorm(100), 10, 10))
#' X  <- hdf5_create_matrix(fn, "data/B", data = matrix(rnorm(100), 10, 10))
#' 
#' 
#' A <- hdf5_matrix(fn, "data/A")
#' B <- hdf5_matrix(fn, "data/B")
#' C <- multiply_sparse(A, B)
#' 
#' hdf5_close_all()
#' unlink(fn)
#' 
#' }
#'
#' @seealso \code{\link{multiply_sparse.HDF5Matrix}}
#'
#' @export
multiply_sparse <- function(x, y, ...) UseMethod("multiply_sparse")


#' Sparse-aware matrix multiplication for HDF5Matrix
#'
#' @description
#' Computes \code{x \%*\% y} block-wise using BigDataStatMeth's sparse
#' algorithm.
#'
#' @param x           An \code{HDF5Matrix}.
#' @param y           An \code{HDF5Matrix}. Same HDF5 file as \code{x}.
#' @param outgroup  Character or \code{NULL}. Output group in the HDF5 file.
#'   Default \code{"OUTPUT"}.
#' @param outdataset Character or \code{NULL}. Output dataset name.
#'   Default is constructed from the operation and input names.
#' @param block_size  Integer. Block size hint; -1 = auto (default).
#' @param mix_block   Integer. Memory block size for parallel path; -1 = auto.
#' @param paral       Logical or NULL.
#' @param threads     Integer or NULL.
#' @param compression Integer (0-9) or NULL.
#' @param ...         Ignored.
#' @return A new \code{HDF5Matrix}.
#'
#' @exportS3Method
multiply_sparse.HDF5Matrix <- function(x, y,
                                       outgroup     = NULL,
                                       outdataset   = NULL,
                                        block_size  = -1L,
                                        mix_block   = -1L,
                                        paral       = NULL,
                                        threads     = NULL,
                                        compression = NULL,
                                        ...) {
    x$multiply_sparse(y,
                      outgroup    = outgroup,
                      outdataset  = outdataset,
                      block_size  = block_size,
                      mix_block   = mix_block,
                      paral       = paral,
                      threads     = threads,
                      compression = compression)
}


# ── split.HDF5Matrix ─────────────────────────────────────────────────────────

#' Split an HDF5Matrix into a list of blocks
#'
#' @description
#' S3 method of \code{base::split()} for \code{HDF5Matrix} objects.
#' Divides the matrix into blocks along rows (default) or columns.
#'
#' Provide exactly ONE of \code{n_blocks} or \code{block_size}.
#'
#' @param x           An \code{HDF5Matrix}.
#' @param f           Ignored (kept for S3 signature compatibility).
#' @param n_blocks    Integer. Number of (roughly equal) blocks; -1 = unused.
#' @param block_size  Integer. Max rows (or cols) per block; -1 = unused.
#' @param bycols      Logical. If \code{TRUE}, split by columns (default = by rows).
#' @param out_group   Character. HDF5 group for output blocks (default \code{"SPLIT"}).
#' @param out_dataset Character or NULL. Base dataset name.
#' @param overwrite   Logical. Overwrite existing blocks (default \code{FALSE}).
#' @param drop        Ignored (S3 compatibility).
#' @param ...         Ignored.
#' @return Named list of \code{HDF5Matrix} objects:
#'   \code{block_0}, \code{block_1}, …
#'
#' @examples
#' \donttest{
#' 
#' fn <- tempfile(fileext = ".h5")
#' X  <- hdf5_create_matrix(fn, "data/X", data = matrix(rnorm(2000000), 20000, 100))
#' 
#' X      <- hdf5_matrix( fn, "data/X")   # 20000 × 1000
#' blocks <- split(X, n_blocks = 4)             # 4 row-blocks of ~5000 rows each
#' }
#'
#' @exportS3Method
split.HDF5Matrix <- function( x,
                              f            = NULL,
                              drop = FALSE,
                              n_blocks     = -1L,
                              block_size   = -1L,
                              bycols       = FALSE,
                              out_group    = "SPLIT",
                              out_dataset  = NULL,
                              overwrite    = FALSE,
                              ...) {
    x$split(n_blocks   = n_blocks,
            block_size = block_size,
            bycols     = bycols,
            out_group  = out_group,
            out_dataset = out_dataset,
            overwrite  = overwrite)
}


# ── reduce() — new generic ───────────────────────────────────────────────────

#' Reduce a group of HDF5 datasets by accumulation (generic)
#'
#' @description
#' Generic function for reducing (accumulating) all datasets in the same HDF5
#' group as \code{x} into a single dataset using a binary operation.
#'
#' @param x   An \code{HDF5Matrix}.
#' @param ... Additional arguments forwarded to the method.
#' @return A new \code{HDF5Matrix} containing the accumulated result.
#'
#' @examples
#' \donttest{
#' fn <- tempfile(fileext = ".h5")
#' 
#' # Create three matrices in the same group
#' hdf5_create_matrix(fn, "partials/chunk_0", data = matrix(1:100, 10, 10))
#' hdf5_create_matrix(fn, "partials/chunk_1", data = matrix(1:100, 10, 10))
#' hdf5_create_matrix(fn, "partials/chunk_2", data = matrix(1:100, 10, 10))
#' 
#' # Open one as entry point — reduce() operates on its whole group
#' partial <- hdf5_matrix(fn, "partials/chunk_0")
#' total   <- reduce(partial, func = "+")
#' dim(total)
#' 
#' hdf5_close_all()
#' unlink(fn)
#' }
#'
#' @seealso \code{\link{hdf5_reduce}} for the standalone group-level version.
#'
#' @export
reduce <- function(x, ...) UseMethod("reduce")


#' @exportS3Method
reduce.HDF5Matrix <- function(x,
                               out_group    = "REDUCED",
                               out_dataset  = "reduced",
                               func         = "+",
                               overwrite    = FALSE,
                               remove_input = FALSE,
                               drop = FALSE,
                               ...) {
    x$reduce(out_group    = out_group,
             out_dataset  = out_dataset,
             func         = func,
             overwrite    = overwrite,
             remove_input = remove_input)
}


# ── apply_function() — new generic ──────────────────────────────────────────

#' Apply a statistical or algebraic function to HDF5 datasets (generic)
#'
#' @description
#' Generic function that applies one of BigDataStatMeth's algebraic or
#' statistical functions to a list of datasets in the same HDF5 group as
#' \code{x}.
#'
#' Valid \code{func} values: \code{"QR"}, \code{"CrossProd"},
#' \code{"tCrossProd"}, \code{"invChol"}, \code{"blockmult"},
#' \code{"CrossProd_double"}, \code{"tCrossProd_double"}, \code{"solve"},
#' \code{"normalize"}, \code{"sdmean"}, \code{"descChol"}.
#'
#' @param x   An \code{HDF5Matrix}.
#' @param ... Additional arguments forwarded to the method.
#' @return Named list with elements \code{filename}, \code{out_group},
#'   \code{func}, \code{datasets}.
#'
#' @examples
#' \donttest{
#' fn <- tempfile(fileext = ".h5")
#' 
#' # Create two datasets in the same group
#' hdf5_create_matrix(fn, "data/A", data = matrix(rnorm(50), 5, 10))
#' hdf5_create_matrix(fn, "data/B", data = matrix(rnorm(50), 5, 10))
#' 
#' # Apply CrossProd to all datasets in the group
#' X   <- hdf5_matrix(fn, "data/A")
#' res <- apply_function(X, func = "CrossProd", out_group = "RESULTS")
#' 
#' hdf5_close_all()
#' unlink(fn)
#' }
#'
#' @seealso \code{hdf5_apply}
#'
#' @export
apply_function <- function(x, ...) UseMethod("apply_function")


#' @exportS3Method
apply_function.HDF5Matrix <- function(x,
                                       datasets    = NULL,
                                       func        = "QR",
                                       out_group   = "APPLIED",
                                       b_group     = "",
                                       b_datasets  = NULL,
                                       overwrite   = FALSE,
                                       transp_a    = FALSE,
                                       transp_b    = FALSE,
                                       full_matrix = FALSE,
                                       byrows      = FALSE,
                                       threads     = NULL,
                                       ...) {
    x$apply_function(
        datasets    = datasets,
        func        = func,
        out_group   = out_group,
        b_group     = b_group,
        b_datasets  = b_datasets,
        overwrite   = overwrite,
        transp_a    = transp_a,
        transp_b    = transp_b,
        full_matrix = full_matrix,
        byrows      = byrows,
        threads     = threads
    )
}

Try the BigDataStatMeth package in your browser

Any scripts or data that you put into this service are public.

BigDataStatMeth documentation built on May 15, 2026, 1:07 a.m.