R/ruv.R
In CAESAR.Suite: CAESAR: a Cross-Technology and Cross-Resolution Framework for Spatial Omics Annotation

Documented in CAESAR.RUV

get_indexList <- function(alist) {
    nsample <- length(alist)
    nr <- 0
    indexList <- list()
    for (i in 1:nsample) {
        indexList[[i]] <- (nr + 1):(nrow(alist[[i]]) + nr)
        nr <- nr + nrow(alist[[i]])
    }
    return(indexList)
}

mat2list <- function(z_int, nvec) {
    zList_int <- list()
    istart <- 1
    for (i in 1:length(nvec)) {
        zList_int[[i]] <- z_int[istart:sum(nvec[1:i]), ]
        istart <- istart + nvec[i]
    }
    return(zList_int)
}

matlist2mat <- function(XList) {
    # transfer a matrix list to a matrix stacked by rows.
    r_max <- length(XList)
    X0 <- XList[[1]]
    if (is.null(dim(X0)) || any(dim(X0) == 1)) {
        if (r_max > 1) {
            for (r in 2:r_max) {
                X0 <- c(X0, XList[[r]])
            }
        }

        return(matrix(X0, ncol = 1))
    }
    if (r_max > 1) {
        for (r in 2:r_max) {
            X0 <- rbind(X0, XList[[r]])
        }
    }

    return(X0)
}

vec2list <- function(y_int, nvec) {
    if (length(y_int) != sum(nvec)) stop("vec2list: Check the argument: nvec!")

    yList_int <- list()
    istart <- 1
    for (i in 1:length(nvec)) {
        yList_int[[i]] <- y_int[istart:sum(nvec[1:i])]
        istart <- istart + nvec[i]
    }
    return(yList_int)
}

dfList2df <- function(dfList) {
    df <- dfList[[1]]
    r_max <- length(dfList)
    if (r_max > 1) {
        for (r in 2:r_max) {
            df <- rbind(df, dfList[[r]])
        }
    }

    return(df)
}

get_correct_exp <- function(XList, RfList, houseKeep, covariateList = NULL, q_unwanted = 10, subsample_rate = NULL, sample_seed = 1) {
    if (!all(sapply(XList, is.matrix))) {
        XList <- lapply(XList, as.matrix)
    }
    nvec <- sapply(XList, nrow)
    XList_sub <- pbapply::pblapply(XList, function(x) x[, houseKeep])
    M0 <- wpca(matlist2mat(XList_sub), q = q_unwanted, FALSE)$PCs
    HList <- mat2list(M0, nvec = nvec)

    Rf <- matlist2mat(RfList)
    colnames(Rf) <- paste0("Rf", 1:ncol(Rf))
    if (!is.null(covariateList)) {
        # covariates <- matlist2mat(covariateList)
        # covariates <- as.matrix(covariates)
        covarites_df <- dfList2df(covariateList)
        covariates <- model.matrix.lm(object = ~ . + 1, data = covarites_df, na.action = "na.pass")
        rm(covariateList, covarites_df)
        Rf <- cbind(Rf, covariates[, -1])
        rm(covariates)
    }
    rm(RfList)

    # subsampling to speed up the computation!
    if (is.null(subsample_rate)) subsample_rate <- 1
    index_List <- get_indexList(XList)
    set.seed(sample_seed)
    index_subsample <- sort(sample(sum(nvec), floor(sum(nvec) * subsample_rate)))
    ## calculate the number of indices belonging to the index of each slide
    nvec_subsample <- rep(NA, length(nvec))
    for (i in 1:length(nvec_subsample)) {
        ## message("i = ", i)
        nvec_subsample[i] <- sum(index_subsample %in% index_List[[i]])
    }
    index_List_new <- lapply(XList, function(x) 1:nrow(x))
    index_subsample_new <- unlist(index_List_new)[index_subsample]
    index_subsampleList <- vec2list(index_subsample_new, nvec_subsample)
    RfList <- mat2list(Rf, nvec = nvec)

    XList_sub <- list()
    RList_sub <- list()
    HList_sub <- list()
    AdjList_sub <- list()
    for (i in 1:length(XList)) {
        # message("i = ", i)
        index_tmp <- index_subsampleList[[i]]
        XList_sub[[i]] <- XList[[i]][index_tmp, ]
        RList_sub[[i]] <- RfList[[i]][index_tmp, ]
        HList_sub[[i]] <- HList[[i]][index_tmp, ]
    }
    rm(RfList, HList, index_tmp)


    ### XList <-  lapply(XList, scale, scale=FALSE)
    X_sub <- matlist2mat(XList_sub)
    rm(XList_sub)
    Rf <- matlist2mat(RList_sub)
    colnames(Rf) <- NULL
    H <- matlist2mat(HList_sub)
    colnames(H) <- NULL
    nc_M0 <- ncol(H)
    lm1 <- lm(X_sub ~ 0 + cbind(H, Rf))
    coefmat <- coef(lm1)[c(1:nc_M0), ]
    # row.names(coef(lm1))
    rm(lm1, X_sub)
    hX <- matlist2mat(XList) - M0 %*% coefmat
    return(hX)
}


get_sampleID <- function(XList) {
    sampleID <- list()
    r_max <- length(XList)
    for (r in 1:r_max) {
        sampleID[[r]] <- rep(r, nrow(XList[[r]]))
    }
    sampleID <- unlist(sampleID)
    return(sampleID)
}




#' Perform Batch Correction and Integration with CAESAR Using Housekeeping Genes
#'
#' @description
#' This function performs batch correction and integration of multiple Seurat objects using housekeeping genes and distance matrices. It supports human and mouse data, and can optionally use custom housekeeping genes provided by the user.
#'
#' @param seuList A list of Seurat objects to be integrated.
#' @param distList A list of distance matrices corresponding to each Seurat object in `seuList`.
#' @param verbose Logical, indicating whether to display progress messages. Default is \code{FALSE}.
#' @param species A character string specifying the species, either "human" or "mouse". Default is "human".
#' @param custom_housekeep A character vector of custom housekeeping genes. If \code{NULL}, default housekeeping genes for the species are used. Default is \code{NULL}.
#'
#' @return A Seurat object that contains the integrated and batch-corrected data in a new assay called "CAESAR".
#'
#' @examples
#' data(toydata)
#' 
#' seu <- toydata$seu
#' markers <- toydata$markers
#'
#' seu <- ProFAST::pdistance(seu, reduction = "caesar")
#'
#' marker.freq <- markerList2mat(list(markers))
#' anno_res <- CAESAR.annotation(seu, marker.freq, cal.confidence = FALSE, cal.proportions = FALSE)
#'
#' seuList <- list(seu, seu)
#' distList <- list(anno_res$ave.dist, anno_res$ave.dist)
#' seuInt <- CAESAR.RUV(seuList, distList, species = "human", verbose = TRUE)
#'
#' @importFrom Seurat CreateSeuratObject DefaultAssay GetAssayData VariableFeatures SetAssayData
#' @importFrom Matrix t sparseMatrix
#' @importFrom stats model.matrix.lm lm coef
#' @export
CAESAR.RUV <- function(seuList, distList, verbose = FALSE, species = "human", custom_housekeep = NULL) {
    # Check if 'seuList' is a list of Seurat objects
    if (!is.list(seuList) || !all(sapply(seuList, inherits, "Seurat"))) {
        stop("Input 'seuList' must be a list of Seurat objects.")
    }

    # Check if 'distList' is a list of matrices
    if (!is.list(distList) || !all(sapply(distList, is.matrix))) {
        stop("Input 'distList' must be a list of distance matrices.")
    }

    # Check if 'species' is valid
    species <- tolower(species)
    if (!species %in% c("human", "mouse")) {
        stop("'species' must be either 'human' or 'mouse'.")
    }

    # Check if 'custom_housekeep' is a character vector if provided
    if (!is.null(custom_housekeep) && !is.character(custom_housekeep)) {
        stop("'custom_housekeep' must be a character vector or NULL.")
    }

    # Extract the default assay data for each Seurat object
    defAssay_vec <- sapply(seuList, Seurat::DefaultAssay)
    n_r <- length(seuList)

    # Extract and transpose the assay data for each Seurat object
    XList <- lapply(
        seq_along(defAssay_vec), function(r) {
            Matrix::t(Seurat::GetAssayData(seuList[[r]], assay = defAssay_vec[r], slot = "data"))
        }
    )

    # Convert gene names based on species
    if (species == "mouse") {
        for (r in 1:n_r) {
            colnames(XList[[r]]) <- firstup(colnames(XList[[r]]))
        }
        if (!is.null(custom_housekeep)) {
            custom_housekeep <- firstup(custom_housekeep)
        }
    } else if (species == "human") {
        for (r in 1:n_r) {
            colnames(XList[[r]]) <- toupper(colnames(XList[[r]]))
        }
        if (!is.null(custom_housekeep)) {
            custom_housekeep <- toupper(custom_housekeep)
        }
    }

    # Handle covariates (if provided)
    covariates_use <- NULL
    if (!is.null(covariates_use)) {
        covariateList <- lapply(seuList, function(x) x@meta.data[covariates_use])
    } else {
        covariateList <- NULL
    }

    # Check for duplicate barcodes
    barcodes_all <- lapply(XList, row.names)
    if (any(duplicated(unlist(barcodes_all)))) {
        for (r in 1:n_r) {
            row.names(XList[[r]]) <- paste0(row.names(XList[[r]]), r)
        }
    }

    # Determine housekeeping genes based on species
    genelist <- colnames(XList[[1]])
    houseKeep <- switch(species,
        human = intersect(toupper(genelist), CAESAR.Suite::Human_HK_genes$Gene),
        mouse = intersect(firstup(genelist), CAESAR.Suite::Mouse_HK_genes$Gene),
        character()
    )

    # Merge with custom housekeeping genes
    houseKeep <- c(houseKeep, custom_housekeep)
    houseKeep <- intersect(houseKeep, colnames(XList[[1]]))
    houseKeep <- intersect(houseKeep, Seurat::VariableFeatures(seuList[[1]]))

    # Handle large datasets with subsampling
    nvec <- sapply(XList, nrow)
    subsample_rate <- 1
    if (sum(nvec) > 8e4) {
        subsample_rate <- 5e4 / sum(nvec)
        if (verbose) {
            message("Subsampling schema will be used to speed up computation.")
        }
    }

    sample_seed <- 1

    # Perform batch correction using housekeeping genes
    if (length(houseKeep) < 5) {
        if (verbose) {
            message("Using CAESAR results for batch correction due to insufficient housekeeping genes.")
        }
        # hX <- get_correct_mean_exp(...)  # Replace with actual batch correction code
    } else {
        if (verbose) {
            message("Using both housekeeping genes and CAESAR results for batch correction.")
        }
        hX <- get_correct_exp(XList, distList,
            houseKeep = houseKeep, q_unwanted = min(10, length(houseKeep)),
            covariateList = covariateList, subsample_rate = subsample_rate, sample_seed = sample_seed
        )
    }

    # Create a new Seurat object with corrected data
    meta_data <- data.frame(batch = factor(get_sampleID(XList)))
    row.names(meta_data) <- row.names(hX)
    count <- Matrix::sparseMatrix(i = 1, j = 1, x = 0, dims = dim(t(hX)))
    row.names(count) <- colnames(hX)
    colnames(count) <- row.names(hX)

    seuInt <- Seurat::CreateSeuratObject(counts = count, assay = "CAESAR", meta.data = meta_data)

    # Add corrected data to the Seurat object
    if (inherits(seuInt[["CAESAR"]], "Assay5")) {
        seuInt <- Seurat::SetAssayData(object = seuInt, slot = "data", assay = "CAESAR", new.data = t(hX))
    } else {
        seuInt[["CAESAR"]]@data <- t(hX)
    }

    return(seuInt)
}

Any scripts or data that you put into this service are public.

CAESAR.Suite documentation built on April 3, 2025, 10:32 p.m.

rdrr.io home R language documentation Run R code online

CRAN packages Bioconductor packages R-Forge packages GitHub packages

Note that we can't provide technical support on individual packages. You should contact the package authors for that.

CAESAR.Suite
CAESAR: a Cross-Technology and Cross-Resolution Framework for Spatial Omics Annotation

R/ruv.R
In CAESAR.Suite: CAESAR: a Cross-Technology and Cross-Resolution Framework for Spatial Omics Annotation

Defines functions CAESAR.RUV get_sampleID get_correct_exp dfList2df vec2list matlist2mat mat2list get_indexList

Documented in CAESAR.RUV

Try the CAESAR.Suite package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

CAESAR.Suite CAESAR: a Cross-Technology and Cross-Resolution Framework for Spatial Omics Annotation

R/ruv.R In CAESAR.Suite: CAESAR: a Cross-Technology and Cross-Resolution Framework for Spatial Omics Annotation

Defines functions CAESAR.RUV get_sampleID get_correct_exp dfList2df vec2list matlist2mat mat2list get_indexList

Documented in CAESAR.RUV

Try the CAESAR.Suite package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

CAESAR.Suite
CAESAR: a Cross-Technology and Cross-Resolution Framework for Spatial Omics Annotation

R/ruv.R
In CAESAR.Suite: CAESAR: a Cross-Technology and Cross-Resolution Framework for Spatial Omics Annotation