Nothing
#################################### qc ########################################
#' General QC for liger object
#' @description Calculate number of UMIs, number of detected features and
#' percentage of feature subset (e.g. mito, ribo and hemo) expression per cell.
#' @details
#' This function by default calculates:
#'
#' \itemize{
#' \item{\code{nUMI} - The column sum of the raw data matrix per cell.
#' Represents the total number of UMIs per cell if given raw counts.}
#' \item{\code{nGene} - Number of detected features per cell}
#' \item{\code{mito} - Percentage of mitochondrial gene expression per cell}
#' \item{\code{ribo} - Percentage of ribosomal gene expression per cell}
#' \item{\code{hemo} - Percentage of hemoglobin gene expression per cell}
#' }
#'
#' Users can also specify their own feature subsets with argument
#' \code{features}, or regular expression patterns that match to genes of
#' interests with argument \code{pattern}, to calculate the expression
#' percentage. If a character vector is given to \code{features}, a QC metric
#' variable named \code{"featureSubset_name"} will be computed. If a named list
#' of multiple subsets is given, the names will be used as the variable names.
#' If a single pattern is given to \code{pattern}, a QC metric variable named
#' \code{"featureSubset_pattern"} will be computed. If a named list of multiple
#' patterns is given, the names will be used as the variable names.
#' \bold{Duplicated QC metric names between these two arguments and the default
#' five listed above should be avoided.}
#'
#' This function is automatically operated at the creation time of each
#' \linkS4class{liger} object to capture the raw status. Argument
#' \code{overwrite} is set to FALSE by default to avoid mistakenly updating
#' existing metrics after filtering the object. Users can still opt to update
#' all newly calculated metrics (including the default five) by setting
#' \code{overwrite = TRUE}, or only some of newly calculated ones by providing
#' a character vector of the names of the metrics to update. Intended
#' overwriting only happens to datasets selected with \code{useDatasets}.
#'
#' @param object \linkS4class{liger} object with \code{rawData} available in
#' each \linkS4class{ligerDataset} embedded
#' @param organism Specify the organism of the dataset to identify the
#' mitochondrial, ribosomal and hemoglobin genes. Available options are
#' \code{"mouse"}, \code{"human"}, \code{"zebrafish"}, \code{"rat"} and
#' \code{"drosophila"}. Set \code{NULL} to disable mito, ribo and hemo
#' calculation.
#' @param features Feature names matching the feature subsets that users want to
#' calculate the expression percentage with. A vector for a single subset, or a
#' named list for multiple subset. Default \code{NULL}.
#' @param pattern Regex patterns for matching the feature subsets that users
#' want to calculate the expression percentage with. A vector for a single
#' subset, or a named list for multiple subset. Default \code{NULL}.
#' @param overwrite Whether to overwrite existing QC metric variables. Default
#' \code{FALSE} do not update existing result. Use \code{TRUE} for updating all.
#' Use a character vector to specify which to update. See Details.
#' @param useDatasets A character vector of the names, a numeric or logical
#' vector of the index of the datasets to be included for QC. Default
#' \code{NULL} performs QC on all datasets.
#' @param chunkSize Integer number of cells to include in a chunk when working
#' on HDF5 based dataset. Default \code{1000}
#' @param verbose Logical. Whether to show information of the progress. Default
#' \code{getOption("ligerVerbose")} or \code{TRUE} if users have not set.
#' @param mito,ribo,hemo `r lifecycle::badge("deprecated")` Now will always
#' compute the percentages of mitochondrial, ribosomal and hemoglobin gene
#' counts. These arguments will be ignored.
#' @return Updated \code{object} with the \code{cellMeta(object)} updated as
#' intended by users. See Details for more information.
#' @export
#' @examples
#' pbmc <- runGeneralQC(pbmc, "human", overwrite = TRUE)
runGeneralQC <- function(
object,
organism,
features = NULL,
pattern = NULL,
overwrite = FALSE,
useDatasets = NULL,
chunkSize = 1000,
verbose = getOption("ligerVerbose", TRUE),
mito = NULL,
ribo = NULL,
hemo = NULL
) {
.checkObjVersion(object)
if (!missing(mito) || !missing(ribo) || !missing(hemo)) {
cli::cli_alert_info("Arguments {.code mito}, {.code ribo}, {.code hemo} are deprecated and now we always compute the percentage.")
}
if (!is.null(organism)) {
organism <- match.arg(organism, choices = c("mouse", "human", "zebrafish", "rat", "drosophila"))
}
useDatasets <- .checkUseDatasets(object, useDatasets)
# Process the the two arguments all into one named list of feature names
# before exactly calculate the percentage
featureSubsets <- list()
allFeatures <- unique(
unlist(
lapply(datasets(object)[useDatasets], rownames),
use.names = FALSE
)
)
# Idea from package scCustomize, with more curation and added hemoglobins
# Removed marmoset though because couldn't verify against online database
# Accepted species names
if (!is.null(organism)) {
organism <- .checkArgLen(organism, n = 1, class = "character")
mitoPattern <- switch(
EXPR = organism,
mouse = "^mt-",
human = "^MT-",
zebrafish = "^mt-",
rat = "^[Mm]t-",
drosophila = "^mt:",
)
if (startsWith(mitoPattern, "^")) {
featureSubsets$mito <- grep(mitoPattern, allFeatures, value = TRUE)
} else {
featureSubsets$mito <- intersect(mitoPattern, allFeatures)
}
if (length(featureSubsets$mito) == 0) {
cli::cli_alert_warning("No {organism} mitochondrial gene found in the union of dataset {.val {useDatasets}}")
}
riboPattern <- switch(
EXPR = organism,
mouse = "^Rp[sl]",
human = "^RP[SL]",
zebrafish = "^rp[sl]",
rat = "^Rp[sl]",
drosophila = "^Rp[SL]"
)
featureSubsets$ribo <- grep(riboPattern, allFeatures, value = TRUE)
if (length(featureSubsets$ribo) == 0) {
cli::cli_alert_warning("No {organism} ribosomal gene found in the union of dataset {.val {names(object)[useDatasets]}}")
}
hemoPattern <- switch(
EXPR = organism,
mouse = "^Hb(?!egf|s1l|p1)",
human = "^HB(?!EGF|S1L|P1)",
zebrafish = "^hb(?!egf|s1l|p1)",
rat = "^Hb(?!egf|s1l|p1)",
drosophila = NA
)
if (!is.na(hemoPattern)) {
featureSubsets$hemo <- grep(hemoPattern, allFeatures, value = TRUE, perl = TRUE)
}
if ("hemo" %in% names(featureSubsets) && length(featureSubsets$hemo) == 0) {
cli::cli_alert_warning("No {organism} hemoglobin gene found in the union of dataset {.val {names(object)[useDatasets]}}")
}
}
# Then process the user specified gene sets
if (!is.null(features)) {
if (is.list(features)) {
featureSubsets <- c(featureSubsets, features)
} else if (is.vector(features)) {
featureSubsets[["featureSubset_name"]] <- features
}
}
if (!is.null(pattern)) {
if (is.list(pattern)) {
pattern <- lapply(pattern, function(x) {
grep(x, allFeatures, value = TRUE)
})
featureSubsets <- c(featureSubsets, pattern)
} else if (is.vector(pattern)) {
pattern <- grep(pattern, allFeatures, value = TRUE)
featureSubsets[["featureSubset_pattern"]] <- pattern
}
}
# Start calculation on each dataset
newResultNames <- c("nUMI", "nGene", names(featureSubsets))
# Overwriting logic:
# By default - FALSE, no overwrite
# If TRUE, overwrite all that exist
# If given specific column names, only overwrite given ones but do not touch
# other existing ones
if (isTRUE(overwrite)) {
# Just use newResultNames
} else if (isFALSE(overwrite)) {
newResultNames <- setdiff(newResultNames, colnames(object@cellMeta))
} else if (is.character(overwrite)) {
newResultNames <- newResultNames[newResultNames %in% overwrite | !newResultNames %in% colnames(object@cellMeta)]
}
for (d in useDatasets) {
ld <- dataset(object, d)
if (isTRUE(verbose))
cliID <- cli::cli_process_start("calculating QC for dataset {.val {d}}")
if (isH5Liger(ld))
results <- runGeneralQC.h5(
ld,
featureSubsets = featureSubsets,
chunkSize = chunkSize,
verbose = verbose
)
else
results <- runGeneralQC.Matrix(
ld,
featureSubsets = featureSubsets,
verbose = verbose
)
resultCell <- results$cell[, newResultNames, drop = FALSE]
if (ncol(resultCell) > 0) {
object@cellMeta[object$dataset == d, newResultNames] <- resultCell
cli::cli_alert_info("Updated QC variables: {.val {newResultNames}}")
} else {
cli::cli_alert_info("No QC variable updated.")
}
featureMeta(ld, check = FALSE)$nCell <- results$feature
datasets(object, check = FALSE)[[d]] <- ld
if (isTRUE(verbose)) cli::cli_process_done(id = cliID)
}
return(object)
}
# runGeneralQCOld <- function(
# object,
# mito = TRUE,
# ribo = TRUE,
# hemo = TRUE,
# features = NULL,
# pattern = NULL,
# useDatasets = NULL,
# chunkSize = 1000,
# verbose = getOption("ligerVerbose", TRUE)
# ) {
# .checkObjVersion(object)
# useDatasets <- .checkUseDatasets(object, useDatasets)
# # Process the the two arguments all into one named list of feature names
# # before exactly calculate the percentage
# featureSubsets <- list()
# allFeatures <- unique(unlist(lapply(datasets(object), rownames),
# use.names = FALSE))
#
# # Work on the presets
# if (isTRUE(mito))
# featureSubsets$mito <- grep("^MT-", allFeatures, value = TRUE)
# if (isTRUE(ribo))
# featureSubsets$ribo <- grep("^RP[SL]", allFeatures, value = TRUE)
# if (isTRUE(hemo))
# featureSubsets$hemo <- grep("^HB[^(P)]", allFeatures, value = TRUE)
#
# # Then process the user specified gene sets
# if (!is.null(features)) {
# if (is.list(features)) {
# featureSubsets <- c(featureSubsets, features)
# } else if (is.vector(features)) {
# featureSubsets[["featureSubset_name"]] <- features
# }
# }
# if (!is.null(pattern)) {
# if (is.list(pattern)) {
# pattern <- lapply(pattern, function(x) {
# grep(x, allFeatures, value = TRUE)
# })
# featureSubsets <- c(featureSubsets, pattern)
# } else if (is.vector(pattern)) {
# pattern <- grep(pattern, allFeatures, value = TRUE)
# featureSubsets[["featureSubset_pattern"]] <- pattern
# }
# }
#
# # Start calculation on each dataset
# newResultNames <- c("nUMI", "nGene", names(featureSubsets))
#
# for (d in useDatasets) {
# ld <- dataset(object, d)
# if (isTRUE(verbose))
# cliID <- cli::cli_process_start("calculating QC for dataset {.val {d}}")
# if (isH5Liger(ld))
# results <- runGeneralQC.h5(
# ld,
# featureSubsets = featureSubsets,
# chunkSize = chunkSize,
# verbose = verbose
# )
# else
# results <- runGeneralQC.Matrix(
# ld,
# featureSubsets = featureSubsets,
# verbose = verbose
# )
# object@cellMeta[object$dataset == d, newResultNames] <- results$cell
# featureMeta(ld, check = FALSE)$nCell <- results$feature
# datasets(object, check = FALSE)[[d]] <- ld
# if (isTRUE(verbose)) cli::cli_process_done(id = cliID)
# }
#
# return(object)
# }
#' Calculate general QC on H5 based ligerDataset object
#' @param object ligerDataset object
#' @param featureSubsets Named list passed from \code{runGeneralQC}
#' @param chunkSize Integer
#' @return data.frame
#' @noRd
runGeneralQC.h5 <- function(
object,
featureSubsets = NULL,
chunkSize = 1000,
verbose = getOption("ligerVerbose", TRUE)) {
allFeatures <- rownames(object)
# Initialize results
cell <- data.frame(row.names = colnames(object))
cell$nUMI <- 0
cell$nGene <- 0
for (i in names(featureSubsets)) {
cell[[i]] <- 0
}
nCell <- rep(0, nrow(object))
rowIndices <- lapply(featureSubsets, function(x) allFeatures %in% x)
# Calculate in only one iteration
H5Apply(
object,
init = list(cell = cell, feature = nCell),
useData = "rawData",
chunkSize = chunkSize,
verbose = verbose,
FUN = function(chunk, sparseXIdx, cellIdx, values) {
nUMI <- colSums(chunk)
values$cell$nUMI[cellIdx] <- nUMI
nonzero <- methods::as(chunk, "lMatrix")
values$cell$nGene[cellIdx] <- colSums(nonzero)
for (fs in names(rowIndices)) {
values$cell[[fs]][cellIdx] <-
colSums(chunk[rowIndices[[fs]], , drop = FALSE]) / nUMI *
100
}
values$feature <- values$feature + Matrix::rowSums(nonzero)
return(values)
}
)
}
#' Calculate general QC on in memory matrix based ligerDataset object
#' @param object ligerDataset object
#' @param featureSubsets Named list passed from \code{runGeneralQC}
#' @return data.frame
#' @noRd
runGeneralQC.Matrix <- function(
object,
featureSubsets = NULL,
verbose = getOption("ligerVerbose", TRUE)) {
nUMI <- Matrix::colSums(rawData(object))
# Instead of using `nonzero <- rawData > 0` which generates dense logical
# matrix, keep it sparse with 1 for TRUE
# nonzero <- rawData(object)
# nonzero@x <- rep(1, length(nonzero@x))
nonzero <- methods::as(rawData(object), "lMatrix")
nGene <- Matrix::colSums(nonzero)
nCell <- Matrix::rowSums(nonzero)
results <- data.frame(nUMI = nUMI, nGene = nGene,
row.names = colnames(object))
if (length(featureSubsets) > 0) {
percentages <- lapply(featureSubsets, function(x) {
rowIdx <- rownames(object) %in% x
if (sum(rowIdx) == 0) {
return(rep(0, ncol(object)))
} else {
return(colSums(rawData(object)[rowIdx, , drop = FALSE]) /
colSums(rawData(object)) * 100)
}
})
results <- cbind(results, as.data.frame(percentages))
}
list(cell = results, feature = nCell)
}
#' Calculate proportion mitochondrial contribution
#' @description
#' Calculates proportion of mitochondrial contribution based on raw or
#' normalized data.
#' @param object \code{liger} object.
#' @param use.norm \bold{Deprecated} Whether to use cell normalized data in
#' calculating contribution. Default \code{FALSE}.
#' @param pattern Regex pattern for identifying mitochondrial genes. Default
#' \code{"^mt-"} for mouse.
#' @return Named vector containing proportion of mitochondrial contribution for
#' each cell.
#' @export
#' @note
#' \code{getProportionMito} will be deprecated because
#' \code{\link{runGeneralQC}} generally covers and expands its use case.
#' @examples
#' # Example dataset does not contain MT genes, expected to see a message
#' pbmc$mito <- getProportionMito(pbmc)
getProportionMito <- function(object, use.norm = FALSE, pattern = "^mt-") {
lifecycle::deprecate_warn("1.99.0", "getProportionMito()",
"runGeneralQC()")
result <- numeric()
for (d in names(object)) {
ld <- dataset(object, d)
mitoGeneIdx <- grep(pattern, rownames(ld))
if (isTRUE(use.norm)) {
pctMT <- colSums(normData(ld)[mitoGeneIdx, , drop = FALSE]) /
colSums(normData(ld))
} else {
pctMT <- colSums(rawData(ld)[mitoGeneIdx, , drop = FALSE]) /
colSums(rawData(ld))
}
names(pctMT) <- colnames(ld)
result <- c(result, pctMT)
}
if (all(result == 0)) {
cli::cli_alert_warning("Zero proportion detected in all cells")
}
return(result)
}
#' Remove missing cells or features from liger object
#' @param object \linkS4class{liger} object
#' @param orient Choose to remove non-expressing features (\code{"feature"}),
#' empty barcodes (\code{"cell"}), or both of them (\code{"both"}). Default
#' \code{"both"}.
#' @param minCells Keep features that are expressed in at least this number of
#' cells, calculated on a per-dataset base. A single value for all datasets or
#' a vector for each dataset. Default \code{NULL} only removes none expressing
#' features.
#' @param minFeatures Keep cells that express at least this number of features,
#' calculated on a per-dataset base. A single value for all datasets or a vector
#' for each dataset. Default \code{NULL} only removes none expressing cells.
#' @param useDatasets A character vector of the names, a numeric or logical
#' vector of the index of the datasets to be processed. Default
#' \code{NULL} removes empty entries from all datasets.
#' @param newH5 Logical, whether to create a new H5 file on disk for each
#' H5-based dataset on subset. Default \code{TRUE}
#' @param filenameSuffix When subsetting H5-based datasets to new H5 files, this
#' suffix will be added to all the filenames. Default \code{"removeMissing"}.
#' @param verbose Logical. Whether to show information of the progress. Default
#' \code{getOption("ligerVerbose")} or \code{TRUE} if users have not set.
#' @param ... Arguments passed to \code{\link{subsetLigerDataset}}
#' @return Updated (subset) \code{object}.
#' @export
#' @rdname removeMissing
#' @examples
#' # The example dataset does not contain non-expressing genes or empty barcodes
#' pbmc <- removeMissing(pbmc)
removeMissing <- function(
object,
orient = c("both", "feature", "cell"),
minCells = NULL,
minFeatures = NULL,
useDatasets = NULL,
newH5 = TRUE,
filenameSuffix = "removeMissing",
verbose = getOption("ligerVerbose", TRUE),
...
) {
orient <- match.arg(orient)
useDatasets <- .checkUseDatasets(object, useDatasets)
minCells <- minCells %||% rep(0, length(useDatasets))
minCells <- .checkArgLen(minCells, length(useDatasets), class = "numeric")
names(minCells) <- useDatasets
minFeatures <- minFeatures %||% rep(0, length(useDatasets))
minFeatures <- .checkArgLen(minFeatures, length(useDatasets), class = "numeric")
names(minFeatures) <- useDatasets
rmFeature <- ifelse(orient %in% c("both", "feature"), TRUE, FALSE)
rmCell <- ifelse(orient %in% c("both", "cell"), TRUE, FALSE)
datasets.new <- list()
subsetted <- c()
for (d in useDatasets) {
ld <- dataset(object, d)
if (rmFeature) {
featureIdx <- which(featureMeta(ld)$nCell > minCells[d])
} else {
featureIdx <- seq_len(nrow(ld))
}
rmFeatureDataset <- length(featureIdx) != nrow(ld)
if (rmCell) {
cellIdx <- object$dataset == d & object$nGene > minFeatures[d]
cellIdx <- colnames(object)[cellIdx]
cellIdx <- which(colnames(ld) %in% cellIdx)
} else {
cellIdx <- seq_len(ncol(ld))
}
rmCellDataset <- length(cellIdx) != ncol(ld)
subsetted <- c(subsetted, any(c(rmFeatureDataset, rmCellDataset)))
if (any(c(rmFeatureDataset, rmCellDataset))) {
if (isTRUE(verbose)) {
cli::cli_alert_info("Removing missing in dataset: {.val {d}}")
}
datasets.new[[d]] <- subsetLigerDataset(
ld,
featureIdx = featureIdx,
cellIdx = cellIdx,
filenameSuffix = filenameSuffix,
verbose = verbose,
newH5 = newH5,
...
)
} else {
datasets.new[[d]] <- ld
}
}
if (any(subsetted)) {
allCells <- unlist(lapply(datasets.new, colnames), use.names = FALSE)
object <- methods::new(
"liger",
datasets = datasets.new,
cellMeta = cellMeta(object, cellIdx = allCells,
drop = FALSE),
varFeatures = character(),
H.norm = object@H.norm[allCells, , drop = FALSE]
)
}
return(object)
}
#' @rdname removeMissing
#' @export
#' @param slot.use \bold{Deprecated}. Always look at \code{rawData} slot of
#' inner \linkS4class{ligerDataset} objects.
#' @param use.cols \bold{Deprecated}. Previously means "treating each column as
#' a cell" when \code{TRUE}, now means \code{orient="cell"}.
#' @note
#' \code{removeMissingObs} will be deprecated. \code{removeMissing} covers and
#' expands the use case and should be easier to understand.
removeMissingObs <- function(
object,
slot.use = NULL,
use.cols = TRUE,
verbose = getOption("ligerVerbose", TRUE)) {
lifecycle::deprecate_warn("1.99.0", "removeMissingObs()",
"removeMissing()")
if (!missing(slot.use)) {
cli::cli_alert_warning("Argument {.code slot.use} is deprecated and ignored.")
}
orient <- ifelse(isTRUE(use.cols), "cell", "gene")
object <- removeMissing(object, orient, verbose = verbose)
return(object)
}
################################ Normalize #####################################
#' Normalize raw counts data
#' @description Perform library size normalization on raw counts input. As for
#' the preprocessing step of iNMF integration, by default we don't multiply the
#' normalized values with a scale factor, nor do we take the log transformation.
#' Applicable S3 methods can be found in Usage section.
#'
#' \code{normalizePeak} is designed for datasets of "atac" modality, i.e. stored
#' in \linkS4class{ligerATACDataset}. S3 method for various container object is
#' not supported yet due to difference in architecture design.
#' @param object \linkS4class{liger} object
#' @param ... Arguments to be passed to S3 methods. The "liger" method calls
#' the "ligerDataset" method, which then calls "dgCMatrix" method.
#' \code{normalizePeak} directly calls \code{normalize.dgCMatrix}.
#' @return Updated \code{object}.
#' \itemize{
#' \item{dgCMatrix method - Returns processed dgCMatrix object}
#' \item{ligerDataset method - Updates the \code{normData} slot of the object}
#' \item{liger method - Updates the \code{normData} slot of chosen datasets}
#' \item{Seurat method - Adds a named layer in chosen assay (V5), or update the
#' \code{data} slot of the chosen assay (<=V4)}
#' \item{\code{normalizePeak} - Updates the \code{normPeak} slot of chosen
#' datasets.}
#' }
#' @rdname normalize
#' @export
#' @examples
#' pbmc <- normalize(pbmc)
normalize <- function(object, ...) {
UseMethod("normalize", object)
}
#' @rdname normalize
#' @export
#' @method normalize matrix
normalize.matrix <- function(
object,
log = FALSE,
scaleFactor = NULL,
...
) {
scaleFactor <- .checkArgLen(scaleFactor, ncol(object), repN = TRUE, class = "numeric")
if (!is.null(scaleFactor) && any(scaleFactor <= 0)) {
cli::cli_alert_danger("Invalid {.code scaleFactor} given. Setting to {.code NULL}.")
scaleFactor <- NULL
}
normed <- normalize_byCol_dense_rcpp(object)
if (!is.null(scaleFactor)) normed <- normed * scaleFactor
if (isTRUE(log)) normed <- log1p(normed)
return(normed)
}
#' @rdname normalize
#' @export
#' @method normalize dgCMatrix
#' @param log Logical. Whether to do a \code{log(x + 1)} transform on the
#' normalized data. Default \code{TRUE}.
#' @param scaleFactor Numeric. Scale the normalized expression value by this
#' factor before transformation. \code{NULL} for not scaling. Default
#' \code{1e4}.
normalize.dgCMatrix <- function(
object,
log = FALSE,
scaleFactor = NULL,
...
) {
scaleFactor <- .checkArgLen(scaleFactor, ncol(object), repN = TRUE, class = "numeric")
if (!is.null(scaleFactor) && any(scaleFactor <= 0)) {
cli::cli_alert_danger("Invalid {.code scaleFactor} given. Setting to {.code NULL}.")
scaleFactor <- NULL
}
normed <- object
normed@x <- object@x / rep.int(Matrix::colSums(object), diff(object@p))
if (!is.null(scaleFactor)) normed <- normed * scaleFactor
if (isTRUE(log)) normed <- log1p(normed)
return(normed)
}
#' @rdname normalize
#' @export
#' @param chunk Integer. Number of maximum number of cells in each chunk when
#' working on HDF5 file based ligerDataset. Default \code{1000}.
#' @param verbose Logical. Whether to show information of the progress. Default
#' \code{getOption("ligerVerbose")} or \code{TRUE} if users have not set.
#' @method normalize ligerDataset
normalize.ligerDataset <- function(
object,
chunk = 1000,
verbose = getOption("ligerVerbose", TRUE),
...
) {
if (!isH5Liger(object)) {
normData(object) <- normalize(rawData(object), ...)
} else {
# Initialize result
results <- list(
geneSumSq = rep(0, nrow(object)),
geneMeans = rep(0, nrow(object))
)
h5file <- getH5File(object)
resultH5Path <- "normData"
# Use safe create here in practice
## This creates the CSC sparse non-zero value array
safeH5Create(object = object, dataPath = resultH5Path,
dims = rawData(object)$dims, dtype = "double",
chunkSize = rawData(object)$chunk_dims)
# Chunk run
results <- H5Apply(
object,
function(chunk, sparseXIdx, cellIdx, values) {
normChunk <- normalize(chunk)
h5file[[resultH5Path]][sparseXIdx] <- normChunk@x
row_sums <- rowSums(normChunk)
values$geneSumSq <- values$geneSumSq +
rowSums(normChunk * normChunk)
values$geneMeans <- values$geneMeans + row_sums
return(values)
},
init = results, useData = "rawData",
chunkSize = chunk, verbose = verbose
)
results$geneMeans <- results$geneMeans / ncol(object)
featureMeta(object, check = FALSE)$geneMeans <- results$geneMeans
featureMeta(object, check = FALSE)$geneSumSq <- results$geneSumSq
normData(object, check = FALSE) <- h5file[[resultH5Path]]
h5fileInfo(object, "normData", check = FALSE) <- resultH5Path
}
return(object)
}
#' @rdname normalize
#' @export
#' @param useDatasets A character vector of the names, a numeric or logical
#' vector of the index of the datasets to be normalized. Should specify ATACseq
#' datasets when using \code{normalizePeak}. Default \code{NULL} normalizes all
#' valid datasets.
#' @param format.type,remove.missing \bold{Deprecated}. The functionality of
#' these is covered through other parts of the whole workflow and is no long
#' needed. Will be ignored if specified.
#' @method normalize liger
normalize.liger <- function(
object,
useDatasets = NULL,
verbose = getOption("ligerVerbose", TRUE),
format.type = NULL,
remove.missing = NULL,
...
) {
.deprecateArgs(defunct = c("format.type", "remove.missing"))
.checkObjVersion(object)
useDatasets <- .checkUseDatasets(object, useDatasets)
object <- recordCommand(object, ..., dependencies = "hdf5r")
for (d in useDatasets) {
if (isTRUE(verbose)) cliID <- cli::cli_process_start("Normalizing datasets {.val {d}}")
# `d` is the name of each dataset
ld <- dataset(object, d)
ld <- normalize(ld, verbose = verbose, ...)
datasets(object, check = FALSE)[[d]] <- ld
}
if (isTRUE(verbose)) cli::cli_process_done(id = cliID)
object
}
#' @rdname normalize
#' @export
#' @param assay Name of assay to use. Default \code{NULL} uses current active
#' assay.
#' @param save For Seurat>=4.9.9, the name of layer to store normalized data.
#' Default \code{"ligerNormData"}. For older Seurat, stored to \code{data} slot.
#' @param layer Where the input raw counts should be from. Default
#' \code{"counts"}. For older Seurat, always retrieve from \code{counts} slot.
#' @method normalize Seurat
normalize.Seurat <- function(
object,
assay = NULL,
layer = "counts",
save = "ligerNormData",
...
) {
raw <- .getSeuratData(object, layer = layer, slot = "counts",
assay = assay)
if (!is.list(raw)) normed <- normalize(raw, ...)
else normed <- lapply(raw, normalize, ...)
object <- .setSeuratData(object, layer = layer, save = save, slot = "data",
value = normed, assay = assay)
return(object)
}
#' @rdname normalize
#' @export
normalizePeak <- function(
object,
useDatasets = NULL,
verbose = getOption("ligerVerbose", TRUE),
...
) {
useDatasets <- .checkUseDatasets(object, useDatasets, modal = "atac")
object <- recordCommand(object, ..., dependencies = "hdf5r")
for (d in useDatasets) {
if (isTRUE(verbose)) cliID <- cli::cli_process_start("Normalizing peak of dataset: {.val {d}}")
# `d` is the name of each dataset
ld <- dataset(object, d)
normPeak(ld, check = FALSE) <- normalize(rawPeak(ld), ...)
datasets(object, check = FALSE)[[d]] <- ld
if (isTRUE(verbose)) cli::cli_process_done(id = cliID)
}
object
}
############################### Select Genes ###################################
#' Select a subset of informative genes
#' @description This function identifies highly variable genes from each dataset
#' and combines these gene sets (either by union or intersection) for use in
#' downstream analysis. Assuming that gene expression approximately follows a
#' Poisson distribution, this function identifies genes with gene expression
#' variance above a given variance threshold (relative to mean gene expression).
#' Alternatively, we allow selecting a desired number of genes for each dataset
#' by ranking the relative variance, and then take the combination.
#' @export
#' @rdname selectGenes
#' @param object A \linkS4class{liger}, \linkS4class{ligerDataset} or
#' \code{Seurat} object, with normalized data available (no scale factor
#' multipled nor log transformed).
#' @param thresh Variance threshold used to identify variable genes. Higher
#' threshold results in fewer selected genes. Liger and Seurat S3 methods accept
#' a single value or a vector with specific threshold for each dataset in
#' \code{useDatasets}.* Default \code{0.1}.
#' @param nGenes Number of genes to find for each dataset. By setting this,
#' we optimize the threshold used for each dataset so that we get \code{nGenes}
#' selected features for each dataset. Accepts single value or a vector for
#' dataset specific setting matching \code{useDataset}.* Default \code{NULL}
#' does not optimize.
#' @param alpha Alpha threshold. Controls upper bound for expected mean gene
#' expression. Lower threshold means higher upper bound. Default \code{0.99}.
#' @param combine How to combine variable genes selected from all datasets.
#' Choose from \code{"union"} or \code{"intersection"}. Default \code{"union"}.
#' @param verbose Logical. Whether to show information of the progress. Default
#' \code{getOption("ligerVerbose")} or \code{TRUE} if users have not set.
#' @param ... Arguments passed to other methods.
#' @return Updated object
#' \itemize{
#' \item{liger method - Each involved dataset stored in
#' \linkS4class{ligerDataset} is updated with its \code{\link{featureMeta}}
#' slot and \code{varUnsharedFeatures} slot (if requested with
#' \code{useUnsharedDatasets}), while \code{\link{varFeatures}(object)} will be
#' updated with the final combined gene set.}
#' \item{Seurat method - Final selection will be updated at
#' \code{Seurat::VariableFeatures(object)}. Per-dataset information is
#' stored in the \code{meta.features} slot of the chosen Assay.}
#' }
#' @examples
#' pbmc <- normalize(pbmc)
#' # Select basing on thresholding the relative variance
#' pbmc <- selectGenes(pbmc, thresh = .1)
#' # Select specified number for each dataset
#' pbmc <- selectGenes(pbmc, nGenes = c(60, 60))
selectGenes <- function(
object,
thresh = .1,
nGenes = NULL,
alpha = .99,
...
) {
UseMethod("selectGenes", object)
}
#' @export
#' @rdname selectGenes
#' @method selectGenes liger
#' @param useDatasets A character vector of the names, a numeric or logical
#' vector of the index of the datasets to use for shared variable feature
#' selection. Default \code{NULL} uses all datasets.
#' @param useUnsharedDatasets A character vector of the names, a numeric or
#' logical vector of the index of the datasets to use for finding unshared
#' variable features. Default \code{NULL} does not attempt to find unshared
#' features.
#' @param unsharedThresh The same thing as \code{thresh} that is applied to test
#' unshared features. A single value for all datasets in
#' \code{useUnsharedDatasets} or a vector for dataset-specific setting.* Default
#' \code{0.1}.
#' @param chunk Integer. Number of maximum number of cells in each chunk, when
#' gene selection is applied to any HDF5 based dataset. Default \code{1000}.
#' @param var.thresh,alpha.thresh,num.genes,datasets.use,unshared.datasets,unshared.thresh \bold{Deprecated}.
#' These arguments are renamed and will be removed in the future. Please see
#' function usage for replacement.
#' @param tol,do.plot,cex.use,unshared \bold{Deprecated}. Gene variability
#' metric is now visualized with separated function
#' \code{\link{plotVarFeatures}}. Users can now set none-NULL
#' \code{useUnsharedDatasets} to select unshared genes, instead of having to
#' switch \code{unshared} on.
selectGenes.liger <- function(
object,
thresh = .1,
nGenes = NULL,
alpha = .99,
useDatasets = NULL,
useUnsharedDatasets = NULL,
unsharedThresh = .1,
combine = c("union", "intersection"),
chunk = 1000,
verbose = getOption("ligerVerbose", TRUE),
var.thresh = thresh,
alpha.thresh = alpha,
num.genes = nGenes,
datasets.use = useDatasets,
unshared.datasets = useUnsharedDatasets,
unshared.thresh = unsharedThresh,
tol = NULL,
do.plot = NULL,
cex.use = NULL,
unshared = NULL,
...
) {
combine <- match.arg(combine)
.deprecateArgs(replace = list(var.thresh = "thresh",
alpha.thresh = "alpha",
num.genes = "nGenes",
datasets.use = "useDatasets",
unshared.datasets = "useUnsharedDatasets",
unshared.thresh = "unsharedThresh"),
defunct = c("tol", "do.plot", "cex.use"))
object <- recordCommand(object, ...)
datasetShared <- .checkUseDatasets(object, useDatasets)
if (!is.null(useUnsharedDatasets))
datasetUnshared <- .checkUseDatasets(object, useUnsharedDatasets)
else datasetUnshared <- NULL
useDatasets <- union(datasetShared, datasetUnshared)
thresh <- .checkArgLen(thresh, length(datasetShared), class = "numeric")
nGenes <- .checkArgLen(nGenes, length(datasetShared), class = "numeric")
unsharedThresh <- .checkArgLen(unsharedThresh, length(datasetUnshared), class = "numeric")
sharedFeature <- Reduce(intersect, lapply(datasets(object), rownames))
selectList <- list()
for (d in useDatasets) {
if (isTRUE(verbose))
cli::cli_alert_info("Selecting variable features for dataset {.val {d}}")
ld <- dataset(object, d)
thresh_i <- thresh[datasetShared == d]
nGenes_i <- nGenes[datasetShared == d]
unsharedThresh_i <- unsharedThresh[datasetUnshared == d]
ld <- .selectGenes.ligerDataset(
ld, sharedFeature = sharedFeature, thresh = thresh_i,
nGenes = nGenes_i, unshared = d %in% datasetUnshared,
unsharedThresh = unsharedThresh_i,
nUMI = cellMeta(object, "nUMI", useDatasets = d),
alpha = alpha, chunk = chunk, verbose = verbose
)
selectList[[d]] <- rownames(ld)[featureMeta(ld)$isVariable]
datasets(object, check = FALSE)[[d]] <- ld
}
if (combine == "union") selected <- Reduce(union, selectList)
else selected <- Reduce(intersect, selectList)
if (length(selected) == 0) {
cli::cli_alert_danger("No genes were selected. Lower {.code thresh} values or set {.code combine = 'union'}")
} else {
if (isTRUE(verbose))
cli::cli_alert_success("Finally {length(selected)} shared variable feature{?s} are selected.")
}
varFeatures(object) <- selected
for (d in names(object)) {
ld <- dataset(object, d)
featureMeta(ld, check = FALSE)$selected <- rownames(ld) %in% selected
datasets(object, check = FALSE)[[d]] <- ld
}
return(object)
}
#' @param sharedFeature Character vector, the feature names that are common to
#' all datasets involved for the selection. Mostly set internally by the liger
#' S3 method. Default \code{NULL} tests with all available features.
#' @param unshared Logical, whether to select variable ones from unshared
#' features.
#' @param nUMI A vector of prior QC information, number of total counts per cell
#' derived with the raw count matrix of a dataset. Mostly set internally by the
#' liger S3 method.
#' @note
#' For \code{thresh}, \code{unsharedThresh} and \code{nGenes}, ligerDataset S3
#' method only accept a single value, which most of the time got passed from
#' upstream method chain.
#' @noRd
.selectGenes.ligerDataset <- function(
object,
nUMI,
sharedFeature = NULL,
thresh = .1,
nGenes = NULL,
unshared = FALSE,
unsharedThresh = .1,
alpha = .99,
chunk = 1000,
verbose = getOption("ligerVerbose", TRUE)
) {
if (is.null(normData(object))) cli::cli_abort("Normalized data not available.")
if (is.null(sharedFeature)) sharedFeature <- rownames(object)
sharedFeature <- rownames(object) %in% sharedFeature
unsharedFeature <- !sharedFeature
selected.shared <- logical(sum(sharedFeature))
selected.unshared <- logical(sum(unsharedFeature))
if (isH5Liger(object)) {
object <- calcGeneVars.H5(object, chunkSize = chunk, verbose = verbose)
} else {
featureMeta(object, check = FALSE)$geneMeans <-
Matrix::rowMeans(normData(object))
featureMeta(object, check = FALSE)$geneVars <-
rowVars_sparse_rcpp(normData(object), featureMeta(object)$geneMeans)
}
selected.shared <- .selectGenes.withMetric(
genes = rownames(object)[sharedFeature],
means = featureMeta(object)$geneMeans[sharedFeature],
vars = featureMeta(object)$geneVars[sharedFeature],
nUMI = nUMI, dims = dim(object), thresh = thresh, alpha = alpha,
nGenes = nGenes
)
featureMeta(object, check = FALSE)$isVariable <-
rownames(object) %in% selected.shared
if (isTRUE(verbose)) {
cli::cli_alert_success("... {length(selected.shared)} feature{?s} selected out of {sum(sharedFeature)} shared feature{?s}.")
}
if (isTRUE(unshared) && length(unsharedFeature) > 0) {
selected.unshared <- .selectGenes.withMetric(
genes = rownames(object)[unsharedFeature],
means = featureMeta(object)$geneMeans[unsharedFeature],
vars = featureMeta(object)$geneVars[unsharedFeature],
nUMI = nUMI, dims = dim(object), thresh = unsharedThresh,
alpha = alpha, nGenes = nGenes
)
object@varUnsharedFeatures <- selected.unshared
if (isTRUE(verbose)) {
cli::cli_alert_success("... {length(selected.unshared)} feature{?s} selected out of {sum(unsharedFeature)} unshared feature{?s}.")
}
}
return(object)
}
#' Calculate Gene Variance for ligerDataset object
#' @param object ligerDataset object
#' @param chunkSize Integer for the maximum number of cells in each chunk.
#' Default \code{1000}.
#' @param verbose Logical. Whether to show a progress bar. Default
#' \code{getOption("ligerVerbose")} or \code{TRUE} if users have not set.
#' @return The input \code{object} with calculated var updated in the H5 file.
#' @noRd
calcGeneVars.H5 <- function(object, chunkSize = 1000,
verbose = getOption("ligerVerbose", TRUE)) {
geneVars <- rep(0, nrow(object))
geneMeans <- featureMeta(object)$geneMeans
geneVars <- H5Apply(
object,
function(chunk, sparseXIdx, cellIdx, values) {
values + sumSquaredDeviations(chunk, geneMeans)
},
init = geneVars,
useData = "normData",
chunkSize = chunkSize,
verbose = verbose
)
geneVars <- geneVars / (ncol(object) - 1)
featureMeta(object, check = FALSE)$geneVars <- geneVars
object
}
#' @export
#' @rdname selectGenes
#' @method selectGenes Seurat
#' @param layer Where the input normalized counts should be from. Default
#' \code{"ligerNormData"}. For older Seurat, always retrieve from \code{data}
#' slot.
#' @param assay Name of assay to use. Default \code{NULL} uses current active
#' assay.
#' @param datasetVar Metadata variable name that stores the dataset source
#' annotation. Default \code{"orig.ident"}.
selectGenes.Seurat <- function(
object,
thresh = .1,
nGenes = NULL,
alpha = .99,
useDatasets = NULL,
layer = "ligerNormData",
assay = NULL,
datasetVar = "orig.ident",
combine = c("union", "intersection"),
verbose = getOption("ligerVerbose", TRUE),
...
) {
combine <- match.arg(combine)
assay <- assay %||% SeuratObject::DefaultAssay(object)
matList <- .getSeuratData(object, layer = layer, slot = "data",
assay = assay)
if (is.list(matList)) {
# object contain split layers
names(matList) <- gsub(paste0(layer, "."), "", names(matList))
datasetVar <- factor(rep(names(matList), sapply(matList, ncol)),
levels = names(matList))
} else {
datasetVar <- object[[datasetVar, drop = TRUE]]
if (!is.factor(datasetVar)) datasetVar <- factor(datasetVar)
datasetVar <- droplevels(datasetVar)
matList <- splitRmMiss(matList, datasetVar)
}
useDatasets <- useDatasets %||% levels(datasetVar)
matList <- matList[unique(useDatasets)]
featureList <- lapply(matList, rownames)
allshared <- Reduce(intersect, featureList)
allFeatures <- SeuratObject::Features(object, assay = assay)
thresh <- .checkArgLen(thresh, nlevels(datasetVar), class = "numeric")
# Get nUMI metric into list
nUMIVar <- paste0("nCount_", assay)
nUMIAll <- object[[nUMIVar]][,1]
nUMIList <- split(nUMIAll, datasetVar)
selectList <- list()
hvg.info <- data.frame(row.names = allFeatures)
for (d in levels(datasetVar)) {
if (isTRUE(verbose))
cli::cli_alert_info("Selecting variable features for dataset: {.val {d}}")
submat <- matList[[d]]
# submat <- mat[, datasetVar == d, drop = FALSE]
# submat <- submat[sort(expressed), , drop = FALSE]
sharedFeature <- rownames(submat) %in% allshared
means <- Matrix::rowMeans(submat)
hvg.info[[paste0("liger.mean.", d)]] <- 0
hvg.info[rownames(submat), paste0("liger.mean.", d)] <- means
vars <- rowVars_sparse_rcpp(submat, means)
hvg.info[[paste0("liger.variance.", d)]] <- 0
hvg.info[rownames(submat), paste0("liger.variance.", d)] <- vars
thresh_i <- thresh[levels(datasetVar) == d]
selected <- .selectGenes.withMetric(
genes = rownames(submat)[sharedFeature],
means = means[sharedFeature],
vars = vars[sharedFeature],
nUMI = nUMIList[[d]],
dims = dim(submat),
thresh = thresh_i, alpha = alpha,
nGenes = nGenes
)
if (isTRUE(verbose)) {
cli::cli_alert_success("... {length(selected)} features selected out of {length(allshared)} shared features")
}
selectList[[d]] <- selected
}
if (combine == "union") selected <- Reduce(union, selectList)
else selected <- Reduce(intersect, selectList)
if (length(selected) == 0) {
cli::cli_alert_danger("No genes were selected. Lower {.code thresh} values or set {.code combine = 'union'}")
} else {
if (isTRUE(verbose))
cli::cli_alert_success("Finally {length(selected)} shared variable features selected.")
}
hvg.info$liger.variable <- allFeatures %in% selected
assayObj <- Seurat::GetAssay(object, assay = assay)
assayObj[[names(hvg.info)]] <- hvg.info
object[[assay]] <- assayObj
SeuratObject::VariableFeatures(object, assay = assay) <- selected
return(object)
}
# returns selected gene names
.selectGenes.withMetric <- function(
genes,
means,
vars,
nUMI,
dims,
thresh = .1,
alpha = .99,
nGenes = NULL
) {
nolan_constant <- mean((1 / nUMI))
alphathresh.corrected <- alpha / dims[1]
geneMeanUpper <- means +
stats::qnorm(1 - alphathresh.corrected / 2) *
sqrt(means * nolan_constant / dims[2])
basegenelower <- log10(means * nolan_constant)
if (!is.null(nGenes)) {
preselect <- vars / nolan_constant > geneMeanUpper &
log10(vars) > basegenelower
relativeVar <- log10(vars) - basegenelower
names(relativeVar) <- genes
relativeVar <- relativeVar[preselect]
relativeVar <- relativeVar[order(relativeVar, decreasing = TRUE)]
selected <- names(relativeVar[seq(nGenes)])
} else {
selected <- genes[vars / nolan_constant > geneMeanUpper &
log10(vars) > basegenelower + thresh]
}
return(selected)
}
#' Plot the variance vs mean of feature expression
#' @description For each dataset where the feature variablitity is calculated,
#' a plot of log10 feature expression variance and log10 mean will be produced.
#' Features that are considered as variable would be highlighted in red.
#' @param object \linkS4class{liger} object. \code{\link{selectGenes}} needs to
#' be run in advance.
#' @param combinePlot Logical. If \code{TRUE}, sub-figures for all datasets will
#' be combined into one plot. if \code{FALSE}, a list of plots will be returned.
#' Default \code{TRUE}.
#' @param dotSize Controls the size of dots in the main plot. Default
#' \code{0.8}.
#' @param ... More theme setting parameters passed to
#' \code{\link{.ggplotLigerTheme}}.
#' @return \code{ggplot} object when \code{combinePlot = TRUE}, a list of
#' \code{ggplot} objects when \code{combinePlot = FALSE}
#' @export
#' @examples
#' pbmc <- normalize(pbmc)
#' pbmc <- selectGenes(pbmc)
#' plotVarFeatures(pbmc)
plotVarFeatures <- function(
object,
combinePlot = TRUE,
dotSize = 1,
...
) {
plotList <- list()
maxVar <- max(sapply(datasets(object),
function(ld) max(log10(featureMeta(ld)$geneVars))))
minVar <- min(sapply(datasets(object),
function(ld) min(log10(featureMeta(ld)$geneVars))))
maxMean <- max(sapply(datasets(object),
function(ld) max(log10(featureMeta(ld)$geneMeans))))
minMean <- min(sapply(datasets(object),
function(ld) min(log10(featureMeta(ld)$geneMeans))))
for (d in names(object)) {
ld <- dataset(object, d)
trx_per_cell <- cellMeta(object, "nUMI", cellIdx = object$dataset == d)
nolan_constant <- mean((1 / trx_per_cell))
data <- .DataFrame.as.data.frame(featureMeta(ld))
nSelect <- sum(data$isVariable)
data$geneMeans <- log10(data$geneMeans)
data$geneVars <- log10(data$geneVars)
data$isVariable <- factor(data$isVariable,
levels = c("TRUE", "FALSE"))
p <- ggplot2::ggplot(
data,
ggplot2::aes(x = .data[["geneMeans"]],
y = .data[["geneVars"]],
color = .data[["isVariable"]])
) +
ggplot2::geom_point(size = dotSize, stroke = 0) +
ggplot2::geom_abline(intercept = log10(nolan_constant), slope = 1,
color = "purple") +
ggplot2::xlim(minMean, maxMean) +
ggplot2::ylim(minVar, maxVar)
p <- .ggplotLigerTheme(p, title = d,
subtitle = paste0(nSelect, " variable features"),
xlab = "Gene Expression Mean (log10)",
ylab = "Gene Expression Variance (log10)",
legendColorTitle = "Variable\nfeature",
colorLabels = c("TRUE", "FALSE"),
colorValues = c("RED", "BLACK"),
...)
plotList[[d]] <- p
}
if (isTRUE(combinePlot)) {
suppressWarnings({
legend <- cowplot::get_legend(plotList[[1]])
})
plotList <- lapply(plotList, function(x) {
x + ggplot2::theme(legend.position = "none")
})
combined <- cowplot::plot_grid(plotlist = plotList,
align = "hv",
axis = "tblr")
combined <- cowplot::plot_grid(combined, legend, rel_widths = c(5,1))
return(combined)
}
else return(plotList)
}
#' Select variable genes from one dataset with Seurat VST method
#' @description
#' Seurat FindVariableFeatures VST method. This allows the selection of a fixed
#' number of variable features, but only applies to one dataset. No
#' normalization is needed in advance.
#' @param object A \linkS4class{liger} object.
#' @param useDataset The names, a numeric or logical index of the dataset to
#' be considered for selection.
#' @param n Number of variable features needed. Default \code{2000}.
#' @param loessSpan Loess span parameter used when fitting the variance-mean
#' relationship. Default \code{0.3}.
#' @param clipMax After standardization values larger than \code{clipMax} will
#' be set to \code{clipMax}. Default \code{"auto"} sets this value to the square
#' root of the number of cells.
#' @param useShared Logical. Whether to only select from genes shared by all
#' dataset. Default \code{TRUE}.
#' @param verbose Logical. Whether to show information of the progress. Default
#' \code{getOption("ligerVerbose")} or \code{TRUE} if users have not set.
#' @references Seurat::FindVariableFeatures.default(selection.method = "vst")
#' @export
#' @examples
#' pbmc <- selectGenesVST(pbmc, "ctrl", n = 50)
selectGenesVST <- function(
object,
useDataset,
n = 2000,
loessSpan = 0.3,
clipMax = "auto",
useShared = TRUE,
verbose = getOption("ligerVerbose", TRUE))
{
useDataset <- .checkUseDatasets(object, useDataset)
useDataset <- .checkArgLen(useDataset, 1)
if (isTRUE(verbose)) {
cli::cli_alert_info("Selecting top {n} HVG{?s} with VST method for dataset: {.val {useDataset}}")
}
ld <- dataset(object, useDataset)
data <- rawData(ld)
if (isTRUE(useShared)) {
useGenes <- lapply(datasets(object), rownames)
useGenes <- Reduce(intersect, useGenes)
} else {
useGenes <- rownames(data)
}
if (isTRUE(verbose)) {
cli::cli_alert_info("... Totally {length(useGenes)} {ifelse(useShared, 'shared', 'dataset specific')} genes to be selected from.")
}
if (clipMax == "auto") {
clipMax <- sqrt(ncol(data))
}
hvf.info <- data.frame(mean = Matrix::rowMeans(data))
hvf.info$variance <- rowVars_sparse_rcpp(data, hvf.info$mean)
not.const <- hvf.info$variance > 0
hvf.info$variance.expected <- 0
fit <- stats::loess(formula = log10(variance) ~ log10(mean),
data = hvf.info[not.const, ],
span = loessSpan)
hvf.info$variance.expected[not.const] <- 10^fit$fitted
# `SparseRowVarStd` Rcpp function reimplemented with Armadillo,
# Seurat original uses Eigen
hvf.info$variance.standardized <- SparseRowVarStd(
x = data,
mu = hvf.info$mean,
sd = sqrt(hvf.info$variance.expected),
vmax = clipMax
)
colnames(hvf.info) <- c("mean", "var", "vst.variance.expected",
"vst.variance.standardized")
rank <- order(hvf.info$vst.variance.standardized, decreasing = TRUE)
selected <- rownames(ld)[rank][seq(n)]
selected <- selected[selected %in% useGenes]
varFeatures(object) <- selected
fm <- featureMeta(ld)
for (col in colnames(hvf.info)) fm[[col]] <- hvf.info[[col]]
fm$selected <- FALSE
fm$selected[rank[seq(n)]] <- TRUE
featureMeta(ld, check = FALSE) <- fm
datasets(object, check = FALSE)[[useDataset]] <- ld
return(object)
}
############################# Scale Not Center #################################
#' Scale genes by root-mean-square across cells
#' @description This function scales normalized gene expression data after
#' variable genes have been selected. We do not mean-center the data before
#' scaling in order to address the non-negativity constraint of NMF.
#' Computation applied to each normalized dataset matrix can form the following
#' equation:
#'
#' \deqn{S_{i,j}=\frac{N_{i,j}}{\sqrt{\sum_{p}^{n}\frac{N_{i,p}^2}{n-1}}}}
#'
#' Where \eqn{N} denotes the normalized matrix for an individual dataset,
#' \eqn{S} is the output scaled matrix for this dataset, and \eqn{n} is the
#' number of cells in this dataset. \eqn{i, j} denotes the specific gene and
#' cell index, and \eqn{p} is the cell iterator.
#'
#' Please see detailed section below for explanation on methylation dataset.
#' @note
#' Since the scaling on genes is applied on a per dataset base, other scaling
#' methods that apply to a whole concatenated matrix of multiple datasets might
#' not be considered as equivalent alternatives, even if options like
#' \code{center} are set to \code{FALSE}. Hence we implemented an efficient
#' solution that works under such circumstance, provided with the Seurat S3
#' method.
#' @section Methylation dataset:
#' Because gene body mCH proportions are negatively correlated with gene
#' expression level in neurons, we need to reverse the direction of the
#' methylation data before performing the integration. We do this by simply
#' subtracting all values from the maximum methylation value. The resulting
#' values are positively correlated with gene expression. This will only be
#' applied to variable genes detected in prior. Please make sure that argument
#' \code{modal} is set accordingly when running \code{\link{createLiger}}. In
#' this way, this function can automatically detect it and take proper action.
#' If it is not set, users can still manually have the equivalent processing
#' done by doing \code{scaleNotCenter(lig, useDataset = c("other", "datasets"))},
#' and then \code{\link{reverseMethData}(lig, useDataset = c("meth", "datasets"))}.
#' @param object \linkS4class{liger} object, \linkS4class{ligerDataset} object,
#' \link[Matrix]{dgCMatrix-class} object, or a Seurat object.
#' @param ... Arguments passed to other methods. The order goes by: "liger"
#' method calls "ligerDataset" method", which then calls "dgCMatrix" method.
#' "Seurat" method directly calls "dgCMatrix" method.
#' @return Updated \code{object}
#' \itemize{
#' \item{dgCMatrix method - Returns scaled dgCMatrix object}
#' \item{ligerDataset method - Updates the \code{scaleData} and
#' \code{scaledUnsharedData} (if unshared variable feature available) slot
#' of the object}
#' \item{liger method - Updates the \code{scaleData} and
#' \code{scaledUnsharedData} (if unshared variable feature available) slot
#' of chosen datasets}
#' \item{Seurat method - Adds a named layer in chosen assay (V5), or update the
#' \code{scale.data} slot of the chosen assay (<=V4)}
#' }
#' @export
#' @rdname scaleNotCenter
#' @examples
#' pbmc <- normalize(pbmc)
#' pbmc <- selectGenes(pbmc)
#' pbmc <- scaleNotCenter(pbmc)
scaleNotCenter <- function(object, ...) {
UseMethod("scaleNotCenter", object)
}
#' @export
#' @rdname scaleNotCenter
#' @method scaleNotCenter dgCMatrix
scaleNotCenter.dgCMatrix <- function(
object,
...)
{
if (nrow(object) == 0) return(object)
scaled <- scaleNotCenter_byRow_rcpp(object)
scaled@x[is.na(scaled@x)] <- 0 # Is this really happening?
dimnames(scaled) <- dimnames(object)
return(scaled)
}
#' @export
#' @rdname scaleNotCenter
#' @method scaleNotCenter ligerDataset
#' @param features Character, numeric or logical index that choose the variable
#' feature to be scaled. "liger" method by default uses
#' \code{\link{varFeatures}(object)}. "ligerDataset" method by default uses all
#' features. "Seurat" method by default uses
#' \code{Seurat::VariableFeatures(object)}.
#' @param chunk Integer. Number of maximum number of cells in each chunk, when
#' scaling is applied to any HDF5 based dataset. Default \code{1000}.
#' @param verbose Logical. Whether to show information of the progress. Default
#' \code{getOption("ligerVerbose")} or \code{TRUE} if users have not set.
scaleNotCenter.ligerDataset <- function(
object,
features = NULL,
chunk = 1000,
verbose = getOption("ligerVerbose", TRUE),
...
) {
features <- .idxCheck(object, features, "feature")
unsharedIdx <- .idxCheck(object, object@varUnsharedFeatures, "feature")
if (!isH5Liger(object)) {
scaleData(object) <- scaleNotCenter(
normData(object)[features, , drop = FALSE]
)
if (length(unsharedIdx) > 0)
scaleUnsharedData(object) <- scaleNotCenter(
normData(object)[unsharedIdx, , drop = FALSE]
)
} else {
object <- .scaleH5SpMatrix(object, features,
resultH5Path = "scaleDataSparse",
chunk = chunk, verbose = verbose)
if (length(unsharedIdx) > 0)
object <- .scaleH5SpMatrix(object, unsharedIdx,
resultH5Path = "scaleUnsharedDataSparse",
chunk = chunk, verbose = verbose)
}
return(object)
}
#' @export
#' @rdname scaleNotCenter
#' @method scaleNotCenter ligerMethDataset
scaleNotCenter.ligerMethDataset <- function(
object,
features = NULL,
verbose = getOption("ligerVerbose", TRUE),
...
) {
raw <- rawData(object)
scaled <- max(raw) - as.matrix(raw[features, , drop = FALSE])
# Need more experiment to see if it's worth doing so
scaled <- methods::as(scaled, "CsparseMatrix")
scaleData(object, check = FALSE) <- scaled
return(object)
}
#' @export
#' @rdname scaleNotCenter
#' @method scaleNotCenter liger
#' @param useDatasets A character vector of the names, a numeric or logical
#' vector of the index of the datasets to be scaled but not centered. Default
#' \code{NULL} applies to all datasets.
#' @param remove.missing \bold{Deprecated}. The functionality of this is covered
#' through other parts of the whole workflow and is no long needed. Will be
#' ignored if specified.
scaleNotCenter.liger <- function(
object,
useDatasets = NULL,
features = varFeatures(object),
verbose = getOption("ligerVerbose", TRUE),
remove.missing = NULL,
...
) {
.deprecateArgs(defunct = "remove.missing")
.checkObjVersion(object)
if (is.null(features) || length(features) == 0) {
cli::cli_abort("No variable feature specified. Run {.fn selectGenes} first.")
}
useDatasets <- .checkUseDatasets(object, useDatasets)
object <- recordCommand(object, ...,
dependencies = c("RcppArmadillo", "Rcpp"))
for (d in useDatasets) {
if (isTRUE(verbose)) cliID <- cli::cli_process_start("Scaling dataset {.val {d}}")
ld <- dataset(object, d)
ld <- scaleNotCenter(ld, features = features, verbose = verbose, ...)
datasets(object, check = FALSE)[[d]] <- ld
if (isTRUE(verbose)) cli::cli_process_done(id = cliID)
}
return(object)
}
#' @rdname scaleNotCenter
#' @export
#' @param assay Name of assay to use. Default \code{NULL} uses current active
#' assay.
#' @param save For Seurat>=4.9.9, the name of layer to store normalized data.
#' Default \code{"ligerScaleData"}. For older Seurat, stored to
#' \code{scale.data} slot.
#' @param layer For Seurat>=4.9.9, the name of layer to retrieve normalized
#' data. Default \code{"ligerNormData"}. For older Seurat, always retrieve from
#' \code{data} slot.
#' @param datasetVar Metadata variable name that stores the dataset source
#' annotation. Default \code{"orig.ident"}.
#' @method scaleNotCenter Seurat
scaleNotCenter.Seurat <- function(
object,
assay = NULL,
layer = "ligerNormData",
save = "ligerScaleData",
datasetVar = "orig.ident",
features = NULL,
...
) {
normed <- .getSeuratData(object, layer = layer, slot = "data",
assay = assay)
features <- features %||% SeuratObject::VariableFeatures(object)
if (!length(features)) {
cli::cli_abort("No variable feature specified. Run {.fn selectGenes} first")
}
if (is.list(normed)) {
scaled <- lapply(normed, function(x) {
scaleNotCenter(x[features, , drop = FALSE])
})
} else {
# Condition for all batches in one matrix
normed <- normed[features, , drop = FALSE]
# the last [,1] converts data.frame to the vector/factor
datasetVar <- object[[datasetVar, drop = TRUE]]
if (!is.factor(datasetVar)) datasetVar <- factor(datasetVar)
datasetVar <- droplevels(datasetVar)
nlevel <- nlevels(datasetVar)
datasetVar <- as.integer(datasetVar) - 1
# efficient solution where we don't need to split and merge
scaled <- scaleNotCenter_byRow_perDataset_rcpp(normed, datasetVar, nlevel)
scaled@x[is.na(scaled@x)] <- 0
dimnames(scaled) <- list(features, colnames(normed))
}
object <- .setSeuratData(object, layer = layer, save = save,
slot = "scale.data",
value = scaled, assay = assay,
denseIfNeeded = TRUE)
return(object)
}
.scaleH5SpMatrix <- function(ld, featureIdx, resultH5Path, chunk, verbose) {
features <- rownames(ld)[featureIdx]
geneSumSq <- featureMeta(ld)$geneSumSq[featureIdx]
nCells <- ncol(ld)
geneRootMeanSumSq = sqrt(geneSumSq / (nCells - 1))
h5file <- getH5File(ld)
# Count the subset nnz first before creating data space
nnz <- 0
nnz <- H5Apply(
ld, useData = "normData", chunkSize = chunk, verbose = FALSE,
FUN = function(chunk, sparseXIdx, cellIdx, values) {
chunk <- chunk[featureIdx, , drop = FALSE]
values <- values + length(chunk@x)
},
init = nnz
)
# Create datasets
dataPath <- paste0(resultH5Path, "/data")
rowindPath <- paste0(resultH5Path, "/indices")
colptrPath <- paste0(resultH5Path, "/indptr")
safeH5Create(ld, dataPath = dataPath, dims = nnz,
dtype = "double", chunkSize = 2048)
safeH5Create(ld, dataPath = rowindPath, dims = nnz,
dtype = "int", chunkSize = 2048)
safeH5Create(ld, dataPath = colptrPath, dims = nCells + 1,
dtype = "int", chunkSize = 1024)
# Process chunks of sparse normData, and write to sparse scaleData
h5file[[colptrPath]][1] <- 0
H5Apply(
ld,
useData = "normData",
init = c(1, 0), # [1] record of nnz idx start [2] record of last colptr
chunkSize = chunk,
verbose = verbose,
FUN = function(chunk, sparseXIdx, cellIdx, values) {
# Subset variable features
chunk <- chunk[featureIdx, , drop = FALSE]
# Calculate scale not center
chunk <- rowDivide_rcpp(chunk, geneRootMeanSumSq)
chunk@x[is.na(chunk@x)] = 0
# Write
nnzRange <- seq(from = values[1], length.out = length(chunk@i))
h5file[[rowindPath]][nnzRange] <- chunk@i
h5file[[dataPath]][nnzRange] <- chunk@x
values[1] <- values[1] + length(nnzRange)
increColptr <- chunk@p + values[2]
h5file[[colptrPath]][cellIdx + 1] =
increColptr[seq(2, length(increColptr))]
values[2] <- increColptr[length(increColptr)]
return(values)
}
)
safeH5Create(ld, dataPath = paste0(resultH5Path, "/featureIdx"),
dims = length(features), dtype = "int")
h5file[[paste0(resultH5Path, "/featureIdx")]][1:length(featureIdx)] <-
featureIdx
h5fileInfo(ld, "scaleData", check = FALSE) <- resultH5Path
return(ld)
}
#' Create "scaled data" for DNA methylation datasets
#' @description
#' Because gene body mCH proportions are negatively correlated with gene
#' expression level in neurons, we need to reverse the direction of the
#' methylation data. We do this by simply subtracting all values from the
#' maximum methylation value. The resulting values are positively correlated
#' with gene expression. This will only be applied to variable genes detected in
#' prior.
#' @param object A \linkS4class{liger} object, with variable genes identified.
#' @param useDatasets Required. A character vector of the names, a numeric or
#' logical vector of the index of the datasets that should be identified as
#' methylation data where the reversed data will be created.
#' @param verbose Logical. Whether to show information of the progress. Default
#' \code{getOption("ligerVerbose")} or \code{TRUE} if users have not set.
#' @return The input \linkS4class{liger} object, where the \code{scaleData} slot
#' of the specified datasets will be updated with value as described in
#' Description.
#' @export
#' @examples
#' # Assuming the second dataset in example data "pbmc" is methylation data
#' pbmc <- normalize(pbmc, useDatasets = 1)
#' pbmc <- selectGenes(pbmc, datasets.use = 1)
#' pbmc <- scaleNotCenter(pbmc, useDatasets = 1)
#' pbmc <- reverseMethData(pbmc, useDatasets = 2)
reverseMethData <- function(object, useDatasets,
verbose = getOption("ligerVerbose", TRUE)) {
useDatasets <- .checkUseDatasets(object, useDatasets)
if (is.null(varFeatures(object)) || length(varFeatures(object)) == 0) {
cli::cli_abort("No variable feature available. Run {.fn selectGenes} first.")
}
for (d in useDatasets) {
ld <- dataset(object, d)
raw <- rawData(ld)
if (isTRUE(verbose)) cli::cli_alert_info("Substracting methylation data: {.val {d}}")
scaleData(ld, check = FALSE) <- methods::as(
max(raw) - raw[varFeatures(object), , drop = FALSE],
"CsparseMatrix"
)
datasets(object, check = FALSE)[[d]] <- ld
}
return(object)
}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.