Nothing
#' Create liger object
#' @description This function allows creating \linkS4class{liger} object from
#' multiple datasets of various forms (See \code{rawData}).
#'
#' \bold{DO} make a copy of the H5AD files because rliger functions write to
#' the files and they will not be able to be read back to Python. This will be
#' fixed in the future.
#' @param rawData Named list of datasets. Required. Elements allowed include a
#' matrix, a \code{Seurat} object, a \code{SingleCellExperiment} object, an
#' \code{AnnData} object, a \linkS4class{ligerDataset} object or a filename to
#' an HDF5 file. See detail for HDF5 reading.
#' @param modal Character vector for modality setting. Use one string for all
#' datasets, or the same number of strings as the number of datasets. Currently
#' options of \code{"default"}, \code{"rna"}, \code{"atac"}, \code{"spatial"}
#' and \code{"meth"} are supported.
#' @param organism Character vector for setting organism for identifying mito,
#' ribo and hemo genes for expression percentage calculation. Use one string for
#' all datasets, or the same number of strings as the number of datasets.
#' Currently options of \code{"mouse"}, \code{"human"}, \code{"zebrafish"},
#' \code{"rat"}, and \code{"drosophila"} are supported.
#' @param cellMeta data.frame of metadata at single-cell level. Default
#' \code{NULL}.
#' @param removeMissing Logical. Whether to remove cells that do not have any
#' counts from each dataset. Default \code{TRUE}.
#' @param addPrefix Logical. Whether to add "datasetName_" as a prefix of
#' cell identifiers (e.g. barcodes) to avoid duplicates in multiple libraries (
#' common with 10X data). Default \code{"auto"} detects if matrix columns
#' already has the exact prefix or not. Logical value forces the action.
#' @param formatType Select preset of H5 file structure. Current available
#' options are \code{"10x"} and \code{"anndata"}. Can be either a single
#' specification for all datasets or a character vector that match with each
#' dataset.
#' @param anndataX The HDF5 path to the raw count data in an H5AD file. See
#' \code{\link{createH5LigerDataset}} Details. Default \code{"X"}.
#' @param dataName,indicesName,indptrName The path in a H5 file for the raw
#' sparse matrix data. These three types of data stands for the \code{x},
#' \code{i}, and \code{p} slots of a \code{\link[Matrix]{dgCMatrix-class}}
#' object. Default \code{NULL} uses \code{formatType} preset.
#' @param genesName,barcodesName The path in a H5 file for the gene names and
#' cell barcodes. Default \code{NULL} uses \code{formatType} preset.
#' @param newH5 When using HDF5 based data and subsets created after removing
#' missing cells/features, whether to create new HDF5 files for the subset.
#' Default \code{TRUE}. If \code{FALSE}, data will be subset into memory and
#' can be dangerous for large scale analysis.
#' @param verbose Logical. Whether to show information of the progress. Default
#' \code{getOption("ligerVerbose")} or \code{TRUE} if users have not set.
#' @param ... Additional slot values that should be directly placed in object.
#' @param raw.data,remove.missing,format.type,data.name,indices.name,indptr.name,genes.name,barcodes.name
#' `r lifecycle::badge("superseded")` See Usage section for replacement.
#' @param take.gene.union `r lifecycle::badge("defunct")` Will be ignored.
#' @export
#' @seealso \code{\link{createLigerDataset}}, \code{\link{createH5LigerDataset}}
#' @examples
#' # Create from raw count matrices
#' ctrl.raw <- rawData(pbmc, "ctrl")
#' stim.raw <- rawData(pbmc, "stim")
#' pbmc1 <- createLiger(list(ctrl = ctrl.raw, stim = stim.raw))
#'
#' # Create from H5 files
#' h5Path <- system.file("extdata/ctrl.h5", package = "rliger")
#' tempPath <- tempfile(fileext = ".h5")
#' file.copy(from = h5Path, to = tempPath)
#' lig <- createLiger(list(ctrl = tempPath))
#'
#' # Create from other container object
#' if (requireNamespace("SeuratObject", quietly = TRUE)) {
#' ctrl.seu <- SeuratObject::CreateSeuratObject(ctrl.raw)
#' stim.seu <- SeuratObject::CreateSeuratObject(stim.raw)
#' pbmc2 <- createLiger(list(ctrl = ctrl.seu, stim = stim.seu))
#' }
createLiger <- function(
rawData,
modal = NULL,
organism = "human",
cellMeta = NULL,
removeMissing = TRUE,
addPrefix = "auto",
formatType = "10X",
anndataX = "X",
dataName = NULL,
indicesName = NULL,
indptrName = NULL,
genesName = NULL,
barcodesName = NULL,
newH5 = TRUE,
verbose = getOption("ligerVerbose", TRUE),
...,
# Deprecated coding style
raw.data = rawData,
take.gene.union = NULL,
remove.missing = removeMissing,
format.type = formatType,
data.name = dataName,
indices.name = indicesName,
indptr.name = indptrName,
genes.name = genesName,
barcodes.name = barcodesName
) {
.deprecateArgs(list(raw.data = "rawData", remove.missing = "removeMissing",
format.type = "formatType", data.name = "dataName",
indices.name = "indicesName",
indptr.name = "indptrName", genes.name = "genesName",
barcodes.name = "barcodesName"),
defunct = "take.gene.union")
if (!is.list(rawData) ||
is.null(names(rawData)) ||
any(nchar(names(rawData)) == 0)) {
cli::cli_abort("{.var rawData} has to be a named list.")
}
nData <- length(rawData)
if (missing(modal) || is.null(modal)) modal <- "default"
modal <- tolower(modal)
modal <- .checkArgLen(modal, nData, repN = TRUE, class = "character")
organism <- tolower(organism)
organism <- .checkArgLen(organism, nData, repN = TRUE, class = "character")
if (!all(organism %in% c("mouse", "human", "zebrafish", "rat", "drosophila"))) {
cli::cli_abort("Invalid {.var organism} value. Only support: mouse, human, zebrafish, rat, drosophila.")
}
# TODO handle h5 specific argument for hybrid of H5 and in memory stuff.
datasets <- list()
barcodesOrig <- NULL
cellID <- NULL
for (i in seq_along(rawData)) {
dname <- names(rawData)[i]
data <- rawData[[i]]
if (is.character(data)) {
# Assuming character input is a filename
ld <- createH5LigerDataset(
h5file = data,
formatType = formatType,
anndataX = anndataX,
rawData = dataName,
barcodesName = barcodesName,
genesName = genesName,
indicesName = indicesName,
indptrName = indptrName,
modal = modal[i]
)
} else {
ld <- as.ligerDataset(data, modal = modal[i])
}
if (is.null(ld)) next
colnameOrig <- colnames(ld)
prefix <- paste0(dname, "_")
.addPrefix <- FALSE
if (addPrefix == "auto") {
# If all colnames starts with the prefix wanted, don't add it again
.addPrefix <- !all(startsWith(colnameOrig, prefix))
}
barcodesOrig <- c(barcodesOrig, colnameOrig)
if (.addPrefix) {
colnames(ld) <- paste0(prefix, colnameOrig)
}
datasets[[dname]] <- ld
}
datasets <- lapply(datasets, function(ld) {
colnames(ld) <- make.unique(colnames(ld))
return(ld)
})
cellID <- unlist(lapply(datasets, colnames), use.names = FALSE)
if (is.null(cellMeta)) {
cellMeta <- S4Vectors::DataFrame(
dataset = factor(rep(names(datasets), sapply(datasets, ncol)),
levels = names(datasets)),
barcode = barcodesOrig,
row.names = cellID)
} else {
cellMeta <- S4Vectors::DataFrame(cellMeta)
# TODO Whether the given cell metadata dataframe should have original
# barcodes or processed cellID?
cellMeta <- cellMeta[barcodesOrig, , drop = FALSE]
rownames(cellMeta) <- cellID
# Force writing `dataset` variable as named by @datasets
cellMeta$dataset <- factor(rep(names(datasets),
lapply(datasets, ncol)))
}
obj <- methods::new("liger",
datasets = datasets,
cellMeta = cellMeta,
...)
for (i in seq_along(datasets)) {
obj <- runGeneralQC(
object = obj,
organism = organism[i],
useDatasets = i,
overwrite = TRUE,
verbose = verbose
)
}
if (isTRUE(removeMissing)) {
obj <- removeMissing(obj, "cell", filenameSuffix = "qc",
verbose = verbose, newH5 = newH5)
}
return(obj)
}
#' Create in-memory ligerDataset object
#' @param rawData,normData,scaleData A \code{\link[Matrix]{dgCMatrix-class}}
#' object for the raw or normalized expression count or a dense matrix of scaled
#' variable gene expression, respectively. Default \code{NULL} for all three but
#' at lease one has to be specified.
#' @param modal Name of modality for this dataset. Currently options of
#' \code{"default"}, \code{"rna"}, \code{"atac"}, \code{"spatial"} and
#' \code{"meth"} are supported. Default \code{"default"}.
#' @param featureMeta Data frame of feature metadata. Default \code{NULL}.
#' @param ... Additional slot data. See \linkS4class{ligerDataset} for detail.
#' Given values will be directly placed at corresponding slots.
#' @seealso \linkS4class{ligerDataset}, \linkS4class{ligerATACDataset},
#' \linkS4class{ligerSpatialDataset}, \linkS4class{ligerMethDataset}
#' @export
#' @examples
#' ctrl.raw <- rawData(pbmc, "ctrl")
#' ctrl.ld <- createLigerDataset(ctrl.raw)
createLigerDataset <- function(
rawData = NULL,
modal = c("default", "rna", "atac", "spatial", "meth"),
normData = NULL,
scaleData = NULL,
featureMeta = NULL,
...
) {
modal <- match.arg(modal)
args <- as.list(environment())
additional <- list(...)
# Necessary initialization of slots
if (is.null(rawData) && is.null(normData)) {
cli::cli_abort("At least one of {.field rawData} or {.field normData} has to be provided.")
}
# Look for proper colnames and rownames
cn <- NULL
rn <- NULL
for (i in c("rawData", "normData", "scaleData")) {
cn <- colnames(args[[i]])
if (!is.null(cn)) break
}
if (!is.null(rawData)) {
rn <- rownames(rawData)
if (!inherits(rawData, "dgCMatrix"))
rawData <- methods::as(rawData, "CsparseMatrix")
}
if (!is.null(normData)) {
if (is.null(rn)) rn <- rownames(normData)
if (!inherits(normData, "dgCMatrix"))
normData <- methods::as(normData, "CsparseMatrix")
}
if (!is.null(scaleData)) {
if (is.null(rn)) rn <- rownames(scaleData)
if (!inherits(scaleData, "matrix"))
scaleData <- methods::as(scaleData, "matrix")
}
if (is.null(h5fileInfo)) h5fileInfo <- list()
if (is.null(featureMeta))
featureMeta <- S4Vectors::DataFrame(row.names = rn)
else if (!inherits(featureMeta, "DFrame"))
featureMeta <- S4Vectors::DataFrame(featureMeta)
# Create ligerDataset
allData <- list(.modalClassDict[[modal]],
rawData = rawData, normData = normData,
scaleData = scaleData, featureMeta = featureMeta,
colnames = cn, rownames = rn)
allData <- c(allData, additional)
x <- do.call("new", allData)
return(x)
}
#' Create on-disk ligerDataset Object
#' @description
#' For convenience, the default \code{formatType = "10x"} directly fits the
#' structure of cellranger output. \code{formatType = "anndata"} works for
#' current AnnData H5AD file specification (see Details). If a customized H5
#' file structure is presented, any of the \code{rawData},
#' \code{indicesName}, \code{indptrName}, \code{genesName}, \code{barcodesName}
#' should be specified accordingly to override the \code{formatType} preset.
#'
#' \bold{DO} make a copy of the H5AD files because rliger functions write to
#' the files and they will not be able to be read back to Python. This will be
#' fixed in the future.
#' @details
#' For H5AD file written from an AnnData object, we allow using
#' \code{formatType = "anndata"} for the function to infer the proper structure.
#' However, while a typical AnnData-based analysis tends to in-place update the
#' \code{adata.X} attribute and there is no standard/forced convention for where
#' the raw count data, as needed from LIGER, is stored. Therefore, we expose
#' argument \code{anndataX} for specifying this information. The default value
#' \code{"X"} looks for \code{adata.X}. If the raw data is stored in a layer,
#' e.g. \code{adata.layers['count']}, then \code{anndataX = "layers/count"}.
#' If it is stored to \code{adata.raw.X}, then \code{anndataX = "raw/X"}. If
#' your AnnData object does not have the raw count retained, you will have to
#' go back to the Python work flow to have it inserted at desired object space
#' and re-write the H5AD file, or just go from upstream source files with which
#' the AnnData was originally created.
#' @param h5file Filename of an H5 file
#' @param formatType Select preset of H5 file structure. Default \code{"10X"}.
#' Alternatively, we also support \code{"anndata"} for H5AD files.
#' @param rawData,indicesName,indptrName The path in a H5 file for the raw
#' sparse matrix data. These three types of data stands for the \code{x},
#' \code{i}, and \code{p} slots of a \code{\link[Matrix]{dgCMatrix-class}}
#' object. Default \code{NULL} uses \code{formatType} preset.
#' @param normData The path in a H5 file for the "x" vector of the normalized
#' sparse matrix. Default \code{NULL}.
#' @param scaleData The path in a H5 file for the Group that contains the sparse
#' matrix constructing information for the scaled data. Default \code{NULL}.
#' @param genesName,barcodesName The path in a H5 file for the gene names and
#' cell barcodes. Default \code{NULL} uses \code{formatType} preset.
#' @param anndataX The HDF5 path to the raw count data in an H5AD file. See
#' Details. Default \code{"X"}.
#' @param modal Name of modality for this dataset. Currently options of
#' \code{"default"}, \code{"rna"}, \code{"atac"}, \code{"spatial"} and
#' \code{"meth"} are supported. Default \code{"default"}.
#' @param featureMeta Data frame for feature metadata. Default \code{NULL}.
#' @param ... Additional slot data. See \linkS4class{ligerDataset} for detail.
#' Given values will be directly placed at corresponding slots.
#' @export
#' @return H5-based \linkS4class{ligerDataset} object
#' @examples
#' h5Path <- system.file("extdata/ctrl.h5", package = "rliger")
#' tempPath <- tempfile(fileext = ".h5")
#' file.copy(from = h5Path, to = tempPath)
#' ld <- createH5LigerDataset(tempPath)
createH5LigerDataset <- function(
h5file,
formatType = "10x",
rawData = NULL,
normData = NULL,
scaleData = NULL,
barcodesName = NULL,
genesName = NULL,
indicesName = NULL,
indptrName = NULL,
anndataX = "X",
modal = c("default", "rna", "atac", "spatial", "meth"),
featureMeta = NULL,
...
) {
# Currently commented this check because it fails for unknown reason on
# Windows platform when subsetting H5 to H5, even if the following part
# works without passing this check
# if (!hdf5r::is.h5file(h5file)) {
# stop("`h5file`: Invalid HDF5 file or file path: ", h5file)
# }
modal <- match.arg(modal)
additional <- list(...)
h5file <- hdf5r::H5File$new(h5file, mode = "r+")
tryCatch(
{
if (!is.null(formatType)) {
formatType <- tolower(formatType)
if (formatType == "10x") {
barcodesName <- barcodesName %||% "matrix/barcodes"
barcodes <- h5file[[barcodesName]][]
rawData <- rawData %||% "matrix/data"
indicesName <- indicesName %||% "matrix/indices"
indptrName <- indptrName %||% "matrix/indptr"
genesName <- genesName %||% "matrix/features/name"
genes <- h5file[[genesName]][]
} else if (formatType == "anndata") {
# AnnData specs changed around 0.7.0
if (inherits(h5file[["obs"]], "H5Group")) {
barcodesName <- paste0("obs/", hdf5r::h5attr(h5file[["obs"]], "_index"))
barcodes <- h5file[[barcodesName]][]
genesName <- paste0("var/", hdf5r::h5attr(h5file[["var"]], "_index"))
genes <- h5file[[genesName]][]
} else if (inherits(h5file[["obs"]], "H5D")) {
barcodesName <- "obs"
barcodes <- h5file[["obs"]][]$cell
genesName <- genesName %||% "raw.var"
genes <- h5file[[genesName]][]
}
rawData <- rawData %||% paste0(anndataX, "/data")
indicesName <- indicesName %||% paste0(anndataX, "/indices")
indptrName <- indptrName %||% paste0(anndataX, "/indptr")
if (!inherits(h5file[[rawData]]$key_info$type, "H5T_INTEGER")) {
cli::cli_alert_warning(
"Warning: H5AD matrix data detected is not of integer dtype while {.pkg rliger} requires raw counts input."
)
cli::cli_alert_warning(
"Use caution when using this dataset if the data really does not have integer values."
)
cli::cli_alert_info(
"A different matrix can be selected using argument {.var anndataX}."
)
cli::cli_alert_info(
"See {.code ?createH5LigerDataset} {.emph Details} for more information."
)
}
} else {
cli::cli_abort("Specified {.var formatType} ({.val {formatType}}) is not supported for now.")
}
} else {
barcodes <- h5file[[barcodesName]][]
genes <- h5file[[genesName]][]
}
# The order of list elements matters. Put "paths" together so easier for
# checking link existence.
h5.meta <- list(
H5File = h5file,
filename = h5file$filename,
formatType = formatType,
indicesName = indicesName,
indptrName = indptrName,
barcodesName = barcodesName,
genesName = genesName,
rawData = rawData,
normData = normData,
scaleData = scaleData
)
if (!is.null(rawData)) rawData <- h5file[[rawData]]
if (!is.null(normData)) normData <- h5file[[normData]]
if (!is.null(scaleData)) scaleData <- h5file[[scaleData]]
if (is.null(featureMeta))
featureMeta <- S4Vectors::DataFrame(row.names = genes)
else if (!inherits(featureMeta, "DFrame"))
featureMeta <- S4Vectors::DataFrame(featureMeta)
allData <- list(Class = .modalClassDict[[modal]],
rawData = rawData,
normData = normData,
scaleData = scaleData,
#H = H, V = V, A = A, B = B, U = U,
h5fileInfo = h5.meta, featureMeta = featureMeta,
colnames = barcodes, rownames = genes)
allData <- c(allData, additional)
x <- do.call("new", allData)
return(x)
},
error = function(e) {
message(format(e))
cli::cli_alert_warning("Closing all linking to H5 file: {.file {h5file$filename}}")
h5file$close_all()
return(invisible(NULL))
}
)
}
#' Read liger object from RDS file
#' @description
#' This file reads a liger object stored in RDS files under all kinds of types.
#' 1. A \linkS4class{liger} object with in-memory data created from package
#' version since 1.99.
#' 2. A liger object with on-disk H5 data associated, where the link to H5 files
#' will be automatically restored.
#' 3. A liger object created with older package version, and can be updated to
#' the latest data structure by default.
#' @param filename Path to an RDS file of a \code{liger} object of old versions.
#' @param dimredName The name of variable in \code{cellMeta} slot to store the
#' dimensionality reduction matrix, which originally located in
#' \code{tsne.coords} slot. Default \code{"tsne.coords"}.
#' @param clusterName The name of variable in \code{cellMeta} slot to store the
#' clustering assignment, which originally located in \code{clusters} slot.
#' Default \code{"clusters"}.
#' @param h5FilePath Named character vector for all H5 file paths. Not required
#' for object run with in-memory analysis. For object containing H5-based
#' analysis (e.g. online iNMF), this must be supplied if the H5 file location is
#' different from that at creation time.
#' @param update Logical, whether to update an old (<=1.99.0) \code{liger} object
#' to the currect version of structure. Default \code{TRUE}.
#' @return New version of \linkS4class{liger} object
#' @export
#' @examples
#' # Save and read regular current-version liger object
#' tempPath <- tempfile(fileext = ".rds")
#' saveRDS(pbmc, tempPath)
#'
#' pbmc <- readLiger(tempPath, dimredName = NULL)
#'
#' # Save and read H5-based liger object
#' h5Path <- system.file("extdata/ctrl.h5", package = "rliger")
#' h5tempPath <- tempfile(fileext = ".h5")
#' file.copy(from = h5Path, to = h5tempPath)
#' lig <- createLiger(list(ctrl = h5tempPath))
#' tempPath <- tempfile(fileext = ".rds")
#' saveRDS(lig, tempPath)
#'
#' lig <- readLiger(tempPath, h5FilePath = list(ctrl = h5tempPath))
#'
#' \dontrun{
#' # Read a old liger object <= 1.0.1
#' # Assume the dimensionality reduction method applied was UMAP
#' # Assume the clustering was derived with Louvain method
#' lig <- readLiger(
#' filename = "path/to/oldLiger.rds",
#' dimredName = "UMAP",
#' clusterName = "louvain"
#' )
#' }
readLiger <- function(
filename,
dimredName,
clusterName = "clusters",
h5FilePath = NULL,
update = TRUE) {
object <- readRDS(filename)
if (isTRUE(update)) {
object <- updateLigerObject(object, dimredName, clusterName, h5FilePath)
}
return(object)
}
# nocov start
#' Import prepared dataset publically available
#' @description
#' These are functions to download example datasets that are subset from public
#' data.
#' \itemize{
#' \item{\bold{PBMC} - Downsampled from GSE96583, Kang et al, Nature
#' Biotechnology, 2018. Contains two scRNAseq datasets.}
#' \item{\bold{BMMC} - Downsampled from GSE139369, Granja et al, Nature
#' Biotechnology, 2019. Contains two scRNAseq datasets and one scATAC data.}
#' \item{\bold{CGE} - Downsampled from GSE97179, Luo et al, Science, 2017.
#' Contains one scRNAseq dataset and one DNA methylation data.}
#' }
#'
#' @rdname importVignetteData
#' @param overwrite Logical, if a file exists at corresponding download
#' location, whether to re-download or directly use this file. Default
#' \code{FALSE}.
#' @param dir Path to download datasets. Default current working directory
#' \code{getwd()}.
#' @param method \code{method} argument directly passed to
#' \code{\link[utils]{download.file}}. Using \code{"libcurl"} while other
#' options might not work depending on platform.
#' @param verbose Logical. Whether to show information of the progress. Default
#' \code{getOption("ligerVerbose")} or \code{TRUE} if users have not set.
#' @param ... Additional arguments passed to \code{\link{download.file}}
#' @export
#' @return Constructed \linkS4class{liger} object with QC performed and missing
#' data removed.
#' @examplesIf interactive()
#' \donttest{
#' pbmc <- importPBMC()
#' bmmc <- importBMMC()
#' cge <- importCGE()
#' }
importPBMC <- function(
dir = getwd(),
overwrite = FALSE,
method = "libcurl",
verbose = getOption("ligerVerbose", TRUE),
...
) {
fsep <- ifelse(Sys.info()["sysname"] == "Windows", "\\", "/")
info <- data.frame(
name = c("ctrl", "stim"),
url = c(
"https://figshare.com/ndownloader/files/40054042",
"https://figshare.com/ndownloader/files/40054048"
),
filename = file.path(dir,
c("liger_PBMCs_ctrl.rds", "liger_PBMCs_stim.rds"),
fsep = fsep),
modal = "default"
)
doDownload <- rep(TRUE, nrow(info))
for (i in seq(nrow(info))) {
f <- info$filename[i]
if (file.exists(f) && isFALSE(overwrite)) {
cli::cli_alert_warning(
"Skipping file already exists at: {.file {f}}. "
)
cli::cli_alert_info("Set {.code overwrite = TRUE} to forcing download.")
doDownload[i] <- FALSE
next
}
if (isTRUE(verbose))
cli::cli_alert_info("Downloading from {.url {info$url[i]}} to {.file {f}}")
}
if (sum(doDownload) > 0) {
utils::download.file(info$url[doDownload],
destfile = info$filename[doDownload],
mode = "wb", quiet = !verbose, method = method,
...)
}
rawList <- lapply(info$filename, readRDS)
names(rawList) <- info$name
object <- createLiger(rawList, modal = info$modal, verbose = verbose)
return(object)
}
#' @rdname importVignetteData
#' @export
importBMMC <- function(
dir = getwd(),
overwrite = FALSE,
method = "libcurl",
verbose = getOption("ligerVerbose", TRUE),
...
) {
fsep <- ifelse(Sys.info()["sysname"] == "Windows", "\\", "/")
info <- data.frame(
name = c("rna_D1T1", "rna_D1T2", "atac_D5T1", NA),
url = c(
"https://figshare.com/ndownloader/files/40054858",
"https://figshare.com/ndownloader/files/40054861",
"https://figshare.com/ndownloader/files/40054891",
"https://figshare.com/ndownloader/files/40054864"
),
filename = file.path(dir,
c("liger_BMMC_rna_D1T1.rds",
"liger_BMMC_rna_D1T2.rds",
"liger_BMMC_atac_D5T1.rds",
"liger_BMMC_atac_D5T1_peak.rds"),
fsep = fsep),
modal = c("default", "default", "atac", NA)
)
doDownload <- rep(TRUE, nrow(info))
for (i in seq(nrow(info))) {
f <- info$filename[i]
if (file.exists(f) && isFALSE(overwrite)) {
cli::cli_alert_warning(
"Skipping file already exists at: {.file {f}}. "
)
cli::cli_alert_info("Set {.code overwrite = TRUE} to forcing download.")
doDownload[i] <- FALSE
next
}
if (isTRUE(verbose))
cli::cli_alert_info("Downloading from {.url {info$url[i]}} to {.file {f}}")
}
if (sum(doDownload) > 0) {
utils::download.file(info$url[doDownload],
destfile = info$filename[doDownload],
mode = "wb", quiet = !verbose, method = method,
...)
}
rawList <- lapply(info$filename, readRDS)
names(rawList) <- info$name
# BMMC raw data has prefix added to barcodes, so don't add it
object <- createLiger(rawList[1:3], modal = info$modal[1:3], verbose = verbose,
addPrefix = FALSE)
rawPeak(object, info$name[3]) <- rawList[[4]]
return(object)
}
#' @rdname importVignetteData
#' @export
importCGE <- function(
dir = getwd(),
overwrite = FALSE,
method = "libcurl",
verbose = getOption("ligerVerbose", TRUE),
...
) {
fsep <- ifelse(Sys.info()["sysname"] == "Windows", "\\", "/")
info <- data.frame(
name = c("rna", "met"),
url = c(
"https://figshare.com/ndownloader/files/40222699",
"https://figshare.com/ndownloader/files/40222702"
),
filename = file.path(dir,
c("liger_CGE_rna.rds", "liger_CGE_met.rds"),
fsep = fsep),
modal = c("default", "meth")
)
doDownload <- rep(TRUE, nrow(info))
for (i in seq(nrow(info))) {
f <- info$filename[i]
if (file.exists(f) && isFALSE(overwrite)) {
cli::cli_alert_warning(
"Skipping file already exists at: {.file {f}}. "
)
cli::cli_alert_info("Set {.code overwrite = TRUE} to forcing download.")
doDownload[i] <- FALSE
next
}
if (isTRUE(verbose))
cli::cli_alert_info("Downloading from {.url {info$url[i]}} to {.file {f}}")
}
if (sum(doDownload) > 0) {
utils::download.file(info$url[doDownload],
destfile = info$filename[doDownload],
mode = "wb", quiet = !verbose, method = method,
...)
}
rawList <- lapply(info$filename, readRDS)
names(rawList) <- info$name
object <- createLiger(rawList, modal = info$modal, verbose = verbose)
return(object)
}
#' Load in data from 10X
#' @description
#' Enables easy loading of sparse data matrices provided by 10X genomics.
#'
#' \code{read10X} works generally for 10X cellranger pipelines including:
#' CellRanger < 3.0 & >= 3.0 and CellRanger-ARC.
#'
#' \code{read10XRNA} invokes \code{read10X} and takes the "Gene Expression" out,
#' so that the result can directly be used to construct a \linkS4class{liger}
#' object. See Examples for demonstration.
#'
#' \code{read10XATAC} works for both cellRanger-ARC and cellRanger-ATAC
#' pipelines but needs user arguments for correct recognition. Similarly, the
#' returned value can directly be used for constructing a \linkS4class{liger}
#' object.
#' @param path (A.) A Directory containing the matrix.mtx, genes.tsv (or
#' features.tsv), and barcodes.tsv files provided by 10X. A vector, a named
#' vector, a list or a named list can be given in order to load several data
#' directories. (B.) The 10X root directory where subdirectories of per-sample
#' output folders can be found. Sample names will by default take the name of
#' the vector, list or subfolders.
#' @param sampleNames A vector of names to override the detected or set sample
#' names for what is given to \code{path}. Default \code{NULL}. If no name
#' detected at all and multiple samples are given, will name them by numbers.
#' @param addPrefix Logical, whether to add sample names as a prefix to the
#' barcodes. Default \code{FALSE}.
#' @param useFiltered Logical, if \code{path} is given as case B, whether to use
#' the filtered feature barcode matrix instead of raw (unfiltered). Default
#' \code{TRUE}.
#' @param reference In case of specifying a CellRanger<3 root folder to
#' \code{path}, import the matrix from the output using which reference. Only
#' needed when multiple references present. Default \code{NULL}.
#' @param geneCol Specify which column of genes.tsv or features.tsv to use for
#' gene names. Default \code{2}.
#' @param cellCol Specify which column of barcodes.tsv to use for cell names.
#' Default \code{1}.
#' @param returnList Logical, whether to still return a structured list instead
#' of a single matrix object, in the case where only one sample and only one
#' feature type can be found. Otherwise will always return a list. Default
#' \code{FALSE}.
#' @param verbose Logical. Whether to show information of the progress. Default
#' \code{getOption("ligerVerbose")} or \code{TRUE} if users have not set.
#' @param sample.dirs,sample.names,use.filtered These arguments are renamed and
#' will be deprecated in the future. Please see usage for corresponding
#' arguments.
#' @param data.type,merge,num.cells,min.umis These arguments are defuncted
#' because the functionality can/should be fulfilled with other functions.
#' @return
#' \itemize{
#' \item{When only one sample is given or detected, and only one feature type
#' is detected or using CellRanger < 3.0, and \code{returnList = FALSE}, a
#' sparse matrix object (dgCMatrix class) will be returned.}
#' \item{When using \code{read10XRNA} or \code{read10XATAC}, which are modality
#' specific, returns a list named by samples, and each element is the
#' corresponding sparse matrix object (dgCMatrix class).}
#' \item{\code{read10X} generally returns a list named by samples. Each sample
#' element will be another list named by feature types even if only one feature
#' type is detected (or using CellRanger < 3.0) for data structure consistency.
#' The feature type "Gene Expression" always comes as the first type if
#' available.}
#' }
#' @export
#' @rdname read10X
#' @examples
#' \dontrun{
#' # For output from CellRanger < 3.0
#' dir <- 'path/to/data/directory'
#' list.files(dir) # Should show barcodes.tsv, genes.tsv, and matrix.mtx
#' mat <- read10X(dir)
#' class(mat) # Should show dgCMatrix
#'
#' # For root directory from CellRanger < 3.0
#' dir <- 'path/to/root'
#' list.dirs(dir) # Should show sample names
#' matList <- read10X(dir)
#' names(matList) # Should show the sample names
#' class(matList[[1]][["Gene Expression"]]) # Should show dgCMatrix
#'
#' # For output from CellRanger >= 3.0 with multiple data types
#' dir <- 'path/to/data/directory'
#' list.files(dir) # Should show barcodes.tsv.gz, features.tsv.gz, and matrix.mtx.gz
#' matList <- read10X(dir, sampleNames = "tissue1")
#' names(matList) # Shoud show "tissue1"
#' names(matList$tissue1) # Should show feature types, e.g. "Gene Expression" and etc.
#'
#' # For root directory from CellRanger >= 3.0 with multiple data types
#' dir <- 'path/to/root'
#' list.dirs(dir) # Should show sample names, e.g. "rep1", "rep2", "rep3"
#' matList <- read10X(dir)
#' names(matList) # Should show the sample names: "rep1", "rep2", "rep3"
#' names(matList$rep1) # Should show the avalable feature types for rep1
#' }
read10X <- function(
path,
sampleNames = NULL,
addPrefix = FALSE,
useFiltered = NULL,
reference = NULL,
geneCol = 2,
cellCol = 1,
returnList = FALSE,
verbose = getOption("ligerVerbose", TRUE),
# Renamed
sample.dirs = path,
sample.names = sampleNames,
use.filtered = useFiltered,
# Defunct
data.type = NULL,
merge = NULL,
num.cells = NULL,
min.umis = NULL
) {
.deprecateArgs(replace = list(sample.dirs = "path",
sample.names = "sampleNames",
use.filtered = "useFiltered"),
defunct = c("data.type", "merge", "num.cells", "min.umis"))
# Adopted from Seurat V4.4.0 repo, with minor modification
# Whether a folder with samples inside, or a list/vector of mtxDirs
if (length(path) == 1) {
dirSampleNames <- list.dirs(path, full.names = FALSE, recursive = FALSE)
outsPaths <- file.path(path, dirSampleNames, "outs")
if (any(dir.exists(outsPaths))) {
# Case B, make the final `path` to case A
dirSampleNames <- dirSampleNames[dir.exists(outsPaths)]
outsPaths <- outsPaths[dir.exists(outsPaths)]
useFiltered <- useFiltered %||% TRUE
prefix <- ifelse(useFiltered, "filtered", "raw")
subdir <- paste0(prefix, "_feature_bc_matrix")
if (dir.exists(file.path(outsPaths[1], subdir))) {
isV3 <- TRUE
} else {
isV3 <- FALSE
subdir <- paste0(prefix, "_gene_bc_matrix")
}
# Now paths are sample/outs/*_*_bc_matrix/
path <- file.path(outsPaths, subdir)
if (!isV3) {
# Account the inner reference folder for V2
refsExist <- list.dirs(path[1], full.names = FALSE,
recursive = FALSE)
if (is.null(reference)) {
if (length(refsExist) == 1) {
reference <- refsExist
cli::cli_alert_info("Using referece {.val {reference}}")
} else {
cli::cli_abort(
"Multiple references found, please select one from: {.val {refsExist}}"
)
}
} else if (length(reference) == 1) {
if (!reference %in% refsExist) {
cli::cli_abort(
"Specified reference not found, please select one from: {.val {refsExist}}"
)
}
} else {
cli::cli_abort("Multiple reference specified but only one allowed.")
}
path <- file.path(path, reference)
}
names(path) <- dirSampleNames
cli::cli_alert_info(
c("Found the following sample folders with possible sub-folder structure: ",
"{.val {dirSampleNames}}")
)
} # else mtxDirs
} # else mtxDirs
allData <- list()
sampleNames <- .checkArgLen(sampleNames, length(path), repN = FALSE, class = "character")
if (is.null(sampleNames) && !is.null(names(path))) {
sampleNames <- names(path)
} else {
if (any(duplicated(sampleNames))) {
cli::cli_abort("Cannot set duplicated sample names.")
}
}
for (i in seq_along(path)) {
if (isTRUE(verbose)) {
name <- sampleNames[i]
if (is.null(name)) name <- paste0("sample ", i)
cliID <- cli::cli_process_start("Reading from {.val {name}}")
}
if (is.list(path)) run <- path[[i]]
else run <- path[i]
if (!dir.exists(run)) {
cli::cli_abort("Directory provided does not exist: {.file {normalizePath(run, mustWork = FALSE)}}")
}
barcode.loc <- file.path(run, 'barcodes.tsv')
gene.loc <- file.path(run, 'genes.tsv')
features.loc <- file.path(run, 'features.tsv.gz')
matrix.loc <- file.path(run, 'matrix.mtx')
# Flag to indicate if this data is from CellRanger >= 3.0
isOldVer <- file.exists(gene.loc)
if (!isOldVer) {
addgz <- function(s) paste0(s, ".gz")
barcode.loc <- addgz(barcode.loc)
matrix.loc <- addgz(matrix.loc)
}
if (!file.exists(barcode.loc)) {
cli::cli_abort("Barcode file is missing. Expecting {.file {barcode.loc}}")
}
if (!isOldVer && !file.exists(features.loc) ) {
cli::cli_abort("Gene name or features file is missing. Expecting {.file {features.loc}}")
}
if (!file.exists(matrix.loc)) {
cli::cli_abort("Expression matrix file is missing. Expecting {.file {matrix.loc}}")
}
data <- read10XFiles(matrixPath = matrix.loc, barcodesPath = barcode.loc,
featuresPath = ifelse(isOldVer, gene.loc, features.loc),
sampleName = if (isTRUE(addPrefix)) sampleNames[i] else NULL,
geneCol = geneCol, cellCol = cellCol, returnList = TRUE)
if (isOldVer) names(data) <- "Gene Expression"
allData[[i]] <- data
if (isTRUE(verbose)) cli::cli_process_done(id = cliID)
}
if (!is.null(sampleNames)) names(allData) <- sampleNames
if (isTRUE(returnList)) return(allData)
# By default, if only one sample and only one feature type,
# directly return the matrix, otherwise still return the structured list
if (length(allData) == 1 && length(allData[[1]]) == 1) {
return(allData[[1]][[1]])
} else {
return(allData)
}
}
#' @rdname read10X
#' @export
#' @param ... Arguments passed to \code{read10X}
#' @examples
#' \dontrun{
#' # For creating LIGER object from root directory of CellRanger >= 3.0
#' dir <- 'path/to/root'
#' list.dirs(dir) # Should show sample names, e.g. "rep1", "rep2", "rep3"
#' matList <- read10XRNA(dir)
#' names(matList) # Should show the sample names: "rep1", "rep2", "rep3"
#' sapply(matList, class) # Should show matrix class all are "dgCMatrix"
#' lig <- createLigerObject(matList)
#' }
read10XRNA <- function(
path,
sampleNames = NULL,
addPrefix = FALSE,
useFiltered = NULL,
reference = NULL,
returnList = FALSE,
...
) {
dataList <- read10X(path, sampleNames = sampleNames, addPrefix = addPrefix,
useFiltered = useFiltered, reference = reference,
returnList = TRUE, ...)
dataList <- lapply(dataList, `[[`, i = "Gene Expression")
if (length(dataList) == 1 && isFALSE(returnList)) return(dataList[[1]])
else return(dataList)
}
#' @rdname read10X
#' @export
#' @param pipeline Which cellRanger pipeline type to find the ATAC data. Choose
#' \code{"atac"} to read the peak matrix from cellranger-atac pipeline output
#' folder(s), or \code{"arc"} to split the ATAC feature subset out from the
#' multiomic cellranger-arc pipeline output folder(s). Default \code{"atac"}.
#' @param arcFeatureType When \code{pipeline = "arc"}, which feature type is
#' for the ATAC data of interests. Default \code{"Peaks"}. Other possible
#' feature types can be \code{"Chromatin Accessibility"}. Error message will
#' show available options if argument specification cannot be found.
read10XATAC <- function(
path,
sampleNames = NULL,
addPrefix = FALSE,
useFiltered = NULL,
pipeline = c("atac", "arc"),
arcFeatureType = "Peaks",
returnList = FALSE,
geneCol = 2,
cellCol = 1,
verbose = getOption("ligerVerbose", TRUE)
) {
pipeline <- match.arg(pipeline)
if (length(path) == 1) {
dirSampleNames <- list.dirs(path, full.names = FALSE, recursive = FALSE)
outsPaths <- file.path(path, dirSampleNames, "outs")
if (any(dir.exists(outsPaths))) {
# Case B, make the final `path` to case A
dirSampleNames <- dirSampleNames[dir.exists(outsPaths)]
outsPaths <- outsPaths[dir.exists(outsPaths)]
useFiltered <- useFiltered %||% TRUE
prefix <- ifelse(useFiltered, "filtered", "raw")
subdir <- paste0(prefix,
ifelse(pipeline == "atac",
"_peak_bc_matrix", "_feature_bc_matrix"))
# Now paths are sample/outs/*_peak_bc_matrix/
path <- file.path(outsPaths, subdir)
if (!dir.exists(path)) {
cli::cli_abort(
c("Cannot find folder {.file {path}}, not standard {.code cellranger-{pipeline}} output. ",
"i" = "Please try with the other {.code pipeline}.")
)
}
names(path) <- dirSampleNames
cli::cli_alert_info(
"Found the following sample folders with possible sub-folder structure: {.val {dirSampleNames}}"
)
} # else mtxDirs
} # else mtxDirs
allData <- list()
sampleNames <- .checkArgLen(sampleNames, length(path), repN = FALSE, class = "character")
if (is.null(sampleNames) && !is.null(names(path))) {
sampleNames <- names(path)
} else {
if (any(duplicated(sampleNames))) {
cli::cli_abort("Cannot set duplicated sample names.")
}
}
for (i in seq_along(path)) {
if (isTRUE(verbose)) {
name <- sampleNames[i]
if (is.null(name)) name <- paste0("sample ", i)
cliID <- cli::cli_process_start("Reading from {.val {name}}")
}
if (is.list(path)) run <- path[[i]]
else run <- path[i]
if (!dir.exists(run)) {
cli::cli_abort("Directory provided does not exist: {.file {normalizePath(run, mustWork = FALSE)}}")
}
barcode.loc <- switch(pipeline,
arc = "barcodes.tsv.gz",
atac = "barcodes.tsv")
feature.loc <- switch(pipeline,
arc = "features.tsv.gz",
atac = "peaks.bed"
)
matrix.loc <- switch(pipeline,
arc = "matrix.mtx.gz",
atac = "matrix.mtx"
)
if (!file.exists(barcode.loc)) {
cli::cli_abort("Barcode file is missing. Expecting {.file {barcode.loc}}")
}
if (!file.exists(feature.loc) ) {
cli::cli_abort("Peak or feature file is missing. Expecting {.file {feature.loc}}")
}
if (!file.exists(matrix.loc)) {
cli::cli_abort("Expression matrix file is missing. Expecting {.file {matrix.loc}}")
}
data <- read10XFiles(matrixPath = matrix.loc,
barcodesPath = barcode.loc,
featuresPath = feature.loc,
sampleName = if (isTRUE(addPrefix)) sampleNames[i] else NULL,
geneCol = geneCol, cellCol = cellCol,
isATAC = pipeline == "atac",
returnList = TRUE)
if (pipeline == "arc" && !arcFeatureType %in% names(data)) {
cli::cli_abort(
c("No ATAC data retrieved from cellranger-arc pipeline. ",
"Please see if the following available feature types match ",
"with need and select one for `arcFeatureType`: {.val {names(data)}}")
)
}
data <- switch(pipeline,
arc = data[[arcFeatureType]],
atac = data[[1]]
)
allData[[i]] <- data
cli::cli_process_done(id = cliID)
}
if (!is.null(sampleNames)) names(allData) <- sampleNames
if (isTRUE(returnList)) return(allData)
# By default, if only one sample and only one feature type,
# directly return the matrix, otherwise still return the structured list
if (length(allData) == 1 && length(allData[[1]]) == 1) {
return(allData[[1]][[1]])
} else {
return(allData)
}
}
#' Read 10X cellranger files (matrix, barcodes and features) into R session
#' @description
#' This function works for loading a single sample with specifying the paths to
#' the matrix.mtx, barcodes.tsv, and features.tsv files. This function is
#' internally used by \code{\link{read10X}} functions for loading individual
#' samples from cellranger output directory, while it can also be convenient
#' when out-of-standard files are presented (e.g. data downloaded from GEO).
#' @param matrixPath Character string, path to the matrix MTX file. Can be
#' gzipped.
#' @param barcodesPath Character string, path to the barcodes TSV file. Can be
#' gzipped.
#' @param featuresPath Character string, path to the features TSV file. Can be
#' gzipped.
#' @param sampleName Character string attached as a prefix to the cell barcodes
#' loaded from the barcodes file. Default \code{NULL} does not add any prefix.
#' Useful when users plan to merge multiple samples into one matrix and need
#' to avoid duplicated cell barcodes from different batches.
#' @param geneCol An integer indicating which column in the features file to
#' extract as the feature identifiers. Default \code{2}.
#' @param cellCol An integer indicating which column in the barcodes file to
#' extract as the cell identifiers. Default \code{1}.
#' @param isATAC Logical, whether the data is for ATAC-seq. Default
#' \code{FALSE}. If \code{TRUE}, feature identifiers will be generated by
#' combining the first three columns of the features file in the format of
#' "chr:start-end".
#' @param returnList Logical, used internally by wrapper functions. Whether to
#' force putting the loaded matrix in a list even if there's only one matrix.
#' Default \code{FALSE}.
#' @export
#' @return For a single-modal sample, a dgCMatrix object, or a list of one
#' dgCMatrix when \code{returnList = TRUE}. A list of multiple dgCMatrix objects
#' when multiple feature types are detected.
#' @examples
#' \dontrun{
#' matrix <- read10XFiles(
#' matrixPath = "path/to/matrix.mtx.gz",
#' barcodesPath = "path/to/barcodes.tsv.gz",
#' featuresPath = "path/to/features.tsv.gz"
#' )
#' }
read10XFiles <- function(
matrixPath,
barcodesPath,
featuresPath,
sampleName = NULL,
geneCol = 2,
cellCol = 1,
isATAC = FALSE,
returnList = FALSE
) {
# Matrix can be easily read
data <- methods::as(Matrix::readMM(matrixPath), "CsparseMatrix")
# Processing barcodes
cell.barcodes <- utils::read.table(barcodesPath, header = FALSE,
sep = '\t', row.names = NULL)
if (ncol(cell.barcodes) > 1) {
cell.names <- cell.barcodes[, cellCol]
} else {
cell.names <- readLines(barcodesPath)
}
# If sampleNames given, prefix with it; otherwise looks at list/vector
# names; finally number prefix if more than one sample.
# `sampleNames` is preprocessed and checked at top of the function
if (is.null(sampleName)) {
colnames(data) <- cell.names
} else {
colnames(data) <- paste0(sampleName, "_", cell.names)
}
# Processing features
feature.names <- utils::read.delim(
file = featuresPath,
header = FALSE,
stringsAsFactors = FALSE
)
if (isTRUE(isATAC)) {
features <- paste0(feature.names[, 1], ":", feature.names[, 2],
"-", feature.names[, 3])
} else {
if (ncol(feature.names) < geneCol) {
cli::cli_abort(
c("{.var geneCol} was set to {.val {geneCol}} but {.file feature.tsv.gz} (or {.file genes.tsv}) only has {ncol(fetures.names)} columns.",
"i" = "Try setting {.var geneCol} to a value <= {ncol(feature.names)}.")
)
}
if (any(is.na(feature.names[, geneCol]))) {
cli::cli_alert_warning(
"Some feature names are NA. Replacing NA names with ID from the opposite column requested"
)
na.features <- which(is.na(feature.names[, geneCol]))
replacement.column <- ifelse(geneCol == 2, 1, 2)
feature.names[na.features, geneCol] <-
feature.names[na.features, replacement.column]
}
features <- feature.names[, geneCol]
}
rownames(data) <- make.unique(feature.names[, geneCol])
# In cell ranger 3.0, a third column specifying the type of data was added
# and we will return each type of data as a separate matrix
if (ncol(feature.names) > 2) {
data_types <- factor(feature.names$V3)
lvls <- levels(data_types)
if (length(lvls) > 1) {
cli::cli_alert_warning(
"10X data contains more than one type and is being returned as a list containing matrices of each type."
)
}
expr_name <- "Gene Expression"
# Return Gene Expression first
if (expr_name %in% lvls) {
lvls <- c(expr_name, lvls[-which(lvls == expr_name)])
}
data <- lapply(lvls, function(l) {
data[data_types == l, , drop = FALSE]
})
names(data) <- lvls
} else{
if (isTRUE(returnList)) data <- list(data)
}
return(data)
}
# nocov end
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.