#' Load data from a 10X Genomics experiment
#'
#' Creates a \linkS4class{SingleCellExperiment} from the CellRanger output directories for 10X Genomics data.
#'
#' @param samples A character vector containing one or more directory names, each corresponding to a 10X sample.
#' Each directory should contain a matrix file, a gene/feature annotation file, and a barcode annotation file.
#'
#' Alternatively, each string may contain a path to a HDF5 file in the sparse matrix format generated by 10X.
#' These can be mixed with directory names when \code{type="auto"}.
#'
#' Alternatively, each string may contain a prefix of names for the three-file system described above,
#' where the rest of the name of each file follows the standard 10X output.
#' @param sample.names A character vector of length equal to \code{samples}, containing the sample names to store in the column metadata of the output object.
#' If \code{NULL}, the file paths in \code{samples} are used directly.
#' @param col.names A logical scalar indicating whether the columns of the output object should be named with the cell barcodes.
#' @param row.names String specifying whether to use Ensembl IDs ("ID") or gene symbols ("Symbol") as row names.
#' For symbols, the Ensembl ID will be appended to disambiguate rows where the same symbol corresponds to multiple Ensembl IDs.
#' @param type String specifying the type of 10X format to read data from.
#' @param version String specifying the version of the 10X format to read data from.
#' @param delayed Logical scalar indicating whether sparse matrices should be wrapped in \linkS4class{DelayedArray}s before combining.
#' Only relevant for multiple \code{samples}.
#' @param genome String specifying the genome if \code{type="HDF5"} and \code{version='2'}.
#' @param compressed Logical scalar indicating whether the text files are compressed for \code{type="mtx"} or \code{"prefix"}.
#' @param intersect.genes Logical scalar indicating whether to take the intersection of common genes across all samples.
#' If \code{FALSE}, differences in gene information across samples will cause an error to be raised.
#' @param mtx.two.pass Logical scalar indicating whether to use a two-pass approach for loading data from a Matrix Market file.
#' This reduces peak memory usage at the cost of some additional runtime.
#' Only relevant when \code{type="mtx"} or \code{type="prefix"}.
#' @param mtx.class String specifying the class of the output matrix when \code{type="mtx"} or \code{type="prefix"}.
#' @param mtx.threads Integer scalar specifying the number of threads to use for reading Matrix Market files.
#' Only relevant when \code{type="mtx"} or \code{type="prefix"}.
#' @param BPPARAM A \linkS4class{BiocParallelParam} object specifying how loading should be parallelized for multiple \code{samples}.
#'
#' @return A \linkS4class{SingleCellExperiment} object containing count data for each gene (row) and cell (column) across all \code{samples}.
#' \itemize{
#' \item Row metadata will contain the fields \code{"ID"} and \code{"Symbol"}.
#' The former is the gene identifier (usually Ensembl), while the latter is the gene name.
#' If \code{version="3"}, it will also contain the \code{"Type"} field specifying the type of feature (e.g., gene or antibody).
#' \item Column metadata will contain the fields \code{"Sample"} and \code{"Barcode"}.
#' The former contains the name of the sample (or if not supplied, the path in \code{samples}) from which each column was obtained.
#' The latter contains to the cell barcode sequence and GEM group for each cell library.
#' \item Rows are named with the gene identifier.
#' Columns are named with the cell barcode in certain settings, see Details.
#' \item The assays will contain a single \code{"counts"} matrix, containing UMI counts for each gene in each cell.
#' Note that the matrix representation will depend on the format of the \code{samples}, see Details.
#' \item The metadata contains a \code{"Samples"} field, containing the input \code{samples} character vector.
#' }
#'
#' @details
#' This function has a long and storied past.
#' It was originally developed as the \code{read10xResults} function in \pkg{scater}, inspired by the \code{Read10X} function from the \pkg{Seurat} package.
#' It was then migrated to this package in an effort to consolidate some 10X-related functionality across various packages.
#'
#' If \code{type="auto"}, the format of the input file is automatically detected for each \code{samples} based on whether it ends with \code{".h5"}.
#' If so, \code{type} is set to \code{"HDF5"}; otherwise it is set to \code{"sparse"}.
#' \itemize{
#' \item If \code{type="mtx"} (or its older alias \code{"sparse"}), count data are assumed to be stored in a directory.
#' This should contain a (possibly Gzipped) MatrixMarket text file (\code{"matrix.mtx"})
#' with additional tab-delimited files for barcodes (\code{"barcodes.tsv"})
#' and gene annotation (\code{"features.tsv"} for version 3 or \code{"genes.tsv"} for version 2).
#' \item If \code{type="prefix"}, count data are assumed to follow same three-file structure for each sample as described for \code{type="mtx"}.
#' However, each sample is defined by a prefix in the file names rather than by being stored a separate directory.
#' For example, if the \code{samples} entry is \code{"xyx_"}, the files are expected to be \code{"xyz_matrix.mtx"}, \code{"xyz_barcodes.tsv"}, etc.
#' \item If \code{type="hdf5"} (or its older alias \code{"HDF5"}), count data are assumed to follow the 10X sparse HDF5 format for large data sets.
#' It is loaded as a \linkS4class{TENxMatrix} object, which is a stub object that refers back to the file in \code{samples}.
#' Users may need to set \code{genome} if it cannot be automatically determined when \code{version="2"}.
#' }
#'
#' When \code{type="mtx"} or \code{"prefix"} and \code{compressed=NULL},
#' the function will automatically search for both the unzipped and Gzipped versions of the files.
#' This assumes that the compressed files have an additional \code{".gz"} suffix.
#' We can restrict to only compressed or uncompressed files by setting \code{compressed=TRUE} or \code{FALSE}, respectively.
#'
#' CellRanger 3.0 introduced a major change in the format of the output files for both \code{type}s.
#' If \code{version="auto"}, the version of the format is automatically detected from the supplied paths.
#' For \code{type="mtx"}, this is based on whether there is a \code{"features.tsv.gz"} file in the directory.
#' For \code{type="HDF5"}, this is based on whether there is a top-level \code{"matrix"} group with a \code{"matrix/features"} subgroup in the file.
#'
#' Matrices are combined by column if multiple \code{samples} were specified.
#' This will throw an error if the gene information is not consistent across \code{samples}.
#' For \code{type="mtx"} or \code{"prefix"}, users can set \code{delayed=TRUE} to save memory during the combining process.
#' This also avoids integer overflow for very large datasets.
#'
#' If \code{col.names=TRUE} and \code{length(sample)==1}, each column is named by the cell barcode.
#' For multiple samples, the index of each sample in \code{samples} is concatenated to the cell barcode to form the column name.
#' This avoids problems with multiple instances of the same cell barcodes in different samples.
#'
#' Note that user-level manipulation of sparse matrices requires loading of the \pkg{Matrix} package.
#' Otherwise, calculation of \code{rowSums}, \code{colSums}, etc. will result in errors.
#'
#' @author
#' Davis McCarthy, with modifications from Aaron Lun
#'
#' @seealso
#' \code{\link{splitAltExps}}, to split alternative feature sets (e.g., antibody tags) into their own Experiments.
#'
#' \code{\link{write10xCounts}}, to create 10X-formatted file(s) from a count matrix.
#'
#' @examples
#' # Mocking up some 10X genomics output.
#' example(write10xCounts, echo=FALSE)
#'
#' # Reading it in.
#' sce10x <- read10xCounts(tmpdir)
#'
#' # Column names are dropped with multiple 'samples'.
#' sce10x2 <- read10xCounts(c(tmpdir, tmpdir))
#'
#' @references
#' Zheng GX, Terry JM, Belgrader P, and others (2017).
#' Massively parallel digital transcriptional profiling of single cells.
#' \emph{Nat Commun} 8:14049.
#'
#' 10X Genomics (2017).
#' Gene-Barcode Matrices.
#' \url{https://support.10xgenomics.com/single-cell-gene-expression/software/pipelines/2.2/output/matrices}
#'
#' 10X Genomics (2018).
#' Feature-Barcode Matrices.
#' \url{https://support.10xgenomics.com/single-cell-gene-expression/software/pipelines/latest/output/matrices}
#'
#' 10X Genomics (2018).
#' HDF5 Gene-Barcode Matrix Format.
#' \url{https://support.10xgenomics.com/single-cell-gene-expression/software/pipelines/2.2/advanced/h5_matrices}
#'
#' 10X Genomics (2018).
#' HDF5 Feature Barcode Matrix Format.
#' \url{https://support.10xgenomics.com/single-cell-gene-expression/software/pipelines/latest/advanced/h5_matrices}
#'
#' @export
#' @importFrom S4Vectors DataFrame ROWNAMES<- ROWNAMES extractROWS
#' @importFrom SingleCellExperiment SingleCellExperiment
#' @importFrom BiocParallel SerialParam bplapply
#' @importFrom GenomicRanges GRanges
#' @importFrom DelayedArray DelayedArray
read10xCounts <- function(samples,
sample.names=names(samples),
col.names=FALSE,
row.names = c("id", "symbol"),
type=c("auto", "mtx", "hdf5", "prefix", "sparse", "HDF5"),
delayed=FALSE,
version=c("auto", "2", "3"),
genome=NULL,
compressed=NULL,
intersect.genes=FALSE,
mtx.two.pass=FALSE,
mtx.class=c("CsparseMatrix", "SVT_SparseMatrix"),
mtx.threads=1,
BPPARAM=SerialParam())
{
type <- match.arg(type)
version <- match.arg(version)
row.names <- match.arg(row.names)
if (is.null(sample.names)) {
sample.names <- samples
}
load.out <- bplapply(samples,
FUN=.tenx_loader,
type=type,
version=version,
genome=genome,
compressed=compressed,
mtx.two.pass=mtx.two.pass,
mtx.class=match.arg(mtx.class),
mtx.threads=mtx.threads,
BPPARAM=BPPARAM
)
nsets <- length(samples)
full_data <- vector("list", nsets)
gene_info_list <- vector("list", nsets)
cell_info_list <- vector("list", nsets)
for (i in seq_len(nsets)) {
current <- load.out[[i]]
full_data[[i]] <- current$mat
rr <- current$gene.info
rns <- if (row.names == "id") rr$ID else rr$Symbol
if (row.names == "symbol" && anyDuplicated(rns)) {
dup.name <- rns %in% rns[duplicated(rns)]
rns[dup.name] <- paste0(rns[dup.name], "_", rr$ID[dup.name])
}
ROWNAMES(rr) <- rns
gene_info_list[[i]] <- rr
cell.names <- current$cell.names
cell_info_list[[i]] <- DataFrame(
Sample = rep(sample.names[i], length(cell.names)),
Barcode = cell.names, row.names=NULL)
}
# Checking gene correctness and handling differences, or failing.
gene_info <- gene_info_list[[1]]
if (!intersect.genes) {
if (nsets > 1 && length(unique(gene_info_list)) != 1L) {
stop("gene information differs between runs")
}
} else {
all.genes <- lapply(gene_info_list, ROWNAMES)
if (length(unique(all.genes)) > 1) {
common.genes <- Reduce(intersect, all.genes)
for (i in seq_along(full_data)) {
m <- match(common.genes, all.genes[[i]])
full_data[[i]] <- full_data[[i]][m,,drop=FALSE]
}
gene_info <- extractROWS(gene_info, common.genes)
}
}
# Forming the full data matrix.
if (length(full_data) > 1) {
if (delayed) {
full_data <- lapply(full_data, DelayedArray)
}
full_data <- do.call(cbind, full_data)
} else {
full_data <- full_data[[1]]
}
# Adding the cell data.
cell_info <- do.call(rbind, cell_info_list)
if (col.names) {
if (nsets == 1L) {
cnames <- cell_info$Barcode
} else {
sid <- rep(seq_along(cell_info_list), vapply(cell_info_list, nrow, 1L))
cnames <- paste0(sid, "_", cell_info$Barcode)
}
colnames(full_data) <- cnames
}
SingleCellExperiment(list(counts = full_data), rowData = gene_info, colData = cell_info, metadata=list(Samples=samples))
}
.tenx_loader <- function(run, type, version, genome, compressed, mtx.two.pass, mtx.class, mtx.threads) {
cur.type <- .type_chooser(run, type)
if (cur.type=="mtx") {
.read_from_sparse(run, version=version, is.prefix=FALSE, compressed=compressed, mtx.two.pass=mtx.two.pass, mtx.class=mtx.class, mtx.threads=mtx.threads)
} else if (cur.type=="prefix") {
.read_from_sparse(run, version=version, is.prefix=TRUE, compressed=compressed, mtx.two.pass=mtx.two.pass, mtx.class=mtx.class, mtx.threads=mtx.threads)
} else {
.read_from_hdf5(run, genome=genome, version=version)
}
}
#' @importFrom methods as new
#' @importClassesFrom Matrix dgCMatrix
#' @importFrom utils read.delim head
#' @importFrom IRanges IRanges
#' @importFrom S4Vectors mcols<-
#' @importClassesFrom SparseArray SVT_SparseMatrix
.read_from_sparse <- function(path, version, is.prefix, compressed, mtx.two.pass, mtx.class, mtx.threads) {
FUN <- if (is.prefix) paste0 else file.path
if (version=="auto") {
target <- FUN(path, "features.tsv")
target <- .check_for_compressed(target, compressed, error=FALSE)
version <- if (file.exists(target)) "3" else "2"
}
bname <- "barcodes.tsv"
mname <- "matrix.mtx"
if (version=="3") {
gname <- "features.tsv"
} else {
gname <- "genes.tsv"
}
barcode.loc <- FUN(path, bname)
gene.loc <- FUN(path, gname)
matrix.loc <- FUN(path, mname)
barcode.loc <- .check_for_compressed(barcode.loc, compressed)
gene.loc <- .check_for_compressed(gene.loc, compressed)
matrix.loc <- .check_for_compressed(matrix.loc, compressed)
gene.info <- read.delim(gene.loc, header=FALSE, stringsAsFactors=FALSE, quote="", comment.char="")
possible.names <- c("ID", "Symbol", "Type", "Chromosome", "Start", "End")
colnames(gene.info) <- head(possible.names, ncol(gene.info))
if (ncol(gene.info) > 3) {
# Default ARC-seq reference seems to give empty names for mitochondrial genes.
# Hey, I don't make the rules.
keep <- gene.info$Chromosome == "" & grepl("^MT-", gene.info$Symbol)
if (any(keep)) {
gene.info[keep,"Chromosome"] <- "chrM"
}
# Newer cellranger versions like to add coordinates, so
# let's throw it into the GRanges for fun.
gr <- GRanges(gene.info$Chromosome, IRanges(gene.info$Start, gene.info$End))
mcols(gr) <- DataFrame(gene.info[,1:3])
gene.info <- gr
}
raw_mat <- read_mm(matrix.loc, two_pass=mtx.two.pass, class_name=mtx.class, threads=mtx.threads)
if (mtx.class == "CsparseMatrix") {
# Don't use sparseMatrix as this seems to do an unnecessary roundtrip through the triplet form.
mat <- new("dgCMatrix", Dim=raw_mat$dim, i=raw_mat$contents$i, x=raw_mat$contents$x, p=raw_mat$contents$p)
} else {
mat <- new("SVT_SparseMatrix", SVT=raw_mat$contents$list, dim=raw_mat$dim, type=raw_mat$contents$type)
}
list(
mat=mat,
cell.names=readLines(barcode.loc),
gene.info=gene.info
)
}
.check_for_compressed <- function(path, compressed, error=TRUE) {
original <- path
if (isTRUE(compressed)) {
path <- paste0(path, ".gz")
} else if (is.null(compressed) && !file.exists(path)) {
path <- paste0(path, ".gz")
if (error && !file.exists(path)) {
# Explicit error here to avoid users getting confused.
stop(sprintf("cannot find '%s' or its gzip-compressed form", original))
}
}
path
}
#' @importFrom rhdf5 h5ls h5read
#' @importFrom HDF5Array TENxMatrix
#' @importFrom utils head
.read_from_hdf5 <- function(path, genome=NULL, version) {
path <- path.expand(path) # eliminate tilde's.
available <- h5ls(path, recursive=FALSE)
available <- available[available$otype=="H5I_GROUP",]
if (version=="auto") {
version <- if ("matrix" %in% available$name) "3" else "2"
}
if (version=="2") {
group <- genome
if (is.null(group)) {
if (nrow(available) > 1L) {
to.see <- head(available$name, 3)
if (length(to.see)==3L) {
to.see[3] <- "..."
}
stop("more than one available group (", paste(to.see, collapse=", "), ")")
} else if (nrow(available) == 0L) {
stop("no available groups")
}
group <- available$name
}
gene.info <- data.frame(
ID=as.character(h5read(path, paste0(group, "/genes"))),
Symbol=as.character(h5read(path, paste0(group, "/gene_names"))),
stringsAsFactors=FALSE
)
} else {
group <- "matrix"
gene.info <- data.frame(
ID=as.character(h5read(path, paste0(group, "/features/id"))),
Symbol=as.character(h5read(path, paste0(group, "/features/name"))),
Type=as.character(h5read(path, paste0(group, "/features/feature_type"))),
stringsAsFactors=FALSE
)
}
mat <- TENxMatrix(path, group)
dimnames(mat) <- NULL # for consistency.
list(
mat=mat,
cell.names=as.character(h5read(path, paste0(group, "/barcodes"))),
gene.info=gene.info
)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.