#' @include zzz.R
#' @include Connect.R
#' @include TestObject.R
#' @include Transpose.R
#' @include PadMatrix.R
#' @importFrom utils setTxtProgressBar
#' @importFrom hdf5r H5File h5attr H5S
#' @importFrom tools file_path_sans_ext
#'
NULL
#%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
# Generics
#%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
#' Convert an on-disk single-cell dataset to another format
#'
#' HDF5-based single-cell datasets can be converted from one format to another
#' using minimal memory. Details about conversion formats implemented are
#' provided below
#'
#' @inheritSection H5ADToH5Seurat AnnData/H5AD to h5Seurat
#' @inheritSection H5SeuratToH5AD h5Seurat to AnnData/H5AD
#'
#' @param source Source dataset
#' @param dest Name of destination dataset
#' @param assay Converting from \code{\link{h5Seurat}}: name of assay to write
#' out; converting to \code{\link{h5Seurat}}: name to store assay data as
#' @param overwrite Overwrite existing \code{dest}
#' @param verbose Show progress updates
#' @param ... Arguments passed to other methods
#'
#' @return If \code{source} is a \code{character}, invisibly returns
#' \code{dest}; otherwise, returns an \code{\link[hdf5r]{H5File}}, or
#' filetype-specific subclass of \code{H5File} (eg. \code{\link{h5Seurat}}),
#' connection to \code{dest}
#'
#' @name Convert
#' @rdname Convert
#'
#' @export
#'
Convert <- function(source, dest, assay, overwrite = FALSE, verbose = TRUE, ...) {
if (!missing(x = dest) && !is.character(x = dest)) {
stop("'dest' must be a filename or type", call. = FALSE)
}
UseMethod(generic = 'Convert', object = source)
}
#%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
# Methods
#%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
#' @importFrom hdf5r H5File
#'
#' @rdname Convert
#' @method Convert character
#' @export
#'
Convert.character <- function(
source,
dest,
assay,
overwrite = FALSE,
verbose = TRUE,
...
) {
hfile <- Connect(filename = source, force = TRUE)
if (missing(x = assay)) {
assay <- tryCatch(
expr = DefaultAssay(object = hfile),
error = function(...) {
warning(
"'assay' not set, setting to 'RNA'",
call. = FALSE,
immediate. = TRUE
)
"RNA"
}
)
}
on.exit(expr = hfile$close_all())
dfile <- Convert(
source = hfile,
dest = dest,
assay = assay,
overwrite = overwrite,
verbose = verbose,
...
)
dfile$close_all()
return(invisible(x = dfile$filename))
}
#' @rdname Convert
#' @method Convert H5File
#' @export
#'
Convert.H5File <- function(
source,
dest = 'h5seurat',
assay = 'RNA',
overwrite = FALSE,
verbose = TRUE,
...
) {
stype <- FileType(file = source$filename)
dtype <- FileType(file = dest)
if (tolower(x = dest) == dtype) {
dest <- paste(file_path_sans_ext(x = source$filename), dtype, sep = '.')
}
dfile <- switch(
EXPR = stype,
'h5ad' = switch(
EXPR = dtype,
'h5seurat' = H5ADToH5Seurat(
source = source,
dest = dest,
assay = assay,
overwrite = overwrite,
verbose = verbose
),
stop("Unable to convert H5AD files to ", dtype, " files", call. = FALSE)
),
stop("Unknown file type: ", stype, call. = FALSE)
)
return(dfile)
}
#' @rdname Convert
#' @method Convert h5Seurat
#' @export
#'
Convert.h5Seurat <- function(
source,
dest = 'h5ad',
assay = DefaultAssay(object = source),
overwrite = FALSE,
verbose = TRUE,
...
) {
type <- FileType(file = dest)
if (tolower(x = dest) == type) {
dest <- paste(file_path_sans_ext(x = source$filename), type, sep = '.')
}
dfile <- switch(
EXPR = type,
'h5ad' = H5SeuratToH5AD(
source = source,
dest = dest,
assay = assay,
overwrite = overwrite,
verbose = verbose
),
stop("Unable to convert h5Seurat files to ", type, " files", call. = FALSE)
)
return(dfile)
}
#%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
# Implementations
#%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
#' Convert AnnData/H5AD files to h5Seurat files
#'
#' @inheritParams Convert
#'
#' @return Returns a handle to \code{dest} as an \code{\link{h5Seurat}} object
#'
#' @importFrom Seurat Project<- DefaultAssay<-
#'
#' @section AnnData/H5AD to h5Seurat:
#' The AnnData/H5AD to h5Seurat conversion will try to automatically fill in
#' datasets based on data presence. It works in the following manner:
#' \subsection{Expression data}{
#' The expression matrices \code{counts}, \code{data}, and \code{scale.data}
#' are filled by \code{/X} and \code{/raw/X} in the following manner:
#' \itemize{
#' \item \code{counts} will be filled with \code{/raw/X} if present;
#' otherwise, it will be filled with \code{/X}
#' \item \code{data} will be filled with \code{/raw/X} if \code{/raw/X} is
#' present and \code{/X} is dense; otherwise, it will be filled with \code{/X}
#' \item \code{scale.data} will be filled with \code{/X} if it dense;
#' otherwise, it will be empty
#' }
#' Feature names are taken from the feature-level metadata
#' }
#' \subsection{Feature-level metadata}{
#' Feature-level metadata is added to the \code{meta.features} datasets in each
#' assay. Feature names are taken from the dataset specified by the
#' \dQuote{_index} attribute, the \dQuote{_index} dataset, or the \dQuote{index}
#' dataset, in that order. Metadata is populated with \code{/raw/var} if
#' present, otherwise with \code{/var}; if both \code{/raw/var} and \code{/var}
#' are present, then \code{meta.features} will be populated with \code{/raw/var}
#' first, then \code{/var} will be added to it. For columns present in both
#' \code{/raw/var} and \code{/var}, the values in \code{/var} will be used
#' instead. \strong{Note}: it is possible for \code{/var} to have fewer features
#' than \code{/raw/var}; if this is the case, then only the features present in
#' \code{/var} will be overwritten, with the metadata for features \emph{not}
#' present in \code{/var} remaining as they were in \code{/raw/var} or empty
#' }
#' \subsection{Cell-level metadata}{
#' Cell-level metadata is added to \code{meta.data}; the row names of the
#' metadata (as determined by the value of the \dQuote{_index} attribute, the
#' \dQuote{_index} dataset, or the \dQuote{index} dataset, in that order) are
#' added to the \dQuote{cell.names} dataset instead. If the
#' \dQuote{__categories} dataset is present, each dataset within
#' \dQuote{__categories} will be stored as a factor group. Cell-level metadata
#' will be added as an HDF5 group unless factors are \strong{not} present and
#' \code{\link[SeuratDisk]{SeuratDisk.dtype.dataframe_as_group}} is \code{FALSE}
#' }
#' \subsection{Dimensional reduction information:}{
#' Cell embeddings are taken from \code{/obsm}; dimensional reductions are
#' named based on their names from \code{obsm} by removing the preceding
#' \dQuote{X_}.For example, if a dimensional reduction is named \dQuote{X_pca}
#' in \code{/obsm}, the resulting dimensional reduction information will be
#' named \dQuote{pca}. The key will be set to one of the following:
#' \itemize{
#' \item \dQuote{PC_} if \dQuote{pca} is present in the dimensional reduction
#' name (\code{grepl("pca", reduction.name, ignore.case = TRUE)})
#' \item \dQuote{tSNE_} if \dQuote{tsne} is present in the dimensional
#' reduction name (\code{grepl("tsne", reduction.name, ignore.case = TRUE)})
#' \item \code{reduction.name_} for all other reductions
#' }
#' Remember that the preceding \dQuote{X_} will be removed from the reduction
#' name before converting to a key. Feature loadings are taken from
#' \code{/varm} and placed in the associated dimensional reduction. The
#' dimensional reduction is determine from the loadings name in \code{/varm}:
#' \itemize{
#' \item \dQuote{PCs} will be added to a dimensional reduction named
#' \dQuote{pca}
#' \item All other loadings in \code{/varm} will be added to a dimensional
#' reduction named \code{tolower(loading)} (eg. a loading named \dQuote{ICA}
#' will be added to a dimensional reduction named \dQuote{ica})
#' }
#' If a dimensional reduction cannot be found according to the rules above, the
#' loading will not be taken from the AnnData/H5AD file. Miscellaneous
#' information will be taken from \code{/uns/reduction} where \code{reduction}
#' is the name of the reduction in \code{/obsm} without the preceding
#' \dQuote{X_}; if no dimensional reduction information present, then
#' miscellaneous information will not be taken from the AnnData/H5AD file.
#' Standard deviations are taken from a dataset \code{/uns/reduction/variance};
#' the variances will be converted to standard deviations and added to the
#' \code{stdev} dataset of a dimensional reduction
#' }
#' \subsection{Nearest-neighbor graph}{
#' If a nearest neighbor graph is present in \code{/uns/neighbors/distances},
#' it will be added as a graph dataset in the h5Seurat file and associated with
#' \code{assay}; if a value is present in \code{/uns/neighbors/params/method},
#' the name of the graph will be \code{assay_method}, otherwise, it will be
#' \code{assay_anndata}
#' }
#' \subsection{Layers}{
#' TODO: add this
#' }
#' \subsection{Miscellaneous information}{
#' All groups and datasets from \code{/uns} will be copied to \code{misc} in
#' the h5Seurat file except for the following:
#' \itemize{
#' \item Any group or dataset named the same as a dimensional reduction (eg.
#' \code{/uns/pca})
#' \item \code{/uns/neighbors}
#' }
#' }
#'
#' @keywords internal
#'
H5ADToH5Seurat <- function(
source,
dest,
assay = 'RNA',
overwrite = FALSE,
verbose = TRUE
) {
if (file.exists(dest)) {
if (overwrite) {
file.remove(dest)
} else {
stop("Destination h5Seurat file exists", call. = FALSE)
}
}
dfile <- h5Seurat$new(filename = dest, mode = WriteMode(overwrite = FALSE))
# Get rownames from an H5AD data frame
#
# @param dset Name of data frame
#
# @return Returns the name of the dataset that contains the rownames
#
GetRownames <- function(dset) {
if (inherits(x = source[[dset]], what = 'H5Group')) {
# rownames <- if (source[[dset]]$attr_exists(attr_name = '_index')) {
rownames <- if (isTRUE(x = AttrExists(x = source[[dset]], name = '_index'))) {
h5attr(x = source[[dset]], which = '_index')
} else if (source[[dset]]$exists(name = '_index')) {
'_index'
} else if (source[[dset]]$exists(name = 'index')) {
'index'
} else {
stop("Cannot find rownames in ", dset, call. = FALSE)
}
} else {
# TODO: fix this
stop("Don't know how to handle datasets", call. = FALSE)
# rownames(x = source[[dset]])
}
return(rownames)
}
ColToFactor <- function(dfgroup) {
if (dfgroup$exists(name = '__categories')) {
for (i in names(x = dfgroup[['__categories']])) {
tname <- basename(path = tempfile(tmpdir = ''))
dfgroup$obj_copy_to(dst_loc = dfgroup, dst_name = tname, src_name = i)
dfgroup$link_delete(name = i)
# Because AnnData stores logicals as factors, but have too many levels
# for factors
bool.check <- dfgroup[['__categories']][[i]]$dims == 2
if (isTRUE(x = bool.check)) {
bool.check <- all(sort(x = dfgroup[['__categories']][[i]][]) == c('False', 'True'))
}
if (isTRUE(x = bool.check)) {
dfgroup$create_dataset(
name = i,
robj = dfgroup[[tname]][] + 1L,
dtype = dfgroup[[tname]]$get_type()
)
} else {
dfgroup$create_group(name = i)
dfgroup[[i]]$create_dataset(
name = 'values',
robj = dfgroup[[tname]][] + 1L,
dtype = dfgroup[[tname]]$get_type()
)
if (IsDType(x = dfgroup[['__categories']][[i]], dtype = 'H5T_STRING')) {
dfgroup$obj_copy_to(
dst_loc = dfgroup,
dst_name = paste0(i, '/levels'),
src_name = paste0('__categories/', i)
)
} else {
dfgroup[[i]]$create_dataset(
name = 'levels',
robj = as.character(x = dfgroup[[H5Path('__categories', i)]][]),
dtype = StringType()
)
}
}
dfgroup$link_delete(name = tname)
# col.order <- h5attr(x = dfile[['var']], which = 'column-order')
# col.order <- c(col.order, var.name)
# dfile[['var']]$attr_rename(
# old_attr_name = 'column-order',
# new_attr_name = 'old-column-order'
# )
# dfile[['var']]$create_attr(
# attr_name = 'column-order',
# robj = col.order,
# dtype = GuessDType(x = col.order)
# )
# dfile[['var']]$attr_delete(attr_name = 'old-column-order')
}
dfgroup$link_delete(name = '__categories')
}
return(invisible(x = NULL))
}
ds.map <- c(
scale.data = if (inherits(x = source[['X']], what = 'H5D')) {
'X'
} else {
NULL
},
data = if (inherits(x = source[['X']], what = 'H5D') && source$exists(name = 'raw')) {
'raw/X'
} else {
'X'
},
counts = if (source$exists(name = 'raw')) {
'raw/X'
} else {
'X'
}
)
# Add assay data
assay.group <- dfile[['assays']]$create_group(name = assay)
for (i in seq_along(along.with = ds.map)) {
if (verbose) {
message("Adding ", ds.map[[i]], " as ", names(x = ds.map)[i])
}
dst <- names(x = ds.map)[i]
assay.group$obj_copy_from(
src_loc = source,
src_name = ds.map[[i]],
dst_name = dst
)
# if (assay.group[[dst]]$attr_exists(attr_name = 'shape')) {
if (isTRUE(x = AttrExists(x = assay.group[[dst]], name = 'shape'))) {
dims <- rev(x = h5attr(x = assay.group[[dst]], which = 'shape'))
assay.group[[dst]]$create_attr(
attr_name = 'dims',
robj = dims,
dtype = GuessDType(x = dims)
)
assay.group[[dst]]$attr_delete(attr_name = 'shape')
}
}
features.source <- ifelse(
test = source$exists(name = 'raw') && source$exists(name = 'raw/var'),
yes = 'raw/var',
no = 'var'
)
if (inherits(x = source[[features.source]], what = 'H5Group')) {
features.dset <- GetRownames(dset = features.source)
assay.group$obj_copy_from(
src_loc = source,
src_name = paste(features.source, features.dset, sep = '/'),
dst_name = 'features'
)
} else {
tryCatch(
expr = assay.group$create_dataset(
name = 'features',
robj = rownames(x = source[[features.source]]),
dtype = GuessDType(x = "")
),
error = function(...) {
stop("Cannot find feature names in this H5AD file", call. = FALSE)
}
)
}
scaled <- !is.null(x = ds.map['scale.data']) && !is.na(x = ds.map['scale.data'])
if (scaled) {
if (inherits(x = source[['var']], what = 'H5Group')) {
scaled.dset <- GetRownames(dset = 'var')
assay.group$obj_copy_from(
src_loc = source,
src_name = paste0('var/', scaled.dset),
dst_name = 'scaled.features'
)
} else {
tryCatch(
expr = assay.group$create_dataset(
name = 'scaled.features',
robj = rownames(x = source[['var']]),
dtype = GuessDType(x = "")
),
error = function(...) {
stop("Cannot find scaled features in this H5AD file", call. = FALSE)
}
)
}
}
assay.group$create_attr(
attr_name = 'key',
robj = paste0(tolower(x = assay), '_'),
dtype = GuessDType(x = assay)
)
# Set default assay
DefaultAssay(object = dfile) <- assay
# Add feature-level metadata
if (!getOption(x = "SeuratDisk.dtypes.dataframe_as_group", default = FALSE)) {
warning(
"Adding feature-level metadata as a compound is not yet supported",
call. = FALSE,
immediate. = TRUE
)
}
# TODO: Support compound metafeatures
if (Exists(x = source, name = 'raw/var')) {
if (inherits(x = source[['raw/var']], what = 'H5Group')) {
if (verbose) {
message("Adding meta.features from raw/var")
}
assay.group$obj_copy_from(
src_loc = source,
src_name = 'raw/var',
dst_name = 'meta.features'
)
if (scaled) {
features.use <- assay.group[['features']][] %in% assay.group[['scaled.features']][]
features.use <- which(x = features.use)
meta.scaled <- names(x = source[['var']])
meta.scaled <- meta.scaled[!meta.scaled %in% c('__categories', scaled.dset)]
for (mf in meta.scaled) {
if (!mf %in% names(x = assay.group[['meta.features']])) {
if (verbose) {
message("Adding ", mf, " from scaled feature-level metadata")
}
assay.group[['meta.features']]$create_dataset(
name = mf,
dtype = source[['var']][[mf]]$get_type(),
space = H5S$new(dims = assay.group[['features']]$dims)
)
} else if (verbose) {
message("Merging ", mf, " from scaled feature-level metadata")
}
assay.group[['meta.features']][[mf]][features.use] <- source[['var']][[mf]]$read()
}
}
} else {
warning(
"Cannot yet add feature-level metadata from compound datasets",
call. = FALSE,
immediate. = TRUE
)
assay.group$create_group(name = 'meta.features')
}
} else {
if (inherits(x = source[['var']], what = 'H5Group')) {
if (verbose) {
message("Adding meta.features from var")
}
assay.group$obj_copy_from(
src_loc = source,
src_name = 'var',
dst_name = 'meta.features'
)
} else {
warning(
"Cannot yet add feature-level metadata from compound datasets",
call. = FALSE,
immediate. = TRUE
)
assay.group$create_group(name = 'meta.features')
}
}
ColToFactor(dfgroup = assay.group[['meta.features']])
# if (assay.group[['meta.features']]$attr_exists(attr_name = 'column-order')) {
if (isTRUE(x = AttrExists(x = assay.group[['meta.features']], name = 'column-order'))) {
colnames <- h5attr(
x = assay.group[['meta.features']],
which = 'column-order'
)
assay.group[['meta.features']]$create_attr(
attr_name = 'colnames',
robj = colnames,
dtype = GuessDType(x = colnames)
)
}
if (inherits(x = source[['var']], what = 'H5Group')) {
assay.group[['meta.features']]$link_delete(name = GetRownames(dset = 'var'))
}
# Add cell-level metadata
if (source$exists(name = 'obs') && inherits(x = source[['obs']], what = 'H5Group')) {
if (!source[['obs']]$exists(name = '__categories') && !getOption(x = "SeuratDisk.dtypes.dataframe_as_group", default = TRUE)) {
warning(
"Conversion from H5AD to h5Seurat allowing compound datasets is not yet implemented",
call. = FALSE,
immediate. = TRUE
)
}
dfile$obj_copy_from(
src_loc = source,
src_name = 'obs',
dst_name = 'meta.data'
)
ColToFactor(dfgroup = dfile[['meta.data']])
# if (dfile[['meta.data']]$attr_exists(attr_name = 'column-order')) {
if (isTRUE(x = AttrExists(x = dfile[['meta.data']], name = 'column-order'))) {
colnames <- h5attr(x = dfile[['meta.data']], which = 'column-order')
dfile[['meta.data']]$create_attr(
attr_name = 'colnames',
robj = colnames,
dtype = GuessDType(x = colnames)
)
}
rownames <- GetRownames(dset = 'obs')
dfile$obj_copy_from(
src_loc = dfile,
src_name = paste0('meta.data/', rownames),
dst_name = 'cell.names'
)
dfile[['meta.data']]$link_delete(name = rownames)
} else {
warning(
"No cell-level metadata present, creating fake cell names",
call. = FALSE,
immediate. = TRUE
)
ncells <- if (inherits(x = assay.group[['data']], what = 'H5Group')) {
assay.group[['data/indptr']]$dims - 1
} else {
assay.group[['data']]$dims[2]
}
dfile$create_group(name = 'meta.data')
dfile$create_dataset(
name = 'cell.names',
robj = paste0('Cell', seq.default(from = 1, to = ncells)),
dtype = GuessDType(x = 'Cell1')
)
}
# Add dimensional reduction information
if (source$exists(name = 'obsm')) {
# Add cell embeddings
if (inherits(x = source[['obsm']], what = 'H5Group')) {
for (reduc in names(x = source[['obsm']])) {
sreduc <- gsub(pattern = '^X_', replacement = '', x = reduc)
reduc.group <- dfile[['reductions']]$create_group(name = sreduc)
message("Adding ", reduc, " as cell embeddings for ", sreduc)
Transpose(
x = source[['obsm']][[reduc]],
dest = reduc.group,
dname = 'cell.embeddings',
verbose = FALSE
)
reduc.group$create_group(name = 'misc')
reduc.group$create_attr(
attr_name = 'active.assay',
robj = assay,
dtype = GuessDType(x = assay)
)
key <- paste0(
if (grepl(pattern = 'pca', x = sreduc, ignore.case = TRUE)) {
'PC'
} else if (grepl(pattern = 'tsne', x = sreduc, ignore.case = TRUE)) {
'tSNE'
} else {
sreduc
},
'_'
)
reduc.group$create_attr(
attr_name = 'key',
robj = key,
dtype = GuessDType(x = reduc)
)
global <- BoolToInt(x = grepl(
pattern = 'tsne|umap',
x = sreduc,
ignore.case = TRUE
))
reduc.group$create_attr(
attr_name = 'global',
robj = global,
dtype = GuessDType(x = global)
)
}
} else {
warning(
"Reading compound dimensional reductions not yet supported, please update your H5AD file",
call. = FALSE,
immediate. = TRUE
)
}
# Add feature loadings
if (source$exists(name = 'varm')) {
if (inherits(x = source[['varm']], what = 'H5Group')) {
for (reduc in names(x = source[['varm']])) {
sreduc <- switch(EXPR = reduc, 'PCs' = 'pca', tolower(x = reduc))
if (!isTRUE(x = sreduc %in% names(x = dfile[['reductions']]))) {
warning(
"Cannot find a reduction named ",
sreduc,
" (",
reduc,
" in varm)",
call. = FALSE,
immediate. = TRUE
)
next
}
if (isTRUE(x = verbose)) {
message("Adding ", reduc, " as feature loadings fpr ", sreduc)
}
Transpose(
x = source[['varm']][[reduc]],
dest = dfile[['reductions']][[sreduc]],
dname = 'feature.loadings',
verbose = FALSE
)
reduc.features <- dfile[['reductions']][[sreduc]][['feature.loadings']]$dims[1]
assay.features <- if (assay.group[['features']]$dims == reduc.features) {
'features'
} else if (assay.group$exists(name = 'scaled.features') && assay.group[['scaled.features']]$dims == reduc.features) {
'scaled.features'
} else {
NULL
}
if (is.null(x = assay.features)) {
warning(
"Cannot find features for feature loadings, will not be able to load",
call. = FALSE,
immediate. = TRUE
)
} else {
dfile[['reductions']][[sreduc]]$obj_copy_from(
src_loc = assay.group,
src_name = assay.features,
dst_name = 'features'
)
}
}
} else {
warning(
"Reading compound dimensional reductions not yet supported",
call. = FALSE,
immediate. = TRUE
)
}
}
# Add miscellaneous information
if (source$exists(name = 'uns')) {
for (reduc in names(x = source[['uns']])) {
if (!isTRUE(x = reduc %in% names(x = dfile[['reductions']]))) {
next
}
if (verbose) {
message("Adding miscellaneous information for ", reduc)
}
dfile[['reductions']][[reduc]]$link_delete(name = 'misc')
dfile[['reductions']][[reduc]]$obj_copy_from(
src_loc = source[['uns']],
src_name = reduc,
dst_name = 'misc'
)
if ('variance' %in% names(x = dfile[['reductions']][[reduc]][['misc']])) {
if (verbose) {
message("Adding standard deviations for ", reduc)
}
dfile[['reductions']][[reduc]]$create_dataset(
name = 'stdev',
robj = sqrt(x = dfile[['reductions']][[reduc]][['misc']][['variance']][]),
dtype = GuessDType(x = 1.0)
)
}
}
}
}
# Add project and cell identities
Project(object = dfile) <- 'AnnData'
idents <- dfile$create_group(name = 'active.ident')
idents$create_dataset(
name = 'values',
dtype = GuessDType(x = 1L),
space = H5S$new(dims = dfile[['cell.names']]$dims)
)
idents$create_dataset(
name = 'levels',
robj = 'AnnData',
dtype = GuessDType(x = 'AnnData')
)
idents[['values']]$write(
args = list(seq.default(from = 1, to = idents[['values']]$dims)),
value = 1L
)
# Add nearest-neighbor graph
if (Exists(x = source, name = 'uns/neighbors/distances')) {
graph.name <- paste(
assay,
ifelse(
test = source$exists(name = 'uns/neighbors/params/method'),
yes = source[['uns/neighbors/params/method']][1],
no = 'anndata'
),
sep = '_'
)
if (verbose) {
message("Saving nearest-neighbor graph as ", graph.name)
}
dfile[['graphs']]$obj_copy_from(
src_loc = source,
src_name = 'uns/neighbors/distances',
dst_name = graph.name
)
# if (dfile[['graphs']][[graph.name]]$attr_exists(attr_name = 'shape')) {
if (isTRUE(x = AttrExists(x = dfile[['graphs']], name = 'shape'))) {
dfile[['graphs']][[graph.name]]$create_attr(
attr_name = 'dims',
robj = h5attr(x = dfile[['graphs']][[graph.name]], which = 'shape'),
dtype = GuessDType(x = h5attr(
x = dfile[['graphs']][[graph.name]],
which = 'shape'
))
)
dfile[['graphs']][[graph.name]]$attr_delete(attr_name = 'shape')
}
dfile[['graphs']][[graph.name]]$create_attr(
attr_name = 'assay.used',
robj = assay,
dtype = GuessDType(x = assay)
)
}
# Add miscellaneous information
if (source$exists(name = 'uns')) {
misc <- setdiff(
x = names(x = source[['uns']]),
y = c('neighbors', names(x = dfile[['reductions']]))
)
for (i in misc) {
if (verbose) {
message("Adding ", i, " to miscellaneous data")
}
dfile[['misc']]$obj_copy_from(
src_loc = source[['uns']],
src_name = i,
dst_name = i
)
}
}
# Add layers
if (Exists(x = source, name = 'layers')) {
slots <- c('data')
if (!isTRUE(x = scaled)) {
slots <- c(slots, 'counts')
}
for (layer in names(x = source[['layers']])) {
layer.assay <- dfile[['assays']]$create_group(name = layer)
layer.assay$obj_copy_from(
src_loc = dfile[['assays']][[assay]],
src_name = 'features',
dst_name = 'features'
)
layer.assay$create_attr(
attr_name = 'key',
robj = UpdateKey(key = layer),
dtype = GuessDType(x = layer)
)
for (slot in slots) {
if (verbose) {
message("Adding layer ", layer, " as ", slot, " in assay ", layer)
}
layer.assay$obj_copy_from(
src_loc = source[['layers']],
src_name = layer,
dst_name = slot
)
# if (layer.assay[[slot]]$attr_exists(attr_name = 'shape')) {
if (isTRUE(x = AttrExists(x = layer.assay[[slot]], name = 'shape'))) {
dims <- rev(x = h5attr(x = layer.assay[[slot]], which = 'shape'))
layer.assay[[slot]]$create_attr(
attr_name = 'dims',
robj = dims,
dtype = GuessDType(x = dims)
)
layer.assay[[slot]]$attr_delete(attr_name = 'shape')
}
}
}
}
return(dfile)
}
#' Convert h5Seurat files to H5AD files
#'
#' @inheritParams Convert
#'
#' @return Returns a handle to \code{dest} as an \code{\link[hdf5r]{H5File}}
#' object
#'
#' @section h5Seurat to AnnData/H5AD:
#' The h5Seurat to AnnData/H5AD conversion will try to automatically fill in
#' datasets based on data presence. Data presense is determined by the h5Seurat
#' index (\code{source$index()}). It works in the following manner:
#' \subsection{Assay data}{
#' \itemize{
#' \item \code{X} will be filled with \code{scale.data} if \code{scale.data}
#' is present; otherwise, it will be filled with \code{data}
#' \item \code{var} will be filled with \code{meta.features} \strong{only} for
#' the features present in \code{X}; for example, if \code{X} is filled with
#' \code{scale.data}, then \code{var} will contain only features that have
#' been scaled
#' \item \code{raw.X} will be filled with \code{data} if \code{X} is filled
#' with \code{scale.data}; otherwise, it will be filled with \code{counts}. If
#' \code{counts} is not present, then \code{raw} will not be filled
#' \item \code{raw.var} will be filled with \code{meta.features} with the
#' features present in \code{raw.X}; if \code{raw.X} is not filled, then
#' \code{raw.var} will not be filled
#' }
#' }
#' \subsection{Cell-level metadata}{
#' Cell-level metadata is added to \code{obs}
#' }
#' \subsection{Dimensional reduction information}{
#' Only dimensional reductions associated with \code{assay} or marked as
#' \link[Seurat:IsGlobal]{global} will be transfered to the H5AD file. For
#' every reduction \code{reduc}:
#' \itemize{
#' \item cell embeddings are placed in \code{obsm} and renamed to
#' \code{X_reduc}
#' \item feature loadings, if present, are placed in \code{varm} and renamed
#' to either \dQuote{PCs} if \code{reduc} is \dQuote{pca} otherwise
#' \code{reduc} in all caps
#' }
#' For example, if \code{reduc} is \dQuote{ica}, then cell embeddings will be
#' \dQuote{X_ica} in \code{obsm} and feature loaodings, if present, will be
#' \dQuote{ICA} in \code{varm}
#' }
#' \subsection{Nearest-neighbor graphs}{
#' If a nearest-neighbor graph is associated with \code{assay}, it will be
#' added to \code{uns/neighbors/distances}; if more than one graph is present,
#' then \strong{only} the last graph according to the index will be added.
#' }
#' \subsection{Layers}{
#' Data from other assays can be added to \code{layers} if they have the same
#' shape as \code{X} (same number of cells and features). To determine this,
#' the shape of each alternate assays's \code{scale.data} and \code{data} slots
#' are determined. If they are the same shape as \code{X}, then that slot
#' (\code{scale.data} is given priority over \code{data}) will be added as a
#' layer named the name of the assay (eg. \dQuote{SCT}). In addition, the
#' features names will be added to \code{var} as \code{assay_features}
#' (eg. \dQuote{SCT_features}).
#' }
#'
#' @keywords internal
#'
H5SeuratToH5AD <- function(
source,
dest,
assay = DefaultAssay(object = source),
overwrite = FALSE,
verbose = TRUE
) {
if (file.exists(dest)) {
if (overwrite) {
file.remove(dest)
} else {
stop("Destination H5AD file exists", call. = FALSE)
}
}
rownames <- '_index'
dfile <- H5File$new(filename = dest, mode = WriteMode(overwrite = FALSE))
# Transfer data frames from h5Seurat files to H5AD files
#
# @param src Source dataset
# @param dname Name of destination
# @param index Integer values of rows to take
#
# @return Invisibly returns \code{NULL}
#
TransferDF <- function(src, dname, index) {
if (verbose) {
message("Transfering ", basename(path = src$get_obj_name()), " to ", dname)
}
if (inherits(x = src, what = 'H5D')) {
CompoundToGroup(
src = src,
dest = dfile,
dst.name = dname,
order = 'column-order',
index = index
)
} else {
dfile$create_group(name = dname)
for (i in src$names) {
if (IsFactor(x = src[[i]])) {
dfile[[dname]]$create_dataset(
name = i,
robj = src[[i]][['values']][index] - 1L,
dtype = src[[H5Path(i, 'values')]]$get_type()
)
if (!dfile[[dname]]$exists(name = '__categories')) {
dfile[[dname]]$create_group(name = '__categories')
}
dfile[[dname]][['__categories']]$create_dataset(
name = i,
robj = src[[i]][['levels']][],
dtype = src[[H5Path(i, 'levels')]]$get_type()
)
} else {
dfile[[dname]]$create_dataset(
name = i,
robj = src[[i]][index],
dtype = src[[i]]$get_type()
)
}
}
if (src$attr_exists(attr_name = 'colnames')) {
dfile[[dname]]$create_attr(
attr_name = 'column-order',
robj = h5attr(x = src, which = 'colnames'),
dtype = GuessDType(x = h5attr(x = src, which = 'colnames'))
)
}
encoding.info <- c('type' = 'dataframe', 'version' = '0.1.0')
names(x = encoding.info) <- paste0('encoding-', names(x = encoding.info))
for (i in seq_along(along.with = encoding.info)) {
attr.name <- names(x = encoding.info)[i]
attr.value <- encoding.info[i]
if (dfile[[dname]]$attr_exists(attr_name = attr.name)) {
dfile[[dname]]$attr_delete(attr_name = attr.name)
}
dfile[[dname]]$create_attr(
attr_name = attr.name,
robj = attr.value,
dtype = GuessDType(x = attr.value),
space = Scalar()
)
}
}
return(invisible(x = NULL))
}
# Because AnnData can't figure out that sparse matrices are stored as groups
AddEncoding <- function(dname) {
encoding.info <- c('type' = 'csr_matrix', 'version' = '0.1.0')
names(x = encoding.info) <- paste0('encoding-', names(x = encoding.info))
if (inherits(x = dfile[[dname]], what = 'H5Group')) {
for (i in seq_along(along.with = encoding.info)) {
attr.name <- names(x = encoding.info)[i]
attr.value <- encoding.info[i]
if (dfile[[dname]]$attr_exists(attr_name = attr.name)) {
dfile[[dname]]$attr_delete(attr_name = attr.name)
}
dfile[[dname]]$create_attr(
attr_name = attr.name,
robj = attr.value,
dtype = GuessDType(x = attr.value),
space = Scalar()
)
}
# dfile[[dname]]$create_attr(
# attr_name = 'encoding-type',
# robj = 'csr_matrix',
# dtype = StringType(),
# space = Scalar()
# )
# dfile[[dname]]$create_attr(
# attr_name = 'encoding-version',
# robj = '0.1.0',
# dtype = StringType(),
# space = Scalar()
# )
}
return(invisible(x = NULL))
}
# Add assay data
assay.group <- source[['assays']][[assay]]
if (source$index()[[assay]]$slots[['scale.data']]) {
x.data <- 'scale.data'
raw.data <- 'data'
} else {
x.data <- 'data'
raw.data <- if (source$index()[[assay]]$slots[['counts']]) {
'counts'
} else {
NULL
}
}
if (verbose) {
message("Adding ", x.data, " from ", assay, " as X")
}
assay.group$obj_copy_to(dst_loc = dfile, dst_name = 'X', src_name = x.data)
if (dfile[['X']]$attr_exists(attr_name = 'dims')) {
dims <- h5attr(x = dfile[['X']], which = 'dims')
dfile[['X']]$create_attr(
attr_name = 'shape',
robj = rev(x = dims),
dtype = GuessDType(x = dims)
)
dfile[['X']]$attr_delete(attr_name = 'dims')
}
AddEncoding(dname = 'X')
x.features <- switch(
EXPR = x.data,
'scale.data' = which(x = assay.group[['features']][] %in% assay.group[['scaled.features']][]),
seq.default(from = 1, to = assay.group[['features']]$dims)
)
# Add meta.features
if (assay.group$exists(name = 'meta.features')) {
TransferDF(
src = assay.group[['meta.features']],
dname = 'var',
index = x.features
)
} else {
dfile$create_group(name = 'var')
}
# Add feature names
if (Exists(x = dfile[['var']], name = rownames)) {
dfile[['var']]$link_delete(name = rownames)
}
dfile[['var']]$create_dataset(
name = rownames,
robj = assay.group[['features']][x.features],
dtype = GuessDType(x = assay.group[['features']][1])
)
dfile[['var']]$create_attr(
attr_name = rownames,
robj = rownames,
dtype = GuessDType(x = rownames),
space = Scalar()
)
# Because AnnData requries meta.features and can't build an empty data frame
if (!dfile[['var']]$attr_exists(attr_name = 'column-order')) {
var.cols <- setdiff(
x = names(x = dfile[['var']]),
y = c(rownames, '__categories')
)
if (!length(x = var.cols)) {
var.cols <- 'features'
dfile[['var']]$obj_copy_to(
dst_loc = dfile[['var']],
dst_name = var.cols,
src_name = rownames
)
}
dfile[['var']]$create_attr(
attr_name = 'column-order',
robj = var.cols,
dtype = GuessDType(x = var.cols)
)
}
# Add encoding, to ensure compatibility with python's anndata > 0.8.0:
encoding.info <- c('type' = 'dataframe', 'version' = '0.1.0')
names(x = encoding.info) <- paste0('encoding-', names(x = encoding.info))
for (i in seq_along(along.with = encoding.info)) {
attr.name <- names(x = encoding.info)[i]
attr.value <- encoding.info[i]
if (dfile[['var']]$attr_exists(attr_name = attr.name)) {
dfile[['var']]$attr_delete(attr_name = attr.name)
}
dfile[['var']]$create_attr(
attr_name = attr.name,
robj = attr.value,
dtype = GuessDType(x = attr.value),
space = Scalar()
)
}
# Add raw
if (!is.null(x = raw.data)) {
if (verbose) {
message("Adding ", raw.data, " from ", assay, " as raw")
}
dfile$create_group(name = 'raw')
assay.group$obj_copy_to(
dst_loc = dfile[['raw']],
dst_name = 'X',
src_name = raw.data
)
if (dfile[['raw/X']]$attr_exists(attr_name = 'dims')) {
dims <- h5attr(x = dfile[['raw/X']], which = 'dims')
dfile[['raw/X']]$create_attr(
attr_name = 'shape',
robj = rev(x = dims),
dtype = GuessDType(x = dims)
)
dfile[['raw/X']]$attr_delete(attr_name = 'dims')
}
AddEncoding(dname = 'raw/X')
# Add meta.features
if (assay.group$exists(name = 'meta.features')) {
TransferDF(
src = assay.group[['meta.features']],
dname = 'raw/var',
index = seq.default(from = 1, to = assay.group[['features']]$dims)
)
} else {
dfile[['raw']]$create_group(name = 'var')
}
# Add feature names
if (Exists(x = dfile[['raw/var']], name = rownames)) {
dfile[['raw/var']]$link_delete(name = rownames)
}
dfile[['raw/var']]$create_dataset(
name = rownames,
robj = assay.group[['features']][],
dtype = GuessDType(x = assay.group[['features']][1])
)
dfile[['raw/var']]$create_attr(
attr_name = rownames,
robj = rownames,
dtype = GuessDType(x = rownames),
space = Scalar()
)
}
# Add cell-level metadata
TransferDF(
src = source[['meta.data']],
dname = 'obs',
index = seq.default(from = 1, to = length(x = Cells(x = source)))
)
if (Exists(x = dfile[['obs']], name = rownames)) {
dfile[['obs']]$link_delete(name = rownames)
}
dfile[['obs']]$create_dataset(
name = rownames,
robj = Cells(x = source),
dtype = GuessDType(x = Cells(x = source))
)
dfile[['obs']]$create_attr(
attr_name = rownames,
robj = rownames,
dtype = GuessDType(x = rownames),
space = Scalar()
)
# Add dimensional reduction information
obsm <- dfile$create_group(name = 'obsm')
varm <- dfile$create_group(name = 'varm')
reductions <- source$index()[[assay]]$reductions
for (reduc in names(x = reductions)) {
if (verbose) {
message("Adding dimensional reduction information for ", reduc)
}
Transpose(
x = source[[H5Path('reductions', reduc, 'cell.embeddings')]],
dest = obsm,
dname = paste0('X_', reduc),
verbose = FALSE
)
if (reductions[[reduc]]['feature.loadings']) {
if (verbose) {
message("Adding feature loadings for ", reduc)
}
loadings <- source[['reductions']][[reduc]][['feature.loadings']]
reduc.features <- loadings$dims[1]
x.features <- dfile[['var']][[rownames]]$dims
varm.name <- switch(EXPR = reduc, 'pca' = 'PCs', toupper(x = reduc))
# Because apparently AnnData requires nPCs == nrow(X)
if (reduc.features < x.features) {
pad <- paste0('pad_', varm.name)
PadMatrix(
src = loadings,
dest = dfile[['varm']],
dname = pad,
dims = c(x.features, loadings$dims[2]),
index = list(
match(
x = source[['reductions']][[reduc]][['features']][],
table = dfile[['var']][[rownames]][]
),
seq.default(from = 1, to = loadings$dims[2])
)
)
loadings <- dfile[['varm']][[pad]]
}
Transpose(x = loadings, dest = varm, dname = varm.name, verbose = FALSE)
if (reduc.features < x.features) {
dfile$link_delete(name = loadings$get_obj_name())
}
}
}
# Add global dimensional reduction information
global.reduc <- source$index()[['global']][['reductions']]
for (reduc in global.reduc) {
if (reduc %in% names(x = reductions)) {
next
} else if (verbose) {
message("Adding dimensional reduction information for ", reduc, " (global)")
}
Transpose(
x = source[[H5Path('reductions', reduc, 'cell.embeddings')]],
dest = obsm,
dname = paste0('X_', reduc),
verbose = FALSE
)
}
# Create uns
dfile$create_group(name = 'uns')
# Add graph
graph <- source$index()[[assay]]$graphs
graph <- graph[length(x = graph)]
if (!is.null(x = graph)) {
if (verbose) {
message("Adding ", graph, " as neighbors")
}
dgraph <- dfile[['uns']]$create_group(name = 'neighbors')
source[['graphs']]$obj_copy_to(
dst_loc = dgraph,
dst_name = 'distances',
src_name = graph
)
if (source[['graphs']][[graph]]$attr_exists(attr_name = 'dims')) {
dims <- h5attr(x = source[['graphs']][[graph]], which = 'dims')
dgraph[['distances']]$create_attr(
attr_name = 'shape',
robj = rev(x = dims),
dtype = GuessDType(x = dims)
)
}
AddEncoding(dname = 'uns/neighbors/distances')
# Add parameters
dgraph$create_group(name = 'params')
dgraph[['params']]$create_dataset(
name = 'method',
robj = gsub(pattern = paste0('^', assay, '_'), replacement = '', x = graph),
dtype = GuessDType(x = graph)
)
cmdlog <- paste(
paste0('FindNeighbors.', assay),
unique(x = c(names(x = reductions), source$index()$global$reductions)),
sep = '.',
collapse = '|'
)
cmdlog <- grep(
pattern = cmdlog,
x = names(x = source[['commands']]),
value = TRUE
)
if (length(x = cmdlog) > 1) {
timestamps <- sapply(
X = cmdlog,
FUN = function(cmd) {
ts <- if (source[['commands']][[cmd]]$attr_exists(attr_name = 'time.stamp')) {
h5attr(x = source[['commands']][[cmd]], which = 'time.stamp')
} else {
NULL
}
return(ts)
},
simplify = TRUE,
USE.NAMES = FALSE
)
timestamps <- Filter(f = Negate(f = is.null), x = timestamps)
cmdlog <- cmdlog[order(timestamps, decreasing = TRUE)][1]
}
if (length(x = cmdlog) && !is.na(x = cmdlog)) {
cmdlog <- source[['commands']][[cmdlog]]
if ('k.param' %in% names(x = cmdlog)) {
dgraph[['params']]$obj_copy_from(
src_loc = cmdlog,
src_name = 'k.param',
dst_name = 'n_neighbors'
)
}
}
}
# Add layers
other.assays <- setdiff(
x = names(x = source$index()),
y = c(assay, 'global', 'no.assay')
)
if (length(x = other.assays)) {
x.dims <- Dims(x = dfile[['X']])
layers <- dfile$create_group(name = 'layers')
for (other in other.assays) {
layer.slot <- NULL
for (slot in c('scale.data', 'data')) {
slot.dims <- if (source$index()[[other]]$slots[[slot]]) {
Dims(x = source[['assays']][[other]][[slot]])
} else {
NA_integer_
}
if (isTRUE(all.equal(slot.dims, x.dims))) {
layer.slot <- slot
break
}
}
if (!is.null(x = layer.slot)) {
if (verbose) {
message("Adding ", layer.slot, " from ", other, " as a layer")
}
layers$obj_copy_from(
src_loc = source[['assays']][[other]],
src_name = layer.slot,
dst_name = other
)
if (layers[[other]]$attr_exists(attr_name = 'dims')) {
dims <- h5attr(x = layers[[other]], which = 'dims')
layers[[other]]$create_attr(
attr_name = 'shape',
robj = rev(x = dims),
dtype = GuessDType(x = dims)
)
layers[[other]]$attr_delete(attr_name = 'dims')
}
AddEncoding(dname = paste('layers', other, sep = '/'))
layer.features <- switch(
EXPR = layer.slot,
'scale.data' = 'scaled.features',
'features'
)
var.name <- paste0(other, '_features')
dfile[['var']]$obj_copy_from(
src_loc = source[['assays']][[other]],
src_name = layer.features,
dst_name = var.name
)
col.order <- h5attr(x = dfile[['var']], which = 'column-order')
col.order <- c(col.order, var.name)
dfile[['var']]$attr_rename(
old_attr_name = 'column-order',
new_attr_name = 'old-column-order'
)
dfile[['var']]$create_attr(
attr_name = 'column-order',
robj = col.order,
dtype = GuessDType(x = col.order)
)
dfile[['var']]$attr_delete(attr_name = 'old-column-order')
}
}
if (!length(x = names(x = layers))) {
dfile$link_delete(name = 'layers')
}
}
# Because AnnData can't handle an empty /uns
if (!length(x = names(x = dfile[['uns']]))) {
dfile$link_delete(name = 'uns')
}
dfile$flush()
return(dfile)
}
# Read obs meta data from h5ad file and return a data.frame
#' @export
#'
readH5AD_obs <- function(file) {
suppressWarnings(expr = hfile <- SeuratDisk:: Connect(filename = file, force = TRUE))
hfile_obs <- hfile[['obs']]
obs_groups <- setdiff(names(hfile_obs), c('__categories', '_index'))
matrix <- as.data.frame(
x = matrix(data = NA,
nrow = hfile_obs[['_index']]$dims[1],
ncol = length(obs_groups))
)
colnames(matrix) <- obs_groups
rownames(matrix) <- hfile_obs[['_index']][]
if ('__categories' %in% names(x = hfile_obs)) {
hfile_cate <- hfile_obs[['__categories']]
for (i in seq_along(obs_groups)) {
obs.i <- obs_groups[i]
obs_value_i <- hfile_obs[[obs.i]][]
if (obs.i %in% names(x = hfile_cate)){
obs_value_i <- factor(x = obs_value_i, labels = hfile_cate[[obs.i]][])
}
matrix[,i] <- obs_value_i
}
} else {
for (i in seq_along(obs_groups)) {
obs.i <- obs_groups[i]
if (all(names(hfile_obs[[obs.i]]) == c("categories", "codes"))) {
if (
length(unique(hfile_obs[[obs.i]][['codes']][])) == length(hfile_obs[[obs.i]][['categories']][])
) {
obs_value_i <- factor(
x = hfile_obs[[obs.i]][['codes']][],
labels = hfile_obs[[obs.i]][['categories']][]
)
} else {
obs_value_i <- hfile_obs[[obs.i]][['codes']][]
}
} else {
obs_value_i <- hfile_obs[[obs.i]][]
}
matrix[,i] <- obs_value_i
}
}
hfile$close_all()
return(matrix)
}
# Read obsm from h5ad file and return a list of embeddings
#' @export
#'
readH5AD_obsm <- function(file) {
hfile <- SeuratDisk:: Connect(filename = file, force = TRUE)
hfile_obsm <- hfile[['obsm']]
if (length(names(hfile_obsm)) == 0) {
message('No obsm if found in this object')
return (list())
}
obsm_set <- names(hfile_obsm)
cells.name <- hfile[['obs']][['_index']][]
obsm.list <- lapply(obsm_set, function(x) {
emb <- t(hfile_obsm[[x]][,])
rownames(emb) <- cells.name
key.name <- gsub('X_', '', x)
colnames(emb) <- paste0(key.name, "_", 1:ncol(emb))
return(emb)
})
names(obsm.list) <- gsub('X_', '',obsm_set)
hfile$close_all()
return(obsm.list)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.