#' @include internal.R
#' @import hdf5r
#' @import Matrix
#' @importFrom R6 R6Class
NULL
#' A class for connections loom files
#'
#'
#' @docType class
#' @name loom
#' @rdname loom
#' @usage lfile <- loomR::connect(filename = 'myfile.loom')
#' @aliases loom-class
#' @format An \code{\link{R6::R6Class}} object
#' @seealso \code{\link{loomR}}, \code{\link{hdf5r::H5File}}
#'
#' @field version Version of loomR object was created under
#' @field shape Shape of \code{/matrix} in genes (columns) by cells (rows)
#' @field chunksize Chunks set for this dataset in columns (cells) by rows (genes)
#' @field matrix The main data matrix, stored as columns (cells) by rows (genes)
#' @field layers Additional data matricies, the same shape as \code{/matrix}
#' @field col.attrs Extra information about cells
#' @field row.attrs Extra information about genes
#'
#' @section Methods:
#' \describe{
#' \item{\code{add.graph(a, b, w, name, MARGIN, overwrite)}, \code{add.graph.matrix(mat, name, MARGIN, overwrite)}}{
#' Add a graph to the loom object; can add either in coorindate format (\code{add.graph}) or matrix format (\code{add.graph.matrix}).
#' Stores graph in coordinate format as \code{[row, col]_graphs/name/a} (row indices),
#' \code{[row, col]_graphs/name/b} (column indices), and \code{[row, col]_graphs/name/w} (values)
#' \describe{
#' \item{\code{a}}{Integer vector of row indices for graph, must be the same lengths as \code{b} and \code{w}}
#' \item{\code{b}}{Integer vector of column indices for graph, must be the same lengths as \code{a} and \code{w}}
#' \item{\code{w}}{Numeric vector of values for graph, must be the same lengths as \code{a} and \code{b}}
#' \item{\code{mat}}{Graph provided as a matrix (sparse or dense) or data.frame}
#' \item{\code{name}}{Name to store graph, will end up being \code{col_graphs/name} or \code{row_graphs/name}, depending on \code{MARGIN}}
#' \item{\code{MARGIN}}{Store the graph in \code{row_graphs} (1) or \code{col_graphs} (2), defaults to 2}
#' \item{\code{overwrite}}{Can overwrite existing graph?}
#' }
#' }
#' \item{\code{add.layer(layer, chunk.size, overwrite)}}{
#' Add a data layer to this loom file, must be the same dimensions as \code{/matrix}
#' \describe{
#' \item{\code{layer}}{A named list of matrices to be added as layers}
#' \item{\code{chunk.size}}{Number of rows from each layer to stream at once, defaults to 1000}
#' \item{\code{overwrite}}{If a layer already exists, overwrite with new data, defaults to FALSE}
#' }
#' }
#' \item{\code{add.attribute(attribute, MARGIN, overwrite)}}{
#' Add extra information to this loom file.
#' \describe{
#' \item{\code{attribute}}{A named list where the first dimmension of each element as long as one dimension of \code{/matrix}}
#' \item{\code{MARGIN}}{Either 1 for genes or 2 for cells}
#' \item{\code{overwrite}}{Can overwrite existing attributes?}
#' }
#' }
#' \item{\code{add.row.attribute(attribute), add.col.attribute(attribute)}}{
#' Add row or column attributes
#' }
#' \item{\code{get.attribute.df(MARGIN, attribute.names, row.names, col.names)}}{
#' Get a group of row or column attributes as a data frame, will only return attributes that have one dimension
#' \describe{
#' \item{\code{MARGIN}}{Either '1' or '2' to get row- or column-attributes, respectively}
#' \item{\code{attribute.names}}{A vector of attribute dataset basenames}
#' \item{\code{row.names}}{Basename of the rownames dataset}
#' \item{\code{col.names}}{Basename of the colnames dataset}
#' }
#' }
#' \item{\code{get.graph(name, MARGIN)}}{
#' Get a graph as a sparse matrix
#' \describe{
#' \item{\code{name}}{Name of the graph, can be either the basename or full name of the grpah}
#' \item{\code{MARGIN}}{
#' Load the graph from \code{row_graphs} (1) or \code{col_graphs} (2), defaults to 2.
#' Ignored if full path to graph is passed to \code{name}
#' }
#' }
#' }
#' \item{\code{batch.scan(chunk.size, MARGIN, index.use, dataset.use, force.reset)}, \code{batch.next(return.data)}}{
#' Scan a dataset in the loom file from \code{index.use[1]} to \code{index.use[2]}, iterating by \code{chunk.size}.
#' \describe{
#' \item{\code{chunk.size}}{Size to chunk \code{MARGIN} by, defaults to \code{self$chunksize}}
#' \item{\code{MARGIN}}{Iterate over genes (1) or cells (2), defaults to 2}
#' \item{\code{index.use}}{Which specific values of \code{dataset.use} to use, defaults to \code{1:self$shape[MARGIN]} (all values)}
#' \item{\code{dataset.use}}{Name of dataset to use, can be the name, not \code{group/name}, unless the name is present in multiple groups}
#' \item{\code{force.reset}}{Force a reset of the internal iterator}
#' \item{\code{return.data}}{Return data for a given chunk, if \code{FALSE}, returns the indices across \code{MARGIN} for said chunk}
#' }
#' }
#' \item{\code{apply(name, FUN, MARGIN, chunk.size, dataset.use, overwrite, display.progress, ...)}}{
#' Apply a function over a dataset within the loom file, stores the results in the loom file. Will not make multidimensional attributes.
#' \describe{
#' \item{\code{name}}{Full name ('group/name') of dataset to store results to}
#' \item{\code{FUN}}{Function to apply}
#' \item{\code{MARGIN}}{Iterate over genes (1) or cells (2), defaults to 2}
#' \item{\code{index.use}}{Which specific values of \code{dataset.use} to use, defaults to \code{1:self$shape[MARGIN]} (all values)}
#' \item{\code{chunk.size}}{Size to chunk \code{MARGIN} by, defaults to \code{self$chunksize}}
#' \item{\code{dataset.use}}{Name of dataset to use}
#' \item{\code{overwrite}}{Overite \code{name} if already exists}
#' \item{\code{display.progress}}{Display progress}
#' \item{\code{...}}{Extra parameters to pass to \code{FUN}}
#' }
#' }
#' \item{\code{map(FUN, MARGIN, chunk.size, index.use, dataset.use, display.progress, expected, ...)}}{
#' Map a function onto a dataset within the loom file, returns the result into R.
#' \describe{
#' \item{\code{FUN}}{}
#' \item{\code{MARGIN}}{Iterate over genes (1) or cells (2), defaults to 2}
#' \item{\code{chunk.size}}{Size to chunk \code{MARGIN} by, defaults to \code{self$chunksize}}
#' \item{\code{index.use}}{Which specific values of \code{dataset.use} to use, defaults to \code{1:self$shape[MARGIN]} (all values)}
#' \item{\code{dataset.use}}{Name of dataset to use}
#' \item{\code{display.progress}}{Display progress}
#' \item{\code{...}}{Extra parameters to pass to \code{FUN}}
#' }
#' }
#' \item{\code{add.cells(matrix.data, attributes.data = NULL, layers.data = NULL, display.progress = TRUE)}}{
#' Add cells to a loom file.
#' \describe{
#' \item{\code{matrix.data}}{A list of m2 cells where each entry is a vector of length n (num genes, \code{self$shape[1]})}
#' \item{\code{attributes.data}}{A list where each entry is named for one of the datasets in \code{self[['col_attrs']]}; each entry is a vector of length m2.}
#' \item{\code{layers.data}}{A list where each entry is named for one of the datasets in \code{self[['layers']]}; each entry is an n-by-m2 matrix where n is the number of genes in this loom file and m2 is the number of cells being added.}
#' \item{\code{display.progress}}{Display progress}
#' }
#' }
#' \item{\code{add.loom(other, other.key, self.key, ...)}}{
#' Add the contents of another loom file to this one.
#' \describe{
#' \item{\code{other}}{An object of class \code{loom} or a filename of another loom file}
#' \item{\code{other.key}}{Row attribute in \code{other} to add by}
#' \item{\code{self.key}}{Row attribute in this loom file to add by}
#' \item{\code{...}}{Ignored for now}
#' }
#' }
#' }
#'
#' @importFrom iterators nextElem
#' @importFrom itertools hasNext ihasNext ichunk
#' @importFrom utils packageVersion txtProgressBar setTxtProgressBar
#'
#' @export
#'
loom <- R6Class(
# The loom class
# Based on the H5File class from hdf5r
# Not clonable (no loom$clone method), doesn't make sense since all data is on disk, not in memory
# Yes to portability, other packages can subclass the loom class
# Class is locked, other fields and methods cannot be added
classname = 'loom',
inherit = hdf5r::H5File,
cloneable = FALSE,
portable = TRUE,
lock_class = TRUE,
# Public fields and methods
# See above for documentation
public = list(
# Fields
version = NULL,
shape = NULL,
chunksize = NULL,
matrix = NULL,
layers = NULL,
col.attrs = NULL,
row.attrs = NULL,
# Methods
initialize = function(
filename = NULL,
mode = c('a', 'r', 'r+', 'w', 'w-'),
skip.validate = FALSE,
...
) {
# If the file exists, run validation steps
do.validate <- file.exists(filename) && !(mode %in% c('w', 'w+'))
private$skipped.validation <- skip.validate
super$initialize(filename = filename, mode = mode, ...)
if (do.validate) {
# Run the validation steps
if (skip.validate) {
warning("Skipping validation step, some fields are not populated")
} else {
validateLoom(object = self)
}
# Store /matrix and the shape of /matrix
if (skip.validate) {
if (getOption(x = 'verbose')) {
warning("Not setting matrix field")
}
} else {
self$matrix <- self[['matrix']]
}
self$shape <- rev(self[['matrix']]$dims)
# Store the chunk size
chunks <- tryCatch(
expr = h5attr(x = self, which = 'chunks'),
error = function(e) {
hchunks <- self[['matrix']]$chunk_dims
pchunks <- paste0('(', paste(hchunks, collapse = ', '), ')')
if (mode != 'r') {
h5attr(x = self, which = 'chunks') <- pchunks
}
return(pchunks)
}
)
chunks <- gsub(pattern = '(', replacement = '', x = chunks, fixed = TRUE)
chunks <- gsub(pattern = ')', replacement = '', x = chunks, fixed = TRUE)
chunks <- unlist(x = strsplit(x = chunks, split = ','))
self$chunksize <- as.integer(x = chunks)
# Store version information
self$version <- as.character(x = tryCatch(
# Try getting a version
# If it doesn't exist, can we write to the file?
# If so, store the version as this version of loomR
# Otherwise, store the version as NA_character_
expr = h5attr(x = self, which = 'version'),
error = function(e) {
if (mode != 'r') {
version <- packageVersion(pkg = 'loomR')
h5attr(x = self, which = 'version') <- as.character(x = version)
} else {
version <- NA_character_
}
return(version)
}
))
# Load layers
private$load_layers()
# Load attributes
private$load_attributes(MARGIN = 1) # Genes (row_attrs)
private$load_attributes(MARGIN = 2) # Cells (col_attrs
} else {
# Assume new HDF5 file
self$version <- as.character(x = packageVersion(pkg = 'loomR'))
}
},
finalizer = function() {
self$close_all(close_self = TRUE)
},
load.fields = function() {
private$load_layers()
private$load_attributes(MARGIN = 1)
private$load_attributes(MARGIN = 2)
},
update.shape = function() {
self$shape <- rev(x = self[['matrix']]$dims)
},
# Addding attributes, layers, and graphs
add.graph = function(a, b, w, name, MARGIN = 2, overwrite = FALSE) {
if (self$mode == 'r') {
stop(private$err_mode)
}
# Coerce datasets to proper types
a <- as.integer(x = a)
b <- as.integer(x = b)
w <- as.double(x = w)
# Check lengths of each vector
graph.lengths <- vapply(
X = list(a, b, w),
FUN = length,
FUN.VALUE = integer(length = 1L)
)
graph.lengths <- unique(x = graph.lengths)
if (length(x = graph.lengths) > 1) {
stop("'a', 'b', and 'w' must all be the same length")
}
# Check the name, automatically assign a group if provided in `name`
if (dirname(path = name) != '.') {
name.group <- dirname(path = name)
MARGIN <- if (grepl(pattern = 'col', x = name.group)) {
2
} else if (grepl(pattern = 'row', x = name.group)) {
1
} else {
cate("Unknown group:", name.group)
cate("Using", MARGIN, "as default")
MARGIN
}
}
# Shorten `name` to the just the basename
name <- basename(path = name)
# Assign group
group <- switch(
EXPR = MARGIN,
'1' = 'row',
'2' = 'col',
stop("'MARGIN' must be 1 or 2")
)
group <- paste0(group, '_graphs')
# Check for existance of previous graph
if (name %in% names(x = self[[group]])) {
if (overwrite) {
self[[group]]$link_delete(name = name)
} else {
stop(paste("A graph with the name", name, "exists already!"))
}
}
# Create our datasets
self[[group]]$create_group(name = name)
self[[group]][[name]][['a']] <- a
self[[group]][[name]][['b']] <- b
self[[group]][[name]][['w']] <- w
invisible(x = self)
},
add.graph.matrix = function(mat, name, MARGIN = 2, overwrite = FALSE) {
if (!inherits(x = mat, what = 'dgCMatrix')) {
mat <- Matrix(
data = mat,
nrow = nrow(x = mat),
ncol = ncol(x = mat),
sparse = TRUE
)
}
a <- mat@i
b <- PointerToIndex(p = mat@p)
w <- mat@x
self$add.graph(
a = a,
b = b,
w = w,
name = name,
MARGIN = MARGIN,
overwrite = overwrite
)
invisible(x = self)
},
add.layer = function(layers, chunk.size = 1000, overwrite = FALSE) {
if (self$mode == 'r') {
stop(private$err_mode)
}
# Value checking
if (!is.list(x = layers) || is.null(x = names(x = layers))) {
stop("'layers' must be a named list")
}
if (is.null(x = self$shape)) {
stop(private$err_init)
}
# Add layers
for (i in 1:length(x = layers)) {
# if (!is.matrix(x = layers[[i]])) {
if (!inherits(x = layers[[i]], what = c('Matrix', 'matrix'))) {
layers[[i]] <- as.matrix(x = layers[[i]])
}
do.transpose <- FALSE
shape.use <- rev(x = self$shape)
if (any(dim(x = layers[[i]]) != shape.use)) {
if (all(rev(x = dim(x = layers[[i]])) == shape.use)) {
do.transpose <- TRUE
} else {
stop(paste(
"All layers must have",
shape.use[1],
"rows for cells and",
shape.use[2],
"columns for genes"
))
}
}
if (do.transpose) {
layers[[i]] <- t(x = layers[[i]])
}
layer.name <- names(x = layers)[i]
if (layer.name %in% list.datasets(object = self[['layers']])) {
if (overwrite) {
self[['layers']]$link_delete(name = layer.name)
} else {
stop(paste(
"A layer with the name",
layer.name,
"already!"
))
}
}
dtype <- getDtype(x = layers[[i]][1, 1])
self[['layers']]$create_dataset(
name = layer.name,
dtype = dtype,
dims = dim(x = layers[[i]])
)
chunk.points <- chunkPoints(
data.size = dim(x = layers[[i]])[1],
chunk.size = chunk.size
)
# if (display.progress) {
# pb <- txtProgressBar(char = '=', style = 3)
# }
for (col in 1:ncol(x = chunk.points)) {
row.start <- chunk.points[1, col]
row.end <- chunk.points[2, col]
self[['layers']][[layer.name]][row.start:row.end, ] <- as.matrix(
x = layers[[i]][row.start:row.end, ]
)
# if (display.progress) {
# setTxtProgressBar(pb = pb, value = col / ncol(x = chunk.points))
# }
}
# self[['layers']]$create_dataset(
# name = names(x = layers)[i],
# robj = layers[[i]],
# chunk_dims = self$chunksize
# )
}
self$flush()
gc(verbose = FALSE)
private$load_layers()
invisible(x = self)
},
add.attribute = function(attribute, MARGIN, overwrite = FALSE) {
if (self$mode == 'r') {
stop(private$err_mode)
}
# Value checking
if (is.data.frame(x = attribute)) {
attribute <- as.list(x = attribute)
}
is.actual.list <- is.list(x = attribute)
if (!is.actual.list || is.null(x = names(x = attribute))) {
stop("Attributes must be provided as a named list")
}
# if (is.data.frame(x = attribute)) {
# attribute <- as.list(x = attribute)
# }
# if (!is.list(x = attribute) || is.null(x = names(x = attribute))) {
# stop("'attribute' must be a named list")
# }
if (!MARGIN %in% c(1, 2)) {
stop("'MARGIN' must be 1 or 2")
}
length.use <- rev(x = self[['matrix']]$dims)[MARGIN]
dim.msg <- paste(
"At least one dimmension for each",
switch(EXPR = MARGIN, '1' = 'gene', '2' = 'cell'),
"attribute must be",
length.use
)
for (i in 1:length(x = attribute)) {
if (is.matrix(x = attribute[[i]]) || is.data.frame(x = attribute[[i]])) {
margin.use <- which(x = dim(x = attribute[[i]]) == length.use)
if (!length(x = margin.use)) {
stop(dim.msg)
}
margin.use <- margin.use[1]
attribute[[i]] <- switch(
EXPR = margin.use,
'1' = t(x = as.matrix(x = attribute[[i]])),
'2' = as.matrix(x = attribute[[i]]),
stop("All attributes must be one- or two-dimmensional")
)
} else {
if (length(x = attribute[[i]]) != length.use) {
stop(dim.msg)
}
attribute[[i]] <- as.vector(x = attribute[[i]])
}
}
if (length(x = which(x = names(x = attribute) != '')) != length(x = attribute)) {
stop("Not all attributes had names provided")
}
grp.name <- c('row_attrs', 'col_attrs')[MARGIN]
grp <- self[[grp.name]]
if (!overwrite) {
names.fail <- which(x = names(x = attribute) %in% names(x = grp))
if (length(x = names.fail) != 0) {
stop(paste0(
"The following names are already used for ",
switch(EXPR = MARGIN, '1' = 'row', '2' = 'column'),
" attributes: '",
paste(names(x = attribute)[names.fail], collapse = ''),
"'"
))
}
}
# Add the attributes as datasets for our MARGIN's group
for (i in 1:length(x = attribute)) {
try(expr = grp$link_delete(name = names(x = attribute)[i]), silent = TRUE)
grp[[names(x = attribute)[i]]] <- attribute[[i]]
}
self$flush()
gc(verbose = FALSE)
# Load the attributes for this margin
private$load_attributes(MARGIN = MARGIN)
invisible(x = self)
},
add.row.attribute = function(attribute, overwrite = FALSE) {
self$add.attribute(attribute = attribute, MARGIN = 1, overwrite = overwrite)
invisible(x = self)
},
add.col.attribute = function(attribute, overwrite = FALSE) {
self$add.attribute(attribute = attribute, MARGIN = 2, overwrite = overwrite)
invisible(x = self)
},
add.meta.data = function(meta.data, overwrite = FALSE) {
self$add.col.attribute(attribute = meta.data, overwrite = overwrite)
invisible(x = self)
},
# Get attribute information
get.attribute.df = function(
MARGIN = 2,
attribute.names = NULL,
row.names = "gene_names",
col.names = "cell_names"
) {
# takes multiple row_attrs or col_attrs and combines them into a data.frame,
# removing rows or columns that are entirely NAs.
#
# attribute.layer either "row" for row_attrs or "col" col_attrs
# attribute.names name of rows or columns to combine into matrix
# row.names either a character vector or name of an element in row_attrs
# col.names either a character vector or name of an element in col_attrs
# if (!attribute.layer %in% c("row", "col")) {
# stop("Invalid attribute.layer. Please select either 'row' or 'col'.")
# }
attribute.layer <- switch(
EXPR = MARGIN,
'1' = 'row',
'2' = 'col',
stop("'MARGIN' must be either '1' for row attributes or '2' for column attributes")
)
attribute.layer <- paste0(attribute.layer, "_attrs")
# by default return all attributes
if (is.null(attribute.names)) {
attribute.names <- self[[attribute.layer]]$names
}
# check that attribute names are present
if (!all(attribute.names %in% self[[attribute.layer]]$names)) {
invalid.names <- attribute.names[which(!attribute.names %in% self[[attribute.layer]]$names)]
stop(paste0("Invalid attribute.names: ", paste0(invalid.names, collapse = ", ")))
}
attr.paths <- paste0(attribute.layer, "/", attribute.names)
# keep only the one-dimensional attributes
dims <- sapply(attr.paths, function(x) length(self[[x]]$dims))
data.lst <- lapply(attr.paths[dims == 1], function(x) data.frame(self[[x]][], stringsAsFactors = FALSE))
combined.df <- Reduce(cbind, data.lst)
colnames(combined.df) <- attribute.names[dims == 1]
if (attribute.layer == "col_attrs") {
rownames(combined.df) <- self[[paste0(attribute.layer, "/", col.names)]][]
} else {
rownames(combined.df) <- self[[paste0(attribute.layer, "/", row.names)]][]
}
# check if any row is all NAs
rows.to.remove <- unname(obj = which(x = apply(
X = combined.df,
MARGIN = 1,
FUN = function(x) {
return(all(is.na(x = x)))
}
)))
if (length(x = rows.to.remove) > 1) {
combined.df <- combined.df[-rows.to.remove, ]
}
return(combined.df)
},
get.graph = function(name, MARGIN = 2) {
# Check the name, automatically assign a group if provided in `name`
if (dirname(path = name) != '.') {
name.group <- dirname(path = name)
MARGIN <- if (grepl(pattern = 'col', x = name.group)) {
2
} else if (grepl(pattern = 'row', x = name.group)) {
1
} else {
cate("Unknown group:", name.group)
cate("Using", MARGIN, "as default")
MARGIN
}
}
# Shorten `name` to the just the basename
name <- basename(path = name)
# Assign group
group <- switch(
EXPR = MARGIN,
'1' = 'row',
'2' = 'col',
stop("'MARGIN' must be 1 or 2")
)
group <- paste0(group, '_graphs')
# Validate graph
if (!name %in% list.groups(object = self[[group]], full.names = FALSE, recursive = FALSE)) {
stop(paste("Cannot find a graph named", name, "in", group))
}
graph.datasets <- list.datasets(object = self[[group]][[name]], full.names = FALSE)
if (length(x = graph.datasets) != 3) {
stop(paste0(
"Malformed graph: wrong number of components for graph (",
length(x = graph.datasets),
")"
))
}
if (!all(graph.datasets %in% c('a', 'b', 'w'))) {
stop("Malformed graph: unknown dataset names")
}
graph <- sparseMatrix(
i = self[[group]][[name]][['a']][] + 1,
j = self[[group]][[name]][['b']][],
x = self[[group]][[name]][['w']][]
)
return(graph)
},
# Chunking functions
batch.scan = function(
chunk.size = NULL,
MARGIN = 2,
index.use = NULL,
dataset.use = 'matrix',
force.reset = FALSE
) {
if (is.null(x = private$it) || !grepl(pattern = dataset.use, x = private$iter.dataset) || force.reset) {
# Check the existence of the dataset
private$iter.dataset <- grep(
pattern = dataset.use,
x = list.datasets(object = self),
value = TRUE
)
if (length(x = private$iter.dataset) != 1) {
private$reset_batch()
stop(paste0("Cannot find dataset '", dataset.use, "' in the loom file"))
}
if (grepl(pattern = 'row_attrs', x = private$iter.dataset)) {
MARGIN <- 1
}
else if (grepl(pattern = 'col_attrs', x = private$iter.dataset)) {
MARGIN <- 2
}
# Check the margin
if (!(MARGIN %in% c(1, 2))) {
private$reset_batch()
stop("MARGIN must be 1 (genes) or 2 (cells)")
} else {
private$iter.margin <- MARGIN
}
if (is.null(x = chunk.size)) {
chunk.size <- rev(x = self$chunksize)[private$iter.margin]
}
private$iter.chunksize <- chunk.size
# Set the indices to use
index.use <- private$iter_range(index.use = index.use)
# Setup our iterator
private$it <- ihasNext(iterable = ichunk(
iterable = index.use[1]:index.use[2],
chunkSize = chunk.size
))
private$iter.index <- index.use
}
# Return the times we iterate, this isn't important, we only need the length of this vector
return(1:ceiling((private$iter.index[2] - private$iter.index[1]) / private$iter.chunksize))
},
batch.next = function(return.data = TRUE) {
# Ensure that we have a proper iterator
if (!'hasNext.ihasNext' %in% suppressWarnings(expr = methods(class = class(x = private$it)))) {
stop("Please setup the iterator with self$batch.scan")
}
# Do the iterating
if (hasNext(obj = private$it)) {
# Get the indices for this chunk
chunk.indices <- unlist(x = nextElem(obj = private$it))
if (return.data) {
# If we're working with a matrix dataset, ensure chunking on proper axis
if (private$iter.dataset == 'matrix' || grepl(pattern = 'layers', x = private$iter.dataset)) {
to.return <- switch(
EXPR = private$iter.margin,
'1' = self[[private$iter.dataset]][, chunk.indices], # Genes
'2' = self[[private$iter.dataset]][chunk.indices, ] # Cells
)
} else {
# Otherwise, iterating over an attribute (1 dimensional)
to.return <- self[[private$iter.dataset]][chunk.indices]
}
} else {
to.return <- chunk.indices
}
# Determine if we reset the iterator
if (!hasNext(obj = private$it)) {
private$reset_batch()
}
return(to.return)
} else {
# Just in case
private$reset_batch()
return(NULL)
}
},
apply = function(
name,
FUN,
MARGIN = 2,
index.use = NULL,
chunk.size = NULL,
dataset.use = 'matrix',
overwrite = FALSE,
display.progress = TRUE,
...
) {
if (self$mode == 'r') {
stop(private$err_mode)
}
if (!inherits(x = FUN, what = 'function')) {
stop("FUN must be a function")
}
# Check that we're storing our results properly
results.basename <- basename(path = name) # Basename of our results
results.dirname <- gsub(pattern = '/', replacement = '', x = dirname(path = name)) # Groupname of our results
dirnames <- c('row_attrs', 'col_attrs', 'layers') # Allowed group names
if (name %in% list.datasets(object = self)) {
if (overwrite) {
self$link_delete(name = name)
} else {
stop(paste("A dataset with the name", name, "already exists!"))
}
}
# Checks datset, index, and MARGIN
# Sets full dataset path in private$iter.dataset
# Sets proper MARGIN in private$iter.margin
batch <- self$batch.scan(
chunk.size = chunk.size,
MARGIN = MARGIN,
dataset.use = dataset.use,
force.reset = TRUE
)
MARGIN <- private$iter.margin
dataset.use <- private$iter.dataset
# Ensure that our group name is allowed
name.check <- which(x = dirnames == results.dirname)
if (!any(name.check)) {
private$reset_batch()
stop(paste(
"The dataset must go into one of:",
paste(dirnames, collapse = ', ')
))
}
# Check that our group matches our iteration
# ie. To store in col_attrs, MARGIN must be 1
if (name.check %in% c(1, 2) && name.check != private$iter.margin) {
private$reset_batch()
stop(paste(
"Iteration must be over",
c('genes', 'cells')[name.check],
paste0("(MARGIN = ", name.check, ")"),
"to store results in",
paste0("'", dirnames[name.check], "'")
))
}
# Check how we store our results
dataset.matrix <- ('matrix' %in% private$iter.dataset || grepl(pattern = 'layers', x = private$iter.dataset))
results.matrix <- name.check == 3
# Ensure index.use is integers within the bounds of [1, self$shape[MARGIN]]
if (is.null(x = index.use)) {
index.use <- 1:self$shape[MARGIN]
} else {
# Filter index.use to values between 1 and self$shape[MARGIN]
index.use <- sort(x = index.use)
index.use <- as.integer(x = index.use)
index.use <- index.use[index.use >= 1 & index.use <= self$shape[MARGIN]]
index.use <- as.vector(x = na.omit(object = index.use))
# If we still have values, figure out NAs, otherwise set index.use to NULL
if (length(x = index.use) == 0) {
warning("No values passed to 'index.use' fall within the data, using all values")
index.use <- 1:self$shape[MARGIN]
}
}
# Trial to get class of new dataset
# Do a trial run to figure out the class of dataset
na.use <- NA
# if (display.progress) {
# catn("Running trial to determine class of dataset")
# }
trial.use <- if (is.null(x = index.use)) {
sample(x = 1:self$shape[MARGIN], size = 3, replace = FALSE)
} else {
sample(x = index.use, size = 3, replace = FALSE)
}
trial.use <- sort(x = trial.use)
trial <- if (grepl(pattern = 'layers', x = dataset.use) || dataset.use == 'matrix') {
switch(
EXPR = MARGIN,
'1' = self[[dataset.use]][, trial.use],
'2' = self[[dataset.use]][trial.use, ]
)
} else {
self[[dataset.use]][trial.use]
}
trial <- FUN(trial, ...)
if (is.list(x = trial)) {
trial <- unlist(x = trial)
}
trial <- as.vector(x = trial)
class(x = na.use) <- class(x = trial)
# Get a connection to the group we're iterating over
group <- self[[results.dirname]]
# Make results dataset
dtype.use <- getDtype(x = na.use)
dims.use <- switch(
EXPR = results.dirname,
'layers' = self[['matrix']]$dims,
'row_attrs' = self[['matrix']]$dims[2],
'col_attrs' = self[['matrix']]$dims[1]
)
group$create_dataset(
name = results.basename,
dtype = dtype.use,
dims = dims.use
)
# Start the iteration
if (display.progress) {
catn("Writing results to", name)
pb <- txtProgressBar(char = '=', style = 3)
}
# first <- TRUE
for (i in 1:length(x = batch)) {
# Get the indices we're iterating over
chunk.indices <- self$batch.next(return.data = FALSE)
indices.use <- chunk.indices[chunk.indices %in% index.use]
indices.use <- indices.use - chunk.indices[1] + 1
if (length(x = indices.use) < 1) {
if (display.progress) {
setTxtProgressBar(pb = pb, value = i / length(x = batch))
}
next
}
# Get the data and apply FUN
chunk.data <- if (dataset.matrix) {
switch(
EXPR = MARGIN,
'1' = {
# Chunk genes
x <- self[[dataset.use]][, chunk.indices]
x[, indices.use]
},
'2' = {
# Chunk cells
x <- self[[dataset.use]][chunk.indices, ]
x[indices.use, ]
}
)
} else {
x <- self[[private$iter.datset]][chunk.indices]
x[indices.use]
}
chunk.data <- FUN(chunk.data, ...)
if (results.matrix) {
# If we're writign to a matrix
# Figure out which way we're writing the data
switch(
EXPR = MARGIN,
'1' = {
chunk.full <- matrix(
nrow = nrow(x = chunk.data),
ncol = length(x = chunk.indices)
)
chunk.full[, indices.use] <- chunk.data
group[[results.basename]][, chunk.indices] <- chunk.full
},
'2' = {
chunk.full <- matrix(
nrow = length(x = chunk.indices),
ncol = ncol(x = chunk.data)
)
chunk.full[indices.use, ] <- chunk.data
group[[results.basename]][chunk.indices, ] <- chunk.full
}
)
} else {
# Just write to the vector
chunk.full <- vector(length = length(x = chunk.indices))
chunk.full[indices.use] <- chunk.data
group[[results.basename]][chunk.indices] <- chunk.full
}
gc(verbose = FALSE)
if (display.progress) {
setTxtProgressBar(pb = pb, value = i / length(x = batch))
}
}
if (display.progress) {
close(con = pb)
}
# Clean up and allow chaining
private$reset_batch()
# Load layers and attributes
private$load_layers()
private$load_attributes(MARGIN = 1) # Genes (row_attrs)
private$load_attributes(MARGIN = 2) # Cells (col_attrs)
invisible(x = self)
},
map = function(
FUN,
MARGIN = 2,
chunk.size = NULL,
index.use = NULL,
dataset.use = 'matrix',
display.progress = TRUE,
...
) {
if (!inherits(x = FUN, what = 'function')) {
stop("FUN must be a function")
}
# Checks datset and MARGIN
# Sets full dataset path in private$iter.dataset
# Sets proper MARGIN in private$iter.margin
batch <- self$batch.scan(
chunk.size = chunk.size,
MARGIN = MARGIN,
dataset.use = dataset.use,
force.reset = TRUE
)
MARGIN <- private$iter.margin
dataset.use <- private$iter.dataset
# Check how we store our results
# And what the shape of our dataset is
dataset.matrix <- any(vapply(
X = c('matrix', 'layers'),
FUN = grepl,
FUN.VALUE = logical(length = 1L),
x = dataset.use
))
# Ensure that index.use is integers within the bounds of [1, self$shape[MARGIN]]
if (is.null(x = index.use)) {
index.use <- 1L:self$shape[MARGIN]
} else {
# Filter index.use to values between 1 and self$shape[MARGIN]
index.use <- sort(x = index.use)
index.use <- as.integer(x = index.use)
index.use <- index.use[index.use >= 1 & index.use <= self$shape[MARGIN]]
index.use <- as.vector(x = na.omit(object = index.use))
# If we still have values, figure out NAs, otherwise set index.use to NULL
if (length(x = index.use) == 0) {
warning("No values passed to 'index.use' fall within the data, using all values")
index.use <- 1:self$shape[MARGIN]
}
}
# Create our results holding object
results <- vector(mode = "list", length = length(x = batch))
if (display.progress) {
pb <- txtProgressBar(char = '=', style = 3)
}
for (i in 1:length(x = batch)) {
# Get the indices we're iterating over
chunk.indices <- self$batch.next(return.data = FALSE)
indices.use <- chunk.indices[chunk.indices %in% index.use]
indices.use <- indices.use - chunk.indices[1] + 1
if (length(x = indices.use) < 1) {
if (display.progress) {
setTxtProgressBar(pb = pb, value = i / length(x = batch))
}
next
}
# Get the data and apply FUN
chunk.data <- if (dataset.matrix) {
switch(
EXPR = MARGIN,
'1' = {
# Chunk genes
x <- self[[dataset.use]][, chunk.indices]
x[, indices.use]
},
'2' = {
# Chunk cells
x <- self[[dataset.use]][chunk.indices, ]
x[indices.use, ]
}
)
} else {
x <- self[[private$iter.datset]][chunk.indices]
x[indices.use]
}
results[[i]] <- FUN(chunk.data, ...)
if (display.progress) {
setTxtProgressBar(pb = pb, value = i / length(x = batch))
}
}
# Bring result list into matrix or vector format
if (inherits(x = results[[1]], what = c('matrix', 'Matrix', 'data.frame'))) {
reduce.func <- switch(EXPR = MARGIN, '1' = cbind, '2' = rbind)
results <- Reduce(f = reduce.func, x = results)
} else {
results <- unlist(x = results, use.names = FALSE)
}
if (display.progress) {
close(con = pb)
}
private$reset_batch()
return(results)
},
# Functions that modify `/matrix'
add.cells = function(
matrix.data,
attributes.data = NULL,
layers.data = NULL,
display.progress = TRUE
) {
if (self$mode == 'r') {
stop(private$err_mode)
}
# Check inputs
n <- self[['matrix']]$dims[2]
if (display.progress) {
cate("Checking inputs...")
}
matrix.data <- check.matrix_data(x = matrix.data, n = n)
layers.data <- check.layers(
x = layers.data,
n = n,
layers.names = names(x = self[['layers']])
)
attributes.data <- check.col_attrs(
x = attributes.data,
attrs.names = names(x = self[['col_attrs']])
)
# Get the number of cells we're adding
num.cells <- c(
nCells.matrix_data(x = matrix.data),
nCells.layers(x = layers.data),
nCells.col_attrs(x = attributes.data)
)
num.cells <- max(num.cells)
# Flesh out the input data to add
if (display.progress) {
cate(paste("Adding", num.cells, "to this loom file"))
}
matrix.data <- addCells.matrix_data(x = matrix.data, n = n, m2 = num.cells)
layers.data <- addCells.layers(x = layers.data, n = n, m2 = num.cells)
attributes.data <- addCells.col_attrs(x = attributes.data, m2 = num.cells)
# Add the input to the loom file
dims.fill <- self[['matrix']]$dims[1]
dims.fill <- (dims.fill + 1L):(dims.fill + num.cells)
# Matrix data
if (display.progress) {
cate("Adding data to /matrix")
}
matrix.data <- t(x = as.matrix(x = data.frame(matrix.data)))
self[['matrix']][dims.fill, ] <- matrix.data
# Layer data
if (display.progress) {
cate("Adding data to /layers")
pb <- new.pb()
counter <- 0
}
layers.names <- names(x = self[['layers']])
for (i in layers.names) {
self[['layers']][[i]][dims.fill, ] <- t(x = layers.data[[i]])
if (display.progress) {
counter <- counter + 1
setTxtProgressBar(pb = pb, value = counter / length(x = layers.names))
}
}
# Column attributes
if (display.progress) {
cate("Adding data to /col_attrs")
pb <- new.pb()
counter <- 0
}
attrs.names <- names(x = self[['col_attrs']])
for (i in attrs.names) {
self[['col_attrs']][[i]][dims.fill] <- attributes.data[[i]]
if (display.progress) {
counter <- counter + 1
setTxtProgressBar(pb = pb, value = counter / length(x = attrs.names))
}
}
# # Load layers and attributes
# private$load_layers()
# private$load_attributes(MARGIN = 1)
# private$load_attributes(MARGIN = 2)
# self$shape <- self[['matrix']]$dims
},
add.loom = function(
other,
other.key = NULL,
self.key = NULL,
...
) {
if (self$mode == 'r') {
stop(private$err_mode)
}
# Connect to the other loom file
if (inherits(x = other, what = 'loom')) {
ofile <- other
} else if (is.character(x = other)) {
ofile <- connect(filename = other)
} else {
stop("'other' must be either a loom object or a path to a loom file")
}
# If we have row keys to use
if (!is.null(x = other.key) && !is.null(x = self.key)) {
other.key <- basename(path = other.key)
self.key <- basename(path = self.key)
tryCatch(
expr = other.key <- other[['row_attrs']][[other.key]][],
error = function(e) {
if (is.character(x = other)) {
ofile$close_all()
}
stop("Failed to find the gene names dataset in the other loom file")
}
)
tryCatch(
expr = self.key <- self[['row_attrs']][[self.key]][],
error = function(e) {
if (is.character(x = other)) {
ofile$close_all()
}
stop("Failed to find the gene names dataset in this loom file")
}
)
# Match rows
rows.use <- match(x = other.key, table = self.key, nomatch = 0)
rows.use <- rows.use[rows.use > 0]
} else {
cate("Adding the loom file as-is, assuming in the correct order")
Sys.sleep(time = 3)
rows.use <- 1:other[['matrix']]$dims[2]
}
if (max(rows.use) > self[['matrix']]$dims[2]) {
stop("More genes in the other loom file than present in this one")
} else if (max(rows.use) < self[['matrix']]$dims[2]) {
cate("...")
}
# Clean up
if (is.character(x = other)) {
ofile$close_all()
}
},
# Datetime functions
timestamp = function(dataset) {
if (self$mode == 'r') {
stop(private$err_mode)
}
# Timestamp format, always in UTC
# %Y -- four-digit year
# %m -- two-digit month
# %d -- two-digit day
# T -- sepearte date from time
# %H -- 24-hour time
# %M -- two-digit minute
# %OS6 -- seconds with 6 digits after the decimal
# Z -- end time
time <- strftime(x = Sys.time(), format = '%Y%m%dT%H%M%OS6Z', tz = 'UTC')
h5attr(x = self[[dataset]], which = 'last_modified') <- time
invisible(x = self)
},
last.modified = function(...) {
time <- unlist(x = strsplit(x = '', split = '.', fixed = TRUE))[1]
if (substr(x = time, start = nchar(x = time), stop = nchar(x = time)) != 'Z') {
time <- paste0(time, 'Z')
}
time <- as.POSIXct(x = strptime(
x = time,
format = '%Y%m%dT%H%M%SZ',
tz = 'UTC'
))
time <- format(x = time, tz = Sys.timezone(), usetz = TRUE)
return(time)
}
),
# Private fields and methods
# @field err_init An error message for if this object hasn't been created with loomR::create or loomR::connect
# @field err_mode An error message for if this object is in read-only mode
# @field it Iterator object for batch.scan and batch.next
# @field iter.dataset Dataset for iterating on
# @field iter.margin Margin to iterate over
# @field iter.index Index values for iteration
# @field skipped.validation Was validation skipped?
# \describe{
# \item{\code{load_attributes(MARGIN)}}{Load attributes of a given MARGIN into \code{self$col.attrs} or \code{self$row.attrs}}
# \item{\code{load_layers()}}{Load layers into \code{self$layers}}
# \item{\code{reset_batch()}}{Reset the batch iterator fields}
# \item{\code{iter_range(index.use)}}{Get the range of indices for a batch iteration}
# }
private = list(
# Fields
err_init = "This loom object has not been created with either loomR::create or loomR::connect, please use these functions to create or connect to a loom file",
err_mode = "Cannot modify a loom file in read-only mode",
it = NULL,
iter.chunksize = NULL,
iter.dataset = NULL,
iter.margin = NULL,
iter.index = NULL,
skipped.validation = FALSE,
# Methods
load_attributes = function(MARGIN) {
attribute <- switch(
EXPR = MARGIN,
'1' = 'row_attrs',
'2' = 'col_attrs',
stop('Invalid attribute dimension')
)
group <- self[[attribute]]
if (length(x = names(x = group)) > 0) {
attributes <- unlist(x = lapply(
X = names(x = group),
FUN = function(x) {
d <- list(group[[x]])
names(x = d) <- x
return(d)
}
))
} else {
attributes <- NULL
}
if (MARGIN == 1) {
self$row.attrs <- attributes
} else if (MARGIN == 2) {
self$col.attrs <- attributes
}
},
load_layers = function() {
if (private$skipped.validation) {
if (getOption(x = 'verbose')) {
warning("Not loading layers field")
}
} else if (length(x = names(x = self[['layers']])) > 0) {
self$layers <- unlist(x = lapply(
X = names(x = self[['layers']]),
FUN = function(n) {
d <- list(self[[paste('layers', n, sep = '/')]])
names(x = d) <- n
return(d)
}
))
}
},
reset_batch = function() {
private$it <- NULL
private$iter.chunksize <- NULL
private$iter.dataset <- NULL
private$iter.margin <- NULL
private$iter.index <- NULL
},
iter_range = function(index.use) {
if (is.null(private$iter.margin)) {
stop("Batch processing hasn't been set up")
}
if (is.null(x = index.use)) {
# If no index was provided, use entire range for this margin
index.use <- c(1, self$shape[private$iter.margin])
} else if (length(x = index.use) == 1) {
# If one index was provided, start at one and go to index
index.use <- c(1, index.use)
} else {
# Use index.use[1] and index.use[2]
index.use <- c(index.use[1], index.use[2])
}
# Ensure the indices provided fit within the range of the dataset
index.use[1] <- max(1, index.use[1])
index.use[2] <- min(index.use[2], self$shape[private$iter.margin])
# Ensure that index.use[1] is greater than index.use[2]
if (index.use[1] > index.use[2]) {
stop(paste0(
"Starting index (",
index.use[1],
") must be lower than the ending index (",
index.use[2],
")"
))
}
return(index.use)
}
)
)
#' Create a loom object
#'
#' @param filename The name of the new loom file
#' @param data The data for \code{/matrix}. If cells are rows and genes are columns, set \code{do.transpose = FALSE}; otherwise, set \code{do.transpose = TRUE}
#' @param gene.attrs A named list of vectors with extra data for genes, each vector must be as long as the number of genes in \code{data}
#' @param cell.attrs A named list of vectors with extra data for cells, each vector must be as long as the number of cells in \code{data}
#' @param chunk.dims A one- or two-length integer vector of chunksizes for \code{/matrix}, defaults to 'auto' to automatically determine chunksize
#' @param do.transpose Transpose the input? Should be \code{TRUE} if \code{data} has genes as rows and cells as columns
#' @param calc.numi Calculate number of UMIs and genes expressed per cell? Will store in 'col_attrs/nUMI' and 'col_attrs/nGene', overwriting anything passed to \code{cel.attrs};
#' To set a custom threshold for gene expression, pass an integer value (eg. \code{calc.numi = 5} for a threshold of five counts per cell)
#' @param chunk.size How many rows of \code{data} should we stream to the loom file at any given time?
#' @param overwrite Overwrite an already existing loom file?
#'
#' @return A connection to a loom file
#'
#' @importFrom utils packageVersion txtProgressBar setTxtProgressBar
#'
#' @seealso \code{\link{loom}}
#'
#' @export
#'
create <- function(
filename,
data,
gene.attrs = NULL,
cell.attrs = NULL,
layers = NULL,
chunk.dims = 'auto',
chunk.size = 1000,
do.transpose = TRUE,
calc.numi = FALSE,
overwrite = FALSE,
display.progress = TRUE
) {
mode <- ifelse(test = overwrite, yes = 'w', no = 'w-')
if (file.exists(filename) && !overwrite) {
stop(paste('File', filename, 'already exists!'))
}
if (!inherits(x = data, what = c('matrix', 'Matrix'))) {
data <- as.matrix(x = data)
}
if (length(x = chunk.dims) > 2 || length(x = chunk.dims) < 1) {
stop("'chunk.dims' must be a one- or two-length integer vector or 'auto'")
} else if (length(x = chunk.dims) == 1) {
if (!grepl(pattern = '^auto$', x = chunk.dims, perl = TRUE)) {
chunk.dims <- rep.int(x = as.integer(x = chunk.dims), times = 2)
}
} else {
chunk.dims <- as.integer(x = chunk.dims)
}
new.loom <- loom$new(filename = filename, mode = mode)
dtype <- getDtype(x = data[1, 1])
matrix.shape <- dim(x = data)
if (do.transpose) {
cate("Transposing input data: loom file will show input columns (cells) as rows and input rows (genes) as columns")
cate("This is to maintain compatibility with other loom tools")
matrix.shape <- rev(x = matrix.shape)
} else {
cate("Not tranposing data: loom file will show data exactly like input")
cate("Please note, other loom tools will show this flipped")
}
new.loom$create_dataset(
name = 'matrix',
dtype = dtype,
dims = matrix.shape
)
chunk.points <- chunkPoints(
data.size = matrix.shape[1],
chunk.size = chunk.size
)
if (display.progress) {
pb <- txtProgressBar(char = '=', style = 3)
}
for (col in 1:ncol(x = chunk.points)) {
row.start <- chunk.points[1, col]
row.end <- chunk.points[2, col]
data.add <- if (do.transpose) {
t(x = as.matrix(x = data[, row.start:row.end]))
} else {
as.matrix(x = data[row.start:row.end, ])
}
new.loom[['matrix']][row.start:row.end, ] <- data.add
if (display.progress) {
setTxtProgressBar(pb = pb, value = col / ncol(x = chunk.points))
}
}
new.loom$matrix <- new.loom[['matrix']]
new.loom$shape <- rev(x = new.loom[['matrix']]$dims)
# Groups
groups <- c('layers', 'row_attrs', 'col_attrs', 'row_graphs', 'col_graphs')
for (group in groups) {
new.loom$create_group(name = group)
}
# Check for the existance of gene or cell names
if (!is.null(x = colnames(x = data))) {
if (do.transpose) {
new.loom$add.col.attribute(attribute = list('cell_names' = colnames(x = data)))
} else {
new.loom$add.row.attribute(attribute = list('gene_names' = colnames(x = data)))
}
}
if (!is.null(x = rownames(x = data))) {
if (do.transpose) {
new.loom$add.row.attribute(attribute = list('gene_names' = rownames(x = data)))
} else {
new.loom$add.col.attribute(attribute = list('cell_names' = rownames(x = data)))
}
}
# Store some constants as HDF5 attributes
h5attr(x = new.loom, which = 'version') <- new.loom$version
h5attr(x = new.loom, which = 'chunks') <- paste0(
'(',
paste(new.loom[['matrix']]$chunk_dims, collapse = ', '),
')'
)
# Add layers
if (!is.null(x = layers)) {
new.loom$add.layer(layer = layers)
}
if (!is.null(x = gene.attrs)) {
new.loom$add.row.attribute(attribute = gene.attrs)
}
if (!is.null(x = cell.attrs)) {
new.loom$add.col.attribute(attribute = cell.attrs)
}
# Calculate nUMI and nGene if asked
if (calc.numi) {
is.expr <- ifelse(test = is.numeric(x = calc.numi), yes = calc.numi, no = 0)
calc.umi(
object = new.loom,
chunk.size = chunk.size,
display.progress = display.progress,
is.expr = is.expr
)
}
# Set last bit of information
chunks <- new.loom[['matrix']]$chunk_dims
chunks <- gsub(pattern = '(', replacement = '', x = chunks, fixed = TRUE)
chunks <- gsub(pattern = ')', replacement = '', x = chunks, fixed = TRUE)
chunks <- unlist(x = strsplit(x = chunks, split = ','))
new.loom$chunksize <- as.integer(x = chunks)
# Return the connection
return(new.loom)
}
#' Connect to a loom file
#'
#' @param filename The loom file to connect to
#' @param mode How do we connect to it? Pass 'r' for read-only or 'r+' for read/write.
#' If \code{mode} is 'r+', loomR will automatically add missing required groups during validation
#' @param skip.validate Skip the validation steps, use only for extremely large loom files
#'
#' @return A loom file connection
#'
#' @seealso \code{\link{loom}}
#'
#' @export
#'
connect <- function(filename, mode = "r", skip.validate = FALSE) {
if (!(mode %in% c('r', 'r+'))) {
stop("'mode' must be one of 'r' or 'r+'")
}
new.loom <- loom$new(filename = filename, mode = mode, skip.validate = skip.validate)
return(new.loom)
}
#' Subset a loom file
#'
#' @param x A loom object
#' @param m Rows (cells) to subset, defaults to all rows
#' @param n Columns (genes) to subset, defaults to all columns
#' @param filename Filename for new loom object, defaults to ...
#' @param chunk.size Chunk size to iterate through \code{x}
#' @param overwrite Overwrite \code{filename} if already exists?
#' @param display.progress Display progress bars?
#' @param ... Ignored for now
#'
#' @return A loom object connected to \code{filename}
#'
#' @aliases subset
#'
#' @importFrom utils setTxtProgressBar
#'
#' @seealso \code{\link{loom}}
#'
#' @export subset.loom
#' @method subset loom
#'
subset.loom <- function(
x,
m = NULL,
n = NULL,
filename = NULL,
chunk.size = 1000,
overwrite = FALSE,
display.progress = TRUE,
...
) {
m.all <- is.null(x = m)
n.all <- is.null(x = n)
if (m.all && n.all) {
stop("Subset is set to all data, not subsetting")
}
# Set some defaults
if (m.all) { # Pull all cells
m <- 1:x[['matrix']]$dims[1]
}
if (n.all) { # Pull all genes
n <- 1:x[['matrix']]$dims[2]
}
if (is.null(x = filename)) { # Set default filename
filename <- paste(
unlist(x = strsplit(x = x$filename, split = '.', fixed = TRUE)),
collapse = '_subset.'
)
}
# Ensure that m and n are within the bounds of the loom file
if (max(m) > x[['matrix']]$dims[1] || max(n) > x[['matrix']]$dims[2]) {
stop(paste(
"'m' and 'n' must be less than",
x[['matrix']]$dims[1],
"and",
x[['matrix']]$dims[2],
"respectively"
))
}
extension <- rev(x = unlist(x = strsplit(x = filename, split = '.', fixed = TRUE)))[1]
# Ensure that we call our new file a loom file
if (extension != 'loom') {
filename <- paste0(filename, '.loom')
}
# Make the loom file
mode <- ifelse(test = overwrite, yes = 'w', no = 'w-')
if (file.exists(filename) && !overwrite) {
stop(paste('File', filename, 'already exists!'))
} else if (display.progress) {
catn("Writing new loom file to", filename)
}
new.loom <- loom$new(filename = filename, mode = mode)
# Add /matrix
matrix.dims <- c(length(x = m), length(x = n))
new.loom$create_dataset(
name = 'matrix',
dtype = getDtype2(x = class(x = x[['matrix']]$get_type())[1]),
dims = matrix.dims
)
new.loom$shape <- rev(x = matrix.dims)
batch <- break.consecutive(
x = m,
max.length = chunk.size,
min.length = chunk.size * 0.5
)
previous <- 1L
layers <- list.datasets(object = x, path = 'layers', full.names = TRUE)
if (display.progress) {
msg <- "Adding data for /matrix"
num.layers <- length(x = layers)
if (num.layers) {
msg <- paste(msg, 'and', num.layers, 'layers')
}
catn(msg)
if (!num.layers) {
cate("No layers found")
}
pb <- new.pb()
}
if (length(x = layers) > 0) {
for (layer in layers) {
new.loom[['layers']]$create_dataset(
name = basename(path = layer),
dtype = getDtype2(x = class(x = x[[layer]]$get_type())[1]),
dims = matrix.dims
)
}
}
for (i in 1:length(x = batch)) {
indices.use <- batch[[i]]
chunk.indices <- min(indices.use):max(indices.use)
indices.return <- previous:(previous + length(x = indices.use) - 1)
indices.use <- chunk.indices[chunk.indices %in% indices.use] - chunk.indices[1] + 1
previous <- max(indices.return) + 1L
# chunk.indices <- x$batch.next(return.data = FALSE)
# indices.use <- chunk.indices[chunk.indices %in% m]
# if (length(x = indices.use) < 1) {
# if (display.progress) {
# setTxtProgressBar(pb = pb, value = i / length(x = batch))
# }
# next
# }
# indices.return <- match(x = indices.use, table = m)
# indices.use <- indices.use - chunk.indices[1] + 1
chunk.data <- x[['matrix']][chunk.indices, ]
chunk.data <- matrix(data = chunk.data, nrow = length(x = chunk.indices))
chunk.data <- chunk.data[indices.use, n]
# chunk.data <- chunk.data[indices.use, n]
new.loom[['matrix']][indices.return, ] <- chunk.data
gc(verbose = FALSE)
if (length(x = layers) > 0) {
for (layer in layers) {
layer.data <- x[[layer]][chunk.indices, ]
layer.data <- layer.data[indices.use, n]
new.loom[[layer]][indices.return, ] <- layer.data
gc(verbose = FALSE)
}
}
if (display.progress) {
setTxtProgressBar(pb = pb, value = i / length(x = batch))
}
}
if (display.progress) {
close(con = pb)
}
# Add groups
for (group in c('row_attrs', 'row_graphs', 'col_attrs', 'col_graphs', 'layers')) {
new.loom$create_group(name = group)
}
# Add row attributes
row.attrs <- list.datasets(object = x, path = 'row_attrs', full.names = TRUE)
if (length(x = row.attrs) > 0) {
if (display.progress) {
catn("Adding", length(x = row.attrs), "row attributes")
pb <- new.pb()
counter <- 0
}
for (row.attr in row.attrs) {
if (n.all) {
new.loom[['row_attrs']]$obj_copy_from(
src_loc = x,
src_name = row.attr,
dst_name = basename(path = row.attr)
)
} else if (length(x = x[[row.attr]]$dims) == 2) {
new.loom[[row.attr]] <- x[[row.attr]][,][, n]
} else {
new.loom[[row.attr]] <- x[[row.attr]][][n]
}
gc(verbose = FALSE)
if (display.progress) {
counter <- counter + 1
setTxtProgressBar(pb = pb, value = counter / length(x = row.attrs))
}
}
if (display.progress) {
close(con = pb)
}
} else {
cate("No row attributes found")
}
# Add col attributes
col.attrs <- list.datasets(object = x, path = 'col_attrs', full.names = TRUE)
if (length(x = col.attrs) > 0) {
if (display.progress) {
catn("Adding", length(x = col.attrs), "column attributes")
pb <- new.pb()
counter <- 0
}
for (col.attr in col.attrs) {
if (m.all) {
new.loom[['col_attrs']]$obj_copy_from(
src_loc = x,
src_name = col.attr,
dst_name = basename(path = col.attr)
)
} else if (length(x = x[[col.attr]]$dims) == 2) {
new.loom[[col.attr]] <- x[[col.attr]][, m]
} else {
new.loom[[col.attr]] <- x[[col.attr]][m]
}
gc(verbose = FALSE)
if (display.progress) {
counter <- counter + 1
setTxtProgressBar(pb = pb, value = counter / length(x = col.attrs))
}
}
if (display.progress) {
close(con = pb)
}
} else {
cate("No column attributes found")
}
# # Add layers
# layers <- list.datasets(object = x, path = 'layers', full.names = TRUE)
# if (length(x = layers) > 0) {
# if (display.progress) {
# catn("Adding", length(x = layers), "layers")
# pb <- new.pb()
# }
# # Initialize datasets
# for (layer in layers) {
# new.loom[['layers']]$create_dataset(
# name = basename(path = layer),
# dtype = getDtype2(x = class(x = x[[layer]]$get_type())[1]),
# dims = matrix.dims
# )
# }
# # batch <- x$batch.scan(chunk.size = chunk.size)
# for (i in 1:length(x = batch)) {
# # chunk.indices <- x$batch.next(return.data = FALSE)
# indices.use <- batch[[i]]
# chunk.indices <- min(indices.use):max(indices.use)
# indices.use <- chunk.indices[chunk.indices %in% m]
# indices.return <- match(x = indices.use, table = m)
# indices.use <- indices.use - chunk.indices[1] + 1
# # Add the data for each layer for this batch
# for (layer in layers) {
# layer.data <- x[[layer]][chunk.indices, ]
# layer.data <- layer.data[indices.use, n]
# new.loom[[layer]][indices.return, ] <- layer.data
# gc(verbose = FALSE)
# }
# if (display.progress) {
# setTxtProgressBar(pb = pb, value = i / length(x = batch))
# }
# }
# if (display.progress) {
# close(con = pb)
# }
# } else {
# cate("No layers found")
# }
new.loom$flush()
new.loom$load.fields()
return(new.loom)
}
#' Combine loom files
#'
#' @param looms A list of \code{loom} objects or paths to loom files
#' @param filename Name for resultant loom file
#' @param chunk.size How many rows from each input loom should we stream to the merged loom file at any given time?
#' @param order.by Optional row attribute to order each input loom by, must be one dimensional
#' @param overwrite Overwrite \code{filename} if already exists?
#' @param display.progress Display progress as we're copying over data
#'
#' @return A loom object connected to \code{filename}
#'
#' @importFrom utils setTxtProgressBar
#'
#' @seealso \code{\link{loom}}
#'
#' @export
#'
combine <- function(
looms,
filename,
chunk.size = 1000,
order.by = NULL,
overwrite = FALSE,
display.progress = TRUE,
...
) {
if (!is.list(x = looms)) {
stop("'combine' takes a list of loom objects or paths to loom files")
}
# Basic checking of input arguments
looms <- looms[vapply(
X = looms,
FUN = inherits,
FUN.VALUE = logical(length = 1L),
what = c('loom', 'character')
)]
if (length(x = looms) < 2) {
stop("Need at least two loom objects or files to merge")
}
# Check the existance of loom files
loom.names <- looms[is.character(x = looms)]
if (length(x = loom.names) > 0) {
if (!all(file.exists(loom.names))) {
stop(paste0(
"Cannot find the following loom files: '",
paste(loom.names[!file.exists(loom.names)], collapse = "', '"),
"'"
))
}
}
# Set mode and provide more useful error
mode <- ifelse(test = overwrite, yes = 'w', no = 'w-')
if (file.exists(filename) && !overwrite) {
stop(paste('File', filename, 'already exists!'))
}
# Check loom contents
# Every loom must have same number of genes (rows, MARGIN = 2)
# and same datasets in the groups
row.attrs <- vector(mode = 'list', length = length(x = looms))
row.types <- list()
col.attrs <- vector(mode = 'list', length = length(x = looms))
col.dims <- list()
col.ndims <- list()
col.types <- list()
layers <- vector(mode = 'list', length = length(x = looms))
layers.types <- list()
nrows <- vector(mode = 'list', length = length(x = looms))
ncols <- vector(mode = 'list', length = length(x = looms))
matrix.type <- list()
if (display.progress) {
catn("Validating", length(x = looms), "input loom files")
pb <- new.pb()
}
for (i in 1:length(x = looms)) {
this <- if (is.character(x = looms[[i]])) {
connect(filename = looms[[i]])
} else {
looms[[i]]
}
row.attrs[[i]] <- sort(x = list.datasets(
object = this,
path = 'row_attrs',
full.names = TRUE
))
for (attr in row.attrs[[i]]) {
if (length(x = attr) > 0) {
row.types[[attr]] <- c(
row.types[[attr]],
class(x = this[[attr]]$get_type())[1]
)
}
}
col.attrs[[i]] <- sort(x = list.datasets(
object = this,
path = 'col_attrs',
full.names = TRUE
))
for (attr in col.attrs[[i]]) {
if (length(x = attr) > 0) {
col.types[[attr]] <- c(
col.types[[attr]],
class(x = this[[attr]]$get_type())[1]
)
col.dims[[attr]] <- sum(col.dims[[attr]], this[[attr]]$dims[1])
col.ndims[[attr]] <- c(
col.ndims[[attr]],
length(x = this[[attr]]$dims)
)
}
}
layers[[i]] <- sort(x = list.datasets(
object = this,
path = 'layers',
full.names = TRUE
))
for (lay in layers) {
if (length(x = lay) > 0) {
layers.types[[lay]] <- c(
layers.types[[lay]],
class(x = this[[lay]]$get_type())[1]
)
}
}
nrows[[i]] <- this[['matrix']]$dims[2]
ncols[[i]] <- this[['matrix']]$dims[1]
matrix.type[[i]] <- class(x = this[['matrix']]$get_type())[1]
if (is.character(x = looms[[i]])) {
this$close_all()
}
if (display.progress) {
setTxtProgressBar(pb = pb, value = i / length(x = looms))
}
}
row.attrs <- unique(x = row.attrs)
col.attrs <- unique(x = col.attrs)
layers <- unique(x = layers)
nrows <- unlist(x = unique(x = nrows))
ncols <- unlist(x = ncols)
ncells <- sum(ncols)
if (length(x = row.attrs) != 1) {
stop("Not all loom objects have the same row attributes")
}
if (length(x = col.attrs) != 1) {
stop("Not all loom objects have the same column attributes")
}
if (length(x = layers) != 1) {
stop("Not all loom objects have the same layers")
}
if (length(x = nrows) != 1) {
stop("Not all loom objects have the number of rows (MARGIN = 2)")
}
# Check for the row attribute to order by
if (!is.null(x = order.by)) { # We have something to order by
if (!grepl(pattern = order.by, x = row.attrs)) { # Check to ensure that the order.by is in each of our row attributes
stop(paste0("Cannot find '", order.by, "' in the row attributes for the loom files provided"))
} else {
# If it is, get the values to order by
order.by <- basename(path = order.by) # Basename of the order.by attribute name
temp <- if (is.character(x = looms[1])) { # Connect to the loom first loom file
connect(filename = looms[1])
} else {
looms[1]
}
order.dat <- temp[['row_attrs']][[order.by]] # Ensure that our order.by attribute is one dimmensional
if (length(x = order.dat$dims) != 1) {
if (is.character(x = looms[1])) {
temp$close_all()
}
stop("'order.by' must reference a one dimensional attribute")
}
# If so, pull the data
order.use <- order.dat[]
# If the first loom was initially closed, close it again
if (is.character(x = looms[1])) {
temp$close_all()
}
}
}
# Check data types:
matrix.type <- unlist(x = unique(x = matrix.type))
row.types <- lapply(X = row.types, FUN = unique)
row.types.counts <- vapply(X = row.types, FUN = length, FUN.VALUE = integer(length = 1L))
col.types <- lapply(X = col.types, FUN = unique)
col.types.counts <- vapply(X = col.types, FUN = length, FUN.VALUE = integer(length = 1L))
layers.types <- lapply(X = layers.types, FUN = unique)
layers.types.counts <- vapply(X = layers.types, FUN = length, FUN.VALUE = integer(length = 1L))
if (any(row.types.counts > 1)) {
stop(paste0(
"The following row attributes have multiple types across the input loom files: '",
paste(names(x = row.types.counts[row.types.counts > 1]), collapse = "', '"),
"'; cannot combine"
))
}
if (any(col.types.counts > 1)) {
stop(paste0(
"The following column attributes have multiple types across the input loom files: '",
paste(names(x = col.types.counts[col.types.counts > 1]), collapse = "', '"),
"'; cannot combine"
))
}
if (any(layers.types.counts > 1)) {
stop(paste0(
"The following layers have multiple types across the input loom files: '",
paste(names(x = layers.types.counts[layers.types.counts > 1]), collapse = "', '"),
"'; cannot combine"
))
}
if (length(x = matrix.type) != 1) {
stop("Cannot combine multiple datatypes for '/matrix'")
}
# Check dimmensions for col_attrs
col.ndims <- lapply(X = col.ndims, FUN = unique)
col.ndims.counts <- vapply(X = col.ndims, FUN = length, FUN.VALUE = integer(length = 1L))
if (any(col.ndims.counts > 1)) {
stop(paste0(
"The following column attributes have varying dimmensions across the input loom files: '",
paste(names(x = col.ndims.counts[col.ndims.counts > 1]), collapse = "', '"),
"'; cannot combine"
))
}
# Create the new HDF5 file and the required groups
new.loom <- loom$new(filename = filename, mode = mode)
groups <- c('layers', "row_attrs", "col_attrs", "row_graphs", "col_graphs")
for (group in groups) {
new.loom$create_group(name = group)
}
# Initialize the '/matrix' dataset as well as datasets in 'col_attrs' and 'layers'
col.attrs <- unlist(col.attrs)
row.attrs <- unlist(row.attrs)
new.loom$create_dataset(
name = 'matrix', # Name is '/matrix'
dtype = getDtype2(x = matrix.type), # Use the single type that we got from above
dims = c(ncells, nrows) # Use the number of cells from the sum of ncols above, nrows should be the same for everyone
)
for (attr in col.attrs) {
if (length(x = attr) > 0) {
dims.use <- switch(
EXPR = col.ndims[[attr]],
'1' = ncells,
'2' = c(col.dims[[attr]], ncells),
stop("loomR supports only one- and two-dimmensional attribute datasets")
)
new.loom$create_dataset(
name = attr,
dtype = getDtype2(x = col.types[[attr]]),
dims = dims.use
)
}
}
for (lay in layers) {
if (length(x = lay) > 1) {
new.loom$create_dataset(
name = lay,
dtype = getDtype2(x = layers.types[[lay]]),
dims = c(ncells, nrows)
)
}
}
# Start adding loom objects
matrix.previous <- 0
col.previous <- vector(mode = 'integer', length = length(x = col.attrs))
names(x = col.previous) <- col.attrs
for (i in 1:length(x = looms)) {
if (display.progress) {
catn("\nAdding loom file", i ,"of", length(x = looms))
}
# Open the loom file
this <- if (is.character(x = looms[[i]])) {
connect(filename = looms[[i]])
} else {
looms[[i]]
}
# Get the chunk points
chunk.points <- chunkPoints(data.size = ncols[[i]], chunk.size = chunk.size)
# print(chunk.points)
# Pull ordering information
order.genes <- if (is.null(x = order.by)) {
1:nrows # If order.by wasn't provided, just use 1:number of genes
} else {
# Otherwise, match the data order from our order.use
order(match(x = this[[row.attrs]][[order.by]][], table = order.use))
}
# Add data to /matrix, /col_attrs, and /layers
if (display.progress) {
catn("Adding data to /matrix and /layers")
pb <- new.pb()
}
for (col in 1:ncol(x = chunk.points)) {
cells.use <- chunk.points[1, col]:chunk.points[2, col]
# Add /matrix for this chunk
matrix.add <- this[['matrix']][cells.use, ]
new.loom[['matrix']][cells.use + matrix.previous, ] <- matrix.add[, order.genes]
# Add layers for this chunk
for (lay in list.datasets(object = this[['layers']])) {
lay.add <- this[['layers']][[lay]][cells.use, ]
new.loom[['layers']][[lay]][cells.use + matrix.previous, ] <- lay.add[, order.genes]
}
if (display.progress) {
setTxtProgressBar(pb = pb, value = col / ncol(x = chunk.points))
}
gc()
}
matrix.previous <- matrix.previous + max(chunk.points)
# Add col_attrs for this chunk
if (display.progress) {
catn("\nAdding data to /col_attrs")
pb <- new.pb()
}
for (attr in list.datasets(object = this[['col_attrs']], full.names = TRUE)) {
start <- col.previous[attr]
end <- start + this[[attr]]$dims[length(x = this[[attr]]$dims)]
if (col.ndims[[attr]] == 1) {
new.loom[[attr]][(start + 1):end] <- this[[attr]][]
} else {
for (col in 1:ncol(x = chunk.points)) {
col.use <- chunk.points[1, col]:chunk.points[2, col]
new.loom[[attr]][col.use, (start + 1):end] <- this[[attr]][col.use, ]
}
}
col.previous[attr] <- end
if (display.progress) {
setTxtProgressBar(
pb = pb,
value = (grep(
pattern = attr,
x = list.datasets(object = this[['col_attrs']], full.names = TRUE)
) / length(x = list.datasets(object = this[['col_attrs']], full.names = TRUE)))[1]
)
}
}
# Copy row attributes from the first loom object into the merged one
if (i == 1) {
if (display.progress) {
catn("\nCopying data for /row_attrs")
pb <- new.pb()
}
for (attr in list.datasets(object = this, path = 'row_attrs')) {
new.loom[['row_attrs']]$obj_copy_from(
src_loc = this[['row_attrs']],
src_name = attr,
dst_name = attr
)
if (display.progress) {
setTxtProgressBar(
pb = pb,
value = grep(
pattern = attr,
x = list.datasets(object = this[['row_attrs']])
) / length(x = list.datasets(object = this[['row_attrs']]))
)
}
}
}
new.loom$flush()
# Close current loom file, if not open previously
if (is.character(x = looms[[i]])) {
this$close_all()
}
}
return(new.loom)
}
# #need to comment
# #need to add progress bar
# #but otherwise, pretty cool
# #for paul to try :
# # f <- connect("~/Downloads/10X43_1.loom")
# # mean_var = map(f,f_list = c(mean,var),chunksize = 5000)
# # nGene <- map(f, f_list = function(x) length(which(x>0)), MARGIN = 2)
# map <- function(self, f_list = list(mean, var), MARGIN=1, chunksize=1000, selection) {
# n_func = length(f_list)
# if (n_func == 1) {
# f_list <- list(f_list)
# }
# if (MARGIN == 1) {
# results <- list()
# for (j in 1:n_func) {
# results[[j]] <- numeric(0)
# }
# rows_per_chunk <- chunksize
# ix <- 1
# while (ix <= self@shape[1]) {
# rows_per_chunk <- min(rows_per_chunk, self@shape[1] - ix + 1)
# chunk <- self["matrix"][ix:(ix + rows_per_chunk - 1), ]
# for (j in 1:n_func) {
# new_results <- apply(chunk, 1, FUN = f_list[[j]])
# results[[j]] <- c(results[[j]], new_results)
# }
# ix <- ix + chunksize
# }
# }
# if (MARGIN == 2) {
# results <- list()
# for (j in 1:n_func) {
# results[[j]] <- numeric(0)
# }
# cols_per_chunk <- chunksize
# ix <- 1
# while (ix <= self@shape[2]) {
# cols_per_chunk <- min(cols_per_chunk, self@shape[2] - ix + 1)
# chunk <- self["matrix"][, ix:(ix + cols_per_chunk - 1)]
# for (j in 1:n_func) {
# new_results <- apply(chunk, 2, FUN = f_list[[j]])
# results[[j]] <- c(results[[j]], new_results)
# }
# ix <- ix + chunksize
# }
# }
# if (n_func == 1) {
# results <- results[[1]]
# }
# return(results)
# }
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.