h5mread | R Documentation |
rhdf5::h5read
An efficient and flexible alternative to rhdf5::h5read()
.
h5mread(filepath, name, starts=NULL, counts=NULL, noreduce=FALSE,
as.vector=NA, as.integer=FALSE, as.sparse=FALSE,
method=0L, use.H5Dread_chunk=FALSE)
get_h5mread_returned_type(filepath, name, as.integer=FALSE)
filepath |
The path (as a single string) to the HDF5 file where the dataset to read from is located, or an H5File object. Note that you must create and use an H5File object if the HDF5
file to access is stored in an Amazon S3 bucket. See Also please note that H5File objects must NOT be used in the context of parallel evaluation at the moment. |
name |
The name of the dataset in the HDF5 file. |
starts , counts |
If If Each list element in If Finally note that when |
noreduce |
TODO |
as.vector |
Should the data be returned in a vector instead of an array?
By default (i.e. when set to |
as.integer |
TODO |
as.sparse |
TODO |
method |
TODO |
use.H5Dread_chunk |
TODO |
DETAILS COMING SOON...
h5mread()
returns an ordinary array or vector if as.sparse
is FALSE
(the default), and a COO_SparseArray
object if as.sparse
is TRUE
.
get_h5mread_returned_type()
returns the type of the array or
vector that will be returned by h5mread()
.
Equivalent to (but more efficient than):
typeof(h5mread(filepath, name, rep(list(integer(0)), ndim)))
where ndim
is the number of dimensions (a.k.a. rank in
HDF5 jargon) of the dataset.
H5File objects.
h5read
in the rhdf5 package.
extract_array
in the S4Arrays
package.
COO_SparseArray objects in the SparseArray package.
The TENxBrainData
dataset (in the
TENxBrainData package).
h5mread_from_reshaped
to read data from a virtually
reshaped HDF5 dataset.
## ---------------------------------------------------------------------
## BASIC USAGE
## ---------------------------------------------------------------------
m0 <- matrix((runif(600) - 0.5) * 10, ncol=12)
M0 <- writeHDF5Array(m0, name="M0")
m <- h5mread(path(M0), "M0")
stopifnot(identical(m0, m))
m <- h5mread(path(M0), "M0", starts=list(NULL, c(3, 12:8)))
stopifnot(identical(m0[ , c(3, 12:8)], m))
m <- h5mread(path(M0), "M0", starts=list(integer(0), c(3, 12:8)))
stopifnot(identical(m0[NULL , c(3, 12:8)], m))
m <- h5mread(path(M0), "M0", starts=list(1:5, NULL), as.integer=TRUE)
storage.mode(m0) <- "integer"
stopifnot(identical(m0[1:5, ], m))
a0 <- array(1:350, c(10, 5, 7))
A0 <- writeHDF5Array(a0, filepath=path(M0), name="A0")
h5ls(path(A0))
a <- h5mread(path(A0), "A0", starts=list(c(2, 7), NULL, 6),
counts=list(c(4, 2), NULL, NULL))
stopifnot(identical(a0[c(2:5, 7:8), , 6, drop=FALSE], a))
## Load the data in a sparse array representation:
m1 <- matrix(c(5:-2, rep.int(c(0L, 99L), 11)), ncol=6)
M1 <- writeHDF5Array(m1, name="M1", chunkdim=c(3L, 2L))
index <- list(5:3, NULL)
m <- h5mread(path(M1), "M1", starts=index)
coo <- h5mread(path(M1), "M1", starts=index, as.sparse=TRUE)
class(coo) # COO_SparseArray object (see ?COO_SparseArray)
as(coo, "dgCMatrix")
stopifnot(identical(m, as.array(coo)))
## ---------------------------------------------------------------------
## PERFORMANCE
## ---------------------------------------------------------------------
library(ExperimentHub)
hub <- ExperimentHub()
## With the "sparse" TENxBrainData dataset
## ---------------------------------------
fname0 <- hub[["EH1039"]]
h5ls(fname0) # all datasets are 1D datasets
index <- list(77 * sample(34088679, 5000, replace=TRUE))
## h5mread() is about 4x faster than h5read():
system.time(a <- h5mread(fname0, "mm10/data", index))
system.time(b <- h5read(fname0, "mm10/data", index=index))
stopifnot(identical(a, as.vector(b)))
index <- list(sample(1306127, 7500, replace=TRUE))
## h5mread() is about 20x faster than h5read():
system.time(a <- h5mread(fname0, "mm10/barcodes", index))
system.time(b <- h5read(fname0, "mm10/barcodes", index=index))
stopifnot(identical(a, as.vector(b)))
## With the "dense" TENxBrainData dataset
## --------------------------------------
fname1 <- hub[["EH1040"]]
h5ls(fname1) # "counts" is a 2D dataset
set.seed(33)
index <- list(sample(27998, 300), sample(1306127, 450))
## h5mread() is about 2x faster than h5read():
system.time(a <- h5mread(fname1, "counts", index))
system.time(b <- h5read(fname1, "counts", index=index))
stopifnot(identical(a, b))
## Alternatively 'as.sparse=TRUE' can be used to reduce memory usage:
system.time(coo <- h5mread(fname1, "counts", index, as.sparse=TRUE))
stopifnot(identical(a, as.array(coo)))
## The bigger the selection, the greater the speedup between
## h5read() and h5mread():
## Not run:
index <- list(sample(27998, 1000), sample(1306127, 1000))
## h5mread() about 4x faster than h5read() (12s vs 48s):
system.time(a <- h5mread(fname1, "counts", index))
system.time(b <- h5read(fname1, "counts", index=index))
stopifnot(identical(a, b))
## With 'as.sparse=TRUE' (about the same speed as with 'as.sparse=FALSE'):
system.time(coo <- h5mread(fname1, "counts", index, as.sparse=TRUE))
stopifnot(identical(a, as.array(coo)))
## End(Not run)
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.