h5mread | R Documentation |
rhdf5::h5read
h5mread
is the result of experimenting with alternative
rhdf5::h5read
implementations.
It should still be considered experimental!
h5mread(filepath, name, starts=NULL, counts=NULL, noreduce=FALSE,
as.integer=FALSE, as.sparse=FALSE,
method=0L, use.H5Dread_chunk=FALSE)
get_h5mread_returned_type(filepath, name, as.integer=FALSE)
filepath |
The path (as a single string) to the HDF5 file where the dataset to read from is located, or an H5File object. Note that you must create and use an H5File object if the HDF5
file to access is stored in an Amazon S3 bucket. See Also please note that H5File objects must NOT be used in the context of parallel evaluation at the moment. |
name |
The name of the dataset in the HDF5 file. |
starts, counts |
If If Each list element in If Finally note that when |
noreduce |
TODO |
as.integer |
TODO |
as.sparse |
TODO |
method |
TODO |
use.H5Dread_chunk |
TODO |
COMING SOON...
An array for h5mread
.
The type of the array that will be returned by h5mread
for
get_h5mread_returned_type
.
Equivalent to:
typeof(h5mread(filepath, name, rep(list(integer(0)), ndim)))
where ndim
is the number of dimensions (a.k.a. the rank
in HDF5 jargon) of the dataset. get_h5mread_returned_type
is
provided for convenience.
H5File objects.
h5read
in the rhdf5 package.
extract_array
in the S4Arrays
package.
The TENxBrainData
dataset (in the
TENxBrainData package).
h5mread_from_reshaped
to read data from a virtually
reshaped HDF5 dataset.
## ---------------------------------------------------------------------
## BASIC USAGE
## ---------------------------------------------------------------------
m0 <- matrix((runif(600) - 0.5) * 10, ncol=12)
M0 <- writeHDF5Array(m0, name="M0")
m <- h5mread(path(M0), "M0")
stopifnot(identical(m0, m))
m <- h5mread(path(M0), "M0", starts=list(NULL, c(3, 12:8)))
stopifnot(identical(m0[ , c(3, 12:8)], m))
m <- h5mread(path(M0), "M0", starts=list(integer(0), c(3, 12:8)))
stopifnot(identical(m0[NULL , c(3, 12:8)], m))
m <- h5mread(path(M0), "M0", starts=list(1:5, NULL), as.integer=TRUE)
storage.mode(m0) <- "integer"
stopifnot(identical(m0[1:5, ], m))
a0 <- array(1:350, c(10, 5, 7))
A0 <- writeHDF5Array(a0, filepath=path(M0), name="A0")
h5ls(path(A0))
a <- h5mread(path(A0), "A0", starts=list(c(2, 7), NULL, 6),
counts=list(c(4, 2), NULL, NULL))
stopifnot(identical(a0[c(2:5, 7:8), , 6, drop=FALSE], a))
## Load the data in a sparse array representation:
m1 <- matrix(c(5:-2, rep.int(c(0L, 99L), 11)), ncol=6)
M1 <- writeHDF5Array(m1, name="M1", chunkdim=c(3L, 2L))
index <- list(5:3, NULL)
m <- h5mread(path(M1), "M1", starts=index)
sas <- h5mread(path(M1), "M1", starts=index, as.sparse=TRUE)
class(sas) # SparseArraySeed object (see ?SparseArraySeed)
as(sas, "dgCMatrix")
stopifnot(identical(m, sparse2dense(sas)))
## ---------------------------------------------------------------------
## PERFORMANCE
## ---------------------------------------------------------------------
library(ExperimentHub)
hub <- ExperimentHub()
## With the "sparse" TENxBrainData dataset
## ---------------------------------------
fname0 <- hub[["EH1039"]]
h5ls(fname0) # all datasets are 1D datasets
index <- list(77 * sample(34088679, 5000, replace=TRUE))
## h5mread() is about 4x faster than h5read():
system.time(a <- h5mread(fname0, "mm10/data", index))
system.time(b <- h5read(fname0, "mm10/data", index=index))
stopifnot(identical(a, b))
index <- list(sample(1306127, 7500, replace=TRUE))
## h5mread() is about 20x faster than h5read():
system.time(a <- h5mread(fname0, "mm10/barcodes", index))
system.time(b <- h5read(fname0, "mm10/barcodes", index=index))
stopifnot(identical(a, b))
## With the "dense" TENxBrainData dataset
## --------------------------------------
fname1 <- hub[["EH1040"]]
h5ls(fname1) # "counts" is a 2D dataset
set.seed(33)
index <- list(sample(27998, 300), sample(1306127, 450))
## h5mread() is about 2x faster than h5read():
system.time(a <- h5mread(fname1, "counts", index))
system.time(b <- h5read(fname1, "counts", index=index))
stopifnot(identical(a, b))
## Alternatively 'as.sparse=TRUE' can be used to reduce memory usage:
system.time(sas <- h5mread(fname1, "counts", index, as.sparse=TRUE))
stopifnot(identical(a, sparse2dense(sas)))
## The bigger the selection, the greater the speedup between
## h5read() and h5mread():
## Not run:
index <- list(sample(27998, 1000), sample(1306127, 1000))
## h5mread() about 8x faster than h5read() (20s vs 2m30s):
system.time(a <- h5mread(fname1, "counts", index))
system.time(b <- h5read(fname1, "counts", index=index))
stopifnot(identical(a, b))
## With 'as.sparse=TRUE' (about the same speed as with 'as.sparse=FALSE'):
system.time(sas <- h5mread(fname1, "counts", index, as.sparse=TRUE))
stopifnot(identical(a, sparse2dense(sas)))
## End(Not run)
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.