Nothing
## ----setup, echo = FALSE, include=FALSE---------------------------------------
library(rhdf5)
library(dplyr)
library(ggplot2)
## ----create data, echo=TRUE, warning=FALSE------------------------------------
m1 <- matrix(rep(1:20000, each = 100), ncol = 20000, byrow = FALSE)
ex_file <- tempfile(fileext = ".h5")
h5write(m1, file = ex_file, name = "counts", level = 6)
## ----extract1, echo = TRUE----------------------------------------------------
system.time(
res1 <- h5read(file = ex_file, name = "counts",
index = list(NULL, 1:10000))
)
## ----extract2, echo = TRUE----------------------------------------------------
index <- list(NULL, seq(from = 1, to = 20000, by = 2))
system.time(
res2 <- h5read(file = ex_file, name = "counts",
index = index)
)
## ----extract3, echo = TRUE----------------------------------------------------
start <- c(1,1)
stride <- c(1,2)
block <- c(100,1)
count <- c(1,10000)
system.time(
res3 <- h5read(file = ex_file, name = "counts", start = start,
stride = stride, block = block, count = count)
)
identical(res2, res3)
## ---- eval = TRUE, echo = FALSE, fig.width=6, fig.height=3, fig.wide = TRUE----
## this code demonstrates the exponential increase in time as the
## number of hyberslab unions increases
select_index <- function(n = 1) {
## open the dataspace for the count table
fid <- H5Fopen(ex_file)
did <- H5Dopen(fid, name = "counts")
sid <- H5Dget_space(did)
## column choice based on number of unions required
columns <- c(head(1:10001, n = -n), head(seq(10001-n+2, 20000, 2), n = n-1))
index <- list(100, columns)
H5Sselect_index(sid, index = index)
## tidy up
H5Sclose(sid)
H5Dclose(did)
H5Fclose(fid)
}
bm <- microbenchmark::microbenchmark(
select_index(1), select_index(2), select_index(5),
select_index(10), select_index(20), select_index(50),
select_index(100), select_index(200), select_index(500),
select_index(1000), select_index(2000), select_index(5000),
select_index(10000),
times = 3
) %>% mutate(n = gsub(".*\\(([0-9]+)\\)", "\\1", expr)
%>% as.integer(),
time = time / 1e9)
ggplot(bm,aes(x = n, y = time)) +
geom_point() +
scale_x_log10() +
scale_y_log10() +
theme_bw() +
xlab('number of hyperslab unions') +
ylab('time (seconds)')
## ----singleReads, cached = TRUE-----------------------------------------------
set.seed(1234)
columns <- sample(x = seq_len(20000), size = 10000, replace = FALSE) %>%
sort()
f1 <- function(cols, name) {
h5read(file = ex_file, name = name,
index = list(NULL, cols))
}
system.time(res4 <- vapply(X = columns, FUN = f1,
FUN.VALUE = integer(length = 100),
name = 'counts'))
## ----createChunked, echo = TRUE, eval = TRUE, results='hide'------------------
h5createDataset(file = ex_file, dataset = "counts_chunked",
dims = dim(m1), storage.mode = "integer",
chunk = c(100,100), level = 6)
h5write(obj = m1, file = ex_file, name = "counts_chunked")
## ----read_chunked, eval = TRUE------------------------------------------------
system.time(res5 <- vapply(X = columns, FUN = f1,
FUN.VALUE = integer(length = 100),
name = 'counts_chunked'))
## -----------------------------------------------------------------------------
f2 <- function(block_size = 100) {
cols_grouped <- split(columns, (columns-1) %/% block_size)
res <- lapply(cols_grouped, f1, name = 'counts_chunked') %>%
do.call('cbind', .)
}
system.time(f2())
## ----benchmark, echo = FALSE, cache = TRUE------------------------------------
bm <- microbenchmark::microbenchmark(
f2(10), f2(25), f2(50), f2(100),
f2(250), f2(500), f2(1000),
f2(2000), f2(5000), f2(10000),
times = 3
) %>% mutate(block_size = gsub(".*\\(([0-9]+)\\)", "\\1", expr)
%>% as.integer(),
time = time / 1e9)
## ---- echo = FALSE, fig.width=6, fig.height=3, fig.wide = TRUE----------------
ggplot(bm, aes(x = block_size, y = time)) +
geom_point() +
scale_x_log10() +
theme_bw() +
ylab('time (seconds)')
## ---- echo = FALSE, eval = FALSE----------------------------------------------
# h5createDataset(file = ex_file, dataset = "counts_chunked", dims = dim(m1),
# storage.mode = "integer", chunk = c(100,1), level = 6)
# h5write(obj = m1, file = ex_file, name = "counts_chunked")
# f2 <- function(cols) { h5read(file = ex_file, name = 'counts_chunked', index = list(NULL, cols))}
# system.time(res5 <- vapply(columns, f2, integer(length = 100)))
## ---- echo = FALSE, eval = FALSE----------------------------------------------
# h5createDataset(file = ex_file, dataset = "counts_chunked_100", dims = dim(m1),
# storage.mode = "integer", chunk = c(100,100), level = 6)
# h5write(obj = m1, file = ex_file, name = "counts_chunked_100")
#
# f2 <- function(col, fid, dapl, did, sid, mem_sid) {
# size <- H5Sselect_index(h5space = sid, index = list(NULL, col))
# res <- H5Dread(did, sid, mem_sid)
# res
# }
#
# fid <- H5Fopen(ex_file)
# dapl <- H5Pcreate("H5P_DATASET_ACCESS")
# did <- H5Dopen(h5loc = fid, name = "counts", dapl = dapl)
# sid <- H5Dget_space(h5dataset = did)
# mem_sid <- H5Screate_simple(c(100,1))
# system.time(res5 <- vapply(columns, f2, integer(length = 100), fid, dapl, did, sid, mem_sid))
# H5Sclose(sid)
# H5Sclose(mem_sid)
# H5Pclose(dapl)
# H5Dclose(did)
# H5Fclose(fid)
## ---- echo = FALSE, eval = FALSE----------------------------------------------
# h5createDataset(file = ex_file, dataset = "counts_chunked", dims = dim(m1),
# storage.mode = "integer", chunk = c(100,1), level = 6)
# h5write(obj = m1, file = ex_file, name = "counts_chunked")
#
# f2 <- function(col, fid, dapl, did) {
# sid <- .Call("_H5Dget_space", did@ID, PACKAGE = "rhdf5")
# #sid <- H5Dget_space(did)
# mem_sid <- .Call("_H5Screate_simple", c(100,1), c(100,1), PACKAGE='rhdf5')
# .Call("_H5Sselect_index", sid, list(col-1,0), list(1,100), PACKAGE='rhdf5')
# res <- .Call("_H5Dread", did@ID, sid, mem_sid, NULL, TRUE, 0L, FALSE, FALSE, PACKAGE='rhdf5')
# invisible(.Call("_H5Sclose", sid, PACKAGE='rhdf5'))
# #H5Sclose(sid)
# invisible(.Call("_H5Sclose", mem_sid, PACKAGE='rhdf5'))
# res
# }
#
# fid <- H5Fopen(ex_file)
# dapl <- H5Pcreate("H5P_DATASET_ACCESS")
# did <- H5Dopen(h5loc = fid, name = "counts_chunked", dapl = dapl)
# system.time(res5 <- vapply(columns, f2, double(length = 100), fid, dapl, did))
# H5Pclose(dapl)
# H5Dclose(did)
# H5Fclose(fid)
## ----sessionInfo, echo=FALSE--------------------------------------------------
sessionInfo()
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.