inst/doc/practical_tips.R

## ----setup, echo = FALSE, include=FALSE---------------------------------------
library(rhdf5)
library(dplyr)
library(ggplot2)

## ----create data, echo=TRUE, warning=FALSE------------------------------------
m1 <- matrix(rep(1:20000, each = 100), ncol = 20000, byrow = FALSE)
ex_file <- tempfile(fileext = ".h5")
h5write(m1, file = ex_file, name = "counts", level = 6)

## ----extract1, echo = TRUE----------------------------------------------------
system.time(
  res1 <- h5read(file = ex_file, name = "counts", 
                 index = list(NULL, 1:10000))
)

## ----extract2, echo = TRUE----------------------------------------------------
index <- list(NULL, seq(from = 1, to = 20000, by = 2))
system.time(
  res2 <- h5read(file = ex_file, name = "counts", 
                 index = index)
)

## ----extract3, echo = TRUE----------------------------------------------------
start <- c(1,1)
stride <- c(1,2)
block <- c(100,1)
count <- c(1,10000)
system.time(
  res3 <- h5read(file = ex_file, name = "counts", start = start,
                 stride = stride, block = block, count = count)
)
identical(res2, res3)

## ---- eval = TRUE, echo = FALSE, fig.width=6, fig.height=3, fig.wide = TRUE----
## this code demonstrates the exponential increase in time as the 
## number of hyberslab unions increases

select_index <- function(n = 1) {

  ## open the dataspace for the count table
  fid <- H5Fopen(ex_file)
  did  <- H5Dopen(fid, name = "counts")
  sid <- H5Dget_space(did)
  
  ## column choice based on number of unions required
  columns <- c(head(1:10001, n = -n), head(seq(10001-n+2, 20000, 2), n = n-1))
  index <- list(100, columns)
  H5Sselect_index(sid, index = index)
  
  ## tidy up
  H5Sclose(sid)
  H5Dclose(did)
  H5Fclose(fid)
}

bm <- microbenchmark::microbenchmark(
  select_index(1), select_index(2), select_index(5), 
  select_index(10), select_index(20), select_index(50),
  select_index(100), select_index(200), select_index(500),
  select_index(1000), select_index(2000), select_index(5000),
  select_index(10000),
  times = 3
) %>% mutate(n = gsub(".*\\(([0-9]+)\\)", "\\1", expr) 
             %>% as.integer(),
             time = time / 1e9)

ggplot(bm,aes(x = n, y = time)) +
  geom_point() + 
  scale_x_log10() + 
  scale_y_log10() +
  theme_bw() +
  xlab('number of hyperslab unions') +
  ylab('time (seconds)')

## ----singleReads, cached = TRUE-----------------------------------------------
set.seed(1234)
columns <- sample(x = seq_len(20000), size = 10000, replace = FALSE) %>%
  sort()

f1 <- function(cols, name) { 
  h5read(file = ex_file, name = name, 
         index = list(NULL, cols))
  }
system.time(res4 <- vapply(X = columns, FUN = f1, 
                           FUN.VALUE = integer(length = 100), 
                           name = 'counts'))

## ----createChunked, echo = TRUE, eval = TRUE, results='hide'------------------
h5createDataset(file = ex_file, dataset = "counts_chunked", 
                dims = dim(m1), storage.mode = "integer", 
                chunk = c(100,100), level = 6)
h5write(obj = m1, file = ex_file, name = "counts_chunked")

## ----read_chunked, eval = TRUE------------------------------------------------
system.time(res5 <- vapply(X = columns, FUN = f1, 
                           FUN.VALUE = integer(length = 100), 
                           name = 'counts_chunked'))

## -----------------------------------------------------------------------------
f2 <- function(block_size = 100) {
  cols_grouped <- split(columns,  (columns-1) %/% block_size)
  res <-  lapply(cols_grouped, f1, name = 'counts_chunked') %>%
    do.call('cbind', .)
}
system.time(f2())

## ----benchmark, echo = FALSE, cache = TRUE------------------------------------
bm <- microbenchmark::microbenchmark(
  f2(10), f2(25), f2(50), f2(100), 
  f2(250), f2(500), f2(1000), 
  f2(2000), f2(5000), f2(10000),
  times = 3
) %>% mutate(block_size = gsub(".*\\(([0-9]+)\\)", "\\1", expr) 
             %>% as.integer(),
             time = time / 1e9)

## ---- echo = FALSE, fig.width=6, fig.height=3, fig.wide = TRUE----------------
ggplot(bm, aes(x = block_size, y = time)) + 
  geom_point() + 
  scale_x_log10() +
  theme_bw() + 
  ylab('time (seconds)')

## ---- echo = FALSE, eval = FALSE----------------------------------------------
#  h5createDataset(file = ex_file, dataset = "counts_chunked", dims = dim(m1),
#                  storage.mode = "integer", chunk = c(100,1), level = 6)
#  h5write(obj = m1, file = ex_file, name = "counts_chunked")
#  f2 <- function(cols) { h5read(file = ex_file, name = 'counts_chunked', index = list(NULL, cols))}
#  system.time(res5 <- vapply(columns, f2, integer(length = 100)))

## ---- echo = FALSE, eval = FALSE----------------------------------------------
#  h5createDataset(file = ex_file, dataset = "counts_chunked_100", dims = dim(m1),
#                  storage.mode = "integer", chunk = c(100,100), level = 6)
#  h5write(obj = m1, file = ex_file, name = "counts_chunked_100")
#  
#  f2 <- function(col, fid, dapl, did, sid, mem_sid) {
#      size <- H5Sselect_index(h5space = sid, index = list(NULL, col))
#      res <- H5Dread(did, sid, mem_sid)
#      res
#  }
#  
#  fid <- H5Fopen(ex_file)
#  dapl <- H5Pcreate("H5P_DATASET_ACCESS")
#  did <- H5Dopen(h5loc = fid, name = "counts", dapl = dapl)
#  sid <- H5Dget_space(h5dataset = did)
#  mem_sid <- H5Screate_simple(c(100,1))
#  system.time(res5 <- vapply(columns, f2, integer(length = 100), fid, dapl, did, sid, mem_sid))
#  H5Sclose(sid)
#  H5Sclose(mem_sid)
#  H5Pclose(dapl)
#  H5Dclose(did)
#  H5Fclose(fid)

## ---- echo = FALSE, eval = FALSE----------------------------------------------
#  h5createDataset(file = ex_file, dataset = "counts_chunked", dims = dim(m1),
#                  storage.mode = "integer", chunk = c(100,1), level = 6)
#  h5write(obj = m1, file = ex_file, name = "counts_chunked")
#  
#  f2 <- function(col, fid, dapl, did) {
#      sid <- .Call("_H5Dget_space", did@ID, PACKAGE = "rhdf5")
#      #sid <- H5Dget_space(did)
#      mem_sid <- .Call("_H5Screate_simple", c(100,1), c(100,1), PACKAGE='rhdf5')
#      .Call("_H5Sselect_index", sid, list(col-1,0), list(1,100), PACKAGE='rhdf5')
#      res <- .Call("_H5Dread", did@ID, sid, mem_sid, NULL, TRUE, 0L, FALSE, FALSE, PACKAGE='rhdf5')
#      invisible(.Call("_H5Sclose", sid, PACKAGE='rhdf5'))
#      #H5Sclose(sid)
#      invisible(.Call("_H5Sclose", mem_sid, PACKAGE='rhdf5'))
#      res
#  }
#  
#  fid <- H5Fopen(ex_file)
#  dapl <- H5Pcreate("H5P_DATASET_ACCESS")
#  did <- H5Dopen(h5loc = fid, name = "counts_chunked", dapl = dapl)
#  system.time(res5 <- vapply(columns, f2, double(length = 100), fid, dapl, did))
#  H5Pclose(dapl)
#  H5Dclose(did)
#  H5Fclose(fid)

## ----sessionInfo, echo=FALSE--------------------------------------------------
sessionInfo()

Try the rhdf5 package in your browser

Any scripts or data that you put into this service are public.

rhdf5 documentation built on Nov. 8, 2020, 6:56 p.m.