# -----------------------------------------------------------------------------
# hdfio readers
# -----------------------------------------------------------------------------
read_atomic_column = function(h5_fp, dataset, varname, rows)
{
ds = glue(dataset, varname)
if (is.null(rows))
h5_fp[[ds]]$read(NULL)
else
h5_fp[[ds]]$read(list(rows))
}
read_factor_column = function(h5_fp, dataset, varname, rows)
{
ds = glue(dataset, varname)
levels = h5attributes(h5_fp[[ds]])$LEVELS
x = read_atomic_column(h5_fp, dataset, varname, rows)
levels(x) = levels
class(x) = "factor"
x
}
read_h5df_column = function(h5_fp, dataset, rows, cols, strings)
{
if (is.null(cols))
{
if (strings == FALSE)
{
datasets = list.datasets(h5_fp[[dataset]])
is_string = sapply(datasets, function(ds) h5_is_string(h5_fp, dataset, ds))
cols = which(!is_string)
colnames = h5attributes(h5_fp[[dataset]])$VARNAMES[cols]
}
else
{
colnames = h5attributes(h5_fp[[dataset]])$VARNAMES
cols = 1:length(colnames)
}
}
else
{
colnames = h5attributes(h5_fp[[dataset]])$VARNAMES
if (max(cols) > length(colnames))
close_and_stop(h5_fp, "some 'cols' indices larger than the number of columns in the dataset")
colnames = colnames[cols]
}
x = vector(mode="list", length=length(cols))
names(x) = colnames
for (j in 1:length(cols))
{
nm = paste0("x", cols[j])
class = h5attributes(h5_fp[[glue(dataset, nm)]])$CLASS
if (length(class) == 0)
x[[j]] = read_atomic_column(h5_fp, dataset, nm, rows)
else if (class == H5_STORAGE_STR)
{
col = read_atomic_column(h5_fp, dataset, nm, rows)
col[col == "NA"] = NA_character_
x[[j]] = col
}
else if (class == H5_STORAGE_LGL)
{
x[[j]] = read_atomic_column(h5_fp, dataset, nm, rows)
class(x[[j]]) = "logical"
}
else if (class == H5_STORAGE_DATE)
{
col = read_atomic_column(h5_fp, dataset, nm, rows)
x[[j]] = as.POSIXct(col, origin="1970-01-01 00:00.00 UTC")
}
else if (class == H5_STORAGE_FAC)
x[[j]] = read_factor_column(h5_fp, dataset, nm, rows)
else
close_and_stop(h5_fp, INTERNAL_ERROR)
}
data.table::setDF(x)
x
}
read_h5df_compound=function(h5_fp, dataset, rows)
{
datasets <- list.datasets(h5_fp[[dataset]])
is_string = sapply(datasets, function(ds) h5_is_string(h5_fp, dataset, ds))
colnames = h5attributes(h5_fp[[dataset]])$VARNAMES
if (!is.null(rows)) {
df <- h5_fp[[glue(dataset,"data")]][rows][]
}
else {
df <- h5_fp[[glue(dataset,"data")]][]
}
return(df)
}
# -----------------------------------------------------------------------------
# pytables readers
# -----------------------------------------------------------------------------
# pandas.HDFStore.put(format="fixed")
read_pytables_fixed = function(h5_fp, dataset, rows)
{
columns = list.datasets(h5_fp[[dataset]])
items = columns[grep("items", columns)]
columns = columns[grep("values", columns)]
if (any(grepl("block2", columns)))
close_and_stop(h5_fp, "file has string data written from pandas/pytables in 'fixed' format, which can not be portably read. Please re-write with format='table'")
colnames = h5_fp[[glue(dataset, "axis0")]][]
rownames = h5_fp[[glue(dataset, "axis1")]][]
n = length(colnames)
df = vector(mode="list", length=n)
names(df) = colnames
for (ind in 1:length(columns))
{
col_ind = columns[ind]
ds = glue(dataset, col_ind)
n_block = h5_fp[[ds]]$dims[1]
for (j_block in 1:n_block)
{
if (is.null(rows))
col = h5_fp[[ds]][j_block, ]
else
col = h5_fp[[ds]][j_block, rows]
df_j = h5_fp[[glue(dataset, items[ind])]][j_block]
df[[df_j]] = col
}
}
data.table::setDF(df, rownames=rownames)
df
}
# pandas.HDFStore.put(format="table")
read_pytables_table = function(h5_fp, dataset, rows)
{
if (is.null(rows))
df = h5_fp[[glue(dataset, "table")]][]
else
df = h5_fp[[glue(dataset, "table")]][rows]
df$index = NULL
df
}
# -----------------------------------------------------------------------------
# interface
# -----------------------------------------------------------------------------
#' read_h5df
#'
#' TODO
#'
#' @details
#' TODO
#'
#' @param h5in
#' Input file.
#' @param dataset
#' Name of the data within the HDF5 file. If none is supplied, then this will be
#' inferred from the input file name.
#' @param rows
#' TODO
#' @param cols
#' TODO
#' @param strings
#' Only available for 'hdfio_columns' format files. Should string columns be read?
#' @param verbose
#' TODO
#'
#' @return
#' A dataframe.
#'
#' @export
read_h5df = function(h5in, dataset=NULL, rows=NULL, cols=NULL, strings=TRUE, verbose=FALSE)
{
check.is.string(h5in)
check.file(h5in, h5=TRUE)
check.is.flag(strings)
check.is.flag(verbose)
if (!is.null(dataset))
check.is.string(dataset)
if (!is.null(rows))
{
if (length(rows) == 0 || !all(is.inty(rows)) || any(rows < 1))
stop("argument 'rows' must be a vector of positive integers")
}
if (!is.null(cols))
{
if (length(cols) == 0 || !all(is.inty(cols)) || any(cols < 1))
stop("argument 'cols' must be a vector of positive integers")
}
h5_fp = h5file(h5in, mode="r")
dataset = h5_get_dataset(h5_fp, dataset)
fmt = h5_detect_format(h5_fp, dataset, verbose)
if (!is.null(cols) && fmt != "hdfio_column")
close_and_stop(h5_fp, "argument 'cols' can only be a vector of indices if format is hdfio_column")
if (!isTRUE(strings) && fmt != "hdfio_column")
close_and_stop(h5_fp, "argument 'strings' must be TRUE if format is not hdfio_column")
if (!is.null(cols) && !isTRUE(strings))
close_and_stop(h5_fp, "must have 'strings=TRUE' when columns are specified via 'cols'")
if (fmt == "hdfio_column")
df = read_h5df_column(h5_fp, dataset, rows, cols, strings)
else if (fmt == "pytables_table")
df = read_pytables_table(h5_fp, dataset, rows)
else if (fmt == "pytables_fixed")
df = read_pytables_fixed(h5_fp, dataset, rows)
else if (fmt == "hdfio_compound")
df = read_h5df_compound(h5_fp, dataset, rows)
else
close_and_stop(h5_fp, "unknown format")
h5close(h5_fp)
df
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.