knitr::opts_chunk$set(
  message = FALSE,
  warning = FALSE,
  collapse = TRUE,
  comment = "#>"
)
library(httr)
library(hdf5r)
library(Matrix)
library(SeuratDisk)
hfile <- H5File$new(filename = tempfile(fileext = ".h5"), mode = "a")
arg.match <- function(arg, choices) {
  arg <- tryCatch(
    expr = match.arg(arg = arg, choices = choices),
    error = function(e) {
      if (sum(grepl(pattern = arg, x = choices)) == 1) {
        return(grep(pattern = arg, x = choices, value = TRUE))
      }
      stop(e$message, "; got ", arg, call. = FALSE)
    }
  )
  return(arg)
}

hlink <- function(arg) {
  choices <- c(
    paste0(
      unique(x = c(
        "H5T_COMPOUND",
        sapply(
          X = grep(pattern = '^H5T_', x = names(x = h5types), value = TRUE),
          FUN = function(i) {
            return(class(x = h5types[[i]])[1])
          },
          USE.NAMES = FALSE
        )
      )),
      "-class"
    )
  )
  arg <- arg.match(arg = arg, choices = choices)
  return(paste0("https://hhoeflin.github.io/hdf5r/reference/", arg, ".html"))
}

plink <- function(arg) {
  eth <- function(pkg, f = NULL) {
    if (is.null(x = f)) {
      f <- paste0(pkg, "-package")
    }
    return(paste0(
      "https://stat.ethz.ch/R-manual/R-devel/library/",
      pkg,
      "/html/",
      f,
      ".html"
    ))
  }
  rdrr <- function(pkg, f) {
    return(paste0("https://rdrr.io/cran/", pkg, "/man/", f, ".html"))
  }
  if (grepl(pattern = "-package$", x = arg)) {
    pkg <- gsub(pattern = "-package$", replacement = "", x = arg)
  } else if (grepl(pattern = "-class$", x = arg)) {
    cls <- gsub(pattern = "-class$", replacement = "", x = arg)
    pkg <- getClass(Class = cls)@package
  } else {
    pkg <- grep(pattern = "package", x = getAnywhere(x = arg)$where, value = TRUE)[1]
    if (is.na(x = pkg)) {
      stop("not found")
    }
    pkg <- gsub(pattern = "^package:", replacement = "", x = pkg)
  }
  if (GET(url = eth(pkg = pkg))$status_code == 200L) {
    return(eth(pkg = pkg,f = arg))
  }
  return(rdrr(pkg = pkg, f = arg))
}

vlink <- function(arg) {
  choices <- c(
    "character-representation",
    "dense-matrix-representation",
    "sparse-matrix-representation",
    "factor-representation",
    "logical-representation",
    "data-frame-representation",
    "data-frame-datasets",
    "data-frame-groups",
    "generic-s3-object-representation",
    "generic-s4-object-representation",
    "list-and-custom-class-representation"
  )
  arg <- arg.match(arg = arg, choices = choices)
  return(paste0("#", arg))
}

Overall File Structure

Required Attributes

There are three required top-level HDF5 attributes: "project", "active.assay", and "version". Each of these must be a single r vlink("char")>character. The "project" attribute corresponds to the project value of a Seurat object; the "active.assay" attribute is the name of the default assay and must be present in the "assays" group. The "version" corresponds to the version of Seurat that the h5Seurat file is based on.

Top-Level Datasets and Groups

There are two required top-level HDF5 datasets: "cell.names" and "meta.data". The "cell.names" dataset should be a one-dimensional r vlink("char")>character dataset, with a length equal to the number of cells present in the data. Cell names are not stored anywhere else in the h5Seurat file.

The "meta.data" dataset contains cell-level metadata. It should be stored as either an r vlink("data-frame-rep")>HDF5 dataset or group, depending on the contents of the meta data. See the r vlink("data-frame-rep")>data frame representation for more details.

Assay Expression Data

r plink("Assay-class")>Assay objects are stored in the top-level group "assays"; each assay is stored as its own group within the "assays" group. Within each assay group, there must be a dataset named "features" and either a dataset or group named "data"; the "features" dataset must be a one-dimensional r vlink("char")>character dataset with a length equal to the number of total features within the assay. The "data" entry is a matrix, with dimensions of $m_{features} x n_{cells}$; this entry may be either a dataset, if "data" is a r vlink("dense")>dense matrix, or a group, if "data" is a r vlink("sparse")>sparse matrix. Assay groups must also have an attribute named "key"; this is a single r vlink("char")>character value.

Assay groups may also have the following optional groups and datasets:

Subclasses of r plink("Assay-class")>Assay objects must also follow the same rules as r vlink("list")>custom S4 classes.

Dimensional Reductions

r plink("DimReduc-class")>Dimensional reduction information is stored in the top-level group "reductions"; each dimensional reduction is stored as its own group within the "reductions" group. Within each dimensional reduction group, there are three required attributes: "active.assay", "key", and "global"; "active.assay" must be one or more r vlink("char")>character values where each value is a name of an assay, "key" must be a single a r vlink("char")>character value, and "global" must be a single r vlink("log")>logical value. In addition, there must also be a dataset named "cell.embeddings" representing a r vlink("dense")>dense matrix. This matrix must have the same number of rows as cells present in the h5Seurat file.

Dimensional reduction groups may also have the following optional groups and datasets:

Nearest-Neighbor Graphs

r plink("Graph-class")>Nearest-neighbor graphs are stored in the top-level group "graphs"; each graph is stored as its own group within the "graphs" group. Graph names become graph group names. Graphs are stored as r vlink("sparse")>sparse matrices with an additional HDF5 attribute: "assay.used". This HDF5 attribute should be a single r vlink("char")>character value.

Spatial Image Data

Spatial image data is stored in the top-level group "images"; each image is stored as its own group within the "images" group. Actual structure of the image group is dependent on the structure of the spatial image data. However, it follows the same rules as r vlink("list")>custom S4 classes.

Note: spatial images are only supported in objects that were generated by a version of Seurat that has spatial support. Currently, this is restricted to version r SeuratDisk:::spatial.version or higher.

Command Logs

Miscellaneous Information and Tool-Specific Results

Miscellaneous information is stored in the top-level group "misc"; this group follows the same runs as r vlink("list")>lists. The "misc" group is required to be present, but not required to be filled.

Tool-specific results are stored in the top-level group "tools"; this group follows the same runs as r vlink("list")>lists. The "tools" group is required to be present, but not required to be filled.

Common Data Structures

Some data types are found commonly throughout Seurat objects

Character Representation

All r plink("character")>character values (strings in other languages) should be encoding as variable-length UTF-8 strings; this applies to HDF5 datasets (including standalone r hlink("STRING")>string datasets as well as parts of r hlink("COMPOUND")>compound datasets) and HDF5 attributes.

SeuratDisk:::StringType(stype = "utf8")

Dense Matrix Representation

Dense matrices should be stored as a two-dimensional dataset of any type. Datasets should be written in a column-major order. For column-major implementations (eg. R, Fortran), dataset dimensions on-disk should be the same as dimensions in-memory (eg. $m_{diskrow} x n_{diskcol} \sim m_{memrow} x n_{memcol}$). For row-major implmentations (eg. C/C++, Python), dataset dimensions on-disk should appear transposed to dimensions in-memory (eg. $m_{diskrow} x n_{diskcol} \sim n_{memrow} x m_{memcol}$); row-major implmemetnations transpose datasets prior to reading and writing data.

vals <- c(0, 0, 1, 0, 1, 1, 0, 1, 0, 1, 1, 0)
mat <- matrix(data = vals, nrow = 3)
WriteH5Group(x = mat, name = "densemat", hgroup = hfile, verbose = FALSE)
mat
hfile[["densemat"]]
hfile[["densemat"]][,]

Sparse Matrix Representation

Sparse matrices are stored as an HDF5 group with three datasets: "indices", "indptr", and "data"; the "indices" and "data" datasets must be the same length. "data" represents each non-zero element of the matrix. "indices" represents the $0$-based row numbers for each value in "data"

mat <- Matrix(data = vals, nrow = 3, sparse = TRUE)
WriteH5Group(x = mat, name = "sparsemat", hgroup = hfile, verbose = FALSE)
mat
hfile[["sparsemat"]]
hfile[["sparsemat/indices"]]
hfile[["sparsemat/indices"]][]
hfile[["sparsemat/data"]]
hfile[["sparsemat/data"]][]

"indptr" represents the points in "data" at which a new column is started. This dataset is $0$-based and should be $n_{columns} + 1$ in length.

hfile[["sparsemat/indptr"]]
hfile[["sparsemat/indptr"]][]

The "indices", "indptr", and "data" datasets correspond to the "i", "p", and "x" slots in a r plink("dgCMatrix-class")>dgCMatrix, respectively.

There may optionally be an HDF5 attribute called "dims"; this attribute should be a two r hlink("INTEGER")>integer values corresponding to the number of rows and number of columns, in that order, in the sparse matrix.

hfile[["sparsemat"]]$attr_open(attr_name = "dims")$read()

Factor Representation

r plink("factor")>Factors should be stored as an HDF5 group with two datasets: "levels" and "values"

fctr <- factor(x = c("g1", "g2", "g1", "g1", "g2"))
WriteH5Group(x = fctr, name = "factor", hgroup = hfile, verbose = FALSE)
fctr
hfile[["factor"]]

The "levels" dataset should be a r vlink("char")>character dataset with one entry per r plink("levels")>level

hfile[["factor/levels"]]
hfile[["factor/levels"]][]

The "values" dataset should be an integer dataset with one entry per value in the original factor. These integers should correspond to the factor level they had in R

hfile[["factor/values"]]
hfile[["factor/values"]][]

The number of unique entries in "values" should not exceed the number of unique entries in "levels"

Rationale Storing factors in this manner seems excessive from an R perspective. HDF5 has the concept of r hlink("ENUM")>enumerated datasets (enums) which are an efficient way to store R factors on-disk. However, some implementations of HDF5 do not support enums and thus lose factor level information. In order to make h5Seurat as cross-language as possible, we’ve opted to store factors as HDF5 groups instead of HDF5 enums.

Logical Representation

r plink("logical")>Logical values (booleans in other languages) are encoded as integers in the following manner: FALSE is encoded as 0, TRUE is encoded as 1, and NA is encoded as 2

vals <- c(TRUE, FALSE, NA)
WriteH5Group(x = vals, name = "logicals", hgroup = hfile, verbose = FALSE)
vals
hfile[["logicals"]]
hfile[["logicals"]][]

An optional HDF5 attribute named "s3class" with the value "logical" is allowed to enforce reading in the dataset as logical values. This HDF5 attribute is a single r vlink("char")>character value.

Rationale

Unlike most languages, r plink("logical")>logicals (or booleans) can take one of three values: TRUE, FALSE, or NA; as such, an extra integer value is needed to handle the additional logical value. Typically, these values are stored as r hlink("ENUM")>enums with mappings between the logical representation and integer value. However, some implementations of HDF5 do not support enums and thus lose the logical representation. Since the mappings are lost, all logicals are stored as integers instead.

Data Frame Representation

There are two ways of storing r plink("data.frame")>data frames in h5Seurat files: as r vlink("datasets")>datasets or as r vlink("groups")>groups. Data frame groups are required when data frames contain r plink("factor")>factors; if no factors are present, data frames can be stored in either type.

Rationale r vlink("factor")>Storing factors on-disk in an HDF5 file poses a unique set of challenges. Namely, r hlink("ENUM")>enumerated datasets (enums), which are an ideal method of storing mapping values, are not supported in some implementations of HDF5. As factor level information is lost under implementations that do not support HDF5 enums, we need a method of storing factors in a cross-language manner. Two options were presented: groups or r hlink("COMPOUND")>compound datasets. While the former seems excessive, the latter presents issues with unequal dataset length. Therefore, to accomodate factor level information in data frames, we utilize HDF5 groups to store data frames when one or more columns are r plink("factor")>factors.

Data Frame Datasets

Data frames stored as datasets should be a one-dimensional r hlink("COMPOUND")>compound dataset. The single dimension should be equal to the number of observations (number of rows) in the data frame. Each data type in the compound dataset must adhere to the same requirements as standard datasets (eg. r vlink("char")>character encodings, r vlink("log")>logical mapping, etc). The compound labels should correspond to the data frame column names.

op <- options(SeuratDisk.dtypes.dataframe_as_group = FALSE)
df <- data.frame(
  x = c("g1", "g1", "g2", "g1", "g2"),
  y = 1:5,
  stringsAsFactors = FALSE
)
df
WriteH5Group(x = df, name = "dfdataset", hgroup = hfile)
hfile[["dfdataset"]]
hfile[["dfdataset"]][]
options(op)

Row names are not stored with the dataset itself, but may be stored elsewhere in the h5Seurat file, typically named dataset_name.row.names; an optional HDF5 attribute called "logicals" containing the names of r plink("logical")>logical columns is allowed. This attribute consists of r vlink("char")>character values.

Data Frame Groups

Data frames stored as groups are used when r plink("factor")>factors are present in the data frame. Within the data frame group, there should be one dataset or group per column. Columns that are factors are r vlink("factor")>stored as groups while all other columns are stored as one-dimensional datasets. Each dataset within the group must adhere to the same requirements as standard datasets (eg. r vlink("char")>character encodings, r vlink("log")>logical mapping, etc). The names of the datasets within the group correspond to the data frame column names

df$x <- factor(x = df$x)
WriteH5Group(x = df, name = "dfgroup", hgroup = hfile)
hfile[["dfgroup"]]

Data frame row names may be stored in a dataset called "row.names" within the group; this dataset should be a one-dimensional r vlink("char")>character dataset. There are two optional attributes allowed: "colnames" and "logicals"; the "colnames" attribute contains the column names in the same order as was present in the in-memory r plink("data.frame")>data frame as r vlink("char")>character values. This is used to control column order when reading the data frame back into memory. Note, the "colnames" attribute does not need to contain the name of every dataset.

The "logicals" attribute contains the names of r plink("logical")>logical columns; this attribute should consist of r vlink("char")>character values.

List and Custom Class Representation

r plink("list")>Lists are stored as HDF5 groups. Each entry in a list must be named; the names serve as the names of datasets and groups within the list group. List values are stored as HDF5 datasets or groups, depending on their R object type. For example, a list within a list would be stored as a group within the first group.

l <- list(a = 1:3, b = list(b1 = "hello", b2 = c("tomato", "potato")))
l
WriteH5Group(x = l, name = "list", hgroup = hfile)
hfile[["list"]]
hfile[["list/b"]]

Custom classes are stored as r vlink("list")>lists with a special HDF5 attribute. S3 classes are stored with the attribute "s3class", which has a value equal to the class of the object. This attribute is a variable-length r vlink("char")>character value.

Custom S4 classes are also stored as r vlink("list")>lists where each entry is a slot in the S4 class. S4 class groups have an attribute named "s4class"; this attribute should be a single-length r vlink("char")>character storing the name of the class and the package that defines the class in the form of package:class (eg. Signac:ChromatinAssay); custom S4 classes defined in the Seurat package can be named with just the class of the object.

hfile$close_all()
file.remove(hfile$filename)


mojaveazure/seurat-disk documentation built on Nov. 5, 2023, 9:40 a.m.