R/hdf5-functions.R

Defines functions h5Overwrite h5Backup h5Move h5Copy is.H5Group is.H5D h5Class h5TryOpen h5GuessDtype h5AbsLinkName

Documented in h5AbsLinkName h5Backup h5Class h5Copy h5GuessDtype h5Move h5Overwrite h5TryOpen is.H5D is.H5Group

#%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
# Functions ####################################################################
#%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

## HDF5 link name ==============================================================

#' Format an absolute path name for HDF5 link
#' 
#' @param name String representing an expected name of HDF5 link.
#' 
#' @details
#' If \code{name} contains any of "", \code{NA} or \code{NULL}, will simply 
#' return \code{"/"}.
#' 
#' @return An update \code{name} starting with '/'.
#' 
#' @examples
#' h5AbsLinkName("ggg")
#' h5AbsLinkName("ggg/ddd")
#' h5AbsLinkName(NA)
#' h5AbsLinkName("")
#' h5AbsLinkName(NULL)
#' 
#' @importFrom easy.utils isValidCharacters
#' 
#' @export
h5AbsLinkName <- function(name) {
  name <- name[1]
  if (!any(isValidCharacters(x = name))) {
    name <- ""
  }
  root <- substr(x = name, start = 1, stop = 1)
  if (root != "/") {
    name <- paste0("/", name)
  }
  name <- gsub(pattern = "\\/+", replacement = "\\/", x = name)
  return(name)
}

## HDF5 datatype ===============================================================

#' Guess an HDF5 Datatype
#'
#' Wrapper around \code{\link[hdf5r:guess_nelem]{hdf5r::guess_dtype}}, allowing 
#' for the customization of string types such as utf-8 rather than defaulting to 
#' variable-length ASCII-encoded strings.
#'
#' @param x The object for which to guess the HDF5 datatype
#' @param stype 'utf8' or 'ascii7'
#' @param ... Arguments passed to \code{hdf5r::guess_dtype}
#' 
#' @return An object of class \code{\link[hdf5r]{H5T}}
#'
#' @seealso \code{\link[hdf5r:guess_nelem]{guess_dtype}}
#' 
#' @references 
#' \url{https://github.com/mojaveazure/seurat-disk/blob/163f1aade5bac38ed1e9e9c9
#' 12283a7e74781610/R/zzz.R}
#' 
#' @examples
#' h5GuessDtype(0)
#' h5GuessDtype("abc")
#' 
#' @importFrom hdf5r guess_dtype h5const H5T_STRING
#' @export
h5GuessDtype <- function(x, stype = c('utf8', 'ascii7'), ...) {
  stype <- match.arg(arg = stype)
  dtype <- guess_dtype(x = x, ...)
  if (!inherits(x = dtype, what = 'H5T_STRING')) {
    return(dtype)
  }
  return(switch(
    EXPR = stype,
    'utf8' = H5T_STRING$new(size = Inf)$set_cset(cset = h5const$H5T_CSET_UTF8),
    'ascii7' = H5T_STRING$new(size = 7L)
  ))
}

## Open an HDF5 file ===========================================================

#' Automatically retry opening HDF5 file
#' 
#' Helper function to open an HDF5 file. When the opening fails, will retry it 
#' until reach a timeout.
#' 
#' @param filename An HDF5 file to open
#' @param mode How to open it: 
#' \itemize{
#' \item \code{a} creates a new file or opens an existing one for read/write. 
#' \item \code{r} opens an existing file for reading
#' \item \code{r+} opens an existing file for read/write. 
#' \item \code{w} creates a file, truncating any existing ones.
#' \item \code{w-} and \code{x} are synonyms, creating a file and failing if it 
#' already exists.
#' }
#' @param timeout Positive integer. The timeout for retrying.
#' @param interval Positive integer. The interval seconds of retrying.
#' @param ... Arguments passed to \code{H5File$new()}
#' 
#' @details
#' \code{timeout} and \code{interval} must be positive. Otherwise no retrying, 
#' which is default setting.
#' 
#' @return 
#' When \code{file} is opened successfully, an \code{\link[hdf5r]{H5File}} will 
#' be returned. Otherwise, will keep retrying. When a timeout is reached, will 
#' raise an error and terminate the current R session.
#' 
#' @seealso \code{\link[hdf5r]{H5File}} for \code{mode}
#' 
#' @examples
#' file <- system.file("extdata", "pbmc_small.h5ad", package = "hdf5r.Extra")
#' h5fh <- h5TryOpen(file, mode = "r")
#' h5fh
#' h5fh$close_all()
#' 
#' @importFrom hdf5r H5File
#' @export
h5TryOpen <- function(
    filename, 
    mode = c("a", "r", "r+", "w", "w-", "x"),
    timeout = getOption(x = "h5TryOpen.timeout", default = 0),
    interval = getOption(x = "h5TryOpen.interval", default = 0),
    ...
) {
  mode <- match.arg(arg = mode)
  do.retry <- timeout > 0 && interval > 0
  if (mode %in% c("r", "r+")) {
    filename <- file_path_as_absolute(x = filename)
  }
  return(tryCatch(
    expr = H5File$new(filename = filename, mode = mode, ...),
    error = function(e) {
      message("Open file '", filename, "' failed: \n")
      if (!do.retry) {
        stop(e)
      }
      message("Keep retrying per minute with 'timeout' set to ", timeout, " s")
      time <- 0
      while(time < timeout) {
        Sys.sleep(time = interval)
        h5fh <- tryCatch(
          expr = H5File$new(filename = filename, mode = mode, ...),
          error = function(e) NULL
        )
        if (inherits(x = h5fh, what = "H5File")) {
          return(h5fh)
        }
        time <- time + interval
      }
      stop("Reach a timeout after ", time, " s. Cannot open '", filename, "'")
    }
  ))
}

## h5Class =====================================================================

#' Get the class of an HDF5 link
#' 
#' Functions to get or check the class of an HDF5 link.
#' 
#' @param file An existing HDF5 file
#' @param name Name of a link in \code{file}
#' 
#' @name h5Class
NULL

#' @returns 
#' \code{h5Class} returns a character specifying the class of the query HDF5 
#' link (typically H5D, H5Group or H5File).
#' 
#' @examples
#' file <- system.file("extdata", "pbmc_small.h5ad", package = "hdf5r.Extra")
#' h5Class(file, "X")
#' h5Class(file, "obs")
#' is.H5D(file, "X")
#' is.H5Group(file, "obs")
#' 
#' @export
#' @rdname h5Class
h5Class <- function(file, name) {
  h5obj <- h5Open(x = file, name = name, mode = "r")
  on.exit(expr = h5obj$close())
  cls <- class(x = h5obj)
  return(cls[1])
}

#' @returns 
#' \code{is.H5D} and \code{is.H5Group} return a logical value.
#' 
#' @importFrom hdf5r H5D
#' @export
#' @rdname h5Class
is.H5D <- function(file, name) {
  return(.h5_is_a(file = file, name = name, what = "H5D"))
}

#' @importFrom hdf5r H5Group
#' @export
#' @rdname h5Class
is.H5Group <- function(file, name) {
  return(.h5_is_a(file = file, name = name, what = "H5Group"))
}

## Move and copy HDF5 links ====================================================

#' Copy an HDF5 link
#' 
#' Copy an HDF5 link from one file to another file.
#'
#' @param from.file The source HDF5 file.
#' @param from.name The source link name.
#' @param to.file The target HDF5 file.
#' @param to.name The destination HDF5 link name.
#' @param overwrite Whether or not to overwrite the existing link.
#' @param verbose Print progress.
#' @param ... Arguments passed to \code{H5File$obj_copy_from()}
#' 
#' @seealso \code{\link[hdf5r]{H5File}}
#' 
#' @note
#' \itemize{
#' \item Copying can still work even if the \code{to.file} is actually identical 
#' to the \code{from.file}.
#' \item Attributes of \code{from.name} will be kept, while those of its parent 
#' H5Groups will not.
#' }
#' 
#' @returns 
#' This is an operation function and no return. Any failure should raise an 
#' error.
#' 
#' @examples
#' file <- system.file("extdata", "pbmc_small.h5ad", package = "hdf5r.Extra")
#' to.file <- tempfile(fileext = ".h5")
#' 
#' # Copy a link to a new file
#' h5Copy(file, "obs", to.file, "obs")
#' obs <- h5Read(file, "obs")
#' obs2 <- h5Read(to.file, "obs")
#' 
#' # The parent link (H5Group) will be created automatically
#' h5Copy(file, "obsm/tsne", to.file, "obsm/tsne")
#' obsm <- h5Read(to.file, "obsm")
#' 
#' # Copy the whole file
#' x <- h5Read(file)
#' h5Copy(file, "/", to.file, "/", overwrite = TRUE)
#' x2 <- h5Read(to.file)
#' 
#' @export
h5Copy <- function(
    from.file,
    from.name,
    to.file,
    to.name,
    overwrite = FALSE,
    verbose = TRUE,
    ...
) {
  from.file <- file_path_as_absolute(x = from.file)
  from.name <- h5AbsLinkName(name = from.name)
  to.file <- normalizePath(path = to.file, mustWork = FALSE)
  to.name <- h5AbsLinkName(name = to.name)
  verboseMsg(
    "h5Copy: ",
    "\n  Source file: ", from.file,
    "\n  Destination file: ", to.file,
    "\n  Source name: ", from.name,
    "\n  Destination name: ", to.name
  )
  if (from.file == to.file) {
    return(.h5copy_same_file(
      h5.file = from.file, 
      from.name = from.name, 
      to.name = to.name, 
      overwrite = overwrite, 
      verbose = verbose,
      ...
    ))
  }
  return(.h5copy_different_file(
    from.file = from.file,
    from.name = from.name,
    to.file = to.file,
    to.name = to.name,
    overwrite = overwrite,
    verbose = verbose,
    ...
  ))
}

#' Move link in an HDF5 file
#' 
#' Move one HDF5 link to another position within the same file.
#' 
#' @param file An HDF5 file.
#' @param from.name Name of the source link.
#' @param to.name Name of the destination link.
#' @param overwrite When \code{to.name} already exists, whether or not to 
#' overwrite it.
#' @param verbose Print progress.
#' @param ... Arguments passed to \code{H5File$link_move_from()}
#' 
#' @seealso \code{\link[hdf5r]{H5File}}
#' 
#' @returns 
#' This is an operation function and no return. Any failure should raise an 
#' error.
#' 
#' @examples
#' file <- system.file("extdata", "pbmc_small.h5ad", package = "hdf5r.Extra")
#' to.file <- tempfile(fileext = ".h5")
#' file.copy(file, to.file)
#' 
#' obs <- h5Read(to.file, "obs")
#' h5Move(to.file, "obs", "obs2")
#' obs2 <- h5Read(to.file, "obs2")
#' 
#' # Move an object to an existing link
#' h5Move(to.file, "obs2", "var")  # Warning
#' h5Move(to.file, "obs2", "var", overwrite = TRUE)
#' 
#' # Move a non-existing object will raise an error
#' try(h5Move(to.file, "obs", "obs3"))
#' 
#' @export
h5Move <- function(
    file,
    from.name,
    to.name,
    overwrite = FALSE,
    verbose = TRUE,
    ...
) {
  from.name <- h5AbsLinkName(name = from.name)
  to.name <- h5AbsLinkName(name = to.name)
  verboseMsg(
    "h5Move: ",
    "\n  File: ", file,
    "\n  Source name: ", from.name,
    "\n  Destination name: ", to.name
  )
  if (from.name == to.name) {
    warning(
      "The source name and the destination name are identical.",
      immediate. = TRUE
    )
    return(invisible(x = NULL))
  }
  h5fh <- h5TryOpen(filename = file, mode = "r+")
  on.exit(expr = h5fh$close())
  if (!h5Exists(x = h5fh, name = from.name)) {
    stop("Cannot move a non-existing object: ", from.name)
  }
  if (h5Exists(x = h5fh, name = to.name)) {
    if (!overwrite) {
      warning(
        "Destination object already exists. ",
        "Set 'overwrite = TRUE' to remove it.",
        immediate. = TRUE
      )
      return(invisible(x = NULL))
    }
    if (verbose) {
      message("Destination object already exists, removing it.")
    }
    h5fh$link_delete(name = to.name)
  }
  h5CreateGroup(
    x = h5fh, 
    name = dirname(path = to.name), 
    show.warnings = FALSE
  )
  h5fh$link_move_from(
    src_loc = h5fh, 
    src_name = from.name, 
    dst_name = to.name, 
    ...
  )
  return(invisible(x = NULL))
}

#' Back up contents from one HDF5 file to another
#' 
#' Function to back up HDF5 file, with optionally excluding specific links.
#' 
#' @param from.file The source HDF5 file.
#' @param to.file The target HDF5 file. Cannot be the same file as 
#' \code{from.file}. If \code{NULL}, will generate an R temp file.
#' @param exclude Names of HDF5 links not to be backed up.
#' @param overwrite When the \code{to.file} already exists, whether or not to 
#' overwrite it.
#' @param verbose Print progress.
#' @param ... Arguments passed to \code{H5File$obj_copy_from()}
#' 
#' @details
#' When any HDF5 link is to be excluded, it will copy the rest of links from
#' \code{from.file} using \code{\link{h5Copy}}. Otherwise, it will simply copy 
#' the \code{from.file} to the \code{to.file} via \code{\link{file.copy}} 
#' 
#' @return Path of the \code{to.file}
#' 
#' @examples
#' file <- system.file("extdata", "pbmc_small.h5ad", package = "hdf5r.Extra")
#' to.file <- tempfile(fileext = ".h5")
#' 
#' h5Backup(file, to.file, exclude = "X")
#' 
#' x <- h5Read(file)
#' x2 <- h5Read(to.file)
#' x$X <- NULL # Remove 'X'
#' identical(x, x2) # Now these two should be identical
#' 
#' @export
h5Backup <- function(
    from.file, 
    to.file = NULL, 
    exclude = NULL, 
    overwrite = FALSE,
    verbose = TRUE,
    ...
) {
  to.file <- to.file %||% paste0(tempfile(), ".h5")
  to.file <- normalizePath(path = to.file, mustWork = FALSE)
  from.file <- file_path_as_absolute(x = from.file)
  verboseMsg(
    "h5Backup: ",
    "\n  Source file: ", from.file,
    "\n  Destination file: ", to.file,
    "\n  Excluded objects: ", paste(exclude, collapse = ", ")
  )
  if (from.file == to.file) {
    stop("\n  The source file and the target file are identical.")
  }
  if (!overwrite && file.exists(to.file)) {
    stop("The destination file exists, please set 'overwrite = TRUE'")
  }
  h5fh <- h5TryOpen(filename = from.file, mode = "r")
  all_links <- h5List(
    x = h5fh, 
    full.names = TRUE, 
    simplify = FALSE, 
    detailed = FALSE, 
    recursive = TRUE
  )
  keep_links <- all_links$name
  if (length(x = exclude) > 0) {
    keep_links <- .exclude_h5_links(all_links = keep_links, exclude = exclude)
  }
  if (identical(x = keep_links, y = all_links$name)) {
    h5fh$close()
    file.copy(from = from.file, to = to.file, overwrite = TRUE)
    return(to.file)
  }
  on.exit(expr = h5fh$close())
  all_links <- all_links[all_links$name %in% keep_links, , drop = FALSE]
  
  to.h5fh <- h5TryOpen(filename = to.file, mode = "w")
  on.exit(expr = to.h5fh$close(), add = TRUE)
  for (i in seq_along(along.with = all_links$name)) {
    verboseMsg("Backup '", all_links[i, "name"], "'")
    # all_links[i, "obj_type"] is actually `factor_ext`
    if (as.character(x = all_links[i, "obj_type"]) %in% "H5I_GROUP") {
      h5CreateGroup(
        x = to.h5fh, 
        name = all_links[i, "name"], 
        show.warnings = FALSE
      )
    } else {
      h5CreateGroup(
        x = to.h5fh, 
        name = dirname(path = all_links[i, "name"]), 
        show.warnings = FALSE
      )
      to.h5fh$obj_copy_from(
        src_loc = h5fh, 
        src_name = all_links[i, "name"], 
        dst_name = all_links[i, "name"], 
        ...
      )
    }
    .h5attr_copy_all(
      from.h5fh = h5fh,
      from.name = all_links[i, "name"],
      to.h5fh = to.h5fh,
      to.name = all_links[i, "name"],
      overwrite = TRUE
    )
  }
  .h5attr_copy_all(
    from.h5fh = h5fh,
    from.name = "/",
    to.h5fh = to.h5fh,
    to.name = "/",
    overwrite = TRUE
  )
  return(to.file)
}

#' Overwrite an existing HDF5 link
#' 
#' @param file An existing HDF5 file
#' @param name Name of HDF5 link to be overwritten. 
#' @param overwrite Whether or not to overwrite \code{name}. 
#' @param verbose Print progress.
#' 
#' @return Path to \code{file} which is ready to be written.
#' 
#' @details
#' \itemize{
#' \item When \code{file} doesn't exist, will create it.
#' \item When the old link \code{name} doesn't exist, will simply return 
#' \code{file}. 
#' \item When \code{name} exists and \code{overwrite} is \code{TRUE}, will copy 
#' the rest of HDF5 links to an updated \code{file} with \code{\link{h5Backup}}. 
#' If \code{name} is "/", will create a new \code{file} and overwrite the old one.
#' \item When \code{name} exists and \code{overwrite} is \code{FALSE}, will 
#' raise an error.
#' }
#' 
#' @examples
#' file <- system.file("extdata", "pbmc_small.h5ad", package = "hdf5r.Extra")
#' tmp.file <- tempfile(fileext = ".h5")
#' file.copy(file, tmp.file)
#' 
#' obs <- h5Read(tmp.file, "obs")
#' 
#' h5Overwrite(tmp.file, "layers", TRUE)
#' h5Exists(tmp.file, "layers")
#' 
#' # You can still read other links.
#' obs2 <- h5Read(tmp.file, "obs")
#' identical(obs, obs2)
#' 
#' @export
h5Overwrite <- function(
    file, 
    name, 
    overwrite, 
    verbose = getOption(x = "h5.overwrite.verbose", default = FALSE)
) {
  name <- h5AbsLinkName(name = name)
  if (!file.exists(file)) {
    h5CreateFile(x = file)
    return(normalizePath(path = file))
  }
  file <- normalizePath(path = file)
  if (name == "/" & overwrite) {
    warning(
      "Overwrite '/' will truncate anything in the orignial file:\n  ", file,
      immediate. = TRUE
    )
    h5fh <- h5TryOpen(filename = file, mode = "w")
    on.exit(expr = h5fh$close())
    return(file)
  }
  if (!h5Exists(x = file, name = name)) {
    return(file)
  }
  if (!overwrite) {
    stop(
      "\nFound object that already exists: ",
      "\n  File: ", file, 
      "\n  Object: ", name,
      "\nSet 'overwrite = TRUE' to remove it."
    )
  }
  tmp.file <- tempfile(tmpdir = dirname(path = file), fileext = ".h5")
  verboseMsg(
    "Overwriting existing H5 object:",
    "\n  File: ", file,
    "\n  Object: ", name
  )
  file.rename(from = file, to = tmp.file)
  tryCatch(
    expr = {
      h5Backup(
        from.file = tmp.file, 
        to.file = file, 
        exclude = name, 
        overwrite = TRUE,
        verbose = FALSE
      )
      unlink(x = tmp.file)
    },
    error = function(e) {
      file.rename(from = tmp.file, to = file)
      stop(e)
    }
  )
  return(file)
}

Try the hdf5r.Extra package in your browser

Any scripts or data that you put into this service are public.

hdf5r.Extra documentation built on Oct. 18, 2024, 9:06 a.m.