omxr: Read and Write Open Matrix Files

Documented in create_omx read_all_omx read_omx write_all_omx write_omx

#' Create an OMX file
#'
#' This function creates an OMX file, establishes the shape attribute (number
#' of rows and columns) and version attribute, and creates the data and lookup
#' groups.
#
#' @param file Full path name of the OMX file to create.
#' @param numrows Number of rows that all matrices in the file will have.
#' @param numcols Number of columns that all matrices in the file will have.
#'    Defaults to \code{numrows}.
#' @param level Compression level. Default \code{1}.
#'
#' @return None
#'
#' @importFrom rhdf5 h5writeAttribute h5createGroup H5Fclose H5Gclose H5Fclose
#'
#' @export
#' 
create_omx <- function(file, numrows, numcols, level = 1){

  # if file already exists, delete it
  if(file.exists(file)){
    file.remove(file)
  }

  # create hdf5 file with appropriate shape and attributes
  Shape <- c(numrows, numcols)
  H5File <- rhdf5::H5Fcreate(file)
  rhdf5::h5writeAttribute("0.2", H5File, "OMX_VERSION")
  rhdf5::h5writeAttribute(Shape, H5File, "SHAPE")
  rhdf5::h5createGroup(file,"data")
  rhdf5::h5createGroup(file,"lookup")
  rhdf5::H5Fclose( H5File )

}


#' Function to write OMX matrix data
#'
#' This function writes OMX matrix data. A full matrix can be written or just
#' portions of an existing matrix. It allows overwriting existing matrix
#' values, but only if the "Replace" argument is set to TRUE. If only portions
#' of the matrix are to be written to, the full matrix must already exist.
#'
#' @param file Full path name of the OMX file to store the matrix in. If this is
#'   a new matrix file, see \link{create_omx}.
#' @param matrix Matrix object to be stored.
#' @param name Name of the matrix in the OMX object.
#' @param row_index An integer vector indicating the rows represented by
#'   \code{matrix}. Defaults to \code{NULL}, meaning that all rows are written
#'   (and \code{nrow(matrix)} is the number of rows in the matrix).
#' @param col_index The precise corrolary to \code{row_index}, for columns.
#' @param na_value The value representing \code{NA} values in the matrix.
#'   Defaults to \code{-1}
#' @param replace If the named object already exists in \code{file}, should it
#'   be replaced? Defaults to \code{FALSE}.
#' @param description (Optional) description of matrix contents.
#' 
#' @return Write function return code; 0 if successful.
#'
#' @importFrom rhdf5 h5ls h5writeAttribute h5createDataset h5writeDataset
#'   H5Dclose h5write H5Fopen H5Gopen H5Dopen 
#' @export
#' 
#' @examples 
#' omxfile <- tempfile(fileext = ".omx")
#' create_omx(omxfile, 10, 10)
#' trips <- matrix(rnorm(n = 10^2, 200, 50),   
#'                 nrow = 10, ncol = 10)
#' write_omx(matrix = trips, omxfile, "trips",  description = "Total Trips")
#'
write_omx <- function(matrix, file,  name,
                      row_index = NULL, col_index = NULL,
                      na_value = -1, replace = FALSE,
                      description = "") {

  #Get names of matrices in the file and check if exists
  H5File <- rhdf5::H5Fopen(file)
  Contents <- rhdf5::h5ls(H5File)
  MatrixNames <- Contents$name[ Contents$group == "/data" ]
  MatrixExists <- name %in% MatrixNames

  if(name %in% MatrixNames & replace == FALSE ){
    stop(paste(
      "Matrix", name, "already exists. To overwrite, set 'replace = TRUE"
    ))
  }

  # Get the matrix dimensions specified in the file
  RootAttr <- get_omx_attr(file)
  Shape <- RootAttr$SHAPE

  #Allow indexed writing only if the matrix already exists
  if( !( is.null(row_index) & is.null(col_index) ) ){
    if( !MatrixExists ){
      stop(
        "Indexed writing to a matrix only allowed if a full matrix of that name already exists."
      )
    }
  }

  #If neither dimension will be written to indexes, write the full matrix and
  # add the NA attribute
  if(is.null(row_index) & is.null(col_index) ){

    #Check conformance of matrix dimensions with OMX file
    if( !all( dim( matrix ) == Shape ) ){
      stop(paste(
        "Matrix dimensions not consistent with", file, ":", Shape[1],
        "Rows,", Shape[2], "Cols"
      ))
    }

    #Transpose matrix and convert NA to designated storage value
    # From hdf5 docs: The dimensions of the array as they will appear in the
    # file. Note, the dimensions will appear in inverted order when viewing the
    # file with a C-programm (e.g. HDFView), because the fastest changing
    # dimension in R is the first one, whereas the fastest changing dimension in
    # C is the last one.
    matrix <- t( matrix ) 
    matrix[ is.na( matrix ) ] <- na_value
    
    matrix_type <- if(is.character(matrix)) {
      "character"
    } else if(is.double(matrix)) {
      "double"
    } else if(is.integer(matrix)){
      "integer"
    }

    #Write matrix to file, set chunking and compression
    ItemName <- paste( "data", name, sep="/" )
    rhdf5::h5createDataset(
      file = H5File, dataset = ItemName, 
      storage.mode = matrix_type,
      dims =  dim(matrix), 
      chunk=c(nrow(matrix), 1),
      level=7
    )
    rhdf5::h5write(matrix, H5File, ItemName)

    #Add the NA storage value and matrix descriptions as attributes to the matrix
    H5Group <- rhdf5::H5Gopen( H5File, "data" )
    H5Data <- rhdf5::H5Dopen( H5Group, name )
    rhdf5::h5writeAttribute(na_value, H5Data, "NA" )
    rhdf5::h5writeAttribute(description, H5Data, "Description" )

    #Close everything up before exiting
    rhdf5::H5Dclose( H5Data )
    rhdf5::H5Gclose( H5Group )
    rhdf5::H5Fclose( H5File )

  } else {
    #Otherwise write only to the indexed positions

    # set dimensions
    if( is.null( row_index ) ) row_index <- 1:Shape[1]
    if( is.null( col_index ) ) col_index <- 1:Shape[2]

    #Check that indexes are within matrix dimension ranges
    if( any( row_index <= 0 ) | ( max( row_index ) > Shape[1] ) ){
      stop("Elements of 'row_index' are outside the index range of the matrix.")
    }
    if( any( col_index <= 0 ) | ( max( col_index ) > Shape[2] ) ){
      stop("Elements of 'col_index' are outside the index range of the matrix.")
    }

    #Check that there are no duplicated indices
    if( any( duplicated( row_index ) ) ){
      stop( "Duplicated index values in 'row_index' not permitted." )
    }
    if( any( duplicated( col_index ) ) ){
      stop( "Duplicated index values in 'col_index' not permitted." )
    }

    #Combine the row and column indexes into a list
    #Indices are reversed since matrix is stored in transposed form
    Indices <- list( row_index, col_index )

    #Transpose matrix and convert NA to designated storage value
    matrix <- t( matrix )
    matrix[ is.na( matrix ) ] <- na_value

    # Write the matrix to the indexed positions
    ItemName <- paste( "data", name, sep="/" )
    rhdf5::h5write(matrix, H5File, ItemName, index=Indices )

  }
}

#' Read an OMX matrix
#'
#' This function reads an entire matrix in an OMX file or portions of a matrix
#' using indexing.
#'
#' @param file Path name of the OMX file where the matrix resides.
#' @param name Name of the matrix in the OMX file
#' @param row_index An integer vector indicating the rows represented by
#'   \code{matrix}. Defaults to \code{NULL}, meaning that all rows are read
#'   (and \code{nrow(matrix)} is the number of rows in the matrix).
#' @param col_index The precise corrolary to \code{row_index}, for columns.
#'
#' @return An R matrix object containing the named data from the OMX file
#'
#' @importFrom rhdf5 h5read
#'
#' @export
#' @examples
#' omxfile <- omxr_example("skims.omx")
#' mat <- read_omx(omxfile, "DIST")
#' str(mat)
#' read_omx(omxfile, "DIST", c(1,4,5), c(1,4,5))
#'
read_omx <- function(file, name, row_index = NULL, col_index = NULL){

  #Get the matrix dimensions specified in the file
  RootAttr <- omxr::get_omx_attr( file )
  Shape <- RootAttr$SHAPE

  #Identify the item to be read
  ItemName <- paste( "data", name, sep="/" )

  #Check that indices are properly formatted
  Indices <- list(row_index, col_index)
  dim <- 1   # initialize dimension
  for(index in Indices){

    if(!is.null(index)) {
      if(any(index <= 0) | (max(index) > Shape[dim])){
        stop(paste("One or more values in dimension", dim, "are invalid."))
      }

      if(any(duplicated(index))){
        stop("Duplicated index values in dimension", dim, ".")
      }
    }

    # change dimension from 1 to 2
    dim <- dim + 1
  }


  #Read the indexed positions of the matrix
  Result <- t(rhdf5::h5read(file, ItemName, index = rev(Indices)))

  #Replace the NA values with NA
  NAValue = as.vector(
    attr(rhdf5::h5read(file, ItemName, read.attribute=T),  "NA")
  )

  if(!is.null(NAValue)) {
    Result[Result == NAValue] <- NA
  }

  Result
}




#' Read all matrix cores from an OMX file
#' 
#' @param file Path to OMX file
#' @param names A character vector of matrices to read. If blank, will read all.
#' @param long If TRUE (default) will return the matrices as a long tidy 
#'   tibble with the cores as columns. If FALSE will return a list of named
#'   matrices.
#'   
#' @importFrom tidyr spread
#' @importFrom dplyr bind_rows
#' @importFrom stats setNames
#' 
#' @return Depending on the value of `long`: if TRUE, a tibble with one core
#'   per column and one row per interchange; if FALSE, a list of named matrices.
#' @export
#' @examples 
#' omxfile <- omxr_example("skims.omx")
#' read_all_omx(omxfile, names = c("origin", "destination", "DIST", "DISTBIKE", "DISTWALK"))
#' read_all_omx(omxfile, names = c("DIST", "DISTBIKE", "DISTWALK"), long = FALSE)
#' 
#'   
#' 
read_all_omx <- function(file, names = NULL, long = TRUE) {
  core_names <- list_omx(file)[["Matrices"]][["name"]]
  
  if(!is.null(names)){
    core_names <- core_names[which(core_names %in% names)]
  }
  
  matrices <- lapply(core_names, function(name) read_omx(file, name)) %>%
    stats::setNames(core_names)
  
  if (long) {
    matrices <- lapply(matrices, gather_matrix) %>%
      dplyr::bind_rows(.id = "matrix") %>%
      tidyr::spread(matrix, value)
  }
  
  matrices
}


#' Write all matrix cores to an OMX file
#' 
#' @param object The object to be written to OMX. May be either a tibble or 
#'   a list of named matrices.
#' @param long Identify whether the object to be written is a long-format 
#'   `tibble` (the default, `TRUE`) or a list of named matrices.
#' @param file The location of the OMX file.
#' 
#' @return List of write function return codes; 0 if successful.
#' 
#' @export
#' 
#' @examples 
#' omxfile <- omxr_example("skims.omx")
#' # long-format (tibble)
#' skims <- read_all_omx(omxfile, names = c("DIST", "DISTBIKE", "DISTWALK"))
#' write_all_omx(skims, file = tempfile(fileext = ".omx"))
#' 
#' # list of matrices
#' skims <- read_all_omx(omxfile, names = c("DIST", "DISTBIKE", "DISTWALK"), long = FALSE)
#' write_all_omx(skims, file = tempfile(fileext = ".omx"), long = FALSE)
#' 
#' @importFrom dplyr select
#' @importFrom tidyr spread
#' 
#' 
write_all_omx <- function(object, file, long = TRUE) {
  
  # get matrix parameters to create a new omx file
  if(long) {   # Tibble input
    numrows <- length(unique(object[["origin"]]))
    numcols <- length(unique(object[["destination"]]))
    core_names <- colnames(object)[!(colnames(object) %in% c("origin", "destination"))]
    
  } else {     # Matrix list input
    numrows <- nrow(object[[1]])
    numcols <- ncol(object[[1]])
    core_names <- names(object)
  }
  
  # create the omx file
  create_omx( file, numrows = numrows, numcols = numcols)
  
  # write matrix information to OMX.
  if (long) { # Tibble input
    lapply(core_names, function(table_name) {
      object %>%
        dplyr::select(origin, destination, !!table_name ) %>%
        tidyr::spread(destination, !!table_name) %>%
        dplyr::select(-origin) %>% as.matrix() %>%
        write_omx(file = file, matrix = ., table_name)
    })
  } else {     # Matrix list input
    lapply(core_names, function(table_name){ 
      write_omx(matrix = object[[table_name]], file = file, name = table_name)
    })
  }
  
}