experiment/lazyarray.R

#' @title Create or load 'lazyarray' instance
#' @description If path is missing, create a new array. If path exists and 
#' meta file is complete, load existing file, otherwise create new meta file
#' and import from existing data.
#' @author Zhengjia Wang
#' @seealso \code{\link{create_lazyarray}}, \code{\link{load_lazyarray}}
#' @param x R vector, matrix, array or \code{LazyArray}, \code{LazyMatrix}
#' @param path path to a local drive where array data is stored
#' @param storage_format data type, choices are \code{"double"}, 
#' \code{"integer"}, \code{"character"}, and \code{"complex"}; see details
#' @param dim integer vector, dimension of array, see \code{\link{dim}}
#' @param dimnames list of vectors, names of each dimension, see \code{\link{dimnames}}
#' @param compress_level 0 to 100, level of compression. 0 means
#' no compression, 100 means maximum compression. For persistent data,
#' it's recommended to set 100. Default is 50.
#' @param meta_name header file name, default is \code{"lazyarray.meta"}
#' @param read_only whether created array is read-only
#' @param quiet whether to suppress messages, default is false
#' @param ... ignored or passed to other methods
#' 
#' @details The last margin of \code{x} is going to be the partitions of 
#' \code{LazyArray} instances. For example, a \code{d1 x d2 x d3} 
#' array will have \code{d3} partitions. Each partition dimension will be 
#' \code{d1 x d2 x 1}. \code{LazyArray} requires each partition to 
#' be less than one-third of the total memory size, and number of partitions 
#' less than 10 thousand for best performance. 
#' 
#' For matrices, \code{as.lazymatrix.array} automatically transposes internal
#' partitions such that the number of partitions is always less or equal than
#' each partition length. However, \code{as.lazymatrix.LazyArray} or 
#' \code{as.lazyarray} don't have such optimization. It's highly recommended
#' to keep small partition size while having large each partition lengths.
#' For performance comparisons, see \code{\link{lazy_matmul}}.
#' 
#' When importing from existing partition files generated by 
#' other packages such as \code{'fst'}, the partition files must be homogeneous,
#' meaning the stored data length, dimension, and storage type must be the same.
#' Because \code{'fstcore'} package stores data in data frame internally, 
#' the column name must be 'V1' for non-complex elements or 
#' 'V1R', 'V1I' for complex numbers (real and imaginary data are stored
#' in different columns).
#' 
#' @examples 
#' 
#' path <- tempfile()
#' 
#' # ---------------- case 1: Create new array ------------------
#' arr <- lazyarray(path, storage_format = 'double', dim = c(2,3,4), 
#'                  meta_name = 'lazyarray.meta')
#' arr[] <- 1:24
#' 
#' # Subset and get the first partition
#' arr[,,1]
#' 
#' # Partition file path (total 4 partitions)
#' arr$get_partition_fpath()
#' 
#' # Removing array doesn't clear the data
#' rm(arr); gc()
#' 
#' # ---------------- Case 2: Load from existing directory ----------------
#' ## Important!!! Run case 1 first
#' # Load from existing path, no need to specify other params
#' arr <- lazyarray(path, meta_name = 'lazyarray.meta', read_only = TRUE)
#' 
#' arr[,,1]
#' 
#' # ---------------- Case 3: Import from existing data ----------------
#' ## Important!!! Run case 1 first
#' 
#' # path exists, but meta is missing, all other params are required
#' # Notice the partition count increased from 4 to 5, and storage type converts
#' # from double to character
#' arr <- lazyarray(path = path, meta_name = 'lazyarray-character.meta', 
#'                  file_names = c(1,2,3,4,'additional'), 
#'                  storage_format = 'character', dim = c(2,3,5), 
#'                  quiet = TRUE, read_only = FALSE)
#' 
#' # partition names
#' arr$get_partition_fpath(1:4, full_path = FALSE)
#' arr$get_partition_fpath(5, full_path = FALSE)
#' 
#' # The first dimension still exist and valid
#' arr[,,1]
#' 
#' # The additional partition is all NA
#' arr[,,5]
#' 
#' # Set data to 5th partition
#' arr[,,5] <- rep(0, 6)
#' 
#' # ---------------- Case 3: Converting from R arrays ----------------
#' 
#' x <- matrix(1:16, 4)
#' x <- as.lazymatrix(x)
#' x[,]
#' 
#' 
#' x <- array(1:27, c(3,3,3))
#' as.lazymatrix(x)[,1]
#' 
#' as.lazyarray(x)[]
#' 
#' # -------- Advanced usage: create fst data and import manually --------
#' 
#' # Clear existing files
#' path <- tempfile()
#' unlink(path, recursive = TRUE)
#' dir.create(path, recursive = TRUE)
#' 
#' # Create array of dimension 2x3x4, but 3rd partition is missing
#' # without using lazyarray package 
#' 
#' # Column names must be V1 or V1R, V1I (complex)
#' fst::write_fst(data.frame(V1 = 1:6), path = file.path(path, 'part-1.fst'))
#' fst::write_fst(data.frame(V1 = 7:12), path = file.path(path, 'part-B.fst'))
#' fst::write_fst(data.frame(V1 = 19:24), path = file.path(path, 'part-d.fst'))
#' 
#' # Import via lazyarray
#' arr <- lazyarray(path, meta_name = 'test-int.meta',
#'                  storage_format = 'integer',
#'                  dim = c(2,3,4), prefix = 'part-', 
#'                  file_names = c('1', 'B', 'C', 'd'), 
#'                  quiet = TRUE)
#' 
#' arr[]
#' 
#' # Complex case
#' fst::write_fst(data.frame(V1R = 1:6, V1I = 1:6), 
#'                path = file.path(path, 'cplx-1.fst'))
#' fst::write_fst(data.frame(V1R = 7:12, V1I = 100:105), 
#'                path = file.path(path, 'cplx-2.fst'))
#' fst::write_fst(data.frame(V1R = 19:24, V1I = rep(0,6)), 
#'                path = file.path(path, 'cplx-4.fst'))
#' arr <- lazyarray(path, meta_name = 'test-cplx.meta',
#'                  storage_format = 'complex',
#'                  dim = c(2,3,4), prefix = 'cplx-', 
#'                  file_names = 1:4, quiet = TRUE)
#' 
#' arr[]
#' 
#' @export
lazyarray <- function(
  path, storage_format, dim, dimnames = NULL, 
  compress_level = 50L, meta_name = 'lazyarray.meta', 
  read_only = FALSE, quiet = FALSE, ...
){
  if(file.exists(path) && !dir.exists(path)){
    stop('lazyarray path must be a directory path, but a file was found.')
  }
  
  if(!dir.exists(path)){
    # not exists, create a new one
    arr <- create_lazyarray(
      path = path, storage_format = storage_format, dim = dim,
      dimnames = dimnames,  compress_level = compress_level, 
      meta_name = meta_name)
    if(read_only){
      arr <- load_lazyarray(path = path, read_only = TRUE, 
                            meta_name = meta_name)
    }
    
    return(arr)
  }
  
  # path exists, locate meta_name
  if(file.exists(file.path(path, meta_name))){
    arr <- load_lazyarray(path = path, read_only = read_only, 
                          meta_name = meta_name)
    return(arr)
  }
  
  if(!quiet){
    message('meta file not found, create one with existing files')
  }
  
  
  # library(raveio)
  # ts <- lapply(1:20, function(ii){
  #   Tensor$new(
  #     data = 1:9000, c(30,300,1),
  #     dimnames = list(A = 1:30, B = 1:300, C = ii),
  #     varnames = c('A', 'B', 'C'), use_index = 2, temporary = FALSE)
  # })
  # file_names <- sapply(ts, '[[', 'swap_file')
  # path <- stringr::str_split(file_names[[1]], '/file|\\.fst', simplify = TRUE)[,1]
  # file_names <- stringr::str_split(file_names, '/file|\\.fst', simplify = TRUE)[,2]
  # prefix <- 'file'
  # join_tensors(ts, temporary = FALSE)
  
  # Otherwise meta_name does not exist
  nparts <- dim[[length(dim)]]
  path <- normalizePath(path)
  fs <- file.path(path, sprintf('%s.fst', seq_len(nparts)))
  fe <- file.exists(fs)
  
  if(any(fe)){
    fs <- fs[fe]
    ds <- lapply2(fs, function(f){
      # f=ts[[1]]$swap_file
      tryCatch({
        meta <- fstMeta(normalizePath(f))
        if(inherits(meta, 'fst_error')){ stop(meta) }
        meta
      }, error = function(e){
        stop('Cannot open array file(s). \n  ', f)
      })
      c(meta$nrOfCols, meta$nrOfRows)
    })
    
    
    ds <- unique(ds)
    if(length(ds) != 1){
      stop('All existing files must be homogeneous')
    }
  }
  part_dimension <- dim
  part_dimension[length(dim)] <- 1
  
  
  # make a meta file
  meta <- list(
    lazyarray_version = 0,
    file_format = 'fst',
    storage_format = storage_format,
    dim = dim,
    dimnames = dimnames,
    part_dimension = part_dimension,
    postfix = '.fst',
    compress_level = compress_level
  )
  
  meta_path <- file.path(path, meta_name)
  save_yaml(meta, meta_path)
  
  ClassLazyArray$new(path = path, read_only = read_only, meta_name = meta_name)
  
}


#' Automatically remove array data 
#' @author Zhengjia Wang
#' @description Remove the files containing array data once no
#' 'lazyarray' instance is using the folder. Require
#' installation of \code{dipsaus} package (at least version 0.0.8).
#' @param x 'lazyarray' instance
#' @param onexit passed to \code{\link{reg.finalizer}}
#' 
#' @details \code{auto_clear_lazyarray} attempts to remove the entire folder
#' containing array data. However, if some files are not created by the
#' array, only partition data and meta file will be removed, all the 
#' artifacts will remain and warning will be displayed. One exception is
#' if all files left in the array directory are \code{*.meta} files, 
#' all these meta files will be removed along with the folder.
#' 
#' @examples 
#' 
#' path <- tempfile()
#' arr_dbl <- lazyarray(path, storage_format = 'double',
#'                      dim = 2:4, meta_name = 'meta-dbl.meta')
#' arr_dbl[] <- 1:24
#' auto_clear_lazyarray(arr_dbl)
#' 
#' arr_chr <- lazyarray(path, storage_format = 'character',
#'                      dim = 2:4, meta_name = 'meta-chr.meta',
#'                      quiet = TRUE)
#' auto_clear_lazyarray(arr_chr)
#' 
#' # remove either one, the directory still exists
#' rm(arr_dbl); invisible(gc(verbose = FALSE))
#' 
#' arr_chr[1,1,1]
#' 
#' # Remove the other one, and path will be removed
#' rm(arr_chr); invisible(gc(verbose = FALSE))
#' 
#' dir.exists(path)
#' arr_check <- lazyarray(path, storage_format = 'character',
#'                        dim = 2:4, meta_name = 'meta-chr',
#'                        quiet = TRUE)
#' 
#' # data is removed, so there should be no data (NAs)
#' arr_check[]
#' 
#' @export
auto_clear_lazyarray <- function(x, onexit = FALSE){
  if(requireNamespace('dipsaus', quietly = TRUE)){
    path <- dirname(x$storage_path)
    path <- normalizePath(path)
    dipsaus::shared_finalizer(x, key = path, function(e){
      e$remove_data(force = TRUE)
    }, onexit = onexit)
    rm(path)
  }
  rm(x, onexit)
  invisible()
}


#' @rdname lazyarray
#' @export
as.lazyarray <- function(x, path, storage_format, ...){
  UseMethod('as.lazyarray')
}

#' @rdname lazyarray
#' @export
as.lazyarray.default <- function(x, path, storage_format, ...){
  dm <- dim(x)
  
  if(length(dm) < 2){
    dm <- c(length(x), 1)
  }
  
  if(missing(path)){
    path <- tempfile()
  }
  if(missing(storage_format)){
    storage_format <- storage.mode(x)
  }
  
  re <- create_lazyarray(path, storage_format, dim = dm, ...)
  re[] <- x
  re
}


#' @rdname lazyarray
#' @export
as.lazyarray.LazyArray <- function(x, path, storage_format, meta_name, ...){
  
  path1 <- normalizePath(dirname(x$storage_path))
  # temporarily create?
  if(!missing(path)){
    path <- normalizePath(path, mustWork = FALSE)
    if(path != path1){
      warning("as.lazyarray.LazyArray: path will be ignored")
    }
  }
  
  if(missing(storage_format)){
    return(x)
  } else if(missing(meta_name)){
    meta_name = sprintf('%s_version.meta', storage_format)
  }
  
  header <- load_yaml(x$storage_path)
  if( header$storage_format == storage_format ){
    return(x)
  }
  
  header$storage_format = storage_format
  
  stopifnot(storage_format %in% x$storage_formats_avail)
  
  save_yaml(header, file.path(path1, meta_name))
  
  lazyarray(
    meta_name = meta_name,
    path = path1,
    dim = x$dim,
    dimnames = x$dimnames,
    storage_format = storage_format,
    read_only = !x$can_write,
    ...
  )
  
}
dipterix/lazyarray documentation built on June 30, 2023, 6:30 a.m.