#' @title Create or load 'lazyarray' instance
#' @description If path is missing, create a new array. If path exists and
#' meta file is complete, load existing file, otherwise create new meta file
#' and import from existing data.
#' @author Zhengjia Wang
#' @seealso \code{\link{create_lazyarray}}, \code{\link{load_lazyarray}}
#' @param x R vector, matrix, array or \code{LazyArray}, \code{LazyMatrix}
#' @param path path to a local drive where array data is stored
#' @param storage_format data type, choices are \code{"double"},
#' \code{"integer"}, \code{"character"}, and \code{"complex"}; see details
#' @param dim integer vector, dimension of array, see \code{\link{dim}}
#' @param dimnames list of vectors, names of each dimension, see \code{\link{dimnames}}
#' @param compress_level 0 to 100, level of compression. 0 means
#' no compression, 100 means maximum compression. For persistent data,
#' it's recommended to set 100. Default is 50.
#' @param meta_name header file name, default is \code{"lazyarray.meta"}
#' @param read_only whether created array is read-only
#' @param quiet whether to suppress messages, default is false
#' @param ... ignored or passed to other methods
#'
#' @details The last margin of \code{x} is going to be the partitions of
#' \code{LazyArray} instances. For example, a \code{d1 x d2 x d3}
#' array will have \code{d3} partitions. Each partition dimension will be
#' \code{d1 x d2 x 1}. \code{LazyArray} requires each partition to
#' be less than one-third of the total memory size, and number of partitions
#' less than 10 thousand for best performance.
#'
#' For matrices, \code{as.lazymatrix.array} automatically transposes internal
#' partitions such that the number of partitions is always less or equal than
#' each partition length. However, \code{as.lazymatrix.LazyArray} or
#' \code{as.lazyarray} don't have such optimization. It's highly recommended
#' to keep small partition size while having large each partition lengths.
#' For performance comparisons, see \code{\link{lazy_matmul}}.
#'
#' When importing from existing partition files generated by
#' other packages such as \code{'fst'}, the partition files must be homogeneous,
#' meaning the stored data length, dimension, and storage type must be the same.
#' Because \code{'fstcore'} package stores data in data frame internally,
#' the column name must be 'V1' for non-complex elements or
#' 'V1R', 'V1I' for complex numbers (real and imaginary data are stored
#' in different columns).
#'
#' @examples
#'
#' path <- tempfile()
#'
#' # ---------------- case 1: Create new array ------------------
#' arr <- lazyarray(path, storage_format = 'double', dim = c(2,3,4),
#' meta_name = 'lazyarray.meta')
#' arr[] <- 1:24
#'
#' # Subset and get the first partition
#' arr[,,1]
#'
#' # Partition file path (total 4 partitions)
#' arr$get_partition_fpath()
#'
#' # Removing array doesn't clear the data
#' rm(arr); gc()
#'
#' # ---------------- Case 2: Load from existing directory ----------------
#' ## Important!!! Run case 1 first
#' # Load from existing path, no need to specify other params
#' arr <- lazyarray(path, meta_name = 'lazyarray.meta', read_only = TRUE)
#'
#' arr[,,1]
#'
#' # ---------------- Case 3: Import from existing data ----------------
#' ## Important!!! Run case 1 first
#'
#' # path exists, but meta is missing, all other params are required
#' # Notice the partition count increased from 4 to 5, and storage type converts
#' # from double to character
#' arr <- lazyarray(path = path, meta_name = 'lazyarray-character.meta',
#' file_names = c(1,2,3,4,'additional'),
#' storage_format = 'character', dim = c(2,3,5),
#' quiet = TRUE, read_only = FALSE)
#'
#' # partition names
#' arr$get_partition_fpath(1:4, full_path = FALSE)
#' arr$get_partition_fpath(5, full_path = FALSE)
#'
#' # The first dimension still exist and valid
#' arr[,,1]
#'
#' # The additional partition is all NA
#' arr[,,5]
#'
#' # Set data to 5th partition
#' arr[,,5] <- rep(0, 6)
#'
#' # ---------------- Case 3: Converting from R arrays ----------------
#'
#' x <- matrix(1:16, 4)
#' x <- as.lazymatrix(x)
#' x[,]
#'
#'
#' x <- array(1:27, c(3,3,3))
#' as.lazymatrix(x)[,1]
#'
#' as.lazyarray(x)[]
#'
#' # -------- Advanced usage: create fst data and import manually --------
#'
#' # Clear existing files
#' path <- tempfile()
#' unlink(path, recursive = TRUE)
#' dir.create(path, recursive = TRUE)
#'
#' # Create array of dimension 2x3x4, but 3rd partition is missing
#' # without using lazyarray package
#'
#' # Column names must be V1 or V1R, V1I (complex)
#' fst::write_fst(data.frame(V1 = 1:6), path = file.path(path, 'part-1.fst'))
#' fst::write_fst(data.frame(V1 = 7:12), path = file.path(path, 'part-B.fst'))
#' fst::write_fst(data.frame(V1 = 19:24), path = file.path(path, 'part-d.fst'))
#'
#' # Import via lazyarray
#' arr <- lazyarray(path, meta_name = 'test-int.meta',
#' storage_format = 'integer',
#' dim = c(2,3,4), prefix = 'part-',
#' file_names = c('1', 'B', 'C', 'd'),
#' quiet = TRUE)
#'
#' arr[]
#'
#' # Complex case
#' fst::write_fst(data.frame(V1R = 1:6, V1I = 1:6),
#' path = file.path(path, 'cplx-1.fst'))
#' fst::write_fst(data.frame(V1R = 7:12, V1I = 100:105),
#' path = file.path(path, 'cplx-2.fst'))
#' fst::write_fst(data.frame(V1R = 19:24, V1I = rep(0,6)),
#' path = file.path(path, 'cplx-4.fst'))
#' arr <- lazyarray(path, meta_name = 'test-cplx.meta',
#' storage_format = 'complex',
#' dim = c(2,3,4), prefix = 'cplx-',
#' file_names = 1:4, quiet = TRUE)
#'
#' arr[]
#'
#' @export
lazyarray <- function(
path, storage_format, dim, dimnames = NULL,
compress_level = 50L, meta_name = 'lazyarray.meta',
read_only = FALSE, quiet = FALSE, ...
){
if(file.exists(path) && !dir.exists(path)){
stop('lazyarray path must be a directory path, but a file was found.')
}
if(!dir.exists(path)){
# not exists, create a new one
arr <- create_lazyarray(
path = path, storage_format = storage_format, dim = dim,
dimnames = dimnames, compress_level = compress_level,
meta_name = meta_name)
if(read_only){
arr <- load_lazyarray(path = path, read_only = TRUE,
meta_name = meta_name)
}
return(arr)
}
# path exists, locate meta_name
if(file.exists(file.path(path, meta_name))){
arr <- load_lazyarray(path = path, read_only = read_only,
meta_name = meta_name)
return(arr)
}
if(!quiet){
message('meta file not found, create one with existing files')
}
# library(raveio)
# ts <- lapply(1:20, function(ii){
# Tensor$new(
# data = 1:9000, c(30,300,1),
# dimnames = list(A = 1:30, B = 1:300, C = ii),
# varnames = c('A', 'B', 'C'), use_index = 2, temporary = FALSE)
# })
# file_names <- sapply(ts, '[[', 'swap_file')
# path <- stringr::str_split(file_names[[1]], '/file|\\.fst', simplify = TRUE)[,1]
# file_names <- stringr::str_split(file_names, '/file|\\.fst', simplify = TRUE)[,2]
# prefix <- 'file'
# join_tensors(ts, temporary = FALSE)
# Otherwise meta_name does not exist
nparts <- dim[[length(dim)]]
path <- normalizePath(path)
fs <- file.path(path, sprintf('%s.fst', seq_len(nparts)))
fe <- file.exists(fs)
if(any(fe)){
fs <- fs[fe]
ds <- lapply2(fs, function(f){
# f=ts[[1]]$swap_file
tryCatch({
meta <- fstMeta(normalizePath(f))
if(inherits(meta, 'fst_error')){ stop(meta) }
meta
}, error = function(e){
stop('Cannot open array file(s). \n ', f)
})
c(meta$nrOfCols, meta$nrOfRows)
})
ds <- unique(ds)
if(length(ds) != 1){
stop('All existing files must be homogeneous')
}
}
part_dimension <- dim
part_dimension[length(dim)] <- 1
# make a meta file
meta <- list(
lazyarray_version = 0,
file_format = 'fst',
storage_format = storage_format,
dim = dim,
dimnames = dimnames,
part_dimension = part_dimension,
postfix = '.fst',
compress_level = compress_level
)
meta_path <- file.path(path, meta_name)
save_yaml(meta, meta_path)
ClassLazyArray$new(path = path, read_only = read_only, meta_name = meta_name)
}
#' Automatically remove array data
#' @author Zhengjia Wang
#' @description Remove the files containing array data once no
#' 'lazyarray' instance is using the folder. Require
#' installation of \code{dipsaus} package (at least version 0.0.8).
#' @param x 'lazyarray' instance
#' @param onexit passed to \code{\link{reg.finalizer}}
#'
#' @details \code{auto_clear_lazyarray} attempts to remove the entire folder
#' containing array data. However, if some files are not created by the
#' array, only partition data and meta file will be removed, all the
#' artifacts will remain and warning will be displayed. One exception is
#' if all files left in the array directory are \code{*.meta} files,
#' all these meta files will be removed along with the folder.
#'
#' @examples
#'
#' path <- tempfile()
#' arr_dbl <- lazyarray(path, storage_format = 'double',
#' dim = 2:4, meta_name = 'meta-dbl.meta')
#' arr_dbl[] <- 1:24
#' auto_clear_lazyarray(arr_dbl)
#'
#' arr_chr <- lazyarray(path, storage_format = 'character',
#' dim = 2:4, meta_name = 'meta-chr.meta',
#' quiet = TRUE)
#' auto_clear_lazyarray(arr_chr)
#'
#' # remove either one, the directory still exists
#' rm(arr_dbl); invisible(gc(verbose = FALSE))
#'
#' arr_chr[1,1,1]
#'
#' # Remove the other one, and path will be removed
#' rm(arr_chr); invisible(gc(verbose = FALSE))
#'
#' dir.exists(path)
#' arr_check <- lazyarray(path, storage_format = 'character',
#' dim = 2:4, meta_name = 'meta-chr',
#' quiet = TRUE)
#'
#' # data is removed, so there should be no data (NAs)
#' arr_check[]
#'
#' @export
auto_clear_lazyarray <- function(x, onexit = FALSE){
if(requireNamespace('dipsaus', quietly = TRUE)){
path <- dirname(x$storage_path)
path <- normalizePath(path)
dipsaus::shared_finalizer(x, key = path, function(e){
e$remove_data(force = TRUE)
}, onexit = onexit)
rm(path)
}
rm(x, onexit)
invisible()
}
#' @rdname lazyarray
#' @export
as.lazyarray <- function(x, path, storage_format, ...){
UseMethod('as.lazyarray')
}
#' @rdname lazyarray
#' @export
as.lazyarray.default <- function(x, path, storage_format, ...){
dm <- dim(x)
if(length(dm) < 2){
dm <- c(length(x), 1)
}
if(missing(path)){
path <- tempfile()
}
if(missing(storage_format)){
storage_format <- storage.mode(x)
}
re <- create_lazyarray(path, storage_format, dim = dm, ...)
re[] <- x
re
}
#' @rdname lazyarray
#' @export
as.lazyarray.LazyArray <- function(x, path, storage_format, meta_name, ...){
path1 <- normalizePath(dirname(x$storage_path))
# temporarily create?
if(!missing(path)){
path <- normalizePath(path, mustWork = FALSE)
if(path != path1){
warning("as.lazyarray.LazyArray: path will be ignored")
}
}
if(missing(storage_format)){
return(x)
} else if(missing(meta_name)){
meta_name = sprintf('%s_version.meta', storage_format)
}
header <- load_yaml(x$storage_path)
if( header$storage_format == storage_format ){
return(x)
}
header$storage_format = storage_format
stopifnot(storage_format %in% x$storage_formats_avail)
save_yaml(header, file.path(path1, meta_name))
lazyarray(
meta_name = meta_name,
path = path1,
dim = x$dim,
dimnames = x$dimnames,
storage_format = storage_format,
read_only = !x$can_write,
...
)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.