R/setup-attach.R

Defines functions as_src_env.src_tbl as_src_env.default as_src_env.character as_src_env.src_env as_src_env src_name.src_env str.src_env dim_brak dim_desc format.src_env print.src_env new_src_env as_src_tbl.default as_src_tbl.character as_src_tbl.src_env as_src_tbl.src_tbl as_src_tbl tbl_name.src_tbl src_name.src_tbl part_desc id_desc def_desc tim_desc tbl_sum.src_tbl time_vars.src_tbl index_var.src_tbl id_vars.src_tbl new_src_tbl setup_src_env.default setup_src_env.src_cfg setup_src_env attach_src.default attach_src.character attach_src.src_cfg attach_src

Documented in as_src_env as_src_tbl attach_src attach_src.character attach_src.src_cfg new_src_env new_src_tbl setup_src_env setup_src_env.src_cfg

#' Data attach utilities
#'
#' Making a dataset available to `ricu` consists of 3 steps: downloading
#' ([download_src()]), importing ([import_src()]) and attaching
#' ([attach_src()]). While downloading and importing are one-time procedures,
#' attaching of the dataset is repeated every time the package is loaded.
#' Briefly, downloading loads the raw dataset from the internet (most likely
#' in `.csv` format), importing consists of some preprocessing to make the
#' data available more efficiently and attaching sets up the data for use by
#' the package.
#'
#' @details
#' Attaching a dataset sets up two types of S3 classes: a single `src_env`
#' object, containing as many `src_tbl` objects as tables are associated with
#' the dataset. A `src_env` is an environment with an `id_cfg` attribute, as
#' well as sub-classes as specified by the data source `class_prefix`
#' configuration setting (see [load_src_cfg()]). All `src_env` objects created
#' by calling `attach_src()` represent environments that are direct
#' descendants of the `data` environment and are bound to the respective
#' dataset name within that environment. While `attach_src()` does not
#' immediately instantiate a `src_env` object, it rather creates a promise
#' using [base::delayedAssign()] which evaluates to a `src_env` upon first
#' access. This allows for data sources to be set up where the data is missing
#' in a way that prompts the user to download and import the data when first
#' accessed.
#'
#' Additionally, `attach_src()` creates an active binding using
#' [base::makeActiveBinding()], binding a function to the dataset name within
#' the environment passed as `assign_env`, which retrieves the respective
#' `src_env` from the `data` environment. This shortcut is set up for
#' convenience, such that for example the MIMIC-III demo dataset not only is
#' available as `ricu::data::mimic_demo`, but also as `ricu::mimic_demo` (or if
#' the package namespace is attached, simply as `mimic_demo`). The `ricu`
#' namespace contains objects `mimic`, `mimic_demo`, `eicu`, etc. which are
#' used as such links when loading the package. However, new data sets can be
#' set up an accessed in the same way.
#'
#' If set up correctly, it is not necessary for the user to directly call
#' `attach_src()`. When the package is loaded, the default data sources are
#' attached automatically. This default can be controlled by setting as
#' environment variable `RICU_SRC_LOAD` a comma separated list of data source
#' names before loading the library. Setting this environment variable as
#'
#' ```
#' Sys.setenv(RICU_SRC_LOAD = "mimic_demo,eciu_demo")
#' ```
#'
#' will change the default of loading both MIMIC-III and eICU, alongside the
#' respective demo datasets, and HiRID, to just the two demo datasets. For
#' setting an environment variable upon startup of the R session, refer to
#' [base::.First.sys()].
#'
#' The `src_env` promise for each data source is created using the S3 generic
#' function `setup_src_env()`. This function checks if all required files are
#' available from `data_dir`. If files are missing the user is prompted for
#' download in interactive sessions and an error is thrown otherwise. As soon
#' as all required data is available, a `src_tbl` object is created per table
#' and assigned to the `src_env`.
#'
#' The S3 class `src_tbl` inherits from [`prt`][prt::new_prt()], which
#' represents a partitioned [`fst`][fst::fst()] file. In addition to the `prt`
#' object, meta data in the form of `col_cfg` and `tbl_cfg` is associated with
#' a `src_tbl` object (see [load_src_cfg()]). Furthermore, as with `src_env`,
#' sub-classes are added as specified by the source configuration
#' `class_prefix` entry. This allows certain functionality, for example data
#' loading, to be adapted to data source-specific requirements.
#'
#' @examples
#' \dontrun{
#'
#' Sys.setenv(RICU_SRC_LOAD = "")
#' library(ricu)
#'
#' ls(envir = data)
#' exists("mimic_demo")
#'
#' attach_src("mimic_demo")
#'
#' ls(envir = data)
#' exists("mimic_demo")
#'
#' mimic_demo
#'
#' }
#'
#' @param x Data source to attach
#' @param ... Forwarded to further calls to `attach_src()`
#'
#' @return The constructors `new_src_env()`/`new_src_tbl()` as well as coercion
#' functions `as_src_env()`/`as_src_tbl()` return `src_env` and `src_tbl`
#' objects respectively. The function `attach_src()` is called for side
#' effects and returns `NULL` invisibly, while `setup_src_env()` instantiates
#' and returns a `src_env` object.
#'
#' @export
#'
attach_src <- function(x, ...) UseMethod("attach_src", x)

#' @param data_dir Directory used to look for [fst::fst()] files; `NULL` calls
#' [data_dir()] using the source name as `subdir` argument
#' @param assign_env Environment in which the data source will become available
#'
#' @rdname attach_src
#'
#' @export
#'
attach_src.src_cfg <- function(x, assign_env = NULL,
                               data_dir = src_data_dir(x), ...) {

  warn_dots(...)

  src <- src_name(x)

  tryCatch({

    assert_that(is.string(data_dir), null_or(assign_env, is.environment))

    dat_env <- data_env()
    src_env <- new_src_env(x, env = new.env(parent = dat_env))

    delayedAssign(src, setup_src_env(x, src_env, data_dir),
                  assign.env = dat_env)

    if (is.environment(assign_env)) {
      makeActiveBinding(src, get_from_data_env(src), assign_env)
    }

  }, error = function(err) {

    msg <- conditionMessage(err)

    warn_ricu(
      c("Failed to attach source `{src}` with error:", msg),
      class = "src_attach_error", indent = c(0L, rep_along(2L, msg)),
      exdent = c(0L, rep_along(2L, msg))
    )

    if (is.environment(assign_env)) {
      assign(src, NULL, envir = assign_env)
    }
  })

  invisible(NULL)
}

#' @rdname attach_src
#' @export
attach_src.character <- function(x, assign_env = NULL,
                                 data_dir = src_data_dir(x), ...) {

  read_err <- function(err) {

    msg <- conditionMessage(err)

    warn_ricu(
      c("Failed to read source configuration with error:", msg),
      class = "src_cfg_read_error", indent = c(0L, rep_along(2L, msg)),
      exdent = c(0L, rep_along(2L, msg))
    )

    NULL
  }

  try_parse <- function(src, cfg, dir, env) {

    parse_err <- function(err) {

      msg <- conditionMessage(err)

      warn_ricu(
        c("Failed to parse source configuration for source `{src}` with
           error:", msg),
        class = "src_cfg_parse_error", indent = c(0L, rep_along(2L, msg)),
        exdent = c(0L, rep_along(2L, msg))
      )

      if (is.environment(env)) {
        assign(src, NULL, envir = env)
      }

      NULL
    }

    cfg <- tryCatch(parse_src_cfg(cfg), error = parse_err)

    if (not_null(cfg)) {
      attach_src(cfg, env, dir)
    }

    invisible(NULL)
  }

  cfgs <- tryCatch(read_src_cfg(x, ...), error = read_err)

  if (has_name(cfgs, x)) {

    Map(try_parse, x, cfgs[x], data_dir, MoreArgs = list(env = assign_env))

  } else {

    warn_ricu("Failed to read source configuration for source{?s}
               {setdiff(x, names(cfg))}", class = "src_cfg_read_error")
  }

  invisible(NULL)
}

#' @export
attach_src.default <- function(x, ...) stop_generic(x, .Generic)

#' @param env Environment where data proxy objects are created
#'
#' @rdname attach_src
#'
#' @export
#'
setup_src_env <- function(x, env, ...) {
  UseMethod("setup_src_env", x)
}

#' @rdname attach_src
#' @export
setup_src_env.src_cfg <- function(x, env, data_dir = src_data_dir(x), ...) {

  assert_that(is_src_env(env), is.string(data_dir))

  tbl <- as_tbl_cfg(x)

  fst_files <- lapply(tbl, fst_file_names)
  fst_paths <- Map(file.path, data_dir, fst_files)

  tables  <- chr_ply(tbl, tbl_name)
  missing <- lgl_ply(fst_paths, all_fun, Negate(file.exists))

  if (any(missing)) {

    todo <- tables[missing]

    if (is_interactive()) {

      msg_ricu(
        c("The following {qty(length(todo))} table{?s} {?is/are} missing from
           directory {data_dir}:", bullet(quote_bt(todo))), "miss_tbl_msg",
        exdent = c(0L, rep_along(2L, todo)),
        tbl_ok = setNames(!missing, tables)
      )

      resp <- readline("Download now (Y/n)? ")

      if (!identical(resp, "Y")) {
        stop_ricu("Cannot continue with missing tables for `{src_name(x)}`",
                  "tbl_dl_abort")
      }

    } else {

      stop_ricu(
        c("The following {qty(length(todo))} table{?s} {?is/are}
           missing from directory {data_dir}:", bullet(quote_bt(todo))),
        class = "miss_tbl_err", exdent = c(0L, rep_along(2L, todo))
      )
    }

    if (data_pkg_avail(x)) {

      src_dir <- src_data_dir(x)

      if (!identical(data_dir, src_dir)) {
        warn_ricu("data will be saved to {src_dir} instead of {data_dir}",
                  class = "pkg_src_dir")
      }

      install_data_pkgs(x)

    } else {

      ensure_dirs(
        unique(dirname(unlist(fst_paths, recursive = FALSE)))
      )

      tmp <- ensure_dirs(tempfile())
      on.exit(unlink(tmp, recursive = TRUE))

      download_src(x, tmp, tables = todo)
      import_src(x, tmp)

      done <- Map(file.rename, Map(file.path, tmp, fst_files[missing]),
                  fst_paths[missing])
      done <- lgl_ply(done, all)

      if (!all(done)) {
        stop_ricu(
          c("The following {qty(sum(!done))} table{?s} could be moved to
             directory {data_dir}:", bullet(quote_bt(todo[!done]))),
          class = "tbl_mv_err", exdent = c(0L, rep(2L, sum(!done)))
        )
      }
    }

    done <- lgl_ply(fst_paths, all_fun, file.exists)

    if (!all(done)) {
      stop_ricu(
        c("The following {qty(sum(!done))} table{?s} were not successfully
           downloaded and imported:", bullet(quote_bt(todo[!done]))),
        class = "tbl_dl_err", exdent = c(0L, rep(2L, sum(!done)))
      )
    }
  }

  dat_tbls <- Map(new_src_tbl, fst_paths, as_col_cfg(x), as_tbl_cfg(x),
                  MoreArgs = list(prefix = x[["prefix"]], src_env = env))
  names(dat_tbls) <- tables

  list2env(dat_tbls, envir = env)
}

#' @export
setup_src_env.default <- function(x, ...) stop_generic(x, .Generic)

#' @param files File names of `fst` files that will be used to create a `prt`
#' object (see also [prt::new_prt()])
#' @param col_cfg Coerced to `col_cfg` by calling [as_col_cfg()]
#' @param tbl_cfg Coerced to `tbl_cfg` by calling [as_tbl_cfg()]
#' @param prefix Character vector valued data source name(s) (used as class
#' prefix)
#' @param src_env The data source environment (as `src_env` object)
#'
#' @rdname attach_src
#' @export
new_src_tbl <- function(files, col_cfg, tbl_cfg, prefix, src_env) {

  assert_that(is_src_env(src_env))

  res <- prt::new_prt(files)
  class(res) <- c(paste0(c(prefix, "src"), "_tbl"), class(res))

  attr(res, "col_cfg") <- as_col_cfg(col_cfg)
  attr(res, "tbl_cfg") <- as_tbl_cfg(tbl_cfg)
  attr(res, "src_env") <- src_env

  res
}

is_src_tbl <- is_type("src_tbl")

#' @export
id_vars.src_tbl <- function(x) {
  coalesce(id_vars(as_col_cfg(x)), id_vars(as_id_cfg(x)))
}

#' @export
index_var.src_tbl <- function(x) index_var(as_col_cfg(x))

#' @export
time_vars.src_tbl <- function(x) time_vars(as_col_cfg(x))

#' @importFrom tibble tbl_sum
#' @export
tbl_sum.src_tbl <- function(x) {

  out <- setNames(dim_brak(x), paste0("<", class(x)[2L], ">"))

  if (not_null(prt <- part_desc(x))) {
    out <- c(out, Partitions = prt)
  }

  ids <- id_var_opts(x)

  if (length(ids) > 1L && any(ids %in% colnames(x))) {
    out <- c(out, "ID options" = id_desc(x))
  }

  c(out, "Defaults" = def_desc(x), "Time vars" = tim_desc(x))
}

tim_desc <- function(x) {

  x <- time_vars(x)

  if (has_length(x)) {
    concat(x)
  } else {
    NULL
  }
}

def_desc <- function(x) {

  x <- as_col_cfg(x)

  res <- c(id = id_vars(x), index = index_var(x), value = val_var(x),
           unit = unit_var(x))

  if (has_length(res)) {
    paste0(res, " (", names(res), ")", collapse = ", ")
  } else {
    NULL
  }
}

id_desc <- function(x) {
  id <- sort(as_id_cfg(x))
  paste0(id_var_opts(id), " (", names(id), ")", id_cfg_op(id), collapse = " ")
}

part_desc <- function(x) {

  part_nrow <- prt::part_nrow(x)

  if (length(part_nrow) > 1L) {
    paste0("[", concat(chr_ply(part_nrow, big_mark)), "] rows")
  } else {
    NULL
  }
}

#' @export
src_name.src_tbl <- function(x) src_name(as_col_cfg(x))

#' @export
tbl_name.src_tbl <- function(x) tbl_name(as_col_cfg(x))

#' @rdname attach_src
#' @export
as_src_tbl <- function(x, ...) UseMethod("as_src_tbl", x)

#' @export
as_src_tbl.src_tbl <- function(x, ...) warn_dot_ident(x, ...)

#' @export
as_src_tbl.src_env <- function(x, tbl, ...) {

  warn_dots(...)

  assert_that(is.string(tbl))

  res <- get0(tbl, envir = x, inherits = FALSE, ifnotfound = NULL)

  if (is.null(res)) {

    opts <- ls(envir = x)

    stop_ricu(
      c("Table `{tbl}` not found for `{src_name(x)}`. Available are:",
        bullet(quote_bt(opts))),
      class = "src_tbl_not_found", exdent = c(0L, rep_along(2L, opts))
    )
  }

  res
}

#' @export
as_src_tbl.character <- function(x, env, ...) {
  as_src_tbl(as_src_env(env), x, ...)
}

#' @export
as_src_tbl.default <- function(x, ...) stop_generic(x, .Generic)

#' @rdname attach_src
#' @export
new_src_env <- function(x, env = new.env(parent = data_env())) {

  assert_that(is_src_cfg(x), is.environment(env))

  structure(env, class = paste0(c(x[["prefix"]], "src"), "_env"),
            src_name = src_name(x), id_cfg = as_id_cfg(x))
}

is_src_env <- is_type("src_env")

#' @export
print.src_env <- function(x, ...) {
  cat_line("<", class(x)[1L], "[", length(x), "]>")
  print(setNames(format(x), names(x)), quote = FALSE)
  invisible(x)
}

#' @export
format.src_env <- function(x, ...) {
  chr_ply(eapply(x, dim_brak), identity)
}

dim_desc <- function(x) {
  paste0(vapply(dim(x), big_mark, character(1L)),
         collapse = paste0(" ", symbol$cross, " "))
}

dim_brak <- function(x) paste0("[", dim_desc(x), "]")

#' @importFrom utils ls.str
#' @export
str.src_env <- function(object, ...) ls.str(object)

#' @export
src_name.src_env <- function(x) attr(x, "src_name")

#' @rdname attach_src
#' @export
as_src_env <- function(x) UseMethod("as_src_env", x)

#' @export
as_src_env.src_env <- function(x) x

#' @export
as_src_env.character <- function(x) {

  assert_that(is.string(x))

  env <- data_env()

  res <- get0(x, envir = env, mode = "environment", ifnotfound = NULL)

  if (is.null(res)) {
    stop_ricu("Source `{x}` not found in {format(env)}",
              class = "src_env_not_found")
  }

  res
}

#' @export
as_src_env.default <- function(x) as_src_env(src_name(x))

#' @export
as_src_env.src_tbl <- function(x) attr(x, "src_env")
septic-tank/ricu documentation built on Jan. 30, 2021, 8:40 p.m.