R/step-first.R

Defines functions dt_has_computation.dtplyr_step_first dt_sources.dtplyr_step_first dt_call.dtplyr_step_first step_first dim.dtplyr_step_first lazy_dt

Documented in lazy_dt

#' Create a "lazy" data.table for use with dplyr verbs
#'
#' @description
#' A lazy data.table lazy captures the intent of dplyr verbs, only actually
#' performing computation when requested (with [collect()], [pull()],
#' [as.data.frame()], [data.table::as.data.table()], or [tibble::as_tibble()]).
#' This allows dtplyr to convert dplyr verbs into as few data.table expressions
#' as possible, which leads to a high performance translation.
#'
#' See `vignette("translation")` for the details of the translation.
#'
#' @param x A data table (or something can can be coerced to a data table).
#' @param immutable If `TRUE`, `x` is treated as immutable and will never
#'   be modified by any code generated by dtplyr. Alternatively, you can set
#'   `immutable = FALSE` to allow dtplyr to modify the input object.
#' @param name Optionally, supply a name to be used in generated expressions.
#'   For expert use only.
#' @param key_by Set keys for data frame, using [select()] semantics (e.g.
#'   `key_by = c(key1, key2)`.
#'
#'   This uses [data.table::setkey()] to sort the table and build an index.
#'   This will considerably improve performance for subsets, summaries, and
#'   joins that use the keys.
#'
#'   See `vignette("datatable-keys-fast-subset")` for more details.
#' @export
#' @aliases tbl_dt grouped_dt
#' @examples
#' library(dplyr, warn.conflicts = FALSE)
#'
#' # If you have a data.table, using it with any dplyr generic will
#' # automatically convert it to a lazy_dt object
#' dt <- data.table::data.table(x = 1:10, y = 10:1)
#' dt %>% filter(x == y)
#' dt %>% mutate(z = x + y)
#'
#' # Note that dtplyr will avoid mutating the input data.table, so the
#' # previous translation includes an automatic copy(). You can avoid this
#' # with a manual call to lazy_dt()
#' dt %>%
#'   lazy_dt(immutable = FALSE) %>%
#'   mutate(z = x + y)
#'
#' # If you have a data frame, you can use lazy_dt() to convert it to
#' # a data.table:
#' mtcars2 <- lazy_dt(mtcars)
#' mtcars2
#' mtcars2 %>% select(mpg:cyl)
#' mtcars2 %>% select(x = mpg, y = cyl)
#' mtcars2 %>% filter(cyl == 4) %>% select(mpg)
#' mtcars2 %>% select(mpg, cyl) %>% filter(cyl == 4)
#' mtcars2 %>% mutate(cyl2 = cyl * 2, cyl4 = cyl2 * 2)
#' mtcars2 %>% transmute(cyl2 = cyl * 2, vs2 = vs * 2)
#' mtcars2 %>% filter(cyl == 8) %>% mutate(cyl2 = cyl * 2)
#'
#' # Learn more about translation in vignette("translation")
#' by_cyl <- mtcars2 %>% group_by(cyl)
#' by_cyl %>% summarise(mpg = mean(mpg))
#' by_cyl %>% mutate(mpg = mean(mpg))
#' by_cyl %>%
#'   filter(mpg < mean(mpg)) %>%
#'   summarise(hp = mean(hp))
lazy_dt <- function(x, name = NULL, immutable = TRUE, key_by = NULL) {
  # in case `x` has an `as.data.table()` method but not a `group_vars()` method 
  groups <- tryCatch(group_vars(x), error = function(e) character())

  if (!is.data.table(x)) {
    if (!immutable) {
      abort("`immutable` must be `TRUE` when `x` is not already a data table.")
    }
    x <- as.data.table(x)
    copied <- TRUE
  } else {
    copied <- FALSE
  }

  key_by <- enquo(key_by)
  key_vars <- unname(tidyselect::vars_select(names(x), !!key_by))
  if (length(key_vars)) {
    if (immutable && !copied) {
      x <- data.table::copy(x)
    }
    data.table::setkeyv(x, key_vars)
  }

  step_first(x, name = name, groups = groups, immutable = immutable, env = caller_env())
}

#' @export
dim.dtplyr_step_first <- function(x) {
  dim(x$parent)
}

step_first <- function(parent, name = NULL, groups = character(),
                       immutable = TRUE, env = caller_env()) {
  stopifnot(is.data.table(parent))

  if (is.null(name)) {
    name <- unique_name()
  }

  new_step(parent,
    vars = names(parent),
    groups = groups,
    locals = list(),
    implicit_copy = !immutable,
    needs_copy = FALSE,
    name = sym(name),
    env = env,
    class = "dtplyr_step_first"
  )
}

#' @export
dt_call.dtplyr_step_first <- function(x, needs_copy = FALSE) {
  if (needs_copy) {
    expr(copy(!!x$name))
  } else {
    x$name
  }
}

#' @export
dt_sources.dtplyr_step_first <- function(x) {
  stats::setNames(list(x$parent), as.character(x$name))
}

#' @export
dt_has_computation.dtplyr_step_first <- function(x) {
  FALSE
}

unique_name <- local({
  i <- 0
  function() {
    i <<- i + 1
    paste0("_DT", i)
  }
})

Try the dtplyr package in your browser

Any scripts or data that you put into this service are public.

dtplyr documentation built on March 31, 2023, 9:13 p.m.