tidydt0: Tidy Verbs for `data.table`

Description Usage Arguments Value Examples

let adds new variables or modify existing variables. let_if make the same thing conditionally. take aggregates data or select subset of the data by rows or columns. Both functions return data.table.

Add new variables: let(mtcars, new_var = 42, new_var2 = new_var*hp)
Filter data: take_if(mtcars, am==0)
Select variables: take(mtcars, am, vs, mpg)
Aggregate data: take(mtcars, mean_mpg = mean(mpg), by = am)
Aggregate all non-grouping columns: take(mtcars, fun = mean, by = am)

let_if(
  data,
  i,
  ...,
  by,
  keyby,
  with = TRUE,
  nomatch = getOption("datatable.nomatch"),
  mult = "all",
  roll = FALSE,
  rollends = if (roll == "nearest") c(TRUE, TRUE) else if (roll >= 0) c(FALSE, TRUE)
    else c(TRUE, FALSE),
  which = FALSE,
  .SDcols,
  verbose = getOption("datatable.verbose"),
  allow.cartesian = getOption("datatable.allow.cartesian"),
  drop = NULL,
  on = NULL
)

take_if(
  data,
  i,
  ...,
  by,
  keyby,
  with = TRUE,
  nomatch = getOption("datatable.nomatch"),
  mult = "all",
  roll = FALSE,
  rollends = if (roll == "nearest") c(TRUE, TRUE) else if (roll >= 0) c(FALSE, TRUE)
    else c(TRUE, FALSE),
  which = FALSE,
  .SDcols,
  verbose = getOption("datatable.verbose"),
  allow.cartesian = getOption("datatable.allow.cartesian"),
  drop = NULL,
  on = NULL,
  autoname = TRUE,
  fun = NULL
)

take(
  data,
  ...,
  by,
  keyby,
  with = TRUE,
  nomatch = getOption("datatable.nomatch"),
  mult = "all",
  roll = FALSE,
  rollends = if (roll == "nearest") c(TRUE, TRUE) else if (roll >= 0) c(FALSE, TRUE)
    else c(TRUE, FALSE),
  which = FALSE,
  .SDcols,
  verbose = getOption("datatable.verbose"),
  allow.cartesian = getOption("datatable.allow.cartesian"),
  drop = NULL,
  on = NULL,
  autoname = TRUE,
  fun = NULL
)

let(
  data,
  ...,
  by,
  keyby,
  with = TRUE,
  nomatch = getOption("datatable.nomatch"),
  mult = "all",
  roll = FALSE,
  rollends = if (roll == "nearest") c(TRUE, TRUE) else if (roll >= 0) c(FALSE, TRUE)
    else c(TRUE, FALSE),
  which = FALSE,
  .SDcols,
  verbose = getOption("datatable.verbose"),
  allow.cartesian = getOption("datatable.allow.cartesian"),
  drop = NULL,
  on = NULL
)

## Default S3 method:
let(
  data,
  ...,
  by,
  keyby,
  with = TRUE,
  nomatch = getOption("datatable.nomatch"),
  mult = "all",
  roll = FALSE,
  rollends = if (roll == "nearest") c(TRUE, TRUE) else if (roll >= 0) c(FALSE, TRUE)
    else c(TRUE, FALSE),
  which = FALSE,
  .SDcols,
  verbose = getOption("datatable.verbose"),
  allow.cartesian = getOption("datatable.allow.cartesian"),
  drop = NULL,
  on = NULL
)

sort_by(data, ..., na.last = FALSE)

`data`	data.table/data.frame data.frame will be automatically converted to data.table. `let` modify data.table object in-place.
`i`	integer/logical vector. Supposed to use to subset/conditional modifications of `data`. For details see data.table
`...`	List of variables or name-value pairs of summary/modifications functions. The name will be the name of the variable in the result. In the `let` and `take` functions we can use `a = b` or `a := b` notation. Advantages of `:=` is parametric assignment, e. g. `(a) := 2` create variable with name which are stored in `a`. In `let` `:=` can be used for multiassignment (`c("a", "b") := list(1,2)`)
`by`	unquoted name of grouping variable of list of unquoted names of grouping variables. For details see data.table
`keyby`	Same as `by`, but with an additional `setkey()` run on the by columns of the result, for convenience. It is common practice to use 'keyby=' routinely when you wish the result to be sorted. For details see data.table.
`with`	logical. For details see data.table.
`nomatch`	Same as nomatch in match. For details see data.table.
`mult`	For details see data.table.
`roll`	For details see data.table.
`rollends`	For details see data.table.
`which`	For details see data.table.
`.SDcols`	Specifies the columns of x to be included in the special symbol .SD which stands for Subset of data.table. May be character column names or numeric positions. For details see data.table.
`verbose`	logical. For details see data.table.
`allow.cartesian`	For details see data.table.
`drop`	For details see data.table.
`on`	For details see data.table.
`autoname`	logical. TRUE by default. Should we create names for unnamed expressions in `take`?
`fun`	function which will be applied to all variables in `take`. If there are no variables in `take` then it will be applied to all non-grouping variables in the `data`.
`na.last`	logical. FALSE by default. If TRUE, missing values in the data are put last; if FALSE, they are put first.

data.table. let returns its result invisibly.

# examples form 'dplyr' package
data(mtcars)

# Newly created variables are available immediately
mtcars %>%
    let(
        cyl2 = cyl * 2,
        cyl4 = cyl2 * 2
    ) %>% head()

# You can also use let() to remove variables and
# modify existing variables
mtcars %>%
    let(
        mpg = NULL,
        disp = disp * 0.0163871 # convert to litres
    ) %>% head()


# window functions are useful for grouped computations
mtcars %>%
    let(rank = rank(-mpg, ties.method = "min"),
        by = cyl) %>%
    head()

# You can drop variables by setting them to NULL
mtcars %>% let(cyl = NULL) %>% head()

# keeps all existing variables
mtcars %>%
    let(displ_l = disp / 61.0237) %>%
    head()

# keeps only the variables you create
mtcars %>%
    take(displ_l = disp / 61.0237)


# can refer to both contextual variables and variable names:
var = 100
mtcars %>%
    let(cyl = cyl * var) %>%
    head()

# filter by condition
mtcars %>%
    take_if(am==0)

# filter by compound condition
mtcars %>%
    take_if(am==0 & mpg>mean(mpg))


# A 'take' with summary functions applied without 'by' argument returns an aggregated data
mtcars %>%
    take(mean = mean(disp), n = .N)

# Usually, you'll want to group first
mtcars %>%
    take(mean = mean(disp), n = .N, by = cyl)

# You can group by expressions:
mtcars %>%
    take(fun = mean, by = list(vsam = vs + am))


# parametric evaluation:
var = quote(mean(cyl))
mtcars %>%
    let(mean_cyl = eval(var)) %>%
    head()
take(mtcars, eval(var))

# all together
new_var = "mean_cyl"
mtcars %>%
    let((new_var) := eval(var)) %>%
    head()
take(mtcars, (new_var) := eval(var))

########################################

# examples from data.table
dat = data.table(
    x=rep(c("b","a","c"), each=3),
    y=c(1,3,6),
    v=1:9
)

# basic row subset operations
take_if(dat, 2)                         # 2nd row
take_if(dat, 3:2)                       # 3rd and 2nd row
take_if(dat, order(x))                  # no need for order(dat$x)
take_if(dat, y>2)                       # all rows where dat$y > 2
take_if(dat, y>2 & v>5)                 # compound logical expressions
take_if(dat, !2:4)                      # all rows other than 2:4
take_if(dat, -(2:4))                    # same

# select|compute columns
take(dat, v)                  # v column (as data.table)
take(dat, sum(v))             # return data.table with sum of v (column autonamed 'sum(v)')
take(dat, sv = sum(v))        # same, but column named "sv"
take(dat, v, v*2)             # return two column data.table, v and v*2

# subset rows and select|compute
take_if(dat, 2:3, sum(v))      # sum(v) over rows 2 and 3
take_if(dat, 2:3, sv = sum(v)) # same, but return data.table with column sv

# grouping operations
take(dat, sum(v), by = x)             # ad hoc by, order of groups preserved in result
take(dat, sum(v), keyby = x)          # same, but order the result on by cols


# all together now
take_if(dat, x!="a", sum(v), by=x)                       # get sum(v) by "x" for each x != "a"
take_if(dat, c("b", "c"), sum(v), by = .EACHI, on="x")   # same

# more on special symbols, see also ?"data.table::special-symbols"
take_if(dat, .N)                           # last row
take(dat, .N)                              # total number of rows in DT
take(dat, .N, by=x)                        # number of rows in each group

take(dat, .I[1], by=x)                     # row number in DT corresponding to each group


# add/update/delete by reference
# [] at the end of expression is for autoprinting
let(dat, grp = .GRP, by=x)[]          # add a group counter column
let(dat, z = 42L)[]                   # add new column by reference
let(dat, z = NULL)[]                  # remove column by reference
let_if(dat, x=="a", v = 42L)[]        # subassign to existing v column by reference
let_if(dat, x=="b", v2 = 84L)[]       # subassign to new column by reference (NA padded)

let(dat, m = mean(v), by=x)[]         # add new column by reference by group

# advanced usage
dat = data.table(x=rep(c("b","a","c"), each=3),
                 v=c(1,1,1,2,2,1,1,2,2),
                 y=c(1,3,6),
                 a=1:9,
                 b=9:1)

take(dat, sum(v), by=list(y%%2))              # expressions in by
take(dat, sum(v), by=list(bool = y%%2))       # same, using a named list to change by column name
take(dat, fun = sum, by=x)                    # sum of all (other) columns for each group
take(dat,
     MySum=sum(v),
     MyMin=min(v),
     MyMax=max(v),
     by = list(x, y%%2)               # by 2 expressions
)

take(dat, seq = min(a):max(b), by=x)  # j is not limited to just aggregations
dat %>%
    take(V1 = sum(v), by=x) %>%
    take_if(V1<20)                    # compound query

dat %>%
    take(V1 = sum(v), by=x) %>%
    sort_by(-V1) %>%                  # ordering results
    head()