R/query.R

Defines functions query.data.frame query_if.data.frame query query_if

Documented in query query_if

#' One-to-one interface for data.table '[' method
#'
#' Quote from [data.table][data.table::data.table]:
#' ```
#' query(data, j,  by) # + extra arguments
#'             |   |
#'             |    -------> grouped by what?
#'              -------> what to do?
#' ```
#' or,
#' ```
#' query_if(data, i,  j,  by) # + extra arguments
#'                |   |   |
#'                |   |    -------> grouped by what?
#'                |    -------> what to do?
#'                 ---> on which rows?
#' ```
#' If you don't need '`i`' argument, use '`query`'. In this case you can
#' avoid printing leading comma inside brackets to denote empty '`i`'.
#'
#' @param data data.table/data.frame data.frame will be automatically converted
#'   to data.table.
#' @param i Integer, logical or character vector, single column numeric matrix,
#'   expression of column names, list, data.frame or data.table. integer and
#'   logical vectors work the same way they do in [.data.frame except logical
#'   NAs are treated as FALSE. expression is evaluated within the frame of the
#'   data.table (i.e. it sees column names as if they are variables) and can
#'   evaluate to any of the other types. For details see
#'   [data.table][data.table::data.table]
#' @param j When with=TRUE (default), j is evaluated within the frame of the
#'   data.table; i.e., it sees column names as if they are variables. This
#'   allows to not just select columns in j, but also compute on them e.g., `x[,a]`
#'   and `x[, sum(a)]` returns `x$a` and `sum(x$a)` as a vector respectively.
#'   `x[, .(a, b)]` and `x[, .(sa=sum(a), sb=sum(b))]` returns a two column data.table
#'   each, the first simply selecting columns a, b and the second computing
#'   their sums. For details see [data.table][data.table::data.table].
#' @param by unquoted name of grouping variable of list of unquoted names of
#'   grouping variables. For details see [data.table][data.table::data.table]
#' @param keyby Same as by, but with an additional `setkey()` run on the by
#'   columns of the result, for convenience. It is common practice to use
#'   'keyby=' routinely when you wish the result to be sorted. For details see
#'   [data.table][data.table::data.table]
#' @param with logical. For details see [data.table][data.table::data.table].
#' @param nomatch Same as nomatch in match. For details see
#'   [data.table][data.table::data.table].
#' @param mult For details see [data.table][data.table::data.table].
#' @param roll For details see [data.table][data.table::data.table].
#' @param rollends For details see [data.table][data.table::data.table].
#' @param which For details see [data.table][data.table::data.table].
#' @param .SDcols Specifies the columns of x to be included in the special
#'   symbol .SD which stands for Subset of data.table. May be character column
#'   names or numeric positions. For details see [data.table][data.table::data.table].
#' @param verbose logical. For details see [data.table][data.table::data.table].
#' @param allow.cartesian For details see [data.table][data.table::data.table].
#' @param drop For details see [data.table][data.table::data.table].
#' @param on For details see [data.table][data.table::data.table].
#'
#' @return It depends. For details see [data.table][data.table::data.table].
#' @export
#'
#' @examples
#' \donttest{
#' # examples from data.table
#' dat = data.table(x=rep(c("b","a","c"),each=3), y=c(1,3,6), v=1:9)
#' dat
#' # basic row subset operations
#' query_if(dat, 2)                        # 2nd row
#' query_if(dat, 3:2)                        # 3rd and 2nd row
#' query_if(dat, order(x))                   # no need for order(dat$x)
#' query_if(dat, y>2)                        # all rows where dat$y > 2
#' query_if(dat, y>2 & v>5)                  # compound logical expressions
#' query_if(dat, !2:4)                       # all rows other than 2:4
#' query_if(dat, -(2:4))                     # same
#'
#' # select|compute columns data.table way
#' query(dat, v)                        # v column (as vector)
#' query(dat, list(v))                  # v column (as data.table)
#' query(dat, sum(v))                   # sum of column v, returned as vector
#' query(dat, list(sum(v)))             # same, but return data.table (column autonamed V1)
#' query(dat, list(v, v*2))             # return two column data.table, v and v*2
#'
#' # subset rows and select|compute data.table way
#' query_if(dat, 2:3, sum(v))                # sum(v) over rows 2 and 3, return vector
#' query_if(dat, 2:3, list(sum(v)))             # same, but return data.table with column V1
#' query_if(dat, 2:3, list(sv=sum(v)))          # same, but return data.table with column sv
#' query_if(dat, 2:5, cat(v, "\n"))          # just for j's side effect
#'
#' # select columns the data.frame way
#' query(dat, 2, with=FALSE)            # 2nd column, returns a data.table always
#' colNum = 2
#' query(dat, colNum, with=FALSE)       # same, equivalent to DT[, .SD, .SDcols=colNum]
#'
#' # grouping operations - j and by
#' query(dat, sum(v), by=x)             # ad hoc by, order of groups preserved in result
#' query(dat, sum(v), keyby=x)          # same, but order the result on by cols
#' query(dat, sum(v), by=x) %>%
#'     query_if(order(x))               # same but by chaining expressions together
#'
#' # fast ad hoc row subsets (subsets as joins)
#' # same as x == "a" but uses binary search (fast)
#' query_if(dat, "a", on="x")
#' # same, for convenience, no need to quote every column
#' query_if(dat, "a", on=list(x))
#' query_if(dat, .("a"), on="x")                          # same
#' # same, single "==" internally optimised to use binary search (fast)
#' query_if(dat, x=="a")
#' # not yet optimized, currently vector scan subset
#' query_if(dat, x!="b" | y!=3)
#' # join on columns x,y of 'dat'; uses binary search (fast)
#' query_if(dat, .("b", 3), on=c("x", "y"))
#' query_if(dat, .("b", 3), on=list(x, y))                # same, but using on=list()
#' query_if(dat, .("b", 1:2), on=c("x", "y"))             # no match returns NA
#' query_if(dat, .("b", 1:2), on=.(x, y), nomatch=0)      # no match row is not returned
#' # locf, nomatch row gets rolled by previous row
#' query_if(dat, .("b", 1:2), on=c("x", "y"), roll=Inf)
#' query_if(dat, .("b", 1:2), on=.(x, y), roll=-Inf)      # nocb, nomatch row gets rolled by next row
#' # on rows where dat$x=="b", calculate sum(v*y)
#' query_if(dat, "b", sum(v*y), on="x")
#'
#' # all together now
#' query_if(dat, x!="a", sum(v), by=x)                    # get sum(v) by "x" for each i != "a"
#' query_if(dat, !"a", sum(v), by=.EACHI, on="x")         # same, but using subsets-as-joins
#' query_if(dat, c("b","c"), sum(v), by=.EACHI, on="x")   # same
#' query_if(dat, c("b","c"), sum(v), by=.EACHI, on=.(x))  # same, using on=.()
#'
#' # joins as subsets
#' X = data.table(x=c("c","b"), v=8:7, foo=c(4,2))
#' X
#'
#' query_if(dat, X, on="x")                         # right join
#' query_if(X, dat, on="x")                         # left join
#' query_if(dat, X, on="x", nomatch=0)              # inner join
#' query_if(dat, !X, on="x")                        # not join
#' # join using column "y" of 'dat' with column "v" of X
#' query_if(dat, X, on=c(y="v"))
#' query_if(dat,X, on="y==v")                       # same as above (v1.9.8+)
#'
#' query_if(dat, X, on = .(y<=foo))                 # NEW non-equi join (v1.9.8+)
#' query_if(dat, X, on="y<=foo")                    # same as above
#' query_if(dat, X, on=c("y<=foo"))                 # same as above
#' query_if(dat, X, on=.(y>=foo))                   # NEW non-equi join (v1.9.8+)
#' query_if(dat, X, on=.(x, y<=foo))                # NEW non-equi join (v1.9.8+)
#' query_if(dat, X, .(x,y,x.y,v), on=.(x, y>=foo))  # Select x's join columns as well
#'
#' query_if(dat, X, on="x", mult="first")           # first row of each group
#' query_if(dat, X, on="x", mult="last")            # last row of each group
#' query_if(dat, X, sum(v), by=.EACHI, on="x")      # join and eval j for each row in i
#' query_if(dat, X, sum(v)*foo, by=.EACHI, on="x")  # join inherited scope
#' query_if(dat, X, sum(v)*i.v, by=.EACHI, on="x")  # 'i,v' refers to X's v column
#' query_if(dat, X, on=.(x, v>=v), sum(y)*foo, by=.EACHI) # NEW non-equi join with by=.EACHI (v1.9.8+)
#'
#'
#' # more on special symbols, see also ?"special-symbols"
#' query_if(dat, .N)                           # last row
#' query(dat, .N)                              # total number of rows in DT
#' query(dat, .N, by=x)                        # number of rows in each group
#' query(dat, .SD, .SDcols=x:y)                # select columns 'x' and 'y'
#' query(dat, .SD[1])                          # first row of all columns
#' query(dat, .SD[1], by=x)                    # first row of 'y' and 'v' for each group in 'x'
#' query(dat, c(.N, lapply(.SD, sum)), by=x)   # get rows *and* sum columns 'v' and 'y' by group
#' query(dat, .I[1], by=x)                     # row number in DT corresponding to each group
#' query(dat, grp := .GRP, by=x) %>% head()    # add a group counter column
#' query(X, query_if(dat, .BY, y, on="x"), by=x)               # join within each group
#'
#' # add/update/delete by reference (see ?assign)
#' query(dat, z:=42L) %>% head()         # add new column by reference
#' query(dat, z:=NULL) %>% head()        # remove column by reference
#' query_if(dat, "a", v:=42L, on="x") %>% head()  # subassign to existing v column by reference
#' query_if(dat, "b", v2:=84L, on="x") %>% head() # subassign to new column by reference (NA padded)
#'
#' # NB: postfix [] is shortcut to print()
#' query(dat, m:=mean(v), by=x)[]              # add new column by reference by group
#'
#' # advanced usage
#' dat = data.table(x=rep(c("b","a","c"),each=3),
#'                  v=c(1,1,1,2,2,1,1,2,2),
#'                  y=c(1,3,6),
#'                  a=1:9,
#'                  b=9:1)
#' dat
#' query(dat, sum(v), by=.(y%%2))              # expressions in by
#' query(dat, sum(v), by=.(bool = y%%2))       # same, using a named list to change by column name
#' query(dat, .SD[2], by=x)                    # get 2nd row of each group
#' query(dat, tail(.SD,2), by=x)               # last 2 rows of each group
#' query(dat, lapply(.SD, sum), by=x)          # sum of all (other) columns for each group
#' query(dat, .SD[which.min(v)], by=x)         # nested query by group
#'
#' query(dat, list(MySum=sum(v),
#'                 MyMin=min(v),
#'                 MyMax=max(v)),
#'       by=.(x, y%%2)
#' )                    # by 2 expressions
#'
#' query(dat, .(a = .(a), b = .(b)), by=x)      # list columns
#' query(dat, .(seq = min(a):max(b)), by=x)     # j is not limited to just aggregations
#' query(dat, sum(v), by=x) %>%
#'     query_if(V1<20) # compound query
#' query(dat, sum(v), by=x) %>%
#'     setorder(-V1) %>%
#'     head()          # ordering results
#' query(dat, c(.N, lapply(.SD,sum)), by=x)     # get number of observations and sum per group
#'
#' # anonymous lambda in 'j', j accepts any valid
#' # expression. TO REMEMBER: every element of
#' # the list becomes a column in result.
#' query(dat,
#'       {tmp = mean(y);
#'       .(a = a-tmp, b = b-tmp)
#'       },
#'       by=x)
#'
#' # using rleid, get max(y) and min of all cols in .SDcols for each consecutive run of 'v'
#' query(dat,
#'       c(.(y=max(y)), lapply(.SD, min)),
#'       by=rleid(v),
#'       .SDcols=v:b
#' )
#' }
#' \dontrun{
#'     pdf("new.pdf")
#'     query(dat, plot(a,b), by=x)                # can also plot in 'j'
#'     dev.off()
#' }
query_if = function(data,
                    i,
                    j,
                    by,
                    keyby,
                    with = TRUE,
                    nomatch = getOption("datatable.nomatch"),
                    mult = "all",
                    roll = FALSE,
                    rollends = if (roll=="nearest") c(TRUE,TRUE)
                    else if (roll>=0) c(FALSE,TRUE)
                    else c(TRUE,FALSE),
                    which = FALSE,
                    .SDcols,
                    verbose = getOption("datatable.verbose"),                   # default: FALSE
                    allow.cartesian = getOption("datatable.allow.cartesian"),   # default: FALSE
                    drop = NULL,
                    on = NULL){
    UseMethod("query_if")

}

#' @rdname query_if
#' @export
query = function(data,
                 j,
                 by,
                 keyby,
                 with = TRUE,
                 nomatch = getOption("datatable.nomatch"),
                 mult = "all",
                 roll = FALSE,
                 rollends = if (roll=="nearest") c(TRUE,TRUE)
                 else if (roll>=0) c(FALSE,TRUE)
                 else c(TRUE,FALSE),
                 which = FALSE,
                 .SDcols,
                 verbose = getOption("datatable.verbose"),                   # default: FALSE
                 allow.cartesian = getOption("datatable.allow.cartesian"),   # default: FALSE
                 drop = NULL,
                 on = NULL){
    UseMethod("query")
}


#' @export
query_if.data.frame = function(data,
                    i,
                    j,
                    by,
                    keyby,
                    with = TRUE,
                    nomatch = getOption("datatable.nomatch"),
                    mult = "all",
                    roll = FALSE,
                    rollends = if (roll=="nearest") c(TRUE,TRUE)
                    else if (roll>=0) c(FALSE,TRUE)
                    else c(TRUE,FALSE),
                    which = FALSE,
                    .SDcols,
                    verbose = getOption("datatable.verbose"),                   # default: FALSE
                    allow.cartesian = getOption("datatable.allow.cartesian"),   # default: FALSE
                    drop = NULL,
                    on = NULL){
    call_expr = sys.call()
    call_expr[[1]] = as.symbol("[")
    eval_in_parent_frame(data, call_expr, frame = parent.frame())
}


#' @export
query.data.frame = function(data,
                 j,
                 by,
                 keyby,
                 with = TRUE,
                 nomatch = getOption("datatable.nomatch"),
                 mult = "all",
                 roll = FALSE,
                 rollends = if (roll=="nearest") c(TRUE,TRUE)
                 else if (roll>=0) c(FALSE,TRUE)
                 else c(TRUE,FALSE),
                 which = FALSE,
                 .SDcols,
                 verbose = getOption("datatable.verbose"),                   # default: FALSE
                 allow.cartesian = getOption("datatable.allow.cartesian"),   # default: FALSE
                 drop = NULL,
                 on = NULL){
    call_expr = as.list(sys.call())
    call_expr[[1]] = quote(maditr::query_if)
    # insert empty i
    call_expr =  as.call(c(call_expr[1:2], list(substitute()), call_expr[-(1:2)]))
    call_expr[[1]] = as.symbol("[")
    eval_in_parent_frame(data, call_expr, frame = parent.frame())
}
gdemin/maditr documentation built on April 12, 2024, 10 p.m.