R/slice.R

Defines functions slice_sample_dt slice_min_dt slice_max_dt slice_tail_dt slice_head_dt slice_dt

Documented in slice_dt slice_head_dt slice_max_dt slice_min_dt slice_sample_dt slice_tail_dt

#' @title Subset rows using their positions
#' @description
#' `slice_dt()` lets you index rows by their (integer) locations. It allows you
#' to select, remove, and duplicate rows. It is accompanied by a number of
#' helpers for common use cases:
#'
#' * `slice_head_dt()` and `slice_tail_dt()` select the first or last rows.
#' * `slice_sample_dt()` randomly selects rows.
#' * `slice_min_dt()` and `slice_max_dt()` select rows with highest or lowest values
#'   of a variable.
#' @param .data A data.table
#' @param ... Provide either positive values to keep, or negative values to drop.
#'   The values provided must be either all positive or all negative.
#' @param by Slice by which group(s)?
#' @param n When larger than or equal to 1, the number of rows.
#'   When between 0 and 1, the proportion of rows to select.
#'
#' @param order_by Variable or function of variables to order by.
#'
#' @param with_ties Should ties be kept together? The default, `TRUE`,
#'   may return more rows than you request. Use `FALSE` to ignore ties,
#'   and return the first `n` rows.
#'
#' @param replace Should sampling be performed with (`TRUE`) or without
#'   (`FALSE`, the default) replacement.
#'
#' @return A data.table
#' @seealso \code{\link[dplyr]{slice}}
#' @examples
#'
#' a = iris
#' slice_dt(a,1,2)
#' slice_dt(a,2:3)
#' slice_dt(a,141:.N)
#' slice_dt(a,1,.N)
#' slice_head_dt(a,5)
#' slice_head_dt(a,0.1)
#' slice_tail_dt(a,5)
#' slice_tail_dt(a,0.1)
#' slice_max_dt(a,Sepal.Length,10)
#' slice_max_dt(a,Sepal.Length,10,with_ties = FALSE)
#' slice_min_dt(a,Sepal.Length,10)
#' slice_min_dt(a,Sepal.Length,10,with_ties = FALSE)
#' slice_sample_dt(a,10)
#' slice_sample_dt(a,0.1)
#'
#'
#' # use by to slice by group
#'
#' ## following codes get the same results
#' slice_dt(a,1:3,by = "Species")
#' slice_dt(a,1:3,by = Species)
#' slice_dt(a,1:3,by = .(Species))
#'
#' slice_head_dt(a,2,by = Species)
#' slice_tail_dt(a,2,by = Species)
#'
#' slice_max_dt(a,Sepal.Length,3,by = Species)
#' slice_max_dt(a,Sepal.Length,3,by = Species,with_ties = FALSE)
#' slice_min_dt(a,Sepal.Length,3,by = Species)
#' slice_min_dt(a,Sepal.Length,3,by = Species,with_ties = FALSE)
#'
#' # in `slice_sample_dt`, "by" could only take character class
#' slice_sample_dt(a,.1,by = "Species")
#' slice_sample_dt(a,3,by = "Species")
#' slice_sample_dt(a,51,replace = TRUE,by = "Species")
#'

#' @rdname slice
#' @export
slice_dt = function(.data,...,by = NULL){
  dt = as_dt(.data)
  substitute(by) %>% deparse() -> by_char
  if(by_char == "NULL") dt[eval(substitute(c(...)))]
  else
    eval(substitute(
    dt[,.SD[c(...)],by]
  ))
}


# slice_dt = function(.data,...){
#   as_dt(.data)[eval(substitute(c(...)))]
# }

#' @rdname slice
#' @export
slice_head_dt = function(.data,n,by = NULL){
  .data = as_dt(.data)
  substitute(by) %>% deparse() -> by_char
  if(by_char == "NULL"){
    if(abs(n) >= 1L) head(.data,n)
    else head(.data,nrow(.data)*n)
  }else{
    if(abs(n) >= 1L)
      eval(substitute(
        .data[,head(.SD,n),by]
      ))
    else
      eval(substitute(
        .data[,head(.SD,nrow(.SD)*n),by]
      ))
  }
}


# slice_head_dt = function(.data,n){
#   .data = as_dt(.data)
#   if(abs(n) >= 1L) head(.data,n)
#   else head(.data,nrow(.data)*n)
# }

#' @rdname slice
#' @export
slice_tail_dt = function(.data,n,by = NULL){
  .data = as_dt(.data)
  substitute(by) %>% deparse() -> by_char
  if(by_char == "NULL"){
    if(abs(n) >= 1L) tail(.data,n)
    else tail(.data,nrow(.data)*n)
  }else{
    if(abs(n) >= 1L)
      eval(substitute(
        .data[,tail(.SD,n),by]
      ))
    else
      eval(substitute(
        .data[,tail(.SD,nrow(.SD)*n),by]
      ))
  }
}

# slice_tail_dt = function(.data,n){
#   .data = as_dt(.data)
#   if(abs(n) >= 1L) tail(.data,n)
#   else tail(.data,nrow(.data)*n)
# }

#' @rdname slice
#' @export
slice_max_dt = function(.data,order_by,n,by = NULL,with_ties = TRUE){
  dt = as_dt(.data)
  if(with_ties) ties_method = "min" else ties_method = "first"
  wt = substitute(order_by) %>% deparse()
  by = substitute(by) %>% deparse()
  if(n <= 0) stop("Invalid input, n should take a positive value.")

  # if(by == "NULL"){
  #   eval(parse(text = str_glue(
  #     " dt[frank(-{wt},ties.method = ties_method,na.last = 'keep')<=
  #   min(.N,ifelse(n<1,.N*n,n))][order(-{wt})]"
  #   )))
  # } else{
  #   eval(parse(text = str_glue(
  #     " dt[,.SD[frank(-{wt},ties.method = ties_method,na.last = 'keep')<=
  #   min(.N,ifelse(n<1,.N*n,n))],by = {by}]"
  #   )))
  # }

  if(by == "NULL"){
    eval(parse(text = str_glue(
      " dt[frankv({wt},ties.method = ties_method,order=-1,na.last = 'keep')<=
    min(.N,ifelse(n<1,.N*n,n))][order(-{wt})]"
    )))
  } else{
    eval(parse(text = str_glue(
      " dt[,.SD[frankv({wt},ties.method = ties_method,order=-1,na.last = 'keep')<=
    min(.N,ifelse(n<1,.N*n,n))],by = {by}]"
    )))
  }
}

#' @rdname slice
#' @export
slice_min_dt = function(.data,order_by,n,by = NULL,with_ties = TRUE){
  dt = as_dt(.data)
  if(with_ties) ties_method = "min" else ties_method = "first"
  wt = substitute(order_by) %>% deparse()
  by = substitute(by) %>% deparse()
  if(n <= 0) stop("Invalid input, n should take a positive value.")
  if(by == "NULL"){
    eval(parse(text = str_glue(
      " dt[frank({wt},ties.method = ties_method,na.last = 'keep')<=
    min(.N,ifelse(n<1,.N*n,n))][order({wt})]"
    )))
  } else{
    eval(parse(text = str_glue(
      " dt[,.SD[frank({wt},ties.method = ties_method,na.last = 'keep')<=
    min(.N,ifelse(n<1,.N*n,n))],by = {by}]"
    )))
  }
}

#' @rdname slice
#' @export
slice_sample_dt = function(.data, n, replace = FALSE, by = NULL){
  if(n >= 1L) sample_n_dt(.data,size = n,replace = replace,by = by)
  else if(n > 0) sample_frac_dt(.data,size = n,replace = replace,by = by)
  else stop("Invalid input, n should take a positive value.")
}

Try the tidyfst package in your browser

Any scripts or data that you put into this service are public.

tidyfst documentation built on July 26, 2023, 5:20 p.m.