R/relop_natural_join.R

Defines functions ex_data_table_step.relop_natural_join

Documented in ex_data_table_step.relop_natural_join

#' Natural join.
#'
#' \code{data.table} based implementation.
#'
#' @inheritParams ex_data_table_step
#'
#' @examples
#'
#' d1 <- build_frame(
#'     "key", "val", "val1" |
#'       "a"  , 1  ,  10    |
#'       "b"  , 2  ,  11    |
#'       "c"  , 3  ,  12    )
#' d2 <- build_frame(
#'     "key", "val", "val2" |
#'       "a"  , 5  ,  13    |
#'       "b"  , 6  ,  14    |
#'       "d"  , 7  ,  15    )
#'
#' # key matching join
#' optree <- natural_join(local_td(d1), local_td(d2),
#'                        jointype = "FULL", by = 'key')
#' ex_data_table(optree)
#'
#' # full cross-product join
#' # (usually with jointype = "FULL", but "LEFT" is more
#' # compatible with rquery field merge semantics).
#' optree2 <- natural_join(local_td(d1), local_td(d2),
#'                         jointype = "LEFT", by = NULL)
#' ex_data_table(optree2)
#' # notice ALL non-"by" fields take coalese to left table.
#'
#' @export
ex_data_table_step.relop_natural_join <- function(optree,
                                             ...,
                                             tables = list(),
                                             source_usage = NULL,
                                             source_limit = NULL,
                                             env = parent.frame()) {
  force(env)
  wrapr::stop_if_dot_args(substitute(list(...)), "rqdatatable::ex_data_table_step.relop_natural_join")
  if(is.null(source_usage)) {
    source_usage <- columns_used(optree)
  }
  inputs <- lapply(optree$source,
                   function(si) {
                     ex_data_table_step(si,
                                   tables = tables,
                                   source_usage = source_usage,
                                   source_limit = source_limit,
                                   env = env)
                   })
  A <- inputs[[1]]
  B <- inputs[[2]]
  inputs <- NULL
  by <- optree$by
  common_non_key <- sort(setdiff(intersect(column_names(A), column_names(B)),
                                 by))
  new_non_key <- paste0("rquery_join_tmp_", common_non_key)
  if(length(common_non_key)>0) {
    data.table::setnames(B, old = common_non_key, new = new_non_key)
  }
  col_to_zap <- NULL
  ACOL <- NULL # don't look like an unbound ref
  BCOL <- NULL # don't look like an unbound ref
  if(length(by)<1) {
    # data.table deliberately does not accept empty by
    col_to_zap <- "requery_join_const"
    A[[col_to_zap]] <- 'a'
    B[[col_to_zap]] <- 'a'
    by <- col_to_zap
  }
  res <- if(optree$jointype=="INNER") {
    merge(A, B, by = by, all=FALSE, allow.cartesian=TRUE)
  } else if(optree$jointype=="LEFT") {
    merge(A, B, by = by, all.x=TRUE, allow.cartesian=TRUE)
  } else if(optree$jointype=="RIGHT") {
    merge(A, B, by = by, all.y=TRUE, allow.cartesian=TRUE)
  } else if(optree$jointype=="FULL") {
    merge(A, B, by = by, all=TRUE, allow.cartesian=TRUE)
  } else {
    stop(paste("jointype was", optree$jointype, " but should be one of INNER, LEFT, RIGHT, or FULL"))
  }
  # fix up common columns with rquery coallesce rules
  for(i in seq_len(length(common_non_key))) {
    res[[common_non_key[[i]]]] <- ifelse(is.na(res[[common_non_key[[i]]]]),
                                         res[[new_non_key[[i]]]],
                                         res[[common_non_key[[i]]]])
    res[, (new_non_key[[i]]) := NULL]
  }
  if(!is.null(col_to_zap)) {
    res[, (col_to_zap) := NULL]
  }
  res[]
}

Try the rqdatatable package in your browser

Any scripts or data that you put into this service are public.

rqdatatable documentation built on Aug. 21, 2023, 9:10 a.m.