R/merge.R

Defines functions is_pristine extract_on_validator merge_i_dbi_table merge.dbi.table

Documented in merge.dbi.table

#' @name merge
#'
#' @aliases merge.dbi.table
#'
#' @title Merge two dbi.tables
#'
#' @description
#'   Merge two \code{\link{dbi.table}}s. By default, the columns to merge on are
#'   determined by the first of the following cases to apply.
#'
#'   \enumerate{
#'     \item If \code{x} and \code{y} are each unmodified \code{dbi.table}s in
#'           the same \code{dbi.catalog} and if there is a single foreign key
#'           relating \code{x} and \code{y} (either \code{x} referencing
#'           \code{y}, or \code{y} referencing \code{x}), then it is used to set
#'           \code{by.x} and \code{by.y}.
#'
#'     \item If \code{x} and \code{y} have shared key columns, then they are
#'           used to set \code{by} (that is,
#'           \code{by = intersect(key(x), key(y))} when
#'           \code{intersect(key(x), key(y))} has length greater than zero).
#'
#'     \item If \code{x} has a key, then it is used to set \code{by} (that is,
#'           \code{by = key(x)} when \code{key(x)} has length greater than
#'           zero).
#'
#'     \item If \code{x} and \code{y} have columns in common, then they are used
#'           to set
#'           \code{by} (that is, \code{by = intersect(names(x), names(y))} when
#'           \code{intersect(names(x), names(y))} has length greater than zero).
#'   }
#'
#'   Use the \code{by}, \code{by.x}, and \code{by.y} arguments explicitly to
#'   override this default.
#'
#' @param x,y
#'   \code{\link{dbi.table}}s sharing the same DBI connection. If \code{y} is
#'   not a \code{dbi.table} but does inherit from \code{data.frame}, then it is
#'   coerced to a \code{dbi.table} using \code{\link{as.dbi.table}}. If \code{y}
#'   is missing, a merge is performed for each of \code{x}'s foreign keys.
#'
#' @param by
#'   a character vector of shared column names in \code{x} and \code{y} to merge
#'   on.
#'
#' @param by.x,by.y
#'   character vectors of column names in \code{x} and \code{y} to merge on.
#'
#' @param all
#'   a logical value. \code{all = TRUE} is shorthand to save setting both
#'   \code{all.x = TRUE} and \code{all.y = TRUE}.
#'
#' @param all.x
#'   a logical value. When \code{TRUE}, rows from \code{x} that do not have a
#'   matching row in \code{y} are included. These rows will have \code{NA}s in
#'   the columns that are filled with values from \code{y}. The default is
#'   \code{FALSE} so that only rows with data from both \code{x} and \code{y}
#'   are included in the output.
#'
#' @param all.y
#'   a logical value. Analogous to \code{all.x} above.
#'
#' @param sort
#'   a logical value. When TRUE (default), the key of the merged
#'   \code{dbi.table} is set to the \code{by} / \code{by.x} columns.
#'
#' @param suffixes
#'   a length-2 character vector. The suffixes to be used for making
#'   non-\code{by} column names unique. The suffix behavior works in a similar
#'   fashion to the \code{\link[base]{merge.data.frame}} method.
#'
#' @param no.dups
#'   a logical value. When \code{TRUE}, suffixes are also appended to
#'   non-\code{by.y} column names in \code{y} when they have the same column
#'   name as any \code{by.x}.
#'
#' @param recursive
#'   a logical value. Only used when \code{y} is missing. When \code{TRUE},
#'   \code{merge} is called on each \code{dbi.table} prior to merging with
#'   \code{x}. See examples.
#'
#' @param \dots
#'   additional arguments are passed to \code{\link{as.dbi.table}}.
#'
#' @return
#'   a \code{\link{dbi.table}}.
#'
#' @details
#'   \code{merge.dbi.table} uses \code{\link{sql.join}} to join \code{x} and
#'   \code{y} then formats the result set to match the typical \code{merge}
#'   output.
#'
#' @seealso
#'   \code{\link[data.table]{merge.data.table}},
#'   \code{\link[base]{merge.data.frame}}
#'
#' @examples
#'   chinook <- dbi.catalog(chinook.duckdb)
#'
#'   #The Album table has a foreign key constriant that references Artist
#'   merge(chinook$main$Album, chinook$main$Artist)
#'
#'   #When y is omitted, x's foreign key relationship is used to determine y
#'   merge(chinook$main$Album)
#'
#'   #Track has 3 foreign keys: merge with Album, Genre, and MediaType
#'   merge(chinook$main$Track)
#'
#'   #Track references Album but not Artist, Album references Artist
#'   #This dbi.table includes the artist name
#'   merge(chinook$main$Track, recursive = TRUE)
#'
#' @export
merge.dbi.table <- function(x, y, by = NULL, by.x = NULL, by.y = NULL,
                            all = FALSE, all.x = all, all.y = all,
                            sort = TRUE, suffixes = c(".x", ".y"),
                            no.dups = TRUE, recursive = FALSE, ...) {
  if (missing(y) || is.null(y)) {
    return(relational_merge(x, recursive))
  }

  if (!is.dbi.table(y)) {
    y <- as.dbi.table(y, ...)
  }

  dots <- list(...)

  if (!is.null(dots$allow.cartesian)) {
    warning("the value of 'allow.cartesian' was ignored")
  }

  if (!is.null(dots$incomparables)) {
    warning("non-NULL value of 'incomparables' was ignored")
  }

  x_names <- names(x)
  y_names <- names(y)

  if (anyDuplicated(x_names) || anyDuplicated(y_names)) {
    stop("the 'merge' method for 'dbi.table' requires that 'x' and 'y' ",
         "each have unique column names")
  }

  if (is.null(by) && is.null(by.x) && is.null(by.y)) {

    if (is_pristine(x) && is_pristine(y)) {
      if (length(rt <- related_tables(x, y))) {
        if (nrow(rt) == 1L) {
          by.x <- rt[[1L, "x_columns"]]
          by.y <- rt[[1L, "y_columns"]]
        }
      }
    }

    if (is.null(by.x) && is.null(by.y)) {
      by <- intersect(x_key <- get_key(x), get_key(y))

      if (!length(by)) {
        by <- x_key
      }

      if (!length(by)) {
        by <- intersect(x_names, y_names)
      }

      if (length(by) < 1L || !is.character(by)) {
        stop("a non-empty vector of column names for 'by' is required")
      }

      if (!all(by %in% x_names)) {
        stop("at least one column listed in 'by' is not present in 'x'")
      }
      if (!all(by %in% y_names)) {
        stop("at least one column listed in 'by' is not present in 'y'")
      }

      by <- unname(by)
    }
  }

  if ((!is.null(by.x) || !is.null(by.y)) && length(by.x) != length(by.y))
    stop("'by.x' and 'by.y' are not the same length")

  if (!missing(by) && !missing(by.x))
    warning("specification of 'by' superseded by 'by.x' and 'by.y'")

  if (!is.null(by.x)) {
    if (length(by.x) == 0L || !is.character(by.x) || !is.character(by.y))
      stop("non-empty character vectors of column names are required for ",
           "'by.x' and 'by.y'")

    if (!all(by.x %in% x_names))
      stop("Elements listed in 'by.x' must be valid column names in 'x'")

    if (!all(by.y %in% y_names))
      stop("Elements listed in 'by.y' must be valid column names in 'y'")
  } else {
    if (!length(by)) {
      stop("a non-empty character vector of column names is required for 'by'")
    }

    if (!all(by %in% intersect(x_names, y_names)))
      stop("Elements listed in 'by' must be valid column names in 'x' and 'y'")

    by <- unname(by)
    by.x <- by.y <- by
  }

  on <- paste(paste0("`x.", by.x, "`"), paste0("`y.", by.y, "`"), sep = " == ")
  on <- handy_andy(lapply(on, str2lang))

  if (!length(by.x)) {
    type <- "cross"
  } else if (all.x && all.y) {
    type <- "outer"
  } else if (all.x && !all.y) {
    type <- "left"
  } else if (!all.x && all.y) {
    type <- "right"
  } else {
    type <- "inner"
  }

  xy <- sql_join(x, y, type, on, c("x.", "y."), NULL)

  xy_x <- names_list(names(xy)[seq_along(x)], x_names)
  xy_y <- names_list(names(xy)[-seq_along(x)], y_names)

  start <- setdiff(x_names, by.x)
  end <- setdiff(y_names, by.y)

  if (type %in% c("inner", "left")) {
    by <- xy_x[by.x]
  } else if (type == "right") {
    by <- xy_y[by.y]
    names(by) <- by.x
  } else if (type == "outer") {
    by <- mapply(function(u, v) {
      if (!is.null(u) && !is.null(v))
        return(call("coalesce", u, v))
      if (!is.null(u))
        return(u)
      NULL
    }, u = xy_x[by.x], v = xy_y[by.y], SIMPLIFY = FALSE)
  } else { #for cross joins
    by <- list()
  }

  xy <- handle_j(xy, c(by, xy_x[start], xy_y[end]), NULL, NULL)

  # naming logical taken from merge.data.table (data.table version 1.14.10)
  by_names <- names(by)
  dupnames <- intersect(start, end)

  if (length(dupnames)) {
    start[match(dupnames, start, 0L)] <- paste0(dupnames, suffixes[1L])
    end[match(dupnames, end, 0L)] <- paste0(dupnames, suffixes[2L])
  }
  dupkeyx <- intersect(by.x, end)
  if (no.dups && length(dupkeyx)) {
    end[match(dupkeyx, end, 0L)] <- paste0(dupkeyx, suffixes[2L])
  }

  names(xy) <- c(by_names, start, end)

  if (isTRUE(sort)) {
    attr(xy, "sorted") <- by_names
  }

  xy
}



merge_i_dbi_table <- function(x, i, not_i, j, by, nomatch, on, enclos) {
  x_key <- get_key(x)

  if (is.null(on)) {
    if (length(x_key)) {
      if (is.null(i_key <- get_key(i))) {
        i_key <- names(i)
      }

      idx <- seq_len(min(length(x_key), length(i_key)))
      on <- paste(x_key[idx], i_key[idx], sep = " == ")
    } else {
      stop("when 'on' is NULL, 'x' must be keyed", call. = FALSE)
    }
  }

  names(x) <- paste0("x.", x_names <- names(x))
  names(i) <- paste0("i.", i_names <- names(i))

  if (is.null(nomatch)) {
    join_type <- "inner"
  } else if (is.na(nomatch)) {
    join_type <- "right"
  } else {
    stop("'nomatch' must be NA or NULL", call. = FALSE)
  }

  if (is.character(on)) {
    if (is.null(on_names <- names(on))) {
      on_names <- on
    }

    on <- as.list(parse(text = on))
    single_name <- vapply(on, is.name, FALSE)
    no_name <- (nchar(on_names) == 0L)

    on_names[single_name & no_name] <- as.character(on[single_name & no_name])

    on[single_name] <- mapply(call,
                              name = "==",
                              #use names_list here?
                              lapply(on_names[single_name], as.name),
                              on[single_name],
                              SIMPLIFY = FALSE,
                              USE.NAMES = FALSE)

  } else if ((is_call_to(on) == ".") || (is_call_to(on) == "list")) {
    on <- as.list(on[-1])
  }

  on <- lapply(on, extract_on_validator, x_names = x_names, i_names = i_names)

  on_x <- as.character(lapply(on, `[[`, 2L))
  on_i <- as.character(lapply(on, `[[`, 3L))

  on <- lapply(on, function(u) {
    u[[2L]] <- as.name(paste0("x.", u[[2L]]))
    u
  })

  on <- lapply(on, function(u) {
    u[[3L]] <- as.name(paste0("i.", u[[3L]]))
    u
  })

  on <- handy_andy(on)

  if (not_i) {
    xi <- sql.join(x, i, type = "left", on = on, prefixes = c("x.", "i."))

    w <- lapply(paste0("i.", on_i), function(u) call("is.na", as.name(u)))
    w <- handy_andy(w)
    xi <- xi[w]

    if (is.null(j)) {
      j <- names_list(names(x), x_names)
    } else {
      j <- sub_lang(j, envir = names_list(names(x), x_names))
    }

    xi <- handle_j(xi, j, by = NULL)
  } else {
    xi <- sql.join(x, i, type = join_type, on = on)

    j_map <- names_list(xi)
    i_map <- names_list(names(i), i_names)
    j_map[names(i_map)] <- i_map
    x_map <- names_list(names(x), x_names)
    j_map[names(x_map)] <- x_map
    on_map <- names_list(paste0("i.", on_i), on_x)
    j_map[names(on_map)] <- on_map

    if (is.null(j)) {
      i_names <- setdiff(i_names, on_i)
      dups <- intersect(i_names, x_names)
      i_names[i_names %in% dups] <- paste0("i.", i_names[i_names %in% dups])
      j <- names_list(c(x_names, i_names))
    }

    j <- sub_lang(j, envir = j_map, specials = NULL)
    xi <- handle_j(xi, j, by = NULL)
  }

  attr(xi, "sorted") <- x_key
  xi
}



DT_SUPPORTED_JOIN_OPERATORS <- c("==", "<=", "<", ">=", ">")



extract_on_validator <- function(expr, x_names, i_names) {
  if (is.name(expr)) {
    cexpr <- as.character(expr)
    if (cexpr %in% x_names && cexpr %in% i_names) {
      return(call("==", expr, expr))
    } else {
      stop("argument specifying columns received non-existing column: '",
           cexpr, "'")
    }
  }

  if ((op <- is_call_to(expr)) %in% DT_SUPPORTED_JOIN_OPERATORS) {
    if (!(lhs <- as.character(expr[[2L]])) %in% x_names) {
      stop("argument specifying columns received non-existing column: '",
           lhs, "'")
    }
    if (!(rhs <- as.character(expr[[3L]])) %in% i_names) {
      stop("argument specifying columns received non-existing column: '",
           rhs, "'")
    }
    return(expr)
  } else {
    stop("invalid join operator [", op, "]; the allowed operators are ",
         "[", paste(DT_SUPPORTED_JOIN_OPERATORS, collapse = ", "), "]")
  }

  NULL
}



is_pristine <- function(x) {
  f <- get_fields(x)
  nn <- names_list(f$internal_name, f$field)
  nrow(get_data_source(x)) == 1L && setequal(c(x), nn)
}

Try the dbi.table package in your browser

Any scripts or data that you put into this service are public.

dbi.table documentation built on June 29, 2025, 1:07 a.m.