R/merge.R
In dbi.table: Database Queries Using 'data.table' Syntax

Documented in merge.dbi.table

#' @name merge
#'
#' @aliases merge.dbi.table
#'
#' @title Merge two dbi.tables
#'
#' @description
#'   Merge two \code{\link{dbi.table}}s. The \code{dbi.table} method is similar
#'   to the \code{\link[data.table]{data.table}} method except that the result
#'   set is only determined up to row order and is not sorted by default.
#'
#'   Default merge columns: if \code{x} has a foreign key constraint that
#'   references \code{y} then the columns comprising this key are used; see
#'   details. When a foreign key cannot be found, then the common columns
#'   between the two \code{dbi.tables}s are used.
#'
#'   Use the \code{by}, \code{by.x}, and \code{by.y} arguments explicitly to
#'   override this default.
#'
#' @param x,y
#'   \code{\link{dbi.table}}s sharing the same DBI connection.
#'
#' @param by
#'   A vector of shared column names in \code{x} and \code{y} to merge on.
#'
#' @param by.x,by.y
#'   character vectors of column names in \code{x} and \code{y} to merge on.
#'
#' @param all
#'   a logical value. \code{all = TRUE} is shorthand to save setting both
#'   \code{all.x = TRUE} and \code{all.y = TRUE}.
#'
#' @param all.x
#'   a logical value. When \code{TRUE}, rows from \code{x} that do not have a
#'   matching row in \code{y} are included. These rows will have \code{NA}s in
#'   the columns that are filled with values from \code{y}. The default is
#'   \code{FALSE} so that only rows with data from both \code{x} and \code{y}
#'   are included in the output.
#'
#' @param all.y
#'   a logical value. Analogous to \code{all.x} above.
#'
#' @param sort
#'   a logical value. Currently ignored.
#'
#' @param suffixes
#'   a length-2 character vector. The suffixes to be used for making
#'   non-\code{by} column names unique. The suffix behavior works in a similar
#'   fashion to the \code{\link[base]{merge.data.frame}} method.
#'
#' @param no.dups
#'  a logical value. When \code{TRUE}, suffixes are also appended to
#'  non-\code{by.y} column names in \code{y} when they have the same column
#'  name as any \code{by.x}.
#'
#' @param recursive
#'   a logical value. Only used when \code{y} is missing. When \code{TRUE},
#'   \code{merge} is called recursively on each of the just-merged
#'   \code{dbi.table}s. See examples.
#'
#' @param \dots
#'   additional arguments are ignored.
#'
#' @return
#'   a \code{\link{dbi.table}}.
#'
#' @details
#'   Foreign key constraints. Foreign keys can only be queried when (1) the
#'   \code{dbi.table}'s schema is loaded, and (2) \code{dbi.table} understands
#'   the underlying database's information schema.
#'
#'   \code{merge.dbi.table} uses \code{\link{sql.join}} to join \code{x} and
#'   \code{y} then formats the result set to match the typical \code{merge}
#'   output.
#'
#' @examples
#'   chinook <- dbi.catalog(chinook.duckdb)
#'
#'   #The Album table has a foreign key constriant that references Artist
#'   merge(chinook$main$Album, chinook$main$Artist)
#'
#'   #When y is omitted, x's foreign key relationship is used to determine y
#'   merge(chinook$main$Album)
#'
#'   #Multiple foreign keys are supported
#'   csql(merge(chinook$main$Track))
#'
#'   #Track references Album but not Artist, Album references Artist
#'   #This dbi.table includes Artist.Name as well
#'   csql(merge(chinook$main$Track, recursive = TRUE))
#'
#' @export
merge.dbi.table <- function(x, y, by = NULL, by.x = NULL, by.y = NULL,
                            all = FALSE, all.x = all, all.y = all,
                            sort = FALSE, suffixes = c(".x", ".y"),
                            no.dups = TRUE, recursive = FALSE, ...) {
  if (!is.dbi.table(x)) {
    stop("'x' is not a 'dbi.table'")
  }

  if (missing(y) || is.null(y)) {
    return(relational_merge(x, recursive))
  }

  if (!is.dbi.table(y)) {
    stop("'y' is not a 'dbi.table'")
  }

  dots <- list(...)

  if (!is.null(dots$allow.cartesian)) {
    warning("the value of 'allow.cartesian' was ignored")
  }

  if (!is.null(dots$incomparables)) {
    warning("non-NULL value of 'incomparables' was ignored")
  }

  names_x <- names(x)
  names_y <- names(y)

  if (anyDuplicated(names_x) || anyDuplicated(names_y)) {
    stop("the 'merge' method for 'dbi.table' requires that 'x' and 'y' ",
         "each have unique column names")
  }

  if (is.null(by) && is.null(by.x) && is.null(by.y)) {
    if (is.null(rt <- related_tables(x, y)) || nrow(rt) < 1L) {
      by.x <- by.y <- intersect(names_x, names_y)
    } else {
      rt_x <- rt[, c("catalog_x", "schema_x", "table_x", "field_x")]
      by_x <- match_by_field(x, rt_x)
      rt_y <- rt[, c("catalog_y", "schema_y", "table_y", "field_y")]
      by_y <- match_by_field(y, rt_y)

      if (!anyNA(by_x) && !anyNA(by_y)) {
        by.x <- by_x
        by.y <- by_y
      }
    }
  }

  if ((!is.null(by.x) || !is.null(by.y)) && length(by.x) != length(by.y))
    stop("'by.x' and 'by.y' are not the same length")

  if (!missing(by) && !missing(by.x))
    warning("specification of 'by' superseded by 'by.x' and 'by.y'")

  if (!is.null(by.x)) {
    if (length(by.x) == 0L || !is.character(by.x) || !is.character(by.y))
      stop("non-empty character vectors of column names are required for ",
           "'by.x' and 'by.y'")

    if (!all(by.x %in% names_x))
      stop("Elements listed in 'by.x' must be valid column names in 'x'")

    if (!all(by.y %in% names_y))
      stop("Elements listed in 'by.y' must be valid column names in 'y'")
  } else {
    if (!length(by)) {
      stop("a non-empty character vector of column names is required for 'by'")
    }

    if (!all(by %in% intersect(names_x, names_y)))
      stop("Elements listed in 'by' must be valid column names in 'x' and 'y'")

    by <- unname(by)
    by.x <- by.y <- by
  }

  on <- paste(paste0("`x.", by.x, "`"), paste0("`y.", by.y, "`"), sep = " == ")
  on <- handy_andy(lapply(on, str2lang))

  if (!length(by.x)) {
    type <- "cross"
  } else if (all.x && all.y) {
    type <- "outer"
  } else if (all.x && !all.y) {
    type <- "left"
  } else if (!all.x && all.y) {
    type <- "right"
  } else {
    type <- "inner"
  }

  xy <- sql_join(x, y, type, on, c("x.", "y."), NULL)

  x <- c(xy)[seq_along(x)]
  names(x) <- names_x
  y <- c(xy)[-seq_along(x)]
  names(y) <- names_y

  start <- setdiff(names_x, by.x)
  end <- setdiff(names_y, by.y)

  non_by <- c(x[start], y[end])

  by_x <- list_of_by_columns(by.x, x)
  keep <- !vapply(by_x, is.null, FALSE)
  by_x <- by_x[keep]

  if (type %in% c("inner", "left")) {
    by <- by_x
  } else {
    by_y <- list_of_by_columns(by.y, y)
    names(by_y) <- by.x
    by_y <- by_y[keep]

    if (type == "right") {
      by <- by_y

    } else if (type == "outer") {
      by <- mapply(function(u, v) {
        if (!is.null(u) && !is.null(v))
          return(call("coalesce", u, v))
        if (!is.null(u))
          return(u)
        NULL
      }, u = by_x, v = by_y, SIMPLIFY = FALSE)
    } else { #for cross joins
      by <- list()
    }
  }

  j <- c(by, non_by)
  names_j <- names(j)
  a <- attributes(xy)
  a$names <- NULL
  attributes(j) <- a
  names(j) <- names_j

  # naming logical taken from merge.data.table (data.table version 1.14.10)
  by_names <- names(by)
  dupnames <- intersect(start, end)

  if (length(dupnames)) {
    start[match(dupnames, start, 0L)] <- paste0(dupnames, suffixes[1L])
    end[match(dupnames, end, 0L)] <- paste0(dupnames, suffixes[2L])
  }
  dupkeyx <- intersect(by.x, end)
  if (no.dups && length(dupkeyx)) {
    end[match(dupkeyx, end, 0L)] <- paste0(dupkeyx, suffixes[2L])
  }

  names(j) <- c(by_names, start, end)
  j
}



merge_i_dbi_table <- function(x, i, not_i, j, by, nomatch, on, enclos) {
  names(x) <- paste0("x.", x_names <- names(x))
  names(i) <- paste0("i.", i_names <- names(i))

  if (is.null(nomatch)) {
    join_type <- "inner"
  } else if (is.na(nomatch)) {
    join_type <- "right"
  } else {
    stop("'nomatch' must be NA or NULL", call. = FALSE)
  }

  if (is.character(on)) {
    if (is.null(on_names <- names(on))) {
      on_names <- on
    }

    on <- as.list(parse(text = on))
    single_name <- vapply(on, is.name, FALSE)
    no_name <- (nchar(on_names) == 0L)

    on_names[single_name & no_name] <- as.character(on[single_name & no_name])

    on[single_name] <- mapply(call,
                              name = "==",
                              #use names_list here?
                              lapply(on_names[single_name], as.name),
                              on[single_name],
                              SIMPLIFY = FALSE,
                              USE.NAMES = FALSE)

  } else if ((is_call_to(on) == ".") || (is_call_to(on) == "list")) {
    on <- as.list(on[-1])
  }

  on <- lapply(on, extract_on_validator, x_names = x_names, i_names = i_names)

  on_x <- as.character(lapply(on, `[[`, 2L))
  on_i <- as.character(lapply(on, `[[`, 3L))

  on <- lapply(on, function(u) {u[[2L]] <- as.name(paste0("x.", u[[2L]])); u})
  on <- lapply(on, function(u) {u[[3L]] <- as.name(paste0("i.", u[[3L]])); u})

  on <- handy_andy(on)

  if (not_i) {
    xi <- sql.join(x, i, type = "left", on = on, prefixes = c("x.", "i."))

    w <- lapply(paste0("i.", on_i), function(u) call("is.na", as.name(u)))
    w <- handy_andy(w)
    xi <- xi[w]

    if (is.null(j)) {
      j <- names_list(names(x), x_names)
    } else {
      j <- sub_lang(j, envir = names_list(names(x), x_names))
    }

    xi <- handle_j(xi, j, by = NULL)
  } else {
    xi <- sql.join(x, i, type = join_type, on = on)

    j_map <- names_list(xi)
    i_map <- names_list(names(i), i_names)
    j_map[names(i_map)] <- i_map
    x_map <- names_list(names(x), x_names)
    j_map[names(x_map)] <- x_map
    on_map <- names_list(paste0("i.", on_i), on_x)
    j_map[names(on_map)] <- on_map

    if (is.null(j)) {
      i_names <- setdiff(i_names, on_i)
      dups <- intersect(i_names, x_names)
      i_names[i_names %in% dups] <- paste0("i.", i_names[i_names %in% dups])
      j <- names_list(c(x_names, i_names))
    }

    j <- sub_lang(j, envir = j_map, specials = NULL)
    xi <- handle_j(xi, j, by = NULL)
  }

  xi
}



DT_SUPPORTED_JOIN_OPERATORS <- c("==", "<=", "<", ">=", ">")



extract_on_validator <- function(expr, x_names, i_names) {
  if (is.name(expr)) {
    cexpr <- as.character(expr)
    if (cexpr %in% x_names && cexpr %in% i_names) {
      return(call("==", expr, expr))
    } else {
      stop("argument specifying columns received non-existing column: '",
           cexpr, "'")
    }
  }

  if ((op <- is_call_to(expr)) %in% DT_SUPPORTED_JOIN_OPERATORS) {
    if (!(lhs <- as.character(expr[[2L]])) %in% x_names) {
      stop("argument specifying columns received non-existing column: '",
           lhs, "'")
    }
    if (!(rhs <- as.character(expr[[3L]])) %in% i_names) {
      stop("argument specifying columns received non-existing column: '",
           rhs, "'")
    }
    return(expr)
  } else {
    stop("invalid join operator [", op, "]; the allowed operators are ",
         "[", paste(DT_SUPPORTED_JOIN_OPERATORS, collapse = ", "), "]")
  }

  NULL
}



list_of_by_columns <- function(nm, x) {
  ret <- vector("list", length(nm))
  names(ret) <- nm
  idx <- intersect(nm, names(x))
  ret[idx] <- x[idx]
  ret
}