R/join.R

Defines functions log_join semi_join anti_join right_join left_join full_join inner_join

Documented in anti_join full_join inner_join left_join right_join semi_join

#' @export
inner_join <- function(x, y, by = NULL, ...) {
    log_join(x, y, by,
        .fun = dplyr::inner_join, .funname = "inner_join",
        .name_x = deparse1(substitute(x)), .name_y = deparse1(substitute(y)), ...)
}

#' @export
full_join <- function(x, y, by = NULL, ...) {
    log_join(x, y, by,
        .fun = dplyr::full_join, .funname = "full_join",
        .name_x = deparse1(substitute(x)), .name_y = deparse1(substitute(y)), ...)
}

#' @export
left_join <- function(x, y, by = NULL, ...) {
    log_join(x, y, by,
        .fun = dplyr::left_join, .funname = "left_join",
        .name_x = deparse1(substitute(x)), .name_y = deparse1(substitute(y)), ...)
}

#' @export
right_join <- function(x, y, by = NULL, ...) {
    log_join(x, y, by,
        .fun = dplyr::right_join, .funname = "right_join",
        .name_x = deparse1(substitute(x)), .name_y = deparse1(substitute(y)), ...)
}

#' @export
anti_join <- function(x, y, by = NULL, ...) {
    log_join(x, y, by,
        .fun = dplyr::anti_join, .funname = "anti_join",
        .name_x = deparse1(substitute(x)), .name_y = deparse1(substitute(y)), ...)
}

#' @export
semi_join <- function(x, y, by = NULL, ...) {
    log_join(x, y, by,
        .fun = dplyr::semi_join, .funname = "semi_join",
        .name_x = deparse1(substitute(x)), .name_y = deparse1(substitute(y)), ...)
}

log_join <- function(x, y, by, .fun, .funname, .name_x, .name_y, ...) {
    newdata <- .fun(x, y, by, ...)
    if (!"data.frame" %in% class(x) | !should_display()) {
        return(newdata)
    }

    # columns
    cols <- setdiff(names(newdata), names(x))
    if (length(cols) == 0) {
        display(glue::glue("{.funname}: added no columns"))
    } else {
        display(glue::glue("{.funname}: ",
            "added {plural(length(cols), 'column')} ({format_list(cols)})"))
    }

    # figure out matched in rows
    if ("dplyr_join_by" %in% class(by)) {
        if (all(by$condition == "==")) {
            keys <- by
        } else {
            return(newdata)
        }

    } else {
        keys <- suppressMessages(dplyr::common_by(by = by, x = x, y = y))
    }
    cols_x <- x[, keys$x, drop = FALSE]
    cols_y <- y[, keys$y, drop = FALSE]

    only_in_x <- suppressMessages(dplyr::anti_join(cols_x, cols_y,
                                                  by = stats::setNames(keys$y, keys$x)))
    only_in_y <- suppressMessages(dplyr::anti_join(cols_y, cols_x,
                                                  by = stats::setNames(keys$x, keys$y)))

    stats <- list(
        only_in_x = nrow(only_in_x),
        only_in_y = nrow(only_in_y),
        total = nrow(newdata)
    )

    # figure out matched & duplicates
    duplicates <- ""
    if (.funname == "inner_join") {
        stats$matched <- stats$total

        if (stats$matched > (nrow(x) - stats$only_in_x))
            duplicates <- "    (includes duplicates)"
    } else if (.funname == "full_join") {
        stats$matched <- stats$total - stats$only_in_x - stats$only_in_y

        if (stats$matched > (nrow(x) - stats$only_in_x))
            duplicates <- "    (includes duplicates)"
    } else if (.funname == "left_join") {
        stats$matched <- stats$total - stats$only_in_x

        if (stats$matched > (nrow(x) - stats$only_in_x))
            duplicates <- "    (includes duplicates)"
    } else if (.funname == "right_join") {
        stats$matched <- stats$total - stats$only_in_y

        if (stats$matched > (nrow(y) - stats$only_in_y))
            duplicates <- "    (includes duplicates)"
    } else if (.funname == "anti_join") {
        stats$matched <- nrow(x) - stats$total
        # by definition, no duplicates
    } else if (.funname == "semi_join") {
        stats$matched <- stats$total
        # by definition, no duplicates
    }

    # format to same width
    stats_str <- lapply(stats, function(x) formatC(x, big.mark = ","))
    max_n <- max(sapply(stats_str, nchar))
    stats_str <- lapply(stats_str, function(x) format(x, justify = "right", width = max_n))
    # data set names
    .name_x <- ifelse(.name_x == ".", "x", shorten(.name_x))
    .name_y <- ifelse(.name_y == ".", "y", shorten(.name_y))
    names_length <- max(nchar(.name_x), nchar(.name_y))
    .name_x <- format(.name_x, justify = "left", width = names_length)
    .name_y <- format(.name_y, justify = "left", width = names_length)
    # white space
    ws_pre <- paste0(rep(" ", nchar(.funname)), collapse = "")
    ws_post <- paste0(rep(" ", names_length), collapse = "")

    if (.funname %in% c("right_join", "inner_join", "semi_join")) {
        display(glue::glue("{ws_pre}  > rows only in {.name_x} ({stats_str$only_in_x})"))
    } else {
        display(glue::glue("{ws_pre}  > rows only in {.name_x}  {stats_str$only_in_x}"))
    }
    if (.funname %in% c("left_join", "inner_join", "semi_join", "anti_join")) {
        display(glue::glue("{ws_pre}  > rows only in {.name_y} ({stats_str$only_in_y})"))
    } else {
        display(glue::glue("{ws_pre}  > rows only in {.name_y}  {stats_str$only_in_y}"))
    }
    if (.funname == "anti_join") {
        display(glue::glue("{ws_pre}  > matched rows{ws_post}  ({stats_str$matched})"))
    } else {
        display(glue::glue("{ws_pre}  > matched rows{ws_post}   {stats_str$matched}{duplicates}"))
    }
    display(glue::glue("{ws_pre}  >{ws_post}               ={paste0(rep('=', max_n), collapse = '')}="))
    display(glue::glue("{ws_pre}  > rows total{ws_post}     {stats_str$total}"))

    newdata
}
elbersb/tidylog documentation built on Oct. 14, 2023, 12:54 p.m.