R/join.R

Defines functions summarise_join_stats join_fuzzily

Documented in join_fuzzily summarise_join_stats

#' Calculate how "well" two `data.frame`s joined
#'
#' @description Calculates the join count and percentage (by number of rows missing)
#' of a `data.frame` that is the results of a join.
#' @details This function is mostly useful after calling `dplyr::full_join()`
#' (or a similar function), although it can also be useful for `dplyr::left_join()`
#' #' or `dplyr::right_join()`.
#'
#' The logic for
#' handling `x` and `y` as strings/`quo`s was borrowed from a previous
#' version of `tidyr::gather()`.
#' @param data data.frame. This should already by the result of a join.
#' @param x,y Either characters or bare names of columns in `data` on which
#' to compute metrics.
#' @return A [tibble][tibble::tibble-package]. One line summary of join metrics,
#' including the following columns:
#' `x`, `n_x`, `n_x_unjoined`, `x_in_y_pct` and their `y` analogues,
#' as well as `n_joined`.
#' @export
#' @seealso <https://github.com/tidyverse/tidyr/blob/master/R/gather.R>
#' provides some insight for how to make the function accept either characters
#' or bare names for the column names.
summarise_join_stats <-
  function(data, x, y) {
    stopifnot(is.data.frame(data))

    x_nm <- rlang::quo_name(rlang::enexpr(x))
    y_nm <- rlang::quo_name(rlang::enexpr(y))

    in_x <- in_y <-  n_x <- n_y <- n_joined <- NULL

    idx <- match(c(x_nm, y_nm), names(data))
    res <- dplyr::select(data, idx)
    res <- dplyr::rename(res, in_x = !!(names(res)[1]), in_y = !!(names(res)[2]))
    res <-
      dplyr::summarise(
        res,
        n_x = sum(!is.na(in_x)),
        n_y = sum(!is.na(in_y)),
        n_joined = sum(!is.na(in_x) & !is.na(in_y)),
        n_x_unjoined = sum(is.na(in_x) & !is.na(in_y)),
        n_y_unjoined = sum(is.na(in_y) & !is.na(in_x))
      )
    res <-
      dplyr::mutate(
        res,
        x_in_y_pct = 100 * n_joined / n_x,
        y_in_x_pct = 100 * n_joined / n_y
      )
    res <- dplyr::bind_cols(tibble::tibble(x = x_nm, y = y_nm), res)
    res
  }


#' Join two `data.frame`s using fuzzy string matching.
#'
#' @description Join two `data.frame`s using fuzzy string matching.
#' @details This function is effectively a customized version of the `stringdist_join`
#' functions in the `{fuzzyjoin}` package.
#' Additionally, this function is primarily intended to be uzed before `tetidy::summarise_join_stats()`
#' with `mode = "full"`.
#' @inheritParams dplyr::inner_join
#' @param mode From `stringdist::stringdist_join()` documentation: One of "inner", "left", "right", "full" "semi", or "anti".
#' @param max_dist From `stringdist::stringdist_join()` documentation: Maximum distance to use for joining.
#' @param ... From `stringdist::stringdist_join()` documentation: Arguments passed on to `stringdist`.
#' @return A [tibble][tibble::tibble-package].
#' @export
#' @seealso [dplyr::inner_join()]
join_fuzzily <-
  function(x,
           y,
           mode = "inner",
           max_dist = 0,
           ...,
           by = intersect(names(x), names(y)),
           copy = FALSE,
           suffix = c("_x", "_y")) {

    pkg <- "fuzzyjoin"
    if(!requireNamespace(pkg)) {
      msg <- sprintf("Please install the `{%s}` package.", pkg)
      stop(msg)
    }
    f <- sprintf("fuzzyjoin::stringdist_%s_join", mode)
    suffix_x <- suffix[1]
    suffix_y <- suffix[2]
    res <- do_call_with(f, list(x = x, y = y, by = by, max_dist = max_dist, ...))

    . <- NULL
    res <-
      dplyr::rename_at(res, dplyr::vars(dplyr::ends_with(".x")), dplyr::funs(gsub("\\.x", suffix_x, .)))
    res <-
      dplyr::rename_at(res, dplyr::vars(dplyr::ends_with(".y")), dplyr::funs(gsub("\\.y", suffix_y, .)))

    if(copy) {
      by_y <- rlang::quo_name(paste0(by, suffix_y))
      res <-
        dplyr::left_join(res, dplyr::mutate(y, !!!rlang::sym(by_y) := !!!rlang::sym(by)), by = by)
    }
    res
  }
tonyelhabr/tetidy documentation built on May 29, 2019, 3:18 p.m.