R/flatten.R

Defines functions prepare_dm_for_flatten check_flatten_to_tbl parent_child_table dm_join_to_tbl dm_flatten_to_tbl_impl dm_flatten_to_tbl

Documented in dm_flatten_to_tbl dm_join_to_tbl

#' Flatten a part of a `dm` into a wide table
#'
#' `dm_flatten_to_tbl()` gathers all information of interest in one place in a wide table.
#' It performs a disambiguation of column names and a cascade of joins.
#'
#' @inheritParams dm_join_to_tbl
#' @param .start The table from which all outgoing foreign key relations are considered
#'   when establishing a processing order for the joins.
#'   An interesting choice could be
#'   for example a fact table in a star schema.
#' @param ...
#'   `r lifecycle::badge("experimental")`
#'
#'   Unquoted names of the tables to be included in addition to the `.start` table.
#'   The order of the tables here determines the order of the joins.
#'   If the argument is empty, all tables that can be reached will be included.
#'   `tidyselect` is supported, see [dplyr::select()] for details on the semantics.
#' @param .recursive Logical, defaults to `FALSE`. Should not only parent tables be joined to `.start`, but also their ancestors?
#' @param .join The type of join to be performed, see [dplyr::join()].
#' @family flattening functions
#'
#' @details
#' With `...` left empty, this function will join together all the tables of your [`dm`]
#' object that can be reached from the `.start` table, in the direction of the foreign key relations
#' (pointing from the child tables to the parent tables), using the foreign key relations to
#' determine the argument `by` for the necessary joins.
#' The result is one table with unique column names.
#' Use the `...` argument if you would like to control which tables should be joined to the `.start` table.
#'
#' Mind that calling `dm_flatten_to_tbl()` with `.join = right_join` and no table order determined in the `...` argument
#' will not lead to a well-defined result if two or more foreign tables are to be joined to `.start`.
#' The resulting
#' table would depend on the order the tables that are listed in the `dm`.
#' Therefore, trying this will result in a warning.
#'
#' Since `.join = nest_join` does not make sense in this direction (LHS = child table, RHS = parent table: for valid key constraints
#' each nested column entry would be a tibble of one row), an error will be thrown if this method is chosen.
#'
#' The difference between `.recursive = FALSE` and `.recursive = TRUE` is
#' the following (see the examples):
#'
#' - `.recursive = FALSE` allows only one level of hierarchy
#'   (i.e., direct neighbors to table `.start`), while
#'
#' - `.recursive = TRUE` will go through all levels of hierarchy while joining.
#'
#' Additionally, these functions differ from `dm_wrap_tbl()`, which always
#' returns a `dm` object.
#'
#' @return A single table that results from consecutively joining all affected tables to the `.start` table.
#'
#' @examplesIf dm:::dm_has_financial()
#'
#' dm_financial() %>%
#'   dm_select_tbl(-loans) %>%
#'   dm_flatten_to_tbl(.start = cards)
#'
#' dm_financial() %>%
#'   dm_select_tbl(-loans) %>%
#'   dm_flatten_to_tbl(.start = cards, .recursive = TRUE)
#'
#' @export
dm_flatten_to_tbl <- function(dm, .start, ..., .recursive = FALSE, .join = left_join) {
  check_not_zoomed(dm)
  join_name <- as_label(enexpr(.join))
  if (.recursive && !(join_name %in% c("left_join", "full_join", "inner_join"))) abort_squash_limited()

  start <- dm_tbl_name(dm, {{ .start }})

  vars <- setdiff(src_tbls_impl(dm), start)
  list_of_pts <- eval_select_table(quo(c(...)), vars)

  dm_flatten_to_tbl_impl(dm, start, list_of_pts, join = .join, join_name = join_name, squash = .recursive)
}

dm_flatten_to_tbl_impl <- function(dm, start, list_of_pts, join, join_name, squash, .position = "suffix") {
  if (join_name == "nest_join") abort_no_flatten_with_nest_join()

  force(join)
  stopifnot(is_function(join))

  # in case of `semi_join()` and `anti_join()` no renaming necessary
  gotta_rename <- !(join_name %in% c("semi_join", "anti_join"))

  # early returns for some of the possible joins would be possible for "perfect" key relations,
  # but since it is generally possible to have imperfect FK relations, `semi_join` and `anti_join` might
  # produce results, that are of interest, e.g.
  # dm_flatten_to_tbl(dm_nycflights13(cycle = TRUE) %>% dm_rm_fk(flights, origin, airports), flights, airports, join = anti_join)

  # need to work with directed graph here, since we only want to go in the direction
  # the foreign key is pointing to
  g <- create_graph_from_dm(dm, directed = TRUE)

  # If no tables are given, we use all reachable tables
  auto_detect <- is_empty(list_of_pts)
  if (auto_detect) {
    list_of_pts <- get_names_of_connected(g, start, squash)
  }
  # We use the induced subgraph right away
  g <- igraph::induced_subgraph(g, c(start, list_of_pts))

  # each next table needs to be accessible from the former table (note: directed relations)
  # we achieve this with a depth-first-search (DFS) with param `unreachable = FALSE`
  dfs <- igraph::dfs(g, start, unreachable = FALSE, father = TRUE, dist = TRUE)

  # compute all table names
  order_df <-
    tibble(
      name = names(dfs[["order"]]),
      pred = names(V(g))[unclass(dfs[["father"]])[name]]
    )

  # function to detect any reason for abort()
  check_flatten_to_tbl(
    join_name,
    (nrow(dm_get_filters_impl(dm)) > 0) && !is_empty(list_of_pts),
    anyNA(order_df$name),
    g,
    auto_detect,
    nrow(order_df) > 2,
    any(dfs$dist > 1),
    squash
  )

  # rename dm and replace table `.start` by its filtered, renamed version
  prep_dm <- prepare_dm_for_flatten(dm, order_df$name, gotta_rename, position = .position)

  # Drop the first table in the list of join partners. (We have at least one table, `.start`.)
  # (Working with `reduce2()` here and the `.init`-argument is the first table)
  # in the case of only one table in the `dm` (table "start"), all code below is a no-op
  order_df <- order_df[-1, ]
  # the order given in the ellipsis determines the join-list; if empty ellipsis, this is a no-op.
  # `unname()` to avoid warning (tibble version ‘2.99.99.9012’ retains names in column vectors)
  order_df <- left_join(tibble(name = unname(list_of_pts)), order_df, by = "name")

  # list of join partners
  ordered_table_list <- dm_get_tables(prep_dm)[order_df$name]
  by <- map2(order_df$pred, order_df$name, ~ get_by(prep_dm, .x, .y))

  # perform the joins according to the list, starting with table `initial_LHS`
  reduce2(ordered_table_list, by, ~ join(..1, ..2, by = ..3), .init = tbl_impl(prep_dm, start))
}

#' Join two tables
#'
#' `dm_join_to_tbl()` is deprecated in favor of [dm_flatten_to_tbl()].
#'
#' @param dm A [`dm`] object.
#' @param table_1 One of the tables involved in the join.
#' @param table_2 The second table of the join.
#' @param join The type of join to be performed, see [dplyr::join()].
#'
#' @rdname deprecated
#' @export
dm_join_to_tbl <- function(dm, table_1, table_2, join = left_join) {
  deprecate_soft("0.3.0", "dm::dm_join_to_tbl()", "dm::dm_flatten_to_tbl()")

  check_not_zoomed(dm)
  force(join)
  stopifnot(is_function(join))
  join_name <- deparse(substitute(join))

  t1_name <- dm_tbl_name(dm, {{ table_1 }})
  t2_name <- dm_tbl_name(dm, {{ table_2 }})

  rel <- parent_child_table(dm, {{ table_1 }}, {{ table_2 }})
  start <- rel$child_table
  other <- rel$parent_table

  dm_flatten_to_tbl_impl(dm, start, other, join = join, join_name = join_name, squash = FALSE, .position = "prefix")
}

parent_child_table <- function(dm, table_1, table_2) {
  t1_name <- dm_tbl_name(dm, {{ table_1 }})
  t2_name <- dm_tbl_name(dm, {{ table_2 }})

  rel <-
    dm_get_all_fks(dm) %>%
    filter(
      (child_table == t1_name & parent_table == t2_name) |
        (child_table == t2_name & parent_table == t1_name)
    )

  if (nrow(rel) == 0) {
    abort_tables_not_neighbors(t1_name, t2_name)
  }

  if (nrow(rel) > 1) {
    abort_no_cycles(create_graph_from_dm(dm))
  }

  rel
}

check_flatten_to_tbl <- function(join_name,
                                 part_cond_abort_filters,
                                 any_not_reachable,
                                 g,
                                 auto_detect,
                                 more_than_1_pt,
                                 has_grandparent,
                                 squash) {
  # argument checking, or filter and recompute induced subgraph
  # for subsequent check
  if (any_not_reachable) {
    abort_tables_not_reachable_from_start()
  }

  # Cycles not yet supported
  if (length(V(g)) - 1 != length(E(g))) {
    abort_no_cycles(g)
  }
  if (join_name == "nest_join") abort_no_flatten_with_nest_join()
  if (part_cond_abort_filters && join_name %in% c("full_join", "right_join")) abort_apply_filters_first(join_name)
  # the result for `right_join()` depends on the order of the dim-tables in the `dm`
  # if 2 or more of them are joined to the fact table and ellipsis is empty.


  # If called by `dm_join_to_tbl()` or `dm_flatten_to_tbl()`, the argument `squash = FALSE`.
  # Then only one level of hierarchy is allowed (direct neighbors to table `.start`).
  if (!squash && has_grandparent) {
    abort_only_parents()
  }

  if (join_name == "right_join" && auto_detect && more_than_1_pt) {
    warning(
      paste0(
        "Result for `dm_flatten_to_tbl()` with `right_join()` dependend on order of tables in `dm`, when ",
        "more than 2 tables involved and no explicit order given in `...`."
      )
    )
  }
}

prepare_dm_for_flatten <- function(dm, tables, gotta_rename, position = "suffix") {
  start <- tables[1]
  # filters need to be empty, for the disambiguation to work
  # renaming will be minimized if we reduce the `dm` to the necessary tables here
  red_dm <-
    dm_reset_all_filters(dm) %>%
    dm_select_tbl(!!!tables)
  # Only need to compute `dm[[start]]`, `dm_apply_filters()` not necessary
  # Need to use `dm` and not `clean_dm` here, because of possible filter conditions.
  start_tbl <- dm_get_filtered_table(dm, start)

  if (gotta_rename) {
    table_colnames <- get_table_colnames(red_dm)
    recipe <- compute_disambiguate_cols_recipe(table_colnames, sep = ".", position = position)
    explain_col_rename(recipe)
    # prepare `dm` by disambiguating columns (on a reduced dm)
    clean_dm <-
      col_rename(red_dm, recipe)
    # the column names of start_tbl need to be updated, since taken from `dm` and not `clean_dm`,
    # therefore we need a named variable containing the new and old names
    renames <-
      pluck(recipe$renames[recipe$table == start], 1)
    start_tbl <- start_tbl %>% rename(!!!renames)
  } else {
    # for `anti_join()` and `semi_join()` no renaming necessary
    clean_dm <- red_dm
    renames <- character(0)
  }

  def <- dm_get_def(clean_dm)
  def$data[[which(def$table == start)]] <- start_tbl
  dm_from_def(def)
}

Try the dm package in your browser

Any scripts or data that you put into this service are public.

dm documentation built on Nov. 2, 2023, 6:07 p.m.