knitr::opts_chunk$set(
  collapse = TRUE,
  comment = "#>"
)
library(erikmisc)

Completing combinations of key variables

Consider the data below. Let a, b, and c each be unique keys from a variety of data sources, where some data sources contain more than one of the keys, but not necessarily all of them at the same time. The goal is to complete the keys best by cross-referencing all other matching pairs.

For example, a=1 and b=11 appear together, and a=1 and c=10 appear together, therefore these three key values should appear together on the first two rows.

Data and best key variables completion.

x is a key variable but does not appear in this subset. The function handles empty keys, too.

# with data for key matching imputation
dat_data <-
  tibble::tribble(
    ~a, ~b, ~c, ~x, ~d1, ~d2
  ,  1, 11, NA, NA,   1,  20
  ,  1, NA, 10, NA,   2,  21
  ,  2, NA, NA, NA,   3,  22
  , NA, 22, 20, NA,   4,  23
  ,  2, NA, 20, NA,   5,  24
  ,  3, 33, NA, NA,   6,  25
  ,  4, NA, 40, NA,   7,  26
  ,  5, NA, NA, NA,   8,  27
  ,  6, NA, 60, NA,   9,  28
  ,  6, NA, 60, NA,  10,  29
  , NA, 77, NA, NA,  11,  30
  , NA, 88, 80, NA,  12,  31
  , NA, 88, NA, NA,  13,  32
  , NA, NA, NA, NA,  14,  33
  )

# pre-determined best key variables completion
dat_keys_pre <-
  tibble::tribble(
    ~a, ~b, ~c, ~x
  ,  1, 11, 10, NA
  ,  2, 22, 20, NA
  ,  3, 33, NA, NA
  ,  4, NA, 40, NA
  ,  5, NA, NA, NA
  ,  6, NA, 60, NA
  , NA, 77, NA, NA
  , NA, 88, 80, NA
  , NA, NA, NA, NA
  )

Results

Test key completion

Print data.

dat_data |> print(n=Inf)

Computing the best key variables completion.

dat_keys <-
  dat_data |>
  dplyr::select(
    a, b, c, x
  ) |>
  e_coalesce_column_set()
dat_keys |> print(n=Inf)

Pre-determined best key variables completion.

dat_keys_pre |> print(n=Inf)

They are equal.

dat_keys      <- dat_keys     |> dplyr::arrange(a, b, c, x)
dat_keys_pre  <- dat_keys_pre |> dplyr::arrange(a, b, c, x)

dplyr::all_equal(dat_keys_pre, dat_keys)

Best key variables completion with data

dat_data_updated <-
  e_complete_multiple_keys(
    dat_data
  , dat_keys = NULL
  , col_keys = c("a", "b", "c", "x")
  )

dat_data_updated |> print(n=Inf)


erikerhardt/erikmisc documentation built on April 17, 2025, 10:48 a.m.