knitr::opts_chunk$set( collapse = TRUE, comment = "#>" )
library(erikmisc)
Consider the data below.
Let a
, b
, and c
each be unique keys from a variety of data sources,
where some data sources contain more than one of the keys,
but not necessarily all of them at the same time.
The goal is to complete the keys best by cross-referencing all other matching pairs.
For example, a=1
and b=11
appear together, and a=1
and c=10
appear together,
therefore these three key values should appear together on the first two rows.
Data and best key variables completion.
x
is a key variable but does not appear in this subset.
The function handles empty keys, too.
# with data for key matching imputation dat_data <- tibble::tribble( ~a, ~b, ~c, ~x, ~d1, ~d2 , 1, 11, NA, NA, 1, 20 , 1, NA, 10, NA, 2, 21 , 2, NA, NA, NA, 3, 22 , NA, 22, 20, NA, 4, 23 , 2, NA, 20, NA, 5, 24 , 3, 33, NA, NA, 6, 25 , 4, NA, 40, NA, 7, 26 , 5, NA, NA, NA, 8, 27 , 6, NA, 60, NA, 9, 28 , 6, NA, 60, NA, 10, 29 , NA, 77, NA, NA, 11, 30 , NA, 88, 80, NA, 12, 31 , NA, 88, NA, NA, 13, 32 , NA, NA, NA, NA, 14, 33 ) # pre-determined best key variables completion dat_keys_pre <- tibble::tribble( ~a, ~b, ~c, ~x , 1, 11, 10, NA , 2, 22, 20, NA , 3, 33, NA, NA , 4, NA, 40, NA , 5, NA, NA, NA , 6, NA, 60, NA , NA, 77, NA, NA , NA, 88, 80, NA , NA, NA, NA, NA )
Print data.
dat_data |> print(n=Inf)
Computing the best key variables completion.
dat_keys <- dat_data |> dplyr::select( a, b, c, x ) |> e_coalesce_column_set() dat_keys |> print(n=Inf)
Pre-determined best key variables completion.
dat_keys_pre |> print(n=Inf)
They are equal.
dat_keys <- dat_keys |> dplyr::arrange(a, b, c, x) dat_keys_pre <- dat_keys_pre |> dplyr::arrange(a, b, c, x) dplyr::all_equal(dat_keys_pre, dat_keys)
dat_data_updated <- e_complete_multiple_keys( dat_data , dat_keys = NULL , col_keys = c("a", "b", "c", "x") ) dat_data_updated |> print(n=Inf)
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.