#' Merge data tables
#'
#' Merge two data.table objects. If there are any duplicated
#' ID values and column names across the two data tables, the
#' cell values in the first data.table will remain intact and
#' the cell values in the second data.table will be discarded for the
#' resulting merged data table.
#'
#' @param dt1 the first data.table which will remain intact
#' @param dt2 the second data.table which will be joined outside of
#' (around) the first data.table. If there are any duplicated
#' ID values and column names across the two data tables, the
#' cell values in the first data.table will remain intact and
#' the cell values in the second data.table will be discarded for the
#' resulting merged data table.
#' @param id name(s) of the column(s) that will contain the ID values
#' in the two data tables. The name(s) of the ID column(s) must be identical
#' in the two data tables.
#' @param silent If \code{silent = TRUE}, no message will be printed
#' regarding how many ID values and column names were duplicated.
#' If \code{silent = FALSE}, messages will be printed regarding
#' how many column names were duplicated.
#' In cases where only one column was used as the 'id' column (which is the
#' most common case), \code{silent = FALSE} will also print messages
#' regarding how many input ID values were duplicated.
#' By default, \code{silent = FALSE}.
#' @return a data.table object, which merges (joins) the second data.table
#' around the first data.table.
#' @examples
#' ## Example 1: Typical Usage
#' data_1 <- data.table::data.table(
#' id_col = c(4, 2, 1, 3),
#' a = 3:6,
#' b = 5:8,
#' c = c("w", "x", "y", "z"))
#' data_2 <- data.table::data.table(
#' id_col = c(1, 99, 4),
#' e = 6:8,
#' b = c("p", "q", "r"),
#' d = c(TRUE, FALSE, FALSE))
#' # check the two example data tables
#' data_1
#' data_2
#' # check the result of merging the two data tables above and
#' # note how data_1 (the upper left portion) is intact in the resulting
#' # data table
#' merge_data_tables(dt1 = data_1, dt2 = data_2, id = "id_col")
#' # compare the result with above with the result from the `merge` function
#' merge(data_1, data_2, by = "id_col", all = TRUE)
#' ## Example 2: Some values can be converted
#' data_3 <- data.table::data.table(
#' id_col = 99,
#' a = "abc",
#' b = TRUE,
#' c = TRUE)
#' data_1
#' data_3
#' merge_data_tables(data_1, data_3, id = "id_col")
#' # In the example above, note how the value of TRUE gets
#' # converted to 1 in the last row of Column 'b' in the resulting data table
#' ## Example 3: A simpler case
#' data_4 <- data.table::data.table(
#' id_col = c(5, 3),
#' a = c("a", NA))
#' data_5 <- data.table::data.table(
#' id_col = 1,
#' a = 2)
#' # check the two example data tables
#' data_4
#' data_5
#' merge_data_tables(data_4, data_5, id = "id_col")
#' ## Example 4: Merging data tables using multiple ID columns
#' data_6 <- data.table::data.table(
#' id_col_1 = 3:1,
#' id_col_2 = c("a", "b", "c"),
#' id_col_3 = 4:6,
#' a = 7:9,
#' b = 10:12)
#' data_7 <- data.table::data.table(
#' id_col_1 = c(3, 2),
#' id_col_3 = c(3, 5),
#' id_col_2 = c("a", "b"),
#' c = 13:14,
#' a = 15:16)
#' # check the example data sets
#' data_6
#' data_7
#' # merge data sets using the three id columns
#' suppressWarnings(merge_data_tables(
#' dt1 = data_6,
#' dt2 = data_7,
#' id = c("id_col_1", "id_col_2", "id_col_3")))
#' @import data.table
#' @export
merge_data_tables <- function(
dt1 = NULL,
dt2 = NULL,
id = NULL,
silent = TRUE
) {
# stop if an input is missing
if (is.null(dt1)) {
stop("Please enter an input for the first data table argument, dt1.")
}
if (is.null(dt2)) {
stop("Please enter an input for the second data table argument, dt2.")
}
if (is.null(id)) {
stop(paste0("Please enter an input for the ID column name, ",
'e.g., id = "subject_id".'))
}
# copy and coerce inputs into data tables
dt1c <- data.table::setDT(data.table::copy(dt1))
dt2c <- data.table::setDT(data.table::copy(dt2))
# check the id column for cases when there is only one id column
if (length(id) == 1) {
# check if the id column is in both data tables
if (!id %in% names(dt1c)) {
stop(paste0(
'The id column "', id, '" is not found in the first',
" data table, dt1."))
}
if (!id %in% names(dt2c)) {
stop(paste0(
'The id column "', id, '" is not found in the second',
" data table, dt2."))
}
}
# check the id column for cases when there multiple id columns
if (length(id) > 1) {
warning(paste0(
"Merging using multiple columns may not work properly.\n",
"Please refer to the usage example of merging data_6 and data_7",
"\nin the manual. Type `?kim::merge_data_tables`\n",
"and scroll to the bottom to view the example.\n",
"To prevent this warning message from popping up again,\n",
"use the 'suppressWarnings' as in the example."))
if (any(duplicated(id))) {
stop("Each id column name must be unique.")
}
# check whether all the id columns exist in dt1
id_cols_not_in_dt1 <- setdiff(id, names(dt1c))
if (length(id_cols_not_in_dt1) > 1) {
stop(paste0(
"The following id column(s) are not found in the ",
"first data table, dt1:\n",
paste0(id_cols_not_in_dt1, collapse = ", ")))
}
# check whether all the id columns exist in dt2
id_cols_not_in_dt2 <- setdiff(id, names(dt2c))
if (length(id_cols_not_in_dt2) > 1) {
stop(paste0(
"The following id column(s) are not found in the ",
"second data table, dt2:\n",
paste0(id_cols_not_in_dt2, collapse = ", ")))
}
}
# create a temporary id column when multiple columns
# are given as input for id
if (length(id) > 1) {
i <- 1
temp_id_col_suffix_max <- 999999999
suffix_found <- FALSE
while(suffix_found == FALSE) {
if (!paste0("temp_col_name_kim_", i) %in% names(dt1c) &
!paste0("temp_col_name_kim_", i) %in% names(dt2c)) {
temp_id_col_name <- paste0("temp_col_name_kim_", i)
suffix_found <- TRUE
} else {
i <- i + 1
}
}
separator <- gsub("\\.", "", format(Sys.time(), "_%Y%m%d%H%M%OS6_"))
dt1c[, (temp_id_col_name) :=
do.call(paste, c(.SD, sep = separator)), .SDcols = id]
# check the temporary id column just created for dt1
if (any(duplicated(dt1c[[temp_id_col_name]]))) {
stop(paste0(
"Failed to create unique temporary ID values in dt1, ",
"based on the\ninput of ID columns. This is likely due to ",
"the function's limitations."))
}
dt2c[, (temp_id_col_name) :=
do.call(paste, c(.SD, sep = separator)), .SDcols = id]
# check the temporary id column just created for dt2
if (any(duplicated(dt2c[[temp_id_col_name]]))) {
stop(paste0(
"Failed to create unique temporary ID values in dt2, ",
"based on the\ninput of ID columns. This is likely due to ",
"the function's limitations."))
}
# id values in each data table; replicate so that the order is preserved
# jin is not sure if the 'rep' function is necessary
dt1c_id_values <- rep(dt1c[[temp_id_col_name]])
dt2c_id_values <- rep(dt2c[[temp_id_col_name]])
} else if (length(id) == 1) {
# id values in each data table; replicate so that the order is preserved
dt1c_id_values <- rep(dt1c[[id]])
dt2c_id_values <- rep(dt2c[[id]])
}
# id values in the final dt
id_in_final_dt <- union(dt1c_id_values, dt2c_id_values)
# id values unique in dt2c
unique_id_in_dt2c <- setdiff(dt2c_id_values, dt1c_id_values)
# print duplicated id values
duplicated_id_values <- intersect(dt1c_id_values, dt2c_id_values)
if (length(id) == 1) {
if (silent == FALSE) {
message(paste0(
"Number of duplicated ID values: ", length(duplicated_id_values)))
}
}
# column names in each data table
dt1c_col_names <- names(dt1c)
dt2c_col_names <- names(dt2c)
# column names in the final dt
if (length(id) == 1) {
col_names_in_final_dt <- union(dt1c_col_names, dt2c_col_names)
} else if (length(id) > 1) {
col_names_in_final_dt <- setdiff(
union(dt1c_col_names, dt2c_col_names), temp_id_col_name)
}
# names of non-ID columns that are in both data tables
if (length(id) == 1) {
duplicated_col_names <- setdiff(
intersect(dt1c_col_names, dt2c_col_names), id)
} else if (length(id) > 1) {
duplicated_col_names <- setdiff(
intersect(dt1c_col_names, dt2c_col_names), temp_id_col_name)
}
# print duplicated column names
if (silent == FALSE) {
message(paste0(
"Number of duplicated column names: ",
length(duplicated_col_names)))
}
# set keys in each dt
if (length(id) == 1) {
data.table::setkeyv(dt1c, id)
data.table::setkeyv(dt2c, id)
} else if (length(id) > 1) {
data.table::setkeyv(dt1c, temp_id_col_name)
data.table::setkeyv(dt2c, temp_id_col_name)
}
# merge data tables
merged_dt <- merge(dt1c, dt2c, all = TRUE, sort = FALSE)
# merge duplicated columns
if (length(duplicated_col_names) > 0) {
if (length(id) == 1) {
merged_cols <- lapply(duplicated_col_names, function(x) {
dt1c_rows <- merged_dt[[id]] %in% dt1c_id_values
v1 <- merged_dt[[paste0(x, ".x")]][dt1c_rows]
dt2c_rows <- merged_dt[[id]] %in% unique_id_in_dt2c
v2 <- merged_dt[[paste0(x, ".y")]][dt2c_rows]
output <- c(v1, v2)
return(output)
})
} else if (length(id) > 1) {
merged_cols <- lapply(duplicated_col_names, function(x) {
dt1c_rows <- merged_dt[[temp_id_col_name]] %in% dt1c_id_values
v1 <- merged_dt[[paste0(x, ".x")]][dt1c_rows]
dt2c_rows <- merged_dt[[temp_id_col_name]] %in% unique_id_in_dt2c
v2 <- merged_dt[[paste0(x, ".y")]][dt2c_rows]
output <- c(v1, v2)
return(output)
})
}
# give names for merged_cols
names(merged_cols) <- duplicated_col_names
# replace the first set of duplicated columns (those with suffix ".x")
# with the newly created merged columns
cols_to_replace <- paste0(duplicated_col_names, ".x")
for (col in cols_to_replace) {
data.table::set(
merged_dt, j = col,
value = merged_cols[[gsub("\\.x$", "", col)]])
}
# remove the second set of duplicated columns (those with suffix ".y")
cols_to_remove <- paste0(duplicated_col_names, ".y")
merged_dt[, (cols_to_remove) := NULL]
data.table::setnames(
merged_dt, old = cols_to_replace, duplicated_col_names)
}
# restore the original order of rows
if (length(id) == 1) {
output <- kim::order_rows_specifically_in_dt(
dt = merged_dt,
col_to_order_by = id,
specific_order = id_in_final_dt)
} else if (length(id) > 1) {
output <- kim::order_rows_specifically_in_dt(
dt = merged_dt,
col_to_order_by = temp_id_col_name,
specific_order = id_in_final_dt)
# remove the temporary id column
output[, (temp_id_col_name) := NULL][]
}
# output
return(output)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.