## This file contains all functions dealing with multiple entries/indices##
## Exported functions are on top, internal functions below ##
#' Cast table with multiple entries
#' This function divides multiple entries in a compact indexed table as produced by [fhir_crack()] into separate columns.
#'
#' This function turns a table from compact format into wide format. Every column containing multiple entries will be turned into multiple columns.
#' The number of columns created from a single column in the original table is determined by the maximum number of
#' multiple entries occurring in this column. Rows with less than the maximally occurring number of entries will
#' be filled with NA values.
#'
#' For [fhir_cast()] to work properly, column names of the input data must reflect the Xpath to the corresponding resource element
#' with `.` as a separator, e.g. `code.coding.system`. These names are produced automatically by [fhir_crack()]
#' when the names are not explicitly set in the `cols` element of the [fhir_table_description()]/[fhir_design()].
#'
#' In the names of the newly created columns the indices will be added in front of the column names, similar to the result of [fhir_crack()] with
#' `format="wide"`. See examples and the corresponding package vignette for a more detailed description.
#'
#' @param indexed_df A compact data.frame/data.table with indexed multiple entries. Column names should reflect the XPath expression of the respective element.
#' @param brackets A character vector of length two, defining the brackets used for the indices.
#' @param sep A character vector of length one defining the separator that was used when pasting together multiple entries in [fhir_crack()].
#' @param verbose An integer vector of length one. If 0, nothing is printed, if 1, only general progress is printed, if > 1,
#' progress for each variable is printed. Defaults to 1.
#' @export
#' @examples
#'
#' #unserialize example
#' bundles <- fhir_unserialize(bundles = example_bundles1)
#'
#' #crack fhir resources
#' table_desc <- fhir_table_description(
#' resource = "Patient",
#' brackets = c('[', ']'),
#' sep = " ",
#' keep_attr=TRUE
#' )
#' df <- fhir_crack(bundles = bundles, design = table_desc)
#'
#' #original df
#' df
#'
#' #cast
#' fhir_cast(df, brackets=c('[', ']'), sep = ' ', verbose = 0)
#'
#' @seealso [fhir_crack()], [fhir_melt()], [fhir_build_bundle()]
fhir_cast <- function(
indexed_df,
brackets,
sep,
verbose = 1) {
if(is.null(indexed_df)) {stop("indexed_df is NULL.")}
if(nrow(indexed_df) < 1) {stop("indexed_df doesn't contain any data.")}
is_DT <- data.table::is.data.table(x = indexed_df)
if(!is_DT) {data.table::setDT(x = indexed_df)}
col_names <- names(indexed_df)
sep_ <- esc(sep)
brackets <- fix_brackets(brackets)
bra_ <- esc(brackets[1])
ket_ <- esc(brackets[2])
regexpr_ids <- stringr::str_c(bra_, "([0-9]+(\\.[0-9]+)*)", ket_, "(.*$)")
if(!any(grepl(regexpr_ids, indexed_df[1,]))){
stop("Cannot find ids with the specified brackets in the table.")
}
if(0 < verbose) {message("Expanding table...\n")}
map <- sapply(
col_names,
function(name) {
#name <- names(indexed_df)[[6]]
if(1 < verbose) {message(name)}
warning_given <- FALSE
entries <- strsplit(indexed_df[[name]], sep_)
ids <- lapply(entries, function(entry){gsub(regexpr_ids, "\\1", entry)})
name_vec <- strsplit(name, "\\.")[[1]]
name_expanded <- unlist(
lapply(
ids[sapply(ids, function(i){all(!is.na(i))})],
function(id){
#id <- ids[sapply(ids, function(i) all(!is.na(i)))][[1]]
id_ <- strsplit(id, "\\.")
names(id_) <- id
if(length(id_[[1]])!=length(name_vec) && !warning_given){
warning(
"Column name '", stringr::str_c(name_vec, collapse = "."),
"' doesn't fit the id pattern found in this column.",
"The column name should be build the way ",
"fhir_crack() automatically builds it. See ?fhir_cast."
)
warning_given <<- TRUE
}
if(1 < length(name_vec)) {
sapply(
id_,
function(i_) {
i_ <- as.numeric(i_)
stringr::str_c(stringr::str_c(brackets[1],paste(i_, collapse="."), brackets[2]), paste(name_vec, collapse = "."))
},
simplify = FALSE
)
} else {
i <- as.numeric(id)
a <- stringr::str_c(brackets[1], i, brackets[2], name_vec)
names(a) <- id
a
}
}
),
use.names = T
)
sort(name_expanded[unique(names(name_expanded))])
},
simplify = FALSE
)
df_new_names <- unlist(map, use.names = FALSE)
d <- data.table(matrix(data = rep_len(character(), nrow(indexed_df) * length(df_new_names)), nrow = nrow(indexed_df), ncol = length(df_new_names)))
setnames(d, df_new_names)
if(0 < verbose) {message("\nFilling table...\n")}
for(name in names(map)) {
#name <- names(map)[[1]]
if(1 < verbose) {message(name, ":")}
for(id in names(map[[name]])) {
#id <- names(map[[name]])[[1]]
sname <- map[[name]][[id]]
if(1 < verbose) {message(" ", sname)}
id_str <- stringr::str_c(bra_, id, ket_)
row_with_id <- grep(id_str, indexed_df[[name]], perl = T)
entries <- strsplit(indexed_df[[name]][row_with_id], sep_)
values <- gsub(
id_str,
"",
sapply(
entries,
function(entry) {
entry[grep(id_str, entry, perl = T)]
},
simplify = FALSE
)
)
d[row_with_id, (sname) := values]
}
}
if(!is_DT) {setDF(d)}
d[]#to avoid problems with printing
d
}
#' Find common columns
#'
#' This is a convenience function to find all column names in a data frame starting with the same string that can
#' then be used for [fhir_melt()].
#'
#' It is intended for use on data frames with column names that have been automatically produced by [fhir_design()]/[fhir_crack()]
#' and follow the form `level1.level2.level3` such as `name.given` or `code.coding.system`.
#' Note that this function will only work on column names following exactly this scheme.
#'
#' The resulting character vector can be used for melting all columns belonging to the same attribute in an indexed data frame,
#' see `?fhir_melt`.
#'
#' @param data_frame A data.frame/data.table with automatically named columns as produced by [fhir_crack()].
#' @param column_names_prefix A string containing the common prefix of the desired columns.
#' @return A character vector with the names of all columns matching `column_names_prefix`.
#' @examples
#' #unserialize example bundles
#' bundles <- fhir_unserialize(bundles = medication_bundles)
#'
#' #crack Patient Resources
#' pats <- fhir_table_description(resource = "Patient")
#'
#' df <- fhir_crack(bundles = bundles, design = pats)
#'
#' #look at automatically generated names
#' names(df)
#'
#' #extract all column names beginning with the string "name"
#' fhir_common_columns(data_frame = df, column_names_prefix = "name")
#' @export
#' @seealso [fhir_melt()], [fhir_rm_indices()]
#'
fhir_common_columns <- function(data_frame, column_names_prefix) {
if(!is.data.frame(data_frame)) {
stop(
"You need to supply a data.frame or data.table to the argument indexed_data_frame.",
"The object you supplied is of type ", class(data_frame), "."
)
}
pattern_column_names <- stringr::str_c("^", column_names_prefix, "($|\\.+)")
column_names <- names(data_frame)
hits <- grepl(pattern_column_names, column_names)
if(!any(hits)) {stop("The column prefix you gave doesn't appear in any of the column names.")}
column_names[hits]
}
#' Melt multiple entries
#'
#' This function divides multiple entries in an indexed data frame as produced by [fhir_crack()]
#' into separate rows.
#'
#' Every row containing values that consist of multiple entries on the variables specified by the argument `columns`
#' will be turned into multiple rows, one for each entry. Values on other variables will be repeated in all the new rows.
#'
#' The new data.frame will contain only the molten variables (if `all_cloumns = FALSE`) or all variables
#' (if `all_columns = TRUE`) as well as an additional variable `resource_identifier` that maps which rows came
#' from the same origin. The name of this column can be changed in the argument `id_name`.
#'
#' For a more detailed description on how to use this function please see the corresponding package vignette.
#'
#' @param indexed_data_frame A data.frame/data.table with indexed multiple entries.
#' @param columns A character vector specifying the names of all columns that should be molten simultaneously.
#' It is advisable to only melt columns simultaneously that belong to the same (repeating) attribute!
#' @param brackets A character vector of length two, defining the brackets used for the indices.
#' @param sep A character vector of length one defining the separator that was used when pasting together multiple entries in [fhir_crack()].
#' @param id_name A character vector of length one, the name of the column that will hold the identification of the origin of the new rows.
#' @param all_columns Return all columns? Defaults to `FALSE`, meaning only those specified in `columns` are returned.
#'
#' @return A data.frame/data.table where each entry from the variables in `columns` appears in a separate row.
#'
#' @examples
#' #unserialize example
#' bundles <- fhir_unserialize(bundles = example_bundles1)
#'
#' #crack fhir resources
#' table_desc <- fhir_table_description(
#' resource = "Patient",
#' brackets = c("[", "]"),
#' sep = " "
#' )
#'
#' df <- fhir_crack(bundles = bundles, design = table_desc)
#'
#' #find all column names associated with attribute address
#' col_names <- fhir_common_columns(df, "address")
#'
#' #original data frame
#' df
#'
#' #only keep address columns
#' fhir_melt(
#' indexed_data_frame = df,
#' columns = col_names,
#' brackets = c("[", "]"),
#' sep = " "
#' )
#'
#' #keep all columns
#' fhir_melt(indexed_data_frame = df, columns = col_names,
#' brackets = c("[","]"), sep = " ", all_columns = TRUE)
#' @export
#' @seealso [fhir_common_columns()], [fhir_rm_indices()]
fhir_melt <- function(
indexed_data_frame,
columns,
brackets = c("<", ">"),
sep = " ",
id_name = "resource_identifier",
all_columns = FALSE) {
if(!is.data.frame(indexed_data_frame)) {
stop(
"You need to supply a data.frame or data.table to the argument indexed_data_frame.",
"The object you supplied is of type ", class(indexed_data_frame), "."
)
}
if(!all(columns %in% names(indexed_data_frame))) {
stop("Not all column names you gave match with the column names in the data frame.")
}
indexed_dt <- copy(indexed_data_frame) #copy to avoid side effects
is_DT <- data.table::is.data.table(x = indexed_dt)
if(!is_DT) {data.table::setDT(x = indexed_dt)}
brackets <- fix_brackets(brackets = brackets)
indexed_dt[,(id_name):=1:nrow(indexed_dt)]
expanded <- indexed_dt[, melt_row(.SD, columns = columns, brackets = brackets, sep = sep), by = eval((id_name)), .SDcols = columns]
if(all_columns){
rest <- setdiff(names(indexed_dt), columns)
result <- merge.data.table(
x = expanded,
y = indexed_dt[, rest, with=FALSE],
by = id_name,
all = T
)
data.table::setcolorder(result, names(indexed_dt))
}else{
result <- expanded
}
if(nrow(result) == 0) {warning("The brackets you specified don't seem to appear in the indices of the provided data.frame. Returning NULL.")}
if(!is_DT) {setDF(result)}
result
}
#' Melt all columns with multiple entries
#'
#' This function divides all multiple entries in an indexed data frame as produced by [fhir_crack()]
#' into separate rows.
#'
#' The function repeatedly calls [fhir_melt()] on groups of columns that belong to the same
#' FHIR element (e.g. `address.city`, `address.country` and `address.type`) until every cell contains a single value.
#' If there is more than one FHIR element with multiple values (e.g. multiple address elements and multiple name
#' elements), every possible combination of the two elements will appear in the resulting table.
#' Caution! This creates something like a cross product of all values and can multiply the number of rows from the original
#' table considerably.
#'
#' @param indexed_data_frame A data.frame/data.table with indexed multiple entries.
#' @param brackets A character vector of length two, defining the brackets used for the indices.
#' @param sep A character vector of length one defining the separator that was used when pasting together multiple entries in [fhir_crack()].
#' @param column_name_separator A character string that separates element levels column names. Defaults to ".", which is used when
#' column names were generated automatically with [fhir_crack()].
#'
#' @return A completely molten data.table.
#'
#' @examples
#' #unserialize example
#' bundles <- fhir_unserialize(bundles = example_bundles1)
#'
#' #crack fhir resources
#' table_desc <- fhir_table_description(
#' resource = "Patient",
#' brackets = c("[", "]"),
#' sep = " "
#' )
#'
#' df <- fhir_crack(bundles = bundles, design = table_desc)
#'
#' #original data frame
#' df
#'
#' #melt all multiple entries
#' fhir_melt_all(
#' indexed_data_frame = df,
#' brackets = c("[", "]"),
#' sep = " "
#' )
#' @export
fhir_melt_all <- function(indexed_data_frame, brackets, sep, column_name_separator = ".") {
getColumns <- function(prefix) {
grep(paste0("^", prefix, "$|^", prefix, esc(column_name_separator)), column_names, value = TRUE)
}
melted_data <- indexed_data_frame
column_names <- names(indexed_data_frame)
getUniquePrefixes <- function(step) {
# Split each column name by the separator
split_names <- strsplit(column_names, esc(column_name_separator))
# Initialize a vector to store the prefixes
prefixes <- c()
for (name_parts in split_names) {
# Check if the number of parts is sufficient for the given step
if (length(name_parts) >= step) {
# Create the prefix by joining the first `step` parts with the separator
prefix <- paste(name_parts[1:step], collapse = column_name_separator)
prefixes <- c(prefixes, prefix)
}
}
# Get unique prefixes
prefixes <- unique(prefixes)
return(prefixes)
}
step <- 1
repeat {
prefixes <- getUniquePrefixes(step)
if (!rlang::is_empty(prefixes)) {
for (prefix in prefixes) {
columns <- getColumns(prefix)
melted_data <- fhir_melt(melted_data, columns, brackets, sep, all_columns = TRUE)
}
} else {
break
}
step <- step + 1
}
melted_data <- fhir_rm_indices(melted_data, brackets, column_names)
melted_data$resource_identifier <- NULL
return(melted_data)
}
#' Remove indices from data.frame/data.table
#'
#' Removes the indices in front of multiple entries as produced by [fhir_crack()] when brackets are provided in
#' the design.
#' @param indexed_data_frame A data frame with indices for multiple entries as produced by [fhir_crack()]
#' @param brackets A character vector of length two defining the brackets that were used in [fhir_crack()]
#' @param columns A character vector of column names, indicating from which columns indices should be removed.
#' Defaults to all columns.
#'
#' @return A data frame without indices.
#' @export
#'
#' @examples
#'
#' #unserialize example
#' bundles <- fhir_unserialize(bundles = example_bundles1)
#'
#' patients <- fhir_table_description(resource = "Patient")
#'
#' df <- fhir_crack(bundles = bundles,
#' design = patients,
#' brackets = c("[", "]"))
#'
#' df_indices_removed <- fhir_rm_indices(indexed_data_frame = df, brackets=c("[", "]"))
#'
#' @seealso [fhir_melt()]
fhir_rm_indices <- function(indexed_data_frame, brackets = c("<", ">"), columns = names( indexed_data_frame )) {
if(!is.data.frame(indexed_data_frame)) {
stop(
"You need to supply a data.frame or data.table to the argument indexed_data_frame.",
"The object you supplied is of type ", class(indexed_data_frame), "."
)
}
indexed_dt <- copy(indexed_data_frame) #copy to avoid side effects
is_DT <- data.table::is.data.table(x = indexed_dt)
if(!is_DT) {data.table::setDT(x = indexed_dt)}
brackets <- fix_brackets(brackets = brackets)
brackets.escaped <- esc(s = brackets)
pattern.ids <- stringr::str_c(brackets.escaped[1], "([0-9]+\\.*)*", brackets.escaped[2])
if(!any(grepl(pattern.ids, indexed_dt))) {
warning("The brackets you specified don't seem to appear in the data.frame.")
}
result <- data.table::data.table(gsub(pattern = pattern.ids, replacement = "", x = as.matrix(indexed_dt[,columns, with = FALSE])))
indexed_dt[,columns] <- result
if(!is_DT) {data.table::setDF(x = indexed_dt)}
indexed_dt
}
#' Collapse multiple entries
#'
#' This function collapses multiple entries that belong to the same higher level FHIR element (see examples).
#'
#' Currently this function is only needed for very few FHIR elements where multiple values should be kept together
#' in the melting process. To our knowledge, this is only true for address.line elements and name.given elements.
#' Rather than building the cross product with all other elements in the resource as done by [fhir_melt()], these
#' elements should be collapsed into a single entry before melting. See examples to get a better idea of this.
#'
#' @param indexed_data_frame A data.frame/data.table with indexed multiple entries.
#' @param columns A character vector of column names where values should be collapsed
#' @param brackets A character vector of length two, defining the brackets used for the indices.
#' @param sep A character vector of length one defining the separator that was used when pasting together multiple entries in [fhir_crack()].
#' @param collapse A character vector of length one used to separate the collapsed fields. Defaults to blank space.
#' @return The modified data.table/data.frame with collapsed multiple entries
#' @export
#' @examples
#' ### First example: Keep name.given elements together
#' #unserialize example
#' bundles <- fhir_unserialize(bundles = example_bundles7)
#'
#' #Have a look at the structure of example_bundles7
#' ?example_bundles7
#'
#' #Define sep and brackets
#' sep <- "|"
#' brackets <- c("[", "]")
#'
#' #crack fhir resources
#' table_desc <- fhir_table_description(
#' resource = "Patient",
#' brackets = brackets,
#' sep = sep
#' )
#'
#' df <- fhir_crack(bundles = bundles, design = table_desc)
#' df
#'
#' #name.given elements from the same name (i.e. the official vs. the nickname)
#' #should be collapsed
#'
#' df2 <- fhir_collapse(df, columns = "name.given", sep = sep, brackets = brackets)
#' df2
#'
#' #Next the name can be molten
#' fhir_melt(df2, brackets = brackets, sep = sep, columns = fhir_common_columns(df2,"name"))
#'
#'
#' ### Second: Keep address line elements together
#' #unserialize example
#' bundles <- fhir_unserialize(bundles = example_bundles6)
#'
#' #Have a look at the structure of example_bundles6
#' ?example_bundles6
#'
#' #Define sep and brackets
#' sep <- "|"
#' brackets <- c("[", "]")
#'
#' #crack fhir resources
#' table_desc <- fhir_table_description(
#' resource = "Patient",
#' brackets = brackets,
#' sep = sep
#' )
#'
#' df <- fhir_crack(bundles = bundles, design = table_desc)
#' df
#'
#' #Address.line elements from the same address (i.e. the work vs. the home address)
#' #should be collapsed
#'
#' df2 <- fhir_collapse(df, columns = "address.line", sep = sep, brackets = brackets, collapse = ", ")
#' df2
#'
#' #Next the address can be molten
#' fhir_melt(df2, brackets = brackets, sep = sep, columns = fhir_common_columns(df2,"address"))
#'
#'
#' @seealso [fhir_melt()]
fhir_collapse <- function(indexed_data_frame, columns, sep, brackets, collapse = " ") {
for (column_name in columns) {
for (i in 1:nrow(indexed_data_frame)) {
# Check if the cell is not empty
if (length(indexed_data_frame[[column_name]][i])) {
# Check if the cell starts with sep
if (grepl("^\\[", indexed_data_frame[[column_name]][i])) {
#split the string by sep
split_string <- strsplit(indexed_data_frame[[column_name]][i], sep, fixed = TRUE)[[1]]
# Initialize the index vector and previous index
indices <- c()
prev_index <- NULL
# Iterate through the vector and find the first indices for each new pattern
for (vec in seq_along(split_string)) {
index <- as.numeric(stringr::str_extract(split_string[vec], paste0("(?<=\\", brackets[1], ")\\d")))
if (is.null(prev_index) || is.na(prev_index) || index != prev_index) {
indices <- c(indices, vec)
prev_index <- index
}
}
# Remove brackets for all subsequent elements except the first occurrence an collapse values
result <- paste(ifelse(seq_along(split_string) %in% indices, split_string,
gsub(paste0("^\\", brackets[1], ".*\\", brackets[2]), "",
split_string, perl = TRUE)), collapse = collapse)
# Replace the original cell value with the modified result
indexed_data_frame[[column_name]][i] <- result
}
}
}
}
return(indexed_data_frame)
}
########################################################################################
########################################################################################
#' Turn a row with multiple entries into a data frame
#'
#' @param row One row of an indexed data frame
#' @param columns A character vector specifying the names of all columns that should be molten simultaneously.
#' It is advisable to only melt columns simultaneously that belong to the same (repeating) attribute!
#' @param brackets A character vector of length 2, defining the brackets used for the indices.
#' @param sep A character vector of length one, the separator.
#' @param all_columns Return all columns? Defaults to `FALSE`, meaning only those specified in `columns` are returned.
#' @return A data frame with nrow > 1.
#' @noRd
melt_row <- function(row, columns, brackets = c("<", ">"), sep = " ") {
row <- as.data.frame(row)
brackets.escaped <- esc(s = brackets)
pattern.ids <- stringr::str_c(brackets.escaped[1], "([0-9]+\\.*)+", brackets.escaped[2])
ids <- stringr::str_extract_all(string = row, pattern = pattern.ids)
names(ids) <- columns
pattern.items <- stringr::str_c(brackets.escaped[1], "([0-9]+\\.*)+", brackets.escaped[2])
items <- stringr::str_split(string = row, pattern = pattern.items)
items <- lapply(
items,
function(i) {
if(!all(is.na(i)) && i[1] == "") {i[2:length(i)]} else {i}
}
)
names(items) <- columns
d <- row[0,,drop=FALSE]
data.table::setDF(d)
for(i in names(ids)) {
id <- ids[[i]]
if(!all(is.na(id))) {
it <- items[[i]]
new.rows <- gsub(stringr::str_c(brackets.escaped[1], "([0-9]+)\\.*.*"), "\\1", id)
new.ids <- gsub(stringr::str_c("(", brackets.escaped[1], ")([0-9]+)\\.*(.*", brackets.escaped[2], ")" ), "\\1\\3", id)
unique.new.rows <- unique(new.rows)
set <- stringr::str_c(new.ids, it)
f <- sapply(
unique.new.rows,
function(unr) {
fltr <- unr == new.rows
stringr::str_c(set[fltr], collapse = "")
}
)
for(n in unique.new.rows) {
d[as.numeric(n), i]<- gsub(pattern = stringr::str_c(esc(sep), "$"), replacement = "", x = f[names(f) == n], perl = TRUE)
}
}
}
data.table::setDT(d)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.