Nothing
#' Create long dataset.
#'
#' This function is usually not called by the user. Instead it is usually
#' called by [harmonization()] function.
#'
#' The function takes the harmonization sheet, and input dataframe, and creates
#' a dataframe with the harmonized variable.
#'
#' @param vars_interest Variable currently being harmonized
#' @param subdomain Category of variable
#' @param previous_dataset Dataframe created so far
#' @param error_log Error log
#' @param na_string Character string of final recode value to be set to NA.
#' @param verbose (TRUE/FALSE) Should the function print the current progress
#' to the console?
#'
#' @importFrom dplyr rename mutate starts_with bind_rows
#' @importFrom rlang := .data
#'
#' @return Returns a list with the harmonized long dataset, and error log.
#' @export
#'
#' @examples
#'
#' # Example sheet
#' test_sheet <- harmonization_sheet_example[harmonization_sheet_example$study == 'Cohort A',]
#'
#' # Example dataset
#' test_data <- cohort_a
#'
#' # create error log
#' test_error_log <- test_sheet[,c('item','study','visit','possible_range')]
#'
#' test_error_log$completed_status <- NA_character_
#' test_error_log$completed_reason <- NA_character_
#' test_error_log$range_set_to_na <- NA_integer_
#' test_error_log$range_out_of_range_warning <- NA
#'
#' long_dataset <- create_long_dataset(vars_interest = test_sheet,
#' subdomain = 'age',
#' previous_dataset = test_data,
#' error_log = test_error_log,
#' na_string = 'NA',
#' verbose = TRUE)
#'
create_long_dataset <- function(vars_interest, subdomain, previous_dataset, error_log, na_string, verbose = TRUE)
{
visit <- NULL
ID <- NULL
vars_interest <- vars_interest[vars_interest$item == subdomain,]
counter <- 1
## Loop through rows
for(current_row in 1:nrow(vars_interest))
{
cohort <- vars_interest[current_row,][['study']]
if(verbose) {
print(glue::glue('Currently on item: {subdomain}; cohort: {cohort}; visit {vars_interest[current_row,][["visit"]]} / {max(vars_interest[vars_interest$study == vars_interest[current_row,][["study"]],][["visit"]])}.'))
}
## Getting values from harmonization sheet
## Source dataset name
source_dataset <- vars_interest[current_row,][['source_dataset']]
## Source variable name
source_item <- vars_interest[current_row,][['source_item']]
## ID variable for this cohort
id_var <- vars_interest[current_row,][['id_var']]
## Check to see if there are multiple variables (check for semi colon)
## If there are multiple variables, split into list
if(grepl(';',source_item))
{
source_item <- stringr::str_split(source_item, ';')[[1]] %>%
stringr::str_trim()
}
## New variable name
item <- vars_interest[current_row,][['item']]
## New name for source variable name
## Renaming in order to create long dataset
source_item_long <- paste0('source_',item)
## Coding instructions
code_instruct <- vars_interest[current_row,][['code1']]
## Code type
code_type <- vars_interest[current_row,][['code_type']]
# Keep first character (f for function, r for recode)
code_type <- stringr::str_trim(code_type)
code_type <- tolower(code_type)
code_type <- substr(code_type, 1, 1)
## Possible range
possible_range <- vars_interest[current_row,][['possible_range']]
## Data manipulation
## get dataset and variable
## Error handling
if(exists(source_dataset) == FALSE)
{
if(verbose) {
print('Dataset not found')
}
error_log[error_log$item == subdomain,][current_row,c('completed_status')] <- 'Not completed'
error_log[error_log$item == subdomain,][current_row,c('completed_reason')] <- 'Dataset not found'
next
} else if(source_dataset == 'previous_dataset')
{
curr_cohort <- cohort
temp_dataset <- previous_dataset %>%
filter(.data$visit == vars_interest[current_row,][['visit']] & .data$cohort == curr_cohort)
} else
{
temp_dataset <- get(source_dataset)
}
## Keeping variables of interest
# Checking for presence of variable name
for(j in 1:length(source_item))
{
var_exists <- TRUE
if(!(source_item[[j]] %in% names(temp_dataset)))
{
if(verbose) {
print('Variable not found')
}
error_log[error_log$item == subdomain,][current_row,c('completed_status')] <- 'Not completed'
error_log[error_log$item == subdomain,][current_row,c('completed_reason')] <- 'Variable not found'
var_exists <- FALSE
}
if(var_exists == FALSE) {break}
}
if(var_exists == FALSE) {next}
# Checking for id_var in dataset
if(!(id_var %in% names(temp_dataset)))
{
message(paste0('id_var: ', id_var, ' not found in dataset.'))
error_log[error_log$item == subdomain,][current_row,c('completed_status')] <- 'Not completed'
error_log[error_log$item == subdomain,][current_row,c('completed_reason')] <- 'id_var not found'
next
}
temp_dataset <- temp_dataset[,c(id_var,source_item)]
## Rename variable of interest
temp_dataset <- temp_dataset %>%
rename({{source_item_long}} := {{source_item}})
## Assign visit column
temp_dataset$visit <- vars_interest[current_row,][['visit']]
## Assign cohort column
temp_dataset$cohort <- cohort
## Modify values
## Use code type to determine what code to use
## function vs categorical for example
## Check number of arguments
num_arguments <- stringr::str_count(code_instruct, pattern = 'x[:digit:]+')
## If no change is needed assign old value to new column
if (is.na(code_type) | is.na(code_instruct) | code_type == '')
{
temp_dataset[,item] <- temp_dataset[,source_item_long]
}
else if (code_type == 'f' & num_arguments > 1)
{
temp_dataset[,item] <- code_modify_func_multi(temp_dataset,
code_instruct,
source_item_long,
user_args = source_item,
sourcedataset = source_dataset,
subdomain = subdomain,
cohort = cohort,
visit = vars_interest[current_row,][['visit']])
# Recode source items to character, if new item is character
}
else if (code_type == 'f')
{
temp_dataset[,item] <- code_modify_func(data = temp_dataset,
instruction = code_instruct,
old_var = source_item_long)
# Recode source item to character, if new item is character
if(is.character(temp_dataset[[item]]) && is.character(temp_dataset[[source_item_long]]) == FALSE)
{
temp_dataset[[source_item_long]] <- as.character(temp_dataset[[source_item_long]])
}
}
else if (code_type == 'r')
{
# 6/29/2021 Convert to character
#
# Was causing issues previously because some values
# were integers, while others were character
if (is.character(temp_dataset[[source_item_long]]) == FALSE)
{
temp_dataset[[source_item_long]] <- as.character(temp_dataset[[source_item_long]])
}
temp_dataset[,item] <- code_modify_recode(temp_dataset,
code_instruct,
source_item_long,
new_var = item,
na_string = na_string)
# Convert to character
#temp_dataset[[source_item_long]] <- as.character(temp_dataset[[source_item_long]])
}
## Possible range
# Set values outside of range to NA
if (!is.na(possible_range) & possible_range != '')
{
if (is.numeric(temp_dataset[[item]]))
{
range_result <- range_function(temp_dataset, possible_range, item)
temp_dataset[,item] <- range_result[['new_value']]
error_log[error_log$item == subdomain,][current_row,c('range_set_to_na')] <- range_result[['range_set_to_na']]
} else if (is.character(temp_dataset[[item]]))
{
range_result <- range_function_cat(temp_dataset, possible_range, item)
temp_dataset[,item] <- range_result[['new_value']]
error_log[error_log$item == subdomain,][current_row,c('range_set_to_na')] <- range_result[['range_set_to_na']]
}
}
# Rename ID variable
if (id_var != 'ID')
{
num_to_rename <- which(names(temp_dataset) == id_var)
names(temp_dataset)[num_to_rename] <- 'ID'
}
temp_dataset <- temp_dataset %>%
mutate(ID = as.character(ID))
## Add rows to previous dataset
if(counter == 1){
final_dataset <- temp_dataset
} else if(counter > 1){
## Change old variable to character if new value is character
if (
(
(is.character(temp_dataset[[source_item_long]]) == FALSE & is.character(temp_dataset[[item]])) |
(is.character(final_dataset[[source_item_long]]) & is.character(temp_dataset[[source_item_long]]) == FALSE)
) &
(is.na(num_arguments) | num_arguments %in% c(0,1))
)
{
temp_dataset[[source_item_long]] <- as.character(temp_dataset[[source_item_long]])
}
# Convert to character as needed for final dataset
names_to_check <- unique(c(names(final_dataset), names(temp_dataset)))
for(i in 1:length(names_to_check))
{
current_var_name <- names_to_check[[i]]
if(mode(final_dataset[[current_var_name]]) != mode(temp_dataset[[current_var_name]]))
{
# Old dataset char, new dataset numeric
if(mode(final_dataset[[current_var_name]]) == 'character' &&
mode(temp_dataset[[current_var_name]]) == 'numeric')
{
temp_dataset[[current_var_name]] <- as.character(temp_dataset[[current_var_name]])
} else if (mode(final_dataset[[current_var_name]]) == 'numeric' &&
mode(temp_dataset[[current_var_name]]) == 'character')
{
final_dataset[[current_var_name]] <- as.character(final_dataset[[current_var_name]])
}
} else if (
(
('Date' %in% class(final_dataset[[current_var_name]])) &&
!('Date' %in% class(temp_dataset[[current_var_name]]))
) ||
(
!('Date' %in% class(final_dataset[[current_var_name]])) &&
('Date' %in% class(temp_dataset[[current_var_name]]))
)
)
{
final_dataset[[current_var_name]] <- as.character(final_dataset[[current_var_name]])
temp_dataset[[current_var_name]] <- as.character(temp_dataset[[current_var_name]])
}
}
# rbind with final dataset
final_dataset <- bind_rows(final_dataset, temp_dataset)
}
counter <- counter + 1
error_log[error_log$item == subdomain,][current_row,c('completed_status')] <- 'Completed'
}
return_list <- list()
## Returning dataset
if(exists('final_dataset'))
{
return_list$final_dataset <- final_dataset
}
if(exists('error_log'))
{
return_list$error_log <- error_log
}
return(return_list)
}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.