#' @name rc_missing
#'
#' @title Find missing record data
#' @description Finds data points which are ostensibly missing from Redcap records data.
#'
#' For each event/variable combination, if a value exists for at least one
#' record then it is expected to exist for all records. Any event or variable which is empty within
#' the entire dataframe will inherently be dropped. If an event has no values for a given record,
#' none of the variables will be flagged unless there is data for that record in the following event.
#' Additionally, if the subject is marked as having completed the study (i.e. \code{completion_field
#' == 'Yes'|1}), then empty events for that subject will not be discarded.
#'
#' The logic of this function does not extend to repeat instruments or checkbox, calculated,
#' or hidden fields. If present in the record data, they will be dropped. Additionally, this function
#' does not yet support branching logic. Any field with logic dependent on another field will be removed.
#' Fields with other logic, ie event based logic, will remain.
#'
#' @param record_data Record data export from REDCap
#' @param completion_field Character. The REDCap variable which indicates whether or not a subject
#' has completed the study. This should be indicated by a 'Yes' or a '1' (i.e. a Yes/No
#' field in REDCap). If not provided, some missing data may be lost.
#' @param repeats Character. A vector of repeat instrument names which should be kept. This should
#' ONLY be used for repeat instruments which are expected to have a consistent number of occurrences
#' per each event/ID combination. All repeat instruments not supplied here will be removed to avoid
#' excessive false positives.
#'
#' @param data_dict Dataframe. REDCap project data data_dictionary. By default,
#' $data_dict is expected in a REDCap bundle object, as created by \code{rc_bundle}. Otherwise, an
#' equivalent data.frame containing the project data dictionary must be supplied.
#' @param events Character. Vector of REDCap event names. Events not included in this list will
#' not be considered. If supplying manually, chronological ordering of events must be ensured.
#' @param form_perm Dataframe. User access permissions for REDCap forms. By default, $form_perm
#' is expected in the REDCap bundle, as created by \code{rc_bundle}. Otherwise, an equivalent
#' data.frame containing form permission data must be supplied.
#' @param mappings Dataframe. Redcap metadata that maps forms to events.
#' @param id_field Character. The name of the record_id field for your REDCap project.
#'
#'
#' @author Marcus Lehr
#'
#' @export
rc_missing <- function(record_data,
completion_field = NULL, repeats = NULL,
data_dict = getOption("redcap_bundle")$data_dict,
events = getOption("redcap_bundle")$event_data$unique_event_name,
form_perm = getOption("redcap_bundle")$form_perm,
mappings = getOption("redcap_bundle")$mappings,
id_field = getOption("redcap_bundle")$id_field
) {
# Checks ------------------------------------------------------------------
validate_args(required = c('record_data','data_dict','events'),
record_data = record_data,
completion_field = completion_field,
repeats = repeats,
data_dict = data_dict,
events = events,
form_perm = form_perm
)
# Prep data -------------------------------------------------------------------
## Get record_id field name ---
id_field = getID(record_data, data_dict, id_field)
## Filter ---
# Check for pooling and populate repeat list if empty
pooled_vars = attributes(record_data)$pooled_vars
if (!is.null(pooled_vars) & is.null(repeats)) {
repeats = levels(pooled_vars$pooled_vars)
message("Pooled variables being used as repeats") # Could use clearer phrasing here
}
# Remove repeat instrument data unless specified in repeats
record_data$redcap_repeat_instrument[record_data$redcap_repeat_instrument==''] = NA # GUI exports blanks as ''
if (any(!is.na(record_data[['redcap_repeat_instrument']]))) { # Check if col exists and contains data
if (any(repeats %in% record_data$redcap_repeat_instrument)) # Check if values are contained in 'repeats'
record_data = dplyr::filter(record_data, redcap_repeat_instrument %in% repeats |
is.na(redcap_repeat_instrument))
else { # Data exists and is not specified in 'repeats'
message("All repeat instruments will be dropped. If you would like to keep
any repeat instruments, please supply the `repeats` argument. This should
only be done for instruments where a consistent number of repeat instances are expected.")
record_data = record_data %>% dplyr::filter(is.na(redcap_repeat_instrument))
}
} # No data exists in repeat cols, go ahead and remove them if present
# else if (any(grepl('redcap_repeat', names(record_data))))
# record_data = record_data[!grepl('redcap_repeat', names(record_data))]
# Remove other unwanted fields
fields_calc = intersect(data_dict$field_name[data_dict$field_type == 'calc'], names(record_data))
fields_hidden = intersect(data_dict$field_name[grepl('@HIDDEN',data_dict$field_annotation)], names(record_data))
if (length(fields_calc)>0 |
length(fields_hidden)>0 |
any(grepl('___',names(record_data))) # Look for checkbox data
) {
message("The logic of this function does not translate to checkbox
or calculated fields. All such data will be dropped.")
record_data = dplyr::select(record_data,
-dplyr::all_of(fields_calc),
-dplyr::all_of(fields_hidden),
-dplyr::contains('___'))
}
## Completion data ---
if (any(names(record_data)==completion_field)) {
# Grab completion data then remove from data
completion_data = dplyr::select(record_data, !!id_field, !!completion_field) %>%
stats::na.omit() %>%
dplyr::distinct()
record_data = dplyr::select(record_data, -!!completion_field)
# completion_data$record_id = as.character(completion_data$record_id)
}
else message("Completion field not found. Some missing data may not be captured.")
# Remove _complete fields
# if (any(grepl('_complete',names(record_data))))
# record_data = dplyr::select(record_data, -dplyr::contains('_complete'))
form_complete_fields = data_dict$form_name %>% unique() %>% paste0('_complete')
record_data = record_data[!names(record_data) %in% form_complete_fields]
# Collect missing data ----------------------------------------------------
rc_fields = c('redcap_event_name','redcap_repeat_instrument','redcap_repeat_instance')
rc_factors = intersect(c(id_field, rc_fields), names(record_data))
# This is a potential fix for being fed formatted data, but I'm not sure if it's a good idea
# bc rc_format requires event_data, which isn't fed into this function. Also could return data
# in a different format than submitted. Will just let it error out for now.
# # Attempt to coerce event names to raw
# record_data = rc_format(record_data, data_dict, event_labels = F)
# Convert data to long format. Globally empty events and IDs will be lost
record_data = suppressWarnings(
reshape2::melt(record_data, id.vars = rc_factors, na.rm = T) %>% dplyr::as_tibble() %>%
dplyr::filter(value != '') %>% droplevels()
)
# # Ensure ID field is always character for joins
# record_data[[id_field]] = as.character(record_data[[id_field]])
# Collecting IDs outside the loop allows capturing of (non-globally) empty events
# IDs = as.character(unique(record_data[[id_field]]))
IDs = unique(record_data[[id_field]])
# Create a frame of all event/variable combos for each ID. Tidyr has a function for this but programming is difficult
event_var_combos = dplyr::select(record_data, dplyr::contains('redcap'), variable) %>% dplyr::distinct() %>%
dplyr::arrange_at(c('redcap_event_name', 'variable'))
expected_data = data.frame()
for (i in IDs) {
x = event_var_combos
x[id_field] = i
x = dplyr::select(x, !!id_field, dplyr::everything())
expected_data = rbind(expected_data, x)
}
# Capture all expected data points not present in record_data
missing_data_all = dplyr::anti_join(expected_data, dplyr::select(record_data, -value), by = c(rc_factors, 'variable'))
missing_data_all$variable = as.character(missing_data_all$variable)
# Filter data -------------------------------------------------------------
if (nrow(missing_data_all) > 0) {
# Convert expected data to wide format and fill in values from data
cast_formula = paste(paste(rc_factors,collapse = ' + '), "~ variable")
data_wide_full = dplyr::full_join(expected_data, record_data, by = c(rc_factors, 'variable')) %>%
reshape2::dcast(., cast_formula)
# Ensure proper event ordering for logic
events = events[events %in% record_data$redcap_event_name]
if (!length(events)) stop("Events in data do not match supplied events")
data_wide_full$redcap_event_name = factor(data_wide_full$redcap_event_name, levels = events)
data_wide_full = data_wide_full %>% dplyr::arrange_at(c(id_field,'redcap_event_name')) %>%
dplyr::filter(!is.na(redcap_event_name)) # Events outside the events list will be converted to NA
# Filter variables from empty events unless there is data in a following event or the participant as completed the study
if (!is.null(completion_field)) {
visit_data_summary = data_wide_full %>%
dplyr::mutate(row_sums = rowSums(!is.na(dplyr::select(.,-dplyr::all_of(rc_factors))))) %>% # Count variables with data. Incompatible with checkbox fields
dplyr::group_by_at(rc_factors[1:2]) %>% # Removing repeats to look at entire event. Including would break data_following
dplyr::summarise(var_count = sum(row_sums)) %>% # sum() required for repeats
dplyr::mutate(data_following = cumsum(var_count) != sum(var_count)) %>% # If total sum per ID is == cumsum, then all following visits must be empty (if any)
dplyr::left_join(., completion_data, by = id_field) # Add completion field for next step
missing_data = suppressWarnings(
dplyr::left_join(missing_data_all, visit_data_summary, by = rc_factors[1:2]) %>% # Logic has no repeats
dplyr::filter(var_count > 0 | data_following | !!dplyr::sym(completion_field) == 'Yes|1') %>%
dplyr::select(dplyr::all_of(rc_factors), variable)
)
}
# Perform above filtering without completion_field if not provided
else {
visit_data_summary = data_wide_full %>%
dplyr::mutate(row_sums = rowSums(!is.na(dplyr::select(.,-dplyr::all_of(rc_factors))))) %>%
dplyr::group_by_at(rc_factors[1:2]) %>%
dplyr::summarise(var_count = sum(row_sums)) %>%
dplyr::mutate(data_following = !sum(var_count) == cumsum(var_count))
missing_data = suppressWarnings(
dplyr::left_join(missing_data_all, visit_data_summary, by = rc_factors[1:2]) %>%
dplyr::filter(var_count > 0 | data_following) %>%
dplyr::select(dplyr::all_of(rc_factors), variable)
)
}
# Add form names and remove vars from hidden forms
if (nrow(missing_data) > 0) {
# Add form names for ease of locating in Redcap
missing_data = add_form_names(missing_data, pooled_vars, data_dict, mappings, id_field)
# Remove data from forms where no users have access
if (!is.null(form_perm)) {
forms_hidden = setdiff(unique(subset(form_perm, view == 'No access')$form_name),
unique(subset(form_perm, view != 'No access')$form_name))
if (any(forms_hidden %in% missing_data$form_name)) {
message("The following forms will be removed as no users have access to them: ",
paste(intersect(forms_hidden,missing_data$form_name), collapse = ', '))
missing_data = missing_data[!missing_data$form_name %in% forms_hidden,]
}
}
}
# Ignore fields dependent on other fields. Event dependencies are handled via event_var_combos
# Implementing branching logic eval would be quite the endeavor. Preferably RC will implement data quality API methods. Their missing data method takes branching logic into account
field_dependent_fields = data_dict$field_name[which(stringr::str_detect(data_dict$branching_logic, paste(data_dict$field_name,collapse='|')))]
if (any(field_dependent_fields %in% missing_data$variable)) {
message("Fields with branching logic dependent on other fields will be removed as their presence in the data are not consistent.")
missing_data = missing_data %>% dplyr::filter(!variable %in% field_dependent_fields)
}
# Notify user if no data remains
if (nrow(missing_data) == 0) message("No missing data were found.")
# End function
return(missing_data)
}
else {
message("No missing data were found.")
return(missing_data_all)
}
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.