#' Process Data
#'
#' A function called upon creating a task that uses the data provided to the
#' task in order to process the covariates and identify missingness in the
#' outcome. See parameters and details for more information.
#'
#' @details If the data provided to the task contains missing covariate values,
#' then a few things will happen. First, for each covariate with missing values,
#' if the proportion of missing values is greater than
#' \code{getOption("sl3.max_p_missing")}, the covariate will be dropped. (The
#' default option \code{"sl3.max_p_missing"} is 0.5 and it can be modified to
#' say, 0.75, by setting \code{options("sl3.max_p_missing" = 0.75)}). Also,
#' for each covariate with missing values that was not dropped, a so-called
#' "missingness indicator" (that takes the name of the covariate with prefix
#' "delta_") will be added as an additional covariate. The missingness
#' indicator will take a value of 0 if the covariate value was missing and 1
#' if not. Also, imputation will be performed for each covariate with missing
#' values: continuous covariates are imputed with the median, and discrete
#' covariates are imputed with the mode. This coupling of imputation and
#' missingness indicators removes the missing covariate values, while
#' preserving the pattern of missingness, respectively. To avoid this default
#' imputation, users can perform imputation on their analytic dataset before
#' supplying it to \code{\link{make_sl3_Task}}. We generally recommend the
#' missingness indicators be added regardless of the imputation strategy,
#' unless missingness is very rare.
#'
#' This function also coverts any character covariates to factors, and one-hot
#' encodes factor covariates.
#'
#' Lastly, if the \code{outcome} is supplied in creating the
#' \code{\link{sl3_Task}} and if missing outcome values are detected in
#' \code{data}, then a warning will be thrown. If
#' \code{drop_missing_outcome = TRUE} then observations with missing outcomes
#' will be dropped.
#'
#' @param data A \code{data.table} containing the analytic dataset. In
#' creating the \code{\link{sl3_Task}}, the \code{data} passed to the task is
#' supplied for this argument.
#' @param nodes A list of character vectors for \code{covariates},
#' \code{outcome}, \code{id}, \code{weights}, and \code{offset}, which is
#' generated when creating the \code{\link{sl3_Task}} if not already specified
#' as an argument to \code{make_sl3_Task}.
#' @param column_names A named list of column names in the data, which is
#' generated when creating the \code{\link{sl3_Task}} if not already specified
#' as an argument to \code{make_sl3_Task}.
#' @param flag Logical (default \code{TRUE}) indicating whether to notify the
#' user when there are outcomes that are missing, which can be modified when
#' creating the \code{\link{sl3_Task}} by setting \code{flag = FALSE}.
#' @param drop_missing_outcome Logical (default \code{FALSE}) indicating
#' whether to drop observations with missing outcomes, which can be modified
#' when creating the \code{\link{sl3_Task}} by setting
#' \code{drop_missing_outcome = TRUE}.
#'
#'
#' @return A list of processed data, nodes and column names
#'
#' @export
process_data <- function(data, nodes, column_names, flag = TRUE,
drop_missing_outcome = FALSE) {
# force a copy so we can mutate data in place w/o distrupting a user's data
if (inherits(data, "data.table")) {
data <- data.table::copy(data)
} else {
data <- as.data.table(data)
}
all_nodes <- unlist(nodes)
if (length(all_nodes) == 0) {
return(list(data = data, nodes = nodes, column_names = column_names))
}
node_columns <- unlist(column_names[all_nodes])
covariates_columns <- unlist(column_names[nodes$covariates])
outcome_columns <- unlist(column_names[nodes$outcome])
factorized <- FALSE
dropped <- FALSE
imputed <- FALSE
# process characters
is_character <- which(data[, sapply(.SD, is.character), .SDcols = node_columns])
char_cols <- node_columns[is_character]
char_vars <- all_nodes[is_character]
if (length(char_cols) > 0) {
warning(sprintf(
"Character variables found: %s;\nConverting these to factors",
paste0(char_vars, collapse = ", ")
))
# convert data
for (char_col in char_cols) {
set(data, , char_col, as.factor(unlist(data[, char_col, with = FALSE])))
}
factorized <- TRUE
}
# process missing
has_missing <- data[, sapply(.SD, function(x) any(is.na(x))), .SDcols = node_columns]
miss_cols <- node_columns[has_missing]
miss_vars <- all_nodes[has_missing]
missing_Y <- any(nodes$outcome %in% miss_vars)
missing_covar_cols <- intersect(miss_cols, covariates_columns)
missing_covar_vars <- intersect(miss_vars, nodes$covariates)
if (length(miss_cols) > 0) {
if (missing_Y && drop_missing_outcome) {
if (flag) {
warning("Missing outcome data detected: dropping outcomes.")
}
keep_rows <- stats::complete.cases(data[, outcome_columns, with = FALSE])
data <- data[keep_rows, ]
}
if (length(missing_covar_cols) > 0) {
warning(sprintf(
"Imputing missing values and adding missingness indicators for the following covariates with missing values: %s. See documentation of the process_data function for details.",
paste0(missing_covar_cols, collapse = ", ")
))
# make indicators and add to data
missing_indicators <- data[, lapply(.SD, function(x) as.numeric(!is.na(x))),
.SDcols = missing_covar_cols
]
missing_indicator_cols <- sprintf("delta_%s", missing_covar_cols)
missing_indicator_vars <- sprintf("delta_%s", missing_covar_vars)
setnames(missing_indicators, missing_indicator_cols)
set(data, , missing_indicator_cols, missing_indicators)
# add inidicators to column map and covariate list
column_names[missing_indicator_vars] <- missing_indicator_cols
nodes$covariates <- c(nodes$covariates, missing_indicator_vars)
# impute covariates
data_missing_covars <- data[, missing_covar_cols, with = FALSE]
imputed <- impute(data.frame(data_missing_covars))
# update data
set(data, , missing_covar_cols, imputed)
}
}
na_Y <- (!is.null(nodes$outcome) && any(is.na(data[, outcome_columns, with = F])))
if (na_Y && flag) {
warning("Missing outcome data detected. This is okay for prediction, but will likely break training. \n You can drop observations with missing outcomes by setting drop_missing_outcome=TRUE in make_sl3_Task.")
}
list(data = data, nodes = nodes, column_names = column_names)
}
#' Impute missing values with the median/mode
#' based on imputeMissings R package (removed from CRAN)
#'
#' Character vectors and factors are imputed with the mode.
#' Numeric and integer vectors are imputed with median.
#'
#' @param data A data frame with dummies or numeric variables.
#'
#' @keywords internal
impute <- function(data){
compute <- function (data){
Mode <- function(x) {
xtab <- table(x)
xmode <- names(which(xtab == max(xtab)))
return(xmode[1])
}
values <- sapply(data, function(x) {
if (class(x) %in% c("character", "factor"))
Mode(x)
else if (class(x) %in% c("numeric", "integer"))
median(x, na.rm = TRUE)
}, simplify = FALSE)
values
}
object <- compute(data)
if (!identical(colnames(data), names(object))){
stop('Variable names and variable positions need to be identical in compute and impute')
}
data <- data.frame(sapply(1:ncol(data), function(i) {
fact <- is.factor(data[,i])
if (fact) data[,i] <- as.character(data[,i])
data[is.na(data[,i]),i] <- object[[i]]
if (fact) data[,i] <- as.factor(data[,i])
return(data[,i,drop=FALSE])
}, simplify = FALSE))
data
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.