#' Helper function to find mode of a column
#' @param v a column
#' @noRd
getmode <- function(v) {
uniqv <- unique(v)
uniqv[which.max(tabulate(match(v, uniqv)))]
}
#' Handles missing values in a dataframe
#'
#' Replace missing values in dataframe columns by the specified methods.
#' Separate methods can be applied for categorical column imputation and
#' numerical column imputation.
#'
#' @param x_train training set dataframe to be transformed
#' @param x_test test set dataframe to be transformed
#' @param column_list named list of columns with two character vectors, must be
#' named numeric' and 'categorical'.
#' @param num_imp method for numerical imputation, options are "mean and" median
#' @param cat_imp method for categorical imputation, only option is "mode"
#'
#' @return named list, with two vectors: "x_train", the training set with
#' missing values filled, and "x_test", the test set with missing values filled
#' @importFrom magrittr %>%
#' @importFrom rlang :=
#' @examples
#' x_tr <- data.frame('x' = c(2.5, 3.3, NA), 'y' = c(1, NA, 1))
#' x_test <- data.frame('x' = c(NA), 'y' = c(NA))
#' fill_missing(x_tr, x_test, list("numeric" = c('x'),
#' "categorical" = c('y')), 'mean', 'mode')
#'
#' @export
fill_missing <- function(x_train, x_test, column_list, num_imp, cat_imp)
{
# Check input types are as specified
if (!is.data.frame(x_train))
stop("Training set must be a dataframe.")
if (!is.data.frame(x_test))
stop("Test set must be a dataframe.")
if (!is.list(column_list))
stop("num_list must be a named list of columns.")
if (!is.character(num_imp))
stop("num_imp method must be a string.")
if (!is.character(cat_imp))
stop("cat_imp method must be a string.")
# Check train set and test set columns are the same
if (!isTRUE(dplyr::all_equal(colnames(x_train), colnames(x_test))))
stop("Columns of train and test set must be identical.")
# Check column categories as well as
# that all columns listed in the named list are in the df
colnames = x_train %>% names
for (type in column_list){
for (column in type){
if(!is.element(column, colnames))
stop("Columns in named list must be in dataframe")
}
}
# Check that numerical imputation method is one of the two options
if (num_imp != "mean" && num_imp != "median")
stop("numerical imputation method can only be mean or median")
# Check categorical imputation method is one of the two options
if (cat_imp != "mode")
stop("categorical imputation method can only be mode")
# Convert factor columns to character columns
x_train %>%
dplyr::mutate_if(is.factor, as.character) -> x_train
x_test %>%
dplyr::mutate_if(is.factor, as.character) -> x_test
# Imputation methods for numerical columns
for (column in column_list$"numeric"){
if (num_imp == "mean"){
train_col_mean <- x_train %>% dplyr::select(column) %>%
dplyr::pull() %>% mean(na.rm = TRUE)
# impute training mean to train column
x_train <- x_train %>%
dplyr::mutate(!!column := ifelse(is.na(!!rlang::sym(column)),
train_col_mean,
!!rlang::sym(column)))
# impute _training mean_ to test column
x_test <- x_test %>%
dplyr::mutate(!!column := ifelse(is.na(!!rlang::sym(column)),
train_col_mean,
!!rlang::sym(column)))
}
if (num_imp == "median"){
train_col_med <- x_train %>% dplyr::select(column) %>%
dplyr::pull() %>% stats::median(na.rm = TRUE)
# impute training median to train column
x_train <- x_train %>%
dplyr::mutate(!!column := ifelse(is.na(!!rlang::sym(column)),
train_col_med,
!!rlang::sym(column)))
# impute _training median_ to test column
x_test <- x_test %>%
dplyr::mutate(!!column := ifelse(is.na(!!rlang::sym(column)),
train_col_med,
!!rlang::sym(column)))
}
}
# Imputation methods for categorical columns
for (column in column_list$"categorical"){
train_col_mode <- x_train %>% dplyr::select(column) %>%
dplyr::pull() %>% getmode()
# impute training mode to train column
x_train <- x_train %>%
dplyr::mutate(!!column := ifelse(is.na(!!rlang::sym(column)),
train_col_mode,
!!rlang::sym(column)))
# impute _training mode_ to test column
x_test <- x_test %>%
dplyr::mutate(!!column := ifelse(is.na(!!rlang::sym(column)),
train_col_mode,
!!rlang::sym(column)))
}
list("x_train" = x_train,"x_test" = x_test)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.