R/impute_simulated.R
In missCompare: Intuitive Missing Data Imputation Framework

Documented in impute_simulated

#' @title Imputation algorithm tester on simulated data
#'
#' @description
#' \code{\link{impute_simulated}} tests the imputation quality of all missing data imputation algorithms on matrices with various missing data patterns, using various metrics
#'
#' @details
#' This function tests the imputation accuracy of the a curated list of missing data imputation algorithms (16 algorithms at the moment) by comparing the
#' original simulated matrix with no missingness and the imputed matrices generated by the algorithm using the matrices with
#' MCAR, MAR, MNAR and (optionally) MAP missingness patterns. The function calculates root-mean-square error (RMSE), mean absolute error (MAE),
#' Kolmogorov-Smirnov test statistics D (KS) between the imputed datapoints and the original datapoints (that were subsequently set to missing)
#' for each missing data imputation algorithm. The function will calculate average computation time per method as well. The function will
#' automatically detect whether there is a MAP matrix in the list and calculate metrics for all matrices provided in the list. Important! All statistics
#' output by this function are calculated for ALL missing values across the dataset, not by variable.
#'
#' @param rownum Number of rows (samples) in the original dataframe (Rows output from the \code{\link{get_data}} function)
#' @param colnum Number of rows (variables) in the original dataframe (Columns output from the \code{\link{get_data}} function)
#' @param cormat Correlation matrix of the original dataframe (Corr_matrix output from the \code{\link{get_data}} function)
#' @param n.iter Number of iterations to perform with default 10.
#' @param MD_pattern Missing data pattern in the original dataset (MD_Pattern output from the \code{\link{get_data}} function)
#' @param NA_fraction Fraction of missingness in the original dataset (Fraction_missingness output from the \code{\link{get_data}} function)
#' @param min_PDM All patterns with number of observations less than this number will be removed from the missing data generation. This argument is necessary to be carefully set, as the function will fail or generate erroneous missing data patterns with very complicated missing data patterns. The default is 10, but for large datasets this number needs to be set higher to avoid errors.
#' @param assumed_pattern Vector of missingness types (must be same length as missingness fraction per variable). If this input is specified, the function will spike in missing datapoints in a MAP pattern as well.
#'
#' @name impute_simulated
#'
#' @return
#' \item{Imputation_metrics_raw}{Raw RMSE, MAE, KS and computation time values per method, per missingness pattern, per iteration)}
#' \item{Imputation_metrics_means}{RMSE, MAE, KS and computation time means per method and missingness pattern}
#' \item{Plot_TIME}{Boxplot of computation time values per missing data imputation algorithm}
#' \item{Plot_RMSE}{Faceted boxplot of RMSE values per missingness pattern and missing data imputation algorithm}
#' \item{Plot_MAE}{Faceted boxplot of MAE values per missingness pattern and missing data imputation algorithm}
#' \item{Plot_KS}{Faceted boxplot of KS values per missingness pattern and missing data imputation algorithm}
#'
#' @examples
#' ## in case there is no assumed missingness pattern per variable
#' # wrap <- impute_simulated(rownum = metadata$Rows,
#' #        colnum = metadata$Columns,
#' #        cormat = metadata$Corr_matrix,
#' #        MD_pattern = metadata$MD_Pattern,
#' #        NA_fraction = metadata$Fraction_missingness,
#' #        min_PDM = 10,
#' #        n.iter = 50)
#'
#' ## in case there is a pre-defined assumed pattern
#' # wrap <- impute_simulated(rownum = metadata$Rows,
#' #        colnum = metadata$Columns,
#' #        cormat = metadata$Corr_matrix,
#' #        MD_pattern = metadata$MD_Pattern,
#' #        NA_fraction = metadata$Fraction_missingness,
#' #        min_PDM = 10,
#' #        assumed_pattern = c('MAR','MAR','MCAR','MCAR',
#' #                          'MNAR','MCAR','MAR','MNAR',
#' #                          'MCAR','MNAR','MCAR'),
#' #        n.iter = 50)
#'
#' @export


# FUNCTION
impute_simulated <- function(rownum, colnum, cormat, n.iter = 10, MD_pattern, NA_fraction,
    min_PDM = 10, assumed_pattern = NA) {

  if (!is.na(assumed_pattern))
    collect_res <- data.frame(matrix(NA, nrow = 16 * n.iter, ncol = 14)) else collect_res <- data.frame(matrix(NA, nrow = 16 * n.iter, ncol = 11))
    if (!is.na(assumed_pattern))
      colnames(collect_res) <- c("Method", "Comp_time", "MCAR_RMSE", "MAR_RMSE", "MNAR_RMSE", "MAP_RMSE", "MCAR_MAE", "MAR_MAE", "MNAR_MAE", "MAP_MAE", "MCAR_KS", "MAR_KS", "MNAR_KS", "MAP_KS") else
        colnames(collect_res) <- c("Method", "Comp_time", "MCAR_RMSE", "MAR_RMSE", "MNAR_RMSE", "MCAR_MAE", "MAR_MAE", "MNAR_MAE", "MCAR_KS", "MAR_KS", "MNAR_KS")

    for (i in 1:n.iter) {

        print(paste("ITERATION ", i, " OF TOTAL ", n.iter, " - IN PROGRESS", sep = ""))

        collect_res[((16 * (i - 1)) + 1):((16 * (i - 1)) + 16), 1] <- c("Random replacement",
            "Median imputation", "Mean imputation", "missMDA Regularized", "missMDA EM",
            "pcaMethods PPCA", "pcaMethods svdImpute", "pcaMethods BPCA", "pcaMethods NIPALS",
            "pcaMethods NLPCA", "mice mixed", "mi Bayesian", "Amelia II", "missForest",
            "Hmisc aregImpute", "VIM kNN")

        sim <- simulate(rownum, colnum, cormat)
        res <- all_patterns(sim$Simulated_matrix, MD_pattern = MD_pattern, NA_fraction = NA_fraction,
            min_PDM = min_PDM, assumed_pattern = assumed_pattern)

        collect_res[((16 * (i - 1)) + 1), 2:ncol(collect_res)] <- as.data.frame(test_random_imp(sim$Simulated_matrix,
            list = res))
        collect_res[((16 * (i - 1)) + 2), 2:ncol(collect_res)] <- as.data.frame(test_median_imp(sim$Simulated_matrix,
            list = res))
        collect_res[((16 * (i - 1)) + 3), 2:ncol(collect_res)] <- as.data.frame(test_mean_imp(sim$Simulated_matrix,
            list = res))
        collect_res[((16 * (i - 1)) + 4), 2:ncol(collect_res)] <- as.data.frame(test_missMDA_reg(sim$Simulated_matrix,
            list = res))
        collect_res[((16 * (i - 1)) + 5), 2:ncol(collect_res)] <- as.data.frame(test_missMDA_EM(sim$Simulated_matrix,
            list = res))
        collect_res[((16 * (i - 1)) + 6), 2:ncol(collect_res)] <- as.data.frame(test_pcaMethods_PPCA(sim$Simulated_matrix,
            list = res))
        collect_res[((16 * (i - 1)) + 7), 2:ncol(collect_res)] <- as.data.frame(test_pcaMethods_svdImpute(sim$Simulated_matrix,
            list = res))
        collect_res[((16 * (i - 1)) + 8), 2:ncol(collect_res)] <- as.data.frame(test_pcaMethods_BPCA(sim$Simulated_matrix,
            list = res))
        collect_res[((16 * (i - 1)) + 9), 2:ncol(collect_res)] <- as.data.frame(test_pcaMethods_Nipals(sim$Simulated_matrix,
            list = res))
        collect_res[((16 * (i - 1)) + 10), 2:ncol(collect_res)] <- as.data.frame(test_pcaMethods_NLPCA(sim$Simulated_matrix,
            list = res))
        collect_res[((16 * (i - 1)) + 11), 2:ncol(collect_res)] <- as.data.frame(test_mice_mixed(sim$Simulated_matrix,
            list = res))
        collect_res[((16 * (i - 1)) + 12), 2:ncol(collect_res)] <- as.data.frame(test_mi(sim$Simulated_matrix,
            list = res))
        collect_res[((16 * (i - 1)) + 13), 2:ncol(collect_res)] <- as.data.frame(test_AmeliaII(sim$Simulated_matrix,
            list = res))
        collect_res[((16 * (i - 1)) + 14), 2:ncol(collect_res)] <- as.data.frame(test_missForest(sim$Simulated_matrix,
            list = res))
        collect_res[((16 * (i - 1)) + 15), 2:ncol(collect_res)] <- as.data.frame(test_aregImpute(sim$Simulated_matrix,
            list = res))
        collect_res[((16 * (i - 1)) + 16), 2:ncol(collect_res)] <- as.data.frame(test_kNN(sim$Simulated_matrix,
            list = res))

    }

      mymean <- function(x) {
        mean(x, na.rm = TRUE)
      }

      all_stats <- collect_res %>% group_by(Method) %>% summarise_all(funs(mymean))

      if (!is.na(assumed_pattern))
        forgraph <- gather(collect_res, Pattern, value, Comp_time:MAP_KS, factor_key = TRUE) else forgraph <- gather(collect_res, Pattern, value, Comp_time:MNAR_KS, factor_key = TRUE)
      forgraph$Method <- factor(forgraph$Method, levels = c("Random replacement", "Median imputation",
                                                            "Mean imputation", "missMDA Regularized", "missMDA EM", "pcaMethods PPCA",
                                                            "pcaMethods svdImpute", "pcaMethods BPCA", "pcaMethods NIPALS", "pcaMethods NLPCA",
                                                            "mice mixed", "mi Bayesian", "Amelia II", "missForest", "Hmisc aregImpute",
                                                            "VIM kNN"))

      forgraph_time <- forgraph[forgraph$Pattern == "Comp_time",]
      forgraph_RMSE <- forgraph[grepl("_RMSE", forgraph$Pattern),]
      forgraph_MAE <- forgraph[grepl("_MAE", forgraph$Pattern),]
      forgraph_KS <- forgraph[grepl("_KS", forgraph$Pattern),]
      forgraph_RMSE$Pattern <- droplevels(forgraph_RMSE$Pattern)
      forgraph_MAE$Pattern <- droplevels(forgraph_MAE$Pattern)
      forgraph_KS$Pattern <- droplevels(forgraph_KS$Pattern)

      p_time <- ggplot(forgraph_time, aes(x = Method, y = value, fill = Method)) + geom_boxplot() +
        theme(axis.text.x = element_text(angle = 90, hjust = 1)) + ggtitle("Computation time of various missing data imputation methods") +
        theme(plot.title = element_text(hjust = 0.5)) + labs(x = "", y = "Computation time (minutes)")

      if (!is.na(assumed_pattern))
        levels(forgraph_RMSE$Pattern) <- c("MCAR", "MAR", "MNAR", "MAP") else levels(forgraph_RMSE$Pattern) <- c("MCAR", "MAR", "MNAR")
      p_RMSE <- ggplot(forgraph_RMSE, aes(x = Method, y = value, fill = Method)) + geom_boxplot() +
        facet_grid(~Pattern, scales = "free") + theme(axis.text.x = element_text(angle = 90,
                                                                                 hjust = 1)) + ggtitle("Root-mean-square error (RMSE) of various missing data imputation methods") +
        theme(plot.title = element_text(hjust = 0.5)) + labs(x = "", y = "RMSE")

      if (!is.na(assumed_pattern))
        levels(forgraph_MAE$Pattern) <- c("MCAR", "MAR", "MNAR", "MAP") else levels(forgraph_MAE$Pattern) <- c("MCAR", "MAR", "MNAR")
      p_MAE <- ggplot(forgraph_MAE, aes(x = Method, y = value, fill = Method)) + geom_boxplot() +
        facet_grid(~Pattern, scales = "free") + theme(axis.text.x = element_text(angle = 90,
                                                                                 hjust = 1)) + ggtitle("Mean absolute error (MAE) of various missing data imputation methods") +
        theme(plot.title = element_text(hjust = 0.5)) + labs(x = "", y = "MAE")

      if (!is.na(assumed_pattern))
        levels(forgraph_KS$Pattern) <- c("MCAR", "MAR", "MNAR", "MAP") else levels(forgraph_KS$Pattern) <- c("MCAR", "MAR", "MNAR")
      p_KS <- ggplot(forgraph_KS, aes(x = Method, y = value, fill = Method)) + geom_boxplot() +
        facet_grid(~Pattern, scales = "free") + theme(axis.text.x = element_text(angle = 90,
                                                                                 hjust = 1)) + ggtitle("Kolmogorov-Smirnov test statistic of various missing data imputation methods") +
        theme(plot.title = element_text(hjust = 0.5)) + labs(x = "", y = "Kolmogorov-Smirnov D")


      # output list
      list(Imputation_metrics_raw = collect_res, Imputation_metrics_means = all_stats, Plot_TIME = p_time, Plot_RMSE = p_RMSE,
           Plot_MAE = p_MAE, Plot_KS = p_KS)

}