R/autotune_Amelia.R
In NADIA: NA Data Imputation Algorithms

Documented in autotune_Amelia

#' Perform imputation using Amelia package and EMB algorithm.
#'
#' @description Function use EMB (Expectation-Maximization with Bootstrapping ) to impute missing data. Function performance is highly depend from data structure and
#' chosen parameters.
#'
#'
#' @param df data.frame. Df to impute with column names and without target column.
#' @param col_type character vector. Vector containing column type names.
#' @param percent_of_missing numeric vector. Vector contatining percent of missing data in columns for example  c(0,1,0,0,11.3,..)
#' @param col_0_1 Decaid if add bonus column informing where imputation been done. 0 - value was in dataset, 1 - value was imputed. Default False. (Works only for returning one dataset).
#' @param parallel If true parallel calculation is used.
#' @param polytime parameter pass to amelia function
#' @param splinetime parameter pass to amelia finction
#' @param intercs parameter pass to amleia function
#' @param empir parameter pass to amelia function as empir in Amelia == empir*nrow(df). If empir dont set empir=nrow(df)*0.015.
#' @param verbose If true function will print on console.
#' @param return_one Decide if one dataset or amelia object will be returned.
#' @param out_file  Output log file location if file already exists log message will be added. If NULL no log will be produced.
#' @param m Number of datasets generated by amelia. If retrun_one=TRUE first dataset will be given.
#' @import Amelia
#' @references   James Honaker, Gary King, Matthew Blackwell (2011). Amelia II: A Program for Missing Data. Journal of Statistical Software, 45(7), 1-47. URL https://www.jstatsoft.org/v45/i07/.
#'
#' @author James Honaker, Gary King, Matthew Blackwell (2011).
#'
#' @examples
#' {
#'   raw_data <- data.frame(
#'     a = as.factor(sample(c("red", "yellow", "blue", NA), 1000, replace = TRUE)),
#'     b = as.integer(1:1000),
#'     c = as.factor(sample(c("YES", "NO", NA), 1000, replace = TRUE)),
#'     d = runif(1000, 1, 10),
#'     e = as.factor(sample(c("YES", "NO"), 1000, replace = TRUE)),
#'     f = as.factor(sample(c("male", "female", "trans", "other", NA), 1000, replace = TRUE)))
#'
#'   # Prepering col_type
#'   col_type <- c("factor", "integer", "factor", "numeric", "factor", "factor")
#'
#'   percent_of_missing <- 1:6
#'   for (i in percent_of_missing) {
#'     percent_of_missing[i] <- 100 * (sum(is.na(raw_data[, i])) / nrow(raw_data))
#'   }
#'
#'
#'   imp_data <- autotune_Amelia(raw_data, col_type, percent_of_missing,parallel = FALSE)
#'
#'   # Check if all missing value was imputed
#'   sum(is.na(imp_data)) == 0
#'   # TRUE
#' }
#' @return Return one data.frame with imputed values or amelia object.
#' @export



autotune_Amelia <- function(df, col_type=NULL, percent_of_missing=NULL, col_0_1 = FALSE, parallel = TRUE, polytime = NULL, splinetime = NULL, intercs = FALSE,
  empir = NULL, verbose = FALSE, return_one = TRUE, m = 3, out_file = NULL) {

  # Column informations
  if(is.null(col_type)){
    col_type <- 1:ncol(df)
    for ( i in col_type){
      col_type[i] <- class(df[,i])
    }
  }

  if(is.null(percent_of_missing)){
    percent_of_missing <- 1:ncol(df)
    for ( i in percent_of_missing){
      percent_of_missing[i] <- sum(is.na(df[,i]))/nrow(df)
    }
  }



  if (!is.null(out_file)) {
    write("Amelia  ", file = out_file, append = TRUE)
  }
  col_0_1 <- FALSE

  if (sum(is.na(df)) == 0) {
    return(df)
  }
  # prepering information about categorical column
  categorical_col <- colnames(df)[ifelse(col_type == "factor", TRUE, FALSE)]
  if (length(categorical_col) == 0) {
    categorical_col <- NULL
  }

  # seting parallel options
  if (parallel) {
    parallel <- "multicore"
  }
  if ("multicore" != parallel) {
    parallel <- "no"
  }
  n_row <- length(df[, 1])

  # Amelia Run
  tryCatch({
    if (is.null(empir)) {
      empir <- 0.015
    }
    empir <- empir * n_row
    final <- Amelia::amelia(df, m = m, noms = categorical_col, parallel = parallel, p2s = as.numeric(verbose), empri = empir, polytime = polytime, splinetime = splinetime, intercs = intercs)
    if (return_one) {
      for (i in final$imputations) {
        if (!is.null(i)) {
          final <- i
          break
        }
      }
    }
    # Avoiding situtation when amelia dont impute and dont throwe errors            c
    if (!inherits(final,"data.frame")  & !inherits(final,"amelia") ) {
      stop("ERROR no error msg")
    }
  }, error = function(e) {
    if (!is.null(out_file)) {
      write(as.character(e), file = out_file, append = TRUE)
    }
    stop(e)
  })
  # adding 0_1 col
  if (col_0_1 & return_one) {

    columns_with_missing <- (as.data.frame(is.na(df)) * 1)[, percent_of_missing > 0]
    colnames(columns_with_missing) <- paste(colnames(columns_with_missing), "where", sep = "_")
    final <- cbind(final, columns_with_missing)

  }

  for (i in colnames(final)[col_type == "integer"]) {
    final[, i] <- as.integer(final[, i])
  }

  return(final)

}