R/PipeOPAmelia.R
In NADIA: NA Data Imputation Algorithms

#' @title PipeOpAmelia
#'
#' @name PipeOpAmelia
#'
#' @description
#' Implements EMB methods as mlr3 pipeline more about Amelia \code{\link{autotune_Amelia}} or \url{https://cran.r-project.org/package=Amelia}
#'
#' @section Input and Output Channels:
#' Input and output channels are inherited from \code{\link{PipeOpImpute}}.
#'
#'
#' @section Parameters:
#' The parameters include inherited from [`PipeOpImpute`], as well as: \cr
#' \itemize{
#' \item \code{id} :: \code{character(1)}\cr
#' Identifier of resulting object, default \code{"imput_Amelia"}.
#' \item \code{m} :: \code{integer(1)}\cr
#' Number of datasets generated by Amelia, default \code{3}.
#' \item \code{polytime} :: \code{integer(1)}\cr
#' Integer between 0 and 3 indicating what power of polynomial should be included in the imputation model to account for the effects of time. A setting of 0 would indicate constant levels, 1 would indicate linear time effects, 2 would indicate squared effects, and 3 would indicate cubic time effects, default \code{NULL}.
#' \item \code{splinetime} :: \code{integer(1)}\cr
#' Integer value of 0 or greater to control cubic smoothing splines of time. Values between 0 and 3 create a simple polynomial of time (identical to the polytime argument). Values k greater than 3 create a spline with an additional k-3 knotpoints, default \code{NULL}.
#' \item \code{intercs} :: \code{logical(1)}\cr
#' Variable indicating if the time effects of polytime should vary across the cross-section, default \code{FALSE}.
#' \item \code{empir} :: \code{double(1)}\cr
#' Number indicating level of the empirical (or ridge) prior. This prior shrinks the covariances of the data, but keeps the means and variances the same for problems of high missingness, small N's or large correlations among the variables. Should be kept small, perhaps 0.5 to 1 percent of the rows of the data; a reasonable upper bound is around 10 percent of the rows of the data. If empir is not set, empir=nrow(df)*0.015, default \code{NULL}.
#' \item \code{parallel} :: \code{double(1)}\cr
#' If true parallel calculation is used, default \code{TRUE}.
#' \item \code{out_fill} :: \code{character(1)}\cr
#' Output log file location. If file already exists log message will be added. If NULL no log will be produced, default \code{NULL}.
#' }
#'
#' @import mlr3
#' @import mlr3pipelines
#' @import paradox
#'
#' @importFrom methods is
#' @importFrom stats as.formula
#' @importFrom stats median
#' @importFrom stats na.omit
#' @importFrom stats runif
#' @importFrom stats setNames
#' @importFrom utils capture.output
#'
#' @examples
#' \donttest{
#'
#' # Using debug learner for example purpose
#'
#'   graph <- PipeOpAmelia$new() %>>% LearnerClassifDebug$new()
#'   graph_learner <- GraphLearner$new(graph)
#'
#'   graph_learner$param_set$values$impute_Amelia_B.parallel <- FALSE
#'
#'
#'   resample(tsk("pima"), graph_learner, rsmp("cv", folds = 3))
#' }
#' @export
PipeOpAmelia <- R6::R6Class("Amelia_imputation",
  lock_objects = FALSE,
  inherit = PipeOpImpute, # inherit from PipeOp
  public = list(
    initialize = function(id = "impute_Amelia_B", polytime = NULL, splinetime = NULL, intercs = FALSE, empir = NULL, m = 3, parallel = TRUE, out_file = NULL) {

      super$initialize(id,
        whole_task_dependent = TRUE, packages = "NADIA", param_vals = list(polytime = polytime, splinetime = splinetime, intercs = intercs, empir = empir, m = m, parallel = parallel, out_file = out_file),
        param_set = ParamSet$new(list(
          "polytime" = ParamUty$new("polytime", default = NULL, tags = "amelia"),
          "splinetime" = ParamUty$new("splinetime", default = NULL, tags = "amelia"),
          "empir" = ParamUty$new("empir", default = NULL, tags = "amelia"),
          "parallel" = ParamLgl$new("parallel", default = TRUE, tags = "amelia"),
          "intercs" = ParamLgl$new("intercs", default = FALSE, tags = "amelia"),

          "m" = ParamInt$new("m", lower = 1, upper = Inf, default = 3, tags = "amelia"),
          "out_file" = ParamUty$new("out_file", default = NULL, tags = "amelia")





        ))
      )



      self$imputed <- FALSE
      self$column_counter <- NULL
      self$data_imputed <- NULL




    }), private = list(
    .train_imputer = function(feature, type, context) {

      imp_function <- function(data_to_impute) {

        data_to_impute <- as.data.frame(data_to_impute)
        # prepering arguments for function
        col_type <- 1:ncol(data_to_impute)
        for (i in col_type) {
          col_type[i] <- class(data_to_impute[, i])
        }
        percent_of_missing <- 1:ncol(data_to_impute)
        for (i in percent_of_missing) {
          percent_of_missing[i] <- (sum(is.na(data_to_impute[, i])) / length(data_to_impute[, 1])) * 100
        }
        col_miss <- colnames(data_to_impute)[percent_of_missing > 0]
        col_no_miss <- colnames(data_to_impute)[percent_of_missing == 0]


        data_imputed <- NADIA::autotune_Amelia(data_to_impute, col_type, percent_of_missing,
          parallel = self$param_set$values$parallel, polytime = self$param_set$values$polytime,
          splinetime = self$param_set$values$splinetime, intercs = self$param_set$values$intercs,
          empir = self$param_set$values$empir, m = self$param_set$values$m,
          out_file = self$param_set$values$out_file)



        return(data_imputed)
      }
      self$imputed_predict <- TRUE
      self$flag <- "train"
      if (!self$imputed) {

        self$column_counter <- ncol(context) + 1
        self$imputed <- TRUE
        data_to_impute <- cbind(feature, context)

        self$data_imputed <- imp_function(data_to_impute)
        colnames(self$data_imputed) <- self$state$context_cols

      }
      if (self$imputed) {
        self$column_counter <- self$column_counter - 1

      }
      if (self$column_counter == 0) {
        self$imputed <- FALSE
      }
      self$train_s <- TRUE

      self$action <- 3


      return(list("data_imputed" = self$data_imputed, "train_s" = self$train_s, "flag" = self$flag, "imputed_predict" = self$imputed_predict, "imputed" = self$imputed, "column_counter" = self$column_counter))

    },
    .impute = function(feature, type, model, context) {

      if (is.null(self$action)) {


        self$train_s <- TRUE
        self$flag <- "train"
        self$imputed_predict <- TRUE
        self$action <- 3
        self$data_imputed <- model$data_imputed
        self$imputed <- FALSE
        self$column_counter <- 0

      }

      imp_function <- function(data_to_impute) {

        data_to_impute <- as.data.frame(data_to_impute)
        # prepering arguments for function
        col_type <- 1:ncol(data_to_impute)
        for (i in col_type) {
          col_type[i] <- class(data_to_impute[, i])
        }
        percent_of_missing <- 1:ncol(data_to_impute)
        for (i in percent_of_missing) {
          percent_of_missing[i] <- (sum(is.na(data_to_impute[, i])) / length(data_to_impute[, 1])) * 100
        }
        col_miss <- colnames(data_to_impute)[percent_of_missing > 0]
        col_no_miss <- colnames(data_to_impute)[percent_of_missing == 0]


        data_imputed <- NADIA::autotune_Amelia(data_to_impute, col_type, percent_of_missing,
          parallel = self$param_set$values$parallel, polytime = self$param_set$values$polytime,
          splinetime = self$param_set$values$splinetime, intercs = self$param_set$values$intercs,
          empir = self$param_set$values$empir, m = self$param_set$values$m,
          out_file = self$param_set$values$out_file)




        return(data_imputed)
      }
      if (self$imputed) {
        feature <- self$data_imputed[, setdiff(colnames(self$data_imputed), colnames(context))]


      }
      if ((nrow(self$data_imputed) != nrow(context) | !self$train_s) & self$flag == "train") {
        self$imputed_predict <- FALSE
        self$flag <- "predict"
      }

      if (!self$imputed_predict) {

        data_to_impute <- cbind(feature, context)

        self$data_imputed <- imp_function(data_to_impute)
        colnames(self$data_imputed)[1] <- setdiff(self$state$context_cols, colnames(context))
        self$imputed_predict <- TRUE
      }


      if (self$imputed_predict & self$flag == "predict") {
        feature <- self$data_imputed[, setdiff(colnames(self$data_imputed), colnames(context))]

      }

      if (self$column_counter == 0 & self$flag == "train") {
        feature <- self$data_imputed[, setdiff(colnames(self$data_imputed), colnames(context))]
        self$flag <- "predict"
        self$imputed_predict <- FALSE
      }
      self$train_s <- FALSE

      return(feature)
    }




  )
)