R/design_missing_Z.R

Defines functions print.simple_plan format.simple_plan prepare.simple_plan design_missingness_treatment .xform_zap .ind_na .xform_num .xform_cat

Documented in design_missingness_treatment prepare.simple_plan print.simple_plan

.xform_cat <- function(col, arg) {
  known_values <- arg$known_values
  invalid_mark <- arg$invalid_mark
  col <- as.character(col)
  bads <- is.na(col) | (!(col %in% known_values))
  col[bads] <- invalid_mark
  col
}

.xform_num <- function(col, arg) {
  replacement <- arg$replacement
  col <- as.numeric(col)
  bads <- is.na(col) | is.nan(col) | is.infinite(col)
  col[bads] <- replacement
  col
}

.ind_na <- function(col, arg) {
  col <- as.numeric(col)
  bads <- is.na(col) | is.nan(col) | is.infinite(col)
  v <- rep(0, length(col))
  v[bads] <- 1
  v
}

.xform_zap <- function(col, arg) {
 NULL
}


#' Design a simple treatment plan to indicate missingingness and perform simple imputation.
#' 
#' 
#' @param dframe data.frame to drive design.
#' @param ... not used, forces later arguments to bind by name.
#' @param varlist character, names of columns to process.
#' @param invalid_mark character, name to use for NA levels and novel levels.
#' @param drop_constant_columns logical, if TRUE drop columns that do not vary from the treatment plan.
#' @param missingness_imputation function of signature f(values: numeric), simple missing value imputer.
#' @param imputation_map map from column names to functions of signature f(values: numeric), simple missing value imputers.
#' @return simple treatment plan.
#' 
#' @examples 
#' 
#' d <- wrapr::build_frame(
#'   "x1", "x2", "x3" |
#'   1   , 4   , "A"  |
#'   NA  , 5   , "B"  |
#'   3   , 6   , NA   )
#' 
#' plan <- design_missingness_treatment(d)
#' prepare(plan, d)
#' 
#' prepare(plan, data.frame(x1=NA, x2=NA, x3="E"))
#' 
#' @seealso \code{\link{prepare.simple_plan}}
#' 
#' @export
#' 
design_missingness_treatment <- function(dframe, 
                                         ..., 
                                         varlist = colnames(dframe),
                                         invalid_mark = "_invalid_",
                                         drop_constant_columns = FALSE,
                                         missingness_imputation = NULL, 
                                         imputation_map = NULL) {
  wrapr::stop_if_dot_args(substitute(list(...)), 
                          "vtreat:::design_missingness_treatment")
  force(invalid_mark)
  ops <- list()
  for(ci in varlist) {
    vi <- dframe[[ci]]
    if(is.null(vi)) {
      stop(paste("vtreat::design_missingness_treatment: column", ci, "not found"))
    }
    if(drop_constant_columns) {
      if(!.has.range(vi)) {
         next
      }
    }
    if(is.logical(vi) || (is.numeric(vi) && (!is.factor(vi)))) {
      vi <- as.numeric(vi)
      mean_v <- 0.0
      bads <- is.na(vi) | is.nan(vi) | is.infinite(vi)
      if(any(!bads)) {
        missing_v <- base::mean
        if(!is.null(missingness_imputation)) {
          missing_v <- missingness_imputation
        }
        if((!is.null(imputation_map)) && (ci %in% names(imputation_map))) {
          missing_v <- imputation_map[[ci]]
        }
        if(is.function(missing_v)) {
          mean_v <- missing_v(vi[!bads])
        } else {
          mean_v <- missing_v
        }
      }
      ops <- c(ops, 
               list(
                 list(
                   col = ci,
                   nm = vtreat_make_names(ci), 
                   f = .xform_num, 
                   code = "numeric",
                   args = list(replacement = mean_v))))
      if(any(bads)) {
        ops <- c(ops,
                 list(
                   list(
                     col = ci,
                     nm = vtreat_make_names(paste0(ci, "_isBAD")), 
                     f = .ind_na, 
                     code = "is_bad",
                     args = list())))
      }
    } else {
      vi <- as.character(vi)
      ops <- c(ops, 
               list(
                 list(
                   col = ci,
                   nm = vtreat_make_names(ci), 
                   f = .xform_cat, 
                   code = "cat",
                   args = list(known_values = sort(unique(vi)),
                               invalid_mark = invalid_mark))))
      
    }
  }
  class(ops) <- "simple_plan"
  ops
}

#' Prepare a simple treatment.
#' 
#' @param treatmentplan A simple treatment plan.
#' @param dframe data.frame to be treated.
#' @param ... not used, present for S3 signature consistency.
#' 
#' @examples 
#' 
#' d <- wrapr::build_frame(
#'   "x1", "x2", "x3" |
#'   1   , 4   , "A"  |
#'   NA  , 5   , "B"  |
#'   3   , 6   , NA   )
#' 
#' plan <- design_missingness_treatment(d)
#' prepare(plan, d)
#' 
#' prepare(plan, data.frame(x1=NA, x2=NA, x3="E"))
#' 
#' @seealso \code{\link{design_missingness_treatment}}, \code{\link{prepare}}
#' 
#' @export
#' 
prepare.simple_plan <- function(treatmentplan, dframe,
                                ...) {
  wrapr::stop_if_dot_args(substitute(list(...)), 
                          "vtreat:::prepare.simple_plan")
  res <- dframe
  for(pi in treatmentplan) {
    ci <- pi$col
    res[[ci]] <- NULL
  }
  for(pi in treatmentplan) {
    ci <- pi$col
    vi <- dframe[[ci]]
    if(is.null(vi)) {
      stop(paste("vtreat::prepare.simple_plan: column", ci, " must be in data.frame"))
    }
    vi <- pi$f(vi, pi$args)
    res[[pi$nm]] <- vi
  }
  res
}

#' @export
format.simple_plan <- function(x, ...) { 
  steps <- lapply(
    x, 
    function(xi) {
      data.frame(origName = xi$col,
                 varName = xi$nm,
                 code = xi$code,
                 stringsAsFactors = FALSE)
    })
  steps <- .rbindListOfFrames(steps)
  format(steps)
}

#' @export
as.character.simple_plan <- function (x, ...) {
  format(x, ...)
}

#'
#' Print treatmentplan.
#' @param x treatmentplan
#' @param ... additional args (to match general signature).
#' @export
print.simple_plan <- function(x, ...) { 
  print(format(x), ...) 
}
WinVector/vtreat documentation built on Aug. 29, 2023, 4:49 a.m.