R/FastImputation.R
In FastImputation: Learn from Training Data then Quickly Fill in Missing Data

Documented in FastImputation

#' Use the pattern learned from the training data to impute (fill in good guesses for) missing values.
#'
#' Like Amelia, FastImputation assumes that the columns of the data are
#' multivariate normal or can be transformed into approximately
#' multivariate normal.
#' 
#' @param x Dataframe, possibly with some missing (\code{NA}) values.
#' @param patterns An object of class 'FastImputationPatterns' generated by \code{TrainFastImputation}.
#' @param verbose If TRUE then the progress in imputing the data will be shown.
#' @return x, but with missing values filled in (imputed)
#' @export
#' @seealso \code{\link{TrainFastImputation}}
#' @references
#' \url{https://gking.harvard.edu/amelia}
#' @author Stephen R. Haptonstahl \email{srh@@haptonstahl.org}
#' @examples
#' data(FI_train)   # provides FItrain dataset
#' patterns <- TrainFastImputation(
#'   FI_train,
#'   constraints=list(list("bounded_below_2", list(lower=0)),
#'                    list("bounded_above_5", list(upper=0)),
#'                    list("bounded_above_and_below_6", list(lower=0, upper=1))
#'                    ),
#'   idvars="user_id_1",
#'   categorical="categorical_9")
#' 
#' data(FI_test)
#' FI_test          # note there is missing data
#' imputed_data <- FastImputation(FI_test, patterns)
#' imputed_data    # good guesses for missing values are filled in
#'
#' data(FI_true)
#' continuous_cells_imputed <- is.na(FI_test[,2:8])
#' continuous_imputed_values <- imputed_data[,2:8][continuous_cells_imputed]
#' continuous_true_values <- FI_true[,2:8][continuous_cells_imputed]
#' rmse <- sqrt(median((continuous_imputed_values-continuous_true_values)^2))
#' rmse
#' median_relative_error <- median( abs((continuous_imputed_values - continuous_true_values) / 
#'   continuous_true_values) )
#' median_relative_error
#' 
#' imputed_data_column_means <- FI_test[,2:8]
#' for(j in 1:ncol(imputed_data_column_means)) {
#'   imputed_data_column_means[is.na(imputed_data_column_means[,j]),j] <- 
#'     mean(imputed_data_column_means[,j], na.rm=TRUE)
#' }
#' cont_imputed_vals_col_means <- imputed_data_column_means[continuous_cells_imputed]
#' rmse_column_means <- sqrt(median((cont_imputed_vals_col_means-continuous_true_values)^2))
#' rmse_column_means  # much larger error than using FastImputation
#' median_relative_error_col_means <- median( abs((cont_imputed_vals_col_means - 
#'   continuous_true_values) / continuous_true_values) )
#' median_relative_error_col_means  # larger error than using FastImputation
#' 
#' # Let's look at the accuracy of the imputation of the categorical variable
#' library("caret")
#' categorical_rows_imputed <- which(is.na(FI_test$categorical_9))
#' confusionMatrix(data=imputed_data$categorical_9[categorical_rows_imputed], 
#'                 reference=FI_true$categorical_9[categorical_rows_imputed])
#' # Compare to imputing with the modal value
#' stat_mode <- function(x) {
#'   unique_values <- unique(x)
#'   unique_values <- unique_values[!is.na(unique_values)]
#'   unique_values[which.max(tabulate(match(x, unique_values)))]
#' }
#' categorical_rows_imputed_col_mode <- rep(stat_mode(FI_test$categorical_9), 
#'                                          length(categorical_rows_imputed))
#' confusionMatrix(data=categorical_rows_imputed_col_mode, 
#'                 reference=FI_true$categorical_9[categorical_rows_imputed])
#' # less accurate than using FastImputation
#'
FastImputation <-
function(
  x,
  patterns,
  verbose=TRUE
) {
  # TODO: Refactor so bounding transformations only apply to data used for imputation, not
  #       done and undone on returned data.
  if( missing(patterns) ) {
    stop("A 'patterns' object generated by 'TrainFastImputation' must be specified.")
  } else {
    if( !methods::is(patterns, "FastImputationPatterns") ) stop("'patterns' must be of class 'FastImputationPatterns'. This is generated by appropriate use of the 'TrainFastImputation' function.")
  }
  if( !is.data.frame(x) ) stop("'x' must be a dataframe.")
  if( !identical(names(x), patterns$FI_var_names) ) {
    stop("The names of the variables you are imputing don't match those of the training data. Check to make sure the data is in the same format with the same columns in the same order.")
  }
  names_imputing_set <- names(x)
  
  # Order of operations
  # Check x
  #   Remove ignored columns, save as y
  #   Normalize bounded variables, keep in y
  #     Remove and encode categorical variables, save in z
  #     Impute
  #   Recover categorical variables, save in y
  #   Bound normalized variables, keep in y
  # Add ignored columns, keep in y
  # Update col names in y
  # Return y

  # remove cols to ignore
  if(length(patterns$FI_cols_to_ignore) > 0) {
    y <- x[,-patterns$FI_cols_to_ignore]
  } else {
    y <- x
  }
  
  # transform constrained columns
  for(i_col in patterns$FI_cols_bound_to_intervals) {
    y[,i_col] <- NormalizeBoundedVariable(y[,i_col], patterns$FI_constraints[[i_col]])
  }
  
  
  # set aside categorical variables and one-hot encode them
  if(length(patterns$FI_cols_categorical) > 0) {
    # set aside categorical variables
    y_categorical <- y[,patterns$FI_cols_categorical, drop=FALSE]
    z <- y[,-patterns$FI_cols_categorical]
    total_not_categorical <- ncol(z)
    
    # encode categorical variables
    total_one_hot_dummies <- sum(sapply(patterns$FI_categories, length))
    z <- data.frame(z, matrix(NA_real_, nrow=nrow(z), ncol=total_one_hot_dummies))
    current_col_to_fill <- 1
    while(current_col_to_fill < total_one_hot_dummies) {
      for(i in 1:length(patterns$FI_categories)) {
        for(j in 1:length(patterns$FI_categories[[i]])) {
          z[,total_not_categorical+current_col_to_fill] <- ifelse(y_categorical[,i]==patterns$FI_categories[[i]][j], 1, -1)
          current_col_to_fill <- current_col_to_fill + 1
        }
      }
    }
  } else {
    z <- y
  }  
  
  if(verbose) pb <- utils::txtProgressBar(style=3)
  n_rows <- nrow(z)
  n_cols <- ncol(z)
  
  for(i_row in 1:n_rows) {
    constrained_row <- z[i_row,]
    if( sum(is.na(constrained_row)) != 0 ) {  # do nothing if nothing is missing
      # Use formula for mean here: https://en.wikipedia.org/wiki/Multivariate_normal_distribution#Conditional_distributions
      cols_to_impute <- which(is.na(constrained_row))    # indices of "1" in Wikipedia formula for mean of conditional multivariate normal distribution
      if( length(cols_to_impute) == length(constrained_row) ) {
        # nothing to condition on
        replacement_values <- patterns$FI_means
      } else {
        # fill based on conditional normal distribution
        known_cols <- setdiff(1:n_cols, cols_to_impute)  # incides of "2" in Wikipedia formula for mean of conditional multivariate normal distribution
        
        replacement_values <- t(t(patterns$FI_means[cols_to_impute])) + 
          patterns$FI_covariance[cols_to_impute,known_cols, drop=FALSE] %*% 
          solve(a=patterns$FI_covariance[known_cols,known_cols], 
                b=t(constrained_row[known_cols]) - t(t(patterns$FI_means[known_cols])))
      }
      # Store replacement values (note that constraints are not yet applied)
      z[i_row,cols_to_impute] <- replacement_values ### PERHAPS ADD as.vector to RHS
    }
    if(verbose) utils::setTxtProgressBar(pb, i_row/n_rows)
  }
  if(verbose) { 
    close(pb)
    cat("\n")
  }
  
  # Recover categorical variables, save in y
  if(length(patterns$FI_cols_categorical) > 0) {
    for(i_col in 1:ncol(y_categorical)) {
      first_col_of_var_to_parse <- 1
      n_cols_this_var <- length(patterns$FI_categories[[i_col]])
      cat_indices <- apply(z[,(total_not_categorical+1):(total_not_categorical+n_cols_this_var)], 1, which.max)
      y_categorical[,i_col] <- patterns$FI_categories[[i_col]][cat_indices]
    }
    y <- z[,1:total_not_categorical]
    for(i in 1:ncol(y_categorical)) {
      y_col <- patterns$FI_cols_categorical[i]
      y <- as.data.frame(append(y, list(var=y_categorical[,i, drop=FALSE]), y_col - 1))
    }
  } else {
    y <- z
  }
  # Coerce categorical variables to factors with correct levels
  for(i in 1:length(patterns$FI_cols_categorical)) {
    y[[patterns$FI_cols_categorical[i]]] <- factor(y[[patterns$FI_cols_categorical[i]]], 
                                                   levels=patterns$FI_categories[[i]])
  }
  
  # Bound normalized variables, keep in y
  for(i_col in patterns$FI_cols_bound_to_intervals) {
    y[,i_col] <- BoundNormalizedVariable(y[,i_col], patterns$FI_constraints[[i_col]])
  }
  
  # Add ignored columns, keep in y
  if(length(patterns$FI_cols_to_ignore) > 0) {
    for(i_col in patterns$FI_cols_to_ignore) {
      y <- as.data.frame(append(y, list(var=x[,i_col]), i_col - 1))
    }
  }
  # fix names of y
  names(y) <- names_imputing_set
    
  return(y)
}