R/egalitarian_preprocess.R

Defines functions check_data_validity egalitarian print.egalitarian

Documented in check_data_validity egalitarian print.egalitarian

#' Check for problems in datasets
#'
#' @param training training dataset
#' @param validation validation dataset
#'
#' @return data frame
#'
#' @export
#'

check_data_validity <- function(training, validation) {
  if(length(setdiff(colnames(training),
                    colnames(validation))) > 0) {
    warning("Some variables are missing in the validation dataset. \n
            Additional variables will be dropped. \n")
  }
  if(length(setdiff(colnames(validation),
                    colnames(training))) > 0) {
    warning("There are additional variable present in the validation set. \n
            These variables will be dropped. \n")
  }
  for(x in intersect(colnames(training), colnames(validation))) {
    if(class(training[, x]) != class(validation[, x])) {
      warning("Mismatched variable classes in columns ", x, "\n")
    }
  }
  for(x in intersect(colnames(training), colnames(validation))) {
    if(is.factor(training[, x])) {
      if(length(setdiff(levels(training[, x]), levels(validation[, x]))) > 0 |
         length(setdiff(levels(validation[, x]), levels(training[, x]))) > 0) {
        warning("Unequal factor levels in column ", x, "\n")
      }
    }
  }
}


#' Preprocess data and model before analysis.
#'
#' @param training_set Data frame on which model was trained.
#' @param validation_set Data frame with the same columns as training_set.
#' @param target Character, name of the response variable.
#' @param model Model to be explained by local explainers.
#'        If NULL, only global comparisons of distributions can be
#'        performed on the result object.
#'
#' @return list of class egalitarian that consists of
#' \item{data}{data object from `egalitarian` object}
#' \item{model}{model from `egalitarian` object}
#' \item{target_name}{name of the response variable}
#'
#' @importFrom dplyr mutate bind_rows
#'
#' @export
#'

egalitarian <- function(training_set, validation_set, target, model = NULL) {
  check_data_validity(training_set, validation_set)

  cols <- intersect(colnames(training_set), colnames(validation_set))
  combined_datasets <- mutate(suppressWarnings(bind_rows(training_set[, cols],
                                                         validation_set[, cols])),
                              dataset_ = c(rep("training",
                                               nrow(training_set)),
                                           rep("validation",
                                               nrow(validation_set))))
  result <- list(data = combined_datasets,
                 model = model,
                 target_name = target)
  class(result) <- c("egalitarian", class(result))
  result
}

#' Generic print method for egalitarian class.
#'
#' @param x object of class egalitarian.
#' @param ... Currently ignored.
#'
#' @export
#'

print.egalitarian <- function(x, ...) {
  cat("EgalitaRian object \n")
  cat("Number of predictor variables: ", ncol(x$data) - 2, "\n")
  cat("Number of observations in training set: ", sum(x$data$dataset_ == "training"), "\n")
  cat("Number of observations in validation set: ", sum(x$data$dataset_ == "validation"), "\n")
  if(is.null(x$model)) {
    cat("Model object is not present")
  } else {
    cat("Model object is present \n")
    cat("Model class: ", class(x$model))
  }
  invisible(x)
}
mstaniak/egalitaRian documentation built on Aug. 26, 2019, 11:11 p.m.