R/distribution_differences.R

Defines functions compare_distributions print.egalitarian_comparison plot.egalitarian_comparison calculate_statistics get_distributions_distance

Documented in calculate_statistics compare_distributions get_distributions_distance plot.egalitarian_comparison print.egalitarian_comparison

#' Create object which will be used for distribution comparisons
#'
#' @param egalitarian object of class egalitarian
#' @param variable character, name of the variable to compare
#'
#' @return list of class egalitarian_comparison
#'
#' @export
#'

compare_distributions <- function(egalitarian, variable) {
 egalitarian$data <- egalitarian$data[, c("dataset_", variable)]
 if(is.character(egalitarian$data[, variable]) | is.factor(egalitarian$data[, variable])) {
   variable_type <- "factor"
 } else {
   variable_type <- "numeric"
 }
 egalitarian <- c(egalitarian, list(type = variable_type, name = variable))
 class(egalitarian) <- c("egalitarian_comparison", class(egalitarian))
 egalitarian
}


#' Generic print function for egalitarian_comparison class
#'
#' @param x egalitarian_comparison object
#' @param ... Currently ignored
#'
#' @export
#'

print.egalitarian_comparison <- function(x, ...) {
  cat("Egalitarian comparison object", "\n")
  cat("Comparison for variable", x$name, "\n")
  cat("Number of observations in training set: ", sum(x$data$dataset_ == "training"), "\n")
  cat("Number of observations in validation set: ", sum(x$data$dataset_ == "validation"), "\n")
  cat("Type of variable: ", x$type)
  invisible(x)
}


#' Plot differences between distributions of one variable across two datasets
#'
#' @param x egalitarian_comparison object
#' @param type type of the plot: empirical CDF / density / histogram / bar plot,
#' bar plot is reserved for categorical variables
#' @param ... Currently ignored
#'
#' @return ggplot2 object
#'
#' @import ggplot2
#'
#' @export
#'

plot.egalitarian_comparison <- function(x, type = "ecdf", ...) {
  variable_name <- colnames(x$data)[2]
  if(x$type == "numeric") {
    plot <- ggplot(x$data, aes_string(x = variable_name, fill = "dataset_", color = "dataset_"))
    if(type == "histogram") {
      plot <- plot +
        geom_histogram() +
        ylab("")
    } else if(type == "density") {
      plot <- plot +
        stat_density() +
        ylab("Estimated density")
    } else if(type == "ecdf") {
      plot <- plot +
        stat_ecdf() +
        ylab("Empirical CDF")
    }
  } else {
    plot <- ggplot(x$data, aes_string(x = "dataset_", fill = variable_name)) +
      geom_bar(position = "dodge")
  }
  plot +
    theme_bw()
}

#' Calculate distances
#'
#' @param df data frame from `egalitarian` object
#' @param condition function that returns logical vector indicating
#' which columns will be chosen (numeric/factor)
#'
#' @return data frame
#'
#' @export
#'
#' @importFrom dplyr select_if mutate group_by select summarise
#' @importFrom tidyr gather spread
#' @importFrom stats chisq.test ks.test
#'

calculate_statistics <- function(df, condition) {
  dataset_ <- distance <- type <- value <- variable <- validation <- training <- NULL
  find_stats <- select_if(df, condition)
  find_stats <- mutate(find_stats, dataset_ = df$dataset_)
  find_stats <- gather(find_stats, "variable", "value", -dataset_)
  find_stats <- group_by(find_stats, dataset_, variable)
  find_stats <- mutate(find_stats, id = 1:n())
  find_stats <- spread(find_stats, dataset_, value)
  find_stats <- select(find_stats, -id)
  stats <- group_by(find_stats, variable)
  if(is.numeric(stats$training)) {
    stats <- summarise(stats,
                       ks_test__statistic = ks.test(training, validation)$statistic,
                       ks_test__p_value = ks.test(training, validation)$p.value)

  } else {
    stats <- summarise(stats,
                       chi_sq__statistic = chisq.test(training, validation)$statistic,
                       chi_sq__p_value = chisq.test(training, validation)$p.value)
  }
  stats <- gather(stats, "stats", "value", -variable)
  stats <- mutate(stats,
                  distance = stringr::str_split(stats, "__", simplify = T)[, 1],
                  type = stringr::str_split(stats, "__", simplify = T)[, 2])
  stats <- select(stats, variable, distance, type, value)
  spread(stats, type, value)
}


#' Get distances for all variables in datasets
#'
#' @param egalitarian `egalitarian` object
#'
#' @return data frame
#'
#' @importFrom dplyr bind_rows
#'
#' @export
#'

get_distributions_distance <- function(egalitarian) {
  numerical_distances <- suppressWarnings(calculate_statistics(egalitarian$data,
                                                               is.numeric))
  factor_distances <- suppressWarnings(calculate_statistics(egalitarian$data,
                                           function(x)
                                             is.character(x) | is.factor(x)))

  bind_rows(numerical_distances, factor_distances)
}
mstaniak/egalitaRian documentation built on Aug. 26, 2019, 11:11 p.m.