#' Create object which will be used for distribution comparisons
#'
#' @param egalitarian object of class egalitarian
#' @param variable character, name of the variable to compare
#'
#' @return list of class egalitarian_comparison
#'
#' @export
#'
compare_distributions <- function(egalitarian, variable) {
egalitarian$data <- egalitarian$data[, c("dataset_", variable)]
if(is.character(egalitarian$data[, variable]) | is.factor(egalitarian$data[, variable])) {
variable_type <- "factor"
} else {
variable_type <- "numeric"
}
egalitarian <- c(egalitarian, list(type = variable_type, name = variable))
class(egalitarian) <- c("egalitarian_comparison", class(egalitarian))
egalitarian
}
#' Generic print function for egalitarian_comparison class
#'
#' @param x egalitarian_comparison object
#' @param ... Currently ignored
#'
#' @export
#'
print.egalitarian_comparison <- function(x, ...) {
cat("Egalitarian comparison object", "\n")
cat("Comparison for variable", x$name, "\n")
cat("Number of observations in training set: ", sum(x$data$dataset_ == "training"), "\n")
cat("Number of observations in validation set: ", sum(x$data$dataset_ == "validation"), "\n")
cat("Type of variable: ", x$type)
invisible(x)
}
#' Plot differences between distributions of one variable across two datasets
#'
#' @param x egalitarian_comparison object
#' @param type type of the plot: empirical CDF / density / histogram / bar plot,
#' bar plot is reserved for categorical variables
#' @param ... Currently ignored
#'
#' @return ggplot2 object
#'
#' @import ggplot2
#'
#' @export
#'
plot.egalitarian_comparison <- function(x, type = "ecdf", ...) {
variable_name <- colnames(x$data)[2]
if(x$type == "numeric") {
plot <- ggplot(x$data, aes_string(x = variable_name, fill = "dataset_", color = "dataset_"))
if(type == "histogram") {
plot <- plot +
geom_histogram() +
ylab("")
} else if(type == "density") {
plot <- plot +
stat_density() +
ylab("Estimated density")
} else if(type == "ecdf") {
plot <- plot +
stat_ecdf() +
ylab("Empirical CDF")
}
} else {
plot <- ggplot(x$data, aes_string(x = "dataset_", fill = variable_name)) +
geom_bar(position = "dodge")
}
plot +
theme_bw()
}
#' Calculate distances
#'
#' @param df data frame from `egalitarian` object
#' @param condition function that returns logical vector indicating
#' which columns will be chosen (numeric/factor)
#'
#' @return data frame
#'
#' @export
#'
#' @importFrom dplyr select_if mutate group_by select summarise
#' @importFrom tidyr gather spread
#' @importFrom stats chisq.test ks.test
#'
calculate_statistics <- function(df, condition) {
dataset_ <- distance <- type <- value <- variable <- validation <- training <- NULL
find_stats <- select_if(df, condition)
find_stats <- mutate(find_stats, dataset_ = df$dataset_)
find_stats <- gather(find_stats, "variable", "value", -dataset_)
find_stats <- group_by(find_stats, dataset_, variable)
find_stats <- mutate(find_stats, id = 1:n())
find_stats <- spread(find_stats, dataset_, value)
find_stats <- select(find_stats, -id)
stats <- group_by(find_stats, variable)
if(is.numeric(stats$training)) {
stats <- summarise(stats,
ks_test__statistic = ks.test(training, validation)$statistic,
ks_test__p_value = ks.test(training, validation)$p.value)
} else {
stats <- summarise(stats,
chi_sq__statistic = chisq.test(training, validation)$statistic,
chi_sq__p_value = chisq.test(training, validation)$p.value)
}
stats <- gather(stats, "stats", "value", -variable)
stats <- mutate(stats,
distance = stringr::str_split(stats, "__", simplify = T)[, 1],
type = stringr::str_split(stats, "__", simplify = T)[, 2])
stats <- select(stats, variable, distance, type, value)
spread(stats, type, value)
}
#' Get distances for all variables in datasets
#'
#' @param egalitarian `egalitarian` object
#'
#' @return data frame
#'
#' @importFrom dplyr bind_rows
#'
#' @export
#'
get_distributions_distance <- function(egalitarian) {
numerical_distances <- suppressWarnings(calculate_statistics(egalitarian$data,
is.numeric))
factor_distances <- suppressWarnings(calculate_statistics(egalitarian$data,
function(x)
is.character(x) | is.factor(x)))
bind_rows(numerical_distances, factor_distances)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.