####################################################################
#' Compare Variables with their Distributions
#'
#' Compare the distribution of a target variable vs another variable. This
#' function automatically splits into quantiles for numerical variables.
#' Custom and tidyverse friendly.
#'
#' @family Exploratory
#' @family Visualization
#' @param data Dataframe
#' @param ... Variables. Main (target variable) and secondary (values
#' variable) to group by (if needed).
#' @param type Integer. 1 for both plots, 2 for counter plot only, 3 for
#' percentages plot only.
#' @param ref Boolean. Show a reference line if levels = 2? Quite useful
#' when data is unbalanced (not 50/50) because a reference line is drawn.
#' @param note Character. Caption for the plot.
#' @param top Integer. Filter and plot the most n frequent for categorical values.
#' @param breaks Integer. Number of splits for numerical values.
#' @param na.rm Boolean. Ignore \code{NA}s if needed.
#' @param force Character. Force class on the values data. Choose between 'none',
#' 'character', 'numeric', 'date'
#' @param trim Integer. Trim labels until the nth character for categorical values
#' (applies for both, target and values)
#' @param clean Boolean. Use \code{cleanText()} for categorical values (applies
#' for both, target and values)
#' @param abc Boolean. Do you wish to sort by alphabetical order?
#' @param custom_colours Boolean. Use custom colours function?
#' @param plot Boolean. Return a plot? Otherwise, a table with results
#' @param chords Boolean. Use a chords plot?
#' @param save Boolean. Save the output plot in our working directory
#' @param subdir Character. Into which subdirectory do you wish to save the plot to?
#' @return Plot when \code{plot=TRUE} with two plots in one: counter distribution
#' grouped by cuts, and proportions distribution grouped by same cuts. data.frame when
#' \code{plot=FALSE} with counting, percentages, and cumulative percentages results.
#' When \code{type} argument is used, single plots will be returned.
#' @examples
#' Sys.unsetenv("LARES_FONT") # Temporal
#' data(dft) # Titanic dataset
#'
#' # Relation for categorical/categorical values
#' distr(dft, Survived, Sex)
#'
#' # Relation for categorical/numeric values
#' dft %>%
#' distr(Survived, Fare, plot = FALSE) %>%
#' head(10)
#' # Sort values
#' dft %>% distr(Survived, Fare, abc = TRUE)
#' # Less splits/breaks
#' dft %>% distr(Survived, Fare, abc = TRUE, breaks = 5)
#'
#' # Distribution of numerical only
#' dft[dft$Fare < 20, ] %>% distr(Fare)
#'
#' # Distribution of numerical/numerical
#' dft %>% distr(Fare, Age)
#'
#' # Select only one of the two default plots of distr()
#' dft %>% distr(Survived, Age, type = 2)
#' dft %>% distr(Survived, Age, type = 3)
#' @export
distr <- function(data, ...,
type = 1,
ref = TRUE,
note = NA,
top = 10,
breaks = 10,
na.rm = FALSE,
force = "none",
trim = 0,
clean = FALSE,
abc = FALSE,
custom_colours = FALSE,
plot = TRUE,
chords = FALSE,
save = FALSE,
subdir = NA) {
# # To handle scientific notation inputs correctly
# on.exit(options("scipen" = 999))
data <- data.frame(data)
vars <- enquos(...)
var1 <- vars[[1]]
if (length(vars) > 1) {
var2 <- vars[[2]]
} else {
var2 <- NULL
}
# When we only have one variable, use freqs()
if (length(vars) == 1) {
value <- select(data, !!var1)
variable_name <- colnames(value)
value <- value[, 1]
value <- .force_class(value, force)
value <- .fxtrim(value, trim)
value <- .fxclean(value, clean)
df <- data.frame(value = value, dummy = 0)
df <- .fxna_rm(df, na.rm)
is.Date <- function(x) inherits(x, "Date")
is.POSIXct <- function(x) inherits(x, "POSIXct")
is.POSIXlt <- function(x) inherits(x, "POSIXlt")
if (is.numeric(value) || is.Date(value) || is.POSIXct(value) || is.POSIXlt(value)) {
# Continuous and date values
if (is.numeric(value)) {
p <- ggplot(df, aes(x = .data$value))
} else {
p <- ggplot(df, aes(x = date(.data$value)))
}
p <- p +
geom_density(fill = "deepskyblue", alpha = 0.7, adjust = 1 / 3) +
labs(
y = NULL, x = NULL, fill = "Density",
title = "Density Distribution",
subtitle = paste("Variable:", variable_name),
caption = paste("Obs:", formatNum(nrow(df), 0))
) +
theme_lares()
if (top != 10) {
p <- p + xlim(0, top)
}
} else {
# Discrete values
p <- freqs(df, value, plot = TRUE, variable_name = variable_name, abc = abc, top = top)
}
# Return table with results?
if (!plot) {
output <- df %>% freqs(value, top = top)
return(output)
}
return(p)
}
# Check if secondary variable exists and fix if possible
var <- gsub('"', "", as_label(var2))
if (!var %in% colnames(data)) {
msg <- paste("Not a valid input:", var, "was transformed or does not exist.")
maybes <- colnames(data)[grepl(var, colnames(data))]
if (length(maybes) > 0 && maybes[1] %in% colnames(data)) {
message(paste0(
"Maybe you meant one of: ", vector2text(maybes), ". ",
"Automatically using '", maybes[1], "'"
))
var2 <- quos(maybes[1])
warning(msg)
} else {
stop(msg)
}
}
targets <- select(data, !!var1)
targets_name <- colnames(targets)
targets <- targets[, 1]
value <- select(data, !!var2)
variable_name <- colnames(value)
# Transformations
value <- value[, 1] # do.call("c", value)
value <- .force_class(value, force)
value <- .fxtrim(value, trim)
value <- .fxclean(value, clean)
if (length(targets) != length(value)) {
message("The targets and value vectors should be the same length.")
stop(message(paste(
"Currently, targets has", length(targets),
"rows and value has", length(value)
)))
}
# For num-num distributions or too many unique target variables
if (length(unique(targets)) >= 8) {
if (is.numeric(targets) && is.numeric(value)) {
subtitle <- paste0(
"Variables: ", variable_name, " vs. ", targets_name,
". Obs: ", formatNum(length(value), 0)
)
df <- data.frame(x = targets, y = value)
df <- .fxna_rm(df, na.rm = TRUE)
p <- df %>%
ggplot(aes(x = .data$x, y = .data$y)) +
stat_density_2d(aes(fill = after_stat(.data$level)), geom = "polygon") +
labs(
title = "2D Density Distribution",
x = targets_name, y = variable_name,
subtitle = subtitle
) +
scale_x_comma() +
scale_y_comma() +
theme_lares()
return(p)
}
message("You should try a 'target' variable with max 8 different values.")
message("Automatically trying a chords plot...")
chords <- TRUE
}
# Chords plot
if (chords) {
df <- data.frame(value = value, targets = targets)
output <- freqs(df, targets, value)
if (!na.rm) {
output <- output %>% replaceall(NA, "NA")
}
title <- "Frequency Chords Diagram"
subtitle <- paste("Variables:", targets_name, "to", variable_name)
if (!plot) {
return(output)
}
return(plot_chord(
output$targets, output$value, output$n,
mg = 13, title = title, subtitle = subtitle
))
}
# Only n numeric values, really numeric?
if (is.numeric(value) && length(unique(value)) <= 8) {
value <- .force_class(value, class = "char")
}
# Turn numeric variables into quantiles
if (is.numeric(value)) {
breaks <- ifelse(top != 10, top, breaks)
value <- quants(value, breaks, return = "labels")
cuts <- length(unique(value[!is.na(value)]))
if (cuts != breaks) {
message(paste(
"When dividing", variable_name, "into", breaks, "quantiles,",
cuts, "cuts/groups are possible."
))
}
top <- top + 1
}
# Finally, we have our data.frame
df <- data.frame(targets = targets, value = value)
df <- .fxna_rm(df, na.rm)
# Captions for plots
subtitle <- paste0(
"Variables: ", targets_name, " vs. ", variable_name,
". Obs: ", formatNum(nrow(df), 0)
)
freqs <- df %>%
group_by(.data$targets, .data$value) %>%
count() %>%
ungroup() %>%
arrange(desc(.data$n)) %>%
group_by(.data$value) %>%
mutate(
p = round(100 * .data$n / sum(.data$n), 2),
pcum = cumsum(.data$p)
) %>%
ungroup() %>%
filter(!is.na(.data$value)) %>%
mutate(
row = row_number(),
order = suppressWarnings(ifelse(
grepl("\\(|\\)", .data$value),
as.numeric(as.character(substr(gsub(",.*", "", .data$value), 2, 100))),
.data$row
))
)
if (length(unique(value)) > top && !is.numeric(value)) {
message(paste("Filtering the", top, "most frequent values. Use 'top' to overrule."))
which <- freqs(df, .data$value) %>% slice(1:top)
freqs <- freqs %>%
mutate(value = ifelse(.data$value %in% which$value, as.character(.data$value), "OTHERS")) %>%
group_by(.data$value, .data$targets) %>%
select(-.data$row, -.data$order) %>%
summarise(n = sum(.data$n)) %>%
mutate(p = round(100 * n / sum(.data$n), 2)) %>%
ungroup() %>%
arrange(desc(.data$n)) %>%
mutate(
row = row_number(),
order = row_number()
)
}
# Sort values alphabetically or ascending if numeric
if (abc) freqs <- mutate(freqs, order = rank(.data$value))
# Counter plot
if (type %in% c(1, 2)) {
vadj <- ifelse(type == 1, -0.15, 0.5)
hadj <- ifelse(type == 1, 0.5, -0.15)
count <- ggplot(freqs, aes(
x = reorder(as.character(.data$value), .data$order), y = .data$n,
fill = as.character(.data$targets),
label = formatNum(.data$n, 0), ymax = max(.data$n) * 1.1
)) +
geom_col(position = "dodge", colour = "transparent") +
geom_text(
colour = "black",
check_overlap = TRUE,
position = position_dodge(0.9),
size = 3, vjust = vadj, hjust = hadj
) +
labs(x = NULL, y = "Counter [#]", fill = targets_name, caption = note) +
theme(legend.position = "top") +
guides(colour = "none") +
theme(axis.title.y = element_text(size = rel(0.8), angle = 90)) +
scale_y_comma(expand = c(0, 0)) +
theme_lares(pal = 1)
# Give an angle to labels when more than...
if (length(unique(value)) >= 7) {
count <- count + theme(axis.text.x = element_text(angle = 30, hjust = 1))
}
# Custom colours if wanted...
if (custom_colours) {
count <- count + suppressWarnings(gg_fill_customs())
}
}
# Proportions (%) plot
if (type %in% c(1, 3)) {
prop <- freqs %>%
group_by(.data$value) %>%
mutate(size = sum(.data$n) / sum(freqs$n)) %>%
mutate(ptag = ifelse(p < 3, "", as.character(round(.data$p, 1)))) %>%
ggplot(aes(
x = reorder(.data$value, -.data$order),
y = .data$p / 100, label = .data$ptag,
fill = as.character(.data$targets)
)) +
geom_col(position = "fill", colour = "transparent") +
geom_text(aes(size = .data$size, colour = as.character(.data$targets)),
position = position_stack(vjust = 0.5)
) +
scale_size(range = c(2.2, 3)) +
coord_flip() +
labs(x = "Proportions [%]", y = NULL, fill = targets_name, caption = note) +
theme(legend.position = "top") +
guides(colour = "none", size = "none") +
scale_y_percent(expand = c(0, 0)) +
theme(axis.title.y = element_text(size = rel(0.8), angle = 90)) +
theme_lares(pal = 1)
# Show a reference line if levels = 2; quite useful when data is unbalanced (not 50/50)
if (length(unique(targets)) == 2 && ref) {
distr <- df %>%
freqs(.data$targets) %>%
arrange(as.character(.data$targets))
h <- signif(100 - distr$p[1], 3)
prop <- prop +
geom_hline(
yintercept = h / 100, colour = "purple",
linetype = "dotted", alpha = 0.8
) +
geom_label(aes(0, h / 100, label = h, vjust = -0.05),
size = 2.5, fill = "white", alpha = 0.8
)
}
# Custom colours if wanted...
if (custom_colours) {
prop <- prop + suppressMessages(gg_fill_customs())
}
}
# Export file name and folder
if (save) {
file_name <- paste0(
"viz_distr_",
cleanText(targets_name), ".vs.",
cleanText(variable_name),
case_when(type == 2 ~ "_c", type == 3 ~ "_p", TRUE ~ ""), ".png"
)
if (!is.na(subdir)) {
# dir.create(file.path(getwd(), subdir), recursive = TRUE)
file_name <- paste(subdir, file_name, sep = "/")
}
}
# Plot the results and save if needed
if (type == 1) {
count <- count + labs(
title = "Distribution and Proportions",
subtitle = subtitle, caption = ""
) +
theme(plot.margin = margin(10, 15, -15, 15))
prop <- prop + guides(fill = "none") + labs(caption = note) +
theme(plot.margin = margin(-5, 15, -15, 15))
p <- (count / prop) + plot_layout(ncol = 1, nrow = 2)
if (save) p <- p + ggsave(file_name, width = 10, height = 7)
}
if (type == 2) {
count <- count + coord_flip() +
labs(title = "Distribution Plot", subtitle = subtitle, caption = "")
if (save) count <- count + ggsave(file_name, width = 8, height = 6)
p <- count
}
if (type == 3) {
prop <- prop + labs(title = "Proportions Plot", subtitle = subtitle, caption = "")
if (save) prop <- prop + ggsave(file_name, width = 8, height = 6)
p <- prop
}
if (!plot) {
return(select(freqs, -.data$order, -.data$row))
} else {
return(p)
}
}
.force_class <- function(value, class = "none") {
if (class != "none") {
if (grepl("char|fact", class) && is.numeric(value)) {
value <- as.character(value)
}
if (grepl("num|int", class) && !is.numeric(value)) {
value <- as.numeric(value)
}
if (grepl("dat|day|time", class)) {
value <- gsub(" .*", "", as.character(value))
value <- lubridate::date(value)
}
}
return(value)
}
.fxtrim <- function(value, trim, targets = NA) {
if (trim > 0) {
if (!is.numeric(value)) {
value <- substr(value, 1, trim)
}
if (!is.numeric(targets) && !is.na(targets)) {
targets <- substr(targets, 1, trim)
}
message(paste("Chopping everything to", trim, "characters..."))
}
return(value)
}
.fxclean <- function(value, clean = FALSE, targets = NA) {
if (clean) {
if (!is.numeric(value)) {
value <- cleanText(value, spaces = FALSE)
}
if (!is.numeric(targets) && !is.na(targets)) {
targets <- cleanText(targets, spaces = FALSE)
}
}
return(value)
}
.fxna_rm <- function(df, na.rm = FALSE) {
if (na.rm) {
df <- df[complete.cases(df), ]
}
return(df)
}
# options(lifecycle_repeat_warnings = TRUE)
# library(dplyr)
# data("starwars")
# foo <- function(x, ...) {
# temp <- enquos(...)
# group_by(x, !!!temp[[1]]) %>% tally()
# }
# foo(starwars, sex, gender)
# Unquoting language objects with `!!!` is deprecated as of rlang 0.4.0.
# Please use `!!` instead.
#
# # Bad:
# dplyr::select(data, !!!enquo(x))
#
# # Good:
# dplyr::select(data, !!enquo(x)) # Unquote single quosure
# dplyr::select(data, !!!enquos(x)) # Splice list of quosures
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.