Nothing
#' SeeDist -- See The Distribution
#'
#' This function takes a vector of numeric data and returns one or more ggplot2
#' plots that help you visualize the data. Meant to be a useful wrapper for
#' exploring univariate data. Has a plethora of options including type of
#' visualization (histogram, boxplot, density, violin) as well as commonly
#' desired overplots like mean and median points, z and t curves etc.. Common
#' descriptive statistics are provided as a subtitle if desired and sent to the
#' console as well.
#'
#' @param x the data to be visualized. Must be numeric.
#' @param title Optionally replace the default title displayed. title = NULL
#' will remove it entirely. title = "" will provide an empty title but
#' retain the spacing. A sensible default is provided otherwise.
#' @param subtitle Optionally replace the default subtitle displayed. subtitle = NULL
#' will remove it entirely. subtitle = "" will provide an empty subtitle but
#' retain the spacing. A sensible default is provided otherwise.
#' @param whatplots what type of plots? The default is whatplots = c("d", "b",
#' "h", "v") for a density, a boxplot, a histogram, and a violin plot
#' @param numbins the number of bins to use for any plots that bin. If nothing is
#' specified the function will calculate a rational number using Freedman-Diaconis
#' via the \code{nclass.FD} function
#' @param var_explain additional contextual information about the variable as a string
#' such as "Miles Per Gallon" which is appended to the default title information.
#' @param data.fill.color Character string that specifies fill color for the main data
#' area (Default: `deepskyblue`).
#' @param mean.line.color,median.line.color,mode.line.color Character string that
#' specifies line color (Default: `darkgreen`, `yellow`, `orange`).
#' @param mean.line.type,median.line.type,mode.line.type Character string that
#' specifies line color (Default: `longdash`, `dashed`, `dashed`).
#' @param mean.line.size,median.line.size,mode.line.size Numeric that
#' specifies line size (Default: `1.5`, `1.5`, `1`). You can set to `0` to make
#' any of the lines "disappear".
#' @param mean.point.shape,median.point.shape Integer in 0 - 25
#' specifies shape of mean or median point mark on the violin plot
#' (Default: `21`, `23`).
#' @param mean.point.size,median.point.size Integer
#' specifies size of mean or median point mark on the violin plot
#' (Default: `4`). You can set to `0` to make any of the points "disappear".
#' @param zcurve.color,tcurve.color Character string that
#' specifies line color (Default: `red`, `black`).
#' @param zcurve.type,tcurve.type Character string that
#' specifies line color (Default: `twodash`, `dotted`).
#' @param zcurve.size,tcurve.size Numeric that
#' specifies line size (Default: `1`). You can set to `0` to make
#' any of the lines "disappear".
#' @param xlab Custom text for the `x` axis label (Default: `NULL`, which
#' will cause the `x` axis label to be the `x` variable).
#' @param k Number of digits after decimal point (should be an integer)
#' (Default: k = 2) for statistical results.
#' @param add_jitter Logical (Default: `TRUE`) controls whether jittered data
#' ponts are added to violin plot.
#' @param add_rug Logical (Default: `TRUE`) controls whether "rug" data
#' points are added to density plot and histogram.
#' @param xlim_left,xlim_right Logical. For density plots can be used to
#' override the default which is 3 std deviations left and right of
#' the mean of x. Useful for theoretical reasons like horsepower < 0
#' or when `ggplot2` warns you that it has removed rows containing
#' non-finite values (stat_density).
#' @param ggtheme A function, ggplot2 theme name. Default value is ggplot2::theme_bw().
#' Any of the ggplot2 themes, or themes from extension packages are allowed (e.g.,
#' hrbrthemes::theme_ipsum(), etc.).
#'
#' @return from 1 to 4 plots depending on what the user specifies as well as an
#' extensive summary courtesy `DescTools::Desc` printed to the console
#'
#' @export
#' @import ggplot2
#' @importFrom grDevices nclass.FD
#' @importFrom stats dnorm dt median sd
#' @importFrom DescTools Desc
#'
#' @section Warning:
#' If the data has more than 3 modal values only the first three of them are plotted.
#' The rest are ignored and the user is warned on the console.
#'
#' Missing values are removed with a warning to the user
#'
#' @seealso \code{\link[grDevices]{nclass}}
#'
#' @examples
#' SeeDist(rnorm(100, mean = 100, sd = 20), numbins = 15, var_explain = "A Random Sample")
#' SeeDist(mtcars$hp, var_explain = "Horsepower", whatplots = c("d", "b"))
#' SeeDist(iris$Sepal.Length, var_explain = "Sepal Length", whatplots = "d")
#' @author Chuck Powell
#'
SeeDist <- function(x,
title = "Default",
subtitle = "Default",
numbins = 0,
xlab = NULL,
var_explain = NULL,
data.fill.color = "deepskyblue",
mean.line.color = "darkgreen",
median.line.color = "yellow",
mode.line.color = "orange",
mean.line.type = "longdash",
median.line.type = "dashed",
mode.line.type = "dashed",
mean.line.size = 1.5,
median.line.size = 1.5,
mean.point.shape = 21,
median.point.shape = 23,
mean.point.size = 4,
median.point.size = 4,
zcurve.color = "red",
zcurve.type = "twodash",
zcurve.size = 1,
tcurve.color = "black",
tcurve.type = "dotted",
tcurve.size = 1,
mode.line.size = 1,
whatplots = c("d", "b", "h", "v"),
k = 2,
add_jitter = TRUE,
add_rug = TRUE,
xlim_left = NULL,
xlim_right = NULL,
ggtheme = ggplot2::theme_bw()
) {
#### Basic setup ####
# set default theme
ggplot2::theme_set(ggtheme)
if (!is.numeric(x)) {
stop("Sorry the data must be numeric")
}
x_name <- deparse(substitute(x)) # get the variable name
# if not specified, use the variable name for x axis
if (is.null(xlab)) {
xlab <- x_name
}
# figure you what binwidth we'll use
binnumber <- nclass.FD(x) # default
binnumber <- ifelse(numbins == 0,
binnumber,
numbins)
#### Get descriptives ####
desc.output <- DescTools::Desc(x,
plotit = FALSE,
main = xlab,
digits = k)
if (sum(is.na(x)) != 0) {
missing_count <- sum(is.na(x))
warning(paste("Removing",
missing_count,
"missing values"),
call. = FALSE)
x <- x[!is.na(x)]
}
x_mean <- desc.output[[1]]$mean
x_sd <- desc.output[[1]]$sd
x_median <- desc.output[[1]]$quant['median']
x_mode <- CGPfunctions::Mode(x)
x_skew <- desc.output[[1]]$skew
x_kurtosis <- desc.output[[1]]$kurt
if (length(x_mode) >= 4) {
warning(paste("There are",
length(x_mode)),
" modal values displaying just the first 3",
call. = FALSE)
x_mode <- x_mode[c(1, 2, 3)]
}
#### Custom geoms ####
my_jitter_geom <- list()
if (add_jitter) {
my_jitter_geom <- list(
geom_jitter(aes(x = "",
y = x),
width = 0.05,
height = 0,
alpha = .5)
)
}
my_rug_geom <- list()
if (add_rug) {
my_rug_geom <- list(
geom_rug(aes(y = 0),
sides = "b")
)
}
if (is.null(xlim_left)) {
xlim_left <- -3 * x_sd + x_mean
}
if (is.null(xlim_right)) {
xlim_right <- +3 * x_sd + x_mean
}
#### Title, subtitle and caption ####
if (!is.null(title) && title == "Default") {
my_title <- paste0("Distribution of the variable ",
x_name,
" ",
var_explain
)
} else {
my_title <- title
}
make_subtitle <-
function(x,
mean_x,
sd_x,
median_x,
Skew_x,
Kurtosis_x,
k = k) {
ret_subtitle <- bquote("N =" ~ .(length(x)) *
"," ~ bar(X) ~ "=" ~ .(round(mean_x, k)) *
", SD =" ~ .(round(sd_x, k)) *
", Median =" ~ .(round(median_x, k)) *
", Skewness =" ~ .(round(Skew_x, k)) *
", Kurtosis =" ~ .(round(Kurtosis_x, k)
)
)
}
if (!is.null(subtitle) && subtitle == "Default") {
my_subtitle <- make_subtitle(x,
x_mean,
x_sd,
x_median,
x_skew,
x_kurtosis,
k)
} else {
my_subtitle <- subtitle
}
mycaption <- bquote(bar(X) ~ "is" ~ .(mean.line.color) ~
", Median is" ~ .(median.line.color) ~
", Mode is" ~ .(mode.line.color) ~
", z curve is" ~ .(zcurve.color) ~
", t curve is" ~ .(tcurve.color)
)
#### custom function to plot t curve ####
custom_t_function <- function(x, mu, nu, df, ncp) {
dt((x - mu)/nu, df, ncp) / nu
}
#### build the density plot ####
if ("d" %in% tolower(whatplots)) {
p <- ggplot(data.frame(x)) +
aes(x) +
geom_density(fill = data.fill.color, ) +
stat_function(fun = dnorm,
color = zcurve.color,
linetype = zcurve.type,
size = zcurve.size,
args = list(mean = x_mean,
sd = x_sd)
) +
stat_function(fun = custom_t_function,
color = tcurve.color,
linetype = tcurve.type,
size = tcurve.size,
args = list(mu = x_mean,
nu = x_sd,
df = length(x) - 1,
ncp = 0)
) +
geom_vline(xintercept = x_mean,
colour = mean.line.color,
linetype = mean.line.type,
size = mean.line.size) +
geom_vline(xintercept = x_median,
colour = median.line.color,
linetype = median.line.type,
size = median.line.size) +
geom_vline(xintercept = x_mode,
colour = mode.line.color,
linetype = mode.line.type,
size = mode.line.size) +
my_rug_geom +
labs(
title = my_title,
subtitle = my_subtitle,
x = xlab,
caption = mycaption
) +
xlim(xlim_left,
xlim_right) +
theme(
axis.title.y = element_blank(),
axis.text.y = element_blank(),
axis.ticks.y = element_blank(),
axis.line.y = element_blank(),
panel.grid.major.y = element_blank(),
panel.grid.minor.y = element_blank()
)
print(p)
}
#### build the boxplot ####
if ("b" %in% tolower(whatplots)) {
pp <- ggplot(data.frame(x)) +
aes(x) +
labs(
title = my_title,
subtitle = my_subtitle,
y = xlab,
caption = mycaption
) +
stat_boxplot(aes(x = "",
y = x),
geom = "errorbar",
width = 0.2) +
geom_boxplot(aes(x = "",
y = x),
fill = data.fill.color,
outlier.color = data.fill.color) +
coord_flip() +
geom_point(aes(x = "",
y = x_mean),
shape = mean.point.shape,
size = mean.point.size,
fill = mean.line.color) +
theme(
axis.title.y = element_blank(),
axis.text.y = element_blank(),
axis.ticks.y = element_blank(),
axis.line.y = element_blank(),
panel.grid.major.y = element_blank()
)
print(pp)
}
#### build the histogram plot ####
if ("h" %in% tolower(whatplots)) {
ppp <- ggplot(data.frame(x)) +
aes(x) +
labs(
title = my_title,
subtitle = my_subtitle,
x = xlab,
caption = mycaption
) +
geom_histogram(bins = binnumber,
color = "black",
fill = data.fill.color) +
my_rug_geom +
geom_vline(xintercept = x_mean,
colour = mean.line.color,
linetype = mean.line.type,
size = mean.line.size) +
geom_vline(xintercept = x_median,
colour = median.line.color,
linetype = median.line.type,
size = median.line.size) +
geom_vline(xintercept = x_mode,
colour = mode.line.color,
linetype = mode.line.type,
size = mode.line.size)
print(ppp)
}
#### build the violin plot ####
if ("v" %in% tolower(whatplots)) {
pppp <- ggplot(data.frame(x)) +
aes(x) +
labs(
title = my_title,
subtitle = my_subtitle,
y = xlab,
caption = mycaption
) +
geom_violin(aes(x = "",
y = x),
fill = data.fill.color) +
my_jitter_geom +
coord_flip() +
geom_point(aes(x = "",
y = x_mean),
shape = mean.point.shape,
size = mean.point.size,
fill = mean.line.color) +
geom_point(aes(x = "",
y = x_median),
shape = median.point.shape,
size = median.point.size,
fill = median.line.color) +
theme(
axis.title.y = element_blank(),
axis.text.y = element_blank(),
axis.ticks.y = element_blank(),
axis.line.y = element_blank(),
panel.grid.major.y = element_blank()
)
print(pppp)
}
#### return output to console ####
return(desc.output)
} # end function
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.