#' Box plot
#'
#' Creates a box plot
#'
#' This function relies on \href{http://ggplot2.org/}{ggplot2} package to
#' create a box plot with ability to label points. Point labelling has the
#' option to repel text overlapping with
#' \href{https://cran.r-project.org/web/packages/ggrepel/vignettes/ggrepel.html}{ggrepel}.
#' The function supports panel plot layout. By default black-and-white theme is
#' used.
#'
#' @param data Data frame: default dataset to use for plot
#' @param x Character: name of a \code{data} column mapped to x-axis
#' variable, i.e. time
#' @param y Character: name of a \code{data} column mapped to y-axis
#' variable
#' @param group Character: name of a \code{data} column mapped to the color of
#' points
#' @param group_levels Vector/List: a named vector/list that specifies the
#' levels and labels of \code{group}
#' @param label Character: name of a \code{data} column used to label the points
#' when both \code{add_points} and \code{add_label} are \code{TRUE}
#' @param facet_r Character: name of a \code{data} column mapped to the facet
#' row in panel plot layout. Check \code{\link[ggplot2]{facet_grid}} for more
#' details
#' @param facet_c Character: name of a \code{data} column mapped to the facet
#' column in panel plot layout. Check \code{\link[ggplot2]{facet_grid}} for
#' more details
#' @param facet_r_levels Vector/List: a named vector/list that specifies the
#' levels and labels of \code{facet_r}
#' @param facet_c_levels Vector/List: a named vector/list that specifies the
#' levels and labels of \code{facet_c}
#' @param facet_scale Character: Are scales shared across all facets. Refer to
#' the `scale` argument in \code{\link[ggplot2]{facet_grid}}. Default `free`
#' means that scales are not shared
#' @param facet_space Character: Refer to the `space` argument in
#' \code{\link[ggplot2]{facet_grid}}. Default `free` means both height and
#' width will vary
#' @param x_lab Character: x-axis label
#' @param y_lab Character: y-axis label
#' @param group_lab Character: group variable label
#' @param title Character: boxplot title
#' @param x_limit Numeric vector of length 2: limits for x-axis, e.g.
#' \code{c(0, 10)}.
#' @param y_limit Numeric vector of length 2: limits for y-axis, e.g.
#' \code{c(-5, 5)}
#' @param y_log Logical: \code{TRUE} to use log-scale for y-axis and
#' \code{FALSE} (default) otherwise.
#' @param all_colors Vector: a vector of valid color representations for all
#' the point colors. When `group` is mapped to the color of points, specify
#' `all_colors` to change default colors. See \code{\link{is.color}} for valid
#' color definition
#' @param add_points Logical: \code{TRUE} (default) to add original data points,
#' jittered along x-axis
#' @param point_shape integer: valid point shape when \code{add_points} is
#' \code{TRUE}. Refer to \href{http://sape.inf.usi.ch/quick-reference/ggplot2/shape}{ggplot2 Quick Reference: shape}
#' for the details of valid point shape in \href{http://ggplot2.org/}{ggplot2}
#' @param add_legend Logical: \code{TRUE} (default) to show legend and
#' \code{FALSE} otherwise
#' @param legend_pos Character: dictates the location where to place the legend.
#' By default, the legend will be place beneath the actual plot
#' @param reference_hline Numeric vector: locations of horizontal reference
#' line(s) if there is any
#' @param reference_vline Numeric vector: locations of vertical reference
#' line(s) if there is any
#' @param test_func function: the function to be applied to perform statistical
#' test of \code{y} variable against \code{group}, if present, or \code{x}, if
#' \code{group} is not present but \code{x} is. Typically, equality test is of
#' interest and includes Student's t-Test (`test_func = t.test`),
#' Wilcoxon Rank Sum and Signed Rank Tests (`test_func = wilcox.test`),
#' ANOVA test (`test_func = CommonPlots::anova.test`),
#' Kruskal-Wallis Rank Sum Test (`test_func = kruskal.test`).
#' Besides, Jonckheere-Terpstra trend test (DescTools::JonckheereTerpstraTest)
#' is also supported and can be specified as
#' `test_func = CommonPlots::JonckheereTerpstraTest`. In fact, arbitrary test
#' function which takes 'formula' and 'data' as its first two arguments can be
#' passed through \code{test_func}
#' @param test_result (named) String/List: Specify which component from the
#' test, \code{test_func}, to be displayed in the box plot. Note that only one
#' component is allowed and \code{test_result} must be a named component of the
#' test result. When \code{test_func} is specified and \code{test_result} is
#' omitted, the function will guess if the test result includes a component
#' named 'p.value'. If there is, p value will be displayed; otherwise nothing
#' gets displayed. As an example, if `test_result = c('P value:' = 'p.value')`,
#' the string 'P value: {pvalue}' will be displayed in the box plot, where
#' {pvalue} is the numeric p value. In the case of panel plot, the specified
#' test will be carried out in each panel and result gets displayed
#' respectively
#' @param x_tick_angle Numeric: the orientation angle (in [0, 360]) of the
#' x-axis tick marks. By default, the label will be horizontal.
#' @param sample_size Logical: \code{TRUE} (default) to place sample size
#' annotation along x-axis tick marks and \code{FALSE} otherwise
#' @param sample_size_font_size Numeric: font size of sample size annotation
#' @param add_label Logical: whether or not to add labels for the points
#' Default is set to `FALSE`
#' @param is_repel Logical: whether or not to avoid text overlapping using
#' \href{https://cran.r-project.org/web/packages/ggrepel/vignettes/ggrepel.html}{ggrepel}
#' @param jitter_factor Numeric: determines how much the points are jittered
#' over x-axis when `add_points = TRUE`. By default, `jitter_factor = 1`
#' @param bw_theme Logical: If `TRUE` (default), black-and-white theme will be
#' used. Refer to \code{\link[ggplot2]{theme_bw}} for more details
#' @param grids Character: grids option. Must be one of `c('on', 'major',
#' 'off')` with 'on' having both major and minor grids, 'major' having only
#' major grids, and 'off' having no grids
#' @param randseed Numeric: random seed can be set in producing jittered points
#' when \code{add_points} is \code{TRUE}. By default, no random seed is set,
#' i.e. `randseed = NULL`
#'
#' @return An object of class ggplot (if \code{return_data = FALSE}) or a list
#' of two components: an object of class ggplot and a data frame that used to
#' generate the plot
#' @examples
#' library(CommonPlots)
#' data <- mpg
#' data$cyl <- factor(data$cyl)
#' boxplot_ <- gg_boxplot(
#' data, x = 'cyl', y = 'displ', group = 'year',
#' x_lab = 'Number of cylinders', y_lab = 'Engine displacement (L)',
#' group_lab = 'Year', title = ''
#' )
#' # boxplot with Student's t-Test performed on engine displacement against year
#' boxplot_ <- gg_boxplot(
#' data, x = 'cyl', y = 'displ', group = 'year', test_func = t.test,
#' test_result = c('T test p value:' = 'p.value'),
#' x_lab = 'Number of cylinders', y_lab = 'Engine displacement (L)',
#' group_lab = 'Year', title = ''
#' )
#' boxplot_ <- gg_boxplot(
#' diamonds, x = 'cut', y = 'price', group = 'clarity',
#' group_levels = c('SI2', 'VS2', 'VVS2'), facet_r = 'color',
#' facet_r_levels = c('Color = D' = 'D', 'Color = G' = 'G', 'Color = J' = 'J'),
#' x_lab = 'Cut', y_lab = 'Price', group_lab = 'Clarity', title = '',
#' sample_size_font_size = 4
#' )
#' # boxplot with ANOVA test on diamond price against clarify
#' boxplot_ <- gg_boxplot(
#' diamonds, x = 'cut', y = 'price', group = 'clarity',
#' group_levels = c('SI2', 'VS2', 'VVS2'), facet_r = 'color',
#' facet_r_levels = c('Color = D' = 'D', 'Color = G' = 'G', 'Color = J' = 'J'),
#' test_func = anova.test, test_result = c('ANOVA p value:' = 'p.value'),
#' x_lab = 'Cut', y_lab = 'Price', group_lab = 'Clarity', title = '',
#' sample_size_font_size = 4
#' )
#'
#'
#' @import dplyr
#' @export
#' @author Feiyang Niu (Feiyang.Niu@gilead.com)
gg_boxplot <- function(data, x = NULL, y, group = NULL, group_levels = NULL,
label = NULL, facet_r = NULL, facet_c = NULL,
facet_r_levels = NULL, facet_c_levels = NULL,
facet_scale = 'free', facet_space = 'free',
x_lab = x, y_lab = y, group_lab = group, title = '',
x_limit = NULL, y_limit = NULL, y_log = FALSE,
all_colors = NULL, add_points = TRUE, point_shape = 19,
add_legend = TRUE, legend_pos = 'bottom',
reference_hline = NULL, reference_vline = NULL,
test_func = NULL, test_result = c('p value' = 'p.value'),
x_tick_angle = 0, sample_size = TRUE,
sample_size_font_size = 3, add_label = FALSE,
is_repel = FALSE, jitter_factor = 1, bw_theme = TRUE,
grids = 'on', randseed = NULL) {
#-----------------------------
# argument match & error catch
#-----------------------------
if(!is.data.frame(data)) {
tryCatch(
data <- as.data.frame(data),
error = function(err) {stop(
'data must be a data frame or data frame convertable'
)}
)
}
has_x_var <- !is_blank(x)
if(has_x_var) column_in_dataframe(data, x)
column_in_dataframe(data, y)
if(!is_blank(label)) column_in_dataframe(data, label)
if(!is_blank(facet_r)) column_in_dataframe(data, facet_r)
if(!is_blank(facet_c)) column_in_dataframe(data, facet_c)
if(!is_blank(group)) column_in_dataframe(data, group)
if(!is_blank(x_limit)) check_var_class(x_limit, is.numeric, 'numeric')
if(!is_blank(y_limit)) check_var_class(y_limit, is.numeric, 'numeric')
y_log <- isTRUE(y_log)
add_points <- isTRUE(add_points)
add_legend <- isTRUE(add_legend)
sample_size <- isTRUE(sample_size)
add_label <- isTRUE(add_label)
bw_theme <- isTRUE(bw_theme)
arg_in_choices(facet_scale, c('free', 'free_x', 'free_y', 'fixed'))
arg_in_choices(facet_space, c('free', 'free_x', 'free_y', 'fixed'))
if(add_legend) arg_in_choices(legend_pos, c('left', 'right', 'bottom', 'up'))
if(!is_blank(reference_hline))
check_var_class(reference_hline, is.numeric, 'numeric')
if(!is_blank(reference_vline))
check_var_class(reference_vline, is.numeric, 'numeric')
arg_in_choices(grids, c('on', 'x', 'y', 'off'))
#-----------------------------
# define constants and vars
#-----------------------------
alignment_dict <- list(
'left' = 0, 'center' = 0.5, 'right' = 1,
'up' = 1, 'bottom' = 0
)
# create x as constant 1L if not specified
if(!has_x_var) {
x <- 'x_var'
data[[x]] <- factor(1)
}
# convert x to numeric if not
if(!is.numeric(data[[x]])) {
data[[x]] <- factor(data[[x]])
x_tick_labels <- levels(data[[x]])
data[[x]] <- as.numeric(data[[x]])
x_axis_breaks <- sort(unique_na(data[[x]]))
# default expand on discrete x-axis
x_expand <- c(0, 0.3)
} else {
x_axis_breaks <- NULL
x_tick_labels <- NULL
# default expand on continuous x-axis
x_expand <- c(0.05, 0)
}
group_list <- NULL
if(!is_blank(facet_r)) group_list <- c(group_list, facet_r)
if(!is_blank(facet_c)) group_list <- c(group_list, facet_c)
if(!is_blank(group)) group_list <- c(group_list, group)
group_list <- unique(c(group_list, x))
group_cols <- data[, group_list, drop = FALSE]
data <- mutate(data, x_var_factor = do.call(interaction, as.list(group_cols)))
dodge_boxplot_factor <- 0.75
resolution_x <- ggplot2::resolution(
sort(unique_na(data[[x]])), zero = FALSE
)
dodge_width <- resolution_x * dodge_boxplot_factor
outlier_shape <- ifelse(add_points, NA, point_shape)
x_point_var <- 'x_point_'
data[[x_point_var]] <- as.integer(data[[x]])
#---------------------------
# data manipulation
#---------------------------
ngroups <- 1
if(!is_blank(group)) {
group_levels <- unlist(group_levels)
if(is.null(group_levels)) group_levels <- sort(unique(data[[group]]))
data[[group]] <- factor(data[[group]], levels = group_levels,
ordered = TRUE)
data <- data[!is.na(data[[group]]), , drop = F]
if(!is.null(names(group_levels))) {
levels(data[[group]]) <- names(group_levels)
}
ngroups <- nlevels(data[[group]])
}
if(!is_blank(facet_r)) {
facet_r_levels <- unlist(facet_r_levels)
if(is.null(facet_r_levels))
facet_r_levels <- sort(unique(data[[facet_r]]))
data[[facet_r]] <- factor(data[[facet_r]], levels = facet_r_levels,
ordered = TRUE)
data <- data[!is.na(data[[facet_r]]), , drop = F]
if(!is.null(names(facet_r_levels))) {
levels(data[[facet_r]]) <- names(facet_r_levels)
}
}
if(!is_blank(facet_c)) {
facet_c_levels <- unlist(facet_c_levels)
if(is.null(facet_c_levels))
facet_c_levels <- sort(unique(data[[facet_c]]))
data[[facet_c]] <- factor(data[[facet_c]], levels = facet_c_levels)
data <- data[!is.na(data[[facet_c]]), , drop = F]
if(!is.null(names(facet_c_levels))) {
levels(data[[facet_c]]) <- names(facet_c_levels)
}
}
if(ngroups > 1) {
shift <- (as.integer(data[[group]]) - median(1:ngroups)) / ngroups
data[[x_point_var]] <- data[[x_point_var]] + shift * dodge_width
}
if(add_points) {
data[[x_point_var]] <- gg_jitter(
data[[x_point_var]], dodge_width * jitter_factor * 0.2, randseed
)
}
#-----------------------------
# make the plot
#-----------------------------
plot_ <- gg_wrapper(
data = data,
aes_string(x = paste0('`', x, '`'), y = paste0('`', y, '`')),
facet_r = facet_r, facet_c = facet_c,
facet_scale = facet_scale, facet_space = facet_space,
is_x_continuous = TRUE, is_y_continuous = TRUE,
x_lab = x_lab, y_lab = y_lab, title = title,
x_limit = x_limit, y_limit = y_limit,
x_log = FALSE, y_log = y_log,
x_axis_breaks = x_axis_breaks, x_tick_labels = x_tick_labels,
x_tick_angle = x_tick_angle,
add_legend = add_legend, legend_pos = legend_pos,
color_var = group, all_colors = all_colors, color_lab = group_lab,
reference_hline = reference_hline, reference_vline = reference_vline,
bw_theme = bw_theme, grids = grids, x_expand = x_expand
)
plot_ <- plot_ +
geom_boxplot(aes(group = x_var_factor), outlier.shape = outlier_shape,
position = position_dodge(dodge_width), alpha = 0.5)
# add points
if(add_points) {
point_size <- 1
point_alpha <- 0.5
plot_ <- plot_ +
geom_point(aes_string(x = x_point_var), shape = point_shape,
size = point_size, alpha = point_alpha)
# add labels to the points
if(add_label && !is_blank(label)) {
if(is_repel) {
plot_ <- plot_ +
ggrepel::geom_text_repel(
aes_string(x = x_point_var, label = label),
show.legend = FALSE
)
} else {
plot_ <- plot_ +
geom_text(aes_string(x = x_point_var, label = label),
vjust = 'inward', hjust = 'inward',
show.legend = FALSE)
}
}
}
# add statistical test result
cond_test <- (!is.null(x) && length(unique_na(data[[x]])) > 1) ||
(!is.null(group) && length(unique_na(data[[group]])) > 1)
add_test_result <- FALSE
if(cond_test && !is_blank(test_func)) {
if(!is.function(test_func))
print('Warning: `test_func` must be a valid function')
else {
group_test <- NULL
if(!is_blank(facet_r)) group_test <- c(group_test, facet_r)
if(!is_blank(facet_c)) group_test <- c(group_test, facet_c)
test_on_x <- FALSE
if(length(unique_na(data[[x]])) > 1 &&
!is_blank(group) && length(unique_na(data[[group]])) > 1) {
group_test <- c(group_test, x)
test_on_x <- TRUE
}
factor_test <- ifelse(
!is.null(group) && length(unique_na(data[[group]])) > 1, group, x
)
formula_test <- formula(paste(y, '~', factor_test))
results_test <- data %>%
group_by_(.dots = lapply(group_test, as.symbol)) %>%
do(test = tryCatch(
suppressMessages(test_func(formula_test, data = .)),
error = function(e) {NA}
))
if(length(test_result) > 1) {
print('Warning: `test_result` can only be of length 1')
} else {
if(is.null(test_result)) {
test_result <- setNames(
'p.value',
paste(as.character(substitute(test_func)), 'p value:')
)
}
if(is_blank(group_test)) {
res_expr <- lazyeval::interp(
~tryCatch(var[[1]][[test_result]], error = function(e){NA}),
var = as.name('test')
)
} else {
res_expr <- lazyeval::interp(
~tryCatch(var[[test_result]], error = function(e){NA}),
var = as.name('test')
)
}
if(test_on_x)
x_expr <- lazyeval::interp(~as.integer(var), var = as.name(x))
else x_expr <- ~(-Inf)
dots_summarise <- setNames(c(
as.list(group_test), list(res_expr, x_expr, ~(Inf))
), c(group_test, 'result_', 'x_', 'y_'))
results_test <- results_test %>%
summarise_(.dots = dots_summarise)
if(all(is.na(results_test$result_))) {
print('Warning: `test_result` is not found in the test results')
} else {
results_test <- mutate(results_test, result_ = trimws(paste(
names(test_result), round(result_, 3)
)))
add_test_result <- TRUE
}
}
}
}
if(add_test_result) {
if(test_on_x) hjust = 0.5 else hjust = -0.2
plot_ <- plot_ +
geom_text(data = results_test,
aes(x_, y_, label = result_),
inherit.aes = FALSE,
hjust = hjust, vjust = 1.5)
}
# add sample size
if(sample_size) {
nrows <- 1
if(!is_blank(facet_r)) nrows <- nlevels(factor(data[[facet_r]]))
# for sample size annotation
ss_factor <- 0.04 * sample_size_font_size / 3
fnote_size_ss <- sample_size_font_size / (1 + 0.5 * (nrows - 1))
center_aligned <- 0.5
slightly_right_aligned <- 0.7
group_list_ss <- c()
if(!is_blank(facet_r)) group_list_ss <- c(group_list_ss, facet_r)
if(!is_blank(facet_c)) group_list_ss <- c(group_list_ss, facet_c)
yrange <- data %>%
group_by_(.dots = lapply(group_list, as.symbol)) %>%
do(res = grDevices::extendrange(
if(y_log) log(range_na(.[[y]]), base = 10)
else range_na(.[[y]])
)) %>%
mutate(ymin = unlist(res)[1], ymax = unlist(res)[2]) %>%
ungroup() %>%
group_by_(.dots = lapply(group_list_ss, as.symbol)) %>%
summarise(ymin = min(ymin), ymax = max(ymax))
dots_ss <- setNames(list(
lazyeval::interp(~n_nna(var), var = as.name(y)),
lazyeval::interp(~unique_na(var), var = as.name(x))
), c('n', 'x'))
data_ss <- data %>%
group_by_(.dots = lapply(group_list, as.symbol)) %>%
summarise_(.dots = dots_ss)
dots_y_pos <- list(y = lazyeval::interp(
~min_y-ss_factor * (max_y-min_y) * (as.integer(unique_na(var_g))-1),
min_y = as.name('ymin'), max_y = as.name('ymax'),
var_g = if(is_blank(group)) 1 else as.name(group)
))
if(length(group_list_ss) == 0) {
data_ss$ymin <- yrange$ymin
data_ss$ymax <- yrange$ymax
} else {
data_ss <- suppressMessages(left_join(data_ss, yrange))
}
data_ss <- data_ss %>% mutate_(.dots = dots_y_pos)
# align y's in different panels
if(!is_blank(facet_c)) {
group_list_y <- c()
if(!is_blank(facet_r)) group_list_y <- c(group_list_y, facet_r)
if(!is_blank(group)) group_list_y <- c(group_list_y, group)
data_ss <- data_ss %>%
ungroup() %>%
group_by_(.dots = lapply(group_list_y, as.symbol)) %>%
mutate(y = min(y))
if(facet_scale %in% c('fixed', 'free_x')) {
data_ss <- data_ss %>%
ungroup() %>%
group_by_(.dots = ternary(is_blank(group), list(),
as.symbol(group))) %>%
mutate(y = min(y))
}
}
if(y_log) data_ss$y <- 10^data_ss$y
x_1 <- sort(unique_na(data_ss$x))[1]
data_ss_1 <- filter(data_ss, x == x_1)
plot_ <- plot_ +
geom_text(data = data_ss_1, show.legend = FALSE,
aes(label = paste0('N=', data_ss_1$n), x = x, y = y),
size = fnote_size_ss, hjust = slightly_right_aligned)
if(length(unique_na(data_ss$x)) > 1) {
data_ss_rest <- filter(data_ss, x != x_1)
plot_ <- plot_ +
geom_text(data = data_ss_rest, show.legend = FALSE,
aes(label = data_ss_rest$n, x = x, y = y),
size = fnote_size_ss, hjust = center_aligned)
}
}
if(!has_x_var) {
plot_ <- plot_ +
theme(axis.title.x = element_blank(),
axis.text.x = element_blank(),
axis.ticks.x = element_blank())
}
return(plot_)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.