#' Time-profiling Plots
#'
#' Creats time-profiling plots with
#' \href{https://en.wikipedia.org/wiki/Panel_data}{longitudinal data}
#'
#' This function relies on \href{http://ggplot2.org/}{ggplot2} package to
#' create time-profiling plots including: Mean + SD/SE plot, Mean + Median
#' plot, Boxplot, Spaghetti plot and their mixture. Labels can be added to the
#' end of each line using
#' \href{http://directlabels.r-forge.r-project.org/}{directlabels}.
#' Panel plot layout is supported. By default black-and-white theme is used.
#'
#' @param data Data frame: default dataset to use for plot
#' @param x Character: name of a \code{data} column mapped to x-axis
#' variable, i.e. time
#' @param y Character: name of a \code{data} column mapped to y-axis
#' variable
#' @param subject Character: name of a \code{data} column mapped to subject IDs
#' or other types of IDs that define membership of repeated measurements in
#' the data
#' @param group Character: name of a \code{data} column mapped to the fill of
#' bars
#' @param group_levels Vector/List: a named vector/list that specifies the
#' levels and labels of \code{group}
#' @param facet_r Character: name of a \code{data} column mapped to the facet
#' row in panel plot layout. Check \code{\link[ggplot2]{facet_grid}} for more
#' details
#' @param facet_c Character: name of a \code{data} column mapped to the facet
#' column in panel plot layout. Check \code{\link[ggplot2]{facet_grid}} for
#' more details
#' @param facet_r_levels Vector/List: a named vector/list that specifies the
#' levels and labels of \code{facet_r}
#' @param facet_c_levels Vector/List: a named vector/list that specifies the
#' levels and labels of \code{facet_c}
#' @param facet_scale Character: Are scales shared across all facets. Refer to
#' the `scale` argument in \code{\link[ggplot2]{facet_grid}}. Default `free`
#' means that scales are not shared
#' @param facet_space Character: Refer to the `space` argument in
#' \code{\link[ggplot2]{facet_grid}}. Default `free` means both height and
#' width will vary
#' @param x_lab Character: x-axis label
#' @param y_lab Character: y-axis label
#' @param group_lab Character: group variable label
#' @param title Character: barplot title
#' @param x_limit Numeric vector of length 2: limits for x-axis, e.g.
#' \code{c(0, 10)}.
#' @param y_limit Numeric vector of length 2: limits for y-axis, e.g.
#' \code{c(-5, 5)}
#' @param x_tick_label Vector: x-axis tick marks
#' @param all_xticks Logical: \code{TRUE} (default) to display all marks
#' specified as the unique values of \code{x_tick_label}
#' @param x_tick_angle Numeric: the orientation angle (in [0, 360]) of the
#' x-axis tick marks. By default, the label will be horizontal.
#' @param y_tick_angle Numeric: the orientation angle (in [0, 360]) of the
#' y-axis tick marks. By default, the label will be horizontal.
#' @param geoms Character vector: specify geometric components to be included
#' in the plot. Valid geoms are 'boxplot', 'line', 'point', 'sumline'. For
#' example, if the user needs to create a boxplot with raw data points,
#' specify \code{geoms = c('boxplot', 'point')}; if the user wants to create a
#' Mean + SE plot, let \code{geoms = c('sumline', 'point')} together with
#' specification of \code{avg_method} and \code{var_method}
#' @param avg_method Character: dicates whether to use \code{\link[base]{mean}}
#' (\code{avg_method = 'mean'}) or \code{\link[stats]{median}}. Be noted that
#' only these two options are permitted and both methods will remove \code{NA}s
#' in calculation
#' @param var_method Character: dicates whether to use standard error
#' (\code{var_method = 'se'}) or standard deviation (\code{var_method = 'sd'}).
#' Be noted that only these two options are permitted and both methods will
#' remove \code{NA}s in calculation
#' @param y_log Logical: \code{TRUE} to use log-scale for y-axis and
#' \code{FALSE} (default) otherwise.
#' @param sample_size Logical: \code{TRUE} (default) to place sample size
#' annotation along x-axis tick marks and \code{FALSE} otherwise
#' @param sample_size_font_size Numeric: font size of sample size annotation
#' @param reference_hline Numeric vector: locations of horizontal reference
#' line(s) if there is any
#' @param reference_vline Numeric vector: locations of vertical reference
#' line(s) if there is any
#' @param subject_show Logical: \code{TRUE} to show subject IDs (or other types)
#' and \code{FALSE} (default) otherwise
#' @param add_legend Logical: \code{TRUE} (default) to show legend and
#' \code{FALSE} otherwise
#' @param legend_pos Character: dictates the location where to place the legend.
#' Possible locations are 'left', 'right', 'bottom', 'top'. By default, the
#' legend will be place beneath the actual plot, i.e. `legend_pos = 'bottom'`
#' @param all_colors Vector of valid color representation: when \code{group} is
#' present, categories of \code{group} will take different color for specified
#' geometric objects. This argument allows the user to overwrite default color
#' theme. See \code{col} argument in \code{\link[grDevices]{col2rgb}} for types
#' of valid color representation
#' @param all_linetypes Numeric vector: by default, lines of different
#' categories of \code{group}, when present, will be solid lines. This argument
#' allows the user to overwrite the default line types. See
#' \href{http://sape.inf.usi.ch/quick-reference/ggplot2/linetype}{linetypes}
#' for valid ggplot2 linetype specification
#' @param bw_theme Logical: if `TRUE` (default), black-and-white theme will be
#' used. Refer to \code{\link[ggplot2]{theme_bw}} for more details
#' @param jitter_factor Numeric: determines how much the points are jittered
#' over x-axis when `geoms` includes 'point'. By default, `jitter_factor = 1`
#' @param grids Character: control the grids. If `on` (default), grids will be
#' drawn; if `x`, only grids on x-axis will be drawn; if `y`, only grids on
#' y-axis will be drawn; if `off`, no grids will be drawn
#' @param randseed Numeric: random seed can be set in producing jittered points
#' when \code{geoms} includes `point`. By default, no random seed is set,
#' i.e. `randseed = NULL`
#' @param return_data Logical: \code{TRUE} to return both plot and the data
#' used to produce the plot and \code{FALSE} (default) to return only plot
#'
#' @return An object of class ggplot (if \code{return_data = FALSE}) or a list
#' of two components: an object of class ggplot and a data frame that used to
#' generate the plot
#' @examples
#' library(CommonPlots)
#' data(koopLME)
#'
#' # Mean + SE plot
#' data <- koopLME
#' data$siblings <- factor(with(data, ifelse(SIBLINGS >= 3, '>= 3', SIBLINGS)),
#' levels = c(0, 1, 2, '>= 3'))
#' mean_se_plot <- gg_time_profiling(
#' data, x = 'TIMETRND', y = 'LOGWAGE', group = 'siblings',
#' geoms = c('sumline'), avg_method = 'mean',
#' var_method = 'se', x_lab = 'Time trend',
#' y_lab = 'Log of hourly wage', group_lab = 'Number of siblings',
#' title = 'Mean + SE plot'
#' )
#'
#' # Median + IQR plot
#' data <- koopLME
#' data$siblings <- factor(with(data, ifelse(SIBLINGS >= 3, '>= 3', SIBLINGS)),
#' levels = c(0, 1, 2, '>= 3'))
#' sib_labels <- paste('Number of siblings:', levels(data$siblings))
#' median_iqr_plot <- gg_time_profiling(
#' data, x = 'TIMETRND', y = 'LOGWAGE', facet_c = 'siblings',
#' facet_c_levels = setNames(levels(data$siblings), sib_labels),
#' geoms = c('sumline'), avg_method = 'median',
#' x_lab = 'Time trend', y_lab = 'Log of hourly wage',
#' title = 'Median + IQR plot'
#' )
#'
#' # Spaghetti plot
#' data <- koopLME
#' data <- with(data, data[PERSONID %in% unique(PERSONID)[1:100], ])
#' spaghetti_plot <- gg_time_profiling(
#' data, x = 'TIMETRND', y = 'LOGWAGE', subject = 'PERSONID',
#' geoms = c('line'), x_lab = 'Time trend',
#' y_lab = 'Log of hourly wage', title = 'Spaghetti plot',
#' subject_show = TRUE
#' )
#'
#' # Box plot over time
#' data <- koopLME
#' data$siblings <- factor(with(data, ifelse(SIBLINGS >= 3, '>= 3', SIBLINGS)),
#' levels = c(0, 1, 2, '>= 3'))
#' sib_labels <- paste('Number of siblings:', levels(data$siblings))
#' boxplot <- gg_time_profiling(
#' data, x = 'TIMETRND', y = 'LOGWAGE', facet_r = 'siblings',
#' facet_r_levels = setNames(levels(data$siblings), sib_labels),
#' geoms = c('boxplot', 'point'), x_lab = 'Time trend',
#' y_lab = 'Log of hourly wage', title = 'Box plot',
#' sample_size = FALSE, all_xticks = TRUE
#' )
#'
#'
#' @import dplyr
#' @import directlabels
#' @export
#' @author Feiyang Niu (Feiyang.Niu@gilead.com)
gg_time_profiling <- function(data, x, y, subject = NULL,
group = NULL, group_levels = NULL,
facet_r = NULL, facet_c = NULL,
facet_r_levels = NULL, facet_c_levels = NULL,
facet_scale = 'free', facet_space = 'fixed',
x_lab = x, y_lab = y, group_lab = group,
title = '', x_limit = NULL, y_limit = NULL,
x_tick_label = x, all_xticks = FALSE,
x_tick_angle = 0, y_tick_angle = 0,
geoms = c('point', 'line', 'sumline', 'boxplot'),
avg_method = 'mean', var_method = 'se',
y_log = FALSE, sample_size = TRUE,
sample_size_font_size = 3,
reference_hline = NULL, reference_vline = NULL,
subject_show = FALSE, subject_label_size = 0.8,
add_legend = TRUE, legend_pos = 'bottom',
all_colors = NULL, all_linetypes = NULL,
jitter_factor = 1, grids = 'on', bw_theme = TRUE,
randseed = NULL, return_data = FALSE) {
#---------------------------
# error-catch
#---------------------------
if(!is.data.frame(data)) stop('data must be a data frame')
all_columns <- names(data)
if(!all(x %in% all_columns, y %in% all_columns))
stop('x & y must both be columns of data')
if(!is_blank(group)) {
if(!(group %in% all_columns))
stop('group must be a column of data')
}
if(!is_blank(subject)) {
if(!(subject %in% all_columns))
stop('subject must be a column of data')
}
if(!is_blank(facet_r)) {
if(!(facet_r %in% all_columns)) stop('facet_r must be a column of data')
}
if(!is_blank(facet_c)) {
if(!(facet_c %in% all_columns)) stop('facet_c must be a column of data')
}
x_is_numeric <- TRUE
tryCatch(
data[[x]] <- as.numeric(data[[x]]),
warning = function(w) {
stop('x column must be numeric or numeric-convertable')
}
)
#---------------------------
# argument match
#---------------------------
y_log <- isTRUE(y_log)
sample_size <- isTRUE(sample_size)
add_legend <- isTRUE(add_legend)
return_data <- isTRUE(return_data)
all_xticks <- isTRUE(all_xticks)
subject_show <- isTRUE(subject_show)
bw_theme <- isTRUE(bw_theme)
if(!is.null(avg_method))
avg_method <- match.arg(tolower(avg_method),choices = c('mean', 'median'))
if(!is.null(var_method))
var_method <- match.arg(tolower(var_method), choices = c('sd', 'se'))
geoms <- tolower(geoms)
legend_pos <- match.arg(tolower(legend_pos),
c('left', 'right', 'bottom', 'top'))
arg_in_choices(grids, c('on', 'x', 'y', 'off'))
#---------------------------
# define constants (those could be moved to the function arguments)
#---------------------------
point_shape <- 19
point_size <- 1
point_alpha <- 0.5
dodge_boxplot_factor <- 0.75
dodge_line_factor <- 0.4
errorbar_factor <-0.6
x_point_var <- 'x_point'
#---------------------------
# data manipulation
#---------------------------
data <- data[!is.na(data[[x]]) & !is.na(data[[y]]), ]
if(!is.null(x_limit)) {
data <- data[data[[x]] >= x_limit[1] & data[[x]] <= x_limit[2], ]
}
unique_x <- sort(unique_na(data[[x]]))
group_list <- NULL
nrows <- 1
if(!is_blank(facet_r)) {
facet_r_levels <- unlist(facet_r_levels)
if(is.null(facet_r_levels))
facet_r_levels <- sort(unique(data[[facet_r]]))
data[[facet_r]] <- factor(data[[facet_r]], levels = facet_r_levels)
data <- data[!is.na(data[[facet_r]]), , drop = F]
if(!is.null(names(facet_r_levels))) {
levels(data[[facet_r]]) <- names(facet_r_levels)
}
group_list <- c(group_list, facet_r)
nrows <- nlevels(data[[facet_r]])
}
ncols <- 1
if(!is_blank(facet_c)) {
facet_c_levels <- unlist(facet_c_levels)
if(is.null(facet_c_levels))
facet_c_levels <- sort(unique(data[[facet_c]]))
data[[facet_c]] <- factor(data[[facet_c]], levels = facet_c_levels)
data <- data[!is.na(data[[facet_c]]), , drop = F]
if(!is.null(names(facet_c_levels))) {
levels(data[[facet_c]]) <- names(facet_c_levels)
}
group_list <- c(group_list, facet_c)
ncols <- nlevels(data[[facet_c]])
}
ngroups <- 1
if(!is_blank(group)) {
group_levels <- unlist(group_levels)
if(is.null(group_levels)) group_levels <- sort(unique(data[[group]]))
data[[group]] <- factor(data[[group]], levels = group_levels)
data <- data[!is.na(data[[group]]), , drop = F]
if(!is.null(names(group_levels))) {
levels(data[[group]]) <- names(group_levels)
}
group_list <- c(group_list, group)
ngroups <- nlevels(data[[group]])
}
group_list <- c(group_list, x)
# create jitter for geom_point manually if present
resolution_x <- ggplot2::resolution(unique_x, zero = FALSE)
dodge_line <- min(diff(range(unique_x))/50, resolution_x*dodge_line_factor)
dodge_boxplot <- resolution_x * dodge_boxplot_factor
dodge_geom <- ifelse('boxplot' %in% geoms, dodge_boxplot, dodge_line)
dodge_ <- position_dodge(dodge_geom)
if('point' %in% geoms) {
if(ngroups > 1) {
shift <- (as.integer(data[[group]]) - median(1:ngroups)) / ngroups
data[[x_point_var]] <- data[[x]] + shift * dodge_geom
} else data[[x_point_var]] <- data[[x]]
data[[x_point_var]] <- gg_jitter(
data[[x_point_var]], dodge_geom * jitter_factor * 0.25, randseed
)
}
# define expand in ggplot2 axis
if(sample_size) x_expand <- c(0.01, 0) else x_expand <- NULL
#---------------------------
# make the plot
#---------------------------
plot_ <- gg_wrapper(
data, aes_string(x = paste0('`', x, '`'),
y = paste0('`', y, '`')),
facet_r = facet_r, facet_c = facet_c,
y_log = y_log, x_lab = x_lab, y_lab = y_lab, title = title,
x_limit = NULL, y_limit = y_limit,
x_tick_angle = x_tick_angle, y_tick_angle = y_tick_angle,
facet_scale = facet_scale, facet_space = facet_space,
add_legend = add_legend, legend_pos = legend_pos,
color_var = group, all_colors = all_colors, color_lab = group_lab,
linetype_var = if(is_blank(all_linetypes)) NULL else group,
all_linetypes = all_linetypes, linetype_lab = group_lab,
reference_hline = reference_hline,
reference_vline = reference_vline,
x_expand = x_expand, bw_theme = bw_theme, grids = grids
)
if('boxplot' %in% geoms) {
group_cols <- data[, group_list, drop = FALSE]
data_boxplot <- data %>%
mutate(x_factor = do.call(interaction, as.list(group_cols)))
outlier_shape <- ifelse('point' %in% geoms, NA, point_shape)
plot_ <- plot_ +
geom_boxplot(data = data_boxplot, position = dodge_,
aes_string(x = x, group = 'x_factor'),
outlier.shape = outlier_shape, alpha = 0.5)
}
if('line' %in% geoms) {
if(is.null(subject)) stop('please provide subject')
plot_ <- plot_ + geom_line(aes_string(group = subject),
show.legend = FALSE)
if(subject_show) {
plot_ <- plot_ + geom_dl(aes_string(label = subject),
method = list(dl.combine('last.points'),
cex = subject_label_size))
}
}
if('sumline' %in% geoms) {
if(avg_method == 'median') {
fun_y_ <- stats::median
fun_data_ <- median_iqr
} else if(avg_method == 'mean') {
fun_y_ <- mean_na
if(var_method == 'se') fun_data_ <- mean_se
else if(var_method == 'sd') fun_data_ <- mean_sd
}
plot_ <- plot_ +
stat_summary(fun.y = fun_y_, geom = 'line', position = dodge_) +
stat_summary(fun.y = fun_y_, geom = 'point', position = dodge_) +
stat_summary(fun.data = fun_data_, geom = 'errorbar',
width = dodge_geom * errorbar_factor,
position = dodge_, linetype = 'solid')
}
if('point' %in% geoms) {
plot_ <- plot_ + geom_point(aes(x = x_point), shape = point_shape,
size = point_size, alpha = point_alpha)
}
#---------------------------
# add sample size
#---------------------------
if(sample_size) {
# for sample size annotation
ss_factor <- 0.04 * sample_size_font_size / 3
fnote_size_ss <- sample_size_font_size / (1 + 0.5 * (nrows - 1))
center_aligned <- 0.5
slightly_right_aligned <- 0.7
# calculate x-/y-axis range
if(any(c('point', 'line', 'boxplot') %in% geoms)) {
fun_yrange <- range_na
} else if('sumline' %in% geoms) {
fun_yrange <- function(x) {
res <- fun_data_(x)
if(is.na(res$ymin) || is.na(res$ymax)) return(c(res$y, res$y))
return(c(res$ymin, res$ymax))
}
}
group_list_ss <- c()
if(!is_blank(facet_r)) group_list_ss <- c(group_list_ss, facet_r)
if(!is_blank(facet_c)) group_list_ss <- c(group_list_ss, facet_c)
yrange <- data %>%
group_by_(.dots = lapply(group_list, as.symbol)) %>%
do(res = grDevices::extendrange(
if(y_log) log(fun_yrange(.[[y]]), base = 10)
else fun_yrange(.[[y]])
)) %>%
mutate(ymin = unlist(res)[1], ymax = unlist(res)[2]) %>%
ungroup() %>%
group_by_(.dots = lapply(group_list_ss, as.symbol)) %>%
summarise(ymin = min(ymin), ymax = max(ymax))
dots_ss <- setNames(list(
lazyeval::interp(~n_nna(var), var = as.name(y)),
lazyeval::interp(~unique_na(var), var = as.name(x))
), c('n', 'x'))
data_ss <- data %>%
group_by_(.dots = lapply(group_list, as.symbol)) %>%
summarise_(.dots = dots_ss)
dots_y_pos <- list(y = lazyeval::interp(
~min_y-ss_factor*(max_y-min_y)*(as.integer(unique_na(var_g))-1),
min_y = as.name('ymin'), max_y = as.name('ymax'),
var_g = if(is_blank(group)) 1 else as.name(group)
))
if(length(group_list_ss) == 0) {
data_ss$ymin <- yrange$ymin
data_ss$ymax <- yrange$ymax
} else {
data_ss <- suppressMessages(left_join(data_ss, yrange))
}
data_ss <- data_ss %>% mutate_(.dots = dots_y_pos)
# align y's in different panels
if(!is_blank(facet_c)) {
group_list_y <- c()
if(!is_blank(facet_r)) group_list_y <- c(group_list_y, facet_r)
if(!is_blank(group)) group_list_y <- c(group_list_y, group)
data_ss <- data_ss %>%
ungroup() %>%
group_by_(.dots = lapply(group_list_y, as.symbol)) %>%
mutate(y = min(y))
if(facet_scale %in% c('fixed', 'free_x')) {
data_ss <- data_ss %>%
ungroup() %>%
group_by_(.dots = ternary(is_blank(group), list(),
as.symbol(group))) %>%
mutate(y = min(y))
}
}
if(y_log) data_ss$y <- 10^data_ss$y
x_1 <- sort(unique_na(data_ss$x))[1]
data_ss_1 <- filter(data_ss, x == x_1)
plot_ <- plot_ +
geom_text(data = data_ss_1, show.legend = FALSE,
aes(label = paste0('N=', data_ss_1$n), x = x, y = y),
size = fnote_size_ss, hjust = slightly_right_aligned)
if(length(unique_na(data_ss$x)) > 1) {
data_ss_rest <- filter(data_ss, x != x_1)
plot_ <- plot_ +
geom_text(data = data_ss_rest, show.legend = FALSE,
aes(label = data_ss_rest$n, x = x, y = y),
size = fnote_size_ss, hjust = center_aligned)
}
}
if(!is_blank(x_tick_label) && (all_xticks || x_tick_label != x)) {
x_ticks <- sort(unique_na(data[[x]]))
x_tick_labels <- unique_na(data[[x_tick_label]][order(data[[x]])])
plot_ <- plot_ +
scale_x_continuous(breaks = x_ticks, labels = x_tick_labels)
}
if(return_data) {
return(list(plot = plot_, data = data))
} else return(plot_)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.