Nothing
#' Variation Plot
#'
#' The `variationPlot()` function is designed to explore how the distribution of
#' a pollutant (or other variable) changes by another variable (`x`). For
#' example, it can be used to explore how the distribution of `nox` varies by
#' `season` or by `weekday`. This plot can be extensively conditioned using the
#' `type` and `group` arguments, both of which are passed to [cutData()]. An
#' appropriate plot type will be chosen based on the type of `x` - e.g., ordered
#' variables will be joined by a line.
#'
#' When `statistic = "mean"`, the plot shows the 95% confidence intervals in the
#' mean. The 95% confidence intervals are calculated through bootstrap
#' simulations, which will provide more robust estimates of the confidence
#' intervals (particularly when there are relatively few data).
#'
#' Users can supply their own `ylim`, e.g. `ylim = c(0, 200)`.
#'
#' The `difference` option calculates the difference in means between two
#' pollutants, along with bootstrap estimates of the 95\% confidence intervals
#' in the difference. This works in two ways: either two pollutants are supplied
#' in separate columns (e.g. `pollutant = c("no2", "o3")`), or there are two
#' unique values of `group`. The difference is calculated as the second
#' pollutant minus the first and is labelled accordingly. This feature is
#' particularly useful for model evaluation and identifying where models diverge
#' from observations across time scales.
#'
#' Depending on the choice of statistic, a subheading is added. Users can
#' control the text in the subheading through the use of `sub` e.g. `sub = ""`
#' will remove any subheading.
#'
#' @inheritParams timePlot
#' @inheritParams cutData
#'
#' @param x A character value to be passed to [cutData()]; used to define the
#' category by which `pollutant` will be varied and plotted.
#'
#' @param normalise Should variables be normalised? The default is `FALSE`. If
#' `TRUE` then the variable(s) are divided by their mean values. This helps to
#' compare the shape of the diurnal trends for variables on very different
#' scales.
#'
#' @param group This sets the grouping variable to be used. For example, if a
#' data frame had a column `site` setting `group = "site"` will plot all sites
#' together in each panel. Passed to [cutData()].
#'
#' @param difference If two pollutants are chosen then setting `difference =
#' TRUE` will also plot the difference in means between the two variables as
#' `pollutant[2] - pollutant[1]`. Bootstrap 95\% confidence intervals of the
#' difference in means are also calculated. A horizontal dashed line is shown
#' at y = 0. The difference can also be calculated if there is a column that
#' identifies two groups, e.g., having used [splitByDate()]. In this case it
#' is possible to call the function with the option `group = "split.by"` and
#' `difference = TRUE`.
#'
#' @param statistic Can be `"mean"` (default) or `"median"`. If the statistic is
#' `"mean"` then the mean line and the 95% confidence interval in the mean are
#' plotted by default. If the statistic is `"median"` then the median line is
#' plotted together with the 5/95 and 25/75th quantiles are plotted. Users can
#' control the confidence intervals with `conf.int`.
#'
#' @param conf.int The confidence intervals to be plotted. If `statistic =
#' "mean"` then the confidence intervals in the mean are plotted. If
#' `statistic = "median"` then the `conf.int` and `1 - conf.int` *quantiles*
#' are plotted. Any number of `conf.int`s can be provided.
#'
#' @param B Number of bootstrap replicates to use. Can be useful to reduce this
#' value when there are a large number of observations available to increase
#' the speed of the calculations without affecting the 95% confidence interval
#' calculations by much.
#'
#' @param ci Should confidence intervals be shown? The default is `TRUE`.
#' Setting this to `FALSE` can be useful if multiple pollutants are chosen
#' where over-lapping confidence intervals can over complicate plots.
#'
#' @param alpha The alpha transparency used for plotting confidence intervals.
#' `0` is fully transparent and 1 is opaque. The default is `0.4`.
#'
#' @param ... Addition options are passed on to [cutData()] for `type` handling.
#' Some additional arguments are also available:
#' - `xlab`, `ylab` and `main` override the x-axis label, y-axis label, and plot title.
#' - `layout` sets the layout of facets - e.g., `layout(2, 5)` will have 2 columns and 5 rows.
#' - `lwd` and `lty` control various graphical parameters.
#' - `fontsize` overrides the overall font size of the plot.
#' - `ylim` controls axis limits.
#'
#' @export
#'
#' @seealso [timeVariation()], which conveniently assembles many time-related
#' variation plots into a single plot
#'
#' @return an [openair][openair-package] object.
#'
#' @author Jack Davison
#' @author David Carslaw
#'
#' @examples
#' # example using the 'mydata' dataset
#' variationPlot(
#' mydata,
#' pollutant = c("nox", "o3"),
#' x = "hour",
#' type = "season",
#' normalise = TRUE
#' )
variationPlot <- function(
mydata,
pollutant = "nox",
x = "hour",
statistic = "mean",
type = "default",
group = "default",
normalise = FALSE,
difference = FALSE,
conf.int = NULL,
B = 100,
local.tz = NULL,
ci = TRUE,
cols = "hue",
alpha = 0.4,
strip.position = "top",
key.position = "top",
key.columns = NULL,
name.pol = NULL,
auto.text = TRUE,
plot = TRUE,
...
) {
if (is.null(conf.int)) {
if (statistic == "median") {
conf.int <- c(0.75, 0.95)
} else {
conf.int <- c(0.95)
}
}
# validate inputs
validate_varplot_inputs(
mydata = mydata,
group = group,
pollutant = pollutant,
type = type,
statistic = statistic,
conf.int = conf.int
)
# extra.args setup
extra.args <- list(...)
# labels
extra.args$ylab <- quickText(
extra.args$ylab %||%
ifelse(normalise, "normalised level", toString(pollutant)),
auto.text
)
extra.args$xlab <- quickText(extra.args$x %||% x, auto.text)
extra.args$main <- quickText(extra.args$main %||% "", auto.text)
extra.args$sub <- quickText(
extra.args$sub %||% create_varplot_sub_text(statistic, conf.int),
auto.text
)
extra.args$lty <- extra.args$lty %||% 1
drop <- extra.args$drop %||% "none"
extra.args$drop <- NULL
# check & cut data
# always need date for the 'difference' path
vars <- c(pollutant, "date")
if (!x %in% dateTypes) {
vars <- c(vars, x)
}
if (!any(type %in% dateTypes)) {
vars <- c(vars, type[!type %in% dateTypes])
}
if (!group %in% dateTypes) {
vars <- c(vars, group)
}
vars <- vars[vars != "default"]
# how many 'types' are we working with?
type_cols <- "type"
if (length(type) == 2) {
type_cols <- c("type_y", "type_x")
}
# data checks
# need special handling of cutData as `drop` needs to be passed separately
mydata <- mydata |>
checkPrep(vars, c(type, group), remove.calm = FALSE) |>
(\(df) {
do.call(
cutData,
append(
list(
x = df,
type = x,
names = "x",
local.tz = local.tz,
is.axis = TRUE,
drop = drop
),
extra.args
)
)
})() |>
(\(df) {
do.call(
cutData,
append(
list(
x = df,
type = type,
names = type_cols,
local.tz = local.tz,
drop = drop
),
extra.args
)
)
})() |>
(\(df) {
do.call(
cutData,
append(
list(
x = df,
type = group,
names = "group",
local.tz = local.tz,
drop = drop
),
extra.args
)
)
})()
# put in local time if needed
if (!is.null(local.tz)) {
attr(mydata$date, "tzone") <- local.tz
}
# reformat data for plotting
if (group == "default") {
mydata$group <- NULL
mydata <- tidyr::pivot_longer(
mydata,
cols = dplyr::all_of(pollutant),
names_to = "group",
values_to = "value"
) |>
dplyr::mutate(
group = factor(.data$group)
)
} else {
names(mydata)[names(mydata) == pollutant] <- "value"
}
# if x is hour or week, make sure it's numeric for plotting
if (x %in% c("hour", "week")) {
mydata$x <- as.numeric(as.character(mydata$x))
}
# before summary stats - decompose wd into u/v
wd_present <- "wd" %in% levels(mydata$group)
if (wd_present) {
mydata <- mydata |>
dplyr::mutate(
group = as.character(.data$group),
value = dplyr::case_when(
.data$group == "wd" ~ sin(pi * .data$value / 180), # becomes wd_u
.default = .data$value
),
group = dplyr::if_else(.data$group == "wd", "wd_u", .data$group)
) |>
dplyr::bind_rows(
mydata |>
dplyr::filter(.data$group == "wd") |>
dplyr::mutate(
value = cos(pi * .data$value / 180),
group = "wd_v"
)
) |>
dplyr::mutate(group = factor(.data$group))
}
# calculate summary values and confidence intervals
if (difference) {
# identify the two things being differenced
if (nlevels(mydata$group) != 2L) {
cli::cli_abort("Need 2 'groups' to calculate a difference.")
}
the_levels <- levels(mydata$group) # already set by pivot_longer or group cutData
poll1 <- the_levels[1]
poll2 <- the_levels[2]
# pivot wide so both groups are columns - needed for bootMeanDiff
mydata_wide <- mydata |>
tidyr::pivot_wider(names_from = "group", values_from = "value")
if (statistic == "mean") {
mydata <- purrr::map(
conf.int,
\(q) {
dplyr::reframe(
mydata_wide,
bootMeanDiff(
dplyr::pick(dplyr::everything()),
x = poll1,
y = poll2,
conf.int = q,
B = B
),
.by = dplyr::all_of(c("x", type_cols))
) |>
dplyr::rename(mid = "Mean", min = "Lower", max = "Upper") |>
dplyr::mutate(ci = q)
}
) |>
dplyr::bind_rows() |>
dplyr::mutate(
group = factor(
.data$variable,
levels = c(poll1, poll2, paste(poll2, "-", poll1))
)
) |>
dplyr::select(-"variable")
} else {
mydata_wide <-
dplyr::mutate(
mydata_wide,
diff = .data[[poll2]] - .data[[poll1]]
)
names(mydata_wide)[names(mydata_wide) == "diff"] <- paste(
poll2,
"-",
poll1
)
mydata <-
mydata_wide |>
tidyr::pivot_longer(
dplyr::all_of(c(poll1, poll2, paste(poll2, "-", poll1))),
names_to = "group"
) |>
dplyr::mutate(
group = factor(
.data$group,
levels = c(poll1, poll2, paste(poll2, "-", poll1))
)
)
mydata <-
purrr::map(
conf.int,
\(q) {
qs <- sort(c(q, 1 - q))
dplyr::summarise(
mydata,
mid = stats::quantile(.data$value, 0.5, na.rm = TRUE),
min = stats::quantile(.data$value, qs[1], na.rm = TRUE),
max = stats::quantile(.data$value, qs[2], na.rm = TRUE),
.by = dplyr::all_of(c("x", type_cols, "group"))
) |>
dplyr::mutate(
ci = q
)
}
) |>
dplyr::bind_rows()
}
} else {
if (statistic == "mean") {
mydata <-
purrr::map(
conf.int,
\(q) {
dplyr::summarise(
mydata,
bootMeanDF(.data$value, conf.int = q, B = B),
.by = dplyr::all_of(c("x", type_cols, "group"))
) |>
dplyr::rename("mid" = "mean") |>
dplyr::mutate(
ci = q
)
}
) |>
dplyr::bind_rows() |>
dplyr::select(-"n")
} else {
mydata <-
purrr::map(
conf.int,
\(q) {
qs <- sort(c(q, 1 - q))
dplyr::summarise(
mydata,
mid = stats::quantile(.data$value, 0.5, na.rm = TRUE),
min = stats::quantile(.data$value, qs[1], na.rm = TRUE),
max = stats::quantile(.data$value, qs[2], na.rm = TRUE),
.by = dplyr::all_of(c("x", type_cols, "group"))
) |>
dplyr::mutate(
ci = q
)
}
) |>
dplyr::bind_rows()
}
}
# after summary stats - convert wd_u/wd_v back to wd
if (wd_present) {
wd_data <- mydata |>
dplyr::filter(.data$group %in% c("wd_u", "wd_v")) |>
tidyr::pivot_wider(
names_from = "group",
values_from = c("mid", "min", "max")
) |>
dplyr::mutate(
mid = (atan2(.data$mid_wd_u, .data$mid_wd_v) * 180 / pi) %% 360,
min = (atan2(.data$min_wd_u, .data$min_wd_v) * 180 / pi) %% 360,
max = (atan2(.data$max_wd_u, .data$max_wd_v) * 180 / pi) %% 360,
group = factor("wd")
) |>
dplyr::select(-dplyr::ends_with(c("_wd_u", "_wd_v")))
mydata <- mydata |>
dplyr::filter(!.data$group %in% c("wd_u", "wd_v")) |>
dplyr::bind_rows(wd_data) |>
dplyr::mutate(group = factor(.data$group))
}
# normalise the pattern by group (i.e., colour) if requested
if (normalise) {
mydata <- map_type(mydata, type = "group", fun = function(x) {
avg <- mean(x$mid, na.rm = TRUE)
x$mid <- x$mid / avg
x$min <- x$min / avg
x$max <- x$max / avg
x
})
}
# introduce NA gaps for missing x values so lines don't connect across absent
# data
gap_cols <- intersect(c(type_cols, "group", "ci"), names(mydata))
if (is.numeric(mydata$x)) {
full_x <- switch(
x,
"hour" = 0:23,
"week" = 1:53,
seq(min(mydata$x, na.rm = TRUE), max(mydata$x, na.rm = TRUE))
)
mydata <- tidyr::complete(
mydata,
x = full_x,
tidyr::nesting(!!!rlang::syms(gap_cols))
)
} else if (is.factor(mydata$x)) {
mydata <- tidyr::complete(
mydata,
x,
tidyr::nesting(!!!rlang::syms(gap_cols))
)
}
# set pollutant labels for legend
poll_labels <- name.pol %||% levels(mydata$group)
# linetypes
ltys <- extra.args$lty
while (length(ltys) < nlevels(mydata$group)) {
ltys <- c(ltys, ltys)
}
ltys <- ltys[1:nlevels(mydata$group)]
# construct plot
thePlot <-
ggplot2::ggplot(mydata, ggplot2::aes(x = .data$x)) +
theme_openair(key.position = key.position) +
set_extra_fontsize(extra.args) +
get_facet(
if (all(type == "default")) "default" else type_cols,
extra.args,
scales = "fixed",
auto.text = auto.text,
drop = FALSE,
strip.position = strip.position
) +
ggplot2::labs(
x = extra.args$xlab,
y = extra.args$ylab,
color = NULL,
fill = NULL,
linetype = NULL,
title = extra.args$main,
caption = extra.args$sub
) +
ggplot2::coord_cartesian(
ylim = extra.args$ylim
) +
ggplot2::scale_color_manual(
values = openColours(
scheme = cols,
n = nlevels(mydata$group)
),
labels = stats::setNames(
label_openair(poll_labels, auto_text = auto.text),
levels(mydata$group)
),
guide = ggplot2::guide_legend(ncol = key.columns),
aesthetics = c("fill", "colour"),
drop = FALSE
) +
ggplot2::scale_linetype_manual(
values = ltys,
labels = stats::setNames(
label_openair(poll_labels, auto_text = auto.text),
levels(mydata$group)
),
guide = ggplot2::guide_legend(ncol = key.columns),
drop = FALSE
) +
ggplot2::guides(
x = ggplot2::guide_axis(check.overlap = TRUE)
)
# add difference line if needed
if (difference) {
thePlot <-
thePlot +
ggplot2::geom_hline(yintercept = 0, lty = 2)
}
# add geometries
if (is.numeric(mydata$x) && !(x == "hour" && group == "daylight")) {
if (x == "hour") {
xbreaks <- c(seq(0, 20, 4), 23)
} else if (x == "week") {
xbreaks <- seq(0, 54, 6)
}
if (ci) {
thePlot <-
thePlot +
ggplot2::geom_ribbon(
ggplot2::aes(
ymax = .data$max,
ymin = .data$min,
fill = .data$group,
group = interaction(.data$group, .data$ci)
),
show.legend = FALSE,
na.rm = TRUE,
alpha = alpha / dplyr::n_distinct(mydata$ci)
)
}
thePlot <-
thePlot +
ggplot2::geom_line(
ggplot2::aes(
y = .data$mid,
colour = .data$group,
linetype = .data$group
),
show.legend = TRUE,
na.rm = TRUE,
key_glyph = "point"
) +
ggplot2::scale_x_continuous(
breaks = xbreaks,
expand = ggplot2::expansion()
)
} else {
if (ci) {
widths <- rev(seq(0.4, 0.8, length.out = length(unique(mydata$ci))))
splits <- split(mydata, mydata$ci)
splits <- splits[sort(names(splits))]
for (i in seq_along(splits)) {
thePlot <-
thePlot +
ggplot2::geom_crossbar(
data = splits[[i]],
ggplot2::aes(
ymax = .data$max,
ymin = .data$min,
y = .data$mid,
fill = .data$group,
group = interaction(.data$group, .data$ci)
),
show.legend = FALSE,
color = NA,
alpha = alpha / dplyr::n_distinct(mydata$ci),
width = widths[[i]],
na.rm = TRUE
)
}
}
# use points for non-ordered factors, but lines for ordered factors (e.g.
# month) - unless there's an unusual combination of x and group that'd split
# lines
use_point <- !is.ordered(mydata$x) ||
(x == "month" && group == "season") ||
(x == "weekday" && group == "weekend") ||
((mydata |>
tidyr::drop_na() |>
dplyr::pull("x") |>
dplyr::n_distinct()) ==
1L)
if (!use_point) {
thePlot <-
thePlot +
ggplot2::geom_line(
ggplot2::aes(
y = .data$mid,
colour = .data$group,
group = .data$group,
linetype = .data$group
),
show.legend = TRUE,
na.rm = TRUE,
key_glyph = "point"
)
} else {
thePlot <-
thePlot +
ggplot2::geom_point(
ggplot2::aes(
y = .data$mid,
colour = .data$group,
group = .data$group
),
show.legend = TRUE,
key_glyph = "point",
na.rm = TRUE
)
}
thePlot <- thePlot +
ggplot2::scale_x_discrete(
labels = \(x) label_openair(x, auto_text = auto.text),
drop = FALSE
)
}
if (plot) {
plot(thePlot)
}
output <- list(
plot = thePlot,
data = dplyr::tibble(mydata) |>
dplyr::mutate(statistic = statistic),
call = match.call()
)
class(output) <- "openair"
invisible(output)
}
# validate timevar inputs
validate_varplot_inputs <- function(
mydata,
group,
pollutant,
type,
statistic,
conf.int
) {
if (length(type) > 2) {
cli::cli_abort(
"Can only have one {.arg type} for {.fun openair::variationPlot}."
)
}
# validate inputs
if (any(group != "default") && length(pollutant) > 1) {
cli::cli_abort(
"Can only have one {.arg pollutant} and one {.arg group}, or several {.arg pollutant}s and no {.arg group}."
)
}
if (any(type %in% pollutant)) {
cli::cli_abort(
"{.arg type} cannot be in {.arg pollutant}. Problem variable: {type[type %in% pollutant]}."
)
}
if (any(group != "default")) {
if (any(group %in% pollutant)) {
cli::cli_abort(
"{.arg group} cannot be in {.arg pollutant}. Problem variable: {group[group %in% pollutant]}."
)
}
}
# statistic check
rlang::arg_match(statistic, c("mean", "median"))
}
# sub heading stat info
create_varplot_sub_text <- function(statistic, conf.int) {
if (statistic == "mean") {
ci_str <- paste(paste0(100 * conf.int, "%"), collapse = " and ")
return(paste0("mean and ", ci_str, " confidence interval in mean"))
}
if (statistic == "median") {
quantile_pairs <- sapply(conf.int, \(x) paste0(100 * (1 - x), "/", 100 * x))
quantile_str <- paste(quantile_pairs, collapse = " and ")
return(paste0("median and ", quantile_str, "th percentiles"))
}
}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.