Nothing
#' Plot a topic comparison
#'
#' Draws a line chart of each comparison topic's share of the reference
#' literature over time, from the output of [scopus_compare_topics()]. The chart
#' uses integer year breaks, a colour-blind-safe palette and, for a handful of
#' topics, labels the lines directly so the reader need not consult a legend.
#' Shaded bands convey how stable each yearly share is.
#'
#' @param x A `scopus_comparison` object from [scopus_compare_topics()].
#' @param pub_count_in_legend Logical. When `TRUE` (the default), each topic's
#' label carries its total record count, for example `effect size (n = 1,204)`.
#' @param highlight Optional character scalar naming one comparison topic to draw
#' the eye to. The named topic is drawn in an accent colour, and the others in
#' grey, which is useful when one topic is the focus of a figure.
#' @param interval Logical. When `TRUE` (the default), a shaded band around each
#' line shows a Wilson interval on the yearly share. See *Details* for how to
#' read it.
#' @param ... Currently unused, present for S3 consistency.
#' @return A [ggplot2::ggplot] object. Printing it draws the plot.
#' @details
#' This needs the suggested package \pkg{ggplot2} and raises an informative error
#' when it is absent. The chart shows the comparison topics alone, since the
#' reference is the 100% denominator against which they are measured. A year for
#' which the reference has no records carries no defined share and is omitted,
#' which is noted in the caption.
#'
#' The shaded band is a Wilson score interval computed from the comparison count
#' and the reference count for each year. 'Scopus' returns exact counts rather
#' than a sample, so the band is not a confidence interval in the inferential
#' sense. It is best read as an illustrative stability range: it is wide where the
#' reference set for a year is small, and so the share would move easily, and
#' narrow where the reference set is large. It says nothing about query wording,
#' indexing lag or coverage, which are the larger real uncertainties.
#' @seealso [scopus_compare_topics()]
#' @examplesIf rlang::is_installed("ggplot2")
#' cmp <- tibble::tibble(
#' query = "q", query_type = "comparison",
#' abridged_query = rep(c("computer vision", "drug discovery"), each = 4),
#' year = rep(2017:2020, 2), n = c(220, 280, 360, 430, 30, 55, 90, 150),
#' reference_n = rep(1500, 8),
#' comparison_percentage = c(14.7, 18.7, 24, 28.7, 2, 3.7, 6, 10),
#' average_comparison_percentage = rep(c(21.5, 5.4), each = 4)
#' )
#' class(cmp) <- c("scopus_comparison", class(cmp))
#' plot_scopus_comparison(cmp)
#' plot_scopus_comparison(cmp, highlight = "drug discovery")
#' @export
plot_scopus_comparison <- function(x, pub_count_in_legend = TRUE,
highlight = NULL, interval = TRUE, ...) {
if (!inherits(x, "scopus_comparison")) {
rlang::abort(
"`x` must be a `scopus_comparison` object from scopus_compare_topics().",
class = "scopus_error_bad_input"
)
}
rlang::check_installed("ggplot2", reason = "to plot a topic comparison")
df <- x[x$query_type == "comparison", , drop = FALSE]
if (nrow(df) == 0L) {
rlang::abort(
"The comparison contains no comparison topics to plot.",
class = "scopus_error_bad_input"
)
}
ref_lab <- unique(x$abridged_query[x$query_type == "reference"])
n_missing <- sum(is.na(df$comparison_percentage))
df <- df[!is.na(df$comparison_percentage), , drop = FALSE]
if (nrow(df) == 0L) {
rlang::abort(
"The comparison has no finite percentages to plot.",
class = "scopus_error_bad_input"
)
}
df <- df[order(df$abridged_query, df$year), , drop = FALSE]
topics <- unique(df$abridged_query)
if (!is.null(highlight)) {
if (!is.character(highlight) || length(highlight) != 1L ||
!highlight %in% topics) {
rlang::abort(
sprintf("`highlight` must be one of the comparison topics: %s.",
paste(topics, collapse = ", ")),
class = "scopus_error_bad_input"
)
}
}
totals <- tapply(df$n, df$abridged_query, sum, na.rm = TRUE)
df$label <- if (isTRUE(pub_count_in_legend)) {
sprintf("%s (n = %s)", df$abridged_query,
format(totals[df$abridged_query], big.mark = ","))
} else {
df$abridged_query
}
ord <- order(-df$average_comparison_percentage, df$abridged_query)
df$label <- factor(df$label, levels = unique(df$label[ord]))
# Wilson interval on the yearly share, when the counts are available.
show_band <- isTRUE(interval) &&
all(c("n", "reference_n") %in% names(df)) &&
any(!is.na(df$reference_n) & df$reference_n > 0 & !is.na(df$n))
if (show_band) {
wb <- scopus_wilson(df$n, df$reference_n)
df$ci_lower <- wb$lower
df$ci_upper <- wb$upper
}
direct <- length(topics) <= 6L
yrs <- sort(unique(df$year))
brk <- if (length(yrs) > 12L) {
p <- pretty(yrs)
p[p == round(p)]
} else {
yrs
}
ends <- df[df$year == max(yrs), , drop = FALSE]
accent <- "#BB5566"
grey <- "grey75"
# A data-driven upper limit, rounded up to the next 5%, removes dead headroom.
ymax_src <- if (show_band) df$ci_upper else df$comparison_percentage
ymax_pad <- min(100, ceiling(max(ymax_src, na.rm = TRUE) / 5) * 5)
p <- ggplot2::ggplot(
df,
ggplot2::aes(x = .data$year, y = .data$comparison_percentage,
group = .data$label)
)
if (is.null(highlight)) {
if (show_band) {
p <- p +
ggplot2::geom_ribbon(
ggplot2::aes(ymin = .data$ci_lower, ymax = .data$ci_upper,
fill = .data$label),
alpha = 0.18, colour = NA, show.legend = FALSE
) +
ggplot2::scale_fill_viridis_d(option = "viridis", begin = 0.05,
end = 0.85, guide = "none")
}
p <- p +
ggplot2::geom_line(ggplot2::aes(colour = .data$label), linewidth = 1) +
ggplot2::geom_point(ggplot2::aes(colour = .data$label), size = 1.8,
stroke = 0) +
ggplot2::scale_colour_viridis_d(option = "viridis", begin = 0.05,
end = 0.85, name = NULL)
if (direct) {
p <- p + ggplot2::geom_text(
data = ends,
ggplot2::aes(label = .data$label, colour = .data$label),
hjust = 0, nudge_x = diff(range(yrs)) * 0.012 + 0.05, size = 3.1,
show.legend = FALSE
)
}
} else {
df$is_hi <- df$abridged_query == highlight
ends$is_hi <- ends$abridged_query == highlight
if (show_band) {
p <- p + ggplot2::geom_ribbon(
data = df[df$is_hi, , drop = FALSE],
ggplot2::aes(ymin = .data$ci_lower, ymax = .data$ci_upper),
fill = accent, alpha = 0.16, colour = NA
)
}
p <- p +
ggplot2::geom_line(data = df[!df$is_hi, , drop = FALSE], colour = grey,
linewidth = 0.7) +
ggplot2::geom_line(data = df[df$is_hi, , drop = FALSE], colour = accent,
linewidth = 1.3) +
ggplot2::geom_point(data = df[df$is_hi, , drop = FALSE], colour = accent,
size = 2.2, stroke = 0) +
ggplot2::geom_text(
data = ends[ends$is_hi, , drop = FALSE],
ggplot2::aes(label = .data$label), colour = accent, hjust = 0,
nudge_x = diff(range(yrs)) * 0.012 + 0.05, size = 3.1
)
}
# Direct labels sit in the right margin (via clip = "off") rather than in a
# wide in-panel gutter, so the panel itself stays free of empty space.
labelled <- direct || !is.null(highlight)
label_room <- if (labelled) {
min(165, 8 + 4.4 * max(nchar(as.character(ends$label))))
} else {
5.5
}
caption <- sprintf("Source: 'Scopus' Search API. Years %d to %d.",
min(yrs), max(yrs))
if (show_band) {
caption <- paste0(caption, "\nShaded band: illustrative Wilson stability range, wider where the reference set is small.")
}
if (n_missing > 0L) {
caption <- paste0(
caption, sprintf("\n%d year-topic value%s omitted for want of reference records.",
n_missing, if (n_missing == 1L) "" else "s")
)
}
p +
ggplot2::scale_x_continuous(
breaks = brk, minor_breaks = NULL,
expand = ggplot2::expansion(mult = c(0.01, if (labelled) 0.03 else 0.02))
) +
ggplot2::scale_y_continuous(
labels = function(v) paste0(format(v, trim = TRUE), "%"),
limits = c(0, ymax_pad),
expand = ggplot2::expansion(mult = c(0, 0.02))
) +
ggplot2::coord_cartesian(clip = "off") +
ggplot2::labs(
x = NULL, y = "Share of reference records",
title = "Topic share within a reference literature, over time",
subtitle = if (length(ref_lab) == 1L) {
sprintf("Each line: %% of '%s' records that also match the topic", ref_lab)
} else {
NULL
},
caption = caption
) +
ggplot2::theme_minimal(base_size = 12) +
ggplot2::theme(
panel.grid.minor = ggplot2::element_blank(),
panel.grid.major.x = ggplot2::element_blank(),
plot.title.position = "plot",
plot.caption = ggplot2::element_text(colour = "grey45", size = 8),
plot.margin = ggplot2::margin(5.5, label_room, 5.5, 5.5),
legend.position = if (direct || !is.null(highlight)) "none" else "top"
)
}
# Wilson score interval on a binomial share, returned as percentages in [0, 100].
# `x` is the comparison count and `n` the reference count per year.
scopus_wilson <- function(x, n, z = 1.96) {
phat <- x / n
denom <- 1 + z^2 / n
centre <- (phat + z^2 / (2 * n)) / denom
margin <- (z / denom) * sqrt(phat * (1 - phat) / n + z^2 / (4 * n^2))
list(
lower = pmin(pmax((centre - margin) * 100, 0), 100),
upper = pmin(pmax((centre + margin) * 100, 0), 100)
)
}
#' @rdname plot_scopus_comparison
#' @param object A `scopus_comparison` object (for the `autoplot()` method).
#' @exportS3Method ggplot2::autoplot
autoplot.scopus_comparison <- function(object, ...) {
plot_scopus_comparison(object, ...)
}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.