#' Construct plotmath labels for time lines
#'
#' Internal, called by `plot_timeline`.
#'
#' @param description The text for the items to be shown beneath the images.
#' Should be a character vector of the same length as `image_path`.
#' @param description_width Width of the description text in characters to wrap
#' the text in the labels.
#' @param ref_number A string of the number for reference. If NA, then the
#' reference will not be included in the plot.
#' @return A character vector of plotmath, to enable superscripts, italics, and
#' text wrapping.
#' @importFrom stringr str_replace_all
make_labs <- function(description, description_width, ref_number) {
descr_wrap <- strwrap(description, width = description_width)
descr_wrap <- paste0(descr_wrap, collapse = "<br>")
if (is.na(ref_number)) {
descr_wrap
} else {
paste0(descr_wrap, "<sup>", ref_number, "</sup>")
}
}
#' Plot time line with images and descriptions of each event
#'
#' The time line is plotted horizontally with ticks for years. The labels with
#' descriptions are above and/or below that line.
#'
#' @param events_df A data frame with at least these columns:
#' \describe{
#' \item{date_published}{A vector of Date objects for dates when the event of
#' interest was published. Note that the actual time when those events occurred
#' is most likely earlier, sometimes much earlier than the publication date,
#' and the events might have become quite influential before their publication.
#' But the publication date is the easiest way to get an actual date.}
#' \item{description}{Short descriptions of the events. The plot won't look
#' good if the descriptions are too long.}
#' }
#' @param ys A numeric vector of the y coordinates of the items. Since I don't
#' know how to implement the ggrepel algorithm to make sure that the labels don't
#' overlap for geom_richtext, I have to manually set the y coordinates to make
#' sure that the labels don't overlap and look good.
#' @param description_width Width of the description text in characters to wrap
#' the text in the labels.
#' @param expand_x A numeric vector of length 2 of the proportion to expand the
#' x axis on the left and the right. This is a way to manually make sure that the
#' labels are not cropped off at the edge of the plot.
#' @param expand_y Same as expand_x, but for the y axis.
#' @param include_refs Logical, indicating whether to include references in the
#' text box.
#' @return A ggplot2 object for the plot.
#' @importFrom dplyr case_when arrange between
#' @importFrom purrr map2_chr
#' @importFrom lubridate floor_date ceiling_date
#' @importFrom ggplot2 ggplot geom_point aes geom_hline geom_segment scale_x_date
#' expansion scale_y_continuous theme_void annotate scale_fill_manual
#' @importFrom rlang %||%
#' @export
plot_timeline <- function(events_df, ys, description_width = 20,
expand_x = c(0.1, 0.1), expand_y = c(0.05, 0.05),
include_refs = TRUE) {
.pkg_check("grid")
.pkg_check("gridtext")
image <- date_published <- description <- lab <- vjusts <- NULL
events_df <- events_df %>%
mutate(sheet = factor(sheet, levels = names(sheet_fill)),
description = paste0(year(date_published), " ", description))
if (include_refs) {
events_df <- events_df %>%
mutate(lab = map2_chr(description, ref_number,
~ make_labs(.x, description_width, .y)))
} else {
events_df <- events_df %>%
mutate(lab = map_chr(description,
~ make_labs(.x, description_width,
ref_number = NA)))
}
events_df <- events_df %>%
arrange(date_published) %>%
mutate(vjusts = case_when(ys >= 0 ~ 1,
TRUE ~ 0),
ys = ys)
yrs_range <- max(year(events_df$date_published)) - min(year(events_df$date_published))
date_brks <- case_when(yrs_range <= 20 ~ "1 year",
between(yrs_range, 20, 50) ~ "2 years",
between(yrs_range, 50, 100) ~ "5 years",
TRUE ~ "10 years")
axis <- seq(floor_date(min(events_df$date_published), "year"),
ceiling_date(max(events_df$date_published), "year"),
by = date_brks)
sheet_fill <- sheet_fill[names(sheet_fill) %in% unique(events_df$sheet)]
p <- ggplot(events_df) +
geom_point(aes(x = date_published), y = 0) +
geom_hline(yintercept = 0) +
geom_segment(aes(x = date_published, y = 0, xend = date_published, yend = ys)) +
geom_richtext(aes(x = date_published, y = ys, fill = sheet,
label = lab, vjust = vjusts)) +
scale_x_date(expand = expansion(expand_x)) +
scale_y_continuous(expand = expansion(expand_y)) +
scale_fill_manual(values = sheet_fill, name = "Type") +
theme_void() +
annotate("point", x = axis, y = 0, shape = 3) +
annotate("text", x = axis, y = 0, label = format(axis, "%Y"),
color = "gray50", vjust = 1.4)
p
}
#' Number of publications per year
#'
#' Plot bar plot of the number of publications per year. I find facetting
#' makes the plot easier to read than filling with different colors.
#'
#' @param pubs A data frame with at least these columns:
#' \describe{
#' \item{journal}{Name of the journal of the paper.}
#' \item{year}{Year when the paper was published.}
#' }
#' There must be one row per publication or per method or species for each title
#' if faceting by those. If facetting, then a column whose name is the value in
#' `facet_by` must be present.
#' @param facet_by Name of a column for facetting.
#' @param fill_by Name of a column of a categorical variable with which to color
#' the histogram.
#' @param binwidth Width of bins for the histogram in days.
#' @param preprints Logical, whether preprints should be included. Defaults to
#' `TRUE` to include preprints.
#' @param n_top Number of categories with the most publications to plot in facets;
#' the other categories are lumped into "other".
#' @param n_top_fill Number of categories with the most publications to be
#' differentiated by color.
#' @param sort_by How to sort the facets. first_appeared means the category that
#' appeared earlier will be nearer to the top. count means the category with more
#' count (number of publications) will be nearer to the top. Ignored if not
#' facetting.
#' @param log_color If filling histograms by integer values, whether to log
#' transform the values before mapping to colors to improve dynamic range.
#' @return A ggplot2 object.
#' @importFrom dplyr filter select
#' @importFrom forcats fct_reorder fct_lump_n
#' @importFrom rlang !! sym
#' @importFrom ggplot2 geom_bar scale_x_continuous labs theme facet_wrap ggproto
#' layer
#' @importFrom scales breaks_pretty
#' @export
pubs_per_year <- function(pubs, facet_by = NULL, fill_by = NULL, binwidth = 365,
preprints = TRUE, n_top = Inf, n_top_fill = Inf,
sort_by = c("first_appeared", "count", "recent_count"),
log_color = FALSE) {
journal <- date_published <- facets <- NULL
sort_by <- match.arg(sort_by)
if (!preprints) {
pubs <- pubs %>%
filter(!journal %in% c("bioRxiv", "arXiv", "Research Square", "medRxiv"))
}
if (!is.null(facet_by)) {
pubs <- pubs %>%
mutate(facets = !!sym(facet_by))
if (sort_by == "first_appeared") {
pubs <- pubs %>%
mutate(facets = fct_reorder(facets, date_published, .fun = "min"),
w = 1)
} else if (sort_by == "count") {
pubs <- pubs %>%
mutate(facets = fct_infreq(facets),
w = 1)
} else {
date_thresh <- lubridate::as_date(max(pubs$date_published) - lubridate::ddays(binwidth) * 2)
pubs <- pubs %>%
mutate(w = as.numeric(date_published > date_thresh),
facets = fct_reorder(facets, w, .fun = "sum", .desc = TRUE))
}
pubs <- pubs %>%
mutate(facets = fct_lump_n(facets, n = n_top, ties.method = "first", w = w))
}
if (!is.null(fill_by)) {
if (fill_by == "species") {
# Use fixed colors for species
pubs <- pubs %>%
mutate(fill = case_when(species %in% names(species_cols) ~ species,
TRUE ~ "Other"),
fill = fct_infreq(fill) %>% fct_relevel("Other", after = Inf))
} else {
# For continuous variables
is_discrete <- is.character(pubs[[fill_by]]) | is.factor(pubs[[fill_by]])
if (is_discrete) {
if (n_top_fill > 11) {
warning("Maximum of 12 colors are supported for colorblind friendly palette, ",
"less common categories are lumped into Other.")
n_top_fill <- 11
}
pubs <- pubs %>%
mutate(fill = fct_infreq(!!sym(fill_by)),
fill = fct_lump_n(fill, n = n_top_fill, ties.method = "first"))
if ("Other" %in% pubs$fill) {
pubs <- pubs %>%
mutate(fill = fct_infreq(fill) %>% fct_relevel("Other", after = Inf))
}
} else {
use_int <- is.integer(pubs[[fill_by]]) & length(unique(pubs[[fill_by]])) < 20
if (use_int) {
pubs <- pubs %>%
mutate(fill = factor(!!sym(fill_by), levels = seq.int(min(pubs[[fill_by]]),
max(pubs[[fill_by]]), 1)))
} else {
if (n_top_fill > 9) {
warning("Maximum of 9 colors are supported for binned palette.")
n_top_fill <- 9
}
pubs <- pubs %>%
ungroup() %>%
mutate(fill = cut(!!sym(fill_by), breaks = n_top_fill))
}
}
}
}
p <- ggplot(pubs, aes(date_published))
if (!is.null(facet_by)) {
p <- p +
geom_histogram(aes(fill = "all"), binwidth = binwidth,
data = select(pubs, -facets),
fill = "gray70", alpha = 0.5, show.legend = FALSE)
}
if (!is.null(fill_by)) {
p <- p +
geom_histogram(aes(fill = fill), binwidth = binwidth)
if (fill_by != "species") {
if (is_discrete) {
pal_use <- ifelse(n_top_fill > 7, "Paired", "Set2")
p <- p +
scale_fill_brewer(palette = pal_use)
} else if (use_int) {
if (log_color) {
fill_levels <- sort(unique(pubs[[fill_by]]))
log_fill <- log(fill_levels)
log_fill_scaled <- floor(log_fill/max(log_fill)*255) + 1
pal_use <- scales::viridis_pal()(256)[log_fill_scaled]
names(pal_use) <- as.character(fill_levels)
} else {
n_viridis <- max(pubs[[fill_by]]) - min(pubs[[fill_by]]) + 1
pal_use <- scales::viridis_pal()(n_viridis)
names(pal_use) <- as.character(seq.int(min(pubs[[fill_by]]),
max(pubs[[fill_by]]), 1))
pal_use <- pal_use[as.character(sort(unique(pubs$fill)))]
}
p <- p + scale_fill_manual(values = pal_use, drop = TRUE)
} else {
p <- p +
scale_fill_viridis_d()
}
} else {
species_cols <- species_cols[names(species_cols) %in% unique(pubs$fill)]
p <- p +
scale_fill_manual(values = species_cols, name = "", drop = TRUE) +
theme(legend.text = element_text(face = "italic"))
}
} else {
p <- p +
geom_histogram(binwidth = binwidth)
}
p <- p +
scale_y_continuous(expand = expansion(mult = c(0, 0.05)),
breaks = breaks_pretty()) +
scale_x_date(breaks = breaks_pretty(10)) +
labs(y = "Number of publications", x = "Date published") +
theme(panel.grid.minor = element_blank())
if (!is.null(facet_by)) {
p <- p +
facet_wrap(~ facets, ncol = 1)
}
p
}
#' Number of publications per category
#'
#' I think it looks better when the bars are horizontal to make the category
#' names easier to read, as the names can be quite long. This will plot a bar
#' chart for the number of publications per category, sorted according to the
#' number of publications.
#'
#' These are the sources of images that are not stated to be under public domain
#' found online with filter "free to share and use".
#' No changes were made to the images unless indicated. The author and license,
#' if found, are listed here as well.
#'
#' \describe{
#' \item{drosophila.jpg}{http://gompel.org/images-2/drosophilidae}
#' \item{zebrafish.jpg}{https://thumbs.dreamstime.com/m/zebrafish-zebra-barb-danio-rerio-freshwater-aquarium-fish-isolated-white-background-50201849.jpg}
#' \item{ciona.jpg}{http://www.habitas.org.uk/marinelife/tunicata/cioints.jpg}
#' \item{xenopus.jpg}{https://en.wikipedia.org/wiki/African_clawed_frog#/media/File:Xenopus_laevis_02.jpg
#' by Brian Gratwicke. License: https://creativecommons.org/licenses/by/2.0/
#' }
#' \item{celegans.jpg}{https://en.wikipedia.org/wiki/Caenorhabditis_elegans#/media/File:Adult_Caenorhabditis_elegans.jpg
#' by Kbradnam at English Wikipedia. License: https://creativecommons.org/licenses/by-sa/2.5/
#' A smaller version of the original is used here.
#' }
#' \item{arabidopsis.jpg}{http://parts.igem.org/wiki/images/b/bd/Plants_Arabidopsis_thaliana_400px.jpg}
#' \item{skull.jpg}{http://pngimg.com/download/42558 License: https://creativecommons.org/licenses/by-nc/4.0/
#' The original was compressed and converted to jpg here.
#' }
#' \item{platynereis.jpg}{https://en.wikipedia.org/wiki/Epitoky#/media/File:PlatynereisDumeriliiFemaleEpitoke.tif
#' By Martin Gühmann. A smaller jpg version of the original is used here.
#' License: https://creativecommons.org/licenses/by-sa/4.0/
#' }
#' \item{yeast.jpg}{https://en.wikipedia.org/wiki/Shmoo#/media/File:Shmoos_s_cerevisiae.jpg
#' By Pilarbini. This is a smaller version of the original.
#' License: https://creativecommons.org/licenses/by-sa/4.0/deed.en
#' }
#' }
#'
#' @param pubs A data frame at least with a column for the category of interest.
#' @param category Column name to plot. Tidyeval is supported. If it's species
#' or language, then img_df does not have to be supplied for isotype plot since
#' the images are supplied internally.
#' @param fill_by Variable to fill the bars with if not plotting isotypes.
#' @param n_top Number of top entries to plot. Especially useful for isotype.
#' @param isotype Logical, whether to make isotype plot, like one icon stands for
#' a certain number of publications.
#' @param img_df A data frame with one column with the same name as `category`
#' and another column called `image_paths` for path to the images. Relative path
#' is fine since it will be internally converted to absolute path. This argument
#' can be left as NULL if category is species or language, since in these two
#' cases, the `img_df` is provided internally.
#' @param img_unit Integer, how many publications for one icon.
#' @return A ggplot2 object.
#' @importFrom rlang enquo as_name
#' @importFrom forcats fct_infreq fct_rev
#' @importFrom grid unit
#' @importFrom ggplot2 coord_flip theme element_blank
#' @importFrom purrr map_chr map
#' @importFrom dplyr row_number desc inner_join
#' @importFrom stringr str_to_sentence
#' @export
pubs_per_cat <- function(pubs, category, fill_by = NULL, n_top = NULL,
isotype = FALSE, img_df = NULL, img_unit = NULL) {
n <- reordered <- image <- NULL
category <- enquo(category)
if (!is.null(n_top)) {
top <- pubs %>%
count(!!category) %>%
filter(row_number(desc(n)) <= n_top) %>%
pull(!!category)
pubs <- pubs %>%
filter(!!category %in% top)
}
if (isotype) {
.pkg_check("magick")
.pkg_check("ggtextures")
image_paths <- NULL
if (quo_name(category) == "species") {
img_df <- species_img %>%
mutate(image_paths = map_chr(image_paths, system.file, package = "museumst"))
} else if (quo_name(category) == "language") {
img_df <- lang_img %>%
mutate(image_paths = map_chr(image_paths, system.file, package = "museumst"))
}
img_df <- img_df %>%
mutate(image_paths = map_chr(image_paths, normalizePath, mustWork = TRUE),
image = map(image_paths, magick::image_read))
pubs <- pubs %>%
inner_join(img_df, by = as_name(category))
if (is.null(img_unit)) {
img_unit <- round(nrow(pubs)/20)
message("img_unit not supplied. Using heuristic value ", img_unit)
}
pubs <- pubs %>%
mutate(reordered = fct_infreq(!!category) %>% fct_rev())
p <- ggplot(pubs, aes(reordered)) +
ggtextures::geom_isotype_bar(aes(image = image),
img_width = grid::unit(img_unit, "native"),
img_height = NULL,
nrow = 1, ncol = NA,
hjust = 0, vjust = 0.5)
} else {
pubs <- pubs %>%
mutate(reordered = fct_infreq(!!category) %>% fct_rev())
fill_by <- enquo(fill_by)
if (!is.null(fill_by)) {
p <- ggplot(pubs, aes(reordered, fill = !!fill_by)) + geom_bar()
} else {
p <- ggplot(pubs, aes(reordered)) + geom_bar()
}
}
p <- p +
scale_y_continuous(expand = expansion(mult = c(0, 0.05)),
breaks = breaks_pretty()) +
labs(y = "Number of publications", x = str_to_sentence(quo_name(category))) +
coord_flip() +
theme(panel.grid.minor = element_blank(), panel.grid.major.y = element_blank())
p
}
#' Plot number of publications at each location
#'
#' Plots points on a map, and the areas of the points are proportional to the
#' number of publications at the location. Can facet by some category like
#' method or species.
#'
#' World map will use the Robinson projection. European map uses the LAEA Europe
#' projection (EPSG:3035), and the American map uses the US National Atlas Equal Area
#' projection (EPSG:9311) and Alaska and Hawaii are moved and included. The option
#' to zoom in on Europe and the US is available because those are the two regions
#' of the world with the most publications and it's hard to see the data when
#' plotted on a world map.
#'
#' @inheritParams pubs_per_year
#' @param city_gc From geocode_inst_city
#' @param zoom Whether to plot the world map or only Europe (centered on Western
#' Europe and some Eastern European countries are partially cropped off) or only
#' the US or only northeast Asia.
#' @param ncol Number of columns in facetted plot.
#' @param label_insts Logical, whether to label institutions.
#' @param label_cities Logical, whether to label cities.
#' @param n_label Number of top cities to label, so the labels won't clutter the
#' plot.
#' @param per_year Logical, whether to do the count for each year separately.
#' This is for making animations with gganimate.
#' @param plot Whether to plot points, rectangular bins (bin2d), or hexagonal
#' bins (hexbin). The binned options are useful when there's overplotting.
#' @param bins Numeric vector of length 2, the number of bins for bin2d or hex
#' in the x and y directions. Ignored if plotting points.
#' @return A ggplot2 object
#' @importFrom rlang !!!
#' @importFrom dplyr left_join count semi_join vars
#' @importFrom ggplot2 geom_sf scale_size_area scale_color_viridis_c coord_sf
#' @importFrom scales breaks_width
#' @importFrom ggrepel geom_label_repel
#' @export
pubs_on_map <- function(pubs, city_gc,
zoom = c("world", "europe", "usa", "ne_asia"),
plot = c("point", "bin2d", "hexbin"),
facet_by = NULL, n_top = Inf,
ncol = 3, label_insts = TRUE, label_cities = FALSE,
n_label = 10,
per_year = FALSE, bins = c(70, 70)) {
zoom <- match.arg(zoom)
plot <- match.arg(plot)
.pkg_check("sf")
.pkg_check("rnaturalearth")
.pkg_check("rnaturalearthdata")
if (zoom == "usa") .pkg_check("urbnmapr")
if (plot == "hexbin") {
.pkg_check("hexbin")
}
if (per_year) {
.pkg_check("gganimate")
vars_count <- c("country", "state/province", "city", "year")
label_cities <- FALSE
label_insts <- FALSE
} else if (!is.null(facet_by)) {
vars_count <- c("country", "state/province", "city", "facets")
} else {
vars_count <- c("country", "state/province", "city")
}
# Filter continent before doing fct_lump_n
countries_use <- switch(zoom,
europe = europe_countries,
usa = c("USA", "US", "United States",
"United States of America", "Canada", "Mexico"),
ne_asia = c("China", "Taiwan", "Korea", "Japan", "Mongolia",
"Vietnam"),
world = NULL)
if (!is.null(countries_use)) {
pubs <- pubs |>
filter(country %in% countries_use)
}
if (!is.null(facet_by)) {
pubs <- pubs %>%
mutate(facets = fct_lump_n(!!sym(facet_by), n = n_top),
facets = fct_infreq(facets, ordered = TRUE))
if (!is.infinite(n_top)) {
pubs <- pubs %>%
mutate(facets = fct_relevel(facets, "Other", after = Inf))
}
}
inst_count <- pubs %>%
count(!!!syms(vars_count))
suppressWarnings(sf::st_crs(city_gc) <- 4326)
inst_count <- inst_count %>%
left_join(city_gc, by = c("country", "state/province", "city"))
country <- geometry <- NULL
if (zoom == "world") {
map_use <- rnaturalearth::ne_countries(scale = "small", returnclass = "sf")
# use Robinson projection
robin <- "+proj=robin +lon_0=0 +x_0=0 +y_0=0 +ellps=WGS84 +datum=WGS84 +units=m no_defs"
map_use <- sf::st_transform(map_use, robin)
inst_count <- inst_count %>%
mutate(geometry = sf::st_transform(geometry, robin))
# Work around an issue with gdal backward compatibility
suppressWarnings(sf::st_crs(one_world_small) <- 4326)
map_all <- sf::st_transform(one_world_small, robin)
} else if (zoom == "europe") {
map_use <- rnaturalearth::ne_countries(scale = "medium", returnclass = "sf")
crs_europe <- 3035
inst_count <- inst_count %>%
mutate(geometry = sf::st_transform(geometry, crs = crs_europe))
# project on European transformation
map_use <- sf::st_transform(map_use, crs = crs_europe)
suppressWarnings(sf::st_crs(one_world_medium) <- 4326)
map_all <- sf::st_transform(one_world_medium, crs = crs_europe)
} else if (zoom == "usa") {
map_use <- na_w_pop
crs_usa <- 9311
inst_count <- inst_count %>%
mutate(geometry = sf::st_transform(geometry, crs = crs_usa))
suppressWarnings(sf::st_crs(one_world_medium) <- 4326)
map_all <- sf::st_transform(one_world_medium, crs = crs_usa)
} else if (zoom == "ne_asia") {
map_use <- ne
map_all <- one_world_medium
}
n <- NULL
if (plot == "point") {
p <- ggplot() +
geom_sf(data = map_use, linetype = "dotted") +
geom_sf(data = map_all, fill = NA) +
scale_size_area(name = "Number of\npublications") +
theme(panel.border = element_blank(), axis.title = element_blank()) +
scale_color_viridis_c(name = "")
city2 <- city <- NULL
if (is.null(facet_by)) {
if (per_year) {
p <- p +
geom_sf(data = inst_count, aes(geometry = geometry, size = n, color = n,
group = city2),
alpha = 0.7, show.legend = "point")
} else {
p <- p +
geom_sf(data = inst_count, aes(geometry = geometry, size = n, color = n),
alpha = 0.7, show.legend = "point")
}
} else {
if (per_year) {
p <- p +
geom_sf(data = inst_count, aes(geometry = geometry, size = n,
group = city2,
color = facets),
alpha = 0.7, show.legend = "point")
} else {
inst_count_all <- inst_count %>%
group_by(country, `state/province`, city) %>%
summarize(n_all = sum(n)) %>%
left_join(inst_count[, c("country", "state/province", "city", "geometry")],
by = c("country", "state/province", "city"))
p <- p +
geom_sf(data = inst_count_all,
aes(geometry = geometry, size = n_all, color = "all"),
alpha = 0.5, color = "gray50", show.legend = "point") +
geom_sf(data = inst_count, aes(geometry = geometry, size = n,
color = n),
alpha = 0.7, show.legend = "point")
}
p <- p +
facet_wrap(vars(facets), ncol = ncol) #+
#theme(legend.position = "none")
}
if (zoom != "world") {
# Limit to that box
xylims_use <- switch (zoom,
europe = xylims,
usa = xylims_us,
ne_asia = xylims_ne
)
crs_use <- switch (zoom,
europe = crs_europe,
usa = crs_usa,
ne_asia = 4326
)
p <- p +
coord_sf(xlim = xylims_use[c("xmin", "xmax")], ylim = xylims_use[c("ymin", "ymax")],
crs = crs_use)
}
if (per_year) {
year <- NULL
p <- p +
gganimate::transition_states(year, state_length = 5, transition_length = 1) +
labs(title = "{closest_state}") +
gganimate::enter_fade() +
gganimate::exit_fade()
}
} else {
inst_count2 <- uncount(inst_count, n)
coords <- sf::st_coordinates(inst_count2$geometry)
colnames(coords) <- c("lon", "lat")
coords <- as_tibble(coords)
inst_count2 <- cbind(inst_count2, coords)
p <- ggplot() +
geom_sf(data = map_use, linetype = "dotted") +
geom_sf(data = map_all, fill = NA) +
#scale_fill_distiller(palette = "Blues", direction = 1) +
scale_fill_viridis_c() +
theme(panel.border = element_blank(), axis.title = element_blank())
if (zoom != "world") {
# Limit to that box
xylims_use <- if (zoom == "europe") xylims else xylims_us
crs_use <- if (zoom == "europe") crs_europe else crs_usa
p <- p +
coord_sf(xlim = xylims_use[c("xmin", "xmax")], ylim = xylims_use[c("ymin", "ymax")],
crs = crs_use)
}
if (plot == "hexbin") {
p <- p +
geom_hex(data = inst_count2, aes(lon, lat), bins = bins)
} else if (plot == "bin2d") {
p <- p +
geom_bin2d(data = inst_count2, aes(lon, lat), bins = bins)
}
if (!is.null(facet_by)) {
p <- p +
facet_wrap(vars(facets), ncol = ncol)
}
}
if (!per_year) {
if (label_cities) {
if (!is.null(facet_by)) {
inst_count <- inst_count %>%
group_by(facets)
}
inst_count <- inst_count %>%
mutate(city_rank = row_number(desc(n)),
city_label = case_when(city_rank <= n_label ~ city,
TRUE ~ ""))
p <- p +
geom_label_repel(data = inst_count, aes(geometry = geometry, label = city_label),
alpha = 0.7, stat = "sf_coordinates", max.overlaps = Inf)
} else if (label_insts) {
# Find out what the "top" institutions are
inst_sn <- pubs %>%
count(!!!syms(c(vars_count, "short_name")), name = "nn")
inst_count <- inst_count %>%
left_join(inst_sn, by = vars_count)
if (!is.null(facet_by)) {
inst_count <- inst_count %>%
group_by(facets)
}
inst_count <- inst_count %>%
mutate(inst_rank = row_number(desc(nn)),
inst_label = case_when(inst_rank <= n_label ~ short_name,
TRUE ~ ""))
p <- p +
geom_label_repel(data = inst_count, aes(geometry = geometry, label = inst_label),
alpha = 0.7, stat = "sf_coordinates", max.overlaps = Inf)
}
}
p
}
#' Plot per capita data as choropleth or bar plot
#'
#' For the entire world, Europe (for European countries tend to be smaller) and
#' states within the US.
#'
#' @param pubs A data frame with one row per publication and columns country and
#' for the US, also a column "state/province".
#' @param zoom Whether to plot the world map or only Europe (centered on Western
#' Europe and some Eastern European countries are partially cropped off) or only
#' the US.
#' @param plot Whether to plot choropleth or bar plot.
#' @param label_states If plotting the US, whether to label the states.
#' @return A ggplot2 object.
#' @importFrom ggplot2 scale_fill_distiller geom_col geom_sf_text
#' @importFrom stringr str_length
#' @export
pubs_per_capita <- function(pubs, zoom = c("world", "europe", "usa"),
plot = c("choropleth", "bar"),
label_states = TRUE) {
.pkg_check("sf")
.pkg_check("rnaturalearth")
.pkg_check("rnaturalearthdata")
zoom <- match.arg(zoom)
plot <- match.arg(plot)
if (zoom != "usa") {
if (zoom == "world") {
map_use <- rnaturalearth::ne_countries(scale = "small", returnclass = "sf")
if (plot == "choropleth") {
robin <- "+proj=robin +lon_0=0 +x_0=0 +y_0=0 +ellps=WGS84 +datum=WGS84 +units=m no_defs"
map_use <- sf::st_transform(map_use, robin)
map_all <- sf::st_transform(one_world_small, robin)
}
} else {
map_use <- rnaturalearth::ne_countries(scale = "medium", returnclass = "sf")
if (plot == "choropleth") {
map_use <- sf::st_transform(map_use, 3035)
map_all <- sf::st_transform(one_world_medium, 3035)
}
}
country <- per_capita <- pop_est <- country_full <- n <- `state/province` <-
`2019` <- state_name <- area <- NULL
pubs_count <- pubs %>%
mutate(country_full = case_when(country == "USA" ~ "United States",
country == "UK" ~ "United Kingdom",
TRUE ~ country)) %>%
count(country_full, country)
if (zoom == "europe") {
pubs_count <- pubs_count %>%
filter(country %in% europe_countries)
}
map_use <- map_use %>%
left_join(pubs_count, by = c("name" = "country_full")) %>%
mutate(per_capita = n/pop_est)
if (plot == "bar") {
map_use <- map_use %>%
filter(!is.na(per_capita)) %>%
mutate(area = fct_reorder(country, per_capita))
}
} else {
map_use <- na_w_pop
map_all <- sf::st_transform(one_world_medium, 9311)
pubs_count <- pubs %>%
filter(country %in% c("USA", "US", "United States", "United States of America")) %>%
count(`state/province`)
# Convert state names to abbreviations
pubs_count <- pubs_count %>%
mutate(`state/province` = case_when(
`state/province` %in% map_use$postal ~ `state/province`,
TRUE ~ map_use$postal[match(`state/province`, map_use$state_name)]
))
map_use <- map_use %>%
left_join(pubs_count,
by = c("postal" = "state/province")) %>%
mutate(per_capita = n/`2019`)
if (plot == "bar") {
map_use <- map_use %>%
filter(!is.na(per_capita)) %>%
mutate(area = fct_reorder(state_name, per_capita))
}
}
if (plot == "choropleth") {
p <- ggplot(map_use) +
geom_sf(aes(fill = log10(per_capita)), linetype = "dotted") +
geom_sf(data = map_all, fill = NA) +
scale_fill_distiller(palette = "Blues", na.value = "white", direction = 1,
name = "# pub.\nper capita\n(log10)") +
theme(panel.border = element_blank(), axis.title = element_blank())
if (zoom != "world") {
# Limit to that box
xylims_use <- if (zoom == "europe") xylims else xylims_us
crs_use <- if (zoom == "europe") 3035 else 9311
p <- p +
coord_sf(xlim = xylims_use[c("xmin", "xmax")], ylim = xylims_use[c("ymin", "ymax")],
crs = crs_use)
}
if (zoom == "usa" && label_states) {
# Label the states
state_labels <- urbnmapr::get_urbn_labels("states", sf = TRUE) %>%
filter(!state_abbv %in% c("AK", "HI"))
p <- p +
geom_sf_text(data = state_labels, aes(geometry = geometry, label = state_abbv))
}
} else {
area_lab <- if (zoom == "usa") "state" else "country"
if (zoom == "europe") {
map_use <- map_use %>%
filter(country %in% europe_countries)
}
p <- ggplot(map_use, aes(per_capita, area)) +
geom_col() +
scale_x_continuous(expand = expansion(mult = c(0, 0.05))) +
labs(y = area_lab)
}
p
}
#' Plot heatmap to show relationship between two categorical variables
#'
#' For instance, are papers for certain techniques more likely to be in certain
#' journals? Is there an association between species and journal? This is just
#' for visualization. Use `fisher.test` to see if it's significant. I still wonder
#' if I should rewrite this with ggplot2, which is more work than base R in this
#' case.
#'
#' @inheritParams pubs_per_year
#' @param row_var Variable for rows of the heatmap. Tidyeval is supported.
#' @param col_var Variable for columns of the heatmap.
#' @param ... Extra arguments to pass to `heatmap`
#' @return A base R heatmap is plotted to the current device.
#' @importFrom dplyr pull
#' @importFrom tidyr pivot_wider
#' @importFrom stats heatmap
#' @export
cat_heatmap <- function(pubs, row_var, col_var, ...) {
rv <- enquo(row_var)
cv <- enquo(col_var)
mat1 <- pubs %>%
count(!!rv, !!cv) %>%
pivot_wider(names_from = !!cv, values_from = "n")
method_mat <- as.matrix(mat1[,-1])
rownames(method_mat) <- pull(mat1, !!rv)
method_mat[is.na(method_mat)] <- 0
heatmap(method_mat, ...)
}
#' Plot histogram for each value of a logical variable
#'
#' Plots 3 histograms showing the number of publications per year for TRUE, FALSE,
#' and NA, with the histogram overlaid on top of a translucent one for all
#' values. There's one facet per row so it's easy to compare how things change
#' with time.
#'
#' @inheritParams pubs_per_year
#' @param col_use Which logical variable to plot. Tidyeval is supported.
#' @return A ggplot2 object
#' @importFrom ggplot2 geom_histogram facet_grid scale_fill_brewer
#' @export
hist_bool <- function(pubs, col_use, binwidth = 365, preprints = TRUE) {
date_published <- journal <- v <- NULL
col_use <- enquo(col_use)
if (!preprints) {
pubs <- pubs %>%
filter(!journal %in% c("bioRxiv", "arXiv"))
}
pubs <- pubs %>%
mutate(v = !!col_use)
ggplot(pubs, aes(date_published)) +
geom_histogram(aes(fill = 'all'), alpha = 0.7, fill = "gray70",
data = select(pubs, -v), binwidth = binwidth) +
geom_histogram(aes(fill = v), binwidth = binwidth) +
facet_grid(rows = vars(v)) +
scale_y_continuous(breaks = breaks_pretty(), expand = expansion(c(0, 0.05))) +
scale_x_date(breaks = breaks_pretty(10)) +
scale_fill_brewer(palette = "Set1", na.value = "gray50") +
theme(panel.grid.minor = element_blank(), legend.position = "none") +
labs(x = "date published")
}
#' Plot outlines of histogram for a logical variable
#'
#' Kind of like `hist_bool`, but instead of plotting TRUE, FALSE, and NA in 3
#' separate facets, it plots them as an outline overlaid on a translucent
#' histogram for all values. This is useful when facetting with another categorical
#' variable, such as programming language.
#'
#' @inheritParams hist_bool
#' @inheritParams pubs_per_year
#' @inheritParams pubs_on_map
#' @importFrom tidyr complete
#' @importFrom ggplot2 scale_x_continuous
#' @importFrom rlang quo_name
#' @importFrom stringr str_to_sentence
#' @export
hist_bool_line <- function(pubs, col_use, facet_by = NULL, ncol = 3, n_top = Inf,
binwidth = 365, preprints = TRUE) {
date_published <- journal <- v <- NULL
col_use <- enquo(col_use)
if (!preprints) {
pubs <- pubs %>%
filter(!journal %in% c("bioRxiv", "arXiv"))
}
pubs <- pubs %>%
mutate(v = !!col_use)
if (!is.null(facet_by)) {
pubs <- pubs %>%
mutate(facets = fct_lump_n(!!sym(facet_by), n = n_top,
ties.method = "first"),
facets = fct_infreq(facets))
if ("Other" %in% pubs$facets) {
pubs <- pubs %>%
mutate(facets = fct_relevel(facets, "Other", after = Inf))
}
pubs <- pubs %>% group_by(v, facets)
} else {
pubs <- pubs %>% group_by(v)
}
p <- ggplot(pubs, aes(date_published, after_stat(count))) +
geom_histogram(aes(fill = v), alpha = 0.7, binwidth = binwidth) +
#geom_line(aes(color = v), stat = "bin", binwidth = binwidth) +
scale_y_continuous(breaks = breaks_pretty(), expand = expansion(c(0, 0.05))) +
scale_x_date(breaks = breaks_pretty(10)) +
scale_fill_brewer(name = str_to_sentence(quo_name(col_use)),
palette = "Set1", na.value = "gray50") +
theme(panel.grid.minor = element_blank(), legend.position = "top") +
labs(y = "count", x = "date published")
if (!is.null(facet_by)) {
p <- p + facet_wrap(~ facets, ncol = ncol)
}
p
}
#' Test whether something is associated with time
#'
#' Fits a logistic regression model with glm to use year to predict proportion
#' of a logical variable is TRUE, and tests whether beta is 0.
#'
#' @inheritParams hist_bool
#' @return A glm object is returned invisibly. The summary is printed to screen
#' @importFrom dplyr group_by summarize
#' @importFrom stats glm
#' @export
test_year_bool <- function(pubs, col_use, preprints = TRUE) {
journal <- NULL
col_use <- enquo(col_use)
if (!preprints) {
pubs <- pubs %>%
filter(!journal %in% c("bioRxiv", "arXiv"))
}
pubs <- pubs %>%
mutate(bool_use = !!col_use)
out <- glm(bool_use ~ date_published, data = pubs, family = "binomial")
print(summary(out))
invisible(out)
}
#' Prequel vs current binned over time
#'
#' Plots freqpoly for prequel vs current, with an option to start both at the
#' date when the first publication in the category appeared. The point here is
#' not to compare to the distribution of everything, like in hist_bool, or to
#' compare when things started, like when I plotted a histogram of different
#' methods over time, but to compare the height of the histograms and how steeply
#' they rise and fall. So I think freqpoly may be better than blocky histograms
#' for this purposes.
#'
#' @inheritParams pubs_per_year
#' @inheritParams hist_bool
#' @param since_first Logical. Whether to plot days after the first publication
#' appeared.
#' @param do_smooth Logical. Whether to plot smoothed curve for the trend rather
#' than freqpoly.
#' @param smooth_method Method of smoothing, passed to \code{geom_smooth}.
#' @param smooth_formula Formula of smoothing, passed to \code{geom_smooth}.
#' @return A ggplot2 object
#' @importFrom ggplot2 scale_color_brewer geom_freqpoly scale_fill_discrete
#' @importFrom ggplot2 after_stat
#' @export
era_freqpoly <- function(pubs, col_use, since_first = FALSE, binwidth = 365,
preprints = TRUE, do_smooth = FALSE,
smooth_method = NULL, smooth_formula = NULL) {
journal <- date_published <- days_since_first <- NULL
col_use <- enquo(col_use)
if (!preprints) {
pubs <- pubs %>%
filter(!journal %in% c("bioRxiv", "arXiv"))
}
if (since_first) {
df_plt <- pubs %>%
group_by(!!col_use) %>%
mutate(days_since_first = as.numeric(date_published - min(date_published)))
breaks_use <- seq(-binwidth, max(df_plt$days_since_first), by = binwidth)
df_plt <- df_plt %>%
mutate(date_bin = cut(days_since_first, breaks_use, right = TRUE,
labels = FALSE)) %>%
group_by(!!col_use, date_bin, .drop = FALSE) %>%
count() %>%
mutate(x = breaks_use[date_bin+1],
is_last = x == date_bin[which.max(date_bin)])
p <- ggplot(df_plt, aes(x, n, color = !!col_use)) +
labs(x = "Days since the first publication")
} else {
df_plt <- pubs %>%
mutate(x = cut(date_published, paste(binwidth, "days"), right = TRUE,
include.lowest = TRUE)) %>%
group_by(!!col_use, x, .drop = FALSE) %>%
count() %>%
mutate(is_last = x == tail(levels(x), 1),
x = as.Date(x))
p <- ggplot(df_plt, aes(x, n, color = !!col_use)) +
labs(x = "Date published")
}
if (do_smooth) {
p <- p +
geom_smooth(data = df_plt %>% filter(!is_last), se = FALSE,
method = smooth_method, formula = smooth_formula) +
geom_point(aes(shape = is_last)) +
scale_shape_manual(values = c(16, 4))
} else {
p <- p +
geom_line(data = df_plt %>% filter(!is_last)) +
geom_point(data = df_plt %>% filter(is_last), shape = 4)
}
p <- p +
scale_color_brewer(name = str_to_sentence(quo_name(col_use)),
palette = "Set1", na.value = "gray50") +
labs(y = "Number of publications")
p
}
# From https://stackoverflow.com/a/44090582/8916916
gtable_stack <- function(g1, g2) {
g1$grobs <- c(g1$grobs, g2$grobs)
g1$layout <- transform(g1$layout, z= z-max(z), name="g2")
g1$layout <- rbind(g1$layout, g2$layout)
g1
}
gtable_select <- function (x, ...) {
matches <- c(...)
x$layout <- x$layout[matches, , drop = FALSE]
x$grobs <- x$grobs[matches]
x
}
#' Color facet strips by a variable
#'
#' @param p A ggplot object for the original facetted plot.
#' @param strip_color Categorical column in the data for the original plot to
#' use to color the facet strips. Tidyeval is used here.
#' @param palette A character vector of colors. Can be named to assign each
#' color to a value in `strip_color`.
#' @return Nothing, the plot is printed to device.
#' @importFrom ggplot2 ggplotGrob
#' @importFrom grid grid.newpage grid.draw
#' @export
plot_facets_color <- function(p, strip_color, palette) {
strip_color <- enquo(strip_color)
dummy <- ggplot(p$data, p$mapping) +
facet_wrap(vars(!!!p$facet$params$facets),
ncol = p$facet$params$ncol,
labeller = p$facet$params$labeller) +
geom_rect(aes(fill = !!strip_color), xmin=-Inf, xmax=Inf, ymin=-Inf, ymax=Inf) +
theme_minimal() +
scale_fill_manual(values = palette)
if ("legend.position" %in% names(p$theme)) {
dummy <- dummy +
theme(legend.position = p$theme$legend.position)
}
g1 <- ggplotGrob(p)
g2 <- ggplotGrob(dummy)
# move dummy panels one cell up
panels <- grepl(pattern="panel", g2$layout$name)
strips <- grepl(pattern="strip-t", g2$layout$name)
g2$layout$t[panels] <- g2$layout$t[panels] - 1
g2$layout$b[panels] <- g2$layout$b[panels] - 1
new_strips <- gtable_select(g2, panels | strips)
# stack new strips on top of gtable
# ideally you'd remove the old strips, for now they're just covered
new_plot <- gtable_stack(g1, new_strips)
grid.newpage()
grid.draw(new_plot)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.