Nothing
#' Plot distributions of SDGs identified in text
#'
#' \code{plot_sdg} creates a (stacked) barplot of the frequency distribution of SDGs identified via \link{detect_sdg} or \link{detect_sdg_systems}.
#'
#' The function is built using \code{\link[ggplot2]{ggplot}} and can thus be flexibly extended. See examples.
#'
#' @param hits \code{data frame} as returned by \code{\link{detect_sdg}} or \code{\link{detect_sdg_systems}}. Must include columns \code{sdg} and \code{system}.
#' @param systems \code{character} vector specifying the query systems to be visualized. Values must be available in the \code{system} column of \code{hits}. \code{systems} of length greater 1 result, by default, in a stacked barplot. Defaults to \code{NULL} in which case available values are retrieved from \code{hits}.
#' @param sdgs \code{numeric} vector with integers between 1 and 17 specifying the SDGs to be visualized. Values must be available in the \code{sdg} column of \code{hits}. Defaults to \code{NULL} in which case available values are retrieved from \code{hits}.
#' @param normalize \code{character} specifying whether results should be presented as frequencies (\code{normalize = "none"}), the default, or whether the frequencies should be normalized using either the total frequencies of each system (\code{normalize = "systems"}) or the total number of documents (\code{normalize = "documents"}).
#' @param color \code{character} vector used to color the bars according to systems. The default, \code{"unibas"}, uses three colors of University of Basel's corporate design. Alternatively, \code{color} must specified using \link{color} names or color hex values. \code{color} will be interpolated to match the length of \code{systems}.
#' @param sdg_titles \code{logical} specifying whether the titles of the SDG should added to the axis annotation.
#' @param remove_duplicates \code{logical} specifying the handling of multiple hits of the same SDG for a given document and system. Defaults to \code{TRUE} implying that no more than one hit is counted per SDG, system, and document.
#' @param ... arguments passed to \code{\link[ggplot2]{geom_bar}}.
#'
#' @return The function returns a \code{\link[ggplot2]{ggplot}} object that can either be stored in an object or printed to produce the plot.
#'
#' @examples
#' \donttest{
#' # run sdg detection
#' hits <- detect_sdg_systems(projects)
#'
#' # create barplot
#' plot_sdg(hits)
#'
#' # create barplot with facets
#' plot_sdg(hits) + ggplot2::facet_wrap(~system)
#' }
#'
#' @export
plot_sdg <- function(hits,
systems = NULL,
sdgs = NULL,
normalize = "none",
color = "unibas",
sdg_titles = FALSE,
remove_duplicates = TRUE,
...) {
# check if columns present
required_columns <- c("sdg", "system")
if (any(!required_columns %in% names(hits))) {
missing <- required_columns[!required_columns %in% names(hits)]
stop(paste0("Data object must include columns [", paste0(missing, collapse = ", "), "]."))
}
# replace NULLs
if (is.null(systems)) {
systems <- hits %>%
dplyr::arrange(system) %>%
dplyr::pull(system) %>%
as.character() %>%
unique()
}
if (is.null(sdgs)) sdgs <- unique(stringr::str_extract(hits$sdg, "[:digit:]{2}") %>% as.numeric())
# check sdg and system
if (any(!sdgs %in% 1:17)) stop("sdgs can only take numbers in 1:17.")
if (any(!systems %in% hits$system)) {
stop(paste0("Data object only contains systems [", paste0(unique(hits$system), collapse = ", "), "]."))
}
# handle duplicates
duplicates <- hits %>%
dplyr::select(document, sdg, system) %>%
duplicated()
if (any(duplicates) & remove_duplicates == TRUE) {
hits <- hits %>% dplyr::filter(!duplicates)
message(paste0(sum(duplicates), " duplicate hits removed. Set remove_duplicates = FALSE to retain duplicates."))
}
# extract number of documents
n_documents <- length(levels(hits$document))
# handle colors
if (color[1] == "unibas") {
color <- c("#D2EBE9", "#A5D7D2", "#46505A")
}
if (length(color) != length(systems)) {
color <- grDevices::colorRampPalette(color)(length(systems))
}
# handle sdgs
sdgs <- paste0("SDG-", ifelse(sdgs < 10, "0", ""), sdgs) %>% sort()
# prepare data
hits <- hits %>%
dplyr::filter(
sdg %in% sdgs,
system %in% systems
) %>%
dplyr::mutate(
sdg = factor(sdg, levels = sdgs),
system = factor(system)
)
# change to titles
if (sdg_titles == TRUE) {
sdg_titles <- aurora_queries %>%
dplyr::mutate(sdg_title = stringr::str_to_title(sdg_title)) %>%
dplyr::select(sdg, sdg_title) %>%
unique() %>%
dplyr::arrange(sdg) %>%
dplyr::pull(sdg_title, sdg)
hits <- hits %>%
dplyr::mutate(sdg = factor(sdg_titles[sdg], levels = sdg_titles))
}
# get frequencies
hits <- hits %>%
dplyr::group_by(system, sdg) %>%
dplyr::summarize(n = dplyr::n()) %>%
dplyr::ungroup()
y_label <- "Frequency"
# transform to proportions
if (normalize[1] != "none") {
if (normalize[1] == "systems") {
hits <- hits %>%
dplyr::group_by(system) %>%
dplyr::mutate(n = n / sum(n)) %>%
dplyr::ungroup()
} else if (normalize[1] == "documents") {
hits <- hits %>%
dplyr::group_by(system) %>%
dplyr::mutate(n = n / n_documents) %>%
dplyr::ungroup()
} else {
stop('Argument normalize must the "none", "systems", or "documents".')
}
y_label <- "Proportion"
}
# generate plot
plot <- hits %>%
ggplot2::ggplot(mapping = ggplot2::aes(x = sdg, y = n, fill = system)) +
ggplot2::geom_bar(..., stat = "identity") +
ggplot2::scale_x_discrete(drop = FALSE) +
ggplot2::scale_fill_manual(name = "Query\nsystem", values = color) +
ggplot2::theme_minimal() +
ggplot2::theme(
axis.text.x = ggplot2::element_text(angle = 45, hjust = 1),
axis.title.x = ggplot2::element_blank()
) +
ggplot2::labs(y = y_label)
# output plot
plot
}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.