text2sdg_joss: Detecting UN Sustainable Development Goals in Text

Documented in crosstab_sdg

#' Compare query systems and SDGs
#'
#' \code{crosstab_sdg} calculates cross tables (aka contingency tables) of SGSs or systems across hits identified via \link{detect_sdg}.
#'
#' \code{crosstab_sdg} determines correlations between either query systems or SDGs. The respectively other dimension will be ignored. Note that correlations between SDGs may vary between query systems.
#'
#' @param hits \code{data frame} as returned by \code{\link{detect_sdg}}. Must include columns \code{document}, \code{sdg}, \code{system}, and \code{hit}.
#' @param compare \code{character} specifying whether systems or SDGs should be cross tabulated.
#' @param systems \code{character} vector specifying the query systems to be cross tabulated. Values must be available in the \code{system} column of \code{hits}. \code{systems} of length greater 1 result, by default, in a stacked barplot. Defaults to \code{NULL} in which case available values are retrieved from \code{hits}.
#' @param sdgs \code{numeric} vector with integers between 1 and 17 specifying the SDGs to be cross tabluated. Values must be available in the \code{sdg} column of \code{hits}. Defaults to \code{NULL} in which case available values are retrieved from \code{hits}.
#'
#' @return \code{matrix} showing correlation coefficients for all pairs of query systems (if \code{compare = "systems"}) or SDGs (if \code{compare = "SDGs"}).
#'
#' @examples
#'
#' \donttest{
#' # run sdg detection
#' hits <- detect_sdg(projects)
#'
#' # create cross table of systems
#' crosstab_sdg(hits)
#'
#' # create cross table of systems
#' crosstab_sdg(hits, compare = "sdgs")
#' }
#' @export
crosstab_sdg <- function(hits,
                         compare = c("systems", "sdgs"),
                         systems = NULL,
                         sdgs = NULL) {


  # check if columns present
  required_columns = c("document", "sdg", "system")
  if(any(!required_columns %in% names(hits))){
    missing = required_columns[!required_columns %in% names(hits)]
    stop(paste0("Data object must include columns [", paste0(missing, collapse=", "),"]."))
  }

  # replace NULLs
  if(is.null(systems)) systems = unique(hits$system)
  if(is.null(sdgs)) sdgs = unique(stringr::str_extract(hits$sdg,"[:digit:]{2}") %>% as.numeric())

  # check compare
  if(!compare[[1]] %in% c("systems", "sdgs")){
    stop("compare must be either 'systems' or 'sdgs'.")
  }

  # check sdg and system
  if(any(!sdgs %in% 1:17)) stop("sdgs can only take numbers in 1:17.")
  if(any(!systems %in% hits$system)){
    stop(paste0("Data only contains systems [",paste0(unique(hits$system), collapse = ", "),"]."))
  }
  if(compare[[1]] == "systems") {
    if(length(unique(systems)) < 2){
      stop("Argument systems must have, at least, two c values.")
    }
    if(any(!systems %in% hits$system)){
      stop(paste0("Values [",paste0(system[!systems %in% hits$system],collapse=", "),"] for argument systems not found in column system of data."))
    }
  } else {
    if(length(unique(sdgs)) < 2) {
      stop("Argument sdgs must have, at least, two distinct values.")
    }
  }


  # handle duplicates
  hits <- hits %>% dplyr::distinct(document, sdg, system)

  # handle selected sdgs
  sdgs = paste0("SDG-", ifelse(sdgs < 10, "0", ""),sdgs) %>% sort()

  # prepare system labels
  labels = c("aurora" = "Aurora",
             "elsevier" = "Elsevier",
             "siris" = "SIRIS",
             "sdsn" = "SDSN",
             "ontology" = "Ontology")

  # filter and process systems
  hits = hits %>%
    dplyr::filter(sdg %in% sdgs,
                  system %in% systems) %>%
    dplyr::mutate(system = labels[system])

  # abort if no hits left
  if(nrow(hits) == 0) {
    stop(paste0("There are no hits matching the combination of sdgs = [", paste0(sdgs, collapse = ", "), "] and systems = [", paste0(systems, collapse = ", ")))
  }

  # do systems
  if(compare[[1]] == "systems") {

      # do something
      phi_dat <- tidyr::expand_grid(document = 1:length(levels(hits$document)), system = labels[systems], sdg = sdgs) %>%
        dplyr::mutate(document = as.factor(document)) %>%
        dplyr::left_join(hits %>%
                           dplyr::mutate(hit = 1) %>%
                           dplyr::select(document, system, sdg, hit),
                         by = c("document", "system", "sdg")) %>%
        dplyr::mutate(hit = dplyr::if_else(is.na(hit), 0, 1)) %>%
        dplyr::distinct() %>%
        dplyr::arrange(document, sdg) %>%
        tidyr::pivot_wider(names_from = system, values_from = hit) %>%
        `[`(,-(1))

      # do something
      correlations <- phi_dat %>%
        dplyr::select(-sdg) %>%
        cor(.) %>% suppressWarnings()

      # reorder
      correlations <-
        correlations[labels[labels %in% rownames(correlations)],
                     labels[labels %in% colnames(correlations)]]

  } else {

    # do something
    phi_dat <- tidyr::expand_grid(document = 1:length(levels(hits$document)), system = labels[systems], sdg = sdgs) %>%
      dplyr::mutate(document = as.factor(document)) %>%
      dplyr::left_join(hits %>%
                         dplyr::mutate(hit = 1),
                         dplyr::select(document, system, sdg, hit),
                       by = c("document", "system", "sdg")) %>%
      dplyr::mutate(hit = dplyr::if_else(is.na(hit), 0, 1)) %>%
      dplyr::distinct() %>%
      dplyr::arrange(document, sdg) %>%
      tidyr::pivot_wider(names_from = sdg, values_from = hit) %>%
      `[`(,-(1))

    # run correlations
    correlations <- phi_dat %>%
      dplyr::select(-system) %>%
      cor(.) %>% suppressWarnings()

  }

  # out
  correlations

}