text2sdg_joss: Detecting UN Sustainable Development Goals in Text

Documented in detect_any

#' Detect SDGs in text with own query system
#'
#' \code{detect_any} identifies SDGs in text using user provided query systems. Works like \code{\link{detect_sdg}} but uses a user specified query system instead of an existing one like \code{\link{detect_sdg}} does.
#'
#' @param text \code{character} vector or object of class \code{tCorpus} containing text in which SDGs shall be detected.
#' @param queries a data frame that must contain the following variables: a \code{character} vector with queries, a \code{integer} vector specifying which SDG each query maps to (values must be between 1 and 17) and a \code{character} with one unique value specifying the name of the used query system (can be anything as long as it is unique).
#' @param sdgs \code{numeric} vector with integers between 1 and 17 specifying the sdgs to identify in \code{text}. Defaults to \code{1:17}.
#' @param output \code{character} specifying the level of detail in the output. The default \code{"features"} returns a \code{tibble} with one row per matched query, include a variable containing the features of the query that were matched in the text. By contrast, \code{"docs"} returns an aggregated \code{tibble} with one row per matched sdg, without information on the features.
#' @param verbose \code{logical} specifying whether messages on the function's progress should be printed.
#'
#' @return The function returns a \code{tibble} containing the SDG hits found in the vector of documents. Depending on the value of \code{output} the tibble will contain all or some of the following columns:
#' \describe{
#'  \item{document}{Index of the element in \code{text} where match was found. Formatted as a factor with the number of levels matching the original number of documents.}
#'  \item{sdg}{Label of the SDG found in document.}
#'  \item{systems}{The name of the query system that produced the match.}
#'  \item{query_id}{Index of the query within the query system that produced the match.}
#'  \item{features}{Concatenated list of words that caused the query to match.}
#'  \item{hit}{Index of hit for a given system.}
#' }
#'
#' @examples
#' \donttest{
#' # create data frame with query system
#' my_queries <- tibble::tibble(system = "my_system",
#'                              query = c("theory",
#'                                        "analysis OR analyses OR analyzed",
#'                                        "study AND hypothesis"),
#'                              sdg = c(1,2,2))
#'
#' # run sdg detection with own query system
#' hits <- detect_any(projects, my_queries)
#'
#' # run sdg detection for sdg 2 only
#' hits <- detect_any(projects, my_queries, sdgs = 2)
#'}
#'
#' @export

detect_any <- function(text, queries, sdgs = NULL, output = c("features","docs"), verbose = TRUE) {


  # make corpus
  if(class(text)[1] == "character"){
    corpus = make_corpus(text)
  } else if(class(text)[1] == "tCorpus"){
    corpus = text
  } else {
    stop("Argument text must be either class character or corpustools::tCorpus.")
  }

  # replace NULLs
  if(is.null(sdgs)) sdgs = unique(stringr::str_extract(queries$sdg,"[:digit:]+") %>% as.numeric())

  #check that selected subset of sdgs is in queries
  if(all(!sdgs %in% unique(queries$sdg))) {stop("At least one of the selected SDGs needs to be present in the queries data frame.")}

  #check query data
  if (!all(names(queries) %in% c("sdg", "query", "system"))) {
    stop(paste0("Variables 'sdg', 'query' and 'system' must be present in quries dataset."))
  }

  #handle selected SDGs
  if(any(!sdgs %in% 1:17)) stop("show_sdg can only take numbers in 1:17.")
  sdgs = paste0("SDG-", ifelse(sdgs < 10, "0", ""),sdgs) %>% sort()



  #filter queries based on selected sdgs
  queries <- queries %>%
    dplyr::mutate(sdg = paste0("SDG-", ifelse(queries$sdg < 10, "0", ""),queries$sdg),
                  query_id = 1:dplyr::n()) %>%
    dplyr::filter(sdg %in% sdgs)


  # get hits
  hits = search_corpus(corpus, queries$query)

  #return empty tibble if no SDGs were detected
  if(nrow(hits) == 0) return(tibble::tibble())

  # process hits
  hits = hits %>%
    dplyr::mutate(sdg = queries$sdg[as.numeric(stringr::str_extract(hits$code, '[:digit:]+'))],
                  query_id = as.numeric(stringr::str_extract(hits$code, '[:digit:]+')),
                  system = (queries %>% dplyr::pull(system, query_id))[query_id]) %>%
    dplyr::rename(document = doc_id) %>%
    dplyr::select(document, sdg, system, query_id, feature) %>%
    dplyr::group_by(document, sdg, system, query_id) %>%
    dplyr::summarize(features = toString(unique(feature %>% stringr::str_to_lower()))) %>%
    dplyr::group_by(system) %>%
    dplyr::mutate(hit = 1:dplyr::n()) %>%
    dplyr::ungroup() %>%
    suppressMessages()

    # reduce if requested
  if(output[1] == "documents"){
    v = hits %>%
      dplyr::group_by(document, sdg, system) %>%
      dplyr::summarize(hits = dplyr::n()) %>%
      dplyr::ungroup()
  } else {
    if(output[1] != "features") stop('Argument output must be "features" or "docs"')
  }

  #convert document to factor for downstream functions
  hits <- hits %>%
    dplyr::mutate(document = factor(document, levels = 1:length(corpus$doc_id_levels)))

  #output
  hits %>%
    dplyr::arrange(document, sdg, system)


}