R/data_report.R
In manydata: A Portal for Global Governance Data

Documented in data_contrast data_evolution data_source open_codebook

#' @name report
#' @title Set of data structure exploration functions for users
#' @description The report family of functions allows users
#' to quickly get information about and compare several
#' aspects of  a package in the many packages universe,
#' and its' databases and datasets.
#' @param pkg character string of the many package to report data on.
#' Required input.
#' @param database vector of character strings of the many package to
#' report data on a specific database in a many package
#' If NULL, the function returns a summary of all databases in the many package
#' NULL by default for `data_source()` and `data_contrast()`.
#' @param dataset character string of the many package to
#' report data on a specific
#' dataset in a specific database of a many package
#' If NULL and database is specified, returns database level metadata.
#' NULL by default for `data_source()` and `data_contrast()`.
NULL

#' @name report
#' @details `data_source()` displays names of the database/datasets and
#' source material of data in a many package.
#' @importFrom purrr map
#' @importFrom stringr str_to_title
#' @return A dataframe with the data sources
#' @examples
#' \donttest{
#' data_source(pkg = "manydata")
#' }
#' @export
data_source <- function(pkg, database = NULL, dataset = NULL) {
  pkg_path <- find.package(pkg)
  data_path <- file.path(pkg_path, "data")
  #selcts all dbs
  if (!is.null(database)) {
    # Database specified, dataset unspecified
    if (is.null(dataset)) {
      tmp_env <- new.env()
      lazyLoad(file.path(data_path, "Rdata"), envir = tmp_env)
      dbs <-  mget(ls(tmp_env), tmp_env)
      dbs <- dbs[database]
      outlist <- list()
      for (i in c(seq_len(length(dbs)))) {
        assign(paste0("tabl", i), rbind(purrr::map(dbs[[i]], function(x)
          paste0(utils::capture.output(
            print(attr(x, which = "source_bib"))), sep = "", collapse = "")))
          )
        assign(paste0("tabl", i), t(get(paste0("tabl", i))))
        tmp <- get(paste0("tabl", i))
        colnames(tmp) <- "Reference"
        assign(paste0("tabl", i), tmp)
        #List output
        outlist[i] <- list(get(paste0("tabl", i)))
      }
      names(outlist) <- names(dbs)
      # Redefine outlist class to list
      class(outlist) <- "listof"
      return(outlist)
    } else {
      # Database and dataset specified
      tmp_env <- new.env()
      lazyLoad(file.path(data_path, "Rdata"), envir = tmp_env)
      db <- get(database, envir = tmp_env)
      ds <- db[[dataset]]
      tabl <- data.frame(Reference = paste0(utils::capture.output(
        print(attr(ds, which = "source_bib"))), sep = "", collapse = "")
      )
      tmp <- as.data.frame(tabl)
      colnames(tmp) <- "Reference"
      outlist <- list(tmp)
      names(outlist) <- dataset
      # Redefine outlist class to list
      class(outlist) <- "listof"
      return(outlist)
    }
  } else {
    tmp_env <- new.env()
    lazyLoad(file.path(data_path, "Rdata"), envir = tmp_env)
    dbs <-  mget(ls(tmp_env), tmp_env)
    outlist <- list()
    for (i in c(seq_len(length(dbs)))) {
      assign(paste0("tabl", i), rbind(purrr::map(dbs[[i]], function(x)
        paste0(utils::capture.output(
          print(attr(x, which = "source_bib"))), sep = "", collapse = ""
          ))))
      assign(paste0("tabl", i), t(get(paste0("tabl", i))))
      tmp <- get(paste0("tabl", i))
      colnames(tmp) <- "Reference"
      names(tmp) <- names(dbs[[i]])
      #Clear attr from object for a prettier print to console
      attr(tmp, "names") <- NULL
      assign(paste0("tabl", i), tmp)
      #Append to list output
      outlist[i] <- list(get(paste0("tabl", i)))
    }
    # Redefine outlist class to list
    class(outlist) <- "listof"
    return(outlist)
  }
}

#' @name report
#' @details `data_contrast()` displays information about databases
#' and datasets contained in them.
#' Namely the number of unique ID's, the percentage of
#' missing data, the number of observations, the number of variables,
#' the minimum beginning date and the maximum ending date as well as
#' the most direct URL to the original dataset.
#' @importFrom purrr map
#' @importFrom stringr str_to_title
#' @return A list with the desired metadata
#' to compare various datasets in a many package.
#' @examples
#' \donttest{
#' data_contrast(pkg = "manydata")
#' }
#' @export
data_contrast <- function(pkg, database = NULL, dataset = NULL) {
  pkg_path <- find.package(pkg)
  data_path <- file.path(pkg_path, "data")
  pkg_dbs <- unname(unlist(readRDS(file.path(data_path, "Rdata.rds"))))
  if (!is.null(database)) {
    if (is.null(dataset)) {
      # Database specified but not dataset
      tmp_env <- new.env()
      lazyLoad(file.path(data_path, "Rdata"), envir = tmp_env)
      dbs <-  mget(ls(tmp_env), tmp_env)
      dbs <- dbs[database]
      outlist <- list()
      for (i in c(seq_len(length(dbs)))) {
        assign(paste0("tabl", i),
               rbind(purrr::map(dbs[[i]], function(x) length(unique(x$ID))),
                     purrr::map(dbs[[i]], function(x)
                       paste0(round(sum(is.na(x)) * 100 / prod(dim(x)),
                               digits = 2), " %")),
                     purrr::map(dbs[[i]], function(x) nrow(x)),
                     purrr::map(dbs[[i]], function(x) ncol(x)),
                     purrr::map(dbs[[i]], function(x)
                       as.character(ifelse(!all(is.na(x$Beg)),
                                                   min(x$Beg, na.rm = TRUE),
                                                   NA))),
                     purrr::map(dbs[[i]], function(x)
                       as.character(ifelse(!all(is.na(x$End)),
                                                   max(x$End, na.rm = TRUE),
                                                   NA))),
                     purrr::map(dbs[[i]], function(x)
                       attr(x, which = "source_URL"))))
        assign(paste0("tabl", i), t(get(paste0("tabl", i))))
        tmp <- as.data.frame(get(paste0("tabl", i)))
        colnames(tmp) <- c("Unique ID", "Missing Data", "Rows",
                           "Columns", "Beg", "End", "URL")
        assign(paste0("tabl", i), tmp)
        # Append objects to outlist
        outlist[i] <- list(get(paste0("tabl", i)))
      }
      # Name elements in list
      names(outlist) <- database
      # Redefine outlist class to list
      class(outlist) <- "listof"
      return(outlist)
    } else {
      # Both dataset and database specified
      tmp_env <- new.env()
      lazyLoad(file.path(data_path, "Rdata"), envir = tmp_env)
      db <- get(database, envir = tmp_env)
      ds <- db[[dataset]]
      tabl <- data.frame(UniqueID = length(unique(ds$ID)),
                         Missing_Data = paste0(
                           round(sum(is.na(ds)) * 100 / prod(dim(ds)),
                                 digits = 2), " %"),
                         NObs = nrow(ds),
                         NVar = ncol(ds),
                         MinDate = as.character(
                           ifelse(!all(is.na(ds$Beg)),
                                  min(ds$Beg, na.rm = TRUE), NA)),
                         MaxDate = as.character(
                           ifelse(!all(is.na(ds$End)),
                                  max(ds$End, na.rm = TRUE), NA)),
                         URL = attr(ds, which = "source_URL"))
      tmp <- as.data.frame(tabl)
      colnames(tmp) <- c("Unique ID", "Missing Data", "Rows",
                         "Columns", "Beg", "End", "URL")
      outlist <- list(tmp)
      names(outlist) <- dataset
      # Redefine outlist class to list
      class(outlist) <- "listof"
      return(outlist)
    }
  } else {
    # Only package specified, returns package level info
    tmp_env <- new.env()
    lazyLoad(file.path(data_path, "Rdata"), envir = tmp_env)
    dbs <-  mget(ls(tmp_env), tmp_env)
    outlist <- list()
    for (i in c(seq_len(length(dbs)))) {
      assign(paste0("tabl", i),
             rbind(purrr::map(dbs[[i]], function(x) length(unique(x$ID))),
                   purrr::map(dbs[[i]], function(x)
                     paste0(round(sum(is.na(x)) * 100 / prod(dim(x)),
                                  digits = 2), " %")),
                   purrr::map(dbs[[i]], function(x) nrow(x)),
                   purrr::map(dbs[[i]], function(x) ncol(x)),
                   purrr::map(dbs[[i]], function(x)
                     as.character(ifelse(!all(is.na(x$Beg)),
                                                 min(x$Beg, na.rm = TRUE),
                                                 NA))),
                   purrr::map(dbs[[i]], function(x)
                     as.character(ifelse(!all(is.na(x$End)),
                                                 max(x$End, na.rm = TRUE),
                                                 NA))),
                   purrr::map(dbs[[i]], function(x)
                     attr(x, which = "source_URL"))))
      assign(paste0("tabl", i), t(get(paste0("tabl", i))))
      tmp <- as.data.frame(get(paste0("tabl", i)))
      colnames(tmp) <- c("Unique ID", "Missing Data", "Rows",
                         "Columns", "Beg", "End", "URL")
      assign(paste0("tabl", i), tmp)
      #Append to outlist
      outlist[i] <- list(get(paste0("tabl", i)))
    }
    # Name elements in list
    names(outlist) <- pkg_dbs
    # Redefine outlist class to list
    class(outlist) <- "listof"
    return(outlist)
  }
}

#' @name report
#' @param preparation_script Would you like to open the preparation script
#' for the dataset? By default false.
#' @importFrom utils browseURL read.csv
#' @importFrom dplyr rename
#' @importFrom janitor compare_df_cols
#' @details `data_evolution()` enables users to access the
#' differences between raw data and the data made available to them
#' in one of the 'many' packages.
#' @return Either the data comparison between raw and available data or
#' the preparation script detailing all the steps taken to prepare
#' raw data before making it available in one of the 'many' packages.
#' @examples
#' \donttest{
#' data_evolution(pkg = "manydata", database = "emperors",
#' dataset = "wikipedia")
#' #data_evolution(pkg = "manytrade", database = "agreements",
#' #dataset = "GPTAD")
#' }
#' @export
data_evolution <- function(pkg, database, dataset, preparation_script = FALSE) {
  if (length(grep(pkg, search())) == 0) {
    stop(paste0(pkg, " not found.
    Please install, if necessary, and load ", pkg, " before running 'data_evolution()'.
                You can use 'library(", pkg, ")' to load the package."))
  }
  db <- get(database)
  if (!inherits(db, "list")) {
    stop("Please declare a 'many' database")
  }
  url <- paste0("https://github.com/globalgov/", pkg, "/blob/main/data-raw/",
                database, "/", dataset)
  out <- NULL
  if (preparation_script == TRUE) {
    out <- utils::browseURL(paste0(url, "/", "prepare-", dataset, ".R"),
                            browser = getOption("browser"),
                            encodeIfNeeded = FALSE)
    message("Opened preparation script on GitHub.")
  } else {
    datacsv <- tryCatch({
      suppressWarnings(utils::read.csv(paste0("https://raw.githubusercontent.com/globalgov/",
                                              pkg, "/main/data-raw/", database, "/",
                                              dataset, "/", dataset, ".csv")))
    }, error = function(e) {
      NA_character_
    })
    if (length(datacsv) == 1) {
      message("Raw data could not be open or is not available for this dataset,
              opening preparation script instead.")
      out <- utils::browseURL(paste0(url, "/", "prepare-", dataset, ".R"),
                              browser = getOption("browser"),
                              encodeIfNeeded = FALSE)
    } else {
      out <- janitor::compare_df_cols(datacsv, db[[dataset]]) %>%
        dplyr::rename("Raw Data" = datacsv,
                      "Available Data" = "db[[dataset]]",
                      "Variables" = "column_name")
    }
  }
  out
}

#' @name report
#' @details `open_codebook()` opens the original codebook of the specified
#' dataset to allow users to look up the original coding rules.
#' Note that no original codebook might exist for certain datasets.
#' In the latter case, please refer to the
#' source URL provided with each dataset by running `manydata::data_contrast()`
#' as further information on coding rules available online.
#' @return Opens a pdf version of the original codebook of the specified
#' dataset, if available.
#' @export
open_codebook <- function(pkg, database, dataset) {
  # Check if input is null
  if (is.null(pkg) | is.null(database) | is.null(dataset)) {
    stop("Please specify a pkg, a database and a dataset for which you would
         like to open the original codebook.")
  }
  # Check if package exists
  repo <- paste0("https://api.github.com/users/globalgov/repos")
  repo <- httr::GET(repo, query = list(state = "all",
                                       per_page = 100, page = 1))
  repo <- suppressMessages(httr::content(repo, type = "text"))
  repo <- jsonlite::fromJSON(repo, flatten = TRUE)
  reponames <- repo[["name"]]
  if (!(pkg %in% reponames)) {
    stop("Please enter a valid package name.")
  }
  # Find the PDF on GitHub
  url <- paste0("https://github.com/globalgov/",
                 pkg,
                 "/raw/develop/data-raw/",
                 database,
                 "/",
                 dataset,
                 "/",
                 dataset,
                 "OriginalCodebook.pdf")
  # Open the PDF
  utils::browseURL(url, browser = getOption("browser"),
            encodeIfNeeded = FALSE)
}