#' @name report
#' @title Set of data structure exploration functions for users
#' @description The report family of functions allows users
#' to quickly get information about and compare several
#' aspects of a package in the many packages universe,
#' and its' databases and datasets.
#' @param pkg character string of the many package to report data on.
#' Required input.
#' @param database vector of character strings of the many package to
#' report data on a specific database in a many package
#' If NULL, the function returns a summary of all databases in the many package
#' NULL by default for `data_source()` and `data_contrast()`.
#' @param dataset character string of the many package to
#' report data on a specific
#' dataset in a specific database of a many package
#' If NULL and database is specified, returns database level metadata.
#' NULL by default for `data_source()` and `data_contrast()`.
NULL
#' @name report
#' @details `data_source()` displays names of the database/datasets and
#' source material of data in a many package.
#' @importFrom purrr map
#' @importFrom stringr str_to_title
#' @return A dataframe with the data sources
#' @examples
#' \donttest{
#' data_source(pkg = "manydata")
#' }
#' @export
data_source <- function(pkg, database = NULL, dataset = NULL) {
pkg_path <- find.package(pkg)
data_path <- file.path(pkg_path, "data")
#selcts all dbs
if (!is.null(database)) {
# Database specified, dataset unspecified
if (is.null(dataset)) {
tmp_env <- new.env()
lazyLoad(file.path(data_path, "Rdata"), envir = tmp_env)
dbs <- mget(ls(tmp_env), tmp_env)
dbs <- dbs[database]
outlist <- list()
for (i in c(seq_len(length(dbs)))) {
assign(paste0("tabl", i), rbind(purrr::map(dbs[[i]], function(x)
paste0(utils::capture.output(
print(attr(x, which = "source_bib"))), sep = "", collapse = "")))
)
assign(paste0("tabl", i), t(get(paste0("tabl", i))))
tmp <- get(paste0("tabl", i))
colnames(tmp) <- "Reference"
assign(paste0("tabl", i), tmp)
#List output
outlist[i] <- list(get(paste0("tabl", i)))
}
names(outlist) <- names(dbs)
# Redefine outlist class to list
class(outlist) <- "listof"
return(outlist)
} else {
# Database and dataset specified
tmp_env <- new.env()
lazyLoad(file.path(data_path, "Rdata"), envir = tmp_env)
db <- get(database, envir = tmp_env)
ds <- db[[dataset]]
tabl <- data.frame(Reference = paste0(utils::capture.output(
print(attr(ds, which = "source_bib"))), sep = "", collapse = "")
)
tmp <- as.data.frame(tabl)
colnames(tmp) <- "Reference"
outlist <- list(tmp)
names(outlist) <- dataset
# Redefine outlist class to list
class(outlist) <- "listof"
return(outlist)
}
} else {
tmp_env <- new.env()
lazyLoad(file.path(data_path, "Rdata"), envir = tmp_env)
dbs <- mget(ls(tmp_env), tmp_env)
outlist <- list()
for (i in c(seq_len(length(dbs)))) {
assign(paste0("tabl", i), rbind(purrr::map(dbs[[i]], function(x)
paste0(utils::capture.output(
print(attr(x, which = "source_bib"))), sep = "", collapse = ""
))))
assign(paste0("tabl", i), t(get(paste0("tabl", i))))
tmp <- get(paste0("tabl", i))
colnames(tmp) <- "Reference"
names(tmp) <- names(dbs[[i]])
#Clear attr from object for a prettier print to console
attr(tmp, "names") <- NULL
assign(paste0("tabl", i), tmp)
#Append to list output
outlist[i] <- list(get(paste0("tabl", i)))
}
# Redefine outlist class to list
class(outlist) <- "listof"
return(outlist)
}
}
#' @name report
#' @details `data_contrast()` displays information about databases
#' and datasets contained in them.
#' Namely the number of unique ID's, the percentage of
#' missing data, the number of observations, the number of variables,
#' the minimum beginning date and the maximum ending date as well as
#' the most direct URL to the original dataset.
#' @importFrom purrr map
#' @importFrom stringr str_to_title
#' @return A list with the desired metadata
#' to compare various datasets in a many package.
#' @examples
#' \donttest{
#' data_contrast(pkg = "manydata")
#' }
#' @export
data_contrast <- function(pkg, database = NULL, dataset = NULL) {
pkg_path <- find.package(pkg)
data_path <- file.path(pkg_path, "data")
pkg_dbs <- unname(unlist(readRDS(file.path(data_path, "Rdata.rds"))))
if (!is.null(database)) {
if (is.null(dataset)) {
# Database specified but not dataset
tmp_env <- new.env()
lazyLoad(file.path(data_path, "Rdata"), envir = tmp_env)
dbs <- mget(ls(tmp_env), tmp_env)
dbs <- dbs[database]
outlist <- list()
for (i in c(seq_len(length(dbs)))) {
assign(paste0("tabl", i),
rbind(purrr::map(dbs[[i]], function(x) length(unique(x$ID))),
purrr::map(dbs[[i]], function(x)
paste0(round(sum(is.na(x)) * 100 / prod(dim(x)),
digits = 2), " %")),
purrr::map(dbs[[i]], function(x) nrow(x)),
purrr::map(dbs[[i]], function(x) ncol(x)),
purrr::map(dbs[[i]], function(x)
as.character(ifelse(!all(is.na(x$Beg)),
min(x$Beg, na.rm = TRUE),
NA))),
purrr::map(dbs[[i]], function(x)
as.character(ifelse(!all(is.na(x$End)),
max(x$End, na.rm = TRUE),
NA))),
purrr::map(dbs[[i]], function(x)
attr(x, which = "source_URL"))))
assign(paste0("tabl", i), t(get(paste0("tabl", i))))
tmp <- as.data.frame(get(paste0("tabl", i)))
colnames(tmp) <- c("Unique ID", "Missing Data", "Rows",
"Columns", "Beg", "End", "URL")
assign(paste0("tabl", i), tmp)
# Append objects to outlist
outlist[i] <- list(get(paste0("tabl", i)))
}
# Name elements in list
names(outlist) <- database
# Redefine outlist class to list
class(outlist) <- "listof"
return(outlist)
} else {
# Both dataset and database specified
tmp_env <- new.env()
lazyLoad(file.path(data_path, "Rdata"), envir = tmp_env)
db <- get(database, envir = tmp_env)
ds <- db[[dataset]]
tabl <- data.frame(UniqueID = length(unique(ds$ID)),
Missing_Data = paste0(
round(sum(is.na(ds)) * 100 / prod(dim(ds)),
digits = 2), " %"),
NObs = nrow(ds),
NVar = ncol(ds),
MinDate = as.character(
ifelse(!all(is.na(ds$Beg)),
min(ds$Beg, na.rm = TRUE), NA)),
MaxDate = as.character(
ifelse(!all(is.na(ds$End)),
max(ds$End, na.rm = TRUE), NA)),
URL = attr(ds, which = "source_URL"))
tmp <- as.data.frame(tabl)
colnames(tmp) <- c("Unique ID", "Missing Data", "Rows",
"Columns", "Beg", "End", "URL")
outlist <- list(tmp)
names(outlist) <- dataset
# Redefine outlist class to list
class(outlist) <- "listof"
return(outlist)
}
} else {
# Only package specified, returns package level info
tmp_env <- new.env()
lazyLoad(file.path(data_path, "Rdata"), envir = tmp_env)
dbs <- mget(ls(tmp_env), tmp_env)
outlist <- list()
for (i in c(seq_len(length(dbs)))) {
assign(paste0("tabl", i),
rbind(purrr::map(dbs[[i]], function(x) length(unique(x$ID))),
purrr::map(dbs[[i]], function(x)
paste0(round(sum(is.na(x)) * 100 / prod(dim(x)),
digits = 2), " %")),
purrr::map(dbs[[i]], function(x) nrow(x)),
purrr::map(dbs[[i]], function(x) ncol(x)),
purrr::map(dbs[[i]], function(x)
as.character(ifelse(!all(is.na(x$Beg)),
min(x$Beg, na.rm = TRUE),
NA))),
purrr::map(dbs[[i]], function(x)
as.character(ifelse(!all(is.na(x$End)),
max(x$End, na.rm = TRUE),
NA))),
purrr::map(dbs[[i]], function(x)
attr(x, which = "source_URL"))))
assign(paste0("tabl", i), t(get(paste0("tabl", i))))
tmp <- as.data.frame(get(paste0("tabl", i)))
colnames(tmp) <- c("Unique ID", "Missing Data", "Rows",
"Columns", "Beg", "End", "URL")
assign(paste0("tabl", i), tmp)
#Append to outlist
outlist[i] <- list(get(paste0("tabl", i)))
}
# Name elements in list
names(outlist) <- pkg_dbs
# Redefine outlist class to list
class(outlist) <- "listof"
return(outlist)
}
}
#' @name report
#' @param preparation_script Would you like to open the preparation script
#' for the dataset? By default false.
#' @importFrom utils browseURL read.csv
#' @importFrom dplyr rename
#' @importFrom janitor compare_df_cols
#' @details `data_evolution()` enables users to access the
#' differences between raw data and the data made available to them
#' in one of the 'many' packages.
#' @return Either the data comparison between raw and available data or
#' the preparation script detailing all the steps taken to prepare
#' raw data before making it available in one of the 'many' packages.
#' @examples
#' \donttest{
#' data_evolution(pkg = "manydata", database = "emperors",
#' dataset = "wikipedia")
#' #data_evolution(pkg = "manytrade", database = "agreements",
#' #dataset = "GPTAD")
#' }
#' @export
data_evolution <- function(pkg, database, dataset, preparation_script = FALSE) {
if (length(grep(pkg, search())) == 0) {
stop(paste0(pkg, " not found.
Please install, if necessary, and load ", pkg, " before running 'data_evolution()'.
You can use 'library(", pkg, ")' to load the package."))
}
db <- get(database)
if (!inherits(db, "list")) {
stop("Please declare a 'many' database")
}
url <- paste0("https://github.com/globalgov/", pkg, "/blob/main/data-raw/",
database, "/", dataset)
out <- NULL
if (preparation_script == TRUE) {
out <- utils::browseURL(paste0(url, "/", "prepare-", dataset, ".R"),
browser = getOption("browser"),
encodeIfNeeded = FALSE)
message("Opened preparation script on GitHub.")
} else {
datacsv <- tryCatch({
suppressWarnings(utils::read.csv(paste0("https://raw.githubusercontent.com/globalgov/",
pkg, "/main/data-raw/", database, "/",
dataset, "/", dataset, ".csv")))
}, error = function(e) {
NA_character_
})
if (length(datacsv) == 1) {
message("Raw data could not be open or is not available for this dataset,
opening preparation script instead.")
out <- utils::browseURL(paste0(url, "/", "prepare-", dataset, ".R"),
browser = getOption("browser"),
encodeIfNeeded = FALSE)
} else {
out <- janitor::compare_df_cols(datacsv, db[[dataset]]) %>%
dplyr::rename("Raw Data" = datacsv,
"Available Data" = "db[[dataset]]",
"Variables" = "column_name")
}
}
out
}
#' @name report
#' @details `open_codebook()` opens the original codebook of the specified
#' dataset to allow users to look up the original coding rules.
#' Note that no original codebook might exist for certain datasets.
#' In the latter case, please refer to the
#' source URL provided with each dataset by running `manydata::data_contrast()`
#' as further information on coding rules available online.
#' @return Opens a pdf version of the original codebook of the specified
#' dataset, if available.
#' @export
open_codebook <- function(pkg, database, dataset) {
# Check if input is null
if (is.null(pkg) | is.null(database) | is.null(dataset)) {
stop("Please specify a pkg, a database and a dataset for which you would
like to open the original codebook.")
}
# Check if package exists
repo <- paste0("https://api.github.com/users/globalgov/repos")
repo <- httr::GET(repo, query = list(state = "all",
per_page = 100, page = 1))
repo <- suppressMessages(httr::content(repo, type = "text"))
repo <- jsonlite::fromJSON(repo, flatten = TRUE)
reponames <- repo[["name"]]
if (!(pkg %in% reponames)) {
stop("Please enter a valid package name.")
}
# Find the PDF on GitHub
url <- paste0("https://github.com/globalgov/",
pkg,
"/raw/develop/data-raw/",
database,
"/",
dataset,
"/",
dataset,
"OriginalCodebook.pdf")
# Open the PDF
utils::browseURL(url, browser = getOption("browser"),
encodeIfNeeded = FALSE)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.