R/compare.R

Defines functions compare_categories plot.compare_missing compare_missing plot.compare_overlap compare_overlap compare_ranges find_date compare_dimensions

Documented in compare_categories compare_dimensions compare_missing compare_overlap compare_ranges

#' Compare dimensions for 'many' data
#'
#' @details `compare_dimensions()` compares the number of observations,
#' variables, the earliest date, and the latest date in all observations
#' for datasets in a 'many' datacube.
#' @family compare_
#' @param datacube A datacube from one of the many packages.
#' @param dataset A dataset in a datacube from one of the many packages.
#' By default, "all".
#' That is, all datasets in the datacube are used.
#' To select two or more datasets, please declare them as a vector.
#' @import messydates
#' @importFrom purrr map
#' @importFrom dplyr as_tibble mutate
#' @examples
#' \donttest{
#' compare_dimensions(emperors)
#' }
#' @return
#' `compare_dimensions()` returns a tibble with information about each dataset
#' including the number of observations, the number of variables,
#' the earliest date, and the latest date in all observations.
#' @export
compare_dimensions <- function(datacube, dataset = "all") {
  Earliest_Date <- Latest_Date <- NULL
  if (any(dataset != "all")) {
    if (length(dataset) < 2) stop("Please declare 2 or more datasets for comparison.")
    datacube <- datacube[grepl(paste(dataset, collapse = "|"), names(datacube))]
  }
  names <- data.frame(Dataset = names(datacube))
  out <- do.call(rbind, lapply(datacube, function(x) {
    Observations <- nrow(x)
    Variables <- paste(names(x), collapse = ", ")
    Earliest_Date <- find_date(x, type = "earliest")
    Latest_Date <- find_date(x, type = "latest")
    cbind(Observations, Variables, Earliest_Date, Latest_Date)
  }))
  dplyr::as_tibble(cbind(names, out))
}

find_date <- function(x, type) {
  out <- dplyr::select_if(x, vapply(x, function(y) 
    class(y) == "mdate" | class(y) == "date",
    FUN.VALUE = logical(1)))
  if (type == "earliest") {
    out <- min(as.Date(as_messydate(unlist(out)), min), na.rm = TRUE)
  } else if (type == "latest") {
    out <- max(as.Date(as_messydate(unlist(out)), max), na.rm = TRUE)
  }
  messydates::as_messydate(out)
}

#' Compare ranges of variables in 'many' data
#'
#' @details `compare_ranges()` compares the number of observations, variables,
#' the earliest and latest date in each dataset in a 'many' datacube.
#' @family compare_
#' @param datacube A datacube from one of the many packages.
#' @param dataset A dataset in a datacube from one of the many packages.
#' By default, "all".
#' That is, all datasets in the datacube are used.
#' To select two or more datasets, please declare them as a vector.
#' @param variable Please declare a variable present in one or more
#' datasets in the 'many' datacube.
#' For multiple variables, please declare variable names as a vector.
#' @import messydates
#' @importFrom purrr map
#' @importFrom dplyr as_tibble
#' @importFrom stats median
#' @examples
#' \donttest{
#' compare_ranges(emperors, variable = c("Begin", "End"))
#' }
#' @return
#' `compare_ranges()` returns a tibble with information about the minimal,
#' maximal, average, and median values for selected variables in datacubes.
#' @export
compare_ranges <- function(datacube, dataset = "all", variable) {
  if (any(dataset != "all")) {
    if (length(dataset) < 2) stop("Please declare 2 or more datasets for comparison.")
    datacube <- datacube[grepl(paste(dataset, collapse = "|"), names(datacube))]
  }
  if (missing(variable)) stop("Please declare one or more variables.")
  datacube <- lapply(datacube, function(x) {
    x[grepl(paste(variable, collapse = "|"), names(x))]
  })
  names <- data.frame(
    Dataset = unlist(lapply(names(datacube), function(x) {
    rep(x, length(variable))
  })))
  out <- suppressWarnings(do.call(rbind, lapply(datacube, function(x) {
    Variable <- names(x)
    Min <- unlist(lapply(x, function(y) {
      ifelse(grepl("date", class(y), ignore.case = TRUE),
             as.character(min(as.Date(messydates::as_messydate(y), min), na.rm = TRUE)),
             as.character(min(y, na.rm = TRUE)))
    }))
    Max <- unlist(lapply(x, function(y) {
      ifelse(grepl("date", class(y), ignore.case = TRUE),
             as.character(max(as.Date(messydates::as_messydate(y), max), na.rm = TRUE)),
             as.character(max(y, na.rm = TRUE)))
    }))
    Mean <- unlist(lapply(x, function(y) {
      ifelse(grepl("date", class(y), ignore.case = TRUE),
             as.character(mean(as.Date(messydates::as_messydate(y), mean), na.rm = TRUE)),
             as.character(mean(y, na.rm = TRUE)))
    }))
    Median <- unlist(lapply(x, function(y) {
      ifelse(grepl("date", class(y), ignore.case = TRUE),
             as.character(stats::median(as.Date(messydates::as_messydate(y), median), na.rm = TRUE)),
             as.character(stats::median(y, na.rm = TRUE)))
    }))
    data.frame(cbind(Variable, Min, Max, Mean, Median))
  })))
  dplyr::as_tibble(cbind(names, out))
}

#' Compare the overlap between datasets in 'many' datacubes
#' 
#' @details `compare_overlap()` compares the overlap between "key" observations
#' in each dataset in a 'many' datacube.
#' @family compare_
#' @param datacube A datacube from one of the many packages.
#' @param dataset A dataset in a datacube from one of the many packages.
#' By default "all".
#' That is, all datasets in the datacube are used.
#' To select two or more datasets, please declare them as a vector.
#' @param key A variable key to join datasets.
#' 'manyID' by default.
#' @importFrom dplyr select rename_with as_tibble
#' @importFrom purrr map
#' @examples
#' \donttest{
#' compare_overlap(emperors, key = "ID")
#' plot(compare_overlap(emperors, key = "ID"))
#' }
#' @return
#' `compare_overlap()` returns a tibble with information about each dataset
#' and the number of overlapping observations.
#' @export
compare_overlap <- function(datacube, dataset = "all", key = "manyID") {
  name <- db_name <- NULL
  thisRequires("ggVennDiagram")
  if (any(dataset != "all")) {
    if (length(dataset) < 2) stop("Please declare 2 or more datasets for comparison.")
    datacube <- datacube[grepl(paste(dataset, collapse = "|"), names(datacube))]
  }
  db_name <- deparse(substitute(datacube))
  out <- purrr::map(datacube, key)
  out <- ggVennDiagram::Venn(out)
  out <- ggVennDiagram::process_data(out)
  out <- ggVennDiagram::venn_region(out)
  out <- dplyr::as_tibble(out) %>%
    dplyr::select(name, count) %>%
    dplyr::rename_with(.fn = ~paste0("Datasets from ", db_name),
                       .cols = name) %>%
    dplyr::rename_with(.fn = ~paste0("Overlapping Observations by ", key),
                       .cols = count)
  class(out) <- c("compare_overlap", "tbl_df", "tbl", "data.frame")
  out
}

#' @importFrom stringr word
#' @importFrom purrr map
#' @export
plot.compare_overlap <- function(x, ...) {
  thisRequires("ggVennDiagram")
  datacube <- get(stringr::word(names(x[1]),-1))
  dataset <- grep("\\.\\.", unique(unname(unlist(x[,1]))),
                  value = TRUE, invert = TRUE)
  key <- stringr::word(names(x[2]),-1)
  if (length(names(datacube)) != length(dataset)) {
    datacube <- datacube[grepl(paste(dataset, collapse = "|"), names(datacube))]
  }
  out <- purrr::map(datacube, key)
  ggVennDiagram::ggVennDiagram(out)
}

#' Compare missing observations for 'many' data
#' 
#' @details `compare_missing()` compares the missing observations for variables
#' in each dataset in a 'many' datacube.
#' @family compare_
#' @importFrom purrr map
#' @importFrom dplyr arrange mutate as_tibble
#' @param datacube A datacube from one of the many packages.
#' @param dataset A dataset in a datacube from one of the many packages.
#' NULL by default.
#' That is, all datasets in the datacube are used.
#' To select two or more datasets, please declare them as a vector.
#' @param variable Would you like to focus on one, or more, specific variables
#' present in one or more datasets in the 'many' datacube?
#' By default "all".
#' For multiple variables, please declare variable names as a vector.
#' @examples
#' \donttest{
#' compare_missing(emperors)
#' plot(compare_missing(emperors))
#' }
#' @return
#' `compare_missing()` returns a tibble with information about each dataset
#' including the number of observations, the number of variables,
#' the earliest date, and the latest date in all observations.
#' @export
compare_missing <- function(datacube, dataset = "all", variable = "all") {
  # Avoid notes
  Dataset <- Count <- Variable <- Missing <- 'Percent Missing' <- NULL
  # Select datasets if declared
  if (any(dataset != "all")) {
    if (length(dataset) < 2) stop("Please declare 2 or more datasets for comparison.")
    datacube <- datacube[grepl(paste(dataset, collapse = "|"), names(datacube))]
  }
  all_variables <- unique(unname(unlist(purrr::map(datacube, names))))
  # Select variables if declared
  if (variable[1] == "all") {
    # remove other ID variables and text variables
    ID_var <- grep("^ID$|ID$", all_variables, value = TRUE)
    all_variables <- all_variables[!all_variables %in% ID_var]
    # remove text variables
    all_variables <- grep("text", all_variables,
                          ignore.case = TRUE, value = TRUE, invert = TRUE)
  } else {
    all_variables <- all_variables[all_variables %in% variable]
  }
  # Report on datasets in datacube
  out <- purrr::map(datacube, extract_if_present, all_variables)
  for (n in names(out)) out[[n]]['Dataset'] = n
  out <- do.call(rbind, lapply(out, function(x) {
    varnames <- names(x)
    dataset <- x[['Dataset']][1]
    datatype <- unlist(lapply(x, class))
    counts   <- unlist(lapply(x, length))
    mvalues    <- unlist(lapply(x, function(z) sum(is.na(z))))
    mvaluesper <- round((mvalues / counts) * 100, 2)
    cbind(Variable = varnames, Dataset = dataset, Class = datatype,
          Count = counts, Missing = mvalues, 'Percent Missing' = mvaluesper)
  }))
  out <- dplyr::as_tibble(out) %>%
    dplyr::mutate(Count = as.numeric(Count),
                  Missing = as.numeric(Missing),
                  'Percent Missing' = as.numeric(`Percent Missing`)) %>%
    dplyr::arrange(Variable)
  class(out) <- c("compare_missing", "tbl_df", "tbl", "data.frame")
  out
}

#' @import ggplot2
#' @export
plot.compare_missing <- function(x, ...) {
  'Percent Missing' <- Variable <- Dataset <- NULL
  # Plot missing proportions for variables
  ggplot2::ggplot(x, aes(reorder(Dataset, `Percent Missing`, decreasing = TRUE),
                         reorder(Variable, `Percent Missing`))) +
    geom_tile(aes(fill = `Percent Missing`)) +
    scale_fill_gradient(low = "darkgreen", high = "red3", na.value = NA,
                        name = "Proportion\nof missing\nobservations") +
    theme_classic() +
    theme(axis.text.x = element_text(angle = 90)) +
    labs(x = "datacube", y = "Variable")
}

#' Compare categories in 'many' datacubes
#' 
#' @details Confirmed values are the same in all datasets in datacube.
#' Unique values appear once in datasets in datacube.
#' Missing values are missing in all datasets in datacube.
#' Conflict values are different in the same number of datasets in datacube.
#' Majority values have the same value in multiple, but not all,
#' datasets in datacube.
#' @family compare_
#' @param datacube A datacube from one of the many packages.
#' @param dataset A dataset in a datacube from one of the many packages.
#' By default "all".
#' That is, all datasets in the datacube are used.
#' To select two or more datasets, please declare them as a vector.
#' @param key A variable key to join datasets.
#' 'manyID' by default.
#' @param variable Would you like to focus on one, or more, specific variables
#' present in one or more datasets in the 'many' datacube?
#' By default "all".
#' For multiple variables, please declare variable names as a vector.
#' @param category Would you like to focus on one specific code category?
#' By default "all" are returned.
#' Other options include "confirmed", "unique", "missing", "conflict",
#' or "majority".
#' For multiple variables, please declare categories as a vector.
#' @importFrom dplyr full_join filter_all %>% all_of group_by distinct any_vars
#' starts_with mutate tibble
#' @importFrom purrr reduce map
#' @importFrom tidyr drop_na
#' @importFrom stringr str_count str_remove_all str_split str_extract_all
#' str_replace_all
#' @examples
#' \donttest{
#' compare_categories(emperors, key = "ID")
#' compare_categories(datacube = emperors, dataset = c("wikipedia", "UNRV"),
#' key = "ID", variable = c("Beg", "End"), category = c("conflict", "unique"))
#' plot(compare_categories(emperors, key = "ID"))
#' plot(compare_categories(datacube = emperors, dataset = c("wikipedia", "UNRV"),
#' key = "ID", variable = c("Beg", "End"), category = c("conflict", "unique")))
#' }
#' @export
compare_categories <- function(datacube,
                               dataset = "all",
                               key = "manyID",
                               variable = "all",
                               category = "all") {
  # Step 1: get variables of interest
  if (length(grepl(key[1], purrr::map(datacube, names))) != length(datacube)) {
    stop("Please declare a key variable present in all datasets in the datacube.")
  }
  # Step 2: check if multiple keys for memberships' datacubes
  if (grepl("membership", deparse(substitute(datacube)), ignore.case = TRUE) &
      length(key) == 1) {
    stop("For memberships datacube please indicate two keys
         (e.g. key = c('manyID', 'CountryID'))")
  }
  # Step 3: get datasets if declared
  if (any(dataset != "all")) {
    datacube <- datacube[grepl(paste(dataset, collapse = "|"), names(datacube))]
  }
  # Step 4: inform users about duplicates
  if (length(key) == 1) {
    db_size <- sum(duplicated(unname(unlist(purrr::map(datacube, key)))))
    cat("There were", db_size,
        "matched observations by", key,
        "variable across datasets in datacube.")
  }
  # Step 5: get variable(s) of interest if declared
  all_variables <- unique(unname(unlist(purrr::map(datacube, names))))
  if (variable[1] == "all") {
    all_variables <- all_variables[!all_variables %in% key]
    # remove other ID variables and text variables
    ID_var <- grep("^ID$|ID$", all_variables, value = TRUE)
    all_variables <- all_variables[!all_variables %in% ID_var]
    # remove text variables
    all_variables <- grep("text", all_variables,
                          ignore.case = TRUE, value = TRUE, invert = TRUE)
  } else {
    all_variables <- all_variables[all_variables %in% variable]
  }
  out <- purrr::map(datacube, extract_if_present, c(key, all_variables))
  # Step 6: reduce and join data
  out <- purrr::map(out, tidyr::drop_na, dplyr::all_of(key)) %>%
    purrr::reduce(dplyr::full_join, by = key)
  # create an empty data frame in case there is multiple variables
  db <- data.frame(out[, c(key)], stringsAsFactors = TRUE)
  # Step 7: code variables
  for (var in all_variables) {
    vvars <- paste0("^", var, "$|^", var, "\\.")
    vars_to_combine <- grep(vvars, names(out), value = TRUE)
    vlb <- out[vars_to_combine]
    col <- purrr::map(datacube, var)
    col <- names(col[lengths(col) != 0])
    colnames(vlb) <- paste0(col, "$", var)
    value <- NULL
    if (length(vlb) > 1) {
      # remove string duplicates and collapse unique values (except NAs)
      value <- apply(vlb, 1, function(x) {
        paste(unique(trimws(x), incomparables = c("^NA$", NA_character_)),
              collapse = "!")
      })
      # missing
      value <- ifelse(stringr::str_count(value, "NA") ==
                        length(out[vars_to_combine]), "missing", value)
      # unique
      value <- ifelse(stringr::str_count(value, "NA") ==
                        (length(out[vars_to_combine]) - 1), "unique", value)
      # confirmed (by all datasets in datacube)
      value <- ifelse(stringr::str_count(value, "\\!") == 0 &
                        !grepl("^missing$|^unique$", value),
                      "confirmed", value)
      # confirmed (by multiple datasets in datacube, other values are NAs)
      valuec <- lapply(stringr::str_split(value, "!"), function(x) {
        x[!grepl("^NA$", x) & x != ""]
      })
      value <- ifelse(lengths(valuec) == 1 &
                        !grepl("^missing$|^unique$|^confirmed$", valuec),
                      "confirmed", value)
      # conflict (when there are no duplicates, apart from NAs possibly)
      value <- ifelse(!grepl("^missing$|^unique$|^confirmed$", value) &
                        stringr::str_count(value, "\\!") ==
                        (length(out[vars_to_combine]) - 1),
                      "conflict", value)
      # open (and close) the values to find if majority or conflict
      vl <- apply(vlb, 1, function(x) {
        x <- x[!grepl("^NA$", x) & x != ""]
        x <- unique(rle(x)$lengths)
        x
      })
      # conflict (an even number of non-NA conflicting obs)
      value <- ifelse(!grepl("^missing$|^unique$|^confirmed$|^conflict$",
                             value) & lengths(vl) == 1, "conflict", value)
      # majority (an unbalanced number of non-NA matching obs)
      value <- ifelse(!grepl("^missing$|^unique$|^confirmed$|^conflict$",
                             value), "majority", value)
    } else {
      value <- unname(unlist(out[vars_to_combine]))
      value <- ifelse(is.na(value), "missing", "unique")
    }
    # fill data frame with variable and presence in datasets
    db <- cbind(db, vlb)
    if (length(value) == nrow(db)) {
      db[, paste0(var, " (", length(vlb), ")")] <- value
    }
  }
  db <- dplyr::tibble(db[unique(colnames(db))]) %>%
    select(-dplyr::starts_with("dplyr"))
  # Step 8: filter categories if necessary
  . <- NULL
  if (any(category != "all")) {
    db <- dplyr::filter_all(db,
                            dplyr::any_vars(grepl(paste(category,
                                                        collapse = "|"), .)))
  }
  class(db) <- c("compare_categories", "tbl_df", "tbl", "data.frame")
  db
}

#' @importFrom purrr map
#' @importFrom dplyr summarise group_by mutate select %>% filter
#' @importFrom tidyr pivot_longer pivot_wider replace_na fill
#' @importFrom stats reorder
#' @import ggplot2
#' @export
plot.compare_categories <- function(x, ...) {
  Category <- Variable <- Percentage <- Missing <- NULL # to avoid notes
  # Step 1: remove extra variable level information
  db <- x[!grepl("\\$", names(x))]
  
  # Step 3: gather and reshape the data
  dbgather <- db[, -1] %>%
    tidyr::pivot_longer(cols = everything(), names_to = "Variable",
                        values_to = "Category") %>%
    dplyr::group_by(Variable, Category) %>%
    dplyr::summarise(count = n(), .groups = ) %>%
    dplyr::mutate(Percentage = count / sum(count)) %>%
    tidyr::pivot_wider(id_cols = Variable, names_from = Category,
                       values_from = Percentage) %>%
    dplyr::mutate(across(everything(), ~tidyr::replace_na(.x, 0))) %>%
    tidyr::pivot_longer(-Variable, names_to = "Category",
                        values_to = "Percentage") %>%
    dplyr::mutate(Category = factor(Category, levels = c("missing",
                                                         "conflict",
                                                         "unique",
                                                         "majority",
                                                         "confirmed")),
                  Missing = ifelse(Category == "missing",
                                   Percentage, NA_character_)) %>%
    tidyr::fill(Missing, .direction = "downup") %>%
    dplyr::filter(Percentage != 0)
  # Step 4: set colors and plot
  cols <- c(confirmed = "deepskyblue3", majority = "aquamarine3",
            unique = "khaki", conflict = "firebrick", missing = "grey90")
  # Step 5: plot
  ggplot2::ggplot(dbgather, aes(fill = Category, y = Percentage,
                                x = stats::reorder(Variable, as.numeric(Missing)))) +
    geom_bar(position = "fill", stat = "identity") +
    scale_x_discrete(guide = guide_axis(angle = 90)) +
    scale_y_reverse(labels = function(x) {
      ifelse(x == 1 | x == 0.5,
             paste0(x * 100, "%", "\n(", x * nrow(db), " obs)"),
             paste0(x * 100, "%"))
    }) +
    scale_fill_manual(values = cols) +
    geom_text(aes(label = paste0(round(Percentage*100, digits = 1),"%")), 
              position = position_stack(vjust = 0.5),
              size = 2, color = "white", angle = 90) +
    theme_minimal() +
    labs(title = deparse(substitute(datacube)),
         subtitle = paste0("Based on ", nrow(db),
                           " consolidated observations."),
         caption = "In between the parenthesis are the number of datasets in which variable is present.",
         x = "Variable")
}
globalgov/manydata documentation built on May 10, 2024, 9:48 p.m.