Nothing
#' Process raw data
#'
#' \code{processRaw} finds the actual and expected counts using the methodology
#' described by DuMouchel (1999); however, an adjustment is applied to expected
#' counts to prevent double-counting (i.e., using unique marginal ID counts
#' instead of contingency table marginal totals). Also calculates the relative
#' reporting ratio (\emph{RR}) and the proportional reporting ratio
#' (\emph{PRR}).
#'
#' @param data A data frame containing columns named: \emph{id}, \emph{var1},
#' and \emph{var2}. Possibly includes columns for stratification variables
#' identified by the substring \emph{'strat'} (e.g. \emph{strat_age}). Other
#' columns will be ignored.
#' @param stratify A logical scalar (TRUE or FALSE) specifying if stratification
#' is to be used.
#' @param zeroes A logical scalar specifying if zero counts should be included.
#' Using zero counts is only recommended for small data sets because it will
#' dramatically increase run time.
#' @param digits A whole number scalar specifying how many decimal places to
#' use for rounding \emph{RR} and \emph{PRR}.
#' @param max_cats A whole number scalar specifying the maximum number of
#' categories to allow in any given stratification variable. Used to help
#' prevent a situation where the user forgets to categorize a continuous
#' variable, such as age.
#' @param list_ids A logical scalar specifying if a column for pipe-concatenated
#' IDs should be returned.
#' @return A data frame with actual counts (\emph{N}), expected counts
#' (\emph{E}), relative reporting ratio (\emph{RR}), and proportional
#' reporting ratio (\emph{PRR}) for \emph{var1-var2} pairs. Also includes a
#' column for IDs (\emph{ids}) if \code{list_ids = TRUE}.
#'
#' @details An \emph{id} column must be included in \code{data}. If your data
#' set does not include IDs, make a column of unique IDs using \code{df$id <-
#' 1:nrow(df)}. However, unique IDs should only be constructed if the cells in
#' the contingency table are mutually exclusive. For instance, unique IDs for
#' each row in \code{data} are not appropriate with CAERS data since a given
#' report can include multiple products and/or adverse events.
#' @details Stratification variables are identified by searching for the
#' substring \emph{'strat'}. Only variables containing \emph{'strat'} (case
#' sensitive) will be used as stratification variables. \emph{PRR}
#' calculations ignore stratification, but \emph{E} and \emph{RR} calculations
#' are affected. A warning will be displayed if any stratum contains less than
#' 50 unique IDs.
#' @details If a \emph{PRR} calculation results in division by zero, \code{Inf}
#' is returned.
#' @section Warnings:
#' Use of the \code{zeroes = TRUE} option will result in a considerable
#' increase in runtime. Using zero counts is not recommended if the contingency
#' table is moderate or large in size (~500K cells or larger). However, using
#' zeroes could be useful if the optimization algorithm fails to converge
#' when estimating hyperparameters.
#' @section Warnings:
#' Any columns in \code{data} containing the substring \emph{'strat'} in the
#' column name will be considered stratification variables, so verify that you
#' do not have any extraneous columns with that substring.
#'
#' @examples
#' data.table::setDTthreads(2) #only needed for CRAN checks
#' var1 <- c("product_A", rep("product_B", 3), "product_C",
#' rep("product_A", 2), rep("product_B", 2), "product_C")
#' var2 <- c("event_1", rep("event_2", 2), rep("event_3", 2),
#' "event_2", rep("event_3", 3), "event_1")
#' strat1 <- c(rep("Male", 5), rep("Female", 3), rep("Male", 2))
#' strat2 <- c(rep("age_cat1", 5), rep("age_cat1", 3), rep("age_cat2", 2))
#' dat <- data.frame(
#' var1 = var1, var2 = var2, strat1 = strat1, strat2 = strat2,
#' stringsAsFactors = FALSE
#' )
#' dat$id <- 1:nrow(dat)
#' processRaw(dat)
#' suppressWarnings(
#' processRaw(dat, stratify = TRUE)
#' )
#' processRaw(dat, zeroes = TRUE)
#' suppressWarnings(
#' processRaw(dat, stratify = TRUE, zeroes = TRUE)
#' )
#' processRaw(dat, list_ids = TRUE)
#'
#' @references DuMouchel W (1999). "Bayesian Data Mining in Large Frequency
#' Tables, With an Application to the FDA Spontaneous Reporting System."
#' \emph{The American Statistician}, 53(3), 177-190.
#' @keywords openEBGM
#' @import data.table
#' @export
processRaw <- function(data, stratify = FALSE, zeroes = FALSE, digits = 2,
max_cats = 10, list_ids = FALSE) {
.checkInputs_processRaw(data, stratify, zeroes, list_ids)
data <- data.table::as.data.table(data)
# Actual var1/var2 combination counts
if (list_ids) {
actual <- data[, j = list(ids = paste(id, collapse = "|"),
N = .countUnique(id)), by = .(var1, var2)]
} else {
actual <- data[, j = list(N = .countUnique(id)), by = .(var1, var2)]
}
if (zeroes) {
data.table::setkeyv(actual, c("var1", "var2"))
actual <- actual[data.table::CJ(unique(var1), unique(var2))]
actual[is.na(N), N := 0L]
}
# Unstratified marginal counts
v1_marg <- data[, j = list(N_v1 = .countUnique(id)), by = .(var1)]
v2_marg <- data[, j = list(N_v2 = .countUnique(id)), by = .(var2)]
# Expected counts
if (!stratify) {
counts <- merge(actual, v1_marg, by = "var1", all.x = zeroes, sort = FALSE)
counts <- merge(counts, v2_marg, by = "var2", all.x = zeroes, sort = FALSE)
counts[, N_tot := .countUnique(data$id)]
counts[, E := (N_v1 / N_tot) * N_v2]
} else {
data <- .checkStrata_processRaw(data, max_cats) #adds 'stratum' column
v1_marg_str <- data[, j = list(N_v1_str = .countUnique(id)),
by = .(var1, stratum)]
v2_marg_str <- data[, j = list(N_v2_str = .countUnique(id)),
by = .(var2, stratum)]
strat_tot <- data[, j = list(N_tot_str = .countUnique(id)), by = .(stratum)]
counts <- merge(actual, v1_marg_str, by = "var1",
all.x = zeroes, sort = FALSE, allow.cartesian = TRUE)
counts[is.na(N_v1_str), N_v1_str := 0L]
counts <- merge(counts, v2_marg_str, by = c("var2", "stratum"),
all.x = zeroes, sort = FALSE)
counts[is.na(N_v2_str), N_v2_str := 0L]
counts <- merge(counts, strat_tot, by = "stratum",
all.x = zeroes, sort = FALSE)
counts[, E := (N_v1_str / N_tot_str) * N_v2_str]
counts <- counts[, j = list(E = sum(E, na.rm = TRUE)), by = .(var1, var2)]
counts <- merge(actual, counts, by = c("var1", "var2"),
all.x = zeroes, sort = FALSE)
counts <- merge(counts, v1_marg, by = "var1", all.x = zeroes, sort = FALSE)
counts <- merge(counts, v2_marg, by = "var2", all.x = zeroes, sort = FALSE)
counts[, N_tot := .countUnique(data$id)]
}
# Add disproportionality measures
counts[, RR := round(N / E, digits)]
counts[is.nan(RR), RR := 0]
counts[, PRR_num := N / N_v1]
counts[, PRR_den := (N_v2 - N) / (N_tot - N_v1)]
counts[, PRR := round(PRR_num / PRR_den, digits)]
if (list_ids) {
counts <- counts[, .(var1, var2, N, E, RR, PRR, ids)]
} else {
counts <- counts[, .(var1, var2, N, E, RR, PRR)]
}
counts[is.nan(PRR), PRR := Inf]
counts[, var1 := factor(var1, levels = sort(unique(var1)))]
counts[, var2 := factor(var2, levels = sort(unique(var2)))]
data.table::setorder(counts, var1, var2)
data.table::setDF(counts)
counts
}
# From examples in the utils package: '?utils::globalVariables'
if (getRversion() >= "2.15.1") {
utils::globalVariables(c(".", "id", "var1", "var2", "N", "stratum",
"N_v1_str", "N_v2_str", "E", "N_tot", "N_v1",
"N_v2", "N_tot_str", "RR", "PRR_num", "PRR_den",
"PRR", "ids"))
}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.