#' Summarizes veris enumerations from verisr objects
#'
#' This is the primary analysis function for veris. It conducts binomial
#' hypothesistests on veris data to enumerate the frequency of a given
#' enumeration or set of enumerations within a feature. (For example,
#' 'Malware', 'Hacking', etc within 'action').
#'
#' The 'by' parameter allows enumerating one feature by another, (for example
#' to count the frequency of each action by year).
#'
#' Unknowns are generally excluded as 'not tested'. If 'NA' is an enumeration
#' in the feature being enumerated, it must be specified with the 'na.rm'
#' parameter as whether NA should be included or not is highly dependent on
#' the hypothesis being tested.
#'
#' This function accurately enumerates single logical columns, character
#' feature columns, and features spanning multiple logical columns (such as
#' action.*). It cannot enumerate free-form text columns. It accurately
#' calculates the sample size 'n' as the number of rows (independent of the
#' number of enumerations present in the feature).
#'
#' GetenumCI() can also provide binomial confidence intervals for the
#' enumerations tested within the features. See the parameters for details.
#'
#' While getenumCI() may work on other types of dataframes, it was designed for
#' verisr dataframes and data.tables. It is not tested nor recommended for
#' any other type.
#'
#' @param veris A verisr object
#' @param enum A veris feature or enumeration to summarize
#' @param by A veris feature or enumeration to group by
#' @param na.rm A boolean of whether to include not applicable in the sample set.
#' This is REQUIRED if enum has a potential value of NA as there is no
#' 'default' method for handling NAs. Instead, it depends on the
#' hypothesis being tested.
#' @param unk A boolean referring whether to include 'unknown' in the sample.
#' The default is 'FALSE' and should rarely be overwritten.
#' @param short.names A boolean identifying whether to use the full enumeration
#' name or just the last section. (i.e. action.hacking.variety.SQLi vs
#' just SQLi.)
#' @param top Integer limiting the output to top enumerations.
#' @param ci.method A confidence interval method to use. Options are "mcmc" or "bootstrap". "bootstrap"uses the bayes process from the binom package. "mcmc" uses a binomial model based on rstan, rstanarm, brms.
#' @param cred.mass the amount of probability mass that will be contained in
#' reported credible intervals. This argument fills a similar role as
#' conf.level in \code{\link{binom.test}}.
#' @param round.freq An integer indicating how many places to round
#' the frequency value to. (default = 5)
#' @param na DEPRECIATED! Use '\code{na.rm}' parameter.
#' @param ci.level DEPRECIATED! same as \code{cred.mass}.
#' @param force getenumCI() will attempt to enforce sane confidence-based practices (such as hiding x and freq in low sample sizes). Setting force to 'TRUE' will override these best practices.
#' @param quietly When TRUE, suppress all warnings and messages. This is helpful when getenumCI is used in a larger script or markdown document.
#' @param ... A catch all for functions using arguments from previous
#' versions of getenum.
#' @return A data frame summarizing the enumeration
#' @export
#' @examples
#' tmp <- tempfile(fileext = ".dat")
#' download.file("https://github.com/vz-risk/VCDB/raw/master/data/verisr/vcdb.dat", tmp, quiet=TRUE)
#' load(tmp, verbose=TRUE)
#' library(magrittr)
#' chunk <- getenumCI(vcdb, "action.hacking.variety")
#' chunk
#' chunk <- getenumCI(vcdb, "action.hacking.variety", top=10)
#' chunk <- getenumCI(vcdb, "action.hacking.variety", by="timeline.incident.year")
#' chunk
#' chunk <- getenumCI(vcdb,
#' "action.hacking.variety",
#' by="timeline.incident.year")
#' chunk %>%
#' dplyr::select(by, enum, freq) %>%
#' tidyr::pivot_wider(names_from=enum, values_from=freq, values_fill = list(freq=0))
#' getenumCI(vcdb, "action")
#' getenumCI(vcdb, "asset.variety")
#' getenumCI(vcdb, "asset.assets.variety")
#' getenumCI(vcdb, "asset.assets.variety", ci.method="wilson")
#' getenumCI(vcdb, "asset.cloud", na.rm=FALSE)
#' getenumCI(vcdb, "action.social.variety.Phishing")
#' getenumCI(vcdb, "actor.*.motive", ci.method="wilson", na.rm=FALSE)
#' rm(vcdb)
getenumCI2020 <- function(veris,
enum,
by=NULL,
na.rm = NULL,
unk=FALSE,
short.names=TRUE,
ci.method=c(),
cred.mass=0.95,
ci.level=NULL,
round.freq=5,
na = NULL,
top = NULL,
force = FALSE,
quietly = FALSE,
...) {
lifecycle::deprecate_warn("version 2.3.2.001", "verisr::getenumCI2020()", "verisr::getenumCI2021()")
# Below this value for 'n', apply ci best practices from https://github.com/vz-risk/dbir-private/issues/43 unless force = TRUE
ci_n <- 30 # chosen semi-arbitrarily, but it is roughly where frequentist normal approximations break down.
n_floor <- 5 # chosen semi-arbitrarily. return empty dataframe
# even though the parameter is 'na.rm', we still use 'na' internally.
if (!is.null(na.rm)) {
na = !na.rm # if na.rm is set, change na to it. (na is the logical opposit of na.rm)
} else if (!is.null(na)) {
if (!quietly) { warning("'na' is depriciated. please use 'na.rm'.") }
}
# legacy veris objects are data tables, however data tables cause problems.
if (data.table::is.data.table(veris)) {
df <- as.data.frame(veris)
} else {
df <- veris
}
#
if (!is.null(ci.level)) {
cred.mass <- ci.level
}
# because we aren't keeping the 'method' and don't want to duplicate rows for each method, only 1 allowed.
if (length(ci.method) > 1) {
if (!quietly) { warning("More than one confidence interval method specified. Using first.") }
ci.method <- ci.method[1]
}
if (!is.null(ci.method) && !ci.method %in% c("mcmc", "bootstrap")) {
stop(paste0("ci.method ", ci.method, " not one of c('mcmc', 'bootstrap')."))
}
if (ci.method == "mcmc" && length(intersect(c("brms", "tidybayes"), rownames(installed.packages()))) < 2) {
ci.method <- "bootstrap"
if (!quietly) { warning("ci.method set to mcmc, but 'brms' and 'tidybayes' not both installed. updating ci.method to 'bootstrap'") }
}
if (!is.null(top) && top < 1) {
if (!quietly) { warning(paste0("Top must be 1 or greater, but is (", top, "). Setting top to NULL.")) }
top <- NULL
}
# create a list of enumerations to calculate 'enum' _by_
if (!is.null(by)) {
by_enums <- grep(paste0("^",by,"[.][A-Z0-9][^.]*$"), names(df), value=TRUE)
if (length(by_enums) > 0) {
by_type <- "multinomial"
# by_class <- "character"
} else {
by_enums <- grep(paste0("^",by,"$"), names(df), value=TRUE)
if (length(by_enums) == 1) { # could be > 0, but this should help throw errors when not functioning properly. - gdb 090116
by_enums <- unique(df[[by]])
by_type <- "single_column"
# by_class <- class(by_enums)
} else {
if (!quietly) { warning(paste0("No column matching 'by' value ", by, ". Ignoring 'by' value.")) }
by_enums <- c(NA)
by_type <- "none"
# by_class <- "character"
}
}
} else {
by_enums <- c(NA)
by_type <- "none"
# by_class <- "character"
}
# we use do.call because we don't know how many things we'll be rbinding.
# instead, we just get a list of them using lapply and then rbind
# on that list.
# TODO: Parallelize this as time scales linearly with the number of 'by_enums'
#chunk <- do.call(rbind, lapply(by_enums, function(x) { # switching as you can end up with the wrong number of columns if CI protection triggers on one subchunk and not another - GDB 201216
chunks_list <- lapply(by_enums, function(x) {
# subset DF to just the portion we're currently dealing with
if (by_type == "multinomial") {
subdf <- df[df[[x]], ]
} else if (by_type == "single_column") {
subdf <- df[df[[by]] == x, ]
} else { # catchall for by_type == 'none', i.e. keep the whole df
subdf <- df
}
# select the columns that match the enumeration and characterize it's/their type
enum_enums <- grep(paste0("^",enum,"[.][A-Z0-9][^.]*$"), names(subdf), value=TRUE)
if (length(enum_enums) > 0) {
enum_type <- "multinomial"
} else {
enum_enums <- grep(paste0("^",enum,"$"), names(subdf), value=TRUE)
if (length(enum_enums) == 1) { # could be > 0, but this should help throw errors when not functioning properly. - gdb 090116
if (is.logical(subdf[[enum_enums]])) {
enum_type <- "logical"
short_names <- gsub('^.*[.]([^.]+$)', "\\1", names(subdf))
logical_enum <- enum_enums
enum_enums <- grep(paste0("^", gsub('^(.*)[.]([^.]+$)', "\\1", logical_enum), "[.][A-Z0-9][^.]*$"), names(subdf), value=TRUE)
} else {
enum_type <- "single_column"
}
} else if (length(enum_enums) > 1) {
stop(paste0("Enum ", enum, " resolves to multiple logical columns with a single logical enumeration.",
" This is a known limitation of getenumCI. ",
" Remove the enumeration at the end and instead filter the output of getenumCI to the enumeration of interest."))
enum_type <- "multinomial"
} else {
stop(paste0("Enum ", enum, " did not resolve to any columns."))
}
}
## This entire section calculates the sample size. We do it early so we can adjust 'top' if n < ci_n
# This allows us to handle numerical/factor/character and logical enumerations
if (enum_type == "multinomial" | enum_type == "logical") {
subdf_for_n <- subdf[, enum_enums]
if (ncol(subdf_for_n) <= 0) { stop(paste(c("No columns matched feature(s) ", enum, " using regex ", paste0("^",enum,"[.][A-Z0-9][^.]*$"), collapse=" ")))}
# we remove unknowns because they should normally not be counted
if (unk == FALSE) {
if (short.names) {
subdf_for_n <-
subdf_for_n <- subdf_for_n[, enum_enums][, !grepl("^(.+[.]|)(U|[A-Za-z]{1,3} - [U|u])nknown$", names(subdf_for_n))] # if short names, bla - unknown is removed. See logical section for why. - GDB 17-01-30
} else {
subdf_for_n <- subdf_for_n[, !grepl("^(.+[.]|)(U|u)nknown$", names(subdf_for_n))] # if long names, bla - unknown is kept in sample. See logical section for why. - GDB 17-01-30
}
}
# Whether to use NAs or not depends on the hypothesis being tested so we require an answer (no default)
if (is.null(na) & any(grep("[.]NA$", names(subdf_for_n)))) { stop("'na' must be specified if any column names end in .NA")}
if (!is.null(na)) {
if (na == FALSE) {
subdf_for_n <- subdf_for_n[, !grepl(".NA$", names(subdf_for_n)), ]
}
}
# number of records that have one of our enumerations
n <- sum(rowSums(subdf_for_n, na.rm=TRUE) > 0, na.rm=TRUE)
# count of each enumeration
} else if (enum_type == "single_column") {
table_v <- table(subdf[[enum_enums]])
v <- as.integer(table_v)
names(v) <- names(table_v)
n <- sum(v, na.rm=TRUE)
# remove unknowns
if (unk == FALSE) {
n <- n - sum(v[grepl("^(.+[.]|)(U|u)nknown$", names(v))], na.rm=TRUE) # Doesn't handle `Bla - unknown` as these bastardized hierarchies shouldn't be in a single character column. - gdb 17-01-30
}
# remove NAs
if (is.null(na) & any(grepl("^(.+[.]|)NA$", names(v)))) { stop("'na' must be specified if any column names end in .NA")}
if (!is.null(na)) {
if (na == FALSE) {
n <- n - sum(v[grepl("^(.+[.]|)NA$", names(v))], na.rm=TRUE)
}
}
} else {
stop("class of 'enum' column(s) was not identified, preventing summarization.")
}
# if we aren't rocing and 'top' was set and the sample size was less than ci_n, rerun but with 'top' off
if (!force && !is.null(top) && top > 1 && n < ci_n) {
top <- NULL
if (!quietly) { warning(paste0("Parameter 'top' ignored when 'n' < ", ci_n, ". 'Top' can only be used if there is a clear break where confidence intervals between two ordered records don't overlap. Please calculate interdependance, set 'top' to the appropriate breakpoint, and use 'force=TRUE' to avoid this warning.")) }
}
# Subset to top enums
if (!is.null(top)) {
if (enum_type == "logical") {
if (!quietly) { warning(paste0("Parameter 'top' incompatible with single logical column enumeration ", enum_enums, ". Skipping filtering to top.")) }
} else {
# start by getting a count of each enumeration
if (enum_type == "single_column") {
enum_counts <- table(subdf[[enum_enums]])
} else if (enum_type == "multinomial") {
if (short.names) {
# convert to short names and keep mapping
name_mapping <- data.frame(full_name = enum_enums, stringsAsFactors = FALSE)
name_mapping[["short_name"]] <- gsub('^.*[.]([^.]+$)', "\\1", name_mapping[["full_name"]])
# iterate over unique short names and get counts for them
short_names <- unique(name_mapping$short_name)
enum_counts <- unlist(lapply(short_names, function(s) {
ret <- subdf[, name_mapping[name_mapping$short_name == s, ][["full_name"]]]
if (!is.null(ncol(ret))) { # if it's a vector and ncol is null, you can skip applying
ret <- apply(ret, MARGIN=1, any)
}
ret <- sum(ret, na.rm=TRUE)
ret
}))
names(enum_counts) <- short_names
} else {
enum_counts <- colSums(subdf[ , enum_enums]) # BUG: This does not sum same-cols in short-name mode.
}
} else {
stop("class of 'enum' column(s) was not identified, preventing filtering of top items and further processing")
}
# Remove things that should not be a top enumeration. This includes 'Other', 'Unknown', and 'na' (if na.rm=TRUE)
enum_counts <- enum_counts[!grepl("^(.+[.]|)(O|o)ther$", names(enum_counts))]
if (!is.null(unk)) {
if (unk == FALSE) {
enum_counts <- enum_counts[!grepl("^(.+[.]|)(U|u)nknown$", names(enum_counts))]
}
}
if (!is.null(na)) {
if (na == FALSE) {
enum_counts <- enum_counts[!grepl("^(.+[.]|)NA$", names(enum_counts))]
}
}
# order the enumerations and take the top ones
# top enums are the actual top enums, plus 'Other', 'Unknown', and potentially NA
top_enums <- names(enum_counts[rank(-enum_counts, ties.method="min") <= top]) # , grep("^(.+[.]|)(O|o)ther$", enum_enums, value=TRUE)
top_enums <- intersect(top_enums, names(enum_counts[enum_counts > 0])) # attempt to remove zero from top counts
if (enum_type == "multinomial" && short.names) {
top_enums <- name_mapping[name_mapping$short_name %in% top_enums, ][["full_name"]]
}
if (!is.null(unk)) {
if (unk == FALSE) {
top_enums <- c(top_enums, grep("^(.+[.]|)(U|u)nknown$", enum_enums, value=TRUE))
}
}
if (!is.null(na)) {
if (na == FALSE) {
top_enums <- c(top_enums, grep("^(.+[.]|)NA$", enum_enums, value=TRUE))
}
}
# when we assign 'other' to things not in the top enum list, make sure to not assign it to blank or NA as well
not_top_enums <- setdiff(enum_enums, c(top_enums, ''))
not_top_enums <- not_top_enums[!is.na(not_top_enums)]
if (length(not_top_enums) > 0) {
if (enum_type == "single_column") {
subdf[grepl(paste0("^", paste(not_top_enums, collapse="|")), subdf[[enum_enums]]), enum_enums] <- "Other"
} else {
if (length(intersect(not_top_enums, names(subdf))) <= 0) {
subdf[[paste0(enum, ".Other")]] <- FALSE
} else if (length(intersect(not_top_enums, names(subdf))) == 1) {
subdf[[paste0(enum, ".Other")]] <- subdf[[not_top_enums]] # this likely is assigning <enum>.Other to itself, but just in case we'll define it generically.
} else {
subdf[[paste0(enum, ".Other")]] <- unlist(apply(subdf[, not_top_enums], MARGIN=1, any))
}
enum_enums <- c(intersect(top_enums, enum_enums), paste0(enum, ".Other"))
}
}
}
}
# This allows us to handle numerical/factor/character and logical enumerations
if (enum_type == "multinomial" | enum_type == "logical") {
subdf <- subdf[, enum_enums]
if (ncol(subdf) <= 0) { stop(paste(c("No columns matched feature(s) ", enum, " using regex ", paste0("^",enum,"[.][A-Z0-9][^.]*$"), collapse=" ")))}
## 'n' now calcualted farther above to allow turning 'top' off for small 'n'.
# # we remove unknowns because they should normally not be counted
# if (unk == FALSE) {
# if (short.names) {
# subdf_for_n <- subdf[, !grepl("^(.+[.]|)(U|[A-Za-z]{1,3} - [U|u])nknown$", names(subdf))] # if short names, bla - unknown is removed. See logical section for why. - GDB 17-01-30
# } else {
# subdf_for_n <- subdf[, !grepl("^(.+[.]|)(U|u)nknown$", names(subdf))] # if long names, bla - unknown is kept in sample. See logical section for why. - GDB 17-01-30
# }
# } else {
# subdf_for_n <- subdf
# }
#
# # Whether to use NAs or not depends on the hypothesis being tested so we require an answer (no default)
# if (is.null(na) & any(grep("[.]NA$", names(subdf)))) { stop("'na' must be specified if any column names end in .NA")}
# if (!is.null(na)) {
# if (na == FALSE) {
# subdf_for_n <- subdf_for_n[, !grepl(".NA$", names(subdf_for_n)), ]
# }
# }
#
# # number of records that have one of our enumerations
# n <- sum(rowSums(subdf_for_n, na.rm=TRUE) > 0, na.rm=TRUE)
# # count of each enumeration
# if short.names, combine columns with short names. (Rather than summing same short name after calculating column sums, which double-counts in 'x'.)
if (short.names) {
short_names <- gsub('^.*[.]([^.]+$)', "\\1", names(subdf))
subdf <- do.call(cbind, lapply(unique(short_names), function(y) { # bind the list of columns returned by lapply
dups <- grep(paste0("^(",y,")$"), short_names)
if (length(dups) > 1) {
#feature <- apply(subdf[ , grep(paste0("^(",y,")$"), short_names)], MARGIN=1, any) # 'grep' selects columns with the name, apply checks if any of the row are true
feature <- apply(subdf[ , which(y == short_names)], MARGIN=1, any) # replaced 'grep' with 'which' due to some 'y' containing regex control characters.
} else {
#feature <- subdf[ , grep(paste0("^(",y,")$"), short_names)]
feature <- subdf[, which(y == short_names)] # replaced 'grep' with 'which' due to some 'y' containing regex control characters.
}
feature <- data.frame(feature)
if (ncol(feature) > 0) {
names(feature) <- y
}
feature
}))
}
v <- colSums(subdf, na.rm=TRUE) # used instead of a loop or plyr::count to compute x
} else if (enum_type == "single_column") {
table_v <- table(subdf[[enum_enums]])
v <- as.integer(table_v)
names(v) <- names(table_v)
# 'n' now calcualted above for removing 'top' if n < ci_n
# n <- sum(v, na.rm=TRUE)
# # remove unknowns
# if (unk == FALSE) {
# n <- n - sum(v[grepl("^(.+[.]|)(U|u)nknown$", names(v))], na.rm=TRUE) # Doesn't handle `Bla - unknown` as these bastardized hierarchies shouldn't be in a single character column. - gdb 17-01-30
# }
#
# # remove NAs
# if (!is.null(na)) {
# if (na == FALSE) {
# n <- n - sum(v[grepl("^(.+[.]|)NA$", names(v))], na.rm=TRUE)
# }
# }
} else {
stop("class of 'enum' column(s) was not identified, preventing summarization.")
}
# create the chunk for this 'by'
if (short.names) { # if short names, we treat `foo.Bar - unknown` as an unknown, similar to foo.bar.variety.Unknown being truncated to 'Unknown'
subchunk <- data.frame(enum=names(v), x=v, n=rep(n, length(v)), freq=v/n)
enum_subchunk <- subchunk[!grepl("^(.+[.]|)(U|[A-Za-z]{1,3} - [U|u])nknown$", subchunk$enum) & !grepl("^(.+[.]|)NA$", subchunk$enum), ]
unk_subchunk <- subchunk[grepl("^(.+[.]|)(U|[A-Za-z]{1,3} - [U|u])nknown$", subchunk$enum), ]
na_subchunk <- subchunk[grepl("^(.+[.]|)NA$", subchunk$enum), ]
} else { # if long names, we include `foo.Bar - unknown` because we wouldn't truncate, and therefor would include, foo.bar.variety.Unknown
subchunk <- data.frame(enum=names(v), x=v, n=rep(n, length(v)), freq=v/n)
enum_subchunk <- subchunk[!grepl("^(.+[.]|)(U|u)nknown$", subchunk$enum) & !grepl("^(.+[.]|)NA$", subchunk$enum), ]
unk_subchunk <- subchunk[grepl("^(.+[.]|)(U|u)nknown$", subchunk$enum), ]
na_subchunk <- subchunk[grepl("^(.+[.]|)NA$", subchunk$enum), ]
}
# n is not applicable for Unknown (and potentially na) rows so zero it out
if (unk == FALSE & nrow(unk_subchunk) > 0) {
unk_subchunk[ , c("n", "freq")] <- NA
}
if (!is.null(na)) {
if (na == FALSE & nrow(na_subchunk) > 0) {
na_subchunk[ , c("n", "freq")] <- NA
}
}
# Because we will remove 'x' and 'freq', there must be a ci.method set if n < ci_n and force != TRUE
if (!force & n < ci_n) {
if (length(ci.method) <= 0) {
if (!quietly) { warning(paste0("ci.method must be set if 'n' < ", ci_n, " and force != TRUE. Setting ci.method to 'bootstrap' for enumeration ", x, ". To avoid this warning, please set 'ci.method' to either 'mcmc' or 'bootstrap' or force=TRUE.")) }
ci.method <- "bootstrap"
}
}
# apply the confidence interval. Apply to NA's and unk separately depending on if selected. (If you try and apply CI's cart blanc to the NA/Unknowns it can error out on binding the columns)
if (!is.null(ci.method) && ci.method == "bootstrap") {
if (nrow(enum_subchunk) > 0) {
# replacing the one-liner with this because binom.bayes has a bug that errors if the ends (0 or n) are included in x and number of rows > 3
enum_subchunk[['method']] <- "bootstrap"
enum_subchunk[['lower']] <- NA
enum_subchunk[['upper']] <- NA
if (any(enum_subchunk$x != n & enum_subchunk$x != 0)) { # check if any aren't 0 or n. otherwise, calculating a CI will fail.
enum_subchunk[enum_subchunk$x != n & enum_subchunk$x != 0, c('lower', 'upper') ] <- binom::binom.bayes(as.integer(enum_subchunk[enum_subchunk$x != n & enum_subchunk$x != 0, 'x' ]), as.integer(enum_subchunk[enum_subchunk$x != n & enum_subchunk$x != 0, 'n' ], conf.level=cred.mass))[ , c(7, 8)]
}
## Commenting out the 'if' blocks as I _think_ they are unnecessary
#if (0 %in% enum_subchunk$x) {
enum_subchunk[enum_subchunk$x == 0, c('lower', 'upper')] <- binom::binom.bayes(0, n, conf.level=cred.mass)[, c(7, 8)]
#}
#if (n %in% enum_subchunk$x) {
enum_subchunk[enum_subchunk$x == n, c('lower', 'upper')] <- binom::binom.bayes(n, n, conf.level=0.95)[, c(7, 8)]
#}
# enum_subchunk <- cbind(enum_subchunk, data.frame(method="bootstrap"), binom::binom.confint(enum_subchunk$x, enum_subchunk$n, conf.level=cred.mass, methods="bayes")[ , c(5, 6)])
} else {
enum_subchunk <- cbind(enum_subchunk, data.frame(method=character(), lower=numeric(), upper=numeric()))
}
if (unk == FALSE) {
unk_subchunk <- cbind(unk_subchunk, data.frame(method=rep(NA, nrow(unk_subchunk)), lower=rep(NA, nrow(unk_subchunk)), upper=rep(NA, nrow(unk_subchunk))))
} else if (nrow(unk_subchunk) >0) {
unk_subchunk <- cbind(unk_subchunk, data.frame(method="bootstrap"), binom::binom.confint(unk_subchunk$x, unk_subchunk$n, conf.level=cred.mass, methods="bayes")[ , c(5, 6)])
} else {
unk_subchunk <- data.frame()
}
if (!is.null(na)) {
if (na == FALSE) {
na_subchunk <- cbind(na_subchunk, data.frame(method=rep(NA, nrow(na_subchunk)), lower=rep(NA, nrow(na_subchunk)), upper=rep(NA, nrow(na_subchunk))))
} else if (nrow(na_subchunk) > 0) {
na_subchunk <- cbind(na_subchunk, data.frame(method="bootstrap"), binom::binom.confint(na_subchunk$x, na_subchunk$n, conf.level=cred.mass, methods="bayes")[ , c(5, 6)])
} else {
na_subchunk <- data.frame()
}
}
} else if (!is.null(ci.method) && ci.method == "mcmc") {
# An MCMC bayes approach to the confidence interval
# First step is to join all the sections that need to be modeled. That way we don't have to compile multiple models
subchunk_to_ci <- enum_subchunk[FALSE, ] # create an initial empty dataframe
if (nrow(enum_subchunk) > 0) {
subchunk_to_ci <- rbind(subchunk_to_ci, enum_subchunk)
}
if (!is.null(unk) && (unk)) {
subchunk_to_ci <- rbind(subchunk_to_ci, unk_subchunk)
}
if (!is.null(na) && (na)) {
subchunk_to_ci <- rbind(subchunk_to_ci, na_subchunk)
}
# compile the model
if (nrow(subchunk_to_ci) > 0 ) {
# I use a simple binomial model, but other options may provide better estimates:
# Also considered family=zero_inflated_binomial()
# Also considered family=beta_binomial2 from https://cran.r-project.org/web/packages/brms/vignettes/brms_customfamilies.html
#suppressWarnings(requireNamespace('brms'))
suppressWarnings(require('rstan')) # because rstan has issues if not loaded.
m <- suppressWarnings(brms::brm(x | trials(n) ~ (1|enum),
data=subchunk_to_ci,
family = binomial(),
control = list(adapt_delta = .90, max_treedepth=10),
silent=TRUE, refresh=0, open_progress=FALSE)) # suppress most messages
mcmc <- tidybayes::spread_draws(m, b_Intercept, r_enum[enum,])
mcmc$condition_mean <- logit2prob(mcmc$b_Intercept + mcmc$r_enum)
mcmc <- tidybayes::median_qi(mcmc, .width=cred.mass)
# brms rewrites the column names. We're going to _try_ and fix that by mapping them back to the chunk enums.
# this may or may not work since we can't be _sure_ they sort the same.
# As such, this is a hack until a mapping can be extracted from the model object or the same function used internally can be used to create a mapping.
#mcmc$enum <- plyr::mapvalues(enum, sort(unique(enum)), sort(levels(chunk$enum)))
# Per https://discourse.mc-stan.org/t/brms-non-standard-variable-name-modification/11412
mcmc$enum <- plyr::mapvalues(mcmc$enum, gsub("[ \t\r\n]", ".", as.character(unique(subchunk_to_ci$enum))), as.character(unique(subchunk_to_ci$enum))) # it would be nice to not have to import plyr, but oh well. - GDB 191123
}
# separate the values back into their respective subchunks
if (nrow(enum_subchunk) > 0) {
enum_subchunk <- cbind(enum_subchunk, data.frame(method="mcmc", lower=mcmc$condition_mean.lower[match(enum_subchunk$enum, mcmc$enum)], upper=mcmc$condition_mean.upper[match(enum_subchunk$enum, mcmc$enum)]))
mcmc <- mcmc[(nrow(enum_subchunk)+1):nrow(mcmc), ] #remove the subchunk rows.
} else {
enum_subchunk <- cbind(enum_subchunk, data.frame(method=character(), lower=numeric(), upper=numeric()))
}
if (unk == FALSE) {
unk_subchunk <- cbind(unk_subchunk, data.frame(method=rep(NA, nrow(unk_subchunk)), lower=rep(NA, nrow(unk_subchunk)), upper=rep(NA, nrow(unk_subchunk))))
} else if (nrow(unk_subchunk) >0) {
unk_subchunk <- cbind(unk_subchunk, data.frame(method="mcmc", lower=mcmc$condition_mean.lower[[1]], upper=mcmc$condition_mean.upper[[1]]))
mcmc <- mcmc[2:nrow(mcmc), ] #remove the subchunk rows.
} else {
unk_subchunk <- data.frame()
}
if (!is.null(na)) {
if (na == FALSE) {
na_subchunk <- cbind(na_subchunk, data.frame(method=rep(NA, nrow(na_subchunk)), lower=rep(NA, nrow(na_subchunk)), upper=rep(NA, nrow(na_subchunk))))
} else if (nrow(na_subchunk) > 0) {
na_subchunk <- cbind(na_subchunk, data.frame(method="mcmc", lower=mcmc$condition_mean.lower[[1]], upper=mcmc$condition_mean.upper[[1]]))
} else {
na_subchunk <- data.frame()
}
}
}
# If logical (rather than multinomial), remove all rows other than the one logical
if (enum_type == "logical") {
if (short.names) {
logical_enum_end <- gsub('^(.*)[.]([^.]+$)', "\\2", logical_enum)
enum_subchunk <- enum_subchunk[enum_subchunk$enum == logical_enum_end, ]
} else {
enum_subchunk <- enum_subchunk[enum_subchunk$enum == logical_enum, ]
}
}
# recombine the portions of the subchunk
subchunk <- rbind(enum_subchunk, na_subchunk, unk_subchunk)
# add the 'by' column
subchunk <- cbind(rep(x, nrow(subchunk)), subchunk)
names(subchunk)[1] <- "by"
if (!force & n < ci_n) {
subchunk$x[!is.na(subchunk$n)] <- NA
subchunk$freq[!is.na(subchunk$n)] <- NA
}
subchunk # return
})
## Since some chunks may be below ci_min
#if (any(unlist(lapply(chunks_list, function(l) {"lower" %in% names(l)})))) {
chunks_list <- lapply(chunks_list, function(subchunk) {
if (nrow(subchunk) == 0) { # handles zero values
subchunk <- data.frame(by = factor(), x = integer(), n = integer(), freq = numeric(), method = character(), lower = numeric(), upper = numeric())
} else if (!"lower" %in% names(subchunk)) { # handles some values but not enough
subchunk$method <- "none"
subchunk$lower <- NA_real_
subchunk$upper <- NA_real_
}
subchunk
})
#}
chunk <- do.call(rbind, chunks_list) # now that we've fixed any column issues, bind the columns again
# if there was no 'by', delete the 'by' column
if (by_type != "multinomial" && by_type != "single_column") {
chunk <- chunk[ , -1]
}
# if short names, only use the bit of the enum name after the last period
if (short.names) {
if (is.character(chunk$enum)) {
# chunk$enum <- as(stringr::str_match(chunk$enum, "[^.]+$"), by_class)
# chunk$enum <- unlist(regmatches(chunk$enum, regexec("[^.]+$", chunk$enum)))
chunk$enum <- gsub('^.*[.]([^.]+$)', "\\1", chunk$enum)
}
if("by" %in% names(chunk)) {
if (is.character(chunk$by)) {
# chunk$by <- as(stringr::str_match(chunk$by, "[^.]+$"), by_class)
# chunk$by <- unlist(regmatches(chunk$by, regexec("[^.]+$", chunk$by)))
chunk$by <- gsub('^.*[.]([^.]+$)', "\\1", chunk$by)
}
}
}
if (round.freq>0) {
chunk$freq <- round(chunk$freq, round.freq)
if ("lower" %in% names(chunk)) {
chunk$lower <- round(chunk$lower, round.freq)
chunk$upper <- round(chunk$upper, round.freq)
}
}
# reorder output
# if !force and n < ci_n, order enum alphabetically, otherwise order by freq.
if (!force & any(chunk$n < ci_n, na.rm=TRUE)) {
if ("by" %in% names(chunk)) {
chunk <- chunk[order(chunk$by, as.character(chunk$enum)), ]
chunk$enum <- factor(chunk$enum, levels=rev(unique(chunk$enum)))
} else {
chunk <- chunk[order(as.character(chunk$enum)), ]
chunk$enum <- factor(chunk$enum, levels=rev(unique(chunk$enum)))
}
} else {
if ("by" %in% names(chunk)) {
chunk <- chunk[order(chunk$by, -chunk$freq), ]
chunk$enum <- factor(chunk$enum, levels=rev(unique(chunk$enum)))
} else {
chunk <- chunk[order(-chunk$freq), ]
chunk$enum <- factor(chunk$enum, levels=unique(chunk$enum))
}
}
# replace row numbers
rownames(chunk) <- seq(length=nrow(chunk))
# rownames(chunk) <- NULL
if (!force) {
if (any(chunk$n < n_floor, na.rm=TRUE)) {
if (!quietly) { warning(paste0("Removing all rows with n < ", n_floor, ". ", n_floor, " is the smallest samples size appropriate for the DBIR. Use force = TRUE to avoid this removal. If this leaves n othing but columns with no sample size, those will be removed as well.")) }
}
chunk <- chunk[is.na(chunk$n) | chunk$n >= n_floor, ]
if (all(is.na(chunk$n))) {
chunk <- chunk[!is.na(chunk$n), ]
}
}
# return
chunk
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.