#' Calculate Marginal Probabilities
#'
#' @description Calculates marginal probabilities or counts of all selected
#' columns. Option to include row-weights.
#'
#' @param data A data frame.
#' @param cols Columns on which to operate, tidy-select compatible. Columns must
#' contain only 1s and 0s (or `NA`), see [is_onezero()] for help.
#' @param weight An optional column of row-weights.
#' @param stat Either `"prob"` for probabilities or `"count"` for counts.
#' Default `"prob"`.
#' @param tidy Logical, whether or not to return results in tidy format.
#' Default `TRUE`.
#'
#' @importFrom dplyr select pull %>%
#' @importFrom tibble enframe
#' @importFrom collapse fmean
#'
#' @return Returns either a vector (if `tidy = FALSE`), or a tibble in long
#' form of the probabilities or counts of "A."
#'
#' @examples
#' marginal(
#' data = FoodSample,
#' cols = Bisque:PorkChop,
#' weight = weight,
#' tidy = TRUE
#' )
#'
#' @export
marginal <- function(
data, cols, weight, stat = "prob", tidy = TRUE
) {
# Check for correct stats -------------------------------------------------
stats.avail <- c("prob", "count")
if (!stat %in% stats.avail) {
stop(paste0(
"Stat '",
stat,
"' not recognized, please use one of 'prob' or 'count'."
))
}
# Parse out data and weights ----------------------------------------------
# In this section, the data used in the actual turf analysis is parsed out
# from the data set provided, and a vector of weights is either extracted
# from the data, or is created if not provided.
# Grab the data needed for the analysis
item.df <- select(data, {{cols}})
# Check and make sure the data is "onezero"
oz.check <- sapply(item.df, is_onezero)
bad.vars <- names(oz.check[!oz.check])
if (length(bad.vars) > 0) {
bad.vars.message <- paste0(
"The following variables do not meet the requirements of `is_onezero`:\n",
paste(bad.vars, collapse = ", ")
)
stop(bad.vars.message)
}
# Grab the names of the items
item.names <- names(item.df)
num.items <- length(item.names)
# Do weights exist? If so, grab them, if not, make them.
if (missing(weight)) {
ss <- nrow(data)
wgt.vec <- rep(1, times = ss)
} else {
wgt.df <- select(data, {{weight}})
if (ncol(wgt.df) > 1) {
stop("Can only provide one column of weights in `weight` argument.")
}
wgt.name <- names(wgt.df)
if (wgt.name %in% item.names) {
warning(paste0(
"Column '",
wgt.name,
"' was supplied as an input to both `cols` and `weights` arguments, this is likely ill-advised."
))
}
wgt.vec <- pull(wgt.df, {{weight}})
}
# Check for fully missing -------------------------------------------------
# If a variable has 100% missing values then the iteration may be skipped
# entirely and move on to the next. This will reduce iterations and avoid
# returning pairwise counts of zero when they should be NA.
all.miss <- sapply(item.df, function(x) mean(is.na(x)) == 1)
# Probabilities -----------------------------------------------------------
if (stat == "prob") {
out <- sapply(
X = item.df,
fmean,
w = wgt.vec,
na.rm = TRUE
)
out[all.miss] <- NA
if (tidy) {
out <- enframe(out, "var_a", "p")
}
}
# Counts ------------------------------------------------------------------
if (stat == "count") {
out <- sapply(
item.df,
function(x) sum(x * wgt.vec, na.rm = TRUE)
)
out[all.miss] <- NA
if (tidy) {
out <- enframe(out, "var_a", "n")
}
}
return(out)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.