R/marginal.R

Defines functions marginal

Documented in marginal

#' Calculate Marginal Probabilities
#'
#' @description Calculates marginal probabilities or counts of all selected
#' columns. Option to include row-weights.
#'
#' @param data A data frame.
#' @param cols Columns on which to operate, tidy-select compatible. Columns must
#' contain only 1s and 0s (or `NA`), see [is_onezero()] for help.
#' @param weight An optional column of row-weights.
#' @param stat Either `"prob"` for probabilities or `"count"` for counts.
#' Default `"prob"`.
#' @param tidy Logical, whether or not to return results in tidy format.
#' Default `TRUE`.
#'
#' @importFrom dplyr select pull %>%
#' @importFrom tibble enframe
#' @importFrom collapse fmean
#'
#' @return Returns either a vector (if `tidy = FALSE`), or a tibble in long
#' form of the probabilities or counts of "A."
#'
#' @examples
#' marginal(
#'     data = FoodSample,
#'     cols = Bisque:PorkChop,
#'     weight = weight,
#'     tidy = TRUE
#' )
#'
#' @export
marginal <- function(
    data, cols, weight, stat = "prob", tidy = TRUE
) {

    # Check for correct stats -------------------------------------------------

    stats.avail <- c("prob", "count")

    if (!stat %in% stats.avail) {
        stop(paste0(
            "Stat '",
            stat,
            "' not recognized, please use one of 'prob' or 'count'."
        ))
    }


    # Parse out data and weights ----------------------------------------------

    # In this section, the data used in the actual turf analysis is parsed out
    # from the data set provided, and a vector of weights is either extracted
    # from the data, or is created if not provided.

    # Grab the data needed for the analysis
    item.df <- select(data, {{cols}})

    # Check and make sure the data is "onezero"
    oz.check <- sapply(item.df, is_onezero)

    bad.vars <- names(oz.check[!oz.check])

    if (length(bad.vars) > 0) {

        bad.vars.message <- paste0(
            "The following variables do not meet the requirements of `is_onezero`:\n",
            paste(bad.vars, collapse = ", ")
        )

        stop(bad.vars.message)

    }

    # Grab the names of the items
    item.names <- names(item.df)
    num.items <- length(item.names)

    # Do weights exist? If so, grab them, if not, make them.
    if (missing(weight)) {

        ss <- nrow(data)
        wgt.vec <- rep(1, times = ss)

    } else {

        wgt.df <- select(data, {{weight}})

        if (ncol(wgt.df) > 1) {
            stop("Can only provide one column of weights in `weight` argument.")
        }

        wgt.name <- names(wgt.df)

        if (wgt.name %in% item.names) {
            warning(paste0(
                "Column '",
                wgt.name,
                "' was supplied as an input to both `cols` and `weights` arguments, this is likely ill-advised."
            ))
        }

        wgt.vec <- pull(wgt.df, {{weight}})

    }

    # Check for fully missing -------------------------------------------------

    # If a variable has 100% missing values then the iteration may be skipped
    # entirely and move on to the next. This will reduce iterations and avoid
    # returning pairwise counts of zero when they should be NA.
    all.miss <- sapply(item.df, function(x) mean(is.na(x)) == 1)


    # Probabilities -----------------------------------------------------------

    if (stat == "prob") {

        out <- sapply(
            X = item.df,
            fmean,
            w = wgt.vec,
            na.rm = TRUE
        )

        out[all.miss] <- NA

        if (tidy) {

            out <- enframe(out, "var_a", "p")

        }

    }


    # Counts ------------------------------------------------------------------

    if (stat == "count") {

        out <- sapply(
            item.df,
            function(x) sum(x * wgt.vec, na.rm = TRUE)
        )

        out[all.miss] <- NA

        if (tidy) {

            out <- enframe(out, "var_a", "n")

        }


    }

    return(out)

}
ttrodrigz/onezero documentation built on May 9, 2023, 2:59 p.m.