blocs: Estimate and Visualize Voting Blocs' Partisan Contributions

Documented in vb_discrete wtd_table

#' Discrete voting bloc analysis
#'
#' Define voting blocs along a \strong{discrete} variable and estimate their partisan
#' vote contributions.
#'
#' @param data               default data.frame to use as the source for
#'   density, turnout, and vote choice data.
#' @param data_density   data.frame of blocs' composition/density data. Must
#'   include any columns named by \code{indep} and \code{weight}.
#' @param data_turnout   data.frame of blocs' turnout data. Must include any
#'   columns named by \code{dv_turnout}, \code{indep} and
#'   \code{weight}.
#' @param data_vote      data.frame of blocs' vote choice data. Must include any
#'   columns named by \code{dv_vote3}, \code{indep}, and \code{weight}.
#' @param indep      string, column name of the independent variable defining
#'   discrete voting blocs.
#' @param dv_vote3        string, column name of the dependent variable in \code{data_vote}, coded as
#'   follows: -1 for Democrat vote choice, 0 for third-party vote, 1 for
#'   Republican vote choice, and NA for no vote.
#' @param dv_turnout     string, column name of the dependent variable flagging
#'   voter turnout in \code{data_turnout}. That column must be coded 0 =  no vote, 1 = voted.
#' @param weight     optional string naming the column of sample weights.
#' @param boot_iters integer, number of bootstrap iterations for uncertainty
#'   estimation. The default \code{FALSE} is equivalent to 0 and does not estimate
#'   uncertainty.
#' @param verbose        logical, whether to print iteration number.
#' @param check_discrete logical, whether to check if \code{indep} is a discrete variable.
#'
#' @return A \code{vbdf} object.
#' @importFrom dplyr %>%
#'
#' @export


vb_discrete <-
    function(data,
             data_density = data, data_turnout = data, data_vote = data,
             indep, dv_vote3, dv_turnout,
             weight = NULL, boot_iters = FALSE,
             verbose = FALSE, check_discrete = TRUE){

        if(dplyr::is_grouped_df(data_density)){
            stop("Voting blocs analysis does not permit grouped data frames.\n Please use split-apply-combine to analyze multiple years, or pass multiple column names to the `indep` parameter for multivariate blocs.")
        }

        stopifnot(is.data.frame(data_density))
        stopifnot(is.data.frame(data_turnout))
        stopifnot(is.data.frame(data_vote))

        if(!all(rlang::has_name(data_density, indep)))
            stop(sprintf("%s not found in data_density", indep))

        if(!all(rlang::has_name(data_turnout, indep)))
            stop(sprintf("%s not found in data_turnout", indep))
        if(!rlang::has_name(data_turnout, dv_turnout))
            stop(sprintf("%s not found in data_turnout", dv_turnout))

        if(!all(rlang::has_name(data_vote, indep)))
            stop(sprintf("%s not found in data_vote", indep))
        if(!rlang::has_name(data_vote, dv_vote3))
            stop(sprintf("%s not found in data_vote", dv_vote3))

        if( check_discrete & dplyr::n_distinct(collapse::get_vars(data_density, indep)) > 50)
            stop("More than 50 unique values detected in indep. \nIf you are sure you don't want vb_continuous(), set check_discrete = FALSE.")

        # Start with NULL weights = 1, but grab the col if present
        weight_density <- rep(1L, nrow(data_density))
        weight_turnout <- rep(1L, nrow(data_turnout))
        weight_vote    <- rep(1L, nrow(data_vote))


        if(!is.null(weight)) {
            if(rlang::has_name(data_density, weight))
                weight_density <- c(data_density[[weight]])
            else stop(sprintf("%s not found in data_density", weight))

            if(rlang::has_name(data_turnout, weight))
                weight_turnout <- c(data_turnout[[weight]])
            else stop(sprintf("%s not found in data_turnout", weight))

            if(rlang::has_name(data_vote, weight))
                weight_vote    <- c(data_vote[[weight]])
            else stop(sprintf("%s not found in data_vote", weight))

            # Check for negative weights
            if(
                any(
                    weight_density <= 0,
                    weight_turnout <= 0,
                    weight_vote <= 0
                )
            ) stop("Weights must be greater than zero.")

        }

        # Force independent variables to be discrete
        data_density <-
            data_density %>%
            collapse::ftransformv(vars = indep,
                                  FUN  =  function(x)
                                      if(is.factor(x)) x else collapse::qF(x))

        data_turnout <-
            data_turnout %>%
            collapse::ftransformv(vars = indep,
                                  FUN  =  function(x)
                                      if(is.factor(x)) x else collapse::qF(x))

        data_vote <-
            data_vote %>%
            collapse::ftransformv(vars = indep,
                                  FUN  =  function(x)
                                      if(is.factor(x)) x else collapse::qF(x))

        # Boostrap setup ----
        if(length(boot_iters) == 1){
            boot_iters_density <-
                boot_iters_turnout <-
                boot_iters_vote <- boot_iters
        } else {

            if(!all(rlang::has_name(boot_iters, c("density", "turnout", "vote"))))
                stop("If boot_iters has length greater than 1, you must name each value according to the data set:
                     'density', 'turnout', or 'vote'")

            boot_iters_density <-
                boot_iters[pmatch("density", names(boot_iters))]
            boot_iters_turnout <-
                boot_iters[pmatch("turnout", names(boot_iters))]
            boot_iters_vote    <-
                boot_iters[pmatch("vote", names(boot_iters))]
        }

        results_base <-
            collapse::get_vars(data_density, indep) %>%
            collapse::funique()

        results_list <- list()

        # Probability mass calculation ----
        # Create matrix of data-row indices for each iteration
        # For iters = 0, returns the original row indices
        itermat_density <-
            boot_mat(nrow(data_density), iters = boot_iters_density,
                     weight = weight_density)

        # Remove weights when using resampled data
        if(boot_iters_density > 0) weight_density <- NULL

        results_list$prob <-
            apply(itermat_density, 2,
                  FUN = function(ind)

                      dplyr::slice(data_density, ind) %>%
                      dplyr::ungroup() %>%
                      collapse::get_vars(indep) %>%

                      wtd_table(weight = weight_density,
                                prop = TRUE, return_tibble = TRUE) %>%

                      dplyr::rename(prob = prop)
            ) %>%
            dplyr::bind_rows(.id = "resample")

        # Turnout calculation ----
        itermat_turnout <-
            boot_mat(nrow(data_turnout), iters = boot_iters_turnout,
                     weight = weight_turnout)

        if(boot_iters_turnout > 0) weight_turnout <- NULL

        results_list$turnout <-
            apply(itermat_turnout, 2,
                  FUN = function(ind)

                      calc_turnout(dplyr::slice(data_turnout, ind),
                                   indep = indep,
                                   dv = dv_turnout, weight = weight_turnout)

            ) %>%
            dplyr::bind_rows(.id = "resample")

        # Vote choice calculation ----
        itermat_vote <-
            boot_mat(nrow(data_vote), iters = boot_iters_vote,
                     weight = weight_vote)

        if(boot_iters_vote > 0) weight_vote <- NULL

        results_list$vote <-
            apply(itermat_vote, 2,
                  FUN = function(ind)
                      calc_vote(dplyr::slice(data_vote, ind),
                                indep = indep,
                                dv = dv_vote3, weight = weight_vote)
            ) %>%
            dplyr::bind_rows(.id = "resample")


        results <-
            dplyr::full_join(results_base, results_list$prob, by = indep) %>%
            dplyr::full_join(results_list$turnout,
                             by = c("resample", indep)) %>%
            dplyr::full_join(results_list$vote, by = c("resample", indep))

        # If at least one data set not resampled
        # populate missing estimates with the original-sample results
        contains_original <- "original" %in% results$resample
        if(contains_original && !all(boot_iters == 0)){
            estim_nms <- c(prob = "density", pr_turnout = "turnout",
                           net_rep = "vote choice")
            vbdf_orig <- collapse::fsubset(results, resample == "original")

            data_orig <- stats::na.omit(unique(estim_nms[names(which(!sapply(vbdf_orig, function(x) all(is.na(x)))))]))
            estim_orig <- names(estim_nms[estim_nms == data_orig])

            warning(
                sprintf("No resampling performed for %s data.\n  Populating %s estimates assuming zero uncertainty.",
                        paste(data_orig, collapse = ", "),
                        paste(estim_orig, collapse = ", "))
            )

            # Merge original-sample estimates into resamples
            vbdf_orig <-
                dplyr::select(vbdf_orig,
                              dplyr::all_of(c(indep, estim_orig)))

            results <-
                collapse::fsubset(results, resample != "original") %>%
                dplyr::select(-dplyr::all_of(estim_orig)) %>%
                dplyr::left_join(vbdf_orig, by = indep)
        }

        # Calculate net Republican votes
        results <-
            collapse::fmutate(results,
                              resample = gsub("-0+", "-", resample),
                              net_rep = prob * pr_turnout * cond_rep
                              ) %>%
            collapse::colorderv(neworder = c("resample", indep,
                                             "prob", "pr_turnout",
                                             "pr_voterep", "pr_votedem",
                                             "cond_rep", "net_rep")) %>%
            collapse::roworderv(cols = c("resample", indep))


        out <-
            vbdf(results,
                 bloc_var = indep, var_type = "discrete")
        return(out)
    }


#' Weighted frequency table or proportions
#'
#' @param ...     vectors of class factor or character, or a list/data.frame of such vectors.
#' @param weight  optional vector of weights. The default uses uniform weights of 1.
#' @param na.rm   logical, whether to remove NA values.
#' @param prop    logical, whether to return proportions or counts. Default returns counts.
#' @param return_tibble    logical, whether to return a tibble or named vector.
#' @param normwt           logical, whether to normalize weights such that they sum to 1.
#'
#' @return a vector or tibble of counts or proportions by group
#'
#' @export

wtd_table <-
    function(...,
             weight = NULL, na.rm = FALSE,
             prop = FALSE, return_tibble = FALSE,
             normwt = FALSE){

        # Factor/character check
        if(!all( sapply(list(...), is.factor)    |
                 sapply(list(...), is.character) |
                 sapply(list(...), is.list)
        )
        ) stop("All vector inputs must be factor or character. All subsequent arguments must be fully named.")


        if(!is.null(weight)) stopifnot(is.numeric(weight))

        tabdf <- data.frame(...)
        if(normwt) weight <- weight * nrow(tabdf)/sum(weight)

        # Use weights if present, otherwise all 1
        weight_vec <- if(is.null(weight)) rep.int(1L, nrow(tabdf)) else weight

        if(na.rm){
            # Remove values where any ... is NA
            tabdf <- stats::na.omit(tabdf)
            # Remove corresponding weights
            na_ind <- unique(attr(tabdf, "na.action"))
            weight_vec <- weight_vec[- na_ind]
        }

        # Sum weights within group
        grps <- collapse::GRP(tabdf)
        out  <- collapse::fsum(weight_vec, grps)

        if(prop) out <- out / sum(out)

        if(return_tibble){
            out <- tibble::tibble(grps$groups, count = unname(out))

            if(prop) names(out)[names(out) == "count"] <- "prop"
        }

        return(out)
    }

calc_turnout <- function(data, indep, dv, weight){

    cgdf <-
        collapse::get_vars(

            collapse::fgroup_by(data, indep),

            dv
        )

    results <- collapse::fmean(cgdf, w = weight)
    out <- dplyr::rename(results, pr_turnout = {{dv}})

    return(out)
}

calc_vote <- function(data, indep, dv, weight){

    cgdf <-
        collapse::get_vars(

            collapse::fgroup_by(

                collapse::ftransform(
                    data,
                    voterep = as.numeric(get(dv) ==  1),
                    votedem = as.numeric(get(dv) == -1)
                ),

                indep),

            c("voterep", "votedem")
        )

    out <- collapse::fmean(cgdf, w = weight)
    names(out)[names(out) == "voterep"] <- "pr_voterep"
    names(out)[names(out) == "votedem"] <- "pr_votedem"

    out$cond_rep <- out$pr_voterep - out$pr_votedem

    return(out)
}