R/input_wrapper.R

Defines functions dop_transform dop_irv

Documented in dop_irv dop_transform

#' Get full distribution of preferences in each instant runoff voting round as percentage
#'
#' @description
#' Compute the preference in each round of instant runoff voting from input data, 
#' transforming the results into a tidy format for visualization. Each row represents
#' one round, with columns for each candidate's preference percentage and the election winner.
#'
#' @param x Input data. Accepts the same formats as `prefio::pref_irv()`:
#'   * A preference vector where each element represents one ballot
#'   * A data frame with a column for preference
#' @param value_type Character string specifying the output format. Either:
#'   * `"percentage"` (default): Returns vote shares as proportions (0-1)
#'   * `"count"`: Returns raw vote counts
#' @param ... Additional arguments passed to `prefio::pref_irv()`, including:
#'   * `preferences_col`: Column name containing preference orderings
#'   * `frequency_col`: Column name containing vote frequencies
#'
#' @return A tibble with the following structure:
#'   * `round`: Integer, the round number (1 to n)
#'   * One column per candidate: Numeric, the percentage of votes (0-1) that 
#'     candidate received in that round. NA values are replaced with 0 for 
#'     eliminated candidates.
#'   * `winner`: Character, the name of the eventual IRV winner (same for all rows)
#'
#' @examples
#' # Example 1: From preference vector
#' votes <- prefio::preferences(c("A > B > C", "B > A > C", "C > B > A", "A > B > C"))
#' dop_irv(votes, value_type = "count")
#' 
#' # Example 2: From data frame with custom column names
#' vote_data <- tibble::tibble(
#'   prefs = prefio::preferences(c("A > B > C", "B > C > A", "C > A > B")),
#'   counts = c(100, 75, 25)
#' )
#' dop_irv(vote_data, value_type = "percentage",
#'         preferences_col = prefs,
#'         frequency_col = counts)
#'
#' @export
dop_irv <- function(x, value_type = c("percentage", "count"), ...) {
  
  value_type <- match.arg(value_type)
  
  # Implement IRV to get round results
  irv_result <- prefio::pref_irv(x, ...)
  
  percent_df <- tibble::tibble()
  
  # Process each round
  for (i in seq_along(irv_result$rounds)) {
    
    if (value_type == "percentage") {
      # Convert to percentages
      round_pref <- irv_result$rounds[[i]] |>
        dplyr::mutate(
          pref_value = value / sum(value),
          round = i
        ) |>
        dplyr::select(-value) |>
        tidyr::pivot_wider(
          names_from = candidate, 
          values_from = pref_value
        )
    } else {
      # Keep as counts
      round_pref <- irv_result$rounds[[i]] |>
        dplyr::mutate(round = i) |>
        tidyr::pivot_wider(
          names_from = candidate, 
          values_from = value
        )
    }
  
    percent_df <- dplyr::bind_rows(percent_df, round_pref) |>
      dplyr::mutate(dplyr::across(dplyr::everything(), ~tidyr::replace_na(.x, 0)))
  }

  # Add the final winner
  percent_df <- percent_df |>
    dplyr::mutate(winner = irv_result$winner)
  
  return(percent_df)
}

#' Transform AEC distribution of preferences from long to wide format
#' 
#' @description
#' Transform AEC distribution of preferences from long to wide format, with optional scaling and normalization.
#' This function is useful for converting all distribution of preference data with similar format
#' into format ready for ternary plots. 
#' 
#' @param data A data frame containing preference or vote distribution data, with format similar to
#'  \href{https://results.aec.gov.au/27966/Website/HouseDownloadsMenu-27966-Csv.htm}{AEC Distribution of Preferences 2022}
#' @param key_cols Columns that identify unique observations, e.g., DivisionNm, CountNumber
#' @param value_col Numeric and non-negative. Column containing the numeric values to aggregate, 
#'  e.g., CalculationValue, Votes. 
#' @param item_col Column name containing the items (candidates/parties) of the election, 
#'  e.g., Party, Candidate. This column will become column names in the output wide format.
#' @param normalize Logical. If \code{TRUE} (default), normalizes values within
#'   each group to sum to 1. If \code{FALSE}, returns raw aggregated values.
#' @param scale Numeric. If \code{normalize = FALSE}, divides all values by this
#'   scale factor. Default is 1 (no scaling).
#' @param fill_value Numeric. Value to use for missing combinations after
#'   pivoting. Default is 0.
#' @param winner_col Optional character string specifying a column that indicates
#'   the winner/elected party. If provided, this column will be joined back to
#'   the output based on key columns. Useful for preserving election outcome
#'   information. Default is \code{NULL}.
#' @param winner_identifier Optional character string specifying the value in
#'   \code{winner_col} that identifies winning candidates (e.g., "Y", "Elected").
#'   Only used if \code{winner_col} is specified. Default is "Y".
#' 
#' @return A data frame in wide format with:
#'   \itemize{
#'     \item Key columns identifying each observation
#'     \item Columns for each item (candidate/party) containing aggregated/normalized values
#'     \item Winner column (if \code{winner_col} was specified)
#'   }
#' 
#' @examples
#' library(dplyr)
#' # Convert AEC 2025 Distribution of Preference data to wide format
#' data(aecdop_2025)
#' 
#' # We are interested in the preferences of Labor, Coalition, Greens and Independent. 
#' # The rest of the parties are aggregated as Other.
#' aecdop_2025 <- aecdop_2025 |>
#'  filter(CalculationType == "Preference Percent") |> 
#'  mutate(Party = case_when(
#'     !(PartyAb %in% c("LP", "ALP", "NP", "LNP", "LNQ")) ~ "Other",
#'     PartyAb %in% c("LP", "NP", "LNP", "LNQ") ~ "LNP",
#'    TRUE ~ PartyAb))
#' 
#' dop_transform(
#'   data = aecdop_2025,
#'   key_cols = c(DivisionNm, CountNumber),
#'   value_col = CalculationValue,
#'   item_col = Party,
#'   winner_col = Elected
#' )
#' @export

dop_transform <- function(data,
                          key_cols,
                          value_col,
                          item_col,
                          normalize = TRUE,
                          scale = 1,
                          fill_value = 0,
                          winner_col = NULL,
                          winner_identifier = "Y") {
  
  # Input validation
  stopifnot(is.data.frame(data))
  
  # Capture and convert column selections
  key_cols_sel <- tidyselect::eval_select(
    rlang::enquo(key_cols), 
    data
  )
  key_cols_chr <- names(key_cols_sel)
  
  value_col_chr <- rlang::as_label(rlang::ensym(value_col))
  item_col_chr <- rlang::as_label(rlang::ensym(item_col))
  
  if (!is.null(rlang::enexpr(winner_col))) {
    winner_col_chr <- rlang::as_label(rlang::ensym(winner_col))
  } else {
    winner_col_chr <- NULL
  }
  
  # Check if columns exist
  required_cols <- c(key_cols_chr, value_col_chr, item_col_chr)
  if (!is.null(winner_col_chr)) {
    required_cols <- c(required_cols, winner_col_chr)
  }
  
  missing_cols <- setdiff(required_cols, names(data))
  if (length(missing_cols) > 0) {
    stop("Columns not found in data: ", paste(missing_cols, collapse = ", "))
  }
  
  # Check if value column is numeric
  if (!is.numeric(data[[value_col_chr]])) {
    stop("'value_col' (", value_col_chr, ") must be numeric")
  }
  
  # Check for negative values
  if (any(data[[value_col_chr]] < 0, na.rm = TRUE)) {
    stop("'value_col' contains negative values. Compositional data must be non-negative.")
  }
  
  # Aggregate data
  group_cols <- c(key_cols_chr, item_col_chr)
  df_agg <- data |>
    dplyr::group_by(dplyr::across(dplyr::all_of(group_cols))) |>
    dplyr::summarise(
      aggregated_value = sum(.data[[value_col_chr]], na.rm = TRUE),
      .groups = "drop"
    )
  
  # Pivot wider
  df_wide <- df_agg |>
    tidyr::pivot_wider(
      id_cols = dplyr::all_of(key_cols_chr),
      names_from = dplyr::all_of(item_col_chr),
      values_from = aggregated_value,
      values_fill = fill_value
    )
  
  # Get item column names
  item_names <- setdiff(names(df_wide), key_cols_chr)
  
  # Apply normalization or scaling
  if (normalize) {
    df_wide <- df_wide |>
      dplyr::mutate(
        row_total = rowSums(dplyr::across(dplyr::all_of(item_names)), na.rm = TRUE),
        dplyr::across(
          dplyr::all_of(item_names),
          ~ .x / row_total
        )
      ) |>
      dplyr::select(-row_total)
    
    # Handle division by zero
    df_wide <- df_wide |>
      dplyr::mutate(
        dplyr::across(
          dplyr::all_of(item_names),
          ~ if_else(is.nan(.x) | is.infinite(.x), 0, .x)
        )
      )
    
  } else if (scale != 1) {
    df_wide <- df_wide |>
      dplyr::mutate(
        dplyr::across(dplyr::all_of(item_names), ~ .x / scale)
      )
  }
  
  # Join winner information if requested
  if (!is.null(winner_col_chr)) {
    winner_data <- data |>
      dplyr::filter(.data[[winner_col_chr]] == winner_identifier) |>
      dplyr::select(dplyr::all_of(c(key_cols_chr, item_col_chr))) |>
      dplyr::distinct() |>
      dplyr::rename(Winner = dplyr::all_of(item_col_chr))
    
    df_wide <- df_wide |>
      dplyr::left_join(winner_data, by = key_cols_chr)
  }
  
  return(df_wide)
}

Try the prefviz package in your browser

Any scripts or data that you put into this service are public.

prefviz documentation built on April 13, 2026, 5:07 p.m.