clbes: British Election Study Survey Tools

# Constants ------------------------------------------------------------------

BES_FTF_URL <- "http://www.britishelectionstudy.com/wp-content/uploads/2018/01/bes_f2f_2017_v1.2.sav"

# Functions ------------------------------------------------------------------

#' Calculate estimated election turnout by demographic characteristic
#'
#' Calculates estimated election turnout by demographic characteristic for
#' British Election Study survey data.
#'
#' @param data A dataframe containing
#'   \href{http://www.britishelectionstudy.com/}{British Election Study}
#'   internet or face to face survey data.
#' @param dgraphic The name of the variable for the demographic characteristic
#'   of interest. Must be defined between quotation marks.
#' @param vote The name of the variable which captures response from suvey
#'   respondent to the question if they voted or not in the election of
#'   interest. Must be defined between quotation marks.
#' @param wt The name of the weight variable to be used in analysis. Must
#'   be defined between quotation marks.
#' @param validated Use when survey respondent's vote registeration has not
#'   been validated by British Election Study. A boolean which if FALSE
#'   filters \code{data} to show those survey respondents who are over the age
#'   of 18 and say they are registered to vote. Removes 'Dont Know' responses.
#'   If FALSE \code{result} must be defined. Default value is TRUE.
#' @param result Use when \code{validated} is FALSE. A numeric between 0 and 1
#'   for the known turnout percentage for the election in Great Britain.
#'   Default value is 0.
#' @param sample A boolean which if TRUE returns the unweighted sample size.
#'   FALSE returns weighted sample size. Default value is FALSE.
#'   If \code {validated} is FALSE then \code{result} must be default.
#'   \code{percent} must be FALSE to show sample sizes.
#' @param percent A boolean which if FALSE does not calculate turnout
#'   percentages. Default value is TRUE.
#' @param write A boolean which if TRUE creates a CSV in working directory.
#'   Default value is FALSE.
#' @return A tibble or CSV containing turnout estimates for selected
#'   demographic.
#' @export
#'


turnout_query <-  function(data, dgraphic, vote, wt,
                           validated = TRUE, result = 0,
                           sample = FALSE, percent = TRUE,
                           write = FALSE) {


    # Create dataframe variables in function environment
    data$dgraphic <- as.character(haven::as_factor(data[[dgraphic]]))
    data$vote <- as.character(haven::as_factor(data[[vote]]))
    data$wt <- data[[wt]]


    # Filter respondents aged over and registered
    if (validated == FALSE) {
      data <- dplyr::filter(data, ageGroup > 1)
      data <- dplyr::filter(data, registered == 1 | registered == 2)
      data <- dplyr::filter(data, genElecTurnoutRetro < 3)
    }


    # Weight estimates
    if (sample == FALSE) {
      estimate <- data %>%
                  dplyr::group_by(dgraphic, vote) %>%
                  dplyr::summarise(popest = sum(wt, na.rm = TRUE))
    } else {
      estimate <- data %>%
                  dplyr::group_by(dgraphic, vote) %>%
                  dplyr::summarise(popest = n())
    }


    # Round estimate figure
    estimate$popest <- round(estimate$popest, 0)


    # Create tibble in wide form and remove NA column
    estimate <- estimate %>%
                tidyr::spread(vote, popest, fill = 0) %>%
                na.omit(estimate)


    # Add column total for adjustment calculation
    if (validated == FALSE) {
      estimate <- cltools::add_col_totals(estimate)
    }


    # Calculate turnout percentages
    if (percent == TRUE) {
      estimate <- cltools::get_row_percent(estimate, na.rm = TRUE)
    }


    # Adjustment to unvalidated turnout estimates
    if (result != 0) {
      adjust <- result / estimate$`Yes, voted`[length(estimate$`Yes, voted`)]
      estimate$`Yes, voted` <- estimate$`Yes, voted` * adjust
      estimate <- dplyr::select(estimate, dgraphic, `Yes, voted`)
    } else {
      estimate <- dplyr::select(estimate, dgraphic, dplyr::contains("voted"))
    }


    # Write CSV file
    if (write == TRUE) {
      readr::write_csv(estimate, paste("turnout_", dgraphic, ".csv"))
    } else {
      estimate
    }

}


#' Calculate estimated election turnout by demographic characteristic
#'
#' Calculates estimated election turnout by commonly asked demographic
#' characteristic for validated British Election Study 2017 face to face
#' survey data only.
#'
#' @param dgraphic The demographic characteristic of interest. Must be
#'   defined as one of the following values: age, gender, ethnicity,
#'   religion, education. Must be defined between quotation marks.
#' @param sample A boolean which if TRUE returns the unweighted sample size.
#'   FALSE returns weighted sample size. Default value is FALSE.
#'   \code{percent} must be FALSE to show sample sizes.
#' @inheritParams turnout_query
#' @return A tibble or CSV containing turnout estimates for selected
#'   demographic.
#' @export
#'


turnout_common <- function(dgraphic = c("age", "gender",
                           "ethnicity", "religion", "education"),
                           sample = FALSE, percent = TRUE,
                           write = FALSE) {


    # Download BES 2017 face to face survey
    data <- haven::read_spss(BES_FTF_URL)


    # Create and recode dataframe variables in function environment
    if (dgraphic == "age") {
      data$dgraphic <- as.character(haven::as_factor(data$y10_banded))
    }


    if (dgraphic == "gender") {
      data$dgraphic <- as.character(haven::as_factor(data$y09))
    }


    if (dgraphic == "ethnicity") {
      data$dgraphic <- dplyr::recode(as.character(data$y11), '-999' = -999,
                                     '-2' = -2, '1' = 1, '2' = 1, '3' = 1,
                                     '4' = 1, '5' = 2, '6' = 2, '7' = 2,
                                     '8' = 2,'9'= 2, '10' = 2, '11' = 2,
                                     '12' = 2, '13' = 2,'14' = 2, '15' = 2,
                                     '16' = 2, '17' = 2, '18' = 2, '19' = 1)
      data$dgraphic <- factor(data$dgraphic,
                              levels = c(1, 2, -2, -999),
                              labels = c("White", "BAME",
                                         "Refused", "Not Stated"))
      data$dgraphic <- as.character(haven::as_factor(data$dgraphic))
    }


    if (dgraphic == "tenure") {
      data$dgraphic <- dplyr::recode(as.character(data$y03),
                                     '-999' = -999, '-1' = -1, '1' = 1,
                                     '2' = 2, '3' = 3, '4' = 4, '5' = 3)
      data$dgraphic <- factor(data$dgraphic,
                              levels = c(1, 2, 3, 4, -1, -999),
                              labels = c("Own ouright", "Mortgage",
                                         "Social rent", "Private rent",
                                         "Don't Know", "Not stated"))
      data$dgraphic <- as.character(haven::as_factor(data$dgraphic))
    }


    if (dgraphic == "religion") {
      data$dgraphic <- dplyr::recode(as.character(data$y06), '0' = 0, '1' = 1,
                                     '2' = 1, '3' = 1,
                                     '4' = 1, '5' = 1, '6' = 1, '7' = 1,
                                     '8' = 1, '9'= 1, '10' = 1, '11' = 1,
                                     '12' = 2, '13' = 2, '14' = 2, '15' = 2,
                                     '16' = 2, '17' = 2, '18' = 1, '19' = 1,
                                     '-999' = 999, '-2' = -2)
      data$dgraphic <- factor(data$dgraphic,
                              levels = c(0, 1, 2, -2, -999),
                              labels = c("No Religion", "Christian",
                                         "Non-Christian", "Refused",
                                         "Not Stated"))
      data$dgraphic <- as.character(haven::as_factor(data$dgraphic))
    }


    if (dgraphic == "education") {
      data$dgraphic <- dplyr::recode(as.character(data$edlevel), '0' = 0,
                                     '1' = 1, '2' = 2, '3' = 3, '4' = 4,
                                     '5' = 4)
      data$dgraphic <- factor(data$dgraphic,
                              levels = c(0, 1, 2, 3, 4),
                              labels = c("No qualifications",
                                         "Below GCSE",
                                         "GCSE", "A-Level",
                                         "Degree or higher"))
      data$dgraphic <- as.character(haven::as_factor(data$dgraphic))
    }


    # Create validated vote variable
    data$vote <- as.character(haven::as_factor(data$validatedTurnoutBinary))


    # Weight estimates
    if (sample == FALSE) {
      estimate <- data %>%
                  dplyr::group_by(dgraphic, vote) %>%
                  dplyr::summarise(popest = sum(wt_vote_valid, na.rm = TRUE))
    } else {
      estimate <- data %>%
                  dplyr::group_by(dgraphic, vote) %>%
                  dplyr::summarise(popest = n())
    }


    # Round estimate figure
    estimate$popest <- round(estimate$popest, 0)


    # Create tibble in wide form and remove NA column
    estimate <- estimate %>%
                tidyr::spread(vote, popest, fill = 0) %>%
                na.omit(estimate)


    # Calculate turnout percentages
    if (percent == TRUE) {
      estimate <- cltools::get_row_percent(estimate, na.rm = TRUE)
    } else {
      estimate
    }


    # Remove hanging "NA" column
    estimate <- dplyr::select(estimate, dgraphic, Voted, -"<NA>")


    # Write CSV file
    if (write == FALSE) {
      estimate
    } else {
      readr::write_csv(estimate, paste("BES_2017_FTF_turnout_",
                                       dgraphic, ".csv"))
    }

}