#' Use bootstrap resampling of survey to find best weighting indicators
#'
#' @param inputfile A file containing the input survey data (a survey data
#' frame, such as that created by \code{simulate_survey}). Either a path to a
#' file, a connection, or literal data (either a single string or a raw vector).
#' Files starting with \code{http://}, \code{https://}, \code{ftp://}, or
#' \code{ftps://} will be automatically downloaded, and zipped files wll be
#' uncompressed.
#' @param outputpath Path to write a csv file of the output weighted results.
#' The new file will contain the original survey data with 2 columns added,
#' \code{weight_best}, the post-stratification weight for each row in the
#' survey using the best indicators, and \code{weight_all}, the
#' post-stratification weight for each row using all the indicators available.
#' Default will write a csv to the working directory.
#' @param n Number of bootstrap resamplings to generate in order to find the
#' best weighting indicators
#' @param response A column in \code{mysurvey} that contains the quantity to be
#' weighted, such as the response to a yes/no question as in
#' \code{simulate_survey}
#' @param response_col Response column as string
#' @param ... Weighting indicator(s) to be tested for post-stratification.
#' One or more of \code{sex}, \code{raceethnicity}, \code{age}, and/or
#' \code{education} which must be columns in the survey data frame. Both
#' \code{age} and \code{education} cannot be used for post-stratification at the
#' same time because of how the ACS tables are organized.
#' @param dots List of weighting indicator(s) as string(s)
#'
#' @return A list of the best indicators. This function also saves a csv file
#' of the output weighted results
#'
#' @details \code{choose_best_weighting} is given bare names while
#' \code{choose_best_weighting_} is given strings and is therefore suitable for
#' programming with. One column of the survey data in \code{inputfile} must be
#' \code{geography}, to indicate what ACS data to use for post-stratification
#' weighting.
#'
#' @import dplyr
#'
#' @name choose_best_weighting
#'
#' @examples
#'
#' tmp <- tempfile()
#' choose_best_weighting(system.file("extdata/examplesurvey.csv", package = "WWC"),
#' tmp, 5, response, sex, raceethnicity)
#'
#' @export
choose_best_weighting <- function(inputfile, outputpath, n, response, ...) {
# NSE magic
dots <- eval(substitute(alist(...)))
dots <- purrr::map(dots, col_name)
response_col <- col_name(substitute(response))
choose_best_weighting_(inputfile, outputpath, n, response_col, dots)
}
#' @rdname choose_best_weighting
#' @export
choose_best_weighting_ <- function(inputfile, outputpath = "./wwc_weighted.csv",
n, response_col, dots) {
.debug <- F
mysurvey <- readr::read_csv(inputfile)
# error handling for weighting indicator
force_edu <- FALSE
test_indicators(dots, force_edu)
###### added by GM to debug 'contrasts can be applied only to factors with 2 ...' -------
#
# Restore leading 0s for geography
#
geog <- as.character(mysurvey$geography)
nc <- nchar(geog)
if(any(bad <- !(nc %in% c(2,4,5)))) {
print(geog[bad])
stop("Bad number of characters in geography\n")
}
geog <- ifelse(nchar(geog)==4, paste0('0',geog), geog)
mysurvey$geography <- geog
#
# Temporary fix to problem with categorical responses
#
resp <- mysurvey[[response_col]]
if(length( unique(resp)) == 1) stop('response is a constant')
if(!is.numeric(resp)) {
resp <- as.numeric(as.factor(resp))
}
#
# use mean imputation for NAs
#
resp[is.na(resp)] <- mean(resp, na.rm = T)
mysurvey[[response_col]] <- resp
###### end of code added by GM
# exclude rows/observations/respondents who have NA for geography
mysurvey <- mysurvey[!is.na(mysurvey$geography),]
all_weights <- weight_wwc_(mysurvey, dots, force_edu = FALSE) %>%
rename(weight_all = weight)
weight_and_process <- function(dots, mysurvey, n, response_col) {
boot <- modelr::bootstrap(mysurvey, n)
boot <- purrr::map(boot$strap, weight_wwc_, dots)
ret <- purrr::map(boot, summarize_survey_, response_col)
ret <- purrr::map_df(ret, bind_rows)
ret <- ret %>%
filter(result == "Weighted") %>%
group_by(result, answer) %>%
summarise(mean = mean(value),
stddev = stats::sd(value)) %>%
ungroup
ret
}
if (length(dots) == 1) {
ret <- weight_wwc_(mysurvey, dots, force_edu = FALSE)
best_dots <- dots
} else if (length(dots) == 2) {
list_of_dots <- c(dots,
list(purrr::flatten_chr(dots)))
results <- data_frame(dots = list_of_dots) %>%
mutate(results = purrr::map(list_of_dots,
weight_and_process,
mysurvey, n, response_col)) %>%
tidyr::unnest(results, .drop = FALSE)
dots <- results %>%
top_n(-1, stddev)
best_dots <- dots$dots %>%
purrr::transpose() %>%
purrr::map(purrr::flatten_chr)
########## added by GM to fix dot list problem ------------
best_dots <- unique(unlist(best_dots))
if(.debug) browser()
########## end of code added by GM
ret <- weight_wwc_(mysurvey, best_dots, force_edu = FALSE)
} else if (length(dots) == 3) {
list_of_dots <- c(dots,
utils::combn(dots, 2, FUN = as.character,
simplify = FALSE),
list(purrr::flatten_chr(dots)))
results <- data_frame(dots = list_of_dots) %>%
mutate(results = purrr::map(list_of_dots,
weight_and_process,
mysurvey, n, response_col)) %>%
tidyr::unnest(results, .drop = FALSE)
dots <- results %>%
top_n(-1, stddev) %>%
sample_n(1)
best_dots <- dots$dots %>%
purrr::transpose() %>%
purrr::map(purrr::flatten_chr)
########## added by GM to fix dot list problem ------------
best_dots <- unique(unlist(best_dots))
if(.debug) browser()
########## end of code added by GM
ret <- weight_wwc_(mysurvey, best_dots, force_edu = FALSE)
} else {
stop("indicators can only include three of sex, raceethnicity, age, and education")
}
ret <- ret %>%
rename(weight_best = weight) %>%
inner_join(all_weights)
readr::write_csv(ret, outputpath)
message("The set of indicators best suited to this survey is")
message(cat(unlist(best_dots), sep = " "))
return(best_dots)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.