#####################################################
## report_aggregator.r
##
## tools for aggregating respondent reports, taking
## account of weights
##
#####################################################
##' aggregate a reported quantity by groups
##'
##' This function takes a quantity and aggregates it by groups,
##' using the design weights.
##'
##' @param resp.data the data
##' @param attribute.names the names of the variables that define
##' the groups for which the qoi should be aggregated
##' @param qoi the variable with quantity to aggregate
##' @param weights analysis weights; either the name of a column that has sampling weights
##' or a vector with the names of columns of the dataset that have bootstrap weights. Currently,
##' these weights must be named "boot_weight_1", "boot_weight_2", ...
##' @param qoi.name the name of the qoi
##' @param scaling.factor a factor by which weights should be multiplied before applying them. Defaults to NULL (no scaling)
##' @param dropmiss if TRUE, then drop missing values and rescale the weights to preserve their total. So, if weights
##' sum to 100, and dropping rows with missing values leads to weights that sum to 80, then the remaining rows will have
##' their weights multiplied by (100/80) to ensure the weights still add up to 100 after dropping the rows with missing values.
##' Defaults to FALSE
##' @return the aggregated reported quantities
##' @rdname report.aggregator
report.aggregator_ <- function(resp.data,
attribute.names,
qoi,
weights,
qoi.name,
scaling.factor=NULL,
dropmiss=FALSE) {
resp.data <- resp.data
wdat <- select_(resp.data, .dots=weights)
qdat <- select_(resp.data, .dots=qoi)
adat <- select_(resp.data, .dots=attribute.names)
## multiply weights by scaling factor, if one was specified
if (! is.null(scaling.factor)) {
if (is.numeric(scaling.factor)) {
# scaling factor is a number or vector, which we'll multiply each weight by
wdat <- wdat %>%
dplyr::mutate_all(~ . * scaling.factor)
} else if (is.character(scaling.factor)) {
if(! scaling.factor %in% colnames(resp.data)) {
stop(glue::glue("report.aggregator_: Can't find column '{sf}', which is supposed to have a scaling factor.",
sf = scaling.factor))
}
# scaling factor is a string, meaning it's a column name
wdat <- wdat %>%
dplyr::mutate_all(~ . * resp.data[[scaling.factor]])
}
}
df <- bind_cols(wdat, qdat, adat)
# see
# http://stackoverflow.com/questions/21208801/group-by-multiple-columns-in-dplyr-using-string-vector-input
wgt.col <- as.symbol(names(df)[1])
qoi.col <- as.symbol(names(df)[2])
weighted_sum <- function(x, w, na.rm = FALSE)
{
if(na.rm) {
idx <- (1:length(x))[!is.na(x)]
wfac <- sum(w) / sum(w[idx])
} else {
idx <- 1:length(x)
wfac <- 1
}
return(sum(x[idx]*w[idx]*wfac))
}
weighted_mean <- function (x, w, na.rm = FALSE)
{
if (na.rm) {
idx <- (1:length(x))[!is.na(x)]
}
else {
idx <- 1:length(x)
}
return(sum(x[idx] * w[idx])/sum(w[idx]))
}
## NB: this design is based on siblingsurvival::get_ind_est_from_ec
df.summ <- df %>%
group_by_at(attribute.names) %>%
dplyr::summarise_at(.vars=weights,
.funs=list(mean.qoi = ~weighted_mean(x=.data[[qoi]], w=., na.rm=dropmiss),
sum.qoi = ~weighted_sum(x=.data[[qoi]], w=., na.rm=dropmiss),
# get the sum of the weights
# NB: this includes rows w/ missingness, even if
# dropmiss=TRUE
wgt.total = ~sum(.),
wgt.inv.total = ~sum(1/.),
## this is a hack because n() generates errors due to
## plyr/dplyr import conflict (and it is hard to regulate
## import order with package infrastructure)
num.obs = ~length(.),
dropmiss = ~dropmiss))
## if we have bootstrap weights, reshape and clean things up
if(length(weights) > 1) {
## NB: assuming weight names are boot_weight_1, boot_weight_2, ...
df.summ <- df.summ %>%
tidyr::gather(starts_with('boot_weight'),
key='rawqty',
value='value') %>%
mutate(qty = stringr::str_remove(rawqty, 'boot_weight_\\d+_'),
boot_idx = as.integer(stringr::str_remove_all(rawqty, '[^\\d]'))) %>%
select(-rawqty) %>%
tidyr::spread(qty, value)
} #else {
toren <- list(~mean.qoi, ~sum.qoi, ~wgt.total, ~wgt.inv.total, ~num.obs)
newnames <- paste0(c("mean.", "sum.", "wgt.total.",
"wgt.inv.total.", "num.obs."), qoi.name)
df.summ <- dplyr::rename_(df.summ,
.dots=setNames(toren, newnames))
#}
return(df.summ)
}
#####################################################
##' @rdname report.aggregator
report.aggregator <- function(resp.data,
attribute.names=NULL,
qoi,
weights,
qoi.name=NULL,
scaling.factor=NULL,
dropmiss=FALSE) {
report.aggregator_(resp.data,
attribute.names=lazy(attribute.names, env=resp.data),
qoi=lazy(qoi, resp.data),
weights=lazy(weights, resp.data),
ifelse(is.null(qoi.name), "qoi", lazy(qoi.name)),
scaling.factor=lazy(scaling.factor, resp.data),
dropmiss)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.