#' @import dplyr
#' @export
DEFAULT_PERCENTILES <- seq(0.1, 0.9, by=0.1)
join_dfs <- function(dfs) {
if(is.null(names(dfs))) {
names(dfs) <- seq_len(length(dfs))
}
for( name in names(dfs)) {
dfs[[name]]$method <- name
dfs[[name]]$true <- as.numeric(dfs[[name]]$true)
}
joined <- do.call("rbind", dfs)
true_df <- dfs[[1]]
true_df$predicted <- true_df$raw
true_df$method <- "true"
true_df$fold <- 1
joined <- rbind(true_df, joined)
joined$fold <- factor(joined$fold)
joined
}
add_dfs <- function(joined, dfs) {
stopifnot(!is.null(names(dfs)))
for( name in names(dfs)) {
stopifnot(!(name %in% joined$method))
dfs[[name]]$method <- name
dfs[[name]]$true <- as.numeric(dfs[[name]]$true)
}
new_joined <- do.call("rbind", dfs)
rbind(joined, new_joined)
}
calculate_mse_ <- function(joined) {
joined <- dplyr::filter(joined, method != "true")
with_mse <- mutate(joined, se=(true - predicted)**2)
grouped <- group_by(with_mse, method)
summarise(grouped, mse=mean(se))
}
plot_residuals_ <- function(joined) {
joined <- dplyr::filter(joined, method != "true")
joined$residual <- joined$true - joined$predicted
ggplot2::ggplot(joined, ggplot2::aes(x=predicted, y=residual, color=method)) +
ggplot2::geom_point(alpha=0.5)
}
plot_residuals <- function(...) {
dfs <- list(...)
joined <- join_dfs(dfs)
plot_residuals_(joined)
}
plot_scatter_ <- function(joined) {
joined <- dplyr::filter(joined, method != "true")
ggplot2::ggplot(joined, ggplot2::aes(x=true, y=predicted, color=method)) +
ggplot2::geom_point(alpha=0.5)
}
plot_scatter <- function(...) {
dfs <- list(...)
joined <- join_dfs(dfs)
plot_scatter_(joined)
}
plot_density_ <- function(joined, SHOW_FOLDS=FALSE) {
p <- ggplot2::ggplot(joined, ggplot2::aes(x=predicted, color=method)) +
ggplot2::geom_density()
if (SHOW_FOLDS) {
no_true <- dplyr::filter(joined, method != "true")
p <- p + ggplot2::geom_density(no_true, mapping=ggplot2::aes(x=predicted, color=method, group=fold), alpha=0.1)
}
p
}
plot_density <- function(..., SHOW_FOLDS=FALSE) {
dfs <- list(...)
joined <- join_dfs(dfs)
plot_density_(joined, SHOW_FOLDS)
}
subtract_base <- function(df, base) {
df <- mutate(df, i=row_number())
base_df <- filter(df, method == base) %>%
ungroup() %>%
group_by(i) %>%
summarise(y=mean(y)) %>%
select(one_of("y", "i")) %>%
rename(base_y=y)
df <- merge(df, base_df, on=i)
df <- mutate(df, y=y-base_y)
df <- filter(df, method != base)
df
}
plot_cumulative <- function(df, y_label, show_cutoffs, show_folds, folded, point_count, x_label="percent_population_included", base=NULL) {
if (!is.null(base)) {
# assumes that methods are all the same size so check if they are not
method_counts <- summarise(df, count=n())$count
stopifnot(first(method_counts) == method_counts)
df <- subtract_base(df, base)
folded <- subtract_base(folded, base)
}
cut <- df %>%
slice(seq(1, n(), n() / point_count))
p <- ggplot2::ggplot(cut, ggplot2::aes(x=x, y=y, color=method)) +
ggplot2::geom_line(alpha=0.75) +
ggplot2::facet_wrap(~ threshold) +
ggplot2::labs(y = y_label) +
ggplot2::labs(x = x_label)
if ('least_squares' %in% cut$method) {
lms <- filter(ungroup(cut), method=='least_squares')
p <- p +
ggplot2::geom_line(data=lms, mapping=ggplot2::aes(x=x, y=y, color=method), size=1.75)
}
if (show_cutoffs) {
# TODO annotate plot
df <- mutate(df, absolute=quantile(raw, threshold))
threshold.df <- df %>%
filter(cumall(predicted < absolute)) %>%
filter(row_number() == n())
threshold.df[["cutoff"]] <- "consumption"
percentile.threshold.df <- df %>%
mutate(percentile_count = sum(raw < absolute)) %>%
filter(percentile_count == row_number())
percentile.threshold.df[["cutoff"]] <- "percentile"
percentile.threshold.df$percentile_count <- NULL
median.threshold.df <- df %>%
filter(cumall(predicted < median(raw))) %>%
filter(row_number() == n())
median.threshold.df[["cutoff"]] <- "median"
threshold.df <- rbind(threshold.df, percentile.threshold.df, median.threshold.df)
horizontal_mapping <- ggplot2::aes(y=y, x=0., yend=y, xend=x, color=method, linetype=cutoff)
vertical_mapping <- ggplot2::aes(y=0., x=x, yend=y, xend=x, color=method, linetype=cutoff)
p <- p +
ggplot2::geom_segment(data=threshold.df, mapping=horizontal_mapping) +
ggplot2::geom_segment(data=threshold.df, mapping=vertical_mapping)
}
if (show_folds) {
# assumes that folds are all the same size so check if they are not
fold_counts <- summarise(folded, count=n())
method_fold_stats <- summarise(fold_counts, max=max(count), min=min(count))
one_size <- method_fold_stats$max == method_fold_stats$min
stopifnot(all(one_size))
folded <- folded %>%
group_by(method, fold) %>%
mutate(i=row_number()) %>%
ungroup() %>%
group_by(method, i) %>%
summarise(ymax=max(y), ymin=min(y), x=first(x))
folded_cut <- folded %>% slice(seq(1, n(), n() / point_count))
aes_min <- ggplot2::aes(x=x, y=ymin, color=method)
aes_max <- ggplot2::aes(x=x, y=ymax, color=method)
p <- p + ggplot2::geom_line(data=folded_cut, mapping=aes_min, alpha=0.5, linetype="dashed")
p <- p + ggplot2::geom_line(data=folded_cut, mapping=aes_max, alpha=0.5, linetype="dashed")
}
p
}
plot_accuracy_ <- function(joined, BASE=NULL, THRESHOLD=DEFAULT_THRESHOLDS, SHOW_TRUE=FALSE, SHOW_CUTOFFS=FALSE, SHOW_FOLDS=FALSE, POINT_COUNT=200) {
joined <- joined[rep(seq_len(nrow(joined)), each=length(THRESHOLD)), ]
joined$threshold <- THRESHOLD
if(!SHOW_TRUE) {
joined <- dplyr::filter(joined, method!="true")
}
make_df <- function(df, folds) {
if (folds) {
grouped <- group_by(df, method, fold, threshold)
}
else {
grouped <- group_by(df, method, threshold)
}
grouped %>%
mutate(response=raw < quantile(raw, threshold)) %>%
arrange(predicted) %>%
mutate(y=cumsum(response) / n()) %>%
mutate(x=row_number() / n())
}
df <- make_df(joined, FALSE)
folded_df <- make_df(joined, TRUE)
plot_cumulative(df=df,
base=BASE,
y_label="coverage",
show_cutoffs=SHOW_CUTOFFS,
show_folds=SHOW_FOLDS,
folded=folded_df,
point_count=POINT_COUNT)
}
#' If we target N people, what fraction of the true poor would receive funds?
#' True Positives / Total Positives
#' Note that this is a reparameterization of the ROC curve
plot_accuracy <- function(..., BASE=NULL, THRESHOLD=DEFAULT_THRESHOLDS, SHOW_TRUE=FALSE, SHOW_CUTOFFS=FALSE, SHOW_FOLDS=FALSE, POINT_COUNT=200) {
dfs <- list(...)
joined <- join_dfs(dfs)
plot_accuracy_(joined, BASE, THRESHOLD, SHOW_TRUE, SHOW_CUTOFFS, SHOW_FOLDS, POINT_COUNT)
}
plot_accuracy_dollars_ <- function(joined, BASE=NULL, THRESHOLD=DEFAULT_THRESHOLDS, SHOW_TRUE=FALSE, SHOW_CUTOFFS=FALSE, SHOW_FOLDS=FALSE, POINT_COUNT=200) {
joined <- joined[rep(seq_len(nrow(joined)), each=length(THRESHOLD)), ]
joined$threshold <- THRESHOLD
if (!SHOW_TRUE) {
joined <- dplyr::filter(joined, method!="true")
}
make_df <- function(df, folds) {
if (folds) {
grouped <- group_by(df, method, fold, threshold)
}
else {
grouped <- group_by(df, method, threshold)
}
grouped %>%
mutate(response=raw < quantile(raw, threshold)) %>%
arrange(predicted) %>%
mutate(y=cumsum(response) / row_number()) %>%
mutate(x=row_number() / n())
}
df <- make_df(joined, FALSE)
folded_df <- make_df(joined, TRUE)
plot_cumulative(df=df,
base=BASE,
y_label="to_true_poor",
show_cutoffs=SHOW_CUTOFFS,
show_folds=SHOW_FOLDS,
folded=folded_df,
point_count=POINT_COUNT)
}
#' With a fixed amount of money, if we target N people, what fraction would go to the true poor?
#' True Positives / (True Positives + False Positives)
plot_accuracy_dollars <- function(..., THRESHOLD=DEFAULT_THRESHOLDS, SHOW_TRUE=FALSE, SHOW_CUTOFFS=FALSE, SHOW_FOLDS=FALSE, POINT_COUNT=200) {
dfs <- list(...)
joined <- join_dfs(dfs)
plot_accuracy_dollars_(joined, THRESHOLD, SHOW_TRUE, SHOW_CUTOFFS, SHOW_FOLDS, POINT_COUNT)
}
calculate_swf_ <- function(joined, fold=FALSE, GAMMA=2, base=NULL) {
marginal_utility <- function(log_consumption) exp(log_consumption) ^ (- GAMMA)
joined$marginal_utility <- sapply(joined$raw, marginal_utility)
if (fold) {
grouped <- group_by(joined, method, fold)
}
else {
grouped <- group_by(joined, method)
}
grouped %>%
arrange(predicted) %>%
mutate(ideal_sw=cumsum(sort(marginal_utility, decreasing=TRUE))) %>%
mutate(y=cumsum(marginal_utility) / ideal_sw)%>%
mutate(x=row_number() / n())
}
plot_swf_ <- function(joined, BASE=NULL, GAMMA=2, SHOW_FOLDS=FALSE, POINT_COUNT=200) {
joined <- dplyr::filter(joined, method!="true")
joined$threshold <- ""
df <- calculate_swf_(joined, FALSE, GAMMA=GAMMA)
folded_df <- calculate_swf_(joined, TRUE, GAMMA=GAMMA)
plot_cumulative(df=df,
base=BASE,
y_label="welfare",
show_cutoffs=FALSE,
show_folds=SHOW_FOLDS,
folded=folded_df,
point_count=POINT_COUNT)
}
plot_swf <- function(..., BASE=NULL, GAMMA=2, SHOW_FOLDS=FALSE, POINT_COUNT=200) {
dfs <- list(...)
joined <- join_dfs(dfs)
plot_swf_(joined, BASE, GAMMA, SHOW_FOLDS, POINT_COUNT)
}
calculate_reach_vs_waste_ <- function(df, folds) {
if (folds){
grouped <- group_by(df, method, fold, threshold)
}
else {
grouped <- group_by(df, method, threshold)
}
wtd.quantile <- Hmisc::wtd.quantile
grouped %>%
mutate(response1=weight*as.numeric(raw < wtd.quantile(raw, weights=weight, probs=threshold))) %>%
mutate(response2=weight*as.numeric(raw >= wtd.quantile(raw, weights=weight, probs=threshold))) %>%
arrange(predicted) %>%
mutate(y=cumsum(response1) / sum(weight)) %>%
mutate(x=cumsum(response2) / sum(weight)) %>%
mutate(percent_pop_included=cumsum(weight) / sum(weight))
}
calculate_reach_ <- function(joined, fold=FALSE, poverty_threshold=.4, target_threshold=.4, base=NULL) {
joined <- mutate(joined, threshold=poverty_threshold)
rvw <- calculate_reach_vs_waste_(joined, folds=fold)
folds <- unique(rvw$fold)
true <- filter(rvw, method == 'true')
true <- true[rep(1:nrow(true), times=length(folds)), ]
true$fold <- folds
rvw <- filter(rvw, method != 'true')
rvw <- rbind(rvw, true)
rvw <- ungroup(rvw)
reach_df <- rvw %>%
filter(percent_pop_included <= target_threshold)
if (fold){
reach_df <- group_by(reach_df, method, fold, threshold)
}
else {
reach_df <- group_by(reach_df, method, threshold)
}
reach_df <- reach_df %>%
mutate(reach=y) %>%
arrange(desc(percent_pop_included))
if (!is.null(base)) {
base_reach <- reach_df %>%
filter(method == base) %>%
group_by(fold) %>%
summarise(reach=first(reach))
reach_df <- merge(reach_df, base_reach, by='fold') %>%
mutate(reach=reach.x - reach.y) %>%
filter(method != base) %>%
group_by(method, fold)
}
reach_df <- reach_df %>%
arrange(desc(percent_pop_included)) %>%
summarise(reach=first(reach))
select(reach_df, -one_of('threshold'))
}
calculate_budget_reduction_ <- function(joined, base='least_squares', poverty_threshold=.4, target_threshold=.4) {
stopifnot(base %in% joined$method)
base_reach <- calculate_reach_(
filter(joined, method==base),
fold=FALSE,
poverty_threshold=poverty_threshold,
target_threshold=target_threshold)$reach
joined <- mutate(joined, threshold=poverty_threshold)
rvw <- calculate_reach_vs_waste_(joined, folds=FALSE)
rvw %>%
filter(y <= base_reach) %>%
arrange(desc(y), percent_pop_included) %>%
summarize(reach=first(y), percent_pop_included=first(percent_pop_included))
}
calculate_budget_reduction_swf_ <- function(joined, base='least_squares', target_threshold=.4, GAMMA=2) {
stopifnot(base %in% joined$method)
sw <- calculate_swf_(joined,
fold=FALSE,
GAMMA=GAMMA)
sw <- sw %>%
arrange(x) %>%
mutate(sw=cumsum(marginal_utility))
base_sw <- sw %>%
filter(method==base) %>%
arrange(desc(x)) %>%
filter(x <= target_threshold)
base_sw <- base_sw$sw %>% first()
sw %>%
filter(sw <= base_sw) %>%
arrange(desc(sw), x) %>%
summarise(sw=first(sw), percent_pop_included=first(x))
}
calculate_budget_to_true_poor_ <- function(joined, fold=FALSE, poverty_threshold=.4, target_threshold=.4, base=NULL) {
joined <- mutate(joined, threshold=poverty_threshold)
if (fold){
grouped <- group_by(joined, method, fold, threshold)
}
else {
grouped <- group_by(joined, method, threshold)
}
bttp <- grouped %>%
mutate(response=raw < quantile(raw, threshold)) %>%
arrange(predicted) %>%
mutate(y=cumsum(response) / row_number()) %>%
mutate(x=row_number() / n())
folds <- unique(bttp$fold)
true <- filter(bttp, method == 'true')
true <- true[rep(1:nrow(true), times=length(folds)), ]
true$fold <- folds
bttp <- filter(bttp, method != 'true')
bttp <- rbind(bttp , true)
bttp_df <- bttp %>%
filter(x < target_threshold) %>%
arrange(desc(x))
if (!is.null(base)) {
base_bttp <- bttp_df %>%
filter(method == base) %>%
group_by(fold) %>%
summarise(y=first(y))
bttp_df <- merge(bttp_df, base_bttp, by='fold') %>%
mutate(y=(y.x - y.y)/y.y) %>%
filter(method != base) %>%
group_by(method, fold)
}
bttp_df <- bttp_df %>%
arrange(desc(x)) %>%
summarise(y=first(y))
select(bttp_df, -one_of('threshold'))
}
plot_reach_vs_waste_ <- function(joined, THRESHOLD=DEFAULT_THRESHOLDS, SHOW_CUTOFFS = FALSE, SHOW_FOLDS=FALSE, POINT_COUNT=200) {
joined <- joined[rep(seq_len(nrow(joined)), each=length(THRESHOLD)), ]
joined$threshold <- THRESHOLD
joined <- dplyr::filter(joined, method!="true")
df <- calculate_reach_vs_waste_(joined, FALSE)
folded_df <- calculate_reach_vs_waste_(joined, TRUE)
plot_cumulative(df=df,
base=NULL,
x_label="number of rich targeted / N",
y_label="number of poor targeted / N",
show_cutoffs=SHOW_CUTOFFS,
show_folds=SHOW_FOLDS,
folded=folded_df,
point_count=POINT_COUNT)
}
plot_reach_vs_waste <- function(..., THRESHOLD=DEFAULT_THRESHOLDS, SHOW_CUTOFFS = FALSE, SHOW_FOLDS=FALSE, POINT_COUNT=200) {
dfs <- list(...)
joined <- join_dfs(dfs)
plot_reach_vs_waste_(joined, THRESHOLD, SHOW_CUTOFFS, SHOW_FOLDS, POINT_COUNT)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.