knitr::opts_chunk$set(fig.width=12, fig.height=8, fig.path='Figs/', warning=FALSE, message=FALSE) knitr::opts_knit$set(root.dir="../") options(width = 250)
myround <- function(x, digits=1) { if(digits < 1) stop("This is intended for the case digits >= 1.") if(length(digits) > 1) { digits <- digits[1] warning("Using only digits[1]") } tmp <- sprintf(paste("%.", digits, "f", sep=""), x) # deal with "-0.00" case zero <- paste0("0.", paste(rep("0", digits), collapse="")) tmp[tmp == paste0("-", zero)] <- zero tmp } cap <- function(x) { s <- strsplit(x, " ")[[1]] paste(toupper(substring(s, 1,1)), substring(s, 2), sep="", collapse=" ") }
library(MLlibrary) library(dplyr) library(purrr) THRESHOLD <- 0.4 all_names <- c('niger_pastoral', 'niger_agricultural', 'tanzania_2008', 'tanzania_2010', 'tanzania_2012', 'ghana_pe', 'mexico', 'south_africa_w1', 'south_africa_w2', 'south_africa_w3', 'iraq', 'brazil') countries <- strsplit(all_names, '_') %>% map(first) %>% unique() %>% map(cap) pmt_names <- c('niger_pastoral_pmt', 'niger_agricultural_pmt', 'ghana')
table_stats <- function(tables) { lapply(names(tables), function(name) { df <- tables[[name]] value_name <- colnames(df)[[2]] df$dataset <- name reshape::cast(df, dataset ~ method, value=value_name) }) }
ds_stats <- lapply(c(all_names, pmt_names), function(name) { df <- load_dataset(name) row_count <- nrow(df) col_count <- ncol(df) data.frame(dataset=name, N=row_count, K=col_count) }) ds_stats <- bind_rows(ds_stats)
get_reaches <- function(ds_names) { reaches <- lapply(ds_names, function(name) { output <- load_validation_models(name) reach_by_pct_targeted(output, threshold=THRESHOLD) }) names(reaches) <- ds_names reaches } get_reach_table <- function(reaches) { tables <- lapply(reaches, table_stat) combine_tables(tables) %>% select(dataset, N, K, ols, enet, ensemble, forest, opf) %>% rename(ols_plus_forest=opf) } get_budget_table <- function(reaches) { tables <- lapply(reaches, budget_change) combine_tables(tables) } combine_tables <- function(tables) { table_stats(tables) %>% bind_rows() %>% merge(ds_stats, by='dataset') %>% select(dataset, N, K, ols, everything()) %>% arrange(N) } difference_table <- function(reaches) { reach_table <- get_reach_table(reaches) reach_differences <- reach_table %>% mutate(reach_improvement=ensemble-ols) %>% mutate(relative_reach_improvement=(ensemble-ols)/ols) %>% select(N, K, dataset, reach_improvement, relative_reach_improvement) budget_table <- get_budget_table(reaches) %>% mutate(budget_reduction=-1 * ensemble) %>% select(dataset, budget_reduction) merge(reach_differences, budget_table, by='dataset') %>% arrange(N) } reaches <- get_reaches(all_names) reacht <- get_reach_table(reaches) difft <- difference_table(reaches)
r length(countries)
countries: r paste(countries, collapse=', ')
.r 100 * THRESHOLD
% of consumption, when targeting r 100 * THRESHOLD
% of the total population, the median difference between OLS and our top-performing method is median(reacht$ensemble - reacht$ols)
percentage points Table 1.
OLS outperforms random forests, our top-performing nonlinear method, with a median difference of median(reacht$ensemble - reacht$ols)
percentage points.r myround(100 * median(difft$relative_reach_improvement), 1)
% increase in reach. This allows for a r myround(100 * median(difft$budget_reduction), 1)
% budget saving for an equivalent reach. Table 2 print(reacht, digits=4)
print(difft, digits=4)
pmt_reaches <- get_reaches(pmt_names) pmt_table <- difference_table(pmt_reaches) print(pmt_table, digits=4)
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.