library(MLlibrary)
library(dplyr)
library(purrr)
library(foreign)
library(readstata13)
library(ggplot2)
library(rpart)
library(RColorBrewer)
library(reshape2)
library(rpart.plot)
library(gridExtra)
library(xlsx)
library(scales)
library(kknn)
# folder <- 'sendhil_class'
# in_f <- function(path) paste(folder, path, sep='/')
folder <- 'sendhil_class'
in_f <- function(path) paste(folder, 'model_menu', path, sep='/')
summarize_output <- function(output, threshold) {
random <- filter(output, method=='ols')
random$method <- 'random'
random$predicted <- rnorm(nrow(random))
best <- filter(output, method=='ols')
best$method <- 'best'
best$predicted <- best$true
x <- rbind_all(list(output, random, best))
g <- group_by(x, method)
g <- mutate(g, cons_threshold_t=quantile(true, threshold)) %>%
mutate(true_poor=true <= cons_threshold_t, threshold=threshold) %>%
arrange(predicted) %>%
mutate(targeted=row_number() / n() <= threshold)
df <- summarize(g,
reach=sum(targeted * (true_poor)) / sum(true_poor),
avg_1_c=sum(targeted * 1/exp(true)) / sum(targeted)
)
ols_reach <- filter(df, method=='ols')$reach
rf_reach <- filter(df, method=='tuned_forest')$reach
to_reach <- mutate(g, reach=cumsum(true_poor) / sum(true_poor)) %>%
summarize(to_ols_reach=sum(reach <= ols_reach) / n(), to_rf_reach=sum(reach <= rf_reach) / n())
percentile <- ecdf(output$true)
df <- merge(df, to_reach, by='method')
df <- select(df, method=method, reach=reach, to_ols_reach=to_ols_reach, to_rf_reach=to_rf_reach, avg_1_c=avg_1_c)
best_1_c <- filter(df, method=='best')$avg_1_c
random_1_c <- filter(df, method=='random')$avg_1_c
df$avg_1_c_normalized <- (df$avg_1_c - random_1_c) / (best_1_c - random_1_c)
df
# df <- filter(df, method %in% c('best', 'random', 'ols', 'tuned_rf'))
# df[match(c('ols', 'tuned_rf', 'random', 'best'), df$method), ]
}
# mutate(g, reach=cumsum(true_poor) / sum(true_poor)) %>%
# summarize(sum(reach < .4186))
#
# res <- rbind_all(map(c(.05, .1, .2), ~summarize_output(output, .x)))
write_all <- function(output) {
thresholds <- c(.05, .1, .2)
for (thresh in thresholds) {
s <- summarize_output(output, thresh)
write.csv(s, paste('results_', thresh*100, '.csv', sep=''))
}
}
dataset_stats <- function(df, bz=NULL) {
stats_df <- data.frame(
n=nrow(df),
k=ncol(df),
mean_lconsumption_bzreal=mean(df$log_consumption),
median_lconsumption_bzreal=median(df$log_consumption),
median_yearly_consumption_usd=median(exp(df$log_consumption)) * .26,
pct_below_1.90usd=6,
pct_below_3.10usd=14
)
if (!is.null(bz)) {
stats_df$median_monthly_income_usd <- median(bz$renda_total) * .26
}
write.csv(stats_df, in_f('stats.csv'))
p <- ggplot(df, aes(x=exp(log_consumption) * .26)) +
geom_line(stat='density', size=1.5) +
scale_x_log10() +
xlab('yearly consumption (usd)')
ggsave(in_f('consumption_log.pdf'), plot=p)
thresholds <- c(.05, .1, .25, .5)
min_thresh <- quantile(exp(df$log_consumption) * .26, .05)
p3 <- p
for (thresh in thresholds) {
cons_thresh <- quantile(exp(df$log_consumption) * .26, thresh)
p2 <- p +
geom_vline(xintercept = cons_thresh) +
annotate('label', x=min_thresh - 400, y=.85, label=paste('$', formatC(cons_thresh, format='d', big.mark=','), sep=''), size=8)
p3 <- p3 + geom_vline(xintercept = cons_thresh)
ggsave(in_f(paste('consumption_log_', thresh, '.pdf', sep='')), plot=p2)
}
ggsave(in_f(paste('consumption_log_all.pdf')), plot=p3)
p <- ggplot(df, aes(x=exp(log_consumption) * .26)) +
geom_histogram(bins=100) +
xlim(c(0, 6e4)) +
xlab('yearly consumption (usd)')
ggsave(in_f('consumption.pdf'), plot=p)
cut_df <- select(
df,
number.of.residents=qtd_morador_domc,
sewage.type=cod_esgoto_sanit,
number.of.rooms=qtd_comodos_domc,
floor.material=cod_material_piso,
log_consumption=log_consumption)
levels(cut_df$floor.material) <- c('1'='carpet', '2'='ceramic', '3'='milled wood', '4'='cement', '5'='rough wood', '6'='earth', '7'='other')
cut_df$floor.material <- as.character(cut_df$floor.material)
levels(cut_df$sewage.type) <- c('1'='sewer', '2'='septic', '3'='pit (rudimentary)', '4'='ditch', '5'='river', '6'='other', '7'='none')
cut_df$sewage.type <- as.character(cut_df$sewage.type)
p.res <- ggplot(cut_df, aes(x=number.of.residents)) + geom_bar()
p.sew <- ggplot(cut_df, aes(x=sewage.type)) + geom_bar() + ylab('')
p.room <- ggplot(cut_df, aes(x=number.of.rooms)) + geom_bar() + ylab('')
p.floor <- ggplot(cut_df, aes(x=floor.material)) + geom_bar()
pdf(in_f('other_variable_stats.pdf'), width=12, height=10)
grid.arrange(p.res, p.sew, p.floor, p.room, ncol=2)
dev.off()
g <- group_by(cut_df, number.of.rooms) %>%
summarize(log_consumption=mean(log_consumption))
p <- ggplot(cut_df, aes(factor(number.of.rooms), log_consumption)) +
geom_boxplot() +
ylab('consumption (log)')
xlab('number of rooms')
ggsave(in_f('rooms_v_consumption_boxplot.pdf'), plot=p)
}
save_tree <- function(model, fname) {
pdf(in_f(fname))
prp(model, varlen=0)
dev.off()
}
little_trees <- function(df) {
df <- rename(
df,
number.of.bathrooms=qtd_banheiros,
floor.type=cod_material_piso,
number.of.residents=qtd_comodos_domc,
number.of.rooms=qtd_morador_domc)
model <- rpart(log_consumption ~ ., df, maxdepth=1)
save_tree(model, 'tree_1.pdf')
model <- rpart(log_consumption ~ ., df, maxdepth=2)
save_tree(model, 'tree_2.pdf')
model <- rpart(log_consumption ~ ., df, maxdepth=3)
save_tree(model, 'tree_3.pdf')
}
plot_partition <- function(new_df, colors, values=NULL) {
p <- ggplot(new_df, aes(y=qtd_comodos_domc, x=qtd_morador_domc, color=predicted_log_consumption)) +
geom_point() +
geom_jitter() +
xlab('number of residents') +
ylab('number of rooms')
if (is.null(values)) {
p <- p + scale_color_manual(values=colors)
}
else {
p <- p + scale_color_gradientn(colors=colors, values=vals)
}
p
}
partition_graph2 <- function(df) {
new_df <- df[, c('qtd_comodos_domc', 'qtd_morador_domc', 'log_consumption')]
test_id <- sample(1:nrow(new_df), 10000)
new_df <- new_df[test_id, ]
tree_model <- rpart(log_consumption ~ qtd_comodos_domc + qtd_morador_domc, df)
new_df$predicted_log_consumption <- as.factor(predict(tree_model, new_df))
levs <- as.numeric(levels(new_df$predicted_log_consumption))
colors <- brewer.pal(length(levs) + 2, "RdYlBu")
p <- plot_partition(new_df, colors[2:(length(colors)-1)])
ggsave(in_f('partition_plots/tree.pdf'), plot=p)
count <- 100
qtd_comodos_domc <- seq(min(new_df$qtd_comodos_domc), max(new_df$qtd_comodos_domc), length.out=count)
qtd_morador_domc <- seq(min(new_df$qtd_morador_domc), max(new_df$qtd_morador_domc), length.out=count)
new_df2 <- expand.grid(qtd_comodos_domc=qtd_comodos_domc, qtd_morador_domc=qtd_morador_domc)
new_df2$predicted_log_consumption <- as.factor(predict(tree_model, new_df2))
p <- plot_partition(new_df2, colors[2:(length(colors)-1)] )
ggsave(in_f('partition_plots/tree_filled.pdf'), plot=p)
min_cons <- min(new_df$log_consumption)
max_cons <- max(new_df$log_consumption)
vals <- rescale(c(min_cons, levs, max_cons))
p <- ggplot(new_df, aes(y=qtd_comodos_domc, x=qtd_morador_domc, color=log_consumption)) +
geom_point() +
geom_jitter() +
xlab('number of residents') +
ylab('number of rooms') +
scale_color_gradientn(colors=colors, values=vals)
ggsave(in_f('partition_plots/true.pdf'), p)
m <- lm(log_consumption ~ qtd_comodos_domc + qtd_morador_domc, df)
eq <- substitute(log_consumption == a + b %.% rooms + c %.% residents,
list(a = format(coef(m)[1], digits = 3),
b = format(coef(m)[2], digits = 3),
c = format(coef(m)[3], digits = 3)))
eq_str <- as.character(as.expression(eq))
new_df$predicted_log_consumption <- predict(m, new_df)
p <- plot_partition(new_df, colors, vals)
p <- p + annotate('text', y=20, x=10, label=eq_str, parse=TRUE, size=5)
ggsave(in_f('partition_plots/ols.pdf'), plot=p)
new_df2$predicted_log_consumption <- predict(m, new_df2)
p <- plot_partition(new_df2, colors, vals)
p <- p + annotate('text', y=20, x=10, label=eq_str, parse=TRUE, size=5)
ggsave(in_f('partition_plots/ols_filled.pdf'), plot=p)
# KNN
for (k in c(1, 3, 7)) {
m_k <- kknn(log_consumption ~ qtd_comodos_domc + qtd_morador_domc, train=df[-test_id, ], new_df, k=k)
new_df$predicted_log_consumption <- predict(m_k)
p <- plot_partition(new_df, colors, vals)
ggsave(in_f(paste('partition_plots/knn_', k, '.pdf', sep='')), plot=p)
m_k <- kknn(log_consumption ~ qtd_comodos_domc + qtd_morador_domc, train=df[-test_id, ], new_df2, k=k)
new_df2$predicted_log_consumption <- predict(m_k)
p <- plot_partition(new_df2, colors, vals)
ggsave(in_f(paste('partition_plots/knn_', k, '_filled.pdf', sep='')), plot=p)
}
# SPLINES
x <- model.matrix(log_consumption ~ qtd_comodos_domc + qtd_morador_domc, new_df)
y <- new_df$log_consumption
m <- polspline::polymars(y, x)
new_df$predicted_log_consumption <- predict(m, x)
p <- plot_partition(new_df, colors, vals)
ggsave(in_f('partition_plots/spline.pdf'), p)
x <- model.matrix(predicted_log_consumption ~ qtd_comodos_domc + qtd_morador_domc, new_df2)
new_d2f$predicted_log_consumption <- predict(m, x)
p <- plot_partition(new_df2, colors, vals)
ggsave(in_f('partition_plots/spline_filled.pdf'), p)
# RF
fmla <- log_consumption ~ qtd_comodos_domc + qtd_morador_domc
m_rf <- ranger::ranger(fmla, new_df, num.trees=50, write.forest=TRUE)
new_df$predicted_log_consumption <- predict(m_rf, new_df)$predictions
p <- plot_partition(new_df, colors, vals)
ggsave(in_f('partition_plots/random_forest.pdf'), p)
new_df2$predicted_log_consumption <- predict(m_rf, new_df2)$predictions
p <- plot_partition(new_df2, colors, vals)
ggsave(in_f('partition_plots/random_forest_filled.pdf'), p)
# REGRESSION TREE
m <- RWeka::M5P(fmla, new_df)
new_df$predicted_log_consumption <- predict(m, new_df)
p <- plot_partition(new_df, colors, vals)
ggsave(in_f('partition_plots/regression_tree.pdf'), p)
new_df2$predicted_log_consumption <- predict(m, new_df2)
p <- plot_partition(new_df2, colors, vals)
ggsave(in_f('partition_plots/regression_tree_filled.pdf'), p)
# # KERNEL
# x <- df[, c('qtd_comodos_domc', 'qtd_morador_domc')]
# y <- df$log_consumption
# sm <- fields::smooth.2d(Y=y, x=x, surface=FALSE)
# new_df3 <- data.frame(qtd_comodos_domc=sm$x, qtd_morador_domc=sm$y, predicted_log_consumption=sm$z)
# p <- plot_partition(new_df3, colors, vals)
# ggsave(in_f('partition_plots/kernel_filled.pdf'), plot=p)
}
partition_graph <- function(df) {
new_df <- df[, c('qtd_comodos_domc', 'qtd_morador_domc', 'log_consumption')]
new_df <- new_df[sample(1:nrow(new_df), 10000), ]
tree_model <- rpart(log_consumption ~ qtd_comodos_domc + qtd_morador_domc, df)
new_df$predicted_log_consumption <- as.factor(predict(tree_model, new_df))
levs <- as.numeric(levels(new_df$predicted_log_consumption))
colors <- brewer.pal(length(levs) + 2, "RdYlBu")
p <- ggplot(new_df, aes(y=qtd_comodos_domc, x=qtd_morador_domc, color=predicted_log_consumption)) +
geom_point() +
scale_color_manual(values=colors[2:(length(colors)-1)]) +
geom_jitter() +
xlab('number of residents') +
ylab('number of rooms')
ggsave(in_f('tree_partition.pdf'), plot=p)
count <- 100
qtd_comodos_domc <- seq(min(new_df$qtd_comodos_domc), max(new_df$qtd_comodos_domc), length.out=count)
qtd_morador_domc <- seq(min(new_df$qtd_morador_domc), max(new_df$qtd_morador_domc), length.out=count)
new_df2 <- expand.grid(qtd_comodos_domc=qtd_comodos_domc, qtd_morador_domc=qtd_morador_domc)
new_df2$predicted_log_consumption <- as.factor(predict(tree_model, new_df2))
p <- ggplot(new_df2, aes(y=qtd_comodos_domc, x=qtd_morador_domc, color=predicted_log_consumption)) +
geom_point() +
scale_color_manual(values=colors[2:(length(colors)-1)]) +
geom_jitter() +
xlab('number of residents') +
ylab('number of rooms')
ggsave(in_f('tree_partition_filled.pdf'), plot=p)
min_cons <- min(new_df$log_consumption)
max_cons <- max(new_df$log_consumption)
vals <- rescale(c(min_cons, levs, max_cons))
p <- ggplot(new_df, aes(y=qtd_comodos_domc, x=qtd_morador_domc, color=log_consumption)) +
geom_point() +
scale_color_gradientn(colors=colors, values=vals) +
geom_jitter() +
xlab('number of residents') +
ylab('number of rooms')
ggsave(in_f('covariates.pdf'), plot=p)
m <- lm(log_consumption ~ qtd_comodos_domc + qtd_morador_domc, df)
eq <- substitute(log_consumption == a + b %.% rooms + c %.% residents,
list(a = format(coef(m)[1], digits = 3),
b = format(coef(m)[2], digits = 3),
c = format(coef(m)[3], digits = 3)))
eq_str <- as.character(as.expression(eq))
new_df$predicted_log_consumption <- predict(m, new_df)
p <- ggplot(new_df, aes(y=qtd_comodos_domc, x=qtd_morador_domc, color=predicted_log_consumption)) +
geom_point() +
scale_color_gradientn(colors=colors, values=vals) +
geom_jitter() +
annotate('text', y=20, x=10, label=eq_str, parse=TRUE, size=5) +
xlab('number of residents') +
ylab('number of rooms')
ggsave(in_f('ols_partition.pdf'), plot=p)
new_df2$predicted_log_consumption <- predict(m, new_df2)
p <- ggplot(new_df2, aes(y=qtd_comodos_domc, x=qtd_morador_domc, color=predicted_log_consumption)) +
geom_point() +
scale_color_gradientn(colors=colors, values=vals) +
geom_jitter() +
annotate('text', y=20, x=10, label=eq_str, parse=TRUE, size=5) +
xlab('number of residents') +
ylab('number of rooms')
ggsave(in_f('ols_partition_filled.pdf'), plot=p)
}
tree_complexity <- function(df) {
model <- rpart(log_consumption ~ ., df, minsplit=2, cp=5e-5)
cp_df <- as.data.frame(model$cptable)
cp_df$CP <- NULL
cp_df[, 'in_sample_error'] <- cp_df[, 'rel error']
cp_df[, 'rel error'] <- NULL
cp_df <- rename(cp_df, holdout_error=xerror)
cp_df <- filter(cp_df, holdout_error < 1)
cp_df_long <- melt(cp_df, id.vars=c('nsplit', 'xstd'))
cp_df_long <- mutate(cp_df_long, ymin=value-xstd, ymax=value+xstd)
p <- ggplot(cp_df_long, aes(x=nsplit, y=value, color=variable)) +
geom_line(size=1.5) +
geom_errorbar(aes(ymin=ymin, ymax=ymax))
ggsave(in_f('tree_complexity.pdf'), plot=p)
p <- ggplot(cp_df_long, aes(x=nsplit, y=value, color=variable)) +
geom_line(size=1.5) +
geom_errorbar(aes(ymin=ymin, ymax=ymax)) +
scale_x_log10()
ggsave(in_f('tree_complexity_log.pdf'), plot=p)
}
plot_roc <- function(tree_roc, ols_roc=NULL) {
if (!is.null(ols_roc)) {
tree_roc_df <- data.frame(method='tree', sensitivity=tree_roc$sensitivities, specificity=tree_roc$specificities)
ols_roc_df <- data.frame(method='ols', sensitivity=ols_roc$sensitivities, specificity=ols_roc$specificities)
text_df <- data.frame(
method=c('ols', 'tree'),
specificity=c(.77, .72),
sensitivity=c(.8, .6),
auc=paste('auc:', c(round(ols_roc$auc, digits=3), round(tree_roc$auc, digits=3))))
roc_df <- rbind(tree_roc_df, ols_roc_df)
ggplot(roc_df, aes(x=specificity, y=sensitivity, color=method)) +
geom_line() +
scale_x_reverse() +
geom_text(data=text_df, aes(label=auc))
} else {
tree_roc_df <- data.frame(sensitivity=tree_roc$sensitivities, specificity=tree_roc$specificities)
ggplot(tree_roc_df, aes(x=specificity, y=sensitivity)) +
geom_line(size=1.5) +
scale_x_reverse() +
geom_polygon(data=rbind(tree_roc_df, data.frame(specificity=0, sensitivity=0)), alpha=0.2) +
geom_text(
data=filter(tree_roc_df, row_number() == 70),
label=paste('auc:', round(tree_roc$auc, digits=3)),
size=12,
position=position_nudge(.1, -.1))
}
}
plot_reach <- function(pred_df, consumption_threshold) {
only_tree <- filter(pred_df, method=='tree')
random <- mutate(only_tree, predicted=runif(n()), method='random')
tree_and_random <- rbind(only_tree, random)
g <- group_by(tree_and_random, method) %>%
arrange(predicted) %>%
mutate(pct_targeted=row_number() / n()) %>%
mutate(reach=cumsum(true < consumption_threshold) / sum(true < consumption_threshold))
ggplot(g, aes(x=pct_targeted, y=reach, color=method)) +
geom_line(size=1.5) +
xlab('percent targeted')
}
plot_swf <- function(pred_df, consumption_threshold) {
write('welfare = sum(1 / exp(log_consumption))', in_f('welfare.txt'))
only_tree <- filter(pred_df, method=='tree')
random <- mutate(only_tree, predicted=runif(n()), method='random')
tree_and_random <- rbind(only_tree, random)
g <- group_by(tree_and_random, method) %>%
arrange(predicted) %>%
mutate(optimal=sort(true)) %>%
mutate(pct_targeted=row_number() / n()) %>%
mutate(welfare=cumsum(1/exp(true)) / cumsum(1/exp(optimal)))
ggplot(g, aes(x=pct_targeted, y=welfare, color=method)) +
geom_line(size=1.5) +
xlab('percent targeted') +
ylab('welfare (divided by best possible)')
}
final_tree <- function(df, min_cp=1e-4, threshold=0.2) {
df <- translate_colnames(df)
fold <- split_test_train(df, test_fraction=.1)[[1]]
consumption_threshold <- quantile(fold$test$log_consumption, threshold)
full_model <- rpart(log_consumption ~ ., fold$train, cp=min_cp)
cp_df <- as.data.frame(full_model$cptable)
cp_best <- cp_df$CP[which.min(cp_df$xerror)]
model <- prune(full_model, cp=cp_best)
imp_df <- data.frame(feature=names(model$variable.importance), importance=model$variable.importance) %>%
arrange(desc(importance)) %>%
filter(row_number() < 10)
imp_df$feature <- factor(imp_df$feature, levels=rev(imp_df$feature))
p <- ggplot(imp_df, aes(x=feature, y=importance)) +
geom_bar(stat='identity') +
coord_flip()
ggsave(in_f('variable_importance.pdf'), plot=p)
pdf(in_f('full_tree.pdf'))
prp(model)
dev.off()
tree_preds <- data.frame(method='tree', predicted=predict(model, fold$test), true=fold$test$log_consumption)
tree_roc <- pROC::roc(response=tree_preds$true <= consumption_threshold, predictor=tree_preds$predicted)
ols <- lm(log_consumption ~ ., fold$train)
# ols_preds <- data.frame(method='ols', predicted=predict(ols, fold$test), true=fold$test$log_consumption)
# ols_roc <- pROC::roc(response=ols_preds$true <= consumption_threshold, predictor=ols_preds$predicted)
# pred_df <- rbind(tree_preds, ols_preds)
pred_df <- tree_preds
p <- plot_roc(tree_roc)
ggsave(in_f('roc.pdf'), plot=p)
p <- plot_reach(pred_df, consumption_threshold)
ggsave(in_f('reach.pdf'), plot=p)
p <- plot_swf(pred_df, consumption_threshold)
ggsave(in_f('welfare.pdf'), plot=p)
p <- ggplot(tree_preds, aes(x=true, y=predicted)) +
geom_point() +
geom_smooth(method=lm, se=FALSE) +
xlim(c(6, 13)) +
ylim(c(6, 13)) +
coord_fixed()
ggsave(in_f('residuals.pdf'), plot=p)
tree_preds_long <- melt(tree_preds, id.vars='method')
p <- ggplot(tree_preds_long, aes(x=value, color=variable)) +
geom_line(stat='density', size=1.5, alpha=1) +
scale_color_brewer(type='qual', palette=1)+
xlab('log(consumption) (Brazilian real)')
ggsave(in_f('residuals_density.pdf'), plot=p)
pred_df <- select(
fold$test,
number.of.bathrooms,
number.of.rooms,
wall.type,
electric.water.heater,
number.of.residents,
sewage.type,
garbage.disposal,
mail.distribution.service,
street.pavement,
household.type,
consumption.units,
number.of.bedrooms,
number.of.families,
gas.stove,
electricity.source.other)
tree_pred_df <- mutate(pred_df, log_consumption=tree_preds$predicted)
linear <- lm(log_consumption ~ ., tree_pred_df)
sink(in_f('tree_preds_linear_model.txt'))
print(summary(linear))
sink()
coef_df <- data.frame(summary(linear)$coef)
coef_df$feature <- row.names(coef_df)
coef_df <- coef_df %>%
arrange(desc(abs(Estimate))) %>%
filter(!grepl('Intercept', feature)) %>%
filter(!grepl('imput', feature)) %>%
filter(row_number() < 10) %>%
select(Feature=feature, Estimate, Std.Error=Std..Error)
coef_df <- rbind(data.frame(Feature='linear', Estimate=summary(linear)$r.squared, Std.Error=NA), coef_df)
coef_df <- rbind(coef_df, data.frame(Feature='nonlinear', Estimate=1-summary(linear)$r.squared, Std.Error=NA))
}
make <- function() {
df <- load_dataset('brazil_sendhil_all')
df$X <- NULL
cnames <- colnames(df)
categoricals <- cnames[!grepl('qtd', cnames) & !grepl('num', cnames) & TARGET_VARIABLE != cnames]
df[, categoricals] <- lapply(df[, categoricals], factor)
df <- rename(df, log_consumption=yyyyy)
df2 <- mutate(df, log_consumption=log_consumption/qtd_familia)
fpath <- paste(TARGETING_DATA_IN, 'pof2008_dom_standard.dta', sep='/')
bz <- read.dta13(fpath)
dataset_stats(df, bz)
partition_graph(df2)
tree_complexity(df)
little_trees(df)
final_tree(df)
}
forest_graph <- function(df) {
fold <- split_test_train(df, test_fraction=0.2)[[1]]
train <- fold$train
nodesizes <- c(5, 10, 20, 50, 100, 200, 350, 500)
mtries <- seq(2, 40, by=4)
parameters <- expand.grid(ns=nodesizes, mt=mtries)
forests <- mcmapply(
function(nodesize, mtry) {
print(paste(mtry, nodesize))
ranger::ranger(log_consumption ~ ., data=train, num.trees=50, mtry=mtry, min.node.size=nodesize, write.forest=TRUE)
},
parameters$ns,
parameters$mt,
SIMPLIFY=FALSE,
mc.cores=4
)
test <- fold$test
losses <- purrr::map_dbl(forests, ~mean((predict(.x, test)$predictions-test$log_consumption)^2))
colors <- rev(brewer.pal(5, "Greys")[c(1, 3, 5)])
res_df <- data.frame(min.nodesize=parameters$ns, mtry=parameters$mt, loss=losses)
vals <- rescale(c(min(losses), median(losses), max(losses)))
p <- ggplot(res_df, aes(x=min.nodesize, y=mtry, color=loss)) +
geom_point(size=8) +
scale_x_log10() +
# scale_color_gradientn(colors=colors, values=vals) +
xlab('Minimum nodesize') +
ylab('Features per iteration')
ggsave(in_f('forest_tuning.pdf'), plot=p)
}
lasso_variables <- function(df) {
trans_df <- translate_colnames(df)
x <- model.matrix(log_consumption ~ ., trans_df)
y <- df[, 'log_consumption']
model <- glmnet::cv.glmnet(x, y, alpha=1)
lambdas <- model$lambda
nzero <- model$nzero
nz_df <- data.frame(lambda=lambdas, nzero=nzero)
p <- ggplot(nz_df, aes(x=lambda, y=nzero)) +
geom_point() +
ylab('nonzero coefficients')
ggsave(in_f('lasso_nonzero.pdf'), plot=p)
change_lambdas <- lambdas[c(TRUE, (diff(nzero) > 0)) & (nzero <= 10)]
coefs <- coef(model, s=change_lambdas)
coefs <- coefs[apply(coefs > 0, 1, any), ]
colnames(coefs) <- change_lambdas
write.csv(as.matrix(coefs), in_f('lambda_features.csv'))
}
trace_regression_coefs <- function(df) {
trans_df <- translate_colnames(df)
fname <- 'regression_coef_v_lambda.pdf'
x <- model.matrix(log_consumption ~ . - 1, trans_df)
y <- df[, 'log_consumption']
lambdas <- seq(0, 1, length.out=100)
lasso_model <- glmnet::cv.glmnet(x, y, alpha=1, lambda=lambdas)
ridge_model <- glmnet::cv.glmnet(x, y, alpha=0, lambda=lambdas)
ols_model <- glmnet::glmnet(x, y, lambda=0)
feature <- 'number.of.bathrooms'
lasso_trace <- coef(lasso_model, lasso_model$lambda)[feature, ]
ridge_trace <- coef(ridge_model, ridge_model$lambda)[feature, ]
ols_trace <- rep(coef(ols_model)[feature, ], length(lambdas))
lasso_coef_names <- map(lasso_model$lambda, ~names(which(coef(lasso_model, .)[, 1] != 0)) %>% keep(~. != '(Intercept)'))
post_lasso_models <- map(lasso_coef_names, ~if (length(.) < 2) NA else glmnet::glmnet(x[, .], y, lambda=0))
post_lasso_trace <- map_dbl(post_lasso_models, ~if (is.na(.)) NA else coef(.)[feature, ])
post_lasso_trace[is.na(post_lasso_trace)] <- 0
lasso_df <- data.frame(beta_hat=lasso_trace, lambda=lasso_model$lambda, feature=feature, method='lasso')
ridge_df <- data.frame(beta_hat=ridge_trace, lambda=ridge_model$lambda, feature=feature, method='ridge')
post_lasso_df <- data.frame(beta_hat=post_lasso_trace, lambda=lasso_model$lambda, feature=feature, method='post_lasso')
ols_df <- data.frame(beta_hat=ols_trace, lambda=lasso_model$lambda, feature=feature, method='ols')
trace_df <- rbind(lasso_df, ridge_df, post_lasso_df, ols_df)
trace_df$method <- factor(trace_df$method, levels=c('ols', 'ridge', 'lasso', 'post_lasso'), ordered=T)
colors <- brewer.pal(4, "Set1")
names(colors) <- levels(trace_df$method)
max_beta <- max(c(ols_trace, post_lasso_trace, lasso_trace, ridge_trace))
p1 <- ggplot(filter(trace_df, method=='ols'), aes(x=lambda, y=beta_hat, color=method)) +
geom_point() +
scale_color_manual(values=colors) +
ylim(0, max_beta) +
ylab(expression(hat(beta))) +
xlab(expression(lambda))
p2 <- ggplot(filter(trace_df, (method=='ols' | method=='ridge')), aes(x=lambda, y=beta_hat, color=method)) +
geom_point() +
scale_color_manual(values=colors) +
ylim(0, max_beta) +
ylab(expression(hat(beta))) +
xlab(expression(lambda))
p3 <- ggplot(filter(trace_df, (method=='ols' | method=='lasso' | method=='ridge')), aes(x=lambda, y=beta_hat, color=method)) +
geom_point() +
scale_color_manual(values=colors) +
ylim(0, max_beta) +
ylab(expression(hat(beta))) +
xlab(expression(lambda))
p4 <- ggplot(filter(trace_df, (method=='ols' | method=='lasso' | method=='post_lasso')), aes(x=lambda, y=beta_hat, color=method)) +
geom_point() +
scale_color_manual(values=colors) +
ylim(0, max_beta) +
ylab(expression(hat(beta))) +
xlab(expression(lambda))
p5 <- ggplot(trace_df, aes(x=lambda, y=beta_hat, color=method)) +
geom_point() +
scale_color_manual(values=colors) +
ylim(0, max_beta) +
ylab(expression(hat(beta))) +
xlab(expression(lambda))
ggsave(in_f('beta_trace_ols.pdf'), p1)
ggsave(in_f('beta_trace_ols_ridge.pdf'), p2)
ggsave(in_f('beta_trace_ols_ridge_lasso.pdf'), p3)
ggsave(in_f('beta_trace_ols_lasso_post_lasso.pdf'), p4)
ggsave(in_f('beta_trace_ols_ridge_lasso_post_lasso.pdf'), p5)
}
regression_coefs <- function(df, filter_features='all') {
trans_df <- translate_colnames(df)
fname <- 'regression_coefs_all.pdf'
if (filter_features == 'numeric') {
trans_df <- trans_df[, sapply(trans_df, is.numeric)]
fname <- 'regression_coefs_numeric.pdf'
}
if (filter_features == 'numeric+binary') {
trans_df <- keep(trans_df, ~is.numeric(.) | nlevels(.) == 2)
fname <- 'regression_coefs_numeric_and_binary.pdf'
}
x <- model.matrix(log_consumption ~ . - 1, trans_df)
y <- df[, 'log_consumption']
lasso_model <- glmnet::cv.glmnet(x, y, alpha=1)
ridge_model <- glmnet::cv.glmnet(x, y, alpha=0)
ols_model <- glmnet::glmnet(x, y, lambda=0)
lasso_coef <- coef(lasso_model)
ridge_coef <- coef(ridge_model)
ols_coef <- coef(ols_model)
lasso_coef_names <- names(which(lasso_coef[, 1] != 0))
lasso_coef_names <- purrr::discard(lasso_coef_names, ~ . == '(Intercept)')
post_lasso_model <- glmnet::glmnet(x[, lasso_coef_names], y, lambda=0)
post_lasso_coef <- coef(post_lasso_model)
coef_dfs <- map(c(lasso_coef, ridge_coef, ols_coef, post_lasso_coef), ~data.frame(as.matrix(.)))
coef_df <- bind_cols(coef_dfs[1:3])
colnames(coef_df) <- c('lasso', 'ridge', 'ols')
coef_df$feature <- row.names(ols_coef)
post_lasso_coef_df <- coef_dfs[[4]]
colnames(post_lasso_coef_df) <- 'post.lasso'
post_lasso_coef_df$feature <- row.names(post_lasso_coef)
coef_df <- left_join(coef_df, post_lasso_coef_df, by='feature')
coef_df[is.na(coef_df)] <- 0.
coef_df_long <- melt(coef_df, id.vars='feature', value.name='coefficient')
coef_df_long <- filter(coef_df_long, feature!='(Intercept)')
coef_df_long$coefficient <- abs(coef_df_long$coefficient)
feature_order <- row.names(ols_coef)[order(abs(ols_coef[, 1]), decreasing=TRUE)]
coef_df_long$feature <- factor(coef_df_long$feature, levels=feature_order, ordered=TRUE)
p <- ggplot(coef_df_long, aes(x=feature, y=coefficient, fill=variable)) +
geom_bar(position='dodge', stat='identity') +
ylab(expression(abs(beta))) +
coord_flip()
ggsave(in_f(fname), plot=p)
}
translate_colnames <- function(df) {
rename(
df,
number.of.bathrooms=qtd_banheiros,
number.of.rooms=qtd_comodos_domc,
wall.type=cod_material_piso,
electric.water.heater=energia_17,
number.of.residents=qtd_morador_domc,
sewage.type=cod_esgoto_sanit,
garbage.disposal=cod_lixo,
mail.distribution.service=cod_servico_distribuicao,
street.pavement=cod_exist_pavim,
household.type=cod_tipo_domc,
consumption.units=qtd_uc,
number.of.bedrooms=qtd_comd_serv_dormit,
number.of.families=qtd_familia,
gas.stove=gas_18,
electricity.source.other=propria_15
)
}
# tree_stability <- function(df) {
# trans_df <- translate_colnames(df)
# N <- nrow(trans_df)
# models <- lapply(1:4, function(i) {
# sample_df <- trans_df[sample(1:N, floor(N/10)), c('log_consumption', 'gas.stove', 'electric.water.heater', 'sewage.type')]
# rpart(log_consumption ~ ., sample_df)
# })
#
# imp_df <- data.frame(feature=names(model$variable.importance), importance=model$variable.importance) %>%
# arrange(desc(importance)) %>%
# filter(row_number() < 10)
# imp_df$feature <- factor(imp_df$feature, levels=rev(imp_df$feature))
# }
results_table <- function() {
output <- load_validation_models('brazil_sendhil_all')[, -1]
ols <- filter(output, method=='ols') %>% arrange(true)
forest <- filter(output, method=='tuned_forest') %>% arrange(true)
ensemble_df <- data.frame(true=ols$true, ols=ols$predicted, forest=forest$predicted)
m <- lm(true ~ ols + forest, ensemble_df)
ensemble_table <- data.frame(
value=c(m$coefficients, summary(m)$r.squared),
row.names=c('Intercept', 'ols', 'forest', 'r^2'))
ensemble_df <- data.frame(true=ols$true, ols=ols$predicted, forest=forest$predicted)
m2 <- lm(true ~ ols + forest + ols * forest, ensemble_df)
ensemble_table <- data.frame(
value=c(m$coefficients, 0, summary(m)$r.squared),
value2=c(m2$coefficients, summary(m2)$r.squared),
row.names=c('Intercept', 'ols', 'forest', 'ols * forest', 'r^2'))
ensemble_latex <- xtable::xtable(ensemble_table)
sink(in_f('ensemble_coefficients_table.tex'))
ensemble_latex <- xtable::xtable(ensemble_table)
print(ensemble_latex)
sink()
sink(in_f('ensemble_coefficients_table.tex'))
print(ensemble_latex)
sink()
ensemble_output <- data.frame(true=ensemble_df$true, predicted=predict(m, ensemble_df), method='ensemble', fold=1)
ensemble2_output <- data.frame(true=ensemble_df$true, predicted=predict(m2, ensemble_df), method='ensemble2', fold=1)
e_output <- rbind(output, ensemble_output, ensemble2_output)
output_table <- summarize_output(e_output, .05)
row.names(output_table) <- output_table$method
ordered_table <- output_table[c('tree', 'knn', 'svm', 'ols', 'lasso', 'ridge', 'lasso_ix', 'regression_tree', 'tuned_forest', 'boosted_tree', 'ensemble', 'ensemble2'), ]
ordered_table <- ordered_table[, c('reach', 'avg_1_c', 'avg_1_c_normalized')]
colnames(ordered_table) <- c(
'% Poor Reached',
"E[u'(c)]*10^4",
"Fraction of max"
)
results_latex <- xtable::xtable(
ordered_table,
digits=8)
sink(in_f('results_table.tex'))
print(results_latex)
sink()
}
make2 <- function() {
df <- load_dataset('brazil_sendhil_all')
df$X <- NULL
cnames <- colnames(df)
categoricals <- cnames[!grepl('qtd', cnames) & !grepl('num', cnames) & TARGET_VARIABLE != cnames]
df[, categoricals] <- lapply(df[, categoricals], factor)
df <- rename(df, log_consumption=yyyyy)
df <- select(df, -contains('imput'))
df <- select(df, -num_ext_renda)
df2 <- mutate(df, log_consumption=log_consumption/qtd_familia)
lasso_variables(df)
regression_coefs(df)
regression_coefs(df, 'numeric')
regression_coefs(df, 'numeric+binary')
}
# lasso ridge, beta hat as a function of lambda one per graph
# instability in the tree
# ols, ols + ridge, ols + ridge + lasso, ols + lasso + post_lasso
# knn , splines, rfs, reg trees, kernel
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.