library(lemon) knitr::opts_chunk$set( collapse = TRUE, comment = "#>", warning = F, message = F ) knit_print.data.frame <- lemon_print
This vignette is designed for users who would like greater control over the default functions in starbility
. If you have not yet done so, please first review the main vignette.
This vignette will cover four topics:
starbility
pipelinestarbility
. (Currently, the only function automatically supported is felm
)ggplot2
plottingAs in the main vignette, we will use the diamonds
dataset included in `ggplot2
.
library(tidyverse) library(starbility) library(lfe) data("diamonds") set.seed(43) indices = sample(1:nrow(diamonds), replace=F, size=round(nrow(diamonds)/20)) diamonds = diamonds[indices,]
starbility
pipelinestarbility
proceeds in six discrete steps, corresponding to six discrete functions: create_grid
, create_felm_formulas
, create_model_estimates
, create_plot_dfs
, draw_panels
, and combine_plots
. stability_plot
wraps all of these steps for convenience, but we can also access each function individually. Alternatively, we can use the run_to
argument in stability_plot
to end execution before a given step and the run_from
argument to begin execution at a given step. The typical use case is to skip a step and implement it ourselves if we desire more flexibility or an option not yet automatically implemented.
The first step is to create a data frame with the structure of each model to be estimated. Each column corresponds to a variable, each row corresponds to a model, and each cell takes value 1 if that variable is included in the model and value 0 otherwise. I will specify run_to=2
to instruct stability_plot
to run only Step 1.
diamonds$high_clarity = diamonds$clarity %in% c('VS1','VVS2','VVS1','IF') base_controls = c( 'Diamond dimensions' = 'x + y + z' ) perm_controls = c( 'Depth' = 'depth', 'Table width' = 'table' ) perm_fe_controls = c( 'Cut FE' = 'cut', 'Color FE' = 'color' ) nonperm_fe_controls = c( 'Clarity FE (granular)' = 'clarity', 'Clarity FE (binary)' = 'high_clarity' ) grid1 = stability_plot(data = diamonds, lhs = 'price', rhs = 'carat', perm = perm_controls, base = base_controls, perm_fe = perm_fe_controls, nonperm_fe = nonperm_fe_controls, run_to=2) knitr::kable(grid1 %>% head(10)) knitr::kable(grid1 %>% tail(10))
The second step is to use the output of Step 1 to build the formulas to be estimated. starbility
contains built-in support for felm
, but if we want to use other functions, such as glm
, plm
, bife
, etc., we need to build the formulas ourselves. (An example follows later in this document). Let's try running just this step using the run_from
argument and plugging in our output from above:
grid2 = stability_plot(grid = grid1, data=diamonds, lhs='price', rhs='carat', perm=perm_controls, base=base_controls, run_from=2, run_to=3) knitr::kable(grid2 %>% head(10)) knitr::kable(grid2 %>% tail(10))
The third step is to estimate the formulas above we generated in Step 2:
grid3 = stability_plot(grid = grid2, data=diamonds, lhs='price', rhs='carat', perm=perm_controls, base=base_controls, run_from=3, run_to=4) knitr::kable(grid3 %>% head(10)) knitr::kable(grid3 %>% tail(10))
Adjusting p-values, standard errors (e.g. robust, Conley), the width of confidence intervals, etc. is simple; an example is given in the main vignette. Using models other than felm
is also simple once we have the data frame of formulas in place; an example follows later in this document.
The fourth step is to create two data frames, one for the coefficient panel (top) and one for the control panel (bottom). These are converted to tidy format to allow for easy plotting.
dfs = stability_plot(grid = grid3, data=diamonds, lhs='price', rhs='carat', perm=perm_controls, base=base_controls, run_from=4, run_to=5) coef_grid = dfs[[1]] control_grid = dfs[[2]] knitr::kable(coef_grid %>% head(10)) knitr::kable(coef_grid %>% tail(10)) knitr::kable(control_grid %>% head(10)) knitr::kable(control_grid %>% tail(10))
The fifth step is to draw the two panels using ggplot2
.
panels = stability_plot(data = diamonds, lhs='price', rhs='carat', coef_grid = coef_grid, control_grid = control_grid, run_from=5, run_to=6) panels[[1]] panels[[2]]
The final step is to combine the two panels:
stability_plot(data = diamonds, lhs='price', rhs='carat', coef_panel = panels[[1]], control_panel = panels[[2]], run_from = 6, run_to = 7)
We've recovered the original plot! Let's now explore some use cases.
Let's try implementing a logit model using glm
, modeling the effect of carat on an indicator for whether the diamond is above median price. Since glm
uses different syntax (no fixed effects!) we'll need to build our formulas by hand in Step 2.
diamonds$above_med_price = as.numeric(diamonds$price > median(diamonds$price)) base_controls = c( 'Diamond dimensions' = 'x + y + z' ) perm_controls = c( 'Depth' = 'depth', 'Table width' = 'table', 'Clarity' = 'clarity' ) lhs_var = 'above_med_price' rhs_var = 'carat' grid1 = stability_plot(data = diamonds, lhs = lhs_var, rhs = rhs_var, perm = perm_controls, base = base_controls, fe_always = F, run_to=2) # Create control part of formula base_perm = c(base_controls, perm_controls) grid1$expr = apply(grid1[,1:length(base_perm)], 1, function(x) paste(base_perm[names(base_perm)[which(x==1)]], collapse='+')) # Complete formula with LHS and RHS variables grid1$expr = paste(lhs_var, '~', rhs_var, '+', grid1$expr, sep='') knitr::kable(grid1 %>% head(10)) knitr::kable(grid1 %>% tail(10))
Our grid seems to be in order. We will also need to define a custom function for our logit model:
starb_logit = function(spec, data, rhs, ...) { spec = as.formula(spec) model = glm(spec, data=data, family='binomial', weights=data$weight) %>% broom::tidy() row = which(model$term==rhs) coef = model[row, 'estimate'] %>% as.numeric() se = model[row, 'std.error'] %>% as.numeric() p = model[row, 'p.value'] %>% as.numeric() return(c(coef, p, coef+1.96*se, coef-1.96*se)) } stability_plot(grid = grid1, data = diamonds, lhs = lhs_var, rhs = rhs_var, model = starb_logit, perm = perm_controls, base = base_controls, fe_always = F, run_from=3)
What if we want (average) marginal effects? Oh, and what if we want 99\% CIs rather than 95\% CIs? We can simply modify our estimation function accordingly. We'll pass in a new get_mfx
argument through the ...
:
library(margins) starb_logit_enhanced = function(spec, data, rhs, ...) { # Unpack ... l = list(...) get_mfx = ifelse(is.null(l$get_mfx), F, T) # Set a default to F spec = as.formula(spec) if (get_mfx) { model = glm(spec, data=data, family='binomial', weights=data$weight) %>% margins() %>% summary row = which(model$factor==rhs) coef = model[row, 'AME'] %>% as.numeric() se = model[row, 'SE'] %>% as.numeric() p = model[row, 'p'] %>% as.numeric() } else { model = glm(spec, data=data, family='binomial', weights=data$weight) %>% broom::tidy() row = which(model$term==rhs) coef = model[row, 'estimate'] %>% as.numeric() se = model[row, 'std.error'] %>% as.numeric() p = model[row, 'p.value'] %>% as.numeric() } z = qnorm(0.995) return(c(coef, p, coef+z*se, coef-z*se)) } stability_plot(grid = grid1, data = diamonds, lhs = lhs_var, rhs = rhs_var, model = starb_logit_enhanced, get_mfx = T, perm = perm_controls, base = base_controls, fe_always = F, run_from = 3)
starbility
gives some options for customizing the plot, but it can be desirable to plot from scratch for added flexibility. Let's try that below:
dfs = stability_plot(grid = grid1, data = diamonds, lhs = lhs_var, rhs = rhs_var, model = starb_logit_enhanced, get_mfx = T, perm = perm_controls, base = base_controls, fe_always = F, run_from = 3, run_to = 5) coef_grid_logit = dfs[[1]] control_grid_logit = dfs[[2]] min_space = 0.5 coef_plot = ggplot2::ggplot(coef_grid_logit, aes(x = model, y = coef, shape=p, group=p)) + geom_linerange(aes(ymin = error_low, ymax = error_high), alpha=0.75) + geom_point(size=5, aes(col=p, fill=p), alpha=1) + viridis::scale_color_viridis(discrete = TRUE, option = "D")+ scale_shape_manual(values = c(15,17,18, 19)) + theme_classic() + geom_hline(yintercept=0, linetype='dotted') + ggtitle('A custom coefficient stability plot!') + labs(subtitle="Error bars represent 99% confidence intervals") + theme(axis.text.x = element_blank(), axis.title = element_blank(), axis.ticks.x = element_blank()) + coord_cartesian(xlim=c(1-min_space, max(coef_grid_logit$model)+min_space), ylim=c(-0.1, 1.6)) + guides(fill=F, shape=F, col=F) control_plot = ggplot(control_grid_logit) + geom_point(aes(x = model, y = y, fill=value), shape=23, size=4) + scale_fill_manual(values=c('#FFFFFF', '#000000')) + guides(fill=F) + scale_y_continuous(breaks = unique(control_grid_logit$y), labels = unique(control_grid_logit$key), limits=c(min(control_grid_logit$y)-1, max(control_grid_logit$y)+1)) + scale_x_continuous(breaks=c(1:max(control_grid_logit$model))) + coord_cartesian(xlim=c(1-min_space, max(control_grid_logit$model)+min_space)) + theme_classic() + theme(panel.grid.major.y = element_blank(), panel.grid.minor.y = element_blank(), axis.title = element_blank(), axis.text.y = element_text(size=10), axis.ticks = element_blank(), axis.line = element_blank()) cowplot::plot_grid(coef_plot, control_plot, rel_heights=c(1,0.5), align='v', ncol=1, axis='b')
Manual plotting also lets us combine coefficient estimates from different models. Let's try including both logit and probit (marginal effects). That also means we'll need to define a probit function first:
starb_probit = function(spec, data, rhs, ...) { # Unpack ... l = list(...) get_mfx = ifelse(is.null(l$get_mfx), F, T) # Set a default to F spec = as.formula(spec) if (get_mfx) { model = glm(spec, data=data, family=binomial(link='probit'), weights=data$weight) %>% margins() %>% summary row = which(model$factor==rhs) coef = model[row, 'AME'] %>% as.numeric() se = model[row, 'SE'] %>% as.numeric() p = model[row, 'p'] %>% as.numeric() } else { model = glm(spec, data=data, family=binomial(link='probit'), weights=data$weight) %>% broom::tidy() row = which(model$term==rhs) coef = model[row, 'estimate'] %>% as.numeric() se = model[row, 'std.error'] %>% as.numeric() p = model[row, 'p.value'] %>% as.numeric() } z = qnorm(0.995) return(c(coef, p, coef+z*se, coef-z*se)) } probit_dfs = stability_plot(grid = grid1, data = diamonds, lhs = lhs_var, rhs = rhs_var, model = starb_probit, get_mfx = T, perm = perm_controls, base = base_controls, fe_always = F, run_from=3, run_to = 5) # We'll put the probit DFs on the left, so we need to adjust the model numbers accordingly # so the probit and logit DFs don't plot on top of one another! coef_grid_probit = probit_dfs[[1]] %>% mutate(model = model + max(coef_grid_logit$model)) control_grid_probit = probit_dfs[[2]] %>% mutate(model = model + max(control_grid_logit$model)) coef_grid = bind_rows(coef_grid_logit, coef_grid_probit) control_grid = bind_rows(control_grid_logit, control_grid_probit) panels = stability_plot(coef_grid = coef_grid, control_grid = control_grid, data = diamonds, lhs = lhs_var, rhs = rhs_var, perm = perm_controls, base = base_controls, fe_always = F, run_from = 5, run_to = 6) coef_plot = panels[[1]] + geom_vline(xintercept = 8.5, linetype='dashed', alpha=0.8) + annotate(geom='label', x = 4.25, y = 1.8, label = 'Logit models', size=6, fill='#D3D3D3', alpha=0.7) + annotate(geom='label', x = 12.75, y = 1.8, label = 'Probit models', size=6, fill='#D3D3D3', alpha=0.7) + coord_cartesian(ylim = c(-0.5, 1.9)) control_plot = panels[[2]] + geom_vline(xintercept = 8.5, linetype='dashed', alpha=0.8) cowplot::plot_grid(coef_plot, control_plot, rel_heights=c(1,0.5), align='v', ncol=1, axis='b')
Comments, criticism, suggestions, pull requests, etc. are very much appreciated. Email: arao@g.harvard.edu. Thanks to David Yanagizawa-Drott for suggesting the structure of the plot and to Ross Mattheis and Eric Karsten for very helpful feedback.
Thanks for reading!
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.