In bcallaway11/did: Treatment Effects with Multiple Periods and Groups

-- output: html_document --

Example 1: Balanced Panel Data

library(did)
#library(profvis) # not used below, but useful for seeing more details about 
# where speedups are happening
packageVersion("did") # version 2.1.0

n <- 5000
time.periods <- 8
sp <- reset.sim(n=n, time.periods=time.periods)

data <- build_sim_dataset(sp)
N <- nrow(data)
# add some more spurious columns to data:
data$X1 <- sample(1:25, size=N, replace=TRUE)
data$X1 <- as.factor(data$X1)
data$X2 <- rnorm(N)
data$X3 <- rnorm(N)
data$X4 <- rnorm(N)
data$X5 <- rnorm(N)

pt <- proc.time()
new_dr <- att_gt(yname="Y", xformla=~X, data=data, tname="period", idname="id",
                  gname="G", est_method="dr")
new_dyn <- aggte(new_dr, type="dynamic")
new_time <- proc.time() - pt

# currently on CRAN
detach("package:did")
library(did, lib.loc="~/R/old_packages") 
packageVersion("did") # should be 2.0.0

pt <- proc.time()
old_dr <- att_gt(yname="Y", xformla=~X, data=data, tname="period", idname="id",
                  gname="G", est_method="dr")
old_dyn <- aggte(old_dr, type="dynamic")
old_time <- proc.time() - pt

# check that estimates are the same
all(new_dr$att == old_dr$att)
all(new_dyn$att == old_dyn$att)

# compare actual times
new_time
old_time

There is may be a little bit of variation due to compiling the source file for this post, but I get about a 70% reduction in computational time with the new code (from 4.6 to 1.3 seconds).

Example 2: Unbalanced Panel Data

detach("package:did")
library(did)
packageVersion("did") # version 2.1.0

n <- 5000
time.periods <- 8
sp <- reset.sim(n=n, time.periods=time.periods)

data <- build_sim_dataset(sp)
N <- nrow(data)
# add some more spurious columns to data:
data$X1 <- sample(1:25, size=N, replace=TRUE)
data$X1 <- as.factor(data$X1)
data$X2 <- rnorm(N)
data$X3 <- rnorm(N)
data$X4 <- rnorm(N)
data$X5 <- rnorm(N)
data <- data[-1, ] # drop first row

pt <- proc.time()
new_dr <- att_gt(yname="Y", xformla=~X, data=data, tname="period", idname="id",
                  gname="G", est_method="dr", allow_unbalanced_panel = TRUE)
new_dyn <- aggte(new_dr, type="dynamic")
new_time <- proc.time() - pt

# currently on CRAN
detach("package:did")
library(did, lib.loc="~/R/old_packages") 
packageVersion("did") # should be 2.0.0

pt <- proc.time()
old_dr <- att_gt(yname="Y", xformla=~X, data=data, tname="period", idname="id",
                  gname="G", est_method="dr", allow_unbalanced_panel = TRUE)
old_dyn <- aggte(old_dr, type="dynamic")
old_time <- proc.time() - pt

# check that estimates are the same
all(new_dr$att == old_dr$att)
all(new_dyn$att == old_dyn$att)

# compare actual times
new_time
old_time

Again, there may be variation due to compiling the document, but here I get about a 97% reduction in computation time (from 52 seconds to about 1.5 seconds).

Example 3: Repeated cross sections data

detach("package:did")
library(did)
packageVersion("did") # version 2.1.0

n <- 5000
time.periods <- 8
sp <- reset.sim(n=n, time.periods=time.periods)

data <- build_sim_dataset(sp, panel=FALSE)
N <- nrow(data)
# add some more spurious columns to data:
data$X1 <- sample(1:25, size=N, replace=TRUE)
data$X1 <- as.factor(data$X1)
data$X2 <- rnorm(N)
data$X3 <- rnorm(N)
data$X4 <- rnorm(N)
data$X5 <- rnorm(N)

pt <- proc.time()
new_dr <- att_gt(yname="Y", xformla=~X, data=data, tname="period", idname="id",
                  gname="G", est_method="dr", panel=FALSE)
new_dyn <- aggte(new_dr, type="dynamic")
new_time <- proc.time() - pt

# currently on CRAN
detach("package:did")
library(did, lib.loc="~/R/old_packages") 
packageVersion("did") # should be 2.0.0

pt <- proc.time()
old_dr <- att_gt(yname="Y", xformla=~X, data=data, tname="period", idname="id",
                  gname="G", est_method="dr", panel=FALSE)
old_dyn <- aggte(old_dr, type="dynamic")
old_time <- proc.time() - pt

# check that estimates are the same
all(new_dr$att == old_dr$att)
all(new_dyn$att.egt == old_dyn$att.egt)

# compare actual times
new_time
old_time

In this case, I get about a 75% reduction in computation time (from 2.8 to 0.6 seconds).

Example 4: Bigger data

detach("package:did")
library(did)
packageVersion("did") # version 2.1.0

n <- 25000
time.periods <- 20
sp <- reset.sim(n=n, time.periods=time.periods)

data <- build_sim_dataset(sp)
N <- nrow(data)
# add some more spurious columns to data:
data$X1 <- sample(1:25, size=N, replace=TRUE)
data$X1 <- as.factor(data$X1)
data$X2 <- rnorm(N)
data$X3 <- rnorm(N)
data$X4 <- rnorm(N)
data$X5 <- rnorm(N)

pt <- proc.time()
new_dr <- att_gt(yname="Y", xformla=~X, data=data, tname="period", idname="id",
                  gname="G", est_method="dr")
new_dyn <- aggte(new_dr, type="dynamic")
new_time <- proc.time() - pt

# currently on CRAN
detach("package:did")
library(did, lib.loc="~/R/old_packages") 
packageVersion("did") # should be 2.0.0

pt <- proc.time()
old_dr <- att_gt(yname="Y", xformla=~X, data=data, tname="period", idname="id",
                  gname="G", est_method="dr")
old_dyn <- aggte(old_dr, type="dynamic")
old_time <- proc.time() - pt

# check that estimates are the same
all(new_dr$att == old_dr$att)
all(new_dyn$att.egt == old_dyn$att.egt)

# compare actual times
new_time
old_time