#' @title Identify controls and estimate intervention effects.
#' @description \code{idd} takes a long-form panel dataset containing event count and exposure data, obtains the optimal controls and estimates an incidence difference-in-differences model.
#'
#' @param eventvar Name of the event count variable.
#' @param popvar The person-time (or another exposure) variable.
#' @param treatvar The treatment group indicator dummy (0 for controls, 1 for treated units).
#' @param postvar The post-period indicator dummy (0 for all time points in the pre-intervention period, 1 for all time points in the post-period)
#' @param timevar The time variable (can be coded on any scale).
#' @param idvar The panel id variable.
#' @param names A variable containing unit names (optional). Changes the output of id.selected to show unit names of selected controls.
#' @param print Print results? Default=TRUE.
#' @param data A long-form panel dataset containing the supplied variables.
#' @param mult Multiplier for the rates produced by print (default=100000).
#'
#' @return Returns a list containing the following elements:
#' \code{$Resdat}: a data frame containing the results.
#' \code{$cv_errors}: a data frame containing the cross-validation errors for the k-minimizing function.
#' \code{$supp_stats}: a data frame containing supplementary statistics (average effects, relative effects, p-values etc).
#' \code{$id_controls}: a data frame containing information on the selected controls.
#' @examples
#' \dontrun{
#' data(simpanel)
#' idd.out <- idd(eventvar="y",
#' popvar="pop",
#' idvar="age",
#' timevar="time",
#' postvar="post",
#' treatvar="treat",
#' data=simpanel)
#' }
#' @export
idd <- function(eventvar, popvar, treatvar, postvar, timevar, idvar, names=NULL, print=TRUE, data, mult=100000) {
#Generate data
E <- data[,eventvar]
P <- data[,popvar]
TR <- data[,treatvar]
PO <- data[,postvar]
TI <- data[,timevar]
id <- data[,idvar]
namevar <- as.character(data[,names])
TI_c <- TI-(min(TI)-1)
rate = NULL
df <- as.data.frame(cbind(E, P, TR, PO, TI, TI_c, id))
##Prelimiary checks
##Missing data?
#Missing event data?
if (sum(is.na(df$E))>0) {
stop("Event data is missing on at least one observation, please remove units with gaps.")
}
#Missing exposure data?
if (sum(is.na(df$P))>0) {
stop("Exposure data is missing on at least one observation, please remove units with gaps.")
}
#Balanced panel?
tfirst <- doBy::summaryBy(TI~id, FUN=min, data=df, var.names=c("tmin"))
first.test <- length(unique(tfirst$tmin))
tlast <- doBy::summaryBy(TI~id, FUN=max, data=df, var.names=c("tmax"))
last.test <- length(unique(tlast$tmax))
if (first.test>1) {
stop("Data is unbalanced, please supply a balanced panel dataset.")
}
if (last.test>1) {
stop("Data is unbalanced, please supply a balanced panel dataset.")
}
#Subset pre-period.
predf <- subset(df, PO==0)
#More than 2 data points in the pre-period?
if (max(predf$TI_c)<3) {
stop("Pre-period is too short, the idd requires at least 3 data points to run.")
}
#At least 1 data point in the post-period?
if (max(df$PO)<1) {
stop("No post period found, need data before and after the intervention (code post period dummy as 0 in the before period and 1 after).")
}
#Treatment data, pre-period
X1 <- subset(predf, TR==1)
#Control data, pre-period.
X0 <- subset(predf, TR==0)
#Aggregate treatment group for use in matching later.
sumtreat <- doBy::summaryBy(E + P ~ TI, FUN=sum, data=X1, var.names=c("y", "p"))
sumtreat$inc <- sumtreat$y/sumtreat$p ##Compute incidence rate (not multiplied at this point)
X1 <- cbind(sumtreat$TI, sumtreat$inc, row.names = NULL)
X1 <- as.data.frame(X1)
X1$id <- rep(-1337, nrow(X1))
colnames(X1) <- c("time", "inc", "id")
#Generate incidence rate for controls
X0$inc <- X0$E/X0$P
X0 <- cbind(X0$TI, X0$inc, X0$id, row.names = NULL)
colnames(X0) <- c("time", "inc", "id")
#Reshape wide X1 and X0 matrices wide.
X0.w <- stats::reshape(as.data.frame(X0), idvar = "id", timevar = "time", direction = "wide")
X1.w <- stats::reshape(as.data.frame(X1), timevar = "time", direction = "wide")
#Bind data
X <- rbind(X1.w, X0.w) ##First row is treatment group (we use this knowledge later.)
#Take out fixed effects
Xprim <- X
for (i in 1:nrow(Xprim[,2:ncol(Xprim)])) {
Xprim[i,2:ncol(Xprim)] <- Xprim[i,2:ncol(Xprim)]-rowMeans(Xprim[i,2:ncol(Xprim)])
}
cl <- scale(as.matrix(Xprim[,2:ncol(Xprim)]))
#Calculate squared distances from trunit
x_ctrl <- cl[2:nrow(cl),]
x_tr <- cl[1,]
sq_dist <- apply(x_ctrl, 1, function(x) (x_tr-x)^2)
distvec <- rank(as.vector(sqrt(colSums(sq_dist))), ties.method="first")
#Gen treat output data
treatout <- doBy::summaryBy(E + P ~ TI_c, FUN=sum, data=subset(df, TR==1), var.names=c("y1", "p1"), id="PO")
###################################################
#Identify and cross-validate k nearest neighbours##
###################################################
if (isTRUE(print)) {
cat("Matching, please wait...", sep="")
pb <- utils::txtProgressBar(min = 0, max = nrow(sq_dist), style = 3)
}
cv.error <- as.list(rep(NA),nrow(sq_dist)) ##Empty store list
cv.error2 <- as.list(rep(NA), length(rankvec)) ##Empty store list
for (t in 1:nrow(sq_dist)) { #Loop over each t in T0 (pre-period)
rankvec <- rank(as.vector(sqrt(colSums(sq_dist[-t,]))), ties.method="first") #Remove 1 obs (in time) and rank and calculate Euc distance
idrank <- as.data.frame(cbind(X[-1,1], rankvec, row.names = NULL)) #Match ranking to id numbers
colnames(idrank) <- c("id", "rank") #Change column names
kneardat <- merge(df, idrank, by=c("id")) #set rank for t-training data
for (k in 1:length(rankvec)) { #Loop over all possible selections of k (1 nearest through N nearest - i.e. entire sample)
##Summarize control data for rank<=k
ctrlout <- doBy::summaryBy(E + P ~ TI_c, FUN=sum, data=subset(kneardat, rank<=k), var.names=c("y0", "p0"), id="PO")
#Compute cv error using estimation formula
outdat <- cbind(ctrlout, treatout, row.names = NULL)
predat <- subset(outdat, PO==0)
t0 <- sum(predat$y1.sum)/sum(as.numeric(predat$p1.sum))
c0 <- sum(predat$y0.sum)/sum(as.numeric(predat$p0.sum))
y0 <- outdat$y0.sum/outdat$p0.sum
y1 <- outdat$y1.sum/outdat$p1.sum
time <- outdat$TI_c
cf <- (t0-c0)+y0
effect <- y1-cf
temp <- as.data.frame(cbind(effect, time, row.names = NULL))
cv.error[[k]] <- subset(temp$effect, time==t)^2 #Store squared cv error for k
}
cv.error2[[t]] <- plyr::ldply(cv.error, data.frame) #Store for t, k
cv.error2[[t]][,2] <- 1:length(rankvec) #add reference number for k (for summary)
if (isTRUE(print)) {
utils::setTxtProgressBar(pb, t)
}
}
errordat <- plyr::ldply(cv.error2, data.frame) #Flatten list to data frame
colnames(errordat) <- c("cv", "k")
k.mean <- doBy::summaryBy(cv~k, FUN=mean, data=errordat) #Compute mean cross-validation error for each k
k.mean$cv.mean <- sqrt(k.mean$cv.mean)
k.min <- which.min(k.mean$cv.mean) #Extract best selection for k according to leave-one-out cross validation on RMSE
cv.rmse <- k.mean$cv.mean[k.min] #cv error for the matched k
cv.fullsample <- k.mean$cv.mean[nrow(k.mean)] #cv error for the entire sample (store for comparison)
#######################################
##Obtain effect estimates for best k ##
#######################################
rankvec <- rank(as.vector(sqrt(colSums(sq_dist))), ties.method="first") #Rank using all T0 obs
distance <- as.vector(sqrt(colSums(sq_dist)))
idrank <- as.data.frame(cbind(X[-1,1], rankvec, distance, row.names = NULL)) #Match ranking to id numbers
colnames(idrank) <- c("id", "rank", "euc_dist") #Change column names
ctrldat <- merge(df, idrank, by=c("id")) #set ranks for matrix of control data
#Obtain ids of k*-controls:
if (is.null(names)) {
id.selected <- subset(as.data.frame(idrank), idrank$rank<=k.min)
}
else if (!is.null(names)) { #If the names variable has been specified, supply names as well.
idx <- subset(as.data.frame(idrank), idrank$rank<=k.min)
namedat <- cbind(id, namevar, row.names = NULL)
colnames(namedat) <- c("id", "name")
id.selected <- merge(idx, namedat, by=c("id"))
colnames(id.selected) <- c("id", "rank", "euc_dist","name")
id.selected <- stats::aggregate(cbind(id, rank, euc_dist, row.names=NULL) ~ name, data=id.selected, FUN=mean)
}
#Obtain matched dataset
mtchctrl <- subset(ctrldat, rank<=k.min)
treatdat <- subset(df, TR==1)
treatdat$rank <- NA
treatdat$euc_dist <- NA
matched_data <- rbind(treatdat, mtchctrl)
matched_data$rate <- matched_data$E/matched_data$P
#Summarize control data
ctrlout <- doBy::summaryBy(E + P ~ TI, FUN=sum, data=subset(ctrldat, rank<=k.min), var.names=c("y0", "p0"), id="PO")
donorpool <- doBy::summaryBy(E + P ~ TI, FUN=sum, data=ctrldat, var.names=c("yx", "px"), id="PO")
#Summarize treat data
treatout <- doBy::summaryBy(E + P ~ TI, FUN=sum, data=subset(df, TR==1), var.names=c("y1", "p1"), id="PO")
#Combine
outdat <- cbind(ctrlout, treatout, donorpool, row.names = NULL)
#Get pre-period for means
predat <- subset(outdat, PO==0)
##compute stats: prep
t0 <- sum(as.numeric(predat$y1.sum))/sum(as.numeric(predat$p1.sum))
c0 <- sum(as.numeric(predat$y0.sum))/sum(as.numeric(predat$p0.sum))
y0 <- outdat$y0.sum/outdat$p0.sum
y1 <- outdat$y1.sum/outdat$p1.sum
time <- outdat$TI
#Get original donor pool data for comparison to matched controls
cx <- sum(as.numeric(predat$yx.sum))/sum(as.numeric(predat$px.sum))
yx <- outdat$yx.sum/outdat$px.sum
#Design effects
icc <- ICC::ICCbare(as.factor(id), rate, data=matched_data)
m <- length(unique(df$TI))
if (isTRUE(icc<0) == TRUE) { #Check if ICC estimate is negative, if so, reset to zero.
icc <- 0
}
DEFT <- sqrt(1+icc*(m-1))
m2 <- length(unique(predat$TI))+1
DEFT.t <- sqrt(1+icc*(m2-1))
#Compute time-varying effects
cf <- (t0-c0)+y0
effect <- y1-cf
cf.donor <- (t0-cx)+yx
se <- sqrt(sum(predat$y1.sum)/(sum(as.numeric(predat$p1.sum))^2)+sum(predat$y0.sum)/(sum(as.numeric(predat$p0.sum))^2)+outdat$y0.sum/(as.numeric(outdat$p0.sum^2))+outdat$y1.sum/(as.numeric(outdat$p1.sum^2)))*DEFT.t
z <- abs(effect/se)
pval <- 2*stats::pnorm(-z)
#Store time-varying effects matrix
post <- outdat$PO
y0.ctrl <- y0
y0.donors <- yx
res <- as.data.frame(cbind(y1,y0.ctrl,cf,effect,se,pval,time,post,cf.donor,y0.donors, row.names = NULL))
#Compute DD estimates for average effects
postdat <- subset(outdat, PO==1)
t1 <- sum(as.numeric(postdat$y1.sum))/sum(as.numeric(postdat$p1.sum))
c1 <- sum(as.numeric(postdat$y0.sum))/sum(as.numeric(postdat$p0.sum))
c1x <- sum(as.numeric(postdat$yx.sum))/sum(as.numeric(postdat$px.sum))
dd <- (t1-t0)-(c1-c0)
dd_se <- sqrt(sum(predat$y1.sum)/(sum(as.numeric(predat$p1.sum))^2)+sum(predat$y0.sum)/(sum(as.numeric(predat$p0.sum))^2)+sum(postdat$y1.sum)/(sum(as.numeric(postdat$p1.sum))^2)+sum(postdat$y0.sum)/(sum(as.numeric(postdat$p0.sum))^2))*DEFT
dd_z <- abs(dd/dd_se)
dd_pval <- 2*stats::pnorm(-dd_z)
dd_donor <- (t1-t0)-(c1x-cx)
#Compute cumulative effects (in events, entire T1-period - based on time-varying effect)
posteffect <- subset(res, post==1)$effect
postse <- subset(res, post==1)$se
cm.post <- sum(posteffect*as.numeric(postdat$p1.sum))
cm.se <- abs(cm.post/dd_z)
cm.events.lower <- (cm.post-cm.se*stats::qnorm(0.975))
cm.events.upper <- (cm.post+cm.se*stats::qnorm(0.975))
cm.events <- cm.post
#Compute relative effect
post.events <- sum(postdat$y1.sum)
ratio <- (post.events)/(post.events-cm.events)
lnratio <- log(ratio)
rrse <- sqrt((1/sum(predat$y1))+(1/sum(predat$y0))+(1/sum(postdat$y1))+(1/sum(postdat$y0)))*DEFT
ratio.up <- exp(lnratio+stats::qnorm(0.975)*rrse)
ratio.lo <- exp(lnratio-stats::qnorm(0.975)*rrse)
#Compute some basic stats for epidemiological tables and classic DD plotting
y_tr0 <- sum(as.numeric(predat$y1.sum))
y_tr1 <- sum(as.numeric(postdat$y1.sum))
y_c0 <- sum(as.numeric(predat$y0.sum))
y_c1 <- sum(as.numeric(postdat$y0.sum))
p_tr0 <- sum(as.numeric(predat$p1.sum))
p_tr1 <- sum(as.numeric(postdat$p1.sum))
p_c0 <- sum(as.numeric(predat$p0.sum))
p_c1 <- sum(as.numeric(postdat$p0.sum))
epitable <- as.data.frame(cbind(y_tr0,p_tr0,y_tr1,p_tr1,y_c0,p_c0,y_c1,p_c1))
#Store results
#Spit out some basic info first. See stored object for time-varying results.
if (isTRUE(print)) {
cat("\n", "Matching results", "\n",
"----------------", "\n",
"Number of potential controls: ", nrow(k.mean), "\n",
"Number of controls used (k*): ", k.min, "\n",
"Cross-validation error before matching: ", cv.fullsample, "\n",
"Cross-validation error after matching: ", cv.rmse, "\n",
"Error ratio (after/before matching): ", round(cv.rmse/cv.fullsample, 5), "\n",
"Intra-class correlation coefficient (ICC): ", icc, "\n",
"Design effect (multipler for standard errors) ", round(DEFT, 5), "\n",
"\n",
"Difference-in-differences results", "\n",
"---------------------------------", "\n",
"Average effect (rate difference, 95% CI): ",
round(dd*mult, 4), " (",
round((dd-dd_se*stats::qnorm(0.975))*mult, 4), ",",
round((dd+dd_se*stats::qnorm(0.975))*mult, 4), ")",
"\n",
"Standard error: ", round(dd_se*mult, 4), "\n",
"P-value: ", round(dd_pval, 10), "\n", "\n",
"Alternative estimators", "\n",
"----------------------", "\n",
"Cumulative effect (in events, 95% CI): ",
round(cm.events, 1), " (",
round(cm.events.lower, 1), ",",
round(cm.events.upper, 1), ")", "\n",
"Ratio effect (compared to counterfactual, 95% CI): ",
round(ratio, 4), " (",
round(ratio.lo, 4), ",",
round(ratio.up, 4), ")",
"\n",
"\n",
"Data table", "\n",
"----------", "\n",
"Treated (pre-period) || Event freq.: ", y_tr0, " | Exposure-time: ", p_tr0, " | Rate: ", (y_tr0)/(p_tr0)*mult, "\n",
"Treated (post-period) || Event freq.: ", y_tr1, " | Exposure-time: ", p_tr1, " | Rate: ", (y_tr1)/(p_tr1)*mult, "\n",
"Control (pre-period) || Event freq.: ", y_c0, " | Exposure-time: ", p_c0, " | Rate: ", (y_c0)/(p_c0)*mult, "\n",
"Control (post-period) || Event freq.: ", y_c1, " | Exposure-time: ", p_c1, " | Rate: ", (y_c1)/(p_c1)*mult, "\n", "\n",
"Notes", "\n",
"-----", "\n",
"Rates are multiplied by ", as.integer(mult), ", use mult-command to change.", "\n",
"Time-varying results are stored in object$Resdat.", sep="")
}
results <- as.list(NULL)
results[[1]] <- res
results[[2]] <- k.mean
results[[3]] <- as.data.frame(cbind(k.min,dd,dd_se,dd_pval,cm.events, cm.events.lower, cm.events.upper, ratio, ratio.lo, ratio.up, cv.rmse, dd_donor, row.names = NULL))
results[[4]] <- id.selected
results[[5]] <- epitable
results[[6]] <- matched_data
names(results) <- c("Resdat", "cv_errors", "supp_stats", "id_controls", "epitable", "matched_data")
if (isTRUE(print)) {
base::close(pb)
}
return(results)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.