Nothing
#' @importFrom Rcpp sourceCpp
NULL
# Function that pre-process the data to use with drdid, ipwdid and ordid
pre_process_drdid <- function(yname,
tname,
idname,
dname,
xformla = NULL,
data,
panel = TRUE,
normalized = TRUE,
estMethod = c("imp", "trad"),
weightsname = NULL,
boot = FALSE,
boot.type = c("weighted", "multiplier"),
nboot = NULL,
inffunc = FALSE) {
#-----------------------------------------------------------------------------
# Data pre-processing and error checking
#-----------------------------------------------------------------------------
# ####
# set estimation method
estMethod <- estMethod[1]
# set bootstrap type
boot.type <- boot.type[1]
# Flag for estMethod
if ( (estMethod!="imp") && (estMethod!="trad")) {
warning("estMethod = ",estMethod, " is not supported. Using 'imp'.")
estMethod <- "imp"
}
# Flag for boot.type
if (boot){
if ( (boot.type!="weighted") && (boot.type!="multiplier")) {
warning("boot.type = ",boot.type, " is not supported. Using 'weighted'.")
boot.type <- "weighted"
}
}
# Flag for normalized
if ( (normalized != TRUE) && (normalized != FALSE)) {
warning("normalized = ",normalized, " is not supported. Using 'TRUE'.")
normalized <- TRUE
}
# make sure dataset is a data.frame
dta <- data
# this gets around RStudio's default of reading data as tibble
if (!all( class(dta) == "data.frame")) {
#warning("class of data object was not data.frame; converting...")
dta <- as.data.frame(dta)
}
# Flag for yname
if ( !is.element(yname, base::colnames(dta))) {
stop("yname = ",yname, " could not be found in the data provided.")
}
# Flag for tname
if ( !is.element(tname, base::colnames(dta))) {
stop("tname = ",tname, " could not be found in the data provided.")
}
# Flag for dname
if ( !is.element(dname, base::colnames(dta))) {
stop("dname = ",dname, " could not be found in the data provided.")
}
# Flag for idname
if ( !is.element(idname, base::colnames(dta))) {
stop("idname = ",idname, " could not be found in the data provided.")
}
#
# if(panel){
# if ( !is.element(idname, base::colnames(dta))) {
# stop("idname = ",idname, " could not be found in the data provided.")
# }
# }
# Flag for weightsname
if(!is.null(weightsname)){
if ( !is.element(weightsname, base::colnames(dta))) {
stop("weightsname = ",weightsname, " could not be found in the data provided.")
}
}
# Flag for covariates
#if (!is.null(xformla)) {
# if(!all(is.element(BMisc::rhs.vars(xformla), base::colnames(dta)))){
# stop("Some of the covariate names could not be found in the data provided. Double check the covariate names in xformla.")
# }
#}
# set weights if null
base::ifelse(is.null(weightsname), w <- rep(1,nrow(dta)), w <- dta[,weightsname])
dta$w <- w
# make sure time periods are numeric
if (! (is.numeric(dta[, tname])) ) {
stop("data[, tname] must be numeric. Please convert it.")
}
# make sure dname is numeric
if (! (is.numeric(dta[, dname])) ) {
stop("data[, dname] must be numeric. Please convert it.")
}
# make sure id is numeric
if (! is.null(idname)){
# make sure id is numeric
if (! (is.numeric(dta[, idname])) ) {
stop("data[, idname] must be numeric. Please convert it.")
}
# Check if idname is unique by tname
n_id_year = base::all( base::table(dta[, idname], dta[, tname]) <= 1)
if (! n_id_year) stop("The value of idname must be the unique (by tname)")
# Check if dname is unique by idname
checkTreatmentUniqueness(dta, idname, dname)
}
# figure out the time periods
# list of dates from smallest to largest
tlist <- unique(dta[,tname])[base::order(unique(dta[,tname]))]
if ( length(tlist)!=2) {
stop("drdid package only covers the cases with two time periods (pre and post) and two treatment groups (d=1 if treated at post, and d=0 if not treated in both pre and post).
See package `did' for the cases with multiple groups and/or multiple time periods.")
}
# list of groups from smallest to largest
glist <- unique(dta[,dname])[base::order(unique(dta[,dname]))]
if ( length(glist)!=2) {
stop("drdid only work for case with two time periods and two treatment groups (d=1 if treated at post, and d=0 if not treated in both pre and post).
See package `did' for the cases with multiple groups and/or multiple time periods.")
}
# put in blank xformla if no covariates
if (is.null(xformla)) {
xformla <- ~1
}
# # If repeated cross section, allow for null idname
# if(is.null(idname) & (panel == FALSE)){
# dta$id <- seq(1:nrow(dta))
# idname <- "id"
# }
# sort data with respect to id and time
dta <- dta[base::order(dta[,idname], dta[,tname]),]
# If panel, make it a balanced data set
if (panel) {
dta <- BMisc::makeBalancedPanel(dta, idname, tname)
}
# Outcome variable will be denoted by y
dta$y <- dta[, yname]
# group will be denoted by D
dta$D <- dta[, dname]
# Post dummy will be denoted by post
dta$post <- as.numeric(dta[,tname] == tlist[2])
# matrix of covariates
covariates <- stats::model.matrix(xformla,
stats::model.frame(xformla,
data = dta,
na.action=na.pass) )
#check if covariates and group are time invariant in the panel data case.
# matrix of covariates for pre-period and post periods
covariates_pre <- stats::model.matrix(xformla,
data=subset(dta, dta$post==0))
covariates_post <- stats::model.matrix(xformla,
data=subset(dta, dta$post==1))
d_pre <- subset(dta$D, dta$post==0)
d_post <- subset(dta$D, dta$post==1)
w_pre <- subset(dta$w, dta$post==0)
w_post <- subset(dta$w, dta$post==1)
if (panel) {
if (!all(covariates_pre==covariates_post)) {
stop("Error: covariates should be time invariant, and there should be no missing data.")
}
if (!all(d_pre==d_post)) {
stop("Error: group indicator must be time invariant.")
}
if (!all(w_pre==w_post)) {
stop("Error: weights must be time invariant.")
}
}
# check against very small groups
gsize <- stats::aggregate(dta$D,
by=list(dta$D),
function(x) length(x)/length(tlist))
# how many in each group before give warning
# 5 is just a buffer, could pick something else, but seems to work fine
reqsize <- length(BMisc::rhs.vars(xformla)) + 5
# which groups to warn about
gsize <- subset(gsize,
gsize$x < reqsize) # x is name of column from gsize
# warn if some groups are small
if (nrow(gsize) > 0) {
stop(paste0(" Either treatment or the comparison group in your dataset is very small. Inference is not feasible "))
}
#-----------------------------------------------------------------------------
# setup data in panel case
if (panel) {
# make it a balanced data set
dta <- BMisc::makeBalancedPanel(dta, idname, tname)
# Only use this smaller dataset
dta <- as.data.frame(cbind(y = dta$y,
D = dta$D,
post = dta$post,
w = dta$w,
covariates))
# Check if we have missing values
missing_X_flag <- base::anyNA(dta[,-c(1:4)])
missing_D_flag <- base::anyNA(dta$D)
missing_post_flag <- base::anyNA(dta$post)
missing_y_flag <- base::anyNA(dta$y)
missing_w_flag <- base::anyNA(dta$w)
# Warning messages
if(missing_X_flag) warning("Some covariates are missing (NA). We are dropping those observations")
if(missing_D_flag) warning("Some treatment indicators are missing (NA). We are dropping those observations")
if(missing_post_flag) warning("Some post-treatment indicators are missing (NA). We are dropping those observations")
if(missing_w_flag) warning("Some weights are missing (NA). We are dropping those observations")
if(missing_y_flag) warning("Some outcomes are missing (NA). We are dropping those observations")
# Remove NAs
dta <- dta[stats::complete.cases(dta), ]
covariates <- base::as.matrix(dta[,-c(1:4)])
# Drop collinear covariates
qr.covariates <- base::qr(covariates,
tol=1e-6,
LAPACK = FALSE)
rnk_covariates <- qr.covariates$rank
keep_x <- qr.covariates$pivot[seq_len(rnk_covariates)]
# Check if any covariates were dropped due to collinearity
dropped_covariates <- setdiff(colnames(covariates), colnames(covariates[, keep_x]))
if (length(dropped_covariates) > 0) {
warning("The following covariates were dropped due to collinearity: ", paste(dropped_covariates, collapse = ", "))
}
covariates <- covariates[, keep_x]
out <- list(y1 = subset(dta$y, dta$post==1),
y0 = subset(dta$y, dta$post==0),
D = subset(dta$D, dta$post==1),
covariates = subset(covariates, dta$post==1),
# Normalize weights
i.weights = subset(dta$w, dta$post==1)/mean(subset(dta$w, dta$post==1)),
panel = panel,
estMethod = estMethod,
normalized = normalized,
boot = boot,
boot.type = boot.type,
nboot = nboot,
inffunc = inffunc
)
}
else {
# Only use this smaller dataset
dta <- as.data.frame(cbind(y = dta$y,
D = dta$D,
post = dta$post,
w = dta$w,
covariates))
# Check if we have missing values
missing_X_flag <- base::anyNA(dta[,-c(1:4)])
missing_D_flag <- base::anyNA(dta$D)
missing_post_flag <- base::anyNA(dta$post)
missing_y_flag <- base::anyNA(dta$y)
missing_w_flag <- base::anyNA(dta$w)
# Warning messages
if(missing_X_flag) warning("Some covariates are missing (NA). We are dropping those observations")
if(missing_D_flag) warning("Some treatment indicators are missing (NA). We are dropping those observations")
if(missing_post_flag) warning("Some post-treatment indicators are missing (NA). We are dropping those observations")
if(missing_w_flag) warning("Some weights are missing (NA). We are dropping those observations")
if(missing_y_flag) warning("Some outcomes are missing (NA). We are dropping those observations")
# Remove NAs
dta <- dta[stats::complete.cases(dta), ]
covariates <- base::as.matrix(dta[,-c(1:4)])
# Drop collinear covariates
qr.covariates <- base::qr(covariates,
tol=1e-6,
LAPACK = FALSE)
rnk_covariates <- qr.covariates$rank
keep_x <- qr.covariates$pivot[seq_len(rnk_covariates)]
covariates <- covariates[,keep_x]
out <- list(y = dta$y,
D = dta$D,
post = dta$post,
covariates = covariates,
# Normalize weights
i.weights = dta$w/mean(dta$w),
panel = panel,
estMethod = estMethod,
normalized = normalized,
boot = boot,
boot.type = boot.type,
nboot = nboot,
inffunc = inffunc
)
}
return(out)
}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.