#' Calculates balance statistics for matching.
#'
#' @param df data frame with data to match.
#' @param formu the formula used to estimate propensity scores using logistic
#' regression.
#' @param formu.Y (optional) the formula used to estimate a multiple regression model with
#' covariates. Used to estimate covariate importance.
#' @param index.treated a vector of integers corresponding to the rows in \code{df}
#' for the matched treated units. That is, \code{index.treated[i]} is matched
#' to \code{index.control[i]}.
#' @param index.control a vector of integers corresponding to the rows in \code{df}
#' for the matched control units. That is, \code{index.treated[i]} is matched
#' to \code{index.control[i]}.
#' @param exact.covs character vector of covariates to match exactly on. For
#' quantitative variables with more than two unique values, matches will
#' be made within quintiles.
#' @param tolerance the maximum standard deviation the difference between matches
#' on a quantitative covariate can be to be considered matched.
#' @param n.levels maximum number of unique values in a covariate to attempt to
#' match exactly. If the number of unique values is greater than \code{n.levels},
#' then the covariate will be cut using quantiles and exact matches will
#' be estimated within each quantile.
#' @param ... other parameters passed to \code{\link{Match}}.
#' @return An object of class \code{MatchBalance} with the following elements:
#' \describe{
#' \item{ps.out}{Results from the logistic regression model for treatment.}
#' \item{Y.out}{Results from the multiple regression model for Y.}
#' \item{covariate.matched}{a named numeric vector containing the percent
#' of observations with "perfect" match for each covariate.}
#' \item{covariate.balance}{a named numeric vector containing the percent
#' of covariates with "perfect" match for each observation.}
#' \item{covariate.importance}{a data frame with covariate importance.
#' Covariate importance is estimated using the sum of the absolute
#' t-statistic from both glm calls for treatment and outcome.}
#' \item{index.treated}{vector containing observation row number from original
#' data frame for treated observations.}
#' \item{index.control}{vector containing observatino row number from original
#' data frame for control observations.}
#' }
#' @export
MatchBalance <- function(df, formu, formu.Y, exact.covs,
index.treated, index.control,
tolerance = 0.25,
n.levels = 2, ...) {
n.strata <- 5
if(is.logical(class(df[,all.vars(formu)[1]]))) {
df[,all.vars(formu)[1]] <- as.integer(df[,all.vars(formu)[1]])
}
lr.out <- glm(formu, family=binomial, data=df)
glm.out <- NULL
if(!missing(formu.Y)) {
glm.out <- glm(formu.Y, data=lr.out$data)
}
var.imp <- data.frame(Treat=summary(lr.out)$coefficients[,'z value'])
if(!missing(formu.Y)) {
var.imp$Y <- summary(glm.out)$coefficients[,'t value']
var.imp$Total <- abs(var.imp$Treat) + abs(var.imp$Y)
} else {
var.imp$Total <- abs(var.imp$Treat)
}
var.imp <- var.imp[-1,] # Remove intercept
var.imp <- var.imp[order(var.imp$Total, decreasing=TRUE),,drop=FALSE]
if(!missing(index.treated) & !missing(index.control)) {
if(!missing(exact.covs)) {
stop('Specifying exact.covs is not supported when also specifying
index.treated and index.control')
}
if(length(index.treated) != length(index.control) &
length(index.treated) != nrow(df)) {
stop('length(index.treated) != length(index.control) != nrow(df)')
}
} else {
exact <- list()
if(!missing(exact.covs)) {
for(i in exact.covs) {
if(is.numeric(df[,i]) & length(unique(df[,i])) > n.levels) {
q <- quantile(df[,i], seq(0, 1, 1/n.strata))
if(length(unique(q)) != (n.strata+5)) {
# breaks would not be unique so we'll treat as a qualitative covariate
exact[[i]] <- df[,i]
} else {
exact[[i]] <- cut(df[,i], q, include.lowest=TRUE,
labels=letters[1:n.strata])
}
} else {
exact[[i]] <- df[,i]
}
}
}
if(length(exact) > 0) {
match.out <- Matchby(Tr = df[,all.vars(formu)[1]],
X = fitted(lr.out),
by = exact,
print.level = 0, ... )
} else {
match.out <- Match(Tr = df[,all.vars(formu)[1]],
X = fitted(lr.out),
... )
}
index.treated <- match.out$index.treated
index.control <- match.out$index.control
}
df.matrix <- model.matrix(formu, data=df)
df.matrix <- df.matrix[,-1] # Remove the intercept
df.cov.bal <- data.frame(row.names=1:length(index.treated))
for(i in seq_len(ncol(df.matrix))) {
if(is.factor(df.matrix[,i]) |
is.character(df.matrix[,i]) |
is.logical(df.matrix[,i])) {
# This should not occur. Left over from using data.frame
df.cov.bal[,colnames(df.matrix)[i]] <-
df.matrix[index.treated,i] == df.matrix[index.control,i]
} else if(is.numeric(df.matrix[,i])) {
diff <- abs(df.matrix[index.treated,i] - df.matrix[index.control,i])
df.cov.bal[,colnames(df.matrix)[i]] <- diff / sd(df.matrix[,i])
} else {
warning(paste0('Unknown column type for ', names(df)[i]))
}
}
col.percents <- apply(df.cov.bal[,1:(ncol(df.cov.bal))] < tolerance, 2, sum) /
nrow(df.cov.bal) * 100
row.percents <- apply(df.cov.bal[,1:(ncol(df.cov.bal))] < tolerance, 1, sum) /
ncol(df.cov.bal) * 100
results <- list(
df = df,
df.matrix = df.matrix,
ps.out = lr.out,
Y.out = glm.out,
covariate.matched = col.percents,
observation.matched = row.percents,
covariate.balance = df.cov.bal,
covariate.importance = var.imp,
index.treated = index.treated,
index.control = index.control,
tolerance = tolerance
)
class(results) <- 'MatchBalance'
return(results)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.