#' summarizeBias
#'
#' Function to summarize bias of estimators across
#' different simulations.
#'
#' @param result Result data frame
#' @param estNames Column name(s) of results data frame that are estimators
#' @param truthName Column name of results data frame that is the true ATE
#' @param ... Other arguments
#'
#' @return A vector with the ordered bias of the estimators
summarizeBias <- function(result, estNames, truthName = "truth", ...){
allBias <- apply(result[,c(estNames)], 2, function(x){
median(abs(x/result[,truthName]-1),na.rm=TRUE)
})
ord <- order(allBias)
allBias[ord]
}
#' summarizeCoverage
#'
#' Function to summarize coverage of confidence intervals across all
#' simulations
#'
#' @param result Result data frame
#' @param cilNames Column name(s) of results data frame that are lower confidence bounds
#' @param ciuNames Column name(s) of results data frame that are upper confidence bounds
#' @param truthName Column name of results data frame that is the true ATE
#' @param ... Other arguments
#'
#' @return A vector with the coverage of the estimators
summarizeCoverage <- function(result, cilNames, ciuNames,
truthName = "truth", ...){
coverage <- rep(NA, length(cilNames))
truth <- result[,truthName]
for(i in 1:length(cilNames)){
cil <- result[,cilNames[i]]
ciu <- result[,ciuNames[i]]
coverage[i] <- mean(cil < truth & ciu > truth, na.rm= TRUE)
}
names(coverage) <- cilNames
coverage
}
#' plotBias
#'
#' Function to plot the median percent bias for a given set of estimators
#' stratified by a given set of covariates
#' @param result The result data frame
#' @param estNames The names of the estimators to plot
#' @param strat The name of the variable on which to stratify
#' @param truthName The column name containing the true ATE
#' @param pointOptions A list of arguments to be passed to points
#' @param legendOptions A list of arguments passed to legend. pointOptions are added
#' automatically
#' @param ylim y-axis limits for the plot. If null, looks for range and adds 5% to it
#' @param estLabels Vector of names to label estimators in legend (default is to use estNames)
#' @param summary A list with two entries named "fn" and "options". This is the summary
#' function used to summarize percent difference. Defaults to median (i.e., \code{function = quantile},
#' \code{options = list(probs = 0.5)}). Assumes that the function will play nicely with the unnamed
#' argument passed in
#' @param pctTruth Report error as absolute percent of true ATE? If \code{FALSE}, error is reported as
#' absolute units of information (i.e., inverse of variance of EIF)
#' @param ... Additional arguments passed to plot
plotBias <- function(result, estNames, strat, truthName = "truth",
pointOptions = list(pch = 1:length(estNames)),
estLabels = estNames,
legendOptions = list(x = "topleft", legend = estLabels),
ylim = NULL,
pctTruth = TRUE,
summary = list(fn = "quantile", options = list(probs=0.5)),
...){
nEst <- length(estNames)
if(is.numeric(result[1,strat])){
strata <- unique(sort(result[,strat]))
}else{
strata <- unique(result[,strat])
}
if(pctTruth){
biasList <- lapply(split(strata, 1:length(strata)), function(s){
apply(result[result[,strat]==s,c(estNames)], 2, function(x){
do.call(summary$fn,
args = c(list(abs(x/result[result[,strat]==s,truthName]-1)),
summary$options))
})
})
}else{
biasList <- lapply(split(strata, 1:length(strata)), function(s){
apply(result[result[,strat]==s,c(estNames)], 2, function(x){
do.call(summary$fn,
args = c(list(x=abs(sqrt(result$n[result[,strat]==s])*(x-result[result[,strat]==s,truthName])/sqrt(result$varEffIC[result[,strat]==s]))),
summary$options))
})
})
}
# get some limit information
thisRange <- range(unlist(biasList))
# add 5% to range to get y-limits
if(is.null(ylim)){
ylim <- thisRange + c(-0.05,0.05)*thisRange
}
# xlimits
xlim <- c(1,length(strata))
# x-axis labels
xaxt_lab <- strata
# set up empty plot
plot(0,0,pch=NULL,xlim=xlim,ylim=ylim,
xaxt = "n",
ylab = ifelse(pctTruth,
expression(paste("|",(hat(psi) - psi[0])/psi[0],"|")),
expression(paste("|",n^{1/2},(hat(psi) - psi[0]),"| / Var")^{1/2}*"(EIF)")
),bty="n",
...)
# add x-axis
axis(side = 1, at = 1:length(strata), label=strata)
# add results in a loop
for(i in 1:length(strata)){
do.call(points, c(list(x=rep(i,nEst),y=biasList[[i]]),
pointOptions))
}
# add a legend
do.call(legend, c(legendOptions, pointOptions))
}
#' plotCoverage
#'
#' Function to plot the coverage for a given set of estimators
#' stratified by a given set of covariates
#' @param result The result data frame
#' @param cilNames Column name(s) of results data frame that are lower confidence bounds
#' @param ciuNames Column name(s) of results data frame that are upper confidence bounds
#' @param strat The name of the variable on which to stratify
#' @param truthName The column name containing the true ATE
#' @param pointOptions A list of arguments to be passed to points
#' @param legendOptions A list of arguments passed to legend. pointOptions are added
#' automatically
#' @param ylim y-axis limits for the plot. If null, looks for range and adds 5% to it
#' @param estLabels Vector of names to label estimators in legend (default is to use estNames)
#' @param ... Additional arguments passed to plot
plotCoverage <- function(result, cilNames, ciuNames, strat, truthName = "truth",
pointOptions = list(pch = 1:length(cilNames)),
estLabels = cilNames,
legendOptions = list(x = "topleft", legend = estLabels),
ylim = NULL,
...){
nEst <- length(cilNames)
if(is.numeric(result[1,strat])){
strata <- unique(sort(result[,strat]))
}else{
strata <- unique(result[,strat])
}
covList <- lapply(split(strata, 1:length(strata)), function(s){
coverage <- rep(NA, length(cilNames))
truth <- result[result[,strat]==s,truthName]
for(i in 1:length(cilNames)){
cil <- result[result[,strat]==s,cilNames[i]]
ciu <- result[result[,strat]==s,ciuNames[i]]
coverage[i] <- mean(cil < truth & ciu > truth, na.rm = TRUE)
}
names(coverage) <- cilNames
coverage
})
# get some limit information
thisRange <- range(unlist(covList))
# add 5% to range to get lower y-limits, upper y-limit is 1
if(is.null(ylim)){
ylim <- c(thisRange[1]-0.05, 1)
}
# xlimits
xlim <- c(1,length(strata))
# x-axis labels
xaxt_lab <- strata
# set up empty plot
plot(0,0,pch=NULL,xlim=xlim,ylim=ylim,
xaxt = "n",
ylab = "Coverage of nominal 95% CI",bty="n",
...)
# add x-axis
axis(side = 1, at = 1:length(strata), label=strata)
# add results in a loop
for(i in 1:length(strata)){
do.call(points, c(list(x=rep(i,nEst),y=covList[[i]]),
pointOptions))
}
# add a legend
do.call(legend, c(legendOptions, pointOptions))
# add a dashed line at 0.95
abline(h = 0.95, lty = 3)
}
#' findBestSims
#'
#' Locate the simulations where a particular estimator is performing the best
#'
#' @param result The results data frame
#' @param estName The estimator that you want to find good simulations for
#' @param compEst Define best by comparing to other estimators or by being better
#' than a certain threshold in absolute error.
#' @param threshBest The threshold that defines a best simulation (in pct of truth or
#' Z-score depend on value of \code{pctTruth})
#' @param compEstNames The estimators that you want to compare the chosen estimator
#' to in order to define good (e.g., all other estimators)
#' @param nSims The maximum number of simulations to return
#' @param pctTruth Base on bias as percent of truth or in units of information
#'
findBestSims <- function(result, estName, compEst = TRUE, compEstNames, nSims = 100,
pctTruth = TRUE, threshBest = 0.05,
truthName = "truth", ...){
# find |est - truth / truth| for all estimators
if(pctTruth){
allBias <- apply(result[,c(estName,compEstNames)], 2, function(x){
abs(x/result[,truthName]-1)
})
}else{
result <- result[!is.na(result$varEffIC),]
allBias <- apply(result[,c(estName,compEstNames)], 2, function(x){
abs(sqrt(result$n)*(x-result[,truthName])/sqrt(result$varEffIC))
})
}
# if defining best by comparing to other estimators
if(compEst){
# get rank for each estimator
allRanks <- apply(allBias, 1, rank, na.last = NA)
if(is.list(allRanks)){
allRanks <- Reduce(rbind, allRanks)
}
# get all simulations where estName wins
bestSims <- which(allRanks[1,] == 1)
# stop if it never wins
if(is.null(bestSims)){
stop("estName never was best estimator!")
}
# restrict to only these results
bestResult <- allBias[bestSims,]
# compute difference between estName and next closest est
nCompEst <- length(compEstNames)
nextClosest <- apply(bestResult, 1, function(x){
nextBest <- which.min(x[1]/x[2:nCompEst]) + 1
x[nextBest]
})
# order of results from biggest to smallest
tmpInd <- order(-nextClosest)
# output
out <- list()
# number of times it was the best
out$pctBest <- length(bestSims)/length(result[,1])
# the best nSim rows of result
if(length(tmpInd) > nSims){
out$bestResults <- result[bestSims[tmpInd],][1:nSims,]
}else{
out$bestResults <- result[bestSims[tmpInd],]
}
return(out)
}else{
# if defining best by being smaller than threshBest
tmpInd <- which(allBias[,1] < threshBest)
# output
out <- list()
# number of times it was best
out$pctBest <- length(tmpInd)/length(result[,1])
out$bestResults <- result[tmpInd,]
return(out)
}
}
#' findWorstSims
#'
#' Locate the simulations where a particular estimator is performing the worst
#'
#' @param result The results data frame
#' @param estName The estimator that you want to find bad simulations for
#' @param compEstNames The estimators that you want to compare the chosen estimator
#' to in order to define bad (e.g., all other estimators)
#' @param nSims The maximum number of simulations to return
#' @param pctTruth Base on bias as percent of truth or in units of information
#' @param compEst Define best by comparing to other estimators or by being better
#' than a certain threshold in absolute error.
#' @param threshWorst The threshold that defines a best simulation (in pct of truth or
#' Z-score depend on value of \code{pctTruth})
findWorstSims <- function(result, estName, compEstNames, nSims = 1000,
pctTruth = TRUE, compEst = TRUE, threshWorst = 4,
truthName = "truth", ...){
if(pctTruth){
allBias <- apply(result[,c(estName,compEstNames)], 2, function(x){
abs(x/result[,truthName]-1)
})
}else{
result <- result[!is.na(result$varEffIC),]
allBias <- apply(result[,c(estName,compEstNames)], 2, function(x){
abs(sqrt(result$n)*(x-result[,truthName])/sqrt(result$varEffIC))
})
}
if(compEst){
# get rank for each estimator
allRanks <- apply(allBias, 1, rank)
# get all simulations where estName wins
worstSims <- which(allRanks[1,] == (length(compEstNames)+1))
# stop if it never wins
if(is.null(worstSims)){
stop("estName never was worst estimator!")
}
# restrict to only these results
worstResult <- allBias[worstSims,]
# compute difference between estName and next closest est
nCompEst <- length(compEstNames)
nextClosest <- apply(worstResult, 1, function(x){
nextWorst <- which.min(x[1]/x[2:nCompEst]) + 1
x[nextWorst]
})
# order of results from biggest to smallest
tmpInd <- order(-nextClosest)
# output
out <- list()
# number of times it was the best
out$pctWorst <- length(worstSims)/length(result[,1])
# the best nSim rows of result
if(length(tmpInd) > nSims){
out$worstResults <- result[worstSims[tmpInd],][1:nSims,]
}else{
out$worstResults <- result[worstSims[tmpInd],]
}
return(out)
}else{
# if defining best by being larger than threshWorst
tmpInd <- which(allBias[,1] > threshWorst)
# output
out <- list()
# number of times it was best
out$pctWorst <- length(tmpInd)/length(result[,1])
out$worstResults <- result[tmpInd,]
return(out)
}
}
#' plotSamplingDist
#'
#' Plot the centered, scaled (by sqrt(var(EIF)/n)) sampling distribution
#' of a given set of estimators.
#' @param result Result data frame
#' @param estNames Column name(s) of results data frame that are estimators
#' @param truthName Column name of results data frame that is the true ATE
#' @param incAsymDist Whether to also include the asymptotic distribution
#' @param legendOptions A list of arguments passed to legend.
#' @param densityOptions A list of options passed to \code{density}
#' @param ylim y-axis limits for the plot. If null, looks for range and adds 5% to it
#' @param estLabels Vector of names to label estimators in legend (default is to use estNames)
#' @param ... Other arguments passed to plot
plotSamplingDist <- function(result, estNames,
estLabels = estNames,
legendOptions = list(x = "topleft", legend = c(estLabels,"N(0,1)")),
incAsymDist = TRUE,
densityOptions = list(n=2^12),
truthName = "truth",
ylim = NULL,
xlim = NULL,
...){
nEst <- length(estNames)
# find z for each estimator
zList <- vector(mode = "list", length = length(nEst))
for(i in 1:nEst){
tmp <- sqrt(result$n)*(result[,estNames[i]]-result$truth)/sqrt(result$varEffIC)
zList[[i]] <- tmp[!is.na(tmp)]
}
# call density
densList <- lapply(zList, function(x){
do.call(density, c(list(x=x, bw=1.06*sd(x)*length(x)^(-1/5)),densityOptions))
})
# make empty plot
if(is.null(xlim)){
xlim <- range(unlist(lapply(densList, function(x){x$x})))
}
if(is.null(ylim)){
ylim <- range(unlist(lapply(densList, function(x){x$y})))
}
plot(0,0,pch="",xlim=xlim,ylim=ylim,bty="n",ylab = "sqrt(n)*(est-truth)/sd(EIF)")
# add densities in loop
for(i in 1:nEst){
lines(densList[[i]], lwd = 2, col = i)
# tmp <- zList[[i]]
# hist(tmp[abs(tmp)<5],freq = FALSE, add = TRUE, col = i,breaks=20)
}
# add reference if it's there
if(incAsymDist){
trueCol <- rgb(1,0,0,0.25)
xSeq <- seq(xlim[1],xlim[2],length=length(densList[[1]]$x))
lines(y=dnorm(xSeq),x=xSeq,
col = trueCol,lty = 3,lwd=2)
}
do.call(legend, c(legendOptions, list(lty = c(rep(1,nEst),3),lwd = 2, col = c(1:nEst,trueCol))))
}
#' plotDiff
#'
#' Plot the difference between the scaled version of two estimators. The estimator
#' names are entered in \code{estNames} and the difference between absolute value of
#' Z-scores for estNames[1] - estNames[2] is plotted.
#'
#' @param result Result data frame
#' @param estNames Column names of results data frame that are the two estimators to be compared
#' @param truthName Column name of results data frame that is the true ATE
#' @param ylim y-axis limits for the plot. If null, looks for range and adds 5% to it
#' @param estLabels Vector of names to label estimators in legend (default is to use estNames)
#' @param ... Other arguments passed to plot
plotDiff <- function(result, estNames,
estLabels = estNames,
truthName = "truth",
ylim = NULL,
xlim = NULL,
...){
nEst <- length(estNames) # should be 2
if(nEst != 2) stop("length of estNames should be 2")
# find z for each estimator
zList <- vector(mode = "list", length = length(nEst))
for(i in 1:nEst){
# returning absolute value of standardized estimator
tmp <- abs(sqrt(result$n)*(result[,estNames[i]]-result[,truthName])/sqrt(result$varEffIC))
# returning percent error
# tmp <- abs(result[,estNames[i]]/result[,truthName]-1)
# get rid of NAs
zList[[i]] <- tmp[!is.na(tmp)]
}
diffEst <- Reduce("-",zList)
# quantile to get range of points
xlim <- quantile(diffEst, probs = c(0.01,0.99))
# compute P(diffEst < x) for sequence of x's
xSeq <- seq(xlim[1],xlim[2], length=1000)
pDiff <- rep(NA,1000)
for(i in 1:1000){
pDiff[i] <- sum(diffEst < xSeq[i])/length(diffEst)
}
# make empty plot
if(is.null(ylim)){
ylim <- c(0,1)
}
plot(pDiff ~ xSeq,lwd=2,pch="",xlim=xlim,ylim=ylim,bty="n",
xlab = "x",type = "l",...)
# add plot to show where CDF cross 0
cP <- sum(diffEst < 0)/length(diffEst)
segments(x0=0,y0=0,y1=cP,lty=3)
segments(x0=par()$usr[1],x1=0,y0=cP,lty=3)
text(x = par()$usr[1], y = cP, adj=c(0,0),ifelse(round(cP,2)==0,"<0.01",round(cP,2)),xpd=TRUE)
# add plot to show where CDF crosses -1
cP <- sum(diffEst < -1)/length(diffEst)
segments(x0=-1,y0=0,y1=cP,lty=3)
segments(x0=par()$usr[1],x1=-1,y0=cP,lty=3)
text(x = par()$usr[1], y = cP, adj=c(0,0), ifelse(round(cP,2)==0,"<0.01",round(cP,2)),xpd=TRUE)
# add plot to show where CDF crosses 1
cP <- sum(diffEst < 1)/length(diffEst)
segments(x0=1,y0=0,y1=cP,lty=3)
segments(x0=par()$usr[1],x1=1,y0=cP,lty=3)
text(x = par()$usr[1], y = cP, adj=c(0,0),ifelse(round(cP,2)==0,"<0.01",round(cP,2)),xpd=TRUE)
# add text on y-axis
# add a rug plot
rug(diffEst,col="gray75")
# add density
dens <- density(x=diffEst, bw=1.06*sd(diffEst)*length(diffEst)^(-1/5),n=2^12)
densScale <- (dens$y - min(dens$y))/diff(range(dens$y))
lines(y=densScale, x=dens$x, col="gray75")
axis(side = 4, col.axis="gray75","p(abs(Z-HAL) - abs(Z-GLM) < x)",at=seq(0,1,0.2),
labels = round(diff(range(dens$y))*seq(0,1,0.2),2),col.lab="gray75",col="gray75")
}
# # names of estimators/confidence intervals
# colNames_allOut <- colnames(allOut)
# # names of all estimators
# # cilNames <- colNames_allOut[grep("\\.cil", colNames_allOut)]
# # ciuNames <- colNames_allOut[grep("\\.ciu", colNames_allOut)]
# # compEstNames <- colNames_allOut[grep("\\.est", colNames_allOut)]
# # names of selected estimators
# estNames <- c("SL.hal.est","slFull.est","slDrop.est","SL.step.interaction.est","SL.glm.est")
# ciuNames <- c("SL.hal.ciuOrc","slFull.ciuOrc","slDrop.ciuOrc","SL.step.interaction.ciuOrc","SL.glm.ciuOrc")
# cilNames <- c("SL.hal.cilOrc","slFull.cilOrc","slDrop.cilOrc","SL.step.interaction.cilOrc","SL.glm.cilOrc")
# # bias
# summarizeBias(result = allOut, estNames = estNames)
# # coverage
# summarizeCoverage(result = allOut, cilNames = cilNames, ciuNames = ciuNames)
# # by sample size
# summarizeBias(result = allOut[allOut$n==100,], estNames = estNames)
# summarizeBias(result = allOut[allOut$n==500,], estNames = estNames)
# summarizeBias(result = allOut[allOut$n==1000,], estNames = estNames)
# summarizeBias(result = allOut[allOut$n==2000,], estNames = estNames)
# summarizeCoverage(result = allOut[allOut$n==100,], cilNames = cilNames, ciuNames = ciuNames)
# summarizeCoverage(result = allOut[allOut$n==500,], cilNames = cilNames, ciuNames = ciuNames)
# summarizeCoverage(result = allOut[allOut$n==1000,], cilNames = cilNames, ciuNames = ciuNames)
# summarizeCoverage(result = allOut[allOut$n==2000,], cilNames = cilNames, ciuNames = ciuNames)
# # by dimension
# summarizeBias(result = allOut[allOut$D==1,], estNames = estNames)
# summarizeBias(result = allOut[allOut$D==2,], estNames = estNames)
# summarizeBias(result = allOut[allOut$D==3,], estNames = estNames)
# summarizeBias(result = allOut[allOut$D==4,], estNames = estNames)
# summarizeBias(result = allOut[allOut$D==5,], estNames = estNames)
# summarizeBias(result = allOut[allOut$D==6,], estNames = estNames)
# summarizeBias(result = allOut[allOut$D==7,], estNames = estNames)
# summarizeBias(result = allOut[allOut$D==8,], estNames = estNames)
# # debugging code for function
# result <- allOut
# truthName <- "truth"
# # find DGMs with actual confounding
# indConf <- ((allOut$obsATE - allOut$truth)/allOut$truth) > 2
# # summary for bias
# y <- log(abs((allOut$SL.hal.est - allOut$truth)/allOut$truth))
# grbg <- step(glm(y ~ n + D + minObsG0 + mainTermsG0 + bivTermsG0 + triTermsG0 + mainTermsQ0 + bivTermsQ0 +
# triTermsQ0 + r2 + nCorr1 + nCorr2 + nCorr3 + nCorr4, data = allOut))
# summary(grbg)
# # summary for coverage
# y <- as.numeric(allOut$SL.hal.cil < allOut$truth & allOut$SL.hal.ciu > allOut$truth)
# grbg <- step(glm(y ~ n + D + minObsG0 + mainTermsG0 + bivTermsG0 + triTermsG0 + mainTermsQ0 + bivTermsQ0 +
# triTermsQ0 + r2 + nCorr4, data = allOut))
# summary(grbg)
# # where do full and drop SL differ
# grbg <- (allOut$slFull.est - allOut$slDrop.est)/allOut$slFull.est
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.