#' Bootstrapped robust linear model
#'
#' Run a \code{\link[MASS]{rlm}} model \code{nBoot} times, then include the bootstrapped parameter estimates
#' (and summary of quantiles thereof) in the rlm output. Also includes out-of-sample
#' delta-R-squared. If parameter distributions are *anywhere near* decision thresholds,
#' using \code{nBoot}>2000 (or even much higher) is recommended.
#'
#' Wraps \code{rlm()} or \code{lm()} and then bootstraps [fits models to datasets sampled with replacement]
#' that model \code{nBoot} times. Then fits models to
#' nBoot random 80 percent of data and tests the delta R-squared of each numeric or logical parameter
#' when predicting the out-of-sample 20 percent.
#'
#' @return
#' An augmented \code{rlm} or \code{lm} object that includes
#' several new items: \code{$bootSummary}, \code{$boots} (all bootstrapped parameters),
#' \code{$bootQs} (quantiles
#' of bootstrapped parameters), \code{$dRsq} (all out-of-sample proportional reduction of error),
#' \code{$dRsqQs} (quantiles of out-of-sample proportional reduction of error),
#' and \code{$results} (strings, formatted
#' for RMarkdown, including whole-sample slope, bootstrapped CI, and median out-of-sample dRSq).
#'
#' For an explanation specific objects see \code{comment(model$boots)},
#' \code{comment(model$dRsq)}, or \code{comment(model$bootSummary)}.
#'
#' @param formIn Model formula, as with \code{lm()} or \code{rlm()}
#' @param datIn Data, as with \code{lm()} or \code{rlm()}
#' @param nBoot Number of resamples [with replacement]
#' @param useLM Override the standard \code{rlm()} implementation to use basic \code{lm()} instead
#'
#' @export
#'
#' @examples
#' dat <- data.frame(x=rnorm(50))
#' dat$y <- dat$x + rnorm(50)
#' dat$z <- dat$y - rnorm(50)
#' dat$z[2] <- NA # the function is robust to NAs
#' m <- tef_rlm_boot(y~x*z,dat)
#' m$bootSummary # to get a summary of the fit[s]
#' comment(m$bootSummary) # to get an explanation of the summary data frame
#'
tef_rlm_boot <- function(formIn,datIn,nBoot=500,useLM=F){
# to do:
# - see line 146, regarding drsq_oos central tendency calculation
# - get an overall model rsq_oos
if(!is.numeric(nBoot)){cat('Your number of bootstraps must be numeric.')}
if(nBoot<2){(cat('Your number of bootstraps must be positive'))}
nBoot <- round(nBoot)
if(useLM){fitReg <- lm
}else{
require(MASS)
fitReg <- rlm}
m <- suppressWarnings({fitReg(formIn,datIn,maxit=1E4)})
m$call <- match.call()
## bootstrapped fit:
{
boots <- replicate(nBoot,
suppressWarnings({
curCoef <- NA
while(!is.numeric(curCoef)){ # ensures robustness to pathological resampling
try({
curRLM <- suppressWarnings({fitReg(formIn,datIn[sample(nrow(datIn),replace=T),],maxit=50)})
curCoef <- curRLM$coefficients
},silent=T)
}
return(
curCoef=curCoef
)
})
)
m$boots <- data.frame( t(boots) )
colnames(m$boots) <- names(m$coefficients)
comment(m$boots) <- paste('Parameter estimates from',nBoot,
'MASS::rlm fits to data resampled with replacement.')
m$bootQs <-apply(na.omit(data.frame(t(m$boots))),1,quantile,probs=c(.025,.25,.5,.75,.975))
}
# # # #
## out-of-sample delta R squared:
{
oos_dRsq <- replicate(nBoot,{
outVar <- as.character(formIn[2])
shuffleInds <- sample(nrow(datIn))
trainInds <- shuffleInds[1:(round(nrow(datIn))*.8)]
testInds <- shuffleInds[((round(nrow(datIn))*.8)+1):nrow(datIn)]
sse_raw <- sum((datIn[testInds,outVar]-mean(datIn[testInds,outVar],na.rm=T))^2,na.rm=T)
fullMod_preds <- predict(suppressWarnings({fitReg(formIn,datIn[trainInds,],maxit=50)}),
datIn[testInds,])
sse_fullMod <- sum((datIn[testInds,outVar]-fullMod_preds)^2,na.rm=T)
dRsq <- c(fullMod = (sse_raw-sse_fullMod)/sse_raw)
parNames <- attr(terms(m),'term.labels')
if(length(parNames)>1){tilde<- '~'}else{tilde <- '~ 1'}
for (dropVar in 1:length(parNames)){
curForm <- formula(paste(outVar,tilde,paste(parNames[(1:length(parNames))!=dropVar],collapse='+')))
curMod_preds <- predict(suppressWarnings({fitReg(curForm,datIn[trainInds,],maxit=50)}),
datIn[testInds,])
sse_curMod <- sum((datIn[testInds,outVar]-curMod_preds)^2,na.rm=T)
dRsq[parNames[dropVar]] <- (sse_curMod - sse_fullMod)/sse_curMod
}
dRsq
})
m$dRsq <- data.frame(t(oos_dRsq))
comment(m$dRsq) <- paste('Parameter estimates from MASS::rlm fits to',nBoot,
'random 80% subsamples of data (without replacement).',
'Fits using all predictors were compared to fits dropping',
'each predictor in turn. Out-of-sample delta R-squared was',
'calculated on each remaining 20% of data by comparing',
'full parameter set fits to drop-one parameter set fits.')
m$dRsqQs <-apply(na.omit(data.frame(t(m$dRsq))),1,quantile,probs=c(.025,.25,.5,.75,.975))
}
## ## ##
# # # #
## summary:
{
outDF <- data.frame(summary(m)$coefficients)[,1:3]
outDF$ci025 <- m$bootQs["2.5%", ]
outDF$ci975 <- m$bootQs["97.5%", ]
outDF$bootP <- NA
for (curCoef in 1:ncol(m$boots)) {
curECDF <- ecdf(m$boots[, curCoef])
outDF[curCoef, "bootP"] <- min(curECDF(0),
1 - curECDF(0)) * 2
} ; rm(curCoef)
outDF[, c(1, 2, 4, 5)] <- signif(outDF[, c(1, 2, 4, 5)], 3)
outDF[, 3] <- round(outDF[,3], 2)
outDF[,6] <- round(outDF[,6], 3)
outDF$dRsq_oos[1] <- NA
for(curCoef in colnames(m$dRsqQs)){
if(any(rownames(outDF)==curCoef)){
outDF[curCoef,'dRsq_oos'] <- round(m$dRsqQs['50%',curCoef],4)}}
# # # better, but needs testing:
# apply(na.omit(data.frame(t(m$dRsq))),1,mean,trim = .49)
comment(outDF) <- paste('Summary of rlm_boot. The first 3 columns are from the standard rlm fit.',
'ci025 and ci975 are the 2.5% and 97.5% quantiles of parameter estimates to bootstrapped fits.',
'BootP indicates the quantile (calculated from the parameter distributions) associated with a',
'0 slope for a given parameter, multiplied by 2 to imitate a conventional 2-tailed p value.',
'dRsq_oos is, for numeric predictors, the median out-of-sample prediction error reduction when including a',
'given parameter in the full model. This is calculated as a proportional',
'reduction of squared error in out-of-sample prediction',
'relative to a model including all parameters except the specific parameter in question. ')
m$bootSummary <- outDF
formattedResults <- c()
if(useLM){bname='Estimate'}else{bname='Value'}
for (curCoef in 1:ncol(m$boots)) {
coefName <- colnames(m$boots)[curCoef]
if(is.na(outDF[coefName,c('dRsq_oos')])){dr2str <- ''}else{dr2str <- paste(', dR^2^~oos~ =',outDF[coefName,c('dRsq_oos')])}
formattedResults <- c(formattedResults,
paste0('*b* = ',outDF[coefName,bname],', CI = [',
paste(outDF[coefName,c('ci025','ci975')],collapse=','),']',dr2str)
)
}
names(formattedResults) <- colnames(m$boots)
m$results <- formattedResults
}
return(m)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.