#' Validating a bootstrap linear regression model
#'
#' @param reg linear model.
#' @param data a data.frame of values if formula is complex with I() function (Optional).
#' @param plot enable or disable the display of graphical analysis.
#' @param verbose enable or disable the display of the commented analysis.
#' @param conf.level confidence level for validation of the model.
#' @param alpha minimal value accepted for validation of the model and his coefficients.
#' @param iter number of iterations.
#'
#' @details bootreg allows to validate a model by bootstrap.
#' @details It will draw several times in the values of the model in order to test its robustness.
#' @details As an output, we can have a validation or not of the model (analysis=F), or we can have a table expressing the variability of the coefficients, their p-values and their maximum fluctuation values.
#' @return A graph of analysis.
#' @export
#'
#' @examples
#' data(mtcars);
#' corrigraph(mtcars);
#' reg<- lm(cyl~disp+hp,data=mtcars);
#' bootreg(reg, verbose=TRUE, plot=TRUE)
bootreg <- function(reg, data=c(), plot=TRUE, verbose=TRUE, conf.level=0.95, alpha=0.05, iter=1000) {
if (length(data) > 1) {
dt <- data
} else {
dt <- reg$model
}
# Control the presence of $
if (any(unlist(sapply(formula(reg), function(x) grepl("\\$", x))))) {
options(warn = 0)
warning("Warning! The character '$' has been detected in the formula. Prefer the syntax lm(Y~x, data=data) over lm(data$Y~data$x), which may cause bootreg() to fail.")
}
numind <- nrow(dt)
indices <- 1:numind # num if individuals
enregistrement <- 0
erreur <- c()
predictions <- c()
verity <- c()
for (i in 1:iter) {
indices_training <- sample(indices, size=numind, replace=TRUE)
indices_test <- setdiff(indices, indices_training)
if (length(indices_test) > 0) {
training <- dt[indices_training,]
test <- dt[indices_test,]
formula_ <- formula(reg)
oldw <- getOption("warn")
options(warn = -1)
reg1 <- lm(formula_=formula_, data=training)
options(warn = oldw)
if (enregistrement == 0) {
pval_mdl <- pf(summary(reg1)$fstatistic[1], summary(reg1)$fstatistic[2], summary(reg1)$fstatistic[3], lower.tail=FALSE)
names(pval_mdl) <- "p-value of model"
if (length(c(pval_mdl, summary(reg1)[[4]][,4])) == (length(reg$coefficients)+1)) {
enregistrement <- enregistrement + 1
p_values <- c(pval_mdl, summary(reg1)[[4]][,4]) # p_values correspond to the Pr values retrieved for each variable and each bootstrap cycle (here 1000)
# For extracting p-values on glm() : put this : coef(summary(model2))[,4]
coeff <- reg1$coefficients
oldw <- getOption("warn")
options(warn = -1)
predictions <- c(predictions, predict(reg1, dt)[indices_test])
options(warn = oldw)
if (length(all.vars(reg$terms)) == length(names(get_all_vars(formula(reg$terms), dt)))) {
verity <- c(verity, test[[names(get_all_vars(formula(reg$terms), dt))[1]]])
} else {
Ynames <- trimws(strsplit(deparse(reg$call[[2]]), "[~+]")[[1]][1])
verity <- c(verity, test[[Ynames]])
}
}
} else {
pval_mdl <- pf(summary(reg1)$fstatistic[1], summary(reg1)$fstatistic[2], summary(reg1)$fstatistic[3], lower.tail=FALSE)
if (length(c(pval_mdl, summary(reg1)[[4]][,4])) == (length(reg$coefficients)+1)) {
p_values <- rbind(p_values, c(pval_mdl, summary(reg1)[[4]][,4]))
coeff <- rbind(coeff, reg1$coefficients) # Coeff corresponds to the coefficients retrieved for each variable and each bootstrap cycle (here 1000)
oldw <- getOption("warn")
options(warn = -1)
predictions <- c(predictions, predict(reg1, dt)[indices_test])
options(warn = oldw)
if (length(all.vars(reg$terms)) == length(names(get_all_vars(formula(reg$terms), dt)))) {
verity <- c(verity, test[[names(get_all_vars(formula(reg$terms), dt))[1]]])
} else {
Ynames <- trimws(strsplit(deparse(reg$call[[2]]), "[~+]")[[1]][1])
verity <- c(verity, test[[Ynames]])
}
}
}
}
}
coeff <- na.omit(coeff)
p_values <- na.omit(p_values)
predverity <- data.frame(predictions, verity)
predverity <- na.omit(predverity)
predictions <- predverity[,1]
verity <- predverity[,2]
confiance <- function(x, conf.level=0.99) { # threshold
temp <- sort(x)
valeur_seuil <- round(length(x) * conf.level)
temp <- temp[valeur_seuil]
return(temp)
}
mode <- function(x) {
densite <- density(x)
mode <- densite$x[which(densite$y == max(densite$y))]
return(mode)
}
apply(coeff, 2, median) -> coeff_median
if (plot == TRUE) {
boxplot_Pr <- function(x, main="") {
my_min <- min(c(apply(x, 2, quantile)[2,]), 0.0009)
if (my_min <= 0) {
my_min <- 1e-20
}
boxplot(x, log="y", ylim=c(my_min, 1), main=main)
abline(h=0.05, col="red", lwd=2)
abline(h=0.01, col="orange", lwd=2)
abline(h=0.001, col="green", lwd=2)
}
layout(matrix(1:3, 1, 3))
boxplot_Pr(p_values, main="Distribution of the p-values of\nthe model and its coefficients")
apply(coeff, 2, function(x) {(x - median(x)) / median(x) * 100}) -> percent_coeff
boxplot(percent_coeff, main="Fluctuation of coefficients (in %)")
abs(predictions - verity) -> temp
by(temp, verity, confiance, conf.level=conf.level) -> CONF
by(temp, verity, mean) -> MOY
x <- as.numeric(names(MOY))
plot(as.numeric(names(MOY)), CONF, type="l", lwd=3, col="red",
xlim=c(min(x), max(x)), ylim=c(0, max(CONF)),
xlab="Experimental values", ylab="Predictions",
main="Average prediction error (black) and\nmaximum error in the confidence interval (red)")
points(as.numeric(names(MOY)), MOY, type="l", lwd=2, col="black")
}
apply(p_values, 2, median) -> p.values_median
apply(p_values, 2, confiance, conf.level=conf.level) -> p.values_max
coeff_model <- c(NA, reg$coefficients)
coeff_median <- c(NA, coeff_median)
coeff_IC <- c(NA, apply(coeff, 2, int.ech, conf.level=conf.level))
synth <- rbind(p.values_median, p.values_max, coeff_model, coeff_median, coeff_IC)
synth <- t(synth)
synth <- data.frame(synth)
synth <- cbind(data.frame(rownames(synth)), synth)
rownames(synth)[1] <- "Model"
if (verbose == TRUE) {
return(synth)
} else {
if (max(synth[,2]) > alpha) {
return(FALSE)
} else {
return(TRUE)
}
}
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.