#' @title Valid Inference for Model-based Gradient Boosting Models
#' @param obj mboost object
#' @param method character; if possible, choose custom method. See details.
#' @param vars numeric vector; a single numeric value or vector of numeric values for the
#' variance used in the linear model (preferably the true variance or an estimation from a
#' consistent estimator). If NULL, the empirical response variance is used, which will result
#' in rather conservative inference.
#' @param varForSampling variance used for generate new samples.
#' Defaults to the first entry of \code{vars} if not given.
#' @param B numeric; number of samples drawn for inference.
#' @param alpha numeric; significance level for p-value / size of selective interval
#' (\code{1 - alpha}).
#' @param ncore numeric; number of cores to use (via \code{mclapply})
#' @param refit.mboost function; this is needed if \code{obj} was created by a direct call
#' to \code{mboost_fit}.
#' In this case, \code{refit.mboost} should be a function of the response,
#' refitting the exact model
#' as given by \code{obj} with the response vector.
#' @param Ups list of residual matrix produces by \code{\link{getUpsilons}}.
#' @param checkBL logical; if \code{TRUE} checks whether base-learner only include linear,
#' group and spline base-learner (which are currently supported)
#' @param vT list of test vectors as produced by \code{\link{getTestvector}}.
#' @param computeCI logical; whether or not to compute selective confidence intervals
#' @param returnSamples logical; whether or not (default = FALSE) to only return
#' the samples produced. Per default, p-values (and intervals) are calculated using the
#' samples using \code{\link{format_iboost_res}}.
#' @param which numeric; selects only certain base-learner, for which inference is conducted.
#' @param ... Further arguments passed to the sampling method.
#'
#' @import parallel mboost intervals selectiveInference
#' @importFrom intervals Intervals
#' @importFrom methods as
#' @importFrom stats as.formula dnorm pnorm qnorm rnorm runif uniroot
#' @importFrom utils setTxtProgressBar txtProgressBar
#' @importFrom msm ptnorm
#'
#' @export
#' @details iboost provides inference for L_2-Boosting models fitted with \code{mboost}
#' with linear, group or spline base-learner based on
#' Ruegamer and Greven (2018) when \code{method = unifsamp},
#' Yang et al. (2016) when \code{method = impsamp},
#' Tibshirani et al. (2016) when \code{method = analytic},
#' Loftus and Taylor (2015) when \code{method = slice} and
#' two variations of the \code{unifsamp} approach when \code{method} is
#' \code{normalsamp} or \code{normaladjsamp}.
#' Only the methods \code{impsamp} and \code{slice} can be used for testing
#' group effects or whole spline functions.
#'
#' @references
#' Ruegamer, D. and Greven, S. (2018),
#' Valid Inferece for L2-Boosting,
#' arXiv e-prints arXiv:1805.01852.
#'
#' Yang, F., Barber, R. F., Jain, P. and Lafferty, J. (2016),
#' Selective inference for group-sparse linear models,
#' Advances in Neural Information Processing Systems, pp. 2469-2477.
#'
#' Tibshirani, R. J., Taylor, J., Lockhart, R. & Tibshirani, R. (2016),
#' Exact post-selection inference for sequential regression procedures,
#' Journal of the American Statistical Association 111(514), 600-620.
#'
#' Loftus, J. R. & Taylor, J. E. (2015),
#' Selective inference in regression models with groupsof variables,
#' arXiv e-prints arXiv:1511.01478.
#'
#' @return Returns an object of class \code{iboost} or, if \code{length(vars)>1}, a
#' list of \code{iboost} objects for each variance.
#' An \code{iboost} object is a list containing the following items
#' \itemize{
#' \item \code{dist}: a list obtained by the sampling procedure
#' including \code{rB}, the sampled values, \code{logvals}, logical values whether
#' the corresponding \code{rB} yields to a congruent model with the initial model fit,
#' \code{obsval}, the actual observed value
#' in the initial model fit and corresponding \code{weights} of the importance sampling
#' procedure.
#' \item \code{method}: name of the method used
#' \item \code{alpha}: alpha level used for the confidence interval limits
#' \item \code{vT}: the test vector(s)
#' \item \code{yorg}: original response value
#' \item \code{resDF}: a data.frame consisting of the \code{lower} and
#' \code{upper} confidence interval limits, the observed value \code{mean},
#' the calculated p-value \code{pval} and the truncation limits of the effect
#' \code{lowtrunc} and \code{uptrunc}.
#' \item \code{var}: the variance used for inference calculation
#' \item \code{dur}: total duration of sampling in seconds
#'
#' }
#' @description Function computes selective p-values (and confidence intervals)
#' for \code{\link[mboost]{mboost}} objects. Currently \code{iboost} supports Gaussian
#' family models (L2-Boosting) with linear, group and spline base-learners.
#' @examples
#'
#' if(require("mboost")){
#'
#' set.seed(0)
#'
#' n <- 200
#' x1 <- rnorm(n)
#' x2 <- rnorm(n) + 0.25 * x1
#' x3 <- rnorm(n)
#' eta <- 3 * sin(x1) + x2^2
#' y <- scale(eta + rnorm(n), scale = FALSE)
#'
#' spline1 <- bbs(x1, knots = 20, df = 4)
#' knots.x2 <- quantile(x2, c(0.25, 0.5, 0.75))
#' spline2 <- bbs(x2, knots = knots.x2, df = 4)
#' spline3 <- bbs(x3, knots = 20, df = 4)
#'
#' data <- data.frame(y=y, x1=x1, x2=x2, x3=x3)
#'
#' mod1 <- mboost(y ~ spline1 + spline2 + spline3,
#' control=boost_control(mstop = 73), offset = 0,
#' data = data)
#'
#' # calculate p-values and intervals for model with
#' # fixed stopping iteration:
#' # this is done with only B = 100 samples for
#' # demonstrative purposes and should be increased
#' # for actual research questions
#' res <- iboost(mod1, method = "impsamp", B = 100)
#'
#' # do the same with crossvalidation
#' \dontrun{
#'
#' fixFolds <- cv(weights = model.weights(mod1),
#' type = "kfold", B = 10)
#' cvr <- cvrisk(mod1, folds = fixFolds, papply = lapply)
#' modf <- mod1[mstop(cvr)]
#'
#' # define corresponding refit function
#' modFun <- function(y){
#'
#' mod <- mboost_fit(response = y,
#' blg = blList,
#' offset = 0,
#' control = boost_control(mstop = 73))
#' cvr <- cvrisk(mod, folds = fixFolds, papply = lapply)
#' return(mod[mstop(cvr)])
#' }
#'
#' # this will take a while
#' (res <- iboost(modf, refit.mboost = modFun, method = "impsamp", B = 1000))
#'
#' }
#' }
#'
iboost <- function(obj,
method = c("unifsamp",
"impsamp",
"analytic",
"slice",
"linesearch",
"normalsamp",
"normaladjsamp"
),
vars = NULL,
varForSampling = NULL,
B = 1000,
alpha = 0.05,
ncore = 1,
refit.mboost = NULL,
Ups = NULL,
checkBL = TRUE,
vT = NULL,
computeCI = TRUE,
returnSamples = FALSE,
which = NULL,
...)
{
# get method
method <- match.arg(method)
####### checks #######
# check model class
if(all(class(obj) != "mboost"))
stop("obj must be of class mboost.")
# check that offset is (almost) zero
if(abs(obj$offset) > 0.000001)
stop(paste0("Please refit the model by substracting",
" the offset from the response before calling mboost / glmboost."))
# check alpha level
if(alpha >= 1 | alpha <= 0.001)
stop("Please provide an alpha value in between 0.001 and 1.")
# check that family == Gaussian
if(obj$family@name != "Squared Error (Regression)")
stop("Inference for families other than Gaussian is currently not available.")
# check for group base-learners and restrict method
nrcol <- if(class(obj)[1] == "glmboost") rle(obj$assign)$lengths else
unlist(lapply(extract(obj, "design"), "ncol"))
if(any(nrcol > 1) & method %in% c("analytic",
"normalsamp") & is.null(vT))
stop("Method analytic not available for group base-learner.")
# check for not supported base-learners
learners <- gsub("(.*)\\(.*\\)","\\1",names(obj$baselearner))
if(checkBL && any(sapply(learners, function(x) !x %in% c("bbs","bols"))))
stop(paste0("Inference is currently restricted to model with",
" linear and b-spline base-learner only."))
# check model call and data argument
if(is.null(refit.mboost) && (is.null(obj$call) || class(obj$call$data)!="name") &&
method!="analytic")
stop("Need data object as parameter in initial model call.")
# check variances
if(is.null(vars)) vars <- var(obj$response) * (length(obj$response)-1)/length(obj$response)
if(is.null(varForSampling)) varForSampling <- vars[1]
####### prepare inference #######
sel <- sort(unique(selected(obj)))
# function definition for the check of congruent selection sets
is_congruent <- function(mod) setequal(selected(mod), sel)
y <- obj$response
n <- length(y)
# testvector(s)
if(is.null(vT)) vT <- getTestvector(obj)
if(!is.null(which)) vT <- vT[which]
# check if method complies with vT
if(any(sapply(vT, NROW)>1) & !method%in%c("impsamp", "slice"))
{
method <- "impsamp"
warning(paste0("The supplied method does not allow for inference ",
"of grouped variable parameters.\n",
"method has changed to impsamp.\n",
"If linear effects are given and you wish",
"to use a different method, please use the which argument to only select",
"effects of linear base-learners.")
)
}
if(is.null(refit.mboost)){
refit.mboost <- function(newY){
call <- obj$call
olddat <- eval(call$data, parent.frame())
yname <- all.vars(as.formula(obj$call))[1]
olddat[[yname]] <- newY
assign(as.character(call$data), olddat)
if(as.character(as.list(call)[[1]]) == "glmboost.formula")
stop("Please provide a refit.mboost function.")
eval(call)
}
}
var <- varForSampling
####### call distribution helper functions #######
tstart <- Sys.time()
res <- switch (method,
analytic = polyh_inf(obj, vT, var, alpha, Ups = Ups),
slice = mclapply(lapply(vT, t), function(v) getVloVup(mod = obj, v = v, Ups = Ups),
mc.cores = ncore),
# linesearch = mclapply(vT, function(vt)
# infsamp(refit.mboost, y, vT = vt, is_congruent, B, var, ...),
# mc.cores = ncore),
normalsamp = mclapply(vT, function(vt)
normalsamp(refit.mboost, y, vT = vt, is_congruent, B, var, ...),
mc.cores = ncore),
normaladjsamp = mclapply(vT, function(vt)
normaladjsamp(refit.mboost, y, vT = vt, is_congruent, B, var),
mc.cores = ncore),
impsamp = mclapply(vT, function(vt)
impsamp(refit.mboost, y, vT = vt, is_congruent, B, var, ...),
mc.cores = ncore),
unifsamp = mclapply(vT, function(vt)
unifsamp(refit.mboost, y, vT = vt, is_congruent, B, var, ...),
mc.cores = ncore)
)
dur <- as.numeric(difftime(Sys.time(), tstart, units = "secs"))
####### format results #######
if(returnSamples) return(list(res = res,
alpha = alpha,
method = method,
vT = vT,
this_var = vars[i],
computeCI = computeCI,
y = y))
ret <- vector("list", length(vars))
for(i in 1:length(vars)){
ret[[i]] <- format_iboost_res(res = res,
alpha = alpha,
method = method,
vT = vT,
this_var = vars[i],
computeCI = computeCI,
y = y)
ret[[i]]$dur <- dur
}
if(length(ret)==1) ret <- ret[[1]]
return(ret)
}
#' @title Function to calculate selective inference based on sampled values
#'
#' @description This function calculates p-values and confidence intervals
#' based on sampled values and weights obtained by the \code{\link{iboost}} function.
#'
#' @details The specific use of this function is to (internally) compute inference based
#' on samples in the \code{\link{iboost}} function, to recalculate inference for
#' given samples \code{dist} from an \code{iboost} object or to calculate inference in
#' the first place when \code{\link{iboost}} has been ran with
#' argument \code{returnSamples = TRUE}.
#' In the first and third case, use \code{format_iboost_res}, otherwise \code{format_iboost}.
#'
#' @examples
#' if(require("mboost")){
#'
#' set.seed(0)
#'
#' n <- 200
#' x1 <- rnorm(n)
#' x2 <- rnorm(n) + 0.25 * x1
#' x3 <- rnorm(n)
#' eta <- 3 * sin(x1) + x2^2
#' y <- scale(eta + rnorm(n), scale = FALSE)
#'
#' spline1 <- bbs(x1, knots = 20, df = 4)
#' knots.x2 <- quantile(x2, c(0.25, 0.5, 0.75))
#' spline2 <- bbs(x2, knots = knots.x2, df = 4)
#' spline3 <- bbs(x3, knots = 20, df = 4)
#'
#' data <- data.frame(y=y, x1=x1, x2=x2, x3=x3)
#'
#' mod1 <- mboost(y ~ spline1 + spline2 + spline3,
#' control=boost_control(mstop = 73), offset = 0,
#' data = data)
#'
#' # calculate p-values and intervals for model with
#' # fixed stopping iteration:
#' # this is done with only B = 100 samples for
#' # demonstrative purposes and should be increased
#' # for actual research questions
#' res <- iboost(mod1, method = "impsamp", B = 100)
#'
#' # recalculate inference for different variance or alpha level
#' format_iboost(res, alpha = 0.1, this_var = var(y)*5)
#' }
#' @param res list of sampled values either directly obtained in the \code{\link{iboost}}
#' call or when using \code{\link{iboost}} with \code{returnSamples = TRUE}.
#' @param alpha numeric; level for confidence interval(s)
#' @param method character; method used for sampling
#' @param vT list of test vector(s)
#' @param this_var numeric; variance to be used for inference
#' @param computeCI logical; whether or not to compute confidence intervals
#' @param fac numeric; used in inverse search for confidence interval limits
#' @param y vector; response vector
#' @export
#' @rdname format_iboost
#' @aliases format_iboost
#'
format_iboost_res <- function(
res,
alpha,
method,
vT,
this_var,
computeCI,
fac = 2,
y
){
resDF <- list()
if(method=="unifsampTest"){
method <- "unifsamp"
computeCI <- FALSE
}
for(j in 1:length(res)){
if(method %in% c("linesearch", "analytic")){
res[[j]] <- sort(res[[j]])
vlo <- res[[j]][1]
vup <- res[[j]][2]
}else if(method == "slice"){ # slice
vlo <- res[[j]]$ul[[1]]
vup <- res[[j]]$ul[[2]]
}else if(method %in% c("normalsamp", "normaladjsamp", "impsamp", "unifsamp", "impsamp")){
# if("extrcase" %in% names(attributes(res[[j]]))){
# resDF[[j]] <- data.frame(lower = NA, mean = res[[j]]$obsval, upper = NA, pval = 0,
# lowtrunc = NA, uptrunc = NA)
# next
# }
if(sum(res[[j]]$logvals) == 0){
warning("No sample survived. Most likely reason: truncated space very small.")
resDF[[j]] <- data.frame(lower = NA, mean = res[[j]]$obsval, upper = NA, pval = NA,
lowtrunc = NA, uptrunc = NA)
next
}
vlo <- min(res[[j]]$rBs)
vup <- max(res[[j]]$rBs)
mu <- res[[j]]$obsval
l1 <- res[[j]]$logvals
survr <- res[[j]]$rBs[l1]
l2 <- survr > mu
survr_gr <- survr[l2]
survr_le <- survr[!l2]
pval <- sum(res[[j]]$weights(var = this_var)[l2]) /
sum(res[[j]]$weights(var = this_var))
vt <- vT[[j]]
vTv <- as.numeric(tcrossprod(vT[[j]]))
if(computeCI){
# define binary search and get confidence intervaĺ
if(method %in% c("normalsamp", "unifsamp", "normaladjsamp"))
{
w <- res[[j]]$weights(var = this_var)
if(all(w<1e-7)) w <- w / max(w)
ftlo <- function(t) sum(exp(survr_gr * t / (this_var * vTv)) * w[l2]) /
sum(exp(survr * t / (this_var * vTv)) * w)
ftup <- function(t) sum(exp(survr_le * t / (this_var * vTv)) * w[!l2]) /
sum(exp(survr * t / (this_var * vTv)) * w)
if(length(survr_gr)==0){
ci <- c(-Inf, Inf)
}else{
# check admissible values
testvals <- seq(vlo - 4*sqrt(this_var * vTv),
vup + 4*sqrt(this_var * vTv),
l = 1000)
flovals <- sapply(testvals, ftlo)
fupvals <- sapply(testvals, ftup)
ll <- min(which(!is.na(flovals) &
!is.nan(flovals)))
lu <- max(which(!is.na(flovals) &
!is.nan(flovals)))
low <- try(uniroot(f = function(x) ftlo(x) - alpha/2,
interval = testvals[c(ll,lu)],
extendInt = "upX")$root)
up <- try(uniroot(f = function(x) ftup(x) - alpha/2,
interval = testvals[c(ll,lu)],
extendInt = "downX")$root)
if(class(low)=="try-error") low <- -Inf
if(class(up)=="try-error") up <- Inf
ci <- c(low, up)
}
}else{
# impsamp
w <- res[[j]]$weights(var = this_var)
if(all(w<1e-7)) w <- w / max(w)
ft <- function(t) sum(exp(survr_gr * t / this_var) * w[l2]) /
sum(exp(survr * t / this_var) * w)
ibins <- function(d, fac){
uniroot(f = function(x) ft(x) - d,
interval = c(0, fac * mu),
extendInt = "upX")
}
low <- try(ibins(alpha, fac = fac)$root)
if(class(low)=="try-error") low <- 0
ci <- c(max(low, 0), Inf)
}
}else{
ci <- c(NA, NA)
}
# return two-sided p-value
if(method != "impsamp") pval <- 2*min(1-pval,pval)
resDF[[j]] <- data.frame(lower = ci[1], mean = mu, upper = ci[2], pval = pval,
lowtrunc = vlo, uptrunc = vup)
}
if(method %in% c("slice", "analytic", "linesearch")){
df <- nrow(vT[[j]])
lin <- df==1
if(lin){
vTv <- as.numeric(tcrossprod(vT[[j]]))
mu <- as.numeric(vT[[j]] %*% y)
sqvTv <- sqrt(vTv)
sd <- sqrt(this_var)*sqvTv
gridrange = c(-fac,fac)*50
pv <- selectiveInference:::tnorm.surv(mu, mean = 0, sd = sd, a = vlo, b = vup)
if(mu<=0) pv <- 1 - pv
xg = seq(gridrange[1]*sd, gridrange[2]*sd, length = 100)
fun = function(x) { selectiveInference:::tnorm.surv(z = mu, mean = x, sd = sd,
a = vlo, b = vup, bits = NULL) }
if(computeCI)
ci = selectiveInference:::grid.search(xg, fun, alpha/2,
1-alpha/2, 100, 2) else
ci = c(NA, NA)
}else{
R <- vT[[j]] %*% y
mu <- sqrt(sum(R^2))
pv <- selectiveInference:::TC_surv(TC = mu, sigma = sqrt(this_var),
df = df, E = Intervals(c(vlo,vup)))
ci <- c(NA, NA)
}
resDF[[j]] <- data.frame(lower = ci[1], mean = mu, upper = ci[2],
pval = 2*min(pv, 1-pv),
lowtrunc = vlo, uptrunc = vup)
}
}
resDF <- do.call("rbind", resDF)
if(is.null(vT)) rownames(resDF) <- names(res[[1]]$dist)
ret <- list(dist = res,
method = method,
alpha = alpha,
vT = vT,
yorg = y,
resDF = resDF,
var = this_var)
class(ret) <- "iboost"
return(ret)
}
#' @export
#' @param obj an iboost object
#' @rdname format_iboost_res
#'
format_iboost <- function(
obj,
alpha = 0.05,
method = obj$method,
vT = obj$vT,
this_var = obj$var,
computeCI = TRUE,
fac = 2,
y = obj$yorg)
{
format_iboost_res(obj$dist, alpha, method, vT, this_var, computeCI, fac, y)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.