### calcDistMax.R ---
#----------------------------------------------------------------------
## author: Brice Ozenne
## created: jun 21 2017 (16:44)
## Version:
## last-updated: jun 20 2019 (10:28)
## By: Brice Ozenne
## Update #: 630
#----------------------------------------------------------------------
##
### Commentary:
##
### Change Log:
#----------------------------------------------------------------------
##
### Code:
## * documentation
#' @title Adjust the p.values Using the Quantiles of the Max Statistic
#' @description Adjust the p.values using the quantiles of the max statistic.
#' @name calcDistMax
#'
#' @param statistic [numeric vector] the observed Wald statistic.
#' Each statistic correspond to a null hypothesis (i.e. a coefficient) that one wish to test.
#' @param iid [matrix] zero-mean iid decomposition of the coefficient used to compute the statistic.
#' @param df [numeric] the degree of freedom defining the multivariate Student's t distribution.
#' If \code{NULL} the multivariate Gaussian distribution will be used instead.
#' @param iid.previous [matrix, EXPERIMENTAL] zero-mean iid decomposition of previously tested coefficient.
#' @param quantile.compute [logical] should the rejection quantile be computed?
#' @param quantile.previous [numeric, EXPERIMENTAL] rejection quantiles of the previously tested hypotheses. If not \code{NULL} the values should correspond the variable in to the first column(s) of the argument \code{iid.previous}.
#' @param method [character] the method used to compute the p-values.
#' @param alpha [numeric 0-1] the significance cutoff for the p-values.
#' When the p-value is below, the corresponding link will be retained.
#' @param cpus [integer >0] the number of processors to use.
#' If greater than 1, the computation of the p-value relative to each test is performed in parallel.
#' @param cl [cluster] a parallel socket cluster generated by \code{parallel::makeCluster}
#' that has been registered using \code{registerDoParallel}.
#' @param n.sim [integer >0] the number of bootstrap simulations used to compute each p-values.
#' Disregarded when the p-values are computed using numerical integration.
#' @param n.repmax [integer >0] the maximum number of rejection for each bootstrap sample before switching to a new bootstrap sample.
#' Only relevant when conditioning on a previous test.
#' Disregarded when the p-values are computed using numerical integration.
#' @param trace [logical] should the execution of the function be traced?
#'
#' @return A list containing
#' \itemize{
#' \item p.adjust: the adjusted p-values.
#' \item z: the rejection threshold.
#' \item Sigma: the correlation matrix between the test statistic.
#' \item correctedLevel: the alpha level corrected for conditioning on previous tests.
#' }
#'
#' @examples
#' library(mvtnorm)
#'
#' set.seed(10)
#' n <- 100
#' p <- 4
#' link <- letters[1:p]
#' n.sim <- 1e3 # number of bootstrap simulations
#'
#' #### test - not conditional ####
#' X.iid <- rmvnorm(n, mean = rep(0,p), sigma = diag(1,p))
#' colnames(X.iid) <- link
#' statistic <- setNames(1:p,link)
#'
#'
#' r1 <- calcDistMaxIntegral(statistic = statistic, iid = X.iid,
#' trace = FALSE, alpha = 0.05, df = 1e6)
#'
#' r3 <- calcDistMaxBootstrap(statistic = statistic, iid = X.iid,
#' method = "residual",
#' trace = FALSE, alpha = 0.05, n.sim = n.sim)
#'
#' r4 <- calcDistMaxBootstrap(statistic = statistic, iid = X.iid,
#' method = "wild",
#' trace = FALSE, alpha = 0.05, n.sim = n.sim)
#'
#' rbind(integration = c(r1$p.adjust, quantile = r1$z),
#' bootResidual = c(r3$p.adjust, quantile = r3$z),
#' bootWild = c(r4$p.adjust, quantile = r4$z))
#'
#' #### test - conditional ####
#' \dontrun{
#' Z.iid <- rmvnorm(n, mean = rep(0,p+1), sigma = diag(1,p+1))
#' seqQuantile <- qmvnorm(p = 0.95, delta = rep(0,p+1), sigma = diag(1,p+1),
#' tail = "both.tails")$quantile
#'
#' r1c <- calcDistMaxIntegral(statistic = statistic, iid = X.iid,
#' iid.previous = Z.iid, quantile.previous = seqQuantile,
#' trace = FALSE, alpha = 0.05, df = NULL)
#'
#' r3c <- calcDistMaxBootstrap(statistic = statistic, iid = X.iid,
#' iid.previous = Z.iid, quantile.previous = seqQuantile, method = "residual",
#' trace = FALSE, alpha = 0.05, n.sim = n.sim)
#'
#' r4c <- calcDistMaxBootstrap(statistic = statistic, iid = X.iid,
#' iid.previous = Z.iid, quantile.previous = seqQuantile, method = "wild",
#' trace = FALSE, alpha = 0.05, n.sim = n.sim)
#'
#' rbind(integration = c(r1c$p.adjust, quantile = r1c$z),
#' bootResidual = c(r3c$p.adjust, quantile = r3c$z),
#' bootWild = c(r4c$p.adjust, quantile = r4c$z))
#' }
#' @concept modelsearch
#' @concept post-selection inference
## * calcDistMaxIntegral
#' @rdname calcDistMax
#' @export
calcDistMaxIntegral <- function(statistic, iid, df,
iid.previous = NULL, quantile.previous = NULL,
quantile.compute = lava.options()$search.calc.quantile.int,
alpha, cpus = 1, cl = NULL, trace){
## ** normalize arguments
p.iid <- NCOL(iid)
n <- NROW(iid)
conditional <- length(quantile.previous)
if(length(quantile.previous)>1){
stop("Can only condition on one previous step \n")
}
if(is.null(df)){
distribution.statistic <- "gaussian"
}else{
distribution.statistic <- "student"
}
iid.all <- cbind(iid,iid.previous)
index.new <- 1:NCOL(iid)
index.previous <- setdiff(1:NCOL(iid.all),index.new)
p.iid.all <- NCOL(iid.all)
## ** Compute the correlation matrix between the test statistics
# center to be under the null
# scale since we want the distribution of the Wald statistic (i.e. statistic with unit variance)
iid.statistic <- scale(iid.all, center = TRUE, scale = TRUE)
Sigma.statistic <- stats::cov(iid.statistic, use = "pairwise.complete.obs")
out <- list(p.adjust = NULL, z = NULL, Sigma = Sigma.statistic[index.new,index.new,drop=FALSE])
## ** Definition of the functions used to compute the quantiles
warperQ <- function(alpha){
.calcQmaxIntegration(alpha = alpha, p = p.iid,
Sigma = Sigma.statistic[index.new,index.new,drop=FALSE],
df = df, distribution = distribution.statistic)
}
warperP <- function(index){
.calcPmaxIntegration(statistic = statistic[index], p = p.iid,
Sigma = Sigma.statistic[index.new,index.new,drop=FALSE],
df = df, distribution = distribution.statistic)
}
## ** correction for conditioning on the previous steps
if(conditional==TRUE){
out$correctedLevel <- calcType1postSelection(1-alpha, quantile.previous = quantile.previous,
mu = rep(0,p.iid.all), Sigma = Sigma.statistic,
distribution = distribution.statistic,
df = df)
alpha <- 1-out$correctedLevel
}else{
out$correctedLevel <- NA
}
if(quantile.compute){
out$z <- warperQ(alpha)
}else{
out$z <- NA
}
## ** start parallel computation
init.cpus <- (cpus > 1 && is.null(cl))
if(init.cpus){
## define cluster
if(trace>0){
cl <- parallel::makeCluster(cpus, outfile = "")
}else{
cl <- parallel::makeCluster(cpus)
}
## link to foreach
doParallel::registerDoParallel(cl)
}
## ** Computation
if(trace > 0){ cat("Computation of multivariate student probabilities to adjust the p.values \n") }
if(cpus > 1){
## *** parallel computations
if(trace>0){
pb <- utils::txtProgressBar(max = length(index.new), style = 3)
}
## export package
parallel::clusterCall(cl, fun = function(x){
suppressPackageStartupMessages(requireNamespace("mvtnorm", quietly = TRUE))
})
value <- NULL # [:for CRAN check] foreach
out$p.adjust <- foreach::`%dopar%`(
foreach::foreach(value = 1:length(statistic),
.export = c(".calcPmaxIntegration"),
.combine = "list"),
{
if(trace>0){utils::setTxtProgressBar(pb, value)}
return(warperP(index.new[value]))
})
if(trace>0){close(pb)}
}else{
## *** sequential computations
if(trace>0){
out$p.adjust <- pbapply::pblapply(1:length(statistic), function(iStat){ warperP(index.new[iStat]) })
}else{
out$p.adjust <- lapply(1:length(statistic), function(iStat){ warperP(index.new[iStat]) })
}
}
out$error <- stats::setNames(unlist(lapply(out$p.adjust, function(x){attr(x,"error")})), names(statistic))
out$p.adjust <- stats::setNames(unlist(out$p.adjust), names(statistic))
## ** end parallel computation
if(init.cpus){
parallel::stopCluster(cl)
}
## ** export
return(out)
}
## * calcDistMaxBootstrap
#' @rdname calcDistMax
#' @export
calcDistMaxBootstrap <- function(statistic, iid, iid.previous = NULL, quantile.previous = NULL,
method, alpha, cpus = 1, cl = NULL, n.sim, trace, n.repmax = 100){
## ** normalize arguments
n <- NROW(iid)
conditional <- length(quantile.previous)>0
if(length(quantile.previous)>1){
stop("Can only condition on one previous step \n")
}
iid.all <- cbind(iid,iid.previous)
index.new <- 1:NCOL(iid)
index.previous <- setdiff(1:NCOL(iid.all),index.new)
## ** Function used for the simulations
warperBoot <- .bootMaxDist
## ** Compute the correlation matrix between the test statistics
# center to be under the null
# scale since we want the distribution of the Wald statistic (i.e. statistic with unit variance)
iid.statistic <- scale(iid.all, center = TRUE, scale = TRUE)
Sigma.statistic <- stats::cov(iid.statistic, use = "pairwise.complete.obs")
## ** start parallel computation
init.cpus <- (cpus > 1 && is.null(cl))
if(init.cpus){
## define cluster
if(trace>0){
cl <- parallel::makeCluster(cpus, outfile = "")
}else{
cl <- parallel::makeCluster(cpus)
}
## link to foreach
doParallel::registerDoParallel(cl)
}
## ** Computation
if(trace > 0){ cat("Bootsrap simulations to get the 95% quantile of the max statistic: ") }
if(cpus>1){
n.simCpus <- rep(round(n.sim/cpus),cpus)
n.simCpus[1] <- n.sim-sum(n.simCpus[-1])
i <- NULL # [:for CRAN check] foreach
distMax <- foreach::`%dopar%`(
foreach::foreach(i = 1:cpus, .packages = c("MASS"),
.export = "calcDistMax",
.combine = "c"),{
replicate(n.simCpus[i],
warperBoot(iid = iid.all, sigma = Sigma.statistic,
n = n, method = method,
index.new = index.new, index.previous = index.previous,
quantile.previous = quantile.previous, n.repmax = n.repmax))
})
}else{
if(trace>0){
distMax <- pbapply::pbsapply(1:n.sim, warperBoot, method = method,
iid = iid.all, sigma = Sigma.statistic, n = n,
index.new = index.new, index.previous = index.previous,
quantile.previous = quantile.previous, n.repmax = n.repmax)
}else{
distMax <- sapply(1:n.sim, warperBoot, method = method,
iid = iid.all, sigma = Sigma.statistic, n = n,
index.new = index.new, index.previous = index.previous,
quantile.previous = quantile.previous, n.repmax = n.repmax)
}
}
if(trace > 0){ cat("done \n") }
## ** end parallel calculation
if(init.cpus){
parallel::stopCluster(cl)
}
## ** export
out <- list()
out$z <- stats::quantile(distMax, probs = 1-alpha, na.rm = TRUE)
out$p.adjust <- sapply(abs(statistic), function(x){mean(distMax>x,na.rm=TRUE)})
out$Sigma <- Sigma.statistic
out$correctedLevel <- NA
return(out)
}
## * .calcQmaxIntegration: numerical integration to compute the critical threshold
.calcQmaxIntegration <- function(alpha, p, Sigma, df, distribution){
if(distribution == "gaussian"){
if(p==1){
q.alpha <- stats::qnorm(1-alpha, mean = 0, sd = 1)
}else{
q.alpha <- mvtnorm::qmvnorm(1-alpha,
mean = rep(0,p),
corr = Sigma,
tail = "both.tails")$quantile
}
}else if(distribution == "student"){
if(p==1){
q.alpha <- stats::qt(1-alpha, df = df)
}else{
q.alpha <- mvtnorm::qmvt(1-alpha,
delta = rep(0,p),
corr = Sigma,
df = df,
tail = "both.tails")$quantile
}
}
return(q.alpha)
}
## * .calcPmaxIntegration_firstStep: numerical integration to compute the p.values
.calcPmaxIntegration <- function(statistic, p, Sigma, df, distribution){
value <- abs(statistic)
if(!is.na(value)){
if(distribution == "gaussian"){
if(p==1){
out <- stats::pnorm(value, mean = 0, sd = Sigma)-stats::pnorm(-value, mean = 0, sd = Sigma)
attr(out, "error") <- 0
}else{
out <- mvtnorm::pmvnorm(lower = -value, upper = value,
mean = rep(0, p), corr = Sigma)
}
}else if(distribution == "student"){
if(p==1){
out <- stats::pt(value, df = df)-stats::pt(-value, df = df)
attr(out, "error") <- 0
}else{
out <- mvtnorm::pmvt(lower = -value, upper = value,
delta = rep(0, p), corr = Sigma, df = df)
}
}
return(1-out)
}else{
return(NA)
}
}
## * .bootMaxDist: bootstrap simulation
.bootMaxDist <- function(iid, sigma, n, method,
index.new, index.previous, quantile.previous, n.repmax,
...){
iRep <- 0
cv <- FALSE
while(iRep < n.repmax && cv == FALSE){
## ** resample to obtain a new influence function
if(method == "residual"){
iid.sim <- MASS::mvrnorm(n,rep(0,NCOL(sigma)),sigma)
}else if(method == "wild"){
e <- stats::rnorm(n,mean=0,sd=1)
iid.sim <- sapply(1:NCOL(sigma),function(x){e*iid[,x]})
}
if(!is.null(quantile.previous)){
iid.previous <- iid.sim[,index.previous]
test.previous <- apply(iid.previous,2,function(x){sqrt(n)*mean(x)/stats::sd(x)})
max.previous <- max(abs(test.previous))
if(max.previous<quantile.previous){
iRep <- iRep + 1
}else{
iid.sim <- iid.sim[,index.new]
cv <- TRUE
}
}else{
cv <- TRUE
}
}
## ** compute the bootstrap test statistic
if(cv){
Test <- apply(iid.sim,2,function(x){sqrt(n)*mean(x)/stats::sd(x)})
}else{
Test <- NA
}
return(max(abs(Test)))
}
#----------------------------------------------------------------------
### calcDistMax.R ends here
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.