Nothing
#---------------------------------------------------------------
# Adaptive P-Value Thresholding For Multiple Testing With Side Information
# Lihua Lei & William Fithian,
# "AdaPT: An interactive procedure for multiple testing with side information"
# Available from http://arxiv.org/abs/1609.06035
#---------------------------------------------------------------
fdp_hat <- function(A, R){
(1 + A) / pmax(1, R)
}
compute_lfdr_mix <- function(pvals, dist, params,
type = "over"){
pix <- pmax(params$pix, 1e-5)
mux <- params$mux
if (dist$family$family == "Gamma"){
mux <- pmax(mux, 1 + 1e-5)
}
type <- type[1]
if (type == "over"){
lfdr <- (pix * dist$h(1, mux) + 1 - pix) /
(pix * dist$h(pvals, mux) + 1 - pix)
} else if (type == "raw"){
lfdr <- (1 - pix) / (pix * dist$h(pvals, mux) + 1 - pix)
}
return(lfdr)
}
compute_threshold_mix <- function(dist, params, lfdr_lev,
type = "over"){
pix <- pmax(params$pix, 1e-5)
mux <- params$mux
if (dist$family$family == "Gamma"){
mux <- pmax(mux, 1 + 1e-5)
}
if (lfdr_lev == 0 || lfdr_lev == 1){
return(rep(lfdr_lev, length(pix)))
}
type <- type[1]
if (type == "over"){
val1 <- dist$h(1, mux) / lfdr_lev +
(1 - pix) / pix * (1 - lfdr_lev) / lfdr_lev
} else if (type == "raw"){
val1 <- (1 - pix) / pix * (1 - lfdr_lev) / lfdr_lev
}
val2 <- (log(val1) + dist$A(mux) - dist$A(dist$mustar)) /
(dist$eta(mux) - dist$eta(dist$mustar))
dist$ginv(val2)
}
create_stamps <- function(nmasks, nfits, nms){
fit_stamps <- c(seq(0, nmasks, floor(nmasks / nfits)))[1:nfits]
stamps <- data.frame(stamp = fit_stamps, type = "fit",
stringsAsFactors = FALSE)
if (!is.null(nms)){
ms_inds <- seq(1, nfits, floor(nfits / nms))[1:nms]
stamps[ms_inds, 2] <- "ms"
}
stamps <- rbind(stamps, data.frame(stamp = nmasks, type = "end"))
return(stamps)
}
check_pkgs <- function(models){
if (class(models) == "adapt_model"){
models <- list(models)
}
models_names <- sapply(models, function(model){model$name})
if ("gam" %in% models_names){
ind <- which(models_names == "gam")
tmp <- sapply(models, function(model){is.null(model$algo)})
if (any(tmp) && !requireNamespace("mgcv", quietly = TRUE)){
stop("'mgcv' package not found. Please install.")
}
}
if ("glmnet" %in% models_names){
ind <- which(models_names == "glmnet")
tmp <- sapply(models, function(model){is.null(model$algo)})
if (any(tmp) && !requireNamespace("glmnet", quietly = TRUE)){
stop("'glmnet' package not found. Please install.")
}
}
return(invisible())
}
#' Adaptive P-value Thresholding
#'
#' \code{adapt} is a framework allowing for arbitrary exponential families for computing E-steps and arbitrary algorithms for fitting M-steps.
#'
#' \code{x} should have a type compatible to the fitting functions in \code{models}. For GLM and GAM, \code{x} should be a data.frame. For glmnet, \code{x} should be a matrix.
#'
#' \code{models} could either be an \code{adapt_model} object, if a single model is used, or a list of \code{adapt_model} objects, each of which corresponding to a model. Each element should be generated by \code{\link{gen_adapt_model}}. For glm/gam/glmnet, one can use the shortcut by running \code{\link{gen_adapt_model}} with name = "glm" or "gam" or "glmnet" but without specifying \code{pifun}, \code{mufun}, \code{pifun_init} and \code{mufun_init}. See examples below.
#'
#' \code{nfits} is the number of model fitting steps plus \code{nms}, the model selection steps, if \code{models} contains multiple \code{adapt_model} objects. Suppose M is the number of masked p-values at the initial step, then the model is updated at the initial step and at every time when [M/\code{nfits}] more p-values are revealed. If \code{nms > 0}, model selection is performed at the initial step an at every time when [M/\code{nms}] more p-values are revealed. Between two consecutive model selection steps, the model selected from the last step is used for model fitting. For example, when M = 10000, nfits = 10 and nms = 2, model selection will be performed at the initial step and when 5000 p-values are revealed, while the model fitting will be performed when 1000, 2000, 3000, 4000, 6000, 7000, 8000, 9000 p-values are revealed.
#'
#' \code{verbose} has three elements: \code{print}, \code{fit} and \code{ms}. If \code{print = TRUE}, the progress of the main procedure is outputted to the console, in the form of "alpha = 0.05: FDPhat 0.0333, Number of Rej. 30" (where the numbers are made up for illustration). If \code{fit = TRUE}, a progress bar for the model fitting is outputted to the console. Similarly, if \code{ms = TRUE}, a progress bar for the model selection is outputted to the console.
#'
#' For ultra-large scale problems (n > 10^5), it is recommended to keep \code{alphas} short because the output \code{s} is of size n x \code{length(alphas)}.
#' is \code{length(alphas)}.
#'
#' The output \code{qvals} gives the q-values of each hypothesis. \code{qvals[i]} is defined as the minimum target FDR level such that \code{pvals[i]} is rejected. For hypotheses with p-values above s0, the q-values are set to be Inf because they are never rejected by AdaPT for whatever alpha.
#'
#' The output \code{order} gives the order of (the indices of) p-values being revealed, i.e. being in the region (s, 1-s). The latter hypotheses appeared in \code{order} have smaller q-values (i.e. are more likely to be rejected).
#'
#' @param x covariates (i.e. side-information). Should be compatible to \code{models}. See Details
#' @param pvals a vector of values in [0, 1]. P-values
#' @param models an object of class "\code{adapt_model}" or a list of objects of class "adapt_model". See Details
#' @param dist an object of class "\code{\link{gen_exp_family}}". \code{\link{beta_family}()} as default
#' @param s0 a vector of values in [0, 0.5). Initial threshold.
#' @param alphas a vector of values in (0, 1). Target FDR levels.
#' @param params0 a list in the form of list(pix = , mux = ). Initial guess of pi(x) and mu(x). NULL as default
#' @param nfits a positive integer. Number of model-fitting steps. See Details
#' @param nms a non-negative integer. Number of model selection steps. See Details
#' @param niter_fit a positive integer. Number of EM iterations in model fitting
#' @param tol a positive scalar. EM algorithm stops when pi(x) and mu(x) in consecutive steps differ by at most 'tol' for each element
#' @param niter_ms a positive integer. Number of EM iterations in model selection
#' @param cr a string. The criterion for model selection with BIC as default. Also support AIC, AICC and HIC
#' @param verbose a list of logical values in the form of list(print = , fit = , ms = ). Each element indicates whether the relevant information is outputted to the console. See Details
#'
#' @return
#' \item{nrejs}{a vector of integers. Number of rejections for each alpha}
#' \item{rejs}{a list of vector of integers. The set of indices of rejections for each alpha}
#' \item{s}{a matrix of size \code{length(pvals) X length(alphas)}. Threshold curves for each alpha}
#' \item{params}{a list. Each element is a list in the form of \code{list(pix = , mux = , alpha = , nmasks =)}, recording the parameter estimates, the achieved alpha and the number of masked p-values. To avoid massive storage cost, it only contains the information when a new target FDR level is achieved. As a result, it might be shorter than \code{nfits}.}
#' \item{qvals}{a vector of values in [0, 1]U{Inf}. Q-values. See Details}
#' \item{order}{a permutation of \code{1:length(pvals)}. Indices of hypotheses arranged in the order of reveal. See Details}
#' \item{alphas}{same as the input \code{alphas}}
#' \item{dist}{same as the input \code{dist}}
#' \item{models}{a list of \code{adapt_model} objects of length \code{params}. The model used in each fitting step. As in \code{params}, it only contains the model when a new target FDR level is achieved and each element corresponds to an element of \code{params}.}
#' \item{info}{a list of length \code{nfits}. Each element is a list recording extra information in each fitting step, e.g. degree of freedom (df) and variable importance (vi). As in \code{params}, it only contains the model information when a new target FDR level is achieved and each element corresponds to an element of \code{params}.}
#' \item{args}{a list including the other inputs \code{nfits}, \code{nms}, \code{niter_fit}, \code{niter_ms}, \code{tol}, \code{cr}}.
#'
#' @examples
#' \donttest{
#' # Load estrogen data
#' data(estrogen)
#' pvals <- as.numeric(estrogen$pvals)
#' x <- data.frame(x = as.numeric(estrogen$ord_high))
#' dist <- beta_family()
#'
#' # Subsample the data for convenience
#' inds <- (x$x <= 5000)
#' pvals <- pvals[inds]
#' x <- x[inds,,drop = FALSE]
#'
#' # Generate models for function adapt
#' library("splines")
#' formulas <- paste0("ns(x, df = ", 6:10, ")")
#' models <- lapply(formulas, function(formula){
#' piargs <- muargs <- list(formula = formula)
#' gen_adapt_model(name = "glm", piargs = piargs, muargs = muargs)
#' })
#'
#' # Run adapt
#' res <- adapt(x = x, pvals = pvals, models = models,
#' dist = dist, nfits = 10)
#' }
#'
#' @export
adapt <- function(x, pvals, models,
dist = beta_family(),
s0 = rep(0.45, length(pvals)),
alphas = seq(0.01, 1, 0.01),
params0 = list(pix = NULL, mux = NULL),
nfits = 20, nms = 1,
niter_fit = 10, tol = 1e-4,
niter_ms = 20, cr = "BIC",
verbose = list(print = TRUE, fit = FALSE, ms = TRUE)
){
Mstep_type <- "unweighted"
lfdr_type <- "over"
## Check if 'pvals' is a vector of values in [0, 1]
if (!is.numeric(pvals) || min(pvals) < 0 || max(pvals) > 1){
stop("Invalid p-values")
}
## Check if the size of 'x' matches that of 'pvals'
if (nrow(x) != length(pvals)){
stop("'x' must have the same rows as the length of 'pvals'")
}
## Check if 'dist' is of class 'exp_family'
if (class(dist)[1] != "exp_family"){
stop("\'dist\' must be of class \'exp_family\'.")
}
## Check if necessary packages are installed.
check_pkgs(models)
## When a single model is provided, set 'nms' to be NULL
if (class(models) == "adapt_model"){
model <- models
nms <- NULL
} else if (is.list(models)){
types <- sapply(models, class)
if (any(types != "adapt_model")){
stop("All models should be of class \'adapt_model\'.")
}
if (!is.integer(nms) || nms <= 0){
nms <- 1
} else if (nms > nfits){
warning("Model selection cannot be more frequent than model fitting. Set \'nms\' to \'nfits\'")
nms <- nfits
}
if (length(models) == 1){
model <- models[[1]]
nms <- NULL
}
}
## Create time stamps when model is fitted or model selection is performed
nmasks <- sum(pvals <= s0) + sum(pvals >= 1 - s0)
stamps <- create_stamps(nmasks, nfits, nms)
## Create root arguments to simplify fitting and model selection
fit_args_root <- list(
x = x, pvals = pvals, dist = dist,
niter = niter_fit, tol = tol,
verbose = verbose$fit, type = Mstep_type
)
if (any(stamps[, 2] == "ms")){
ms_args_root <- list(
x = x, pvals = pvals, dist = dist, models = models,
cr = cr, niter = niter_ms, tol = tol,
verbose = verbose$ms, type = Mstep_type
)
}
## Initialization
n <- length(pvals)
params <- params0
s <- s0
A <- sum(pvals >= 1 - s)
R <- sum(pvals <= s)
minfdp <- fdp_hat(A, R) # initial FDPhat
## Remove the alphas greater than the initial FDPhat, except the smallest one among them
alphas <- sort(alphas)
if (min(alphas) >= minfdp){
warning("Task completed! Initial \'s0\' has guaranteed FDR control for all alpha's in \'alphas\'.")
alphaind <- 0
} else if (max(alphas) < minfdp){
alphaind <- length(alphas)
} else {
alphaind <- max(which(alphas <= minfdp))
}
m <- length(alphas)
nrejs_return <- rep(0, m) # number of rejections
s_return <- matrix(0, n, m) # threshold
params_return <- list() # parameters (including pix and mux)
model_list <- list() # all selected models
info_list <- list() # other information (df, vi, etc.)
reveal_order <- which((pvals > s) & (pvals < 1 - s)) # the order to be revealed
if (length(reveal_order) > 0){
init_pvals <- pvals[reveal_order]
reveal_order <- reveal_order[order(init_pvals, decreasing = TRUE)]
}
fdp_return <- c(rep(Inf, length(reveal_order)), minfdp) # fdphat along the whole path
if (m > alphaind){
nrejs_return[(alphaind + 1):m] <- R
s_return[, (alphaind + 1):m] <- s0
}
## alphas <- alphas[1:m]
for (i in 1:(nrow(stamps) - 1)){
if (alphaind == 0){
if (verbose$print){
cat("Task completed!")
}
break
}
alpha <- alphas[alphaind]
# mask <- (pvals <= s) | (pvals >= 1 - s)
mask <- rep(TRUE, n)
mask[reveal_order] <- FALSE
nmasks <- sum(mask)
A <- sum(pvals >= 1 - s)
R <- sum(pvals <= s)
start <- stamps[i, 1]
end <- stamps[i + 1, 1]
nreveals <- min(nmasks, end - start) # number of hypotheses to be revealed
type <- stamps[i, 2] # type of model update
## Model selection or model fitting
if (type == "ms"){
ms_args <- c(
list(s = s, params0 = params),
ms_args_root
)
## Use "EM_mix_ms" from "EM-mix-ms.R"
ms_res <- do.call(EM_mix_ms, ms_args)
params <- ms_res$params
model <- ms_res$model
modinfo <- ms_res$info
} else if (type == "fit"){
fit_args <- c(
list(s = s, params0 = params, model = model),
fit_args_root
)
## Use "EM_mix" from "EM-mix.R"
fit_res <- do.call(EM_mix, fit_args)
params <- fit_res$params
modinfo <- fit_res$info
}
if (length(params_return) == 0 ||
alpha < params_return[[length(params_return)]]$alpha){
params <- c(params,
list(alpha = alpha, nmasks = nmasks))
params_return <- append(params_return, list(params))
model_list <- append(model_list, list(model))
info_list <- append(info_list, modinfo)
}
## Estimate local FDR
lfdr <- compute_lfdr_mix(
pmin(pvals, 1 - pvals),
dist, params, lfdr_type)
## Find the top "nreveals" hypotheses with highest lfdr
lfdr[!mask] <- -Inf
inds <- order(lfdr, decreasing = TRUE)[1:nreveals]
reveal_order <- c(reveal_order, inds)
## Shortcut to calculate FDPhat after revealing the hypotheses one by one
Adecre <- cumsum(pvals[inds] >= 1 - s[inds])
Rdecre <- cumsum(pvals[inds] <= s[inds])
fdp <- fdp_hat(A - Adecre, R - Rdecre)
fdp_return <- c(fdp_return, fdp)
fdp <- pmin(fdp, minfdp)
## Calculate the current minimum FDPhat
minfdp <- min(fdp)
while (alphaind > 0){# check if lower FDR level is achieved
alpha <- alphas[alphaind]
if (any(fdp <= alpha)){
breakpoint <- which(fdp <= alpha)[1]
lfdr_lev <- lfdr[inds[breakpoint]]
snew <- compute_threshold_mix(dist, params, lfdr_lev, lfdr_type)
snew <- pmin(s, snew)
## Sanity check to avoid rounding errors
tmp_pvals <- pvals[inds[1:breakpoint]]
tmp_pvals <- pmin(tmp_pvals, 1 - tmp_pvals)
tmp_inds <- which(tmp_pvals <= snew[inds[1:breakpoint]])
if (length(tmp_inds) > 0){
snew[inds[tmp_inds]] <- pmin(snew[inds[tmp_inds]], tmp_pvals[tmp_inds] - 1e-15)
}
s_return[, alphaind] <- snew
fdpnew <- fdp[breakpoint]
Rnew <- sum(pvals <= snew)
nrejs_return[alphaind] <- Rnew
if (verbose$print){
cat(paste0(
"alpha = ", alpha, ": FDPhat ",
round(fdpnew, 4), ", Number of Rej. ",
Rnew, "\n"))
}
alphaind <- alphaind - 1
} else {
break
}
}
if (alphaind == 0){ # check again to save computation
if (verbose$print){
cat("Task completed!\n")
}
break
}
## Update s(x)
final_lfdr_lev <- lfdr[tail(inds, 1)]
snew <- compute_threshold_mix(dist, params, final_lfdr_lev, lfdr_type)
s <- pmin(s, snew)
## Sanity check to avoid rounding errors
tmp_pvals <- pvals[inds]
tmp_pvals <- pmin(tmp_pvals, 1 - tmp_pvals)
tmp_inds <- which(tmp_pvals <= snew[inds])
if (length(tmp_inds) > 0){
s[inds[tmp_inds]] <- pmin(s[inds[tmp_inds]], tmp_pvals[tmp_inds] - 1e-15)
}
}
remain_inds <- (1:n)[-reveal_order]
if (length(remain_inds) > 0){
tmp_pvals <- pvals[remain_inds]
tmp_pvals <- pmin(tmp_pvals, 1 - tmp_pvals)
remain_reveal_order <- remain_inds[order(tmp_pvals, decreasing = TRUE)]
reveal_order <- c(reveal_order, remain_reveal_order)
fdp_return <- c(fdp_return, rep(minfdp, length(remain_inds)))
}
rejs_return <- apply(s_return, 2, function(s){which(pvals <= s)})
qvals <- rep(1, n)
qvals[reveal_order] <- ifelse(pvals[reveal_order] <= s0[reveal_order],
cummin(fdp_return[1:n]), rep(Inf, n))
args <- list(nfits = nfits, nms = nms,
niter_fit = niter_fit, niter_ms = niter_ms,
tol = tol, cr = cr)
res <- structure(
list(nrejs = nrejs_return,
rejs = rejs_return,
s = s_return,
params = params_return,
qvals = qvals,
order = reveal_order,
alphas = alphas,
dist = dist,
models = model_list,
info = info_list,
args = args),
class = "adapt")
return(res)
}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.