R/main.R

Defines functions fdp_hat compute_lfdr_mix compute_threshold_mix create_stamps check_pkgs adapt

Documented in adapt

#---------------------------------------------------------------
# Adaptive P-Value Thresholding For Multiple Testing With Side Information
# Lihua Lei & William Fithian,
#   "AdaPT: An interactive procedure for multiple testing with side information"
# Available from http://arxiv.org/abs/1609.06035
#---------------------------------------------------------------

fdp_hat <- function(A, R){
    (1 + A) / pmax(1, R)
}


compute_lfdr_mix <- function(pvals, dist, params,
                             type = "over"){
    pix <- pmax(params$pix, 1e-5)
    mux <- params$mux
    if (dist$family$family == "Gamma"){
        mux <- pmax(mux, 1 + 1e-5)
    }
    type <- type[1]
    if (type == "over"){
        lfdr <- (pix * dist$h(1, mux) + 1 - pix) /
            (pix * dist$h(pvals, mux) + 1 - pix)
    } else if (type == "raw"){
        lfdr <- (1 - pix) / (pix * dist$h(pvals, mux) + 1 - pix)
    }
    return(lfdr)
}

compute_threshold_mix <- function(dist, params, lfdr_lev,
                                  type = "over"){
    pix <- pmax(params$pix, 1e-5)
    mux <- params$mux
    if (dist$family$family == "Gamma"){
        mux <- pmax(mux, 1 + 1e-5)
    }
    if (lfdr_lev == 0 || lfdr_lev == 1){
        return(rep(lfdr_lev, length(pix)))
    }

    type <- type[1]
    if (type == "over"){
        val1 <- dist$h(1, mux) / lfdr_lev +
            (1 - pix) / pix * (1 - lfdr_lev) / lfdr_lev
    } else if (type == "raw"){
        val1 <- (1 - pix) / pix * (1 - lfdr_lev) / lfdr_lev
    }

    val2 <- (log(val1) + dist$A(mux) - dist$A(dist$mustar)) /
        (dist$eta(mux) - dist$eta(dist$mustar))
    dist$ginv(val2)
}

create_stamps <- function(nmasks, nfits, nms){
    fit_stamps <- c(seq(0, nmasks, floor(nmasks / nfits)))[1:nfits]
    stamps <- data.frame(stamp = fit_stamps, type = "fit",
                         stringsAsFactors = FALSE)
    if (!is.null(nms)){
        ms_inds <- seq(1, nfits, floor(nfits / nms))[1:nms]
        stamps[ms_inds, 2] <- "ms"
    }
    stamps <- rbind(stamps, data.frame(stamp = nmasks, type = "end"))
    return(stamps)
}

check_pkgs <- function(models){
    if (class(models) == "adapt_model"){
        models <- list(models)
    }

    models_names <- sapply(models, function(model){model$name})

    if ("gam" %in% models_names){
        ind <- which(models_names == "gam")
        tmp <- sapply(models, function(model){is.null(model$algo)})
        if (any(tmp) && !requireNamespace("mgcv", quietly = TRUE)){
            stop("'mgcv' package not found. Please install.")
        }
    }
    if ("glmnet" %in% models_names){
        ind <- which(models_names == "glmnet")
        tmp <- sapply(models, function(model){is.null(model$algo)})
        if (any(tmp) && !requireNamespace("glmnet", quietly = TRUE)){
            stop("'glmnet' package not found. Please install.")
        }
    }
    return(invisible())
}

#' Adaptive P-value Thresholding
#'
#' \code{adapt} is a framework allowing for arbitrary exponential families for computing E-steps and arbitrary algorithms for fitting M-steps.
#'
#' \code{x} should have a type compatible to the fitting functions in \code{models}. For GLM and GAM, \code{x} should be a data.frame. For glmnet, \code{x} should be a matrix.
#'
#' \code{models} could either be an \code{adapt_model} object, if a single model is used, or a list of \code{adapt_model} objects, each of which corresponding to a model. Each element should be generated by \code{\link{gen_adapt_model}}. For glm/gam/glmnet, one can use the shortcut by running \code{\link{gen_adapt_model}} with name = "glm" or "gam" or "glmnet" but without specifying \code{pifun}, \code{mufun}, \code{pifun_init} and \code{mufun_init}. See examples below.
#'
#' \code{nfits} is the number of model fitting steps plus \code{nms}, the model selection steps, if \code{models} contains multiple \code{adapt_model} objects. Suppose M is the number of masked p-values at the initial step, then the model is updated at the initial step and at every time when [M/\code{nfits}] more p-values are revealed. If \code{nms > 0}, model selection is performed at the initial step an at every time when [M/\code{nms}] more p-values are revealed. Between two consecutive model selection steps, the model selected from the last step is used for model fitting. For example, when M = 10000, nfits = 10 and nms = 2, model selection will be performed at the initial step and when 5000 p-values are revealed, while the model fitting will be performed when 1000, 2000, 3000, 4000, 6000, 7000, 8000, 9000 p-values are revealed.
#'
#' \code{verbose} has three elements: \code{print}, \code{fit} and \code{ms}. If \code{print = TRUE}, the progress of the main procedure is outputted to the console, in the form of "alpha = 0.05: FDPhat 0.0333, Number of Rej. 30" (where the numbers are made up for illustration). If \code{fit = TRUE}, a progress bar for the model fitting is outputted to the console. Similarly, if \code{ms = TRUE}, a progress bar for the model selection is outputted to the console.
#'
#' For ultra-large scale problems (n > 10^5), it is recommended to keep \code{alphas} short because the output \code{s} is of size n x \code{length(alphas)}.
#' is \code{length(alphas)}.
#'
#' The output \code{qvals} gives the q-values of each hypothesis. \code{qvals[i]} is defined as the minimum target FDR level such that \code{pvals[i]} is rejected. For hypotheses with p-values above s0, the q-values are set to be Inf because they are never rejected by AdaPT for whatever alpha.
#'
#' The output \code{order} gives the order of (the indices of) p-values being revealed, i.e. being in the region (s, 1-s). The latter hypotheses appeared in \code{order} have smaller q-values (i.e. are more likely to be rejected).
#'
#' @param x covariates (i.e. side-information). Should be compatible to \code{models}. See Details
#' @param pvals a vector of values in [0, 1]. P-values
#' @param models an object of class "\code{adapt_model}" or a list of objects of class "adapt_model". See Details
#' @param dist an object of class "\code{\link{gen_exp_family}}". \code{\link{beta_family}()} as default
#' @param s0 a vector of values in [0, 0.5). Initial threshold.
#' @param alphas a vector of values in (0, 1). Target FDR levels.
#' @param params0 a list in the form of list(pix = , mux = ). Initial guess of pi(x) and mu(x). NULL as default
#' @param nfits a positive integer. Number of model-fitting steps. See Details
#' @param nms a non-negative integer. Number of model selection steps. See Details
#' @param niter_fit a positive integer. Number of EM iterations in model fitting
#' @param tol a positive scalar. EM algorithm stops when pi(x) and mu(x) in consecutive steps differ by at most 'tol' for each element
#' @param niter_ms a positive integer. Number of EM iterations in model selection
#' @param cr a string. The criterion for model selection with BIC as default. Also support AIC, AICC and HIC
#' @param verbose a list of logical values in the form of list(print = , fit = , ms = ). Each element indicates whether the relevant information is outputted to the console. See Details
#'
#' @return
#' \item{nrejs}{a vector of integers. Number of rejections for each alpha}
#' \item{rejs}{a list of vector of integers. The set of indices of rejections for each alpha}
#' \item{s}{a matrix of size \code{length(pvals) X length(alphas)}. Threshold curves for each alpha}
#' \item{params}{a list. Each element is a list in the form of \code{list(pix = , mux = , alpha = , nmasks =)}, recording the parameter estimates, the achieved alpha and the number of masked p-values. To avoid massive storage cost, it only contains the information when a new target FDR level is achieved. As a result, it might be  shorter than \code{nfits}.}
#' \item{qvals}{a vector of values in [0, 1]U{Inf}. Q-values. See Details}
#' \item{order}{a permutation of \code{1:length(pvals)}. Indices of hypotheses arranged in the order of reveal. See Details}
#' \item{alphas}{same as the input \code{alphas}}
#' \item{dist}{same as the input \code{dist}}
#' \item{models}{a list of \code{adapt_model} objects of length \code{params}. The model used in each fitting step. As in \code{params}, it only contains the model when a new target FDR level is achieved and each element corresponds to an element of \code{params}.}
#' \item{info}{a list of length \code{nfits}. Each element is a list recording extra information in each fitting step, e.g. degree of freedom (df) and variable importance (vi). As in \code{params}, it only contains the model information when a new target FDR level is achieved and each element corresponds to an element of \code{params}.}
#' \item{args}{a list including the other inputs \code{nfits}, \code{nms}, \code{niter_fit}, \code{niter_ms}, \code{tol}, \code{cr}}.
#'
#' @examples
#' \donttest{
#' # Load estrogen data
#' data(estrogen)
#' pvals <- as.numeric(estrogen$pvals)
#' x <- data.frame(x = as.numeric(estrogen$ord_high))
#' dist <- beta_family()
#'
#' # Subsample the data for convenience
#' inds <- (x$x <= 5000)
#' pvals <- pvals[inds]
#' x <- x[inds,,drop = FALSE]
#'
#' # Generate models for function adapt
#' library("splines")
#' formulas <- paste0("ns(x, df = ", 6:10, ")")
#' models <- lapply(formulas, function(formula){
#'     piargs <- muargs <- list(formula = formula)
#'     gen_adapt_model(name = "glm", piargs = piargs, muargs = muargs)
#' })
#'
#' # Run adapt
#' res <- adapt(x = x, pvals = pvals, models = models,
#'              dist = dist, nfits = 10)
#' }
#'
#' @export
adapt <- function(x, pvals, models,
                  dist = beta_family(),
                  s0 = rep(0.45, length(pvals)),
                  alphas = seq(0.01, 1, 0.01),
                  params0 = list(pix = NULL, mux = NULL),
                  nfits = 20, nms = 1,
                  niter_fit = 10, tol = 1e-4,
                  niter_ms = 20, cr = "BIC",
                  verbose = list(print = TRUE, fit = FALSE, ms = TRUE)
                  ){

    Mstep_type <- "unweighted"
    lfdr_type <- "over"

    ## Check if 'pvals' is a vector of values in [0, 1]
    if (!is.numeric(pvals) || min(pvals) < 0 || max(pvals) > 1){
        stop("Invalid p-values")
    }

    ## Check if the size of 'x' matches that of 'pvals'
    if (nrow(x) != length(pvals)){
        stop("'x' must have the same rows as the length of 'pvals'")
    }

    ## Check if 'dist' is of class 'exp_family'
    if (class(dist)[1] != "exp_family"){
        stop("\'dist\' must be of class \'exp_family\'.")
    }

    ## Check if necessary packages are installed.
    check_pkgs(models)

    ## When a single model is provided, set 'nms' to be NULL
    if (class(models) == "adapt_model"){
        model <- models
        nms <- NULL
    } else if (is.list(models)){
        types <- sapply(models, class)
        if (any(types != "adapt_model")){
            stop("All models should be of class \'adapt_model\'.")
        }
        if (!is.integer(nms) || nms <= 0){
            nms <- 1
        } else if (nms > nfits){
            warning("Model selection cannot be more frequent than model fitting. Set \'nms\' to \'nfits\'")
            nms <- nfits
        }
        if (length(models) == 1){
            model <- models[[1]]
            nms <- NULL
        }
    }

    ## Create time stamps when model is fitted or model selection is performed
    nmasks <- sum(pvals <= s0) + sum(pvals >= 1 - s0)
    stamps <- create_stamps(nmasks, nfits, nms)

    ## Create root arguments to simplify fitting and model selection
    fit_args_root <- list(
        x = x, pvals = pvals, dist = dist,
        niter = niter_fit, tol = tol,
        verbose = verbose$fit, type = Mstep_type
        )
    if (any(stamps[, 2] == "ms")){
        ms_args_root <- list(
            x = x, pvals = pvals, dist = dist, models = models,
            cr = cr, niter = niter_ms, tol = tol,
            verbose = verbose$ms, type = Mstep_type
            )
    }

    ## Initialization
    n <- length(pvals)
    params <- params0
    s <- s0
    A <- sum(pvals >= 1 - s)
    R <- sum(pvals <= s)
    minfdp <- fdp_hat(A, R) # initial FDPhat

    ## Remove the alphas greater than the initial FDPhat, except the smallest one among them
    alphas <- sort(alphas)
    if (min(alphas) >= minfdp){
        warning("Task completed! Initial \'s0\' has guaranteed FDR control for all alpha's in \'alphas\'.")
        alphaind <- 0
    } else if (max(alphas) < minfdp){
        alphaind <- length(alphas)
    } else {
        alphaind <- max(which(alphas <= minfdp))
    }

    m <- length(alphas)
    nrejs_return <- rep(0, m) # number of rejections
    s_return <- matrix(0, n, m) # threshold
    params_return <- list() # parameters (including pix and mux)
    model_list <- list() # all selected models
    info_list <- list() # other information (df, vi, etc.)
    reveal_order <- which((pvals > s) & (pvals < 1 - s)) # the order to be revealed
    if (length(reveal_order) > 0){
        init_pvals <- pvals[reveal_order]
        reveal_order <- reveal_order[order(init_pvals, decreasing = TRUE)]
    }
    fdp_return <- c(rep(Inf, length(reveal_order)), minfdp) # fdphat along the whole path

    if (m > alphaind){
        nrejs_return[(alphaind + 1):m] <- R
        s_return[, (alphaind + 1):m] <- s0
    }
    ## alphas <- alphas[1:m]

    for (i in 1:(nrow(stamps) - 1)){
        if (alphaind == 0){
            if (verbose$print){
                cat("Task completed!")
            }
            break
        }

        alpha <- alphas[alphaind]
        # mask <- (pvals <= s) | (pvals >= 1 - s)
        mask <- rep(TRUE, n)
        mask[reveal_order] <- FALSE
        nmasks <- sum(mask)
        A <- sum(pvals >= 1 - s)
        R <- sum(pvals <= s)
        start <- stamps[i, 1]
        end <- stamps[i + 1, 1]
        nreveals <- min(nmasks, end - start) # number of hypotheses to be revealed
        type <- stamps[i, 2] # type of model update

        ## Model selection or model fitting
        if (type == "ms"){
            ms_args <- c(
                list(s = s, params0 = params),
                ms_args_root
                )
            ## Use "EM_mix_ms" from "EM-mix-ms.R"
            ms_res <- do.call(EM_mix_ms, ms_args)
            params <- ms_res$params
            model <- ms_res$model
            modinfo <- ms_res$info
        } else if (type == "fit"){
            fit_args <- c(
                list(s = s, params0 = params, model = model),
                fit_args_root
                )
            ## Use "EM_mix" from "EM-mix.R"
            fit_res <- do.call(EM_mix, fit_args)
            params <- fit_res$params
            modinfo <- fit_res$info
        }

        if (length(params_return) == 0 ||
            alpha < params_return[[length(params_return)]]$alpha){
            params <- c(params,
                        list(alpha = alpha, nmasks = nmasks))
            params_return <- append(params_return, list(params))
            model_list <- append(model_list, list(model))
            info_list <- append(info_list, modinfo)
        }

        ## Estimate local FDR
        lfdr <- compute_lfdr_mix(
            pmin(pvals, 1 - pvals),
            dist, params, lfdr_type)
        ## Find the top "nreveals" hypotheses with highest lfdr
        lfdr[!mask] <- -Inf
        inds <- order(lfdr, decreasing = TRUE)[1:nreveals]
        reveal_order <- c(reveal_order, inds)
        ## Shortcut to calculate FDPhat after revealing the hypotheses one by one
        Adecre <- cumsum(pvals[inds] >= 1 - s[inds])
        Rdecre <- cumsum(pvals[inds] <= s[inds])
        fdp <- fdp_hat(A - Adecre, R - Rdecre)
        fdp_return <- c(fdp_return, fdp)
        fdp <- pmin(fdp, minfdp)
        ## Calculate the current minimum FDPhat
        minfdp <- min(fdp)

        while (alphaind > 0){# check if lower FDR level is achieved
            alpha <- alphas[alphaind]
            if (any(fdp <= alpha)){
                breakpoint <- which(fdp <= alpha)[1]
                lfdr_lev <- lfdr[inds[breakpoint]]
                snew <- compute_threshold_mix(dist, params, lfdr_lev, lfdr_type)
                snew <- pmin(s, snew)

                ## Sanity check to avoid rounding errors
                tmp_pvals <- pvals[inds[1:breakpoint]]
                tmp_pvals <- pmin(tmp_pvals, 1 - tmp_pvals)
                tmp_inds <- which(tmp_pvals <= snew[inds[1:breakpoint]])
                if (length(tmp_inds) > 0){
                    snew[inds[tmp_inds]] <- pmin(snew[inds[tmp_inds]], tmp_pvals[tmp_inds] - 1e-15)
                }

                s_return[, alphaind] <- snew

                fdpnew <- fdp[breakpoint]
                Rnew <- sum(pvals <= snew)
                nrejs_return[alphaind] <- Rnew
                if (verbose$print){
                    cat(paste0(
                        "alpha = ", alpha, ": FDPhat ",
                        round(fdpnew, 4), ", Number of Rej. ",
                        Rnew, "\n"))
                }

                alphaind <- alphaind - 1
            } else {
                break
            }
        }

        if (alphaind == 0){ # check again to save computation
            if (verbose$print){
                cat("Task completed!\n")
            }
            break
        }

        ## Update s(x)
        final_lfdr_lev <- lfdr[tail(inds, 1)]
        snew <- compute_threshold_mix(dist, params, final_lfdr_lev, lfdr_type)
        s <- pmin(s, snew)

        ## Sanity check to avoid rounding errors
        tmp_pvals <- pvals[inds]
        tmp_pvals <- pmin(tmp_pvals, 1 - tmp_pvals)
        tmp_inds <- which(tmp_pvals <= snew[inds])
        if (length(tmp_inds) > 0){
            s[inds[tmp_inds]] <- pmin(s[inds[tmp_inds]], tmp_pvals[tmp_inds] - 1e-15)
        }
    }

    remain_inds <- (1:n)[-reveal_order]
    if (length(remain_inds) > 0){
        tmp_pvals <- pvals[remain_inds]
        tmp_pvals <- pmin(tmp_pvals, 1 - tmp_pvals)
        remain_reveal_order <- remain_inds[order(tmp_pvals, decreasing = TRUE)]
        reveal_order <- c(reveal_order, remain_reveal_order)
        fdp_return <- c(fdp_return, rep(minfdp, length(remain_inds)))
    }

    rejs_return <- apply(s_return, 2, function(s){which(pvals <= s)})

    qvals <- rep(1, n)
    qvals[reveal_order] <- ifelse(pvals[reveal_order] <= s0[reveal_order],
                                  cummin(fdp_return[1:n]), rep(Inf, n))

    args <- list(nfits = nfits, nms = nms,
                 niter_fit = niter_fit, niter_ms = niter_ms,
                 tol = tol, cr = cr)

    res <- structure(
        list(nrejs = nrejs_return,
             rejs = rejs_return,
             s = s_return,
             params = params_return,
             qvals = qvals,
             order = reveal_order,
             alphas = alphas,
             dist = dist,
             models = model_list,
             info = info_list,
             args = args),
        class = "adapt")
    return(res)
}

Try the adaptMT package in your browser

Any scripts or data that you put into this service are public.

adaptMT documentation built on May 1, 2019, 10:15 p.m.