GSEABenchmarkeR: Reproducible GSEA Benchmarking

Documented in metaFC runDE runEA

#' Differential expression analysis for datasets of a compendium
#' 
#' This function applies selected methods for differential expression (DE)
#' analysis to selected datasets of an expression data compendium.
#' 
#' DE studies typically report a gene as differentially expressed if the
#' corresponding DE p-value, corrected for multiple testing, satisfies the
#' chosen significance level.  Enrichment methods that work directly on the
#' list of DE genes are then substantially influenced by the multiple testing
#' correction.
#' 
#' An example is the frequently used over-representation analysis (ORA), which
#' assesses the overlap between the DE genes and a gene set under study based
#' on the hypergeometric distribution (see Appendix A of the
#' \code{EnrichmentBrowser} vignette for an introduction).
#' 
#' ORA is inapplicable if there are few genes satisfying the significance
#' threshold, or if almost all genes are DE.
#' 
#' Using \code{padj.method="flexible"} accounts for these cases by applying
#' multiple testing correction in dependence on the degree of differential
#' expression:
#' 
#' \itemize{ \item the correction method from Benjamini and Hochberg (BH) is
#' applied if it renders >= 1\% and <= 25\% of all measured genes as DE, \item
#' the p-values are left unadjusted, if the BH correction results in < 1\% DE
#' genes, and \item the more stringent Bonferroni correction is applied, if the
#' BH correction results in > 25\% DE genes.  }
#' 
#' Note that resulting p-values should not be used for assessing the
#' statistical significance of DE genes within or between datasets.  They are
#' solely used to determine which genes are included in the analysis with ORA -
#' where the flexible correction ensures that the fraction of included genes is
#' roughly in the same order of magnitude across datasets.
#' 
#' Alternative stratgies could also be applied - such as taking a constant
#' number of genes for each dataset or excluding ORA methods in general from
#' the assessment.
#' 
#' @param exp.list Experiment list.  A \code{list} of datasets, each being of
#' class \code{\linkS4class{SummarizedExperiment}}.
#' @param de.method Differential expression method.  See documentation of
#' \code{\link{deAna}}.
#' @param padj.method Method for adjusting p-values to multiple testing.  For
#' available methods see the man page of the stats function \code{p.adjust}.
#' Defaults to 'flexible', which applies a dataset-specific correction
#' strategy. See details.
#' @param parallel Parallel computation mode.  An instance of class
#' \code{\linkS4class{BiocParallelParam}}.  See the vignette of the
#' \code{BiocParallel} package for switching between serial, multi-core, and
#' grid execution.  Defaults to \code{NULL}, which then uses the first element
#' of \code{BiocParallel::registered()} for execution.  If not changed by the
#' user, this accordingly defaults to multi-core execution on the local host.
#' @param alpha Statistical significance level. Defaults to 0.05.
#' @param beta Absolute log2 fold change cut-off. Defaults to 1 (2-fold).
#' @param out.dir Character.  Determines the output directory where DE results
#' for each dataset are written to.  Defaults to \code{NULL}, which then writes
#' to a subdir named 'de' in \code{tools::R_user_dir("GSEABenchmarkeR")}.
#' @param max.na Integer. Determines for which genes a meta fold change is 
#' computed. Per default, excludes genes for which the fold change is not 
#' annotated in >= 1/3 of the datasets in \code{exp.list}.
#' @param ...  Additional arguments passed to \code{EnrichmentBrowser::deAna}.
#' @return \code{runDE} returns \code{exp.list} with DE measures annotated to
#' the \code{\link{rowData}} slot of each dataset, \code{writeDE} writes to file,
#' and \code{plotDEDistribution} plots to a graphics device.
#' @author Ludwig Geistlinger <Ludwig.Geistlinger@@sph.cuny.edu>
#' @seealso \code{loadEData} to load a specified expression data compendium.
#' @examples
#' 
#'     # reading user-defined expression data from file
#'     data.dir <- system.file("extdata/myEData", package="GSEABenchmarkeR")
#'     edat <- loadEData(data.dir)
#' 
#'     # differential expression analysis
#'     edat <- runDE(edat)
#' 
#'     # visualization of per-dataset DE distribution
#'     plotDEDistribution(edat)
#' 
#'     # calculating meta fold changes across datasets 
#'     mfcs <- metaFC(edat, max.na=0) 
#' 
#'     # writing DE results to file
#'     out.dir <- tempdir()
#'     out.dir <- file.path(out.dir, "de")
#'     if(!file.exists(out.dir)) dir.create(out.dir)
#'  
#'     writeDE(edat, out.dir)    
#' 
#' @export runDE
runDE <- function(exp.list, 
    de.method=c("limma", "edgeR", "DESeq2"), 
    padj.method="flexible", parallel=NULL, ...)
{
    flex <- padj.method == "flexible"
    if(flex) padj.method <- "none"
    
    PCOL <- EnrichmentBrowser::configEBrowser("PVAL.COL")
    ADJP.COL <- EnrichmentBrowser::configEBrowser("ADJP.COL")
    .de <- function(i, ...)
    { 
        se <- EnrichmentBrowser::deAna(i, de.method=de.method, 
                                            padj.method=padj.method, ...)

        if(flex)
        {
            padj <- p.adjust(rowData(se)[,PCOL], method="BH")
            fracSigP <- mean(padj < 0.05, na.rm=TRUE)
            if(fracSigP > 0.25) 
                padj <- p.adjust(rowData(se)[,PCOL], method="bonf") 
            if(fracSigP > 0.01) rowData(se)[,ADJP.COL] <- padj
            else rowData(se)[,ADJP.COL] <- rowData(se)[,PCOL]
        }
        return(se)
    }
   
    exp.list <- .iter(exp.list, .de, ..., parallel=parallel)
    return(exp.list)
}

#' @rdname runDE
#' @export
metaFC <- function(exp.list, max.na=round(length(exp.list) / 3))
{
    all.names <- unique(unlist(lapply(exp.list, names)))
    glen <- length(all.names)

    # fold changes
    FC.COL <- EnrichmentBrowser::configEBrowser("FC.COL")
    fcs <- vapply(exp.list, function(d) 
                    rowData(d,use.names=TRUE)[all.names,FC.COL], 
                    numeric(glen))
    rownames(fcs) <- all.names

    # filter out genes not measured in >= 1/3 of datasets
    few.nas <- rowSums(t(apply(fcs, 1, is.na))) <= max.na
    fcs <- fcs[few.nas,]

    # weights ~ sample size
    weights <- vapply(exp.list, ncol, integer(1))
    weights <- sqrt(weights) 

    # weighted mean
    wfcs <- apply(fcs, 1, weighted.mean, w=weights, na.rm=TRUE)  
    wfcs <- wfcs[order(abs(wfcs), decreasing=TRUE)]
    return(wfcs)
}

#
# fraction of DE genes according to p-value and fold change threshold
#
.fractDE <- function(se, alpha=0.05, beta=1, freq=c("rel", "abs"))
{
    freq <- match.arg(freq)
    FC.COL <- EnrichmentBrowser::configEBrowser("FC.COL")
    ADJP.COL <- EnrichmentBrowser::configEBrowser("ADJP.COL")

    fract.p <- sum(rowData(se)[,ADJP.COL] < alpha)
    fract.fc <- sum(abs(rowData(se)[,FC.COL]) > beta)
    
    if(freq == "rel")
    { 
        fract.p <- fract.p / nrow(se) * 100
        fract.fc <- fract.fc / nrow(se) * 100
    }
    res <- c(fract.p, fract.fc)
    names(res) <- c("p", "fc")
    return(res)
}


#' Application of enrichment methods to multiple datasets
#' 
#' This function applies selected methods for enrichment analysis to selected
#' datasets of a compendium.
#' 
#' 
#' @param exp.list Experiment list.  A \code{list} of datasets, each being of
#' class \code{\linkS4class{SummarizedExperiment}}. In case of just one dataset
#' a single \code{\linkS4class{SummarizedExperiment}} is also allowed. 
#' See the documentation of \code{\link{sbea}} for required minimal annotations.
#' @param methods Methods for enrichment analysis.  This can be either \itemize{ 
#' \item a character vector with method names chosen from \code{\link{sbeaMethods}}
#' and \code{\link{nbeaMethods}}, 
#' \item a user-defined function implementing a method for enrichment analysis, or
#' \item a named list, containing pre-defined and/or user-defined enrichment methods.
#' See examples.}
#' @param gs Gene sets, i.e. a list of character vectors of gene IDs.
#' @param perm Number of permutations of the sample group assignments. 
#' Defaults to 1000. Can also be an integer vector matching
#' the length of \code{methods} to assign different numbers of permutations for
#' different methods.
#' @param parallel Parallel computation mode.  An instance of class
#' \code{\linkS4class{BiocParallelParam}}.  See the vignette of the
#' \code{BiocParallel} package for switching between serial, multi-core, and
#' grid execution.  Defaults to \code{NULL}, which then uses the first element
#' of \code{BiocParallel::registered()} for execution.  If not changed by the
#' user, this accordingly defaults to multi-core execution on the local host.
#' @param save2file Logical. Should results be saved to file for subsequent
#' benchmarking?  Defaults to \code{FALSE}.
#' @param out.dir Character.  Determines the output directory where results are
#' saved to.  Defaults to \code{NULL}, which then writes to
#' \code{tools::R_user_dir("GSEABenchmarkeR")} in case \code{save2file}
#' is set to \code{TRUE}.
#' @param ...  Additional arguments passed to the selected enrichment methods.
#' @return A list with an entry for each method applied.  Each method entry is
#' a list with an entry for each dataset analyzed.  Each dataset entry is a
#' list of length 2, with the first element being the runtime and the second
#' element being the gene set ranking, as obtained from applying the respective
#' method to the respective dataset.
#' @author Ludwig Geistlinger <Ludwig.Geistlinger@@sph.cuny.edu>
#' @seealso \code{\link{sbea}} and \code{\link{nbea}}
#' for carrying out set- and network-based enrichment analysis.
#' 
#' \code{\linkS4class{BiocParallelParam}} and \code{\link{register}} for
#' configuration of parallel computation.
#' @examples
#' 
#'     # loading three datasets from the GEO2KEGG compendium
#'     geo2kegg <- loadEData("geo2kegg", nr.datasets=3)
#' 
#'     # only considering the first 1000 probes for demonstration
#'     geo2kegg <- lapply(geo2kegg, function(d) d[1:1000,]) 
#'
#'     # preprocessing and DE analysis for two of the datasets
#'     geo2kegg <- maPreproc(geo2kegg[2:3])
#'     geo2kegg <- runDE(geo2kegg)
#' 
#'     # getting a subset of human KEGG gene sets
#'     gs.file <- system.file("extdata/hsa_kegg_gs.gmt", package="EnrichmentBrowser")
#'     kegg.gs <- EnrichmentBrowser::getGenesets(gs.file)
#' 
#'     # applying two methods to two datasets 
#'     res <- runEA(geo2kegg, methods=c("ora", "camera"), gs=kegg.gs, perm=0)
#' 
#'     # applying a user-defined enrichment method
#'     dummySBEA <- function(se, gs) 
#'     {
#'          sig.ps <- sample(seq(0, 0.05, length=1000), 5)
#'          nsig.ps <- sample(seq(0.1, 1, length=1000), length(gs)-5)
#'          ps <- sample(c(sig.ps, nsig.ps), length(gs))
#'          names(ps) <- names(gs)
#'          return(ps)
#'     }  
#'     res <- runEA(geo2kegg, methods=dummySBEA, gs=kegg.gs)
#'
#'     # applying a mix of pre-defined and user-defined methods
#'     methods <- list(camera = "camera", dummySBEA = dummySBEA)
#'     res <- runEA(geo2kegg, methods, gs=kegg.gs, perm=0)   
#'
#'
#' @export runEA
runEA <- function(exp.list, methods, gs, perm=1000,
    parallel=NULL, save2file=FALSE, out.dir=NULL, ...)
{
    # setup
    methods <- .methods2list(methods)
    nr.meth <- length(methods)
    .eaPkgs(methods)

    # singleton call?
    if(is(exp.list, "SummarizedExperiment"))
    {
        did <- metadata(exp.list)$dataId
        exp.list <- list(exp.list)
        names(exp.list) <- ifelse(is.null(did), "se", did)
    }

    show.progress <- interactive() && nr.meth > 2
    if(show.progress) pb <- txtProgressBar(0, nr.meth, style=3)

    if(length(perm) != nr.meth) perm <- rep(perm[1], nr.meth)
    names(perm) <- names(methods)
  
    for(i in seq_along(exp.list))
        if(is.null(metadata(exp.list[[i]])$dataId))
            metadata(exp.list[[i]])$dataId <- names(exp.list)[i]

    res <- lapply(names(methods), 
        function(m, ...)
        {
            if(show.progress) setTxtProgressBar(pb, match(m, names(methods)))
            .iter(exp.list, .ea, method=methods[m], perm=perm[m], ..., parallel=parallel)
        },
        gs=gs, save2file=save2file, out.dir=out.dir, ...
    )
   
    if(show.progress) close(pb) 
    names(res) <- names(methods)
    return(res)
}

.methods2list <- function(methods)
{
    if(is.function(methods)) methods <- list(method = methods)
    else if(is.character(methods)) names(methods) <- methods
    else if(is.list(methods))
    {
        if(is.null(names(methods))) 
            stop(paste("\'methods\' need to be given as",
                    "function, character vector, or named list"))
    }
    methods
}

# check on availability of ea packages
.eaPkgs <- function(ea.methods)
{
    ea.methods <- names(ea.methods)
    sbea.pkgs <- EnrichmentBrowser::configEBrowser("SBEA.PKGS")
    nbea.pkgs <- EnrichmentBrowser::configEBrowser("NBEA.PKGS")
    ea.pkgs <- c(sbea.pkgs, nbea.pkgs)
    for(m in ea.methods)
        if(m %in% names(ea.pkgs)) 
            EnrichmentBrowser::isAvailable(ea.pkgs[m], type="software")
}

# execute indiviual methods
.ea <- function(se, method, gs, perm=1000, save2file=FALSE, out.dir=NULL, ...)
{
    id <- metadata(se)$dataId
    res <- .execEA(method[[1]], se, gs, perm, ...)
    ti <- res$ti
    res <- res$res

    method <- names(method)
    if(is(res, "try-error"))
    {
        message(paste(method, "could not be evaluated on", id))
        message(res)
        message("Returning NULL")
        return(NULL)
    }

    res <- EnrichmentBrowser::gsRanking(res, signif.only=FALSE)
    if(save2file) .save2file(res, out.dir, method, id, ti[3])
    list(runtime = ti[3], ranking = res)
}

.execEA <- function(method, se, gs, perm, ...)
{
    f <- file()
    sink(file=f) ## silence

    # execute
    suppressMessages(
    if(is.function(method) || method %in% EnrichmentBrowser::sbeaMethods())
    {
        ti <- system.time(
            res <- try(
                EnrichmentBrowser::sbea(method, se, gs, perm=perm, ...), 
            silent=TRUE)
        )
    }
    else
    { 
        ti <- system.time(
            res <- try(
                EnrichmentBrowser::nbea(method, se, gs, perm=perm, ...),
            silent=TRUE)
        )
    })
    sink() ## undo silencing
    close(f)
    list(res = res, ti = ti)
}

.save2file <- function(res, out.dir, method, id, ti=NULL)
{
    if(is.null(out.dir)) 
            out.dir <- tools::R_user_dir("GSEABenchmarkeR")
    out.dir <- file.path(out.dir, method)
    if(!file.exists(out.dir)) dir.create(out.dir, recursive=TRUE)

    out.file <- file.path(out.dir, paste0(id, ".rds"))
    saveRDS(res, file=out.file)
    if(!is.null(ti)) cat(ti, file=sub("rds$", "txt", out.file))
}