multiGSEA: A unified interface to a plethora of gene set enrichment analysis methods

Documented in geneSetDb geneSetsStats logFC p.matrix result result.MultiGSEAResult resultNames results tabulateResults

#' Makes MultiGSEAResult objects compatible with old GNE multiGSEA Explorer App
#'
#' This is not documented or exported on purpose and will be removed in the near
#' future without warning, so: don't use it.
#' @noRd
downgradeObject <- function(x) {
  stopifnot(is(x, 'MultiGSEAResult'))
  x@results <- sapply(names(x@results), function(res) {
    out <- result(x, res, as.dt=TRUE)
    setattr(out, 'rawresult', FALSE)
  }, simplify=FALSE)
  x
}

#' Combines two MultiGSEAResult objects together.
#'
#' This would be useful when you want to add a GSEA result to an already
#' existing one. `append` would be more appropriate, but ...
#'
#' When would you want to do that? Imagine a shiny app that drives multiGSEA.
#' You might want to present the results of each analysis as they come "online",
#' so you would run them independently and make them available to the user
#' immediately after they each finish (ie. in combination with the promises
#' package).
#'
#' @param x A `MultiGSEAResult` object
#' @param y A `MultiGSEAResult` object
#' @param rename.x A named vector that used to match resultNames(x) and remane
#'   them to something different. `names(rename.x)` should match whatever you
#'   want to change in `resultNames(x)`, and the values are the new names of
#'   the result.
#' @param rename.y Same as `rename.x`, but for the results in `y`.
#' @param ... more things
#'
#' @importMethodsFrom BiocGenerics combine
#' @exportMethod combine
#' @return A combined `MultiGSEAResult` object
#' @examples
#' mg1 <- exampleMultiGSEAResult()
#' mg2 <- exampleMultiGSEAResult()
#' mgc <- combine(mg1, mg2)
setMethod("combine", c(x = "MultiGSEAResult", y = "MultiGSEAResult"),
function(x, y, rename.x = NULL, rename.y = NULL, ...) {
  if (!isTRUE(all.equal(geneSetDb(x), geneSetDb(y)))) {
    stop("Can't combine MultiGSEAResults with different internal GeneSetDb's")
  }
  if (!isTRUE(all.equal(logFC(x), logFC(y)))) {
    stop("Can't combine MultiGSEAResults with different internal logFC stats")
  }

  xnames <- .replace(resultNames(x), rename.x)
  ynames <- .replace(resultNames(y), rename.y)
  rnames <- make.unique(c(xnames, ynames))

  out <- x
  out@results <- setNames(c(x@results, y@results), rnames)
  out
})

#' Fetches the GeneSetDb from MultiGSEAResult
#'
#' @export
#'
#' @rdname MultiGSEAResult-utilities
#' @param x \code{MultiGSEAResult}
#' @return The \code{GeneSetDb}
#'
#' @examples
#' vm <- exampleExpressionSet(do.voom=TRUE)
#' gdb <- exampleGeneSetDb()
#' mg <- multiGSEA(gdb, vm, vm$design, 'tumor', methods=NULL)
#' geneSetDb(mg)
geneSetDb <- function(x) {
  stopifnot(is(x, 'MultiGSEAResult'))
  x@gsd
}

#' @rdname geneSet
setMethod("geneSet", c(x="MultiGSEAResult"),
function(x, i, j, active.only=TRUE, with.feature.map=FALSE, ...,
         collection = NULL, name = NULL, as.dt=FALSE) {
  if (!isTRUE(active.only)) {
    warning("active.only set to TRUE for geneSet,MultiGSEAResult")
    active.only <- TRUE
  }
  gdb <- geneSetDb(x)
  gs <- geneSet(gdb, i, j, active.only=TRUE, with.feature.map=with.feature.map,
                ..., collection = collection, name = name, as.dt=TRUE)
  lfc <- logFC(x, as.dt=TRUE)[list(gs$feature_id), on='feature_id']
  stopifnot(all(lfc$feature_id == gs$feature_id))
  lfc$feature_id <- NULL
  # Columns that appear in the geneset and also appear in the `logFC` data.frame
  # will be renamed to avoid collision. The duplicated columns used to be
  # removed with a preference to keep the columns in `gs`, however we might
  # have a `logFC` column in the geneset database, which would then override
  # the `logFC` column from the differential expression analysis.
  rename <- intersect(colnames(lfc), colnames(gs))
  if (length(rename)) setnames(gs, rename, paste0(rename, ".gs"))
  out <- cbind(gs, lfc)
  if (!as.dt) setDF(out)
  out
})

setMethod("geneSets", c(x="MultiGSEAResult"),
function(x, ..., as.dt=FALSE) {
  geneSets(geneSetDb(x), active.only=TRUE, as.dt=as.dt)
})

# Here's your chance to vectorize this. Once that's done push this code down
# into geneSetUrl,GeneSetDb
setMethod("geneSetURL", c(x="MultiGSEAResult"), function(x, i, j, ...) {
  geneSetURL(geneSetDb(x), i, j, ...)
})

setMethod("geneSetCollectionURLfunction", "MultiGSEAResult",
function(x, i, ...) {
  geneSetCollectionURLfunction(geneSetDb(x), i, ...)
})

#' @rdname featureIds
setMethod("featureIds", c(x="MultiGSEAResult"),
function(x, i, j, value=c('feature_id', 'x.id', 'x.idx'),
         active.only=TRUE, ...) {
  value <- match.arg(value)
  if (!isTRUE(active.only)) {
    warning("The featureIds() accessor for a MultiGSEAResult enforces ",
            "`active.only` to be set to TRUE")
  }
  featureIds(geneSetDb(x), i, j, value=value, active.only=TRUE, ...)
})

#' Summarizes useful statistics per gene set from a MultiGSEAResult
#'
#' This function calculates the number of genes that move up/down for the
#' given contrasts, as well as mean and trimmed mean of the logFC and
#' t-statistics. Note that the statistics calculated and returned here are
#' purely a function of the statistics generated at the gene-level stage
#' of the analysis.
#'
#' @export
#'
#' @param x A `MultiGSEAResult` object
#' @param feature.min.logFC used with `feature.max.padj` to identify
#'   the individual features that are to be considered differentially
#'   expressed.
#' @param feature.max.padj used with `feature.min.logFC` to identify
#'   the individual features that are to be considered differentially
#'   expressed.
#' @param trim The amount to trim when calculated trimmed `t` and
#'   `logFC` statistics for each geneset.
#'
#' @return A data.table with statistics at the gene set level across the
#'   prescribed contrast run on `x`. These statistics are independent
#'   of any particular GSEA method, but rather summarize aggregate shifts
#'   of the gene sets individual features. The columns included in the output
#'   are summarized below:
#'
#' * `n.sig`: The number of individual features whose `abs(logFC)` and padj
#'    thersholds satisfy the criteria of the `feature.min.logFC` and
#'    `feature.max.padj` parameters of the original [multiGSEA()] call
#' * `n.neutral`: The number of individual features whose abs(logFC) and padj
#'    thersholds do not satisfy the `feature.*` criteria named above.
#' * `n.up, n.down`: The number of individual features with `logFC > 0` or
#'   `logFC < 0`, respectively, irrespective of the `feature.*` thresholds
#'    referenced above.
#' * `n.sig.up, n.sig.down`: The number of individual features that pass the
#'   `feature.*` thresholds and have logFC > 0 or logFC < 0, respectively.
#' * `mean.logFC, mean.logFC.trim`: The mean (or trimmed mean) of the individual
#'   logFC estimates for the features in the gene set. The amount of trim is
#'   specified in the `trim` parameter of the [multiGSEA()] call.
#' * `mean.t, mean.t.trim`: The mean (or trimmed mean) of the individual
#'   t-statistics for the features in the gene sets. These are `NA` if the input
#'   expression object was a `DGEList`.
#'
#' @examples
#' vm <- exampleExpressionSet(do.voom=TRUE)
#' gdb <- exampleGeneSetDb()
#' mg <- multiGSEA(gdb, vm, vm$design, 'tumor')
#' head(geneSetsStats(mg))
geneSetsStats <- function(x, feature.min.logFC=1, feature.max.padj=0.10,
                          trim=0.10, reannotate.significance = FALSE,
                          as.dt=FALSE) {
  stopifnot(is(x, 'MultiGSEAResult'))
  lfc <- logFC(x, as.dt=TRUE)

  # reannotate.significance was added to better accommodate annotations that
  # were passed in the xmeta. data.frame through the multiGSEA call. You
  # should revisit this when you revisit how data.frame input support superseds
  # the xmeta. hacks we have in place now.
  annotate.lfc <- !all(c('significant', 'direction') %in% names(lfc))
  annotate.lfc <- annotate.lfc || reannotate.significance

  is.ttest <- "logFC" %in% names(lfc)
  if (annotate.lfc) {
    lfc <- within(lfc, {
      if (is.ttest) {
        significant <- abs(logFC) >= feature.min.logFC &
          !is.na(padj) &
          padj <= feature.max.padj
        direction <- ifelse(logFC > 0, 'up', 'down')
      } else {
        # This is an ANOVA
        significant <- !is.na(padj) & padj <= feature.max.padj
        direction <- rep("ambiguous", length(padj))
      }
    })
  }

  gs <- geneSets(x, as.dt=TRUE)
  do.by <- key(gs)

  out <- gs[, {
    fids <- featureIds(x, .BY[[1L]], .BY[[2L]])
    stats <- lfc[fids, on='feature_id']
    up <- stats$direction == 'up'
    down <- stats$direction == "down"
    is.sig <- stats$significant
    t.nona <- stats$t[!is.na(stats$t)]
    if (is.ttest) {
      mean.logFC <- mean(stats$logFC, na.rm=TRUE)
      mean.logFC.trim <- mean(stats$logFC, na.rm=TRUE, trim=trim)
      mean.t <- mean(stats$t, na.rm=TRUE)
      mean.t.trim <- mean(stats$t, na.rm=TRUE, trim=trim)
    } else {
      mean.logFC <- rep(NA_real_, .N)
      mean.logFC.trim <- rep(NA_real_, .N)
      mean.t <- rep(NA_real_, .N)
      mean.t.trim <- rep(NA_real_, .N)
    }
    list(n.sig=sum(is.sig), n.neutral=sum(!is.sig),
         n.up=sum(up), n.down=sum(down),
         n.sig.up=sum(up & is.sig), n.sig.down=(sum(down & is.sig)),
         mean.logFC=mean.logFC,
         mean.logFC.trim=mean.logFC.trim,
         mean.t=mean.t,
         mean.t.trim=mean.t.trim)
  }, by=do.by]
  setkeyv(out, do.by)

  if (!as.dt) setDF(out)
  out
}

#' Extract the individual fold changes statistics for elements in the
#' expression object.
#'
#' @export
#' @param x A [MultiGSEAResult()]
#' @template asdt-param
#' @return The log fold change `data.table``
#'
#' @examples
#' vm <- exampleExpressionSet(do.voom=TRUE)
#' gdb <- exampleGeneSetDb()
#' mg <- multiGSEA(gdb, vm, vm$design, 'tumor', methods=NULL)
#' lfc <- logFC(mg)
logFC <- function(x, as.dt=FALSE) {
  stopifnot(is(x, 'MultiGSEAResult'))
  out <- x@logFC
  if (!as.dt) out <- setDF(copy(out))
  out
}

#' Helper funtion: returns method names that were not run on a MultiGSEAResult
#'
#' @noRd
invalidMethods <- function(x, names, as.error=FALSE) {
  stopifnot(is(x, 'MultiGSEAResult'))
  stopifnot(is.character(names))
  if (length(names) == 0 && FALSE) {
    warning("No mehod `names` passed to invalidMethods", immediate.=TRUE)
  }
  bad.names <- setdiff(names, resultNames(x))
  if (length(bad.names) && as.error) {
    stop("Illegal result names queried: ", paste(bad.names, collapse=','))
  }
  bad.names
}

#' Interrogate the results of a multiGSEA analysis stored in a MultiGSEAResult
#'
#' @description
#' The `resultNames`, `result`, and `results` functions enable
#' you to explore the results of the analysis run with \code{\link{multiGSEA}}.
#'
#' The results that are stored within a `MultiGSEAResult` object have a
#' more or less 1:1 mapping with the values passed as `methods`, parameter
#' of the [multiGSEA()] call.
#'
#' @details
#' The product of an indivdual GSEA is consumed by the corresponding
#' `do.<METHOD>` function and converted into a data.table of results that
#' is internally stored.
#'
#' Use the `resultNames()` function to identify which results are available
#' for interrogation. The `result()` function returns the statistics of
#' one individual result, and the `results()` function combines the results
#' from the specified methods into an arbitrarily wide data.table with
#' `method`-suffixed column names.
#'
#' Use the `tabulateResults()` function to create a summary table that
#' tallies the number of significant genesets per collection, per method at
#' the specified FDR thresholds.
#'
#' @export
#' @rdname results
#'
#' @examples
#' ## Refer to the examples in ?multiGSEA
resultNames <- function(x) {
  stopifnot(is(x, 'MultiGSEAResult'))
  names(x@results)
}

#' @export
#' @rdname results
result <- function(x, ...) {
  UseMethod("result", x)
}

#' @export
#' @rdname results
#' @param x MultiGSEAResult
#' @param name the names of the results desired
#' @param stats.only logical, set to `FALSE` if you want to return all
#'   (column-wise) data for each result. By default only the pvalues,
#'   adjusted pvalues, and rank are returned.
#' @param rank.by the statistic to use to append a `rank` column for the
#'   geneset result. By default we rank by pvalue calculated by the GSEA
#'   method. You can rank the results based on the trimmed mean of the logFC's
#'   calculated for all of the features in the geneset (`"logFC"`), or the
#'   trimmed t-statistics of the these features (`"t"`).
#' @param add.suffix If `TRUE`, adds `.name` as a suffix to the
#'   columns of the `method`-specific statistics returned, ie. the
#'   `pval` column from the `"camera"` result will be turned to
#'   `pval.camera`.
#'
#' @return a data.table with the results from the requested method.
result.MultiGSEAResult <- function(x, name = NULL, stats.only=FALSE,
                                   rank.by=c('pval', 't', 'logFC'),
                                   add.suffix=FALSE, as.dt=FALSE, ...) {
  stopifnot(is(x, 'MultiGSEAResult'))
  if (is.null(resultNames(x)) || length(resultNames(x)) == 0) {
    return(results(x, name, as.dt=as.dt))
  }
  if (is.null(name)) name <- resultNames(x)[1L]
  ## no methods run?
  stopifnot(isSingleCharacter(name))
  invalidMethods(x, name, as.error=TRUE)
  stopifnot(isSingleLogical(stats.only))
  rank.by <- match.arg(rank.by)
  stopifnot(isSingleLogical(add.suffix))

  gs <- geneSets(x, as.dt=TRUE)
  base.colnames <- colnames(gs)
  out <- copy(gs)

  res <- local({
    # The <name> of the GSEA methods should not have a "." in them. If there
    # is a ".", we take this to mean (for now) some modification of the
    # method named up to that dot, for instatnce "goseq.up" is "goseq" method.
    # We can have other "."s appear in the future. For instance, someone might
    # want to run camera twice with different values for inter.gene.cor. In
    # this case, one "name" could be "camera.igc01" or something. This part
    # isn't formal, and will be more formalized (and changed) in a future
    # release.
    fnname <- sub("\\..*$", "", name)
    fnname <- paste0('mgres.', fnname)
    fnfetch <- getFunction(fnname)
    r <- fnfetch(x@results[[name]], geneSetDb(x))
    kosher <- isTRUE(
      all.equal(
        out[, key(out), with = FALSE],
        r[, key(out), with = FALSE],
        check.attributes = FALSE))
    if (!isTRUE(kosher)) {
      stop("Unexpected geneset ordering in `", name, "` result")
    }
    ## All mgres.* function must return a pval and padj column
    if (!all(c('pval', 'padj') %in% colnames(r))) {
      stop(fnname, " does not provide pval and/or padj column")
    }

    if (stats.only) {
      ## Select any column that starts with pval or padj
      res.cols <- names(r)[grepl('^(pval\\.?|padj\\.?)', names(r))]
    } else {
      res.cols <- setdiff(names(r), names(out))
    }
    r <- r[, res.cols, with=FALSE]
    r
  })

  for (col in names(res)) {
    out[, (col) := res[[col]]]
  }

  missing.pvals <- is.na(out[['pval']])
  n.missing <- sum(missing.pvals)
  if (any(missing.pvals)) {
    msg <- sprintf("%d missing pvalues for active genesets from %s",
                   n.missing, name)
    warning(msg, immediate.=TRUE)
  }

  out[, padj.by.collection := p.adjust(pval, 'BH'), by='collection']

  ranks <- switch(rank.by,
                  logFC=rank(-abs(out$mean.logFC.trim), ties.method="min"),
                  t=rank(-abs(out$mean.t.trim), ties.method="min"),
                  pval=rank(out[['pval']], ties.method="min"))
  out[, rank := ranks]

  if (add.suffix) {
    rename.cols <- setdiff(colnames(out), base.colnames)
    setnames(out, rename.cols, paste(rename.cols, name, sep='.'))
  }

  # let's get the pvalue related columns up front
  upfront <- c(
    "collection", "name", "active", "N", "n",
    names(out)[grepl('^(pval\\.?|padj\\.?)', names(out))])
  upfront <- intersect(upfront, names(out))
  setcolorder(out, upfront)

  if (!as.dt) setDF(out)
  out
}

#' @export
#' @rdname results
#'
#' @param names The results you want to cbind together for the output, will
#'   default to all of them.
results <- function(x, names=resultNames(x), stats.only=TRUE,
                    rank.by=c('pval', 'logFC', 't'),
                    add.suffix=length(names) > 1L, as.dt=FALSE) {
  stopifnot(is(x, 'MultiGSEAResult'))
  if (is.null(resultNames(x)) || length(resultNames(x)) == 0L) {
    ## No methods were run, you can only return geneset stats
    if (!is.null(names) || length(names) > 0L) {
      ## User is asking for something that is not there
      stop("No GSEA methods were run, you can only get set statistics")
    }
    warning("No GSEA methods were run, only geneset statistics have ",
            "been returned.", immediate.=TRUE)
    out <- geneSets(x, as.dt=as.dt)
  } else {
    invalidMethods(x, names)
    stopifnot(isSingleLogical(stats.only))
    rank.by <- match.arg(rank.by)
    if (length(names) > 1L && add.suffix == FALSE) {
      add.suffix <- TRUE
      warning("Forcing method suffix to generated result columns",
              immediate.=TRUE)
    }

    ## Idiomatic data.table, non-idiomatic R
    out <- copy(geneSets(x, as.dt=TRUE))
    for (name in names) {
      res <- result(x, name, stats.only, rank.by, add.suffix, as.dt=TRUE)
      for (col in setdiff(names(res), names(out))) {
        out[, (col) := res[[col]]]
      }
    }

    # let's get the pvalue related columns up front
    upfront <- c(
      "collection", "name", "active", "N", "n",
      names(out)[grepl('^(pval\\.?|padj\\.?)', names(out))])
    upfront <- intersect(upfront, names(out))
    setcolorder(out, upfront)

    if (!as.dt) setDF(out)
  }
  out
}

#' Summary of geneset level results at a specified FDR
#'
#' Generates a table to indicate the number of genesets per collection that
#' pass a given FDR. The table provides separate groups of rows for each of
#' the `methods` run in the [multiGSEA()] call that generated that
#' generated `x`.
#'
#' @export
#' @rdname results
#'
#' @param x A [MultiGSEAResult()] object.
#' @param names the names of the GSEA methods to be reported. By default,
#'   this function will display results for all methods.
#' @param max.p The maximum padj value to consider a result significant
#' @param p.col use padj or padj.by.collection?
#'
#' @return a data.table that summarizes the significant results per method
#'   per collection for the GSEA that was run
tabulateResults <- function(x, names=resultNames(x), max.p=0.20,
                            p.col=c('padj', 'padj.by.collection', 'pval'),
                            as.dt=FALSE) {
  stopifnot(is(x, 'MultiGSEAResult'))
  invalidMethods(x, names)
  stopifnot(isSingleNumeric(max.p))
  p.col <- match.arg(p.col)
  res <- lapply(names, function(wut) {
    r <- result(x, wut, as.dt=TRUE)
    ## some results (like goseq) don't have just "padj" or "pval" columns,
    ## because it has pval.over and pval.under, so let's just grab the first
    ## pval or padj "hit"
    idx <- grep(p.col, names(r))[1L]
    if (is.na(idx)) {
      r$pcol <- rep(NA, nrow(r))
    } else {
      r$pcol <- r[[idx]]
    }
    ## r$pcol <- r[[p.col]]
    pcol <- NULL # silence R CMD check NOTEs
    r[, {
      list(method=wut, geneset_count=length(pcol),
           sig_count=sum(pcol <= max.p, na.rm=TRUE),
           sig_up=sum(pcol <= max.p & mean.logFC.trim > 0, na.rm=TRUE),
           sig_down=sum(pcol <= max.p & mean.logFC.trim < 0, na.rm=TRUE))
    }, by='collection']
  })
  out <- rbindlist(res)
  if (!as.dt) setDF(out)
  out
}


setMethod("show", "MultiGSEAResult", function(object) {
  msg <- paste("multiGSEA result (max FDR by collection set to 20%)",
               "---------------------------------------------------", sep='\n')
  cat(msg, "\n")
  if (length(resultNames(object)) == 0) {
    cat("No GSEA methods were run, only geneset level statistics calculated")
  } else {
    base::print.data.frame(as.data.frame(tabulateResults(object, max.p=0.20)))
  }
  cat("\n")
})

#' Assembles a matrix of nominal or adjusted pvalues from a multiGSEA result
#'
#' You might want a matrix of pvalues (or FDRs) for the gene sets across all
#' GSEA methods you tried. I think I did, once, so here it is.
#'
#' @export
#' @param x A [MultiGSEAResult()] object.
#' @param names the entries from `resultNames(x)` that you want to include
#'   in the matrix. By default we take all of them.
#' @param pval Are we testing pvalues or adjusted pvalues?
#' @return A matrix of the desired pvalues for all genesets
#'
#' @examples
#' # vm <- exampleExpressionSet(do.voom=TRUE)
#' # gdb <- exampleGeneSetDb()
#' # mg <- multiGSEA(gdb, vm, vm$design, 'tumor', methods=c('cameraPR'))
#' mg <- exampleMultiGSEAResult()
#' pm <- p.matrix(mg)
p.matrix <- function(x, names=resultNames(x),
                     pcol=c('padj', 'padj.by.collection', 'pval')) {
  stopifnot(is(x, 'MultiGSEAResult'))
  invalidMethods(x, names)
  pcol <- match.arg(pcol)
  res <- results(x, names, add.suffix=TRUE, as.dt=TRUE)
  regex <- sprintf('^%s\\.', pcol)
  col.idx <- grep(regex, names(res))
  if (pcol != 'padj.by.collection') {
    col.idx <- setdiff(col.idx, grep('padj.by.collection', names(res)))
  }
  out <- as.matrix(res[, col.idx, with=FALSE])
  colnames(out) <- sub(regex, '', colnames(out))
  out
}
lianos/multiGSEA documentation built on Nov. 17, 2020, 1:26 p.m.
rdrr.io home R language documentation Run R code online
CRAN packages Bioconductor packages R-Forge packages GitHub packages
Note that we can't provide technical support on individual packages. You should contact the package authors for that.
lianos/multiGSEA
A unified interface to a plethora of gene set enrichment analysis methods

R/MultiGSEAResult-methods.R
In lianos/multiGSEA: A unified interface to a plethora of gene set enrichment analysis methods

Defines functions p.matrix tabulateResults results result.MultiGSEAResult result resultNames invalidMethods logFC geneSetsStats geneSetDb downgradeObject

Documented in geneSetDb geneSetsStats logFC p.matrix result result.MultiGSEAResult resultNames results tabulateResults

R Package Documentation

Browse R Packages

We want your feedback!

lianos/multiGSEA A unified interface to a plethora of gene set enrichment analysis methods

R/MultiGSEAResult-methods.R In lianos/multiGSEA: A unified interface to a plethora of gene set enrichment analysis methods

Defines functions p.matrix tabulateResults results result.MultiGSEAResult result resultNames invalidMethods logFC geneSetsStats geneSetDb downgradeObject

Documented in geneSetDb geneSetsStats logFC p.matrix result result.MultiGSEAResult resultNames results tabulateResults

R Package Documentation

Browse R Packages

We want your feedback!

lianos/multiGSEA
A unified interface to a plethora of gene set enrichment analysis methods

R/MultiGSEAResult-methods.R
In lianos/multiGSEA: A unified interface to a plethora of gene set enrichment analysis methods