#' @title Perform Gene Set Enrichment Analysis
#'
#' @description A simple interface for performing gene set enrichment
#' analysis on multiple collections of gene-level statistics.
#'
#' @param gene_sets Gene set data encoded as an n x m binary matrix,
#' where n is the number of gene sets and m is the number of genes:
#' \code{gene_sets[i,j] = 1} if and only if gene i is included in gene
#' set j, and otherwise \code{gene_sets[i,j] = 0}. The rows and
#' columns should be named. Only the rows of \code{gene_sets} matching
#' the rows of \code{Z} will be used in the enrichment analysis.
#'
#' @param Z Matrix of gene-level statistics such as z-scores, with
#' rows corresponding to genes. An enrichment analysis is performed
#' for each column of \code{Z}. The rows and columns should be named;
#' rows of \code{Z} matching the columns of \code{gene_sets} will
#' be used in the enrichment analysis.
#'
#' @param verbose When \code{verbose = TRUE}, information about the
#' progress of the gene set enrichment analysis is printed to the
#' console.
#'
#' @param eps The lower bound for calculating p-values; smaller values
#' of \code{eps} may give more accurate p-values at the possible cost
#' of slightly longer computation. Passed as the \code{eps} input to
#' \code{\link[fgsea]{fgsea}}.
#'
#' @param nproc Number of workers used to run gene set enrichment
#' analysis. Passed as the \code{nproc} input to
#' \code{\link[fgsea]{fgsea}}.
#'
#' @param \dots Additional arguments passed to
#' \code{\link[fgsea]{fgsea}}.
#'
#' @return The return value is a list containing four n x k matrices
#' of gene set enrichment analysis results, where n is the number of
#' gene sets and k is the number of columns in \code{Z}. The matrices
#' give the p-values (pval), enrichment scores (ES), normalized
#' enrichment scores (NES), and expected errors (log2err). See
#' \code{\link[fgsea]{fgseaMultilevel}} for more information about
#' these outputs.
#'
#' @examples
#' # Load the human gene sets and results of a differential
#' # expression analysis.
#' data(gene_sets_human)
#' data(pbmc_facs_z)
#'
#' # Perform the gene set enrichment analysis using fgsea. This step
#' # may take a little while.
#' gsea_res <- perform_gsea(gene_sets_human$gene_sets,pbmc_facs_z)
#'
#' # Create an interactive plot for exploring the results of the gene
#' # set enrichment analysis.
#' gsea_plotly(gsea_res,gene_sets_human$gene_set_info,"B",
#' file = "gsea_b_cells.html",title = "B cells")
#'
#' @import Matrix
#' @importFrom fgsea fgsea
#'
#' @export
#'
perform_gsea <- function (gene_sets, Z, verbose = TRUE, eps = 1e-32,
nproc = 0, ...) {
# Check the "gene_sets" and "Z" inputs.
if (!(is.matrix(gene_sets) | inherits(gene_sets,"Matrix")) & is.matrix(Z) &
!is.null(rownames(gene_sets)) & !is.null(colnames(gene_sets)) &
!is.null(rownames(Z)) & !is.null(colnames(Z)))
stop("Input arguments \"gene_sets\" and \"Z\" should be matrices with ",
"named rows and columns")
# Get the number of gene sets (n) and the number of columns in Z (k).
n <- ncol(gene_sets)
k <- ncol(Z)
# Align the gene and gene set data.
out <- align_by_rownames(gene_sets,Z)
gene_sets <- out$A
Z <- out$Z
if (nrow(Z) == 0)
stop("Failed to align gene_sets and Z by row names.")
if (verbose)
cat("Using statistics from",nrow(Z),"genes for gene set enrichment",
"analysis.\n")
# Convert the gene sets binary adjacency matrix into the format
# accepted by fgsea.
if (verbose)
cat("Converting gene set data for fgsea.\n")
pathways <- matrix2pathways(gene_sets)
# Initialize the outputs.
out <- list(pval = matrix(0,n,k),
log2err = matrix(0,n,k),
ES = matrix(0,n,k),
NES = matrix(0,n,k))
rownames(out$pval) <- colnames(gene_sets)
rownames(out$log2err) <- colnames(gene_sets)
rownames(out$ES) <- colnames(gene_sets)
rownames(out$NES) <- colnames(gene_sets)
colnames(out$pval) <- colnames(Z)
colnames(out$log2err) <- colnames(Z)
colnames(out$ES) <- colnames(Z)
colnames(out$NES) <- colnames(Z)
# Run the gene set enrichment analysis for each column of Z.
if (verbose) {
cat("Computing enrichment statistics for",n,"gene sets and\n")
cat(k,"collections of gene-level statistics:\n")
}
for (i in 1:k) {
if (verbose) {
if (i > 1)
cat(", ")
cat(colnames(Z)[i])
}
# Perform gene set enrichment analysis using fgsea.
ans <- suppressWarnings(fgsea(pathways,Z[,i],eps = eps,nproc = nproc,...))
class(ans) <- "data.frame"
# Refactor the outputs as a data frame with one row per gene set,
# and the following columns: pval, log2err, ES and NES. The
# outputs are set to mising (NA) whenever the expected error
# (log2err) is also NA.
rownames(ans) <- ans$pathway
ans <- ans[c("pval","log2err","ES","NES")]
ans <- ans[colnames(gene_sets),]
ans[is.na(ans$log2err),] <- NA
# Store the fgsea results.
out$pval[,i] <- ans$pval
out$log2err[,i] <- ans$log2err
out$ES[,i] <- ans$ES
out$NES[,i] <- ans$NES
}
if (verbose)
cat("\n")
# Output the results of the gene set enrichment analyses.
return(out)
}
# This function aligns the gene set data (A) with the gene statistics
# (Z) by their row names to prepare these data for a gene set
# enrichment analysis.
align_by_rownames <- function (A, Z) {
x <- rownames(A)
y <- rownames(Z)
ids <- intersect(x,y)
i <- match(ids,x)
j <- match(ids,y)
A <- A[i,]
Z <- Z[j,]
return(list(A = A,Z = Z))
}
# Recover a list of gene sets from an n x m adjacency matrix, A, in
# which n is the number of genes and m is the number of gene sets;
# A[i,j] = 1 if and only if gene i is included in gene set j. This
# function is used to prepare the "pathways" fgsea input from a
# collection of gene sets encoded as a (sparse) matrix. For this
# function to work, the rows and columns of A must be named.
matrix2pathways <- function (A) {
n <- ncol(A)
out <- vector("list",n)
names(out) <- colnames(A)
genes <- rownames(A)
for (i in 1:n)
out[[i]] <- genes[A[,i] == 1]
return(out)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.