Nothing
######################################################################
#
# author: Ludwig Geistlinger
# date: 25 Feb 2011
#
#
# THE ENRICHMENT BROWSER
#
# update: 05 June 2014, eBrowser2.0 all-in-one wrapper function
#
######################################################################
# CONSTANTS:
# TODO resolve:
# options:
# 1) par-like style --> current solution
#
# 2) methods/functions for getting/setting, e.g. group(obj), adjp(obj), ...
#
# 3) convert them all to function parameters
.ebrowser_config_cache <- new.env(parent=emptyenv())
#' Configuring the EnrichmentBrowser
#'
#' Function to get and set configuration parameters determining the default
#' behavior of the EnrichmentBrowser
#'
#' Important colData, rowData, and result column names: \itemize{ \item
#' SMPL.COL: colData column storing the sample IDs (default: "SAMPLE") \item
#' GRP.COL: colData column storing binary group assignment (default: "GROUP")
#' \item BLK.COL: colData column defining paired samples or sample blocks
#' (default: "BLOCK")
#'
#' \item PRB.COL: rowData column storing probe/feature IDs ("PROBEID",
#' read-only) \item EZ.COL: rowData column storing gene ENTREZ IDs ("ENTREZID",
#' read-only) \item SYM.COL: rowData column storing gene symbols ("SYMBOL",
#' read-only) \item GN.COL: rowData column storing gene names ("GENENAME",
#' read-only)
#'
#' \item FC.COL: rowData column storing (log2) fold changes of differential
#' expression between sample groups (default: "FC") \item ADJP.COL: rowData
#' column storing adjusted (corrected for multiple testing) p-values of
#' differential expression between sample groups (default: "ADJ.PVAL")
#'
#' \item GS.COL: result table column storing gene set IDs (default: "GENE.SET")
#' \item PVAL.COL: result table column storing gene set significance (default:
#' "PVAL") \item PMID.COL: gene table column storing PUBMED IDs ("PUBMED",
#' read-only) }
#'
#' Important URLs (all read-only): \itemize{ \item NCBI.URL:
#' \url{http://www.ncbi.nlm.nih.gov/} \item PUBMED.URL:
#' \url{http://www.ncbi.nlm.nih.gov/pubmed/} \item GENE.URL:
#' \url{http://www.ncbi.nlm.nih.gov/gene/} \item KEGG.URL:
#' \url{http://www.genome.jp/dbget-bin/} \item KEGG.GENE.URL:
#' \url{http://www.genome.jp/dbget-bin/www_bget?} \item KEGG.SHOW.URL:
#' \url{http://www.genome.jp/dbget-bin/show_pathway?} \item GO.SHOW.URL:
#' \url{http://amigo.geneontology.org/amigo/term/} }
#'
#' Default output directory: \itemize{ \item EBROWSER.HOME:
#' \code{tools::R_user_dir("EnrichmentBrowser")} \item OUTDIR.DEFAULT:
#' \code{file.path(EBROWSER.HOME, "results")} }
#'
#' Gene set size: \itemize{ \item GS.MIN.SIZE: minimum number of genes per gene
#' set (default: 5) \item GS.MAX.SIZE: maximum number of genes per gene set
#' (default: 500) }
#'
#' Result appearance: \itemize{ \item RESULT.TITLE: (default: "Table of
#' Results") \item NR.SHOW: maximum number of entries to show (default: 20) }
#'
#' @aliases config.ebrowser
#' @param key Configuration parameter.
#' @param value Value to overwrite the current value of key.
#' @return If is.null(value) this returns the value of the selected
#' configuration parameter. Otherwise, it updates the selected parameter with
#' the given value.
#' @author Ludwig Geistlinger <Ludwig.Geistlinger@@sph.cuny.edu>
#' @examples
#'
#' # getting config information
#' configEBrowser("GS.MIN.SIZE")
#'
#' # setting config information
#' # WARNING: this is for advanced users only!
#' # inappropriate settings will impair EnrichmentBrowser's functionality
#' configEBrowser(key="GS.MIN.SIZE", value=3)
#'
#' # restoring default config settings
#' configEBrowser()
#'
#' @export configEBrowser
configEBrowser <- function(key, value=NULL)
{
.key_readonly <- c(
"PRB.COL", "EZ.COL", "PMID.COL",
"NCBI.URL", "PUBMED.URL", "GENE.URL", "KEGG.URL", "KEGG.GENE.URL",
"KEGG.SHOW.URL", "GO.SHOW.URL", "SBEA.PKGS", "NBEA.PKGS")
if(missing(key)) .setupConfig()
else if(is.null(value)) .ebrowser_config_cache[[key]]
else if(!(key %in% .key_readonly)) .ebrowser_config_cache[[key]] <- value
}
##
# check the output directory
##
.checkOutDir <- function(out.dir)
{
if(!file.exists(dirname(out.dir)))
stop(paste0("Not a valid output directory path \'",out.dir,"\'"))
if(!file.exists(out.dir)){
message(paste("Output directory", out.dir,
"does not exist.\nThus, the directory is going to be created."))
dir.create(out.dir)
}
}
##
##
# eBrowser functionality
##
#' Seamless navigation through enrichment analysis results
#'
#' This is the all-in-one wrapper function to perform the standard enrichment
#' analysis pipeline implemented in the EnrichmentBrowser package.
#'
#' Given flat gene expression data, the data is read in and subsequently
#' subjected to chosen enrichment analysis methods.
#'
#' The results from different methods can be combined and investigated in
#' detail in the default browser.
#'
#' *On data type and normalization:*
#'
#' Normalization of high-throughput expression data is essential to make
#' results within and between experiments comparable. Microarray (intensity
#' measurements) and RNA-seq (read counts) data exhibit typically distinct
#' features that need to be normalized for. This function wraps commonly used
#' functionality from limma for microarray normalization and from EDASeq for
#' RNA-seq normalization. For specific needs that deviate
#' from standard normalizations, the user should always refer to more
#' specific functions/packages. See also the limma's user guide
#' \url{http://www.bioconductor.org/packages/limma} for definition and
#' normalization of the different expression data types.
#'
#' Microarray data is expected to be single-channel. For two-color arrays, it
#' is expected here that normalization within arrays has been already carried
#' out, e.g. using \code{\link{normalizeWithinArrays}} from limma.
#'
#' RNA-seq data is expected to be raw read counts. Please note that
#' normalization for downstream DE analysis, e.g. with edgeR and DESeq2, is not
#' ultimately necessary (and in some cases even discouraged) as many of these
#' tools implement specific normalization approaches. See the vignette of
#' EDASeq, edgeR, and DESeq2 for details.
#'
#'
#' @param meth Enrichment analysis method(s). See \code{\link{sbeaMethods}} and
#' \code{\link{nbeaMethods}} for currently supported enrichment analysis
#' methods. See also \code{\link{sbea}} and \code{\link{nbea}} for details.
#' @param exprs Expression matrix. A tab separated text file containing
#' the expression values (microarray: intensity measurements, RNA-seq: read counts).
#' Columns = samples/subjects; rows = features/probes/genes; NO headers, row or
#' column names. Alternatively, this can be a
#' \code{\linkS4class{SummarizedExperiment}}, assuming the expression matrix in
#' the \code{\link{assays}} slot. See details.
#' @param cdat Column (phenotype) data. A tab separated text file containing annotation
#' information for the samples in either *two or three* columns. NO headers,
#' row or column names. The number of rows/samples in this file should match
#' the number of columns/samples of the expression matrix. The 1st column is
#' reserved for the sample IDs; The 2nd column is reserved for a *BINARY* group
#' assignment. Use '0' and '1' for unaffected (controls) and affected (cases)
#' sample class, respectively. For paired samples or sample blocks a third
#' column is expected that defines the blocks. If 'exprs' is a
#' \code{\linkS4class{SummarizedExperiment}}, the 'cdat' argument can be left
#' unspecified, which then expects group and optional block assignments in
#' respectively named columns 'GROUP' (mandatory) and 'BLOCK' (optional) in the
#' \code{\link{colData}} slot.
#' @param rdat Row (feature) data. A tab separated text file containing annotation
#' information for the features. In case of probe level data:
#' exactly *TWO* columns; 1st col = probe/feature IDs; 2nd col = corresponding
#' gene ID for each feature ID in 1st col. In case of gene level data: the
#' gene IDs newline-separated (i.e. just *one* column). It is recommended to
#' use *ENTREZ* gene IDs (to benefit from downstream visualization and
#' exploration functionality of the EnrichmentBrowser). NO headers, row or
#' column names. The number of rows (features/probes/genes) in this file
#' should match the number of rows/features of the expression matrix.
#' Alternatively, this can also be the ID of a recognized platform such as
#' 'hgu95av2' (Affymetrix Human Genome U95 chip) or 'ecoli2' (Affymetrix E.
#' coli Genome 2.0 Array).
#' If 'exprs' is a \code{\linkS4class{SummarizedExperiment}}, the 'rdat' argument
#' can be left unspecified, which then expects probe and corresponding Entrez
#' Gene IDs in respectively named columns 'PROBEID' and 'ENTREZID' in the
#' \code{\link{rowData}} slot.
#' @param org Organism under investigation in KEGG three letter code, e.g.
#' \sQuote{hsa} for \sQuote{Homo sapiens}. See also
#' \code{\link{kegg.species.code}} to convert your organism of choice to KEGG
#' three letter code.
#' @param data.type Expression data type. Use 'ma' for microarray and 'rseq'
#' for RNA-seq data. If NA, data.type is automatically guessed. If the
#' expression values in 'exprs' are decimal numbers they are assumed to be
#' microarray intensities. Whole numbers are assumed to be RNA-seq read
#' counts. Defaults to NA.
#' @param norm.method Determines whether and how the expression data should be
#' normalized. For available microarray normalization methods see the man page
#' of the limma function \code{\link{normalizeBetweenArrays}}. For available
#' RNA-seq normalization methods see the man page of the EDASeq function
#' \code{betweenLaneNormalization}. Defaults to 'quantile', i.e.
#' normalization is carried out so that quantiles between arrays/lanes/samples
#' are equal. Use 'none' to indicate that the data is already normalized and
#' should not be normalized by ebrowser. See the man page of
#' \code{\link{normalize}} for details.
#' @param de.method Determines which method is used for per-gene differential
#' expression analysis. See the man page of \code{\link{deAna}} for details.
#' Defaults to 'limma', i.e. differential expression is calculated based on the
#' typical limma \code{\link{lmFit}} procedure. This can also be 'none' to
#' indicate that DE analysis has already been carried out and should not be
#' overwritten by \code{ebrowser} (applies only when \code{exprs} is given as a
#' \code{\linkS4class{SummarizedExperiment}}).
#' @param gs Gene sets. Either a list of gene sets (character vectors of gene
#' IDs) or a text file in GMT format storing all gene sets under investigation.
#' @param grn Gene regulatory network. Either an absolute file path to a
#' tabular file or a character matrix with exactly *THREE* cols; 1st col = IDs
#' of regulating genes; 2nd col = corresponding regulated genes; 3rd col =
#' regulation effect; Use '+' and '-' for activation/inhibition.
#' @param perm Number of permutations of the sample group assignments.
#' Defaults to 1000. Can also be an integer vector matching
#' the length of 'meth' to assign different numbers of permutations for
#' different methods.
#' @param alpha Statistical significance level. Defaults to 0.05.
#' @param beta Log2 fold change significance level. Defaults to 1 (2-fold).
#' @param comb Logical. Should results be combined if more then one enrichment
#' method is selected? Defaults to FALSE.
#' @param browse Logical. Should results be displayed in the browser for
#' interactive exploration? Defaults to TRUE.
#' @param nr.show Number of gene sets to show. As default all statistical
#' significant gene sets are displayed. Note that this only influences the
#' number of gene sets for which additional visualization will be provided
#' (typically only of interest for the top / signifcant gene sets). Selected
#' enrichment methods and resulting flat gene set rankings still include the
#' complete number of gene sets under study.
#' @param out.dir Output directory. If \code{NULL}, defaults to a
#' timestamp-generated subdirectory of \code{configEBrowser("OUTDIR.DEFAULT")}.
#' @param report.name Character. Name of the HTML report. Defaults to \code{"index"}.
#' @param ... Additional arguments passed on to the individual building blocks.
#' @return None, writes an HTML report and, if selected, opens the browser to
#' explore results.
#
#' If not instructed otherwise (via argument \code{out.dir}),
#' the main HTML report and associated files are written to
#' \code{configEBrowser("OUTDIR.DEFAULT")}.
#' See \code{?configEBrowser} to change the location.
#' If \code{browse=TRUE}, the HTML report will automatically be opened in
#' the default browser.
#' @author Ludwig Geistlinger <Ludwig.Geistlinger@@sph.cuny.edu>
#' @seealso \code{\link{readSE}} to read expression data from file;
#' \code{\link{probe2gene}} to transform probe to gene level expression;
#' \code{\link{kegg.species.code}} maps species name to KEGG code.
#' \code{\link{getGenesets}} to retrieve gene set databases such as GO or KEGG;
#' \code{\link{compileGRN}} to construct a GRN from pathway databases;
#' \code{\link{sbea}} to perform set-based enrichment analysis;
#' \code{\link{nbea}} to perform network-based enrichment analysis;
#' \code{\link{combResults}} to combine results from different methods;
#' \code{\link{eaBrowse}} for exploration of resulting gene sets
#' @references Limma User's guide:
#' \url{http://www.bioconductor.org/packages/limma}
#' @examples
#'
#' # expression data from file
#' exprs.file <- system.file("extdata/exprs.tab", package="EnrichmentBrowser")
#' cdat.file <- system.file("extdata/colData.tab", package="EnrichmentBrowser")
#' rdat.file <- system.file("extdata/rowData.tab", package="EnrichmentBrowser")
#'
#' # getting all human KEGG gene sets
#' # hsa.gs <- getGenesets(org="hsa", db="kegg")
#' gs.file <- system.file("extdata/hsa_kegg_gs.gmt", package="EnrichmentBrowser")
#' hsa.gs <- getGenesets(gs.file)
#'
#' # output destination
#' out.dir <- configEBrowser("OUTDIR.DEFAULT")
#'
#' # set-based enrichment analysis
#' ebrowser( meth="ora", perm=0,
#' exprs=exprs.file, cdat=cdat.file, rdat=rdat.file,
#' gs=hsa.gs, org="hsa", nr.show=3,
#' out.dir=out.dir, report.name="oraReport")
#'
#' # compile a gene regulatory network from KEGG pathways
#' hsa.grn <- compileGRN(org="hsa", db="kegg")
#'
#' # network-based enrichment analysis
#' ebrowser( meth="ggea",
#' exprs=exprs.file, cdat=cdat.file, rdat=rdat.file,
#' gs=hsa.gs, grn=hsa.grn, org="hsa", nr.show=3,
#' out.dir=out.dir, report.name="ggeaReport")
#'
#' # combining results
#' ebrowser( meth=c("ora", "ggea"), perm=0, comb=TRUE,
#' exprs=exprs.file, cdat=cdat.file, rdat=rdat.file,
#' gs=hsa.gs, grn=hsa.grn, org="hsa", nr.show=3,
#' out.dir=out.dir, report.name="combReport")
#'
#' @export ebrowser
ebrowser <- function(
meth, exprs, cdat, rdat, org, data.type=c(NA, "ma", "rseq"),
norm.method="quantile", de.method="limma",
gs, grn=NULL, perm=1000, alpha=0.05, beta=1,
comb=FALSE, browse=TRUE, nr.show=-1, out.dir=NULL, report.name="index", ...)
{
isAvailable("ReportingTools", type="software")
GRP.COL <- configEBrowser("GRP.COL")
FC.COL <- configEBrowser("FC.COL")
ADJP.COL <- configEBrowser("ADJP.COL")
EZ.COL <- configEBrowser("EZ.COL")
METHODS <- c(sbeaMethods(), nbeaMethods())
is.method <- (meth %in% METHODS) | is.function(meth)
if(!all(is.method)) stop("No such method: ", meth[!is.method])
if(any(nbeaMethods() %in% meth))
if(is.null(grn))
stop("\'grn\' must be not null")
if(is.null(out.dir))
{
out.dir <- configEBrowser("OUTDIR.DEFAULT")
stamp <- format(Sys.time(), "%a_%b%d_%Y_%H%M%S")
out.dir <- file.path(out.dir, stamp)
}
else out.dir <- path.expand(out.dir)
if(!file.exists(out.dir)) dir.create(out.dir, recursive=TRUE)
# execution
# read expression data
data.type <- match.arg(data.type)
if(is.character(exprs))
{
message("Read expression data ...")
se <- readSE( assay.file=exprs,
cdat.file=cdat, rdat.file=rdat, data.type=data.type )
}
else
{
if(is(exprs, "ExpressionSet")) exprs <- as(exprs, "SummarizedExperiment")
se <- exprs
if(is.na(data.type))
data.type <- .detectDataType(assay(se))
metadata(se)$dataType <- data.type
}
# normalize?
if(norm.method != 'none')
{
message("Normalize ...")
se <- normalize(se, norm.method=norm.method)
}
# probe 2 gene if data.type = ma
# ... and it's not already a gene level se
if(metadata(se)$dataType == "ma")
{
has.pcol <- configEBrowser("PRB.COL") %in% colnames(rowData(se))
anno <- metadata(se)$annotation
has.anno <- ifelse(length(anno), nchar(anno) > 3, FALSE)
if(has.pcol || has.anno)
{
message("Transform probe expression to gene expression ...")
geneSE <- probe2gene(se)
}
else geneSE <- se
}
else geneSE <- se
if(de.method != "none")
{
message("DE analysis ...")
geneSE <- deAna(geneSE, de.method=de.method)
}
if(missing(org)) org <- metadata(geneSE)$annotation
else metadata(geneSE)$annotation <- org
nr.meth <- length(meth)
if(length(perm) != nr.meth) perm <- rep(perm[1], nr.meth)
if(comb) res.list <- vector("list", length=nr.meth)
for(i in seq_len(nr.meth))
{
m <- meth[i]
message(paste("Execute", toupper(m), "..."))
out.file <- file.path(out.dir, paste0(m, ".txt"))
if(m %in% nbeaMethods())
res <- nbea( method=m, se=geneSE, gs=gs,
grn=grn, alpha=alpha, beta=beta, perm=perm[i], ... )
else res <- sbea( method=m, se=geneSE,
gs=gs, alpha=alpha, perm=perm[i], ... )
write.table(res$res.tbl, file=out.file,
quote=FALSE, row.names=FALSE, sep="\t")
# produce html reports, if desired
if(browse) eaBrowse(res, nr.show,
graph.view=grn, html.only=TRUE, out.dir=out.dir)
# link gene statistics
sam.file <- file.path(out.dir, "samt.RData")
if(m == "samgs" && file.exists(sam.file))
{
samt <- round(get(load(sam.file)), digits=2)
rowData(geneSE)[names(s2n), "SAM.T"] <- unname(samt)
}
s2n.file <- file.path(out.dir, "gsea_s2n.RData")
if(m == "gsea" && file.exists(s2n.file))
{
s2n <- round(get(load(s2n.file)), digits=2)
rowData(geneSE)[names(s2n),"GSEA.S2N"] <- unname(s2n)
}
if(comb) res.list[[i]] <- res
}
# write genewise differential expression
gt.file <- file.path(out.dir, "de.txt")
message("Annotating genes ...")
gt <- .getGeneAnno(names(geneSE), org)
gt <- cbind(gt, rowData(geneSE))
gt <- .sortGeneTable(gt)
ind <- gt[,configEBrowser("EZ.COL")]
geneSE <- geneSE[ind, ]
rowData(geneSE) <- gt
message(paste("Genewise differential expression written to", gt.file))
write.table(gt, file=gt.file, row.names=FALSE, quote=FALSE, sep="\t")
if(comb)
{
# combine results (average ranks, ztrans p-vals)
message("Combine results ...")
if(nr.meth == 1)
message(paste("Only one method given, \'comb\' ignored."))
else
{
# combine results in a specified out file
out.file <- file.path(out.dir, "comb.txt")
res <- combResults(res.list=res.list)
write.table(res$res.tbl,
file=out.file, quote=FALSE, row.names=FALSE, sep="\t")
# produce html report, if desired
if(browse) eaBrowse(res, nr.show,
graph.view=grn, html.only=TRUE, out.dir=out.dir)
}
}
message(paste("Your output files are in", out.dir))
# create INDEX.html
if(browse)
{
# plot DE
if(length(geneSE) > 1000)
{
message("Restricting global view to the 1000 most significant genes")
geneSE <- geneSE[1:1000,]
}
vs <- .viewSet(geneSE, out.prefix=file.path(out.dir, "global"))
message("Produce html report ...")
.createIndex(meth, comb, out.dir, report.name)
}
}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.