R/getGenesets.R

Defines functions .detectGSType .createGS .gsList2Collect .prepGS .parseGMT .extractTitle .makeGSNames .makeGSC .getGenesByPwy .extractPwys .extractKeggGS .dwnldSelectedKeggGS .makeKeggGSC .dwnldAllKeggGS .keggCollections .keggSpecies .getKEGG .goCollections .goSpecies .getGOFromBiomart .getGOFromGODB .getGO .msigdbCollections .msigdbSpecies .getMSigDb .enrichrSpecies .enrichrLibs .org2enrichr .getEnrichrLib .getEnrichr writeGMT showAvailableCollections showAvailableSpecies getGenesets

Documented in getGenesets showAvailableCollections showAvailableSpecies writeGMT

##############################################################################
# 
# Author: ludwig geistlinger
# Date: 16 June 2010
#
# Getting gene sets for enrichment analysis
#
# Update, 05 May 2014: KEGG gene set getter for organism of choice 
# Update, 10 March 2015: including getter for GO genesets
# Update, 24 April 2018: unified getGenesets fassade for gene sets from
#                        different DBs
# Update, Dec 2019: including getter for MSigDB and Enrichr       
###############################################################################

#' @name getGenesets
#'
#' @title Definition of gene sets according to different sources
#' 
#' @description Functionality for retrieving gene sets for an organism under
#' investigation from databases such as GO and KEGG. Parsing and writing a list
#' of gene sets from/to a flat text file in GMT format is also supported.
#'
#' The GMT (Gene Matrix Transposed) file format is a tab delimited file format
#' that describes gene sets.  In the GMT format, each row represents a gene
#' set.  Each gene set is described by a name, a description, and the genes in
#' the gene set. See references.
#'
#' @aliases get.go.genesets get.kegg.genesets parse.genesets.from.GMT
#' @param org An organism in (KEGG) three letter code, e.g. \sQuote{hsa} for
#' \sQuote{Homo sapiens}. Alternatively, this can also be a text file storing 
#' gene sets in GMT format. See details.
#' @param db Database from which gene sets should be retrieved. Currently, 
#' either 'go' (default), 'kegg', 'msigdb', or 'enrichr'. 
#' @param gene.id.type Character. Gene ID type of the returned gene sets.
#' Defaults to \code{"ENTREZID"}. See \code{\link{idTypes}} for available
#' gene ID types.
#' @param cache Logical.  Should a locally cached version used if available?
#' Defaults to \code{TRUE}.
#' @param return.type Character. Determines whether gene sets are returned
#' as a simple list of gene sets (each being a character vector of gene IDs), or
#' as an object of class \code{\linkS4class{GeneSetCollection}}.
#' @param ... Additional arguments for individual gene set databases. 
#' For \code{db = "GO"}:
#' \itemize{
#' \item onto: Character. Specifies one of the three GO ontologies: 'BP'
#' (biological process), 'MF' (molecular function), 'CC' (cellular component).
#' Defaults to 'BP'.
#' \item evid: Character. Specifies one or more GO evidence code(s) such as 
#' IEP (inferred from expression pattern) or TAS (traceable author statement).
#' Defaults to \code{NULL} which includes all annotations, i.e. does not filter by
#' evidence codes. See references for a list of available evidence codes. 
#' \item hierarchical: Logical. Incorporate hierarchical relationships between
#' GO terms ('is_a' and 'has_a') when collecting genes annotated to a GO term?
#' If set to \code{TRUE}, this will return all genes annotated to a GO term
#' *or to one of its child terms* in the GO ontology.
#' Defaults to \code{FALSE}, which will then only collect genes directly
#' annotated to a GO term. 
#' \item mode: Character. Determines in which way the gene 
#' sets are retrieved. This can be either 'GO.db' or 'biomart'. 
#' The 'GO.db' mode creates the gene sets based on BioC annotation packages - 
#' which is fast, but represents not
#' necessarily the most up-to-date mapping. In addition, this option is only
#' available for the currently supported model organisms in BioC.  The
#' 'biomart' mode downloads the mapping from BioMart - which can be time
#' consuming, but allows to select from a larger range of organisms and
#' contains the latest mappings.  Defaults to 'GO.db'.}
#' For \code{db = "msigdb":} \itemize{ \item cat: Character. 
#' MSigDB collection category: 'H' (hallmark), 
#' 'C1' (genomic position), 'C2' (curated databases), 'C3' (binding site motifs),
#' 'C4' (computational cancer), 'C5' (Gene Ontology), 'C6' (oncogenic), 
#' 'C7' (immunologic), 'C8' (cell type). See references. 
#' \item subcat: Character. MSigDB collection subcategory. Depends on the
#' chosen MSigDB collection category. For example, 'MIR' to obtain microRNA targets
#' from the 'C3' collection. See references.}
#' For \code{db = "enrichr"}: \itemize{ \item lib: Character. Enrichr gene set 
#' library. For example, 'Genes_Associated_with_NIH_Grants' to obtain gene sets 
#' based on associations with NIH grants. See references.}
#' @param gs A list of gene sets (character vectors of gene IDs).
#' @param gmt.file Gene set file in GMT format. See details.
#' @return For \code{getGenesets}: a list of gene sets (vectors of gene IDs).
#' For \code{writeGMT}: none, writes to file.
#'
#' For \code{showAvailableSpecies} and \code{showAvailableCollections}: 
#' a \code{\linkS4class{DataFrame}}, displaying supported species and
#' available gene set collections for a gene set database of choice.
#' @author Ludwig Geistlinger
#' @seealso the \code{GO.db} package for GO2gene mapping used in
#' 'GO.db' mode, and the biomaRt package for general queries to BioMart. 
#' 
#' \code{\link{keggList}} and \code{\link{keggLink}} for accessing the KEGG REST
#' server.
#'
#' \code{msigdbr::msigdbr} for obtaining gene sets from the MSigDB.
#' @references GO: \url{http://geneontology.org/}
#' 
#' GO evidence codes: \url{http://geneontology.org/docs/guide-go-evidence-codes/}
#'
#' KEGG Organism code: \url{http://www.genome.jp/kegg/catalog/org_list.html}
#'
#' MSigDB: \url{http://software.broadinstitute.org/gsea/msigdb/collections.jsp}
#'
#' Enrichr: \url{https://maayanlab.cloud/Enrichr/#stats}
#'
#' GMT file format:
#' \url{http://www.broadinstitute.org/cancer/software/gsea/wiki/index.php/Data_formats}
#' @examples
#' 
#'     # (1) Typical usage for gene set enrichment analysis with GO:
#'     # Biological process terms based on BioC annotation (for human)
#'     go.gs <- getGenesets(org = "hsa", db = "go")
#'     
#'     # eq.:  
#'     # go.gs <- getGenesets(org = "hsa", db = "go", onto = "BP", mode = "GO.db")
#'     \donttest{
#'     # Alternatively:
#'     # downloading from BioMart 
#'     # this may take a few minutes ...
#'     go.gs <- getGenesets(org = "hsa", db = "go", mode = "biomart")
#'
#'     # list supported species for obtaining gene sets from GO 
#'     showAvailableSpecies(db = "go")
#'     }
#'     # (2) Defining gene sets according to KEGG  
#'     kegg.gs <- getGenesets(org = "hsa", db = "kegg")
#'     \donttest{
#'     # list supported species for obtaining gene sets from KEGG 
#'     showAvailableSpecies(db = "kegg")
#'
#'     # (3) Obtaining *H*allmark gene sets from MSigDB
#'     hall.gs <- getGenesets(org = "hsa", db = "msigdb", cat = "H")
#'
#'     # list supported species for obtaining gene sets from MSigDB
#'     showAvailableSpecies(db = "msigdb")
#'
#'     # list available gene set collections in the MSigDB
#'     showAvailableCollections(db = "msigdb") 
#'
#'     # (4) Obtaining gene sets from Enrichr
#'     tfppi.gs <- getGenesets(org = "hsa", db = "enrichr", 
#'                             lib = "Transcription_Factor_PPIs")
#'
#'     # list supported species for obtaining gene sets from Enrichr
#'     showAvailableSpecies(db = "enrichr")
#'
#'     # list available Enrichr gene set libraries
#'     showAvailableCollections(org = "hsa", db = "enrichr")        
#'     }
#'     # (6) parsing gene sets from GMT
#'     gmt.file <- system.file("extdata/hsa_kegg_gs.gmt",
#'                             package = "EnrichmentBrowser")
#'     gs <- getGenesets(gmt.file)     
#'     
#'     # (7) writing gene sets to file
#'     writeGMT(gs, gmt.file)
#' 
NULL

#' @export
#' @rdname getGenesets
getGenesets <- function(org, 
    db = c("go", "kegg", "msigdb", "enrichr"), 
    gene.id.type = "ENTREZID",
    cache = TRUE,
    return.type = c("list", "GeneSetCollection"),
    ...)
{
    stopifnot(length(org) == 1 && is.character(org))
    return.type <- match.arg(return.type)
    if(file.exists(org)) gs <- .parseGMT(org, return.type)
    else
    {
        if(!grepl("^[a-z]{3,4}$", org))
            stop(paste("\'org\' must be an organism in KEGG three letter code",
                        "or a file in GMT format"))

        db <- match.arg(db)
        if(db == "go") gs <- .getGO(org, gene.id.type, cache, return.type, ...)
        else if(db == "kegg") 
            gs <- .getKEGG(org, gene.id.type, cache, return.type)
        else if(db == "msigdb") 
            gs <- .getMSigDb(org, gene.id.type, cache, return.type, ...)
        else gs <- .getEnrichr(org, gene.id.type, cache, return.type, ...)
    }
    return(gs)
}

#' @export
#' @rdname getGenesets
showAvailableSpecies <- function(db = c("go", "kegg", "msigdb", "enrichr"),
                                 cache = TRUE)
{
    db <- match.arg(db)
    if(db == "kegg") .keggSpecies(cache)
    else if(db == "go") .goSpecies(cache)
    else if(db == "msigdb") .msigdbSpecies(cache)
    else .enrichrSpecies()
}

#' @export
#' @rdname getGenesets
showAvailableCollections <- function(org, 
                                     db = c("go", "kegg", "msigdb", "enrichr"),
                                     cache = TRUE)
{
    db <- match.arg(db)
    if(db == "kegg") .keggCollections()
    else if(db == "go") .goCollections()
    else if(db == "msigdb") .msigdbCollections()
    else .enrichrLibs(.org2enrichr(org), cache)
}

# write genesets to file in GMT format
#' @export
#' @rdname getGenesets
writeGMT <- function(gs, gmt.file)
{
    ## collapse geneset members to one tab separated string 
    gs.strings <- vapply(gs, 
        function(x) paste(x, collapse="\t"),
        character(1))
    
    ## paste an not annotated second column (artifact of gmt format)
    ann <- paste(names(gs), rep(NA,length(gs)), sep="\t")
    
    ## paste all together
    all <- paste(ann, gs.strings, sep="\t")
    
    ## collapse all together to a single newline separated string
    all.str <- paste(all, collapse="\n")
    all.str <- paste(all, "\n", sep="")
    
    ## write the gs in gmt format
    cat(all.str, file=gmt.file, sep="")
}


#
# ENRICHR
#
.getEnrichr <- function(org, gene.id.type, cache, return.type, lib)
{
    # convert organism
    eorg <- .org2enrichr(org)
    
    # only show available gene set libraries?
    libs <- suppressMessages(.enrichrLibs(eorg, cache))

    gs.tag <- "gs"
    if(return.type == "GeneSetCollection") gs.tag <- paste0(gs.tag, "c")
    gsc.name <- paste(org, "enrichr", lib, gene.id.type, gs.tag, sep =".")
 
    # should a cached version be used?
    if(cache)
    {
        gs <- .getResourceFromCache(gsc.name)
        if(!is.null(gs)) return(gs)
    }
    
    # obtain the gene set library
    if(!(lib %in% libs[,"libraryName"])) stop("Not a valid gene set library") 
    gs <- .getEnrichrLib(eorg, lib)    

    # ID mapping
    from <- ifelse(org == "sce", "GENENAME", "SYMBOL")
    
    if(return.type == "GeneSetCollection")
    {
        ct <- ComputedCollection()
        titles <- vapply(names(gs), .extractTitle, character(1))
        it <- if(org == "sce") GenenameIdentifier() else SymbolIdentifier()
        gs <- .makeGSC(gs, titles, org, ct, it)        
    }
    
    if(gene.id.type != from)
    {
        suppressMessages(
            gs <- idMap(gs, org, from = from, to = gene.id.type)
        )
    }
    .cacheResource(gs, gsc.name)
    return(gs)
}

.getEnrichrLib <- function(eorg, lib)
{
    getURL <- NULL
    isAvailable("RCurl")               
    
    eurl <- paste0("https://maayanlab.cloud/", eorg, 
                    "/geneSetLibrary?mode=text&libraryName=", lib)
    if(.Platform$OS.type == "windows") gs <- getURL(eurl, ssl.verifypeer = FALSE)
    else gs <- getURL(eurl)
    gs <- unlist(strsplit(gs, "\n"))
    gs <- lapply(gs, function(x) unlist(strsplit(x, "\t")))
    names(gs) <- vapply(gs, function(x) x[1], character(1))
    names(gs) <- gsub(" ", "_", names(gs))
    ids <- paste0("ER", seq_along(gs))
    names(gs) <- paste(ids, names(gs), sep = "_")
    gs <- lapply(gs, function(x) x[-c(1,2)])
    .trim <- function(x) vapply(x, 
                                function(y) unlist(strsplit(y, ","))[1], 
                                character(1), USE.NAMES=FALSE)
    gs <- lapply(gs, .trim)
    lapply(gs, function(s) s[!is.na(s)])
}

.org2enrichr <- function(org)
{
    sorgs <- .enrichrSpecies()[,"kegg"] 
    if(!(org %in% sorgs)) stop("Organism not supported") 
    eorgs <- c("Worm", "Fly", "Fish", "", "Yeast")
    names(eorgs) <- sorgs
    eorg <- eorgs[org]
    paste0(eorg, "Enrichr")
}

.enrichrLibs <- function(eorg, cache)
{
    selibs <- paste(eorg, "elibs", sep = ".")
    if(cache)
    {
        elibs <- .getResourceFromCache(selibs)
        if(!is.null(elibs)) return(elibs)
    }
    eurl <- paste0("https://maayanlab.cloud/", eorg, "/datasetStatistics")
    GET <- fromJSON <- NULL
    isAvailable("httr", type = "software")
    dbs <- GET(eurl)
    dbs <- dbs$content
    dbs <- intToUtf8(dbs)
    isAvailable("rjson", type = "software")
    dbs <- fromJSON(dbs)
    dfSAF <- getOption("stringsAsFactors")
    options(stringsAsFactors = FALSE)
    dbs <- lapply(dbs$statistics, function(x) do.call(cbind.data.frame, x))
    dbs <- do.call(rbind.data.frame, dbs)
    options(stringsAsFactors = dfSAF)
    cols <- c("libraryName", "link", "numTerms", "geneCoverage", "genesPerTerm")
    dbs <- dbs[,cols]
    dbs <- DataFrame(dbs)
    .cacheResource(dbs, selibs)
    dbs
}

.enrichrSpecies <- function()
{
    sorgs <- c("cel", "dme", "dre", "hsa", "sce")
    ind <- match(sorgs, SPECIES[,"kegg"])
    DataFrame(SPECIES[ind, c("kegg", "tax", "common")])
}

#
# MSigDB
#
.getMSigDb <- function(org, gene.id.type, cache, return.type,
                        cat = c("H", paste0("C", 1:8)), 
                        subcat = NA)
{
    isAvailable("msigdbr", type = "software")
    cat <- cat[1]
    supp.cats <- unique(msigdbr::msigdbr_collections()$gs_cat)
    if(!is.character(cat) || !(cat %in% supp.cats))
        stop(gettextf("'cat' should be one of %s", 
                      paste(dQuote(supp.cats), collapse = ", ")), domain = NA)
 
    gs.tag <- "gs"
    if(return.type == "GeneSetCollection") gs.tag <- paste0(gs.tag, "c")

    gsc.name <- paste(org, "msigdb", cat, 
                      subcat, gene.id.type, gs.tag, sep = ".")
    # should a cached version be used?
    if(cache)
    {
        gs <- .getResourceFromCache(gsc.name)
        if(!is.null(gs)) return(gs)
    }

    ind <- match(org, SPECIES[,"kegg"])
    morg <- SPECIES[ind, "tax"]

    if(!(morg %in% msigdbr::msigdbr_species()$species_name)) 
        stop("Organism not supported")

    df <- msigdbr::msigdbr(morg, cat, subcat)
    gs <- split(as.character(df$entrez_gene), df$gs_id)
    gs.names <- unique(df$gs_name)
    gs.ids <- unique(df$gs_id)
    gs.names <- paste(gs.ids, gs.names, sep = "_")
    names(gs.names) <- gs.ids
    names(gs) <- unname(gs.names[names(gs)])
    
    if(return.type == "GeneSetCollection")
    {
        ct <- BroadCollection(category = tolower(cat), 
                                subCategory = tolower(subcat))
        titles <- vapply(names(gs), .extractTitle, character(1))
        gs <- .makeGSC(gs, titles, org, ct)        
    }
    
    if(gene.id.type != "ENTREZID")
    {
        suppressMessages( 
            gs <- idMap(gs, org, from = "ENTREZID", to = gene.id.type)
        )
    }
    .cacheResource(gs, gsc.name)
    return(gs)
}

.msigdbSpecies <- function(cache)
{
    sms <- "msigdb.species" 
    if(cache)
    {
        morgs <- .getResourceFromCache(sms)
        if(!is.null(morgs)) return(morgs)
    }
    isAvailable("msigdbr", type = "software")
    ms <- msigdbr::msigdbr_species()$species_name
    ms[ms == "Canis lupus familiaris"] <- "Canis familiaris"
    ms[ms == "Schizosaccharomyces pombe 972h-"] <- "Schizosaccharomyces pombe"
    ind <- match(ms, SPECIES[,"tax"])
    morgs <- SPECIES[ind, c("kegg", "tax", "common")]
    morgs <- DataFrame(morgs)
    .cacheResource(morgs, sms)
    return(morgs)
}

.msigdbCollections <- function()
{
    isAvailable("msigdbr", type = "software")
    DataFrame(msigdbr::msigdbr_collections())    
#    msdb.colls <- "msigdb.collects" 
#    if(cache)
#    {
#        colls <- .getResourceFromCache(msdb.colls)
#        if(!is.null(colls)) return(colls)
#    }
#    df <- msigdbr::msigdbr()[,c("gs_cat", "gs_subcat")]
#    df <- as.data.frame(unique(df))
#    df <- df[do.call(order, df),]
#    rownames(df) <- NULL
#    colnames(df) <- sub("^gs_", "", colnames(df))
#    df <- DataFrame(df)
#    .cacheResource(df, msdb.colls)
#    return(df)
}

#
# GO
#
.getGO <- function(org, gene.id.type, cache, return.type, 
                   onto = c("BP", "MF", "CC"),
                   evid = NULL,
                   hierarchical = FALSE,
                   mode = c("GO.db", "biomart"))
{
    onto <- match.arg(onto)

    gs.tag <- "gs"
    if(return.type == "GeneSetCollection") gs.tag <- paste0(gs.tag, "c")
    hier.tag <- ifelse(hierarchical, "hier", "")
    gsc.name <- paste(org, "go", tolower(onto),
                      gene.id.type, gs.tag, hier.tag, sep = ".")
    
    # should a cached version be used?
    if(cache)
    {
        gs <- .getResourceFromCache(gsc.name)
        if(!is.null(gs)) return(gs)
    }

    # from GO.db or from biomart?
    mode <- match.arg(mode)
    if(mode == "GO.db")
        gs <- .getGOFromGODB(org, onto, evid, hierarchical, return.type)
    else gs <- .getGOFromBiomart(org, onto, evid, hierarchical, return.type)

    if(gene.id.type != "ENTREZID")
    {
        suppressMessages( 
            gs <- idMap(gs, org, from = "ENTREZID", to = gene.id.type)
        )
    }
    .cacheResource(gs, gsc.name)
    return(gs)
}

.getGOFromGODB <- function(org, onto, evid, hierarchical, return.type)
{
    orgpkg <- .getAnnoPkg(org)
    GO2descr <- AnnotationDbi::as.list(GO.db::GOTERM)
    org.terms <- AnnotationDbi::keys(orgpkg, keytype = "GO")
    org.terms <- sort(org.terms)
    htag <- ifelse(hierarchical, "ALL", "")
    kt <- paste0("GO", htag)
    
    suppressMessages(
        dbmap <- AnnotationDbi::select(orgpkg, 
                                       keys = org.terms,
                                       columns = "ENTREZID",
                                       keytype = kt)
    )
    ot <- paste0("ONTOLOGY", htag)
    dbmap <- subset(dbmap, dbmap[[ot]] %in% onto)
    if(!is.null(evid))
    { 
        et <- paste0("EVIDENCE", htag)
        dbmap <- subset(dbmap, dbmap[[et]] %in% evid)
    }
    gs <- split(dbmap$ENTREZID, dbmap[[kt]])

    gs <- gs[intersect(names(gs), names(GO2descr))]
    GO2descr <- GO2descr[names(gs)]
    GO2descr <- vapply(GO2descr, AnnotationDbi::Term, character(1))

    if(return.type == "GeneSetCollection")
    {
        if(is.null(evid)) evid <- NA_character_
        ct <- GOCollection(evidenceCode = evid, ontology = onto)
        gs <- .makeGSC(gs, GO2descr, org, ct)        
    }
    else names(gs) <- paste(names(gs), gsub(" ", "_", GO2descr), sep = "_")
    return(gs)
}

.getGOFromBiomart <- function(org, onto, evid, hierarchical, return.type)
{
    if(hierarchical) stop("hierarchical GO gene set definition is currently ",
                          "supported in GO.db mode only")
    useMart <- listDatasets <- useDataset <- getBM <- NULL
    isAvailable("biomaRt", type = "software")
    # setting mart
    ensembl <- useMart("ensembl")
    ds <- listDatasets(ensembl)[,"dataset"]
    ds <- grep(paste0("^", org), ds, value=TRUE)
    ensembl <- useDataset(ds, mart=ensembl)

    message("Downloading mapping from BioMart ...")
    message("This may take a few minutes ...")
    
    GO2descr <- getBM(mart=ensembl,
        attributes=c("go_id", "name_1006", "namespace_1003"))
    GO2descr <- GO2descr[GO2descr$go_id != "", ]
    GO2descr <- GO2descr[order(GO2descr[,"go_id"]),]
    .getOnto <- function(x)
    {
        spl <- unlist(strsplit(x, "_"))
        onto <- paste(substring(spl,1,1), collapse="")         
        return(toupper(onto))
    }
    ontos <- vapply(GO2descr[,3], .getOnto, character(1)) 
    GO2descr <- GO2descr[ontos==onto,1:2]
   
    attrs <- c("entrezgene_id", "go_id", "go_linkage_type") 
    gene2GO <- getBM(attributes = attrs, mart = ensembl)
    gene2GO <- gene2GO[apply(gene2GO, 1 , function(r) all(r != "")), ]
    gene2GO <- gene2GO[order(gene2GO[,"go_id"]),]
    gene2GO <- gene2GO[gene2GO[,"go_id"] %in% GO2descr[,"go_id"],]
    if(!is.null(evid)) gene2GO <- subset(gene2GO, go_linkage_type %in% evid)
    gs <- lapply(GO2descr[,"go_id"], 
        function(g) gene2GO[gene2GO[,"go_id"] == g, "entrezgene_id"])
    gs <- lapply(gs, function(s) sort(as.character(s)))

    ids <- GO2descr[,1]
    GO2descr <- GO2descr[,2] 
    names(gs) <- names(GO2descr) <- ids
    if(return.type == "GeneSetCollection")
    {
        if(is.null(evid)) evid <- NA_character_
        ct <- GOCollection(evidenceCode = evid, ontology = onto)
        gs <- .makeGSC(gs, GO2descr, org, ct)        
    }
    else names(gs) <- paste(ids, gsub(" ", "_", unname(GO2descr)), sep="_")
    return(gs)
}

.goSpecies <- function(cache)
{
    if(cache)
    {
        gorgs <- .getResourceFromCache("go.orgs")
        if(!is.null(gorgs)) return(gorgs)
    }
    useMart <- listDatasets <- NULL
    isAvailable("biomaRt", type = "software")
    ensembl <- useMart("ensembl")
    ds <- listDatasets(ensembl)

    dat <- ds[,"dataset"]
    desc <- ds[,"description"]
    kegg <- substring(dat, 1, 3)
    tax <- sub("_gene_ensembl$", "", dat)
    tax1 <- substring(tax, 1, 1)
    tax1 <- toupper(tax1)
    tax1 <- paste0(tax1, ". ")
    tax2 <- sub("^[a-z]", "", tax)
    tax <- paste0(tax1, tax2)
    common <- vapply(desc, 
                     function(x) unlist(strsplit(x, " genes "))[1],
                     character(1),
                     USE.NAMES = FALSE)

    gorgs <- DataFrame(kegg = kegg, tax = tax, common = common)
    .cacheResource(gorgs, "go.orgs")
    return(gorgs)
}

.goCollections <- function()
{
    onto <- c("BP", "MF", "CC")
    desc <- c("Biological Process", "Molecular Function", "Cellular Component")
    DataFrame(onto = onto, desc = desc)
}

#
# KEGG
#

.getKEGG <- function(pwys, gene.id.type, cache, return.type)
{
    is.org <- length(pwys) == 1 && grepl("^[a-z]{3,4}$", pwys)
    # download all gs of organism
    if(is.org) gs <- .dwnldAllKeggGS(pwys, gene.id.type, cache, return.type)
    # download selected ids
    else gs <- .dwnldSelectedKeggGS(pwys)
    return(gs)
}

.keggSpecies <- function(cache) 
{
    if(cache)
    {
        korgs <- .getResourceFromCache("kegg.orgs")
        if(!is.null(korgs)) return(korgs)
    }
    korgs <- KEGGREST::keggList("organism")
    .fspl <- function(x)
    {
        spl <- unlist(strsplit(x, " \\("))
        if(length(spl) < 2) spl <- c(spl, "")
        else spl[2] <- sub("\\)$", "", spl[2])
        unname(spl[1:2])
    }

    spl <- vapply(korgs[,"species"], .fspl, character(2), USE.NAMES = FALSE) 
    korgs <- cbind(korgs[,c("organism")], t(spl))
    colnames(korgs) <- c("kegg", "tax", "common")
    korgs <- DataFrame(korgs)
    .cacheResource(korgs, "kegg.orgs")
    return(korgs)
}

.keggCollections <- function()
    message("KEGG gene sets are currently returned as a single collection")


.dwnldAllKeggGS <- function(org, gene.id.type, cache, return.type)
{
    gs.tag <- "gs"
    if(return.type == "GeneSetCollection") gs.tag <- paste0(gs.tag, "c")
    gsc.name <- paste(org, "kegg", gene.id.type, gs.tag, sep = ".") 
    
    # should a cached version be used?
    if(cache)
    {
        gs <- .getResourceFromCache(gsc.name)
        if(!is.null(gs)) return(gs)
    }
    pwys <- KEGGREST::keggList("pathway", org)
    pwys <- pwys[sort(names(pwys))]
    pwy2gene <- KEGGREST::keggLink(org, "pathway")
    names(pwy2gene) <- sub("^path:", "", names(pwy2gene))

    org.start <- paste0("^", org, ":")
    .getGenes <- function(pwy)
    { 
        genes <- pwy2gene[names(pwy2gene) == pwy]
        genes <- sub(org.start, "", genes)
        genes <- unname(sort(genes))
        return(genes)
    }
    gs <- lapply(names(pwys), .getGenes)
        
    # entrez id?
    id.type <- .detectGeneIdType(gs[[1]][1])
    if(is.na(id.type) || id.type != "entrez") 
    {
        map.k2e <- try(KEGGREST::keggConv("ncbi-geneid", org), silent = TRUE)
        if(is(map.k2e, "try-error"))
        {
            message("NCBI Gene ID mapping is not available for: ", org)
            message("Returning KEGG native gene IDs ...")
        }
        else
        {
            names(map.k2e) <- sub(org.start, "", names(map.k2e))
            map.k2e <- sub("^ncbi-geneid:", "", map.k2e)
            gs <- lapply(gs, function(s) unname(map.k2e[s]))
        }
    }

    if(return.type == "GeneSetCollection") gs <- .makeKeggGSC(gs, pwys, org)
    else names(gs) <- .makeGSNames(names(pwys), pwys)

    if(gene.id.type != "ENTREZID")
    {
        suppressMessages( 
            gs <- idMap(gs, org, from = "ENTREZID", to = gene.id.type)
        )
    }
    .cacheResource(gs, gsc.name)
    return(gs)
}

.makeKeggGSC <- function(gs, pwys, org)
{
    ids <- sub("^path:", "", names(pwys))
    titles <- vapply(unname(pwys), 
        function(title) unlist(strsplit(title, " - "))[1], 
        character(1), USE.NAMES=FALSE)
    names(gs) <- names(titles) <- ids 
    ct <- KEGGCollection()
    gs <- .makeGSC(gs, titles, org, ct)
    return(gs)
}

.dwnldSelectedKeggGS <- function(pwys)
{
    .dwnld <- function(pwy)
    { 
        info <- KEGGREST::keggLink(paste0("path:", pwy))
        org <- sub("[0-9]+$", "", pwy)
        genes <- grep(paste0("^", org, ":"), info, value=TRUE)
        genes <- sub("^[a-z]{3,4}:", "", genes)
        genes <- unname(sort(genes))
        return(genes)
    }
    gs <- lapply(pwys, .dwnld) 
    .makeTitle <- function(pwy)
    {
        ti <- paste0("map", sub("^[a-z]{3,4}", "", pwy))
        ti <- KEGGREST::keggList(ti)
        return(ti)
    }
    titles <- vapply(pwys, .makeTitle, character(1))
    names(titles) <- pwys
    pwys <- titles
    names(gs) <- .makeGSNames(names(pwys), pwys)
    return(gs)
}

# only preferred over '.dwnldKeggGS' 
# when pathway kgmls have already been download
.extractKeggGS <- function(pwys, gmt.file=NULL)
{
    # read in & parse pathways
    if(is.character(pwys)) pwys <- .extractPwys(pwys)
    
    # get pathway annotations
    nn <- vapply(pwys, getName, character(1))
    tt <- vapply(pwys, getTitle, character(1))
    
    # extract genesets
    gs <- lapply(pwys, 
        function(pwy)
        {
            genes <- .getGenesByPwy(pwy)
            genes <- sub("^[a-z]{3,4}:", "", genes)
            genes <- sort(genes)
            return(genes)
        })

    names(gs) <- .makeGSNames(nn, tt)
    
    if(!is.null(gmt.file)) writeGMT(gs, gmt.file=gmt.file)
    return(gs)
}

## .extractPwys from zip archive and parse KGML files
.extractPwys <- function(pwy.zip)
{
    pwy.dir <- dirname(pwy.zip)
    unzip(pwy.zip, exdir=pwy.dir, junkpaths=TRUE)
    pwy.files <- list.files(pwy.dir, pattern="*.xml", full.names=TRUE)
    pwys <- sapply(pwy.files, parseKGML)
    ## clean up
    for(f in pwy.files) file.remove(f)
    return(pwys)
}

.getGenesByPwy <- function(pwy)
{
    ts <- vapply(nodes(pwy), getType, character(1))
    genes <- unique(unlist(lapply(nodes(pwy)[ts == "gene"], getName)))
    return(genes)
}

#
# UTILS
#

# make GeneSetCollection from gene set list
.makeGSC <- function(gs, titles, org, ctype, it = EntrezIdentifier())
{
    gs <- lapply(gs, function(s) s[!is.na(s)])
    gs <- lapply(gs, unique)
    .makeGeneSet <- function(s)
    {
        sname <- gsub("[<>:\\?\\|\"\\*\\/]", "", s)
        is.idcoll <- !is(ctype, "ComputedCollection") && !is(ctype, "BroadCollection")
        if(is.idcoll) ctype@ids <- s 
        gset <- GeneSet(setName=sname,
                    geneIds=gs[[s]],
                    type=it,
                    collectionType=ctype,
                    shortDescription=unname(titles[s]),
                    organism=org)
        return(gset)
    }
    gs <- lapply(names(gs), .makeGeneSet)
    gs <- GeneSetCollection(gs)
}


# build first gmt column: the ID (format: <pwy.nr>_<pwy.title>)
.makeGSNames <- function(ids, titles)
{
    ids <- sub("^path:", "", ids)
    titles <- vapply(titles, 
        function(title) unlist(strsplit(title, " - "))[1], 
        character(1))
    titles <- sub("^ +", "", titles)
    titles <- sub(" +$", "", titles)
    titles <- gsub(" ", "_", titles)
    ids <- paste(ids, titles, sep="_")
    return(ids)
}


.extractTitle <- function(n)
{
    spl <- unlist(strsplit(n, "_"))
    paste(spl[-1], collapse = " ")
}

.parseGMT <- function(gmt.file, return.type)
{
    content <- readLines(gmt.file, warn=FALSE)
    le <- length(content)
    gs <- vector("list", length=le)
    gs.names <- vector("character", length=le)
    for(i in seq_len(le))
    {
        line <- content[i]
        spl <- unlist(strsplit(line, "\t"))
        gs.names[i] <- spl[1]
        gs[[i]] <- spl[-c(1,2)]
    }
    if(return.type == "GeneSetCollection")
    {
        spl <- lapply(gs.names, function(s) unlist(strsplit(s, "_"))) 
        ids <- vapply(spl, function(s) s[1], character(1))
        if(length(unique(ids)) < length(ids)) ids <- paste0(ids, seq_along(ids))
        .getTitle <- function(s)
        {
            descr <- "" 
            if(length(s) > 1) descr <- paste(s[2:length(s)], collapse=" ")
            return(descr)
        }
        titles <- vapply(spl, .getTitle, character(1))
        names(gs) <- names(titles) <- ids

        ct <- ComputedCollection()
        gs <- .makeGSC(gs, titles, NA_character_, ct)        
    }
    names(gs) <- gs.names
    return(gs)
}

# prepare gene sets as gene set collection from a variety of input formats
.prepGS <- function(gsets)
{
    if(!is(gsets, "GeneSetCollection"))
	{	
        if(!is.list(gsets))
        {
		    # gmt file or char vector of set names/ids
		    if(class(gsets) != "character")
                stop(paste(
                    "Invalid input: \'gsets\' need to be either a filename",
                    "(GMT file), a character vector (gene set IDs), a named",
                    "list of character vectors, or a GeneSetCollection.", 
                    "See the man page for details."
                ))
            else
            {
			    # gmt file
			    if(file.exists(gsets[1])) 
                    gsets <- getGenesets(gsets)
			    # char vector
			    else gsets <- .createGS(gsets)
		    }
        }
        gsets <- .gsList2Collect(gsets)
	}
    return(gsets)
}

# coerces a list of gene sets (char vecs) into a GeneSetCollection
.gsList2Collect <- function(gs.list)
{
    # gs type
    first <- names(gs.list)[1]
	gs.type <- .detectGSType(first)
    ctype <- paste0(gs.type, "Collection")
    args <- list()
    if(gs.type == "GO") args <- list(evidenceCode=NA_character_)
    ctype <- do.call(ctype, args)
    
    # org
    spl <- unlist(strsplit(first, "_")) 
    org <- ifelse(gs.type == "KEGG", sub("[0-9]+$", "", spl[1]), NA_character_)
       
    # ids & titles
    spl <- lapply(names(gs.list), function(s) unlist(strsplit(s, "_"))) 
    ids <- vapply(spl, function(s) s[1], character(1))
    if(length(unique(ids)) < length(ids)) ids <- paste0(ids, seq_along(ids))
    .getTitle <- function(s)
    {
        descr <- "" 
        if(length(s) > 1) descr <- paste(s[2:length(s)], collapse=" ")
        return(descr)
    }
    titles <- vapply(spl, .getTitle, character(1))
    names(gs.list) <- names(titles) <- ids

    gsc <- .makeGSC(gs.list, titles, org, ctype)        
    return(gsc)
}

# create gene sets from a list of gene set ids such as
# 'hsa00010' (KEGG ID) or 'GO:0000002' (GO ID)
.createGS <- function(gs.ids)
{
	gs.type <- .detectGSType(gs.ids[1])
	if(gs.type == "GO") gs <- .getGO(gs.ids)
	else if(gs.type == "KEGG") gs <- .getKEGG(gs.ids)
	else stop(paste("Automatic gene set recognition", 
        "is currently only supported for GO and KEGG"))
	return(gs)
}

# currently supported: GO, KEGG, user.def
.detectGSType <- function(gs.id)
{
	if(substring(gs.id, 1, 3) == "GO:") return("GO")
	else if(grepl("^[a-z]{3,4}[0-9]{5}", gs.id)) return("KEGG")
	else return("Computed") 
}
lgeistlinger/EnrichmentBrowser documentation built on Oct. 29, 2023, 5:08 p.m.