R/feature_processing.R

Defines functions initOmicsDataStructure rankFeatures getIDMappingDatabase getOrganisms mapIDType getMappedFeatures getMultiOmicsFeatures getMetaboliteMapping getGeneMapping getFeatures

Documented in getFeatures getGeneMapping getIDMappingDatabase getMappedFeatures getMetaboliteMapping getMultiOmicsFeatures getOrganisms initOmicsDataStructure mapIDType rankFeatures

#' Wrapper to extract features (nodes) from given pathways.
#'
#' Function to extract the features (nodes) from a given list of pathways. These
#' pathways have to be compiled with the
#' \code{\link[graphite:pathways]{pathways}} function. Features can only be
#' extracted for \'proteins\' or \'metabolites\'. Features will by default be
#' mapped to gene symbols.
#'
#' @param pathway A pathway created with \code{\link[graphite:pathways]{pathways}} command.
#' @param which Mode to extract the features, either \'proteins\' or
#'   \'metabolites\'.
#' @param org String specifying the organism, which is necessary for featureID
#'   mapping. Default: human
#' @param returntype String that specifies the returning ID type. Default:
#'   SYMBOL Options (genes/proteins): SYMBOL, ENTREZID, UNIPROT, ENSEMBL, REFSEQ
#'   Options (metabolites): HMDB, CAS, DTXCID, DTXSID, SID, CID, ChEBI, KEGG, Drugbank
#'
#' @return Feature list with gene symbols (genes/proteins) or CHEBI IDs
#'   (metabolites)
#'
#' @examples
#' pathways <- graphite::pathways("hsapiens", "kegg")[[1]]
#' getFeatures(pathways)
#' \donttest{
#' pathways <- graphite::pathways("mmusculus", "kegg")[[1]]
#' getFeatures(pathways, which = "metabolites", org = "mmusculus", returntype = "HMDB")
#'
#' pathways <- graphite::pathways("mmusculus", "kegg")
#' getFeatures(pathways, which = "proteins", org = "mmusculus", returntype = "SYMBOL")
#' }
#'
#' @importFrom graphite nodes
#'
#' @export
getFeatures <- function(pathway, which = "proteins", org = "hsapiens", returntype = "SYMBOL") {
    
    if (which != "proteins" && which != "metabolites") {
        stop("Only 'proteins' and 'metabolites' are supported for 'which'.",
            call. = FALSE
        )
    }

    org <- tolower(org)

    ## check for the correct organism
    if (!(org %in% getOrganisms())) {
        stop("Please insert a correct organism name! Use getOrganisms() 
              to see all supported organisms.",
            call. = FALSE
        )
    }

    ## extract the features (genes/proteins or metabolites) from the pathway nodes.
    features <- nodes(pathway, which = which)

    if (length(features) == 0) {
        return(list())
    }

    if (which == "proteins") {

        ## extract the keytype of the ID format
        kt <- gsub(":.*", "", features[1])
        mapped <- gsub("[A-Z]+:", "", features)

        ## get the mapping from the keytype to the user-specific type
        mapped <- getGeneMapping(
            features = mapped, keytype = kt,
            org = org, returntype = returntype
        )
    }

    ## its special for metabolites, because sometimes there are different
    ## identifiers used in the same pathway
    if (which == "metabolites") {
        chebi <- mapIDType(
            features = features, keytype = "CHEBI",
            maptype = "ChEBI", returntype = returntype
        )

        kegg <- mapIDType(
            features = features, keytype = "KEGGCOMP",
            maptype = "KEGG", returntype = returntype
        )

        pubchem <- mapIDType(
            features = features, keytype = "PUBCHEM",
            maptype = "CID", returntype = returntype
        )

        mapped <- c(chebi, kegg, pubchem)
    }

    return(mapped)
}



#' Mapping between pathway encoded genes/proteins and different ID formats.
#'
#' Function to retrieve the gene/protein identifier mapping. Ongoing from
#' genes/proteins retrieved from pathway definitions, which often include two or
#' more ID formats or a format that is not present in your omics measurement,
#' this function maps those IDs to a given format. Depending on the organism,
#' additional packages have to be installed.
#'
#' @param features List of identifiers to be mapped.
#' @param keytype String specifying the ID type, e.g., "ENTREZID" or "UNIPROT".
#' @param org String that defines the organism. Default: hsapiens
#'   Options: see \code{\link[multiGSEA]{getOrganisms}}
#' @param returntype String that specifies the returning ID type. Default:
#'   SYMBOL, Options: SYMBOL, ENTREZID, UNIPROT, ENSEMBL, REFSEQ
#'
#' @return List containing mapped gene/protein IDs.
#'
#' @examples
#' features <- graphite::nodes(graphite::pathways("hsapiens", "kegg")[[1]])
#' features <- gsub("ENTREZID:", "", features)
#' keytype <- "ENTREZID"
#' getGeneMapping(features, keytype)
#'
#' getGeneMapping(features, keytype, returntype = "UNIPROT")
#' \donttest{
#' features <- graphite::nodes(graphite::pathways("rnorvegicus", "reactome")[[1]])
#' features <- gsub("UNIPROT:", "", features)
#' getGeneMapping(features, keytype = "UNIPROT", org = "rnorvegicus")
#'
#' getGeneMapping(features,
#'     keytype = "UNIPROT",
#'     org = "rnorvegicus",
#'     returntype = "ENSEMBL"
#' )
#' }
#'
#' @importFrom AnnotationDbi select
#'
#' @export
getGeneMapping <- function(features, keytype, org = "hsapiens", returntype = "SYMBOL") {
    
    org <- tolower(org)

    ## check for the correct organism
    if (!(org %in% getOrganisms())) {
        stop("Please insert a correct organism name! Use getOrganisms()
              to see all supported organisms.",
            call. = FALSE
        )
    }

    db <- getIDMappingDatabase(org)

    supportedIDs <- c("SYMBOL", "ENTREZID", "UNIPROT", "ENSEMBL", "REFSEQ")
    if (org != "dmelanogaster" && !returntype %in% supportedIDs) {
        stop("Insert one of the following IDs to be returned (returntype): 
              SYMBOL, ENTREZID, UNIPROT, ENSEMBL, REFSEQ.",
            call. = FALSE
        )
    }

    supportedIDs <- c(supportedIDs, "FLYBASE", "FLYBASECG")
    if (org == "dmelanogaster" && !returntype %in% supportedIDs) {
        stop("Insert one of the following IDs to be returned (returntype): 
              SYMBOL, ENTREZID, UNIPROT, ENSEMBL, REFSEQ, FLYBASE, FLYBASECG.",
            call. = FALSE
        )
    }

    ## design the columns field such that we create a triple ID mapping between
    ## ENTREZIDs, UNIPROT, and gene symbols
    if (keytype == "UNIPROT") {
        col <- unique(c("SYMBOL", "ENTREZID", returntype))
    }
    else {
        col <- unique(c("SYMBOL", "UNIPROT", returntype))
    }
    
    ## run the actual mapping of IDS and return a list of the user-given type
    map <- tryCatch(
        {
            map <- select(db, keys = features, columns = col, keytype = keytype)
            m <- match(unique(map[[keytype]]), map[[keytype]])
            map <- map[m, ]
            map[[returntype]][!is.na(map[[returntype]])]
        },
        error = function(cond) {
            return(list())
        }
    )
    
    return(map)
    
}



#' Mapping between pathway encoded metabolites and different metabolite ID
#' formats.
#'
#' Function to retrieve the metabolite identifier mapping. Ongoing from
#' metabolites retrieved from pathway definitions, which often include two or
#' more ID formats, this function maps those IDs to a given format. The complete
#' mapping table based on \href{https://comptox.epa.gov/dashboard}{Comptox
#' Dashboard}, \href{https://pubchem.ncbi.nlm.nih.gov/}{PubChem},
#' \href{https://hmdb.ca/}{HMDB}, and \href{https://www.ebi.ac.uk/chebi}{ChEBI}
#' is provided in the AnnotationHub package metaboliteIDmapping.
#'
#' @param features List of identifiers to be mapped.
#' @param keytype String specifying the ID type, e.g., "ChEBI" or "KEGGCOMP".
#'
#' @param returntype String that specifies the returning ID type.
#'        Default: HMDB
#'        Options: HMDB, CAS, DTXCID, DTXSID, SID, CID, ChEBI, KEGG, Drugbank
#'
#' @return List containing mapped gene/protein IDs.
#'
#' @examples
#' features <- graphite::nodes(graphite::pathways("hsapiens", "kegg")[[1]], which = "metabolites")
#' features <- gsub("KEGGCOMP:", "", features)
#' keytype <- "KEGG"
#'
#' getMetaboliteMapping(features, keytype)
#'
#' getMetaboliteMapping(features, keytype = "KEGG", returntype = "CID")
#' @export
getMetaboliteMapping <- function(features, keytype, returntype = "HMDB") {
    
    ## check for the correct metabolite mapping format
    supportedIDs <- c("HMDB", "ChEBI", "KEGG", "CAS", "DTXCID",
                      "DTXSID", "SID", "CID", "Drugbank")
    if (!returntype %in% supportedIDs) {
        stop( "Insert one of the following IDs to be returned (returntype):
              HMDB, CAS, ChEBI, KEGG, SID, CID, DTXCID, DTXSID, Drugbank, Name",
              call. = FALSE
        )
    }
    
    ## load the mapping table which is deposited in the 
    ## metaboliteIDmapping package.
    if (!requireNamespace( "metaboliteIDmapping", quietly = TRUE)) {
        stop( "The necessary package metabolieIDmapping is not installed.",
              call. = FALSE
        )
    }
    
    
    ## run the actual mapping of IDS and return a list of the user-given type
    map <- tryCatch(
        {
            ## find all rows with a given feature and keytype
            ## extract all unique mappings to a returntype
            ## i.e., there can be more than one ID that is mapped
            ## to one input feature
            m <- lapply( unique( features), function(x){
                
                rows <- which( metaboliteIDmapping::metabolitesMapping[[ keytype]] %in% x)
                unique( metaboliteIDmapping::metabolitesMapping[ rows,][[ returntype]])
                
            })
            
            m <- unlist( m)
            m <- m[ !is.na( m)]
            
        },
        error = function(cond) {
            return(list())
        }
    )
    
    map <- map[!is.na(map)]
    
    return(map)
    
}



#' Collect feature mapping for user given databases and omics layer.
#'
#' The functions makes use of the graphite R package to collect pathways from
#' user specified databases. Depending on the omics layer specified, the
#' function extracts either annotated genes/proteins (for transcriptome,
#' proteome layer) or metabolites (for metabolite layer). The data structure
#' that is returned is mandatory to calculate the multi-omics pathway
#' enrichment.
#'
#' @param dbs List of databases that should be queried for pathways. Default:
#'   all available databases
#' @param layer List of omics layer that should be addressed. Default: all three
#'   layer (transcriptome, proteome, metabolome)
#' @param returnTranscriptome String specifying the returned gene ID format.
#'   Default: SYMBOL Options: SYMBOL, ENTREZID, UNIPROT, ENSEMBL, REFSEQ
#' @param returnProteome String specifying the returned protein ID format.
#'   Default: SYMBOL Options: SYMBOL, ENTREZID, UNIPROT, ENSEMBL, REFSEQ
#' @param returnMetabolome String specifying the returned metabolite ID format.
#'   Default: HMDB Options: HMDB, CAS, DTXCID, DTXSID, SID, CID, ChEBI, KEGG, Drugbank
#' @param organism String specifying the organism of interest. This has direct
#'   influence on the available pathway databases. Default: "hsapiens"
#'   Options: see \code{\link[multiGSEA]{getOrganisms}}
#' @param useLocal Boolean to use local pathway/feature descriptions. In case
#'   useLocal is set to FALSE, pathway definitions and feature extraction
#'   will be recalculated. This could take several minutes depending on the
#'   database used. Pathbank, for example, contains nearly 50000 pathway
#'   definition that have to be re-mapped. useLocal has no effect when
#'   pathway definitions are retrieved for the first time. Default: TRUE
#'
#' @return Nested list with extracted and mapped pathway features.
#'
#' @examples
#'
#' getMultiOmicsFeatures(
#'     dbs = c("kegg"),
#'     layer = c("transcriptome", "proteome"),
#'     organism = "hsapiens"
#' )
#' \donttest{
#' getMultiOmicsFeatures(
#'     dbs = c("kegg", "reactome"),
#'     layer = c("transcriptome", "metabolome"),
#'     organism = "mmusculus"
#' )
#'
#' getMultiOmicsFeatures(
#'     dbs = c("reactome"),
#'     layer = c("proteome"),
#'     organism = "rnorvegicus",
#'     returnProteome = "ENTREZID"
#' )
#' }
#' @importFrom graphite pathwayDatabases pathways
#' @importFrom magrittr %>%
#' @importFrom dplyr filter pull
#' @importFrom rlang .data
#'
#' @export
getMultiOmicsFeatures <- function(dbs = c("all"), layer = c("all"),
                                  returnTranscriptome = "SYMBOL",
                                  returnProteome = "SYMBOL",
                                  returnMetabolome = "HMDB",
                                  organism = "hsapiens",
                                  useLocal = TRUE) {
    
    layers <- c("all", "metabolome", "proteome", "transcriptome")
    organism <- tolower(organism)
    
    returnTranscriptome <- toupper(returnTranscriptome)
    returnProteome <- toupper(returnProteome)
    returnMetabolome <- toupper(returnMetabolome)
    
    ## check for the correct transcriptome mapping format
    supportedIDs <- c("SYMBOL", "ENTREZID", "UNIPROT", "ENSEMBL", "REFSEQ")
    if (!returnTranscriptome %in% supportedIDs) {
        stop("Insert one of the following IDs to be returned (returnTranscriptome):
              SYMBOL, ENTREZID, UNIPROT, ENSEMBL, REFSEQ",
             call. = FALSE
        )
    }
    
    ## check for the correct proteome mapping format
    if (!returnProteome %in% supportedIDs) {
        stop("Insert one of the following IDs to be returned (returnProteome):
              SYMBOL, ENTREZID, UNIPROT, ENSEMBL, REFSEQ",
             call. = FALSE
        )
    }
    
    ## check for the correct metabolite mapping format
    supportedIDs <- c("HMDB", "ChEBI", "KEGG", "CAS", "DTXCID",
                      "DTXSID", "SID", "CID", "Drugbank")
    if (!returnMetabolome %in% supportedIDs) {
        stop("Insert one of the following IDs to be returned (returnMetabolome):
              HMDB, CAS, ChEBI, KEGG, SID, CID, DTXCID, DTXSID, Drugbank",
            call. = FALSE
        )
    }

    ## check if the given organism is supported
    if (!(organism %in% getOrganisms())) {
        stop("You entered an organism that is not supported! 
              Use getOrganisms() to get a list of all suported organisms.",
            call. = FALSE
        )
    }

    if (sum(tolower(layer) %in% layers) != length(layer)) {
        stop("You entered wrong input for the omics layer specification.
              Options are: all, transcriptome, proteome, metabolome, or a combination thereof.",
            call. = FALSE
        )
    }

    pDBs <- pathwayDatabases()
    dbs0 <- pDBs %>%
        filter(.data$species == organism) %>%
        pull(.data$database)
    databases <- c("all", as.vector(dbs0))

    if (sum(tolower(dbs) %in% databases) != length(dbs)) {
        stop("You entered wrong input for the omics layer specification.
              Options are: all, kegg, pathbank, reactome, biocarta, nci, panther, 
              smpdb, pharmgkb, or a combination thereof.",
            call. = FALSE
        )
    }

    if ("all" %in% dbs) dbs <- as.vector(dbs0)

    if ("all" %in% layer) {
          layer <- c("transcriptome", "proteome", "metabolome")
      }

    pathways <- lapply(dbs, function(x) {
        pathways(organism, x)
    })
    names(pathways) <- dbs

    features <- list()
    if ("transcriptome" %in% layer) {
        features$transcriptome <- getMappedFeatures(
            pathways = pathways,
            organism = organism,
            returnID = returnTranscriptome,
            useLocal = useLocal
        )
        
        ## adapt for duplicated pathways
        names( features$transcriptome) <- rename_duplicates( names( features$transcriptome))
                                                                    
    }

    if ("proteome" %in% layer) {
        if ("transcriptome" %in% layer && returnProteome == returnTranscriptome) {
            features$proteome <- features$transcriptome
        } else {
            features$proteome <- getMappedFeatures(
                pathways = pathways,
                organism = organism,
                returnID = returnProteome,
                useLocal = useLocal
            )
            
            ## adapt for duplicated pathways
            names( features$proteome) <- rename_duplicates( names( features$proteome))
            
        }
    }


    if ("metabolome" %in% layer) {
        features$metabolome <- getMappedFeatures(
            pathways = pathways,
            organism = organism,
            returnID = returnMetabolome,
            which = "metabolites",
            useLocal = useLocal
        )
        
        ## adapt for duplicated pathways
        names( features$metabolome) <- rename_duplicates( names( features$metabolome))
        
    }

    return(features)
    
}



#' Wrapper to get feature mappings.
#'
#' Feature mappings will be used from hard disk in case they have been
#' mapped before and `useLocal` is not set to be FALSE.
#' In other cases, a feature extraction will be done and the results are
#' stored for a following occasion.
#'
#' @param pathways List of pathway definitions.
#' @param returnID String specifying the returned ID format.
#' @param organism String defining the organism of analysis.
#' @param which Mode to extract the features, either \'proteins\' or
#'   \'metabolites\'.
#' @param useLocal Boolean specifying whether or not to use the local
#'              preprocessed mapping.
#'
#' @return List of mapped features for an omics layer.
getMappedFeatures <- function(pathways, returnID = "SYMBOL", organism = "hsapiens", which = "proteins", useLocal = TRUE) {
    
    feat <- unlist(lapply(names(pathways), function(db) {
        ap <- archivePath(paste0(organism, "_", db, "_", returnID))

        if (file.exists(ap) && useLocal) {
            loadLocal(ap)
        } else {
            tmp <- lapply(pathways[[db]], function(p) {
                getFeatures(
                    pathway = p, org = organism,
                    which = which,
                    returntype = returnID
                )
            })

            header <- rep(paste0("(", toupper(db), ") "), length(pathways[[db]]))
            names(tmp) <- paste0(header, names(tmp))
            saveRDS(tmp, file = ap)
            tmp
        }
    }), recursive = FALSE)

    return(feat)
    
}



#' Helper function to map only a subset of metabolite IDs
#'
#' This helper function becomes necessary since there are sometimes multiple ID
#' formats used in a single pathway definition.
#'
#' @param features List of metabolite feature IDs of the pathway.
#' @param keytype String specifying the ID format in pathway definition.
#' @param maptype String specifying the corresponding ID format in multiGSEA.
#' @param returntype String identifying the ID type that should be mapped.
#'
#' @return List of mapped metabolite IDs.
mapIDType <- function(features, keytype = "CHEBI", maptype = "ChEBI", returntype = "HMDB") {
    
    mapped <- c()
    ids <- gsub(paste0(keytype, ":"), "", features[grep(keytype, features)])
    if (returntype != maptype) {
        mapped <- c(mapped, getMetaboliteMapping(
            features = ids,
            keytype = maptype,
            returntype = returntype
        ))
    } else {
        mapped <- c(mapped, ids)
    }

    return(mapped)
    
}



#' Get list of supported organisms
#'
#' Get a list of organisms that are covered in our workflow through a supporting
#' `AnnotationDBi` package. Without such a package we would not be able to map
#' transcript and protein identifier between different formats. All the
#' organisms that are listed here have at lest kegg and or reactome pathway
#' annotation that can be queried by `graphite`.
#'
#' @return List of supported organisms
#'
#' @examples
#' getOrganisms()
#' @export
getOrganisms <- function() {
    
    orglist <- c(
        "hsapiens", "rnorvegicus", "mmusculus", "sscrofa",
        "btaurus", "celegans", "dmelanogaster", "drerio",
        "ggallus", "xlaevis", "cfamiliaris"
    )

    return(orglist)
    
}



#' Get the correct ID mapping database
#'
#' Check by means of the given organism name if the required `AnnotationDbi`
#' package is installed. Select the ID mapping table based on the organism name
#' and return it.
#'
#' @param organism String that defines the organism.
#'
#' @return AnnotationDbi database for ID mapping.
getIDMappingDatabase <- function(organism) {
    
    map <- c(
        hsapiens = "org.Hs.eg.db", rnorvegicus = "org.Rn.eg.db",
        mmusculus = "org.Mm.eg.db", sscrofa = "org.Ss.eg.db",
        btaurus = "org.Bt.eg.db", celegans = "org.Ce.eg.db",
        dmelanogaster = "org.Dm.eg.db", drerio = "org.Dr.eg.db",
        ggallus = "org.Gg.eg.db", xlaevis = "org.Xl.eg.db",
        cfamiliaris = "org.Cf.eg.db"
    )

    stopifnot(organism %in% names(map))
    pkg <- map[[organism]]

    if (!requireNamespace(pkg, quietly = TRUE)) {
        stop(paste0("The necessary package ", pkg, " is not installed."),
            call. = FALSE
        )
    }

    return(get(pkg, envir = getNamespace(pkg)))
    
}



#' Pre-rank features prior to calculating enrichment scores.
#'
#' Rank features based on the direction of their fold change and their magnitude
#' implicated through their assigned p-value.
#'
#' @param logFC Vector containing the log-transformed fold changes of features.
#' @param pvalues Vector containing the p-values associated with those logFCs.
#' @param base Integer specifying the base of the logarithm. Default: 10
#'
#' @return Vector of pre-ranked features, still unsorted
#'
#' @examples
#' logFC <- rnorm(10)
#' pvalues <- runif(10)
#' rankFeatures(logFC, pvalues)
#' @export
rankFeatures <- function(logFC, pvalues, base = 10) {
    
    return(sign(logFC) * -log(pvalues, base = base))
    
}



#' Create an empty data structure for measured omics features
#'
#' This function creates a data structure of nested but empty lists. One list
#' for each omics layer. By default all three supported omics layer are used to
#' create a data structures with three empty sublists: transcriptome, proteome,
#' and metabolome.
#'
#' @param layer List specifying the omics layer which should be created
#'
#' @return List with length(layer) empty sublists
#'
#' @examples
#' initOmicsDataStructure()
#' initOmicsDataStructure(c("transcriptome", "proteome"))
#' initOmicsDataStructure(c("Transcriptome", "Metabolome"))
#' @export
initOmicsDataStructure <- function(layer = c("transcriptome", "proteome", "metabolome")) {
    
    l <- c()
    layer <- tolower(layer)
    if (length(grep("trans*", layer)) > 0) l <- c(l, "transcriptome")
    if (length(grep("prote*", layer)) > 0) l <- c(l, "proteome")
    if (length(grep("metabo*", layer)) > 0) l <- c(l, "metabolome")

    if (length(l) != length(layer)) {
        stop("Not all your omics layer could be mapped to 
              'transcriptome', 'proteome', or 'metabolome'. ",
            call. = TRUE
        )
    }

    empty_structure <- rep(list(list()), length(l))
    names(empty_structure) <- l

    return(empty_structure)

}

Try the multiGSEA package in your browser

Any scripts or data that you put into this service are public.

multiGSEA documentation built on Nov. 8, 2020, 8:15 p.m.