#' Load gene set data
#'
#' @inheritParams WebGestaltR
#' @param isMultiOmics Boolean if loading gene sets for multiomics. Defaults to \code{FALSE}.
#' @return A list of \code{geneSet}, \code{geneSetDes}, \code{geneSetDag}, \code{geneSetNet}, \code{standardId}.
#' \describe{
#' \item{geneSet}{Gene set: A data frame with columns of "geneSet", "description", "genes"}
#' \item{geneSetDes}{Description: A data frame with columns of two columns of gene set ID and description}
#' \item{geneSetDag}{DAG: A edge list data frame of two columns of parent and child. Or a list of data frames if multilple databases are given.}
#' \item{geneSetNet}{Network: A edge list data frame of two columns connecting nodes. Or a list of data frames if multilple databases are given.}
#' \item{standardId}{The standard ID of the gene set}
#' }
#'
#' @importFrom dplyr select distinct filter %>%
#' @importFrom httr modify_url
#' @export
loadGeneSet <- function(organism = "hsapiens", enrichDatabase = NULL, enrichDatabaseFile = NULL, enrichDatabaseType = NULL, enrichDatabaseDescriptionFile = NULL,
cache = NULL, hostName = "https://www.webgestalt.org/", isMultiOmics = FALSE) {
# TODO: multiple custom database ID types?
geneSet <- NULL ## gene sets
geneSetDes <- NULL ## gene set description file
geneSetDag <- list() ## gene set DAG file
geneSetNet <- list() ## gene set network file
standardId <- NULL
if (organism != "others" && !is.null(enrichDatabaseFile) && is.null(enrichDatabaseType)) {
stop("The ID type should be given in enrichDatabaseType for custom GMT files, e.g. genesymbol.")
}
# necessary because loop with length used below. enrichDatabase will skip NULL
if (!is.vector(enrichDatabaseFile)) {
enrichDatabaseFile <- list(enrichDatabaseFile)
}
if (!is.vector(enrichDatabaseDescriptionFile)) {
enrichDatabaseDescriptionFile <- list(enrichDatabaseDescriptionFile)
}
if (length(enrichDatabaseFile) != length(enrichDatabaseDescriptionFile)) {
stop("The number of custom database and its description files should be equal. Use NULL for placeholder.")
}
if (organism != "others") { # supported organism
geneSetInfo <- listGeneSet(organism = organism, hostName = hostName, cache = cache)
# load build-in databases
for (enrichDb in enrichDatabase) {
if (is.null(enrichDb) || enrichDb == "others") {
next
} # just for backward compatibility
if (!enrichDb %in% geneSetInfo$name) {
warning("Database ", enrichDb, " is not supported")
next
}
# get the ID type of the enriched database, such as entrezgene or phosphsiteSeq
thisStandardId <- filter(geneSetInfo, .data$name == enrichDb)[[1, "idType"]]
if (!is.null(standardId) && standardId != thisStandardId && !isMultiOmics) {
stop("Databases have inconsistent ID types. Mixed gene annotation databases with phosphosite databases?")
}
standardId <- thisStandardId
######### Read GMT file from the existing database###########
if (startsWith(hostName, "file://")) {
gmtPath <- removeFileProtocol(file.path(hostName, "geneset", paste0(paste(organism, enrichDb, standardId, sep = "_"), ".gmt")))
thisGeneSet <- readGmt(gmtPath)
thisGeneSet$database <- enrichDb # add a column for database source
geneSet <- rbind(geneSet, thisGeneSet)
} else {
gmtUrl <- modify_url(file.path(hostName, "api", "geneset"), query = list(organism = organism, database = enrichDb, standardId = standardId, fileType = "gmt"))
thisGeneSet <- readGmt(gmtUrl, cache = cache)
thisGeneSet$database <- enrichDb
oriCnt <- nrow(thisGeneSet)
thisGeneSet <- thisGeneSet %>% filter(!(.data$geneSet %in% unique(!!geneSet$geneSet)))
if (nrow(thisGeneSet) < oriCnt) {
warning(paste("Duplicate gene set names in", enrichDb, "have been ignored."))
}
geneSet <- rbind(geneSet, thisGeneSet)
}
######### Read the description file#############
# geneSetDes <- rbind(geneSetDes, .loadGeneSetData(hostName, organism, enrichDb, standardId, "des", cache))
thisGeneSetDes <- .loadGeneSetData(hostName, organism, enrichDb, standardId, "des", cache)
if (!is.null(thisGeneSetDes) && !is.null(geneSetDes)) {
thisGeneSetDes <- thisGeneSetDes %>% filter(!(.data$geneSet %in% unique(!!geneSetDes$geneSet)))
}
geneSetDes <- rbind(geneSetDes, thisGeneSetDes)
########### Try to load the DAG file#################
# assignment considering possible return of NULL
# list[] <- NULL will delete the element
geneSetDag[enrichDb] <- list(.loadGeneSetData(hostName, organism, enrichDb, standardId, "dag", cache))
########### Try to load the network file if the gene sets are generated from the network##########
geneSetNet[enrichDb] <- list(.loadGeneSetData(hostName, organism, enrichDb, standardId, "net", cache))
}
# load local database files
for (i in seq_along(enrichDatabaseFile)) {
enrichDbFile <- enrichDatabaseFile[[i]]
if (is.null(enrichDbFile)) {
next
}
thisGeneSet <- idMapping(organism = organism, dataType = "gmt", inputGeneFile = enrichDbFile, sourceIdType = enrichDatabaseType, targetIdType = NULL, mappingOutput = FALSE, cache = cache, hostName = hostName)
thisStandardId <- thisGeneSet$standardId # should be just enrichDatabaseType here
if (!is.null(standardId) && standardId != thisStandardId && !isMultiOmics) {
stop("Databases have inconsistent ID types. Mixed gene annotation databases with phosphosite databases?")
}
standardId <- thisStandardId
thisGeneSet <- thisGeneSet$mapped %>%
select(.data$geneSet, .data$description, gene = standardId) %>%
distinct()
# load local description files
enrichDbDesFile <- enrichDatabaseDescriptionFile[[i]]
if (!is.null(enrichDbDesFile)) {
thisGeneSetDes <- .loadEnrichDatabaseDescriptionFile(thisGeneSet, enrichDbDesFile)
if (!is.null(thisGeneSetDes) && !is.null(geneSetDes)) {
thisGeneSetDes <- thisGeneSetDes %>% filter(!(.data$geneSet %in% unique(!!geneSetDes$geneSet)))
}
geneSetDes <- rbind(geneSetDes, thisGeneSetDes)
}
fileName <- gsub(".gmt", "", basename(enrichDbFile), fixed = TRUE)
thisGeneSet$database <- fileName
oriCnt <- nrow(thisGeneSet)
thisGeneSet <- thisGeneSet %>% filter(!(.data$geneSet %in% unique(!!geneSet$geneSet)))
if (nrow(thisGeneSet) < oriCnt) {
warning(paste("Duplicate gene set names in", fileName, "have been ignored."))
}
geneSet <- rbind(geneSet, thisGeneSet)
geneSetDag[fileName] <- list(NULL) # correct way to assign NULL to list element
geneSetNet[fileName] <- list(NULL)
}
} else { # custom organisms
for (i in seq_along(enrichDatabaseFile)) {
enrichDbFile <- enrichDatabaseFile[[i]]
if (is.null(enrichDbFile)) {
next
}
thisGeneSet <- readGmt(enrichDbFile)
enrichDbDesFile <- enrichDatabaseDescriptionFile[[i]]
if (!is.null(enrichDbDesFile)) {
thisGeneSetDes <- .loadEnrichDatabaseDescriptionFile(thisGeneSet, enrichDbDesFile)
if (!is.null(thisGeneSetDes) && !is.null(geneSetDes)) {
thisGeneSetDes <- thisGeneSetDes %>% filter(!(.data$geneSet %in% unique(!!geneSetDes$geneSet)))
}
geneSetDes <- rbind(geneSetDes, thisGeneSetDes)
}
fileName <- gsub(".gmt", "", basename(enrichDbFile), fixed = TRUE)
thisGeneSet$database <- fileName
oriCnt <- nrow(thisGeneSet)
thisGeneSet <- thisGeneSet %>% filter(!(.data$geneSet %in% unique(!!geneSet$geneSet)))
if (nrow(thisGeneSet) < oriCnt) {
warning(paste("Duplicate gene set names in", enrichDb, "have been ignored."))
}
geneSet <- rbind(geneSet, thisGeneSet)
geneSetDag[fileName] <- list(NULL)
geneSetNet[fileName] <- list(NULL)
}
}
if (is.null(geneSet)) {
stop(enrichDatabaseError(type = "empty"))
}
# unlist if just one database, for backward compatibility
if (length(geneSetDag) == 1) {
geneSetDag <- geneSetDag[[1]]
}
if (length(geneSetNet) == 1) {
geneSetNet <- geneSetNet[[1]]
}
if (length(unique(geneSet$database)) == 1) {
# remove database column for single source
geneSet$database <- NULL
}
re <- list(geneSet = geneSet, geneSetDes = geneSetDes, geneSetDag = geneSetDag, geneSetNet = geneSetNet, standardId = standardId)
return(re)
}
#' @importFrom httr content
#' @importFrom readr read_tsv
.loadGeneSetData <- function(hostName, organism, database, standardId, fileType, cache = NULL) {
# read gene set files from API or returns NULL
if (startsWith(hostName, "file://")) {
geneSetPath <- removeFileProtocol(file.path(hostName, "geneset", paste(paste(organism, database, standardId, sep = "_"), fileType, sep = ".")))
if (file.exists(geneSetPath)) {
geneSetData <- read_tsv(geneSetPath, col_names = FALSE, col_types = "cc", quote = "")
} else {
geneSetData <- NULL
}
} else {
geneSetUrl <- file.path(hostName, "api", "geneset")
response <- cacheUrl(geneSetUrl, cache = cache, query = list(organism = organism, database = database, standardId = standardId, fileType = fileType))
if (response$status_code == 200) {
geneSetData <- read_tsv(content(response), col_names = FALSE, col_types = "cc", quote = "")
} else {
geneSetData <- NULL
}
}
if (!is.null(geneSetData) && fileType == "des") {
colnames(geneSetData) <- c("geneSet", "description")
geneSetData <- geneSetData %>% distinct(.data$geneSet, .keep_all = TRUE)
}
return(geneSetData)
}
#' @importFrom readr read_tsv
#' @importFrom tools file_ext
.loadEnrichDatabaseDescriptionFile <- function(geneSet, enrichDatabaseDescriptionFile) {
if (file_ext(enrichDatabaseDescriptionFile) != "des") {
warning(descriptionFileError("format"))
return(NULL)
} else {
geneSetDes <- read_tsv(enrichDatabaseDescriptionFile, col_names = c("geneSet", "description"), col_types = "cc", quote = "") %>%
distinct(geneSet, .keep_all = TRUE)
if (ncol(geneSetDes) != 2) {
warning(descriptionFileError("columnNum"))
return(NULL)
} else {
if (length(intersect(unique(geneSet$geneSet), geneSetDes$geneSet)) < 0.6 * length(unique(geneSet[, 1]))) {
warning(descriptionFileError("overlap"))
return(NULL)
} else {
return(geneSetDes)
}
}
}
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.