R/loadGeneSet.R
In WebGestaltR: Gene Set Analysis Toolkit WebGestaltR

Documented in loadGeneSet

#' Load gene set data
#'
#' @inheritParams WebGestaltR
#'
#' @return A list of \code{geneSet}, \code{geneSetDes}, \code{geneSetDag}, \code{geneSetNet}, \code{standardId}.
#' \describe{
#'  \item{geneSet}{Gene set: A data frame with columns of "geneSet", "description", "genes"}
#'  \item{geneSetDes}{Description: A data frame with columns of two columns of gene set ID and description}
#'  \item{geneSetDag}{DAG: A edge list data frame of two columns of parent and child. Or a list of data frames if multilple databases are given.}
#'  \item{geneSetNet}{Network: A edge list data frame of two columns connecting nodes. Or a list of data frames if multilple databases are given.}
#'  \item{standardId}{The standard ID of the gene set}
#' }
#'
#' @importFrom dplyr select distinct filter %>%
#' @importFrom httr modify_url
#' @export
loadGeneSet <- function(organism="hsapiens", enrichDatabase=NULL, enrichDatabaseFile=NULL, enrichDatabaseType=NULL, enrichDatabaseDescriptionFile=NULL, cache=NULL, hostName="https://www.webgestalt.org/") {
	# TODO: multiple custom database ID types?
	geneSet <- NULL    ##gene sets
	geneSetDes <- NULL ##gene set description file
	geneSetDag <- list() ##gene set DAG file
	geneSetNet <- list() ##gene set network file
	standardId <- NULL

	if (organism != "others" && !is.null(enrichDatabaseFile) && is.null(enrichDatabaseType)) {
		stop("The ID type should be given in enrichDatabaseType for custom GMT files, e.g. genesymbol.")
	}
	# necessary because loop with length used below. enrichDatabase will skip NULL
	if (!is.vector(enrichDatabaseFile)) {
		enrichDatabaseFile = list(enrichDatabaseFile)
	}
	if (!is.vector(enrichDatabaseDescriptionFile)) {
		enrichDatabaseDescriptionFile = list(enrichDatabaseDescriptionFile)
	}
	if (length(enrichDatabaseFile) != length(enrichDatabaseDescriptionFile)) {
		stop("The number of custom database and its description files should be equal. Use NULL for placeholder.")
	}
	if (organism != "others") {  # supported organism
		geneSetInfo <- listGeneSet(organism=organism, hostName=hostName, cache=cache)
		#  load build-in databases
		for (enrichDb in enrichDatabase) {
			if (is.null(enrichDb) || enrichDb == "others") { next }  # just for backward compatibility
			if (!enrichDb %in% geneSetInfo$name) {
				warning("Database ", enrichDb, " is not supported")
				next
			}
			# get the ID type of the enriched database, such as entrezgene or phosphsiteSeq
			thisStandardId <- filter(geneSetInfo, .data$name==enrichDb)[[1, "idType"]]
			if (!is.null(standardId) && standardId != thisStandardId) {
				stop("Databases have inconsistent ID types. Mixed gene annotation databases with phosphosite databases?")
			}
			standardId <- thisStandardId

			#########Read GMT file from the existing database###########
			if (startsWith(hostName, "file://")) {
				gmtPath <- removeFileProtocol(file.path(hostName, "geneset", paste0(paste(organism, enrichDb, standardId, sep="_"), ".gmt")))
				thisGeneSet <- readGmt(gmtPath)
				thisGeneSet$database <- enrichDb # add a column for database source
				geneSet <- rbind(geneSet, thisGeneSet)
			} else {
				gmtUrl <- modify_url(file.path(hostName, "api", "geneset"), query=list(organism=organism, database=enrichDb, standardId=standardId, fileType="gmt"))
				thisGeneSet <-  readGmt(gmtUrl, cache=cache)
				thisGeneSet$database <- enrichDb
				oriCnt <- nrow(thisGeneSet)
				thisGeneSet <- thisGeneSet %>% filter(!(.data$geneSet %in% unique(!!geneSet$geneSet)))
				if (nrow(thisGeneSet) < oriCnt) {
				  warning(paste("Duplicate gene set names in", enrichDb, "have been ignored."))
				}
				geneSet <- rbind(geneSet, thisGeneSet)
			}

			#########Read the description file#############
			#geneSetDes <- rbind(geneSetDes, .loadGeneSetData(hostName, organism, enrichDb, standardId, "des", cache))
			thisGeneSetDes <- .loadGeneSetData(hostName, organism, enrichDb, standardId, "des", cache)
			if (!is.null(thisGeneSetDes) && !is.null(geneSetDes)) {
			  thisGeneSetDes <- thisGeneSetDes %>% filter(!(.data$geneSet %in% unique(!!geneSetDes$geneSet)))
			}
			geneSetDes <- rbind(geneSetDes, thisGeneSetDes)

			###########Try to load the DAG file#################
			# assignment considering possible return of NULL
			# list[] <- NULL will delete the element
			geneSetDag[enrichDb] <- list(.loadGeneSetData(hostName, organism, enrichDb, standardId, "dag", cache))

			###########Try to load the network file if the gene sets are generated from the network##########
			geneSetNet[enrichDb] <- list(.loadGeneSetData(hostName, organism, enrichDb, standardId, "net", cache))
		}

		# load local database files
		for (i in 1:length(enrichDatabaseFile)) {
			enrichDbFile <- enrichDatabaseFile[[i]]
			if (is.null(enrichDbFile)) { next }

			thisGeneSet <- idMapping(organism=organism, dataType="gmt", inputGeneFile=enrichDbFile, sourceIdType=enrichDatabaseType, targetIdType=NULL, mappingOutput=FALSE, cache=cache, hostName=hostName)
			thisStandardId <- thisGeneSet$standardId  # should be just enrichDatabaseType here
			if (!is.null(standardId) && standardId != thisStandardId) {
				stop("Databases have inconsistent ID types. Mixed gene annotation databases with phosphosite databases?")
			}
			standardId <- thisStandardId

			thisGeneSet <- thisGeneSet$mapped %>% select(.data$geneSet, .data$description, gene=standardId) %>% distinct()

			# load local description files
			enrichDbDesFile <- enrichDatabaseDescriptionFile[[i]]
			if (!is.null(enrichDbDesFile)) {
				thisGeneSetDes <- .loadEnrichDatabaseDescriptionFile(thisGeneSet, enrichDbDesFile)
				if (!is.null(thisGeneSetDes) && !is.null(geneSetDes)) {
				  thisGeneSetDes <- thisGeneSetDes %>% filter(!(.data$geneSet %in% unique(!!geneSetDes$geneSet)))
				}
				geneSetDes <- rbind(geneSetDes, thisGeneSetDes)
			}

			fileName <- gsub(".gmt", "", basename(enrichDbFile), fixed=TRUE)
			thisGeneSet$database <- fileName
			oriCnt <- nrow(thisGeneSet)
			thisGeneSet <- thisGeneSet %>% filter(!(.data$geneSet %in% unique(!!geneSet$geneSet)))
			if (nrow(thisGeneSet) < oriCnt) {
			  warning(paste("Duplicate gene set names in", fileName, "have been ignored."))
			}
			geneSet <- rbind(geneSet, thisGeneSet)

			geneSetDag[fileName] <- list(NULL) # correct way to assign NULL to list element
			geneSetNet[fileName] <- list(NULL)
		}
	} else { # custom organisms
		for (i in 1:length(enrichDatabaseFile)) {
			enrichDbFile <- enrichDatabaseFile[[i]]
			if (is.null(enrichDbFile)) { next }
			thisGeneSet <- readGmt(enrichDbFile)

			enrichDbDesFile <- enrichDatabaseDescriptionFile[[i]]
			if (!is.null(enrichDbDesFile)) {
				thisGeneSetDes <- .loadEnrichDatabaseDescriptionFile(thisGeneSet, enrichDbDesFile)
				if (!is.null(thisGeneSetDes) && !is.null(geneSetDes)) {
				  thisGeneSetDes <- thisGeneSetDes %>% filter(!(.data$geneSet %in% unique(!!geneSetDes$geneSet)))
				}
				geneSetDes <- rbind(geneSetDes, thisGeneSetDes)
			}
			fileName <- gsub(".gmt", "", basename(enrichDbFile), fixed=TRUE)
			thisGeneSet$database <- fileName
			oriCnt <- nrow(thisGeneSet)
			thisGeneSet <- thisGeneSet %>% filter(!(.data$geneSet %in% unique(!!geneSet$geneSet)))
			if (nrow(thisGeneSet) < oriCnt) {
				warning(paste("Duplicate gene set names in", enrichDb, "have been ignored."))
			}
			geneSet <- rbind(geneSet, thisGeneSet)

			geneSetDag[fileName] <- list(NULL)
			geneSetNet[fileName] <- list(NULL)
		}
	}
	if (is.null(geneSet)) { stop(enrichDatabaseError(type="empty")) }

	# unlist if just one database, for backward compatibility
	if (length(geneSetDag) == 1) { geneSetDag <- geneSetDag[[1]] }
	if (length(geneSetNet) == 1) { geneSetNet <- geneSetNet[[1]] }

	if (length(unique(geneSet$database)) == 1) {
		# remove database column for single source
		geneSet$database<- NULL
	}
	re <- list(geneSet=geneSet, geneSetDes=geneSetDes, geneSetDag=geneSetDag, geneSetNet=geneSetNet,standardId=standardId)
	return(re)
}

#' @importFrom httr content
#' @importFrom readr read_tsv
.loadGeneSetData <- function(hostName, organism, database, standardId, fileType, cache=NULL) {
	# read gene set files from API or returns NULL
	if (startsWith(hostName, "file://")) {
		geneSetPath <- removeFileProtocol(file.path(hostName, "geneset", paste(paste(organism, database, standardId, sep="_"), fileType, sep=".")))
		if (file.exists(geneSetPath)) {
			geneSetData <- read_tsv(geneSetPath, col_names=FALSE, col_types="cc", quote="")
		} else {
			geneSetData <- NULL
		}
	} else {
		geneSetUrl <- file.path(hostName,"api","geneset")
		response <- cacheUrl(geneSetUrl, cache=cache, query=list(organism=organism, database=database, standardId=standardId, fileType=fileType))
		if (response$status_code == 200) {
			geneSetData <- read_tsv(content(response), col_names=FALSE, col_types="cc", quote="")
		} else {
			geneSetData <- NULL
		}
	}
	if (!is.null(geneSetData) && fileType == "des") {
		colnames(geneSetData) <- c("geneSet", "description")
		geneSetData <- geneSetData %>% distinct(.data$geneSet, .keep_all=TRUE)
	}
	return(geneSetData)
}

#' @importFrom readr read_tsv
#' @importFrom tools file_ext
.loadEnrichDatabaseDescriptionFile <- function(geneSet, enrichDatabaseDescriptionFile) {
	if (file_ext(enrichDatabaseDescriptionFile) != "des") {
		warning(descriptionFileError("format"))
		return(NULL)
	} else {
		geneSetDes <- read_tsv(enrichDatabaseDescriptionFile, col_names=c("geneSet", "description"), col_types="cc", quote="") %>%
			distinct(geneSet, .keep_all=TRUE)
		if (ncol(geneSetDes)!=2) {
			warning(descriptionFileError("columnNum"))
			return(NULL)
		} else {
			if (length(intersect(unique(geneSet$geneSet), geneSetDes$geneSet)) < 0.6 * length(unique(geneSet[,1]))) {
				warning(descriptionFileError("overlap"))
				return(NULL)
			} else {
				return(geneSetDes)
			}
		}
	}
}