#' @importFrom qs qread
loadRemotePreProcessedData <- function(default, file=NULL, path=NULL) {
link <- file.path("https://compbio.imm.medicina.ulisboa.pt/public/cTRAP",
default)
if (is.null(file)) file <- default
if (!is.null(path)) file <- file.path(path, file)
file <- downloadIfNotFound(link, file)
table <- qread(file)
return(table)
}
#' Load table with drug descriptors
#'
#' @param source Character: source of compounds used to calculate molecular
#' descriptors (\code{NCI60} or \code{CMap})
#' @param type Character: load \code{2D} or \code{3D} molecular descriptors
#' @param file Character: filepath to drug descriptors (automatically downloaded
#' if file does not exist)
#' @inheritParams loadExpressionDrugSensitivityAssociation
#'
#' @family functions for drug set enrichment analysis
#' @return Data table with drug descriptors
#' @export
#'
#' @examples
#' loadDrugDescriptors()
loadDrugDescriptors <- function(source=c("NCI60", "CMap"), type=c("2D", "3D"),
file=NULL, path=NULL) {
source <- match.arg(source)
type <- match.arg(type)
default <- "compound_descriptors_%s_%s.qs"
default <- sprintf(default, source, type)
loadRemotePreProcessedData(default, file, path)
}
# Load drug set prepared from compound descriptors
loadDrugSet <- function(source=c("NCI60", "CMap"), type=c("2D", "3D"),
file=NULL, path=NULL) {
source <- match.arg(source)
type <- match.arg(type)
default <- "drug_set_%s_%s.qs"
default <- sprintf(default, source, type)
loadRemotePreProcessedData(default, file, path)
}
#' Calculate evenly-distributed bins
#'
#' @param numbers Numeric
#' @param maxBins Numeric: maximum number of bins for numeric columns
#' @param k Numeric: constant; the higher the constant, the smaller the bin size
#' (check \code{minpts})
#' @param minPoints Numeric: minimum number of points in a bin (if \code{NULL},
#' the minimum number of points is the number of non-missing values divided by
#' \code{maxBins} divided by \code{k})
#' @inheritDotParams binr::bins -x -target.bins -minpts
#'
#' @importFrom binr bins bins.getvals
#'
#' @return Factor containing the respective group of each element in
#' \code{numbers}
#' @keywords internal
calculateEvenlyDistributedBins <- function(numbers, maxBins=15, k=5,
minPoints=NULL, ..., ids=NULL) {
nas <- is.na(numbers)
if (all(nas)) return(NULL)
numbers <- round(numbers[!nas])
if (max(numbers) - min(numbers) == 0) return(setNames(numbers, ids[!nas]))
if (is.null(minPoints)) {
minPoints <- round( length(numbers) / maxBins / k )
if (minPoints == 0) return(NULL)
}
bin <- bins(numbers, target.bins=maxBins, minpts=minPoints, ...)
# Replace labels of single number intervals
names(bin$binct) <- gsub("\\[([-]?[0-9]*), \\1\\]", "\\1", names(bin$binct))
# Suppress warnings to avoid integer64 overflow warnings
factors <- suppressWarnings(
cut(numbers, bins.getvals(bin), labels=names(bin$binct)))
if (!is.null(ids)) names(factors) <- ids[!nas]
return(factors)
}
#' Prepare drug sets from a table with compound descriptors
#'
#' Create a list of drug sets for each character and numeric column. For each
#' character column, drugs are split across that column's unique values (see
#' argument \code{maxUniqueElems}). For each numeric column, drugs are split
#' across evenly-distributed bins.
#'
#' @param table Data frame: drug descriptors
#' @param id Integer or character: index or name of the identifier column
#' @param maxUniqueElems Numeric: ignore character columns with more unique
#' elements than \code{maxUniqueElems}
#' @inheritParams calculateEvenlyDistributedBins
#'
#' @importFrom pbapply pblapply
#'
#' @family functions for drug set enrichment analysis
#' @return Named list of characters: named drug sets with respective compound
#' identifiers as list elements
#' @export
#'
#' @examples
#' descriptors <- loadDrugDescriptors("NCI60")
#' prepareDrugSets(descriptors)
prepareDrugSets <- function(table, id=1, maxUniqueElems=15, maxBins=15, k=5,
minPoints=NULL) {
# Remove elements with no ID
valid <- !is.na(table[[id]])
table <- table[valid, ]
# Prepare sets from character columns if # unique values <= maxUniqueElems
isCharacter <- sapply(table, is.character)
uniqueElems <- sapply(lapply(table[ , isCharacter, with=FALSE], unique),
length)
sets <- names(uniqueElems[uniqueElems <= maxUniqueElems])
subtable <- table[ , sets, with=FALSE]
res <- lapply(subtable, function(x, ids) split(ids, x, drop=TRUE),
ids=table[[id]])
nonCharacterTable <- table[ , !isCharacter, with=FALSE]
res2 <- pblapply(nonCharacterTable, calculateEvenlyDistributedBins,
ids=table[[id]], maxBins=maxBins, k=k, minPoints=minPoints)
res2 <- Filter(length, res2)
res2 <- lapply(res2, function(x) split(names(x), x, drop=TRUE))
res <- c(res, res2)
symbol <- ": "
names(res) <- paste0(names(res), symbol)
res <- unlist(res, recursive=FALSE)
names(res) <- gsub(paste0(symbol, "."), symbol, names(res), fixed=TRUE)
# Inherit input attributes
attributes(res) <- c(attributes(res),
attributes(table)[c("compoundInfo", "source", "type")])
class(res) <- c("drugSets", class(res))
return(res)
}
#' Prepare stats' compound information
#'
#' @inheritParams matchStatsWithDrugSetsID
#'
#' @return List containing stats' compound info
#' @keywords internal
prepareStatsCompoundInfo <- function(stats) {
statsCompoundInfo <- attr(stats, "compoundInfo")
if (is.vector(stats)) {
statsInfo <- data.table("id"=names(stats), "values"=stats)
statsIDcol <- colnames(statsInfo)[[1]]
} else if (is(stats, "referenceComparison")) {
statsInfo <- as.table(stats, clean=FALSE)
statsIDcol <- colnames(statsInfo)[[1]]
} else if (!is.null(statsCompoundInfo)) {
if (is(stats, "data.table") && !is(statsCompoundInfo, "data.table")) {
statsCompoundInfo <- data.table(statsCompoundInfo)
}
statsIDcol <- colnames(stats)[[1]]
stats[[statsIDcol]] <- as.character(stats[[statsIDcol]])
statsCompoundInfo[["id"]] <- as.character(statsCompoundInfo[["id"]])
statsInfo <- merge(stats, statsCompoundInfo, by.x=statsIDcol, by.y="id",
all.x=TRUE)
} else {
msg <- paste(
"Argument 'stats' needs to be a vector or a 'referenceComparison'",
"object or have an attribute called 'compoundInfo'")
stop(msg)
}
return(list("statsInfo"=statsInfo, "statsIDcol"=statsIDcol))
}
#' Get drug sets' compound info
#'
#' @inheritParams matchStatsWithDrugSetsID
#'
#' @return List containing drug sets' compound info
#' @keywords internal
prepareSetsCompoundInfo <- function(sets) {
setsCompoundInfo <- attr(sets, "compoundInfo")
setsIDcol <- "id"
if (is.null(setsCompoundInfo)) {
setsCompoundInfo <- data.table("id"=unique(unlist(sets)))
} else if (!setsIDcol %in% colnames(setsCompoundInfo)) {
setsIDcol <- colnames(setsCompoundInfo)[[1]]
}
return(list("setsCompoundInfo"=setsCompoundInfo, "setsIDcol"=setsIDcol))
}
#' Match identifiers between data and drug sets
#'
#' @inheritParams analyseDrugSetEnrichment
#' @importFrom data.table data.table
#' @keywords internal
#'
#' @return Statistic values from input data and corresponding identifiers as
#' names (if no match is found, the original identifier from argument
#' \code{stats} is used)
matchStatsWithDrugSetsID <- function(sets, stats, col="values",
keyColSets=NULL, keyColStats=NULL) {
res <- prepareStatsCompoundInfo(stats)
statsInfo <- res$statsInfo
statsIDcol <- res$statsIDcol
res <- prepareSetsCompoundInfo(sets)
setsCompoundInfo <- res$setsCompoundInfo
setsIDcol <- res$setsIDcol
checkIfIDwasReplacedAfterMerging <- function(statsIDcol, data) {
keys <- attr(data, "keys")
if (keys$key1 == statsIDcol) statsIDcol <- keys$key2
return(statsIDcol)
}
# Return statistical values with corresponding identifier (or original
# identifier if no match is found)
df <- mergeDatasets(data1=statsInfo, key1=keyColStats,
data2=setsCompoundInfo, key2=keyColSets,
all.y=TRUE, removeKey2ColNAs=TRUE)
if (!setsIDcol %in% colnames(df)) setsIDcol <- paste0(setsIDcol, ".1")
res <- setNames(df[[col]], df[[setsIDcol]])
if (!statsIDcol %in% colnames(df)) statsIDcol <- paste0(statsIDcol, ".2")
statsIDcol <- checkIfIDwasReplacedAfterMerging(statsIDcol, df)
names(res)[is.na(names(res))] <- df[[statsIDcol]][is.na(names(res))]
return(res)
}
processStats <- function(stats, col) {
isRank <- endsWith(col, "_rank")
stats <- sort(stats, decreasing=!isRank)
stats <- stats[unique(names(stats))]
stats <- ifelse(!isRank, `+`, `-`)(stats)
return(stats)
}
prepareStatsCol <- function(col, stats) {
if (is.vector(stats)) {
col <- "values"
} else if (is.null(col)) {
cols <- paste0(c("rankProduct", "spearman", "pearson", "GSEA"), "_rank")
col <- cols[cols %in% colnames(stats)][[1]]
if (!col %in% colnames(stats)) {
stop("no suitable column to analyse; explicitly set argument 'col'")
} else {
msg <- sprintf(
paste("Ordering results by column '%s'; to manually select",
"column to order by, please set argument 'col'"), col)
message(msg)
}
} else if (!col %in% colnames(stats)) {
stop(sprintf("specified column '%s' not found", col))
}
return(col)
}
#' Analyse drug set enrichment
#'
#' @param stats Named numeric vector or either a \code{similarPerturbations} or
#' a \code{targetingDrugs} object (obtained after running
#' \code{\link{rankSimilarPerturbations}} or
#' \code{\link{predictTargetingDrugs}}, respectively)
#' @param sets Named list of characters: named sets containing compound
#' identifiers (obtain drug sets by running \code{prepareDrugSets()})
#' @param col Character: name of the column to use for statistics (only required
#' if class of \code{stats} is either \code{similarPerturbations} or
#' \code{targetingDrugs})
#' @inheritParams fgsea::fgseaSimple
#' @inheritDotParams fgsea::fgseaSimple -pathways -stats -nperm -maxSize
#' @param keyColSets Character: column from \code{sets} to compare with column
#' \code{keyColStats} from \code{stats}; automatically selected if \code{NULL}
#' @param keyColStats Character: column from \code{stats} to compare with column
#' \code{keyColSets} from \code{sets}; automatically selected if \code{NULL}
#'
#' @importFrom fgsea fgsea
#'
#' @family functions for drug set enrichment analysis
#' @return Enrichment analysis based on GSEA
#' @export
#'
#' @examples
#' descriptors <- loadDrugDescriptors()
#' drugSets <- prepareDrugSets(descriptors)
#'
#' # Analyse drug set enrichment in ranked targeting drugs for a differential
#' # expression profile
#' data("diffExprStat")
#' gdsc <- loadExpressionDrugSensitivityAssociation("GDSC")
#' predicted <- predictTargetingDrugs(diffExprStat, gdsc)
#'
#' analyseDrugSetEnrichment(drugSets, predicted)
analyseDrugSetEnrichment <- function(sets, stats, col=NULL, nperm=10000,
maxSize=500, ..., keyColSets=NULL,
keyColStats=NULL) {
message("Matching compounds with those available in drug sets...")
col <- prepareStatsCol(col, stats)
stats <- matchStatsWithDrugSetsID(
sets=sets, stats=stats, col=col,
keyColSets=keyColSets, keyColStats=keyColStats)
stats <- processStats(stats, col)
message("Performing enrichment analysis...")
gseaRes <- suppressWarnings(
fgsea(sets, stats, nperm=nperm, maxSize=maxSize, ...))
gseaRes <- gseaRes[order(gseaRes$padj), ]
colnames(gseaRes)[[1]] <- "descriptor"
return(gseaRes)
}
#' Plot drug set enrichment
#'
#' @inheritParams analyseDrugSetEnrichment
#' @param selectedSets Character: drug sets to plot (if \code{NULL}, plot all)
#'
#' @family functions for drug set enrichment analysis
#' @return List of GSEA plots per drug set
#' @export
#'
#' @importFrom pbapply pblapply
#'
#' @examples
#' descriptors <- loadDrugDescriptors()
#' drugSets <- prepareDrugSets(descriptors)
#'
#' # Analyse drug set enrichment in ranked targeting drugs for a differential
#' # expression profile
#' data("diffExprStat")
#' gdsc <- loadExpressionDrugSensitivityAssociation("GDSC")
#' predicted <- predictTargetingDrugs(diffExprStat, gdsc)
#'
#' plotDrugSetEnrichment(drugSets, predicted)
plotDrugSetEnrichment <- function(sets, stats, col="rankProduct_rank",
selectedSets=NULL, keyColSets=NULL,
keyColStats=NULL) {
message("Matching compounds with those available in drug sets...")
col <- prepareStatsCol(col, stats)
stats <- matchStatsWithDrugSetsID(
sets=sets, stats=stats, col=col,
keyColSets=keyColSets, keyColStats=keyColStats)
stats <- processStats(stats, col)
message("Plotting enrichment analysis...")
plotGSEAperSet <- function(k, sets, stats, col) {
plot <- plotGSEA(stats, sets[[k]], title=names(sets)[[k]])
return(plot)
}
if (!is.null(selectedSets)) sets <- sets[selectedSets]
plots <- pblapply(seq(sets), plotGSEAperSet, sets, stats, col)
names(plots) <- names(sets)
return(plots)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.