cTRAP: Identification of candidate causal perturbations from differential gene expression data

Documented in analyseDrugSetEnrichment calculateEvenlyDistributedBins loadDrugDescriptors matchStatsWithDrugSetsID plotDrugSetEnrichment prepareDrugSets prepareSetsCompoundInfo prepareStatsCompoundInfo

#' @importFrom qs qread
loadRemotePreProcessedData <- function(default, file=NULL, path=NULL) {
    link <- file.path("https://compbio.imm.medicina.ulisboa.pt/public/cTRAP",
                      default)
    if (is.null(file)) file <- default
    if (!is.null(path)) file <- file.path(path, file)
    file  <- downloadIfNotFound(link, file)
    table <- qread(file)
    return(table)
}

#' Load table with drug descriptors
#'
#' @param source Character: source of compounds used to calculate molecular
#'   descriptors (\code{NCI60} or \code{CMap})
#' @param type Character: load \code{2D} or \code{3D} molecular descriptors
#' @param file Character: filepath to drug descriptors (automatically downloaded
#'   if file does not exist)
#' @inheritParams loadExpressionDrugSensitivityAssociation
#'
#' @family functions for drug set enrichment analysis
#' @return Data table with drug descriptors
#' @export
#'
#' @examples
#' loadDrugDescriptors()
loadDrugDescriptors <- function(source=c("NCI60", "CMap"), type=c("2D", "3D"),
                                file=NULL, path=NULL) {
    source  <- match.arg(source)
    type    <- match.arg(type)
    default <- "compound_descriptors_%s_%s.qs"
    default <- sprintf(default, source, type)
    
    loadRemotePreProcessedData(default, file, path)
}

# Load drug set prepared from compound descriptors
loadDrugSet <- function(source=c("NCI60", "CMap"), type=c("2D", "3D"),
                        file=NULL, path=NULL) {
    source  <- match.arg(source)
    type    <- match.arg(type)
    default <- "drug_set_%s_%s.qs"
    default <- sprintf(default, source, type)
    
    loadRemotePreProcessedData(default, file, path)
}

#' Calculate evenly-distributed bins
#'
#' @param numbers Numeric
#' @param maxBins Numeric: maximum number of bins for numeric columns
#' @param k Numeric: constant; the higher the constant, the smaller the bin size
#'   (check \code{minpts})
#' @param minPoints Numeric: minimum number of points in a bin (if \code{NULL},
#'   the minimum number of points is the number of non-missing values divided by
#'   \code{maxBins} divided by \code{k})
#' @inheritDotParams binr::bins -x -target.bins -minpts
#'
#' @importFrom binr bins bins.getvals
#'
#' @return Factor containing the respective group of each element in
#'   \code{numbers}
#' @keywords internal
calculateEvenlyDistributedBins <- function(numbers, maxBins=15, k=5,
                                           minPoints=NULL, ..., ids=NULL) {
    nas     <- is.na(numbers)
    if (all(nas)) return(NULL)

    numbers <- round(numbers[!nas])
    if (max(numbers) - min(numbers) == 0) return(setNames(numbers, ids[!nas]))
    if (is.null(minPoints)) {
        minPoints <- round( length(numbers) / maxBins / k )
        if (minPoints == 0) return(NULL)
    }
    bin <- bins(numbers, target.bins=maxBins, minpts=minPoints, ...)

    # Replace labels of single number intervals
    names(bin$binct) <- gsub("\\[([-]?[0-9]*), \\1\\]", "\\1", names(bin$binct))
    # Suppress warnings to avoid integer64 overflow warnings
    factors <- suppressWarnings(
        cut(numbers, bins.getvals(bin), labels=names(bin$binct)))
    if (!is.null(ids)) names(factors) <- ids[!nas]
    return(factors)
}

#' Prepare drug sets from a table with compound descriptors
#'
#' Create a list of drug sets for each character and numeric column. For each
#' character column, drugs are split across that column's unique values (see
#' argument \code{maxUniqueElems}). For each numeric column, drugs are split
#' across evenly-distributed bins.
#'
#' @param table Data frame: drug descriptors
#' @param id Integer or character: index or name of the identifier column
#' @param maxUniqueElems Numeric: ignore character columns with more unique
#'   elements than \code{maxUniqueElems}
#' @inheritParams calculateEvenlyDistributedBins
#'
#' @importFrom pbapply pblapply
#'
#' @family functions for drug set enrichment analysis
#' @return Named list of characters: named drug sets with respective compound
#'   identifiers as list elements
#' @export
#'
#' @examples
#' descriptors <- loadDrugDescriptors("NCI60")
#' prepareDrugSets(descriptors)
prepareDrugSets <- function(table, id=1, maxUniqueElems=15, maxBins=15, k=5,
                            minPoints=NULL) {
    # Remove elements with no ID
    valid <- !is.na(table[[id]])
    table <- table[valid, ]

    # Prepare sets from character columns if # unique values <= maxUniqueElems
    isCharacter <- sapply(table, is.character)
    uniqueElems <- sapply(lapply(table[ , isCharacter, with=FALSE], unique),
                          length)
    sets <- names(uniqueElems[uniqueElems <= maxUniqueElems])
    subtable <- table[ , sets, with=FALSE]

    res <- lapply(subtable, function(x, ids) split(ids, x, drop=TRUE),
                  ids=table[[id]])

    nonCharacterTable <- table[ , !isCharacter, with=FALSE]
    res2 <- pblapply(nonCharacterTable, calculateEvenlyDistributedBins,
                     ids=table[[id]], maxBins=maxBins, k=k, minPoints=minPoints)
    res2 <- Filter(length, res2)
    res2 <- lapply(res2, function(x) split(names(x), x, drop=TRUE))

    res <- c(res, res2)
    symbol <- ": "
    names(res) <- paste0(names(res), symbol)
    res <- unlist(res, recursive=FALSE)
    names(res) <- gsub(paste0(symbol, "."), symbol, names(res), fixed=TRUE)

    # Inherit input attributes
    attributes(res) <- c(attributes(res),
                         attributes(table)[c("compoundInfo", "source", "type")])
    class(res) <- c("drugSets", class(res))
    return(res)
}

#' Prepare stats' compound information
#'
#' @inheritParams matchStatsWithDrugSetsID
#'
#' @return List containing stats' compound info
#' @keywords internal
prepareStatsCompoundInfo <- function(stats) {
    statsCompoundInfo <- attr(stats, "compoundInfo")
    if (is.vector(stats)) {
        statsInfo  <- data.table("id"=names(stats), "values"=stats)
        statsIDcol <- colnames(statsInfo)[[1]]
    } else if (is(stats, "referenceComparison")) {
        statsInfo  <- as.table(stats, clean=FALSE)
        statsIDcol <- colnames(statsInfo)[[1]]
    } else if (!is.null(statsCompoundInfo)) {
        if (is(stats, "data.table") && !is(statsCompoundInfo, "data.table")) {
            statsCompoundInfo <- data.table(statsCompoundInfo)
        }
        statsIDcol                <- colnames(stats)[[1]]
        stats[[statsIDcol]]       <- as.character(stats[[statsIDcol]])
        statsCompoundInfo[["id"]] <- as.character(statsCompoundInfo[["id"]])
        statsInfo <- merge(stats, statsCompoundInfo, by.x=statsIDcol, by.y="id",
                           all.x=TRUE)
    } else {
        msg <- paste(
            "Argument 'stats' needs to be a vector or a 'referenceComparison'",
            "object or have an attribute called 'compoundInfo'")
        stop(msg)
    }
    return(list("statsInfo"=statsInfo, "statsIDcol"=statsIDcol))
}

#' Get drug sets' compound info
#'
#' @inheritParams matchStatsWithDrugSetsID
#'
#' @return List containing drug sets' compound info
#' @keywords internal
prepareSetsCompoundInfo <- function(sets) {
    setsCompoundInfo <- attr(sets, "compoundInfo")
    setsIDcol <- "id"
    if (is.null(setsCompoundInfo)) {
        setsCompoundInfo <- data.table("id"=unique(unlist(sets)))
    } else if (!setsIDcol %in% colnames(setsCompoundInfo)) {
        setsIDcol <- colnames(setsCompoundInfo)[[1]]
    }
    return(list("setsCompoundInfo"=setsCompoundInfo, "setsIDcol"=setsIDcol))
}

#' Match identifiers between data and drug sets
#'
#' @inheritParams analyseDrugSetEnrichment
#' @importFrom data.table data.table
#' @keywords internal
#'
#' @return Statistic values from input data and corresponding identifiers as
#'   names (if no match is found, the original identifier from argument
#'   \code{stats} is used)
matchStatsWithDrugSetsID <- function(sets, stats, col="values",
                                     keyColSets=NULL, keyColStats=NULL) {
    res        <- prepareStatsCompoundInfo(stats)
    statsInfo  <- res$statsInfo
    statsIDcol <- res$statsIDcol

    res              <- prepareSetsCompoundInfo(sets)
    setsCompoundInfo <- res$setsCompoundInfo
    setsIDcol        <- res$setsIDcol

    checkIfIDwasReplacedAfterMerging <- function(statsIDcol, data) {
        keys <- attr(data, "keys")
        if (keys$key1 == statsIDcol) statsIDcol <- keys$key2
        return(statsIDcol)
    }

    # Return statistical values with corresponding identifier (or original
    # identifier if no match is found)
    df  <- mergeDatasets(data1=statsInfo, key1=keyColStats,
                         data2=setsCompoundInfo, key2=keyColSets,
                         all.y=TRUE, removeKey2ColNAs=TRUE)

    if (!setsIDcol %in% colnames(df))  setsIDcol <- paste0(setsIDcol, ".1")
    res <- setNames(df[[col]], df[[setsIDcol]])
    
    if (!statsIDcol %in% colnames(df)) statsIDcol <- paste0(statsIDcol, ".2")
    statsIDcol <- checkIfIDwasReplacedAfterMerging(statsIDcol, df)
    names(res)[is.na(names(res))] <- df[[statsIDcol]][is.na(names(res))]
    return(res)
}

processStats <- function(stats, col) {
    isRank <- endsWith(col, "_rank")
    stats  <- sort(stats, decreasing=!isRank)
    stats  <- stats[unique(names(stats))]
    stats  <- ifelse(!isRank, `+`, `-`)(stats)
    return(stats)
}

prepareStatsCol <- function(col, stats) {
    if (is.vector(stats)) {
        col <- "values"
    } else if (is.null(col)) {
        cols <- paste0(c("rankProduct", "spearman", "pearson", "GSEA"), "_rank")
        col  <- cols[cols %in% colnames(stats)][[1]]
        if (!col %in% colnames(stats)) {
            stop("no suitable column to analyse; explicitly set argument 'col'")
        } else {
            msg <- sprintf(
                paste("Ordering results by column '%s'; to manually select",
                      "column to order by, please set argument 'col'"), col)
            message(msg)
        }
    } else if (!col %in% colnames(stats)) {
        stop(sprintf("specified column '%s' not found", col))
    }
    return(col)
}

#' Analyse drug set enrichment
#'
#' @param stats Named numeric vector or either a \code{similarPerturbations} or
#'   a \code{targetingDrugs} object (obtained after running
#'   \code{\link{rankSimilarPerturbations}} or
#'   \code{\link{predictTargetingDrugs}}, respectively)
#' @param sets Named list of characters: named sets containing compound
#'   identifiers (obtain drug sets by running \code{prepareDrugSets()})
#' @param col Character: name of the column to use for statistics (only required
#'   if class of \code{stats} is either \code{similarPerturbations} or
#'   \code{targetingDrugs})
#' @inheritParams fgsea::fgseaSimple
#' @inheritDotParams fgsea::fgseaSimple -pathways -stats -nperm -maxSize
#' @param keyColSets Character: column from \code{sets} to compare with column
#'   \code{keyColStats} from \code{stats}; automatically selected if \code{NULL}
#' @param keyColStats Character: column from \code{stats} to compare with column
#'   \code{keyColSets} from \code{sets}; automatically selected if \code{NULL}
#'
#' @importFrom fgsea fgsea
#'
#' @family functions for drug set enrichment analysis
#' @return Enrichment analysis based on GSEA
#' @export
#'
#' @examples
#' descriptors <- loadDrugDescriptors()
#' drugSets <- prepareDrugSets(descriptors)
#'
#' # Analyse drug set enrichment in ranked targeting drugs for a differential
#' # expression profile
#' data("diffExprStat")
#' gdsc      <- loadExpressionDrugSensitivityAssociation("GDSC")
#' predicted <- predictTargetingDrugs(diffExprStat, gdsc)
#'
#' analyseDrugSetEnrichment(drugSets, predicted)
analyseDrugSetEnrichment <- function(sets, stats, col=NULL, nperm=10000,
                                     maxSize=500, ..., keyColSets=NULL,
                                     keyColStats=NULL) {
    message("Matching compounds with those available in drug sets...")
    col   <- prepareStatsCol(col, stats)
    stats <- matchStatsWithDrugSetsID(
        sets=sets, stats=stats, col=col,
        keyColSets=keyColSets, keyColStats=keyColStats)
    stats <- processStats(stats, col)

    message("Performing enrichment analysis...")
    gseaRes <- suppressWarnings(
        fgsea(sets, stats, nperm=nperm, maxSize=maxSize, ...))
    gseaRes <- gseaRes[order(gseaRes$padj), ]
    colnames(gseaRes)[[1]] <- "descriptor"
    return(gseaRes)
}

#' Plot drug set enrichment
#'
#' @inheritParams analyseDrugSetEnrichment
#' @param selectedSets Character: drug sets to plot (if \code{NULL}, plot all)
#'
#' @family functions for drug set enrichment analysis
#' @return List of GSEA plots per drug set
#' @export
#'
#' @importFrom pbapply pblapply
#'
#' @examples
#' descriptors <- loadDrugDescriptors()
#' drugSets <- prepareDrugSets(descriptors)
#'
#' # Analyse drug set enrichment in ranked targeting drugs for a differential
#' # expression profile
#' data("diffExprStat")
#' gdsc      <- loadExpressionDrugSensitivityAssociation("GDSC")
#' predicted <- predictTargetingDrugs(diffExprStat, gdsc)
#'
#' plotDrugSetEnrichment(drugSets, predicted)
plotDrugSetEnrichment <- function(sets, stats, col="rankProduct_rank",
                                  selectedSets=NULL, keyColSets=NULL,
                                  keyColStats=NULL) {
    message("Matching compounds with those available in drug sets...")
    col   <- prepareStatsCol(col, stats)
    stats <- matchStatsWithDrugSetsID(
        sets=sets, stats=stats, col=col,
        keyColSets=keyColSets, keyColStats=keyColStats)
    stats <- processStats(stats, col)

    message("Plotting enrichment analysis...")
    plotGSEAperSet <- function(k, sets, stats, col) {
        plot <- plotGSEA(stats, sets[[k]], title=names(sets)[[k]])
        return(plot)
    }
    if (!is.null(selectedSets)) sets <- sets[selectedSets]
    plots <- pblapply(seq(sets), plotGSEAperSet, sets, stats, col)
    names(plots) <- names(sets)
    return(plots)
}

nuno-agostinho/cTRAP documentation built on Jan. 2, 2025, 12:11 a.m.

rdrr.io home R language documentation Run R code online

CRAN packages Bioconductor packages R-Forge packages GitHub packages

Note that we can't provide technical support on individual packages. You should contact the package authors for that.

nuno-agostinho/cTRAP
Identification of candidate causal perturbations from differential gene expression data

R/drugSetEnrichment.R
In nuno-agostinho/cTRAP: Identification of candidate causal perturbations from differential gene expression data

Defines functions plotDrugSetEnrichment analyseDrugSetEnrichment prepareStatsCol processStats matchStatsWithDrugSetsID prepareSetsCompoundInfo prepareStatsCompoundInfo prepareDrugSets calculateEvenlyDistributedBins loadDrugSet loadDrugDescriptors loadRemotePreProcessedData

Documented in analyseDrugSetEnrichment calculateEvenlyDistributedBins loadDrugDescriptors matchStatsWithDrugSetsID plotDrugSetEnrichment prepareDrugSets prepareSetsCompoundInfo prepareStatsCompoundInfo

R Package Documentation

Browse R Packages

We want your feedback!

nuno-agostinho/cTRAP Identification of candidate causal perturbations from differential gene expression data

R/drugSetEnrichment.R In nuno-agostinho/cTRAP: Identification of candidate causal perturbations from differential gene expression data

Defines functions plotDrugSetEnrichment analyseDrugSetEnrichment prepareStatsCol processStats matchStatsWithDrugSetsID prepareSetsCompoundInfo prepareStatsCompoundInfo prepareDrugSets calculateEvenlyDistributedBins loadDrugSet loadDrugDescriptors loadRemotePreProcessedData

Documented in analyseDrugSetEnrichment calculateEvenlyDistributedBins loadDrugDescriptors matchStatsWithDrugSetsID plotDrugSetEnrichment prepareDrugSets prepareSetsCompoundInfo prepareStatsCompoundInfo

R Package Documentation

Browse R Packages

We want your feedback!

nuno-agostinho/cTRAP
Identification of candidate causal perturbations from differential gene expression data

R/drugSetEnrichment.R
In nuno-agostinho/cTRAP: Identification of candidate causal perturbations from differential gene expression data