R/compare.R
In cTRAP: Identification of candidate causal perturbations from differential gene expression data

Documented in as.table.referenceComparison chunkVector compareAgainstReference compareAgainstReferencePerMethod correlateAgainstReference performGSEAagainstReference prepareGSEAgenesets processByChunks rankColumns

#' Assign vector elements into chunks
#'
#' @param x Vector of elements
#' @param nElems Numeric: number of chunks
#'
#' @keywords internal
#'
#' @return List of chunks with the original vector elements divided
chunkVector <- function(x, nElems) {
    groups <- ceiling(length(x)/nElems)
    split(x, factor(sort(rank(x) %% groups)))
}

processChunk <- function(chunk, data, FUN, ..., progress) {
    zscores <- loadCMapZscores(data[ , chunk], verbose=FALSE)
    processCall(zscores, chunk, FUN, ..., progress=progress)
}

processCall <- function(data, cols, calledFUN, ..., progress) {
    data <- unclass(data)
    gc()

    setFUNprogress <- function(col, data, ..., progress, calledFUN) {
        res <- calledFUN(col, data, ...)
        setpb(progress, getpb(progress) + 1)
        return(res)
    }

    # Suppress warnings to avoid "Cannot compute exact p-value with ties"
    res <- suppressWarnings(lapply(cols, setFUNprogress, data, ...,
                                   progress=progress, calledFUN=calledFUN))
    # res <- plyr::compact(res) # in case of NULL elements for GSEA
    return(res)
}

#' Process data column by chunks
#'
#' Columns will be processed per chunk if argument \code{data} is a
#' \code{perturbationChanges} object containing a file path instead of a data
#' matrix. Otherwise, the data will be processed as a single chunk.
#'
#' For instance, loading a chunk of 10000 CMap pertubations requires ~1GB of RAM
#' compared to loading the whole dataset.
#'
#' @param data Data matrix or \code{perturbationChanges} object
#' @param FUN Function: function to run for each chunk
#' @param ... Arguments passed to \code{FUN}
#' @param chunkSize Integer: number of columns to load on-demand (a higher value
#'   increases RAM usage, but decreases running time)
#'
#' @importFrom pbapply startpb getpb setpb closepb
#'
#' @return Results of running \code{FUN}
#' @keywords internal
processByChunks <- function(data, FUN, ..., chunkSize=10000) {
    pb <- startpb(max=ncol(data))
    loadFromFile <- is.character(data)
    if (loadFromFile && !file.exists(data)) {
        msg <- "%s not found: has the CMap z-scores file been moved or deleted?"
        stop(sprintf(msg, data))
    }

    if (loadFromFile) {
        chunks <- chunkVector(colnames(data), chunkSize)
        res <- lapply(chunks, processChunk, data, FUN, ..., progress=pb)
        res <- unlist(res, recursive=FALSE, use.names=FALSE)
    } else {
        res <- processCall(data, colnames(data), FUN, ..., progress=pb)
    }
    closepb(pb)
    return(res)
}

#' Rank columns in a dataset
#'
#' @details The rank product's rank is calculated if more than one method is
#'   ranked.
#'
#' @note The first column of \code{data} and \code{rankingInfo} must contain
#'   common identifiers.
#'
#' @param table Data table: data; first column must be identifiers
#' @param rankingInfo Data table: boolean values of which rows to rank based on
#'   columns (column names to be ranked must exactly match those available in
#'   argument \code{table}); first column must be identifiers
#' @param sort Boolean: sort data based on rank product's rank (if multiple
#'   methods are available) or by available ranks
#' @inheritParams compareAgainstReference
#'
#' @importFrom data.table setkeyv
#'
#' @return Data table with the contents of \code{table} and extra columns with
#'   respective rankings
#' @keywords internal
rankColumns <- function(table, rankingInfo, rankByAscending=TRUE, sort=FALSE) {
    setkeyv(table, colnames(table)[[1]])
    colsToRank <- colnames(rankingInfo)[-1]
    rankedCols <- NULL
    for(col in colsToRank) {
        toRank     <- rankingInfo[[col]]
        rowsToRank <- rankingInfo[[1]][toRank]
        dataToRank <- table[rowsToRank][[col]]
        if (rankByAscending) dataToRank <- -dataToRank
        ranked     <- rank(dataToRank, na.last="keep")
        newCol     <- paste0(gsub("(.*)_.*$", "\\1", col), "_rank")
        table[rowsToRank, newCol] <- ranked
        rankedCols <- c(rankedCols, newCol)
    }
    if (length(rankedCols) > 1) {
        # Calculate rank product's rank
        ranks    <- table[rowsToRank, rankedCols, with=FALSE]
        rankProd <- apply(ranks, 1, prod) ^ (1 / ncol(ranks))
        table[rowsToRank, "rankProduct_rank"] <- rank(rankProd, na.last="keep")
        sortingCol <- "rankProduct_rank"
    } else {
        sortingCol <- rankedCols
    }
    if (sort) table <- table[order(table[[sortingCol]])]
    return(table)
}

#' Correlate against data columns
#'
#' @inheritParams rankSimilarPerturbations
#' @param pAdjust Character: method to use for p-value adjustment
#'
#' @importFrom stats p.adjust cor.test
#' @importFrom data.table data.table
#'
#' @return Data frame with correlation results per data column
#'
#' @keywords internal
correlateAgainstReference <- function(diffExprGenes, reference, method,
                                      pAdjust="BH") {
    # Subset based on intersecting genes
    genes <- intersect(names(diffExprGenes), rownames(reference))
    diffExprGenes <- diffExprGenes[genes]
    reference     <- reference[genes, ]

    # Correlate per data column
    corPerColumn <- function(k, data, diffExprGenes, method) {
        cor.test(data[ , k], diffExprGenes, method=method)
    }
    cors <- processByChunks(reference, corPerColumn,
                            diffExprGenes=diffExprGenes, method=method)
    cor  <- sapply(cors, "[[", "estimate")
    pval <- sapply(cors, "[[", "p.value")
    qval <- p.adjust(pval, pAdjust)
    names(cor) <- names(pval) <- names(qval) <- colnames(reference)

    res  <- data.table(names(cor), cor, pval, qval)
    cols <- sprintf("%s_%s", method, c("coef", "pvalue", "qvalue"))
    names(res) <- c("identifier", cols)
    attr(res, "colsToRank") <- cols[[1]]
    return(res)
}

#' Prepare GSEA gene sets
#'
#' @inheritParams compareAgainstReferencePerMethod
#' @return List of gene sets
#' @keywords internal
prepareGSEAgenesets <- function(input, geneSize) {
    input <- prepareGeneInput(input)
    isGeneset <- isTRUE(attr(input, "isGeneset"))
    if(isGeneset) {
        geneset <- list("custom"=input)
        geneSize <- c(length(input), 0)
    } else {
        # Check if length of input is too small for geneSize
        genes <- names(input)
        if (length(geneSize) == 1) {
            isGeneNumberLow <- floor(length(genes)/2) < geneSize
        } else if (length(geneSize) == 2) {
            isGeneNumberLow <- length(genes) < sum(geneSize)
        }

        if (isGeneNumberLow) {
            geneSize <- floor(length(genes)/2)
            msg <- paste(
                "'input' contains a smaller number of genes than available for",
                "'geneSize'; 'geneSize' parameter was reduced to", geneSize)
            warning(msg, immediate.=TRUE)
        }
        if (length(geneSize) == 1) geneSize <- rep(geneSize, 2)

        ordered  <- order(input, decreasing=TRUE)
        topGenes <- genes[head(ordered, geneSize[[1]])]
        if (geneSize[[1]] <= 0) topGenes <- NULL

        bottomGenes <- genes[tail(ordered, geneSize[[2]])]
        if (geneSize[[2]] <= 0) bottomGenes <- NULL

        geneset <- list("top"=topGenes, "bottom"=bottomGenes)
    }
    attr(geneset, "geneSize") <- unique(geneSize)
    return(geneset)
}

#' Perform gene set enrichment (GSA) against data columns
#'
#' @inheritParams rankSimilarPerturbations
#' @inheritParams fgsea::fgsea
#'
#' @importFrom fgsea fgsea
#' @importFrom data.table data.table
#' @importFrom dplyr bind_rows
#'
#' @return Data frame containing gene set enrichment analysis (GSEA) results per
#' data column
#' @keywords internal
performGSEAagainstReference <- function(reference, geneset) {
    # Calculate GSEA per data column
    gseaPerColumn <- function(k, data, geneset) {
        signature        <- data[ , k]
        names(signature) <- rownames(data)
        signature        <- sort(signature)
        score            <- fgsea(pathways=geneset, stats=signature,
                                  minSize=15, maxSize=500, nperm=1)
        return(score)
    }
    gsa <- processByChunks(reference, gseaPerColumn, geneset)
    gsaRes <- bind_rows(gsa)

    calcWTCS <- all(sort(unique(gsaRes$pathway)) == c("bottom", "top"))
    if (calcWTCS) {
        # Weighted connectivity score (WTCS) as per CMap paper (page e8)
        isTop  <- gsaRes$pathway == "top"
        top    <- gsaRes[["ES"]][isTop]
        bottom <- gsaRes[["ES"]][!isTop]
        wtcs   <- ifelse(sign(top) != sign(bottom), (top - bottom) / 2, 0)
        score  <- wtcs
    } else {
        score  <- gsaRes[["ES"]]
    }
    if (length(score) == 0) score <- NA
    results <- data.table("identifier"=colnames(reference), "GSEA"=score)
    attr(results, "colsToRank") <- "GSEA"
    return(results)
}

#' Compare single method
#'
#' @param input \code{Named numeric vector} of differentially expressed genes
#'   whose names are gene identifiers and respective values are a statistic that
#'   represents significance and magnitude of differentially expressed genes
#'   (e.g. t-statistics); or \code{character} of gene symbols composing a gene
#'   set that is tested for enrichment in reference data (only used if
#'   \code{method} includes \code{gsea})
#' @param geneSize Numeric: number of top up-/down-regulated genes to use as
#'   gene sets to test for enrichment in reference data; if a 2-length numeric
#'   vector, the first index is the number of top up-regulated genes and the
#'   second index is the number of down-regulated genes used to create gene
#'   sets; only used if \code{method} includes \code{gsea} and if \code{input}
#'   is not a gene set
#' @param method Character: one or more methods to compare data
#'   (\code{spearman}, \code{pearson} or \code{gsea})
#' @param reference Data matrix or \code{perturbationChanges} object (CMap
#'   perturbations; see \code{\link{prepareCMapPerturbations}()})
#' @param cellLines Integer: number of unique cell lines
#' @param cellLineMean Boolean: add a column with the mean score across cell
#'   lines? If \code{cellLineMean = "auto"} (default), the mean score will be
#'   added when data for more than one cell line is available.
#' @param rankByAscending Boolean: rank values based on their ascending
#'   (\code{TRUE}) or descending (\code{FALSE}) order?
#' @param rankPerCellLine Boolean: rank results based on both individual cell
#'   lines and mean scores across cell lines (\code{TRUE}) or based on mean
#'   scores alone (\code{FALSE})? If \code{cellLineMean = FALSE}, individual
#'   cell line conditions are always ranked.
#'
#' @importFrom utils head tail
#' @importFrom R.utils capitalize
#'
#' @keywords internal
#' @return Data frame containing the results per method of comparison
compareAgainstReferencePerMethod <- function(method, input, reference,
                                             geneSize=150, cellLines=NULL,
                                             cellLineMean="auto",
                                             rankPerCellLine=FALSE) {
    startTime <- Sys.time()

    # Check immediately if there is something wrong with the geneset(s)
    if (method == "gsea") geneset <- prepareGSEAgenesets(input, geneSize)

    type <- attr(reference, "type")
    if (is.null(type)) type <- "comparisons"
    compareMsg <- paste(c(ncol(reference),
                          attr(reference, "source"), type), collapse=" ")

    if (method %in% c("spearman", "pearson")) {
        methodStr <- paste0(capitalize(method), "'s correlation")
        if (is.null(cellLines) || cellLines == 0) {
            msg <- methodStr
        } else {
            msg <- sprintf("%s cell line%s; %s", cellLines,
                           ifelse(cellLines == 1, "", "s"), methodStr)
        }
        message(sprintf("Correlating against %s (%s)...", compareMsg, msg))
        geneset  <- NULL
        rankedRef <- correlateAgainstReference(input, reference, method)
    } else if (method == "gsea") {
        if (is.null(cellLines) || cellLines == 0) {
            msg <- compareMsg
        } else {
            cellLinesMsg <- sprintf("(%s cell line%s)...", cellLines,
                                    ifelse(cellLines == 1, "", "s"))
            msg <- paste(compareMsg, cellLinesMsg)
        }
        message(sprintf("Performing GSEA against %s...", msg))
        geneSize  <- attr(geneset, "geneSize")
        rankedRef <- performGSEAagainstReference(reference, geneset)
    }
    colsToRank <- attr(rankedRef, "colsToRank")

    # Set whether to calculate the mean value across cell lines
    if (cellLineMean == "auto") cellLineMean <- cellLines > 1

    # Retrieve ranking information
    if (is(reference, "perturbationChanges")) {
        cellLine <- parseCMapID(rankedRef[["identifier"]], cellLine=TRUE)
        names(cellLine) <- rankedRef[["identifier"]]
    } else {
        cellLine <- rankedRef[["identifier"]]
        names(cellLine) <- cellLine
    }

    metadata <- attr(reference, "metadata")
    if (cellLineMean) {
        aggregated  <- calculateCellLineMean(rankedRef, cellLine, metadata,
                                             rankPerCellLine)
        rankedRef   <- aggregated$reference
        rankingInfo <- aggregated$rankingInfo
        metadata    <- aggregated$metadata
    } else {
        rankingInfo <- data.table(names(cellLine), TRUE)
    }
    names(rankingInfo) <- c("cTRAP_id", colsToRank)

    # Inherit information of interest
    attr(rankedRef, "rankingInfo") <- rankingInfo
    attr(rankedRef, "metadata")    <- metadata
    attr(rankedRef, "geneset")     <- geneset

    # Report run settings and time
    diffTime <- format(round(Sys.time() - startTime, 2))
    msg      <- "Comparison against %s using '%s' %sperformed in %s\n"
    extra    <- ifelse(method == "gsea",
                       sprintf("(gene size of %s) ", geneSize), "")
    message(sprintf(msg, compareMsg, method, extra, diffTime))
    return(rankedRef)
}

prepareGeneInput <- function(input) {
    if (is.null(names(input)) && is.character(input)) {
        isGeneset   <- TRUE
        geneSymbols <- input
    } else if (!is.null(names(input)) && is.numeric(input)) {
        isGeneset   <- FALSE
        geneSymbols <- names(input)
    } else {
        stop("argument 'input' must be a named numeric vector or a character",
             " vector with gene symbols.")
    }
    attr(input, "isGeneset")   <- isGeneset
    attr(input, "geneSymbols") <- geneSymbols
    return(input)
}

#' Compare multiple methods and rank reference accordingly
#'
#' @inheritParams compareAgainstReferencePerMethod
#'
#' @importFrom data.table :=
#'
#' @keywords internal
#' @return List of data frame containing the results per methods of comparison
compareAgainstReference <- function(input, reference,
                                    method=c("spearman", "pearson", "gsea"),
                                    geneSize=150, cellLines=NULL,
                                    cellLineMean="auto", rankByAscending=TRUE,
                                    rankPerCellLine=FALSE) {
    startTime <- Sys.time()
    # Check if any of supplied methods are supported
    supported <- c("spearman", "pearson", "gsea")
    method <- unique(method)
    method <- method[method %in% supported]
    if (length(method) == 0) {
        stop(paste(
            "Argument 'method' must contain one of the following supported",
            "comparison methods:", paste(supported, collapse=", ")))
    }
    input <- prepareGeneInput(input)
    if (attr(input, "isGeneset")) {
        if (!"gsea" %in% method) {
            msg <- paste("Method 'gsea' is automatically performed if argument",
                         "'input' is a gene set.")
            warning(msg)
        }
        method <- "gsea"
    }

    # Summary of intersecting genes
    genes       <- intersect(attr(input, "geneSymbols"), rownames(reference))
    intersected <- length(genes)
    total       <- length(input)
    message(sprintf(paste(
        "Subsetting data based on %s intersecting genes",
        "(%s%% of the %s input genes)..."),
        intersected, round(intersected / total * 100, 0), total))
    if (intersected == 0) {
        stop("No intersecting genes found. Check if argument 'input' is a",
             " named numeric vector or a character vector with gene symbols.")
    }

    names(method) <- method
    res <- lapply(method, compareAgainstReferencePerMethod, input=input,
                  reference=reference, geneSize=geneSize, cellLines=cellLines,
                  cellLineMean=cellLineMean, rankPerCellLine=rankPerCellLine)

    # Rank columns
    rankingInfo <- Reduce(merge, lapply(res, attr, "rankingInfo"))
    replaceNAsWithFALSE <- function(DT) {
        for (i in names(DT)) DT[is.na(get(i)), (i):=FALSE]
        return(DT)
    }
    rankingInfo <- replaceNAsWithFALSE(rankingInfo)
    merged <- Reduce(merge, res)
    ranked <- rankColumns(merged, rankingInfo, rankByAscending=rankByAscending,
                          sort=TRUE)

    # Inherit metadata
    attr(ranked, "metadata") <- Reduce(merge, lapply(res, attr, "metadata"))
    attr(ranked, "geneInfo")        <- attr(reference, "geneInfo")
    attr(ranked, "compoundInfo")    <- attr(reference, "compoundInfo")
    attr(ranked, "zscoresFilename") <- attr(reference, "zscoresFilename")

    # Inherit from input
    attr(ranked, "input")         <- input
    attr(ranked, "rankingInfo")   <- rankingInfo
    attr(ranked, "geneset")       <- c(attr(res[["gsea"]], "pathways"), # legacy
                                       attr(res[["gsea"]], "geneset"))
    attr(ranked, "runtime")       <- Sys.time() - startTime

    class(ranked) <- c("referenceComparison", class(ranked))
    return(ranked)
}

# referenceComparison object ---------------------------------------------------

convertToTable <- function(x, clean=TRUE) {
    metadata <- attr(x, "metadata")
    isMetadataUseful <- !is.null(metadata) && nrow(metadata) > 0
    if (isMetadataUseful) {
        nonCellID <- "non_cell_id"

        summaryID <- parseCMapID(metadata$sig_id, cellLine=FALSE)
        metadata[[nonCellID]] <- summaryID
        metadataSubset <- metadata[unique(match(summaryID, summaryID)), ]
        metadataSubset[ , c("sig_id", "distil_id")] <- NULL

        x[[nonCellID]] <- parseCMapID(x[[1]], cellLine=FALSE)
        res <- merge(x, metadataSubset, all.x=TRUE, by=nonCellID)
        res[[nonCellID]] <- NULL
    } else {
        res <- x
    }

    geneInfo <- attr(x, "geneInfo")
    isGeneInfoUseful <- !is.null(geneInfo) && any(
        res[["pert_iname"]] %in% geneInfo[["pr_gene_symbol"]])
    if (isGeneInfoUseful) {
        res <- merge(res, geneInfo, by.x="pert_iname",
                     by.y="pr_gene_symbol", all.x=TRUE)
        # Place "pert_iname" column after "pert_id" one
        m <- match("pert_id", colnames(res))
        res <- res[ , c(2:m, 1, (m+1):(ncol(res))), with=FALSE]
    }

    compoundInfo <- attr(x, "compoundInfo")
    if (is(x, "similarPerturbations")) {
        compoundCol1 <- compoundCol2 <- "pert_iname"
    } else {
        compoundCol1 <- "compound"
        compoundCol2 <- names(compoundInfo)[[1]]
    }
    isCompoundInfoUseful <- !is.null(compoundInfo) && any(
        res[[compoundCol1]] %in% compoundInfo[[compoundCol2]])
    if (isCompoundInfoUseful) {
        res[[compoundCol1]] <- as.character(res[[compoundCol1]])
        compoundInfo[[compoundCol2]] <- as.character(
            compoundInfo[[compoundCol2]])
        res <- merge(res, compoundInfo, by.x=compoundCol1, by.y=compoundCol2,
                     all.x=TRUE)
        pos <- match("pert_id", colnames(res))
        if (!is.na(pos)) {
            # Place "pert_iname" column after "pert_id"
            res <- res[ , c(2:pos, 1, (pos + 1):(ncol(res))), with=FALSE]
        }
    }
    if (clean) {
        hideCols <- c(colnames(res)[endsWith(colnames(res), "value") |
                                        endsWith(colnames(res), "value_rank")],
                      "pert_dose", "pert_dose_unit",
                      "pert_time", "pert_time_unit")
        hideCols <- hideCols[hideCols %in% colnames(res)]
        if (length(hideCols) > 0) res <- res[ , -hideCols, with=FALSE]
    }
    res <- data.table(res)

    # Reorder table based on original ordering
    if (all(x[[1]] %in% res[[1]])) {
        res <- res[match(x[[1]], res[[1]]), , drop=FALSE]
    }
    return(res)
}

#' Cross Tabulation and Table Creation
#'
#' @param x \code{referenceComparison} object
#' @param ... Extra parameters not currently used
#' @param clean Boolean: only show certain columns (to avoid redundancy)?
#'
#' @family functions related with the ranking of CMap perturbations
#' @family functions related with the prediction of targeting drugs
#' @return Complete table with metadata based on a \code{targetingDrugs} object
#' @export
as.table.referenceComparison <- function(x, ..., clean=TRUE) {
    return(convertToTable(x, clean=clean))
}

Any scripts or data that you put into this service are public.

cTRAP documentation built on Nov. 8, 2020, 10:58 p.m.

rdrr.io home R language documentation Run R code online

CRAN packages Bioconductor packages R-Forge packages GitHub packages

Note that we can't provide technical support on individual packages. You should contact the package authors for that.

cTRAP
Identification of candidate causal perturbations from differential gene expression data

R/compare.R
In cTRAP: Identification of candidate causal perturbations from differential gene expression data

Defines functions as.table.referenceComparison convertToTable compareAgainstReference prepareGeneInput compareAgainstReferencePerMethod performGSEAagainstReference prepareGSEAgenesets correlateAgainstReference rankColumns processByChunks processCall processChunk chunkVector

Documented in as.table.referenceComparison chunkVector compareAgainstReference compareAgainstReferencePerMethod correlateAgainstReference performGSEAagainstReference prepareGSEAgenesets processByChunks rankColumns

Try the cTRAP package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

cTRAP Identification of candidate causal perturbations from differential gene expression data

R/compare.R In cTRAP: Identification of candidate causal perturbations from differential gene expression data

Defines functions as.table.referenceComparison convertToTable compareAgainstReference prepareGeneInput compareAgainstReferencePerMethod performGSEAagainstReference prepareGSEAgenesets correlateAgainstReference rankColumns processByChunks processCall processChunk chunkVector

Documented in as.table.referenceComparison chunkVector compareAgainstReference compareAgainstReferencePerMethod correlateAgainstReference performGSEAagainstReference prepareGSEAgenesets processByChunks rankColumns

Try the cTRAP package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

cTRAP
Identification of candidate causal perturbations from differential gene expression data

R/compare.R
In cTRAP: Identification of candidate causal perturbations from differential gene expression data