R/diggit.r

Defines functions reaNULL rea areaEnrich associateEvents

Documented in areaEnrich associateEvents rea reaNULL

#' Use 'aREA' to calculate the enrichment between each genomic event - 
#' VIPER inferred protein pair. 
#' 
#' Requires pre-computed VIPER scores and a binary events matrix. Will use only 
#' samples in both event and VIPER matrices.
#'  
#' @param vipermat Pre-computed VIPER scores with samples as columns and proteins
#'  as rows 
#' @param events.mat Binary 0/1 events matrix with samples as columns and genes 
#' or events as rows
#' @param whitelist Only compute associations for events in this list
#' @param min.events Only compute enrichment if the number of samples with these
#'  events is GTE to this
#' @param event.type Name of the event type being analyzed
#' @param verbose whether to print extra progress statements
#' @return A matrix of aREA scores, dimensions are nrow(events.mat) x nrow(vipermat) 
#' @keywords internal
associateEvents <- function(vipermat, events.mat, min.events = NA, 
                            whitelist = NA, 
                            event.type = c("Amplifications","Deletions", 
                                            "Mutations","Fusions", NA),
                            verbose) {
    event.type <- match.arg(event.type)
    if (is.null(events.mat)) {
        message("Null ", event.type, " matrix, skipping..")
        return(NULL)
    }
    if (dim(events.mat)[1] == 0 | dim(events.mat)[2] == 0) {
        message("Not enough ", event.type,", skipping")
        return(NULL)
    }
    
    # subset the two matrices to common samples
    common.samples <- intersect(colnames(vipermat), colnames(events.mat))
    vipermat <- vipermat[, common.samples]
    events.mat <- events.mat[, common.samples]
    
    # include only whitelist items
    if (!(is.na(whitelist))) {
        events.mat <- events.mat[intersect(rownames(events.mat), whitelist), ]
    }
    # filter to minmum number of somatic events
    if (!is.na(min.events)) {
        events.mat <- events.mat[apply(events.mat, 1, sum, na.rm = TRUE) >= min.events,]
    }
    # test again after removing low freuquency events
    if (is.null(dim(events.mat))) {
        message("Not enough ", event.type,", skipping")
        return(NULL)
    } else if (dim(events.mat)[1] == 0 | dim(events.mat)[2] == 0) {
        message("Not enough ", event.type,", skipping")
        return(NULL)
    }
    
    nes <- areaEnrich(events.mat, vipermat, event.type, verbose = verbose)
    nes
}


#' aREA.enrich Compute aREA enrichment between all pairwise combinations of 
#' VIPER proteins and gene-level events
#' 
#' @param events.mat A Binary 0/1 matrix with columns as samples, and rows as 
#' proteins
#' @param vipermat A VIPER network of inferred activity scores with columns as 
#' samples, and rows as proteins
#' @param event.type Name of the event type for printing purposes
#' @param verbose whether to print extra progress statements
#' @return A matrix of enrichment scores with rows as event/gene names and 
#' columns as VIPER protein names
#' @keywords internal
areaEnrich <- function(events.mat, vipermat, event.type, verbose) {
    
    # Convert mutations into a regulon-like object
    events.regulon <- apply(events.mat, 1, function(x) {
        l <- names(x[which(x == TRUE)])
        return(l)
    })
    
    # Calculate raw enrichment scores: each mutation against each TF 
    # columns are TFs, rownames are mutations
    es <- rea(t(vipermat), events.regulon, event.type = event.type, verbose = verbose)
    # Analytical correction
    dnull <- reaNULL(events.regulon)
    
    # Calculate pvalue of ES
    pval <- t(vapply(seq_len(length(dnull)), function(i, es, dnull) {
        dnull[[i]](es[i, ])$p.value
    }, es = es$groups, dnull = dnull, FUN.VALUE = numeric(ncol(es$groups))))
    
    # Convert the pvalues into Normalized Enrichment Scores
    nes <- qnorm(pval/2, lower.tail = FALSE) * es$ss
    # print (dim(nes)) print (length(events.regulon))
    rownames(nes) <- names(events.regulon)
    colnames(nes) <- rownames(vipermat)
    dimnames(pval) <- dimnames(nes)
    nes[is.na(nes)] <- 0
    # columns are TFs, rows are genomic events
    nes
}


#' This function calculates an Enrichment Score of Association based on how the
#' features rank on the samples sorted by a specific gene
#' 
#' @importFrom utils setTxtProgressBar txtProgressBar
#' @param eset Numerical matrix
#' @param regulon A list with genomic features as its names and samples as its 
#' entries, indicating presence of event
#' @param minsize The minimum number of events to use when calculating enrichment
#' @param maxsize The maximum number of events to use when calculating enrichment 
#' @param event.type Type of event being analyzed
#' @param verbose whether to print extra progress statements
#' @return A list containing two elements:
#' \describe{
#' \item{groups}{Regulon-specific NULL model containing the enrichment scores}
#' \item{ss}{Direction of the regulon-specific NULL model}
#' }
#' @keywords internal
rea <- function(eset, regulon, minsize = 1, maxsize = Inf, event.type = NA, verbose) {
    # Filter for minimum sizes
    sizes <- vapply(regulon, length, FUN.VALUE = numeric(1))
    regulon <- regulon[sizes >= minsize]
    sizes <- vapply(regulon, length, FUN.VALUE = numeric(1))
    regulon <- regulon[sizes <= maxsize]
    
    temp <- unique(unlist(regulon))
    tw <- rep(1, length(temp))
    names(tw) <- temp
    
    # Calculations
    t <- eset
    t2 <- apply(t, 2, rank)/(nrow(t) + 1) * 2 - 1
    t1 <- abs(t2) * 2 - 1
    t1[t1 == (-1)] <- 1 - (1/length(t1))
    
    
    # tnorm
    t1 <- qnorm(t1/2 + 0.5)
    t2 <- qnorm(t2/2 + 0.5)
    
    if(is.na(event.type)) {
        event.type = "genomic events"
    }
    
    # Print some progress bar
    if(verbose) {
        message("\nComputing associations of ", length(regulon)," ", 
                event.type," with ", ncol(eset), " regulators")
        message("Process started at ", date())
        pb <- txtProgressBar(max = length(regulon), style = 3) 
    }
    
    temp <- lapply(seq_len(length(regulon)), function(i,regulon,t1,t2,tw,pb) {
        hitsamples <- regulon[[i]]
        hitsamples <- intersect(hitsamples, rownames(t1))
        
        # Mumbo-jumbo
        pos <- match(hitsamples, rownames(t1))
        
        heretw <- tw[match(hitsamples, names(tw))]
        
        sum1 <- matrix(heretw, 1, length(hitsamples)) %*% t2[pos, ]
        ss <- sign(sum1)
        ss[ss == 0] <- 1
        if(verbose) { setTxtProgressBar(pb, i) }
        sum2 <- matrix(0 * heretw, 1, length(hitsamples)) %*% t1[pos, ]
        return(list(es = as.vector(abs(sum1) + sum2 * (sum2 > 0))/sum(heretw), 
                    ss = ss))
    }, regulon = regulon, pb = pb, t1 = t1, t2 = t2, tw = tw)
    names(temp) <- names(regulon)
    
    if(verbose) { message("\nProcess ended at ", date()) }
    es <- t(vapply(temp, function(x) x$es,
                   FUN.VALUE = numeric(length(temp[[1]][[1]]))))
    ss <- t(vapply(temp, function(x) x$ss, 
                   FUN.VALUE = numeric(length(temp[[1]][[1]]))))
    colnames(es) <- colnames(ss) <- colnames(eset)
    return(list(groups = es, ss = ss))
}


#' This function generates the NULL model function, which computes the 
#' normalized enrichment score and associated p-value
#' 
#' @param regulon A list with genomic features as its names and samples as its 
#' entries
#' @param minsize Minimum number of event (or size of regulon)  
#' @param maxsize Maximum number of event (or size of regulon)  
#' @return A list of functions to compute NES and p-value
#' @keywords internal
reaNULL <- function(regulon, minsize = 1, maxsize = Inf) {
    # Filter for minimum sizes
    sizes <- vapply(regulon, length, FUN.VALUE = numeric(1))
    regulon <- regulon[sizes >= minsize]
    sizes <- vapply(regulon, length, FUN.VALUE = numeric(1))
    regulon <- regulon[sizes <= maxsize]
    # complete list of all genes in any regulon
    temp <- unique(unlist(regulon))
    # list of all genes, weighted by 1
    tw <- rep(1, length(temp))
    names(tw) <- temp
    lapply(regulon, function(x, tw) {
        ww <- tw[match(x, names(tw))]
        ww <- ww/max(ww)
        # now it's a constant?
        ww <- sqrt(sum(ww^2))
        return(function(x, alternative = "two.sided") {
            x <- x * ww
            p <- switch(pmatch(alternative, c("two.sided", "less", "greater")), 
                        pnorm(abs(x), lower.tail = FALSE) * 2, 
                        pnorm(x, lower.tail = TRUE), pnorm(x,lower.tail = FALSE))
            list(nes = x, p.value = p)
        })
    }, tw = tw)
}
califano-lab/MOMA documentation built on June 7, 2020, 7:17 a.m.