R/differentialFisherUMI4C.R

Defines functions fisherUMI4C

Documented in fisherUMI4C

#' Differential UMI4C contacts using Fisher's Exact test
#'
#' Using the UMIs inside \code{query_regions} performs Fisher's Exact test to
#' calculate significant differences between contact intensities.
#' @param umi4c UMI4C object as generated by \code{makeUMI4C} or the
#' \code{UMI4C} constructor.
#' @param grouping Name of the column in colData used to merge the samples or
#' replicates. If none available or want to add new groupings, run 
#' \code{\link{addGrouping}}. Default: "condition".
#' @param query_regions \code{GenomicRanges} object or \code{data.frame}
#' containing the region coordinates used to perform the differential analysis.
#' @param resize Width in base pairs for resizing the \code{query_regions}.
#' Default: no resizing.
#' @param window_size If \code{query_regions} are not defined, wil bin region in
#'  \code{window_size} bp and perform the analysis using this windows.
#' @param filter_low Either the minimum median UMIs requiered to perform
#' Fisher's Exact test or \code{FALSE} for performing the test in all windows.
#' @param padj_method Method for adjusting p-values. See
#' \code{\link[stats]{p.adjust}} for the different methods.
#' @param padj_threshold Numeric indicating the adjusted p-value threshold to
#' use to define significant differential contacts.
#' @return Calculates statistical differences between UMI-4C experiments.
#' @details This function calculates the
#'  overlap of fragment ends with either the provided \code{query_regions} or
#'  the binned region using \code{window_size}. The resulting number of UMIs in
#'  each \code{query_region} will be the \emph{sum} of UMIs in all overlapping
#'  fragment ends. As a default, will filter out those regions whose median
#'  UMIs are lower than \code{filter_low}.
#'
#' Finally, a contingency table for each \code{query_reegions} or \code{window}
#' that passed the \code{filter_low} filter is created as follows:
#' \tabular{rcc}{
#'      \tab \emph{query_region} \tab \emph{region}\cr
#'     \emph{Reference} \tab n1 \tab N1-n1\cr
#'     \emph{Condition} \tab n2 \tab N2-n2
#'     }
#' and the Fisher's Exact test is performed. Obtained p-values are then
#' converted to adjusted p-values using \code{padj_method} and the results list
#' is added to the \code{UMI4C} object.
#' @examples
#' data("ex_ciita_umi4c")
#' ex_ciita_umi4c <- addGrouping(ex_ciita_umi4c, grouping="condition")
#' 
#' # Perform differential test
#' enh <- GRanges(c("chr16:10925006-10928900", "chr16:11102721-11103700"))
#' umi_dif <- fisherUMI4C(ex_ciita_umi4c, query_regions = enh, 
#'                        filter_low = 20, resize = 5e3)
#' resultsUMI4C(umi_dif)                        
#' @export
fisherUMI4C <- function(umi4c,
    grouping = "condition",
    query_regions,
    resize = NULL,
    window_size = 5e3,
    filter_low = 50,
    padj_method = "fdr",
    padj_threshold = 0.05) {
    
    umi4c_ori <- umi4c
    
    if (!is.null(grouping)) {
      umi4c <- groupsUMI4C(umi4c_ori)[[grouping]]
    }
    
    if (length(colnames(assay(umi4c))) != 2) stop("You have more than two groups, so differential analysis can not
                                              be performed. Please try setting a 'normalize' variable in makeUMI4C
                                              that will divide your samples in two groups.")

    if (missing(query_regions)) {
        query_regions <- unlist(GenomicRanges::tile(metadata(umi4c)$region,
            width = window_size
        ))
        # Remove query regions overlapping with bait
        query_regions <- IRanges::subsetByOverlaps(query_regions,
            GenomicRanges::resize(bait(umi4c), width = 3e3, fix = "center"),
            invert = TRUE
        )
    } else {
        if (is(query_regions, "data.frame")) query_regions <- regioneR::toGRanges(query_regions)
    }

    if (ncol(mcols(query_regions)) == 0) {
          query_regions$id <- paste0("region_", seq_len(length(query_regions)))
      } else if (length(unique(mcols(query_regions)[1])) == length(query_regions)) {
          colnames(mcols(query_regions))[, 1] <- "id"
      } else {
          query_regions$id <- paste0("region_", seq_len(length(query_regions)))
      }

    totals <- colSums(assay(umi4c))
    # Reorder to make ref_umi4c reference
    totals <- totals[c(
        which(names(totals) == metadata(umi4c)$ref_umi4c),
        which(names(totals) != metadata(umi4c)$ref_umi4c)
    )]

    # Resize query regions if value provided
    if (!is.null(resize)) {
        query_regions <- GenomicRanges::resize(query_regions,
            width = resize,
            fix = "center"
        )
    }

    # Find overlaps between fragment ends and query regions
    ols <- GenomicRanges::findOverlaps(rowRanges(umi4c), query_regions)

    # Sum UMIs
    fends_split <- GenomicRanges::split(
        rowRanges(umi4c)$id_contact[queryHits(ols)],
        query_regions$id[subjectHits(ols)]
    )
    fends_summary <- lapply(
        fends_split,
        function(x) {
            data.frame(
                umis_ref = sum(assay(umi4c)[x, which(colnames(assay(umi4c)) == metadata(umi4c)$ref_umi4c)],
                    na.rm = TRUE
                ),
                umis_cond = sum(assay(umi4c)[x, which(colnames(assay(umi4c)) != metadata(umi4c)$ref_umi4c)],
                    na.rm = TRUE
                )
            )
        }
    )
    fends_summary <- do.call(rbind, fends_summary)
    fends_summary$query_id <- names(fends_split)

    counts <- fends_summary[, c(3, 1, 2)]

    # Filter regions with low UMIs to avoid multiple testing
    if (filter_low) {
        median <- apply(fends_summary[, c(1, 2)], 1, median) >= filter_low
        fends_summary <- fends_summary[median, ]

        if (nrow(fends_summary) == 0) stop("Your filter_low value is too high and removes all query_regions. Try setting a lower value or setting it to 'FALSE'")
    }

    mat_list <- lapply(
        seq_len(nrow(fends_summary)),
        function(x) {
            matrix(c(
                as.vector(t(fends_summary[x, c(2, 1)])),
                totals[2] - fends_summary[x, 2],
                totals[1] - fends_summary[x, 1]
            ),
            ncol = 2,
            dimnames = list(
                c("cond", "ref"),
                c("query", "region")
            )
            )
        }
    )

    fends_summary$pvalue <- unlist(lapply(
        mat_list,
        function(x) stats::fisher.test(x)$p.value
    ))
    fends_summary$odds_ratio <- unlist(lapply(
        mat_list,
        function(x) stats::fisher.test(x)$estimate
    ))
    fends_summary$log2_odds_ratio <- log2(fends_summary$odds_ratio)
    fends_summary$padj <- stats::p.adjust(fends_summary$pval, method = padj_method)

    fends_summary$sign <- FALSE
    fends_summary$sign[fends_summary$padj <= padj_threshold] <- TRUE


    umi4c_ori@results <- S4Vectors::SimpleList(
        test = "Fisher's Exact Test",
        ref = names(totals)[1],
        padj_threshold = padj_threshold,
        results = fends_summary[, -c(1, 2)],
        query = query_regions,
        counts = counts,
        grouping = grouping
    )

    return(umi4c_ori)
}
Pasquali-lab/UMI4Cats documentation built on March 23, 2024, 9:42 p.m.