R/find_dmrs.R

Defines functions find_dmrs

Documented in find_dmrs

#' Find Differentially Methylated Regions across whole genome
#'
#' A function to define DMR for an entire genome.
#'
#' @param tumor_table A matrix of beta-values (percentage) from tumor samples.
#' @param control_table A matrix of beta-values (percentage) from
#' normal/control samples.
#' @param auc_vector A numeric vector generated by [compute_AUC].
#' @param reference_table A data.frame reporting the location of CpG sites
#' (chromosome, genomic_coordinate and chromosome arm).
#' @param ncores Number of parallel processes to use for parallel computing.
#' @param max_distance Maximum distance between sites within same DMRs.
#' Splits long DMRs (default: no split).
#' @param min_sites Minimum required number of CpG sites within a DMR (default = 5).
#' @param pt_start Transition probability of the HSLM. Default is 0.05.
#' @param normdist Distance normalization parameter of the HSLM. Default is 1e5.
#' @param ratiosd Fraction between the standard deviation of AUC values of
#' differentially methylated sites and the total standard deviation of AUC
#' values. Default is 0.4.
#' @param mu Expected mean (AUC) for hypo-methylated state (1-mu is the
#' expected mean for hyper-methylated state). Default is 0.25.
#' @param use_trunc Use truncated normal distribution (DEBUGGING ONLY).
#' Default is TRUE.
#' @return A data.frame reporting genomic location, number of CpG sites,
#' methylation state, average beta difference (tumor vs. control), p-value and
#' adjusted (Benjamini-Hochberg) p-value (fdr) of discovered DMRs.
#' @importFrom stats mad median p.adjust sd wilcox.test
#' @importFrom foreach "%dopar%"
#' @examples
#' auc <- compute_AUC(tumor_example, control_example)
#' dmr_set <- find_dmrs(tumor_example, control_example, auc, reference_example, min_sites = 10)
#' @export
find_dmrs <- function(tumor_table, control_table, auc_vector, reference_table,
                      ncores = 1, max_distance = Inf, min_sites = 5,
                      pt_start = 0.05, normdist = 1e5, ratiosd = 0.4, mu = .25,
                      use_trunc = TRUE){
    message(sprintf("[%s] Find Differentially Methylated Regions", Sys.time()))

    # check parameters
    ncores <- as.integer(ncores)
    use_trunc <- as.logical(use_trunc)
    reference_table <- as.data.frame(reference_table)
    tumor_table <- as.matrix(tumor_table)
    control_table <- as.matrix(control_table)
    system_cores <- parallel::detectCores()

    assertthat::assert_that(ncores < system_cores)
    assertthat::assert_that(ratiosd > 0, ratiosd < 1)
    assertthat::assert_that(normdist > 1)
    assertthat::assert_that(pt_start > 0, pt_start < 1)
    assertthat::assert_that(mu >= 0, mu <= .35)

    assertthat::assert_that(ncol(reference_table) >= 3)
    assertthat::assert_that(nrow(tumor_table) == nrow(control_table))
    assertthat::assert_that(nrow(tumor_table) == nrow(reference_table))
    assertthat::assert_that(nrow(tumor_table) == length(auc_vector))

    diff_range_t <- diff(range(tumor_table, na.rm = TRUE))
    diff_range_c <- diff(range(control_table, na.rm = TRUE))
    assertthat::assert_that(diff_range_t > 1, diff_range_t <= 100, msg = "For computation efficiency, convert tumor table to percentage values.")
    assertthat::assert_that(diff_range_c > 1, diff_range_c <= 100, msg = "For computation efficiency, convert control table to percentage values.")

    tumor_table <- round(tumor_table)
    control_table <- round(control_table)
    storage.mode(tumor_table) <- "integer"
    storage.mode(control_table) <- "integer"

    # remove NA rows
    idx_not_NA <- which(!is.na(auc_vector))
    tumor_table <- tumor_table[idx_not_NA,, drop = FALSE]
    control_table <- control_table[idx_not_NA,, drop = FALSE]
    auc_vector <- auc_vector[idx_not_NA]
    reference_table <- reference_table[idx_not_NA,, drop = FALSE]

    # sort data
    idx <- order(reference_table[[1]], reference_table[[2]])
    reference_table <- reference_table[idx, ]
    tumor_table <- tumor_table[idx, ]
    control_table <- control_table[idx, ]
    auc_vector <- auc_vector[idx]

    cl <- parallel::makeCluster(ncores)
    tumor_beta_mean   <- parallel::parApply(cl, tumor_table, 1, mean, na.rm = TRUE)
    control_beta_mean <- parallel::parApply(cl, control_table, 1, mean, na.rm = TRUE)
    parallel::stopCluster(cl)

    mean_beta_diff <- tumor_beta_mean - control_beta_mean
    auc_sd <- sd(auc_vector, na.rm = TRUE)

    chromosomes_df <- unique(reference_table[c(1,3)])

    cl <- parallel::makeCluster(ncores)
    doParallel::registerDoParallel(cl)

    all_dmrs <- foreach::foreach(i = seq_len(nrow(chromosomes_df)), .combine = rbind, .packages = c("Rockermeth")) %dopar% {

        chr <- chromosomes_df[[1]][i]
        arm <- chromosomes_df[[2]][i]
        idx_chr <- which(reference_table[[1]] == chr & reference_table[[3]] == arm)
        message("## Process chromosome ", chr, ".", arm)

        t_beta_mean_subset <- tumor_beta_mean[idx_chr]
        c_beta_mean_subset <- control_beta_mean[idx_chr]
        mean_beta_diff_subset <- mean_beta_diff[idx_chr]
        auc_subset <- auc_vector[idx_chr]
        coordinates <- reference_table[[2]][idx_chr]

        # 1) compute methylation states (1,2,3)
        meth_states <- Rockermeth::meth_state_finder(auc_subset, coordinates, auc_sd, pt_start,
                                         normdist, ratiosd, mu, use_trunc)

        # 2) find segments
        dmrs <- Rockermeth::segmentator(t_beta_mean_subset, c_beta_mean_subset,
                            meth_states, coordinates, max_distance)

        dmrs <- tibble::add_column(dmrs, chr=chr, .before="start")
        return(dmrs)
    }
    parallel::stopCluster(cl)

    message("# Correct p-values for multiple testing")
    dmrs_idx <- with(all_dmrs, which(nsites >= min_sites & state != 2))
    all_dmrs <- tibble::add_column(all_dmrs, fdr = NA)
    all_dmrs$fdr[dmrs_idx] <- p.adjust(all_dmrs$p_value[dmrs_idx], "fdr")

    message("  Total DMRs: ", sum(all_dmrs$state != 2))
    message("  Valid DMRs: ", length(dmrs_idx))
    message(sprintf("[%s] Done", Sys.time()))
    return(all_dmrs)

}
cgplab/ROCkerMeth documentation built on March 27, 2022, 9:57 p.m.