#' Find Differentially Methylated Regions across whole genome
#'
#' A function to define DMR for an entire genome.
#'
#' @param tumor_table A matrix of beta-values (percentage) from tumor samples.
#' @param control_table A matrix of beta-values (percentage) from
#' normal/control samples.
#' @param auc_vector A numeric vector generated by [compute_AUC].
#' @param reference_table A data.frame reporting the location of CpG sites
#' (chromosome, genomic_coordinate and chromosome arm).
#' @param ncores Number of parallel processes to use for parallel computing.
#' @param max_distance Maximum distance between sites within same DMRs.
#' Splits long DMRs (default: no split).
#' @param min_sites Minimum required number of CpG sites within a DMR (default = 5).
#' @param pt_start Transition probability of the HSLM. Default is 0.05.
#' @param normdist Distance normalization parameter of the HSLM. Default is 1e5.
#' @param ratiosd Fraction between the standard deviation of AUC values of
#' differentially methylated sites and the total standard deviation of AUC
#' values. Default is 0.4.
#' @param mu Expected mean (AUC) for hypo-methylated state (1-mu is the
#' expected mean for hyper-methylated state). Default is 0.25.
#' @param use_trunc Use truncated normal distribution (DEBUGGING ONLY).
#' Default is TRUE.
#' @return A data.frame reporting genomic location, number of CpG sites,
#' methylation state, average beta difference (tumor vs. control), p-value and
#' adjusted (Benjamini-Hochberg) p-value (fdr) of discovered DMRs.
#' @importFrom stats mad median p.adjust sd wilcox.test
#' @importFrom foreach "%dopar%"
#' @examples
#' auc <- compute_AUC(tumor_example, control_example)
#' dmr_set <- find_dmrs(tumor_example, control_example, auc, reference_example, min_sites = 10)
#' @export
find_dmrs <- function(tumor_table, control_table, auc_vector, reference_table,
ncores = 1, max_distance = Inf, min_sites = 5,
pt_start = 0.05, normdist = 1e5, ratiosd = 0.4, mu = .25,
use_trunc = TRUE){
message(sprintf("[%s] Find Differentially Methylated Regions", Sys.time()))
# check parameters
ncores <- as.integer(ncores)
use_trunc <- as.logical(use_trunc)
reference_table <- as.data.frame(reference_table)
tumor_table <- as.matrix(tumor_table)
control_table <- as.matrix(control_table)
system_cores <- parallel::detectCores()
assertthat::assert_that(ncores < system_cores)
assertthat::assert_that(ratiosd > 0, ratiosd < 1)
assertthat::assert_that(normdist > 1)
assertthat::assert_that(pt_start > 0, pt_start < 1)
assertthat::assert_that(mu >= 0, mu <= .35)
assertthat::assert_that(ncol(reference_table) >= 3)
assertthat::assert_that(nrow(tumor_table) == nrow(control_table))
assertthat::assert_that(nrow(tumor_table) == nrow(reference_table))
assertthat::assert_that(nrow(tumor_table) == length(auc_vector))
diff_range_t <- diff(range(tumor_table, na.rm = TRUE))
diff_range_c <- diff(range(control_table, na.rm = TRUE))
assertthat::assert_that(diff_range_t > 1, diff_range_t <= 100, msg = "For computation efficiency, convert tumor table to percentage values.")
assertthat::assert_that(diff_range_c > 1, diff_range_c <= 100, msg = "For computation efficiency, convert control table to percentage values.")
tumor_table <- round(tumor_table)
control_table <- round(control_table)
storage.mode(tumor_table) <- "integer"
storage.mode(control_table) <- "integer"
# remove NA rows
idx_not_NA <- which(!is.na(auc_vector))
tumor_table <- tumor_table[idx_not_NA,, drop = FALSE]
control_table <- control_table[idx_not_NA,, drop = FALSE]
auc_vector <- auc_vector[idx_not_NA]
reference_table <- reference_table[idx_not_NA,, drop = FALSE]
# sort data
idx <- order(reference_table[[1]], reference_table[[2]])
reference_table <- reference_table[idx, ]
tumor_table <- tumor_table[idx, ]
control_table <- control_table[idx, ]
auc_vector <- auc_vector[idx]
cl <- parallel::makeCluster(ncores)
tumor_beta_mean <- parallel::parApply(cl, tumor_table, 1, mean, na.rm = TRUE)
control_beta_mean <- parallel::parApply(cl, control_table, 1, mean, na.rm = TRUE)
parallel::stopCluster(cl)
mean_beta_diff <- tumor_beta_mean - control_beta_mean
auc_sd <- sd(auc_vector, na.rm = TRUE)
chromosomes_df <- unique(reference_table[c(1,3)])
cl <- parallel::makeCluster(ncores)
doParallel::registerDoParallel(cl)
all_dmrs <- foreach::foreach(i = seq_len(nrow(chromosomes_df)), .combine = rbind, .packages = c("Rockermeth")) %dopar% {
chr <- chromosomes_df[[1]][i]
arm <- chromosomes_df[[2]][i]
idx_chr <- which(reference_table[[1]] == chr & reference_table[[3]] == arm)
message("## Process chromosome ", chr, ".", arm)
t_beta_mean_subset <- tumor_beta_mean[idx_chr]
c_beta_mean_subset <- control_beta_mean[idx_chr]
mean_beta_diff_subset <- mean_beta_diff[idx_chr]
auc_subset <- auc_vector[idx_chr]
coordinates <- reference_table[[2]][idx_chr]
# 1) compute methylation states (1,2,3)
meth_states <- Rockermeth::meth_state_finder(auc_subset, coordinates, auc_sd, pt_start,
normdist, ratiosd, mu, use_trunc)
# 2) find segments
dmrs <- Rockermeth::segmentator(t_beta_mean_subset, c_beta_mean_subset,
meth_states, coordinates, max_distance)
dmrs <- tibble::add_column(dmrs, chr=chr, .before="start")
return(dmrs)
}
parallel::stopCluster(cl)
message("# Correct p-values for multiple testing")
dmrs_idx <- with(all_dmrs, which(nsites >= min_sites & state != 2))
all_dmrs <- tibble::add_column(all_dmrs, fdr = NA)
all_dmrs$fdr[dmrs_idx] <- p.adjust(all_dmrs$p_value[dmrs_idx], "fdr")
message(" Total DMRs: ", sum(all_dmrs$state != 2))
message(" Valid DMRs: ", length(dmrs_idx))
message(sprintf("[%s] Done", Sys.time()))
return(all_dmrs)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.