R/find_informative_regions.R

Defines functions find_informative_regions

Documented in find_informative_regions

#' Select informative regions (extended)
#'
#' This function generates a list of informative regions to estimate the purity
#' or the tumor content of a set of tumor samples.
#'
#' A new parameter, named \code{control_costraints}, is required force
#' selection of sites where upper/lower quartiles of control scores are below
#' beta-values given by \code{control_costraints}.  Regions are divided into
#' \code{hyper} and \code{hypo} depending on their level of methylation with
#' respect to the average beta-score of normal samples.
#'
#' @param tumor_table A matrix of beta-values of tumor samples.
#' @param control_table A matrix of beta-values of control/normal samples.
#' @param auc A data.frame with AUC scores generated by \code{get_AUC}.
#' @param cores Number of parallel processes.
#' @param max_regions Maximum number of regions to retrieve (half hyper-, half
#' hypo-methylated) (default=20).
#' @param percentiles A vector of length 2. Min and max percentiles to
#' select sites with beta values outside hypo- and hyper-ranges (default = c(0,100);
#' i.e. only min and max beta should be outside of ranges).
#' @param hyper_range A vector of length 2 with minimum lower and upper values
#' required to select hyper-methylated informative sites.
#' @param hypo_range A vector of length 2 with minimum lower and upper values
#' required to select hypo-methylated informative sites.
#' @param method How to select sites: "even" (half hyper-, half hypo-methylated sites),
#' "top" (highest AUC irregardless of hyper or hypomethylation), "hyper" (hyper-methylated sites only),
#' "hypo" (hypo-methylated, sites only).
#' @param control_costraints To select a site, "first quartile"/"third quartile" of control data must be above/below these beta-values.
#' @param full_info Return all informative sites (for debugging purposes).
#' @return A data.frame reporing region names (chr_position) and type ("hyper" and "hypo") of informative regions.
#' @importFrom stats quantile
#' @export
#' @examples
#' reduced_data <- reduce_to_regions(bs_seq_toy_matrix, bs_seq_toy_sites, cpg_islands[1:1000,])
#' auc_data <- get_AUC(reduced_data[,1:10], reduced_data[,11:20])
#' info_regions <- find_informative_regions(reduced_data[,1:10], reduced_data[,11:20], auc_data)
find_informative_regions <- function(tumor_table, control_table, auc, cores=1,
                                     max_regions = 20, percentiles = c(0,100),
                                     hyper_range = c(min = .40, max = .90), hypo_range = c(min = .10, max = .60),
                                     control_costraints = c(.30, .70),
                                     method = c("even", "top", "hyper", "hypo"), full_info=FALSE){

    message(sprintf("[%s] # Find informative regions #", Sys.time()))

    # check parameters
    assertthat::assert_that(is.matrix(tumor_table))
    assertthat::assert_that(is.matrix(control_table))
    assertthat::assert_that(!is.null(rownames(tumor_table)))
    assertthat::assert_that(identical(rownames(tumor_table), rownames(control_table)))
    assertthat::assert_that(nrow(tumor_table) == nrow(auc))

    assertthat::assert_that(is.matrix(tumor_table))
    assertthat::assert_that(is.matrix(control_table))
    assertthat::assert_that(nrow(tumor_table) == nrow(auc))

    assertthat::assert_that(is.numeric(cores))
    assertthat::assert_that(is.numeric(max_regions))
    assertthat::assert_that(is.numeric(hyper_range))
    assertthat::assert_that(is.numeric(hypo_range))
    assertthat::assert_that(is.numeric(control_costraints))
    assertthat::assert_that(is.numeric(percentiles))

    assertthat::assert_that(length(hyper_range) == 2)
    assertthat::assert_that(length(hypo_range) == 2)
    assertthat::assert_that(length(control_costraints) == 2)
    assertthat::assert_that(length(percentiles) == 2)

    method <- match.arg(method)
    if (method == "even")
        assertthat::assert_that(max_regions %% 2 == 0, msg="method is set to 'even' but max_regions is not even")

    assertthat::assert_that(is.logical(full_info))

    message(sprintf("- Method: %s", method))
    message(sprintf("- Number of regions to retrieve: %i", max_regions))
    message(sprintf("- Hyper-methylated regions range: %.2f-%.2f", hyper_range[1], hyper_range[2]))
    message(sprintf("- Hypo-methylated regions range: %.2f-%.2f", hypo_range[1], hypo_range[2]))
    message(sprintf("- Control constraints: %.2f-%.2f", control_costraints[1], control_costraints[2]))
    message(sprintf("- Percentiles: %g-%g", percentiles[1], percentiles[2]))

    # minimum and maximum beta per region
    cl <- parallel::makeCluster(cores)
    message(sprintf("[%s] Compute min-/max- beta scores...", Sys.time()))
    min_beta <- suppressWarnings(parallel::parApply(cl, tumor_table, 1, quantile, probs = percentiles[1]/100, na.rm = TRUE))
    max_beta <- suppressWarnings(parallel::parApply(cl, tumor_table, 1, quantile, probs = percentiles[2]/100, na.rm = TRUE))

    message(sprintf("[%s] Compute control interquartiles...", Sys.time()))
    lower_quart <- suppressWarnings(parallel::parApply(cl, control_table, 1, quantile, probs = .25, na.rm = TRUE))
    upper_quart <- suppressWarnings(parallel::parApply(cl, control_table, 1, quantile, probs = .75, na.rm = TRUE))

    message(sprintf("[%s] Select regions...", Sys.time()))
    diff_meth_regions_df <- cbind(auc,
                                  data.frame(Max_beta = max_beta,
                                             Min_beta = min_beta,
                                             Lower_Quart = lower_quart,
                                             Upper_Quart = upper_quart))
    hyper_regions_idx <- with(diff_meth_regions_df, which(AUC > .80 & Min_beta < hyper_range[1] & Max_beta > hyper_range[2] & Upper_Quart < control_costraints[1]))
    hypo_regions_idx  <- with(diff_meth_regions_df, which(AUC < .20 & Min_beta < hypo_range[1]  & Max_beta > hypo_range[2]  & Lower_Quart > control_costraints[2]))
    regions_type <- rep("no_differences", nrow(auc))
    regions_type[hyper_regions_idx] <- "hyper"
    regions_type[hypo_regions_idx] <- "hypo"
    diff_meth_regions_df$Region_type <- regions_type
    diff_meth_regions_df$AUC[diff_meth_regions_df$eegions_type == "Hypo"] <- 1-diff_meth_regions_df$AUC
    diff_meth_regions_df <- diff_meth_regions_df[order(diff_meth_regions_df$AUC, decreasing = TRUE),]

    hyper_regions_df <- diff_meth_regions_df[diff_meth_regions_df$Region_type == "hyper",]
    hypo_regions_df  <- diff_meth_regions_df[diff_meth_regions_df$Region_type == "hypo",]
    message(sprintf("* Total hyper-methylated regions = %i", nrow(hyper_regions_df)))
    message(sprintf("* Total hypo-methylated regions = %i", nrow(hypo_regions_df)))

    if (full_info) {
        regions <- rbind(hyper_regions_df, hypo_regions_df)
        columns <- c("Probe", "Region_type",
                     "AUC", "Max_beta", "Min_beta", "Upper_Quart", "Lower_Quart")
        regions <- regions[,columns]
    } else {
        if (method == "even") {
            regions <- rbind(head(hyper_regions_df[hyper_regions_df$Keep_site,],max_regions/2),
                             head(hypo_regions_df[hypo_regions_df$Keep_site,],max_regions/2))
        } else if (method == "top") {
            regions <- rbind(hyper_regions_df, hypo_regions_df)
            regions <- regions[order(regions$AUC, decreasing = TRUE),]
            regions <- head(regions,max_regions)
        } else if (method == "hyper") {
            regions <- head(hyper_regions_df,max_regions)
        } else if (method == "hypo") {
            regions <- head(hypo_regions_df,max_regions)
        }
        regions <- regions[c("Probe", "Region_type")]
        message(sprintf("* Retrieved hyper-methylated regions = %i", sum(regions$Region_type=="hyper")))
        message(sprintf("* Retrieved hypo-methylated regions = %i", sum(regions$Region_type=="hypo")))
    }
    rownames(regions) <- NULL
    message(sprintf("[%s] Done", Sys.time()))
    return(regions)
}
cgplab/PAMES documentation built on Dec. 4, 2022, 6:35 p.m.