R/find_informative_sites.R

Defines functions remove_close_sites find_informative_sites

Documented in find_informative_sites remove_close_sites

#' Discover informative CpG sites
#'
#' This function generates a set of informative CpG sites to estimate
#' the purity or the tumor content of a set of tumor samples.
#'
#' A new parameter, named \code{control_costraints}, is required to force the
#' selection of sites with upper/lower quartiles of control scores are below
#' beta-values given by \code{control_costraints}. Sites are divided into
#' \code{hyper} and \code{hypo} depending on their level of methylation with
#' respect to the average beta-score of normal samples.
#'
#' @param tumor_table A matrix of beta-values of tumor samples.
#' @param control_table A matrix of beta-values of control/normal samples.
#' @param auc A data.frame with AUC scores generated by \code{get_AUC}.
#' @param ref_table A data.frame with first two columns reporting genomic location (chromosome, genomic_coordinates).
#' @param cores Number of parallel processes.
#' @param max_sites Maximum number of sites to retrieve (half hyper-, half
#' hypo-methylated) (default=20).
#' @param min_distance Exclude sites located at less than `min_distance` from higher-ranking site (default = 1e6 bps).
#' @param percentiles A vector of length 2. Min and max percentiles to
#' select sites with beta values outside hypo- and hyper-ranges (default = c(0,100);
#' i.e. only min and max beta should be outside of ranges).
#' @param hyper_range A vector of length 2 with minimum lower and upper values
#' required to select hyper-methylated informative sites.
#' @param hypo_range A vector of length 2 with minimum lower and upper values
#' required to select hypo-methylated informative sites.
#' @param method How to select sites: "even" (half hyper-, half hypo-methylated sites),
#' "top" (highest AUC irregardless of hyper or hypomethylation), "hyper" (hyper-methylated sites only),
#' "hypo" (hypo-methylated, sites only).
#' @param control_costraints To select a site, "first quartile"/"third quartile" of control data must be above/below these beta-values.
#' @param full_info Return all informative sites with a column reporting wether to use a site or not (for debugging purposes).
#' @return A data.frame reporting probe names and type ("hyper" and "hypo") of informative sites.
#' @importFrom stats quantile
#' @importFrom utils head
#' @export
#' @examples
#' ## WARNING: The following code doesn't retrieve any informative site
#' ## It just shows how to use the tool
#' auc_data <- get_AUC(tumor_toy_data, control_toy_data)
#' info_sites <- find_informative_sites(tumor_toy_data, control_toy_data, auc_data, illumina27k_hg19)
find_informative_sites <- function(tumor_table, control_table, auc, ref_table, cores=1,
                                   max_sites = 20, min_distance = 1e6, percentiles = c(0, 100),
                                   hyper_range = c(min = .40, max = .90), hypo_range = c(min = .10, max = .60),
                                   control_costraints = c(.30, .70),
                                   method = c("even", "top", "hyper", "hypo"), full_info=FALSE){

    message(sprintf("[%s] # Find informative sites #", Sys.time()))

    # check parameters
    assertthat::assert_that(is.matrix(tumor_table))
    assertthat::assert_that(is.matrix(control_table))
    assertthat::assert_that(is.data.frame(auc))
    assertthat::assert_that(!is.null(rownames(tumor_table)))
    assertthat::assert_that(identical(rownames(tumor_table), rownames(control_table)))
    assertthat::assert_that(nrow(tumor_table) == nrow(auc))
    assertthat::assert_that(ncol(ref_table) >= 2)
    assertthat::assert_that(nrow(tumor_table) == nrow(ref_table))

    assertthat::assert_that(is.numeric(cores))
    assertthat::assert_that(is.numeric(max_sites))
    assertthat::assert_that(is.numeric(min_distance))
    assertthat::assert_that(is.logical(full_info))
    assertthat::assert_that(is.numeric(hyper_range))
    assertthat::assert_that(is.numeric(hypo_range))
    assertthat::assert_that(is.numeric(control_costraints))
    assertthat::assert_that(is.numeric(percentiles))

    assertthat::assert_that(length(hyper_range) == 2)
    assertthat::assert_that(length(hypo_range) == 2)
    assertthat::assert_that(length(control_costraints) == 2)
    assertthat::assert_that(length(percentiles) == 2)

    method <- match.arg(method)
    if (method == "even")
        assertthat::assert_that(max_sites %% 2 == 0, msg="method is set to 'even' but 'max_sites' is not even")

    message(sprintf("- Method: %s", method))
    message(sprintf("- Minimum distance between sites: %g bps", min_distance))
    message(sprintf("- Number of sites to retrieve: %i", max_sites))
    message(sprintf("- Hyper-methylated sites range: %.2f-%.2f", hyper_range[1], hyper_range[2]))
    message(sprintf("- Hypo-methylated sites range: %.2f-%.2f", hypo_range[1], hypo_range[2]))
    message(sprintf("- Control constraints: %.2f-%.2f", control_costraints[1], control_costraints[2]))
    message(sprintf("- Percentiles: %g-%g", percentiles[1], percentiles[2]))

    # minimum and maximum beta per site
    cl <- parallel::makeCluster(cores)
    message(sprintf("[%s] Compute min-/max- beta scores...", Sys.time()))
    min_beta <- suppressWarnings(parallel::parApply(cl, tumor_table, 1, quantile, probs = percentiles[1]/100, na.rm = TRUE))
    max_beta <- suppressWarnings(parallel::parApply(cl, tumor_table, 1, quantile, probs = percentiles[2]/100, na.rm = TRUE))

    message(sprintf("[%s] Compute control interquartiles...", Sys.time()))
    lower_quart <- suppressWarnings(parallel::parApply(cl, control_table, 1, quantile, probs = .25, na.rm = TRUE))
    upper_quart <- suppressWarnings(parallel::parApply(cl, control_table, 1, quantile, probs = .75, na.rm = TRUE))
    parallel::stopCluster(cl)

    message(sprintf("[%s] Find sites...", Sys.time()))
    diff_meth_sites_df <- cbind(auc,
                                data.frame(Max_beta = max_beta,
                                           Min_beta = min_beta,
                                           Upper_Quart = upper_quart,
                                           Lower_Quart = lower_quart,
                                           Chromosome = ref_table[[1]],
                                           Genomic_Coordinate = ref_table[[2]]))
    hyper_sites_idx <- with(diff_meth_sites_df, which(AUC > .80 & Min_beta < hyper_range[1] & Max_beta > hyper_range[2] & Upper_Quart < control_costraints[1]))
    hypo_sites_idx  <- with(diff_meth_sites_df, which(AUC < .20 & Min_beta < hypo_range[1]  & Max_beta > hypo_range[2]  & Lower_Quart > control_costraints[2]))
    site_type <- rep("no_differences", nrow(auc))
    site_type[hyper_sites_idx] <- "hyper"
    site_type[hypo_sites_idx] <- "hypo"
    diff_meth_sites_df$Site_type <- site_type
    diff_meth_sites_df$AUC[diff_meth_sites_df$Site_type == "Hypo"] <- 1-diff_meth_sites_df$AUC
    diff_meth_sites_df <- diff_meth_sites_df[order(diff_meth_sites_df$AUC, decreasing = TRUE),]

    hyper_sites_df <- diff_meth_sites_df[diff_meth_sites_df$Site_type == "hyper",]
    hypo_sites_df  <- diff_meth_sites_df[diff_meth_sites_df$Site_type == "hypo",]
    message(sprintf("* Total hyper-methylated sites = %i", nrow(hyper_sites_df)))
    message(sprintf("* Total hypo-methylated sites = %i", nrow(hypo_sites_df)))

    # avoid sites too close to previously selected sites
    hyper_sites_df$Keep_site <- remove_close_sites(hyper_sites_df, max_sites, min_distance)
    hypo_sites_df$Keep_site <- remove_close_sites(hypo_sites_df, max_sites, min_distance)

    if (full_info) {
        sites <- rbind(hyper_sites_df, hypo_sites_df)
        columns <- c("Probe", "Site_type", "Chromosome", "Genomic_Coordinate",
                     "AUC", "Max_beta", "Min_beta", "Upper_Quart",
                     "Lower_Quart","Keep_site")
        sites <- sites[,columns]
    } else {
        # filter sites and take selected number
        if (method == "even") {
            sites <- rbind(head(hyper_sites_df[hyper_sites_df$Keep_site,],max_sites/2),
                           head(hypo_sites_df[hypo_sites_df$Keep_site,],max_sites/2))
        } else if (method == "top") {
            sites <- rbind(hyper_sites_df, hypo_sites_df)
            sites <- sites[order(sites$AUC, decreasing = TRUE),]
            sites <- head(sites,max_sites)
        } else if (method == "hyper") {
            sites <- head(hyper_sites_df,max_sites)
        } else if (method == "hypo") {
            sites <- head(hypo_sites_df,max_sites)
        }
        sites <- sites[c("Probe", "Site_type", "Chromosome", "Genomic_Coordinate")]
        message(sprintf("* Retrieved hyper-methylated sites = %i", sum(sites$Site_type=="hyper")))
        message(sprintf("* Retrieved hypo-methylated sites = %i", sum(sites$Site_type=="hypo")))
    }
    rownames(sites) <- NULL
    message(sprintf("[%s] Done", Sys.time()))
    return(sites)
}

#' Remove close sites
#'
#' @param sites_df A data.frame
#' @param min_distanc A number
#' @keywords internal
#' @importFrom stats dist
#' @importFrom utils adist
remove_close_sites <- function(sites_df, max_sites, min_distance){

    keep_sites <- logical(nrow(sites_df))
    kept_sites <- 0
    i <- 1
    if (nrow(sites_df) > 0){
        diff_chr <- as.matrix(adist(sites_df$Chromosome)) > 0
        above_distance <- as.matrix(dist(sites_df$Genomic_Coordinate)) >= min_distance

        keep_sites[1] <- TRUE
        kept_sites <- kept_sites + 1
        i <- i + 1
    }
    while (kept_sites < max_sites & i <= nrow(sites_df)){
        check_all <- all(diff_chr[i, seq_len(i-1)] | above_distance[i, seq_len(i-1)])
        check_all <- check_all[!is.na(check_all)]
        if (length(check_all) > 0 & isTRUE(all(check_all))) {
            keep_sites[i] <- TRUE
            kept_sites <- kept_sites + 1
        }
        i <- i + 1
    }
    return(keep_sites)
}
cgplab/PAMES documentation built on Dec. 4, 2022, 6:35 p.m.