#' Discover informative CpG sites
#'
#' This function generates a set of informative CpG sites to estimate
#' the purity or the tumor content of a set of tumor samples.
#'
#' A new parameter, named \code{control_costraints}, is required to force the
#' selection of sites with upper/lower quartiles of control scores are below
#' beta-values given by \code{control_costraints}. Sites are divided into
#' \code{hyper} and \code{hypo} depending on their level of methylation with
#' respect to the average beta-score of normal samples.
#'
#' @param tumor_table A matrix of beta-values of tumor samples.
#' @param control_table A matrix of beta-values of control/normal samples.
#' @param auc A data.frame with AUC scores generated by \code{get_AUC}.
#' @param ref_table A data.frame with first two columns reporting genomic location (chromosome, genomic_coordinates).
#' @param cores Number of parallel processes.
#' @param max_sites Maximum number of sites to retrieve (half hyper-, half
#' hypo-methylated) (default=20).
#' @param min_distance Exclude sites located at less than `min_distance` from higher-ranking site (default = 1e6 bps).
#' @param percentiles A vector of length 2. Min and max percentiles to
#' select sites with beta values outside hypo- and hyper-ranges (default = c(0,100);
#' i.e. only min and max beta should be outside of ranges).
#' @param hyper_range A vector of length 2 with minimum lower and upper values
#' required to select hyper-methylated informative sites.
#' @param hypo_range A vector of length 2 with minimum lower and upper values
#' required to select hypo-methylated informative sites.
#' @param method How to select sites: "even" (half hyper-, half hypo-methylated sites),
#' "top" (highest AUC irregardless of hyper or hypomethylation), "hyper" (hyper-methylated sites only),
#' "hypo" (hypo-methylated, sites only).
#' @param control_costraints To select a site, "first quartile"/"third quartile" of control data must be above/below these beta-values.
#' @param full_info Return all informative sites with a column reporting wether to use a site or not (for debugging purposes).
#' @return A data.frame reporting probe names and type ("hyper" and "hypo") of informative sites.
#' @importFrom stats quantile
#' @importFrom utils head
#' @export
#' @examples
#' ## WARNING: The following code doesn't retrieve any informative site
#' ## It just shows how to use the tool
#' auc_data <- get_AUC(tumor_toy_data, control_toy_data)
#' info_sites <- find_informative_sites(tumor_toy_data, control_toy_data, auc_data, illumina27k_hg19)
find_informative_sites <- function(tumor_table, control_table, auc, ref_table, cores=1,
max_sites = 20, min_distance = 1e6, percentiles = c(0, 100),
hyper_range = c(min = .40, max = .90), hypo_range = c(min = .10, max = .60),
control_costraints = c(.30, .70),
method = c("even", "top", "hyper", "hypo"), full_info=FALSE){
message(sprintf("[%s] # Find informative sites #", Sys.time()))
# check parameters
assertthat::assert_that(is.matrix(tumor_table))
assertthat::assert_that(is.matrix(control_table))
assertthat::assert_that(is.data.frame(auc))
assertthat::assert_that(!is.null(rownames(tumor_table)))
assertthat::assert_that(identical(rownames(tumor_table), rownames(control_table)))
assertthat::assert_that(nrow(tumor_table) == nrow(auc))
assertthat::assert_that(ncol(ref_table) >= 2)
assertthat::assert_that(nrow(tumor_table) == nrow(ref_table))
assertthat::assert_that(is.numeric(cores))
assertthat::assert_that(is.numeric(max_sites))
assertthat::assert_that(is.numeric(min_distance))
assertthat::assert_that(is.logical(full_info))
assertthat::assert_that(is.numeric(hyper_range))
assertthat::assert_that(is.numeric(hypo_range))
assertthat::assert_that(is.numeric(control_costraints))
assertthat::assert_that(is.numeric(percentiles))
assertthat::assert_that(length(hyper_range) == 2)
assertthat::assert_that(length(hypo_range) == 2)
assertthat::assert_that(length(control_costraints) == 2)
assertthat::assert_that(length(percentiles) == 2)
method <- match.arg(method)
if (method == "even")
assertthat::assert_that(max_sites %% 2 == 0, msg="method is set to 'even' but 'max_sites' is not even")
message(sprintf("- Method: %s", method))
message(sprintf("- Minimum distance between sites: %g bps", min_distance))
message(sprintf("- Number of sites to retrieve: %i", max_sites))
message(sprintf("- Hyper-methylated sites range: %.2f-%.2f", hyper_range[1], hyper_range[2]))
message(sprintf("- Hypo-methylated sites range: %.2f-%.2f", hypo_range[1], hypo_range[2]))
message(sprintf("- Control constraints: %.2f-%.2f", control_costraints[1], control_costraints[2]))
message(sprintf("- Percentiles: %g-%g", percentiles[1], percentiles[2]))
# minimum and maximum beta per site
cl <- parallel::makeCluster(cores)
message(sprintf("[%s] Compute min-/max- beta scores...", Sys.time()))
min_beta <- suppressWarnings(parallel::parApply(cl, tumor_table, 1, quantile, probs = percentiles[1]/100, na.rm = TRUE))
max_beta <- suppressWarnings(parallel::parApply(cl, tumor_table, 1, quantile, probs = percentiles[2]/100, na.rm = TRUE))
message(sprintf("[%s] Compute control interquartiles...", Sys.time()))
lower_quart <- suppressWarnings(parallel::parApply(cl, control_table, 1, quantile, probs = .25, na.rm = TRUE))
upper_quart <- suppressWarnings(parallel::parApply(cl, control_table, 1, quantile, probs = .75, na.rm = TRUE))
parallel::stopCluster(cl)
message(sprintf("[%s] Find sites...", Sys.time()))
diff_meth_sites_df <- cbind(auc,
data.frame(Max_beta = max_beta,
Min_beta = min_beta,
Upper_Quart = upper_quart,
Lower_Quart = lower_quart,
Chromosome = ref_table[[1]],
Genomic_Coordinate = ref_table[[2]]))
hyper_sites_idx <- with(diff_meth_sites_df, which(AUC > .80 & Min_beta < hyper_range[1] & Max_beta > hyper_range[2] & Upper_Quart < control_costraints[1]))
hypo_sites_idx <- with(diff_meth_sites_df, which(AUC < .20 & Min_beta < hypo_range[1] & Max_beta > hypo_range[2] & Lower_Quart > control_costraints[2]))
site_type <- rep("no_differences", nrow(auc))
site_type[hyper_sites_idx] <- "hyper"
site_type[hypo_sites_idx] <- "hypo"
diff_meth_sites_df$Site_type <- site_type
diff_meth_sites_df$AUC[diff_meth_sites_df$Site_type == "Hypo"] <- 1-diff_meth_sites_df$AUC
diff_meth_sites_df <- diff_meth_sites_df[order(diff_meth_sites_df$AUC, decreasing = TRUE),]
hyper_sites_df <- diff_meth_sites_df[diff_meth_sites_df$Site_type == "hyper",]
hypo_sites_df <- diff_meth_sites_df[diff_meth_sites_df$Site_type == "hypo",]
message(sprintf("* Total hyper-methylated sites = %i", nrow(hyper_sites_df)))
message(sprintf("* Total hypo-methylated sites = %i", nrow(hypo_sites_df)))
# avoid sites too close to previously selected sites
hyper_sites_df$Keep_site <- remove_close_sites(hyper_sites_df, max_sites, min_distance)
hypo_sites_df$Keep_site <- remove_close_sites(hypo_sites_df, max_sites, min_distance)
if (full_info) {
sites <- rbind(hyper_sites_df, hypo_sites_df)
columns <- c("Probe", "Site_type", "Chromosome", "Genomic_Coordinate",
"AUC", "Max_beta", "Min_beta", "Upper_Quart",
"Lower_Quart","Keep_site")
sites <- sites[,columns]
} else {
# filter sites and take selected number
if (method == "even") {
sites <- rbind(head(hyper_sites_df[hyper_sites_df$Keep_site,],max_sites/2),
head(hypo_sites_df[hypo_sites_df$Keep_site,],max_sites/2))
} else if (method == "top") {
sites <- rbind(hyper_sites_df, hypo_sites_df)
sites <- sites[order(sites$AUC, decreasing = TRUE),]
sites <- head(sites,max_sites)
} else if (method == "hyper") {
sites <- head(hyper_sites_df,max_sites)
} else if (method == "hypo") {
sites <- head(hypo_sites_df,max_sites)
}
sites <- sites[c("Probe", "Site_type", "Chromosome", "Genomic_Coordinate")]
message(sprintf("* Retrieved hyper-methylated sites = %i", sum(sites$Site_type=="hyper")))
message(sprintf("* Retrieved hypo-methylated sites = %i", sum(sites$Site_type=="hypo")))
}
rownames(sites) <- NULL
message(sprintf("[%s] Done", Sys.time()))
return(sites)
}
#' Remove close sites
#'
#' @param sites_df A data.frame
#' @param min_distanc A number
#' @keywords internal
#' @importFrom stats dist
#' @importFrom utils adist
remove_close_sites <- function(sites_df, max_sites, min_distance){
keep_sites <- logical(nrow(sites_df))
kept_sites <- 0
i <- 1
if (nrow(sites_df) > 0){
diff_chr <- as.matrix(adist(sites_df$Chromosome)) > 0
above_distance <- as.matrix(dist(sites_df$Genomic_Coordinate)) >= min_distance
keep_sites[1] <- TRUE
kept_sites <- kept_sites + 1
i <- i + 1
}
while (kept_sites < max_sites & i <= nrow(sites_df)){
check_all <- all(diff_chr[i, seq_len(i-1)] | above_distance[i, seq_len(i-1)])
check_all <- check_all[!is.na(check_all)]
if (length(check_all) > 0 & isTRUE(all(check_all))) {
keep_sites[i] <- TRUE
kept_sites <- kept_sites + 1
}
i <- i + 1
}
return(keep_sites)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.