#' Build a List of Precision-Recall Curve Objects
#'
#' @description Given a directory of best-performing results files from one of
#' the simulation functions (\code{\link{WriteDMRcateResults}},
#' \code{\link{WriteProbeLassoResults}},
#' \code{\link{WriteBumphunterResults}}, or results from the \code{Comb-p}
#' method in \code{Python}), import the raw data files and construct PR-curve
#' objects via the \code{\link[PRROC]{pr.curve}} function.
#'
#' @param bestResultsDir The name of the directory where the method results from
#' the best-performing parameter settings are stored. For the full design we
#' have included (\code{delta = c(0.025, 0.05, 0.1, 0.15, 0.2, 0.3, 0.4)} and
#' \code{seed = c(100, 210, 330, 450, 680)}), this directory should contain
#' 35 \code{.RDS} files per method.
#' @param delta A treatment size corresponding to one of the simulations with
#' completed results files in the \code{bestResultsDir} directory.
#' @param seed A seed value corresponding to one of the simulations with
#' completed results files in the \code{bestResultsDir} directory.
#' @param beta_mat A beta value matrix for selected methylation samples from a
#' 450k methylation array with CPG IDs as the row names and sample IDs as
#' the column names. An example is given in the \code{betaVals_mat} data set.
#' @param AclustCPG_df A data frame of beta values and CpG information for
#' clusters of CpGs over a 450k methylation array. The rows correspond to the
#' CPGs. The columns have information on the cluster number, chromosome,
#' cluster start and end locations, and the beta values for each subject
#' grouped by some clinical indicator (e.g. case v. control). An example is
#' given in the \code{startEndCPG_df} data set.
#' @param CPGs_df An annotation table that indicates locations of CpGs.
#' This data frame has CPG IDs as the rows with matching chromosome and
#' location info in the columns. Specifically, the columns are: \code{ILMNID}
#' - the CPG ID; \code{chr} - the chromosome label; and \code{MAPINFO} -
#' the chromosome location. An example is given in the \code{cpgLocation_df}
#' data set. This data set is only necessary if the results directory
#' contains Comb-p results with the specified \code{delta} and \code{seed}
#' values.
#' @param min.cpgs The minimum number of CPGs necessary to consider a result
#' significant. Defaults to 5. This argument is only required if the results
#' directory contains Comb-p results with the specified \code{delta} and
#' \code{seed} values.
#'
#' @return A list of PR-curve objects, to be plotted via the
#' \code{\link{PlotPRCurve}} function.
#'
#' @importFrom stats aggregate
#' @importFrom stats as.formula
#' @importFrom PRROC pr.curve
#'
#' @export
#'
#' @examples
#' \dontrun{
#' BuildPRcurve(
#' bestResultsDir = "best_cases_results/",
#' delta = 0.4,
#' seed = 100
#' )
#' }
BuildPRcurve <- function(bestResultsDir,
delta = c(0.025, 0.05, 0.1, 0.15, 0.2, 0.3, 0.4),
seed = c(100, 210, 330, 450, 680),
beta_mat = betaVals_mat,
AclustCPG_df = startEndCPG_df,
CPGs_df = cpgLocation_df,
min.cpgs = 5){
### Gold Standard ###
treatment_ls <- SimulateData(beta_mat = beta_mat,
Aclusters_df = AclustCPG_df,
delta_num = delta,
seed_int = seed)
trueClusters_df <- treatment_ls$simAclusters_df
### List Results Files ###
fileNames_char <- list.files(bestResultsDir)
targetNames_char <- paste0("delta", delta, "_seed", seed)
correctFiles_idx <- grep(targetNames_char, fileNames_char)
correctNames_char <- fileNames_char[correctFiles_idx]
names(correctNames_char) <- sapply(
strsplit(correctNames_char, "_"),
function(x){
gsub(pattern = "Results", replacement = "", x[1])
}
)
### Load and Clean Appropriate Files ###
allRes_ls <- lapply(correctNames_char, function(x){
res_ls <- readRDS(paste0(bestResultsDir, x))
if(grepl("Combp", x)){
results_df <- StandardizeOutput(
methodOut_df = res_ls[[1]],
method = "Comb_p",
cpgLocation_df = CPGs_df
)
# The raw Comb-p results were filtered to > 1, not > 4.
results_df <- results_df[results_df$dmr.n.cpgs >= min.cpgs, ]
res_ls[[1]] <- results_df
}
CleanResults(dmrResults_ls = res_ls,
Aclusters_df = trueClusters_df)
})
### Build PR Curves ###
allPRs_ls <- lapply(allRes_ls, function(cleanDMR_df){
if(!is.null(cleanDMR_df$dmr.n.cpgs)){
# This is because ProbeLasso randomly sucks for small delta.
x_df <- cleanDMR_df[, c("aclust.order", "dmr.pval", "actual")]
x_df$status <- ifelse(x_df$actual == "positive", 1, 0)
x_df$dmr.pval[is.na(x_df$dmr.pval)] <- 1
# take min pvalue of dmrs
agg_fmla <- as.formula("dmr.pval ~ aclust.order + actual + status")
x_df <- aggregate(agg_fmla, data = x_df, FUN = min)
# PR curve
pr.curve(scores.class0 = 1 - x_df$dmr.pval,
weights.class0 = x_df$status,
curve = TRUE)
} else {
NULL
}
})
### Return ###
attr(allPRs_ls, "delta") <- delta
attr(allPRs_ls, "repl") <- which(c(100, 210, 330, 450, 680) %in% seed)
allPRs_ls
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.