inst/examples/clustering-imputation/synthetic/melissa_cpg_eval_rep.R

# ------------------------------------------
# Set working directory and load libraries
# ------------------------------------------
if (interactive()) {cur.dir <- dirname(parent.frame(2)$ofile); setwd(cur.dir)}
R.utils::sourceDirectory("../../lib", modifiedOnly = FALSE)
suppressPackageStartupMessages(library(BPRMeth))
suppressPackageStartupMessages(library(data.table))
suppressPackageStartupMessages(library(ROCR))
set.seed(123)

melissa_cpg_analysis <- function(opts, sim){
    # Initialize lists
    melissa_prof = melissa_mean = eval_perf <- vector("list", length = length(opts$cpg_train_prcg))
    # Load synthetic data
    io <- list(data_file = paste0("high_noise_encode_data_", sim, ".rds"),
               data_dir = "../../local-data/melissa/synthetic/imputation/coverage/raw/data-sims/")
    obj <- readRDS(paste0(io$data_dir, io$data_file))
    i <- 1
    # Iterate
    for (m in opts$cpg_train_prcg) {
        # Partition to training and test sets
        dt <- partition_dataset(X = obj$synth_data$X, region_train_prcg = opts$region_train_prcg,
                                cpg_train_prcg = m, is_synth = TRUE)
        # Using methylation profiles
        prof_obj <- melissa_vb(X = dt$train, K = opts$K, basis = opts$basis_prof,
                               vb_max_iter = opts$vb_max_iter, epsilon_conv = opts$epsilon_conv,
                               is_kmeans = opts$is_kmeans, vb_init_nstart = opts$vb_init_nstart,
                               vb_init_max_iter = opts$vb_init_max_iter, is_parallel = opts$is_parallel,
                               no_cores = opts$no_cores, is_verbose = FALSE)
        # Using methylation rate
        mean_obj <- melissa_vb(X = dt$train, K = opts$K, basis = opts$basis_mean,
                               vb_max_iter = opts$vb_max_iter, epsilon_conv = opts$epsilon_conv,
                               is_kmeans = opts$is_kmeans, vb_init_nstart = opts$vb_init_nstart,
                               vb_init_max_iter = opts$vb_init_max_iter, is_parallel = opts$is_parallel,
                               no_cores = opts$no_cores, is_verbose = FALSE)
        melissa_prof[[i]] <- prof_obj
        melissa_mean[[i]] <- mean_obj
        # Evalute model performance
        eval_prof <- eval_performance(obj = prof_obj, test = dt$test)
        eval_mean <- eval_performance(obj = mean_obj, test = dt$test)
        eval_perf[[i]] <- list(eval_prof = eval_prof, eval_mean = eval_mean)

        ##----------------------------------------------------------------------
        message("Computing AUC...")
        ##----------------------------------------------------------------------
        pred_prof <- prediction(eval_prof$pred_obs, eval_prof$act_obs)
        # roc_prof <- performance(pred_prof, "tpr", "fpr")
        auc_prof <- performance(pred_prof, "auc")
        auc_prof <- unlist(auc_prof@y.values)
        message(auc_prof)

        pred_mean <- prediction(eval_mean$pred_obs, eval_mean$act_obs)
        # roc_mean <- performance(pred_mean, "tpr", "fpr")
        auc_mean <- performance(pred_mean, "auc")
        auc_mean <- unlist(auc_mean@y.values)
        message(auc_mean)
        i <- i + 1 # Increase counter
    }
    obj <- list(melissa_prof = melissa_prof, melissa_rate = melissa_mean,
                eval_perf = eval_perf, opts = opts)
    return(obj)
}

##------------------------
# Load synthetic data
##------------------------
io <- list(data_file = paste0("raw/data-sims/high_noise_encode_data_1.rds"),
           out_dir = "../../local-data/melissa/synthetic/imputation/coverage/")
obj <- readRDS(paste0(io$out_dir, io$data_file))
opts                   <- obj$opts       # Get options
opts$delta_0           <- rep(3, opts$K) # Dirichlet prior
opts$alpha_0           <- 0.5            # Gamma prior
opts$beta_0            <- 10             # Gamma prior
opts$data_train_prcg   <- 0.1            # % of data to keep fully for training
opts$region_train_prcg <- 1              # % of regions kept for training
opts$cpg_train_prcg    <- seq(.1,.9,.1)  # % of CpGs kept for training in each region
opts$is_kmeans         <- TRUE           # Use K-means for initialization
opts$vb_max_iter       <- 300            # Maximum VB iterations
opts$epsilon_conv      <- 1e-4           # Convergence threshold for VB
opts$vb_init_nstart    <- 10             # Mini VB restarts
opts$vb_init_max_iter  <- 20             # Mini VB iteratiions
opts$is_parallel       <- TRUE           # Use parallelized version
opts$no_cores          <- 4              # Number of cores
rm(obj)

# Parallel analysis
no_cores_out <- BPRMeth:::.parallel_cores(no_cores = opts$total_sims,
                                          is_parallel = TRUE)
print(date())
obj <- parallel::mclapply(X = 1:opts$total_sims, FUN = function(sim)
    melissa_cpg_analysis(opts = opts, sim = sim), mc.cores = no_cores_out)
print(date())

##----------------------------------------------------------------------
message("Storing results...")
##----------------------------------------------------------------------
saveRDS(obj, file = paste0(io$out_dir, "high_noise_encode_melissa_K", opts$K,
                           "_rbf", opts$basis_prof$M,
                           "_dataTrain", opts$data_train_prcg,
                           "_regionTrain", opts$region_train_prcg,
                           "_clusterVar", opts$cluster_var, ".rds") )
andreaskapou/Melissa documentation built on June 12, 2020, 5:54 p.m.