#' Estimate the mean predictive accuracy and mean protein importance over all
#' the simulated datasets
#' @details This function fits the classification model,
#' in order to classify the subjects in each simulated training dataset (the
#' output of \code{\link{simulateDataset}}).
#' Then the fitted model is validated on the (simulated) validation set (the
#' output of \code{\link{simulateDataset}}).
#' Two performance are reported :
#'
#' (1) the mean predictive accuracy : The function trains classifier on each
#' simulated training dataset and reports the predictive accuracy of the trained
#' classifier on the validation data (output of \code{\link{simulateDataset}} function).
#' Then these predictive accuracies are averaged over all the simulation.
#'
#' (2) the mean protein importance : It represents the importance of a protein
#' in separating different groups. It is estimated on each simulated training
#' dataset using function `varImp` from package caret. Please refer to the help
#' file of `varImp` about how each classifier calculates the protein importance.
#' Then these importance values for each protein are averaged over all the simulation.
#'
#' The list of classification models trained on each simulated dataset,
#' the predictive accuracy on the validation set predicted by the corresponding
#' classification model and the importance value for all the proteins estimated
#' by the corresponding classification model are also reported.
#'
#' @param simulations A list of simulated datasets It should be the name of the
#' output of \code{\link{simulateDataset}} function.
#' @param classifier A string specifying which classfier to use. This function
#' uses function `train' from package caret. The options are 1) rf (random
#' forest calssifier, default option). 2) nnet (neural network), 3) svmLinear
#' (support vector machines with linear kernel), 4) logreg (logistic regression),
#' and 5) naive_bayes (naive_bayes).
#' @param top_K the number of proteins selected as important features
#' (biomarker candidates). All the proteins are ranked in descending order based
#' on its importance to separate different groups and the `top_K` proteins are
#' selected as important features.
#' @param parallel Default is FALSE. If TRUE, parallel computation is performed.
#'
#' @return \emph{num_proteins} is the number of simulated proteins. It should be
#' the same as one of the output from \emph{simulateDataset}, called \emph{num_proteins}
#' @return \emph{num_samples} is a vector with the number of simulated samples
#' in each condition. It should be the same as one of the output from
#' \emph{simulateDataset}, called \emph{num_samples}
#' @return \emph{mean_predictive_accuracy} is the mean predictive accuracy over
#' all the simulated datasets, which have same `num_proteins' and `num_samples'.
#' @return \emph{mean_feature_importance} is the mean protein importance vector
#' over all the simulated datasets, the length of which is `num_proteins'.
#' @return \emph{predictive_accuracy} is a vector of predictive accuracy on each
#' simulated dataset.
#' @return \emph{feature_importance} is a matrix of feature importance, where
#' rows are proteins and columns are simulated datasets.
#' @author Ting Huang, Meena Choi, Sumedh Sankhe, Olga Vitek
#'
#' @examples data(OV_SRM_train)
#' data(OV_SRM_train_annotation)
#'
#' # num_simulations = 10: simulate 10 times
#' # protein_rank = "mean", protein_select = "high", and protein_quantile_cutoff = 0.0:
#' # select the proteins with high mean abundance based on the protein_quantile_cutoff
#' # expected_FC = "data": fold change estimated from OV_SRM_train
#' # simulate_validation = FALSE: use input OV_SRM_train as validation set
#' # valid_samples_per_group = 50: 50 samples per condition
#' simulated_datasets <- simulateDataset(data = OV_SRM_train,
#' annotation = OV_SRM_train_annotation,
#' log2Trans = FALSE,
#' num_simulations = 10,
#' samples_per_group = 50,
#' protein_rank = "mean",
#' protein_select = "high",
#' protein_quantile_cutoff = 0.0,
#' expected_FC = "data",
#' list_diff_proteins = NULL,
#' simulate_validation = FALSE,
#' valid_samples_per_group = 50)
#'
#' # run classification on simulated datasets without parallel computation
#' classification_results <- designSampleSizeClassification(simulations = simulated_datasets,
#' parallel = FALSE)
#'
# # the number of simulated proteins
#' classification_results$num_proteins
#'
#' # a vector with the number of simulated samples in each condition
#' classification_results$num_samples
#'
#' # the mean predictive accuracy over all the simulated datasets,
#' # which have same 'num_proteins' and 'num_samples'
#' classification_results$mean_predictive_accuracy
#'
#' # the mean protein importance vector over all the simulated datasets,
#' # the length of which is 'num_proteins'.
#' head(classification_results$mean_feature_importance)
#'
#' @importFrom caret train trainControl varImp predict.train
#' @importFrom BiocParallel bplapply
#' @importFrom stats rnorm predict
#' @importFrom utils sessionInfo read.table write.table txtProgressBar setTxtProgressBar
#' @export
designSampleSizeClassification <- function(simulations,
classifier = "rf",
top_K = 10,
parallel = FALSE, ...) {
###############################################################################
## log file
## save process output in each step
dots <- list(...)
session <- dots$session
conn <- .find_log(...)
func <- as.list(sys.call())[[1]]
res <- .catch_faults({
###############################################################################
## Input and option checking
## 1. input should be the output of SimulateDataset function
if(!is.element('simulation_train_Xs', names(simulations)) |
!is.element('simulation_train_Ys', names(simulations))) {
stop("CALL_",func,"Please use 'SimulateDataset' first. Then use ",
"output of simulateDataset function as input in ",
"designSampleSizeClassification.")
}
## 2. input for classifier option
if(!any(classifier == c('rf', 'nnet', 'svmLinear', 'logreg', 'naive_bayes'))) {
stop("CALL_",func,"`classifier` should be one of 'rf', 'nnet',",
"'svmLinear', 'logreg', and 'naive_bayes'. Please check it.")
}
.status(sprintf("classifier : %s", classifier), log = conn$con,
func = func)
## 3. input for top_K option
if(is.null(top_K)){
stop("CALL_",func,"top_K is required. Please provide the value for top_K.")
} else if (top_K < 0 | top_K > simulations$num_proteins){
stop("CALL_",func,
sprintf("_top_K should be between 0 and the total number of
protein (%s). Please check the value for top_K",
simulations$num_proteins))
}
.status(sprintf("top_K = %s", top_K), log = conn$con)
###############################################################################
## start to train classifier
.status("Start to train classifier...", log = conn$con)
## get the validation set for prediction
iter <- length(simulations$simulation_train_Xs) # number of simulations
num_proteins <- simulations$num_proteins
num_samples <- simulations$num_samples
valid_x <- simulations$valid_X
valid_y <- simulations$valid_Y
tunegrid <- NULL
if(classifier != "logreg"){
tunegrid <- .tuning_params(classifier = classifier, ...)
}
## if parallel TRUE,
if(parallel){
.status("Using parallel backend", log = conn$con)
## fit the classifier for each simulation dataset
results <- bplapply(seq_len(iter), .classification_performance,
classifier=classifier,
train_x_list = simulations$simulation_train_Xs,
train_y_list = simulations$simulation_train_Ys,
valid_x = valid_x, valid_y = valid_y,
top_K = top_K, tunegrid = tunegrid)
} else {
## fit the classifier for each simulation dataset
results <- lapply(seq_len(iter),
.classification_performance,
classifier=classifier,
train_x_list = simulations$simulation_train_Xs,
train_y_list = simulations$simulation_train_Ys,
valid_x = valid_x, valid_y = valid_y,
top_K = top_K, tunegrid = tunegrid)
}
## calculate the mean predictive accuracy over all the simulations
PA <- NULL
## calculate the frequency a protein is selected as important (biomarker candidates)
FI <- data.frame('proteins' = names(valid_x))
for (i in seq_along(results)) {
# record the importance of each protein
PA <- c(PA, results[[i]]$acc)
imp <- results[[i]]$f_imp
FI <- cbind(FI, with(FI, ifelse(proteins %in% imp,1,0)))
}
names(FI) <- c('proteins', paste0("simulation", seq_along(results)))
## report the training and validating done
.status("Finish to train classifier and to check the performance.",
log = conn$con)
names(PA) <- names(FI)[-1]
# calculate mean predictive accuracy
meanPA <- mean(PA)
# calculate mean feature importance
meanFI <- rowSums(FI[,-1], na.rm = T)
names(meanFI) <- FI$proteins
# sort in descending order
meanFI <- sort(meanFI, decreasing=TRUE)
.status("Report the mean predictive accuracy and mean feature importance.",
log = conn$con)
list(num_proteins = num_proteins, # number of proteins
num_samples = num_samples, # number of samples per group
mean_predictive_accuracy = meanPA, # mean predictive accuracy
mean_feature_importance = meanFI, # vector of mean feature importance
predictive_accuracy = PA, # vector of predictive accuracy
feature_importance = FI # matrix of feature importance
)
}, conn = conn, session = session)
class(res) <- c('list', 'ssclassification')
return(res)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.