#' Random subsampling
#'
#' @param sample_idx Typically a numeric vector with sample index to be separated.
#' A character vector with sample IDs could also be used
#' @param iterations An integer, the number of iterations in the random subsampling
#' @param test_size A number between 0 and 1. The samples to be included in the
#' test set on each interation.
#' @param keep_together Either `NULL` or a factor with the same length as `sample_idx`.
#' `keep_together` can be used to ensure that groups of samples are kept
#' in together in all iterations (either on training or on test, but never split).
#' A typical use case for this is when you have sample replicates and you want
#' to keep all replicates together to prevent overoptimistic results (having
#' one sample on the train subset and its replicate on the test subset would
#' make the prediction easier to guess).
#' Another use case for this is when you have a longitudinal study and you
#' want to keep some subjects in the same train or test group, because you
#' want to use some information in a longitudinal way (e.g. a multilevel plsda model).
#' @param balance_in_train Either `NULL` or a factor with the same length as `sample_idx`.
#' `balance_in_train` can be used to force that on each iteration, the train
#' partition contains the same number of samples of the given factor levels.
#' For instance, if we have a dataset with 40 samples of class "A" and 20 samples
#' of class "B", using a `test_size = 0.25`, we can force to always have 16
#' samples of class "A" and 16 samples of class "B" in the training subset.
#' This is beneficial to those algorithms that require that the training groups
#' are balanced.
#'
#' @return A list of length equal to `iterations`. Each element of the list is
#' a list with two entries (`training` and `test`) containing the `sample_idx`
#' values that will belong to each subset.
#'
#' @examples
#' random_subsampling(1:100, iterations = 4, test_size = 0.25)
#'
#' subject_id <- c("Alice", "Bob", "Charlie", "Eve")
#' random_subsampling(1:4, iterations = 2, test_size = 0.25, keep_together = subject_id)
#'
#' @export
random_subsampling <- function(sample_idx,
iterations = 10L,
test_size = 0.25,
keep_together = NULL,
balance_in_train = NULL) {
resample <- function(x, ...) x[sample.int(length(x), ...)]
num_samples <- length(sample_idx)
output <- vector("list", iterations)
if (is.null(keep_together)) {
keep_together <- sample_idx
}
keep_together <- as.factor(keep_together)
stopifnot(length(keep_together) == num_samples)
if (!is.null(balance_in_train)) {
balance_in_train <- as.factor(balance_in_train)
# Check keep_together & balance consistency
stopifnot(length(balance_in_train) == num_samples)
for (lev in levels(keep_together)) {
samples_in_lev <- which(keep_together == lev)
if (length(unique(balance_in_train[samples_in_lev])) > 1) {
stop(glue::glue(
"Can't balance samples and keep samples together",
" for samples of keep_together == {lev}, because they",
" belong to more than one balancing groups: {paste0(unique(balance_in_train[samples_in_lev]))}"
))
}
}
keep_balance <- data.frame(
keep_together = keep_together,
balance_in_train = balance_in_train,
stringsAsFactors = TRUE
)
keep_balance <- unique(keep_balance)
smaller_balancing_group <- min(table(keep_balance$balance_in_train))
num_groups_train <- max(floor(smaller_balancing_group * (1 - test_size)), 1)
for (i in seq_len(iterations)) {
groups_train <- keep_balance %>%
dplyr::group_by(balance_in_train) %>%
dplyr::sample_n(num_groups_train) %>%
dplyr::ungroup() %>%
dplyr::pull(keep_together)
train_samples <- sample_idx[keep_together %in% groups_train]
test_samples <- base::setdiff(sample_idx, train_samples)
output[[i]] <- list(
training = train_samples,
test = test_samples
)
if (length(train_samples) == 0 || length(test_samples) == 0) {
rlang::abort(
message = c(
"Too few samples in a random subsampling split",
"i" = glue::glue("Train samples: {length(train_samples)}"),
"i" = glue::glue("Test samples: {length(test_samples)}"),
"i" = sprintf("Please %s the test_subset", ifelse(length(test_samples) == 0, "increase", "decrease"))
)
)
}
}
} else {
groups_keep_together <- unique(keep_together)
num_groups_test <- floor(length(groups_keep_together) * test_size)
for (i in seq_len(iterations)) {
groups_test <- resample(x = groups_keep_together, size = num_groups_test)
test_samples <- sample_idx[keep_together %in% groups_test]
train_samples <- base::setdiff(sample_idx, test_samples)
output[[i]] <- list(
training = train_samples,
test = test_samples
)
if (length(train_samples) == 0 || length(test_samples) == 0) {
rlang::abort(
message = c(
"Too few samples in a random subsampling split",
"i" = glue::glue("Train samples: {length(train_samples)}"),
"i" = glue::glue("Test samples: {length(test_samples)}"),
"i" = sprintf("Please %s the test_subset", ifelse(length(test_samples) == 0, "increase", "decrease"))
)
)
}
}
}
return(output)
}
#' Split samples for double cross-validation
#'
#' @param dataset An [nmr_dataset_family] like object
#' @inheritParams random_subsampling
#' @inheritParams nmr_data_analysis
#'
#' @return A list with two elements: `outer` and `inner`. Each of those elements
#' is a list as long as the number of iterations given in `external_val` and `internal_val`
#' and it includes the indices of the samples that will belong to the train and test
#' subsets.
#' @noRd
split_double_cv <- function(dataset, keep_together = NULL,
external_val = list(iterations = 16L, test_size = 0.25),
internal_val = list(iterations = 10L, test_size = 0.25)) {
if (is.null(keep_together)) {
keep_together_data <- seq_len(dataset$num_samples)
} else {
keep_together_data <- as.factor(nmr_meta_get_column(dataset, keep_together))
}
sample_idx <- seq_len(dataset$num_samples)
outer_cv_loop <- random_subsampling(
sample_idx = sample_idx,
iterations = external_val$iterations,
test_size = external_val$test_size,
keep_together = keep_together_data
)
outer_cv_iterations <- purrr::imap(outer_cv_loop, function(outer, outer_idx) {
list(
outer_iter = outer_idx,
outer_train = outer$training,
outer_test = outer$test
)
})
names(outer_cv_iterations) <- as.character(purrr::map_int(outer_cv_iterations, "outer_iter"))
inner_cv_iterations <- purrr::flatten(
purrr::imap(outer_cv_loop, function(outer, outer_idx) {
calib_idx <- outer$training
blind_idx <- outer$test
inner_cv <- random_subsampling(
sample_idx = calib_idx,
iterations = internal_val$iterations,
test_size = internal_val$test_size,
keep_together = keep_together_data[calib_idx]
)
purrr::imap(inner_cv, function(inner_iter, inner_iter_idx) {
list(
outer_iter = outer_idx,
inner_iter = inner_iter_idx,
inner_train_idx = inner_iter$training,
inner_test_idx = inner_iter$test
)
})
})
)
names(inner_cv_iterations) <-
paste0(
as.character(purrr::map_int(inner_cv_iterations, "outer_iter")),
"_",
as.character(purrr::map_int(inner_cv_iterations, "inner_iter"))
)
list(
outer = outer_cv_iterations,
inner = inner_cv_iterations
)
}
#' Split dataset in matrices, and call a function
#'
#' @param train_test_subset A list with two elements. The first element has the
#' training index and the second the test indices
#' @inheritParams nmr_data_analysis
#' @inheritParams nmr_data_analysis_method
#'
#' @param ... Additional arguments passed to `train_evaluate_model`.
#'
#'
#' @return Whatever `train_evaluate_model` returns
#' @noRd
split_build_perform <- function(train_test_subset,
dataset,
y_column,
identity_column = NULL,
train_evaluate_model,
...) {
x_all <- nmr_data(dataset)
y_all <- nmr_meta_get_column(dataset, column = y_column)
if (is.null(y_all)) {
rlang::abort(
c(
"y_column not found",
"x" = sprintf('Column "%s" does not exist in the dataset', y_column)
)
)
}
if (!is.null(identity_column)) {
identity_all <- nmr_meta_get_column(dataset, column = identity_column)
} else {
identity_all <- NULL
}
# Split
train_idx <- train_test_subset[[1]]
test_idx <- train_test_subset[[2]]
x_train <- x_all[train_idx, , drop = FALSE]
y_train <- y_all[train_idx]
if (!is.null(identity_all)) {
identity_train <- identity_all[train_idx]
} else {
identity_train <- NULL
}
x_test <- x_all[test_idx, , drop = FALSE]
y_test <- y_all[test_idx]
if (!is.null(identity_all)) {
identity_test <- identity_all[test_idx]
} else {
identity_test <- NULL
}
# Build & performance:
train_evaluate_model(
x_train = x_train, y_train = y_train, identity_train = identity_train,
x_test = x_test, y_test = y_test, identity_test = identity_test, ...
)
}
#' Do cross-validation
#' @param train_test_subsets A list of length the number of cross-validation iterations.
#' Each list item should have two elements, first the train indices and second the test indices
#'
#' @inheritParams nmr_data_analysis
#' @inheritParams new_nmr_data_analysis_method
#' @param train_evaluate_model_args_iter A named list. The names are valid `train_evaluate_model` arguments. Each list element is a vector.
#'
#' @return A list of length equal to `train_test_subsets` with outputs from their corresponding `train_evaluate_model` outputs
#' @noRd
do_cv <- function(dataset, y_column, identity_column, train_evaluate_model,
train_test_subsets, train_evaluate_model_args_iter = NULL, ..., .enable_parallel = TRUE) {
if (.enable_parallel) {
fun <- BiocParallel::bpmapply
} else {
fun <- mapply
}
output <- do.call(
what = fun,
args = c(
list(
FUN = split_build_perform,
train_test_subset = train_test_subsets
),
train_evaluate_model_args_iter,
list(
MoreArgs = list(
dataset = dataset,
y_column = y_column,
identity_column = identity_column,
train_evaluate_model = train_evaluate_model,
...
),
SIMPLIFY = FALSE
)
)
)
names(output) <- names(train_test_subsets)
output
}
#' Data analysis
#'
#' Data analysis on AlpsNMR can be performed on both [nmr_dataset_1D] full spectra
#' as well as [nmr_dataset_peak_table] peak tables.
#'
#' The workflow consists of a double cross validation strategy using random
#' subsampling for splitting into train and test sets. The classification model
#' and the metric to choose the best model can be customized (see
#' [new_nmr_data_analysis_method()]), but for now only a PLSDA classification
#' model with a best area under ROC curve metric is implemented (see
#' the examples here and [plsda_auroc_vip_method])
#'
#' @param dataset An [nmr_dataset_family] object
#' @param y_column A string with the name of the y column (present in the
#' metadata of the dataset)
#' @param identity_column `NULL` or a string with the name of the identity column (present in the
#' metadata of the dataset).
#' @param external_val,internal_val A list with two elements: `iterations` and `test_size`.
#' See [random_subsampling] for further details
#' @param data_analysis_method An [nmr_data_analysis_method] object
#' @param .enable_parallel Set to `FALSE` to disable parallellization.
#' @return A list with the following elements:
#'
#' - `train_test_partitions`: A list with the indices used in train and test on each of the cross-validation iterations
#' - `inner_cv_results`: The output returned by `train_evaluate_model` on each inner cross-validation
#' - `inner_cv_results_digested`: The output returned by `choose_best_inner`.
#' - `outer_cv_results`: The output returned by `train_evaluate_model` on each outer cross-validation
#' - `outer_cv_results_digested`: The output returned by `train_evaluate_model_digest_outer`.
#' @examples
#' # Data analysis for a table of integrated peaks
#'
#' ## Generate an artificial nmr_dataset_peak_table:
#' ### Generate artificial metadata:
#' num_samples <- 32 # use an even number in this example
#' num_peaks <- 20
#' metadata <- data.frame(
#' NMRExperiment = as.character(1:num_samples),
#' Condition = rep(c("A", "B"), times = num_samples / 2)
#' )
#'
#' ### The matrix with peaks
#' peak_means <- runif(n = num_peaks, min = 300, max = 600)
#' peak_sd <- runif(n = num_peaks, min = 30, max = 60)
#' peak_matrix <- mapply(function(mu, sd) rnorm(num_samples, mu, sd),
#' mu = peak_means, sd = peak_sd
#' )
#' colnames(peak_matrix) <- paste0("Peak", 1:num_peaks)
#'
#' ## Artificial differences depending on the condition:
#' peak_matrix[metadata$Condition == "A", "Peak2"] <-
#' peak_matrix[metadata$Condition == "A", "Peak2"] + 70
#'
#' peak_matrix[metadata$Condition == "A", "Peak6"] <-
#' peak_matrix[metadata$Condition == "A", "Peak6"] - 60
#'
#' ### The nmr_dataset_peak_table
#' peak_table <- new_nmr_dataset_peak_table(
#' peak_table = peak_matrix,
#' metadata = list(external = metadata)
#' )
#'
#' ## We will use a double cross validation, splitting the samples with random
#' ## subsampling both in the external and internal validation.
#' ## The classification model will be a PLSDA, exploring at maximum 3 latent
#' ## variables.
#' ## The best model will be selected based on the area under the ROC curve
#' methodology <- plsda_auroc_vip_method(ncomp = 3)
#' model <- nmr_data_analysis(
#' peak_table,
#' y_column = "Condition",
#' identity_column = NULL,
#' external_val = list(iterations = 3, test_size = 0.25),
#' internal_val = list(iterations = 3, test_size = 0.25),
#' data_analysis_method = methodology
#' )
#' ## Area under ROC for each outer cross-validation iteration:
#' model$outer_cv_results_digested$auroc
#' ## Rank Product of the Variable Importance in the Projection
#' ## (Lower means more important)
#' sort(model$outer_cv_results_digested$vip_rankproducts)
#'
#' @export
nmr_data_analysis <- function(dataset,
y_column,
identity_column,
external_val,
internal_val,
data_analysis_method,
.enable_parallel = TRUE) {
train_evaluate_model <- data_analysis_method[["train_evaluate_model"]]
train_evaluate_model_params_inner <- data_analysis_method[["train_evaluate_model_params_inner"]]
choose_best_inner <- data_analysis_method[["choose_best_inner"]]
train_evaluate_model_params_outer <- data_analysis_method[["train_evaluate_model_params_outer"]]
train_evaluate_model_digest_outer <- data_analysis_method[["train_evaluate_model_digest_outer"]]
# Prepare double cross-validation splits:
train_test_blind_subsets <- split_double_cv(dataset,
keep_together = identity_column,
external_val = external_val,
internal_val = internal_val
)
# These are ALL the inner cross-validation iterations:
train_test_subsets_inner <- purrr::map(
train_test_blind_subsets$inner,
~ .[c("inner_train_idx", "inner_test_idx")]
)
# We run the train_evaluate_model function for each of the inner CV.
inner_cv_results <- do.call(
what = do_cv,
args = c(
list(
dataset = dataset,
y_column = y_column,
identity_column = identity_column,
train_evaluate_model = train_evaluate_model,
train_test_subsets = train_test_subsets_inner,
.enable_parallel = .enable_parallel
),
train_evaluate_model_params_inner
)
)
# We choose the best hyper-parameters for each inner cross-validation:
inner_cv_results_digested <- choose_best_inner(inner_cv_results)
# Prepare the indices for the final outer cv models:
train_test_subsets_outer <- purrr::map(
train_test_blind_subsets$outer,
~ .[c("outer_train", "outer_test")]
)
# Compute the outer cv models
outer_cv_results <- do.call(
what = do_cv,
args = c(
list(
dataset = dataset,
y_column = y_column,
identity_column = identity_column,
train_evaluate_model = train_evaluate_model,
train_test_subsets = train_test_subsets_outer,
train_evaluate_model_args_iter = inner_cv_results_digested$train_evaluate_model_args,
.enable_parallel = .enable_parallel
),
train_evaluate_model_params_outer
)
)
# Digest the results:
outer_cv_results_dig <- train_evaluate_model_digest_outer(outer_cv_results)
# Give output:
list(
train_test_partitions = train_test_blind_subsets,
inner_cv_results = inner_cv_results,
inner_cv_results_digested = inner_cv_results_digested,
outer_cv_results = outer_cv_results,
outer_cv_results_digested = outer_cv_results_dig
)
}
get_test_accuracy <- function(model, x_test, y_test) {
pred <- stats::predict(model, newdata = x_test, dist = "max.dist")
y_test_pred <- pred$class$max.dist[,model$ncomp]
conf_mat <- table(REAL = y_test, PRED = y_test_pred)
accuracy <- diag(conf_mat)/sum(conf_mat)
accuracy
}
train_models_with_only_vip_features <- function(x_train, y_train, x_test, y_test, ncomp, important_vips, relevant_vips) {
# important_vips is a more stringent subset of relevant_vips
if (length(important_vips) == 0) {
cli::cli_warn(
c(
"No VIPs are ranked as important",
"i" = "Try increasing the number of bootstrap iterations"
)
)
if (length(relevant_vips) == 0) {
cli::cli_warn(
c(
"bp_VIP_analysis: no relevant_vips found",
"i" = "You may try increasing the number of bootstraps"
)
)
}
return(list(vips_model = NULL, vips_CR = 0))
}
if (length(important_vips) < ncomp) {
cli::cli_warn(
c(
"Number of important vips ({length(important_vips)}) smaller than requested ncomp ({ncomp})",
"i" = "Attempting to use relevant vips (less stringent criteria)",
"i" = "You may want to consider reducing ncomp"
)
)
if (length(relevant_vips) < ncomp) {
cli::cli_warn(
c(
"Number of relevant vips ({length(relevant_vips)}) smaller than requested ncomp ({ncomp})",
"i" = "Can't compute sub-model using only VIPs",
"i" = "You may want to consider reducing ncomp"
)
)
return(list(vips_model = NULL, vips_CR = 0))
}
x_train_reduced <- as.matrix(x_train[, relevant_vips, drop = FALSE])
x_test_reduced <- as.matrix(x_test[, relevant_vips, drop = FALSE])
} else {
x_train_reduced <- as.matrix(x_train[, important_vips, drop = FALSE])
x_test_reduced <- as.matrix(x_test[, important_vips, drop = FALSE])
}
# Fit PLS model
vips_model <- plsda_build(
x = x_train_reduced,
y = y_train,
identity = NULL,
ncomp = ncomp
)
# Measure the classification rate (CR) of the fold
vips_CR <- get_test_accuracy(vips_model, x_test_reduced, y_test)
list(vips_model = vips_model, vips_CR = vips_CR)
}
#' Bootstrap and permutation over PLS-VIP
#'
#' Bootstrap and permutation over PLS-VIP on AlpsNMR can be performed on both
#' [nmr_dataset_1D] full spectra as well as [nmr_dataset_peak_table] peak tables.
#'
#' Use of the bootstrap and permutation methods for a more robust
#' variable importance in the projection metric for partial least
#' squares regression
#'
#' @name bp_VIP_analysis
#' @param dataset An [nmr_dataset_family] object
#' @param train_index set of index used to generate the bootstrap datasets
#' @param y_column A string with the name of the y column (present in the
#' metadata of the dataset)
#' @param ncomp number of components used in the plsda models
#' @param nbootstrap number of bootstrap dataset
#' @return A list with the following elements:
#'
#' - `important_vips`: A list with the important vips selected
#' - `relevant_vips`: List of vips with some relevance
#' - `pls_vip`: Pls-VIPs of every bootstrap
#' - `pls_vip_perm`: Pls-VIPs of every bootstrap with permuted variables
#' - `pls_vip_means`: Pls-VIPs normaliced differences means
#' - `pls_vip_score_diff`: Differences of `pls_vip` and `pls_vip_perm`
#' - `pls_models`: pls models of the diferent bootstraps
#' - `pls_perm_models`: pls permuted models of the diferent bootstraps
#' - `classif_rate`: classification rate of the bootstrap models
#' - `general_model`: pls model trained with all train data
#' - `general_CR`: classification rate of the `general_model`
#' - `vips_model`: pls model trained with vips selection over all train data
#' - `vips_CR`: classification rate of the `vips_model`
#' - `error`: error spected in a t distribution
#' - `lower_bound`: lower bound of the confidence interval
#' - `upper_bound`: upper bound of the confidence interval
#'
#' @importFrom stats qt
#' @importFrom stats wilcox.test
#' @examples
#' # Data analysis for a table of integrated peaks
#'
#' ## Generate an artificial nmr_dataset_peak_table:
#' ### Generate artificial metadata:
#' num_samples <- 32 # use an even number in this example
#' num_peaks <- 20
#' metadata <- data.frame(
#' NMRExperiment = as.character(1:num_samples),
#' Condition = rep(c("A", "B"), times = num_samples / 2)
#' )
#'
#' ### The matrix with peaks
#' peak_means <- runif(n = num_peaks, min = 300, max = 600)
#' peak_sd <- runif(n = num_peaks, min = 30, max = 60)
#' peak_matrix <- mapply(function(mu, sd) rnorm(num_samples, mu, sd),
#' mu = peak_means, sd = peak_sd
#' )
#' colnames(peak_matrix) <- paste0("Peak", 1:num_peaks)
#'
#' ## Artificial differences depending on the condition:
#' peak_matrix[metadata$Condition == "A", "Peak2"] <-
#' peak_matrix[metadata$Condition == "A", "Peak2"] + 70
#'
#' peak_matrix[metadata$Condition == "A", "Peak6"] <-
#' peak_matrix[metadata$Condition == "A", "Peak6"] - 60
#'
#' ### The nmr_dataset_peak_table
#' peak_table <- new_nmr_dataset_peak_table(
#' peak_table = peak_matrix,
#' metadata = list(external = metadata)
#' )
#'
#' ## We will use a double cross validation, splitting the samples with random
#' ## subsampling both in the external and internal validation.
#' ## The classification model will be a PLSDA, exploring at maximum 3 latent
#' ## variables.
#' ## The best model will be selected based on the area under the ROC curve
#' methodology <- plsda_auroc_vip_method(ncomp = 3)
#' model <- nmr_data_analysis(
#' peak_table,
#' y_column = "Condition",
#' identity_column = NULL,
#' external_val = list(iterations = 1, test_size = 0.25),
#' internal_val = list(iterations = 3, test_size = 0.25),
#' data_analysis_method = methodology
#' )
#' ## Area under ROC for each outer cross-validation iteration:
#' model$outer_cv_results_digested$auroc
#'
#' ## The number of components for the bootstrap models is selected
#' ncomps <- model$outer_cv_results$`1`$model$ncomp
#' train_index <- model$train_test_partitions$outer$`1`$outer_train
#'
#' # Bootstrap and permutation for VIP selection
#' bp_VIPS <- bp_VIP_analysis(peak_table, # Data to be analyzed
#' train_index,
#' y_column = "Condition",
#' ncomp = ncomps,
#' nbootstrap = 10
#' )
#'
#' @export
bp_VIP_analysis <- function(dataset,
train_index,
y_column,
ncomp,
nbootstrap = 300) {
# Extract data and split for train and test
x_all <- dataset$peak_table
y_all <- nmr_meta_get_column(dataset, column = y_column)
x_train <- x_all[train_index, , drop = FALSE]
y_train <- y_all[train_index]
# For check performance
x_test <- x_all[-train_index, , drop = FALSE]
y_test <- y_all[-train_index]
if (length(unique(y_train)) == 1) {
stop("Only one class in train set, please increase number of samples")
}
if (length(unique(y_test)) == 1) {
stop("Only one class in test set, please increase number of samples")
}
num_features <- n <- ncol(x_all)
if (ncomp > num_features) {
cli::cli_abort("ncomp ({ncomp}) can't be larger than num_features ({num_features})")
}
names <- colnames(x_all)
# some checks
if (length(names) == 0) {
stop("Error in bp_VIP_analysis, the dataset peak_table doesn't have colnames.")
}
pls_vip <- matrix(nrow = num_features, ncol = nbootstrap, dimnames = list(names, NULL))
pls_vip_perm_score <- matrix(nrow = num_features, ncol = nbootstrap, dimnames = list(names, NULL))
pls_vip_score_diff <- matrix(nrow = num_features, ncol = nbootstrap, dimnames = list(names, NULL)) # Bootstrap with replacement nbootstraps datasets
pls_models <- list()
pls_perm_models <- list()
CR <- list()
# Bootstrap with replacement nbootstraps datasets
res <- BiocParallel::bplapply(
seq_len(nbootstrap),
function(i, x_train, y_train, ncomp) {
num_features <- ncol(x_train)
index <- sample(seq_len(nrow(x_train)), nrow(x_train), replace = TRUE)
x_train_boots <- x_train[index, ]
y_train_boots <- y_train[index]
# if y_train_boots only have one class, plsda models give error
if (length(unique(y_train_boots)) == 1) {
# Replace the first element for the first element of another class
for (i in seq_len(y_train)) {
if (y_train[i] != y_train_boots[1]) {
y_train_boots[1] <- y_train[i]
x_train_boots[1, ] <- x_train[i, ]
break
}
}
}
# Rename rownames, because if they are repeated, plsda fails
rownames(x_train_boots) <- paste0("Sample", seq_len(nrow(x_train_boots)))
# Fit PLS model
model <-
plsda_build(
x = x_train_boots,
y = y_train_boots,
identity = NULL,
ncomp = ncomp
)
# VIPs per component extraction
pls_vip_comps <- plsda_vip(model)
# Sum contributions of VIPs to each component
pls_vip <- sqrt(rowSums(pls_vip_comps^2) / ncomp)
# Measure the classification rate (CR) of the bootstrap model
CR <- get_test_accuracy(model, x_test, y_test)
pls_vip_perm <- matrix(nrow = n, ncol = n, dimnames = list(names, NULL))
# Permutation of variables
for (j in seq_len(num_features)) {
random_pos <- sample(seq_len(num_features), 1)
x_train_boots_perm <- x_train_boots
x_train_boots_perm[, j] <- x_train_boots[, random_pos]
# Refit model with permuted variables
model_perm <-
plsda_build(
x = x_train_boots_perm,
y = y_train_boots,
identity = NULL,
ncomp = ncomp
)
# VIPs per component extraction
pls_vip_comps_perm <- plsda_vip(model_perm)
# Sum contributions of VIPs to each component
pls_vip_perm[, j] <- sqrt(rowSums(pls_vip_comps_perm^2) / ncomp)
}
# bootsrapped and randomly permuted PLS-VIPs
pls_vip_perm_score <- colSums(pls_vip_perm) / n
# bootsrapped and randomly permuted difference
pls_vip_score_diff <- pls_vip - pls_vip_perm_score
list(
pls_vip = pls_vip,
pls_vip_perm_score = pls_vip_perm_score,
pls_vip_score_diff = pls_vip_score_diff,
model = model,
model_perm = model_perm,
CR = CR
)
},
x_train = x_train,
y_train = y_train,
ncomp = ncomp
)
pls_vip <- do.call(cbind, purrr::map(res, "pls_vip"))
rownames(pls_vip) <- names
pls_vip_perm_score <- do.call(cbind, purrr::map(res, "pls_vip_perm_score"))
rownames(pls_vip_perm_score) <- names
pls_vip_score_diff <- do.call(cbind, purrr::map(res, "pls_vip_score_diff"))
rownames(pls_vip_score_diff) <- names
pls_models <- purrr::map(res, "model")
pls_perm_models <- purrr::map(res, "model_perm")
CR <- purrr::map(res, "CR")
# Normalization of the difference vector for each variable to
# its corresponding standard deviation and construct
# 95% confidence intervals around the differences
boots_vip <- matrix(nrow = num_features, dimnames = list(names))
boots_vip_sd <- matrix(nrow = num_features, dimnames = list(names))
error <- matrix(nrow = num_features, dimnames = list(names))
lower_bound <- matrix(nrow = num_features, dimnames = list(names))
upper_bound <- matrix(nrow = num_features, dimnames = list(names))
for (k in seq_len(num_features)) {
element <- pls_vip_score_diff[k, ] / sd(pls_vip_score_diff[k, ])
boots_vip[k] <- sum(element) / nbootstrap
boots_vip_sd[k] <- sqrt(sum((element - boots_vip[k])^2) / (nbootstrap - 1))
error[k] <- qt(0.975, df = nbootstrap - 1) * boots_vip_sd[k]
lower_bound[k] <- boots_vip[k] - error[k]
upper_bound[k] <- boots_vip[k] + error[k]
}
important_vips <- names[lower_bound > qt(0.975, df = nbootstrap - 1)]
relevant_vips <- names[lower_bound > 0]
# Checking performance
# Fit PLS model
general_model <-
plsda_build(
x = x_train,
y = y_train,
identity = NULL,
ncomp = ncomp
)
# Measure the classification rate (CR) of the fold
general_CR <- get_test_accuracy(general_model, x_test, y_test)
vips_results <- train_models_with_only_vip_features(
x_train, y_train, x_test, y_test,
ncomp, important_vips, relevant_vips
)
vips_model <- vips_results$vips_model
vips_CR <- vips_results$vips_CR
# To return it ordered by mean of the normalized vectors
orden <- order(boots_vip, decreasing = TRUE)
# Return important vips and auc performance
list(
important_vips = important_vips,
relevant_vips = relevant_vips,
pls_vip = pls_vip[orden, , drop = FALSE],
pls_vip_perm = pls_vip_perm_score[orden, , drop = FALSE],
pls_vip_means = boots_vip[orden, , drop = FALSE],
pls_vip_score_diff = pls_vip_score_diff[orden, , drop = FALSE],
pls_models = pls_models,
pls_perm_models = pls_perm_models,
classif_rate = CR,
general_model = general_model,
general_CR = general_CR,
vips_model = vips_model,
vips_CR = vips_CR,
error = error[orden, , drop = FALSE],
lower_bound = lower_bound[orden, , drop = FALSE],
upper_bound = upper_bound[orden, , drop = FALSE]
)
}
#' K-fold bootstrap and permutation over PLS-VIP
#'
#' Bootstrap and permutation over PLS-VIP on AlpsNMR can be performed on both
#' [nmr_dataset_1D] full spectra as well as [nmr_dataset_peak_table] peak tables.
#'
#' Use of the bootstrap and permutation methods for a more robust
#' variable importance in the projection metric for partial least
#' squares regression, in a k-fold cross validation
#'
#' @name bp_kfold_VIP_analysis
#' @param dataset An [nmr_dataset_family] object
#' @param y_column A string with the name of the y column (present in the
#' metadata of the dataset)
#' @param k Number of folds, recomended between 4 to 10
#' @param ncomp number of components for the bootstrap models
#' @param nbootstrap number of bootstrap dataset
#' @return A list with the following elements:
#'
#' - `important_vips`: A list with the important vips selected
#' - `relevant_vips`: List of vips with some relevance
#' - `wilcoxon_vips`: List of vips that pass a wilcoxon test
#' - `vip_means`: Means of the vips scores
#' - `vip_score_plot`: plot of the vips scores
#' - `kfold_resuls`: results of the k [bp_VIP_analysis]
#' - `kfold_index`: list of index of partitions of the folds
#'
#' @export
#' @examples
#' # Data analysis for a table of integrated peaks
#' set.seed(42)
#' ## Generate an artificial nmr_dataset_peak_table:
#' ### Generate artificial metadata:
#' num_samples <- 64 # use an even number in this example
#' num_peaks <- 10
#' metadata <- data.frame(
#' NMRExperiment = as.character(1:num_samples),
#' Condition = sample(rep(c("A", "B"), times = num_samples / 2), num_samples)
#' )
#'
#' ### The matrix with peaks
#' peak_means <- runif(n = num_peaks, min = 300, max = 600)
#' peak_sd <- runif(n = num_peaks, min = 30, max = 60)
#' peak_matrix <- mapply(function(mu, sd) rnorm(num_samples, mu, sd),
#' mu = peak_means, sd = peak_sd
#' )
#' colnames(peak_matrix) <- paste0("Peak", 1:num_peaks)
#' rownames(peak_matrix) <- paste0("Sample", 1:num_samples)
#'
#' ## Artificial differences depending on the condition:
#' peak_matrix[metadata$Condition == "A", "Peak2"] <-
#' peak_matrix[metadata$Condition == "A", "Peak2"] + 70
#'
#' peak_matrix[metadata$Condition == "A", "Peak6"] <-
#' peak_matrix[metadata$Condition == "A", "Peak6"] - 60
#'
#' ### The nmr_dataset_peak_table
#' peak_table <- new_nmr_dataset_peak_table(
#' peak_table = peak_matrix,
#' metadata = list(external = metadata)
#' )
#'
#' ## We will use bootstrap and permutation method for VIPs selection
#' ## in a a k-fold cross validation
#' bp_results <- bp_kfold_VIP_analysis(peak_table, # Data to be analyzed
#' y_column = "Condition", # Label
#' k = 2,
#' ncomp = 1,
#' nbootstrap = 5
#' )
#'
#' message("Selected VIPs are: ", bp_results$important_vips)
#'
bp_kfold_VIP_analysis <- function(dataset,
y_column,
k = 4,
ncomp = 3,
nbootstrap = 300) {
if (k <= 1) {
stop("K must be integer greater than 1")
}
# Extract data and split for train and test
x_all <- dataset$peak_table
y_all <- nmr_meta_get_column(dataset, column = y_column)
if (length(unique(y_all)) == 1) {
stop("Only one class in data set, at least two needed")
}
# Random and spliting
x <- seq_len(length(y_all))
index <- sample(x, replace = FALSE)
x_all <- x_all[index, ]
y_all <- y_all[index]
# Split data for k-fold
k_fold_split <- split(x, x %% k)
k_fold_index <- list()
for (i in seq_len(k)) {
k_fold_index[[i]] <- seq_len(length(y_all))[-k_fold_split[[i]]]
}
# bp_VIP_analysis is already parallellized.
results <- lapply(
k_fold_index, function(index, dataset = dataset, y_column = y_column,
ncomp = ncomp, nbootstrap = nbootstrap) {
bp_VIP_analysis(
dataset,
index,
y_column = y_column,
ncomp = ncomp,
nbootstrap = nbootstrap
)
},
dataset = dataset, y_column = y_column,
ncomp = ncomp, nbootstrap = nbootstrap
)
# Mean of the vips of the different folds for the plot
means <- purrr::map(results, "pls_vip_means")
# means <- sapply(results, "[", "pls_vip_means")
names_order <- sort(rownames(means[[1]]))
ordered_means <- matrix(nrow = k, ncol = length(names_order), dimnames = list(NULL, names_order))
for (i in seq_len(k)) {
ordered_means[i, ] <- means[[i]][order(rownames(means[[i]]))]
}
ordered_means <- colSums(ordered_means) / k
vip_means <- ordered_means[order(ordered_means, decreasing = TRUE)]
error <- results[[1]]$error
# Selection based on the means (deprecated, now ussing intersection of the vips)
# important_vips <- vip_means[vip_means-2*error > 0]
# relevant_vips <- vip_means[vip_means-error > 0]
## Wilcoxon test
num_var <- dim(results[[1]]$pls_vip)[1]
wt <- matrix(nrow = k, ncol = num_var)
wt_vips <- list()
for (i in seq_len(k)) {
for (j in seq_len(num_var)) {
x <- results[[i]]$pls_vip[j, ]
y <- results[[i]]$pls_vip_perm[j, ]
# wt_object <- wilcox.test(x, y, paired = TRUE, alternative = "two.sided")
wt_object <- wilcox.test(x, y, paired = TRUE, alternative = "greater")
wt[i, j] <- wt_object$p.value
}
wt_vips[[i]] <- rownames(results[[i]]$pls_vip)[wt[i, ] < 0.05]
}
# Plot of the scores
x <- seq_len(length(vip_means))
p <- ggplot2::ggplot() +
ggplot2::geom_segment(
mapping = ggplot2::aes(
x = x,
y = vip_means - error,
xend = x,
yend = vip_means + error
),
arrow = NULL
) +
ggplot2::geom_point(
mapping = ggplot2::aes(x = x, y = vip_means),
shape = 21,
fill = "white"
) +
ggplot2::geom_hline(yintercept = qt(0.975, df = nbootstrap - 1)) +
ggplot2::ggtitle("BP-VIP") +
ggplot2::labs(x = "Variables", y = "Scores") +
ggplot2::theme_bw()
# Reporting the intersection of the vips of the different folds
list(
important_vips = Reduce(intersect, purrr::map(results, "important_vips")),
relevant_vips = Reduce(intersect, purrr::map(results, "relevant_vips")),
wilcoxon_vips = unique(unlist(wt_vips)),
vip_means = vip_means,
vip_score_plot = p,
kfold_results = results,
kfold_index = k_fold_index
)
}
#' Plot vip scores of bootstrap
#'
#' @param vip_means vips means values of bootstraps
#' @param error error tolerated, calculated in the bootstrap
#' @param nbootstrap number of bootstraps realiced
#' @param plot A boolean that indicate if results are plotted or not
#'
#' @return A plot of the results or a ggplot object
#' @export
#' @examples
#' # Data analysis for a table of integrated peaks
#'
#' ## Generate an artificial nmr_dataset_peak_table:
#' ### Generate artificial metadata:
#' num_samples <- 64 # use an even number in this example
#' num_peaks <- 20
#' metadata <- data.frame(
#' NMRExperiment = as.character(1:num_samples),
#' Condition = rep(c("A", "B"), times = num_samples / 2)
#' )
#'
#' ### The matrix with peaks
#' peak_means <- runif(n = num_peaks, min = 300, max = 600)
#' peak_sd <- runif(n = num_peaks, min = 30, max = 60)
#' peak_matrix <- mapply(function(mu, sd) rnorm(num_samples, mu, sd),
#' mu = peak_means, sd = peak_sd
#' )
#' colnames(peak_matrix) <- paste0("Peak", 1:num_peaks)
#'
#' ## Artificial differences depending on the condition:
#' peak_matrix[metadata$Condition == "A", "Peak2"] <-
#' peak_matrix[metadata$Condition == "A", "Peak2"] + 70
#'
#' peak_matrix[metadata$Condition == "A", "Peak6"] <-
#' peak_matrix[metadata$Condition == "A", "Peak6"] - 60
#'
#' ### The nmr_dataset_peak_table
#' peak_table <- new_nmr_dataset_peak_table(
#' peak_table = peak_matrix,
#' metadata = list(external = metadata)
#' )
#'
#' ## We will use bootstrap and permutation method for VIPs selection
#' ## in a a k-fold cross validation
#' # bp_results <- bp_kfold_VIP_analysis(peak_table, # Data to be analized
#' # y_column = "Condition", # Label
#' # k = 3,
#' # ncomp = 1,
#' # nbootstrap = 10)
#'
#' # message("Selected VIPs are: ", bp_results$importarn_vips)
#'
#' # plot_vip_scores(bp_results$kfold_results[[1]]$vip_means,
#' # bp_results$kfold_results[[1]]$error[1],
#' # nbootstrap = 10)
#'
plot_vip_scores <- function(vip_means, error, nbootstrap, plot = TRUE) {
# Plot of the scores
x <- seq_len(length(vip_means))
vip_score_plot <- ggplot2::ggplot() +
ggplot2::geom_segment(
mapping = ggplot2::aes(
x = x,
y = vip_means - error,
xend = x,
yend = vip_means + error
),
arrow = NULL
) +
ggplot2::geom_point(
mapping = ggplot2::aes(x = x, y = vip_means),
shape = 21,
fill = "white"
) +
ggplot2::geom_hline(yintercept = qt(0.975, df = nbootstrap - 1)) +
ggplot2::ggtitle("BP-VIP") +
ggplot2::labs(x = "Variables", y = "Scores") +
ggplot2::theme_bw()
if (plot) {
vip_score_plot
} else {
return(vip_score_plot)
}
}
#' Create method for NMR data analysis
#'
#' @param train_evaluate_model A function. The `train_evaluate_model` must have the following signature:
#'
#' function(x_train, y_train, identity_train, x_test, y_test, identity_test, ...)
#'
#' The `x_train` and `y_train` (and their test counterparts) are self-explanatory.
#'
#' The `identity_` arguments are expected to be factors. They can be used for
#' instance with a callback that uses [mixOmics::plsda] in a `multilevel` approach
#' for longitudinal studies. In those studies the `identity` would be an
#' identifier of the subject.
#'
#' The `...` arguments are free to be defined for each `train_evaluate_model`.
#'
#' @param train_evaluate_model_params_inner,train_evaluate_model_params_outer A list with additional
#' arguments to pass to `train_evaluate_model` either in the inner cv loop or in the outer cv loop.
#'
#' @param choose_best_inner A function with a single argument:
#'
#' function(inner_cv_results)
#'
#' The argument is a list of `train_evaluate_model` outputs.
#' The return value of must be a list with at least an element named `train_evaluate_model_args`.
#' `train_evaluate_model_args` must be a named list.
#'
#' - Each element must be named as one of the `train_evaluate_model` arguments.
#' - Each element must be a vector as long as the number of outer cross-validations.
#' - The values of each vector must be the values that the `train_evaluate_model`
#' argument must take on each outer cross-validation iteration
#' Additional list elements can be returned and will be given back to the user
#'
#' @param train_evaluate_model_digest_outer A function with a single argument:
#'
#' function(outer_cv_results)
#'
#' The argument is a list of `train_evaluate_model` outputs in outer cross-validation.
#' The return value is returned by `nmr_data_analysis`
#'
#' @return An object encapsulating the method dependent functions that can be used with [nmr_data_analysis]
#' @name nmr_data_analysis_method
#' @export
#'
new_nmr_data_analysis_method <- function(train_evaluate_model,
train_evaluate_model_params_inner,
choose_best_inner,
train_evaluate_model_params_outer,
train_evaluate_model_digest_outer) {
out <- list(
train_evaluate_model = train_evaluate_model,
train_evaluate_model_params_inner = train_evaluate_model_params_inner,
choose_best_inner = choose_best_inner,
train_evaluate_model_params_outer = train_evaluate_model_params_outer,
train_evaluate_model_digest_outer = train_evaluate_model_digest_outer
)
class(out) <- "nmr_data_analysis_method"
out
}
#' Permutation test
#'
#' Make permutations with data and default settings from an nmr_data_analysis_method
#'
#' @param nPerm number of permutations
#'
#' @inheritParams nmr_data_analysis
#'
#' @return A permutation matrix with permuted values
#' @name permutation_test_model
#' @export
#' @examples
#' # Data analysis for a table of integrated peaks
#'
#' ## Generate an artificial nmr_dataset_peak_table:
#' ### Generate artificial metadata:
#' num_samples <- 32 # use an even number in this example
#' num_peaks <- 20
#' metadata <- data.frame(
#' NMRExperiment = as.character(1:num_samples),
#' Condition = rep(c("A", "B"), times = num_samples / 2)
#' )
#'
#' ### The matrix with peaks
#' peak_means <- runif(n = num_peaks, min = 300, max = 600)
#' peak_sd <- runif(n = num_peaks, min = 30, max = 60)
#' peak_matrix <- mapply(function(mu, sd) rnorm(num_samples, mu, sd),
#' mu = peak_means, sd = peak_sd
#' )
#' colnames(peak_matrix) <- paste0("Peak", 1:num_peaks)
#'
#' ## Artificial differences depending on the condition:
#' peak_matrix[metadata$Condition == "A", "Peak2"] <-
#' peak_matrix[metadata$Condition == "A", "Peak2"] + 70
#'
#' peak_matrix[metadata$Condition == "A", "Peak6"] <-
#' peak_matrix[metadata$Condition == "A", "Peak6"] - 60
#'
#' ### The nmr_dataset_peak_table
#' peak_table <- new_nmr_dataset_peak_table(
#' peak_table = peak_matrix,
#' metadata = list(external = metadata)
#' )
#'
#' methodology <- plsda_auroc_vip_method(ncomp = 3)
#' model <- nmr_data_analysis(
#' peak_table,
#' y_column = "Condition",
#' identity_column = NULL,
#' external_val = list(iterations = 3, test_size = 0.25),
#' internal_val = list(iterations = 3, test_size = 0.25),
#' data_analysis_method = methodology
#' )
#'
#' p <- permutation_test_model(peak_table,
#' y_column = "Condition",
#' identity_column = NULL,
#' external_val = list(iterations = 3, test_size = 0.25),
#' internal_val = list(iterations = 3, test_size = 0.25),
#' data_analysis_method = methodology,
#' nPerm = 10
#' )
#'
permutation_test_model <- function(dataset, y_column, identity_column, external_val, internal_val,
data_analysis_method, nPerm = 50) {
permMatrix <- BiocParallel::bplapply(
seq_len(nPerm),
function(p, dataset, y_column, identity_column, external_val, internal_val, data_analysis_method) {
dataset_perm <- dataset
y_all <- nmr_meta_get_column(dataset, column = y_column)
YPerm <- sample(y_all)
dataset_perm[["metadata"]][["external"]][[y_column]] <- YPerm
# print(sum(y_test!=y_all))
permMod <- nmr_data_analysis(
dataset_perm,
y_column = y_column,
identity_column = identity_column,
external_val = external_val,
internal_val = internal_val,
data_analysis_method = data_analysis_method,
.enable_parallel = FALSE
)
# I will use the mean of the auc of all the outer_cv for the test static
test_stat <- mean(permMod$outer_cv_results_digested$auroc$auc)
test_stat
},
dataset = dataset,
y_column = y_column,
identity_column = identity_column,
external_val = external_val,
internal_val = internal_val,
data_analysis_method = data_analysis_method
)
permMatrix <- matrix(unlist(permMatrix), ncol = 1)
return(permMatrix)
}
#' Permutation test plot
#'
#' Plot permutation test using actual model and permutated models
#'
#' @param nmr_data_analysis_model A nmr_data_analysis_model
#' @param permMatrix A permutation fitness outcome from permutation_test_model
#' @param xlab optional xlabel
#' @param xlim optional x-range
#' @param ylim otional y-range
#' @param breaks optional custom histogram breaks (defaults to 'sturges')
#' @param main optional plot title (or TRUE for autoname)
#'
#' @importFrom graphics axis hist lines text
#' @importFrom stats median pt sd ecdf na.omit
#' @return A plot with the comparison between the actual model versus the permuted models
#' @name permutation_test_plot
#' @export
#' @examples
#' # Data analysis for a table of integrated peaks
#'
#' ## Generate an artificial nmr_dataset_peak_table:
#' ### Generate artificial metadata:
#' num_samples <- 32 # use an even number in this example
#' num_peaks <- 20
#' metadata <- data.frame(
#' NMRExperiment = as.character(1:num_samples),
#' Condition = rep(c("A", "B"), times = num_samples / 2)
#' )
#'
#' ### The matrix with peaks
#' peak_means <- runif(n = num_peaks, min = 300, max = 600)
#' peak_sd <- runif(n = num_peaks, min = 30, max = 60)
#' peak_matrix <- mapply(function(mu, sd) rnorm(num_samples, mu, sd),
#' mu = peak_means, sd = peak_sd
#' )
#' colnames(peak_matrix) <- paste0("Peak", 1:num_peaks)
#'
#' ## Artificial differences depending on the condition:
#' peak_matrix[metadata$Condition == "A", "Peak2"] <-
#' peak_matrix[metadata$Condition == "A", "Peak2"] + 70
#'
#' peak_matrix[metadata$Condition == "A", "Peak6"] <-
#' peak_matrix[metadata$Condition == "A", "Peak6"] - 60
#'
#' ### The nmr_dataset_peak_table
#' peak_table <- new_nmr_dataset_peak_table(
#' peak_table = peak_matrix,
#' metadata = list(external = metadata)
#' )
#'
#' methodology <- plsda_auroc_vip_method(ncomp = 3)
#' model <- nmr_data_analysis(
#' peak_table,
#' y_column = "Condition",
#' identity_column = NULL,
#' external_val = list(iterations = 3, test_size = 0.25),
#' internal_val = list(iterations = 3, test_size = 0.25),
#' data_analysis_method = methodology
#' )
#'
#' p <- permutation_test_model(peak_table,
#' y_column = "Condition",
#' identity_column = NULL,
#' external_val = list(iterations = 3, test_size = 0.25),
#' internal_val = list(iterations = 3, test_size = 0.25),
#' data_analysis_method = methodology,
#' nPerm = 10
#' )
#'
#' permutation_test_plot(model, p)
#'
permutation_test_plot <- function(nmr_data_analysis_model,
permMatrix,
xlab = "AUCs",
xlim,
ylim = NULL,
breaks = "Sturges",
main = "Permutation test") {
h0 <- permMatrix[, 1]
if (missing(xlim)) {
xlim <- c(0, 1)
}
h <- hist(permMatrix, breaks, xlim = xlim, ylim = ylim, axes = FALSE, xlab = xlab, freq = FALSE, main = main)
h2 <- max(h$density) * .75
axis(1, pos = 0)
axis(2, pos = 0, las = 1)
model_auc <- mean(nmr_data_analysis_model$outer_cv_results_digested$auroc$auc)
lines(rep(model_auc, 2), c(0, h2))
p <- ecdf(h0)(model_auc) # Empirical
# p1Stud=pt((model_auc-(mean(h0)/2))/sd(h0/2),(length(h0)-1)) # Students
# warning: sometimes the auc of the permutation is NaN
h0_median <- apply(permMatrix, 2, function(x) median(na.omit(x)))
pP <- ifelse(model_auc < h0_median, p, 1 - p)
if (pP < 1 / length(h0)) {
text(h2, pos = 2, labels = paste("p<", signif(1 / length(h0), 4), sep = ""))
} else {
text(h2, pos = 2, labels = paste("p=", signif(pP, 4), sep = ""))
}
}
#' Models stability plot
#'
#' Plot stability among models of the external cross validation
#'
#' @param model A nmr_data_analysis_model
#'
#' @return A plot of models stability
#' @name models_stability_plot_plsda
#' @export
#' @examples
#' # Data analysis for a table of integrated peaks
#'
#' ## Generate an artificial nmr_dataset_peak_table:
#' ### Generate artificial metadata:
#' num_samples <- 32 # use an even number in this example
#' num_peaks <- 20
#' metadata <- data.frame(
#' NMRExperiment = as.character(1:num_samples),
#' Condition = rep(c("A", "B"), times = num_samples / 2)
#' )
#'
#' ### The matrix with peaks
#' peak_means <- runif(n = num_peaks, min = 300, max = 600)
#' peak_sd <- runif(n = num_peaks, min = 30, max = 60)
#' peak_matrix <- mapply(function(mu, sd) rnorm(num_samples, mu, sd),
#' mu = peak_means, sd = peak_sd
#' )
#' colnames(peak_matrix) <- paste0("Peak", 1:num_peaks)
#'
#' ## Artificial differences depending on the condition:
#' peak_matrix[metadata$Condition == "A", "Peak2"] <-
#' peak_matrix[metadata$Condition == "A", "Peak2"] + 70
#'
#' peak_matrix[metadata$Condition == "A", "Peak6"] <-
#' peak_matrix[metadata$Condition == "A", "Peak6"] - 60
#'
#' ### The nmr_dataset_peak_table
#' peak_table <- new_nmr_dataset_peak_table(
#' peak_table = peak_matrix,
#' metadata = list(external = metadata)
#' )
#'
#' methodology <- plsda_auroc_vip_method(ncomp = 3)
#' model <- nmr_data_analysis(
#' peak_table,
#' y_column = "Condition",
#' identity_column = NULL,
#' external_val = list(iterations = 3, test_size = 0.25),
#' internal_val = list(iterations = 3, test_size = 0.25),
#' data_analysis_method = methodology
#' )
#'
#' # models_stability_plot_plsda(model)
#'
models_stability_plot_plsda <- function(model) {
# Loadings of the models
ex_n <- length(model$outer_cv_results)
max_ncomp <- 0
for (n in seq_len(ex_n)) {
if (model$outer_cv_results[[n]]$model$ncomp > max_ncomp) {
max_ncomp <- model$outer_cv_results[[n]]$model$ncomp
}
}
loadings_sp <-
matrix(nrow = ex_n * max_ncomp, ncol = ex_n * max_ncomp)
labelsX <- list()
for (n in seq_len(max_ncomp)) {
for (m in seq_len(max_ncomp)) {
for (i in seq_len(ex_n)) {
for (j in seq_len(ex_n)) {
tryCatch(
{
loadings_sp[(n - 1) * ex_n + i, (m - 1) * ex_n + j] <-
(
model$outer_cv_results[[i]]$model$loadings$X[, n] %*% model$outer_cv_results[[j]]$model$loadings$X[, m]
)[1, 1]
labelsX[[(n - 1) * ex_n + i]] <-
paste("M", i, "- LV", n)
},
error = function(e) {
}
)
}
}
}
}
# deleting na cows and cols
loadings_sp <-
loadings_sp[, colSums(is.na(loadings_sp)) != nrow(loadings_sp)]
loadings_sp <-
loadings_sp[rowSums(is.na(loadings_sp)) != ncol(loadings_sp), ]
# rotate results
loadings_sp <- t(loadings_sp[nrow(loadings_sp):1, , drop = FALSE])
melted_loadings_sp <- reshape2::melt(loadings_sp)
ggplot2::ggplot(melted_loadings_sp, ggplot2::aes(
x = .data[["Var1"]], y = .data[["Var2"]], fill =
.data[["value"]]
)) +
ggplot2::geom_tile() +
ggplot2::coord_equal() +
ggplot2::theme_bw() +
ggplot2::scale_fill_distiller(
palette = "Blues",
direction = 1,
na.value = "white"
) +
ggplot2::guides(fill = FALSE) + # removing legend for `fill`
ggplot2::labs(title = "Stability among models") + # using a title instead
ggplot2::geom_text(
ggplot2::aes(
label = round(.data[["value"]], digits = 2),
color = ifelse(.data[["value"]] > 0.5, 1, 0)
),
size = 2.5,
show.legend = FALSE
) +
ggplot2::scale_x_discrete(limits = unlist(labelsX)) +
ggplot2::scale_y_discrete(limits = rev(unlist(labelsX))) +
ggplot2::labs(
x = "Model - Latent variable",
y = "Model - Latent variable"
) +
ggplot2::theme(
axis.text.x = ggplot2::element_text(
angle = 45,
vjust = 1,
hjust = 1
),
text = ggplot2::element_text(size = 10)
)
}
#' Models stability plot
#'
#' Plot stability among models of the external cross validation
#'
#' @param bp_results bp_kfold_VIP_analysis results
#'
#' @return A plot of models stability
#' @name models_stability_plot_bootstrap
#' @export
#' @examples
#' # Data analysis for a table of integrated peaks
#'
#' ## Generate an artificial nmr_dataset_peak_table:
#' ### Generate artificial metadata:
#' num_samples <- 64 # use an even number in this example
#' num_peaks <- 20
#' metadata <- data.frame(
#' NMRExperiment = as.character(1:num_samples),
#' Condition = rep(c("A", "B"), times = num_samples / 2)
#' )
#'
#' ### The matrix with peaks
#' peak_means <- runif(n = num_peaks, min = 300, max = 600)
#' peak_sd <- runif(n = num_peaks, min = 30, max = 60)
#' peak_matrix <- mapply(function(mu, sd) rnorm(num_samples, mu, sd),
#' mu = peak_means, sd = peak_sd
#' )
#' colnames(peak_matrix) <- paste0("Peak", 1:num_peaks)
#'
#' ## Artificial differences depending on the condition:
#' peak_matrix[metadata$Condition == "A", "Peak2"] <-
#' peak_matrix[metadata$Condition == "A", "Peak2"] + 70
#'
#' peak_matrix[metadata$Condition == "A", "Peak6"] <-
#' peak_matrix[metadata$Condition == "A", "Peak6"] - 60
#'
#' ### The nmr_dataset_peak_table
#' peak_table <- new_nmr_dataset_peak_table(
#' peak_table = peak_matrix,
#' metadata = list(external = metadata)
#' )
#'
#' ## We will use bootstrap and permutation method for VIPs selection
#' ## in a a k-fold cross validation
#' # bp_results <- bp_kfold_VIP_analysis(peak_table, # Data to be analized
#' # y_column = "Condition", # Label
#' # k = 3,
#' # nbootstrap = 10)
#'
#' # message("Selected VIPs are: ", bp_results$importarn_vips)
#'
#' # models_stability_plot_bootstrap(bp_results)
#'
models_stability_plot_bootstrap <- function(bp_results) {
# Loadings of the models
n_models <- length(bp_results$kfold_results)
max_ncomp <- bp_results$kfold_results[[1]]$general_model$ncomp
loadings_sp <-
matrix(nrow = n_models * max_ncomp, ncol = n_models * max_ncomp)
labelsX <- list()
for (n in seq_len(max_ncomp)) {
for (m in seq_len(max_ncomp)) {
for (i in seq_len(n_models)) {
for (j in seq_len(n_models)) {
tryCatch(
{
loadings_sp[(n - 1) * n_models + i, (m - 1) * n_models + j] <-
(
bp_results$kfold_results[[i]]$general_model$loadings$X[, n] %*% bp_results$kfold_results[[j]]$general_model$loadings$X[, m]
)[1, 1]
labelsX[[(n - 1) * n_models + i]] <-
paste("M", i, "- LV", n)
},
error = function(e) {
}
)
}
}
}
}
# deleting na cows and cols
loadings_sp <-
loadings_sp[, colSums(is.na(loadings_sp)) != nrow(loadings_sp)]
loadings_sp <-
loadings_sp[rowSums(is.na(loadings_sp)) != ncol(loadings_sp), ]
# rotate results
loadings_sp <- t(loadings_sp[nrow(loadings_sp):1, , drop = FALSE])
melted_loadings_sp <- reshape2::melt(loadings_sp)
ggplot2::ggplot(melted_loadings_sp, ggplot2::aes(
x = .data[["Var1"]], y = .data[["Var2"]], fill =
.data[["value"]]
)) +
ggplot2::geom_tile() +
ggplot2::coord_equal() +
ggplot2::theme_bw() +
ggplot2::scale_fill_distiller(
palette = "Blues",
direction = 1,
na.value = "white"
) +
ggplot2::guides(fill = FALSE) + # removing legend for `fill`
ggplot2::labs(title = "Stability among models") + # using a title instead
ggplot2::geom_text(
ggplot2::aes(
label = round(.data[["value"]], digits = 2),
color = ifelse(.data[["value"]] > 0.5, 1, 0)
),
size = 2.5,
show.legend = FALSE
) +
ggplot2::scale_x_discrete(limits = unlist(labelsX)) +
ggplot2::scale_y_discrete(limits = rev(unlist(labelsX))) +
ggplot2::labs(
x = "Model - Latent variable",
y = "Model - Latent variable"
) +
ggplot2::theme(
axis.text.x = ggplot2::element_text(
angle = 45,
vjust = 1,
hjust = 1
),
text = ggplot2::element_text(size = 10)
)
}
#' Bootstrap plot predictions
#'
#' @param bp_results bp_kfold_VIP_analysis results
#' @param dataset An [nmr_dataset_family] object
#' @param y_column A string with the name of the y column (present in the
#' metadata of the dataset)
#' @param plot A boolean that indicate if results are plotted or not
#'
#' @name plot_bootstrap_multimodel
#' @return A plot of the results or a ggplot object
#' @importFrom stats predict
#' @importFrom mixOmics mixOmics
#' @export
#' @examples
#' # Data analysis for a table of integrated peaks
#'
#' ## Generate an artificial nmr_dataset_peak_table:
#' ### Generate artificial metadata:
#' num_samples <- 64 # use an even number in this example
#' num_peaks <- 20
#' metadata <- data.frame(
#' NMRExperiment = as.character(1:num_samples),
#' Condition = rep(c("A", "B"), times = num_samples / 2)
#' )
#'
#' ### The matrix with peaks
#' peak_means <- runif(n = num_peaks, min = 300, max = 600)
#' peak_sd <- runif(n = num_peaks, min = 30, max = 60)
#' peak_matrix <- mapply(function(mu, sd) rnorm(num_samples, mu, sd),
#' mu = peak_means, sd = peak_sd
#' )
#' colnames(peak_matrix) <- paste0("Peak", 1:num_peaks)
#'
#' ## Artificial differences depending on the condition:
#' peak_matrix[metadata$Condition == "A", "Peak2"] <-
#' peak_matrix[metadata$Condition == "A", "Peak2"] + 70
#'
#' peak_matrix[metadata$Condition == "A", "Peak6"] <-
#' peak_matrix[metadata$Condition == "A", "Peak6"] - 60
#'
#' ### The nmr_dataset_peak_table
#' peak_table <- new_nmr_dataset_peak_table(
#' peak_table = peak_matrix,
#' metadata = list(external = metadata)
#' )
#'
#' ## We will use bootstrap and permutation method for VIPs selection
#' ## in a a k-fold cross validation
#' # bp_results <- bp_kfold_VIP_analysis(peak_table, # Data to be analized
#' # y_column = "Condition", # Label
#' # k = 3,
#' # nbootstrap = 10)
#'
#' # message("Selected VIPs are: ", bp_results$importarn_vips)
#'
#' # plot_bootstrap_multimodel(bp_results, peak_table, "Condition")
#'
plot_bootstrap_multimodel <- function(bp_results, dataset, y_column, plot = TRUE) {
n_models <- length(bp_results$kfold_results)
ncomp <- bp_results$kfold_results[[1]]$general_model$ncomp
# Extract data and split for train and test
x_all <- dataset$peak_table
y_all <- nmr_meta_get_column(dataset, column = y_column)
te_data <- data.frame()
for (i in seq_len(n_models)) {
# Predictions of test set
predictions <- predict(bp_results$kfold_results[[i]]$general_model, newdata = x_all[-bp_results$kfold_index[[i]], , drop = FALSE])
# Individuals plot
if (ncomp == 1) {
te_data <- rbind(te_data, data.frame(
x = predictions$variates[, 1],
label = paste("test ", y_all[-bp_results$kfold_index[[i]]])
))
} else {
te_y <- predictions$variates[, 2]
te_data <- rbind(te_data, data.frame(
x = predictions$variates[, 1],
y = te_y,
label = y_all[-bp_results$kfold_index[[i]]],
group = "test "
))
}
}
# Individuals plot
if (ncomp == 1) {
# This is needed if the model only have one component
plsda_plot <- ggplot2::ggplot(data = te_data, ggplot2::aes(.data[["x"]], fill = .data[["label"]])) +
ggplot2::geom_histogram(
alpha = .5, bins = 10,
position = "identity"
) +
ggplot2::ggtitle("PLS-DA") +
ggplot2::labs(x = "Latent variable 1") +
ggplot2::theme_bw()
} else {
plsda_plot <- ggplot2::ggplot(
data = te_data,
ggplot2::aes(
shape = .data[["group"]],
col = .data[["label"]]
)
) +
ggplot2::geom_hline(
yintercept = 0, linetype = "dashed",
color = "black", size = 0.5
) +
ggplot2::geom_vline(
xintercept = 0, linetype = "dashed",
color = "black", size = 0.5
) +
ggplot2::geom_point(ggplot2::aes(.data[["x"]], .data[["y"]]), size = 1.5) +
ggplot2::ggtitle("PLS-DA") +
ggplot2::labs(
y = "Latent variable 2",
x = "Latent variable 1"
) +
ggplot2::theme_bw()
}
if (plot) {
plsda_plot
} else {
return(plsda_plot)
}
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.