Nothing
#' Analyze data with Sparse Partial Least Squares Discriminant Analysis
#' (sPLS-DA).
#'
#' @param data A matrix or data frame containing the variables. Columns not
#' specified by \code{group_col} or \code{group_col2} are assumed to be continuous
#' variables for analysis.
#' @param group_col A string specifying the column name that contains the first group
#' information. If \code{group_col2} is not provided, an overall analysis will
#' be performed.
#' @param group_col2 A string specifying the second grouping column. Default is
#' \code{NULL}.
#' @param colors A vector of colors for the groups or treatments. If
#' \code{NULL}, a random palette (using \code{rainbow}) is generated based on
#' the number of groups.
#' @param pdf_title A string specifying the file name for saving the PDF output.
#' Default is \code{NULL} which generates figures in the current graphics device.
#' @param ellipse Logical. Whether to draw a 95\% confidence ellipse on the
#' figures. Default is \code{FALSE}.
#' @param bg Logical. Whether to draw the prediction background in the figures.
#' Default is \code{FALSE}.
#' @param conf_mat Logical. Whether to print the confusion matrix for the
#' classifications. Default is \code{FALSE}.
#' @param var_num Numeric. The number of variables to be used in the PLS-DA model.
#' @param cv_opt Character. Option for cross-validation method: either
#' "loocv" or "Mfold". Default is \code{NULL}.
#' @param fold_num Numeric. The number of folds to use if \code{cv_opt} is
#' "Mfold". Default is 5.
#' @param scale Character. Option for data transformation; if set to
#' \code{"log2"}, a log2 transformation is applied to the continuous
#' variables. Default is \code{NULL}.
#' @param comp_num Numeric. The number of components to calculate in the sPLS-DA
#' model. Default is 2.
#' @param pch_values A vector of integers specifying the plotting characters
#' (pch values) to be used in the plots.
#' @param style Character. If set to \code{"3D"} or \code{"3d"} and
#' \code{comp_num} equals 3, a 3D plot is generated using the
#' \code{plot3D} package. Default is \code{NULL}.
#' @param roc Logical. Whether to compute and plot the ROC curve for the model.
#' Default is \code{FALSE}.
#' @param verbose A logical value indicating whether to print additional
#' informational output to the console. When \code{TRUE}, the function will
#' display progress messages, and intermediate results when
#' \code{FALSE} (the default), it runs quietly.
#' @param seed An integer specifying the seed for reproducibility (default is 123).
#' @description
#' This function conducts Sparse Partial Least Squares Discriminant Analysis
#' (sPLS-DA) on the provided data. It uses the specified \code{group_col} (and
#' optionally \code{group_col2}) to define class labels while assuming the remaining
#' columns contain continuous variables. The function supports a log2
#' transformation via the \code{scale} parameter and generates a series of plots,
#' including classification plots, scree plots, loadings plots, and VIP score
#' plots. Optionally, ROC curves are produced when \code{roc} is \code{TRUE}.
#' Additionally, cross-validation is supported via LOOCV or Mfold methods. When
#' both \code{group_col} and \code{group_col2} are provided and differ, the function
#' analyzes each treatment level separately.
#'
#' @return Plots consisting of the classification figures, component figures
#' with Variable of Importance in Projection (VIP) scores, and classifications
#' based on VIP scores greater than 1. ROC curves and confusion matrices are also
#' produced if requested.
#' @details
#' When \code{verbose} is set to \code{TRUE}, additional diagnostic plots (e.g., VIP plots, ROC Plots, Cross-Validation Plots)
#' are printed to the console. These plots provide extra insight into the model's performance
#' but can be suppressed by keeping \code{verbose = FALSE}.
#'
#' @examples
#' # Loading Sample Data
#' data_df <- ExampleData1[,-c(3)]
#' data_df <- dplyr::filter(data_df, Group != "ND", Treatment != "Unstimulated")
#'
#' cyt_splsda(data_df, pdf_title = NULL,
#' colors = c("black", "purple"), bg = FALSE, scale = "log2",
#' conf_mat = FALSE, var_num = 25, cv_opt = NULL, comp_num = 2,
#' pch_values = c(16, 4), style = NULL, ellipse = TRUE,
#' group_col = "Group", group_col2 = "Treatment", roc = FALSE, verbose = FALSE)
#'
#' @export
#' @importFrom mixOmics splsda background.predict perf vip auroc plotIndiv plotLoadings
#' @import ggplot2
#' @importFrom plot3D scatter3D
#' @importFrom reshape2 melt
#' @importFrom caret confusionMatrix
cyt_splsda <- function(data, group_col = NULL, group_col2 = NULL, colors = NULL,
pdf_title, ellipse = FALSE, bg = FALSE, conf_mat = FALSE,
var_num, cv_opt = NULL, fold_num = 5, scale = NULL,
comp_num = 2, pch_values, style = NULL, roc = FALSE,
verbose = FALSE, seed = 123) {
# If one factor is missing, use the provided column for
# both grouping and treatment.
if (!is.null(group_col) && is.null(group_col2)) {
if (verbose) cat("No second grouping column provided; performing overall analysis.\n")
group_col2 <- group_col
}
if(is.null(group_col) && !is.null(group_col2)) {
stop("No first grouping column provided; must provide the first grouping column.")
}
if (is.null(group_col) && is.null(group_col2)) {
stop("At least one grouping column must be provided.")
}
# Optionally apply log2 transformation
if (!is.null(scale) && scale == "log2") {
data <- data.frame(
data[, c(group_col, group_col2)],
log2(data[, !(names(data) %in% c(group_col, group_col2))])
)
if (verbose) cat("Results based on log2 transformation:\n")
} else if (is.null(scale)) {
if (verbose) cat("Results based on no transformation:\n")
}
# Extract the grouping variable from your data (using group_col or group_col2)
# Extract grouping variable(s)
if (group_col == group_col2) {
group_vec <- data[[group_col]]
} else {
# Combine the two grouping columns into a composite factor
group_vec <- data[[group_col2]]
}
# Now perform the check for pch_values:
if (is.null(pch_values)) {
stop("Please enter a vector of pch values, e.g. c(16, 4).")
}
if (group_col == group_col2) {
if (length(pch_values) < length(unique(data[[group_col]]))) {
stop("Please ensure the number of pch values provided (", length(pch_values),
") is at least equal to the number of unique groups (", length(unique(data[[group_col]])),
") from the grouping column.")
}
} else {
# When group_col and group_col2 differ, use the levels of group_col for pch
if (length(pch_values) < length(unique(data[[group_col]]))) {
stop("Please ensure the number of pch values provided (", length(pch_values),
") is at least equal to the number of unique groups (", length(unique(data[[group_col]])),
") from the first grouping column.")
}
}
# Generate a color palette if not provided (based on the
# grouping variable levels in the entire dataset)
if (is.null(colors)) {
num_groups <- length(unique(data[[group_col]]))
colors <- rainbow(num_groups)
}
if(!is.null(pdf_title)){
pdf(file = pdf_title, width = 8.5, height = 8)
}
# Case 1: Only one factor provided (both columns are the same)
if (group_col == group_col2) {
overall_analysis <- "Overall Analysis"
# Remove the factor column from predictors and keep only numeric columns
the_data_df <- data[, !(names(data) %in% c(group_col))]
the_data_df <- the_data_df[, sapply(the_data_df, is.numeric)]
the_groups <- as.vector(data[[group_col]])
if (length(unique(the_groups)) < 2) {
stop("The grouping variable must have at least two levels for PLS-DA.
Please provide an appropriate grouping column.")
}
cytokine_splsda <- mixOmics::splsda(the_data_df, the_groups,
scale = TRUE, ncomp = comp_num,
keepX = rep(var_num, comp_num)
)
splsda_predict <- predict(cytokine_splsda, the_data_df, dist = "max.dist")
prediction1 <- cbind(original = the_groups, splsda_predict$class$max.dist)
accuracy1 <- (sum(prediction1[, 1] == prediction1[, 3]) /
length(prediction1[, 1]))
acc1 <- 100 * signif(accuracy1, digits = 2)
if(bg){
bg_maxdist <- mixOmics::background.predict(cytokine_splsda,
comp.predicted = 2,
dist = "max.dist",
xlim = c(-15,15),
ylim = c(-15,15),
resolution = 200
)
}
group_factors <- seq_len(length(levels(factor(the_groups))))
plot_args <- list(cytokine_splsda,
ind.names = NA, legend = TRUE, col = colors,
pch = pch_values, pch.levels = group_factors,
title = paste(
overall_analysis, "With Accuracy:",
acc1, "%"
),
legend.title = group_col
)
if (ellipse) plot_args$ellipse <- TRUE
if (bg) plot_args$background <- bg_maxdist
do.call(mixOmics::plotIndiv, plot_args)
if (!is.null(style) && comp_num == 3 && (tolower(style) == "3d")) {
cytokine_scores <- cytokine_splsda$variates$X
plot3D::scatter3D(cytokine_scores[, 1], cytokine_scores[, 2],
cytokine_scores[, 3],
pch = pch_values, col = colors,
xlab = "Component 1", ylab = "Component 2",
zlab = "Component 3",
main = paste("3D Plot:", overall_analysis),
theta = 20, phi = 30, bty = "g", colkey = FALSE
)
}
# If roc = TRUE, compute and plot ROC curve for the overall model
if (roc) {
roc_obj <- mixOmics::auroc(
object = cytokine_splsda, newdata = the_data_df,
outcome.test = the_groups,
plot = TRUE, roc.comp = comp_num,
title = paste0("ROC Curve:", overall_analysis), print = FALSE
)
}
# Cross-validation methods
if (!is.null(cv_opt)) {
if (cv_opt == "loocv") {
set.seed(seed)
loocv_results <- mixOmics::perf(cytokine_splsda, validation = "loo")
loocv_error_rate <- loocv_results$error.rate$overall[
"comp2",
"max.dist"
]
loocv_acc <- 100 * signif(1 - loocv_error_rate, digits = 2)
if (verbose) cat("LOOCV Accuracy: ", paste0(loocv_acc, "%"), "\n")
error_rates <- loocv_results$error.rate$overall[, "max.dist"]
error_df <- as.data.frame(error_rates)
error_df$Component <- rownames(error_df)
error_df <- reshape2::melt(error_df,
id.vars = "Component",
variable.name = "Distance",
value.name = "ErrorRate"
)
a <- ggplot2::ggplot(error_df, ggplot2::aes(x = Component, y = ErrorRate,
color = Distance, group = 1)) +
ggplot2::geom_line() +
ggplot2::geom_point(size = 3) +
ggplot2::labs(title = paste("LOOCV Error Rate:", overall_analysis),
x = "Number of Components", y = "Error Rate") +
ggplot2::theme_minimal() +
ggplot2::theme(axis.text.x = ggplot2::element_text(angle = 45, hjust = 1)) +
ggplot2::scale_color_manual(values = "red", labels = "max.dist")
print(a)
} else if (cv_opt == "Mfold") {
set.seed(seed)
fold_results <- mixOmics::perf(cytokine_splsda,
validation = "Mfold",
folds = fold_num, nrepeat = 1000
)
fold_error_rate <- fold_results$error.rate$overall[
"comp2",
"max.dist"
]
fold_acc <- 100 * signif(1 - fold_error_rate, digits = 2)
if (verbose) cat("Mfold Accuracy: ", paste0(fold_acc, "%"), "\n")
error_rates <- fold_results$error.rate$overall[, "max.dist"]
error_df <- as.data.frame(error_rates)
error_df$Component <- rownames(error_df)
error_df <- reshape2::melt(error_df,
id.vars = "Component",
variable.name = "Distance",
value.name = "ErrorRate"
)
a <- ggplot2::ggplot(error_df, ggplot2::aes(x = Component, y = ErrorRate,
color = Distance, group = 1)) +
ggplot2::geom_line() +
ggplot2::geom_point(size = 3) +
ggplot2::labs(title = paste("Mfold Error Rate:", overall_analysis),
x = "Number of Components", y = "Error Rate") +
ggplot2::theme_minimal() +
ggplot2::theme(axis.text.x = ggplot2::element_text(angle = 45, hjust = 1)) +
ggplot2::scale_color_manual(values = "red", labels = "max.dist")
print(a)
}
}
# Loadings plot for each component
for (comp in 1:comp_num) {
mixOmics::plotLoadings(cytokine_splsda,
comp = comp, contrib = "max", method = "mean",
size.name = 1, size.legend = 1, legend.color = colors,
title = paste("Component", comp, ":", overall_analysis),
size.title = 1, legend.title = group_col
)
}
# VIP scores and plot for PLS-DA with VIP > 1
all_vip_scores <- mixOmics::vip(cytokine_splsda)
for (comp in 1:comp_num) {
vscore <- as.data.frame(all_vip_scores[, comp, drop = FALSE])
vscore$metabo <- rownames(vscore)
vscore$comp <- vscore[, 1]
bar <- vscore[, c("metabo", "comp")]
bar <- bar[order(bar$comp, decreasing = TRUE), ]
a <- ggplot2::ggplot(bar, ggplot2::aes(x = metabo, y = comp)) +
ggplot2::geom_bar(stat = "identity", position = "dodge") +
ggplot2::scale_y_continuous(limits = c(0, max(bar$comp))) +
ggplot2::geom_hline(yintercept = 1, color = "grey") +
ggplot2::scale_x_discrete(limits = factor(bar$metabo)) +
ggplot2::theme(axis.text.x = ggplot2::element_text(angle = 45, hjust = 1, size = 15)) +
ggplot2::labs(x = "", y = "VIP score") +
ggplot2::ggtitle(paste("Component", comp)) +
ggplot2::theme(panel.grid = ggplot2::element_blank(),
panel.background = ggplot2::element_rect(color = "black", fill = "transparent"))
print(a)
}
# PLS-DA on VIP > 1: Subset predictors with VIP > 1
condt_variable <- all_vip_scores[, 1] > 1
keep_x <- sum(condt_variable)
the_data_mat <- the_data_df[, condt_variable, drop = FALSE]
if(keep_x < 2){
if(verbose){
warning("Only ", keep_x, " variable has VIP > 1. Skipping VIP > 1 PLS-DA Model.")
}
if (conf_mat == TRUE) {
if (verbose) cat(paste0("Confusion Matrix for PLS-DA Comparison: ", overall_analysis, "\n"))
# Confusion Matrix for main model
cm <- caret::confusionMatrix(
data = as.factor(prediction1[, 3]), # predicted
reference = as.factor(prediction1[, 1]) # actual
)
if(verbose){
print(cm$table)
cat("Accuracy:", signif(cm$overall["Accuracy"], 2), "\n")
# Check if binary or multi-class
if (nlevels(as.factor(prediction1[, 1])) == 2) {
cat("Sensitivity:", signif(cm$byClass["Sensitivity"], 2), "\n")
cat("Specificity:", signif(cm$byClass["Specificity"], 2), "\n")
} else {
cat("\nPer-Class Sensitivity:\n")
print(signif(cm$byClass[, "Sensitivity"], 2))
cat("\nPer-Class Specificity:\n")
print(signif(cm$byClass[, "Specificity"], 2))
macro_sens <- mean(cm$byClass[, "Sensitivity"], na.rm = TRUE)
macro_spec <- mean(cm$byClass[, "Specificity"], na.rm = TRUE)
cat("\nMacro-Averaged Sensitivity:", signif(macro_sens, 2), "\n")
cat("Macro-Averaged Specificity:", signif(macro_spec, 2), "\n")
}
}
}
}
else{
cytokine_splsda2 <- mixOmics::splsda(the_data_mat, the_groups,
scale = TRUE, ncomp = comp_num,
keepX = rep(keep_x, comp_num)
)
splsda_predict2 <- predict(cytokine_splsda2, the_data_mat,
dist = "max.dist"
)
prediction2 <- cbind(original = the_groups, splsda_predict2$class$max.dist)
accuracy2 <- (sum(prediction2[, 1] == prediction2[, 3]) /
length(prediction2[, 1]))
acc2 <- 100 * signif(accuracy2, digits = 2)
# Create a grid of values
if(bg){
bg_maxdist2 <- mixOmics::background.predict(cytokine_splsda2,
comp.predicted = 2,
dist = "max.dist",
xlim = c(-15,15),
ylim = c(-15,15),
resolution = 200
)
}
plot_args2 <- list(cytokine_splsda2,
ind.names = NA, legend = TRUE, col = colors,
pch = pch_values, pch.levels = group_factors,
title = paste(
overall_analysis, "(VIP>1)",
"With Accuracy:", acc2, "%"
),
legend.title = group_col
)
if (ellipse) plot_args2$ellipse <- TRUE
if (bg) plot_args2$background <- bg_maxdist2
do.call(mixOmics::plotIndiv, plot_args2)
if (!is.null(style) && comp_num == 3 && (tolower(style) == "3d")) {
cytokine_scores2 <- cytokine_splsda2$variates$X
plot3D::scatter3D(cytokine_scores2[, 1], cytokine_scores2[, 2],
cytokine_scores2[, 3],
pch = pch_values, col = colors,
xlab = "Component 1", ylab = "Component 2",
zlab = "Component 3",
main = paste("3D Plot:", overall_analysis, "(VIP>1)"),
theta = 20, phi = 30, bty = "g", colkey = FALSE
)
}
# Loadings plot for each component with VIP > 1
for (comp in 1:comp_num) {
mixOmics::plotLoadings(cytokine_splsda2,
comp = comp, contrib = "max", method = "mean",
size.name = 1, size.legend = 1, legend.color = colors,
title = paste("Component", comp, "(VIP > 1):", overall_analysis),
size.title = 1, legend.title = group_col
)
}
if (!is.null(cv_opt)) {
if (cv_opt == "loocv") {
set.seed(seed)
loocv_results2 <- mixOmics::perf(cytokine_splsda2, validation = "loo")
loocv_error_rate2 <- loocv_results2$error.rate$overall[
"comp2",
"max.dist"
]
loocv_acc2 <- 100 * signif(1 - loocv_error_rate2, digits = 2)
if (verbose) cat(paste0("LOOCV Accuracy (VIP>1): ", loocv_acc2, "%\n"))
error_rates2 <- loocv_results2$error.rate$overall[, "max.dist"]
error_df2 <- as.data.frame(error_rates2)
error_df2$Component <- rownames(error_df2)
error_df2 <- reshape2::melt(error_df2,
id.vars = "Component",
variable.name = "Distance",
value.name = "ErrorRate"
)
a <- ggplot2::ggplot(error_df2, ggplot2::aes(x = Component, y = ErrorRate,
color = Distance, group = 1)) +
ggplot2::geom_line() +
ggplot2::geom_point(size = 3) +
ggplot2::labs(title = paste("LOOCV Error Rate (VIP>1):", overall_analysis),
x = "Number of Components", y = "Error Rate") +
ggplot2::theme_minimal() +
ggplot2::theme(axis.text.x = ggplot2::element_text(angle = 45, hjust = 1)) +
ggplot2::scale_color_manual(values = "red", labels = "max.dist")
print(a)
} else if (cv_opt == "Mfold") {
set.seed(seed)
fold_results2 <- mixOmics::perf(cytokine_splsda2,
validation = "Mfold",
folds = fold_num, nrepeat = 1000
)
fold_error_rate2 <- fold_results2$error.rate$overall[
"comp2",
"max.dist"
]
fold_acc2 <- 100 * signif(1 - fold_error_rate2, digits = 2)
if (verbose) cat(paste0("Mfold Accuracy (VIP>1): ", fold_acc2, "%\n"))
error_rates2 <- fold_results2$error.rate$overall[, "max.dist"]
error_df2 <- as.data.frame(error_rates2)
error_df2$Component <- rownames(error_df2)
error_df2 <- reshape2::melt(error_df2,
id.vars = "Component",
variable.name = "Distance",
value.name = "ErrorRate"
)
a <- ggplot2::ggplot(error_df2, ggplot2::aes(x = Component, y = ErrorRate,
color = Distance, group = 1)) +
ggplot2::geom_line() +
ggplot2::geom_point(size = 3) +
ggplot2::labs(title = paste("Mfold Error Rate (VIP>1):", overall_analysis),
x = "Number of Components", y = "Error Rate") +
ggplot2::theme_minimal() +
ggplot2::theme(axis.text.x = ggplot2::element_text(angle = 45, hjust = 1)) +
ggplot2::scale_color_manual(values = "red", labels = "max.dist")
print(a)
}
}
if (conf_mat == TRUE) {
if (verbose) cat(paste0("Confusion Matrix for PLS-DA Comparison: ", overall_analysis, "\n"))
# Confusion Matrix for main model
cm <- caret::confusionMatrix(
data = as.factor(prediction1[, 3]), # predicted
reference = as.factor(prediction1[, 1]) # actual
)
if(verbose){
print(cm$table)
cat("Accuracy:", signif(cm$overall["Accuracy"], 2), "\n")
# Check if binary or multi-class
if (nlevels(as.factor(prediction1[, 1])) == 2) {
cat("Sensitivity:", signif(cm$byClass["Sensitivity"], 2), "\n")
cat("Specificity:", signif(cm$byClass["Specificity"], 2), "\n")
} else {
cat("\nPer-Class Sensitivity:\n")
print(signif(cm$byClass[, "Sensitivity"], 2))
cat("\nPer-Class Specificity:\n")
print(signif(cm$byClass[, "Specificity"], 2))
macro_sens <- mean(cm$byClass[, "Sensitivity"], na.rm = TRUE)
macro_spec <- mean(cm$byClass[, "Specificity"], na.rm = TRUE)
cat("\nMacro-Averaged Sensitivity:", signif(macro_sens, 2), "\n")
cat("Macro-Averaged Specificity:", signif(macro_spec, 2), "\n")
}
}
if (verbose) cat(paste0("Confusion Matrix for PLS-DA Comparison with VIP > 1: ", overall_analysis, "\n"))
# Confusion Matrix for VIP>1 model
cm_vip <- caret::confusionMatrix(
data = as.factor(prediction2[, 3]), # predicted
reference = as.factor(prediction2[, 1]) # actual
)
if (verbose){
print(cm_vip$table)
cat("Accuracy:", signif(cm_vip$overall["Accuracy"], 2), "\n")
if (nlevels(as.factor(prediction2[, 1])) == 2) {
cat("Sensitivity:", signif(cm_vip$byClass["Sensitivity"], 2), "\n")
cat("Specificity:", signif(cm_vip$byClass["Specificity"], 2), "\n")
} else {
cat("\nPer-Class Sensitivity:\n")
print(signif(cm_vip$byClass[, "Sensitivity"], 2))
cat("\nPer-Class Specificity:\n")
print(signif(cm_vip$byClass[, "Specificity"], 2))
macro_sens_vip <- mean(cm_vip$byClass[, "Sensitivity"], na.rm = TRUE)
macro_spec_vip <- mean(cm_vip$byClass[, "Specificity"], na.rm = TRUE)
cat("\nMacro-Averaged Sensitivity:", signif(macro_sens_vip, 2), "\n")
cat("Macro-Averaged Specificity:", signif(macro_spec_vip, 2), "\n")
}
}
}
# If roc = TRUE, compute and plot ROC curve for the overall model of VIP > 1
if (roc) {
roc_obj2 <- mixOmics::auroc(
object = cytokine_splsda2, newdata = the_data_mat,
outcome.test = the_groups,
plot = TRUE, roc.comp = comp_num,
title = paste0("ROC Curve (VIP>1):", overall_analysis), print = FALSE
)
}
}
} else {
# Case 2: Both group and treatment columns are provided and they differ.
levels_vec <- unique(data[[group_col2]])
for (i in seq_along(levels_vec)) {
current_level <- levels_vec[i]
overall_analysis <- current_level
condt <- data[[group_col2]] == current_level
the_data_df <- data[condt, -which(names(data) %in% c(
group_col,
group_col2
))]
the_data_df <- the_data_df[, sapply(the_data_df, is.numeric)]
the_groups <- as.vector(data[condt, group_col])
if (length(unique(the_groups)) < 2) {
stop("The grouping variable must have at least two levels for PLS-DA.
Please provide an appropriate grouping column.")
}
cytokine_splsda <- mixOmics::splsda(the_data_df, the_groups,
scale = TRUE, ncomp = comp_num,
keepX = rep(var_num, comp_num)
)
splsda_predict <- predict(cytokine_splsda, the_data_df,
dist = "max.dist"
)
prediction1 <- cbind(
original = the_groups,
splsda_predict$class$max.dist
)
accuracy1 <- (sum(prediction1[, 1] == prediction1[, 3]) /
length(prediction1[, 1]))
acc1 <- 100 * signif(accuracy1, digits = 2)
# Create a grid of values
if(bg){
bg_maxdist <- mixOmics::background.predict(cytokine_splsda,
comp.predicted = 2,
dist = "max.dist",
xlim = c(-15,15),
ylim = c(-15,15),
resolution = 200
)
}
group_factors <- seq_len(length(levels(factor(the_groups))))
plot_args <- list(cytokine_splsda,
ind.names = NA, legend = TRUE, col = colors,
pch = pch_values[group_factors], pch.levels = pch_values[group_factors],
title = paste(
overall_analysis, "With Accuracy:",
acc1, "%"
),
legend.title = group_col
)
if (ellipse) plot_args$ellipse <- TRUE
if (bg) plot_args$background <- bg_maxdist
do.call(mixOmics::plotIndiv, plot_args)
if (!is.null(style) && comp_num == 3 && (tolower(style) == "3d")) {
cytokine_scores <- cytokine_splsda$variates$X
plot3D::scatter3D(cytokine_scores[, 1], cytokine_scores[, 2],
cytokine_scores[, 3],
pch = pch_values, col = colors,
xlab = "Component 1", ylab = "Component 2",
zlab = "Component 3",
main = paste("3D Plot:", overall_analysis),
theta = 20, phi = 30, bty = "g", colkey = FALSE
)
}
# If roc = TRUE, compute and plot ROC curve for the overall model
if (roc) {
roc_obj <- mixOmics::auroc(
object = cytokine_splsda, newdata = the_data_df, outcome.test =
the_groups,
plot = TRUE, roc.comp = comp_num,
title = paste0("ROC Curve:", overall_analysis), print = FALSE
)
}
if (!is.null(cv_opt)) {
if (cv_opt == "loocv") {
set.seed(seed)
loocv_results <- mixOmics::perf(cytokine_splsda, validation = "loo")
loocv_error_rate <- loocv_results$error.rate$overall[
"comp2",
"max.dist"
]
loocv_acc <- 100 * signif(1 - loocv_error_rate, digits = 2)
if (verbose) cat(paste0(current_level, " LOOCV Accuracy: ", loocv_acc, "%\n"))
error_rates <- loocv_results$error.rate$overall[, "max.dist"]
error_df <- as.data.frame(error_rates)
error_df$Component <- rownames(error_df)
error_df <- reshape2::melt(error_df,
id.vars = "Component",
variable.name = "Distance",
value.name = "ErrorRate"
)
a <- ggplot2::ggplot(error_df, ggplot2::aes(x = Component, y = ErrorRate,
color = Distance, group = 1)) +
ggplot2::geom_line() +
ggplot2::geom_point(size = 3) +
ggplot2::labs(title = paste("LOOCV Error Rate:", overall_analysis),
x = "Number of Components", y = "Error Rate") +
ggplot2::theme_minimal() +
ggplot2::theme(axis.text.x = ggplot2::element_text(angle = 45, hjust = 1)) +
ggplot2::scale_color_manual(values = "red", labels = "max.dist")
print(a)
} else if (cv_opt == "Mfold") {
set.seed(seed)
fold_results <- mixOmics::perf(cytokine_splsda,
validation = "Mfold",
folds = fold_num, nrepeat = 1000
)
fold_error_rate <- fold_results$error.rate$overall[
"comp2",
"max.dist"
]
fold_acc <- 100 * signif(1 - fold_error_rate, digits = 2)
if (verbose) cat(paste0(current_level, " Mfold Accuracy: ", fold_acc, "%\n"))
error_rates <- fold_results$error.rate$overall[, "max.dist"]
error_df <- as.data.frame(error_rates)
error_df$Component <- rownames(error_df)
error_df <- reshape2::melt(error_df,
id.vars = "Component",
variable.name = "Distance",
value.name = "ErrorRate"
)
a <- ggplot2::ggplot(error_df, ggplot2::aes(x = Component, y = ErrorRate,
color = Distance, group = 1)) +
ggplot2::geom_line() +
ggplot2::geom_point(size = 3) +
ggplot2::labs(title = paste("Mfold Error Rate:", overall_analysis),
x = "Number of Components", y = "Error Rate") +
ggplot2::theme_minimal() +
ggplot2::theme(axis.text.x = ggplot2::element_text(angle = 45, hjust = 1)) +
ggplot2::scale_color_manual(values = "red", labels = "max.dist")
print(a)
}
}
# Loadings plot for each component
for (comp in 1:comp_num) {
mixOmics::plotLoadings(cytokine_splsda,
comp = comp, contrib = "max", method = "mean",
size.name = 1, size.legend = 1, legend.color = colors,
title = paste("Component", comp, ":", overall_analysis),
size.title = 1, legend.title = group_col
)
}
all_vip_scores <- mixOmics::vip(cytokine_splsda)
for (comp in 1:comp_num) {
vscore <- as.data.frame(all_vip_scores[, comp, drop = FALSE])
vscore$metabo <- rownames(vscore)
vscore$comp <- vscore[, 1]
bar <- vscore[, c("metabo", "comp")]
bar <- bar[order(bar$comp, decreasing = TRUE), ]
a <- ggplot2::ggplot(bar, aes(x = metabo, y = comp)) +
ggplot2::geom_bar(stat = "identity", position = "dodge") +
ggplot2::scale_y_continuous(limits = c(0, max(bar$comp))) +
ggplot2::geom_hline(yintercept = 1, color = "grey") +
ggplot2::scale_x_discrete(limits = factor(bar$metabo)) +
ggplot2::theme(axis.text.x = element_text(angle = 45, hjust = 1, size = 15)) +
ggplot2::labs(x = "", y = "VIP score") +
ggplot2::ggtitle(paste("Component", comp)) +
ggplot2::theme(
panel.grid = element_blank(),
panel.background = element_rect(
color = "black",
fill = "transparent"
)
)
print(a)
}
condt_variable <- all_vip_scores[, 1] > 1
keep_x <- sum(condt_variable)
the_data_mat <- the_data_df[, condt_variable, drop = FALSE]
cytokine_splsda2 <- mixOmics::splsda(the_data_mat, the_groups,
scale = TRUE, ncomp = comp_num,
keepX = rep(keep_x, comp_num)
)
splsda_predict2 <- predict(cytokine_splsda2, the_data_mat,
dist = "max.dist"
)
prediction2 <- cbind(
original = the_groups,
splsda_predict2$class$max.dist
)
accuracy2 <- (sum(prediction2[, 1] == prediction2[, 3]) /
length(prediction2[, 1]))
acc2 <- 100 * signif(accuracy2, digits = 2)
# Create a grid of values
if(bg){
bg_maxdist2 <- mixOmics::background.predict(cytokine_splsda2,
comp.predicted = 2,
dist = "max.dist",
xlim = c(-15,15),
ylim = c(-15,15),
resolution = 200
)
}
plot_args2 <- list(cytokine_splsda2,
ind.names = NA, legend = TRUE, col = colors,
pch = pch_values[group_factors], pch.levels = pch_values[group_factors],
title = paste(
overall_analysis, "(VIP>1)",
"With Accuracy:", acc2, "%"
),
legend.title = group_col
)
if (ellipse) plot_args2$ellipse <- TRUE
if (bg) plot_args2$background <- bg_maxdist2
do.call(mixOmics::plotIndiv, plot_args2)
if (!is.null(style) && comp_num == 3 && (tolower(style) == "3d")) {
cytokine_scores2 <- cytokine_splsda2$variates$X
plot3D::scatter3D(cytokine_scores2[, 1], cytokine_scores2[, 2],
cytokine_scores2[, 3],
pch = pch_values, col = colors,
xlab = "Component 1", ylab = "Component 2",
zlab = "Component 3",
main = paste("3D Plot:", overall_analysis, "(VIP>1)"),
theta = 20, phi = 30, bty = "g", colkey = FALSE
)
}
# Loadings plot for each component with VIP > 1
# Loadings plot for each component
for (comp in 1:comp_num) {
mixOmics::plotLoadings(cytokine_splsda2,
comp = comp, contrib = "max", method = "mean",
size.name = 1, size.legend = 1, legend.color = colors,
title = paste("Component", comp, "(VIP > 1):", overall_analysis),
size.title = 1, legend.title = group_col
)
}
if (roc) {
roc_obj2 <- mixOmics::auroc(
object = cytokine_splsda2, newdata = the_data_mat,
outcome.test = the_groups,
plot = TRUE, roc.comp = comp_num,
title = paste0("ROC Curve (VIP>1):", overall_analysis), print = FALSE
)
}
if (!is.null(cv_opt)) {
if (cv_opt == "loocv") {
set.seed(seed)
loocv_results2 <- mixOmics::perf(cytokine_splsda2, validation = "loo")
loocv_error_rate2 <- loocv_results2$error.rate$overall[
"comp2",
"max.dist"
]
loocv_acc2 <- 100 * signif(1 - loocv_error_rate2, digits = 2)
if(verbose) cat(paste0(current_level, " LOOCV Accuracy (VIP>1): ", loocv_acc2, "%\n"))
error_rates2 <- loocv_results2$error.rate$overall[, "max.dist"]
error_df2 <- as.data.frame(error_rates2)
error_df2$Component <- rownames(error_df2)
error_df2 <- reshape2::melt(error_df2,
id.vars = "Component",
variable.name = "Distance",
value.name = "ErrorRate"
)
a <- ggplot2::ggplot(error_df2, aes(
x = Component, y = ErrorRate,
color = Distance, group = 1
)) +
ggplot2::geom_line() +
ggplot2::geom_point(size = 3) +
ggplot2::labs(
title = paste("LOOCV Error Rate (VIP>1):", overall_analysis),
x = "Number of Components",
y = "Error Rate"
) +
ggplot2::theme_minimal() +
ggplot2::theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
ggplot2::scale_color_manual(values = "red", labels = "max.dist")
print(a)
} else if (cv_opt == "Mfold") {
set.seed(seed)
fold_results2 <- mixOmics::perf(cytokine_splsda2,
validation = "Mfold",
folds = fold_num, nrepeat = 1000
)
fold_error_rate2 <- fold_results2$error.rate$overall[
"comp2",
"max.dist"
]
fold_acc2 <- 100 * signif(1 - fold_error_rate2, digits = 2)
if(verbose) cat(paste0(current_level, " Mfold Accuracy (VIP>1): ", fold_acc2, "%\n"))
error_rates2 <- fold_results2$error.rate$overall[, "max.dist"]
error_df2 <- as.data.frame(error_rates2)
error_df2$Component <- rownames(error_df2)
error_df2 <- reshape2::melt(error_df2,
id.vars = "Component",
variable.name = "Distance",
value.name = "ErrorRate"
)
a <- ggplot2::ggplot(error_df2, aes(
x = Component, y = ErrorRate,
color = Distance, group = 1
)) +
ggplot2::geom_line() +
ggplot2::geom_point(size = 3) +
ggplot2::labs(
title = paste("Mfold Error Rate (VIP>1):", overall_analysis),
x = "Number of Components",
y = "Error Rate"
) +
ggplot2::theme_minimal() +
ggplot2::theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
ggplot2::scale_color_manual(values = "red", labels = "max.dist")
print(a)
}
}
if (conf_mat == TRUE) {
if (verbose) cat(paste0("Confusion Matrix for PLS-DA Comparison: ", current_level, "\n"))
# Confusion Matrix for main model
cm <- caret::confusionMatrix(
data = as.factor(prediction1[, 3]), # predicted
reference = as.factor(prediction1[, 1]) # actual
)
if (verbose) {
print(cm$table)
cat("Accuracy:", signif(cm$overall["Accuracy"], 2), "\n")
if (nlevels(as.factor(prediction1[, 1])) == 2) {
cat("Sensitivity:", signif(cm$byClass["Sensitivity"], 2), "\n")
cat("Specificity:", signif(cm$byClass["Specificity"], 2), "\n")
} else {
cat("\nPer-Class Sensitivity:\n")
print(signif(cm$byClass[, "Sensitivity"], 2))
cat("\nPer-Class Specificity:\n")
print(signif(cm$byClass[, "Specificity"], 2))
macro_sens <- mean(cm$byClass[, "Sensitivity"], na.rm = TRUE)
macro_spec <- mean(cm$byClass[, "Specificity"], na.rm = TRUE)
cat("\nMacro-Averaged Sensitivity:", signif(macro_sens, 2), "\n")
cat("Macro-Averaged Specificity:", signif(macro_spec, 2), "\n")
}
}
if (verbose) cat(paste0("Confusion Matrix for PLS-DA Comparison with VIP > 1: ", current_level, "\n"))
# Confusion Matrix for VIP>1 model
cm_vip <- caret::confusionMatrix(
data = as.factor(prediction2[, 3]), # predicted
reference = as.factor(prediction2[, 1]) # actual
)
if (verbose) {
print(cm_vip$table)
cat("Accuracy:", signif(cm_vip$overall["Accuracy"], 2), "\n")
if (nlevels(as.factor(prediction2[, 1])) == 2) {
cat("Sensitivity:", signif(cm_vip$byClass["Sensitivity"], 2), "\n")
cat("Specificity:", signif(cm_vip$byClass["Specificity"], 2), "\n")
} else {
cat("\nPer-Class Sensitivity:\n")
print(signif(cm_vip$byClass[, "Sensitivity"], 2))
cat("\nPer-Class Specificity:\n")
print(signif(cm_vip$byClass[, "Specificity"], 2))
macro_sens_vip <- mean(cm_vip$byClass[, "Sensitivity"], na.rm = TRUE)
macro_spec_vip <- mean(cm_vip$byClass[, "Specificity"], na.rm = TRUE)
cat("\nMacro-Averaged Sensitivity:", signif(macro_sens_vip, 2), "\n")
cat("Macro-Averaged Specificity:", signif(macro_spec_vip, 2), "\n")
}
}
}
}
}
if(!is.null(pdf_title)){
dev.off()
}
}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.