Nothing
#' Main function to carry out Immunaut Analysis
#'
#' This function performs clustering and dimensionality reduction analysis on a dataset using user-defined settings.
#' It handles various preprocessing steps, dimensionality reduction via t-SNE, multiple clustering methods, and
#' generates associated plots based on user-defined or default settings.
#'
#' @param dataset A data frame representing the dataset on which the analysis will be performed. The dataset must
#' contain numeric columns for dimensionality reduction and clustering.
#' @param settings A named list containing settings for the analysis. If NULL, defaults will be used. The settings list may contain:
#'
#' \describe{
#' \item{fileHeader}{A data frame mapping the original column names to remapped column names. Used for t-SNE input preparation.}
#' \item{selectedColumns}{Character vector of columns to be used for the analysis. Defaults to NULL.}
#' \item{cutOffColumnSize}{Numeric. The maximum size of the dataset in terms of columns. Defaults to 50,000.}
#' \item{excludedColumns}{Character vector of columns to exclude from the analysis. Defaults to NULL.}
#' \item{groupingVariables}{Character vector of columns to use for grouping the data during analysis. Defaults to NULL.}
#' \item{colorVariables}{Character vector of columns to use for coloring in the plots. Defaults to NULL.}
#' \item{preProcessDataset}{Character vector of preprocessing methods to apply (e.g., scaling, normalization). Defaults to NULL.}
#' \item{fontSize}{Numeric. Font size for plots. Defaults to 12.}
#' \item{pointSize}{Numeric. Size of points in plots. Defaults to 1.5.}
#' \item{theme}{Character. The ggplot2 theme to use (e.g., "theme_gray"). Defaults to "theme_gray".}
#' \item{colorPalette}{Character. Color palette for plots (e.g., "RdPu"). Defaults to "RdPu".}
#' \item{aspect_ratio}{Numeric. The aspect ratio of plots. Defaults to 1.}
#' \item{clusterType}{Character. The clustering method to use. Options are "Louvain", "Hierarchical", "Mclust", "Density". Defaults to "Louvain".}
#' \item{removeNA}{Logical. Whether to remove rows with NA values. Defaults to FALSE.}
#' \item{datasetAnalysisGrouped}{Logical. Whether to perform grouped dataset analysis. Defaults to FALSE.}
#' \item{plot_size}{Numeric. The size of the plot. Defaults to 12.}
#' \item{knn_clusters}{Numeric. The number of clusters for KNN-based clustering. Defaults to 250.}
#' \item{perplexity}{Numeric. The perplexity parameter for t-SNE. Defaults to NULL (automatically determined).}
#' \item{exaggeration_factor}{Numeric. The exaggeration factor for t-SNE. Defaults to NULL.}
#' \item{max_iter}{Numeric. The maximum number of iterations for t-SNE. Defaults to NULL.}
#' \item{theta}{Numeric. The Barnes-Hut approximation parameter for t-SNE. Defaults to NULL.}
#' \item{eta}{Numeric. The learning rate for t-SNE. Defaults to NULL.}
#' \item{clustLinkage}{Character. Linkage method for hierarchical clustering. Defaults to "ward.D2".}
#' \item{clustGroups}{Numeric. The number of groups for hierarchical clustering. Defaults to 9.}
#' \item{distMethod}{Character. Distance metric for clustering. Defaults to "euclidean".}
#' \item{minPtsAdjustmentFactor}{Numeric. Adjustment factor for the minimum points in DBSCAN clustering. Defaults to 1.}
#' \item{epsQuantile}{Numeric. Quantile to compute the epsilon parameter for DBSCAN clustering. Defaults to 0.9.}
#' \item{assignOutliers}{Logical. Whether to assign outliers in the clustering step. Defaults to TRUE.}
#' \item{excludeOutliers}{Logical. Whether to exclude outliers from clustering. Defaults to TRUE.}
#' \item{legendPosition}{Character. Position of the legend in plots (e.g., "right", "bottom"). Defaults to "right".}
#' \item{datasetAnalysisClustLinkage}{Character. Linkage method for dataset-level analysis. Defaults to "ward.D2".}
#' \item{datasetAnalysisType}{Character. Type of dataset analysis (e.g., "heatmap"). Defaults to "heatmap".}
#' \item{datasetAnalysisRemoveOutliersDownstream}{Logical. Whether to remove outliers during downstream dataset analysis (e.g., machine learning). Defaults to FALSE.}
#' \item{datasetAnalysisSortColumn}{Character. The column used to sort dataset analysis results. Defaults to "cluster".}
#' \item{datasetAnalysisClustOrdering}{Numeric. The order of clusters for analysis. Defaults to 1.}
#' \item{anyNAValues}{Logical. Whether the dataset contains NA values. Defaults to FALSE.}
#' \item{categoricalVariables}{Logical. Whether the dataset contains categorical variables. Defaults to FALSE.}
#' \item{resolution_increments}{Numeric vector. The resolution increments to be used for Louvain clustering. Defaults to \code{c(0.01, 0.05, 0.1, 0.2, 0.3, 0.4, 0.5)}.}
#' \item{min_modularities}{Numeric vector. The minimum modularities to test for clustering. Defaults to \code{c(0.4, 0.5, 0.6, 0.7, 0.8, 0.85, 0.9)}.}
#' \item{target_clusters_range}{Numeric vector. The range of acceptable clusters to identify. Defaults to \code{c(3, 6)}.}
#' \item{pickBestClusterMethod}{Character. The method to use for picking the best clustering result ("Modularity", "Silhouette", or "SIMON"). Defaults to "Modularity".}
#' \item{weights}{List. Weights for evaluating clusters based on \code{AUROC}, \code{modularity}, and \code{silhouette}. Defaults to \code{list(AUROC = 0.5, modularity = 0.3, silhouette = 0.2)}. These weights are applied to help choose the most relevant clusters based on user goals:
#' \describe{
#' \item{\code{AUROC}}{Weight for predictive performance (area under the receiver operating characteristic curve). Prioritize this when predictive accuracy is the main goal. For predictive analysis, a recommended configuration could be \code{list(AUROC = 0.8, modularity = 0.1, silhouette = 0.1)}.}
#' \item{\code{modularity}}{Weight for modularity score, which indicates the strength of clustering. Higher modularity suggests that clusters are well-separated. To prioritize well-separated clusters, use a configuration like \code{list(AUROC = 0.4, modularity = 0.4, silhouette = 0.2)}.}
#' \item{\code{silhouette}}{Weight for silhouette score, a measure of cohesion within clusters. Useful when cluster cohesion and interpretability are desired. For balanced clusters, a suggested configuration is \code{list(AUROC = 0.4, modularity = 0.3, silhouette = 0.3)}.}
#' }}
#' }
#'
#' @importFrom dplyr select filter group_by summarise_all rename
#' @importFrom rlang is_null
#' @importFrom stats hclust dist na.omit
#' @importFrom mclust Mclust
#' @importFrom fpc dbscan
#'
#' @return A list containing the following:
#' \itemize{
#' \item \code{tsne_calc}: The t-SNE results object.
#' \item \code{tsne_clust}: The clustering results.
#' \item \code{dataset}: A list containing the original dataset, the preprocessed dataset, and a dataset with machine learning-ready data.
#' \item \code{clusters}: The final cluster assignments.
#' \item \code{settings}: The list of settings used for the analysis.
#' }
#'
#' @examples
#' \donttest{
#' data <- matrix(runif(2000), ncol=20)
#' settings <- list(clusterType = "Louvain",
#' resolution_increments = c(0.05, 0.1),
#' min_modularities = c(0.3, 0.5))
#' result <- immunaut(data.frame(data), settings)
#' print(result$clusters)
#' }
#'
#' @export
immunaut <- function(dataset, settings = list()){
# Check if dataset is empty
if(is_var_empty(dataset) == TRUE){
print("Dataset is empty")
return(NULL)
}
# Check if settings is empty
if(is_var_empty(settings) == TRUE){
print("Settings is empty")
return(NULL)
}
# Check if dataset is a data.frame
if(is.data.frame(dataset) == FALSE){
print("Dataset is not a data.frame")
return(NULL)
}
# Check if settings is a list
if(is.list(settings) == FALSE){
print("Settings is not a data.frame")
return(NULL)
}
# Check if settings is a data.frame
if(is.data.frame(settings$fileHeader) == FALSE){
print("settings$fileHeader is not a data.frame! Please use 'immunaut::generate_file_header' function.")
return(NULL)
}
if(is_var_empty(settings$selectedColumns) == TRUE){
settings$selectedColumns = NULL
}
if(is_var_empty(settings$cutOffColumnSize) == TRUE){
settings$cutOffColumnSize = 50000
}
if(is_var_empty(settings$excludedColumns) == TRUE){
settings$excludedColumns = NULL
}
if(is_var_empty(settings$groupingVariables) == TRUE){
settings$groupingVariables = NULL
}
if(is_var_empty(settings$colorVariables) == TRUE){
settings$colorVariables = NULL
}
if(is_var_empty(settings$preProcessDataset) == TRUE){
settings$preProcessDataset = NULL
}
if(is_var_empty(settings$fontSize) == TRUE){
settings$fontSize <- 12
}
if(is_var_empty(settings$pointSize) == TRUE){
settings$pointSize <- 1.5
}
if(is_var_empty(settings$theme) == TRUE){
settings$theme <- "theme_gray"
}
if(is_var_empty(settings$colorPalette) == TRUE){
settings$colorPalette <- "RdPu"
}
if(is_var_empty(settings$aspect_ratio) == TRUE){
settings$aspect_ratio <- 1
}
if(is_var_empty(settings$clusterType) == TRUE){
settings$clusterType <- "Louvain"
}
## Louvain Specific START
if(is_var_empty(settings$resolution_increments) == TRUE){
settings$resolution_increments <- c(0.44)
}
if(is_var_empty(settings$min_modularities) == TRUE){
settings$min_modularities <- c(0.8)
}
if(is_var_empty(settings$target_clusters_range) == TRUE){
settings$target_clusters_range <- c(3, 6)
}
if(is_var_empty(settings$pickBestClusterMethod) == TRUE){
settings$pickBestClusterMethod <- "Modularity" ## Modularity, Silhouette, Overall, SIMON
}
if(is_var_empty(settings$selectedPartitionSplit) == TRUE){
settings$selectedPartitionSplit <- 0.7
}
if(is_var_empty(settings$selectedPackages) == TRUE){
settings$selectedPackages <- c("rf", "RRF", "RRFglobal", "gcvEarth", "cforest", "nb")
}
if(is_var_empty(settings$trainingTimeout) == TRUE){
settings$trainingTimeout <- 360
}
if(is_var_empty(settings$weights) == TRUE){
settings$weights <- list(AUROC = 0.5, modularity = 0.3, silhouette = 0.2)
}
## Louvain Specific END
if(is_var_empty(settings$removeNA) == TRUE){
settings$removeNA = FALSE
}
if(is_var_empty(settings$seed) == TRUE){
settings$seed = 1337
}
if(is_var_empty(settings$datasetAnalysisGrouped) == TRUE){
settings$datasetAnalysisGrouped = FALSE
}
if(is_var_empty(settings$plot_size) == TRUE){
settings$plot_size <- 12
}
if(is_var_empty(settings$knn_clusters) == TRUE){
settings$knn_clusters <- 60
}
if(is_var_empty(settings$exaggeration_factor) == TRUE){
settings$exaggeration_factor <- NULL
}
if(is_var_empty(settings$perplexity) == TRUE){
settings$perplexity <- 50
}
if(is_var_empty(settings$max_iter) == TRUE){
settings$max_iter <- 700
}
if(is_var_empty(settings$theta) == TRUE){
settings$theta <- 0.1
}
if(is_var_empty(settings$eta) == TRUE){
settings$eta <- 500
}
if(is_var_empty(settings$clustLinkage) == TRUE){
settings$clustLinkage = "ward.D2"
}
if(is_var_empty(settings$clustGroups) == TRUE){
settings$clustGroups = 3
}
## OUTLIER DETECTION START
if(is_var_empty(settings$distMethod) == TRUE){
settings$distMethod = "euclidean"
}
if(is_var_empty(settings$minPtsAdjustmentFactor) == TRUE){
settings$minPtsAdjustmentFactor = 1
}
if(is_var_empty(settings$epsQuantile) == TRUE){
settings$epsQuantile = 0.9
}
if(is_var_empty(settings$assignOutliers) == TRUE){
settings$assignOutliers = TRUE
}
if(is_var_empty(settings$excludeOutliers) == TRUE){
settings$excludeOutliers = TRUE
}
## OUTLIER DETECTION END
if(is_var_empty(settings$legendPosition) == TRUE){
settings$legendPosition = "right"
}
## dataset analysis settings
if(is_var_empty(settings$datasetAnalysisClustLinkage) == TRUE){
settings$datasetAnalysisClustLinkage = "ward.D2"
}
if(is_var_empty(settings$datasetAnalysisType) == TRUE){
settings$datasetAnalysisType = "heatmap"
}
if(is_var_empty(settings$datasetAnalysisRemoveOutliersDownstream) == TRUE){
settings$datasetAnalysisRemoveOutliersDownstream = FALSE
}
if(is_var_empty(settings$datasetAnalysisSortColumn) == TRUE){
settings$datasetAnalysisSortColumn = "cluster"
}
if(is_var_empty(settings$datasetAnalysisClustOrdering) == TRUE){
settings$datasetAnalysisClustOrdering = 1
}
if(is_var_empty(settings$anyNAValues) == TRUE){
settings$anyNAValues <- FALSE
}
if(is_var_empty(settings$categoricalVariables) == TRUE){
settings$categoricalVariables <- FALSE
}
settings$fileHeader$remapped = as.character(settings$fileHeader$remapped)
settings$fileHeader$original = as.character(settings$fileHeader$original)
if(!is.null(settings$groupingVariables)){
settings$groupingVariables <- settings$fileHeader %>% filter(remapped %in% settings$groupingVariables)
settings$groupingVariables <- settings$groupingVariables$remapped
}
# If no columns are selected, select by cut of size
if(is_null(settings$selectedColumns)) {
settings$selectedColumns <- utils::tail(selectedColumns$remapped, n=settings$cutOffColumnSize)
}
# Remove grouping variables from selectedColumns and excludedColumns
if(!is_null(settings$groupingVariables)) {
if(is_null(settings$selectedColumns)) {
settings$selectedColumns <- setdiff(settings$selectedColumns, settings$groupingVariables)
}
if(is_null(settings$excludedColumns)) {
settings$excludedColumns <- setdiff(settings$excludedColumns, settings$groupingVariables)
}
if(is_null(settings$colorVariables)) {
settings$colorVariables <- setdiff(settings$colorVariables, settings$groupingVariables)
}
}
# Remove any excluded columns from selected columns
if(!is_null(settings$excludedColumns)) {
## Remove excluded from selected columns
settings$selectedColumns <- setdiff(settings$selectedColumns, settings$excludedColumns)
# settings$selectedColumns <- settings$selectedColumns[settings$selectedColumns %!in% settings$excludedColumns]
}
# 0. Remove any undefined columns from initial dataset
dataset_filtered <- dataset[, names(dataset) %in% c(settings$selectedColumns, settings$groupingVariables)]
# 1. Cast all non numeric values to NA
vars_to_cast <- c(settings$colorVariables, settings$groupingVariables)
if (is.null(vars_to_cast)){
vars_to_cast <- character(0)
}else{
message(paste("==> Casting to NA: ", vars_to_cast))
}
num_test <- dataset_filtered %>% select(where(is.numeric))
for (groupVariable in settings$groupingVariables) {
if(groupVariable %in% names(num_test)){
dataset_filtered[[groupVariable]] <- paste("g",dataset_filtered[[groupVariable]],sep="_")
}
}
dataset_filtered <- castAllStringsToNA(dataset_filtered, vars_to_cast)
# 2. preProcessResample
if(!is.null(settings$preProcessDataset) && length(settings$preProcessDataset) > 0){
preProcessMapping <- preProcessResample(dataset_filtered,
settings$preProcessDataset,
settings$groupingVariables,
settings$groupingVariables,
settings)
dataset_filtered <- preProcessMapping$datasetData
}
# 3. remove NA is any left
if(settings$removeNA == TRUE){
message("===> INFO: Removing NA Values")
dataset_filtered <- na.omit(dataset_filtered)
}
# 4. calculate_tsne
set.seed(settings$seed)
tsne_calc <- calculate_tsne(dataset_filtered, settings)
# 5. cluster tsne
message("===> INFO: Clustering using: ", settings$clusterType)
set.seed(settings$seed)
if(settings$clusterType == "Louvain"){
tsne_clust_tmp <- list()
iteration <- 1
for (res_increment in settings$resolution_increments) {
for (min_modularity in settings$min_modularities) {
tmp <- cluster_tsne_knn_louvain(tsne_calc$info.norm, tsne_calc$tsne.norm, settings, res_increment, min_modularity)
if (tmp$num_clusters < min(settings$target_clusters_range) ||
tmp$num_clusters > max(settings$target_clusters_range)) {
message("+++++++++++> INFO: Skipping cluster with ", tmp$num_clusters, " clusters")
next
}
tmp$initial_res_increment <- res_increment
tmp$initial_min_modularity <- min_modularity
tsne_clust_tmp[[iteration]] <- tmp
iteration <- iteration + 1
}
}
if(settings$pickBestClusterMethod == "Modularity"){
tsne_clust <- pick_best_cluster_modularity(tsne_clust_tmp)
}else if(settings$pickBestClusterMethod == "Silhouette"){
tsne_clust <- pick_best_cluster_silhouette(tsne_clust_tmp)
}else if(settings$pickBestClusterMethod == "Overall"){
tsne_clust <- pick_best_cluster_overall(tsne_clust_tmp, tsne_calc)
}else if(settings$pickBestClusterMethod == "SIMON"){
best_cluster <- pick_best_cluster_simon(dataset, tsne_clust_tmp, tsne_calc, settings)
tsne_clust <- best_cluster$tsne_clust
}else{
tsne_clust <- pick_best_cluster_modularity(tsne_clust_tmp)
}
}else if(settings$clusterType == "Hierarchical"){
tsne_clust <- cluster_tsne_hierarchical(tsne_calc$info.norm, tsne_calc$tsne.norm, settings)
}else if(settings$clusterType == "Mclust"){
tsne_clust <- cluster_tsne_mclust(tsne_calc$info.norm, tsne_calc$tsne.norm, settings)
}else if(settings$clusterType == "Density"){
tsne_clust <- cluster_tsne_density(tsne_calc$info.norm, tsne_calc$tsne.norm, settings)
}else{
tsne_clust_tmp <- list()
iteration <- 1
for (res_increment in settings$resolution_increments) {
for (min_modularity in settings$min_modularities) {
tmp <- cluster_tsne_knn_louvain(tsne_calc$info.norm, tsne_calc$tsne.norm, settings, res_increment, min_modularity)
if (tmp$num_clusters < min(settings$target_clusters_range) ||
tmp$num_clusters > max(settings$target_clusters_range)) {
message("+++++++++++> INFO: Skipping cluster with ", tmp$num_clusters, " clusters")
next
}
tmp$initial_res_increment <- res_increment
tmp$initial_min_modularity <- min_modularity
tsne_clust_tmp[[iteration]] <- tmp
iteration <- iteration + 1
}
}
if(settings$pickBestClusterMethod == "Modularity"){
tsne_clust <- pick_best_cluster_modularity(tsne_clust_tmp)
}else if(settings$pickBestClusterMethod == "Silhouette"){
tsne_clust <- pick_best_cluster_silhouette(tsne_clust_tmp)
}else if(settings$pickBestClusterMethod == "Overall"){
tsne_clust <- pick_best_cluster_overall(tsne_clust_tmp, tsne_calc)
}else if(settings$pickBestClusterMethod == "SIMON"){
best_cluster <- pick_best_cluster_simon(dataset, tsne_clust_tmp, tsne_calc, settings)
tsne_clust <- best_cluster$tsne_clust
}else{
tsne_clust <- pick_best_cluster_modularity(tsne_clust_tmp)
}
}
## Get clusters from tsne_clust$info.norm[["pandora_cluster"]] and add it to original dataset in "dataset" variable
dataset_with_clusters <- dataset
if(nrow(dataset_with_clusters) == nrow(tsne_clust$info.norm)){
dataset_with_clusters$pandora_cluster <- tsne_clust$info.norm$pandora_cluster
}
dataset_filtered_with_clusters <- dataset_filtered
if(nrow(dataset_filtered_with_clusters) == nrow(tsne_clust$info.norm)){
dataset_filtered_with_clusters$pandora_cluster <- tsne_clust$info.norm$pandora_cluster
}
dataset_ml <- dataset
if(nrow(dataset_ml) == nrow(tsne_clust$info.norm)){
dataset_ml$pandora_cluster <- tsne_clust$info.norm$pandora_cluster
## Should we remove any outliers, if any detected?
dataset_ml <- remove_outliers(dataset_ml, settings)
dataset_ml <- dplyr::rename(dataset_ml, immunaut = pandora_cluster)
dataset_ml <- dataset_ml[, c("immunaut", setdiff(names(dataset_ml), "immunaut"))]
}
results <- list(
tsne_calc = tsne_calc,
tsne_clust = tsne_clust,
dataset = list(
original = dataset,
preprocessed = dataset_filtered,
dataset_ml = dataset_ml
),
clusters = tsne_clust$info.norm$pandora_cluster,
settings = settings
)
return(results)
}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.