knitr::opts_chunk$set( collapse = TRUE, comment = "#>", fig.width = 7, fig.height = 4 ) library(multivarious) library(dplyr) library(tibble) library(ggplot2) # Needed for plots library(knitr) # Needed for kable
```{css, echo=FALSE} h1 { font-size: 1.8em; / Adjust as needed / margin-top: 1.5em; margin-bottom: 0.8em; border-bottom: 1px solid #dee2e6; / Optional: add a subtle separator / padding-bottom: 0.3em; } h2 { font-size: 1.4em; / Adjust as needed / margin-top: 1.2em; margin-bottom: 0.6em; } / Add rules for h3, h4 etc. if they are used and need adjustment /
# 1. Why classify after projection? Once a dimensionality-reduction model (PCA, PLS, CCA, …) is fitted, every new sample can be projected into the low-dimensional latent space. Running a classifier there – instead of on thousands of noisy raw variables – yields * fewer parameters & smaller models, * immunity to collinearity, * freedom to use partial data (ROI, missing sensors), * a clean separation between unsupervised decomposition and supervised prediction. The `classifier()` S3 family supplied by `multiblock` provides that glue: you hand it any projector (or `multiblock_biprojector`, `discriminant_projector`, …) plus class labels → it returns a ready predictor object. # 2. Iris demo – LDA → `discriminant_projector` → k-NN ```r data(iris) X <- as.matrix(iris[, 1:4]) grp <- iris$Species # Fit classical Linear DA and wrap it if (!requireNamespace("MASS", quietly = TRUE)) { stop("MASS package required for LDA example") } # 1. Define the pre-processing step preproc_def <- prep(center()) # 2. Prepare and initialize using the data LDA was trained on (X) Xp <- init_transform(preproc_def, X) # Assuming discriminant_projector, prep, center, scores are available lda_fit <- MASS::lda(X, grouping = grp) disc_proj <- multivarious::discriminant_projector( v = lda_fit$scaling, # loadings (p × d) s = Xp %*% lda_fit$scaling, # scores (n × d) sdev = lda_fit$svd, # singular values labels = grp, preproc = preproc_def # Pass the *initialized* pre-processor ) print(disc_proj)
scores_df <- as_tibble(scores(disc_proj)[, 1:2], .name_repair = ~ c("LD1","LD2")) |> mutate(Species = iris$Species) ggplot(scores_df, aes(LD1, LD2, colour = Species)) + geom_point(size = 2, alpha = .7) + stat_ellipse(level = .9, linewidth = .3) + theme_minimal() + ggtitle("Iris – first two LDA components")
set.seed(42) train_id <- sample(seq_len(nrow(X)), size = 0.7*nrow(X)) test_id <- setdiff(seq_len(nrow(X)), train_id) # Assuming classifier function is available clf_knn <- multivarious::classifier( x = disc_proj, labels = grp[train_id], new_data= X[train_id, ], # Use training data to get reference scores knn = 3 ) print(clf_knn)
pred_knn <- predict(clf_knn, new_data = X[test_id, ], metric = "cosine", prob_type = "knn_proportion") head(pred_knn$prob, 3) print(paste("Overall Accuracy:", mean(pred_knn$class == grp[test_id]))) # Assuming rank_score and topk are available rk <- rank_score(pred_knn$prob, grp[test_id]) tk2 <- topk (pred_knn$prob, grp[test_id], k = 2) tibble( prank_mean = mean(rk$prank), top2_acc = mean(tk2$topk) )
cm <- table( Truth = grp[test_id], Predicted = pred_knn$class ) # Heat-map cm_df <- as.data.frame(cm) ggplot(cm_df, aes(Truth, Predicted, fill = Freq)) + geom_tile(colour = "grey80") + geom_text(aes(label = Freq), colour = "white", size = 4) + scale_fill_gradient(low = "#4575b4", high = "#d73027", name="Count", limits = c(0, 15)) + scale_y_discrete(limits = rev(levels(cm_df$Predicted))) + theme_minimal(base_size = 12) + coord_equal() + ggtitle("k-NN (k = 3) confusion matrix – test set") + theme(axis.text.x = element_text(angle = 45, hjust = 1)) # Pretty table as well knitr::kable(cm, caption = "Confusion matrix (counts)")
# Check if randomForest is installed if (requireNamespace("randomForest", quietly = TRUE)) { # Assuming rf_classifier.projector method is available rf_clf <- rf_classifier( # Using the generic here x = disc_proj, labels = grp[train_id], # Pass scores directly if method requires it, or let it call scores(x) scores = scores(disc_proj)[train_id, ] ) pred_rf <- predict(rf_clf, new_data = X[test_id, ]) print(paste("RF Accuracy:", mean(pred_rf$class == grp[test_id]))) } else { cat("randomForest package not installed. Skipping RF example.\n") }
The RF sees exactly three input variables (the LDA components) – that keeps trees shallow and speeds-up training.
Assume that in deployment we measure only Sepal variables (cols 1–2). A partial projection keeps the classifier happy:
sepal_cols <- 1:2 # Create a classifier using reference scores from Sepal columns only clf_knn_sepal <- multivarious::classifier( x = disc_proj, labels = grp[train_id], new_data= X[train_id, sepal_cols], # Use training data subset colind = sepal_cols, # Indicate which columns were used knn = 3 ) # Predict using the dedicated sepal classifier pred_sepal <- predict( clf_knn_sepal, # Use the sepal-specific classifier new_data = X[test_id, sepal_cols] # No need for colind here as clf_knn_sepal expects sepal data ) print(paste("Accuracy (Sepal only):", mean(pred_sepal$class == grp[test_id])))
Accuracy drops a bit – as expected when using fewer features.
feature_importance()
can rank variable groups via a simple
"leave-block-out" score drop.
blocks <- list( Sepal = 1:2, Petal = 3:4 ) # Assuming feature_importance is available fi <- feature_importance( clf_knn, new_data = X[test_id, ], true_labels = grp[test_id], # Pass the correct test set labels blocks = blocks, fun = rank_score, # Use rank_score as the performance metric fun_direction = "lower_is_better", approach = "marginal" # Calculate marginal drop when block is removed ) print(fi)
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.