knitr::opts_chunk$set( collapse = TRUE, comment = "#>" ) rm(list = ls())
Load the package
library(systemsseRology) set.seed(0) # random seed for reproducibility
Read the data from an Excel file. Z-score the data and assign the label in the variable y.
library(readxl) data <- as.data.frame(read_excel("../data/RSTR_data_NatureMedicine_subset.xlsx")) X <- as.matrix(as.data.frame(lapply(data[, 5:ncol(data)], as.numeric))) # z-score the data to make features comparable X <- scale(X, center = TRUE, scale = TRUE) y <- factor(data$y, levels = unique(data$y))
The variable df_features contains information about the features, mainly used for visualization routines. Here, we assign a label (removing underscores) and extract the antigen.
df_features <- data.frame(name = colnames(X)) df_features$label <- gsub("_", " ", df_features$name) df_features$antigen <- gsub("_.*", "", df_features$name) df_features$antigen <- factor(df_features$antigen) df_features$feature_class <- rep(NA, length = nrow(df_features)) df_features$feature_class[which(grepl("IgG|IgA|IgM", df_features$name))] <- "titer" df_features$feature_class[which(grepl("FcR", df_features$name))] <- "FcR" df_features$feature_class <- factor(df_features$feature_class)
Example PCA and PLS-DA (using the package ropls) and visualization.
library(RColorBrewer) # Define colors my_colors = list(group = c(RSTR = "red", LTB = "blue")) my_colors$antigen <- colorRampPalette(brewer.pal(name = "Dark2", n = 8))(nlevels(df_features$antigen)) names(my_colors$antigen) <- unique(df_features$antigen) my_colors$feature_class <- colorRampPalette(brewer.pal(name = "Set1", n = 3))(nlevels(df_features$feature_class)) names(my_colors$feature_class) <- unique(df_features$feature_class) # general options for plotting opts_plot <- list(df_features = df_features, loading_alpha = 1, # transparency for the loadings score_alpha = 1, # transparency for the scores LV_ind = c(1,2), # which LVs to plot color_features = "antigen", # according to which property (defined in df_features) the features should be color-coded colors = my_colors, y_name = "group") # Perform a simple PCA using the interface function pca_ropls model_pca <- pca_ropls(X) plt_scores_pca <- visualize_ropls_scores(model_pca, y, options = opts_plot) print(plt_scores_pca) # Perform a PLS-DA and plot the scores and loadings model <- train_ropls(X, y) plt_scores <- visualize_ropls_scores(model, y, options = opts_plot) print(plt_scores) plt_loadings <- visualize_ropls_loadings(model, options = opts_plot) print(plt_loadings) # set additional options required to color code enrichment in the bar plot of the loadings opts_plot$X <- X opts_plot$y <- y opts_plot$LV_ind <- 1 opts_plot$mark_enrichment <- TRUE plt_loadings_bar <- visualize_ropls_loadings_bar(model, options = opts_plot) print(plt_loadings_bar) # print the results of 5-fold cv for the ropls method without any # feature selection. if validate() is passed a method with no feature # selector it will default to selecting all features and set rf_trials = 0 method = list(train = train_ropls, predict = predict_ropls, score = score_accuracy) opts = list(n_folds = 5, pt_trials = 0) return_vals <- validate(X, y, method, opts) print(paste("Performance in 5-fold cv:", round(return_vals$cv_score, digits = 2), "accuracy"))
Feature selection and then PLS-DA.
opts_sel <- list(n_trials = 10, threshold = 0.8, return_count = FALSE) sel_features <- select_lasso(X, y) print(sel_features) # repeat this selection 10 times and pick the features that are chosen in # more than 80% of the repetitions sel_features <- select_repeat(X, y, selector = select_lasso, options = opts_sel) X_sel <- X[, sel_features] # Perform a PLS-DA using the selected features and plot the scores and loadings model <- train_ropls(X_sel, y) # The model only has one latent variable. For visualization purposes we fix it to be two dimensional. ropls::getSummaryDF(model) opts_model <- list(n_LV = 2) model <- train_ropls(X_sel, y, options = opts_model) opts_plot$LV_ind <- c(1,2) plt_scores <- visualize_ropls_scores(model, y, options = opts_plot) print(plt_scores) plt_loadings <- visualize_ropls_loadings(model, options = opts_plot) print(plt_loadings) # set additional options required to color code enrichment in the bar plot of the loadings opts_plot$X <- X_sel opts_plot$y <- y opts_plot$LV_ind <- 1 opts_plot$mark_enrichment <- TRUE plt_loadings_bar <- visualize_ropls_loadings_bar(model, options = opts_plot) print(plt_loadings_bar)
Analyze model results. First we check the accuracy obtained on the whole data set.
y_pred <- predict_ropls(model, X_sel) acc <- score_accuracy(y, y_pred) print(paste("Performance on full data set:", round(acc, digits = 2), "accuracy")) select <- function(X, y) { return(select_repeat(X, y, selector = select_lasso, options = opts_sel)) } method = list(select = select, train = train_ropls, predict = predict_ropls, score = score_accuracy) opts = list(n_folds = 5, rf_trials = 0, pt_trials = 0) return_vals <- validate(X, y, method, opts) print(paste("Performance in 5-fold cv:", round(return_vals$cv_score, digits = 2), "accuracy"))
Permutation testing to assess model significance using two different approaches. 1) Size-matched random features 2) Permuted labels
opts$rf_trials <- 3 opts$pt_trials <- 3 opts$compare_pred <- "y_perm" #whether for the 'permuted labels' the predicted label is compared to the original ("y"), or the permuted vector of labels ("y_perm") return_vals <- validate(X, y, method, opts) repeated_vals <- validate_repeat(X, y, method, opts, n_trials = 3)
An example for a regression, i.e., y being continuous.
rm(list = ls()) library(readxl) data <- as.data.frame(read_excel("../data/RSTR_data_NatureMedicine_subset.xlsx")) X <- as.matrix(as.data.frame(lapply(data[, 5:(ncol(data) - 1)], as.numeric))) X <- scale(X, center = TRUE, scale = TRUE) y <- data[,ncol(data)]
The variable df_features contains information about the features, mainly used for visualization routines. Here, we assign a label (removing underscores) and extract the antigen.
df_features <- data.frame(name = colnames(X)) df_features$label <- gsub("_", " ", df_features$name) df_features$antigen <- gsub("_.*", "", df_features$name) df_features$antigen <- factor(df_features$antigen) df_features$feature_class <- rep(NA, length = nrow(df_features)) df_features$feature_class[which(grepl("IgG|IgA|IgM", df_features$name))] <- "titer" df_features$feature_class[which(grepl("FcR", df_features$name))] <- "FcR" df_features$feature_class <- factor(df_features$feature_class)
Example PCA and PLS-R (using the package ropls) and visualization.
library(RColorBrewer) # Define colors my_colors = list(y = c(low = "red", high = "blue")) my_colors$antigen <- colorRampPalette(brewer.pal(name = "Dark2", n = 8))(nlevels(df_features$antigen)) names(my_colors$antigen) <- unique(df_features$antigen) my_colors$feature_class <- colorRampPalette(brewer.pal(name = "Set1", n = 3))(nlevels(df_features$feature_class)) names(my_colors$feature_class) <- unique(df_features$feature_class) # general options for plotting opts_plot <- list(df_features = df_features, loading_alpha = 1, # transparency for the loadings score_alpha = 1, # transparency for the scores LV_ind = c(1,2), # which LVs to plot color_features = "antigen", colors = my_colors, y_name = "y") # Perform a simple PCA using the interface function pca_ropls model_pca <- pca_ropls(X) plt_scores_pca <- visualize_ropls_scores(model_pca, y, options = opts_plot) print(plt_scores_pca) # Perform a PLS-DA and plot the scores and loadings model <- train_ropls(X, y) plt_scores <- visualize_ropls_scores(model, y, options = opts_plot) print(plt_scores) plt_loadings <- visualize_ropls_loadings(model, options = opts_plot) print(plt_loadings) # set additional options required to color code enrichment in the bar plot of the loadings opts_plot$LV_ind <- 1 plt_loadings_bar <- visualize_ropls_loadings_bar(model, options = opts_plot) print(plt_loadings_bar)
y_pred <- predict_ropls(model, X) r2 <- score_r2(y, y_pred) print(paste("Performance on full data set:", r2, "R^2")) method = list(select = select_lasso, train = train_ropls, predict = predict_ropls, score = score_r2) opts = list(n_folds = 5, rf_trials = 0, pt_trials = 0) return_vals <- validate(X, y, method, opts) print(paste("Performance in 5-fold cv:", return_vals$cv_score, "R^2"))
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.