library(OmicSelector) options(width = 138) knitr::opts_chunk$set( comment = '', fig.width = 11.5, fig.height = 7 ) library(kableExtra) dane = OmicSelector_load_datamix(use_smote_not_rose = T) # load mixed_*.csv files train = dane[[1]] test = dane[[2]] valid = dane[[3]] train_smoted = dane[[4]] trainx = dane[[5]] trainx_smoted = dane[[6]] # get the objects from list to make the code more readable. mixed = data.table::fread("mixed.csv") de = OmicSelector_differential_expression_ttest(trainx, train$Class)
metody = OmicSelector_get_benchmark_methods("benchmark.csv") # gets the methods used in benchmark par(mfrow = c(2, 2)) for (i in 1:length(metody)) { temp = OmicSelector_get_benchmark("benchmark.csv") # loads benchmark temp2 = dplyr::select(temp, starts_with(paste0(metody[i], "_"))) boxplot(temp[, paste0(metody[i], "_train_Accuracy")], temp[, paste0(metody[i], "_test_Accuracy")], temp[, paste0(metody[i], "_valid_Accuracy")], main = paste0("Method: ", metody[i]), names = c("Training", "Testing", "Validation"), ylab = "Accuracy", ylim = c(0.5, 1)) tempids = c(match(paste0(metody[i], "_train_Accuracy"), colnames(temp)), match(paste0(metody[i], "_test_Accuracy"), colnames(temp)), match(paste0(metody[i], "_valid_Accuracy"), colnames(temp))) } par(mfrow = c(1, 1))
(metaindex = harmonic mean of all 3 accuracy metrics)
acc = OmicSelector_best_signature_proposals(benchmark_csv = "benchmark.csv", without_train = F) # generates the benchmark sorted by metaindex best_signatures = acc[1:3, ] # get top 3 methods OmicSelector_table(best_signatures[, c("metaindex", "method", "miRy")])
Performance of those signatures:
for (i in 1:3) { # cat(paste0("\n\n## ", acc$method[i], "\n\n")) par(mfrow = c(1, 2)) # acc = OmicSelector_best_signature_proposals_meta11("benchmark.csv") metody = OmicSelector_get_benchmark_methods("benchmark.csv") ktory_set = match(acc$method[i], OmicSelector_get_benchmark("benchmark.csv")$method) # do_ktorej_kolumny = which(colnames(acc) == 'metaindex') barplot(as.numeric(acc[i,1:do_ktorej_kolumny])) for (ii in 1:length(metody)) { temp = OmicSelector_get_benchmark("benchmark.csv") %>% dplyr::select(starts_with(paste0(metody[ii], "_t")), starts_with(paste0(metody[ii], "_v"))) ROCtext = paste0("Training AUC ROC: ", round(temp[ktory_set, 1], 2), " (95%CI: ", round(temp[ktory_set, 2], 2), "-", round(temp[ktory_set, 3], 2), ")") temp = temp[, -c(1:3)] temp2 = as.numeric(temp[ktory_set, ]) temp3 = matrix(temp2, nrow = 3, byrow = T) colnames(temp3) = c("Accuracy", "Sensitivity", "Specificity") rownames(temp3) = c("Training", "Testing", "Validation") temp3 = t(temp3) plot1 = barplot(temp3, beside = T, ylim = c(0, 1), xlab = paste0(ROCtext, "\nBlack - accuracy, blue - sensitivity, green - specificity"), width = 0.85, col = c("black", "blue", "green"), legend = F, args.legend = list(x = "topright", bty = "n", inset = c(0, -0.25)), cex.lab = 0.7, main = paste0(acc$method[i], " - ", metody[ii]), font.lab = 2) ## Add text at top of bars text(x = plot1, y = as.numeric(temp3), label = paste0(round(as.numeric(temp[ktory_set, ]) * 100, 1), "%"), pos = 3, cex = 0.6, col = "red") } par(mfrow = c(1, 1)) }
overlap = OmicSelector_signature_overlap(acc$method[1:3], "benchmark.csv") attr(overlap, "intersections")
(metaindex = mean of 2 accuracy metrics)
acc = OmicSelector_best_signature_proposals(benchmark_csv = "benchmark.csv", without_train = T) # generates the benchmark sorted by metaindex best_signatures = acc[1:3, ] # get top 3 methods OmicSelector_table(best_signatures[, c("metaindex", "method", "miRy")])
Performance of those signatures:
for (i in 1:3) { # cat(paste0("\n\n## ", acc$method[i], "\n\n")) par(mfrow = c(1, 2)) # acc = OmicSelector_best_signature_proposals_meta11("benchmark.csv") metody = OmicSelector_get_benchmark_methods("benchmark.csv") ktory_set = match(acc$method[i], OmicSelector_get_benchmark("benchmark.csv")$method) # do_ktorej_kolumny = which(colnames(acc) == 'metaindex') barplot(as.numeric(acc[i,1:do_ktorej_kolumny])) for (ii in 1:length(metody)) { temp = OmicSelector_get_benchmark("benchmark.csv") %>% dplyr::select(starts_with(paste0(metody[ii], "_t")), starts_with(paste0(metody[ii], "_v"))) ROCtext = paste0("Training AUC ROC: ", round(temp[ktory_set, 1], 2), " (95%CI: ", round(temp[ktory_set, 2], 2), "-", round(temp[ktory_set, 3], 2), ")") temp = temp[, -c(1:3)] temp2 = as.numeric(temp[ktory_set, ]) temp3 = matrix(temp2, nrow = 3, byrow = T) colnames(temp3) = c("Accuracy", "Sensitivity", "Specificity") rownames(temp3) = c("Training", "Testing", "Validation") temp3 = t(temp3) plot1 = barplot(temp3, beside = T, ylim = c(0, 1), xlab = paste0(ROCtext, "\nBlack - accuracy, blue - sensitivity, green - specificity"), width = 0.85, col = c("black", "blue", "green"), legend = F, args.legend = list(x = "topright", bty = "n", inset = c(0, -0.25)), cex.lab = 0.7, main = paste0(acc$method[i], " - ", metody[ii]), font.lab = 2) ## Add text at top of bars text(x = plot1, y = as.numeric(temp3), label = paste0(round(as.numeric(temp[ktory_set, ]) * 100, 1), "%"), pos = 3, cex = 0.6, col = "red") } par(mfrow = c(1, 1)) }
overlap = OmicSelector_signature_overlap(acc$method[1:3], "benchmark.csv") attr(overlap, "intersections")
(metaindex = mean of sensivitiy and specificity in validation dataset)
acc = OmicSelector_best_signature_proposals_meta11(benchmark_csv = "benchmark.csv") # generates the benchmark sorted by metaindex best_signatures = acc[1:3, ] # get top 3 methods OmicSelector_table(best_signatures[, c("metaindex", "method", "miRy")])
Performance of those signatures:
for (i in 1:3) { # cat(paste0("\n\n## ", acc$method[i], "\n\n")) par(mfrow = c(1, 2)) # acc = OmicSelector_best_signature_proposals_meta11("benchmark.csv") metody = OmicSelector_get_benchmark_methods("benchmark.csv") ktory_set = match(acc$method[i], OmicSelector_get_benchmark("benchmark.csv")$method) # do_ktorej_kolumny = which(colnames(acc) == 'metaindex') barplot(as.numeric(acc[i,1:do_ktorej_kolumny])) for (ii in 1:length(metody)) { temp = OmicSelector_get_benchmark("benchmark.csv") %>% dplyr::select(starts_with(paste0(metody[ii], "_t")), starts_with(paste0(metody[ii], "_v"))) ROCtext = paste0("Training AUC ROC: ", round(temp[ktory_set, 1], 2), " (95%CI: ", round(temp[ktory_set, 2], 2), "-", round(temp[ktory_set, 3], 2), ")") temp = temp[, -c(1:3)] temp2 = as.numeric(temp[ktory_set, ]) temp3 = matrix(temp2, nrow = 3, byrow = T) colnames(temp3) = c("Accuracy", "Sensitivity", "Specificity") rownames(temp3) = c("Training", "Testing", "Validation") temp3 = t(temp3) plot1 = barplot(temp3, beside = T, ylim = c(0, 1), xlab = paste0(ROCtext, "\nBlack - accuracy, blue - sensitivity, green - specificity"), width = 0.85, col = c("black", "blue", "green"), legend = F, args.legend = list(x = "topright", bty = "n", inset = c(0, -0.25)), cex.lab = 0.7, main = paste0(acc$method[i], " - ", metody[ii]), font.lab = 2) ## Add text at top of bars text(x = plot1, y = as.numeric(temp3), label = paste0(round(as.numeric(temp[ktory_set, ]) * 100, 1), "%"), pos = 3, cex = 0.6, col = "red") } par(mfrow = c(1, 1)) }
overlap = OmicSelector_signature_overlap(acc$method[1:3], "benchmark.csv") attr(overlap, "intersections")
This is by default performed for top 6 sets which achived the best accuracy in training, testing and validation.
acc = OmicSelector_best_signature_proposals(benchmark_csv = "benchmark.csv", without_train = F) # change here the default method of signature selection for (i in 1:length(metody)) { suppressMessages(library(PairedData)) suppressMessages(library(profileR)) pd = paired(as.numeric(acc[1:6, paste0(metody[i], "_train_Accuracy")]), as.numeric(acc[1:6, paste0(metody[i], "_test_Accuracy")])) colnames(pd) = c("Train Accuracy", "Test Accuracy") plot2 = OmicSelector_profileplot(pd, Method.id = acc$method[1:6], standardize = F) pd = paired(as.numeric(acc[1:6, paste0(metody[i], "_train_Accuracy")]), as.numeric(acc[1:6, paste0(metody[i], "_valid_Accuracy")])) colnames(pd) = c("Train Accuracy", "Valid Accuracy") plot3 = OmicSelector_profileplot(pd, Method.id = acc$method[1:6], standardize = F) pd = paired(as.numeric(acc[1:6, paste0(metody[i], "_test_Accuracy")]), as.numeric(acc[1:6, paste0(metody[i], "_valid_Accuracy")])) colnames(pd) = c("Test Accuracy", "Valid Accuracy") plot4 = OmicSelector_profileplot(pd, Method.id = acc$method[1:6], standardize = F) require(gridExtra) grid.arrange(arrangeGrob(plot2, plot3, ncol = 2, nrow = 1, top = metody[i])) grid.arrange(arrangeGrob(plot4, ncol = 1, nrow = 1, top = metody[i])) }
For top 6 methods.
acc2 = acc[1:6, ] # get top 6 methods accmelt = melt(acc2, id.vars = "method") %>% filter(variable != "metaindex") %>% filter(variable != "miRy") accmelt = cbind(accmelt, strsplit2(accmelt$variable, "_")) acctest = accmelt$value[accmelt$`2` == "test"] accvalid = accmelt$value[accmelt$`2` == "valid"] accmeth = accmelt$method[accmelt$`2` == "test"] plot5 = ggplot(, aes(x = as.numeric(acctest), y = as.numeric(accvalid), shape = accmeth)) + geom_point() + scale_x_continuous(name = "Accuracy on test set", limits = c(0.5, 1)) + scale_y_continuous(name = "Accuracy on validation set", limits = c(0.5, 1)) + theme_bw() grid.arrange(arrangeGrob(plot5, ncol = 1, nrow = 1))
By default we choose the best performing set which achived the best mean accuracy in training, testing and validation.
acc1 = OmicSelector_best_signature_proposals(benchmark_csv = "benchmark.csv", without_train = F) # here you can customize how the best signature is selected. best_signatures = acc1[1:3, ] # get top 3 methods
Best set:
OmicSelector_table(best_signatures[1, c("metaindex", "method", "miRy")])
This should serve as a sanity check.
selected_miRNAs = OmicSelector_get_features_from_benchmark(benchmark_csv = "benchmark.csv", best_signatures$method[1]) best_de = OmicSelector_best_signature_de(selected_miRNAs) OmicSelector_table(best_de)
OmicSelector_vulcano_plot(selected_miRNAs = de$miR, DE = de, only_label = selected_miRNAs)
OmicSelector_heatmap(x = dplyr::select(mixed, selected_miRNAs), rlab = data.frame(Class = mixed$Class, Mix = mixed$mix), zscore = F, margins = c(10, 10), expression_name = "value")
OmicSelector_heatmap(x = dplyr::select(mixed, selected_miRNAs), rlab = data.frame(Class = mixed$Class, Mix = mixed$mix), zscore = T, margins = c(10, 10), expression_name = "value")
Based on benchmark results. You could achive better model by further tuning it. Metaindex - mean accuracy on training, testing and validation datasets. Metaindex2 - mean accuracy on testing and validation datasets only.
methods = OmicSelector_get_benchmark_methods(benchmark_csv = "benchmark.csv") benchmark_results = OmicSelector_get_benchmark(benchmark_csv = "benchmark.csv") model_results = data.frame() for (i in 1:length(methods)) { for (ii in 1:nrow(benchmark_results)) { current_method = methods[i] temp_name = paste0(current_method,"+",benchmark_results[ii, "method"]) temp_fsmethod = current_method temp_method = benchmark_results[ii, "method"] temp_id = benchmark_results[ii, paste0(current_method,"_modelname")] temp_AUC = benchmark_results[ii, paste0(current_method,"_train_ROCAUC")] temp_trainAcc = benchmark_results[ii, paste0(current_method,"_train_Accuracy")] temp_testAcc = benchmark_results[ii, paste0(current_method,"_test_Accuracy")] temp_validAcc = benchmark_results[ii, paste0(current_method,"_valid_Accuracy")] temp_metaindex = psych::harmonic.mean(c(temp_trainAcc, temp_testAcc, temp_validAcc)) model_results = rbind(model_results, c(temp_name, temp_id, temp_fsmethod, temp_method, temp_AUC, temp_trainAcc, temp_testAcc, temp_validAcc, temp_metaindex)) } } colnames(model_results) = c("Name", "ID", "Modelling Method","Selection Method","Train ROC AUC", "Train Acc", "Test Acc", "Valid Acc", "Metaindex") model_results = model_results[rev(order(model_results$Metaindex)),] model_results = model_results[complete.cases(model_results),] data.table::fwrite(model_results, "model_results.csv") # Top 20 models OmicSelector_table(model_results[1:20,])
Metrics on training set:
model_file = paste0("models/",model_results[which(model_results$Metaindex == max(model_results$Metaindex))[1],"ID"],".RDS") bestmodel = readRDS(model_file) predtrain_y = predict(bestmodel, train, type="prob")[,"Case"] predtrain = predict(bestmodel, train) caret::confusionMatrix(predtrain, train$Class, positive = "Case") trainroc = pROC::roc(train$Class ~ predtrain_y) cat(paste0("Model file: ", model_file)) print(trainroc) pROC::ci(trainroc) plot(trainroc)
Metrics on testing set:
pred_y = predict(bestmodel, test, type="prob")[,"Case"] pred = predict(bestmodel, test) caret::confusionMatrix(pred, test$Class, positive = "Case")
Metrics on validation set:
pred_y = predict(bestmodel, valid, type="prob")[,"Case"] pred = predict(bestmodel, valid) caret::confusionMatrix(pred, valid$Class, positive = "Case")
This is the end. Timestamp of the analysis:
OmicSelector_log("[OmicSelector: TASK COMPLETED]","task.log")
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.