proximity_matrix_RF: Plot binary rule-based heatmaps
In multiclassPairs: Build MultiClass Pair-Based Classifiers using TSPs or RF

Description Usage Arguments Value Author(s) Examples

proximity_matrix_RF Plot clustering heatmaps showing which out-of-bag samples are predicted in the same class and in the same trees during the training process for the rule-based random forest classifier

proximity_matrix_RF(object,
            classifier,
            plot=TRUE,
            return_matrix=TRUE,
            title         = "",
            top_anno      = c("ref","platform")[1],
            classes       = NULL,
            sam_order     = NULL,
            ref_col       = NULL,
            platform_col  = NULL,
            platforms_ord = NULL,
            show_platform = TRUE,
            cluster_cols  = FALSE,
            legend        = TRUE,
            anno_height   = 0.03,
            margin        = c(0, 5, 0, 5))

`object`	data_object generated by ReadData function which was used in the training process.
`classifier`	classifier as a rule_based_RandomForest object, generated by train_RF function
`plot`	logical. To plot the proximity matrix or not. Default is TRUE.
`return_matrix`	logical. To return the proximity matrix or not. Default is TRUE.
`title`	Character input as a title for the whole heatmap. Default is "".
`top_anno`	Determine the top annotation level. Samples will be grouped based on the top_anno. Input can be one of two options: "ref", "platform". Default is "ref".
`classes`	Optional vector with the class names. Classes will determine which classes will be plotted and in which order. It is not recommended to use both "classes" and "platforms_ord" arguments together.
`sam_order`	Optional vector with the samples order in the heatmap.
`ref_col`	optional named vector determines the colors of classes for the reference labels. Default is NULL. Vector names should match with the ref labels.
`platform_col`	optional named vector determines the colors of platforms/study labels. Default is NULL. Vector names should match with the platforms/study labels.
`platforms_ord`	Optional vector with the platform/study names. This will determine which platform/study will be plotted and in which order. This will be used when top_anno="platform". It is not recommended to use both "classes" and "platforms_ord" arguments together.
`show_platform`	logical. Determines if the platform/study labels will be plotted or not. If the top_anno argument is "platform" then show_platform will be ignored.
`cluster_cols`	logical. samples will be grouped based on the class then will be Clustered in each class (i.e. not all samples in the cohort). If top_anno is "platform" then the rules from all classes are used to cluster the samples in each platform.
`legend`	logical. Determines if a legend will be plotted under the heatmap.
`anno_height`	Determines the height of the annotations. It is recommended not to go out of this range 0.01<height<0.1. Default is 0.03.
`margin`	Determines the margins of the heatmap. Default is c(0, 5, 0, 5).

returns the proximity matrix and/or a heatmap plot for the proximity matrix.

Nour-al-dain Marzouka <nour-al-dain.marzouka at med.lu.se>

# generate random data
Data <- matrix(runif(8000), nrow=100, ncol=80,
               dimnames = list(paste0("G",1:100), paste0("S",1:80)))

# generate random labels
L <- sample(x = c("A","B","C","D"), size = 80, replace = TRUE)

# generate random platform labels
P <- sample(c("P1","P2","P3"), size = 80, replace = TRUE)

# create data object
object <- ReadData(Data = Data,
                   Labels = L,
                   Platform = P,
                   verbose = FALSE)

# sort genes
genes_RF <- sort_genes_RF(data_object = object,
                          seed=123456, verbose = FALSE)

# to get an idea of how many genes we will use
# and how many rules will be generated
# summary_genes_RF(sorted_genes_RF = genes_RF,
#                  genes_altogether = c(10,20,50,100,150,200),
#                  genes_one_vs_rest = c(10,20,50,100,150,200))

# creat and sort rules
# rules_RF <- sort_rules_RF(data_object = object,
#                           sorted_genes_RF = genes_RF,
#                           genes_altogether = 100,
#                           genes_one_vs_rest = 100,
#                           seed=123456,
#                           verbose = FALSE)

# parameters <- data.frame(
#   gene_repetition=c(3,2,1),
#   rules_one_vs_rest=0,
#   rules_altogether=c(2,3,10),
#   run_boruta=c(FALSE,"produce_error",FALSE),
#   plot_boruta = FALSE,
#   num.trees=c(100,200,300),
#   stringsAsFactors = FALSE)
# parameters

# Or you can use expand.grid to generate dataframe with all parameter combinations
# parameters <- expand.grid(
#   gene_repetition=c(3,2,1),
#   rules_one_vs_rest=0,
#   rules_altogether=c(2,3,10),
#   num.trees=c(100,500,1000),
#   stringsAsFactors = FALSE)
# parameters


# test <- optimize_RF(data_object = object,
#                     sorted_rules_RF = rules_RF,
#                     test_object = NULL,
#                     overall = c("Accuracy"),
#                     byclass = NULL, verbose = FALSE,
#                     parameters = parameters)
# test
# test$summary[which.max(test$summary$Accuracy),]
#
# # train the final model
# # it is preferred to increase the number of trees and rules in case you have
# # large number of samples and features
# # for quick example, we have small number of trees and rules here
# # based on the optimize_RF results we will select the parameters
# RF_classifier <- train_RF(data_object = object,
#                           gene_repetition = 1,
#                           rules_altogether = 0,
#                           rules_one_vs_rest = 10,
#                           run_boruta = FALSE,
#                           plot_boruta = FALSE,
#                           probability = TRUE,
#                           num.trees = 300,
#                           sorted_rules_RF = rules_RF,
#                           boruta_args = list(),
#                           verbose = TRUE)
#
# # training accuracy
# # get the prediction labels
# # if the classifier trained using probability	= FALSE
# training_pred <- RF_classifier$RF_scheme$RF_classifier$predictions
# if (is.factor(training_pred)) {
#   x <- as.character(training_pred)
# }
#
# # if the classifier trained using probability	= TRUE
# if (is.matrix(training_pred)) {
#   x <- colnames(training_pred)[max.col(training_pred)]
# }
#
# # training accuracy
# caret::confusionMatrix(data =factor(x),
#                 reference = factor(object$data$Labels),
#                 mode = "everything")

# not to run
# visualize the binary rules in training dataset
# plot_binary_RF(Data = object,
#                classifier = RF_classifier,
#                prediction = NULL, as_training = TRUE,
#                show_scores = TRUE,
#                top_anno = "ref",
#                show_predictions = TRUE,
#                title = "Training data")

# not to run
# Extract and plot the proximity matrix from the classifier for the training data
# it takes long time for large data
# proximity_mat <- proximity_matrix_RF(object = object,
#                       classifier = RF_classifier,
#                       plot=TRUE,
#                       return_matrix=TRUE,
#                       title = "Test",
#                       cluster_cols = TRUE)

# not to run
# predict
# test_object # any test data
# results <- predict_RF(classifier = RF_classifier, impute = TRUE,
#                       Data = test_object)
#
# # visualize the binary rules in training dataset
# plot_binary_RF(Data = test_object,
#                classifier = RF_classifier,
#                prediction = results, as_training = FALSE,
#                show_scores = TRUE,
#                top_anno = "ref",
#                show_predictions = TRUE,
#                title = "Test data")