R/RFTuning.R
In RFmarkerDetector: Multivariate Analysis of Metabolomics Data using Random Forests

#' Tuning of the mtry parameter for a Random Forest model
#' 
#' \code{tuneMTRY} tries to identify the 'optimal' value of the mtry parameter which indicates the 
#' number of input variables randomly chosen at each node
#' @param data the n x p dataframe used to build the models and to tune the parameter mtry. The first two columns
#' must represent respectively the sample names and the class labels related to each sample 
#' @param iterations the number of different random forest models built for each value of mtry
#' @param maxntree the maximum number of trees of each random forest model
#' @param mtry_length an integer value representing the number of mtry values to test.
#' @param changeTreeNum a logical value indicating whether or not to change
#' @param graph a logical value indicating whether to plot the OOB error as a function of the parameter mtry 
#' the number of trees during the tuning of mtry
#' @return a list of two elements:\itemize{
#'   \item a diagram of the average value of the OOB error as a function of the mtry with its 95\% confidence interval
#'   \item a n x p matrix of n iterations and p mtry values tested 
#' }
#' @author Piergiorgio Palla
#' @import randomForest randomForest
#' @details The function searches for the optimal value of mtry assigning to it a set of values and building different random forests (also with a different 
#' number of trees) for each value of the mtry. The number of models built for each mtry is defined by the \code{iteration} parameter. 
#' The oob errors of each random forest model, computed for each mtry value, are then arranged in a matrix
#' @examples
#' ## data(cachexiaData)
#' ## res <- tuneMTRY(cachexiaData, iterations = 10, maxntree = 600, mtry_length = 10, graph = FALSE)
#' @export

tuneMTRY <- function(data, iterations, maxntree, mtry_length, changeTreeNum = F, graph = T) {
    
    # numOfVariables = dim(data)[2] - 2
    input <- data[, 3:dim(data)[2]]
    numOfVariables <- dim(input)[2]
    DEFAULT_N_TREES <- 500
    DEFAULT_ITERATIONS <- 40
    DEFAULT_MTRY_VALUES_TO_TEST <- numOfVariables
    
    
    if (hasArg("iterations") && !is.null(iterations)) {
        iterat <- iterations
    } else {
        iterat <- DEFAULT_ITERATIONS
    }
    
    if (hasArg("maxntree") && !is.null(maxntree)) {
        nt <- maxntree
    } else {
        nt <- DEFAULT_N_TREES
    }
    
    if (hasArg("mtry_length") && !is.null(mtry_length) && mtry_length <= numOfVariables) {
        mtry_length <- mtry_length
    } else {
        mtry_length <- DEFAULT_MTRY_VALUES_TO_TEST
    }
    
    set.seed(1234)
    mtry_values <- seq(from = 1, to = numOfVariables, length = mtry_length)
    typical <- sqrt(numOfVariables)
    
    ### Here we add the optimal mtry
    if (length(mtry_values) < numOfVariables) {
        mtry_values <- c(mtry_values, typical)
    }
    mtry_values <- floor(sort(mtry_values))
    mtry_values <- unique(mtry_values)
    
    
    if (changeTreeNum == T) {
        ntree_values <- seq(from = 50, to = nt, length = iterat)
        ntree_values <- floor(ntree_values)
    } else {
        ntree_values <- rep(x = nt, times = iterat)
    }
    
    
    global_oob = c()
    
    START <- proc.time()
    
    # set.seed(115)
    for (mtr in mtry_values) {
        ### The sequence of models built for each mtry value starts from the same seed ####
        set.seed(1234)
        oob = c()
        for (i in 1:iterat) {
            ntree <- ntree_values[i]
            class <- data[, 2]
            rf <- randomForest::randomForest(class ~ ., data = input, mtry = mtr, ntree = ntree)
            print(rf)
            ####### We consider only the error rate after the completion of the model
            errors <- rf$err.rate[ntree, 1]
            oob <- append(oob, errors)
        }
        global_oob = cbind(global_oob, oob)
    }
    
    colnames(global_oob) <- mtry_values
    
    ELAPSED <- proc.time() - START
    cat(paste("Elapsed time:", round(ELAPSED[3], 2), "sec", sep = " "), "\n")
    
    
    cat("\nNumber of trees used to build each Random Forest model\n")
    cat(ntree_values, "\n")
    
    cat("\nMTRY values tested\n")
    cat(mtry_values)
    
    ### 
    g <- global_oob
    
    ## Here we set up the data needed to plot the 95% confidence interval of the mean of the OOB errors for each mtry value
    
    res <- optimizeMTRY(g)
    
    ### Then we plot with the package ggplot2
    mean_matrix <- res$mean_matrix
    ci_matrix <- res$ci_matrix
    
    if (graph == T) {
        diagram = plotOOBvsMTRY(mean_matrix, ci_matrix)
    } else {
        diagram = NULL
    }
    
    diagram
    cat("\n\nOOB matrix\n")
    return(list(oob = g, diagram = diagram))
    
    
}

#' Mtry Optimization
#' 
#' This function provides a 'population' estimate of the average OOB error computed for different mtry values,
#' starting from a sample of N models. These values will be used to compute the mtry associated to the minimum averaged OOB error,
#' that is the optimal parameter we are looking for.
#' 
#' 
#' @param oob_matrix a n x p of n OOB error values (one for each iteration) and p columns (one for each mtry value tested)
#' Each value of a column is the oob error of a model growth with a particular mtry. Typically for each mtry,
#' we will have N different models (N > 30), a sample large enough to provide an estimate of the average OOB
#' error for the corresponding population of models.
#' @return a list of two elements: \itemize{
#' \item mean_matrix a 1 x p matrix which contains the mean of each OOB errors sample (resulting from the training of N different Random Forest models growth
#' with N different mtry values) 
#' \item ci_matrix a 2 x p matrix in which each column represents the 95\% confidence interval of the mean of the population of the OOB errors for each 
#' mtry value 
#' \item sd_matrix a 1 x p matrix which contains the standard deviatiaon of each OOB error sample resulting from the training of N different models
#' built for each value of mtry}
#' 
#' @examples
#' ## data(cachexiaData)
#' ## res <- tuneMTRY(cachexiaData, iterations = 50, maxntree = 600, mtry_length = 10, graph = F)
#' ## l <- optimizeMTRY(res$oob)
#' @author Piergiorgio Palla
#' @import UsingR
#' @export

optimizeMTRY <- function(oob_matrix) {
    
    ## Here we initialize the vector that will contain the mean of each sample and the vector that will contain the interval
    ## estimate of the 'population' (from which each sample has been drawn) mean.
    
    mean_matrix = colMeans(oob_matrix)
    sd_matrix <- apply(oob_matrix, 2, sd)
    ci_matrix = c()
    
    for (i in 1:ncol(oob_matrix)) {
        sample <- oob_matrix[, i]
        
        ### sample standard deviation (denominator: n-1)
        stdev <- sd(sample)
        ### The use of the z test requires the knowledge of the variance / standard deviation of the population.  In this case we do
        ### not know those values, but for 'large' sample sizes (n > 30) the sample standard deviation is a good unbiased estimator
        ### of the corresponding parameter. This way we can use the z-test to estimate our confidence intervals of the population
        ### mean oob errors. We can also use a one sample t-test.
        
        ci <- UsingR::simple.z.test(sample, sigma = stdev, conf.level = 0.95)
        ci_matrix <- cbind(ci_matrix, ci)
        
        # ci <- t.test(sample) ci_matrix <- cbind(ci_matrix, ci)
        
    }
    
    colnames(ci_matrix) <- names(mean_matrix)
    res <- list(mean_matrix = mean_matrix, ci_matrix = ci_matrix, sd_matrix = sd_matrix)
    
}


#' Tuning of the ntree parameter (i.e. the number of trees) for a Random Forest model
#' 
#' This function tries to find the 'optimal' value for the parameter ntree which indicates 
#' the number of trees used to grow the ensemble of trees. To do that it will build several random forest models with a different 
#' number of trees for the mtry value considered. The number of models built for each ntree value will be equal to
#' the parameter iteration. The oob errors of each random forest model, computed 
#' for each ntree value will be arranged in a matrix.
#' @param data the n x p dataframe used to build the Random Forest models. The first two columns must represent respectively
#' the sample names and the class labels associated to each sample
#' @param mtry the chosen mtry value
#' @param iterations the number of Random Forest models to be built for each value of ntree
#' @param minNTREE the minimum number of trees of each random forest model.
#' @param pace the pace between each value of ntree to be tested
#' @param seq_length the number of ntree values to be tested
#' @return a n x p matrix in which n is the number of models considered and p is the number of ntree values tested.
#' Each column represents the oob errors resulting from each model and corresponding to the different ntree values
#' @examples
#' ## data(cachexiaData)
#' ## res <- tuneNTREE(cachexiaData, 8, iterations = 50, minNTREE = 600, pace = 100, seq_length = 10)
#' @author Piergiorgio Palla
#' @export

tuneNTREE <- function(data, mtry, iterations, minNTREE = 500, pace = 100, seq_length = 5) {
    
    ### input dataset ###
    input <- data[, 3:dim(data)[2]]
    
    DEFAULT_NTREE <- 500
    
    if (!hasArg(minNTREE)) {
        minNTREE = DEFAULT_NTREE
    }
    
    
    ntree_range <- seq(from = minNTREE, by = pace, length.out = seq_length)
    
    
    
    START <- proc.time()
    
    ### Output matrix initialization ###
    global_oob = c()
    
    for (t in ntree_range) {
        ### The sequence of models built for each mtry value starts from the same seed ####
        set.seed(1234)
        oob = c()
        for (i in 1:iterations) {
            class = data[, 2]
            rf <- randomForest::randomForest(class ~ ., data = input, mtry = mtry, ntree = t)
            print(rf)
            ####### We consider only the error rate after the completion of the model
            errors <- rf$err.rate[t, 1]
            oob <- append(oob, errors)
        }
        global_oob = cbind(global_oob, oob)
    }
    
    colnames(global_oob) <- ntree_range
    
    ELAPSED <- proc.time() - START
    cat(paste("Elapsed time:", round(ELAPSED[3], 2), "sec", sep = " "), "\n")
    
    
    cat("\n\nNumber of trees used to build each Random Forest model\n")
    cat(ntree_range, "\n")
    
    ### Matrix with the distribution of the oob errors for each ntree value considered ### Its number of rows is equal to the
    ### number of models built for each ntree value (i.e. iterations) ####
    g <- global_oob
    
}



#' Plotting the average OOB error and its 95\% confidence interval as a function of the mtry parameter 

#' @param mean_matrix a 1 x p matrix where p is the number of mtry values tested. Each value represents the average
#' OOB error obtained training multiple Random Forest models with a defined value of mtry
#' @param ci_matrix a 2 x p matrix containing the extremes of the confidence interval of the average OOB error. 
#' @return a graphical representation of the average OOB error as a function of the mtry parameter.
#' @author Piergiorgio Palla
#' @examples
#' ## data(cachexiaData)
#' ## res <- tuneMTRY(cachexiaData, iterations = 5, maxntree = 600, mtry_length = 10, graph = F)
#' ## l <- optimizeMTRY(res$oob)
#' ## plotOOBvsMTRY(l$mean_matrix, l$ci_matrix)
# @import ggplot2

plotOOBvsMTRY <- function(mean_matrix, ci_matrix) {
    
    # print(ci_matrix[2, ])
    min_mtry <- as.numeric(names(which.min(mean_matrix)))
    ### R CMD check workaround for ggplot2 ###
    mtry <- NULL
    mtry_vector <- as.numeric(names(mean_matrix))
    OOB.error <- NULL
    L <- NULL
    U <- NULL
    
    df <- data.frame(mtry = mtry_vector, OOB.error = mean_matrix, U = ci_matrix[1, ], L = ci_matrix[2, ], min_mtry = min_mtry)
    diagram <- ggplot2::ggplot(df, ggplot2::aes(mtry)) + ggplot2::geom_line(ggplot2::aes(y = OOB.error), colour = "blue") + 
        ggplot2::geom_ribbon(ggplot2::aes(ymin = L, ymax = U), alpha = 0.2)
    
    
    # diagram <- diagram + #scale_x_discrete(limits=names(mean_matrix))
    
    diagram <- diagram + ggplot2::geom_vline(ggplot2::aes(xintercept = min_mtry), colour = "#BB0000", linetype = "dashed") + 
        ggplot2::ggtitle("Mtry Optimization")  #+ annotate('text', x=13, y=0, label=13)
    diagram
    # diagram <- diagram + theme(plot.title = element_text(lineheight=.8, face='bold')) plot(df$x, df$M, col='black')
    # polygon(c(df$x,rev(df$x)),c(df$L,rev(df$U)),col = 'grey75', border = FALSE) lines(df$x, df$F, lwd = 2) #add red lines on
    # borders of polygon lines(df$x, df$U, col='red',lty=2) lines(df$x, df$L, col='red',lty=2)
    
}
Any scripts or data that you put into this service are public.
RFmarkerDetector documentation built on May 2, 2019, 3:42 p.m.
rdrr.io home R language documentation Run R code online
CRAN packages Bioconductor packages R-Forge packages GitHub packages
Note that we can't provide technical support on individual packages. You should contact the package authors for that.
RFmarkerDetector
Multivariate Analysis of Metabolomics Data using Random Forests

R/RFTuning.R
In RFmarkerDetector: Multivariate Analysis of Metabolomics Data using Random Forests

Try the RFmarkerDetector package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

RFmarkerDetector Multivariate Analysis of Metabolomics Data using Random Forests

R/RFTuning.R In RFmarkerDetector: Multivariate Analysis of Metabolomics Data using Random Forests

Try the RFmarkerDetector package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

RFmarkerDetector
Multivariate Analysis of Metabolomics Data using Random Forests

R/RFTuning.R
In RFmarkerDetector: Multivariate Analysis of Metabolomics Data using Random Forests