R/fs.stability.v2.R

#' @title Classification & Feature Selection
#' @description Applies models to high-dimensional data to both classify and 
#' determine important features for classification.  The function bootstraps 
#' a user-specified number of times to facilitate stability metrics of 
#' features selected thereby providing an important metric for biomarker 
#' investigations, namely whether the important variables can be identified if 
#' the models are refit on 'different' data.
#' @param X A scaled matrix or dataframe containing numeric values of 
#' each feature
#' @param Y A factor vector containing group membership of samples
#' @param method A vector listing models to be fit.
#' Available options are \code{"plsda"} (Partial Least Squares 
#' Discriminant Analysis), \code{"rf"} (Random Forest), \code{"gbm"} 
#' (Gradient Boosting Machine), \code{"svm"} (Support Vector Machines), 
#' \code{"glmnet"} (Elastic-net Generalized Linear Model),
#'  and \code{"pam"} (Prediction Analysis of Microarrays)
#' @param k Number of bootstrapped interations
#' @param p Percent of data to by 'trained'
#' @param f Number of features desired. 
#' If rank correlation is desired, set \code{"f = NULL"}
#' @param stability.metric string indicating the type of stability metric.
#' Avialable options are \code{"jaccard"} (Jaccard Index/Tanimoto Distance),
#'  \code{"sorensen"} (Dice-Sorensen's Index), \code{"ochiai"} (Ochiai's Index),
#'  \code{"pof"} (Percent of Overlapping Features), \code{"kuncheva"} 
#'  (Kuncheva's Stability Measures), \code{"spearman"} (Spearman Rank 
#'  Correlation), and \code{"canberra"} (Canberra Distance)
#' @param optimize Logical argument determining if each model should 
#' be optimized. Default \code{"optimize = TRUE"}
#' @param optimize.resample Logical argument determining if each resample 
#' should be re-optimized. Default \code{"optimize.resample = FALSE"} - Only 
#' one optimization run, subsequent models use initially determined parameters
#' @param tuning.grid Optional list of grids containing parameters to optimize 
#' for each algorithm.  Default \code{"tuning.grid = NULL"} lets function 
#' create grid determined by \code{"res"}
#' @param k.folds Number of folds generated during cross-validation.  
#' May optionally be set to \code{"LOO"} for leave-one-out cross-validation.  
#' Default \code{"k.folds = 10"}
#' @param repeats Number of times cross-validation repeated.  
#' Default \code{"repeats = 3"}
#' @param resolution Resolution of model optimization grid.  
#' Default \code{"resolution = 3"}
#' @param metric Criteria for model optimization.  Available options 
#' are \code{"Accuracy"} (Predication Accuracy), \code{"Kappa"} 
#' (Kappa Statistic), and \code{"AUC-ROC"} 
#' (Area Under the Curve - Receiver Operator Curve)
#' @param model.features Logical argument if should have number of 
#' features selected to be determined by the individual model runs.  
#' Default \code{"model.features = FALSE"}
#' @param allowParallel Logical argument dictating if parallel processing 
#' is allowed via foreach package. Default \code{allowParallel = FALSE}
#' @param verbose Character argument specifying how much output progress 
#' to print. Options are 'none', 'minimal' or 'full'.
#' @param ... Extra arguments that the user would like to apply to the models
#'
#' @return \item{Methods}{Vector of models fit to data}
#' @return \item{performance}{Performance metrics of each model and 
#' bootstrap iteration}
#' @return \item{RPT}{Robustness-Performance Trade-Off as defined in 
#' Saeys 2008}
#' @return \item{features}{List concerning features determined via each 
#' algorithms feature selection criteria.}
#' @return \itemize{
#'  \item{metric: Stability metric applied}
#'  \item{features: Matrix of selected features}
#'  \item{stability: Matrix of pairwise comparions and average stability}
#'  }
#' @return \item{stability.models}{Function perturbation metric - i.e. how 
#' similar are the features selected
#' by each model.}
#' @return \item{original.best.tunes}{If \code{"optimize.resample = TRUE"} 
#' then returns list of optimized parameters for each bootstrap.}
#' @return \item{final.best.tunes}{If \code{"optimize.resample = TRUE"} 
#' then returns list of optimized parameters for each bootstrap of models 
#' refit to selected features.}
#' @return \item{specs}{List with the following elements:}
#' @return \itemize{
#'  \item{total.samples: Number of samples in original dataset}
#'  \item{number.features: Number of features in orginal dataset}
#'  \item{number.groups: Number of groups}
#'  \item{group.levels: The specific levels of the groups}
#'  \item{number.observations.group: Number of observations in each group}}
#' @author Charles Determan Jr
#' @references Saeys Y., Abeel T., et. al. (2008) \emph{Machine Learning and 
#' Knowledge Discovery in Databases}. 313-325. 
#' http://link.springer.com/chapter/10.1007/978-3-540-87481-2_21
#' @example inst/examples/fs.stability.R
#' @import DiscriMiner
#' @import randomForest
#' @import plyr
#' @importFrom caret createDataPartition
#' @import e1071
#' @import gbm
#' @import pamr
#' @import glmnet
#' @export

fs.stability <- 
    function(X,
             Y,
             method,
             k = 10,
             p = 0.9,
             f = NULL,
             stability.metric = "jaccard",
             optimize = TRUE,
             optimize.resample = FALSE,
             tuning.grid = NULL,
             k.folds = if(optimize) 10 else NULL,
             repeats = if(k.folds=="LOO") NULL else if(optimize) 3 else NULL,
             resolution = if(is.null(tuning.grid) && optimize) 3 else NULL,
             metric = "Accuracy",
             model.features = FALSE,
             allowParallel = FALSE,
             verbose = 'none',
             ...
    )
{    
        #### Filter Methods???
        ## Bhattacharyya Distance
        #??bhattacharyya
        #require(fpc)
        #bhattacharyya.dist
        ## Relative Entropy
        
        assert_is_character(verbose)
        
        verify(x = X, y = Y, method = method, na.rm = FALSE)
        
        f <- verify_fs(f = f, 
                       stability.metric = stability.metric, 
                       model.features = model.features,
                       no.fs=FALSE)
        
        if (is.null(colnames(X))){
            colnames(X) = paste(rep("X",ncol(X)),seq_len(ncol(X)), sep='') 
        }
        if (is.null(rownames(X))) rownames(X) = 1:nrow(X)
        
        # convert to data.frame to use $ operator
        raw.data <- as.data.frame(X)
        raw.data$.classes <- Y
        
        nr <- nrow(X)
        nc <- ncol(X)
        # number of groups
        num.group <- nlevels(Y)
        # what the groups are
        grp.levs <- levels(Y)  
        # how many obs in each group
        num.obs.group <- as.vector(table(Y))
        theDots <- list(...)
        
        # Create empty list for features identified by each chosen algorithm
        final.features <- vector("list", k)
        names(final.features) <- paste("Resample", seq(k), sep = ".")
        
        # need to retain for SVM and PAM feature selection
        trainVars.list <- vector("list", k)
        trainGroup.list <- vector("list", k)  
        
        if(optimize == TRUE & optimize.resample == TRUE){
            resample.tunes <- vector("list", k)
            names(resample.tunes) <- paste("Resample", 1:k, sep=".")
        }else{
            resample.tunes <- NULL
        }
        
        inTrain <- rlply(k, caret::createDataPartition(Y, p = p, list = FALSE))
        outTrain <- lapply(inTrain, 
                           function(inTrain, total) total[-unique(inTrain)],
                           total = seq(nr))
        #i <- 1
        for(i in seq(k)){
            trainVars <- X[inTrain[[i]],, drop= FALSE]
            trainVars.list[[i]] <- trainVars
            trainGroup <- Y[inTrain[[i]], drop= FALSE]
            trainGroup.list[[i]] <- trainGroup
            
            trainData <- as.data.frame(trainVars)
            trainData$.classes <- trainGroup
            
            if(optimize == TRUE){
                if(optimize.resample == TRUE){
                    # tune the methods
                    tuned.methods <- 
                        optimize.model(
                            trainVars = trainVars,
                            trainGroup = trainGroup,
                            method = method,
                            k.folds = k.folds,
                            repeats = repeats,
                            res = resolution,
                            grid = tuning.grid,
                            metric = metric,
                            allowParallel = allowParallel,
                            verbose = verbose,
                            theDots = theDots
                            )
                    
                    # store the best tune parameters for each iteration
                    names(tuned.methods$bestTune) = method
                    resample.tunes[[i]] <- tuned.methods$bestTune
                    
                    if(i == 1){
                        finalModel <- tuned.methods$finalModel
                        finish.Model <- finalModel
                    }else{
                        finalModel <- tuned.methods$finalModel
                        finish.Model <- 
                            append(finish.Model, tuned.methods$finalModel)
                    }
                    
                    # Create empty list for features identified by each 
                    # chosen algorithm
                    features <- vector("list", length(method))
                    names(features) <- tolower(method)
                    
                    for(j in seq(along = method)){
                        ### Extract important features
                        # pam requires a special mydata argument
                        if(method[j] == "pam"){
                            mydata <- 
                                list(x=t(trainVars.list[[i]]), 
                                     y=factor(trainGroup.list[[i]]), 
                                     geneid = as.character(
                                         colnames(
                                             trainVars.list[[i]]
                                             )
                                         )
                                     )
                        }else{
                            # svm requires training data for RFE
                            mydata <- trainVars.list[[i]]
                        }
                        
                        features[[j]] <- extract.features(
                            x = finalModel[j],
                            dat = mydata,
                            grp = trainGroup.list[[i]],
                            # add in gbm best tune trees???
                            bestTune = if(method[j] == "svm" | 
                                              method[j] == "pam" | 
                                              method[j] == "glmnet") {
                                tuned.methods$bestTune[[j]]
                                }else{
                                    NULL
                                },
                            model.features = model.features, 
                            method = method[j], 
                            f = f, 
                            comp.catch = tuned.methods$comp.catch)
                        
                        if(stability.metric %in% c("spearman", "canberra")){
                            rownames(features[[j]]$features.selected) <- 
                                colnames(X)
                        }
                    }
                    
                    ### Re-fitting models to reduced features
                    if(verbose == 'full'){
                        cat(paste("Refitting model iteration", i, 
                                  "with selected features\n", sep = " "))
                    }
                    
                    # subset only those features which were selected
                    if(!is.null(f)){
                        trainData.new <- 
                            lapply(features, 
                                   FUN = function(x) {
                                       trainData[,colnames(trainData) %in% 
                                                     c(t(x$features.selected), 
                                                       ".classes")]
                                       })          
                    }else{
                        trainData.new <- 
                            lapply(
                                features, 
                                FUN = function(x) {
                                    trainData[, colnames(trainData) %in% 
                                                  c(
                                                      rownames(
                                                          x$features.selected
                                                      ), 
                                                      ".classes")]
                                })          
                    }
                    
                    tunedModel.new <- vector("list", length(method))
                    for(m in seq(along = method)){
                        tunedModel.new[[m]] <- 
                            optimize.model(
                                trainVars = trainData.new[[m]][,!colnames(
                                    trainData.new[[m]]
                                    ) %in% c(".classes")],
                                trainGroup = trainData.new[[m]]$.classes,
                                method = method[m],
                                k.folds = k.folds,
                                repeats = repeats,
                                res = resolution,
                                grid = tuning.grid,
                                metric = metric,
                                allowParallel = allowParallel,
                                verbose = verbose,
                                theDots = theDots)  
                    }
                    
                    
                    if(i == 1){                        
                        finalModel.new <- 
                            sapply(tunedModel.new, 
                                   FUN = function(x) x$finalModels)
                        new.best.tunes <- 
                            sapply(tunedModel.new, 
                                   FUN = function(x) x$bestTune)
                        names(new.best.tunes) <- method
                        final.features[[i]] <- 
                            lapply(features, 
                                   FUN = function(x) x$features.selected)
                        names(final.features[[i]]) <- method
                    }else{
                        tmp.model <- 
                            sapply(tunedModel.new, 
                                   FUN = function(x) x$finalModels)
                        tmp.tunes <- 
                            sapply(tunedModel.new, 
                                   FUN = function(x) x$bestTune)
                        names(tmp.tunes) <- method
                        finalModel.new <- c(finalModel.new, tmp.model)
                        new.best.tunes <- append(new.best.tunes, tmp.tunes)
                        final.features[[i]] <- 
                            lapply(features, 
                                   FUN = function(x) x$features.selected)
                        names(final.features[[i]]) <- method
                    }  
                    
                    # end of optimize.resample loop
                }else{
                    if(i == 1){
                        tuned.methods <- 
                            optimize.model(
                                trainVars = trainVars,
                                trainGroup = trainGroup,
                                method = method,
                                k.folds = k.folds,
                                repeats = repeats,
                                res = resolution,
                                grid = tuning.grid,
                                metric = metric,
                                verbose = verbose,
                                allowParallel = allowParallel,
                                theDots = theDots
                                )
                        finalModel <- tuned.methods$finalModel
                        finish.Model <- tuned.methods$finalModel
                    }else{
                        # Fit remainder of resamples with initial 
                        # best parameters
                        tmp <- vector("list", length(method))
                        names(tmp) <- method
                        
                        for(d in seq(along = method)){
                            tmp[[d]] <- 
                                training(
                                    data = trainData,
                                    method = method[d],
                                    tuneValue = tuned.methods$bestTune[[d]],
                                    obsLevels = grp.levs,
                                    theDots = theDots)$fit
                        }
                        finalModel <- tmp
                        finish.Model <- append(finish.Model, tmp)
                    }
                    
                    # Create empty list for features identified by each 
                    # chosen algorithm
                    features <- vector("list", length(method))
                    
                    for(j in seq(along = method)){
                        ### Extract important features
                        # pam requires a special mydata argument
                        if(method[j] == "pam"){
                            mydata <- 
                                list(x=t(trainVars.list[[i]]), 
                                     y=factor(trainGroup.list[[i]]), 
                                     geneid = as.character(
                                         colnames(
                                             trainVars.list[[i]]
                                             )
                                         )
                                     )
                        }else{
                            # svm requires training data for RFE
                            mydata <- trainVars.list[[i]]
                        }
                        
                        features[[j]] <- extract.features(
                            x = finalModel[j],
                            dat = mydata,
                            grp = trainGroup.list[[i]],
                            # add in gbm best tune trees???
                            bestTune = if(method[j] == "svm" | 
                                              method[j] == "pam" | 
                                              method[j] == "glmnet" | 
                                              method[j] == "gbm"){
                                tuned.methods$bestTune[[j]]
                            }else{NULL},
                            model.features = model.features, 
                            method = method[j], 
                            f = f, 
                            comp.catch = tuned.methods$comp.catch)
                        
                        if(stability.metric %in% c("spearman", "canberra")){
                            features[[j]]$features.selected <- 
                                data.frame(features[[j]]$features.selected)
                            colnames(features[[j]]$features.selected) <-
                                method[j]
                            rownames(features[[j]]$features.selected) <- 
                                colnames(X)
                        }
                    }
                    
                    ### Re-fitting models to reduced features
                    if(verbose == 'full'){
                        cat(paste("Refitting model iteration", i, 
                                  "with selected features\n", sep = " "))
                    }
                    
                    # subset only those features which were selected
                    if(!is.null(f) | model.features == TRUE){
                        trainData.new <- 
                            lapply(features, 
                                   FUN = function(x){
                                       trainData[,colnames(trainData) 
                                                 %in% c(t(x$features.selected),
                                                        ".classes")]
                                   })          
                    }else{
                        trainData.new <- 
                            lapply(features, 
                                   FUN = function(x){
                                       trainData[,colnames(trainData) 
                                                 %in% c(
                                                     rownames(
                                                         x$features.selected
                                                         ), 
                                                     ".classes")]
                                   })          
                    }
                    
                    if(i == 1){
                        tunedModel.new <- vector("list", length(method))
                        for(m in seq(along = method)){
                            #m <- 2
                            if(is.null(tuning.grid)){
                                grid <- denovo.grid(data = trainData, 
                                                    method = method[m], 
                                                    res = resolution)
                            }
                            # if generated grid and running gbm, reduce 
                            # n.trees for gbm
                            # the large number of trees cause NaN for 
                            # most values
                            if(method[m] == "gbm" & is.null(tuning.grid)){
                                gbm.trees <- grid$gbm$.n.trees
                                grid$gbm$.n.trees <- gbm.trees/10
                            }
                            
                            # if generated grid and running rf, reduce
                            # mtry for rf otherwise may be higher
                            # than number of variables
                            if(method[m] == "rf" & is.null(tuning.grid)){
                                grid$rf <- data.frame(.mtry = 
                                    floor(
                                        seq(1, 
                                            to=ncol(trainData.new[[m]])-1, 
                                            length=resolution)
                                    )
                                )
                            }
                            
                            tunedModel.new[[m]] <- 
                                optimize.model(
                                    trainVars = trainData.new[[m]][,!colnames(
                                        trainData.new[[m]]) 
                                        %in% c(".classes")],
                                    trainGroup = trainData.new[[m]]$.classes,
                                    method = method[m],
                                    k.folds = k.folds,
                                    repeats = repeats,
                                    res = resolution,
                                    grid = grid,
                                    metric = metric,
                                    allowParallel = allowParallel,
                                    verbose = verbose,
                                    theDots = theDots)    
                        }
                    }else{
                        tmp <- vector("list", length(method))
                        names(tmp) <- method
                        
                        for(d in seq(along = method)){
                            tmp[[d]] <- 
                                training(
                                    data = trainData.new[[d]],
                                    method = method[d],
                                    tuneValue = as.data.frame(t(
                                        unlist(tunedModel.new[[d]]$bestTune))),
                                    obsLevels = grp.levs,
                                    theDots = theDots)$fit
                        }
                    }
                    
                    if(i == 1){
                        finalModel.new <- 
                            sapply(tunedModel.new, 
                                   FUN = function(x) x$finalModels)
                        new.best.tunes <- 
                            sapply(tunedModel.new, 
                                   FUN = function(x) x$bestTune)
                        final.features[[i]] <- 
                            lapply(features, function(x) x$features.selected)
#                             sapply(features, 
#                                    FUN = function(x) x$features.selected)
                    }else{
                        tmp.model <- lapply(tmp, FUN = function(x) x)
                        finalModel.new <- c(finalModel.new, tmp.model)
                        final.features[[i]] <- 
                            lapply(features, function(x) x$features.selected)
#                             sapply(features, 
#                                    FUN = function(x) x$features.selected)
                    }  
                    
                } # end of single optimized loop
                # end of optimize loop 
            }else{        
                names(theDots) <- paste(".", names(theDots), sep="")
                
                # sequester appropriate parameters to fit models
                args.seq <- sequester(theDots, method)
                
                # remove arguments used from theDots - also remove '.' 
                # from each
                names(theDots) <- sub(".", "", names(theDots))
                moreDots <- theDots[!names(theDots) %in% args.seq$pnames]
                if(length(moreDots) == 0){
                    moreDots <- NULL
                }
                
                tmp <- vector("list", length(method))
                for(q in seq(along = method)){
                    tmp[[q]] <- training(data = trainData,
                                         method = method[q],
                                         tuneValue = args.seq$parameters[[q]],
                                         obsLevels = grp.levs,
                                         theDots = moreDots)$fit  
                }
                
                finalModel <- tmp
                if(i == 1){
                    finish.Model <- finalModel
                }else{
                    finish.Model <- append(finish.Model, tmp)
                }
                
                # Create empty list for features identified by each 
                # chosen algorithm
                features <- vector("list", length(method))
                names(features) <- tolower(method)
                
                for(j in seq(along = method)){
                    ### Extract important features
                    # pam requires a special mydata argument
                    mydata <- vector("list", length(method))
                    if(method[j] == "pam"){
                        for(t in seq(along = method)){
                            mydata[[t]] <- 
                                list(
                                    x=t(trainVars.list[[i]]), 
                                    y=factor(trainGroup.list[[i]]),
                                    geneid = as.character(
                                        colnames(trainVars.list[[i]])
                                        )
                                    )
                        }
                    }else{
                        # svm requires training data for RFE
                        for(t in seq(along = method)){
                            mydata[[t]] <- trainVars.list[[i]]
                        }
                    }
                    
                    features[[j]] <- extract.features(
                        x = finalModel[j],
                        dat = mydata[[j]],
                        grp = trainGroup.list[[j]],
                        # add in gbm best tune trees???
                        bestTune = if(method[j] == "svm" | 
                                          method[j] == "pam" | 
                                          method[j] == "glmnet") {
                            args.seq$parameters[[j]]}else{
                                NULL
                            },
                        model.features = model.features, 
                        method = method[j], 
                        f = f, 
                        comp.catch = tuned.methods$comp.catch)
                    
                    if(stability.metric %in% c("spearman", "canberra")){
                        rownames(features[[j]]$features.selected) <- 
                            colnames(X)
                    }
                }
                
                ### Re-fitting models to reduced features
                if(verbose == 'full'){
                    cat(paste("Iteration ", i, 
                              " Refitting models with selected features\n", 
                              sep = ""))
                }
                
                # subset only those features which were selected
                if(!is.null(f)){
                    trainData.new <- 
                        lapply(features,
                               FUN = function(x) {
                                   trainData[,colnames(trainData) 
                                             %in% c(t(x$features.selected), 
                                                    ".classes")]
                               })          
                }else{
                    trainData.new <- 
                        lapply(features, 
                               FUN = function(x) {
                                   trainData[,colnames(trainData) 
                                             %in% c(
                                                 rownames(x$features.selected),
                                                 ".classes")]
                               })          
                }
                
                tunedModel.new <- vector("list", length(method))
                for(q in seq(along = method)){
                    tunedModel.new[[q]] <- 
                        training(data = trainData.new[[q]],
                                 method = method[q],
                                 tuneValue = args.seq$parameters[[q]],
                                 obsLevels = grp.levs,
                                 theDots = moreDots)$fit  
                }
                
                if(i == 1){
                    finalModel.new <- 
                        lapply(tunedModel.new, FUN = function(x) x)
                    names(finalModel.new) <- method
                    final.features[[i]] <- 
                        sapply(features, FUN = function(x) x$features.selected)
                    names(final.features[[i]]) <- method
                }else{
                    tmp.model <- lapply(tunedModel.new, FUN = function(x) x)
                    names(tmp.model) <- method
                    finalModel.new <- c(finalModel.new, tmp.model)
                    final.features[[i]] <- 
                        sapply(features, FUN = function(x) x$features.selected)
                    names(final.features[[i]]) <- method
                }  
            } # end of non-optimized sequence
        } # end of stability loop
        
        
        ### Performance Metrics of Reduced Models
        if(verbose == 'minimal' | verbose == 'full'){
            cat("Calculating Model Performance Statistics\n")  
        }  

        if(stability.metric %in% c("spearman", "canberra")){
            final.features <- lapply(unlist(final.features, recursive=FALSE), 
                                     function(x) as.numeric(as.vector(unlist(x))))
        }else{
            final.features <- lapply(unlist(final.features, recursive=FALSE), 
                                     function(x) as.vector(unlist(x)))
        }

        names(final.features) <- 
            rep(method, length(finalModel.new)/length(method))

        final.metrics <-
            prediction.metrics(finalModel = finalModel.new,
                               method = method,
                               raw.data = raw.data,
                               inTrain = inTrain,
                               outTrain = outTrain,
                               features = final.features,
                               bestTune = if(optimize){new.best.tunes}else{
                                   args.seq$parameters},
                               grp.levs = grp.levs,
                               stability.metric = stability.metric)

        ### Extract Performance Metrics
        if(optimize == TRUE){
            if(optimize.resample == TRUE){        
                colnames(final.metrics) <- 
                    gsub("^\\.", "", colnames(final.metrics))  
                performance <- vector("list", length(method))
                names(performance) <- method
                final.metrics <- 
                    final.metrics[,!grepl("^cell", 
                                          colnames(final.metrics)),
                                  drop = FALSE]
                
                # sort the list elements so applied in the proper order
                method.names <- 
                    unlist(
                        lapply(method, 
                               FUN = function(x){
                                   c(rep(x, 
                                         length(new.best.tunes)/length(method)
                                         )
                                     )
                               })
                    )
                new.best.tunes <- 
                    new.best.tunes[order(names(new.best.tunes), 
                                         levels = method.names)]
                
                # dataframe retains colnames after split
                rownames(final.metrics) <- letters[seq(nrow(final.metrics))]
                test.final.metrics <- 
                    split(as.data.frame(final.metrics), rownames(final.metrics))
                for(i in seq(along = test.final.metrics)){
                    rownames(test.final.metrics[[i]]) <- 1
                }
                rownames(final.metrics) <- method.names
                all.model.perfs <- 
                    mapply(new.best.tunes, 
                           FUN = function(x, y) {
                               list(x,y)}, y = test.final.metrics, 
                           SIMPLIFY = FALSE)
                
                # name resamples within all.model.perfs
                samps <- 
                    rep(rep(paste("Resample", seq(k), sep = "."), each = 2), 
                        length(method))
                for(i in seq(length(all.model.perfs))){
                    if(i == 1){
                        start = i
                        end = i+1
                    }
                    names(all.model.perfs[[i]]) <- samps[start:end]
                    start = start + 2
                    end = end + 2
                }
                
                x <- final.metrics[,!grepl("^cell", 
                                           colnames(final.metrics)),
                                   drop = FALSE]
                for(h in seq(along = method)){
                    tmp <- subset(x, rownames(x) == method[h])
                    performance[[h]] <- c(colMeans(tmp, na.rm = TRUE), 
                                          apply(tmp, 2, sd, na.rm = TRUE))
                    names(performance[[h]])[-(1:ncol(tmp))] <- 
                        paste(names(performance[[h]])[-(1:ncol(tmp))], 
                              "SD", sep = " ")
                    performance[[h]] <- t(data.frame(performance[[h]]))
                    rownames(performance[[h]]) <- 1
                }
            }else{
                colnames(final.metrics) <- 
                    gsub("^\\.", "", colnames(final.metrics))  
                performance <- vector("list", length(method))
                names(performance) <- method
                x <- final.metrics[,!grepl("^cell", 
                                           colnames(final.metrics)),
                                   drop = FALSE]
                for(h in seq(along = method)){
                    tmp <- subset(x, rownames(x) == method[h])
                    performance[[h]] <- c(colMeans(tmp, na.rm = TRUE), 
                                          apply(tmp, 2, sd, na.rm = TRUE))
                    names(performance[[h]])[-(1:ncol(tmp))] <- 
                        paste(names(performance[[h]])[-(1:ncol(tmp))], 
                              "SD", sep = " ")
                    performance[[h]] <- 
                        do.call(cbind, 
                                c(as.vector(new.best.tunes[[h]]),
                                  performance[[h]]))
                    colnames(performance[[h]]) <- 
                        gsub("^\\.", "", colnames(performance[[h]]))
                    rownames(performance[[h]]) <- 1
                }
            }
        }else{
            colnames(final.metrics) <- gsub("^\\.", "", colnames(final.metrics))  
            performance <- vector("list", length(method))
            names(performance) <- method
            for(h in seq(along = method)){
                x <- final.metrics[,!grepl("^cell", 
                                           colnames(final.metrics)),
                                   drop = FALSE]
                tmp <- subset(x, rownames(x) == method[h])
                performance[[h]] <- c(colMeans(x, na.rm = TRUE), 
                                      apply(tmp, 2, sd, na.rm = TRUE))
                names(performance[[h]])[-(1:ncol(tmp))] <- 
                    paste(names(performance[[h]])[-(1:ncol(tmp))], 
                          "SD", sep = " ")
                performance[[h]] <- 
                    do.call(cbind, c(as.vector(args.seq$parameters[[h]]), 
                                     performance[[h]]))
                colnames(performance[[h]]) <- 
                    gsub("^\\.", "", colnames(performance[[h]]))
                rownames(performance[[h]]) <- 1
            }
        }

        # need to split features into length(method) dataframes 
        # for pairwise.stability  
        results.stability <- vector("list", length(method))
        names(results.stability) <- method
        if(!model.features){
            if(length(method) == 1){
                if(stability.metric %in% c("spearman", "canberra")){
                    results.stability[[1]] <- 
                        final.features[which(
                            names(final.features) == method[1]
                        )]

                    results.stability[[1]] <- 
                        sapply(results.stability[[1]], 
                               setNames, 
                               colnames(X))
                    
                }else{
                    results.stability[[1]] <- 
                        do.call("cbind", final.features)
                }
            }else{
                if(stability.metric %in% c("spearman", "canberra")){
                    for(c in seq(along = method)){
                        results.stability[[c]] <- 
                            final.features[which(
                                        names(final.features) == method[c]
                                    )]
                        
                        results.stability[[c]] <- 
                            sapply(results.stability[[c]], 
                                   setNames, 
                                   colnames(X))
                    }
                }else{
                    for(c in seq(along = method)){
                        results.stability[[c]] <- 
                            do.call("cbind", 
                                    final.features[which(
                                        names(final.features) == method[c]
                                    )]
                            )
                    }
                }
            }
        }else{
            for(c in seq(along = method)){
                tmp <- do.call(cbind.fill, 
                               final.features[which(
                                   names(final.features) == method[c]
                               )]
                )
                results.stability[[c]] <- tmp
            }      
        }

        # Calculate All Pairwise Stability Measurements for 
        # each algorithm's set of features
        if(any(unlist(lapply(results.stability, ncol)) == 1)){
            warning("No replicates were run, stability will be set to NULL.
                    If stability not desired, see 'fit.only.model'")
            stability <- NULL
        }else{
            stability <- lapply(results.stability, 
                                pairwise.stability, 
                                stability.metric = stability.metric, 
                                nc= nc) 
        }
        
        # stability across algorithms 
        # (i.e. 'function perturbation' ensemble analysis)
        if(length(method) > 1){
            stability.models <- 
                pairwise.model.stability(features = results.stability,
                                         stability.metric = stability.metric,
                                         nc = nc)
        }else{
            stability.models <- NULL
        }
        
        # harmonic mean of stability and performance
        if(!is.null(stability)){
            rpt.stab <- lapply(stability, FUN = function(x) x$overall)
            rpt.perf <- lapply(performance, 
                               FUN = function(x) as.data.frame(x)$Accuracy)
            rpt <- 
                mapply(rpt.stab, 
                       FUN = function(x,y){
                           RPT(stability = x, performance = y)
                       }, y = rpt.perf)
        }else{
            rpt <- NULL
        }
       
        
        # add stability metrics to features selected
        for(i in seq(along = method)){
            results.stability[[i]] <- 
                list(metric = stability.metric, 
                     features = results.stability[[i]], 
                     stability = stability[[i]])
        }
        
        ### Desired Output 
        ## specifications
        specs = list(total.samples=nr, 
                     number.features=nc, 
                     number.groups=num.group, 
                     group.levels=grp.levs, 
                     number.observations.group=num.obs.group)
        
        ## add remainder of data
        overall <- list(
            methods = method,                     
            performance = performance,            
            RPT = rpt,                            
            features = results.stability, 
            stability.models = stability.models,  
            original.best.tunes = resample.tunes, 
            final.best.tunes = if(optimize.resample) all.model.perfs else NULL,   
            specs = specs,
            perf.table = x
        )
        return(overall)
    }
cdeterman/OmicsMarkeR documentation built on May 13, 2019, 2:35 p.m.