R/MUVR2.R

Defines functions MUVR2

Documented in MUVR2

#' MUVR2 with PLS and RF
#'
#' "Multivariate modelling with Unbiased Variable selection" using PLS and RF.
#' Repeated double cross validation with tuning of variables in the inner loop.
#' @param X Predictor variables. NB: Variables (columns) must have names/unique identifiers. NAs not allowed in data. For multilevel, only the positive half of the difference matrix is specified.
#' @param Y Response vector (Dependent variable). For classification, a factor (or character) variable should be used. For multilevel, Y is calculated automatically.
#' @param ID Subject identifier (for sampling by subject; Assumption of independence if not specified)
#' @param scale If TRUE, the predictor variable matrix is scaled to unit variance for PLS modeling.
#' @param nRep Number of repetitions of double CV. (Defaults to 5)
#' @param nOuter Number of outer CV loop segments. (Defaults to 6)
#' @param nInner Number of inner CV loop segments. (Defaults to nOuter - 1)
#' @param varRatio Ratio of variables to include in subsequent inner loop iteration. (Defaults to 0.75)
#' @param DA Boolean for Classification (discriminant analysis) (By default, if Y is numeric -> DA = FALSE. If Y is factor (or character) -> DA = TRUE)
#' @param fitness Fitness function for model tuning (choose either 'AUROC' or 'MISS' (default) for classification; or 'RMSEP' (default) for regression.)
#' @param method Multivariate method. Supports 'PLS' and 'RF' (default)
#' @param methParam List with parameter settings for specified MV method (see function code for details)
#' @param ML Boolean for multilevel analysis (defaults to FALSE)
#' @param modReturn Boolean for returning outer segment models (defaults to FALSE). Setting modReturn = TRUE is required for making MUVR predictions using predMV().
#' @param logg Boolean for whether to sink model progressions to `log.txt`
#' @param parallel Boolean for whether to perform `foreach` parallel processing (Requires a registered parallel backend; Defaults to `TRUE`)
#' @param keep Confounder variables can be added. NB: Variables (columns) must match column names.
#' @param weigh_added To add a weighing matrix when it is classfication
#' @param weighing_matrix The matrix used for get a miss classfication score
#' @param ... additional argument
#' @return A 'MUVR' object
#' @importFrom randomForest randomForest
#' @importFrom ranger ranger
#' @export
#' @examples
#' \donttest{
#' data(freelive2)
#' nRep <- 2 # Number of MUVR2 repetitions
#' nOuter <- 3 # Number of outer cross-validation segments
#' varRatio <- 0.6 # Proportion of variables kept per iteration
#' method <- 'PLS' # Selected core modeling algorithm
#' regrModel <- MUVR2(X = XRVIP2,
#'                    Y = YR2,
#'                    nRep = nRep,
#'                    nOuter = nOuter,
#'                    varRatio = varRatio,
#'                    method = method,
#'                    modReturn = TRUE)
#'}

MUVR2 <- function(X,
                 Y,
                 ID,
                 scale = TRUE,
                 nRep = 5,
                 nOuter = 6,
                 nInner,
                 varRatio = 0.75,
                 DA = FALSE,
                 fitness = c('AUROC', 'MISS', 'BER', 'RMSEP', 'wBER', 'wMISS'),
                 method = c('PLS', ' RF', "ANN", "SVM"),
                 methParam,
                 ML = FALSE,
                 modReturn = FALSE,
                 logg = FALSE,
                 parallel = TRUE,
                 weigh_added = FALSE,
                 weighing_matrix = NULL,
                 keep,
                 ...) {


  if (is.null(weighing_matrix) & DA == TRUE) {
    weighing_matrix <- diag(1, length(levels(Y)), length(levels(Y)))
  }
  ##library(kernlab)
  ##library(e1071)
  # Start timer
  start.time <- proc.time()[3]

  # Initialise modelReturn with function call
  modelReturn <- list(call = match.call())

  # Default core modelling method
  if (missing(method)) {
    method <- 'RF'
  }
  if (!missing(method)) {
    if (method != "PLS" &
        method != "RF" & method != "ANN" & method != "SVM") {
      stop("This method could not be implemented.")
    }
  }

  # Call in relevant package(s)
  ##library(pROC)
  ##library(foreach)

  # Parallel processing
  if (parallel) {
    "%doVersion%" <- get("%dopar%")
  } else{
    "%doVersion%" <- get("%do%")
  }

  # Rough check indata
  if (length(dim(X)) != 2) {
    stop('\nWrong format of X matrix.\n')
  }
  if (is.null(colnames(X))) {
    stop('\nNo column names in X matrix.\n')
  }

  # methParams
  if (any(names(list(...)) == 'nCompMax'))
    ###???
  {
    stop(
      'nCompMax` is deprecated. Use customParams() and the methParam argument in MUVR instead.'
    )
  }

  if (missing(methParam)) {
    methParam <- customParams(method = method)
  }

  ###This line is moved from below,to make sure missing(methPara) is prior than other command that involves methParam
  ###########################
  #See the one hot encoding package I added in my branch yan 20210719
  #This requires that X is a data frame of all variables correctly categorized as numeric, factor, character and logical
  ###The X data put in MUVR should be numeric, if not, onehotencoding is used

  ###When the data is data frame, oneHotencoding transformthe dataframe to matrix
  ##when the data is a matrix, it does not need onehotencoding anyway becase the output is the same amtrix
  if (methParam$oneHot == TRUE) {
    if (any(class(X) %in% c("data.frame"))) {
      X <- onehotencoding(X)
      message("X is transformed to a matrix by onehotencoding.", "\n")
    }
  }
  ##now the X here is the new X after onehot encoding

  #by doing so.We do not need methParam$oneHot

  ###############
  # Remove nearZeroVariance variables for PLS
  if (methParam$NZV) {
    nzv <- MUVR2::nearZeroVar(X) # Function borrowed from mixOmics
    if (length(nzv$Position) > 0) {
      modelReturn$nzv <- colnames(X)[nzv$Position]
      X <- X[, -nzv$Position]
      message(
        '\n',
        length(nzv$Position),
        'variables with near zero variance detected -> removed from X and stored under $nzv',
        "\n"
      )
      message("They are", rownames(nzv$Metrics), "\n")
    }
  }



  #--------------------------
  #TESSA ADDITIONS START
  #--------------------------

  # Sort out which are the "keep" variables (after one-hot and NZV)
  #with the current solution it works if one hot encoded original variables are not in dataset column names anymore but there is a pattern

  if (!missing(keep))  {
    keeps <- vector()
    for (k in 1:length(keep)) {
      if (keep[k] %in% colnames(X))
        ###this X is the X after onehot encoding or not
      {
        len <-
          length(keeps)                            ##If there is a 3 level factor variable at 2nd place of the data then keeps[5] needs to be keep[3]
        keeps[len + 1] <- keep[k]
      } else if (any(grep(pattern = paste(keep[k], "_level", sep = ""), colnames(X))))
      {
        nlevel <-
          length(grep(pattern = paste(keep[k], "_level", sep = ""), colnames(X)))
        len <- length(keeps)
        for (nl in 1:nlevel)
        {
          keeps[len + nl] <-
            colnames(X)[grep(pattern = paste(keep[k], "_", sep = ""), colnames(X))][nl]
        }
      }
      else {
        if (exists("nzv"))
        {
          if (!(keep[k] %in% rownames(nzv$Metrics))) {
            stop("\n",
                 keep[k],
                 'variable in the NearZeroVariance Variables',
                 "\n",
                 "\n")
          }
        }
      }
    }
    nkeep <- length(keeps)
    message("\n", nkeep, "variables are kept manually.", "\n")
    message("They are", keeps, "\n")
  }

  #--------------------------
  #TESSA ADDITIONS END
  #--------------------------




  ####
  # Sort out which are the "keep" variables (after one-hot and NZV)
  #####
  # Number of samples and variables
  nSamp <- nrow(X)
  nVar <- nVar0 <- ncol(X)

  # Sample identifiers; Assume independence if ID is not specified
  if (missing(ID)) {
    warning('\nMissing ID -> Assume all unique (i.e. sample independence)')
    ID <- 1:nSamp
  }

  # Figure out number of iterations in inner CV loop
  # Gets tweaked for "keeps"
  # lower limit is max(c(2, keeps))
  #
  var <- numeric()   ##var is the vector of variable number in keeps
  cnt <- 0

  if (exists("nkeep")) {
    if (nkeep > 0) {
      while (nVar > nkeep & nVar > 1) {
        cnt <- cnt + 1
        var <- c(var, nVar)
        nVar <-
          max(floor(varRatio * nVar), nkeep)   ##it is possible that in the end nVar<nkeep
      }


      #####Could it be remove? actually I think it could be simplified to
      ###if (nVar==nkeep){var <- c(var, nVar)}

      if (!any(var == nkeep) & nVar > 1) {
        cnt <- cnt + 1
        nVar2 <- ifelse(nVar >= nkeep, nVar, nkeep)
        nVar <- nVar2
        var <- c(var, nVar)    ##add next nVar or nkeep into it
      }
    }
  } else {
    while (nVar > 1) {
      ##limited it to 2
      cnt <- cnt + 1
      var <- c(var, nVar)
      nVar <- floor(varRatio * nVar)
    }
  }



  # Number of inner segments
  if (missing(nInner)) {
    nInner <- nOuter - 1
  } # Default value for inner segments

  ##Set up ANN packages
  if (method == 'ANN') {
    max_x <- apply(X, 2, max)
    min_x <- apply(X, 2, min)
    X <- scale(X,
               center = min_x,
               scale = max_x - min_x)

    if (methParam$annMethod == 'neuralnet') {
      ##library("neuralnet")
      ##library("NeuralNetTools")
    } else if (methParam$annMethod == 'nnet') {
      ##library("NeuralNetTools")
      ##library("nnet")
    } else{
      stop('Aritificial network method incorrectly specified in methParam')
    }
  }


  if (method == 'SVM') {
    X <- scale(X,
               center = TRUE,
               scale = TRUE)
    if (methParam$svmMethod == 'svm') {
      ##library(e1071)
      ##library(rminer)
    } else if (methParam$svmMethod == 'ksvm') {
      ##library(kernlab)
      ##library(rminer)
    } else if (methParam$svmMethod == 'svmlight') {
      ####library()
    }
    else {
      stop('Support vector machine method incorrectly specified in methParam')
    }
  }

  # Set up randomForest package
  if (method == 'RF') {
    if (methParam$rfMethod == 'randomForest') {
      ##library (randomForest)
    } else if (methParam$rfMethod == 'ranger') {
      ##library (ranger)
      # } else if (methParam$rfMethod == 'Rborist') {
      # ##library(Rborist)
    } else{
      stop('Random Forest method incorrectly specified in methParam')
    }
  }

  # Set up for multilevel analysis
  # When ML is true,
  if (ML) {
    X <- rbind(X, -X)
    if (missing(Y)) {
      Y <- rep(-1, nSamp)
    }
    Y <- c(Y, -Y)
    nSamp <- 2 * nSamp
    ID <- c(ID, ID)
    DA <- FALSE
    fitness <- 'MISS'
    message('\nMultilevel -> Regression on (-1, 1) & fitness = "MISS"')
  }
  if(dim(X)[1]!=length(Y)){
    stop("The X and Y should have same number of observations")
  }

  # No missingness allowed - Please perform imputations before running MUVR
  if (any(is.na(X)) |
      any(is.na(Y))) {
    stop('No missing values allowed in X or Y data.')
  }
  if (!is.null(dim(Y))) {
    stop('Y is not a vector.')
  }

  # DA / Classification
  if (is.character(Y)) {
    Y <- factor(Y)
  }
  if (is.factor(Y)) {
    message('\nY is factor -> Classification (',
        length(unique(Y)),
        ' classes)',
        sep = '')
    DA <- TRUE
  }
  if (is.numeric(Y) & DA) {
    Y <- as.factor(Y)
    message('\nDA = TRUE -> Y as factor -> Classification (',
        length(unique(Y)),
        ' classes)',
        sep = '')
  }

  # Check fitness criterion
  if (missing(fitness)) {
    if (DA) {
      fitness <- 'BER'
      message('\nMissing fitness -> BER')
    } else {
      # I.e. for regression
      fitness <- 'RMSEP'
      message('\nMissing fitness -> RMSEP')
    }
  }



  # Additional sanity check
  if (nrow(X) != length(Y)) {
    stop('Must have same nSamp in X and Y.')
  }







  ###########################################################
  # Store indata in list for later model return
  InData <- list(
    X = X,
    Y = Y,
    ID = ID,
    scale = scale,
    nRep = nRep,
    nOuter = nOuter,
    nInner = nInner,
    varRatio = varRatio,
    DA = DA,
    fitness = fitness,
    method = method,
    methParam = methParam,
    ML = ML,
    parallel = parallel
  )

  # Sort sampling based on ID and not index & Allocate prediction
  unik <- !duplicated(ID)  # boolean of unique IDs
  unikID <- ID[unik]       # Actual unique IDs
  if (DA) {
    if (nOuter > min(table(Y))) {
      warning(
        '\nnOuter is larger than your smallest group size. Consider lowering your nOuter to min(table(Y)).',
        call. = TRUE
      )
    }
    ###not all nOuter groups contains each Y level in this case
    unikY <-
      Y[unik]  # Counterintuitive, but needed for groupings by Ynames
    Ynames <- sort(unique(Y))  # Find groups
    groups <- length(Ynames) # Number of groups
    groupID <- list()  # Allocate list for indices of groups
    for (g in 1:groups) {
      groupID[[g]] <- unikID[unikY == Ynames[g]]  # Find indices per group
    }
    # Allocate final predictions for min mid and max models
    yPredMin <-
      yPredMid <-
      yPredMax <-
      array(
        dim = c(length(Y), # Rows = number of observations
                length(levels(Y)), # Columns = number of classes in Y
                nRep),
        # 3rd = number of repetitions
        dimnames = list(ID, # Observation ID names
                        levels(Y), # Names of levels in Y
                        paste('Rep', 1:nRep, sep = ''))
      ) # Repetitions
    # Allocate predictions per repetition
    yPredMinR <-
      yPredMidR <-
      yPredMaxR <-
      matrix(
        nrow = length(Y),
        # Like above but lacking 3rd dimension (repetitions)
        ncol = length(levels(Y)),
        dimnames = list(ID, levels(Y))
      )
  } else {
    # I.e. for regression and ML
    # Allocate final predictions for min mid and max models
    if (DA == FALSE | method == "SVM") {
      yPredMin <-
        yPredMid <-
        yPredMax <-
        matrix(
          nrow = length(Y),
          # Like above but matrix instead of array, since there are not multiple classes
          ncol = nRep,
          dimnames = list(ID, paste('Rep', 1:nRep, sep = ''))
        )
      # Allocate predictions per repetition
      yPredMinR <-
        yPredMidR <-
        yPredMaxR <-
        numeric(length(Y)) # Like above but lacking columns (repetitions) -> numeric vector
    }
  }

  # Allocate response vectors and matrices for var's, nComp and VIP ranks over repetitions
  missRep <- numeric(nRep)
  names(missRep) <- paste(rep('rep', nRep), 1:nRep, sep = '')
  varRepMin <-
    varRepMid <-
    varRepMax <-
    nCompRepMin <-
    nCompRepMid <-
    nCompRepMax <- missRep
  nCompSegMin <-
    nCompSegMid <-
    nCompSegMax <- matrix(nrow = nRep,
                          ncol = nOuter,
                          dimnames = list(
                            paste('repetition', 1:nRep, sep = ''),
                            paste('segment', 1:nOuter, sep =
                                    '')
                          ))
  VIRankRepMin <-
    VIRankRepMid <-
    VIRankRepMax <- matrix(
      data = nVar0,
      nrow = nVar0,
      ncol = nRep,
      dimnames = list(colnames(X),
                      paste(rep('rep', nRep), 1:nRep, sep = ''))
    )

  # Allocate array for validation results
  VAL <- array(dim = c(nOuter, cnt, nRep),
               dimnames = list(
                 paste('outSeg', 1:nOuter, paste = ''),
                 var,
                 paste(rep('rep', nRep), 1:nRep, sep = '')
               ))

  # Choose package/core algorithm according to chosen method
  # And prepare for exporting them in 'foreach' (below)
  packs <- c('pROC')
  if (method == 'RF') {
    packs <- c(packs, methParam$rfMethod)
  }
  exports <- 'vectSamp'


  ####################################################
  ## Start repetitions
  ####################################################

  # For manual debugging purposes
  #reps <- list()

  #for (r in 1:nRep) {

  reps <- foreach(r = 1:nRep,
                  .packages = packs,
                  .export = exports) %doVersion% {
                    # For manual debugging purposes
                    #r <- 1
                    #r <- r + 1

                    # Send intermediate outputs to log file: Not a pretty output. For debugging purposes only.
                    if (logg) {
                      sink('log.txt', append = TRUE)
                    }

                    # Intermediate info output
                    message('\n', '   Repetition ', r, ' of ', nRep, ':', sep = '', appendLF = FALSE)

                    # Allocate output for models (i.e. for later prediction of external samples)
                    if (modReturn) {
                      outMod <- list()
                    }

                    # PERFORM SEGMENTATION INTO OUTER SEGMENTS
                    if (DA & identical(unikID, ID)) {
                      groupTest <- list()  ## Allocate list for samples within group
                      for (gT in 1:groups) {
                        groupTest[[gT]] <-
                          vectSamp(groupID[[gT]], n = nOuter)  # Draw random samples within group
                      }
                      allTest <-
                        groupTest[[1]] # Add 1st groups to 'Master' sample of all groups
                      for (gT in 2:groups) {
                        # Add subsequent groups
                        allTest <-
                          allTest[order(sapply(allTest, length))]
                        for (aT in 1:nOuter) {
                          allTest[[aT]] <- sort(c(allTest[[aT]], groupTest[[gT]][[aT]]))
                        }
                      }
                    } else {
                      allTest <- vectSamp(unikID, n = nOuter)
                    }

                    # Allocate intermediate output objects
                    nCompOutMax <- numeric(nOuter)
                    names(nCompOutMax) <-
                      paste(rep('outSeg', nOuter), 1:nOuter, sep = '')
                    varOutMin <-
                      varOutMid <-
                      varOutMax <-
                      nCompOutMin <-
                      nCompOutMid <- nCompOutMax
                    VIRankOutMin <-
                      VIRankOutMid <-
                      VIRankOutMax <- matrix(
                        data = nVar0,
                        nrow = nVar0,
                        ncol = nOuter,
                        dimnames = list(colnames(X),
                                        paste(rep(
                                          'outSeg', nOuter
                                        ), 1:nOuter, sep = ''))
                      )
                    VALRep <- matrix(nrow = nOuter,
                                     ncol = cnt)

                    # Perform outer loop segments -> one "majority vote" MV model per segment
                    for (i in 1:nOuter) {
                      # For manual debugging purposes
                      # i <- 1
                      # i <- i + 1

                      # Intermediate info output
                      message('\n Segment ', i, ' (variables):', sep = '', appendLF = FALSE) # Counter

                      # Draw out test set
                      testID <-
                        allTest[[i]] # Draw out segment = holdout set BASED ON UNIQUE ID
                      testIndex <-
                        ID %in% testID # Boolean for samples corresponding to unique test IDs
                      xTest <- X[testIndex, ]
                      yTest <- Y[testIndex]

                      # Inner data (not in test)
                      inID <-
                        unikID[!unikID %in% testID]  # IDs not in test set


                      if (DA &
                          identical(unikID, ID))
                      {
                        inY <- unikY[!unikID %in% testID]
                      }  # Counterintuitive, but needed for grouping by Ynames

                      # Allocate fitness variables for the inner data
                      missIn <-
                        berIn <-
                        aucIn <-
                        rmsepIn <-
                        PRESSIn <-
                        nCompIn <- matrix(
                          nrow = nInner,
                          ncol = cnt,
                          dimnames = list(paste(
                            rep('inSeg', nInner), 1:nInner, sep = ''
                          ),
                          var)
                        )
                      # Allocate VIRank variable for the inner data
                      VIRankInner <- array(
                        data = nVar0,
                        dim = c(nVar0, cnt, nInner),
                        dimnames = list(colnames(X),
                                        var,
                                        paste(
                                          rep('inSeg', nInner), 1:nInner, sep = ''
                                        ))
                      )
                      # Set initial inclusion list of variables to all variables
                      incVar <- colnames(X)

                      # Perform steps with successively fewer variables
                      for (count in 1:cnt) {
                        # Build models with successively fewer variables.

                        # For manual debugging purposes
                        # count <- 1 # for testing
                        # count <- count + 1

                        # Extract the number of variables at the present count (according to the count loop before the foreach loop)
                        nVar <- var[count]

                        # Intermediate info output
                        message(nVar, appendLF = FALSE)
                        ###############################################################################################################################################################################
                        # Tweak method parameters for low number of variables
                        if (method == 'PLS')
                        {
                          comp <- min(c(nVar, methParam$compMax))
                        } # nComp cannot be > nVar
                        if (method == 'RF') {
                          mtryIn <- ifelse(DA,
                                           min(c(
                                             methParam$mtryMaxIn, floor(sqrt(nVar))
                                           )), # Standard but with upper limit
                                           min(c(
                                             methParam$mtryMaxIn, floor(nVar / 3)
                                           ))) # Standard but with upper limit
                          mtryIn <- max(c(2, mtryIn)) # Lower limit
                        }
                        #if(method=="ANN"){
                        #  neuralIn<-min(c(methParam$neuralmaxIn,floor(nVar/2)))
                        #  neuralIn <- max(c(2, neuralIn))
                        #}
                        ###############################################################################################################################################################################
                        # PERFORM SEGMENTATION INTO INNER SEGMENTS
                        if (DA & identical(unikID, ID)) {
                          groupIDVal <- list()
                          for (g in 1:groups) {
                            groupIDVal[[g]] <- inID[inY == Ynames[g]]  # Find indices per group
                          }
                          groupVal <-
                            list()  ## Allocate list for samples within group
                          for (gV in 1:groups) {
                            groupVal[[gV]] <- vectSamp(groupIDVal[[gV]],
                                                       n = nInner)  # Draw random samples within group
                          }
                          allVal <-
                            groupVal[[1]] # Add 1st groups to 'Master' sample of all groups

                          for (gV in 2:groups) {
                            # Add subsequent groups
                            allVal <-
                              allVal[order(sapply(allVal, length))]
                            for (aV in 1:nInner) {
                              allVal[[aV]] <- sort(c(allVal[[aV]], groupVal[[gV]][[aV]]))
                            }
                          }
                        } else {
                          allVal <- vectSamp(inID, n = nInner)
                        }

                        # Perform inner CV loop
                        for (j in 1:nInner) {
                          # For manual debugging purposes
                          # j <- 1
                          # j <- j + 1

                          # Intermediate info output
                          message('.', appendLF = FALSE) # Counter

                          # Extract validation segment
                          valID <- allVal[[j]] # Extract IDs
                          valIndex <-
                            ID %in% valID # Corresponding observations
                          xVal <- X[valIndex, ]
                          xVal <-
                            subset(xVal, select = incVar) # limit to the current selection of variables (selection below)
                          yVal <- Y[valIndex]

                          # Extract training data
                          trainID <-
                            inID[!inID %in% valID] # The inner but not validation IDs
                          trainIndex <-
                            ID %in% trainID # Corresponding observations
                          xTrain <- X[trainIndex, ]
                          xTrain <-
                            subset(xTrain, select = incVar) # limit to the current selection of variables (selection below)
                          yTrain <- Y[trainIndex]

                          # For debugging
                          # sum(trainIndex,valIndex,testIndex)
                          # trainIndex|valIndex|testIndex

                          # Make inner model
                          if (method == 'PLS') {
                            inMod <- plsInner(
                              xTrain,
                              yTrain,
                              xVal,
                              yVal,
                              DA,
                              fitness,
                              comp,
                              scale = scale,
                              weigh_added = weigh_added,
                              weighing_matrix = weighing_matrix
                            )
                            nCompIn[j, count] <- inMod$nComp
                          } else if (method == 'RF') {
                            inMod <- rfInner(
                              xTrain,
                              yTrain,
                              xVal,
                              yVal,
                              DA,
                              fitness,
                              mtry = mtryIn,
                              ntree = methParam$ntreeIn,
                              method = methParam$rfMethod,
                              weigh_added = weigh_added,
                              weighing_matrix = weighing_matrix
                            )
                          } else if (method == "ANN") {
                            # inMod <- annInner(
                            #   xTrain,
                            #   yTrain,
                            #   xVal,
                            #   yVal,
                            #   DA = DA,
                            #   fitness = fitness,
                            #   method = methParam$annMethod,
                            #   #nodes=neuralIn,
                            #   nodes = methParam$nodes,
                            #   threshold = methParam$threshold,
                            #   stepmax = methParam$stepmax
                            # )
                          } else if (method == "SVM") {
                            # inMod <- svmInner(
                            #   xTrain,
                            #   yTrain,
                            #   xVal,
                            #   yVal,
                            #   DA = DA,
                            #   fitness = fitness,
                            #   method = methParam$svmMethod,
                            #   kernel = methParam$kernel
                            #
                            # )
                          } else {
                            stop('No other core modelling techniques available at present.')
                          }

                          # Store fitness metric
                          if (fitness == 'MISS') {
                            missIn[j, count] <- inMod$miss
                          } else if (fitness == 'wMISS') {
                            missIn[j, count] <- inMod$wmiss
                          } else if (fitness == 'BER') {
                            berIn[j, count] <- inMod$ber
                          } else if (fitness == 'wBER') {
                            berIn[j, count] <- inMod$wber
                          } else if (fitness == 'AUROC') {
                            aucIn[j, count] <- inMod$auc
                          } else {
                            ###for Q2
                            rmsepIn[j, count] <- inMod$rmsep
                            PRESSIn[j, count] <-
                              (inMod$rmsep ^ 2) * length(yVal) # Is this really correct???
                          }

                          # Store VIRanks
                          VIRankInner[match(names(inMod$virank),  ###tell it where to put in
                                            rownames(VIRankInner)),
                                      count,
                                      j] <-
                            inMod$virank  ###put it in

                        }   ###where each Inner ends

                        if (exists("nkeep")) {
                          if (nkeep > 0) {
                            VIRankInner[rownames(VIRankInner) %in% keeps, count, ] <-
                              0 ##0 is the highest number
                          }
                        }
                        # Average inner VIP ranks before variable elimination - Tweak for `keeps`
                        VIRankInAve <- apply(VIRankInner[, count,],
                                             1,
                                             mean, na.rm = TRUE)
                        ### This is the average of VIRank of Inner segmnet for each x variable and each cnt

                        # VIRankInAve[keeps] <- 0
                        if (count < cnt) {
                          incVar <-
                            names(VIRankInAve[order(VIRankInAve)])[1:var[count + 1]] # Extract the names of the variables kept for the next iteration
                        }
                      }   ###where each cnt ends

                      # PER OUTER SEGMENT:

                      # Store fitness curves and construct fitness rank curves for the different "counts"
                      if (fitness == 'AUROC') {
                        fitRank <- colMeans(-aucIn)
                        VALRep[i, ] <- colMeans(aucIn)
                      } else if (fitness == 'MISS' |
                                 fitness == 'wMISS') {
                        fitRank <- VALRep[i, ] <- colSums(missIn)
                      } else if (fitness == 'BER' |
                                 fitness == 'wBER') {
                        fitRank <- VALRep[i, ] <- colMeans(berIn)
                      } else {
                        fitRank <- colMeans(rmsepIn)
                        VALRep[i, ] <-
                          sqrt(colSums(PRESSIn) / sum(!testIndex))
                      }

                      # Rescale fitRank to range 0 (best) - 1 (worst)
                      fitRank <-
                        (fitRank - min(fitRank)) / abs(diff(range(fitRank)))

                      # BugCatch: If all VAL have NaN value -> reset all fitRank to 0
                      if (all(is.nan(fitRank)))
                      {
                        fitRank <- rep(0, cnt)
                      }

                      # Extract index of min and max models
                      #(remember that counts and nVar go in opposite direction!)
                      minIndex <-
                        max(which(fitRank <= methParam$robust))
                      maxIndex <-
                        min(which(fitRank <= methParam$robust))

                      # nVar at min, mid and max
                      varOutMin[i] <- var[minIndex]
                      varOutMax[i] <- var[maxIndex]
                      varOutMid[i] <-
                        round(exp(mean(log(c(
                          var[minIndex], var[maxIndex]
                        ))))) # Geometric mean of min and max. This one has decimals

                      # midIndex is closest to nVarMid, this makes it integrals
                      midIndex <- which.min(abs(var - varOutMid[i]))

                      # For PLS, extract number of components
                      if (method == 'PLS') {
                        nCompOutMin[i] <-
                          round(mean(nCompIn[, minIndex]))  ### under a cnt, the average number of component for inner
                        nCompOutMid[i] <-
                          round(mean(nCompIn[, midIndex]))
                        nCompOutMax[i] <-
                          round(mean(nCompIn[, maxIndex]))
                      }

                      # Average VIP ranks
                      VIRankOutMin[, i] <-
                        apply(VIRankInner[, minIndex, ], 1, mean, na.rm =
                                TRUE)
                      VIRankOutMid[, i] <-
                        apply(VIRankInner[, midIndex, ], 1, mean, na.rm =
                                TRUE)
                      VIRankOutMax[, i] <-
                        apply(VIRankInner[, maxIndex, ], 1, mean, na.rm =
                                TRUE)

                      # Build outer model for min, mid and max and predict YTEST
                      # Extract all inner data
                      xIn <-
                        X[!testIndex,] # Perform Validation on all samples except holdout set
                      yIn <- Y[!testIndex]

                      #######################################################################################
                      #####incVarMin is different for each i
                      # Determine consensus choice of included variables for min mid and max
                      incVarMin <-
                        rownames(VIRankOutMin)[rank(VIRankOutMin[, i]) <= varOutMin[i]]
                      incVarMid <-
                        rownames(VIRankOutMid)[rank(VIRankOutMid[, i]) <= varOutMid[i]]
                      incVarMax <-
                        rownames(VIRankOutMax)[rank(VIRankOutMax[, i]) <= varOutMax[i]]
                      ##################################################pls predict
                      # Consensus models for PLS
                      if (method == 'PLS') {
                        # Build min model
                        if (DA) {
                          plsOutMin <- plsda(
                            subset(xIn, select = incVarMin),
                            yIn,
                            ncomp = nCompOutMin[i],
                            near.zero.var = TRUE,
                            scale = scale
                          )
                        } else {
                          plsOutMin <- pls(
                            subset(xIn, select = incVarMin),
                            yIn,
                            ncomp = nCompOutMin[i],
                            near.zero.var = TRUE,
                            scale = scale
                          )
                        }
                        # Extract test data with correct variable selection
                        xTestMin <-
                          subset(xTest, select = incVarMin)
                        # Extract predictions
                        yPredMinR[testIndex] <-
                          predict(plsOutMin,
                                  newdata = xTestMin,
                                  scale = scale)$predict[, , nCompOutMin[i]]  #

                        # Build mid model
                        if (DA) {
                          plsOutMid <- plsda(
                            subset(xIn, select = incVarMid),
                            yIn,
                            ncomp = nCompOutMid[i],
                            near.zero.var = TRUE,
                            scale = scale
                          )
                        } else {
                          plsOutMid <- pls(
                            subset(xIn, select = incVarMid),
                            yIn,
                            ncomp = nCompOutMid[i],
                            near.zero.var = TRUE,
                            scale = scale
                          )
                        }
                        # Extract test data with correct variable selection
                        xTestMid <- subset(xTest, select = incVarMid)
                        # Extract predictions
                        yPredMidR[testIndex] <-
                          predict(plsOutMid,
                                  newdata = xTestMid,
                                  scale = scale)$predict[, , nCompOutMid[i]]  #

                        # Build max model
                        if (DA) {
                          plsOutMax <- plsda(
                            subset(xIn, select = incVarMax),
                            yIn,
                            ncomp = nCompOutMax[i],
                            near.zero.var = TRUE,
                            scale = scale
                          )
                        } else {
                          plsOutMax <- pls(
                            subset(xIn, select = incVarMax),
                            yIn,
                            ncomp = nCompOutMax[i],
                            near.zero.var = TRUE,
                            scale = scale
                          )
                        }

                        # Extract test data with correct variable selection
                        xTestMax <-
                          subset(xTest, select = incVarMax)
                        # Extract predictions
                        yPredMaxR[testIndex] <-
                          predict(plsOutMax,
                                  newdata = xTestMax,
                                  scale = scale)$predict[, , nCompOutMax[i]]  #

                        # Extract models for external predictions
                        if (modReturn) {
                          outMod[[i]] = list(plsOutMin,
                                             plsOutMid,
                                             plsOutMax)
                        }
                        #######################################################################rf predict
                      } else if (method == 'RF') {
                        # min model
                        rfOutMin <- rfPred(
                          xTrain = subset(xIn, select = incVarMin),
                          yTrain = yIn,
                          xTest = subset(xTest, select = incVarMin),
                          yTest = yTest,
                          DA = DA,
                          ntree = methParam$ntreeOut,
                          keep.forest = TRUE,
                          method = methParam$rfMethod
                        )
                        # min predictions
                        if (DA)
                        {
                          yPredMinR[testIndex,] <- rfOutMin$predicted
                        } else {
                          yPredMinR[testIndex] <- rfOutMin$predicted
                        }

                        # mid model
                        rfOutMid <- rfPred(
                          xTrain = subset(xIn, select = incVarMid),
                          yTrain = yIn,
                          xTest = subset(xTest, select = incVarMid),
                          yTest = yTest,
                          DA = DA,
                          ntree = methParam$ntreeOut,
                          keep.forest = TRUE,
                          method = methParam$rfMethod
                        )
                        # mid predictions
                        if (DA)
                        {
                          yPredMidR[testIndex,] <- rfOutMid$predicted
                        } else {
                          yPredMidR[testIndex] <- rfOutMid$predicted
                        }

                        # max model
                        rfOutMax <- rfPred(
                          xTrain = subset(xIn, select = incVarMax),
                          yTrain = yIn,
                          xTest = subset(xTest, select = incVarMax),
                          yTest = yTest,
                          DA = DA,
                          ntree = methParam$ntreeOut,
                          keep.forest = TRUE,
                          method = methParam$rfMethod
                        )
                        # max predictions
                        if (DA)
                        {
                          yPredMaxR[testIndex,] <- rfOutMax$predicted
                        } else {
                          yPredMaxR[testIndex] <- rfOutMax$predicted
                        }

                        # Extract models for external predictions
                        if (modReturn) {
                          outMod[[i]] <- list(
                            rfOutMin = rfOutMin$model,
                            rfOutMid = rfOutMid$model,
                            rfOutMax = rfOutMax$model
                          )
                        }
                        ###################################################################################Ann predict
                      } else if (method == "ANN") {
                        #if(DA==FALSE){
                        # annOutMin <- annpredict(
                        #   xTrain = subset(xIn, select = incVarMin),
                        #   yTrain = yIn,
                        #   xTest = subset(xTest, select = incVarMin),
                        #   yTest = yTest,
                        #   DA = DA,
                        #   nodes = methParam$nodes,
                        #   #nodes=min(c(methParam$neuralmaxIn,floor(sqrt(length(incVarMin))))),
                        #   threshold = methParam$threshold,
                        #   stepmax = methParam$stepmax,
                        #   method = methParam$annMethod
                        #
                        # )
                        # if (DA)
                        # {
                        #   yPredMinR[testIndex,] <- annOutMin$predicted
                        # } else {
                        #   yPredMinR[testIndex] <- annOutMin$predicted
                        # }

                        # annOutMid <- annpredict(
                        #   xTrain = subset(xIn, select = incVarMid),
                        #   yTrain = yIn,
                        #   xTest = subset(xTest, select = incVarMid),
                        #   yTest = yTest,
                        #   DA = DA,
                        #   nodes = methParam$nodes,
                        #   #nodes=min(c(methParam$neuralmaxIn,floor(sqrt(length(incVarMid))))),
                        #   threshold = methParam$threshold,
                        #   stepmax = methParam$stepmax,
                        #   method = methParam$annMethod
                        #
                        # )

                        # if (DA)
                        # {
                        #   yPredMidR[testIndex,] <- annOutMid$predicted
                        # } else {
                        #   yPredMidR[testIndex] <- annOutMid$predicted
                        # }

                        # annOutMax <- annpredict(
                        #   xTrain = subset(xIn, select = incVarMax),
                        #   yTrain = yIn,
                        #   xTest = subset(xTest, select = incVarMax),
                        #   yTest = yTest,
                        #   DA = DA,
                        #   nodes = methParam$nodes,
                        #   #nodes=min(c(methParam$nodes,floor(sqrt(length(incVarMax))))),
                        #   threshold = methParam$threshold,
                        #   stepmax = methParam$stepmax,
                        #   method = methParam$annMethod
                        #
                        # )
                        # if (DA)
                        # {
                        #   yPredMaxR[testIndex,] <- annOutMax$predicted
                        # } else {
                        #   yPredMaxR[testIndex] <- annOutMax$predicted
                        # }
                        # if (modReturn) {
                        #   outMod[[i]] <- list(
                        #     annOutMin = annOutMin$model,
                        #     annOutMid = annOutMid$model,
                        #     annOutMax = annOutMax$model
                        #   )
                        # }
                        # #  }





                      } else if (method == "SVM") {
                        #if(DA==FALSE){
                        # svmOutMin <- svmpredict(
                        #   xTrain = subset(xIn, select = incVarMin),
                        #   yTrain = yIn,
                        #   xTest = subset(xTest, select = incVarMin),
                        #   yTest = yTest,
                        #   DA = DA,
                        #   method = methParam$svmMethod,
                        #   kernel = methParam$kernel
                        #   #nu=methParam$nu,
                        #   #degree=methParam$degree,
                        #   #gamma=methParam$gamma
                        #
                        # )
                        # if (DA)
                        # {
                        #   yPredMinR[testIndex] <- svmOutMin$predicted
                        #   #yPredMinR[testIndex, ] <- svmOutMin$predicted
                        # } else {
                        #   yPredMinR[testIndex] <- svmOutMin$predicted
                        # }

                        # svmOutMid <- svmpredict(
                        #   xTrain = subset(xIn, select = incVarMid),
                        #   yTrain = yIn,
                        #   xTest = subset(xTest, select = incVarMid),
                        #   yTest = yTest,
                        #   DA = DA,
                        #   method = methParam$svmMethod,
                        #   kernel = methParam$kernel
                        #   #nu=methParam$nu,
                        #   #degree=methParam$degree,
                        #   #gamma=methParam$gamma
                        #
                        # )

                        # if (DA)
                        # {
                        #   yPredMidR[testIndex] <- svmOutMid$predicted
                        #   #yPredMidR[testIndex, ] <- svmOutMid$predicted
                        # } else {
                        #   yPredMidR[testIndex] <- svmOutMid$predicted
                        # }

                        # svmOutMax <- svmpredict(
                        #   xTrain = subset(xIn, select = incVarMax),
                        #   yTrain = yIn,
                        #   xTest = subset(xTest, select = incVarMax),
                        #   yTest = yTest,
                        #   DA = DA,
                        #   method = methParam$svmMethod,
                        #   kernel = methParam$kernel
                        #   #nu=methParam$nu,
                        #   #degree=methParam$degree,
                        #   #gamma=methParam$gamma
                        #
                        # )
                        # if (DA)
                        # {
                        #   yPredMaxR[testIndex] <- svmOutMax$predicted
                        #   #yPredMaxR[testIndex, ] <- svmOutMax$predicted
                        # } else {
                        #   yPredMaxR[testIndex] <- svmOutMax$predicted
                        # }
                        # if (modReturn) {
                        #   outMod[[i]] <- list(
                        #     svmOutMin = svmOutMin$model,
                        #     svmOutMid = svmOutMid$model,
                        #     svmOutMax = svmOutMax$model
                        #   )
                        # }


                      } else{
                        stop('Other core models not implemented')
                      }
                      ###################################################################################################################
                      ######################################################################################################
                      # Needs to be expanded for other cores (SVM; ANN)
                    }


                    ###where the outer loop ends

                    # PER REPETITION:
                    # Organize output

                    # Individual predictions
                    ##################################################################################################
                    #############################################################################################
                    ##It  seems that yPredMinR should be different for each i(nOuter). However here it use the last yPredMinR in the iteration
                    ##Maybe this is because you used foreach instead of each?
                    parReturn <- list(yPredMin = yPredMinR,
                                      yPredMid = yPredMidR,
                                      yPredMax = yPredMaxR)

                    # VI ranks
                    parReturn$VIRankRepMin <- rowMeans(VIRankOutMin)
                    parReturn$VIRankRepMid <- rowMeans(VIRankOutMid)
                    parReturn$VIRankRepMax <- rowMeans(VIRankOutMax)

                    # PLS components
                    if (method == 'PLS') {
                      parReturn$nCompRepMin <-
                        round(mean(nCompOutMin)) # Averaged over the nOuter segments
                      parReturn$nCompRepMid <-
                        round(mean(nCompOutMid))
                      parReturn$nCompRepMax <-
                        round(mean(nCompOutMax))
                      parReturn$nCompSegMin <-
                        nCompOutMin # For the individual segments
                      parReturn$nCompSegMid <- nCompOutMid
                      parReturn$nCompSegMax <- nCompOutMax
                    }

                    # Validation curves
                    parReturn$VAL <- VALRep

                    # Recalculate fitness per repetition
                    fitRankRep <- colSums(VALRep)
                    if (fitness == 'AUROC')
                    {
                      fitRankRep <- -fitRankRep
                    } # AUROC fitness (higher is better) goes in opposite direction of all other metrics (lower is better)
                    fitRankRep <-
                      (fitRankRep - min(fitRankRep)) / abs(diff(range(fitRankRep)))
                    # Rescale fitness curve to scale from 0 (best) to 1 (worst)

                    if (all(is.nan(fitRankRep)))
                    {
                      fitRankRep <- rep(0, cnt)
                    } # If all VAL have same value -> reset all fitRankRep to 0

                    # Recalculate min/mid/max nVar per repetition from fitness
                    minIndex <-
                      max(which(fitRankRep <= methParam$robust))
                    maxIndex <-
                      min(which(fitRankRep <= methParam$robust))
                    parReturn$varRepMin <- var[minIndex]
                    parReturn$varRepMid <-
                      round(exp(mean(log(c(
                        var[minIndex], var[maxIndex]
                      ))))) # Geometric mean of min and max
                    parReturn$varRepMax <- var[maxIndex]

                    # Return underlying models
                    if (modReturn)
                    {
                      parReturn$outModel <- outMod
                    }

                    # Stop log
                    if (logg) {
                      sink()
                    }

                    # Return repetition results
                    return(parReturn)

                    # For manual debugging purposes
                    ## reps[[r]] <- parReturn
                  }
  ###Where the repetition ends

  # Unpack the results from the repetitions
  if (modReturn)
  {
    outMods <- list()
  }
  for (r in 1:nRep) {
    # Individual predictions
    if (DA)
    {
      yPredMin[, , r] <- reps[[r]]$yPredMin
    }
    else
    {
      yPredMin[, r] <- reps[[r]]$yPredMin
    }
    if (DA)
    {
      yPredMid[, , r] <- reps[[r]]$yPredMid
    }
    else
    {
      yPredMid[, r] <- reps[[r]]$yPredMid
    }
    if (DA)
    {
      yPredMax[, , r] <- reps[[r]]$yPredMax
    }
    else
    {
      yPredMax[, r] <- reps[[r]]$yPredMax
    }
    # Number of variables
    varRepMin[r] <- reps[[r]]$varRepMin
    varRepMid[r] <- reps[[r]]$varRepMid
    varRepMax[r] <- reps[[r]]$varRepMax
    # VI Ranks
    VIRankRepMin[, r] <- reps[[r]]$VIRankRepMin
    VIRankRepMid[, r] <- reps[[r]]$VIRankRepMid
    VIRankRepMax[, r] <- reps[[r]]$VIRankRepMax
    # PLS components
    if (method == 'PLS') {
      nCompRepMin[r] <- reps[[r]]$nCompRepMin # Average per repetition
      nCompRepMid[r] <- reps[[r]]$nCompRepMid
      nCompRepMax[r] <- reps[[r]]$nCompRepMax
      nCompSegMin[r, ] <-
        reps[[r]]$nCompSegMin # For the individual nOuter segments
      nCompSegMid[r, ] <- reps[[r]]$nCompSegMid
      nCompSegMax[r, ] <- reps[[r]]$nCompSegMax
    }
    # Validation curves
    VAL[, , r] <- reps[[r]]$VAL
    # Models
    if (modReturn)
    {
      outMods <- c(outMods, reps[[r]]$outModel)
    }
  }

  # Average predictions
  if (DA) {
    yPred <- list()
    yPred[['min']] <- apply(yPredMin, c(1, 2), mean, na.rm = TRUE)
    yPred[['mid']] <- apply(yPredMid, c(1, 2), mean, na.rm = TRUE)
    yPred[['max']] <- apply(yPredMax, c(1, 2), mean, na.rm = TRUE)
  } else {
    yPred <-
      cbind(
        apply(yPredMin, 1, mean, na.rm = TRUE)[1:nrow(X)],
        # Is the [1:nrow(X)] really necessary?
        apply(yPredMid, 1, mean, na.rm = TRUE)[1:nrow(X)],
        apply(yPredMax, 1, mean, na.rm = TRUE)[1:nrow(X)]
      )
    colnames(yPred) <- c('min', 'mid', 'max')
    rownames(yPred) <- paste(1:nSamp, ID, sep = '_ID')
  }
  modelReturn$yPred <- yPred # Store averaged predictions
  modelReturn$yPredPerRep <-
    list(minModel = yPredMin,
         # And predictions per repetition
         midModel = yPredMid,
         maxModel = yPredMax)

  # Calculate classification-specific characteristics
  if (DA) {
    # Calculate AUCs per class
    auc <- matrix(
      nrow = 3,
      ncol = length(levels(Y)),
      dimnames = list(c('min', 'mid', 'max'), levels(Y))
    )
    for (cl in 1:length(levels(Y))) {
      auc[1, cl] <- roc(Y == (levels(Y)[cl]),
                        yPred[['min']][, cl],
                        quiet = TRUE)$auc
      auc[2, cl] <- roc(Y == (levels(Y)[cl]),
                        yPred[['mid']][, cl],
                        quiet = TRUE)$auc
      auc[3, cl] <- roc(Y == (levels(Y)[cl]),
                        yPred[['max']][, cl],
                        quiet = TRUE)$auc
    }
    # Classify predictions
    miss <- numeric(3)
    yClass <- data.frame(Y)
    for (mo in 1:3) {
      # mo for model
      classPred <-
        factor(apply(yPred[[mo]], 1, function(x)
          levels(Y)[which.max(x)]), levels = levels(Y))
      miss[mo] <- sum(classPred != Y)
      yClass[, mo] <- classPred
    }
    names(miss) <- colnames(yClass) <- c('min', 'mid', 'max')
    rownames(yClass) <- paste(1:nSamp, ID, sep = '_ID')
    ##########################################################################################################################################################################
    ####For ber predicted
    ber <- numeric(3)
    yClass <- data.frame(Y)
    for (mo in 1:3) {
      # mo for model
      berPred <-
        factor(apply(yPred[[mo]], 1, function(x)
          levels(Y)[which.max(x)]), levels = levels(Y))
      ber[mo] <- getBER(actual = Y,
                        predicted = berPred)
      yClass[, mo] <- berPred
    }
    names(ber) <- colnames(yClass) <- c('min', 'mid', 'max')
    rownames(yClass) <- paste(1:nSamp, ID, sep = '_ID')

    #############################################################################################################################################################################################333
    # Report
    modelReturn$yClass <- yClass
    modelReturn$miss <- miss
    modelReturn$auc <- auc
    modelReturn$ber <- ber


    if (weigh_added == TRUE) {
      wmiss <- numeric(3)
      yClass <- data.frame(Y)
      for (mo in 1:3) {
        # mo for model
        classPred <-
          factor(apply(yPred[[mo]], 1, function(x)
            levels(Y)[which.max(x)]), levels = levels(Y))
        wmiss[mo] <- getMISS(
          actual = Y,
          predicted = classPred,
          weigh_added = weigh_added,
          weighing_matrix = weighing_matrix
        )
        yClass[, mo] <- classPred
      }
      names(wmiss) <- colnames(yClass) <- c('min', 'mid', 'max')
      rownames(yClass) <- paste(1:nSamp, ID, sep = '_ID')

      modelReturn$wmiss <- wmiss

      wber <- numeric(3)
      yClass <- data.frame(Y)
      for (mo in 1:3) {
        # mo for model
        wberPred <-
          factor(apply(yPred[[mo]], 1, function(x)
            levels(Y)[which.max(x)]), levels = levels(Y))
        wber[mo] <- getBER(
          actual = Y,
          predicted = wberPred,
          weigh_added = weigh_added,
          weighing_matrix = weighing_matrix
        )
        yClass[, mo] <- wberPred
      }
      names(wber) <- colnames(yClass) <- c('min', 'mid', 'max')
      rownames(yClass) <- paste(1:nSamp, ID, sep = '_ID')
      modelReturn$wber <- wber
    }
  }


  # Calculate multilevel-specific characteristics
  if (ML) {
    modelReturn$yClass <-
      apply(yPred, 2, function(x)
        ifelse(x > 0, 1, -1))
    modelReturn$miss <-
      apply(modelReturn$yClass, 2, function(x)
        sum(x != Y))
    modelReturn$auc <- apply(yPred, 2, function(x)
      roc(Y, x)$auc)
    #####################################################################################################################
    modelReturn$ber <-
      apply(modelReturn$yClass, 2, function(x) {
        getBER(actual = Y, predicted = x)
      })                           ####newly added
    if (weigh_added == TRUE) {
      modelReturn$wmiss <-
        apply(modelReturn$yClass, 2, function(x) {
          getMISS(
            actual = Y,
            predicted = x,
            weigh_added = weigh_added,
            weighing_matrix = weighing_matrix
          )
        })

      modelReturn$wber <-
        apply(modelReturn$yClass, 2, function(x) {
          getBER(
            actual = Y,
            predicted = x,
            weigh_added = weigh_added,
            weighing_matrix = weighing_matrix
          )
        })

    }
    #####################################################################################################################
    colnames(modelReturn$yClass) <-
      names(modelReturn$miss) <-
      names(modelReturn$auc) <-
      names(modelReturn$ber) <- c('min', 'mid', 'max')
    if (weigh_added == TRUE) {
      names(modelReturn$wber) <- c('min', 'mid', 'max')
      names(modelReturn$wmiss) <- c('min', 'mid', 'max')
    }
    rownames(modelReturn$yClass) <- paste(1:nSamp, ID, sep = '_ID')
  }

  # Average VIP ranks over repetitions
  VIRank <- cbind(rowMeans(VIRankRepMin),
                  rowMeans(VIRankRepMid),
                  rowMeans(VIRankRepMax))
  ####################################################################
  ### adjust the sequence of rank
  if (exists("nkeep")) {
    if (nkeep > 0) {
      VIRank <- VIRank - length(keep)
      for (i in 1:length(keep)) {
        VIRank[keep[i], ] <- 0
      }
    }
    modelReturn$keep <- keep
  }
  ##################################################################3
  colnames(VIRank) <- c('min', 'mid', 'max')
  modelReturn$VIRank <- VIRank
  modelReturn$VIRankPerRep <- list(minModel = VIRankRepMin,
                                   midModel = VIRankRepMid,
                                   maxModel = VIRankRepMax)

  # Recalculate fitness averaged over all the repetitions
  fitRankAll <- apply(VAL, 2, mean, na.rm = TRUE)
  if (fitness == 'AUROC')
  {
    fitRankAll <-
      -fitRankAll
  } # AUROC fitness (higher is better) goes in opposite direction of all other metrics (lower is better)
  fitRankAll <-
    (fitRankAll - min(fitRankAll)) / abs(diff(range(fitRankAll))) # rescale from 0 (best) to 1 (worst)
  if (all(is.nan(fitRankAll)))
  {
    fitRankAll <-
      rep(0, cnt)
  } # If all VAL have same value -> reset all fitRankAll to 0

  # Calculate min/mid/max indices and nVar
  minIndex <- max(which(fitRankAll <= methParam$robust))
  maxIndex <- min(which(fitRankAll <= methParam$robust))
  nVar <- c(var[minIndex],
            round(exp(mean(log(
              c(var[minIndex], var[maxIndex])
            )))),
            var[maxIndex])
  names(nVar) <- c('min', 'mid', 'max')

  modelReturn$nVar <- nVar
  modelReturn$nVarPerRep <- list(minModel = varRepMin,
                                 midModel = varRepMid,
                                 maxModel = varRepMax)

  # PLS components
  if (method == 'PLS') {
    # Average nComp over repetitions
    nComp <- c(round(mean(nCompRepMin)),
               round(mean(nCompRepMid)),
               round(mean(nCompRepMax)))
    names(nComp) <- c('min', 'mid', 'max')
    modelReturn$nComp <- nComp
    modelReturn$nCompPerRep <- list(minModel = nCompRepMin,
                                    midModel = nCompRepMid,
                                    maxModel = nCompRepMax)
    modelReturn$nCompPerSeg <- list(minModel = nCompSegMin,
                                    midModel = nCompSegMid,
                                    maxModel = nCompSegMax)
  }

  # Store validation metric and curves
  modelReturn$VAL$metric <- fitness
  modelReturn$VAL$VAL <- VAL

  # Store segment models
  if (modReturn)
  {
    modelReturn$outModels <- outMods
  }

  # Store inData
  modelReturn$inData <- InData
  #############################################################################################################3
  ## Build overall "Fit-Predict" models for calculating R2 and visualizations
  incVarMin <- names(VIRank[rank(VIRank[, 1]) <= round(nVar[1]), 1])
  incVarMid <- names(VIRank[rank(VIRank[, 2]) <= round(nVar[2]), 2])
  incVarMax <- names(VIRank[rank(VIRank[, 3]) <= round(nVar[3]), 3])


  if (method == 'PLS') {
    ######################
    # PLS Min fit-predict
    ######################
    if (DA) {
      plsFitMin <- plsda(
        subset(X, select = incVarMin),
        Y,
        ncomp = round(nComp[1]),
        near.zero.var = TRUE,
        scale = scale
      )
    } else {
      plsFitMin <- pls(
        subset(X, select = incVarMin),
        Y,
        ncomp = round(nComp[1]),
        near.zero.var = TRUE,
        scale = scale
      )
    }
    # Exclude potential near zero variance variables
    if (length(plsFitMin$nzv$Position) > 0)
    {
      incVarMin <-
        incVarMin[!incVarMin %in% rownames(plsFitMin$nzv$Metrics)]
    }

    # Make Min predictions
    yFitMin <- predict(plsFitMin,
                       newdata = subset(X, select = incVarMin),
                       scale = scale)$predict[, , nComp[1]]  #
    ######################
    # PLS Mid fit-predict
    ######################
    if (DA) {
      plsFitMid <- plsda(
        subset(X, select = incVarMid),
        Y,
        ncomp = round(nComp[2]),
        near.zero.var = TRUE,
        scale = scale
      )
    } else {
      plsFitMid <- pls(
        subset(X, select = incVarMid),
        Y,
        ncomp = round(nComp[2]),
        near.zero.var = TRUE,
        scale = scale
      )
    }
    # Exclude potential near zero variance variables
    if (length(plsFitMid$nzv$Position) > 0)
    {
      incVarMid <-
        incVarMid[!incVarMid %in% rownames(plsFitMid$nzv$Metrics)]
    }
    # Make Mid predictions
    yFitMid <- predict(plsFitMid,
                       newdata = subset(X, select = incVarMid),
                       scale = scale)$predict[, , nComp[2]]  #
    ######################
    # PLS Max fit-predict
    ######################
    if (DA) {
      plsFitMax <- plsda(
        subset(X, select = incVarMax),
        Y,
        ncomp = round(nComp[3]),
        near.zero.var = TRUE,
        scale = scale
      )
    } else {
      plsFitMax <- pls(
        subset(X, select = incVarMax),
        Y,
        ncomp = round(nComp[3]),
        near.zero.var = TRUE,
        scale = scale
      )
    }
    # Exclude potential near zero variance variables
    if (length(plsFitMax$nzv$Position) > 0)
    {
      incVarMax <-
        incVarMax[!incVarMax %in% rownames(plsFitMax$nzv$Metrics)]
    }
    # Make Max predictions
    yFitMax <- predict(plsFitMax,
                       newdata = subset(X, select = incVarMax),
                       scale = scale)$predict[, , nComp[3]]  #

    # Combine fit-predictions
    yFit <- cbind(yFitMin, yFitMid, yFitMax)
    yRep <- ncol(yFit) / 3
    colnames(yFit) <- rep(c('min', 'mid', 'max'), each = yRep)
    rownames(yFit) <- ID
    # Report fit-predict
    modelReturn$Fit <- list(
      yFit = yFit,
      plsFitMin = plsFitMin,
      plsFitMid = plsFitMid,
      plsFitMax = plsFitMax
    )
  } else if (method == 'RF') {
    ######################
    # RF Min fit-predict
    ######################
    rfFitMin <-
      suppressWarnings(rfPred(
        xTrain = subset(X, select = incVarMin),
        yTrain = Y,
        DA = DA,
        method = methParam$rfMethod
      ))
    yFitMin <- rfFitMin$fit
    ######################
    # RF Mid fit-predict
    ######################
    rfFitMid <-
      suppressWarnings(rfPred(
        xTrain = subset(X, select = incVarMid),
        yTrain = Y,
        DA = DA,
        method = methParam$rfMethod
      ))
    yFitMid <- rfFitMid$fit
    ######################
    # RF Max fit-predict
    ######################
    rfFitMax <-
      suppressWarnings(rfPred(
        xTrain = subset(X, select = incVarMax),
        yTrain = Y,
        DA = DA,
        method = methParam$rfMethod
      ))
    yFitMax <- rfFitMax$fit

    # Combine fit-predictions
    yFit <- cbind(yFitMin, yFitMid, yFitMax)
    yRep <- ncol(yFit) / 3
    colnames(yFit) <- rep(c('min', 'mid', 'max'), each = yRep)
    rownames(yFit) <- ID
    modelReturn$Fit <- list(
      yFit = yFit,
      rfFitMin = rfFitMin,
      rfFitMid = rfFitMid,
      rfFitMax = rfFitMax
    )
  } else if (method == "ANN") {

    # annFitMin <-
    #   suppressWarnings(
    #     annpredict(
    #       xTrain = subset(X, select = incVarMin),
    #       yTrain = Y,
    #       DA = DA,
    #       nodes = methParam$nodes,
    #       threshold = methParam$threshold,
    #       stepmax = methParam$stepmax,
    #       method = methParam$annMethod
    #     )
    #   )
    # yFitMin <- annFitMin$fit
    #
    # annFitMid <-
    #   suppressWarnings(
    #     annpredict(
    #       xTrain = subset(X, select = incVarMid),
    #       yTrain = Y,
    #       DA = DA,
    #       nodes = methParam$nodes,
    #       threshold = methParam$threshold,
    #       stepmax = methParam$stepmax,
    #       method = methParam$annMethod
    #     )
    #   )
    # yFitMid <- annFitMid$fit
    #
    # annFitMax <-
    #   suppressWarnings(
    #     annpredict(
    #       xTrain = subset(X, select = incVarMax),
    #       yTrain = Y,
    #       DA = DA,
    #       nodes = methParam$nodes,
    #       threshold = methParam$threshold,
    #       stepmax = methParam$stepmax,
    #       method = methParam$annMethod
    #     )
    #   )
    # yFitMax <- annFitMax$fit
    #
    # yFit <- cbind(yFitMin, yFitMid, yFitMax)
    # yRep <- ncol(yFit) / 3
    # colnames(yFit) <- rep(c('min', 'mid', 'max'), each = yRep)
    # rownames(yFit) <- ID
    # modelReturn$Fit <- list(
    #   yFit = yFit,
    #   annFitMin = annFitMin,
    #   annFitMid = annFitMid,
    #   annFitMax = annFitMax
    # )

  } else if (method == "SVM") {
    # svmFitMin <-
    #   suppressWarnings(
    #     svmpredict(
    #       xTrain = subset(X, select = incVarMin),
    #       yTrain = Y,
    #       DA = DA,
    #       method = methParam$svmMethod,
    #       kernel = methParam$kernel,
    #       #nu=methParam$nu,
    #       #degree=methParam$degree,
    #       #gamma=methParam$gamma
    #     )
    #   )
    # yFitMin <- svmFitMin$fit
    #
    # svmFitMid <-
    #   suppressWarnings(
    #     svmpredict(
    #       xTrain = subset(X, select = incVarMid),
    #       yTrain = Y,
    #       DA = DA,
    #       method = methParam$svmMethod,
    #       kernel = methParam$kernel,
    #       #nu=methParam$nu,
    #       #degree=methParam$degree,
    #       #gamma=methParam$gamma
    #     )
    #   )
    # yFitMid <- svmFitMid$fit
    #
    # svmFitMax <-
    #   suppressWarnings(
    #     svmpredict(
    #       xTrain = subset(X, select = incVarMax),
    #       yTrain = Y,
    #       DA = DA,
    #       method = methParam$svmMethod,
    #       kernel = methParam$kernel,
    #       #nu=methParam$nu,
    #       #degree=methParam$degree,
    #       #gamma=methParam$gamma
    #     )
    #   )
    # yFitMax <- svmFitMax$fit
    #
    # yFit <- cbind(yFitMin, yFitMid, yFitMax)
    # yRep <- ncol(yFit) / 3
    # colnames(yFit) <- rep(c('min', 'mid', 'max'), each = yRep)
    # rownames(yFit) <- ID
    # modelReturn$Fit <- list(
    #   yFit = yFit,
    #   svmFitMin = svmFitMin,
    #   svmFitMid = svmFitMid,
    #   svmFitMax = svmFitMax
    # )
  }
  else{
    stop ('Other ML methods not implemented')
  }

  # Calculate fit statistics
  if (!DA) {
    if (!is.null(weighing_matrix)) {
      warning("This is not classification. Weighing matrix is ignored")
    }
    TSS <- sum((Y - mean(Y)) ^ 2)
    RSSMin <- sum((Y - yFitMin) ^ 2)
    RSSMid <- sum((Y - yFitMid) ^ 2)
    RSSMax <- sum((Y - yFitMax) ^ 2)
    PRESSMin <- sum((Y - yPred[, 1]) ^ 2)
    PRESSMid <- sum((Y - yPred[, 2]) ^ 2)
    PRESSMax <- sum((Y - yPred[, 3]) ^ 2)
    R2 <- c(1 - (RSSMin / TSS),
            1 - (RSSMid / TSS),
            1 - (RSSMax / TSS))
    Q2 <- c(1 - (PRESSMin / TSS),
            1 - (PRESSMid / TSS),
            1 - (PRESSMax / TSS))
    names(R2) <- names(Q2) <- c('min', 'mid', 'max')
    # Report
    modelReturn$fitMetric <- list(R2 = R2,
                                  Q2 = Q2)
  } else {
    ##This is where the weighting matrix that comes in
    ###########################################################################################################
    #  if(!missing(weighing_matrix))
    #  {warning("Weighing matrix is added (overwrite weigh_added command)")
    #    weigh_added<-TRUE
    #    }

    if (weigh_added == FALSE) {
      modelReturn$fitMetric <- list(CR = 1 - (miss / length(Y)))
    } else{
      if (is.null(weighing_matrix)) {
        warning("Missing weighing_matrix,weighing_matrix will be diagnoal")
        weighing_matrix <- diag(1, length(levels(Y)), length(levels(Y)))
      }
      if (dim(weighing_matrix)[1] != length(levels(Y))) {
        stop("The dimension of weighing_matrix is not correct")
      }
      if (dim(weighing_matrix)[2] != length(levels(Y))) {
        stop("The dimension of weighing_matrix is not correct")
      }
      for (i in 1:nrow(weighing_matrix)) {
        if (weighing_matrix[i, i] != 1) {
          stop("diagonal values must be 1")
        }
        for (j in 1:ncol(weighing_matrix)) {
          if (weighing_matrix[i, j] < 0 | weighing_matrix[i, j] > 1) {
            stop("Values in the weighing matrix must between 0 and 1")
          }
        }
      }

      confusion_matrix_list <- list()
      scoring_matrix_list <- list()
      confusion_matrix_list$min <-
        table(actual = Y,
              predicted = t(modelReturn$yClass[, "min"]))
      confusion_matrix_list$mid <-
        table(actual = Y,
              predicted = t(modelReturn$yClass[, "mid"]))
      confusion_matrix_list$max <-
        table(actual = Y,
              predicted = t(modelReturn$yClass[, "max"]))
      scoring_matrix_list$min <-
        confusion_matrix_list$min * weighing_matrix
      scoring_matrix_list$mid <-
        confusion_matrix_list$mid * weighing_matrix
      scoring_matrix_list$max <-
        confusion_matrix_list$max * weighing_matrix
      correct_min <- (sum(scoring_matrix_list$min))
      correct_mid <- (sum(scoring_matrix_list$mid))
      correct_max <- (sum(scoring_matrix_list$max))
      correct_score <- c(correct_min, correct_mid, correct_max)
      names(correct_score) <- c('min', 'mid', 'max')
      modelReturn$fitMetric <-
        list(wCR =  (correct_score / length(Y)))
      modelReturn$fitMetric$CR <-  1 - (miss / length(Y))
    }
    #################################################################################################################

  }

  # Set class
  class(modelReturn) <-
    c('MUVR',                ##change name of MUObject to MUVR
      ifelse(DA,
             'Classification',
             ifelse(ML,
                    'Multilevel',
                    'Regression')),
      method)

  # Stop timer
  end.time <- proc.time()[3]
  modelReturn$calcMins <- (end.time - start.time) / 60

  # Output
  message('\n Elapsed time ', modelReturn$calcMins, ' mins \n', appendLF = FALSE)


  ################################# add Var
  Var <-
    list(
      min = rownames(modelReturn$VIRank)[order(modelReturn$VIRank[, 1])][1:modelReturn$nVar[1]],
      mid = rownames(modelReturn$VIRank)[order(modelReturn$VIRank[, 2])][1:modelReturn$nVar[2]],
      max = rownames(modelReturn$VIRank)[order(modelReturn$VIRank[, 3])][1:modelReturn$nVar[3]]
    )
  modelReturn$Var <- Var

  #  beep(3)
  # Return final object
  return(modelReturn)
}

Try the MUVR2 package in your browser

Any scripts or data that you put into this service are public.

MUVR2 documentation built on Sept. 16, 2024, 9:06 a.m.