R/varSelRF.R

Defines functions varSelRF print.varSelRF plot.varSelRF varSelRFBoot print.varSelRFBoot summary.varSelRFBoot plot.varSelRFBoot randomVarImpsRF randomVarImpsRFplot varSelImpSpecRF selProbPlot figureSummary.varSelRFBoot figureSummary2.varSelRFBoot tableSummary.varSelRFBoot boot.imp

Documented in plot.varSelRF plot.varSelRFBoot randomVarImpsRF randomVarImpsRFplot selProbPlot summary.varSelRFBoot varSelImpSpecRF varSelRF varSelRFBoot

#### watch out use of "..." inside Boot function

#####   in bootSimplifyRF maybe allow allBootRuns = NULL so that object is small.



varSelRF <- function(xdata, Class,
                     c.sd = 1,
                     mtryFactor = 1,
                     ntree = 5000,
                     ntreeIterat = 2000,                     
                     vars.drop.num = NULL,
                     vars.drop.frac = 0.2,
                     whole.range = TRUE,
                     recompute.var.imp = FALSE,
                     verbose = FALSE,
                     returnFirstForest = TRUE,
                     fitted.rf = NULL,
                     keep.forest = FALSE) {

    if(!is.factor(Class))
        stop("Class should be a factor")
    if( (is.null(vars.drop.num) & is.null(vars.drop.frac)) |
       (!is.null(vars.drop.num) & !is.null(vars.drop.frac)))
        stop("One (and only one) of vars.drop.frac and vars.drop.num must be NULL and the other set")
  
    max.num.steps <- dim(xdata)[2]
    num.subjects <- dim(xdata)[1]

    if(is.null(colnames(xdata)))
        colnames(xdata) <- paste("v", 1:dim(xdata)[2], sep ="")
    
    ##oversize the vectors; will prune later.
    n.vars <- vars <- OOB.rf <- OOB.sd <- rep(NA, max.num.steps)

    oobError <- function(rf) { ## should not be exported in the namespace.
        ## The out of bag error
        ooo <- rf$confusion[, -dim(rf$confusion)[2]]
        s.ooo <- sum(ooo)
        diag(ooo) <- 0
        sum(ooo)/s.ooo
    }

    
    if(!is.null(fitted.rf)) {
        if(ncol(fitted.rf$importance) < 2)
            stop("The fitted rf was not fitted with importance = TRUE")
        n.ntree <- fitted.rf$ntree
        mtry <- fitted.rf$mtry
        n.mtryFactor <- mtry/sqrt(ncol(xdata))
        if((n.ntree != ntree) | (n.mtryFactor != mtryFactor))
            warning("Using as ntree and mtry the parameters obtained from fitted.rf",
                    immediate.= TRUE)
        ntree <- n.ntree
        mtryFactor <- n.mtryFactor
        rm(n.ntree, n.mtryFactor)
        rf <- fitted.rf
    } else {
        mtry <- floor(sqrt(ncol(xdata)) * mtryFactor)
        rf <- randomForest(x = xdata, y = Class,
                           ntree = ntree, mtry = mtry,
                           importance = TRUE,
                           keep.forest = keep.forest)
    }
    
    if(returnFirstForest)
        FirstForest <- rf
    else
        FirstForest <- NULL
    m.iterated.ob.error <- m.initial.ob.error <- oobError(rf)
    sd.iterated.ob.error <- sd.initial.ob.error <-
        sqrt(m.iterated.ob.error * (1 - m.iterated.ob.error) *
             (1/num.subjects))

    if(verbose) {
        print(paste("Initial OOB error: mean = ",
                    round(m.initial.ob.error, 4),
                    "; sd = ", round(sd.initial.ob.error, 4), sep = ""))
    }

    importances <- importance(rf, type = 1, scale = FALSE)
    selected.vars <- order(importances, decreasing = TRUE)
    ordered.importances <- importances[selected.vars]
    
    initialImportances <- importances
    initialOrderedImportances <- ordered.importances
    
    j <- 1
    n.vars[j] <- dim(xdata)[2] 
    vars[j] <- paste(colnames(xdata), collapse = " + ")
    OOB.rf[j] <- m.iterated.ob.error
    OOB.sd[j] <- sd.iterated.ob.error

    var.simplify <- TRUE
    
    while(var.simplify) {
        if (verbose){
          print("gc inside loop of varSelRF")
          print(gc())
        } else {
          gc()
        }
        
        last.rf <- rf 
        last.vars <- selected.vars
        previous.m.error <- m.iterated.ob.error
        previous.sd.error <- sd.iterated.ob.error

        ## If this is left as only
        ## "if(length(selected.vars) <= 2) var.simplify <- FALSE"
        ## under the if((length(selected.vars) < 2) | (any(selected.vars < 1))),
        ## as it used to be, then we fit a 2 model var, which might be
        ## better or as good as others, but as we never re-enter,
        ## we cannot return it, even if we see it in the history.

        ## Alternatively, we cannot just convert
        ## "if((length(selected.vars) < 2) | (any(selected.vars < 1))"
        ## to the <= 2, as we then would re-enter many times because
        ## of the way selected.vars <- selected.vars[1:2] when
        ## num.vars < (vars.drop + 2)

        ## This way, we enter just to set last.rf, last.vars and
        ## we bail out
        
        if(length(selected.vars) <= 2) {
          var.simplify <- FALSE
          break
        }

        
        if(recompute.var.imp & (j > 1)) {
            importances <- importance(rf, type = 1, scale = FALSE)
            tmp.order <- order(importances, decreasing = TRUE)
            selected.vars <- selected.vars[tmp.order]
            ordered.importances <- importances[tmp.order]
        }
        
        num.vars <- length(selected.vars)
  
        if(is.null(vars.drop.num))
            vars.drop <- round(num.vars * vars.drop.frac)
        else vars.drop <- vars.drop.num
        
        if(num.vars >= (vars.drop + 2)) {
            ## prevent infinite looping when num.vars is, say, 3 and
            ## vars.drop.frac < 0.17
            ## We must drop a variable for sure, since > 2 and we
            ## are still simplifying
            ## Alternatively, use ceiling instead of round, above.
            if(vars.drop == 0) {
                vars.drop <- 1
                if( (num.vars - vars.drop) < 1)
                    stop("vars.drop = 0 and num.vars -vars.drop < 1!")
            }
            selected.vars <- selected.vars[1: (num.vars - vars.drop)]
            ordered.importances <- ordered.importances[1: (num.vars - vars.drop)]
        }
        else {
            selected.vars <- selected.vars[1:2]
            ordered.importances <- ordered.importances[1:2]
        }
        
        ## couldn't we eliminate the following?
        if((length(selected.vars) < 2) | (any(selected.vars < 1))) {
            var.simplify <- FALSE
            break
        }
        
        
        
        mtry <- floor(sqrt(length(selected.vars)) * mtryFactor)
        if(mtry > length(selected.vars)) mtry <- length(selected.vars)
        
        if(recompute.var.imp) 
            rf <- randomForest(x = xdata[, selected.vars],
                               y = Class, importance= TRUE,
                               ntree = ntree, mtry = mtry,
                               keep.forest = keep.forest)
        else
            rf <- randomForest(x = xdata[, selected.vars],
                               y = Class, importance= FALSE,
                               ntree = ntreeIterat, mtry = mtry,
                               keep.forest = keep.forest)
        
        m.iterated.ob.error <- oobError(rf)
        sd.iterated.ob.error <-
            sqrt(m.iterated.ob.error * (1 - m.iterated.ob.error) *
                 (1/num.subjects))
        
        if(verbose) {
            print(paste("..... iteration ", j, "; OOB error: mean = ",
                        round(m.iterated.ob.error, 4),
                        "; sd = ", round(sd.iterated.ob.error, 4),
                        "; num. vars = ", length(selected.vars), 
                        sep = ""))
        }
        j <- j + 1
        
        
        n.vars[j] <- length(selected.vars)
        vars[j] <- paste(colnames(xdata)[selected.vars],
                         collapse = " + ")
        OOB.rf[j] <- m.iterated.ob.error
        OOB.sd[j] <- sd.iterated.ob.error


        if(!whole.range &
           (
            (m.iterated.ob.error >
             (m.initial.ob.error + c.sd*sd.initial.ob.error))
            |
            (m.iterated.ob.error >
             (previous.m.error + c.sd*previous.sd.error)))
           )
            var.simplify <- FALSE
    }

    if (!whole.range) {
        if(!is.null(colnames(xdata)))
            selected.vars <- sort(colnames(xdata)[last.vars])
        else
            selected.vars <- last.vars

        out <- list(selec.history = data.frame(
                    Number.Variables = n.vars,
                    Vars.in.Forest = vars,
                    OOB = OOB.rf,
                    sd.OOB = OOB.sd)[1:j,],
                    rf.model = last.rf,
                    selected.vars = selected.vars,
                    selected.model =  paste(selected.vars, collapse = " + "),
                    best.model.nvars = length(selected.vars),
                    initialImportances = initialImportances,
                    initialOrderedImportances = initialOrderedImportances,
                    ntree = ntree,
                    ntreeIterat = ntreeIterat,
                    mtryFactor = mtryFactor,
#                    mtry = mtry,
                    firstForest = FirstForest)
        class(out) <- "varSelRF"
        return(out)
    }
    else {
      ## Prune the too long vectors created at begin.
      ## not needed above, because we select the 1:j rows
      ## of the return matrix selec.history.
        n.vars <- n.vars[1:j]
        vars <- vars[1:j]
        OOB.rf<- OOB.rf[1:j]
        OOB.sd <- OOB.sd[1:j]
        min.oob.ci <- min(OOB.rf) + c.sd * OOB.sd[which.min(OOB.rf)]
        best.pos <-
            which(OOB.rf <= min.oob.ci)[which.min(n.vars[which(OOB.rf <= min.oob.ci)])]

        selected.vars <- sort(unlist(strsplit(vars[best.pos],
                                              " + ", fixed = TRUE)))
        out <- list(selec.history = data.frame(
                    Number.Variables = n.vars,
                    Vars.in.Forest = vars,
                    OOB = OOB.rf,
                    sd.OOB = OOB.sd),
                    rf.model = NA,
                    selected.vars = selected.vars,
                    selected.model = paste(selected.vars, collapse = " + "),
                    best.model.nvars = n.vars[best.pos],
                    initialImportances = initialImportances,
                    initialOrderedImportances = initialOrderedImportances,
                    ntree = ntree,
                    ntreeIterat = ntreeIterat,
                    mtryFactor = mtryFactor,
                    ##mtry = mtry,
                    firstForest = FirstForest)
        class(out) <- "varSelRF"
        return(out)
    }
}


print.varSelRF <- function(x, ...) {
    cat("\nBackwards elimination on random forest; ")
  cat(paste("ntree = ", x$ntree,";  mtryFactor = ",
            x$mtryFactor, "\n"), sep ="")
  cat("\n Selected variables:\n")
  print(x$selected.vars)
  cat("\n Number of selected variables:", x$best.model.nvars, "\n\n")
}

#summary.varSelRF <- print.varSelRF

plot.varSelRF <- function(x, nvar = NULL, which = c(1, 2), ...) {
    if (length(which) == 2 && dev.interactive()) {
        op <- par(ask = TRUE, las = 1)
    } else {
        op <- par(las = 1)
    }
    
    on.exit(par(op))
    
    if(is.null(nvar))
        nvar <- min(30,
                    length(x$initialOrderedImportances))
    
    show <- c(FALSE, FALSE)
    show[which] <- TRUE

    if (show[1]){
      dotchart(rev(x$initialOrderedImportances[1:nvar]),
          main = "Initial importances",
          xlab = "Importances (unscaled)")
    }
    
    if (show[2]){
      ylim <- c(0, max(0.50, x$selec.history$OOB))
      plot(x$selec.history$Number.Variables,
          x$selec.history$OOB, type = "b",
          xlab = "Number of variables used",
          ylab = "OOB error", log = "x",
          ylim = ylim,
          ...)
      
      ##  if(max(x$selec.history$Number.Variables) > 300)
      ##      axis(1, at = c(1, 2, 3, 5, 8, 15, 25, 50, 75, 150, 200, 300),
      ##           labels = c(1, 2, 3, 5, 8, 15, 25, 50, 75, 150, 200, 300))
      
      lines(x$selec.history$Number.Variables,
          x$selec.history$OOB +
              2 * x$selec.history$sd.OOB, lty = 2)
      lines(x$selec.history$Number.Variables,
          x$selec.history$OOB -
              2 * x$selec.history$sd.OOB, lty = 2)
    }
}
  
## We could also write a varSelRFCV zz: move al TODO

varSelRFBoot <- function(xdata, Class,
                         c.sd = 1,
                         mtryFactor = 1,
                         ntree = 5000,
                         ntreeIterat = 2000,
                         vars.drop.frac = 0.2,
                         bootnumber = 200,
                         whole.range = TRUE,
                         recompute.var.imp = FALSE,
                         usingCluster = TRUE,
                         TheCluster = NULL,
                         srf = NULL,
                         verbose = TRUE,
                         ...) {
    
    ## beware there is a lot of data copying... pass by reference, or
    ## minimize copying or something.
    

    if(is.null(colnames(xdata)))
        colnames(xdata) <- paste("v", 1:dim(xdata)[2], sep ="")
    
    if(!is.null(srf)) { ## we are passing a simplified rf object.
        if(class(srf) != "varSelRF")
            stop("srf must be the results of a run of varSelRF")
        n.ntree <- srf$ntree
        n.ntreeIterat <- srf$ntreeIterat
        n.mtryFactor <- srf$mtryFactor
        if((n.ntree != ntree) | (n.mtryFactor != mtryFactor) |
           (n.ntreeIterat != ntreeIterat))
        warning("Using as ntree and mtryFactor the parameters obtained from srf",
                immediate.= TRUE)
        ntree <- n.ntree
        mtryFactor <- n.mtryFactor
        rm(n.ntree, n.mtryFactor)
        all.data.run <- srf
    } else { ## we are simplifying the random forest
        all.data.run <- varSelRF(Class = Class,
                             xdata = xdata,
                             c.sd = c.sd,
                             mtryFactor = mtryFactor,
                             ntree = ntree,
                             ntreeIterat = ntreeIterat,
                             vars.drop.frac = vars.drop.frac,
                             whole.range = whole.range,
                             recompute.var.imp = recompute.var.imp)
###                             ...) 
    }
    columns.data <- which(colnames(xdata) %in%
                          all.data.run$selected.vars)
    
    all.data.rf.mtry <- floor(mtryFactor * sqrt(length(columns.data)))
    if(all.data.rf.mtry > length(columns.data))
        all.data.rf.mtry <- length(columns.data)
    all.data.rf.predict <- randomForest(y = Class,
                                        x = xdata[, columns.data],
                                        ntree = all.data.run$ntree,
                                        mtry = all.data.rf.mtry,
                                        xtest = xdata[, columns.data],
                                        ytest = Class,
                                        keep.forest = FALSE)
    
    full.pred <- all.data.rf.predict$test$predicted
    
    all.data.selected.vars <- all.data.run$selected.vars
    ##    all.data.selected.model <- all.data.run$selected.model
    all.data.best.model.nvars <- all.data.run$best.model.nvars
    
    N <- length(Class)
    solution.sizes <- rep(NA, bootnumber)
    overlap.with.full <- rep(NA, bootnumber)
    vars.in.solutions <- vector()
##    solutions <- rep(NA, bootnumber)
    
    bootTrainTest <- function(dummy,
        xdataTheCluster,
        ClassTheCluster,
        c.sd,
        mtryFactor,
        ntree, ntreeIterat,
        whole.range, recompute.var.imp,
        ...) {
        N <- length(ClassTheCluster)
        sample.again <- TRUE
        while(sample.again) {
            bootsample <- unlist(tapply(1:N,  ClassTheCluster,
                                        function(x)
                                        sample(x, size = length(x),
                                               replace = TRUE)))
            ## sure, this isn't the fastest, but will do for now.
            nobootsample <- setdiff(1:N, bootsample)
            if(!length(nobootsample))
                sample.again <- TRUE
            else sample.again <- FALSE
        }
        ## this is an ugly hack to prevent nobootsamples
        ## of size 0.
        
        train.data <- xdataTheCluster[bootsample, , drop = FALSE]
        test.data <- xdataTheCluster[nobootsample, , drop = FALSE]
        train.class <- ClassTheCluster[bootsample]
        test.class <- ClassTheCluster[nobootsample]

        boot.run <- varSelRF(Class = train.class,
                             xdata = train.data,
                             c.sd = c.sd,
                             mtryFactor = mtryFactor,
                             ntree = ntree,
                             ntreeIterat = ntreeIterat,
                             whole.range = whole.range,
                             recompute.var.imp = recompute.var.imp,
                             vars.drop.frac = vars.drop.frac)
###                             ...)
        output.cl <- list()
        output.cl$best.model.nvars <- boot.run$best.model.nvars
        output.cl$selected.model <- boot.run$selected.model
        output.cl$selected.vars <- boot.run$selected.vars
        output.cl$nobootsample <- nobootsample
        output.cl$bootsample <- bootsample
        output.cl$initialImportances <- boot.run$initialImportances
        output.cl$initialOrderedImportances <- boot.run$initialOrderedImportances
        output.cl$selec.history <- boot.run$selec.history

        boot.col.data <- which(colnames(xdataTheCluster) %in%
                               boot.run$selected.vars)
        run.test.mtry <- floor(mtryFactor * sqrt(length(boot.col.data)))
        if(run.test.mtry > length(boot.col.data))
            run.test.mtry <- length(boot.col.data)

        boot.run.test <- randomForest(y  = train.class,
                                      x = train.data[, boot.col.data,
                                      drop = FALSE],
                                       ntree = boot.run$ntree,
                                      mtry = run.test.mtry,
                                      keep.forest = FALSE,
                                      xtest = test.data[, boot.col.data,
                                      drop = FALSE])$test
##                                      ytest = test.class)$test
        output.cl$class.pred.array <- boot.run.test$predicted
        output.cl$prob.pred.array <- boot.run.test$votes
        rm(boot.run.test)
        gc()
        return(output.cl)
    } ## bootTrainTest; this function is defined to be used in the cluster.
    ## we use it too if not in the cluster. There is a bit too much
    ## copying, because we define a pair of new objects
    ## xdataTheCluster and ClassTheCluster that need not be defined
    ## if we just copied the code. But that is ugly and hard to follow,
    ## and the non-cluster is slow for other reasons.
    
    if(usingCluster) {
        if (verbose){
          print("gc inside varSelRFBoot papply")
          print(gc())
        } else {
          gc()
        }
        
        cat("\n      Running bootstrap iterations using cluster (can take a while)\n")
        boot.runs <- clusterApplyLB(TheCluster,
                                    1:bootnumber,
                                    bootTrainTest,
                                    xdataTheCluster = xdata,
                                    ClassTheCluster = Class,
                                    c.sd = c.sd,
                                    mtryFactor = mtryFactor,
                                    ntree = ntree,
                                    ntreeIterat = ntreeIterat,
                                    whole.range = whole.range,
                                    recompute.var.imp = recompute.var.imp)
        ## clean up before leaving
        clusterEvalQ(TheCluster,
                     rm(list = c("xdataTheCluster", "ClassTheCluster")))

    } else { ## Not using Cluster
        boot.runs <- list()
        xdataTheCluster <- xdata
        ClassTheCluster <- Class
        cat("\n      Running bootstrap iterations")
        for(nboot in 1:bootnumber) {
            cat(".")
            boot.runs[[nboot]] <-
                bootTrainTest(nboot,
                              xdataTheCluster = xdata,
                              ClassTheCluster = Class,
                              c.sd = c.sd,
                              mtryFactor = mtryFactor,
                              ntree = ntree,
                              ntreeIterat = ntreeIterat,
                              whole.range = whole.range,
                              recompute.var.imp = recompute.var.imp)
        }
        cat("\n")
    }
        

    solutions <- unlist(lapply(boot.runs, function(z) {
        paste(sort(z$selected.vars), collapse = " + ")}))
    vars.in.solutions <- unlist(lapply(boot.runs,
                                       function(z) z$selected.vars))
    solution.sizes <- unlist(lapply(boot.runs,
                                    function(z) z$best.model.nvars))
    overlap.with.full <-
        unlist(lapply(boot.runs,
                      function(x) {
                          length(intersect(x$selected.vars,
                                           all.data.selected.vars))/
                                               sqrt(all.data.best.model.nvars *
                                                    x$best.model.nvars)})) 

    prob.pred.array <-
        array(NA, dim = c(N,  nlevels(Class), bootnumber),
              dimnames = list(1:N, levels(Class),
              paste("BootstrapReplication.", 1:bootnumber, sep = "")))

    ## to store class predictions as data frame
    class.pred.array <- data.frame(Class)
    
    for(nb in 1:bootnumber) {
        nobootsample <- boot.runs[[nb]]$nobootsample
        class.pred.array[[nb]] <- factor(NA, levels = levels(Class))
        class.pred.array[nobootsample, nb] <-
            boot.runs[[nb]]$class.pred.array
        prob.pred.array[nobootsample, , nb] <-
            boot.runs[[nb]]$prob.pred.array
    }
    names(class.pred.array) <-
        paste("BootstrapReplication.", 1:bootnumber, sep = "")

            
    ############## The .632+ estimate of prediction error ##################
    ##      one:         \hat{Err}^{(1)} in Efron & Tibshirani, 1997, p. 550,
    ##                   or leave-one-out bootstrap error.  
    ##      resubst:     \bar{err}, the apparent error rate, resubstitution rate,
    ##                   or "in sample" error rate.
    ##      full.pred:   the "in sample" prediction from full model; only used
    ##                   to obtain gamma.
    ##      gamma:       gamma in p. 552
    ##      r:           R in p. 552 (bounding in [0, 1]).
    ##      err632:      the .632 error
    ##      errprime:    the one prime; \hat{Err}^{(1)}' 
    ##      err:         the .632+ error
    
    ## This is how I find one
    
    one <- mean(apply(cbind(class.pred.array, Class), 1,
                      function(x) {mean(x[-(bootnumber + 1)] != x[bootnumber + 1],
                                        na.rm = TRUE)}), na.rm = TRUE)
    ## this is equivalent to what Torsten Hothorn does in ipred
            
    resubst <- mean(full.pred != Class)
    ## The following code I take directly from the function
    ## bootest.factor, by Torsten Hothorn, in package ipred.
    err632 <- 0.368 * resubst + 0.632 * one
    gamma <-
        sum(outer(as.numeric(Class),
                  as.numeric(full.pred),
                  function(x, y) ifelse(x == y, 0, 1)))/(length(Class)^2)
    
    r <- (one - resubst)/(gamma - resubst)
    r <- ifelse(one > resubst & gamma > resubst, r, 0)
    if((r > 1) | (r < 0)) { ## just debugging; remove later?
        print(paste("r outside of 0, 1 bounds: one", one,
                    "resubst", resubst, "gamma", gamma))
        if(r > 1) {
            r <- 1
            print("setting r to 1")
        }
        else if(r < 0) {
            r <- 0
            print("setting r to 0")
        }
    }
    
    errprime <- min(one, gamma)
    err <- err632 + (errprime - resubst) *
        (0.368 * 0.632 * r)/(1 - 0.368 * r)
    cat("\n     .632+ prediction error ", round(err, 4), "\n")

    out <- list(number.of.bootsamples = bootnumber, 
                bootstrap.pred.error = err,
                resubstitution.error = resubst,
                leave.one.out.bootstrap.error = one,
                all.data.randomForest = all.data.rf.predict,
                all.data.vars = all.data.selected.vars,
                all.data.run = all.data.run,
                class.predictions = class.pred.array,
                prob.predictions = prob.pred.array,
                number.of.vars = solution.sizes,
                overlap = overlap.with.full,
                all.vars.in.solutions = vars.in.solutions,
                all.solutions = solutions,
                class = Class,
                allBootRuns = boot.runs)
    class(out) <- "varSelRFBoot"
    return(out)
}


print.varSelRFBoot <- function(x, ...) {
    cat("\n\n Variable selection with random forest \n")
    cat(" ------------------------------\n")
##    cat("\n\n randomForest summary \n")
##    print(object$all.data.randomForest)
    cat("\n Variables used \n")
    print(x$all.data.vars)
    cat("\n \n Number of variables used: ", length(x$all.data.vars),
        "\n")
    
    cat("\n\n Bootstrap results\n")
    cat(" ------------------\n")
    cat("\n Bootstrap (.632+) estimate of prediction error: \n",
        " (using", x$number.of.bootsamples, "bootstrap iterations): \n",
        x$bootstrap.pred.error)
    cat("\n\n Number of vars in bootstrapped forests:        \n")
    print(summary(x$number.of.vars))
}


## zz: pass gene names and subject names??
## look at examples from ~/Proyectos/Signatures/Symposum/boot.pamr.knn.dlda.R
## o similar 
## Or not, because I think I am ussing column names
## but look at that code anyway for improvements


#### in summaryBoot: a plot of error rate vs. 
#### number of variables
#### requires saving el "all.data.run" en el Boot, which should ALWAYS be done!!


summary.varSelRFBoot <- function(object,
                                 return.model.freqs = FALSE,
                                 return.class.probs = TRUE,
                                 return.var.freqs.b.models = TRUE,
                                 ...) {
    cat("\n\n Variable selection using all data \n")
    cat(" ------------------------------\n")
###    cat("\n\n randomForest summary \n")
###    print(object$all.data.randomForest)
    cat("\n \n variables used \n")
    print(object$all.data.vars)
    cat("\n \n Number of variables used: ", length(object$all.data.vars),
        "\n")
    cat("\n\n Bootstrap results\n")
    cat(" ------------------\n")
    cat("\n\n Bootstrap (.632+) estimate of prediction error:",
        object$bootstrap.pred.error, " (using",
        object$number.of.bootsamples, "bootstrap iterations).\n")
    cat("\n\n Resubstitution error:                          ",
        object$resubstitution.error, "\n")
    cat("\n\n Leave-one-out bootstrap error:                 ",
        object$leave.one.out.bootstrap.error, "\n")
    nk <- as.vector(table(object$class))
    cat("\n\n Error rate at random:                          ",
        1 - (max(nk)/sum(nk)), "\n")
    cat("\n\n Number of vars in bootstrapped forests:        \n")
    print(summary(object$number.of.vars))
    cat
    cat("\n")
    cat("\n Overlapp of bootstrapped forests with forest from all data\n")
    print(summary(object$overlap))
    if(return.var.freqs.b.models) {
        cat("\n\n Variable freqs. in bootstrapped models \n")
        print(sort(table(object$all.vars.in.solutions),
                   decreasing = TRUE)/object$number.of.bootsamples)
    }
    in.all.data <-
        which(names(table(object$all.vars.in.solutions)) %in% object$all.data.vars)
    cat("\n\n Variable freqs. of variables in forest from all data, and summary \n")
    print(table(object$all.vars.in.solutions)[in.all.data]/object$number.of.bootsamples)
    cat("\n")
    print(summary(table(object$all.vars.in.solutions)[in.all.data]/object$number.of.bootsamples))


    if(return.model.freqs) {
        tmp.table <- sort(table(object$all.solutions),
                          decreasing = TRUE)/object$number.of.bootsamples
        n.tmp.table <- names(tmp.table)
        dim(tmp.table) <- c(dim(tmp.table), 1)
        rownames(tmp.table) <- n.tmp.table
        colnames(tmp.table) <- "Freq."
        cat("\n\n Solutions frequencies in bootstrapped models \n")
        print(tmp.table)
    }
    
    if(return.class.probs)
        cat("\n\n Mean class membership probabilities from out of bag samples\n")
    if(return.class.probs) {
        mean.class.probs <- apply(object$prob.predictions, c(1, 2),
                                  function(x) mean(x, na.rm = TRUE))
        colnames(mean.class.probs) <- levels(object$class)
    }
    if(return.class.probs) return(mean.class.probs)
}



plot.varSelRFBoot <- function(x,
                              oobProb = TRUE,
                              oobProbBoxPlot = FALSE,
                              ErrorNum = TRUE,
                              subject.names = NULL,
                              class.to.plot = NULL,
                              ...) {
    
    if(oobProb | oobProbBoxPlot) {
        mean.class.probs <- apply(x$prob.predictions, c(1, 2),
                                  function(x) mean(x, na.rm = TRUE))
        colnames(mean.class.probs) <- levels(x$class)
        rainbow.col <- rainbow(nlevels(x$class))
	if(dev.interactive()) {
              op <- par(ask = TRUE, las = 1)
    	} else {
               op <- par(las = 1)
    	}
        on.exit(par(op))
####        for(i in 1:ncol(mean.class.probs)) {
        if(is.null(class.to.plot))
            class.to.plot <- 1:ncol(mean.class.probs)
        for(i in class.to.plot) {
            if(oobProbBoxPlot)  {
                boxplot(data.frame(t(x$prob.predictions[ , i, ])),
                        xlab = "Samples",
                        ylab = "Out of bag probability of membership",
                        main = paste("Class", colnames(mean.class.probs)[i]),
                        type = "p",
                        col = rainbow.col[as.numeric(x$class)],
                        pch = 19, ylim = c(0, 1.3),
                        axes = FALSE)
            } else {
###                dotchart(mean.class.probs[, i],
###                         xlab = "Samples",
###                         ylab = "(Average) Out of bag probability of membership",
###                         main = paste("Class", colnames(mean.class.probs)[i]),
###                         type = "p",
###                         col = rainbow.col[as.numeric(x$class)],
###                         pch = 19, ylim = c(0, 1.2),
###                         axes = FALSE)
                plot(mean.class.probs[, i],
                     xlab = "Samples",
                     ylab = "(Average) Out of bag probability of membership",
                     main = paste("Class", colnames(mean.class.probs)[i]),
                     type = "p",
                     col = rainbow.col[as.numeric(x$class)],
                     pch = 19, ylim = c(0, 1.3),
                     axes = FALSE)
            }
            if(!is.null(subject.names))
                text(mean.class.probs[ , i],
                     labels = subject.names, pos = 2, cex = 0.8)
            
            box()
            ##axis(1)
            par(las = 2)
            axis(1, at = seq(1:dim(mean.class.probs)[1]),
                 labels = seq(1:dim(mean.class.probs)[1]))
            segments(x0 = seq(1:dim(mean.class.probs)[1]),
                     y0 = 0,
                     x1 = seq(1:dim(mean.class.probs)[1]),
                     y1 = 1,
                     lty = 2,
                     col = "grey")
            axis(2, at = seq(from = 0, to = 1, by = 0.2))
            legend(y = c(1.29, 1.15), x = c(1, dim(mean.class.probs)[1]),
                   legend = paste("Class", levels(x$class)),
                   col = rainbow.col,
                   pch = 19, bty = "n" )
            abline( h = 1.05, lty = 1)
            abline(h = seq(from = 0, to = 1,
                   length = 1 + nlevels(x$class))[-c(1, nlevels(x$class) + 1)], 
                   lty = 2)
        }
    }
    if(ErrorNum) {
        par(las = 2)
        all.data.errors <- x$all.data.run$selec.history$OOB
        ngenes <- x$all.data.run$selec.history$Number.Variables
        maxplot <-
            max(
                c(unlist(lapply(x$allBootRuns,
                                function(x) max(x$selec.history$OOB))),
                  all.data.errors))
        minplot <-
            min(
                c(unlist(lapply(x$allBootRuns,
                                function(x) min(x$selec.history$OOB))),
                  all.data.errors))
        minplot <- minplot * (1 - 0.1)
        maxplot <- maxplot * (1 + 0.2)
        plot(ngenes, all.data.errors, 
             type = "l", axes = TRUE, xlab = "Number of variables",
             ylab = "OOB Error rate", ylim = c(minplot, maxplot),
             lty = 1,
             log = "x",
             col = "red", lwd = 2,
             main = "OOB Error rate vs. Number of variables in predictor",
             xlim = c(2, max(ngenes)*1.1))
###             plot(num.points.plot:1,
###                  x$allBootRuns[[1]]$other$trained.pam.cv$error,
###                  type = "l", axes = FALSE, xlab = "Number of genes",
###                  ylab = "CV Error rate", ylim = c(minplot, maxplot), lty = 2,
###                  main = "CV Error rate vs. Number of genes in predictor.")
        legend(x = 10, y = maxplot,
               legend = c("Bootstrap samples", "Original sample"),
               lty = c(2, 1), lwd = c(1, 3), col = c("Black", "Red"),
               bty = "n")
##        box()
##        axis(2)
##        axis(1)
        
        if(max(ngenes) > 300)
            axis(1, at = c(2, 3, 5, 8, 15, 20, 25, 35,
                    50, 75, 150, 200, 300),
                 labels = c(2, 3, 5, 8, 15, 10, 25, 35,
                 50, 75, 150, 200, 300))

        for(nb in 1:x$number.of.bootsamples)
            lines(x$allBootRuns[[nb]]$selec.history$Number.Variables,
                  x$allBootRuns[[nb]]$selec.history$OOB, lty = 2)
        lines(ngenes, all.data.errors, col = "red", lwd = 4)
    }
}



randomVarImpsRF <- function(xdata, Class, forest, numrandom = 100,
                            whichImp = "impsUnscaled",
                            usingCluster = TRUE,
                            TheCluster = NULL,
                            ...) {

  if(!all(whichImp %in% c("impsScaled", "impsUnscaled", "impsGini")))
      stop("whichImp contains a non-valid option; should be one or more \n",
           "of impsScaled, impsUnscaled, impsGini")

  
  ontree <- forest$ntree
  omtry <- forest$mtry
  
  nodesize <- 1
  
  if(usingCluster) {
      iRF2.cluster <- function(dummy, xdataTheCluster, ClassTheCluster,
                               ontree, omtry, nodesize, ...) {
        rf <- randomForest(x = xdataTheCluster,
                           y = sample(ClassTheCluster),
                           ntree = ontree,
                           mtry = omtry,
                           nodesize = nodesize,
                           importance = TRUE, keep.forest = FALSE,
                           ...)
        ## If we specify the importance measure, only that var is evaluated.
        ## if we say "ALL", all three.
        impsScaled <- NULL
        impsUnscaled <- NULL
        impsGini <- NULL
        if("impsUnscaled" %in% whichImp) 
          impsUnscaled <- importance(rf, type = 1, scale = FALSE)
        if("impsScaled" %in% whichImp)
          impsScaled <- importance(rf, type = 1, scale = TRUE)
        if("impsGini" %in% whichImp)
            impsGini <- importance(rf, type = 2, scale = TRUE)
          ## impsGini <- rf$importance[, ncol(rf$importance)]
        
        return(list(impsScaled,
                    impsUnscaled,
                    impsGini))
    }
      
      outCl <- clusterApplyLB(TheCluster,
                              1:numrandom,
                              iRF2.cluster,
                              xdataTheCluster = xdata,
                              ClassTheCluster = Class,
                              ontree = ontree,
                              omtry = omtry,
                              nodesize = nodesize)


      
  } else {
      outCl <- list()
      cat("\n Obtaining random importances ")
      for(nriter in 1:numrandom) {
          cat(".")
        rf <- randomForest(x = xdata,
                           y = sample(Class),
                           ntree = ontree,
                           mtry = omtry,
                           nodesize = nodesize,
                           importance = TRUE,
                           keep.forest = FALSE,
                           ...)
        impsScaled <- NULL
        impsUnscaled <- NULL
        impsGini <- NULL
          if("impsUnscaled" %in% whichImp) 
              impsUnscaled <- importance(rf, type = 1, scale = FALSE)
          if("impsScaled" %in% whichImp)
              impsScaled <- importance(rf, type = 1, scale = TRUE)
          if("impsGini" %in% whichImp)
              impsGini <- importance(rf, type = 2, scale = TRUE)
          ## impsGini <- rf$importance[, ncol(rf$importance)]
        outCl[[nriter]] <- list(impsScaled,
                                impsUnscaled,
                                impsGini)
      }
      cat("\n")
    } ##</else>
    
  randomVarImps <- list()
  
  if("impsScaled" %in% whichImp) {
    randomVarImps$impsScaled <-
        matrix(unlist(lapply(outCl, function(x) x[[1]])),
               ncol = length(outCl))
    colnames(randomVarImps$impsScaled) <- 1:numrandom
    rownames(randomVarImps$impsScaled) <- rownames(forest$importance)
  }
  if("impsUnscaled" %in% whichImp){
    randomVarImps$impsUnscaled <-
        matrix(unlist(lapply(outCl, function(x) x[[2]])),
               ncol = length(outCl))
    colnames(randomVarImps$impsUnscaled) <- 1:numrandom
    rownames(randomVarImps$impsUnscaled) <- rownames(forest$importance)
  }
  if("impsGini" %in% whichImp) {
    randomVarImps$impsGini <-
        matrix(unlist(lapply(outCl, function(x) x[[3]])),
               ncol = length(outCl))
    colnames(randomVarImps$impsGini) <- 1:numrandom
    rownames(randomVarImps$impsGini) <- rownames(forest$importance)
  }
  
  class(randomVarImps) <- c(class(randomVarImps),
                            "randomVarImpsRF")
  return(randomVarImps)
  ## Hopefully, we are returning a list with only the components selected.zz
}

randomVarImpsRFplot <- function(randomImportances,
                                forest,
                                whichImp = "impsUnscaled",
                                nvars = NULL,
                                show.var.names = FALSE,
                                vars.highlight = NULL,
                                main = NULL,
                                screeRandom = TRUE,
                                lwdBlack = 1.5,
                                lwdRed = 2,
                                lwdLightblue = 1,
                                cexPoint = 1,
                                overlayTrue = FALSE,
                                xlab = NULL,
                                ylab = NULL,
                                ...) {
    
    if(ncol(forest$importance) < 2)
        stop("The fitted rf", deparse(substitute(forest)),
             "was not fitted with importance = TRUE")
    
    randomImportances <-
        switch(whichImp,
               "impsUnscaled" = randomImportances$impsUnscaled,
               "impsScaled" = randomImportances$impsScaled,
               "impsGini" = randomImportances$impsGini,
               )

    originalForestImportance <-
        switch(whichImp,
               "impsUnscaled" =
                   importance(forest, type = 1, scale = FALSE),
               "impsScaled" =
                   importance(forest, type = 1, scale = TRUE),
               "impsGini" =
                   importance(forest, type = 2, scale = TRUE)
               ## forest$importance[, ncol(forest$importance)]
               )
    if(is.null(originalForestImportance))
        stop("\n Not valid 'whichImp' \n")
    
    if(is.null(xlab)) xlab <- "(Ordered) Variable"
    if(is.null(ylab)) ylab <-
        switch(whichImp,
               "impsUnscaled" = "Importance (unscaled)",
               "impsScaled"   = "Importance (scaled)",
               "impsGini"     = "Importance (Gini)",
               )
               
    nvars <- min(nvars, dim(randomImportances)[1])
    ylim <- range(originalForestImportance, randomImportances)    
    plottingOrder <- order(originalForestImportance,
                           decreasing = TRUE)[1:nvars]

    ## This is really (something changed in randomForest?) a matrix.
    ## Explictily make it a vector
    ## check first
    if(ncol(originalForestImportance) != 1)
        stop("originalForestImportance ncol != 1")
    orderedOriginalImps <- originalForestImportance[plottingOrder, 1,
                                                    drop = TRUE]
    
    plot(orderedOriginalImps, type = "n", axes = FALSE, 
         xlab = xlab, ylab = ylab,
         main = main, ylim = ylim,
         ...)
    abline(h = 0, lty = 2, col = "blue")
    axis(2)
    box()
    if(show.var.names) {
        axis(1, labels = names(orderedOriginalImps),
             at = 1:length(orderedOriginalImps))
    } else {
        axis(1)
    }
    
    if(!overlayTrue)
         ###points(x = 1:nvars, orderedOriginalImps, lwd = lwdBlack,
           ###    col = "black", type = "b", cex = cexPoint)
        lines(x = 1:nvars, orderedOriginalImps, lwd = lwdBlack,
              col = "black", type = "b", cex = cexPoint)

    
    if(!is.null(vars.highlight)) {
        if(length(vars.highlight) > nvars) {
            warning("Not all vars. to highlight will be shown; increase nvars\n")
            cat("\n Not all vars. to highlight will be shown; increase nvars\n")
        }
        pos.selected <- which(names(orderedOriginalImps) %in% vars.highlight)
        if(!length(pos.selected)){
            warning("No selected vars. among those to show\n")
            cat("\nNo selected vars. among those to show\n")
        }
        else segments(pos.selected, 0,
                      pos.selected, orderedOriginalImps[pos.selected],
                      col = "blue", lwd = 2)
        if(length(pos.selected) < length(vars.highlight)) {
            warning("Not shown ",
                    length(vars.highlight) - length(pos.selected),
                    " of the 'to-highlight' variables\n")
            cat("Not shown ", length(vars.highlight) - length(pos.selected),
                    " of the 'to-highlight' variables\n")

        }
    }

    column.of.mean <- dim(randomImportances)[2] + 1
    if(screeRandom) {
        randomImportances <- apply(randomImportances, 2,
                                   sort, decreasing = TRUE)
        randomImportances <- randomImportances[1:nvars, ]
        randomImportances <- cbind(randomImportances,
                                   apply(randomImportances, 1, mean))
    } else {
        randomImportances <- randomImportances[plottingOrder, ]
        randomImportances <- cbind(randomImportances,
                                   apply(randomImportances, 1, mean))
    }

    matlines(randomImportances[, -column.of.mean],
             col = "lightblue", lwd = lwdLightblue, lty = 1)
    lines(x = 1:nvars,
          y = randomImportances[, column.of.mean],
          lwd = lwdRed, col = "red")

    if(overlayTrue) {
        points(x = 1:nvars, orderedOriginalImps, lwd = lwdBlack,
               col = "black", type = "b", cex = cexPoint)
        lines(x = 1:nvars, orderedOriginalImps, lwd = lwdBlack,
              col = "black", type = "l")
    }

}



varSelImpSpecRF <- function(forest,
                            xdata = NULL,
                            Class = NULL,
                            randomImps = NULL,
                            threshold = 0.10,
                            numrandom = 20,
                            whichImp = "impsUnscaled",
                            usingCluster = TRUE,
                            TheCluster = NULL,
                            ...) {

    if((is.null(xdata) | is.null(Class)) & is.null(randomImps))
        stop("You must specify a randomVarImpsRF object OR",
             "valid covariates and class objects.\n")

    if((!is.null(xdata) & !is.null(Class)) & !is.null(randomImps))
        warning("Using only the randomVarImpsRF object.",
             "Covariates and class objects discarded.\n", immediate. = TRUE)

    if(length(whichImp) > 1) stop("You can only use one importance measure")

    originalImps <- switch(whichImp,
                           "impsUnscaled" =
                               importance(forest, type = 1, scale = FALSE),
                           "impsScaled" =
                               importance(forest, type = 1, scale = TRUE),
                           "impsGini" =
                               importance(forest, type = 2, scale = TRUE)
                   ## forest$importance[, ncol(forest$importance)]
                   )
    if(is.null(originalImps))
        stop("\n Not valid 'whichImp' \n")

    originalImpsOrder <- order(originalImps,
                               decreasing = TRUE)

    if(is.null(randomImps)) {
        randomImps <-
            switch(whichImp,
                   "impsUnscaled" = randomVarImpsRF(xdata, Class, forest,
                   numrandom = numrandom,
                   whichImp = "impsUnscaled",
                   usingCluster = usingCluster,
                   TheCluster = TheCluster,
                   ...)$impsUnscaled, 
                   "impsUnscaled" = randomVarImpsRF(xdata, Class, forest,
                   numrandom = numrandom,
                   whichImp = "impsScaled",
                   usingCluster = usingCluster,
                   TheCluster = TheCluster,
                   ...)$impsScaled, 
                   "impsUnscaled" = randomVarImpsRF(xdata, Class, forest,
                   numrandom = numrandom,
                   whichImp = "impsGini",
                   usingCluster = usingCluster,
                   TheCluster = TheCluster,
                   ...)$impsGini, 
                   )               
    } else {
        elemento <- match(whichImp, names(randomImps))
        if(is.na(elemento))
            stop("The requested importance was not calculated\n",
                 "for the randomImps", deparse(substitute(randomImps)),"\n",
                 "object.\n")
        cat("\n Using the randomVarImpsRF", deparse(substitute(randomImps)),
            "object. xdata, Class, numrandom ignored.\n")
        randomImps <- randomImps[[elemento]]

    }
     
    randomImps <- apply(randomImps, 2,
                        sort, decreasing = TRUE)
    thresholds <- apply(randomImps, 1,
                        function(x) quantile(x, probs = 1 - threshold, type = 8))

    largest.value <- which(originalImps[originalImpsOrder] <= thresholds)[1]
    if(is.na(largest.value)) {
        selected.vars <- originalImpsOrder
        warning("All variables selected; could signal a problem")
    } else if(largest.value == 1) {
        selected.vars <- NA
        warning("No variables selected; could signal a problem")
    } else selected.vars <- originalImpsOrder[1:(largest.value - 1)]

    return(selected.vars)
}


selProbPlot <- function(object,
                        k = c(20, 100),
                        color = TRUE,
                        legend = FALSE,
                        xlegend = 68,
                        ylegend = 0.93,
                        cexlegend = 1.4,
                        main = NULL,
                        xlab = "Rank of gene",
                        ylab = "Selection probability",
                        pch = 19,
                        ...) {
    ## selection probability plots, such as in Pepe
    ## et al. 2003 (ROC paper).
    if(class(object) != "varSelRFBoot")
        stop("This function only works with objects created\n",
             "with the varSelRFBoot function.\n")
    nboot <- object$number.of.bootsamples
    if(nboot < 100)
        warning("You only used ", nboot,
                " bootstrap samples. Might be too few.",
                immediate. = TRUE)
    
    original.imps <- object$all.data.run$initialImportances
    original.ranks <- rank(-original.imps)
    boot.ranks <- lapply(object$allBootRuns,
                         function(x) {rank(-x$initialImportances)})
    boot.ranks <- matrix(unlist(boot.ranks), ncol = nboot)
    k1 <- apply(boot.ranks, 1, function(z) {sum(z <= k[1])/nboot})
    k2 <- apply(boot.ranks, 1, function(z) {sum(z <= k[2])/nboot})

    if(color) {
        plot(original.ranks[original.ranks <= k[2]],
             k2[original.ranks <= k[2]], xlim = c(1, k[2]),
             col = "red", pch = pch,
             xlab = xlab,
             ylab = ylab,
             main = main,
             ylim = c(0, 1),
             ...)
        points(original.ranks[original.ranks <= k[1]],
               k1[original.ranks <= k[1]],
               col = "blue", pch = pch)
        
        if(legend) legend(x = xlegend, y = ylegend,
                          legend = c(paste("Top", k[2]),
                          paste("Top", k[1])),
                          col = c("red", "blue"),
                          pch = pch, cex = cexlegend)
    } else {
        plot(original.ranks[original.ranks <= k[2]],
             k2[original.ranks <= k[2]], xlim = c(1, k[2]),
             pch = pch,
             xlab = "Rank of gene",
             ylab = "Selection probability",
             main = main, ylim = c(0, 1),
             ...)
        points(original.ranks[original.ranks <= k[1]],
               k1[original.ranks <= k[1]],
               pch = 21)
        if(legend) legend(x = xlegend, y = ylegend,
                          legend = c(paste("Top", k[2]),
                          paste("Top", k[1])),
                          pch = c(19, 21), cex = (cexlegend + 0.2))
    }
}





###################################################################
###################################################################
##########                                           ##############
##########       Custom code used in paper           ##############
##########       (unlikely to be of interest         ##############
##########       for anybody else)                   ##############
##########                                           ##############
###################################################################
###################################################################



    

figureSummary.varSelRFBoot <- function(object) {
## to create data for figures of paper with randomdata
    errorrate <- object$bootstrap.pred.error
    nvused <- length(object$all.data.vars)
    return(c(errorrate, nvused))
}


figureSummary2.varSelRFBoot <- function(object) {
## to create data for figures of paper with randomdata
    errorrate <- object$bootstrap.pred.error
    nvused <- length(object$all.data.vars)
    median.nvars <- median (object$number.of.vars)
    iq1.nvars <- quantile(object$number.of.vars, p = 0.25)
    iq3.nvars <- quantile(object$number.of.vars, p = 0.75)
    string1 <- paste(formatC(round(nvused, 0), width = 6)," (",
                     round(iq1.nvars, 0),", ",
                     round(median.nvars, 0), ", ",
                     round(iq3.nvars, 0),"),  ", 
                     formatC(round(errorrate, 3), width = 4)," \\\\ ",
                     sep = "")
    string1
}




tableSummary.varSelRFBoot <- function(object, name){
### to create data ready for the LaTeX tables of the paper

    neatname <- switch(name,
                        "gl.boot" = "Leukemia    ",
                        "vv.boot" = "Breast 2 cl.",
                        "vv3.boot" ="Breast 3 cl.",
                        "nci.boot" ="NCI 60      ",
                        "ra.boot" = "Adenocar.   ",
                     "brain.boot" = "Brain       ",
                     "colon.boot" = "Colon       ",
                  "lymphoma.boot" = "Lymphoma    ",
                  "prostate.boot" = "Prostate    ",
                     "srbct.boot" = "Srbct       ")
                        
    errorrate <- object$bootstrap.pred.error
    nvused <- length(object$all.data.vars)
    median.nvars <- median (object$number.of.vars)
    iq1.nvars <- quantile(object$number.of.vars, p = 0.25)
    iq3.nvars <- quantile(object$number.of.vars, p = 0.75)
    in.all.data <- which(names(table(object$all.vars.in.solutions)) %in% object$all.data.vars)
    tmp1 <- table(object$all.vars.in.solutions)[in.all.data]/object$number.of.bootsamples
    median.freq <- median(tmp1)
    if(nvused == 2) {
        iq1.freq <- min(tmp1)
        iq3.freq <- max(tmp1)
    } else {
        iq1.freq <- quantile(tmp1, p = 0.25)
        iq3.freq <- quantile(tmp1, p = 0.75)
    }

    if(nvused > 2) 
        string1 <-
            paste(neatname, "&     ",
                  formatC(round(errorrate, 3), width = 4)," &       ",
                  formatC(round(nvused, 0), width = 4)," &       ",
                  formatC(round(median.nvars, 0), width = 4)," (",
                  round(iq1.nvars, 0),", ",
                  round(iq3.nvars, 0),")   &   ",
                  formatC(round(median.freq, 2), width = 4)," (",
                  round(iq1.freq, 2),", ",
                  round(iq3.freq, 2),")\\\\",
                  sep = "") else   string1 <-
                      paste(neatname, "&     ",
                            formatC(round(errorrate, 3), width = 4)," &       ",
                            formatC(round(nvused, 0), width = 4)," &       ",
                            formatC(round(median.nvars, 0), width = 4)," (",
                            round(iq1.nvars, 0),", ",
                            round(iq3.nvars, 0),")   &   ",
                            formatC(round(median.freq, 2), width = 4)," (",
                            round(iq1.freq, 2),", ",
                            round(iq3.freq, 2),")\\footnotemark[1]\\\\",
                            sep = "")
    cat(string1, "\n")
}




###################################################################
###################################################################
##########                                           ##############
##########       Miscellaneous code                  ##############
##########                                           ##############
###################################################################
###################################################################



## rVI <- function(xdata, ydata, forest, numrandom = 40,
##                 whichImp = "impsUnscaled",
##                 ...) {
##     ## from randomVarImpsRF, but simplified to use only
##     ## one tytpe of importance
    
##     ontree <- forest$ntree
##     omtry <- forest$mtry

##     if(usingCluster) {
##         iRF.cluster <- function(dummy, ontree, omtry, ...) {
##             rf <- randomForest(x = xdataTheCluster,
##                                y = sample(ClassTheCluster),
##                                ntree = ontree,
##                                mtry = omtry,
##                                importance = TRUE, keep.forest = FALSE,
##                                ...)
##             imps <- switch(whichImp,
##                            "impsUnscaled" =
##                            importance(rf, type = 1, scale = FALSE),
##                            "impsScaled" =
##                            importance(rf, type = 1, scale = TRUE),
##                            "impsGini" =
##                            rf$importance[, ncol(rf$importance)]
##                            )
##             if(is.null(imps))
##                 stop("\n Not valid 'whichImp' \n")
##             return(imps)
##         }
##         clusterEvalQ(TheCluster,
##                      rm(list = c("xdataTheCluster", "ClassTheCluster")))
##         xdataTheCluster <<- xdata
##         ClassTheCluster <<- ydata
##         clusterExport(TheCluster,
##                       c("xdataTheCluster", "ClassTheCluster"))
##         outCl <- clusterApplyLB(TheCluster,
##                                 1:numrandom,
##                                 iRF.cluster,
##                                 ontree = ontree,
##                                 omtry = omtry)

##     } else {
##         outCl <- list()
##         for(nriter in 1:numrandom) {
##             rf <- randomForest(x = xdata,
##                                y = sample(ydata),
##                                ntree = ontree,
##                                mtry = omtry,
##                                importance = TRUE,
##                                keep.forest = FALSE,
##                                ...)
##             imps <- switch(whichImp,
##                            "impsUnscaled" =
##                            importance(rf, type = 1, scale = FALSE),
##                            "impsScaled" =
##                            importance(rf, type = 1, scale = TRUE),
##                            "impsGini" =
##                            rf$importance[, ncol(rf$importance)]
##                            )
##             if(is.null(imps))
##                 stop("\n Not valid 'whichImp' \n")
##             outCl[[nriter]] <- imps
##         }
##     }
    
##     randomVarImps <- matrix(unlist(outCl), ncol = numrandom)
##     rownames(randomVarImps) <- rownames(forest$importance)
##     colnames(randomVarImps) <- 1:numrandom
##     class(randomVarImps) <- c(class(randomVarImps),
##                               "rVI")
##     return(randomVarImps)
## }


##### old stuff

####nr <- 10; nc <- 20; x <- matrix(rnorm(2* nr*nc), ncol = nc)
####colnames(x) <- paste("v", 1:nc, sep ="")
####Class <- factor(c(rep("A", nr), rep("B", nr)))
####xdata <- x

####library(randomForest)
####rf1 <- randomForest(x = x, y = Class, importance = TRUE,
####                    keep.forest = FALSE, ntree = 2000)

####usingCluster <- TRUE
####if(usingCluster) {
####    library(snow)
####    library(Rmpi)
####    clusterNumberNodes <- 4
####    typeCluster <- "MPI"
####    TheCluster <- makeCluster(clusterNumberNodes,
####                              type = typeCluster)
####    clusterSetupSPRNG(TheCluster)
####    clusterEvalQ(TheCluster, library(randomForest))
####}



####screePlotRF <- function(forest, randomImportances,
####                        nvars = 50,
####                        show.var.names = FALSE,
####                        vars.highlight = NULL,
####                        main = NULL, scale = TRUE) {
####    nvars <- min(nvars, dim(randomImportances)[1])

    
####    if(scale) {
####        original.imps <- importance(forest, type = 1, scale = TRUE)
####    } else {
####        original.imps <- importance(forest, type = 1, scale = FALSE)
####    }
####    ordered.imps <- sort(original.imps, decreasing = TRUE)[1:nvars]

####    plot(ordered.imps, type = "b", axes = FALSE, lwd = 1.5,
####         xlab = "Variable", ylab = "Importance",
####         main = main)
####    abline(h = 0, lty = 2, col = "blue")
####    axis(2)
####    box()
####    if(show.var.names) {
####        axis(1, labels = names(ordered.imps),
####             at = 1:length(ordered.imps))
####    } else {
####        axis(1)
####    }

####    if(!is.null(vars.highlight)) {
####        if(length(vars.highlight) > nvars) {
####            warning("Not all vars. to highlight will be shown; increase nvars\n")
####            cat("\n Not all vars. to highlight will be shown; increase nvars\n")
####        }
####        pos.selected <- which(names(ordered.imps) %in% vars.highlight)
####        if(!length(pos.selected)){
####            warning("No selected vars. among those to show\n")
####            cat("\nNo selected vars. among those to show\n")
####        }
####        else segments(pos.selected, 0,
####                      pos.selected, ordered.imps[pos.selected],
####                      col = "blue", lwd = 2)
####        if(length(pos.selected) < length(vars.highlight)) {
####            warning("Not shown ", length(vars.highlight) - length(pos.selected),
####                    " of the 'to-highlight' variables\n")
####            cat("Not shown ", length(vars.highlight) - length(pos.selected),
####                    " of the 'to-highlight' variables\n")

####        }
####    }

####    randomImportances <- randomImportances[1:nvars, ]
####    column.mean <- dim(randomImportances)[2]
####    matlines(randomImportances[, -column.mean],
####             col = "green", lty = 1)
####    lines(x = 1:nvars,
####          y = randomImportances[, column.mean],
####          lwd = 1.3, col = "red")
####}


####screePlotRF(rf1, ii1, nvars = 20, vars.highligh = paste("v", c(8, 4), sep =""), show.var.names = TRUE)

####i1 <- randomVarImpsRF(xdata, Class, rf1, numrandom = 3)
####screePlotRF(rf1, i1)






### This version allows optimizing mtry and ntree; leave code here,
### but don't use.

## FIXME: uncomment, but fix global function uses. And add examples.

## ExperimentalvarSelRF <- function(xdata, Class, vars.drop.num = NULL,
##                      vars.drop.frac = 0.5,
##                      c.sd = 1,
##                      whole.range = TRUE,
##                      recompute.var.imp = FALSE,
##                      verbose = FALSE,
##                      returnFirstForest = TRUE,
##                      ## next are all for tune2RF
##                      ## but this is a mess; should pass them as
##                      ## control.
##                      tuneMtry = FALSE, tuneNtree = FALSE,
##                      startNtree = 1000, startMtryFactor = 1,
##                      stepFactorMtry = 1.25,
##                      stepFactorNtree = 1.75,
##                      minCorNtree = 0.975,
##                      quantNtree = 0.025,
##                      returnForest = TRUE,
##                      ntreeTry = 2000) {

##     if( (is.null(vars.drop.num) & is.null(vars.drop.frac)) |
##        (!is.null(vars.drop.num) & !is.null(vars.drop.frac)))
##         stop("One (and only one) of vars.drop.frac and vars.drop.num must be NULL and the other set")
    

##     max.num.steps <- dim(xdata)[2]
##     num.subjects <- dim(xdata)[1]

##     if(is.null(colnames(xdata)))
##         colnames(xdata) <- paste("v", 1:dim(xdata)[2], sep ="")
    
##     ##oversize the vectors; will prune later.
##     n.vars <- vars <- OOB.rf <- OOB.sd <- rep(NA, max.num.steps)
    
##     ## First get optimal values for mtry and ntree, and get first run:
    
##     rf <- tune2RF(x = xdata, y = Class,
##                   tuneMtry = tuneMtry, tuneNtree = tuneNtree,
##                   startNtree = startNtree, mtryFactor = mtryFactor,
##                   stepFactorMtry = stepFactorMtry,
##                   stepFactorNtree = stepFactorNtree,
##                   minCorNtree = minCorNtree,
##                   quantNtree = quantNtree,
##                   returnForest = TRUE,
##                   ntreeTry = ntreeTry,
##                   verbose = verbose)

##     ## We'll need it for future runs
##     ntree <- rf$ntree
##     mtry <- rf$mtry

##     if(returnFirstForest)
##         FirstForest <- rf
##     else
##         FirstForest <- NULL
    
## #    rf <- randomForest(x = xdata, y = Class, importance= TRUE,
## #                       ntree = ntree, keep.forest = FALSE)
##     m.iterated.ob.error <- m.initial.ob.error <- oobError(rf)
##     sd.iterated.ob.error <- sd.initial.ob.error <-
##         sqrt(m.iterated.ob.error * (1 - m.iterated.ob.error) * (1/num.subjects))

##     if(verbose) {
##         print(paste("Initial OOB error: mean = ", round(m.initial.ob.error, 4),
##                     "; sd = ", round(sd.initial.ob.error, 4), sep = ""))
##     }

## ### previous code (before rF < 4.3.1)
##     #selected.vars <- order(rf$importance[, (ncol(rf$importance) - 1)], decreasing = TRUE)
## #    ordered.importance <- rf$importance[selected.vars,(ncol(rf$importance) -1)]

##     importances <- importance(rf, type = 1, scale = FALSE)
##     selected.vars <- order(importances, decreasing = TRUE)
##     ordered.importances <- importances[selected.vars]
    
##     initialImportances <- importances
##     initialOrderedImportances <- ordered.importances
    
##     j <- 1
##     n.vars[j] <- dim(xdata)[2] 
##     vars[j] <- paste(colnames(xdata), collapse = " + ")
##     OOB.rf[j] <- m.iterated.ob.error
##     OOB.sd[j] <- sd.iterated.ob.error

##     var.simplify <- TRUE
    
##     while(var.simplify) {
##         last.rf <- rf
##         last.vars <- selected.vars
## #        print(paste(".........Number of variables before selection",
## #                    dim(xdata)[2])) ## debug
##         previous.m.error <- m.iterated.ob.error
##         previous.sd.error <- sd.iterated.ob.error

##         if(recompute.var.imp & (j > 1)) {
##             ## need to set indexes as absolute w.r.t. original data
## ####            tmp.order <- order(rf$importance[, (ncol(rf$importance) - 1)],
## ####                                    decreasing = TRUE)
## ####            selected.vars <- selected.vars[tmp.order]
## ####            ordered.importance <- rf$importance[tmp.order, (ncol(rf$importance) -1)]

##             importances <- importance(rf, type = 1, scale = FALSE)
##             tmp.order <- order(importances, decreasing = TRUE)
##             selected.vars <- selected.vars[tmp.order]
##             ordered.importances <- importances[tmp.order]

##         }
        
##         num.vars <- length(selected.vars)
  
## ####        if(any(is.na(ordered.importances))) {
## ####            print("**********  Nas in ordered.importances ******")
## ####            browser()
## ####        }
           
##         if(any(ordered.importances < 0)) {
##             selected.vars <- selected.vars[-which(ordered.importances < 0)]
##             ordered.importances <- ordered.importances[-which(ordered.importances < 0)]
##         } else {
##             if(is.null(vars.drop.num))
##                 vars.drop <- round(num.vars * vars.drop.frac)
##             else vars.drop <- vars.drop.num
                
##             if(num.vars >= (vars.drop + 2)) {
##                 selected.vars <- selected.vars[1: (num.vars - vars.drop)]
##                 ordered.importances <- ordered.importances[1: (num.vars - vars.drop)]
##             }
##             else {
##                 selected.vars <- selected.vars[1:2]
##                 ordered.importances <- ordered.importances[1:2]
##             }
##         }
##         ## couldn't we eliminate the following?
##         if((length(selected.vars) < 2) | (any(selected.vars < 1))) {
##             var.simplify <- FALSE
##             break
##         }

##         if(length(selected.vars) <= 2) var.simplify <- FALSE
      
##         if(recompute.var.imp) 
##             rf <- randomForest(x = xdata[, selected.vars], y = Class, importance= TRUE,
##                                ntree = ntree, mtry = mtry, keep.forest = FALSE)
##         else
##             rf <- randomForest(x = xdata[, selected.vars], y = Class, importance= FALSE,
##                                ntree = ntree, mtry = mtry, keep.forest = FALSE)
        
##         m.iterated.ob.error <- oobError(rf)
##         sd.iterated.ob.error <-
##             sqrt(m.iterated.ob.error * (1 - m.iterated.ob.error) * (1/num.subjects))
        
##         if(verbose) {
##             print(paste("..... iteration ", j, "; OOB error: mean = ",
##                         round(m.iterated.ob.error, 4),
##                         "; sd = ", round(sd.iterated.ob.error, 4), sep = ""))
##         }
##         j <- j + 1

        
##         n.vars[j] <- length(selected.vars)
##         vars[j] <- paste(colnames(xdata)[selected.vars],
##                          collapse = " + ")
##         OOB.rf[j] <- m.iterated.ob.error
##         OOB.sd[j] <- sd.iterated.ob.error


##         if(!whole.range &
##            (
##             (m.iterated.ob.error >
##              (m.initial.ob.error + c.sd*sd.initial.ob.error))
##             |
##             (m.iterated.ob.error >
##              (previous.m.error + c.sd*previous.sd.error)))
##            )
##             var.simplify <- FALSE
##     }

##     if (!whole.range) {
##         if(!is.null(colnames(xdata)))
##             selected.vars <- sort(colnames(xdata)[last.vars])
##         else
##             selected.vars <- last.vars

##         out <- list(selec.history = data.frame(
##                     Number.Variables = n.vars,
##                     Vars.in.Forest = vars,
##                     OOB = OOB.rf,
##                     sd.OOB = OOB.sd)[1:j,],
##                     rf.model = last.rf,
##                     selected.vars = selected.vars,
##                     selected.model =  paste(selected.vars, collapse = " + "),
##                     best.model.nvars = length(selected.vars),
##                     initialImportances = initialImportances,
##                     initialOrderedImportances = initialOrderedImportances,
##                     ntree = ntree,
##                     mtry = mtry,
##                     firstForest = FirstForest)
##         class(out) <- "varSelRF"
##         return(out)
##     }
##     else {
##         n.vars <- n.vars[1:j]
##         vars <- vars[1:j]
##         OOB.rf<- OOB.rf[1:j]
##         OOB.sd <- OOB.sd[1:j]
##         ##browser()
##         min.oob.ci <- min(OOB.rf) + c.sd * OOB.sd[which.min(OOB.rf)]
##         best.pos <-
##             which(OOB.rf <= min.oob.ci)[which.min(n.vars[which(OOB.rf <= min.oob.ci)])]

##         selected.vars <- sort(unlist(strsplit(vars[best.pos],
##                                               " + ", fixed = TRUE)))
##         out <- list(selec.history = data.frame(
##                     Number.Variables = n.vars,
##                     Vars.in.Forest = vars,
##                     OOB = OOB.rf,
##                     sd.OOB = OOB.sd),
##                     rf.model = NA,
##                     selected.vars = selected.vars,
##                     selected.model = paste(selected.vars, collapse = " + "),
##                     best.model.nvars = n.vars[best.pos],
##                     initialImportances = initialImportances,
##                     initialOrderedImportances = initialOrderedImportances,
##                     ntree = ntree,
##                     mtry = mtry,
##                     firstForest = FirstForest)
##         class(out) <- "varSelRF"
##         return(out)
##     }
## }



## FIXME: uncomment, but fix global function uses. And add examples.

## tune2RF <- function(x, y, tuneMtry = TRUE, tuneNtree = TRUE,
##                     startNtree = 1000, startMtryFactor = 1,
##                     stepFactorMtry = 1.25,
##                     stepFactorNtree = 1.75,
##                     minCorNtree = 0.9,
##                     quantNtree = 0.025, ## with 4000, look at first 100
##                     returnForest = TRUE,
##                     ntreeTry = 2000,
##                     verbose = TRUE,
##                     plot = FALSE,
##                     ...) {
##     ## if tuneMtry = TRUE and tuneNtree = TRUE,
##     ##        ntree is used for tuneRF, mtryFactor is ignored
##     ## if tuneMtry = TRUE and tuneNtree = FALSE
##     ##        ntree used for tuneRF and final ntree, and mtry factor is ignored
##     ## if tuneMtry = FALSE and tuneRF = TRUE,
##     ##        ntree is used only as starting value for search of ntree and
##     ##        mtryFactor is used for mtry
##     ## if tuneMtry = FALSE and tuneRF = FALSE,
##     ##        mtryFactor and ntree are the ntree and mtry used.

##     if(plot) {
##         op <- par(mfrow = c(1,2))
##         on.exit(par(op))
##     }
##     if(tuneMtry) {
##         tunedMtry <- tuneRF(x, y, stepFactor = stepFactorMtry,
##                             ntreeTry = ntreeTry,
##                             mtryStart = floor(sqrt(ncol(x)) * startMtryFactor),
##                             doBest = FALSE,
##                             plot = plot,
##                             trace = verbose)
##         tunedMtry <-
##             tunedMtry[which.min(tunedMtry[, 2]), 1]
##     } else {
##         tunedMtry <- floor(sqrt(ncol(x)) * startMtryFactor)
##     }

##     tunedNtree <- startNtree
##     if(tuneNtree) {
##         if(verbose)
##             cat("\n Tunning ntree: initial forest construction \n")
##         f1 <- randomForest(x, y, mtry = tunedMtry,
##                            importance = TRUE,
##                            keep.forest = FALSE,
##                            ntree = tunedNtree)
##         repeat {
##             f2 <- randomForest(x, y, mtry = tunedMtry,
##                                importance = TRUE,
##                                keep.forest = FALSE,
##                                ntree = round(stepFactorNtree * tunedNtree))
##             m1 <- cbind(
##                         importance(f1, type = 1, scale = FALSE),
##                         importance(f2, type = 1, scale = FALSE))
##             m1[m1 < quantile(m1, 1-quantNtree)] <- NA
##             m1 <- na.omit(m1)
##             m2 <- m1
##             m2[m2 <= 0] <- NA
##             m2 <- na.omit(m2)
##             mc1 <- cor(m1)
##             mc.rob <- cov.rob(m2, method = "mcd", cor = TRUE)$cor
##             if(verbose) {
##                 cat("\n ... tunning ntree; cor. importances successive ntrees\n")
##                 colnames(mc1) <- c(tunedNtree, round(stepFactorNtree * tunedNtree))
##                 rownames(mc1) <- c(tunedNtree, round(stepFactorNtree * tunedNtree))
##                 colnames(mc.rob) <- c(tunedNtree, round(stepFactorNtree * tunedNtree))
##                 rownames(mc.rob) <- c(tunedNtree, round(stepFactorNtree * tunedNtree))
##                 cat("\n         correlation matrix\n")
##                 print(round(mc1, 4))
##                 cat("\n         robust correlation matrix\n")
##                 print(round(mc.rob, 4))
##                 cat(" \n using ", dim(m2)[1], "observations\n")
##                 cat("\n")
##             }
##             if((min(mc1) > minCorNtree) &
##                (min(mc.rob) > minCorNtree)) {
##                 plot(m1[, 1], m1[, 2], xlab = paste("Ntree", tunedNtree),
##                      ylab = paste("Ntree", round(stepFactorNtree * tunedNtree)),
##                      main =
##                      paste("Upper ", quantNtree,
##                            "th quantile of importances", sep = ""))
                     
##                 break
##             }
##             tunedNtree <- round(stepFactorNtree * tunedNtree)
##             f1 <- f2
##         }
##     }
##     if(returnForest) {
##         if(tuneNtree)
##             return(f1)
##         else
##             return(randomForest(x, y, mtry = tunedMtry,
##                                 importance = TRUE,
##                                 keep.forest = FALSE,
##                                 ntree = ntree)
##                    )
##     }
##     else {
##         return(c(tunedMtry = tunedMtry, tunedNtree = tunedNtree))
##     }
## }




boot.imp <- function(data, class, ntree = 20000, B = 200) {
## just checking the output from the pg.plots. is correct.
    N <- length(class)
    mat.out <- matrix(NA, nrow = dim(data)[2], ncol = B)
    for(i in 1:B) {
        sample.again <- TRUE
        while(sample.again) {
            bootsample <- unlist(tapply(1:N,  class,
                                        function(x)
                                        sample(x, size = length(x),
                                               replace = TRUE)))
            ## sure, this isn't the fastest, but will do for now.
            nobootsample <- setdiff(1:N, bootsample)
            if(!length(nobootsample))
                sample.again <- TRUE
            else sample.again <- FALSE
        }
        ## this is an ugly hack to prevetn nobootsamples
        ## of size 0.
        train.data <- data[bootsample, , drop = FALSE]
        train.class <- class[bootsample]
        rftmp <- randomForest(x = train.data, y = train.class,
                              ntree = ntree,
                              keep.forest = FALSE,
                              importance = TRUE)
        mat.out[, i] <- importance(rftmp, type = 1, scale = FALSE)
    }
    return(mat.out)
}


####gl.b.i <- boot.imp(gl.data, gl.class, B = 20)
        
####pairs(cbind(gl.b.i, importance(gl.20000.rf, type = 1, scale = FALSE)),
####      pch = ".")

####pairs(cbind(gl.b.i[, 1:10], importance(gl.20000.rf, type = 1, scale = FALSE)),
####      pch = ".")

####summary(cbind(gl.b.i, importance(gl.20000.rf, type = 1, scale = FALSE)))
####boxplot(data.frame(cbind(gl.b.i, importance(gl.20000.rf, type = 1, scale = FALSE))))




#####f.mtr <- function(x, mf = 3) {
#####  x2 <- floor(sqrt(x) * mf)
#####  x2[x2 > x] <- x[x2 > x]
#####  return(x2)
#####}
#####plot(x = sqrt(2:200), f.mtr(2:200, mf = 5), type = "l")
rdiaz02/varSelRF documentation built on May 27, 2019, 3:06 a.m.