R/utilities.subsample.bootstrap.R

Defines functions print.bootsample.rfsrc extract.bootsample bootsample

Documented in extract.bootsample print.bootsample.rfsrc

###################################################################
##
## core bootstrap function
##
###################################################################
bootsample <- function(obj, rf.prms, B, block.size, joint, xvar.names,
                       importance, performance, performance.only, verbose) {
  ##--------------------------------------------------------------
  ##
  ## extract data from the previously grown forest
  ## set the dimensions, pull the feature names
  ##
  ##--------------------------------------------------------------
  dta <- data.frame(obj$yvar, obj$xvar)
  colnames(dta)[1:length(obj$yvar.names)] <- obj$yvar.names
  n <- nrow(obj$xvar)
  ##--------------------------------------------------------------
  ##
  ## call vimp to extract importance if not available in original object
  ##
  ##--------------------------------------------------------------
  vmp <- get.mv.vimp(obj, FALSE, FALSE)
  if (!performance.only) {
    if (is.null(vmp)) {
      if (verbose) cat("no importance found: calculating it now ...\n")
      obj <- vimp(obj, perf.type = rf.prms$perf.type, block.size = block.size, importance = importance)
      vmp <- get.mv.vimp(obj, FALSE, FALSE)
      rf.prms$block.size <- block.size
      if (verbose) cat("done\n")
    }
    else {
      importance <- obj$forest$importance
    }
  }
  ##--------------------------------------------------------------
  ##
  ## call joint vimp - reference vimp value for pure noise settings
  ##
  ##--------------------------------------------------------------
  if (joint && !performance.only) {
    vmp.joint <- get.mv.vimp(get.subsample.joint.vimp(obj, rf.prms, xvar.names), FALSE, FALSE)
    vmp <- vimp.subsample.combine(vmp, vmp.joint)
  }
  ##--------------------------------------------------------------
  ##
  ## obtain performance value - used for error rate confidence intervals
  ##
  ##--------------------------------------------------------------
  if (performance) {
    err <- get.mv.error(obj, FALSE, FALSE)
    vmp <- vimp.subsample.combine(vmp, err, "err")
  }
  ##----------------------------------------------------------
  ##
  ## bootstrap loop for calculating VIMP confidence regions
  ##
  ##----------------------------------------------------------
  vmpB <- lapply(1:B, function(b) {
    ## progress bar
    if (verbose && B > 1) {
      custom.progress.bar(b, B)
    }
    ##---------------------------------------------------
    ##
    ## double bootstrap 
    ##
    ##---------------------------------------------------
    ## double bootstrap sample
    smp <- sample(1:n, size = n, replace = TRUE)
    smp.unq <- sort(unique(smp))
    dbl.smp <- make.double.boot.sample(rf.prms$ntree, n, smp,
                             size = rf.prms$sampsize, replace = rf.prms$samptype == "swr")
    ## calculate VIMP from the double boostrap forest
    rf.b <- do.call("rfsrc",
                    c(list(data = dta[smp.unq,, drop = FALSE], importance = importance,
                           bootstrap = "by.user", samp = dbl.smp), rf.prms))
    vmp.b <- get.mv.vimp(rf.b, FALSE, FALSE)
    if (joint && !performance.only) {
      vmp.b.joint <- get.mv.vimp(get.subsample.joint.vimp(rf.b, rf.prms, xvar.names), FALSE, FALSE)
      vmp.b <- vimp.subsample.combine(vmp.b, vmp.b.joint)
    }
    if (performance) {
      err.b <- get.mv.error(rf.b, FALSE, FALSE)
      vmp.b <- vimp.subsample.combine(vmp.b, err.b, "err")
    }
    ## combine
    rO.b <- lapply(1:length(vmp), function(j) {
      vmp.j <- vmp[[j]]
      vmp.b.j <- vmp.b[[j]]
      vmp.mtx <- data.frame(matrix(NA, nrow(vmp.j), ncol(vmp.j)))
      rownames(vmp.mtx) <- rownames(vmp.j)
      colnames(vmp.mtx) <- colnames(vmp.j)
      if (ncol(vmp.b.j) == ncol(vmp.j)) {
        vmp.mtx[rownames(vmp.b.j), ] <- vmp.b.j
        vmp.b.j <- vmp.mtx
      }
      vmp.b.j
    })
    names(rO.b) <- names(vmp)
    rO.b
  })
  ## return the potentially updated forest object
  ## return the vimp 
  list(rf = obj, vmp = vmp, vmpB = vmpB)
}
###################################################################
##
## extract confidence interval + other goodies from bootstrap object
##
###################################################################
extract.bootsample <- function(obj, alpha = .05, target = 0, m.target = NULL, standardize = TRUE, raw = FALSE)
{
  ## confirm this is the right kind of object
  if (!sum(c(grepl("rfsrc", class(obj))), grepl("bootsample", class(obj))) == 2) {
    stop("this must be a double-bootstrap object obtained by calling the 'subsample' function")
  }
  ## coerce the (potentially) multivariate rf object
  m.target <- get.univariate.target(obj$rf, m.target)
  obj$rf <- coerce.multivariate(obj$rf, m.target)
  ## coerce the (potentially) multivariate vimp object
  if (is.null(m.target)) {
    obj$vmpB <- lapply(obj$vmpB, data.frame)
  }
  else {
    obj$vmpB <- lapply(obj$vmpB, function(oo) {
      oo[[m.target]]
    })
  }  
  ## extract vimp
  theta.boot <- get.subsample.standardize.vimp(obj$rf, rbind(sapply(obj$vmpB, "[[", 1 + target)), standardize)
  rownames(theta.boot) <- rownames(obj$vmp[[1]][, 1 + target, drop = FALSE])
  ## bootstrap VIMP
  vmp.boot <- rowMeans(theta.boot, na.rm = TRUE)
  ## bootstrap standard error
  se.boot <- apply(theta.boot, 1, sd, na.rm = TRUE)
  ## nonparametric bootstrap confidence interval
  ci.boot <- do.call(cbind, lapply(1:length(vmp.boot), function(rowj) {
    c(quantile(theta.boot[rowj,], alpha/2, na.rm = TRUE),
      quantile(theta.boot[rowj,], .25, na.rm = TRUE),
      vmp.boot[rowj],
      quantile(theta.boot[rowj,], .75, na.rm = TRUE),
      quantile(theta.boot[rowj,], 1 - alpha/2, na.rm = TRUE))
  }))
  rownames(ci.boot) <- paste(round(100 * c(alpha/2,.25,.50,.75,1-alpha/2), 1), "%", sep = "")
  colnames(ci.boot) <- names(vmp.boot)
  ## parametric bootstrap confidence interval
  ci.boot.Z <- do.call(cbind, lapply(1:length(vmp.boot), function(rowj) {
    c(vmp.boot[rowj] - qnorm(1 - alpha/2) * se.boot[rowj],
      vmp.boot[rowj] - qnorm(.75) * se.boot[rowj],
      vmp.boot[rowj],
      vmp.boot[rowj] + qnorm(.75) * se.boot[rowj],
      vmp.boot[rowj] + qnorm(1 - alpha/2) * se.boot[rowj])
  }))
  rownames(ci.boot.Z) <- paste(round(100 * c(alpha/2,.25,.50,.75,1-alpha/2), 1), "%", sep = "")
  colnames(ci.boot.Z) <- names(vmp.boot)
  ## bootstrap variable selection object
  var.sel.boot <- data.frame(lower  =  ci.boot[1, ],
                             mean   =  ci.boot[3, ],
                             upper  =  ci.boot[5, ],
                             signif =  ci.boot[1, ] > 0)
  rownames(var.sel.boot) <- names(vmp.boot)
  var.sel.boot.Z <- data.frame(lower  =  ci.boot.Z[1, ],
                               mean   =  ci.boot.Z[3, ],
                               upper  =  ci.boot.Z[5, ],
                               pvalue = pnorm(vmp.boot, 0, se.boot, FALSE),
                               signif =  ci.boot.Z[1, ] > 0)
  rownames(var.sel.boot.Z) <- names(vmp.boot)
  if (raw) {
    list(ci = ci.boot,
         ci.Z = ci.boot.Z,
         vmp = vmp.boot,
         vmpS = theta.boot,
         se = se.boot,
         var.sel = var.sel.boot,
         var.sel.Z = var.sel.boot.Z)
  }
  else {
    list(se = se.boot,
         var.sel = var.sel.boot,
         var.sel.Z = var.sel.boot.Z)
  }
}
###################################################################
##
## print function from bootstrap ci
##
###################################################################
print.bootsample.rfsrc <- function(x, alpha = .05, standardize = TRUE, ...) {
  vmp <- x$vmp
  nullO <- lapply(1:length(vmp), function(j) {
    m.target <- names(vmp)[j]
    p.vmp <- ncol(vmp[[j]])
    vmp.col.names <- colnames(vmp[[j]])
    if (length(vmp) > 1) {
      cat("processing a multivariate family, outcome:", m.target, "\n")
    }
    lapply(1:p.vmp, function(k) {
      oo <- extract.bootsample(x, alpha = alpha, target = k - 1,
                   m.target = m.target, standardize = standardize, raw = TRUE)
      cat("===== VIMP confidence regions for", vmp.col.names[k], " =====\n")
      cat("nonparametric:\n")
      print(round(oo$ci, 3))
      cat("parametric:\n")
      print(round(oo$ci.Z, 3))
      NULL
    })
  })
}
print.bootsample <- print.bootsample.rfsrc

Try the randomForestSRC package in your browser

Any scripts or data that you put into this service are public.

randomForestSRC documentation built on June 25, 2024, 5:08 p.m.