#' @title Random Forest Class Balance (Zero Inflation Correction) Model
#' @description Implements Evans & Cushman (2008) Random Forests class-balance
#' (zero inflation) modeling approach.
#'
#' @param ydata Response variable using index (i.e., [,2] or [,"SPP"] )
#' @param xdata Independent variables using index (i.e., [,3:14] or
#' [3:ncol(data)] )
#' @param p p-value of covariance convergence (do not recommend changing)
#' @param cbf Scaling factor to test if problem is imbalanced, default is
#' size of majority class * 3
#' @param sf Majority subsampling factor. If sf=1 then random sample would
#' be perfectly balanced with smallest class [s|0=n|1] whereas;
#' sf=2 provides [s|0=(n|1*2)]
#' @param seed Sets random seed in R global environment
#' @param ... Additional arguments passed to randomForest
#'
#' @return A rf.balanced object with the following components:
#' \itemize{
#' \item model = [Final Combined Random Forests ensemble (randomForest object)]
#' \item OOB.error = [averaged out-of-bag error for each model]
#' \item confusion = [averaged confusion matrix for each model]
#' }
#'
#' @details
#' This approach runs independent Random Forest models using random subsets of
#' the majority class until covariance convergences on full data. The final model
#' is obtained by combining independent ensembles.
#'
#' @author Jeffrey S. Evans <jeffrey_evans<at>tnc.org>
#'
#' @references
#' Evans, J.S. and S.A. Cushman (2009) Gradient Modeling of Conifer Species
#' Using Random Forest. Landscape Ecology 5:673-683.
#' @references
#' Evans J.S., M.A. Murphy, Z.A. Holden, S.A. Cushman (2011). Modeling species
#' distribution and change using Random Forests CH.8 in Predictive Modeling in
#' Landscape Ecology eds Drew, CA, Huettmann F, Wiersma Y. Springer
#'
#' @examples
#' require(randomForest)
#' data(iris)
#' iris$Species <- as.character(iris$Species)
#' iris$Species <- ifelse(iris$Species == "setosa", "virginica", iris$Species)
#' iris$Species <- as.factor(iris$Species)
#'
#' # Percent of "virginica" observations
#' length( iris$Species[iris$Species == "virginica"] ) / dim(iris)[1]*100
#'
#' # Balanced model
#' ( cb <- rf.classBalance( ydata=iris[,"Species"], xdata=iris[,1:4], cbf=1 ) )
#'
#' # Calculate Kappa for each balanced model in ensemble
#' for(i in 1:length(cb$confusion) ) {
#' print( accuracy(cb$confusion[[i]][,1:2])[5] )
#' }
#'
#' # Evaluate cumulative and mean confusion matrix
#' accuracy( round((cb$confusion[[1]] + cb$confusion[[2]] + cb$confusion[[3]]))[,1:2] )
#' accuracy( round((cb$confusion[[1]] + cb$confusion[[2]] + cb$confusion[[3]])/3)[,1:2])
#'
#' @seealso \code{\link[randomForest]{randomForest}} for randomForest ... model options
#'
#' @export
rf.classBalance <- function (ydata, xdata, p=0.005, cbf=3, sf=2, seed=NULL, ...)
{
if ( class(ydata) != "factor" ) { ydata <- as.factor(ydata) }
if(!is.null(seed)) { set.seed(seed) }
CompCov <- function(m1, m2, pVal=p) {
k = 2
p = 2
n1 = dim(m1)[1]
n2 = dim(m2)[1]
n = n1 + n2
s1 <- crossprod(m1[1:dim(m1)[1]])
s2 <- crossprod(m2[1:dim(m2)[1]])
c1 = (1/(n1-1)) * s1
c2 = (1/(n2-1)) * s2
c3 = (s1+s2)/(n-k)
d = det(c3)
d1 = det(c1)
d2 = det(c2)
m = ( (n - k) * log(d) ) - ( (n1 - 1) * log(d1) + (n2 - 1) * log(d2) )
h = 1 - ((2 * p * p + 3 * p - 1) / (6 * (p + 1) * (k - 1)) *
(1 / (n1 - 1) + 1 / (n2 - 1) + 1 / (n - k)))
chi = round(abs(m * h),digits=6)
dfree = p * (p + 1) * (k - 1) / 2
print( paste("EQUIVALENCE p", chi, sep=": ") )
if ( (chi <= pVal ) == TRUE & (i > 2) | (i > 20) == TRUE ) {
( "TRUE" )
} else {
( "FALSE" )
}
}
y <- ydata
x <- xdata
class.ct <- table(y)
maj.class <- names(class.ct)[which.max(class.ct)]; maj.idx <- which.max(class.ct)
min.class <- names(class.ct)[which.min(class.ct)]; min.idx <- which.min(class.ct)
if ( ( class.ct[maj.idx] <= class.ct[min.idx] * cbf ) == TRUE)
stop("CLASSES ARE BALANCED!")
tmp.data <- data.frame(y, x)
majority <- tmp.data[tmp.data[,"y"] == maj.class ,]
minority <- tmp.data[tmp.data[,"y"] == min.class ,]
all.cov <- stats::cov(majority[,names(x)])
test <- as.data.frame(array(0, dim=c( 0, dim(tmp.data)[2] )))
names(test) <- names(majority)
if ( !is.na(match("rf.model",ls()))) rm(rf.model)
n <- dim(minority)[1] * sf
i=0; converge = c("FALSE")
confusion <- list()
validation <- list()
OOB <- vector()
while (converge != "TRUE" )
{
i=i+1
ns <- sample(1:nrow(majority), n)
class.sample <- majority[ns, ]
mdata <- rbind(minority, class.sample)
if ( class(mdata[,1]) != "factor" ) { mdata[,1] <- as.factor(mdata[,1]) }
if ( !is.na(match("rf.model",ls()))) {
rf.fit <- randomForest::randomForest(x=mdata[,2:ncol(mdata)], y=mdata[,1], ...)
rf.model <- randomForest::combine(rf.fit, rf.model)
OOB[i] <- stats::median(rf.fit$err.rate[,1])
confusion[[i]] <- rf.fit$confusion
} else {
rf.model <- randomForest::randomForest(x=mdata[,2:ncol(mdata)], y=mdata[,1], ...)
OOB[i] <- stats::median(rf.model$err.rate[,1])
confusion[[i]] <- rf.model$confusion
}
test <- rbind(test, class.sample)
test.cov <- stats::cov( test[,names(x)] )
converge <- CompCov(all.cov, test.cov)
}
rf.model$y <- ydata
bal.mdl <- list( model=rf.model, OOB.error=OOB, confusion=confusion )
class( bal.mdl ) <- c("rf.balanced")
return( bal.mdl )
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.