R/rf.classBalance.R

#' @title Random Forest Class Balance (Zero Inflation Correction) Model
#' @description Implements Evans & Cushman (2008) Random Forests class-balance 
#'              (zero inflation) modeling approach. 
#'                                                                                                                                                                                                    
#' @param ydata              Response variable using index (i.e., [,2] or [,"SPP"] )                        
#' @param xdata              Independent variables using index (i.e., [,3:14] or 
#'                           [3:ncol(data)] )
#' @param p                  p-value of covariance convergence (do not recommend changing)
#' @param cbf                Scaling factor to test if problem is imbalanced, default is 
#'                           size of majority class * 3
#' @param sf                 Majority subsampling factor. If sf=1 then random sample would 
#'                           be perfectly balanced with smallest class [s|0=n|1] whereas; 
#'                           sf=2 provides [s|0=(n|1*2)]
#' @param seed               Sets random seed in R global environment
#' @param ...                Additional arguments passed to randomForest
#'  
#' @return A rf.balanced object with the following components:
#'   \itemize{ 
#'   \item model = [Final Combined Random Forests ensemble (randomForest object)]
#'   \item OOB.error = [averaged out-of-bag error for each model]
#'   \item confusion = [averaged confusion matrix for each model]
#'   }
#'
#' @details
#' This approach runs independent Random Forest models using random subsets of 
#' the majority class until covariance convergences on full data. The final model 
#' is obtained by combining independent ensembles.  
#'
#' @author Jeffrey S. Evans   <jeffrey_evans<at>tnc.org>
#'
#' @references 
#' Evans, J.S. and S.A. Cushman (2009) Gradient Modeling of Conifer Species 
#'   Using Random Forest. Landscape Ecology 5:673-683.
#' @references 
#' Evans J.S., M.A. Murphy, Z.A. Holden, S.A. Cushman (2011). Modeling species 
#'   distribution and change using Random Forests CH.8 in Predictive Modeling in 
#'   Landscape Ecology eds Drew, CA, Huettmann F, Wiersma Y. Springer 
#'                                                                       
#' @examples 
#' require(randomForest)
#' data(iris)
#'   iris$Species <- as.character(iris$Species)
#'     iris$Species <- ifelse(iris$Species == "setosa", "virginica", iris$Species)
#'       iris$Species <- as.factor(iris$Species)	
#' 	
#' # Percent of "virginica" observations
#' length( iris$Species[iris$Species == "virginica"] ) / dim(iris)[1]*100
#' 
#' # Balanced model	
#' ( cb <- rf.classBalance( ydata=iris[,"Species"], xdata=iris[,1:4], cbf=1 ) )
#'
#' # Calculate Kappa for each balanced model in ensemble 
#' for(i in 1:length(cb$confusion) ) { 
#'   print( accuracy(cb$confusion[[i]][,1:2])[5] ) 
#' }
#'
#' # Evaluate cumulative and mean confusion matrix
#' accuracy( round((cb$confusion[[1]] + cb$confusion[[2]] + cb$confusion[[3]]))[,1:2] )
#' accuracy( round((cb$confusion[[1]] + cb$confusion[[2]] + cb$confusion[[3]])/3)[,1:2])
#'
#' @seealso \code{\link[randomForest]{randomForest}} for randomForest ... model options
#'
#' @export
rf.classBalance <- function (ydata, xdata, p=0.005, cbf=3, sf=2, seed=NULL, ...) 
 {
  if (  class(ydata) != "factor" ) { ydata <- as.factor(ydata) }
    if(!is.null(seed)) { set.seed(seed) }
  CompCov <- function(m1, m2, pVal=p) {
       k = 2
        p = 2
         n1 = dim(m1)[1]
          n2 = dim(m2)[1] 
           n = n1 + n2
            s1 <- crossprod(m1[1:dim(m1)[1]])
             s2 <- crossprod(m2[1:dim(m2)[1]])
              c1 = (1/(n1-1)) * s1
              c2 = (1/(n2-1)) * s2
             c3 = (s1+s2)/(n-k)
            d = det(c3)
            d1 = det(c1)
           d2 = det(c2) 
          m = ( (n - k) * log(d) ) - ( (n1 - 1) * log(d1) + (n2 - 1) * log(d2) )
         h = 1 - ((2 * p * p + 3 * p - 1) / (6 * (p + 1) * (k - 1)) * 
		         (1 / (n1 - 1) + 1 / (n2 - 1) + 1 / (n - k)))
        chi = round(abs(m * h),digits=6)
        dfree = p * (p + 1) * (k - 1) / 2
        print( paste("EQUIVALENCE p", chi, sep=": ") )
      if ( (chi <= pVal ) == TRUE & (i > 2) |  (i > 20)  == TRUE ) { 
        ( "TRUE" )
      } else {
        ( "FALSE" ) 
      }
    }  		 
    y <- ydata
    x <- xdata  		 		 
    class.ct <- table(y)
    maj.class <- names(class.ct)[which.max(class.ct)]; maj.idx <- which.max(class.ct) 
    min.class <- names(class.ct)[which.min(class.ct)]; min.idx <- which.min(class.ct)  
      if ( ( class.ct[maj.idx] <= class.ct[min.idx] * cbf ) == TRUE) 
        stop("CLASSES ARE BALANCED!")  	 
    tmp.data <- data.frame(y, x)
	  majority <- tmp.data[tmp.data[,"y"] == maj.class ,]       
      minority <- tmp.data[tmp.data[,"y"] == min.class ,]    
	    all.cov <- stats::cov(majority[,names(x)])     
	      test <- as.data.frame(array(0, dim=c( 0, dim(tmp.data)[2] )))
    names(test) <- names(majority) 
      if ( !is.na(match("rf.model",ls()))) rm(rf.model)
        n <- dim(minority)[1] * sf                 
    i=0; converge = c("FALSE")
    confusion <- list()
    validation <- list()
	OOB <- vector()
      while (converge != "TRUE" )
       {
       i=i+1
        ns <- sample(1:nrow(majority), n) 
        class.sample <- majority[ns, ]
        mdata <- rbind(minority, class.sample)   
          if (  class(mdata[,1]) != "factor" ) { mdata[,1] <- as.factor(mdata[,1]) }
          if ( !is.na(match("rf.model",ls()))) {               
            rf.fit <- randomForest::randomForest(x=mdata[,2:ncol(mdata)], y=mdata[,1], ...)                           
            rf.model <- randomForest::combine(rf.fit, rf.model)           
            OOB[i] <- stats::median(rf.fit$err.rate[,1])  
            confusion[[i]] <- rf.fit$confusion	
          } else {
            rf.model <- randomForest::randomForest(x=mdata[,2:ncol(mdata)], y=mdata[,1], ...)  
            OOB[i] <- stats::median(rf.model$err.rate[,1])  
            confusion[[i]] <- rf.model$confusion	                               
          }
        test <- rbind(test, class.sample)    
        test.cov <- stats::cov( test[,names(x)] )
        converge <- CompCov(all.cov, test.cov)  
    }
	rf.model$y <- ydata
      bal.mdl <- list( model=rf.model, OOB.error=OOB, confusion=confusion )
    class( bal.mdl ) <- c("rf.balanced")	
  return( bal.mdl )  
}
jeffreyevans/rfUtilities documentation built on Nov. 12, 2023, 6:52 p.m.