R/rf.classBalance.R
In rfUtilities: Random Forests Model Selection and Performance Evaluation

#' @title Random Forest Class Balance (Zero Inflation Correction) Model
#' @description Implements Evans & Cushman (2008) Random Forests class-balance (zero inflation) modeling approach. 
#'                                                                                                                                                                                                    
#' @param ydata              Response variable using index (i.e., [,2] or [,"SPP"] )                        
#' @param xdata              Independent variables using index (i.e., [,3:14] or [3:ncol(data)] )
#' @param p                  p-value of covariance convergence (do not recommend changing)
#' @param cbf                Scaling factor to test if problem is imbalanced, default is size of majority class * 3
#' @param sf                 Majority subsampling factor. If sf=1 then random sample would be perfectly balanced with smallest class [s|0=n|1] whereas; sf=2 provides [s|0=(n|1*2)]
#' @param seed               Sets random seed in R global environment
#' @param ...                Additional arguments passed to randomForest
#'  
#' @return A rf.balanced object with the following components:
#'  @return   model          Final Combined Random Forests ensemble (randomForest object)
#'  @return   OOB.error      Out-of-bag error for each model (vector)
#'  @return   confusion      Confusion matrix for each model (list)
#'
#' @note
#' This approach runs independent Random Forest models using random subsets of the majority class until covariance convergences on full data. The final model is obtained by combining independent ensembles.  
#'
#' @author Jeffrey S. Evans   <jeffrey_evans<at>tnc.org>
#'
#' @references Evans, J.S. and S.A. Cushman (2009) Gradient Modeling of Conifer Species Using Random Forest. Landscape Ecology 5:673-683.
#' @references Evans J.S., M.A. Murphy, Z.A. Holden, S.A. Cushman (2011). Modeling species distribution and change using Random Forests CH.8 in Predictive Modeling in Landscape Ecology eds Drew, CA, Huettmann F, Wiersma Y. Springer 
#'                                                                       
#' @examples 
#' require(randomForest)
#' data(iris)
#'   iris$Species <- as.character(iris$Species)
#'     iris$Species <- ifelse(iris$Species == "setosa", "virginica", iris$Species)
#'       iris$Species <- as.factor(iris$Species)	
#' 	
#' # Percent of "virginica" observations
#' length( iris$Species[iris$Species == "virginica"] ) / dim(iris)[1]*100
#' 
#' # Balanced model	
#' ( cb <- rf.classBalance( ydata=iris[,"Species"], xdata=iris[,1:4], cbf=1 ) )
#'
#' # Calculate Kappa for each balanced model in ensemble 
#' for(i in 1:length(cb$confusion) ) { 
#'   print( accuracy(cb$confusion[[i]][,1:2])[5] ) 
#' }
#'
#' # Evaluate cumulative and mean confusion matrix
#' accuracy( round((cb$confusion[[1]] + cb$confusion[[2]] + cb$confusion[[3]]))[,1:2] )
#' accuracy( round((cb$confusion[[1]] + cb$confusion[[2]] + cb$confusion[[3]])/3)[,1:2])
#'
#' @seealso \code{\link[randomForest]{randomForest}} for randomForest ... model options
#'
#' @export
rf.classBalance <- function (ydata, xdata, p=0.005, cbf=3, sf=2, seed=NULL, ...) 
 {
  if (  class(ydata) != "factor" ) { ydata <- as.factor(ydata) }
    if(!is.null(seed)) { set.seed(seed) }
  CompCov <- function(m1, m2, pVal=p) {
       k = 2
        p = 2
         n1 = dim(m1)[1]
          n2 = dim(m2)[1] 
           n = n1 + n2
            s1 <- crossprod(m1[1:dim(m1)[1]])
             s2 <- crossprod(m2[1:dim(m2)[1]])
              c1 = (1/(n1-1)) * s1
              c2 = (1/(n2-1)) * s2
             c3 = (s1+s2)/(n-k)
            d = det(c3)
            d1 = det(c1)
           d2 = det(c2) 
          m = ( (n - k) * log(d) ) - ( (n1 - 1) * log(d1) + (n2 - 1) * log(d2) )
         h = 1 - ((2 * p * p + 3 * p - 1) / (6 * (p + 1) * (k - 1)) * 
		         (1 / (n1 - 1) + 1 / (n2 - 1) + 1 / (n - k)))
        chi = round(abs(m * h),digits=6)
        dfree = p * (p + 1) * (k - 1) / 2
        print( paste("EQUIVALENCE p", chi, sep=": ") )
      if ( (chi <= pVal ) == TRUE & (i > 2) |  (i > 20)  == TRUE ) { 
        ( "TRUE" )
      } else {
        ( "FALSE" ) 
      }
    }  		 
    y <- ydata
    x <- xdata  		 		 
    class.ct <- table(y)
    maj.class <- names(class.ct)[which.max(class.ct)]; maj.idx <- which.max(class.ct) 
    min.class <- names(class.ct)[which.min(class.ct)]; min.idx <- which.min(class.ct)  
      if ( ( class.ct[maj.idx] <= class.ct[min.idx] * cbf ) == TRUE) 
        stop("CLASSES ARE BALANCED!")  	 
    tmp.data <- data.frame(y, x)
	  majority <- tmp.data[tmp.data[,"y"] == maj.class ,]       
      minority <- tmp.data[tmp.data[,"y"] == min.class ,]    
	    all.cov <- stats::cov(majority[,names(x)])     
	      test <- as.data.frame(array(0, dim=c( 0, dim(tmp.data)[2] )))
    names(test) <- names(majority) 
      if ( !is.na(match("rf.model",ls()))) rm(rf.model)
        n <- dim(minority)[1] * sf                 
    i=0; converge = c("FALSE")
    confusion <- list()
    validation <- list()
	OOB <- vector()
      while (converge != "TRUE" )
       {
       i=i+1
        ns <- sample(1:nrow(majority), n) 
        class.sample <- majority[ns, ]
        mdata <- rbind(minority, class.sample)   
          if (  class(mdata[,1]) != "factor" ) { mdata[,1] <- as.factor(mdata[,1]) }
          if ( !is.na(match("rf.model",ls()))) {               
            rf.fit <- randomForest::randomForest(x=mdata[,2:ncol(mdata)], y=mdata[,1], ...)                           
            rf.model <- randomForest::combine(rf.fit, rf.model)           
            OOB[i] <- stats::median(rf.fit$err.rate[,1])  
            confusion[[i]] <- rf.fit$confusion	
          } else {
            rf.model <- randomForest::randomForest(x=mdata[,2:ncol(mdata)], y=mdata[,1], ...)  
            OOB[i] <- stats::median(rf.model$err.rate[,1])  
            confusion[[i]] <- rf.model$confusion	                               
          }
        test <- rbind(test, class.sample)    
        test.cov <- stats::cov( test[,names(x)] )
        converge <- CompCov(all.cov, test.cov)  
    }
	rf.model$y <- ydata
      bal.mdl <- list( model=rf.model, OOB.error=OOB, confusion=confusion )
    class( bal.mdl ) <- c("rf.balanced", "list")	
  return( bal.mdl )  
}

Any scripts or data that you put into this service are public.

rfUtilities documentation built on Oct. 3, 2019, 9:04 a.m.

rdrr.io home R language documentation Run R code online

CRAN packages Bioconductor packages R-Forge packages GitHub packages

Note that we can't provide technical support on individual packages. You should contact the package authors for that.

rfUtilities
Random Forests Model Selection and Performance Evaluation

R/rf.classBalance.R
In rfUtilities: Random Forests Model Selection and Performance Evaluation

Try the rfUtilities package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

rfUtilities Random Forests Model Selection and Performance Evaluation

R/rf.classBalance.R In rfUtilities: Random Forests Model Selection and Performance Evaluation

Try the rfUtilities package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

rfUtilities
Random Forests Model Selection and Performance Evaluation

R/rf.classBalance.R
In rfUtilities: Random Forests Model Selection and Performance Evaluation