R/RFdissimilarity.R

#' @title  Unsupervised Random Forest 
# 
#' @description  Compute dissimilarity matrix between observations by training a 
#' randomForest (RF) classifier to 
#' descriminate between the 'original' data and a synthetic version. 
#' The original data is labeled as "True.Data" 
#' while the synthetic data is labeled ``Synthetic.Data". The random forest p
#' roximity matrix between observations in the 
#' original data are then extracted, converted to distance, and returned.  
#' The synthetic data is generated by taking a random sample from each 
#' dimension of the true data, with 
#' or without replacement. 
#'
#' @name RFdist
#
#' @param data data.frame or matrix 
#' @param  mtry mtry in \code{\link[randomForest]{randomForest}} 
#' @param  ntree, number of trees 
#' @param  no.rep number of repetitions or forests
#' @param  syn.type  type of synthetic data generator: "emperical" generate samples from the 
#' emperical distribution of the original data while "permute" takes a permutation 
#' of each dimension. emerical is just sample with 
#' replacement while permute is without replacement   
#' @param  importance (logical) compute variable importance  ? 
#' @param parallel  character vector specifying the type of parallel run: 'forests' - run a total of 
#' \code{no.rep} RF in parallel, 
#' 'trees' - run RF in parallel over ntrees and combine the results, 'no' - serial computation. 
#' @param nodesize  node size in \code{randomForest}
#' @param x object of class \code{\link{RFdist}}
#' @param \dots further arguments passed to \code{\link[randomForest]{randomForest}}.
#' @details
#' Methods 
#' \enumerate{
#' \item \code{print} : print OOB error and convergence summary  
#' \item \code{plot} : plots the convergence of the RF proximities given by the MSE 
#' over number of forest \code{no.rep}
#' } 

#' @return A list with elements:  
#' \enumerate{
#' \item RFdist: RF proximity converted to a distance object
#' \item err: error rate
#' \item UnsupRFvarimp: Unsupervised RF variable importance 
#' \item proxConver: a matrix containing three convergence meausres 
#'                   \enumerate{                        
#'                   \item  Max.prox = max( abs( aveprox(N)- aveprox(N-1)))  
#'                   \item MSE.prox = mean( (aveprox(N)- aveprox(N-1))^2)   
#' 	             \item Mean =  mean(aveprox(N))
#'                   where N is number of forests (no.rep).
#'                    }         
#' }
#
#' @references
#' Tao Shi and Steve Horvath (2006) Unsupervised Learning with Random Forest Predictors. 
#' Journal of Computational and Graphical Statistics. Volume 15, Number 1, March 2006, pp. 118-138(21)
#
#' @import foreach 
#' @importFrom randomForest randomForest combine 
#' @importFrom iterators idiv
NULL
#' @exportPattern "."
#' @rdname RFdist
#' @export
RFdist <- function(data, ...) UseMethod("RFdist")
#' @rdname RFdist
#' @export
#' @examples
#' \dontrun{
#' set.seed(12345)
#' data(iris)
#' dat <- iris[, -5]
#' RF.dist <- RFdist(data=dat, ntree = 10, no.rep=20, syn.type = "permute", 
#'                importance=TRUE, parallel = "no")
#' # 
#' print(RF.dist, digits = 3)
#' #  
#' plot(RF.dist)
#  #             
#' # plot variable importance 
#' UnsupRFvarImpPlot(RF.dist, sort=TRUE)
#' }
#
# calcuate RF dissimilarity matrix: serial and parallel  
RFdist.default <- function(data, mtry = floor(sqrt(ncol(data))), ntree, no.rep, 
                      syn.type = "emperical", importance=FALSE, nodesize = 1, 
                      parallel = c("forests", "trees", "no"), ...) {

if (parallel == "forests") '%cols%' <- get('%dopar%')  ## parallel over no.rep or forest
else if(parallel =="no")   '%cols%' <- get('%do%')  ## sequential 
xntree <- 0  

    synthetic.emperical <- function(dat) {
        samp <- function(X)   { sample(X, replace=T) } 
        g1      <- function(dat) { apply(dat,2,samp) }
        nrow1 <- dim(dat)[[1]];
        yy <- rep(c(1,2),c(nrow1,nrow1) );
        data.frame(cbind(yy,rbind(dat,data.frame(g1(dat)))))
    }
    
    synthetic.permute <- function(dat) {
        samp <- function(X)   { sample(X, size = length(X), replace = FALSE) } 
        g1      <- function(dat) { apply(dat,2, samp) }
        nrow1 <- dim(dat)[[1]];
        yy <- rep(c("True.Data","Synthetic.Data"),c(nrow1,nrow1) );
        data.frame(cbind(yy,rbind(dat,data.frame(g1(dat)))))
    }

    nrow1 <- dim(data)[[1]]
    ncol1 <- dim(data)[[2]]
    RFproxAddcl <- matrix(0,nrow=nrow1,ncol=nrow1)
    RFproxConver <- cbind(1:no.rep,matrix(0,(no.rep),3))
#    colnames(RFproxConver) <- c("Max.prox", "MSE.prox", "Mean.prox")
    RFproxConver1 <- NULL 
    RFimportance <- vimp <- matrix(0, nrow=ncol1, ncol=4)    
    RFerrrate <- err <-  0    
    rep1 <- rep(666,2*nrow1) 
    
        if (parallel =="forests" | parallel =="no"){
        
        Res <- foreach(i= c(0:no.rep), .packages='randomForest') %cols% {
        
            index1 <- sample(c(1:(2*nrow1))) 
            rep1[index1] <-  c(1:(2*nrow1))
            if(syn.type =="emperical") 
            datRFsyn <- synthetic.emperical(data)[index1,] 
            else if(syn.type =="permute")
            datRFsyn <- synthetic.permute(data)[index1,] 
            else stop("synthetic type must be \"emperical \"' or \"permute \"'" ) 
            yy <- datRFsyn[,1] 
 
             RF <- randomForest(factor(yy)~., data=datRFsyn[,-1], ntree=ntree, mtry = mtry, 
                     oob.prox=FALSE, proximity=TRUE, importance=importance, nodesize = nodesize, ...) 
            
            CollectGarbage()
            RFprox <- RF$proximity[rep1,rep1]
            
            if (i > 0) { 
                if (i > 1){
                    xx <- ((RFproxAddcl + (RFprox[c(1:nrow1),c(1:nrow1)]))/i) - (RFproxAddcl/(i-1))
                    yy <- mean( c(as.dist((RFproxAddcl + (RFprox[c(1:nrow1),c(1:nrow1)]))/i))) 
#                    RFproxConver[i,2] <- max(abs(c(as.dist(xx))))
#                    RFproxConver[i,3] <- mean((c(as.dist(xx)))^2)
#                    RFproxConver[i,4] <- yy
                 RFproxConver1 <- cbind(Max.prox = max(abs(c(as.dist(xx)))), MSE.prox = mean((c(as.dist(xx)))^2), Mean.prox = yy)
                }
#                RFproxAddcl <- RFproxAddcl + (RFprox[c(1:nrow1),c(1:nrow1)]) 
#                if(importance)  RFimportance <- RFimportance + 1/no.rep*(RF$importance) 
#                RFerrrate <- RFerrrate+ 1/no.rep*(RF$err.rate[ntree])
                 err <- RF$err.rate[ntree]
                 
                if(importance) vimp <-RF$importance 
             }            
          list(RFprox=RFprox, vimp = vimp, err = err, RFproxConver=RFproxConver1)
          
        }
        
       RFprox <- lapply(Res, function(xx) xx$RFprox);  
      if(importance) vimp <- lapply(Res, function(xx) xx$vimp);
       err <-   lapply(Res, function(xx) xx$err);
       RFproxConver <- do.call(rbind, lapply(Res, function(xx) xx$RFproxConver));

      for(ii in c(1:no.rep) ){
               RFproxAddcl <- RFproxAddcl + (RFprox[[ii]][c(1:nrow1),c(1:nrow1)]) 
               if(importance)  RFimportance <- RFimportance + 1/no.rep*(vimp[[ii]]) 
                RFerrrate <- RFerrrate+ 1/no.rep*(err[[ii]])
                }
                
                      
        } else if(parallel == "trees") {
        
         for (i in c(0:no.rep)) { 

            index1 <- sample(c(1:(2*nrow1))) 
            rep1[index1] <-  c(1:(2*nrow1))
            if(syn.type =="emperical") 
            datRFsyn <- synthetic.emperical(data)[index1,] 
            else if(syn.type =="permute")
            datRFsyn <- synthetic.permute(data)[index1,] 
            else stop("synthetic type must be \"emperical \"' or \"permute \"'" ) 
            yy <- datRFsyn[,1] 

            if (getDoParWorkers() == 1) 
              stop("Please register a 'foreach' parallel backend to run 'UnsupRF' in parallel or set 'parallel' to FALSE to run in serial mode.")
                          RF <- foreach(xntree=idiv(ntree, chunks=getDoParWorkers()), .combine='combine', 
                          .multicombine=TRUE, .packages='randomForest') %dopar% {
                        randomForest(factor(yy)~., data=datRFsyn[,-1],  ntree = xntree, mtry = mtry, proximity = TRUE, 
                           importance = importance, nodesize = nodesize, ...)}
           
             CollectGarbage()           
	     RFprox <- RF$proximity[rep1,rep1]
            if (i > 0) { 
                if (i > 1){
                    xx <- ((RFproxAddcl + (RFprox[c(1:nrow1),c(1:nrow1)]))/i) - (RFproxAddcl/(i-1))
                    yy <- mean( c(as.dist((RFproxAddcl + (RFprox[c(1:nrow1),c(1:nrow1)]))/i))) 
                    RFproxConver[i,2] <- max(abs(c(as.dist(xx))))
                    RFproxConver[i,3] <- mean((c(as.dist(xx)))^2)
                    RFproxConver[i,4] <- yy
                }
                RFproxAddcl <- RFproxAddcl + (RFprox[c(1:nrow1),c(1:nrow1)]) 
                if(importance) { RFimportance <- RFimportance+ 1/no.rep*(RF$importance) }
                RFerrrate <- RFerrrate + 1/no.rep*(RF$err.rate[ntree])
            }
        }
        } else stop("Please set parellel to 'forests', 'trees' , or 'no' ")
#        
#         distRFAddcl   = RFproxAddcl
    RFproxAddcl[RFproxAddcl <= 0] <- 1e-12  
    distRFAddcl <- sqrt(1-RFproxAddcl/no.rep) 
    distRFAddcl <- as.dist(distRFAddcl)
    distRF <- list(RFdist=distRFAddcl, err=RFerrrate, UnsupRFvarimp=RFimportance, proxConver=RFproxConver, no.rep=no.rep)
    class(distRF) <- "RFdist"
    distRF
}
#
#' @rdname  RFdist
#' @method print RFdist
#' @export
print.RFdist <- function(x,...){
if (!inherits(x, "RFdist")) stop("Object must be a \"RFdist \"'")
    print("*** UnsupRF OOB error estimate ***")
    print(round(x$err, ...))
    cat("UnsupRF convergence estimates over a total of ", x$no.rep, "forests \n")
    print(summary(x$proxConver, ...))
 }


#' @rdname  RFdist
#' @method plot RFdist
#' @export
plot.RFdist <- function(x,...){
  if (!inherits(x, "RFdist")) stop("Object must be a \"RFdist \"'")
 tab <- x$proxConver 
plot(1:nrow(tab), tab[, "MSE.prox"], xlab = "iteration", ylab = "Proximity MSE", type = "o", 
       col = "blue", main = "Convergence of random forest proximities")
  
}
    
    
    
    
    
    
    
    
    
    
    
    
    
nguforche/UnsupRF documentation built on May 5, 2019, 4:51 p.m.