#' @title Unsupervised Random Forest
#
#' @description Compute dissimilarity matrix between observations by training a
#' randomForest (RF) classifier to
#' descriminate between the 'original' data and a synthetic version.
#' The original data is labeled as "True.Data"
#' while the synthetic data is labeled ``Synthetic.Data". The random forest p
#' roximity matrix between observations in the
#' original data are then extracted, converted to distance, and returned.
#' The synthetic data is generated by taking a random sample from each
#' dimension of the true data, with
#' or without replacement.
#'
#' @name RFdist
#
#' @param data data.frame or matrix
#' @param mtry mtry in \code{\link[randomForest]{randomForest}}
#' @param ntree, number of trees
#' @param no.rep number of repetitions or forests
#' @param syn.type type of synthetic data generator: "emperical" generate samples from the
#' emperical distribution of the original data while "permute" takes a permutation
#' of each dimension. emerical is just sample with
#' replacement while permute is without replacement
#' @param importance (logical) compute variable importance ?
#' @param parallel character vector specifying the type of parallel run: 'forests' - run a total of
#' \code{no.rep} RF in parallel,
#' 'trees' - run RF in parallel over ntrees and combine the results, 'no' - serial computation.
#' @param nodesize node size in \code{randomForest}
#' @param x object of class \code{\link{RFdist}}
#' @param \dots further arguments passed to \code{\link[randomForest]{randomForest}}.
#' @details
#' Methods
#' \enumerate{
#' \item \code{print} : print OOB error and convergence summary
#' \item \code{plot} : plots the convergence of the RF proximities given by the MSE
#' over number of forest \code{no.rep}
#' }
#' @return A list with elements:
#' \enumerate{
#' \item RFdist: RF proximity converted to a distance object
#' \item err: error rate
#' \item UnsupRFvarimp: Unsupervised RF variable importance
#' \item proxConver: a matrix containing three convergence meausres
#' \enumerate{
#' \item Max.prox = max( abs( aveprox(N)- aveprox(N-1)))
#' \item MSE.prox = mean( (aveprox(N)- aveprox(N-1))^2)
#' \item Mean = mean(aveprox(N))
#' where N is number of forests (no.rep).
#' }
#' }
#
#' @references
#' Tao Shi and Steve Horvath (2006) Unsupervised Learning with Random Forest Predictors.
#' Journal of Computational and Graphical Statistics. Volume 15, Number 1, March 2006, pp. 118-138(21)
#
#' @import foreach
#' @importFrom randomForest randomForest combine
#' @importFrom iterators idiv
NULL
#' @exportPattern "."
#' @rdname RFdist
#' @export
RFdist <- function(data, ...) UseMethod("RFdist")
#' @rdname RFdist
#' @export
#' @examples
#' \dontrun{
#' set.seed(12345)
#' data(iris)
#' dat <- iris[, -5]
#' RF.dist <- RFdist(data=dat, ntree = 10, no.rep=20, syn.type = "permute",
#' importance=TRUE, parallel = "no")
#' #
#' print(RF.dist, digits = 3)
#' #
#' plot(RF.dist)
# #
#' # plot variable importance
#' UnsupRFvarImpPlot(RF.dist, sort=TRUE)
#' }
#
# calcuate RF dissimilarity matrix: serial and parallel
RFdist.default <- function(data, mtry = floor(sqrt(ncol(data))), ntree, no.rep,
syn.type = "emperical", importance=FALSE, nodesize = 1,
parallel = c("forests", "trees", "no"), ...) {
if (parallel == "forests") '%cols%' <- get('%dopar%') ## parallel over no.rep or forest
else if(parallel =="no") '%cols%' <- get('%do%') ## sequential
xntree <- 0
synthetic.emperical <- function(dat) {
samp <- function(X) { sample(X, replace=T) }
g1 <- function(dat) { apply(dat,2,samp) }
nrow1 <- dim(dat)[[1]];
yy <- rep(c(1,2),c(nrow1,nrow1) );
data.frame(cbind(yy,rbind(dat,data.frame(g1(dat)))))
}
synthetic.permute <- function(dat) {
samp <- function(X) { sample(X, size = length(X), replace = FALSE) }
g1 <- function(dat) { apply(dat,2, samp) }
nrow1 <- dim(dat)[[1]];
yy <- rep(c("True.Data","Synthetic.Data"),c(nrow1,nrow1) );
data.frame(cbind(yy,rbind(dat,data.frame(g1(dat)))))
}
nrow1 <- dim(data)[[1]]
ncol1 <- dim(data)[[2]]
RFproxAddcl <- matrix(0,nrow=nrow1,ncol=nrow1)
RFproxConver <- cbind(1:no.rep,matrix(0,(no.rep),3))
# colnames(RFproxConver) <- c("Max.prox", "MSE.prox", "Mean.prox")
RFproxConver1 <- NULL
RFimportance <- vimp <- matrix(0, nrow=ncol1, ncol=4)
RFerrrate <- err <- 0
rep1 <- rep(666,2*nrow1)
if (parallel =="forests" | parallel =="no"){
Res <- foreach(i= c(0:no.rep), .packages='randomForest') %cols% {
index1 <- sample(c(1:(2*nrow1)))
rep1[index1] <- c(1:(2*nrow1))
if(syn.type =="emperical")
datRFsyn <- synthetic.emperical(data)[index1,]
else if(syn.type =="permute")
datRFsyn <- synthetic.permute(data)[index1,]
else stop("synthetic type must be \"emperical \"' or \"permute \"'" )
yy <- datRFsyn[,1]
RF <- randomForest(factor(yy)~., data=datRFsyn[,-1], ntree=ntree, mtry = mtry,
oob.prox=FALSE, proximity=TRUE, importance=importance, nodesize = nodesize, ...)
CollectGarbage()
RFprox <- RF$proximity[rep1,rep1]
if (i > 0) {
if (i > 1){
xx <- ((RFproxAddcl + (RFprox[c(1:nrow1),c(1:nrow1)]))/i) - (RFproxAddcl/(i-1))
yy <- mean( c(as.dist((RFproxAddcl + (RFprox[c(1:nrow1),c(1:nrow1)]))/i)))
# RFproxConver[i,2] <- max(abs(c(as.dist(xx))))
# RFproxConver[i,3] <- mean((c(as.dist(xx)))^2)
# RFproxConver[i,4] <- yy
RFproxConver1 <- cbind(Max.prox = max(abs(c(as.dist(xx)))), MSE.prox = mean((c(as.dist(xx)))^2), Mean.prox = yy)
}
# RFproxAddcl <- RFproxAddcl + (RFprox[c(1:nrow1),c(1:nrow1)])
# if(importance) RFimportance <- RFimportance + 1/no.rep*(RF$importance)
# RFerrrate <- RFerrrate+ 1/no.rep*(RF$err.rate[ntree])
err <- RF$err.rate[ntree]
if(importance) vimp <-RF$importance
}
list(RFprox=RFprox, vimp = vimp, err = err, RFproxConver=RFproxConver1)
}
RFprox <- lapply(Res, function(xx) xx$RFprox);
if(importance) vimp <- lapply(Res, function(xx) xx$vimp);
err <- lapply(Res, function(xx) xx$err);
RFproxConver <- do.call(rbind, lapply(Res, function(xx) xx$RFproxConver));
for(ii in c(1:no.rep) ){
RFproxAddcl <- RFproxAddcl + (RFprox[[ii]][c(1:nrow1),c(1:nrow1)])
if(importance) RFimportance <- RFimportance + 1/no.rep*(vimp[[ii]])
RFerrrate <- RFerrrate+ 1/no.rep*(err[[ii]])
}
} else if(parallel == "trees") {
for (i in c(0:no.rep)) {
index1 <- sample(c(1:(2*nrow1)))
rep1[index1] <- c(1:(2*nrow1))
if(syn.type =="emperical")
datRFsyn <- synthetic.emperical(data)[index1,]
else if(syn.type =="permute")
datRFsyn <- synthetic.permute(data)[index1,]
else stop("synthetic type must be \"emperical \"' or \"permute \"'" )
yy <- datRFsyn[,1]
if (getDoParWorkers() == 1)
stop("Please register a 'foreach' parallel backend to run 'UnsupRF' in parallel or set 'parallel' to FALSE to run in serial mode.")
RF <- foreach(xntree=idiv(ntree, chunks=getDoParWorkers()), .combine='combine',
.multicombine=TRUE, .packages='randomForest') %dopar% {
randomForest(factor(yy)~., data=datRFsyn[,-1], ntree = xntree, mtry = mtry, proximity = TRUE,
importance = importance, nodesize = nodesize, ...)}
CollectGarbage()
RFprox <- RF$proximity[rep1,rep1]
if (i > 0) {
if (i > 1){
xx <- ((RFproxAddcl + (RFprox[c(1:nrow1),c(1:nrow1)]))/i) - (RFproxAddcl/(i-1))
yy <- mean( c(as.dist((RFproxAddcl + (RFprox[c(1:nrow1),c(1:nrow1)]))/i)))
RFproxConver[i,2] <- max(abs(c(as.dist(xx))))
RFproxConver[i,3] <- mean((c(as.dist(xx)))^2)
RFproxConver[i,4] <- yy
}
RFproxAddcl <- RFproxAddcl + (RFprox[c(1:nrow1),c(1:nrow1)])
if(importance) { RFimportance <- RFimportance+ 1/no.rep*(RF$importance) }
RFerrrate <- RFerrrate + 1/no.rep*(RF$err.rate[ntree])
}
}
} else stop("Please set parellel to 'forests', 'trees' , or 'no' ")
#
# distRFAddcl = RFproxAddcl
RFproxAddcl[RFproxAddcl <= 0] <- 1e-12
distRFAddcl <- sqrt(1-RFproxAddcl/no.rep)
distRFAddcl <- as.dist(distRFAddcl)
distRF <- list(RFdist=distRFAddcl, err=RFerrrate, UnsupRFvarimp=RFimportance, proxConver=RFproxConver, no.rep=no.rep)
class(distRF) <- "RFdist"
distRF
}
#
#' @rdname RFdist
#' @method print RFdist
#' @export
print.RFdist <- function(x,...){
if (!inherits(x, "RFdist")) stop("Object must be a \"RFdist \"'")
print("*** UnsupRF OOB error estimate ***")
print(round(x$err, ...))
cat("UnsupRF convergence estimates over a total of ", x$no.rep, "forests \n")
print(summary(x$proxConver, ...))
}
#' @rdname RFdist
#' @method plot RFdist
#' @export
plot.RFdist <- function(x,...){
if (!inherits(x, "RFdist")) stop("Object must be a \"RFdist \"'")
tab <- x$proxConver
plot(1:nrow(tab), tab[, "MSE.prox"], xlab = "iteration", ylab = "Proximity MSE", type = "o",
col = "blue", main = "Convergence of random forest proximities")
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.