R/UnsupRFfpc.R

#' @title Unsupervised random forest clustering with fpc.
#
#' @description Unsupervised random forest clustering. A Random forest (RF) classifier is trained to predict 
#' the data labeled as class ``True.Data" and a synthetic data labeled as class ``Synthetic.Data". 
#' The synthetic data is generated by taking a random sample from each dimension of the true data, with 
#' or without replacement (see \code{\link{RFdist}}). The dissimilarity matrix from \code{\link{RFdist}} is 
#' then passed to the algorithms in the "flexible point clustering" 
#' \code{fpc} package for clustering and selection of optimal number of clusters through the bootstrap 
#' cluster-wise stability method. 
#
#' @name UnsupRF 
# 
#' @param data data.frame or matrix 
#' @param  RFdist  RF distance matrix computed from \code{\link{RFdist}}.
#' @param  B number of bootstraps 
#' @param  clustermethod clustering method, options are \code{pamkCBI}, or \code{claraCBI}, or \code{hclustCBI}. 
#'   Not to sure about \code{hclustCBI} see the \code{fpc} package. \code{pamkCBI} is 
#'   recommended for RF dissimilarity matrix, but we have found standard 
#' \code{hclust} in base R works well with  Ward's minimum variance creterion  
#' @param classification type of prediction for finding optimal number of clusters 
#' see \code{\link[fpc]{nselectboot}}.
#' @param krange integer vector; numbers of clusters to be tried
#' @param kopt user provided optimal number of clusters 
#' @param run.boot (logical) run bootstrap cluster-wise stability ? 
#' @param fun function to determine mediods, should be \code{mean}, \code{median}, 
#' or \code{sum}. See \code{\link{mediod}}
#' @param x object of class \code{\link{UnsupRF}} 
#' @param \dots further arguments passed to or from other methods.
#' @return A list with elements:  
#' \enumerate{
#' \item cluster.model: The cluster model  
#' \item cluster: cluster memberships 
#' \item kopt: optimal number of clusters  
#' \item mediods: a mediod object            
#' }
#' @import fpc 
NULL
#' @rdname UnsupRF 
#' @export
UnsupRF <- function(data, ...) UseMethod("UnsupRF")
#
#' @rdname UnsupRF 
#' @export
#' @examples
#' \dontrun{
#' set.seed(12345)
#' data(iris)
#' dat <- iris[, -5]
#' RF.dist <- RFdist(data=dat, ntree = 10, no.rep=20, syn.type = "permute", 
#'                importance=TRUE, oob.prox=TRUE, proxConver=FALSE)
#' #            
#' Clus.res <- UnsupRF(data = dat, RFdist=RF.dist$RFdist, 
#'              B =  5, clustermethod=pamkCBI, classification="centroid", 
#'              krange= 2:4, kopt=2, run.boot = TRUE)
#'  print(Clus.res)            
#' clusters <- Clus.res$clusters 
#' kopt <- Clus.res$kopt # optimal number of clusters 
#' }
#
# performs clustering using data or similarity matrix and 
# optionally select optimal number of clusters through 
# bootstrap
UnsupRF.default <- function(data, RFdist, B = 10, clustermethod=pamkCBI, classification = "centroid", 
                 krange= 2:5, kopt=2, run.boot = FALSE, fun = "sum",  ...){
	if(run.boot){
	nsel <- nselectboot(RFdist, B=B, clustermethod=clustermethod, 
		               classification=classification, krange=krange,...)
	kopt <- nsel$kopt 
	} 
	clust.mod <- clustermethod(RFdist, k = kopt, ...)
	clusters = clust.mod$partition	

med <- mediod(x = RFdist, clusters=clusters, fun = fun)
res = list(cluster.model = clust.mod, clusters = clusters, kopt = kopt, mediods = med) 
class(res) <- "UnsupRF"
return(res)
}

#' @rdname  UnsupRF 
#' @method print UnsupRF
#' @export
print.UnsupRF <- function(x,...){
  if (!inherits(x, "UnsupRF")) stop("Object must be a \"UnsupRF \"'")
#  print("*** Cluster Model ***")
#  print(x$cluster.model)
  print("*** Optimal number of clusters  ***")
  print(x$kopt)
  print("*** Distribution of clusters ***")
  print(table(x$clusters))
  print(x$mediods)
}
nguforche/UnsupRF documentation built on May 5, 2019, 4:51 p.m.