#' unsupervisedForest
#'
#' Wrapper for creating unsupervised randomForest objects with prediction capabilities
#'
#' @export
#'
#' @author David Navega
#'
#' @param x a data.frame.
#' @param mtry an integer, defines number of variables to try at each split.
#' @param ntree number of trees composing the forest.
#' @param type the type of envelope used to generate random data. Default is
#' "data". type can be "data" or "uniform".
#' @param ... additional parameters to the randomForest
#'
#' @return an unsupervisedForest object
#'
#' @seealso \link{randomForest}
#'
unsupervisedForest <- function(x, mtry,
ntree = 500,
type = "data",
...
) {
# initialize and assess class of each variable
x_names <- colnames(x)
x_class <- lapply(x, class)
x_class <- named_apply(x_class, numeric_factor_class, simplify = TRUE)
x_dimensions <- dim(x)
n <- x_dimensions[1]
m <- x_dimensions[2]
condition <- missing(mtry)
if(condition) {
mtry <- sqrt(ncol(x))
}
# mode imputation
condition <- any(is.na(x))
if(condition) {
x <- randomForest::na.roughfix(x)
warning("\n[-] NA values detected - randomForest::na.roughfix(x) applied.")
}
# sythethic data ----
distribution_envelope <- create_envelope(x, type = type)
synthetic <- sample_envelope(distribution_envelope, n = n)
# data for unsupervised tree
forest_output <- factor(rep(c(1, 2), each = n))
forest_data <- dplyr::bind_rows(x, synthetic)
unsupervised_forest <- randomForest::randomForest(
x = forest_data,
y = forest_output,
mtry = mtry,
ntree = ntree,
proximity = T,
oob.prox = T,
...
)
full_proximity <- unsupervised_forest$proximity
proximity <- full_proximity[seq_len(n), seq_len(n)]
unsupervised_forest$proximity <- proximity
object <- list(
forest = unsupervised_forest,
data = x,
proximity = proximity,
full_proximity = full_proximity
)
class(object) <- "unsupervisedForest"
# return
rout <- object
return(rout)
}
#' Predict method for unsupervisedForest class
#'
#' @export
#'
#' @author David Navega
#'
#' @param object an unsupervisedForest object,
#' @param newdata data to predict proximity and probability based from an
#' unsupervisedForest
#' @param ... ...
#'
#' @return a list with the following components
#' \item{proximity}{proximity vector for newdata}
#' \item{probability}{probability of belonging to normal or synthethic data}
#'
predict.unsupervisedForest <- function(object, newdata, ...) {
data <- object$data
forest <- object$forest
forest_data <- dplyr::bind_rows(newdata, object$data)
n <- NROW(newdata)
forest_output <- predict(
object = forest,
newdata = forest_data,
type = "prob",
proximity = T
)
probability <- rbind(forest_output$pred[seq_len(n), ])
proximity <- forest_output$proximity[seq_len(n),-seq_len(n)]
# return
rout <- list(
proximity = (rbind(proximity)),
probability = probability
)
return(rout)
}
#' Print method for unsurpersivedForest class
#'
#' @noRd
#'
#' @export
#'
#' @author David Navega
#'
#' @param x an unsupervisedForest object
#' @param ... ...
#'
#'
print.unsupervisedForest <- function(x, ...) {
error <- x$forest$err.rate[x$forest$ntree,1:2]
cat("unsupervisedForest\n\n")
cat("OOB Accuracy (%): ", round(1 - error[1], 4) * 100, sep = "")
cat("\nOOB Detection Rate (%): ", round(1 - error[2], 4) * 100, sep = "")
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.