#'@title RF Post-Hoc Test
#'
#'@description Test for false positives in RF prediction.
#'
#'@param rf Object of class randomForest.
#'
#'@param newdata Dataframe to predict using RF model.
#'
#'@return An object of class RF.ph List with results of Post-Hoc test .
#'\itemize{
#' \item{ecdf}{ The empirical cumulative distribution fuction of the beta distributed
#' RF probability of assignment to correct class. }
#' \item{post.hoc}{ Dataframe with results of post-hoc test. }
#' \itemize{
#' \item{winner}{ The predicted class. }
#' \item{POA}{ Probability of assigment to the winner class. }
#' \item{p.post.hoc}{ Probability that POA belongs to POA distribution of this class in the trained model. }
#' \item{sig}{ Significance code: 0 ***, 0.001 **, 0.01 *, 0.05 ., > ns }
#' }
#' }
#'
#'
#'@details
#' RF Post-Hoc test:
#' Take an object created with randomForest and a new dataset to predict.
#' From the votes matrix of RF object take only correct assigments and calculate for each predicted class
#' the beta distribution of the probability of assignment (POA) to the correct class using fitdistr() from MASS.
#' In case fitdistr fails in estimating the parameters of beta,
#' the non-parametric empirical cumulative distribution function (ecdf) is used instead.
#' The for each observation in newdata, the probability that the POA to winner class (p.assign) belongs to
#' the beta (or ecdf) of that class in the training data set is calculated (p.post.hoc).
#' Use p.post.hoc to reject the hypothesis that the predicted POA belongs to the trained POA.
#' A low p.post.hoc probability (e.g. <= 0.05) indicates a possible missclassification of that observation.
#'
#' Methods plot, print and summary are available
#'
#'@author Pedro Martinez Arbizu & Sven Rossel
#'
#'@references
#' Rossel, S. & P. Martinez Arbizu (2018) Automatic specimen identification of Harpacticoids (Crustacea:Copepoda) using Random Forest
#' and MALDIāTOF mass spectra, including a post hoc test for false positive discovery. Methods in Ecology and Evolution,
#' 9(6):1421-1434.
#'
#'\url{https://doi.org/10.1111/2041-210X.13000}
#'
#'@examples
#'#example with maldi data
#' data(maldi)
#' library(randomForest)
#' unique(maldi$species)
#' maldi_train <- maldi[maldi$species != 'Cletodes limicola',]
#' maldi_test <- maldi[maldi$species == 'Cletodes limicola',]
#'#exclude Cletodes limicola from factors
#' maldi_train$species <- factor(maldi_train$species)
#' rf <- randomForest(species ~ ., data = maldi_train[-1])
#' ph <- rf.post.hoc(rf,maldi_test)
#' plot(ph)
#' plot(ph,'Tachidius discipes')
#'@export rf.post.hoc summary.RFPH
#'@import randomForest MASS stats
#'@seealso \code{\link{add.null.class}} \code{\link{smooth.data}} \code{\link{robust.test}} \code{\link{plot.RFPH}}
######################################################### Post hoc test of RF prediction using probability of class assignment
rf.post.hoc <- function(rf, newdata) {
# initial checks
if(!(inherits(rf,'randomForest'))) {
stop("rf should be a randomForest Object\n ")
}
# predict with rf model
res.pred <- data.frame(class = predict(rf, newdata), predict(rf, newdata, type = "p"))
# winner probability of assignment
prob.assign <- apply(res.pred[-1], max, MARGIN = 1)
# consider only predicted classes
pred.class.votes <- data.frame(
rf.pred = rf$y[which(rf$y %in% unique(res.pred$class))],
votes.pred = rf$votes[which(rf$y %in% unique(res.pred$class)),]
)
colnames(pred.class.votes) <- c('class',colnames(rf$votes))
# winner class consider correctly predicted
winner <- colnames(pred.class.votes[,-1])[apply(pred.class.votes[,-1],1,which.max)]
#consider only corrently predicted
pred.class <- pred.class.votes[pred.class.votes$class == winner,]
# for each species calculate the empirical beta distribution
res.ecdf <- vector("list", length(unique(pred.class$class)))
for (i2 in unique(pred.class.votes$class)) {
# list of probabilities of correct assigment for class i
prob.ca <- pred.class[pred.class$class == as.character(i2), as.character(i2)]
# laplace correction
prob.ca[prob.ca == 0] <- 1e-3
prob.ca[prob.ca == 1] <- 1-(1e-9)
# first calculate lower 5% quantile from ecdf
ed <- ecdf(prob.ca)
# delete beta values
beta1 = NULL
beta2 = NULL
# try optimization of empirical beta distribution of the probability of correct
# assigment
try(beta1 <- MASS::fitdistr(prob.ca, "beta", start = list(shape1 = 1, shape2 = 1),lower=c(0,0)),
silent = TRUE)
# now try to generate 5000 random numbers with the parameters of fitdistr
try(beta2 <- rbeta(5000, beta1$estimate[1], beta1$estimate[2]), silent = TRUE)
if(!is.null(beta2)){ed <- ecdf(beta2)}
# write to list
res.ecdf[[i2]] <- list(rf$classes[i2], ed)
}
# probability that probability of assigment belongs to OOB POA for winner class
p.post.hoc <- c()
for (i in 1:nrow(res.pred)) {
df <- res.ecdf[[as.character(res.pred$class[i])]][[2]]
p.post.hoc <- c(p.post.hoc, df(prob.assign[i]))
}
# significance symbols
sig = c(rep("ns", length(p.post.hoc)))
sig[p.post.hoc <= 0.05] <- "."
sig[p.post.hoc <= 0.01] <- "*"
sig[p.post.hoc <= 0.001] <- "**"
sig[p.post.hoc <= 1e-04] <- "***"
res <- list(ecdf = res.ecdf, post.hoc = data.frame(winner = res.pred$class, POA = prob.assign,
p.post.hoc = round(p.post.hoc, 4), sig = sig))
class(res) <- c("RFPH", "list")
return(res)
}
## end of function
### Method summary
summary.RFPH = function(object, ...) {
cat("Results of RF post-hoc test:\n")
cat("\n")
print(object$post.hoc, ...)
cat("Signif. codes: ns > 0.05, . <= 0.05, * <=0.01, ** <=0.001, *** <=0.0001\n")
cat("\n")
cat("\n")
cat("Table of counts by class:\n")
print(table(object$post.hoc[, c(1, 4)]), ...)
}
## end of method summary
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.