Nothing
#' HypoRF; a Random Forest based Two Sample Test
#'
#' @description Performs a permutation two sample test based on the out-of-bag-error of random forest
#'
#' @param data1 An object of type "data.frame". The first sample.
#' @param data2 An object of type "data.frame". The second sample.
#' @param K A numeric value specifying the number of times the created label is permuted. For K = 1 a binomial test is carried out. The Default is K = 100.
#' @param statistic A character value specifying the statistic for permutation testing. Two options available
#' \itemize{
#' \item\code{PerClassOOB} Sum of OOB per class errors.
#' \item\code{OverallOOB} OOB-error.
#' }. Default is statistic = "PerClassOOB".
#' @param normalapprox A logical value asking for the use of a normal approximation. Default is normalapprox = FALSE.
#' @param seed A numeric value for reproducibility.
#' @param alpha The level of the test. Default is alpha = 0.05.
#' @param ... Arguments to be passed to ranger
#' @return A list with elements
#' \itemize{
#' \item\code{pvalue:} The p-value of the test.
#' \item\code{obs:} The OOB-statistic in case of K>1 or the out-of-sample error in case of K=1 (binomial test).
#' \item\code{val:} The OOB-statistic of the permuted random forests in case of K>1 (otherwise NULL).
#' \item\code{varest:} The estimated variance of the permuted random forest OOB-statistic in case of K>1 (otherwise NULL).
#' \item\code{statistic:} The used OOB-statistic
#' \item\code{importance_ranking:} The variable importance measure, when importance == "impurity".
#' \item\code{cutoff:} The quantile of the importance distribution at level alpha.
#' \item\code{call:} Call to the function.}
#' @seealso \code{\link{ranger}}
#' @examples
#' # Using the default testing procedure (permutation test)
#' x1 <- data.frame(x=stats::rt(100, df=1.5))
#' x2 <- data.frame(x=stats::rnorm(100))
#' hypoRF(x1, x2, K=2)
#' # Using the exact binomial test
#' hypoRF(x1, x2, K=1)
#' @export
hypoRF <- function(data1, data2, K = 100,
statistic = "PerClassOOB",
normalapprox = F, seed = NULL, alpha=0.05, ...) {
if (!is.null(seed) && (!is.numeric(seed) || !is.finite(seed) || seed < 0)) {
stop("invalid seed provided")
}
if(!is.null(seed)) {
set.seed(seed)
}
add.param <- list(...)
if (c("probability") %in% names(add.param)) {
stop("probability parameter not meaningfull for testing")
}
if (!is.data.frame(data1) || !is.data.frame(data2)){
stop("data1 or data2 are not of class data.frame")
}
if (ncol(data1)!=ncol(data2)) {
stop("amount of columns in data1 and data2 not identical")
}
if (!identical(colnames(data1),colnames(data2))) {
stop("colnames of the datasets do not match")
}
if (c("y")%in%colnames(data1)) {
stop("y is a reserved character for internal use")
}
if (!is.numeric(K) || !is.finite(K) || K<1) {
stop("input K not valid")
}
if (is.na(statistic) || !(statistic %in% c("PerClassOOB", "OverallOOB"))){
stop("invalid statistic")
}
if (!is.logical(normalapprox)) {
stop("input normalapprox not logical")
}
if ((alpha > 1) || (alpha < 0)) {
stop("invalid alpha")
}
n1 <- nrow(data1) # number of class 0
n2 <- nrow(data2) # number of class 1
p <- ncol(data1)
if(K > 1){
y <- factor(c(rep(0, n1) ,rep(1, n2)))
d <- cbind(rbind(data1, data2), y)
rf <- ranger::ranger(y~., data=d, probability=T, ...)
if(statistic == "OverallOOB"){
obs <- rf$prediction.error
} else if(statistic == "PerClassOOB") {
pred_oob <- as.numeric(rf$predictions[,2] > (n2/(n1+n2)))
tmp <- cbind(d,pred_oob)[,c("y","pred_oob")]
L0<-sum(tmp[y==0,"pred_oob"])/length(tmp[y==0,"pred_oob"])
L1<-(length(tmp[y==1,"pred_oob"])-sum(tmp[y==1,"pred_oob"]))/length(tmp[y==1,"pred_oob"])
obs <- L0 + L1
}
val <- importancedistribution <- numeric()
for(i in 1:K){
y <- sample(factor(c(rep(0,n1),rep(1,n2))));
ds <- cbind(rbind(data1, data2), y);
rf_i <- ranger::ranger(y~., data=ds, probability = T, ...)
if(statistic == "OverallOOB"){
val[i] <- rf_i$prediction.error
} else if(statistic == "PerClassOOB"){
pred_oob <- as.numeric(rf_i$predictions[,2] > (n2/(n1+n2)))
tmpi <- cbind(d,pred_oob)[,c("y","pred_oob")]
L0i<-sum(tmpi[y==0,"pred_oob"])/length(tmpi[y==0,"pred_oob"])
L1i<-(length(tmpi[y==1,"pred_oob"])-sum(tmpi[y==1,"pred_oob"]))/length(tmpi[y==1,"pred_oob"])
val[i] <- L0i + L1i
}
if(!is.null(rf_i$variable.importance)){
importancedistribution[i] <- max(rf_i$variable.importance)
}else{
importancedistribution <- NULL
}
}
if (!normalapprox){
pvalue = (sum(val < obs)+1) / (K+1)
varest <- NULL
}else{
varest <- stats::var(val)
mean <- mean(val)
pvalue <- stats::pnorm(obs, mean=mean, sd=sqrt(varest))
}
if(c("importance") %in% names(add.param)){
if(add.param$importance == "impurity"){
importance <- as.matrix(ranger::importance(rf))
importance_ranking <- as.matrix(importance)
cutoff<-stats::quantile(importancedistribution , 1-alpha)
}
} else {
importance_ranking <- NULL
cutoff <- NULL
}
} else if(K==1) {
data1 <- as.matrix(data1)
data2 <- as.matrix(data2)
n <- n1 + n2
m1 <- round(0.5*n1) # number of class 0
m2 <- round(0.5*n2) # number of class 1
y_train <- as.factor(c(rep(1, n1-m1), rep(0, n2-m2)))
y_test <- as.factor(c(rep(1, m1), rep(0, m2)))
idtest1 <- sample(1:n1, m1)
idtest2 <- sample(1:n2, m2)
data1train <- data1[-idtest1, , drop=F]
data2train <- data2[-idtest2, , drop=F]
data1test <- data1[idtest1, , drop=F]
data2test <- data2[idtest2, , drop=F]
rf <- ranger::ranger(Y~., data=data.frame(
Y=y_train, X=rbind(data1train, data2train)), probability = T,...)
if(c("importance") %in% names(add.param)){
if(add.param$importance == "impurity"){
importance <- as.matrix(ranger::importance(rf))
importance_ranking <- as.matrix(importance[order(importance[,1], decreasing = T),])
}
} else {
importance_ranking <- NULL
}
if(statistic == "OverallOOB"){
pred_prob<-stats::predict(rf, data=data.frame(Y=y_test, X=rbind(data1test, data2test)))$prediction
pred_class<-as.numeric(pred_prob[,2] > (n2-m2)/((n1-m1) + (n2-m2)))
obs <- sum(as.numeric(as.character(y_test) != as.character(pred_class)))
pvalue <- stats::pbinom(obs, size = (m1+m2), prob=0.5)
} else if(statistic == "PerClassOOB"){
pred_prob<-stats::predict(rf, data=data.frame(Y=y_test, X=rbind(data1test, data2test)))$prediction
pred_class<-as.numeric(pred_prob[,2] > (n2-m2)/((n1-m1) + (n2-m2)))
tmp<-data.frame(y_test=y_test, pred_class=pred_class)
obs1<-sum(tmp[y_test==0,"pred_class"])/length(tmp[y_test==0,"pred_class"])
obs2<-(length(tmp[y_test==1,"pred_class"])-sum(tmp[y_test==1,"pred_class"]))/length(tmp[y_test==1,"pred_class"])
if(sqrt(obs1*(1-obs1)+obs2*(1-obs2))==0){
if(obs1+obs2==1){
pvalue <- 1
} else {
pvalue <- 0
}
} else {
obs <- (obs1 + obs2 -1)/sqrt(obs1*(1-obs1)/m1+obs2*(1-obs2)/m2)
pvalue <- stats::pnorm(obs,mean=0,sd=1)
}
}
val <- NULL
varest <- NULL
importancedistribution <- NULL
cutoff <- NULL
}
return(list(pvalue = pvalue,
obs = obs,
val = val,
varest = varest,
statistic = statistic,
importance_ranking = importance_ranking,
importancedistribution=importancedistribution,
cutoff=cutoff,
call = match.call()))
}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.