#' varImpRanger
#'
#' Computes the variable importance for ranger models and for arbitrary measures from the 'measures' package.
#'
#' @param object An object as returned by cforest. \code{\link[ranger]{ranger}} with option \code{keep.inbag = TRUE}.
#' @param data Original data that was used for training the random forest.
#' @param target Target variable as used in the trained model.
#' @param nperm The number of permutations performed.
#' @param measure The name of the measure of the 'measures' package that should be used for the variable importance calculation.
#'
#'
#' @importFrom stats predict
#' @return Vector with computed permutation importance for each variable.
#' @export
#'
#' @examples
#' library(ranger)
#' iris.rg = ranger(Species ~ ., data = iris, keep.inbag = TRUE, probability = TRUE)
#' vimp.ranger = varImpRanger(object = iris.rg, data = iris, target = "Species")
#' vimp.ranger
varImpRanger_test = function(object, data, target, nperm = 1, measure = "multiclass.Brier") {
# Some tests
if(!("ranger" %in% class(object)))
stop("Object is not a 'ranger' model")
if(!(object$treetype %in% c("Regression", "Probability estimation")))
stop("Object is not a regression or probability forest")
measureList = listAllMeasures()
if (!(measure %in% measureList[, 1]))
stop("measure should be a measure of the measures package")
measure.minimize = measureList$minimize[measureList[,1] == measure]
pred_cols = which(colnames(data) != target)
num.trees = object$num.trees
inbag = do.call(cbind, object$inbag.counts)
pred_levels = levels(data[, target])
truth = data[, target]
# Calculate original performance
old_predis = predict(object, data = data, predict.all = TRUE)$predictions
colnames(old_predis) = pred_levels
res_old = numeric(num.trees)
for(i in 1:num.trees){
if(object$treetype == "Regression")
res_old[i] = do.call(measure, list(old_predis[inbag[,i] == 0, i], truth[inbag[,i] == 0]))
if(object$treetype == "Probability estimation")
res_old[i] = do.call(measure, list(old_predis[inbag[,i] == 0, , i], truth[inbag[,i] == 0]))
}
# Calculate permuted performance
res_new = matrix(NA, num.trees, length(pred_cols))
for(j in pred_cols) {
for(i in 1:num.trees) {
print(paste("column", j, "tree", i))
data_new = data[inbag[,i] == 0,]
data_new[,j] = sample(data_new[,j], replace = FALSE)
object2 = swap_trees(object, 1, i)
predis = predict(object2, data = data_new, num.trees = 1)$predictions
colnames(predis) = pred_levels
truth = data_new[, target]
perf_new = do.call(measure, list(predis, truth))
res_new[i, j] = perf_new
}
}
minSign = ifelse(measure.minimize, 1, -1)
res = minSign * (res_new - res_old)
return(list(res = res, vimp = colMeans(res)))
}
swap_trees = function(rf, i, j) {
res = rf
res$forest$child.nodeIDs[[i]] = rf$forest$child.nodeIDs[[j]]
res$forest$split.varIDs[[i]] = rf$forest$split.varIDs[[j]]
res$forest$split.values[[i]] = rf$forest$split.values[[j]]
res$forest$child.nodeIDs[[j]] = rf$forest$child.nodeIDs[[i]]
res$forest$split.varIDs[[j]] = rf$forest$split.varIDs[[i]]
res$forest$split.values[[j]] = rf$forest$split.values[[i]]
if (!is.null(rf$forest$terminal.class.counts)) {
res$forest$terminal.class.counts[[i]] = rf$forest$terminal.class.counts[[j]]
res$forest$terminal.class.counts[[j]] = rf$forest$terminal.class.counts[[i]]
}
res
}
devtools::install_github("mlr-org/measures")
devtools::install_github("PhilippPro/varImp")
library(ranger)
library(varImp)
iris.rg = ranger(Species ~ ., data = iris, keep.inbag = TRUE, probability = TRUE)
vimp.ranger = varImpRanger_test(object = iris.rg, data = iris, target = "Species", measure = "multiclass.Brier") # MMCE geht nicht!
# binomial test
bin.test_greater = function(y, zeros = FALSE) {
#if(!(zeros))
# y = y[y!=0] # Diese Loesung ist nicht wirklich sauber
x = sum(y > 0)
n = length(y)
binom.test(x, n, p = 0.5, alternative = "greater")$p.value
}
bin.test_two.sided = function(y, zeros = FALSE) {
#if(!(zeros))
# y = y[y!=0] # Diese Loesung ist nicht wirklich sauber
x = sum(y > 0)
n = length(y)
binom.test(x, n, p = 0.5, alternative = "two.sided")$p.value
}
# wilcoxon test
wilcoxon.test = function(y, zeros = FALSE) {
#if(!(zeros))
# y = y[y!=0] # Diese Loesung ist nicht wirklich sauber
wilcox.test(y, mu = 10^-10, alternative = "greater")$p.value
}
# z-Scores
p_value = function(y, zeros = FALSE) {
#if(!(zeros))
# y = y[y!=0] # Diese Loesung ist nicht wirklich sauber
t.test(y, alternative = "greater", mu = 0)$p.value # 1 - pnorm(mean(x) / (sd(x)/sqrt(length(x))))
}
vimp.ranger$vimp
apply(vimp.ranger$res < 0, 2, mean)
round(apply(vimp.ranger$res, 2, bin.test_greater), 3)
round(apply(vimp.ranger$res, 2, bin.test_two.sided), 3)
round(apply(vimp.ranger$res, 2, wilcoxon.test), 3)
round(apply(vimp.ranger$res, 2, p_value), 3)
# Simulation
sim_data = function(n = 20) {
x1 = runif(n,5,95)
x2 = runif(n,5,95)
x3 = rbinom(n,1,.5)
b0 = 17
b1 = 0
b2 = 0.037
b3 = -5.2
sigma = 1.4
eps = rnorm(n, 0, sigma)
y = b0 + b1*x1 + b2*x2 + b3*x3 + eps
X = cbind(x1, x2, x3)
return(data.frame(X, y))
}
sim = sim_data(n = 1000)
mod = ranger(y ~ ., data = sim, num.trees = 10000, keep.inbag = TRUE)
vimp.ranger = varImpRanger_test(object = mod, data = sim, target = "y", measure = "MSE")
apply(vimp.ranger$res[1:10000,] < 0, 2, mean)
# n = 100, 1000 Baeume: 0.429 0.243 0.000
# n = 1000, 1000 Baeume: 0.411 0.006 0.000
# n = 10000, 1000 Baeume: 0.496 0.003 0.000
# n = 100000, 1000 Baeume: 0.469 0.002 0.000
round(apply(vimp.ranger$res, 2, bin.test_greater), 3)
# n = 100, 1000 Baeume: 0.98 0.00 0.00
# n = 1000, 1000 Baeume: 0.747 0.000 0.000
# n = 10000, 1000 Baeume: 0.803 0.000 0.000
# Scheint dann öfter > 0 zu sein, wenn nicht relevant? Ist dem überhaupt so? -> öfters simulieren...
round(apply(vimp.ranger$res, 2, bin.test_two.sided), 3)
# n = 100, 1000 Baeume: 0.046 0.000 0.000
# n = 1000, 1000 Baeume: 0.548 0.000 0.000
# n = 10000, 1000 Baeume: 0.018 0.000 0.000
round(apply(vimp.ranger$res, 2, wilcoxon.test), 3)
round(apply(vimp.ranger$res, 2, p_value), 3)
# n = 100, 1000 Baeume: 1.895139e-05 0.000000e+00 0.000000e+00
# n = 1000, 1000 Baeume: 0.09871624 0.00000000 0.00000000
# n = 10000, 1000 Baeume: 0.695337 0.000000 0.000000
a = summary(lm(y~., data = sim))
# Distribution of the p_values under the Null-Hypothesis
n_exp = 1000
p_matrix = array(NA, dim = c(n_exp, 7, 3))
resis = list()
for(i in 1:100) {
print(i)
set.seed(i)
sim = sim_data(n = 100)
mod = ranger(y ~ ., data = sim, num.trees = 10000, keep.inbag = TRUE)
invisible(capture.output(vimp.ranger <- varImpRanger_test(object = mod, data = sim, target = "y", measure = "MSE")))
resis[[i]] = vimp.ranger$res
p_matrix[i, 1, ] = apply(resis[[i]] < 0, 2, mean)
p_matrix[i, 2, ] = apply(resis[[i]] > 0, 2, mean)
p_matrix[i, 3, ] = round(apply(resis[[i]], 2, bin.test_greater), 10)
p_matrix[i, 4, ] = round(apply(resis[[i]], 2, bin.test_two.sided), 10)
p_matrix[i, 5, ] = round(apply(resis[[i]], 2, wilcoxon.test), 10)
p_matrix[i, 6, ] = round(apply(resis[[i]], 2, p_value), 10)
p_matrix[i, 7, ] = summary(lm(y~., data = sim))$coefficients[2:4,4]
save(resis, p_matrix, file = "./varImpRanger/simulation.RData")
}
load("./varImpRanger/simulation.RData")
par(mfrow = c(4,8))
mittel = numeric(100)
for(i in 1:100) {
hist(resis[[i]][,1], main = i)
print(mittel[i] <- mean(resis[[i]][,1]))
}
mean(mittel)
mean(mittel >0)
which(mittel > 0)
which(p_matrix[, 3, 1] <0.05)
sum(p_matrix[, 3, 1] >0.05, na.rm = T)
sum(p_matrix[, 4, 1] >0.05, na.rm = T)
sum(p_matrix[, 5, 1] >0.05, na.rm = T)
sum(p_matrix[, 6, 1] >0.05, na.rm = T)
sum(p_matrix[, 7, 1] >0.05, na.rm = T)
sum(resis[[i]][,1] > 0)
sum(resis[[i]][,1] <= 0)
load("./varImpRanger/simulation.RData")
# some Graphical analysis
hist(vimp.ranger$res[,1]) # zu viele exakt Null
a = vimp.ranger$res[,1]
sum(a==0)
mean(a)
mean(a[a!=0])
qqnorm(a)
qqline(a)
hist(a)
mean(a>0)
round(bin.test_greater(a), 3)
round(bin.test_greater(a, zeros = TRUE), 3)
round(p_value(a), 3)
round(p_value(a, zeros = TRUE), 3)
p_matrix[1:7, , ]
# meistens nur klare Trennungen bei den anderen Tests...
for(i in 1:ncol(p_matrix))
print(hist(p_matrix[,i, 1], main = i))
# Second simulation
n_exp = 1000
p_matrix = array(NA, dim = c(n_exp, 7, 3))
resis = list()
for(i in 1:100) {
print(i)
set.seed(i)
sim = sim_data(n = 500)
mod = ranger(y ~ ., data = sim, num.trees = 10000, keep.inbag = TRUE)
invisible(capture.output(vimp.ranger <- varImpRanger_test(object = mod, data = sim, target = "y", measure = "MSE")))
resis[[i]] = vimp.ranger$res
p_matrix[i, 1, ] = apply(resis[[i]] < 0, 2, mean)
p_matrix[i, 2, ] = apply(resis[[i]] > 0, 2, mean)
p_matrix[i, 3, ] = round(apply(resis[[i]], 2, bin.test_greater), 10)
p_matrix[i, 4, ] = round(apply(resis[[i]], 2, bin.test_two.sided), 10)
p_matrix[i, 5, ] = round(apply(resis[[i]], 2, wilcoxon.test), 10)
p_matrix[i, 6, ] = round(apply(resis[[i]], 2, p_value), 10)
p_matrix[i, 7, ] = summary(lm(y~., data = sim))$coefficients[2:4,4]
save(resis, p_matrix, file = "./varImpRanger/simulation2.RData")
}
load("./varImpRanger/simulation2.RData")
par(mfrow = c(4,8))
mittel = numeric(100)
for(i in 1:100) {
hist(resis[[i]][,1], main = i)
print(mittel[i] <- mean(resis[[i]][,1]))
}
mean(mittel)
mean(mittel >0)
which(mittel > 0)
which(p_matrix[, 3, 1] <0.05)
sum(p_matrix[, 3, 1] >0.05, na.rm = T)
sum(p_matrix[, 4, 1] >0.05, na.rm = T)
sum(p_matrix[, 5, 1] >0.05, na.rm = T)
sum(p_matrix[, 6, 1] >0.05, na.rm = T)
sum(p_matrix[, 7, 1] >0.05, na.rm = T)
sum(resis[[i]][,1] > 0)
sum(resis[[i]][,1] <= 0)
load("./varImpRanger/simulation.RData")
# some Graphical analysis
hist(vimp.ranger$res[,1]) # zu viele exakt Null
a = vimp.ranger$res[,1]
sum(a==0)
mean(a)
mean(a[a!=0])
qqnorm(a)
qqline(a)
hist(a)
mean(a>0)
round(bin.test_greater(a), 3)
round(bin.test_greater(a, zeros = TRUE), 3)
round(p_value(a), 3)
round(p_value(a, zeros = TRUE), 3)
p_matrix[1:7, , ]
# meistens nur klare Trennungen bei den anderen Tests...
for(i in 1:ncol(p_matrix))
print(hist(p_matrix[,i, ]))
# 4 kann man wegschmeissen
# 3,5,6 könne man skalieren, so dass sie Varianz 1 haben?
# wilcoxon, binomial und silkes test
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.