In zachmayer/pirouette: Fast Rotation Forests

Benchmark on a sparse matrix

Pretend we have a dataset of 99,339 documents, with a vocabulary of 14,052 ngrams

library(pirouette)
data(sparse_text_matrix)
X <- sparse_text_matrix
summary(X@x)

Pretend our target is a simple logistic regression: (TODO: change back to xor problem)

Y <- (X %*% rnorm(ncol(X)))[,1]
Y <- as.integer(Y >= 0)
table(Y)

Benchmark runtime vs glmnet:

suppressMessages(library(microbenchmark))
suppressMessages(library(glmnet))
suppressMessages(library(pirouette))
train_rows <- sample(c(TRUE, FALSE), nrow(X), replace=TRUE)
microbenchmark(
  pirouette_model = pirouette_model <- pirouette(
    X[train_rows,], Y[train_rows], prob=10/sqrt(ncol(X)), 
    ctrl = pirouetteControl(
      ntrees=100, newdim=3
      ),
    gbm_control = list(
      n.trees=1,
      interaction.depth=1,
      shrinkage=.75,
      verbose=FALSE
    ),
    distribution = 'bernoulli'
    ),
  glmnet_model = glmnet_model <- glmnet(
    X[train_rows,], factor(Y)[train_rows], family='binomial'),
  times = 1
)
object.size(pirouette_model) / object.size(glmnet_model)

Benchmark AUC vs glmnet: (TRY REGRESSION)

library(caTools)
p_glmnet <- predict(glmnet_model, X[!train_rows,])
p_pirouette <- predict(pirouette_model, X[!train_rows,])

auc_glmnet <- max(colAUC(p_glmnet, Y[!train_rows])[1,])
auc_pirouette <- colAUC(p_pirouette, Y[!train_rows])[1,1]

auc_pirouette / auc_glmnet
auc_pirouette
auc_glmnet
hist(p_pirouette)