Pretend we have a dataset of 99,339 documents, with a vocabulary of 14,052 ngrams
library(pirouette) data(sparse_text_matrix) X <- sparse_text_matrix summary(X@x)
Pretend our target is a simple logistic regression: (TODO: change back to xor problem)
Y <- (X %*% rnorm(ncol(X)))[,1] Y <- as.integer(Y >= 0) table(Y)
Benchmark runtime vs glmnet:
suppressMessages(library(microbenchmark)) suppressMessages(library(glmnet)) suppressMessages(library(pirouette)) train_rows <- sample(c(TRUE, FALSE), nrow(X), replace=TRUE) microbenchmark( pirouette_model = pirouette_model <- pirouette( X[train_rows,], Y[train_rows], prob=10/sqrt(ncol(X)), ctrl = pirouetteControl( ntrees=100, newdim=3 ), gbm_control = list( n.trees=1, interaction.depth=1, shrinkage=.75, verbose=FALSE ), distribution = 'bernoulli' ), glmnet_model = glmnet_model <- glmnet( X[train_rows,], factor(Y)[train_rows], family='binomial'), times = 1 ) object.size(pirouette_model) / object.size(glmnet_model)
Benchmark AUC vs glmnet: (TRY REGRESSION)
library(caTools) p_glmnet <- predict(glmnet_model, X[!train_rows,]) p_pirouette <- predict(pirouette_model, X[!train_rows,]) auc_glmnet <- max(colAUC(p_glmnet, Y[!train_rows])[1,]) auc_pirouette <- colAUC(p_pirouette, Y[!train_rows])[1,1] auc_pirouette / auc_glmnet auc_pirouette auc_glmnet hist(p_pirouette)
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.