Prediction models - scoring and calibration
In targeted: Targeted Inference

knitr::opts_chunk$set(
 collapse = TRUE,
 #dev="png",
 comment = "#>"
)
library(targeted)

Introduction

Model Scoring

library("targeted")
data(iris)
set.seed(1)
dat <- csplit(iris, 2)

g0 <- NB(Species ~ ., data=dat[[1]])
g1 <- NB(Species ~ Sepal.Width + Petal.Length, data=dat[[1]])
g2 <- NB(Species ~ Sepal.Width, data=dat[[1]])
pr0 <- predict(g0, newdata=dat[[2]], wide=TRUE)
pr1 <- predict(g1, newdata=dat[[2]], wide=TRUE)
pr2 <- predict(g2, newdata=dat[[2]], wide=TRUE)
table(colnames(pr1)[apply(pr1,1,which.max)], dat[[2]]$Species)
table(colnames(pr2)[apply(pr2,1,which.max)], dat[[2]]$Species)

scoring(dat[[2]]$Species, NB1=pr1)
scoring(dat[[2]]$Species, pr0=pr0, pr1=pr1, pr2=pr2)

\(A\) ...

## library(ranger)
## m1 <- ranger(Species ~ ., data=dat[[1]], num.threads=1, probability=TRUE)
## pr3 <- predict(m1, data=dat[[2]], num.threads=1)$predictions
## scoring(dat[[2]]$Species, pr3)

rf_pred <- function(x, newdata, ...) predict(x, newdata)$prediction
ml_grf <- learner$new(~., info="grf::regression_forest",
                       fit=function(x,y) grf::regression_forest(X=x,Y=y,...),
                       pred=rf_pred)
x <- ml_grf$clone()
x$update(Sepal.Length ~ .)
x$estimate(iris)
x$predict(droplevels(head(iris)))


m <- learner$new(Species ~ ., info="grf::probability_forest",
                  fit=function(x,y) grf::probability_forest(X=x,Y=y,...),
                  pred=rf_pred)
m$estimate(dat[[1]], num.trees=1000)
pr3 <- m$predict(dat[[2]])
scoring(dat[[2]]$Species, pr3)

Cross validation - model comparison

rf_model <- learner$new(Sepal.Length ~ .,
                  fit=function(x,y) grf::regression_forest(X=x,Y=y,...),
                  pred=function(...) predict(...)$predictions)
lm_model <- learner$new(Sepal.Length ~ ., fit=lm)

a <- cv(list(lm=lm_model, rf=rf_model), data=iris, response="Sepal.Length")
a

binary

n <- 1e4
x <- sample(1:5, n, replace=TRUE)
y <- rbinom(n, 1, expit(x))
d <- data.frame(y=y, x=x)
summary(glm(y ~ x, family=binomial, data=d))

counts <- table(x, y)
dd <- data.frame(y=rep(0:1,each=5), x=rep(1:5, 2),
                 w=c(counts[,1], counts[,2]))
summary(g <- glm(y ~ x, family=binomial, data=dd, weights=w))

m <- learner_glm(y ~ x, family = binomial())
m$estimate(d)


m <- learner$new(fit=function(data) glm(y~x, data=data, weights=data$w, family=binomial))
m$estimate(dd)

m <- learner$new(y~x, function(formula,data) {
  glm(formula, data=data, weights=w, family=binomial)
})
m$estimate(dd)

 m <- learner$new(y~x,
                  fit=function(y,x,w,...) {
                    glm.fit(x=cbind(1,x), y=y, weights=w, family=binomial()) },
                  pred=function(fit, newdata,...) {
                    pr <- fit$family$linkinv(cbind(1, newdata)%*%fit$coefficients)
                    cbind(1-pr, pr)
                  },
                  specials=list(w=w))
fit <- m$estimate(dd)
m$predict(dd)

Categorical response

rf <- function(formula, ...)
  learner$new(formula, info="grf::probability_forest",
               fit=function(x,y, ...) grf::probability_forest(X=x, Y=y, ...),
               pred=function(fit, newdata) predict(fit, newdata)$predictions, ...)
models <- list(rf=rf(Species ~ .),
               nb=function(data, ...) NB(Species ~ ., data=data))
a <- cv(models, data=iris)
a

Model tuning

rf <- function(formula, ...)
  learner$new(formula, info="grf::probability_forest",
               fit=function(x,y, ...) grf::probability_forest(X=x, Y=y, ...),
               pred=function(fit, newdata) predict(fit, newdata)$predictions, ...)

args <- expand.list(num.trees=c(50,100,200,300,400,500), mtry=1:3,
                    formula=c(Species ~ ., Species ~ Sepal.Length + Sepal.Width))
models <- lapply(args, function(par) do.call(rf, par))


x <- models[[1]]$clone()
x$estimate(iris)

#x$.__enclos_env__$private$formula
#rtarget

x <- learner$new(Species ~ ., info="grf::probability_forest",
             fit=function(x,y, ...) grf::probability_forest(X=x, Y=y, ...),
             num.trees=100, mtry=1)
x$estimate(iris)


a <- targeted::cv(models, data=iris)
cbind(coef(a), attr(args, "table")[,1:2])

## library(GPareto)
## dom <- t(nondominated_points(t(coef(a))))
## plot(coef(a))
## plotParetoEmp(dom, col="red")
## mp <- rbind(coef(a)[cbind(coef(a, min=TRUE),1:2)])
## points(mp, pch=16)

a <- learner$new(fit=function(data,response="Species",...) {
  y <- data[,response]
  x <- data[,-which(names(data)%in%response),drop=FALSE]
  NB2(x=x,y=y)
}, pred=function(fit, newdata) predict(fit, newdata))
a$estimate(iris)

b <- learner$new(Species ~ ., fit=function(x,y,...) {
  grf::probability_forest(Y=y,X=x,...)
}, pred=function(fit,newdata) predict(fit,newdata)$predictions,
num.trees=50)

cv(list(NB=a, RF=b), data=iris, response="Species")