knitr::opts_chunk$set(
  collapse = TRUE,
  comment = "#>"
)
library(CVRTSEncoder)

Re-encode a set of categorical columns as a set of columns that are mutually useful for classification.

The concept is y-aware encoding the trajectory of non-linear model residuals in terms of target categorical variables.

The idea is an extension of the vtreat coding concepts and of the y-aware scaling concepts of Nina Zumel and John Mount:

data <- iris
avars <- c("Sepal.Length", "Petal.Length")
evars <- c("Sepal.Width", "Petal.Width")
dep_var <- "Species"
dep_target <- "versicolor"
for(vi in evars) {
  data[[vi]] <- as.character(round(data[[vi]]))
}
cross_enc <- estimate_residual_encoding_c(
  data = data,
  avars = avars,
  evars = evars,
  dep_var = dep_var,
  dep_target = dep_target,
  n_comp = 4
)
enc <- prepare(cross_enc$coder, data)
data <- cbind(data, enc)

f0 <- wrapr::mk_formula(dep_var, avars, outcome_target = dep_target)
model0 <- glm(f0, data = data, family = binomial)
summary(model0)

data$pred0 <- predict(model0, newdata = data, type = "response")
table(data$Species, data$pred0>0.5)

newvars <- c(avars, colnames(enc))
model <- glmnet::cv.glmnet(as.matrix(data[, newvars, drop = FALSE]), 
                           as.numeric(data[[dep_var]]==dep_target), 
                           family = "binomial")
coef(model, lambda = "lambda.min")
data$pred <- as.numeric(predict(model, newx = as.matrix(data[, newvars, drop = FALSE]), s = "lambda.min"))
table(data$Species, data$pred>0.5)


WinVector/CVRTSEncoder documentation built on June 7, 2019, 9:53 a.m.