`glmtree`: logistic regression trees for efficient segmentation

Segmentation

Three clusters, one predictive law

Simulation

library(glmtree)
data = generateData(n = 1000, scenario = "no tree", visualize = TRUE)

int_train = sample.int(n = 1000, size = 0.2*1000)

test = data[-int_train,]
data = data[int_train,]

PCA

library(FactoMineR)
mixed = PCA(data[,c("x1","x2")])

data$pca1 = predict(mixed, data)$coord[,1]
data$pca2 = predict(mixed, data)$coord[,2]
test$pca1 = predict(mixed, test)$coord[,1]
test$pca2 = predict(mixed, test)$coord[,2]

data$cluster = ifelse(data$pca1 > 1, 1, ifelse(data$pca1 > 0, 2, 3))
test$cluster = ifelse(test$pca1 > 1, 1, ifelse(test$pca1 > 0, 2, 3))

pred = matrix(0, nrow = 0.2*1000, ncol = 1)

for (j in 1:3) {
  modele = glm(y ~ x1 + x2, data = data[data$cluster==j,], family=binomial(link = "logit"))
  pred[test$cluster==j] = predict(modele, test[test$cluster==j,], type="response")
}

normalizedGini(test$y,pred)
plot(mixed, choix = 'ind', label = "none")

MOB

if (require(partykit, quietly = TRUE)) {
  mob_data = partykit::glmtree(formula = y ~ x1 + x2 | x1 + x2, data = data, family = binomial)
  plot(mob_data)
  normalizedGini(test$y, predict(mob_data,test))
}

glmtree approach

tree = glmtree::glmtree(x = data[,c("x1", "x2")], y = data$y)

plot(unlist(tree@performance$criterionEvolution), type="l")

data$c_map <- factor(apply(predict(tree@best.tree$tree,data,type="prob"),1,function(p) names(which.max(p))))
test$c_map <- factor(apply(predict(tree@best.tree$tree,data,type="prob"),1,function(p) names(which.max(p))))

table(data$c_map)

plot(data[,1],data[,2],pch=2+data[,3],col=as.numeric(data$c_map),xlab="First coordinate",ylab="Second coordinate")

plot(tree@best.tree$tree)

pred = matrix(0, nrow = 0.2*1000, ncol = 1)

for (j in levels(data$c_map)) {
  modele = glm(y ~ x1 + x2, data = data[data$c_map==j,], family=binomial(link = "logit"))
  pred[test$c_map==j] = predict(modele, test[test$c_map==j,], type="response")
}

normalizedGini(test$y,pred)

One "cluster", three predictive laws

Simulation

data = generateData(n = 1000, scenario = "tree", visualize = TRUE)

int_train = sample.int(n = 1000, size = 0.2*1000)

test = data[-int_train,]
data = data[int_train,]

PCA

mixed = FAMD(data[,c("x1","x2","x3")])

dim_famd = predict(mixed,test)$coord[,"Dim 1"] < 0

pred = matrix(0, nrow = 0.2*1000, ncol = 1)

for (j in c(TRUE,FALSE)) {
  modele = glm(y ~ x1 + x2 + x3, data = data[dim_famd==j,], family=binomial(link = "logit"))
  pred[dim_famd==j] = predict(modele, test[dim_famd==j,], type="response")
}

normalizedGini(test$y,pred)

MOB

if (require(partykit, quietly = TRUE)) {
  mob_data = partykit::glmtree(formula = y ~ x1 + x2 +x3 | x1 + x2 + x3, data = data, family = binomial)
  plot(mob_data)
  normalizedGini(test$y, predict(mob_data,test))
}

glmtree approach

tree = glmtree::glmtree(x = data[,c("x1", "x2", "x3")], y = data$y)

plot(unlist(tree@performance$criterionEvolution), type="l")

data$c_map <- factor(apply(predict(tree@best.tree$tree,data,type="prob"),1,function(p) names(which.max(p))))
test$c_map <- factor(apply(predict(tree@best.tree$tree,test,type="prob"),1,function(p) names(which.max(p))))

table(data$c,data$c_map)

plot(data[,1],data[,2],pch=2+data[,3],col=as.numeric(data$c_map),xlab="First coordinate",ylab="Second coordinate")

plot(tree@best.tree$tree)

pred = matrix(0, nrow = 0.2*1000, ncol = 1)

for (j in 1:nlevels(data$c_map)) {
  pred[test$c_map==levels(data$c_map)[j]] = predict(tree@best.tree$glms[[j]], test[test$c_map==levels(data$c_map)[j],], type="response")
}

normalizedGini(test$y,pred)


Try the glmtree package in your browser

Any scripts or data that you put into this service are public.

glmtree documentation built on Jan. 13, 2021, 4:58 p.m.