TestCases.R

# Tests to perform on ciu package. The "testthat" package doesn't quite
# offer the right functionality for this kind of packages. So this is the
# adopted way of performing tests until finding a better way to do it.

require(ggplot2)
require(MASS)
require(caret)
require(ciu)
require(randomForest)
require(AmesHousing)
require(readr)
require(gbm)
require(data.table)

# # Install required packages if they are not installed
# if (!require("gbm")) install.packages("gbm")
# if (!require("data.table")) install.packages("data.table")
# if (!require("readr")) install.packages("readr")

scaleFUN <- function(x) as.character(round(x, digits = 2))

# Simple weighted sum
test.ws <- function() {
  x <- y <- seq(0, 1, 0.05)
  pm <- as.matrix(expand.grid(x,y))
  pm_df <- data.frame(x1=pm[,1],x2=pm[,2])
  c <- data.frame(x1=0.7, x2=0.8)
  c1 <- data.frame(x1=0.5, x2=0.5)
  linfunc <- function(m, inp) { 0.3*inp[,1] + 0.7*inp[,2] }
  z <- linfunc(NULL, pm)
  ciu <- ciu.new(NULL, in.min.max.limits = matrix(c(0,0,1,1), ncol=2), abs.min.max=matrix(c(0,1), ncol=2),
                 input.names=c("x1","x2"), output.names = c("y"),
                 predict.function = linfunc)
  p <- ciu$ggplot.col.ciu(c1) + labs(title="", x ="", y="CI", fill="CU"); print(p)
  p <- ciu$ggplot.col.ciu(c1, use.influence=TRUE, low.color = "firebrick", high.color = "steelblue")
  p <- p + labs(title="", x ="", y = expression(phi)) + scale_y_continuous(labels=scaleFUN)
  print(p)
  print(ciu$explain(c1,1))
  print(ciu$explain(c1,2))
}

# Iris data set, lda model.
test.iris.lda <- function() {
  iris_train <- iris[, 1:4]
  iris_lab <- iris$Species
  iris.lda <- lda(iris_train, iris_lab)
  instance <- iris[100,1:4]
  ciu <- ciu.new(iris.lda, Species~., iris)
  for ( i in 1:length(levels(iris$Species)))
    ciu$barplot.ciu(instance, ind.output = i)
  ciu$plot.ciu.3D(instance, ind.inputs = c(1,2), ind.output = 2)
  ciu$plot.ciu.3D(instance, ind.inputs = c(3,4), ind.output = 2)
  p <- ciu$ggplot.col.ciu(instance, use.influence=TRUE); print(p)
  return(p)
}

# Boston with GBM
test.boston.gbm <- function() {
  kfoldcv <- trainControl(method="cv", number=10)
  gbm <- caret::train(medv ~ ., Boston, method="gbm", trControl=kfoldcv)
  instance <- Boston[370,1:13]
  ciu <- ciu.new(gbm, medv~., Boston)
  p <- ciu$ggplot.col.ciu(instance); print(p)
  p <- ciu$ggplot.col.ciu(instance, plot.mode = "overlap"); print(p)
  p <- ciu$ggplot.col.ciu(instance, cu.colours=NULL, plot.mode = "overlap"); print(p)
  ciu$barplot.ciu(instance, sort="CI", use.influence=TRUE)
  # See how lstat,rm,crim affect output.
  oldpar <- par(no.readonly = TRUE)
  on.exit(par(oldpar))
  par(mfrow=c(1,3))
  ciu$plot.ciu(instance,13,main="BH: #370")
  ciu$plot.ciu(instance,6,main="BH: #370")
  ciu$plot.ciu(instance,1,main="BH: #370")
  print(ciu$ggplot.ciu(instance,13,main="BH: #370"))
  print(ciu$ggplot.ciu(instance,6,main="BH: #370",ylim=c(0,60)))
  print(ciu$ggplot.ciu(instance,1,main="BH: #370"))
}

# Heart disease with RF
test.heart.disease.rf <- function() {
  orig.heart.data <- heart.data <- read.csv("https://archive.ics.uci.edu/ml/machine-learning-databases/heart-disease/processed.cleveland.data",header=FALSE,sep=",",na.strings = '?')
  names(heart.data) <- c( "age", "sex", "cp", "trestbps", "chol","fbs", "restecg",
                          "thalach","exang", "oldpeak","slope", "ca", "thal", "num")
  heart.data$num[heart.data$num > 0] <- 1
  heart.data$num <- as.factor(heart.data$num)
  heart.data <- na.omit(heart.data)
  # Random Forest with caret.
  kfoldcv <- trainControl(method="cv", number=10)
  performance_metric <- "Accuracy"
  rf.heartdisease <- caret::train(num~., data=heart.data, method="rf", metric=performance_metric, trControl=kfoldcv,preProcess=c("center", "scale"))
  n.in <- ncol(heart.data) - 1
  instance <- heart.data[32,1:n.in]
  ciu <- ciu.new(rf.heartdisease, num~., heart.data, output.names=c("No Heart Disease","Heart Disease Present"))
  p <- ciu$ggplot.col.ciu(instance, c(1:n.in)); print(p)
  p <- ciu$ggplot.col.ciu(instance, c(1:n.in), plot.mode = "overlap"); print(p)
  p <- ciu$ggplot.col.ciu(instance, c(1:n.in), cu.colours=NULL, plot.mode = "overlap"); print(p)
  ciu$barplot.ciu(instance, ind.output=1, sort="CI")
  ciu$barplot.ciu(instance, ind.output=2, sort="CI")
  ciu$pie.ciu(instance, ind.output=1, sort="CI")
  ciu$pie.ciu(instance, ind.output=2, sort="CI")
  for ( i in 1:n.in ) {
    ciu$plot.ciu(instance, ind.input=i, ind.output=2)
  }
}

# UCI Cars with RF
test.cars.UCI.rf <- function() {
  car.data <- read.csv("https://archive.ics.uci.edu/ml/machine-learning-databases/car/car.data", header=FALSE)
  colnames(car.data) <- c("buying","maint","doors","persons","lug_boot","safety","result")
  price <- c(1,2)
  comfort <- c(3,4,5)
  tech <- c(comfort, 6)
  car <- c(price, tech)
  voc <- list("PRICE"=price, "COMFORT"=comfort, "TECH"=tech, "CAR"=car,
              "Buying Price"=c(1), "Maintenance Cost"=c(2), "#Doors"=c(3),
              "Person Capacity"=c(4), "#Luggage in Boot"=c(5), "Safety"=c(6))
  n.in <- ncol(car.data) - 1
  n.out <- 1
  # Random Forest with caret
  kfoldcv <- caret::trainControl(method="cv", number=10)
  performance_metric <- "Accuracy"
  rf.carsUCI <- caret::train(result~., data=car.data, method="rf", metric=performance_metric, trControl=kfoldcv)
  inst.ind <- 1098 # A very good one"
  instance <- car.data[inst.ind,1:6]
  ciu <- ciu.new(rf.carsUCI, formula=result~., data=car.data, vocabulary=voc)
  # Global plot, inputs
  p <- ciu$ggplot.col.ciu(instance, c(1:6)); print(p)
  # Plot according to PRICE, TECH
  for ( i in 1:4 )
    ciu$barplot.ciu(instance, ind.output=i, sort="CI", concepts.to.explain=c("PRICE", "TECH"))
  p <- ciu$ggplot.col.ciu(instance, concepts.to.explain=c("PRICE", "TECH")); print(p)
  # Why is PRICE good?
  p <- ciu$ggplot.col.ciu(instance, ind.inputs = voc$PRICE, target.concept = "PRICE"); print(p)
  # Why is TECH good?
  p <- ciu$ggplot.col.ciu(instance, ind.inputs = voc$TECH, target.concept = "TECH"); print(p)
  # What happens if "safety" is "med" instead?
  instance$safety <- "med"
  p <- ciu$ggplot.col.ciu(instance, ind.inputs = voc$TECH, target.concept = "TECH"); print(p)
  # What happens if "persons" is "more" instead?
  instance <- car.data[inst.ind,];
  instance$persons <- "more"
  p <- ciu$ggplot.col.ciu(instance, ind.inputs = voc$TECH, target.concept = "TECH"); print(p)
}

# Diamonds with GBM. This is a mixed discrete/continuous input case. Takes
# a long time for GBM to train!
test.diamonds.gbm <- function() {
  kfoldcv <- caret::trainControl(method="cv", number=10)
  diamonds.gbm <- caret::train(price~., diamonds, method="gbm", trControl=kfoldcv)
  inst.ind <- 27750 # An expensive one!
  instance <- diamonds[inst.ind,-7]
  ciu <- ciu.new(diamonds.gbm, price~., diamonds)
  oldpar <- par(no.readonly = TRUE)
  on.exit(par(oldpar))
  par(mfrow=c(1,1))
  ciu$barplot.ciu(instance, sort="CI")
  p <- ciu$ggplot.col.ciu(instance); print(p)
}

# Titanic data set. Mixed discrete/continuous input case.
# Also tests plot.ciu with discrete inputs.
test.titanic.rf <- function() {
  require("DALEX")
  titanic_train <- titanic[,c("survived", "class", "gender", "age", "sibsp", "parch", "fare", "embarked")]
  titanic_train$survived <- factor(titanic_train$survived)
  titanic_train$gender <- factor(titanic_train$gender)
  titanic_train$embarked <- factor(titanic_train$embarked)
  titanic_train <- na.omit(titanic_train)

  # Using Random Forest here. Really bad results... Takes a while also (15 seconds?)
  kfoldcv <- caret::trainControl(method="cv", number=10)
  model_rf <- caret::train(survived ~ ., titanic_train, method="rf", trControl=kfoldcv)

  # Example instance
  new_passenger <- data.frame(
    class = factor("1st", levels = c("1st", "2nd", "3rd", "deck crew", "engineering crew", "restaurant staff", "victualling crew")),
    gender = factor("male", levels = c("female", "male")),
    age = 8,
    sibsp = 0,
    parch = 0,
    fare = 72,
    embarked = factor("Cherbourg", levels = c("Belfast", "Cherbourg", "Queenstown", "Southampton"))
  )
  ciu <- ciu.new(model_rf, survived~., titanic_train)
  p <- ciu$ggplot.col.ciu(new_passenger); print(p)
  p <- ciu$ggplot.col.ciu(new_passenger, plot.mode = "overlap"); print(p)
  for ( i in 1:ncol(new_passenger) )
    ciu$plot.ciu(new_passenger,i,1)
  for ( i in 1:ncol(new_passenger) )
    ciu$plot.ciu(new_passenger,i,2)
  for ( i in 1:ncol(new_passenger) )
    print(ciu$ggplot.ciu(new_passenger,i,1))
  for ( i in 1:ncol(new_passenger) )
    print(ciu$ggplot.ciu(new_passenger,i,2))

  # Intermediate concepts.
  cat(ciu$textual(new_passenger[,-8], use.text.effects = TRUE, ind.output = 2))
  # Customized limits for CI.
  cat(ciu$textual(new_passenger[,-8], use.text.effects = TRUE, ind.output = 2,
                  CI.voc = data.frame(limits=c(0.05,0.15,0.25,0.5,1.0),
                                      texts=c("not important","little important", "important","very important",
                                              "extremely important"))))

  # Intermediate concepts
  wealth<-c(1,6); family<-c(4,5); gender<-c(2); age<-c(3); embarked <- c(7)
  Titanic.voc <- list("WEALTH"=wealth, "FAMILY"=family, "Gender"=gender,
                      "Age"=age, "Embarkment port"=embarked)
  ciu <- ciu.new(model_rf, survived~., titanic_train, vocabulary = Titanic.voc)
  # Need to use meta.explain here in order to guarantee same CIU values for
  # intermediate concepts when moving from one level to the other.
  meta.top <- ciu$meta.explain(new_passenger[,-8], concepts.to.explain=names(Titanic.voc), n.samples = 1000)
  cat(ciu$textual(new_passenger[,-8], use.text.effects = TRUE, ind.output = 2, ciu.meta = meta.top))

  # Explain intermediate concept utility, using input features (could also
  # be using other intermediate concepts).
  cat(ciu$textual(new_passenger[,-8], use.text.effects = TRUE, ind.output = 2, ind.inputs = Titanic.voc$FAMILY,
                  target.concept = "FAMILY", target.ciu = meta.top$ciuvals[["FAMILY"]], n.samples = 100))
  cat(ciu$textual(new_passenger[,-8], use.text.effects = TRUE, ind.output = 2, ind.inputs = Titanic.voc$WEALTH,
                  target.concept = "WEALTH", target.ciu = meta.top$ciuvals[["WEALTH"]], n.samples = 100))
}

# Adult dataset with Random Forest (not caret).
# This is a mixed discrete/continuous input case. For some reason caret RF takes
# very long to train, so we use randomForest library instead, which for the
# moment requires providing a predict.function.
# This data set also requires some pre-processing.
test.adult.rf <- function() {
  adult <- read.table('https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data',
                      sep = ',', fill = F, strip.white = T, na.strings = c("?","NA"))
  colnames(adult) <- c('age', 'workclass', 'fnlwgt', 'education',
                       'education_num', 'marital_status', 'occupation', 'relationship', 'race', 'sex',
                       'capital_gain', 'capital_loss', 'hours_per_week', 'native_country', 'income')

  # For simplicity of this analysis, the following variables are removed:
  # education.num, relationship, fnlwgt, capital.gain and capital.loss
  adult$capital_gain<-NULL
  adult$capital_loss<-NULL
  adult$fnlwgt<-NULL
  adult$education_num<-NULL
  adult$relationship<-NULL

  # Remove all rows with NAs
  adult1 <- stats::na.omit(adult)

  # Convert "chr" type columns into factor to avoid problems with RandomForest
  # classifier later.
  adult1$workclass <- factor(adult1$workclass)
  adult1$education <- factor(adult1$education)
  adult1$marital_status <- factor(adult1$marital_status)
  adult1$occupation <- factor(adult1$occupation)
  adult1$race <- factor(adult1$race)
  adult1$sex <- factor(adult1$sex)
  adult1$native_country <- factor(adult1$native_country)
  adult1$income <- factor(adult1$income)

  # Train with Random Forest, then do CIU.
  rf_adult_model<-randomForest(income~.,data = adult1)
  n.in <- ncol(adult1) - 1
  instance <- adult1[10,1:n.in]
  predict.function <- function(model, inputs) { as.matrix((predict(model, inputs, type = "prob"))) }
  ciu <- ciu.new(rf_adult_model, income~., adult1, predict.function=predict.function)
  p <- ciu$ggplot.col.ciu(instance, c(1:n.in)); print(p)

  # Textual explanation
  cat(ciu$textual(instance[,1:n.in], use.text.effects = TRUE, ind.output = 2))

  # Plot effect of every input on the output "2" ('>50K').
  par("cex.lab"=1.5)
  for ( i in 1:ncol(instance[,1:n.in]) )
    ciu$plot.ciu(instance[,1:n.in],i,2, n.points = 200, main = "", ylab="")
  par("cex.lab"=1)
}

# Ames Housing dataset with Gradient Boosting.
# This is a regression task.
test.ames.housing <- function(caret.model="gbm") {
  library(AmesHousing)
  #data("AmesHousing")
  data <- data.frame(make_ames())

  # Training
  set.seed(25) # We want to make sure to always get valid indices.
  target <- 'Sale_Price'
  trainIdx <- createDataPartition(data[,target], p=0.8, list=FALSE)
  trainData = data[trainIdx,]
  testData = data[-trainIdx,]
  # Train and show some performance indicators.
  kfoldcv <- trainControl(method="cv", number=10)
  exec.time <- system.time(
    Ames.gbm.caret <<- train(Sale_Price~., trainData, method=caret.model, trControl=kfoldcv))
  # Training set performance
  res <- predict(Ames.gbm.caret, newdata=trainData)
  train.err <- RMSE(trainData$Sale_Price, res)
  # Test set performance
  res <- predict(Ames.gbm.caret, newdata=testData)
  test.err <- RMSE(testData$Sale_Price, res)
  # Display useful information
  cat("Execution times:", exec.time, "\n")
  cat("Training set RMSE:", train.err, "\n")
  cat("Test set RMSE:", test.err, "\n")

  # Most expensive instances
  expensive <- which(testData$Sale_Price>500000)
  # Cheapest instances
  cheap <- which(testData$Sale_Price<50000)

  # Explanations
  ciu.gbm <- ciu.new(Ames.gbm.caret, Sale_Price~., trainData)
  for ( inst.ind in c(expensive,cheap) ) {
    instance <- subset(testData[inst.ind,], select=-Sale_Price)
    print(ciu.gbm$ggplot.col.ciu(instance, sort="CI", plot.mode="overlap"))
    print(ciu.gbm$ggplot.ciu(instance,46))
    print(ciu.gbm$ggplot.ciu(instance,30))
  }

  # Vocabularies
  Ames.voc <- list(
    "Garage"=c(58,59,60,61,62,63),
    "Basement"=c(30,31,33,34,35,36,37,38,47,48),
    "Lot"=c(3,4,7,8,9,10,11),
    "Access"=c(13,14),
    "House type"=c(1,15,16,21),
    "House aesthetics"=c(22,23,24,25,26),
    "House condition"=c(17,18,19,20,27,28),
    "First floor surface"=c(43),
    "Above ground living area"=which(names(data)=="Gr_Liv_Area"))
  Ames.voc_ciu.gbm <- ciu.new(Ames.gbm.caret, Sale_Price~., trainData, vocabulary = Ames.voc)
  instance <- subset(testData[expensive[1],], select=-Sale_Price)
  Ames.voc_ciu.meta <- Ames.voc_ciu.gbm$meta.explain(instance)

  # Basic textual explanations.
  cat(Ames.voc_ciu.gbm$textual(instance, use.text.effects = TRUE,
                               ciu.meta=Ames.voc_ciu.meta,
                               CI.voc = data.frame(limits=c(0.015,0.04,0.09,0.1,1.0),
                                                   texts=c("not important","little important", "important","very important",
                                                           "extremely important")))
  )

  # Intermediate concepts
  meta.top <- Ames.voc_ciu.gbm$meta.explain(instance, concepts.to.explain=names(Ames.voc), n.samples = 1000)
  cat(Ames.voc_ciu.gbm$textual(instance, use.text.effects = TRUE, ciu.meta = meta.top,
                               CI.voc = data.frame(limits=c(0.02,0.05,0.1,0.2,1.0),
                                                   texts=c("not important","little important", "important","very important",
                                                           "extremely important"))))
  cat(Ames.voc_ciu.gbm$textual(instance, use.text.effects = TRUE, ind.inputs = Ames.voc$Basement,
                               target.concept = "Basement", target.ciu = meta.top$ciuvals[["Basement"]], n.samples = 100))
  cat(Ames.voc_ciu.gbm$textual(instance, use.text.effects = TRUE, ind.inputs = Ames.voc$`House type`,
                               target.concept = "House type", target.ciu = meta.top$ciuvals[["House type"]], n.samples = 100))
  cat(Ames.voc_ciu.gbm$textual(instance, use.text.effects = TRUE, ind.inputs = Ames.voc$Garage,
                               target.concept = "Garage", target.ciu = meta.top$ciuvals[["Garage"]], n.samples = 100))

  # Same with barplots
  p <- Ames.voc_ciu.gbm$ggplot.col.ciu(instance, concepts.to.explain=names(Ames.voc),
                                       plot.mode = "overlap"); print(p)
  p <- Ames.voc_ciu.gbm$ggplot.col.ciu(instance, ind.inputs = Ames.voc$Basement, target.concept = "Basement", plot.mode = "overlap"); print(p)
  p <- Ames.voc_ciu.gbm$ggplot.col.ciu(instance, ind.inputs = Ames.voc$`House type`, target.concept = "House type", plot.mode = "overlap"); print(p)
  p <- Ames.voc_ciu.gbm$ggplot.col.ciu(instance, ind.inputs = Ames.voc$Garage, target.concept = "Garage", plot.mode = "overlap"); print(p)

  # Contextual influence barplots
  p <- Ames.voc_ciu.gbm$ggplot.col.ciu(instance, concepts.to.explain=names(Ames.voc),
                                       use.influence = TRUE)
  print(p)
  p <- Ames.voc_ciu.gbm$ggplot.col.ciu(instance, ind.inputs = Ames.voc$Basement, target.concept = "Basement",
                                       use.influence = TRUE)
  print(p)
  p <- Ames.voc_ciu.gbm$ggplot.col.ciu(instance, ind.inputs = Ames.voc$Garage, target.concept = "Garage",
                                       use.influence = TRUE)
  print(p)
}

# Read Främling car data from CSV and do the needed transformations.
read.cars.framling <- function(file="https://raw.githubusercontent.com/KaryFramling/ciu/master/CarsFramling.csv") {
  CarsFramling <- read.csv(file)
  CarsFramling <- CarsFramling[,-1]
  CarsFramling[CarsFramling$Transmission == 4, 'Transmission'] <- "manual"
  CarsFramling[CarsFramling$Transmission == 2, 'Transmission'] <- "automatic"
  CarsFramling[CarsFramling$Wheels.drive == 1, 'Wheels.drive'] <- "front"
  CarsFramling[CarsFramling$Wheels.drive == 2, 'Wheels.drive'] <- "rear"
  CarsFramling[CarsFramling$Wheels.drive == 3, 'Wheels.drive'] <- "four-wheel"
  CarsFramling[,'Transmission'] <- factor(CarsFramling[,'Transmission'])
  CarsFramling[,'Wheels.drive'] <- factor(CarsFramling[,'Wheels.drive'])
  CarsFramling[,'Aesthetic.value'] <- factor(CarsFramling[,'Aesthetic.value'])
  CarsFramling[,'Brand.value'] <- factor(CarsFramling[,'Brand.value'])
  return(CarsFramling)
}

# Cars Framling data set with Gradient Boosting (default).
# This is a regression task.
test.cars.framling <- function(caret.model="gbm") {
  CarsFramling <- read.cars.framling()
  data <- CarsFramling[,-1]
  rownames(data) <- CarsFramling[,1]
  # Training
  target <- 'Preference.value'
  trainIdx <- createDataPartition(data[,target], p=0.8, list=FALSE)
  trainData = data[trainIdx,]
  testData = data[-trainIdx,]
  # Train and show some performance indicators.
  kfoldcv <- trainControl(method="cv", number=10)
  exec.time <- system.time(
    cars.framling.model <<- train(Preference.value~., trainData, method=caret.model, trControl=kfoldcv))
  # Training set performance
  res <- predict(cars.framling.model, newdata=trainData)
  train.err <- RMSE(trainData$`Preference.value`, res)
  # Test set performance
  res <- predict(cars.framling.model, newdata=testData)
  test.err <- RMSE(testData$`Preference.value`, res)
  # Display useful information
  cat("Execution times:", exec.time, "\n")
  cat("Training set RMSE:", train.err, "\n")
  cat("Test set RMSE:", test.err, "\n")

  # Most preferred instances
  best <- which(testData$`Preference.value`>70)
  # Least preferred instances
  worst <- which(testData$`Preference.value`<55)

  # Vocabulary for Intermediate concepts
  performance<-c(2,7,8,9); driving.comfort<-c(3,13); price<-c(1); size<-c(5,6)
  fuel.consumption <- c(10); status=c(11,12)
  Cars.Framling.voc <- list("Performance"=performance, "Driving comfort"=driving.comfort,
                            "Price"=price, "Size"=size,
                            "Fuel consumption"=fuel.consumption, "Status"=status)

  # Explanations
  ciu <- ciu.new(cars.framling.model, `Preference.value`~., trainData, vocabulary = Cars.Framling.voc)
  for ( inst.ind in c(best,worst) ) {
    instance <- subset(testData[inst.ind,], select=-`Preference.value`)
    print(ciu$ggplot.col.ciu(instance, sort="CI", plot.mode = "overlap"))
    print(ciu$ggplot.ciu(instance,1)) # which(names(trainData)=="Price")
    print(ciu$ggplot.ciu(instance,2))
    # Intermediate Concept explanations
    meta.top <- ciu$meta.explain(instance, concepts.to.explain=names(Cars.Framling.voc), n.samples = 1000)
    cat(ciu$textual(instance, use.text.effects = TRUE, ind.output = 1, ciu.meta = meta.top))
    cat(ciu$textual(instance, use.text.effects = TRUE, ind.inputs = Cars.Framling.voc$Performance,
                    target.concept = "Performance", target.ciu = meta.top$ciuvals[["Performance"]], n.samples = 100))
  }
}

# Cars Framling data set with Gradient Boosting (default).
# This is a regression task for predicting car price.
test.cars.framling.price <- function(caret.model="gbm") {
  CarsFramling <- read.cars.framling()
  data <- CarsFramling[,-c(1,15)]
  rownames(data) <- CarsFramling[,1]
  # Training
  target <- 'Price'
  trainIdx <- createDataPartition(data[,target], p=0.8, list=FALSE)
  trainData = data[trainIdx,]
  testData = data[-trainIdx,]
  # Train and show some performance indicators.
  kfoldcv <- trainControl(method="cv", number=10)
  exec.time <- system.time(
    cars.framling.model <<- train(Price~., trainData, method=caret.model, trControl=kfoldcv))
  # Training set performance
  res <- predict(cars.framling.model, newdata=trainData)
  train.err <- RMSE(trainData$Price, res)
  # Test set performance
  res <- predict(cars.framling.model, newdata=testData)
  test.err <- RMSE(testData$Price, res)
  # Display useful information
  cat("Execution times:", exec.time, "\n")
  cat("Training set RMSE:", train.err, "\n")
  cat("Test set RMSE:", test.err, "\n")

  # Most expensive instances
  expensive <- which(testData$Price>250000)
  # Least preferred instances
  cheap <- which(testData$Price<120000)

  # Explanations
  for ( inst.ind in c(expensive,cheap) ) {
    instance <- subset(testData[inst.ind,], select=-Price)
    ciu <- ciu.new(cars.framling.model, Price~., trainData)
    print(ciu$ggplot.col.ciu(instance, sort="CI", plot.mode = "overlap"))
    print(ciu$ggplot.ciu(instance,1)) # Power
  }
}

test.california.housing <- function(use.averages=TRUE) {

  # Download the California housing dataset
  url <- "https://raw.githubusercontent.com/ageron/handson-ml/master/datasets/housing/housing.csv"
  california_data <- fread(url)

  # Rename columns for compatibility (no spaces or special characters)
  colnames(california_data) <- make.names(colnames(california_data))

  # data.table class messes things up...
  class(california_data) <- "data.frame"

  # Convert 'ocean_proximity' to a factor
  california_data$ocean_proximity <- as.factor(california_data$ocean_proximity)

  # sklearn and, by consequence, SHAP use average numbers rather than the totals.
  if ( use.averages ) {
    california_data$ave_rooms <- california_data$total_rooms/california_data$households
    california_data$ave_occupation <- california_data$population/california_data$households
    california_data$ave_bedrooms <- california_data$total_bedrooms/california_data$households
    california_data <- subset(california_data, select= -c(total_rooms, population, total_bedrooms))
  }

  # Split the data into training and testing sets
  set.seed(25)
  target <- 'median_house_value'
  trainIdx <- createDataPartition(california_data[,target], p=0.8, list=FALSE)
  trainData = california_data[trainIdx,]
  testData = california_data[-trainIdx,]

  # Train a GBM model
  gbm_model <- gbm(
    median_house_value ~ .,  # formula: median house value as the response variable
    data = trainData,  # training data
    distribution = "gaussian",  # for regression
    n.trees = 1000,  # number of trees
    interaction.depth = 3,  # tree depth
    shrinkage = 0.01,  # learning rate
    cv.folds = 5,  # 5-fold cross-validation
    verbose = FALSE
  )

  # Check the best number of trees based on cross-validation
  best_iter <- gbm.perf(gbm_model, method = "cv")

  # Summarize the model
  #summary(gbm_model)

  # Predict on the test set
  predictions <- predict(gbm_model, newdata = testData, n.trees = best_iter)

  # Evaluate model performance using RMSE
  rmse <- sqrt(mean((testData$median_house_value - predictions)^2))
  cat("Test RMSE:", rmse, "\n")

  # CIU
  ciu.california.gbm <- ciu.new(gbm_model, median_house_value~., trainData)
  instance <- subset(testData[1,], select=-median_house_value)
  print(ciu.california.gbm$ggplot.col.ciu(instance, sort="CI", plot.mode="overlap"))
  print(ciu.california.gbm$ggplot.col.ciu(instance, use.influence=TRUE)) #, low.color = "firebrick", high.color = "steelblue"
  print(ciu.california.gbm$ggplot.ciu(instance,1))
  print(ciu.california.gbm$ggplot.ciu(instance,6,illustrate.CIU=TRUE))
}

# par(mai=c(0.8,1.2,0.4,0.2)) # Good parameters for barplot so that labels fit in.
# par(mai=c(0.8,1.2,0.4,0.2))

test.all<- function() {
  test.ws()
  test.iris.lda()
  test.boston.gbm()
  test.heart.disease.rf()
  test.cars.UCI.rf() # Takes about 15 seconds for RF to train
  test.diamonds.gbm() # Takes something like 2-3 minutes to train but GBM seems to be best by far here.
  test.titanic.rf() # Takes maybe half minute.
  test.adult.rf() # Takes about half minute.
  # Takes about 40s with GBM. RF takes about 12 minutes, achieves lower error
  # for training set but similar for test set.
  # "lm" is very quick but despite similar error as the more complex ones, the
  # results don't really seem to make much sense.
  test.ames.housing()
  test.cars.framling()
  test.cars.framling.price()
  test.california.housing()
}
KaryFramling/ciu documentation built on July 16, 2025, 8:18 p.m.