DriveML: Machine Learning Projects

library(rmarkdown)
library(DriveML)
library(knitr)
library(scales)
library(ggplot2)

modelobject <- params$mlobject

Machine Learning Classification Model

Automated Machine Learning (DriveML) mainly refers to the automated methods for model selection and hyper-parameter optimization of various algorithms such as random forests, gradient boosting etc..

Summary of trained data and model function

Dimensions of the dataset and other information

t1 <- modelobject$datasummary$train;
t2 <- modelobject$datasummary$test;
t3 <- modelobject$datasummary$score; 

t4 <- modelobject$call
mdata <- NULL
for(j in 2: length(t4)){
  fnam <- as.character(names(t4[j]))
  ivalue <- as.character(t4[[j]])
  if(length(ivalue) == 0) ivalue <- "NULL"
  md <- data.frame(parameter = fnam, input = ivalue)
  mdata <- rbind(mdata, md)
  }

 modename <- names(modelobject$trainedModels)
    manme <- data.frame(model = c("glmnet", "logreg", "randomForest", "ranger", "xgboost", "rpart"),descriptions = c("Regularised regression  from glmnet R package",
                                        "logistic regression from stats R package",
                                        "Random forests using the randomForest R package",
                                        "Random forests using the ranger R package",
                                        "Gradient boosting using xgboost R package",
                                        "decision tree classification using rpart R package"))
 drmodel <- subset(manme, model = modename)

## section2
result <- modelobject$results
rownames(result) <- NULL

## Section ROC plot
exe_modl <- names(modelobject$trainedModels)
pl_glmnet <- pl_logreg <- pl_randomForest <- pl_ranger <- pl_xgboost <- pl_rpart <- FALSE

for(j in exe_modl){
  assign(paste0("pl_",j), TRUE)
}

## variable importance
vi_randomForest <- vi_ranger <- vi_xgboost <- vi_rpart <- vi_logreg <- vi_glmnet <- FALSE
for(j in exe_modl){
  if(j == "randomForest") assign(paste0("vi_",j), TRUE)
  if(j == "ranger") assign(paste0("vi_",j), TRUE)
  if(j == "xgboost") assign(paste0("vi_",j), TRUE)
  if(j == "glmnet") assign(paste0("vi_",j), TRUE)
  if(j == "logreg") assign(paste0("vi_",j), TRUE)
  if(j == "rpart") assign(paste0("vi_",j), TRUE)
}

Training data set

t1 <- t1[t1$Value!=0,]; rownames(t1) <- NULL
kable(t1)

Validation data set

t2 <- t2[t2$Value!=0,]; rownames(t2) <- NULL
kable(t2)

Scoring data set

if(!is.null(t3)) {
  t3 <- t3[t3$Value!=0,]; rownames(t3) <- NULL
  kable(t3)
} else {
    cat("No score data set")
  }

DriveML Model selected parameters

  kable(mdata)

List of Machine learning classification algorithm used

  kable(drmodel)

Model Performance comparision

Summary table

Table has Model fitting time and performance metric like AUC, Accuaracy, Precision, Recall and F1 score

  kable(result)

ROC curve

#masterModel <- modelobject$trainedModels[["glmnet"]]
#masterModel$modelPlots$TrainROC
#masterModel$modelPlots$TestROC
print("DD")
masterModel <- modelobject$trainedModels[["logreg"]]
masterModel$modelPlots$TrainROC
masterModel$modelPlots$TestROC
masterModel <- modelobject$trainedModels[["randomForest"]]
masterModel$modelPlots$TrainROC
masterModel$modelPlots$TestROC
masterModel <- modelobject$trainedModels[["ranger"]]
masterModel$modelPlots$TrainROC
masterModel$modelPlots$TestROC
masterModel <- modelobject$trainedModels[["xgboost"]]
masterModel$modelPlots$TrainROC
masterModel$modelPlots$TestROC
masterModel <- modelobject$trainedModels[["rpart"]]
masterModel$modelPlots$TrainROC
masterModel$modelPlots$TestROC

Variable importance or coefficients

masterModel <- modelobject$trainedModels[["xgboost"]]
masterModel$modelPlots$VarImp[[1]]
masterModel <- modelobject$trainedModels[["randomForest"]]
masterModel$modelPlots$VarImp[[1]]
masterModel <- modelobject$trainedModels[["ranger"]]
masterModel$modelPlots$VarImp[[1]]
masterModel <- modelobject$trainedModels[["rpart"]]
masterModel$modelPlots$VarImp[[1]]
masterModel <- modelobject$trainedModels[["glmnet"]]
masterModel$modelPlots$VarImp[[1]]
masterModel <- modelobject$trainedModels[["logreg"]]
masterModel$modelPlots$VarImp[[1]]

Best Model Explainability

Used lift charts and PDP plots

Lift charts and table

Lift chart
modelobject$modelexp$Lift_plot
Lift table

Top decile (2%) lift catpured by model level

cc <- modelobject$modelexp$Lift_data
cc1 <- cc[cc$groups==1, ]; rownames(cc1) <- NULL
cc2 <- cc[cc$groups==5, ]; rownames(cc2) <- NULL
ccd <-  data.frame(model = cc1$model, top_2 = cc1$lift, top_10 = cc2$lift)
kable(ccd)

Partial Dependency Plots (PDP)

Note: Plot available for top important variables

lapply(names(modelobject$modelexp$pdp$plots), function(x) {cc = modelobject$modelexp$pdp$plots[[x]]; cc})

Sample view of predicted score - validation set

cc <- modelobject$predicted_score$test
cc <-  data.frame(cc[1:10, ])
kable(cc)


Try the DriveML package in your browser

Any scripts or data that you put into this service are public.

DriveML documentation built on Dec. 2, 2022, 5:14 p.m.