library(rmarkdown)
library(DriveML)
library(knitr)
library(scales)
library(ggplot2)

modelobject <- params$mlobject
mldata <- params$mldata

ML Model Data Preparation

## MAR section

Summary of input dataframe

Overview of the data

typ1 <- mldata$datasummary$type1
kable(typ1)

Variable summary

typ2 <- mldata$datasummary$type2
paged_table(typ2)

Missing not completely at random variable check (MAR)

More than one missing value columns are considered for MAR computation

print(mldata$var_list$MAR_col)

Missing imputation

if(names(mldata$call) %in% "missimpute") {

if(mldata$call$missimpute=="default") {
  mdd <- data.frame(Variable_type = c("Factor", "Integer", "Numeric", "Character"),
  Impute_method = c("Mode value", "Mean value", "Median value", "Constant value"))
  kable(mdd)
    } else {
    cat("Missing replaced with", "{", print(mldata$call$missimpute), "\n") }
  } else 
  {
    mdd <- data.frame(Variable_type = c("Factor", "Integer", "Numeric", "Character"),
  Impute_method = c("Mode value", "Mean value", "Median value", "Constant value"))
  kable(mdd)
  }

Feature engineering

ccd = data.frame(Descriptions = c("Checked and removed useless variables:",
"No. of variables before fetature reductions:"), Value = c(length(mldata$var_list$Dropped_col), length(mldata$overall_variable)))
kable(ccd)

Variable computaion

Outlier variable
print(mldata$var_list$outlier_summary)
Interaction variables using multiplaction method

Calculated ineraction between two continuous variables using multiplication method

List of variables calculated are below

cc  <- mldata$var_list$overall_variable
print(cc[grep("Inter_", cc)])
Frequent transformer variables

The frequent transformer counts each categorical variable in the data set

List of variables calculated are below

cc  <- mldata$var_list$overall_variable
print(cc[grep("Freq_t", cc)])
Date transformer variables

Date transformer retrieves any date values like month, year, quarter and week

List of variables calculated are below

cc  <- mldata$var_list$overall_variable
print(cc[grep("datet_", cc)])
Dummy variables using one hot encoding

One hot coding for all categorical variables

cc  <- mldata$var_list$overall_variable
print(cc)

Variable reduction

Zero variance
cc  <- mldata$var_list$zerovariance
print(cc)
Correlation

Highly correlated variables

cc  <- mldata$var_list$cor_var
print(cc)

Correaltion table

cc  <- data.frame(mldata$raw_list$Num_Correlation)
if(nrow(cc)>1){
  paged_table(cc)
} else {
  print("correlation value not found")
}
Area Under the curve

Top predictors based on AUC value against dependent variables

cc  <- mldata$var_list$auc_var
print(cc)

AUC summary table

cc  <- mldata$raw_list$AUC_value

if(nrow(cc) > 0){
  rownames(cc) <- NULL
  cc <- cc[, c(2:3)]
  paged_table(cc)
} else {
  print("No AUC features generated")
}

Final table view - head

cc  <- mldata$master_data
if(nrow(cc)>0){
paged_table(head(cc, 5))  
} else 
{
  print("data is not prepared")
  }

Machine Learning Classification Model

Automated Machine Learning (DriveML) mainly refers to the automated methods for model selection and hyper-parameter optimization of various algorithms such as random forests, gradient boosting etc..

Summary of trained data and model function

Dimensions of the dataset and other informations

t1 <- modelobject$datasummary$train;
t2 <- modelobject$datasummary$test;
t3 <- modelobject$datasummary$score; 

t4 <- modelobject$call
mdata <- NULL
for(j in 2: length(t4)){
  fnam <- as.character(names(t4[j]))
  ivalue <- as.character(t4[[j]])
  if(length(ivalue) == 0) ivalue <- "NULL"
  md <- data.frame(parameter = fnam, input = ivalue)
  mdata <- rbind(mdata, md)
  }

 modename <- names(modelobject$trainedModels)
    manme <- data.frame(model = c("glmnet", "logreg", "randomForest", "ranger", "xgboost", "rpart"),descriptions = c("Regularised regression  from glmnet R package",
                                        "logistic regression from stats R package",
                                        "Random forests using the randomForest R package",
                                        "Random forests using the ranger R package",
                                        "Gradient boosting using xgboost R package",
                                        "decision tree classification using rpart R package"))
 drmodel <- subset(manme, model = modename)

## section2
result <- modelobject$results
rownames(result) <- NULL

## Section ROC plot
exe_modl <- names(modelobject$trainedModels)
pl_glmnet <- pl_logreg <- pl_randomForest <- pl_ranger <- pl_xgboost <- pl_rpart <- FALSE

for(j in exe_modl){
  assign(paste0("pl_",j), TRUE)
}

## variable importance
vi_randomForest <- vi_ranger <- vi_xgboost <- vi_rpart <- vi_logreg <- vi_glmnet <- FALSE
for(j in exe_modl){
  if(j == "randomForest") assign(paste0("vi_",j), TRUE)
  if(j == "ranger") assign(paste0("vi_",j), TRUE)
  if(j == "xgboost") assign(paste0("vi_",j), TRUE)
  if(j == "glmnet") assign(paste0("vi_",j), TRUE)
  if(j == "logreg") assign(paste0("vi_",j), TRUE)
  if(j == "rpart") assign(paste0("vi_",j), TRUE)
}

Training data set

t1 <- t1[t1$Value!=0,]; 
rownames(t1) <- NULL
kable(t1)

Validation data set

t2 <- t2[t2$Value!=0,]; 
rownames(t2) <- NULL
kable(t2)

Scoring data set

if(!is.null(t3)) {
  t3 <- t3[t3$Value!=0,]; rownames(t3) <- NULL
  kable(t3)
} else {
    cat("No score data set")
  }

DriveML Model selected parameters

  kable(mdata)

List of Machine learning classification algorithm used

  kable(drmodel)

Model Performance comparision

Summary table

Table has Model fitting time and performance metric like AUC, Accuaracy, Precision, Recall and F1 score

  kable(result)

ROC curve

masterModel <- modelobject$trainedModels[["glmnet"]]
masterModel$modelPlots$TrainROC
masterModel$modelPlots$TestROC
masterModel <- modelobject$trainedModels[["logreg"]]
masterModel$modelPlots$TrainROC
masterModel$modelPlots$TestROC
masterModel <- modelobject$trainedModels[["randomForest"]]
masterModel$modelPlots$TrainROC
masterModel$modelPlots$TestROC
masterModel <- modelobject$trainedModels[["ranger"]]
masterModel$modelPlots$TrainROC
masterModel$modelPlots$TestROC
masterModel <- modelobject$trainedModels[["xgboost"]]
masterModel$modelPlots$TrainROC
masterModel$modelPlots$TestROC
masterModel <- modelobject$trainedModels[["rpart"]]
masterModel$modelPlots$TrainROC
masterModel$modelPlots$TestROC

Variable importance

masterModel <- modelobject$trainedModels[["xgboost"]]
masterModel$modelPlots$VarImp[[1]]
masterModel <- modelobject$trainedModels[["randomForest"]]
masterModel$modelPlots$VarImp[[1]]
masterModel <- modelobject$trainedModels[["ranger"]]
masterModel$modelPlots$VarImp[[1]]
masterModel <- modelobject$trainedModels[["rpart"]]
masterModel$modelPlots$VarImp[[1]]
masterModel <- modelobject$trainedModels[["glmnet"]]
masterModel$modelPlots$VarImp[[1]]
masterModel <- modelobject$trainedModels[["logreg"]]
masterModel$modelPlots$VarImp[[1]]

Best Model Explainability

Used lift charts and PDP plots

Lift charts and table

Lift chart
modelobject$modelexp$Lift_plot
Lift table

Top decile (2%) lift catpured by model level

cc <- modelobject$modelexp$Lift_data
cc1 <- cc[cc$groups==1, ]; rownames(cc1) <- NULL
cc2 <- cc[cc$groups==5, ]; rownames(cc2) <- NULL
ccd <-  data.frame(model = cc1$model, top_2 = cc1$lift, top_10 = cc2$lift)
kable(ccd)

Partial Dependency Plots (PDP)

Note: Plot available for top important variables

lapply(names(modelobject$modelexp$pdp$plots), function(x) {cc = modelobject$modelexp$pdp$plots[[x]]; cc})

Sample view of predicted score - validation set

cc <- modelobject$predicted_score$test
cc <-  data.frame(cc[1:10, ])
kable(cc)


daya6489/DriveML documentation built on July 22, 2021, 4:21 a.m.