library(rmarkdown) library(DriveML) library(knitr) library(scales) library(ggplot2) modelobject <- params$mlobject mldata <- params$mldata
## MAR section
Overview of the data
typ1 <- mldata$datasummary$type1 kable(typ1)
Variable summary
typ2 <- mldata$datasummary$type2 paged_table(typ2)
More than one missing value columns are considered for MAR computation
print(mldata$var_list$MAR_col)
if(names(mldata$call) %in% "missimpute") { if(mldata$call$missimpute=="default") { mdd <- data.frame(Variable_type = c("Factor", "Integer", "Numeric", "Character"), Impute_method = c("Mode value", "Mean value", "Median value", "Constant value")) kable(mdd) } else { cat("Missing replaced with", "{", print(mldata$call$missimpute), "\n") } } else { mdd <- data.frame(Variable_type = c("Factor", "Integer", "Numeric", "Character"), Impute_method = c("Mode value", "Mean value", "Median value", "Constant value")) kable(mdd) }
ccd = data.frame(Descriptions = c("Checked and removed useless variables:", "No. of variables before fetature reductions:"), Value = c(length(mldata$var_list$Dropped_col), length(mldata$overall_variable))) kable(ccd)
print(mldata$var_list$outlier_summary)
Calculated ineraction between two continuous variables using multiplication method
List of variables calculated are below
cc <- mldata$var_list$overall_variable print(cc[grep("Inter_", cc)])
The frequent transformer counts each categorical variable in the data set
List of variables calculated are below
cc <- mldata$var_list$overall_variable print(cc[grep("Freq_t", cc)])
Date transformer retrieves any date values like month, year, quarter and week
List of variables calculated are below
cc <- mldata$var_list$overall_variable print(cc[grep("datet_", cc)])
One hot coding for all categorical variables
cc <- mldata$var_list$overall_variable print(cc)
cc <- mldata$var_list$zerovariance print(cc)
Highly correlated variables
cc <- mldata$var_list$cor_var print(cc)
Correaltion table
cc <- data.frame(mldata$raw_list$Num_Correlation) if(nrow(cc)>1){ paged_table(cc) } else { print("correlation value not found") }
Top predictors based on AUC value against dependent variables
cc <- mldata$var_list$auc_var print(cc)
AUC summary table
cc <- mldata$raw_list$AUC_value if(nrow(cc) > 0){ rownames(cc) <- NULL cc <- cc[, c(2:3)] paged_table(cc) } else { print("No AUC features generated") }
cc <- mldata$master_data if(nrow(cc)>0){ paged_table(head(cc, 5)) } else { print("data is not prepared") }
Automated Machine Learning (DriveML) mainly refers to the automated methods for model selection and hyper-parameter optimization of various algorithms such as random forests, gradient boosting etc..
Dimensions of the dataset and other informations
t1 <- modelobject$datasummary$train; t2 <- modelobject$datasummary$test; t3 <- modelobject$datasummary$score; t4 <- modelobject$call mdata <- NULL for(j in 2: length(t4)){ fnam <- as.character(names(t4[j])) ivalue <- as.character(t4[[j]]) if(length(ivalue) == 0) ivalue <- "NULL" md <- data.frame(parameter = fnam, input = ivalue) mdata <- rbind(mdata, md) } modename <- names(modelobject$trainedModels) manme <- data.frame(model = c("glmnet", "logreg", "randomForest", "ranger", "xgboost", "rpart"),descriptions = c("Regularised regression from glmnet R package", "logistic regression from stats R package", "Random forests using the randomForest R package", "Random forests using the ranger R package", "Gradient boosting using xgboost R package", "decision tree classification using rpart R package")) drmodel <- subset(manme, model = modename) ## section2 result <- modelobject$results rownames(result) <- NULL ## Section ROC plot exe_modl <- names(modelobject$trainedModels) pl_glmnet <- pl_logreg <- pl_randomForest <- pl_ranger <- pl_xgboost <- pl_rpart <- FALSE for(j in exe_modl){ assign(paste0("pl_",j), TRUE) } ## variable importance vi_randomForest <- vi_ranger <- vi_xgboost <- vi_rpart <- vi_logreg <- vi_glmnet <- FALSE for(j in exe_modl){ if(j == "randomForest") assign(paste0("vi_",j), TRUE) if(j == "ranger") assign(paste0("vi_",j), TRUE) if(j == "xgboost") assign(paste0("vi_",j), TRUE) if(j == "glmnet") assign(paste0("vi_",j), TRUE) if(j == "logreg") assign(paste0("vi_",j), TRUE) if(j == "rpart") assign(paste0("vi_",j), TRUE) }
Training data set
t1 <- t1[t1$Value!=0,]; rownames(t1) <- NULL kable(t1)
Validation data set
t2 <- t2[t2$Value!=0,]; rownames(t2) <- NULL kable(t2)
Scoring data set
if(!is.null(t3)) { t3 <- t3[t3$Value!=0,]; rownames(t3) <- NULL kable(t3) } else { cat("No score data set") }
kable(mdata)
kable(drmodel)
Table has Model fitting time and performance metric like AUC, Accuaracy, Precision, Recall and F1 score
kable(result)
masterModel <- modelobject$trainedModels[["glmnet"]] masterModel$modelPlots$TrainROC masterModel$modelPlots$TestROC
masterModel <- modelobject$trainedModels[["logreg"]] masterModel$modelPlots$TrainROC masterModel$modelPlots$TestROC
masterModel <- modelobject$trainedModels[["randomForest"]] masterModel$modelPlots$TrainROC masterModel$modelPlots$TestROC
masterModel <- modelobject$trainedModels[["ranger"]] masterModel$modelPlots$TrainROC masterModel$modelPlots$TestROC
masterModel <- modelobject$trainedModels[["xgboost"]] masterModel$modelPlots$TrainROC masterModel$modelPlots$TestROC
masterModel <- modelobject$trainedModels[["rpart"]] masterModel$modelPlots$TrainROC masterModel$modelPlots$TestROC
masterModel <- modelobject$trainedModels[["xgboost"]] masterModel$modelPlots$VarImp[[1]]
masterModel <- modelobject$trainedModels[["randomForest"]] masterModel$modelPlots$VarImp[[1]]
masterModel <- modelobject$trainedModels[["ranger"]] masterModel$modelPlots$VarImp[[1]]
masterModel <- modelobject$trainedModels[["rpart"]] masterModel$modelPlots$VarImp[[1]]
masterModel <- modelobject$trainedModels[["glmnet"]] masterModel$modelPlots$VarImp[[1]]
masterModel <- modelobject$trainedModels[["logreg"]] masterModel$modelPlots$VarImp[[1]]
Used lift charts and PDP plots
modelobject$modelexp$Lift_plot
Top decile (2%) lift catpured by model level
cc <- modelobject$modelexp$Lift_data cc1 <- cc[cc$groups==1, ]; rownames(cc1) <- NULL cc2 <- cc[cc$groups==5, ]; rownames(cc2) <- NULL ccd <- data.frame(model = cc1$model, top_2 = cc1$lift, top_10 = cc2$lift) kable(ccd)
Note: Plot available for top important variables
lapply(names(modelobject$modelexp$pdp$plots), function(x) {cc = modelobject$modelexp$pdp$plots[[x]]; cc})
cc <- modelobject$predicted_score$test cc <- data.frame(cc[1:10, ]) kable(cc)
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.