knitr::opts_chunk$set(echo = TRUE) options(warn = -1)
library(dlookr) library(magrittr) library(dplyr) library(ggplot2) library(grid) library(gridExtra) library(knitr) library(kableExtra) if (!requireNamespace("mice", quietly = TRUE)) { stop("Package 'mice' needed for this function to work. Please install it.", call. = FALSE) } if (!requireNamespace("rpart", quietly = TRUE)) { stop("Package 'rpart' needed for this function to work. Please install it.", call. = FALSE) }
#library(smbinning) #library(xtable) edaData <- as.data.frame(get("edaData", .dlookrEnv)) targetVariable <- get("targetVariable", .dlookrEnv) if (length(targetVariable) == 0) targetVariable <- NULL names(edaData) <- gsub("[[:punct:][:space:]]", "_", names(edaData)) idx.numeric <- find_class(edaData, type = "numerical") varname <- names(edaData)[idx.numeric] varname <- setdiff(varname, targetVariable) setUnder <- function(x) { gsub("_", ".", x) }
The variables for imputate missing values are as follows.
nalist <- find_na(edaData, index = FALSE) if (length(nalist) > 0) { naclass <- get_class(edaData) %>% filter(variable %in% nalist) %>% select(class) %>% pull() %>% as.character() cat(paste(nalist, collapse = ", ")) } else { cat("There are no variables including missing values.") }
numerics <- c("mean", "median", "mode", "knn", "rpart", "mice") categories <- c("mode", "rpart", "mice") imputae_summary <- function(object) { type <- attr(object, "type") method <- attr(object, "method") var_type <- attr(object, "var_type") original <- object if (type == "missing values") { na_pos <- attr(object, "na_pos") seed <- attr(object, "seed") original[na_pos] <- NA } else if (type == "outliers") { outlier_pos <- attr(object, "outlier_pos") outliers <- attr(object, "outliers") original[outlier_pos] <- outliers } if (var_type == "numerical") { original <- as.numeric(original) object <- as.numeric(object) } else if (var_type == "categorical") { original <- factor(original) object <- factor(object) } dframe <- data.frame(original = original, imputation = object) %>% tidyr::gather() if (var_type == "numerical") { smmry <- dframe %>% group_by(key) %>% describe("value") %>% select(-variable, -key) %>% t smmry <- smmry[, 2:1] colnames(smmry) <- c("Original", "Imputation") } else if (var_type == "categorical") { tab_freq <- xtabs(~ value + key, dframe, addNA = TRUE) tab_relat <- round(prop.table(tab_freq, 2) * 100, 2) smmry <- cbind(tab_freq, tab_relat) smmry <- smmry[, c(2, 1, 4, 3)] colnames(smmry) <- c("original", "imputation", "original_percent", "imputation_percent") } if (method %in% c("knn", "rpart", "mice")) { if (method == "knn") { met <- "K-Nearest Neighbors" } else if (method == "rpart") { met <- "Recursive Partitioning and Regression Trees" } else if (method == "mice") { met <- "Multivariate Imputation by Chained Equations" met <- sprintf("%s\n - random seed : %s", met, seed) } cat(sprintf("* Imputate %s based on %s\n\n", type, met)) } cat("* Information of Imputation (before vs after)\n") invisible(smmry) }
if (length(nalist) > 0) { for (i in 1:length(nalist)) { cat("\n") cat("###", setUnder(nalist[i]), "\n") if (naclass[i] %in% c("integer", "numeric")) { method <- numerics } else if (naclass[i] %in% c("factor", "ordered")) { method <- categories } if (is.null(targetVariable)) { method <- setdiff(method, c("knn", "rpart", "mice")) } for (j in 1:length(method)) { cat("\n") cat("#### Imputate missing values with '", method[j], "'\n") impu <- imputate_na(edaData, nalist[i], targetVariable, method = method[j], print_flag = FALSE) if (!attr(impu, "success")) { cat(attr(impu, "message"), "<br><br><br>\n") next } tab <- imputae_summary(impu) kable(tab, digits = 2, format = "html", format.args = list(big.mark = ",")) %>% kable_styling(full_width = FALSE, font_size = 13, position = "left") %>% row_spec(NROW(tab), bold = TRUE, color = "white", background = "#FFA349") %>% print() cat("\n") cat("<br><br><br>\n") grid.draw(plot(impu)) cat("\n") } } }
The numerical variables for imputate outliers are as follows.
outlist <- find_outliers(edaData, index = FALSE) if (length(outlist) > 0) { cat(paste(outlist, collapse = ", ")) } else { cat("There are no variables including outliers.") }
if (length(outlist) > 0) { for (i in 1:length(outlist)) { cat("\n") cat("###", setUnder(outlist[i]), "\n") method <- c("mean", "median", "mode", "capping") for (j in 1:length(method)) { cat("\n") cat("#### Imputate outliers with '", method[j], "'\n") impu <- imputate_outlier(edaData, outlist[i], method = method[j]) tab <- imputae_summary(impu) kable(tab, digits = 2, format = "html", format.args = list(big.mark = ",")) %>% kable_styling(full_width = FALSE, font_size = 13, position = "left") %>% row_spec(NROW(tab), bold = TRUE, color = "white", background = "#FFA349") %>% print() cat("\n") cat("<br><br><br>\n") grid.draw(plot(impu)) cat("\n") } } }
The numerical variables for resolving skewness are as follows.
summary_transform <- function(object, ...) { method <- attr(object, "method") origin <- attr(object, "origin") suppressWarnings({dframe <- data.frame(original = origin, trans = object) %>% tidyr::gather()}) smmry <- dframe %>% group_by(key) %>% describe("value") %>% select(-variable, -key) %>% t colnames(smmry) <- c("Original", "Transformation") invisible(smmry) }
skewlist <- find_skewness(edaData, index = FALSE) skew <- find_skewness(edaData, value = TRUE) if (length(skewlist) > 0) { cat(paste(skewlist, collapse = ", ")) skew <- skew[skewlist] } else { cat("There are no variables including skewed.") }
if (length(skewlist) > 0) { for (i in 1:length(skewlist)) { cat("\n") cat("###", setUnder(skewlist[i]), "\n") if (skew[i] <= 0) { method <- c("1/x", "x^2", "x^3") } else { method <- c("log", "log+1", "sqrt") } for (j in 1:length(method)) { cat("\n") cat("#### Resolving skewness with '", method[j], "'\n") trans <- transform(pull(edaData, skewlist[i]), method = method[j]) tab <- summary_transform(trans) kable(tab, digits = 2, format = "html", format.args = list(big.mark = ",")) %>% kable_styling(full_width = FALSE, font_size = 13, position = "left") %>% row_spec(NROW(tab), bold = TRUE, color = "white", background = "#FFA349") %>% print() cat("\n") cat("<br><br><br>\n") grid.draw(plot(trans)) cat("\n") } } }
The numerical variables for binning are as follows.
idx.numeric <- find_class(edaData, type = "numerical") var_numeric <- names(edaData)[idx.numeric] var_numeric <- setdiff(var_numeric, targetVariable)
var_numeric
if (length(var_numeric) > 0) { for (i in 1:length(var_numeric)) { cat("\n") cat("###", setUnder(var_numeric[i]), "\n") method <- c("quantile", "equal", "pretty", "kmeans", "bclust") for (j in 1:length(method)) { cat("\n") cat("#### Binning with '", method[j], "'\n") error_flag <- FALSE tryCatch(binn <- binning(pull(edaData, var_numeric[i]), type = method[j]), error = function(e) { msg <<- e$message error_flag <<- TRUE }, finally = NULL) if (error_flag) { cat(msg, "\n\n") rm(msg, error_flag) next } tab <- summary(binn) kable(tab, digits = 4, format = "html", format.args = list(big.mark = ",")) %>% kable_styling(full_width = FALSE, font_size = 13, position = "left") %>% row_spec(NROW(tab), bold = TRUE, color = "white", background = "#FFA349") %>% print() cat("\n") cat("<br><br><br>\n") plot(binn) cat("\n") } } }
For the scoring modeling, optimal binning is performed considering the characteristics of numerical data. This work uses the recursive partitioning technique known as supervised discretization when categorizing numeric data as categorical data.
if (!is.null(targetVariable)) { n_levles <- length(table(pull(edaData, targetVariable))) if (n_levles == 2) { # Optimal Binning for Scoring Modeling results <- lapply(var_numeric, function(x) binning_by(.data = edaData, y = targetVariable, x = x, p = 0.05)) # could also use tapply here for (i in 1:length(var_numeric)) { cat("\n") cat("###", setUnder(var_numeric[i]), "\n") result <- results[[i]] if (is.character(result)) { cat(result) cat("\n") next } cat("\n") tab <- attr(result, "performance") %>% select(-CntCumPos, -CntCumNeg, -RateCumPos, -RateCumNeg) kable(tab, digits = 2, format = "html", format.args = list(big.mark = ",")) %>% kable_styling(full_width = FALSE, font_size = 10, position = "left") %>% row_spec(NROW(tab), bold = TRUE, color = "white", background = "#FFA349") %>% print() cat("\n") cat(sprintf("Information value = %s", attr(result, "iv"))) cat("<br><br><br>\n") plot(result) cat("\n") } } else { cat("The target variable is not a binary class.\n\n") } } else { cat("There is no target variable.\n\n") }
options(warn = 0)
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.