knitr::opts_chunk$set(echo = TRUE)
options(warn = -1)
library(dlookr)
library(magrittr)
library(dplyr)
library(ggplot2)
library(grid)
library(gridExtra)
library(knitr)
library(kableExtra)

if (!requireNamespace("mice", quietly = TRUE)) {
  stop("Package 'mice' needed for this function to work. Please install it.", 
       call. = FALSE)
}

if (!requireNamespace("rpart", quietly = TRUE)) {
  stop("Package 'rpart' needed for this function to work. Please install it.", 
       call. = FALSE)
}
#library(smbinning)
#library(xtable)

edaData <- as.data.frame(get("edaData", .dlookrEnv))
targetVariable <- get("targetVariable", .dlookrEnv)

if (length(targetVariable) == 0) targetVariable <- NULL

names(edaData) <- gsub("[[:punct:][:space:]]", "_", names(edaData))

idx.numeric <- find_class(edaData, type = "numerical")
varname <- names(edaData)[idx.numeric]

varname <- setdiff(varname, targetVariable)

setUnder <- function(x) {
  gsub("_", ".", x)
}

Imputation

Missing Values

Missing values imputation information

The variables for imputate missing values are as follows.

nalist <- find_na(edaData, index = FALSE)

if (length(nalist) > 0) {
  naclass <- get_class(edaData) %>% 
    filter(variable %in% nalist) %>% 
    select(class) %>% 
    pull() %>% 
    as.character()

  cat(paste(nalist, collapse = ", "))
} else {
  cat("There are no variables including missing values.")
}
numerics <- c("mean", "median", "mode", "knn", "rpart", "mice")
categories <- c("mode", "rpart", "mice")

imputae_summary <- function(object) {
  type <- attr(object, "type")
  method <- attr(object, "method")
  var_type <- attr(object, "var_type")

  original <- object

  if (type == "missing values") {
    na_pos <- attr(object, "na_pos")
    seed <- attr(object, "seed")

    original[na_pos] <- NA
  } else if (type == "outliers") {
    outlier_pos <- attr(object, "outlier_pos")
    outliers <- attr(object, "outliers")

    original[outlier_pos] <- outliers
  }

  if (var_type == "numerical") {
    original <- as.numeric(original)
    object <- as.numeric(object)
  } else if (var_type == "categorical") {
    original <- factor(original)
    object <- factor(object)
  }

  dframe <- data.frame(original = original,
    imputation = object) %>%
    tidyr::gather()

  if (var_type == "numerical") {
    smmry <- dframe %>%
      group_by(key) %>%
      describe("value") %>%
      select(-variable, -key) %>%
      t

    smmry <- smmry[, 2:1]
    colnames(smmry) <- c("Original", "Imputation")
  } else if (var_type == "categorical") {
    tab_freq <- xtabs(~ value + key, dframe, addNA = TRUE)
    tab_relat <- round(prop.table(tab_freq, 2) * 100, 2)

    smmry <- cbind(tab_freq, tab_relat)
    smmry <- smmry[, c(2, 1, 4, 3)]
    colnames(smmry) <- c("original", "imputation",
      "original_percent", "imputation_percent")
  }

  if (method %in% c("knn", "rpart", "mice")) {
    if (method == "knn") {
      met <- "K-Nearest Neighbors"
    } else if (method == "rpart") {
      met <- "Recursive Partitioning and Regression Trees"
    } else if (method == "mice") {
      met <- "Multivariate Imputation by Chained Equations"
      met <- sprintf("%s\n - random seed : %s", met, seed)
    }
    cat(sprintf("* Imputate %s based on %s\n\n", type, met))
  }

  cat("* Information of Imputation (before vs after)\n")

  invisible(smmry)
}
if (length(nalist) > 0) {
  for (i in 1:length(nalist)) {
    cat("\n")
    cat("###", setUnder(nalist[i]), "\n")

    if (naclass[i] %in% c("integer", "numeric")) {
      method <- numerics
    } else if (naclass[i] %in% c("factor", "ordered")) {
      method <- categories
    }

    if (is.null(targetVariable)) {
      method <- setdiff(method, c("knn", "rpart", "mice"))
    }

    for (j in 1:length(method)) {
      cat("\n")
      cat("#### Imputate missing values with '", method[j], "'\n")

      impu <- imputate_na(edaData, nalist[i], targetVariable, method = method[j], print_flag = FALSE)

      if (!attr(impu, "success")) {
        cat(attr(impu, "message"), "<br><br><br>\n")
        next
      }

      tab <- imputae_summary(impu)

      kable(tab, digits = 2, format = "html",
            format.args = list(big.mark = ",")) %>% 
        kable_styling(full_width = FALSE, font_size = 13, position = "left") %>% 
        row_spec(NROW(tab), bold = TRUE, color = "white", background = "#FFA349") %>% 
        print()

      cat("\n")
      cat("<br><br><br>\n")

      grid.draw(plot(impu))
      cat("\n")
    } 
  }
}

Outliers

Outliers imputation information

The numerical variables for imputate outliers are as follows.

outlist <- find_outliers(edaData, index = FALSE)

if (length(outlist) > 0) {
  cat(paste(outlist, collapse = ", "))
} else {
  cat("There are no variables including outliers.")
}
if (length(outlist) > 0) {
  for (i in 1:length(outlist)) {
    cat("\n")
    cat("###", setUnder(outlist[i]), "\n")

    method <- c("mean", "median", "mode", "capping")

    for (j in 1:length(method)) {
      cat("\n")
      cat("#### Imputate outliers with '", method[j], "'\n")

      impu <- imputate_outlier(edaData, outlist[i], method = method[j])
      tab <- imputae_summary(impu)

      kable(tab, digits = 2, format = "html",
            format.args = list(big.mark = ",")) %>% 
        kable_styling(full_width = FALSE, font_size = 13, position = "left") %>% 
        row_spec(NROW(tab), bold = TRUE, color = "white", background = "#FFA349") %>% 
        print()

      cat("\n")
      cat("<br><br><br>\n")

      grid.draw(plot(impu))
      cat("\n")
    } 
  }
}

Resolving Skewness

Skewed variables information

The numerical variables for resolving skewness are as follows.

summary_transform <- function(object, ...) {
  method <- attr(object, "method")
  origin <- attr(object, "origin")

  suppressWarnings({dframe <- data.frame(original = origin,
    trans = object) %>%
    tidyr::gather()})

  smmry <- dframe %>%
    group_by(key) %>%
    describe("value") %>%
    select(-variable, -key) %>%
    t
  colnames(smmry) <- c("Original", "Transformation")
  invisible(smmry)
}
skewlist <- find_skewness(edaData, index = FALSE)
skew <- find_skewness(edaData, value = TRUE)

if (length(skewlist) > 0) {
  cat(paste(skewlist, collapse = ", "))
  skew <- skew[skewlist]
} else {
  cat("There are no variables including skewed.")
}
if (length(skewlist) > 0) {
  for (i in 1:length(skewlist)) {
    cat("\n")
    cat("###", setUnder(skewlist[i]), "\n")

    if (skew[i] <= 0) {
      method <- c("1/x", "x^2", "x^3")
    } else {
      method <- c("log", "log+1", "sqrt")
    }

    for (j in 1:length(method)) {
      cat("\n")
      cat("#### Resolving skewness with '", method[j], "'\n")

      trans <- transform(pull(edaData, skewlist[i]), method = method[j])
      tab <- summary_transform(trans)

      kable(tab, digits = 2, format = "html",
            format.args = list(big.mark = ",")) %>% 
        kable_styling(full_width = FALSE, font_size = 13, position = "left") %>% 
        row_spec(NROW(tab), bold = TRUE, color = "white", background = "#FFA349") %>% 
        print()

      cat("\n")
      cat("<br><br><br>\n")

      grid.draw(plot(trans))
      cat("\n")
    } 
  }
}

Binning

Numerical Variables for Binning

The numerical variables for binning are as follows.

idx.numeric <- find_class(edaData, type = "numerical")
var_numeric <- names(edaData)[idx.numeric]

var_numeric <- setdiff(var_numeric, targetVariable)
var_numeric

Binning

if (length(var_numeric) > 0) {
  for (i in 1:length(var_numeric)) {
    cat("\n")
    cat("###", setUnder(var_numeric[i]), "\n")

    method <- c("quantile", "equal", "pretty", "kmeans", "bclust")

    for (j in 1:length(method)) {
      cat("\n")
      cat("#### Binning with '", method[j], "'\n")

      error_flag <- FALSE
      tryCatch(binn <- binning(pull(edaData, var_numeric[i]), type = method[j]),
        error = function(e) {
          msg <<- e$message
          error_flag <<- TRUE
      }, finally = NULL)

      if (error_flag) {
        cat(msg, "\n\n")
        rm(msg, error_flag)
        next
      }

      tab <- summary(binn)

      kable(tab, digits = 4, format = "html",
            format.args = list(big.mark = ",")) %>% 
        kable_styling(full_width = FALSE, font_size = 13, position = "left") %>% 
        row_spec(NROW(tab), bold = TRUE, color = "white", background = "#FFA349") %>% 
        print()

      cat("\n")
      cat("<br><br><br>\n")

      plot(binn)
      cat("\n")
    } 
  }
}

Optimal Binning

For the scoring modeling, optimal binning is performed considering the characteristics of numerical data. This work uses the recursive partitioning technique known as supervised discretization when categorizing numeric data as categorical data.

if (!is.null(targetVariable)) {
  n_levles <- length(table(pull(edaData, targetVariable)))

  if (n_levles == 2) {
    # Optimal Binning for Scoring Modeling
    results <- lapply(var_numeric, function(x)
      binning_by(.data = edaData, y = targetVariable, x = x, p = 0.05))

    # could also use tapply here
    for (i in 1:length(var_numeric)) {
      cat("\n")
      cat("###", setUnder(var_numeric[i]), "\n")

      result <- results[[i]]

      if (is.character(result)) {
        cat(result)
        cat("\n")
        next
      }

      cat("\n")

      tab <- attr(result, "performance") %>% 
        select(-CntCumPos, -CntCumNeg, -RateCumPos, -RateCumNeg)

      kable(tab, digits = 2, format = "html",
        format.args = list(big.mark = ",")) %>% 
        kable_styling(full_width = FALSE, font_size = 10, position = "left") %>% 
        row_spec(NROW(tab), bold = TRUE, color = "white", background = "#FFA349") %>% 
        print()

      cat("\n")
      cat(sprintf("Information value = %s", attr(result, "iv")))
      cat("<br><br><br>\n")

      plot(result)
      cat("\n")
    } 
  } else {
    cat("The target variable is not a binary class.\n\n")
  }
} else {
  cat("There is no target variable.\n\n")
}
options(warn = 0)


choonghyunryu/dlookr documentation built on June 11, 2024, 9:12 a.m.