Transformation Information Report
In dlookr: Tools for Data Diagnosis, Exploration, Transformation

knitr::opts_chunk$set(echo = TRUE)
options(warn = -1)

library(dlookr)
library(magrittr)
library(dplyr)
library(ggplot2)
library(grid)
library(gridExtra)
library(knitr)
library(kableExtra)

if (!requireNamespace("mice", quietly = TRUE)) {
  stop("Package 'mice' needed for this function to work. Please install it.", 
       call. = FALSE)
}

if (!requireNamespace("rpart", quietly = TRUE)) {
  stop("Package 'rpart' needed for this function to work. Please install it.", 
       call. = FALSE)
}

#library(smbinning)
#library(xtable)

edaData <- as.data.frame(get("edaData", .dlookrEnv))
targetVariable <- get("targetVariable", .dlookrEnv)

if (length(targetVariable) == 0) targetVariable <- NULL

names(edaData) <- gsub("[[:punct:][:space:]]", "_", names(edaData))

idx.numeric <- find_class(edaData, type = "numerical")
varname <- names(edaData)[idx.numeric]

varname <- setdiff(varname, targetVariable)

setUnder <- function(x) {
  gsub("_", ".", x)
}

Imputation

Missing Values

Missing values imputation information

The variables for imputate missing values are as follows.

nalist <- find_na(edaData, index = FALSE)

if (length(nalist) > 0) {
  naclass <- get_class(edaData) %>% 
    filter(variable %in% nalist) %>% 
    select(class) %>% 
    pull() %>% 
    as.character()

  cat(paste(nalist, collapse = ", "))
} else {
  cat("There are no variables including missing values.")
}

numerics <- c("mean", "median", "mode", "knn", "rpart", "mice")
categories <- c("mode", "rpart", "mice")

imputae_summary <- function(object) {
  type <- attr(object, "type")
  method <- attr(object, "method")
  var_type <- attr(object, "var_type")

  original <- object

  if (type == "missing values") {
    na_pos <- attr(object, "na_pos")
    seed <- attr(object, "seed")

    original[na_pos] <- NA
  } else if (type == "outliers") {
    outlier_pos <- attr(object, "outlier_pos")
    outliers <- attr(object, "outliers")

    original[outlier_pos] <- outliers
  }

  if (var_type == "numerical") {
    original <- as.numeric(original)
    object <- as.numeric(object)
  } else if (var_type == "categorical") {
    original <- factor(original)
    object <- factor(object)
  }

  dframe <- data.frame(original = original,
    imputation = object) %>%
    tidyr::gather()

  if (var_type == "numerical") {
    smmry <- dframe %>%
      group_by(key) %>%
      describe("value") %>%
      select(-variable, -key) %>%
      t

    smmry <- smmry[, 2:1]
    colnames(smmry) <- c("Original", "Imputation")
  } else if (var_type == "categorical") {
    tab_freq <- xtabs(~ value + key, dframe, addNA = TRUE)
    tab_relat <- round(prop.table(tab_freq, 2) * 100, 2)

    smmry <- cbind(tab_freq, tab_relat)
    smmry <- smmry[, c(2, 1, 4, 3)]
    colnames(smmry) <- c("original", "imputation",
      "original_percent", "imputation_percent")
  }

  if (method %in% c("knn", "rpart", "mice")) {
    if (method == "knn") {
      met <- "K-Nearest Neighbors"
    } else if (method == "rpart") {
      met <- "Recursive Partitioning and Regression Trees"
    } else if (method == "mice") {
      met <- "Multivariate Imputation by Chained Equations"
      met <- sprintf("%s\n - random seed : %s", met, seed)
    }
    cat(sprintf("* Imputate %s based on %s\n\n", type, met))
  }

  cat("* Information of Imputation (before vs after)\n")

  invisible(smmry)
}

if (length(nalist) > 0) {
  for (i in 1:length(nalist)) {
    cat("\n")
    cat("###", setUnder(nalist[i]), "\n")

    if (naclass[i] %in% c("integer", "numeric")) {
      method <- numerics
    } else if (naclass[i] %in% c("factor", "ordered")) {
      method <- categories
    }

    if (is.null(targetVariable)) {
      method <- setdiff(method, c("knn", "rpart", "mice"))
    }

    for (j in 1:length(method)) {
      cat("\n")
      cat("#### Imputate missing values with '", method[j], "'\n")

      impu <- imputate_na(edaData, nalist[i], targetVariable, method = method[j], print_flag = FALSE)

      if (!attr(impu, "success")) {
        cat(attr(impu, "message"), "<br><br><br>\n")
        next
      }

      tab <- imputae_summary(impu)

      kable(tab, digits = 2, format = "html",
            format.args = list(big.mark = ",")) %>% 
        kable_styling(full_width = FALSE, font_size = 13, position = "left") %>% 
        row_spec(NROW(tab), bold = TRUE, color = "white", background = "#FFA349") %>% 
        print()

      cat("\n")
      cat("<br><br><br>\n")

      grid.draw(plot(impu))
      cat("\n")
    } 
  }
}

Outliers

Outliers imputation information

The numerical variables for imputate outliers are as follows.

outlist <- find_outliers(edaData, index = FALSE)

if (length(outlist) > 0) {
  cat(paste(outlist, collapse = ", "))
} else {
  cat("There are no variables including outliers.")
}

if (length(outlist) > 0) {
  for (i in 1:length(outlist)) {
    cat("\n")
    cat("###", setUnder(outlist[i]), "\n")

    method <- c("mean", "median", "mode", "capping")

    for (j in 1:length(method)) {
      cat("\n")
      cat("#### Imputate outliers with '", method[j], "'\n")

      impu <- imputate_outlier(edaData, outlist[i], method = method[j])
      tab <- imputae_summary(impu)

      kable(tab, digits = 2, format = "html",
            format.args = list(big.mark = ",")) %>% 
        kable_styling(full_width = FALSE, font_size = 13, position = "left") %>% 
        row_spec(NROW(tab), bold = TRUE, color = "white", background = "#FFA349") %>% 
        print()

      cat("\n")
      cat("<br><br><br>\n")

      grid.draw(plot(impu))
      cat("\n")
    } 
  }
}

Resolving Skewness

Skewed variables information

The numerical variables for resolving skewness are as follows.

summary_transform <- function(object, ...) {
  method <- attr(object, "method")
  origin <- attr(object, "origin")

  suppressWarnings({dframe <- data.frame(original = origin,
    trans = object) %>%
    tidyr::gather()})

  smmry <- dframe %>%
    group_by(key) %>%
    describe("value") %>%
    select(-variable, -key) %>%
    t
  colnames(smmry) <- c("Original", "Transformation")
  invisible(smmry)
}

skewlist <- find_skewness(edaData, index = FALSE)
skew <- find_skewness(edaData, value = TRUE)

if (length(skewlist) > 0) {
  cat(paste(skewlist, collapse = ", "))
  skew <- skew[skewlist]
} else {
  cat("There are no variables including skewed.")
}

if (length(skewlist) > 0) {
  for (i in 1:length(skewlist)) {
    cat("\n")
    cat("###", setUnder(skewlist[i]), "\n")

    if (skew[i] <= 0) {
      method <- c("1/x", "x^2", "x^3")
    } else {
      method <- c("log", "log+1", "sqrt")
    }

    for (j in 1:length(method)) {
      cat("\n")
      cat("#### Resolving skewness with '", method[j], "'\n")

      trans <- transform(pull(edaData, skewlist[i]), method = method[j])
      tab <- summary_transform(trans)

      kable(tab, digits = 2, format = "html",
            format.args = list(big.mark = ",")) %>% 
        kable_styling(full_width = FALSE, font_size = 13, position = "left") %>% 
        row_spec(NROW(tab), bold = TRUE, color = "white", background = "#FFA349") %>% 
        print()

      cat("\n")
      cat("<br><br><br>\n")

      grid.draw(plot(trans))
      cat("\n")
    } 
  }
}

Binning

Numerical Variables for Binning

The numerical variables for binning are as follows.

idx.numeric <- find_class(edaData, type = "numerical")
var_numeric <- names(edaData)[idx.numeric]

var_numeric <- setdiff(var_numeric, targetVariable)

var_numeric

Binning

if (length(var_numeric) > 0) {
  for (i in 1:length(var_numeric)) {
    cat("\n")
    cat("###", setUnder(var_numeric[i]), "\n")

    method <- c("quantile", "equal", "pretty", "kmeans", "bclust")

    for (j in 1:length(method)) {
      cat("\n")
      cat("#### Binning with '", method[j], "'\n")

      error_flag <- FALSE
      tryCatch(binn <- binning(pull(edaData, var_numeric[i]), type = method[j]),
        error = function(e) {
          msg <<- e$message
          error_flag <<- TRUE
      }, finally = NULL)

      if (error_flag) {
        cat(msg, "\n\n")
        rm(msg, error_flag)
        next
      }

      tab <- summary(binn)

      kable(tab, digits = 4, format = "html",
            format.args = list(big.mark = ",")) %>% 
        kable_styling(full_width = FALSE, font_size = 13, position = "left") %>% 
        row_spec(NROW(tab), bold = TRUE, color = "white", background = "#FFA349") %>% 
        print()

      cat("\n")
      cat("<br><br><br>\n")

      plot(binn)
      cat("\n")
    } 
  }
}

Optimal Binning

For the scoring modeling, optimal binning is performed considering the characteristics of numerical data. This work uses the recursive partitioning technique known as supervised discretization when categorizing numeric data as categorical data.

if (!is.null(targetVariable)) {
  n_levles <- length(table(pull(edaData, targetVariable)))

  if (n_levles == 2) {
    # Optimal Binning for Scoring Modeling
    results <- lapply(var_numeric, function(x)
      binning_by(.data = edaData, y = targetVariable, x = x, p = 0.05))

    # could also use tapply here
    for (i in 1:length(var_numeric)) {
      cat("\n")
      cat("###", setUnder(var_numeric[i]), "\n")

      result <- results[[i]]

      if (is.character(result)) {
        cat(result)
        cat("\n")
        next
      }

      cat("\n")

      tab <- attr(result, "performance") %>% 
        select(-CntCumPos, -CntCumNeg, -RateCumPos, -RateCumNeg)

      kable(tab, digits = 2, format = "html",
        format.args = list(big.mark = ",")) %>% 
        kable_styling(full_width = FALSE, font_size = 10, position = "left") %>% 
        row_spec(NROW(tab), bold = TRUE, color = "white", background = "#FFA349") %>% 
        print()

      cat("\n")
      cat(sprintf("Information value = %s", attr(result, "iv")))
      cat("<br><br><br>\n")

      plot(result)
      cat("\n")
    } 
  } else {
    cat("The target variable is not a binary class.\n\n")
  }
} else {
  cat("There is no target variable.\n\n")
}

options(warn = 0)

Any scripts or data that you put into this service are public.

dlookr documentation built on Aug. 24, 2025, 9:07 a.m.

rdrr.io home R language documentation Run R code online

CRAN packages Bioconductor packages R-Forge packages GitHub packages

Note that we can't provide technical support on individual packages. You should contact the package authors for that.

dlookr
Tools for Data Diagnosis, Exploration, Transformation

Transformation Information Report
In dlookr: Tools for Data Diagnosis, Exploration, Transformation

Imputation

Missing Values

Missing values imputation information

Outliers

Outliers imputation information

Resolving Skewness

Skewed variables information

Binning

Numerical Variables for Binning

Binning

Optimal Binning

Try the dlookr package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

dlookr Tools for Data Diagnosis, Exploration, Transformation

Transformation Information Report In dlookr: Tools for Data Diagnosis, Exploration, Transformation

Imputation

Missing Values

Missing values imputation information

Outliers

Outliers imputation information

Resolving Skewness

Skewed variables information

Binning

Numerical Variables for Binning

Binning

Optimal Binning

Try the dlookr package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

dlookr
Tools for Data Diagnosis, Exploration, Transformation

Transformation Information Report
In dlookr: Tools for Data Diagnosis, Exploration, Transformation