summary_imputation <- function(object, ...) {
success <- attr(object, "success")
if (!success) {
message("imputation object isn't success.")
return()
}
type <- attr(object, "type")
method <- attr(object, "method")
var_type <- attr(object, "var_type")
original <- object
if (type == "missing values") {
na_pos <- attr(object, "na_pos")
seed <- attr(object, "seed")
original[na_pos] <- NA
} else if (type == "outliers") {
outlier_pos <- attr(object, "outlier_pos")
outliers <- attr(object, "outliers")
original[outlier_pos] <- outliers
}
if (var_type == "numerical") {
original <- as.numeric(original)
object <- as.numeric(object)
} else if (var_type == "categorical") {
original <- factor(original)
object <- factor(object)
}
dframe <- data.frame(original = original,
imputation = object) %>%
tidyr::gather()
if (var_type == "numerical") {
smmry <- dframe %>%
group_by(key) %>%
describe("value") %>%
select(which(!names(.) %in% c("described_variables", "key"))) %>%
t
smmry <- smmry[, 2:1]
colnames(smmry) <- c("Original", "Imputation")
} else if (var_type == "categorical") {
tab_freq <- xtabs(~ value + key, dframe, addNA = TRUE)
tab_relat <- round(prop.table(tab_freq, 2) * 100, 2)
smmry <- cbind(tab_freq, tab_relat)
smmry <- smmry[, c(2, 1, 4, 3)]
colnames(smmry) <- c("original", "imputation",
"original_percent", "imputation_percent")
}
invisible(smmry)
}
summary_transform <- function(object, ...) {
method <- attr(object, "method")
origin <- attr(object, "origin")
suppressWarnings({dframe <- data.frame(original = origin,
trans = object) %>%
tidyr::gather()})
smmry <- dframe %>%
group_by(key) %>%
describe("value") %>%
select(-described_variables, -key) %>%
t
colnames(smmry) <- c("Original", "Transformation")
invisible(smmry)
}
#' @importFrom shiny tabsetPanel tabPanel
#' @importFrom htmltools plotTag
#' @import dplyr
#' @import reactable
html_impute_missing <- function(.data, target = NULL, base_family = NULL) {
if (is.null(base_family)) {
base_family <- "Roboto Condensed"
}
numerics <- c("mean", "median", "mode", "knn", "rpart", "mice")
categories <- c("mode", "rpart", "mice")
n_na <- .data %>%
purrr::map_dbl(function(x) sum(is.na(x))) %>%
.[. > 0]
if (length(n_na) > 0) {
data_type <- get_class(.data)
tab_missing <- n_na %>%
tibble::as_tibble() %>%
mutate(variable = names(n_na)) %>%
rename("missing" = value) %>%
bind_cols(n = NROW(.data)) %>%
mutate(rate_missing = missing / n) %>%
inner_join(data_type, by = "variable") %>%
select(variable, class, n, missing, rate_missing)
cap <- "Information about impute missing values"
html_cat(cap)
tab_missing %>%
reactable(
defaultColDef = colDef(style = "font-size: 14px;color: hsl(0, 0%, 40%);"),
columns = list(
variable = colDef(
name = "Variables"
),
class = colDef(
name = "Data Type"
),
n = colDef(
name = "Observations"
),
missing = colDef(
name = "Missing"
),
rate_missing = colDef(
name = "Missing(%)",
format = colFormat(
percent = TRUE,
digits = 2
)
)
),
details = function(index) {
variable <- tab_missing$variable[index]
type <- tab_missing$class[index] %>%
as.character()
if (type %in% c("integer", "numeric")) {
method <- numerics
} else if (type %in% c("factor", "ordered")) {
method <- categories
}
if (is.null(target)) {
method <- setdiff(method, c("knn", "rpart", "mice"))
} else {
method <- method
}
imputes <- method %>%
lapply(function(x) {
if (is.null(target)) {
imputate_na(.data, all_of(variable), all_of(target), method = x,
print_flag = FALSE)
} else {
imputate_na(.data, all_of(variable), method = x,
print_flag = FALSE)
}
})
p_compare <- imputes %>%
lapply(function(x) {
htmltools::plotTag({
plot(x, base_family = base_family)
}, sprintf("A plot of imputation"),
width = 600, height = 400, device = grDevices::png)
})
suppressMessages({
mat <- imputes %>%
purrr::map_dfc(function(x) {
summary_imputation(x) %>%
as.data.frame()
})
})
if (type %in% c("integer", "numeric")) {
drops <- seq(ncol(mat))[seq(ncol(mat)) %% 2 == 1][-1]
mat <- mat[, -c(drops)]
names(mat) <- c("original", paste("inpute", method, sep = "_"))
tab_compare <- mat %>%
t() %>%
as.data.frame() %>%
select(mean, sd, IQR, p00, p25, p50, p75, p100, skewness, kurtosis) %>%
reactable(
defaultColDef = colDef(
style = "font-size: 14px;color: hsl(0, 0%, 40%);",
format = colFormat(
digits = 2
)
),
columns = list(
mean = colDef(
name = "Mean"
),
sd = colDef(
name = "Standard Devation"
),
p00 = colDef(
name = "Min"
),
p25 = colDef(
name = "Q1"
),
p50 = colDef(
name = "Median"
),
p75 = colDef(
name = "Q3"
),
p100 = colDef(
name = "Max"
),
skewness = colDef(
name = "Skewness"
),
kurtosis = colDef(
name = "Kurtosis"
)
)
)
tabs <- lapply(seq(length(method) + 1), function(x) {
if (x == 1) {
shiny::tabPanel("Distribution", tab_compare,
hr(style = "border-top: 1px solid black;"),
style = "padding-top:5px; padding-bottom:25px;")
} else {
shiny::tabPanel(title = paste0("Plot_", method[x-1]),
p_compare[[x-1]],
hr(style = "border-top: 1px solid black;"),
style = "padding-top:5px; padding-bottom:25px;")
}
})
do.call(shiny::tabsetPanel, c(tabs, id = "viz"))
} else if (type %in% c("factor", "ordered")) {
drops <- seq(ncol(mat))[seq(ncol(mat)) %% 4 != 2][-1]
mat <- mat[, -c(drops)]
names(mat) <- c("original", paste("inpute", method, sep = "_"))
tab_compare <- mat %>%
t() %>%
as.data.frame()
names(tab_compare)[ncol(tab_compare)] <- "<Missing>"
tab_compare_rate <- tab_compare / rowSums(tab_compare)
tab_compare <- tab_compare %>%
reactable(
defaultColDef = colDef(
style = "font-size: 14px;color: hsl(0, 0%, 40%);",
format = colFormat(
separators = TRUE
)
)
)
tab_compare_rate <- tab_compare_rate %>%
reactable(
defaultColDef = colDef(
style = "font-size: 14px;color: hsl(0, 0%, 40%);",
format = colFormat(
percent = TRUE,
digits = 2
)
)
)
tabs <- lapply(seq(length(method) + 2), function(x) {
if (x == 1) {
shiny::tabPanel("Contingency Table", tab_compare,
hr(style = "border-top: 1px solid black;"),
style = "padding-top:5px; padding-bottom:25px;")
} else if (x == 2) {
shiny::tabPanel("Relate Contingency Table", tab_compare_rate,
hr(style = "border-top: 1px solid black;"),
style = "padding-top:5px; padding-bottom:25px;")
} else {
shiny::tabPanel(title = paste0("Plot_", method[x-2]),
p_compare[[x-2]],
hr(style = "border-top: 1px solid black;"),
style = "padding-top:5px; padding-bottom:25px;")
}
})
do.call(shiny::tabsetPanel, c(tabs, id = "viz"))
}
}
)
} else {
html_cat("There are no variables in the dataset with missing values.")
}
}
#' @importFrom shiny tabsetPanel tabPanel
#' @importFrom htmltools plotTag
#' @import dplyr
#' @import reactable
html_impute_outlier <- function(.data, base_family = NULL) {
if (is.null(base_family)) {
base_family <- "Roboto Condensed"
}
method <- c("mean", "median", "mode", "capping")
n <- nrow(.data)
outlist <- find_outliers(.data, index = FALSE)
if (length(outlist) > 0) {
tab_outlier <- .data %>%
select_at(outlist) %>%
diagnose_numeric() %>%
mutate(n = n) %>%
mutate(rate_outlier = outlier / n) %>%
select(variables, n, min, max, outlier, rate_outlier)
cap <- "Information about impute outliers"
html_cat(cap)
tab_outlier %>%
reactable(
defaultColDef = colDef(style = "font-size: 14px;color: hsl(0, 0%, 40%);"),
columns = list(
variables = colDef(
name = "Variables"
),
n = colDef(
name = "Observations"
),
min = colDef(
name = "Min"
),
max = colDef(
name = "Max"
),
outlier = colDef(
name = "Outlier"
),
rate_outlier = colDef(
name = "Outlier(%)",
format = colFormat(
percent = TRUE,
digits = 2
)
)
),
details = function(index) {
variable <- tab_outlier$variables[index]
imputes <- method %>%
lapply(function(x) {
imputate_outlier(.data, all_of(variable), method = x)
})
p_compare <- imputes %>%
lapply(function(x) {
htmltools::plotTag({
plot(x, base_family = base_family)
}, sprintf("A plot of imputation"),
width = 600, height = 400, device = grDevices::png)
})
suppressMessages({
mat <- imputes %>%
purrr::map_dfc(function(x) {
summary_imputation(x) %>%
as.data.frame()
})
})
drops <- seq(ncol(mat))[seq(ncol(mat)) %% 2 == 1][-1]
mat <- mat[, -c(drops)]
names(mat) <- c("original", paste("inpute", method, sep = "_"))
tab_compare <- mat %>%
t() %>%
as.data.frame() %>%
select(mean, sd, IQR, p00, p25, p50, p75, p100, skewness, kurtosis) %>%
reactable(
defaultColDef = colDef(
style = "font-size: 14px;color: hsl(0, 0%, 40%);",
format = colFormat(
digits = 2
)
),
columns = list(
mean = colDef(
name = "Mean"
),
sd = colDef(
name = "Standard Devation"
),
p00 = colDef(
name = "Min"
),
p25 = colDef(
name = "Q1"
),
p50 = colDef(
name = "Median"
),
p75 = colDef(
name = "Q3"
),
p100 = colDef(
name = "Max"
),
skewness = colDef(
name = "Skewness"
),
kurtosis = colDef(
name = "Kurtosis"
)
)
)
tabs <- lapply(seq(length(method) + 1), function(x) {
if (x == 1) {
shiny::tabPanel("Distribution", tab_compare,
hr(style = "border-top: 1px solid black;"),
style = "padding-top:5px; padding-bottom:25px;")
} else {
shiny::tabPanel(title = paste0("Plot_", method[x-1]),
p_compare[[x-1]],
hr(style = "border-top: 1px solid black;"),
style = "padding-top:5px; padding-bottom:25px;")
}
})
do.call(shiny::tabsetPanel, c(tabs, id = "viz"))
}
)
} else {
html_cat("There are no variables in the dataset with outliers.")
}
}
#' @importFrom shiny tabsetPanel tabPanel
#' @importFrom htmltools plotTag
#' @import dplyr
#' @import reactable
html_resolve_skewness <- function(.data, base_family = NULL) {
if (is.null(base_family)) {
base_family <- "Roboto Condensed"
}
skewlist <- find_skewness(.data, index = FALSE)
if (length(skewlist) > 0) {
tab_skewness <- .data %>%
select_at(skewlist) %>%
describe(statistics = c("quantiles", "skewness"),
quantiles = c(0, 0.25, 0.5, 0.75, 1)) %>%
select(described_variables, p00:p100, skewness)
cap <- "Information about resolve skewness"
html_cat(cap)
tab_skewness %>%
reactable(
defaultColDef = colDef(
style = "font-size: 14px;color: hsl(0, 0%, 40%);",
minWidth = 80,
maxWidth = 100
),
columns = list(
described_variables = colDef(
name = "Variables",
minWidth = 100,
maxWidth = 150
),
p00 = colDef(
name = "Min"
),
p25 = colDef(
name = "Q1"
),
p50 = colDef(
name = "Median"
),
p75 = colDef(
name = "Q3"
),
p100 = colDef(
name = "Max"
),
skewness = colDef(
name = "Skewness",
format = colFormat(
digits = 2
)
)
),
details = function(index) {
variable <- tab_skewness$described_variables[index]
skewness <- tab_skewness$skewness[index]
if (skewness <= 0) {
method <- c("1/x", "x^2", "x^3", "Box-Cox")
} else {
method <- c("log", "log+1", "sqrt", "Box-Cox")
}
resolve <- method %>%
lapply(function(x) {
transform(pull(.data, variable), method = x)
})
p_compare <- resolve %>%
lapply(function(x) {
htmltools::plotTag({
plot(x, base_family = base_family)
}, sprintf("A plot of transform"),
width = 600, height = 400, device = grDevices::png)
})
suppressMessages({
mat <- resolve %>%
purrr::map_dfc(function(x) {
summary_transform(x) %>%
as.data.frame()
})
})
drops <- seq(ncol(mat))[seq(ncol(mat)) %% 2 == 1][-1]
mat <- mat[, -c(drops)]
names(mat) <- c("original", paste("transform", method, sep = "_"))
tab_compare <- mat %>%
t() %>%
as.data.frame() %>%
select(mean, sd, IQR, p00, p25, p50, p75, p100, skewness, kurtosis) %>%
reactable(
defaultColDef = colDef(
style = "font-size: 14px;color: hsl(0, 0%, 40%);",
format = colFormat(
digits = 2
)
),
columns = list(
mean = colDef(
name = "Mean"
),
sd = colDef(
name = "Standard Devation"
),
p00 = colDef(
name = "Min"
),
p25 = colDef(
name = "Q1"
),
p50 = colDef(
name = "Median"
),
p75 = colDef(
name = "Q3"
),
p100 = colDef(
name = "Max"
),
skewness = colDef(
name = "Skewness"
),
kurtosis = colDef(
name = "Kurtosis"
)
)
)
tabs <- lapply(seq(length(method) + 1), function(x) {
if (x == 1) {
shiny::tabPanel("Distribution", tab_compare,
hr(style = "border-top: 1px solid black;"),
style = "padding-top:5px; padding-bottom:25px;")
} else {
shiny::tabPanel(title = paste0("Plot_", method[x-1]),
p_compare[[x-1]],
hr(style = "border-top: 1px solid black;"),
style = "padding-top:5px; padding-bottom:25px;")
}
})
do.call(shiny::tabsetPanel, c(tabs, id = "viz"))
}
)
} else {
html_cat("There are no variables including skewed.")
}
}
#' @importFrom shiny tabsetPanel tabPanel
#' @importFrom htmltools plotTag
#' @importFrom purrr map_int
#' @import dplyr
#' @import reactable
html_binning <- function(.data, base_family = NULL) {
if (is.null(base_family)) {
base_family <- "Roboto Condensed"
}
method <- c("quantile", "equal", "pretty", "kmeans", "bclust")
numlist <- find_class(.data, "numerical", index = FALSE)
if (length(numlist) > 0) {
cap <- "Binning information for numeric variables"
html_cat(cap)
tab_numerical <- .data %>%
select_at(numlist) %>%
diagnose() %>%
full_join(data.frame(method = method, stringsAsFactors = FALSE),
by = character()) %>%
select(variables, types, unique_count, unique_rate, method)
options(show.error.messages = FALSE)
bins <- seq(nrow(tab_numerical)) %>%
lapply(function(x) {
binn <- try(binning(pull(.data, tab_numerical$variables[x]),
type = tab_numerical$method[x]),
silent = TRUE)
if (inherits(binn, "try-error")) {
return(NULL)
} else {
return(binn)
}
})
# remove error cases
is_complete <- sapply(bins, function(x) is(x)[1] == "bins")
bins <- bins[is_complete]
tab_numerical <- tab_numerical %>%
filter(is_complete)
options(show.error.messages = TRUE)
tab_numerical$n_bins <- bins %>%
purrr::map_int(function(x) attr(x, "levels") %>% length)
tab_numerical %>%
reactable(
defaultColDef = colDef(style = "font-size: 14px;color: hsl(0, 0%, 40%);"),
columns = list(
variables = colDef(
name = "Variables",
minWidth = 100,
maxWidth = 150
),
types = colDef(
name = "Data Types",
minWidth = 80,
maxWidth = 100
),
unique_count = colDef(
name = "Unique",
format = colFormat(
separators = TRUE
)
),
unique_rate = colDef(
name = "Unique Rate",
format = colFormat(
digits = 3
)
),
method = colDef(
name = "Binning Method",
minWidth = 100,
maxWidth = 150
),
n_bins = colDef(
name = "Bins",
minWidth = 50,
maxWidth = 80
)
),
details = function(index) {
p_hist <- htmltools::plotTag({
plot(bins[[index]], base_family = base_family)
}, sprintf("A plot of bins"),
width = 600, height = 400, device = grDevices::png)
tab_bins <- summary(bins[[index]]) %>%
reactable(
defaultColDef = colDef(style = "font-size: 14px;color: hsl(0, 0%, 40%);"),
columns = list(
levels = colDef(
name = "Bins"
),
freq = colDef(
name = "Frequency",
format = colFormat(
separators = TRUE
)
),
rate = colDef(
name = "Frequency(%)",
format = colFormat(
percent = TRUE,
digits = 2
)
)
)
)
shiny::tabsetPanel(
shiny::tabPanel("Distribution", p_hist,
hr(style = "border-top: 1px solid black;"),
style = "padding-top:5px; padding-bottom:25px;"),
shiny::tabPanel("Frequency Table", tab_bins,
hr(style = "border-top: 1px solid black;"),
style = "padding-top:5px; padding-bottom:25px;")
)
}
)
} else {
html_cat("There are no numerical variables.")
}
}
#' @importFrom shiny tabsetPanel tabPanel
#' @importFrom htmltools plotTag
#' @importFrom purrr map_int
#' @import dplyr
#' @import reactable
html_optimal_binning <- function(.data, target, base_family = NULL) {
if (is.null(base_family)) {
base_family <- "Roboto Condensed"
}
numlist <- find_class(.data, "numerical", index = FALSE)
if (is.null(target)) {
html_cat("The target variable is not defied.")
} else if (!target %in% names(.data)) {
html_cat(paste0("The target variable ", target, " is not in the data."))
} else if (length(numlist) == 0) {
html_cat("There are no numerical variables.")
} else {
n_levles <- length(table(pull(.data, target)))
if (n_levles != 2) {
html_cat("The target variable is not a binary class.")
} else {
tab_numerical <- .data %>%
select_at(numlist) %>%
diagnose() %>%
select(-missing_count, -missing_percent)
# Optimal Binning for Scoring Modeling
bins <- lapply(numlist, function(x)
binning_by(.data, y = target, x = all_of(x), p = 0.05))
success <- ifelse(sapply(bins, is.character),
"No significant splits", "Success")
tab_numerical <- tab_numerical %>% bind_cols(
data.frame(success = success))
tab_numerical$n_bins <- bins %>%
purrr::map_int(function(x) {
attr(x, "levels") %>% length
})
cap <- "Optimal binning information for numeric variables with target variable"
html_cat(cap)
tab_numerical %>%
reactable(
defaultColDef = colDef(style = "font-size: 14px;color: hsl(0, 0%, 40%);"),
columns = list(
variables = colDef(
name = "Variables",
minWidth = 100,
maxWidth = 150
),
types = colDef(
name = "Data Types",
minWidth = 80,
maxWidth = 100
),
unique_count = colDef(
name = "Unique",
format = colFormat(
separators = TRUE
)
),
unique_rate = colDef(
name = "Unique Rate",
format = colFormat(
digits = 3
)
),
success = colDef(
name = "Success"
),
n_bins = colDef(
name = "Bins",
minWidth = 50,
maxWidth = 80
)
),
details = function(index) {
if (tab_numerical$n_bins[index] > 0) {
p_dist <- htmltools::plotTag({
plot(bins[[index]], base_family = base_family)
}, sprintf("A plot of bins"),
width = 600, height = 400, device = grDevices::png)
sticky_style <- list(position = "sticky", left = 0,
background = "#fff", zIndex = 1,
borderRight = "1px solid #eee",
fontSize = "15px")
tab_bins <- attr(bins[[index]], "performance") %>%
select(-CntCumPos, -CntCumNeg, -RateCumPos, -RateCumNeg) %>%
reactable(
sortable = FALSE,
defaultColDef = colDef(
style = "font-size: 14px;color: hsl(0, 0%, 40%);",
format = colFormat(
digits = 3
)
),
columns = list(
Bin = colDef(
style = sticky_style,
headerStyle = sticky_style
)
)
)
shiny::tabsetPanel(
shiny::tabPanel("Distribution", p_dist,
hr(style = "border-top: 1px solid black;"),
style = "padding-top:5px; padding-bottom:25px;"),
shiny::tabPanel("Binning Table", tab_bins,
hr(style = "border-top: 1px solid black;"),
style = "padding-top:5px; padding-bottom:25px;")
)
}
}
)
}
}
}
#' @import dplyr
#' @importFrom knitr kable
#' @importFrom kableExtra kable_styling add_header_above
html_paged_impute_missing <- function(.data, target = NULL, full_width = TRUE,
font_size = 13, base_family = NULL) {
if (is.null(base_family)) {
base_family <- "Roboto Condensed"
}
numerics <- c("mean", "median", "mode", "knn", "rpart", "mice")
categories <- c("mode", "rpart", "mice")
n_na <- .data %>%
sapply(function(x) sum(is.na(x))) %>%
.[. > 0]
if (length(n_na) > 0) {
data_type <- get_class(.data)
tab_missing <- n_na %>%
tibble::as_tibble() %>%
mutate(variable = names(n_na)) %>%
rename("missing" = value) %>%
bind_cols(n = NROW(.data)) %>%
mutate(rate_missing = missing / n) %>%
inner_join(data_type, by = "variable") %>%
select(variable, class, n, missing, rate_missing)
nm_cols <- c("variables", "data types", "observations", "missing", "missing(%)")
caption <- "Information of missing values"
tab_missing %>%
knitr::kable(format = "html", digits = 2, caption = caption,
col.names = nm_cols) %>%
kableExtra::kable_styling(full_width = full_width, font_size = font_size,
position = "left") %>%
gsub("font-size: initial !important;",
"font-size: 12px !important;", .) %>%
cat()
break_page_asis()
for (i in seq(NROW(tab_missing))) {
variable <- tab_missing$variable[i]
type <- tab_missing$class[i] %>%
as.character()
el <- div(h3(variable))
cat(as.character(el))
if (type %in% c("integer", "numeric")) {
method <- numerics
} else if (type %in% c("factor", "ordered")) {
method <- categories
}
if (is.null(target)) {
method <- setdiff(method, c("knn", "rpart", "mice"))
} else {
method <- method
}
imputes <- method %>%
lapply(function(x) {
if (is.null(target)) {
imputate_na(.data, all_of(variable), all_of(target), method = x,
print_flag = FALSE)
} else {
imputate_na(.data, all_of(variable), method = x,
print_flag = FALSE)
}
})
p_compare <- imputes %>%
lapply(function(x) {
plot(x, base_family = base_family)
})
suppressMessages({
mat <- imputes %>%
purrr::map_dfc(function(x) {
summary_imputation(x) %>%
as.data.frame()
})
})
if (type %in% c("integer", "numeric")) {
drops <- seq(ncol(mat))[seq(ncol(mat)) %% 2 == 1][-1]
mat <- mat[, -c(drops)]
names(mat) <- c("original", paste("inpute", method, sep = "_"))
nm_cols <- c("mean", "standard devation", "IQR", "min", "Q1", "median", "Q3",
"max", "skewness", "kurtosis")
tab_compare <- mat %>%
t() %>%
as.data.frame() %>%
select(mean, sd, IQR, p00, p25, p50, p75, p100, skewness, kurtosis)
for (j in seq(length(method))) {
el <- div(h4(paste0(variable, " - ", method[j])))
cat(as.character(el))
print(p_compare[[j]])
break_line_asis(1)
caption <- paste0("Distribution with ", method[j], " method")
tab_compare[c(1, j + 1), ] %>%
knitr::kable(format = "html", digits = 2, caption = caption,
col.names = nm_cols) %>%
kableExtra::kable_styling(full_width = full_width, font_size = font_size,
position = "left") %>%
gsub("font-size: initial !important;",
"font-size: 12px !important;", .) %>%
cat()
break_page_asis()
}
} else if (type %in% c("factor", "ordered")) {
drops <- seq(ncol(mat))[seq(ncol(mat)) %% 4 != 2][-1]
mat <- mat[, -c(drops)]
names(mat) <- c("original", paste("inpute", method, sep = "_"))
tab_compare <- mat %>%
t() %>%
as.data.frame()
names(tab_compare)[ncol(tab_compare)] <- "<Missing>"
tab_compare_rate <- tab_compare / rowSums(tab_compare)
for (j in seq(length(method))) {
el <- div(h4(paste0(variable, " - ", method[j])))
cat(as.character(el))
print(p_compare[[j]])
break_line_asis(1)
caption <- paste0("Contingency table with ", method[j], " method")
header_above <- c(1, NCOL(tab_compare))
names(header_above) <- c("imputation method", target)
tab_compare[c(1, j + 1), ] %>%
knitr::kable(format = "html", caption = caption,
format.args = list(big.mark = ",")) %>%
kableExtra::add_header_above(header_above) %>%
kableExtra::kable_styling(full_width = full_width, font_size = font_size,
position = "left") %>%
gsub("font-size: initial !important;",
"font-size: 12px !important;", .) %>%
cat()
break_line_asis(1)
caption <- paste0("Relate Contingency Table with ", method[j], " method")
tab_compare_rate[c(1, j + 1), ] %>%
knitr::kable(format = "html", digits = 2, caption = caption) %>%
kableExtra::add_header_above(header_above) %>%
kableExtra::kable_styling(full_width = full_width, font_size = font_size,
position = "left") %>%
gsub("font-size: initial !important;",
"font-size: 12px !important;", .) %>%
cat()
break_page_asis()
}
}
}
} else {
html_cat("There are no variables in the dataset with missing values.")
break_page_asis()
}
}
#' @import dplyr
#' @import reactable
#' @importFrom knitr kable
#' @importFrom kableExtra kable_styling add_header_above
html_paged_impute_outlier <- function(.data, full_width = TRUE, font_size = 13,
base_family = NULL) {
if (is.null(base_family)) {
base_family <- "Roboto Condensed"
}
method <- c("mean", "median", "mode", "capping")
n <- nrow(.data)
outlist <- find_outliers(.data, index = FALSE)
if (length(outlist) > 0) {
tab_outlier <- .data %>%
select_at(outlist) %>%
diagnose_numeric() %>%
mutate(n = n) %>%
mutate(rate_outlier = outlier / n) %>%
select(variables, n, min, max, outlier, rate_outlier)
nm_cols <- c("variables", "observations", "min", "max", "outlier", "outlier(%)")
caption <- "Information of outliers"
tab_outlier %>%
knitr::kable(format = "html", digits = 2, caption = caption,
col.names = nm_cols) %>%
kableExtra::kable_styling(full_width = full_width, font_size = font_size,
position = "left") %>%
gsub("font-size: initial !important;",
"font-size: 12px !important;", .) %>%
cat()
break_page_asis()
for (i in seq(NROW(tab_outlier))) {
variable <- tab_outlier$variables[i]
el <- div(h3(variable))
cat(as.character(el))
imputes <- method %>%
lapply(function(x) {
imputate_outlier(.data, all_of(variable), method = x)
})
p_compare <- imputes %>%
lapply(function(x) {
plot(x, base_family = base_family)
})
suppressMessages({
mat <- imputes %>%
purrr::map_dfc(function(x) {
summary_imputation(x) %>%
as.data.frame()
})
})
drops <- seq(ncol(mat))[seq(ncol(mat)) %% 2 == 1][-1]
mat <- mat[, -c(drops)]
names(mat) <- c("original", paste("inpute", method, sep = "_"))
tab_compare <- mat %>%
t() %>%
as.data.frame() %>%
select(mean, sd, IQR, p00, p25, p50, p75, p100, skewness, kurtosis)
nm_cols <- c("mean", "standard devation", "IQR", "min", "Q1", "median", "Q3",
"max", "skewness", "kurtosis")
for (j in seq(length(method))) {
el <- div(h4(paste0(variable, " - ", method[j])))
cat(as.character(el))
print(p_compare[[j]])
break_line_asis(1)
caption <- paste0("Distribution with ", method[j], " method")
tab_compare[c(1, j + 1), ] %>%
knitr::kable(format = "html", digits = 2, caption = caption,
col.names = nm_cols) %>%
kableExtra::kable_styling(full_width = full_width, font_size = font_size,
position = "left") %>%
gsub("font-size: initial !important;",
"font-size: 12px !important;", .) %>%
cat()
break_page_asis()
}
}
} else {
html_cat("There are no variables in the dataset with outliers.")
break_page_asis()
}
}
#' @import dplyr
#' @importFrom knitr kable
#' @importFrom kableExtra kable_styling
html_paged_resolve_skewness <- function(.data, full_width = TRUE, font_size = 13,
base_family = NULL) {
if (is.null(base_family)) {
base_family <- "Roboto Condensed"
}
skewlist <- find_skewness(.data, index = FALSE)
if (length(skewlist) > 0) {
tab_skewness <- .data %>%
select_at(skewlist) %>%
dlookr::describe(statistics = c("quantiles", "skewness"),
quantiles = c(0, 0.25, 0.5, 0.75, 1)) %>%
select(described_variables, p00:p100, skewness)
nm_cols <- c("variables", "min", "Q1", "median", "Q3", "max", "skewness")
caption <- "Information of skewness"
tab_skewness %>%
knitr::kable(format = "html", digits = 2, caption = caption,
col.names = nm_cols) %>%
kableExtra::kable_styling(full_width = full_width, font_size = font_size,
position = "left") %>%
gsub("font-size: initial !important;",
"font-size: 12px !important;", .) %>%
cat()
break_page_asis()
for (i in seq(NROW(tab_skewness))) {
variable <- tab_skewness$described_variables[i]
el <- div(h3(variable))
cat(as.character(el))
skewness <- tab_skewness$skewness[i]
if (skewness <= 0) {
method <- c("1/x", "x^2", "x^3", "Box-Cox")
} else {
method <- c("log", "log+1", "sqrt", "Box-Cox")
}
resolve <- method %>%
lapply(function(x) {
transform(pull(.data, variable), method = x)
})
# p_compare <- resolve %>%
# lapply(function(x) {
# plot(x)
# })
suppressMessages({
mat <- resolve %>%
purrr::map_dfc(function(x) {
summary_transform(x) %>%
as.data.frame()
})
})
drops <- seq(ncol(mat))[seq(ncol(mat)) %% 2 == 1][-1]
mat <- mat[, -c(drops)]
names(mat) <- c("original", paste("transform", method, sep = "_"))
tab_compare <- mat %>%
t() %>%
as.data.frame() %>%
select(IQR, p00, p25, p50, p75, p100, skewness, kurtosis)
nm_cols <- c("IQR", "min", "Q1", "median", "Q3",
"max", "skewness", "kurtosis")
for (j in seq(length(method))) {
el <- div(h4(paste0(variable, " - ", method[j])))
cat(as.character(el))
plot(resolve[[j]], base_family = base_family)
break_line_asis(1)
caption <- paste0("Distribution with ", method[j], " method")
tab_compare[c(1, j + 1), ] %>%
knitr::kable(format = "html", digits = 2, caption = caption,
col.names = nm_cols) %>%
kableExtra::kable_styling(full_width = full_width, font_size = font_size,
position = "left") %>%
gsub("font-size: initial !important;",
"font-size: 12px !important;", .) %>%
cat()
break_page_asis()
}
}
} else {
html_cat("There are no variables including skewed.")
break_page_asis()
}
}
#' @import dplyr
#' @importFrom knitr kable
#' @importFrom kableExtra kable_styling
html_paged_binning <- function(.data, full_width = TRUE, font_size = 13,
base_family = NULL) {
if (is.null(base_family)) {
base_family <- "Roboto Condensed"
}
method <- c("quantile", "equal", "pretty", "kmeans", "bclust")
numlist <- find_class(.data, "numerical", index = FALSE)
if (length(numlist) > 0) {
tab_numerical <- .data %>%
select_at(numlist) %>%
diagnose() %>%
full_join(data.frame(method = method, stringsAsFactors = FALSE),
by = character()) %>%
select(variables, types, unique_count, unique_rate, method)
options(show.error.messages = FALSE)
bins <- seq(nrow(tab_numerical)) %>%
lapply(function(x) {
binn <- try(binning(pull(.data, tab_numerical$variables[x]),
type = tab_numerical$method[x]),
silent = TRUE)
if (inherits(binn, "try-error")) {
return(NULL)
} else {
return(binn)
}
})
options(show.error.messages = TRUE)
tab_numerical$n_bins <- bins %>%
sapply(function(x) attr(x, "levels") %>% length)
nm_cols <- c("variables", "data types", "unique", "unique rate",
"binning method", "bins")
caption <- "Information of binnings"
print_tab(tab_numerical, n_rows = 30, add_row = 3, caption = caption,
full_width = TRUE, font_size = 13, col.names = nm_cols,
digits = 3, big_mark = TRUE)
variable_before <- ""
for (i in seq(NROW(tab_numerical))) {
variable <- tab_numerical$variables[i]
if (variable_before != variable) {
el <- div(h3(variable))
cat(as.character(el))
}
variable_before <- variable
el <- div(h4(paste0(variable, " - ", tab_numerical[i, 5])))
cat(as.character(el))
plot(bins[[i]], base_family = base_family)
break_line_asis(1)
nm_cols <- c("bins", "frequency", "frequency(%)")
caption <- "Contingency table with bins"
summary(bins[[i]]) %>%
knitr::kable(format = "html", digits = 2, caption = caption,
col.names = nm_cols) %>%
kableExtra::kable_styling(full_width = full_width, font_size = font_size,
position = "left") %>%
gsub("font-size: initial !important;",
"font-size: 12px !important;", .) %>%
cat()
break_page_asis()
}
} else {
html_cat("There are no numerical variables.")
break_page_asis()
}
}
#' @importFrom purrr map_int
#' @import dplyr
#' @importFrom knitr kable
#' @importFrom kableExtra kable_styling
html_paged_optimal_binning <- function(.data, target, full_width = TRUE,
font_size = 13, base_family = NULL) {
if (is.null(base_family)) {
base_family <- "Roboto Condensed"
}
numlist <- find_class(.data, "numerical", index = FALSE)
if (is.null(target)) {
html_cat("The target variable is not defied.")
break_page_asis()
} else if (!target %in% names(.data)) {
html_cat(paste0("The target variable ", target, " is not in the data."))
break_page_asis()
} else if (length(numlist) == 0) {
html_cat("There are no numerical variables.")
break_page_asis()
} else {
n_levles <- length(table(pull(.data, target)))
if (n_levles != 2) {
html_cat("The target variable is not a binary class.")
break_page_asis()
} else {
tab_numerical <- .data %>%
select_at(numlist) %>%
diagnose() %>%
select(-missing_count, -missing_percent)
# Optimal Binning for Scoring Modeling
bins <- lapply(numlist, function(x)
binning_by(.data, y = target, x = all_of(x), p = 0.05))
success <- ifelse(sapply(bins, is.character),
"No significant splits", "Success")
tab_numerical <- tab_numerical %>% bind_cols(
data.frame(success = success))
tab_numerical$n_bins <- bins %>%
purrr::map_int(function(x) {
attr(x, "levels") %>% length
})
nm_cols <- c("variables", "data types", "unique", "unique rate",
"success", "bins")
caption <- "Information of optimal binnings"
print_tab(tab_numerical, n_rows = 30, add_row = 3, caption = caption,
full_width = TRUE, font_size = 13, col.names = nm_cols,
digits = 3, big_mark = TRUE)
for (i in seq(NROW(tab_numerical))) {
if (tab_numerical$n_bins[i] == 0) {
next
}
variable <- tab_numerical$variables[i]
el <- div(h3(variable))
cat(as.character(el))
plot(bins[[i]], base_family = base_family)
break_line_asis(1)
caption <- "Binning table with performance measures"
attr(bins[[i]], "performance") %>%
select(-CntCumPos, -CntCumNeg, -RateCumPos, -RateCumNeg) %>%
knitr::kable(format = "html", digits = 2, caption = caption) %>%
kableExtra::kable_styling(full_width = full_width, font_size = font_size,
position = "left") %>%
gsub("font-size: initial !important;",
"font-size: 12px !important;", .) %>%
cat()
break_page_asis()
}
}
}
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.