knitr::opts_chunk$set(echo = FALSE, message = FALSE, warning = FALSE, collapse = FALSE, comment = "#>", fig.align = "center") knitr::opts_chunk$set(fig.width = 12, fig.height = 9)
library(dlookr) library(dplyr) library(kableExtra) library(reactable) library(htmltools)
reportData <- get("reportData", .dlookrEnv) sample_percent <- get("sample_percent", .dlookrEnv) author <- get("author", .dlookrEnv) thres_uniq_cat <- get("thres_uniq_cat", .dlookrEnv) thres_uniq_num <- get("thres_uniq_num", .dlookrEnv) base_family <- get("base_family", .dlookrEnv)
```{css, echo=FALSE} :root { --custom-grey60: rgb(102, 102, 102); --custom-grey20: rgb(204, 204, 204); --custom-grey10: rgb(230, 230, 230); --custom-blue: rgb(0, 114, 188); --custom-lightblue: rgb(204, 227, 242); --custom-orange: rgb(255, 127, 42); --custom-lightorange: rgb(255, 204, 170); }
color: $title_color$; }
.navbar {
background-color: #f0f0f0;
border-bottom : 2px solid $customColor$ !important;
}
.nav>li>a { position: relative; display: block; padding: 10px 15px; }
a { color: #337ab7 !important; background-color: transparent !important; text-decoration: none; }
/ Warnings / .tag { display: inline-block; padding: 2px 12px; border-radius: 5px; font-weight: 600; font-size: 12px; }
.recommend-check { background: hsl(116, 60%, 90%); color: hsl(116, 30%, 25%); }
.recommend-judgement { background: hsl(43, 82%, 68%); color: hsl(230, 45%, 30%); }
.recommend-remove { background: hsl(25, 93%, 63%); color: hsl(176, 93%, 97%); }
/ Defined title with H1, H2, H3 / .title-h1 { font-size: 2.5em; color: $customColor$; }
.title-h2 { font-size: 2.0em; font-weight: 400; color: $customColor$; }
.title-h3 { font-size: 1.75em; font-weight: 400; color: rgb(102, 102, 102); }
```r # Number of observations N <- NROW(reportData) # sampling with sample_percent if (sample_percent < 100) { N_sample <- ceiling(N * sample_percent / 100) idx <- sample(seq(N), size = N_sample) reportData <- reportData[idx, ] } else { N_sample <- N } # solve the overview ov <- overview(reportData) # diagnose the missing & unique diagn_na_unique <- diagnose(reportData) # diagnose the numeric diagn_numeric <- diagnose_numeric(reportData) tab_warning <- data.frame( warnings = character(5000), status = character(5000), variables = character(5000), types = character(5000), indicator = numeric(5000), ratio = numeric(5000), recommend = character(5000), stringsAsFactors = FALSE) # duplicate -------------------------------------------------------------------- idx_last <- 0 n_duplicate <- length(attr(ov, "duplicate")) if (n_duplicate > 0) { idx <- 1 tab_warning$status[idx] <- "duplicate" tab_warning$variables[idx] <- NA tab_warning$types[idx] <- NA tab_warning$indicator[idx] <- n_duplicate tab_warning$ratio[idx] <- n_duplicate / N tab_warning$warnings[idx] <- sprintf( "dataset has %s (%s%%) duplicated observations", format(n_duplicate, big.mark = ","), round(n_duplicate / N * 100, 1)) tab_warning$recommend[idx] <- "check" } else { idx <- NULL } # missing ---------------------------------------------------------------------- idx_last <- idx_last + length(idx) warn_miss <- diagn_na_unique %>% filter(missing_count > 0) %>% select(variables, types, missing_count, missing_percent) %>% arrange(desc(missing_count)) if (nrow(warn_miss) > 0) { idx <- seq(nrow(warn_miss)) + idx_last tab_warning$status[idx] <- "missing" tab_warning$variables[idx] <- warn_miss$variables tab_warning$types[idx] <- warn_miss$types tab_warning$indicator[idx] <- warn_miss$missing_count tab_warning$ratio[idx] <- warn_miss$missing_percent / 100 tab_warning$warnings[idx] <- sprintf( "%s has %s (%s%%) missing values", warn_miss$variables, format(warn_miss$missing_count, big.mark = ","), round(warn_miss$missing_percent, 1)) tab_warning$recommend[idx] <- "judgement" } else { idx <- NULL } # cardinality: identifier ------------------------------------------------------ idx_last <- idx_last + length(idx) warn_identifier <- diagn_na_unique %>% filter(unique_rate == 1) %>% select(variables, types, unique_count, unique_rate) if (nrow(warn_identifier) > 0) { idx <- seq(nrow(warn_identifier)) + idx_last tab_warning$status[idx] <- "cardinality" tab_warning$variables[idx] <- warn_identifier$variables tab_warning$types[idx] <- warn_identifier$types tab_warning$indicator[idx] <- warn_identifier$unique_count tab_warning$ratio[idx] <- warn_identifier$unique_rate tab_warning$warnings[idx] <- sprintf( "%s has high(%.2f) cardinality, Maybe identifier", warn_identifier$variables, warn_identifier$unique_rate) tab_warning$recommend[idx] <- "check" } else { idx <- NULL } # cardinality: constant -------------------------------------------------------- idx_last <- idx_last + length(idx) warn_constant <- diagn_na_unique %>% filter(unique_count == 1) %>% select(variables, types, unique_count, unique_rate) if (nrow(warn_constant) > 0) { idx <- seq(nrow(warn_constant)) + idx_last tab_warning$status[idx] <- "cardinality" tab_warning$variables[idx] <- warn_constant$variables tab_warning$types[idx] <- warn_constant$types tab_warning$indicator[idx] <- warn_constant$unique_count tab_warning$ratio[idx] <- warn_constant$unique_rate tab_warning$warnings[idx] <- sprintf( "%s has constant value \"%s\"", warn_constant$variables, reportData[1, warn_constant$variables %>% as.character()] %>% t() %>% as.vector() ) tab_warning$recommend[idx] <- "remove" } else { idx <- NULL } # cardinally: high cardinality(category) --------------------------------------- idx_last <- idx_last + length(idx) warn_unique_cat <- diagn_na_unique %>% filter(types %in% c("character", "factor", "ordered", "Date", "POSIXct")) %>% filter(unique_rate >= thres_uniq_cat & unique_rate < 1) %>% select(variables, types, unique_count, unique_rate) if (nrow(warn_unique_cat) > 0) { idx <- seq(nrow(warn_unique_cat)) + idx_last tab_warning$status[idx] <- "cardinality" tab_warning$variables[idx] <- warn_unique_cat$variables tab_warning$types[idx] <- warn_unique_cat$types tab_warning$indicator[idx] <- warn_unique_cat$unique_count tab_warning$ratio[idx] <- warn_unique_cat$unique_rate tab_warning$warnings[idx] <- sprintf( "%s has a high cardinality. %s (%s%%) distinct values", warn_unique_cat$variables, format(warn_unique_cat$unique_count, big.mark = ","), round(warn_unique_cat$unique_rate * 100, 1)) tab_warning$recommend[idx] <- "judgement" } else { idx <- NULL } # cardinally: low cardinality(numerical) --------------------------------------- warn_unique_num <- diagn_na_unique %>% filter(types %in% c("numeric", "integer")) %>% filter(unique_count <= thres_uniq_num & unique_count > 1) %>% select(variables, types, unique_count, unique_rate) if (nrow(warn_unique_num) > 0) { idx <- seq(nrow(warn_unique_num)) + idx_last tab_warning$status[idx] <- "cardinality" tab_warning$variables[idx] <- warn_unique_num$variables tab_warning$types[idx] <- warn_unique_num$types tab_warning$indicator[idx] <- warn_unique_num$unique_count tab_warning$ratio[idx] <- warn_unique_num$unique_rate tab_warning$warnings[idx] <- sprintf( "%s has a low cardinality. %s (%s%%) distinct values", warn_unique_num$variables, format(warn_unique_num$unique_count, big.mark = ","), round(warn_unique_num$unique_rate * 100, 1)) tab_warning$recommend[idx] <- "judgement" } else { idx <- NULL } # zeros ------------------------------------------------------------------------ idx_last <- idx_last + length(idx) if (!is.null(diagn_numeric)) { warn_zero <- diagn_numeric %>% filter(zero > 0) %>% select(variables, zero) %>% arrange(desc(zero)) if (nrow(warn_zero) > 0) { idx <- seq(nrow(warn_zero)) + idx_last tab_warning$status[idx] <- "zero" tab_warning$variables[idx] <- warn_zero$variables tab_warning$types[idx] <- NA tab_warning$indicator[idx] <- warn_zero$zero tab_warning$ratio[idx] <- warn_zero$zero / N tab_warning$warnings[idx] <- sprintf( "%s has %s (%s%%) zeros", warn_zero$variables, format(warn_zero$zero, big.mark = ","), round(warn_zero$zero / N * 100, 2)) tab_warning$recommend[idx] <- "check" } else { idx <- NULL } } else { idx <- NULL } # cardinally: negative --------------------------------------------------------- idx_last <- idx_last + length(idx) if (!is.null(diagn_numeric)) { warn_minus <- diagn_numeric %>% filter(minus > 0) %>% select(variables, minus) %>% arrange(desc(minus)) if (nrow(warn_minus) > 0) { idx <- seq(nrow(warn_minus)) + idx_last tab_warning$status[idx] <- "negative" tab_warning$variables[idx] <- warn_minus$variables tab_warning$types[idx] <- NA tab_warning$indicator[idx] <- warn_minus$minus tab_warning$ratio[idx] <- warn_minus$minus / N tab_warning$warnings[idx] <- sprintf( "%s has %s (%s%%) negatives", warn_minus$variables, format(warn_minus$minus, big.mark = ","), round(warn_minus$minus / N * 100, 2)) tab_warning$recommend[idx] <- "check" } else { idx <- NULL } } else { idx <- NULL } # outlier ---------------------------------------------------------------------- idx_last <- idx_last + length(idx) if (!is.null(diagn_numeric)) { warn_outlier <- diagn_numeric %>% filter(outlier > 0) %>% select(variables, outlier) %>% arrange(desc(outlier)) if (nrow(warn_outlier) > 0) { idx <- seq(nrow(warn_outlier)) + idx_last tab_warning$status[idx] <- "outlier" tab_warning$variables[idx] <- warn_outlier$variables tab_warning$types[idx] <- NA tab_warning$indicator[idx] <- warn_outlier$outlier tab_warning$ratio[idx] <- warn_outlier$outlier / N tab_warning$warnings[idx] <- sprintf( "%s has %s (%s%%) outliers", warn_outlier$variables, format(warn_outlier$outlier, big.mark = ","), round(warn_outlier$outlier / N * 100, 2)) tab_warning$recommend[idx] <- "judgement" } else { idx <- NULL } } else { idx <- NULL } tab_warning <- tab_warning %>% filter(status != "")
division <- c("dataset" ,"dataset", "job", "job", "job") metrics <- c("dataset" ,"dataset type", "samples", "created", "created by") value <- c("$dataset$", "$datatype$", paste0(format(N_sample, big.mark = ","), " / ", format(N, big.mark = ","), " (", sample_percent, "%)"), "$date$", ifelse(author == "", "dlookr", author)) overview <- data.frame(Divisions = division, Metrics = metrics, Values = value)
h1("Overview", id = "ID-h1-overview", class = "title-h1")
h2("Data Structures", id = "ID-h2-data-struct", class = "title-h2")
tab_left <- ov[1:9, ] tab_right <- ov[10:nrow(ov), ] rownames(tab_right) <- seq(nrow(tab_right)) tab_left <- tab_left %>% mutate(value = ifelse (metrics %in% "observations", N, value)) %>% mutate(value = ifelse (metrics %in% "memory size", ifelse(value / 1024^2 > 0, round(value / 1024^2), round(value / 1024)), value)) %>% mutate(metrics = ifelse (metrics %in% "memory size", ifelse(value / 1024^2 > 0, "memory size (MB)", "memory size (KB)"), metrics)) cap <- "Data Structures" knitr::kable(tab_left, digits = 2, caption = cap, format = "html", format.args = list(big.mark = ","), col.names = c("Divisions", "Metrics", "Values"), table.attr = "style=\"color: hsl(0, 0%, 40%);margin-right:30px !important;\"") %>% kable_styling(full_width = FALSE, font_size = 14, position = "float_left") cap <- "Data Types" knitr::kable(tab_right, digits = 2, caption = cap, format = "html", col.names = c("Divisions", "Metrics", "Values"), format.args = list(big.mark = ","), table.attr = "style=\"color: hsl(0, 0%, 40%);margin-right:30px !important;\"") %>% kable_styling(full_width = FALSE, font_size = 14, position = "float_left") cap <- "Job Information" knitr::kable(overview, caption = cap, format = "html", table.attr = "style=\"color: hsl(0, 0%, 40%);\"") %>% kable_styling(full_width = FALSE, font_size = 14, position = "left") breaks <- 9 - nrow(overview) + 3 break_line_asis(breaks)
h2("Warnings", id = "ID-h2-waring", class = "title-h2")
if (NROW(tab_warning) < 1) { html_cat("No warnings") } else { tab_warning %>% count(recommend) %>% right_join(data.frame(recommend = c("check", "judgement", "remove")), by = "recommend") %>% tidyr::spread(recommend, n) %>% mutate_all(function(x) ifelse(is.na(x), 0, x)) %>% reactable( defaultColDef = colDef(style = "font-size: 14px;color: hsl(0, 0%, 40%);", minWidth = 120), columns = list( check = colDef( name = "Checks", style = function(value) { list(color = "#007000", fontWeight = "bold") } ), judgement = colDef( name = "Judgements", style = function(value) { list(color = "#fdb368", fontWeight = "bold") } ), remove = colDef( name = "Removes", style = function(value) { list(color = "#e00000", fontWeight = "bold") } ) ), fullWidth = FALSE ) }
if (NROW(tab_warning) > 0) { tab_warning %>% select(warnings, status, recommend) %>% reactable( defaultColDef = colDef(style = "font-size: 14px;color: hsl(0, 0%, 40%);"), columns = list( warnings = colDef( name = "Warnings", cell = function(value, index) { variable_name <- strsplit(value, " ") %>% unlist() %>% "["(1) msg <- strsplit(value, " ") %>% unlist() %>% "["(-1) %>% paste(collapse = " ") msg <- paste("", msg) if (tab_warning$status[index] %in% "duplicate") { variable <- a(class = "anchor", href = "#ID-h2-duplicate", variable_name) } else if (tab_warning$status[index] %in% "missing") { variable <- a(class = "anchor", href = "#ID-h1-missing", variable_name) } else if (tab_warning$status[index] %in% "cardinality") { if (tab_warning$types[index] %in% c("character", "factor", "ordered", "Date", "POSIXct")) variable <- a(class = "anchor", href = "#ID-h2-uniq-categorical", variable_name) else variable <- a(class = "anchor", href = "#ID-h2-uniq-numerical", variable_name) } else if (tab_warning$status[index] %in% "outlier") { variable <- a(class = "anchor", href = "#ID-h1-outlier", variable_name) } else if (tab_warning$status[index] %in% c("zero", "negative")) { variable <- a(class = "anchor", href = "#ID-h2-variables", variable_name) } else { variable <- div(style = list(color = "red"), variable_name) } tagList( div(style = list(display = "inline-block"), variable), msg ) } ), status = colDef(name = "Types", width = 100), recommend = colDef( name = "Recommendations", width = 130, cell = function(value) { class <- paste0("tag recommend-", tolower(value)) div(class = class, value) } ) ) ) } break_line_asis(2)
h2("Variables", id = "ID-h2-variables", class = "title-h2")
cap <- "Diagnostic overview of individual variables" html_cat(cap) html_variable(reportData, theme = "$theme$", base_family = base_family) break_line_asis(2)
h1("Missing Values", id = "ID-h1-missing", class = "title-h1")
h2("List of Missing Values", id = "ID-h2-missing-value", class = "title-h2")
html_missing(diagn_na_unique) break_line_asis(1)
h2("Visualization", id = "ID-h2-missing-viz", class = "title-h2")
diagn_missing <- diagn_na_unique %>% filter(missing_count > 0) if (NROW(diagn_missing) > 1) { cap <- "Interaction of variables including missing values" html_cat(cap) break_line_asis(1) plot_na_intersect(reportData, base_family = base_family) } else if (NROW(diagn_missing) == 1) { html_cat("upported only when the number of variables including missing values is 2 or more") break_line_asis(1) } else { html_cat("No variables including missing values") break_line_asis(1) } break_line_asis(1)
h1("Unique Values", id = "ID-h1-uniq-value", class = "title-h1")
h2("Categorical Variables", id = "ID-h2-uniq-categorical", class = "title-h2")
html_unique_cat(diagn_na_unique, thres_uniq_cat) break_line_asis(1)
h2("Numerical Variables", id = "ID-h2-uniq-numerical", class = "title-h2")
html_unique_num(diagn_na_unique, thres_uniq_num) break_line_asis(1)
h1("Outliers", id = "ID-h1-outlier", class = "title-h1")
if (NROW(diagn_numeric) > 0) { diagn_outlier <- diagn_numeric %>% filter(outlier > 0) if (NROW(diagn_outlier) > 0) { cap <- "Diagnosis of numerical variable outliers" html_cat(cap) html_outlier(reportData, theme = "$theme$", base_family = base_family) } else { html_cat("No numeric variables including outliers") break_line_asis(1) } } else { html_cat("No numerical variable") break_line_asis(1) } break_line_asis(1)
h1("Samples", id = "ID-h1-sample", class = "title-h1")
h2("Duplicated", id = "ID-h2-duplicate", class = "title-h2")
cap <- "Duplicated records" idx_dup <- attr(ov, "duplicated") if (length(idx_dup)) { tabs <- reportData[idx_dup, ] %>% reactable::reactable( defaultColDef = colDef(style = "font-size: 14px;color: hsl(0, 0%, 40%);", minWidth = 150) ) htmlwidgets::prependContent( tabs, div(class = "caption", style = "padding-top: 8px; padding-bottom: 8px; color: #777777; text-align: left;", cap) ) } else { html_cat("No duplicated records") break_line_asis(1) } break_line_asis()
h2("Heads", id = "ID-h2-head", class = "title-h2")
cap <- "First few records" tabs <- reportData %>% head(10) %>% reactable::reactable( defaultColDef = colDef(style = "font-size: 14px;color: hsl(0, 0%, 40%);", minWidth = 150) ) htmlwidgets::prependContent(tabs, div(class = "caption", style = "padding-top: 8px; padding-bottom: 8px; color: #777777; text-align: left;", cap)) break_line_asis()
h2("Tails", id = "ID-h2-tail", class = "title-h2")
cap <- "Last few records" tabs <- reportData %>% tail(10) %>% reactable::reactable( defaultColDef = colDef(style = "font-size: 14px;color: hsl(0, 0%, 40%);", minWidth = 150) ) htmlwidgets::prependContent(tabs, div(class = "caption", style = "padding-top: 8px; padding-bottom: 8px; color: #777777; text-align: left;", cap))
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.