knitr::opts_chunk$set(echo = FALSE, message = FALSE, warning = FALSE, collapse = FALSE, comment = "#>", fig.align = "center") knitr::opts_chunk$set(fig.width = 12, fig.height = 9)
# Load required packages for charts with UNHCR style library(dplyr) library(ggplot2) library(kableExtra) library(dlookr) library(htmltools) library(reactable) reportData <- get("reportData", .dlookrEnv) sample_percent <- get("sample_percent", .dlookrEnv) author <- get("author", .dlookrEnv) thres_uniq_cat <- get("thres_uniq_cat", .dlookrEnv) thres_uniq_num <- get("thres_uniq_num", .dlookrEnv) base_family <- get("base_family", .dlookrEnv)
```{css, echo=FALSE} .pagedjs_page.pagedjs_first_page .pagedjs_margin-top-right>.pagedjs_margin-content::after { content: url("$logo$"); }
.title { color: $title_color$; }
.subtitle {
color: $subtitle_color$;
font-size: 2.5em !important;
}
/ Warnings / .tag { display: inline-block; padding: 2px 12px; border-radius: 5px; font-weight: 600; font-size: 12px; }
.recommend-check { background: hsl(116, 60%, 90%); color: hsl(116, 30%, 25%); }
.recommend-judgement { background: hsl(230, 70%, 90%); color: hsl(230, 45%, 30%); }
.recommend-remove { background: hsl(350, 70%, 90%); color: hsl(350, 45%, 30%); }
```r # Number of observations N <- NROW(reportData) # sampling with sample_percent if (sample_percent < 100) { N_sample <- ceiling(N * sample_percent / 100) idx <- sample(seq(N), size = N_sample) reportData <- reportData[idx, ] } else { N_sample <- N } # solve the overview ov <- overview(reportData) # diagnose the missing & unique diagn_na_unique <- diagnose(reportData) # diagnose the numeric diagn_numeric <- diagnose_numeric(reportData) tab_warning <- data.frame( warnings = character(5000), status = character(5000), variables = character(5000), types = character(5000), indicator = numeric(5000), ratio = numeric(5000), recommend = character(5000), stringsAsFactors = FALSE) # duplicate -------------------------------------------------------------------- idx_last <- 0 n_duplicate <- length(attr(ov, "duplicate")) if (n_duplicate > 0) { idx <- 1 tab_warning$status[idx] <- "duplicate" tab_warning$variables[idx] <- NA tab_warning$types[idx] <- NA tab_warning$indicator[idx] <- n_duplicate tab_warning$ratio[idx] <- n_duplicate / N tab_warning$warnings[idx] <- sprintf( "dataset has %s (%s%%) duplicated observations", format(n_duplicate, big.mark = ","), round(n_duplicate / N * 100, 1)) tab_warning$recommend[idx] <- "check" } else { idx <- NULL } # missing ---------------------------------------------------------------------- idx_last <- idx_last + length(idx) warn_miss <- diagn_na_unique %>% filter(missing_count > 0) %>% select(variables, types, missing_count, missing_percent) %>% arrange(desc(missing_count)) if (nrow(warn_miss) > 0) { idx <- seq(nrow(warn_miss)) + idx_last tab_warning$status[idx] <- "missing" tab_warning$variables[idx] <- warn_miss$variables tab_warning$types[idx] <- warn_miss$types tab_warning$indicator[idx] <- warn_miss$missing_count tab_warning$ratio[idx] <- warn_miss$missing_percent / 100 tab_warning$warnings[idx] <- sprintf( "%s has %s (%s%%) missing values", warn_miss$variables, format(warn_miss$missing_count, big.mark = ","), round(warn_miss$missing_percent, 1)) tab_warning$recommend[idx] <- "judgement" } else { idx <- NULL } # cardinality: identifier ------------------------------------------------------ idx_last <- idx_last + length(idx) warn_identifier <- diagn_na_unique %>% filter(unique_rate == 1) %>% select(variables, types, unique_count, unique_rate) if (nrow(warn_identifier) > 0) { idx <- seq(nrow(warn_identifier)) + idx_last tab_warning$status[idx] <- "cardinality" tab_warning$variables[idx] <- warn_identifier$variables tab_warning$types[idx] <- warn_identifier$types tab_warning$indicator[idx] <- warn_identifier$unique_count tab_warning$ratio[idx] <- warn_identifier$unique_rate tab_warning$warnings[idx] <- sprintf( "%s has high(%.2f) cardinality, Maybe identifier", warn_identifier$variables, warn_identifier$unique_rate) tab_warning$recommend[idx] <- "check" } else { idx <- NULL } # cardinality: constant -------------------------------------------------------- idx_last <- idx_last + length(idx) warn_constant <- diagn_na_unique %>% filter(unique_count == 1) %>% select(variables, types, unique_count, unique_rate) if (nrow(warn_constant) > 0) { idx <- seq(nrow(warn_constant)) + idx_last tab_warning$status[idx] <- "cardinality" tab_warning$variables[idx] <- warn_constant$variables tab_warning$types[idx] <- warn_constant$types tab_warning$indicator[idx] <- warn_constant$unique_count tab_warning$ratio[idx] <- warn_constant$unique_rate tab_warning$warnings[idx] <- sprintf( "%s has constant value \"%s\"", warn_constant$variables, reportData[1, warn_constant$variables %>% as.character()] %>% t() %>% as.vector() ) tab_warning$recommend[idx] <- "remove" } else { idx <- NULL } # cardinally: high cardinality(category) --------------------------------------- idx_last <- idx_last + length(idx) warn_unique_cat <- diagn_na_unique %>% filter(types %in% c("character", "factor", "ordered", "Date", "POSIXct")) %>% filter(unique_rate >= thres_uniq_cat & unique_rate < 1) %>% select(variables, types, unique_count, unique_rate) if (nrow(warn_unique_cat) > 0) { idx <- seq(nrow(warn_unique_cat)) + idx_last tab_warning$status[idx] <- "cardinality" tab_warning$variables[idx] <- warn_unique_cat$variables tab_warning$types[idx] <- warn_unique_cat$types tab_warning$indicator[idx] <- warn_unique_cat$unique_count tab_warning$ratio[idx] <- warn_unique_cat$unique_rate tab_warning$warnings[idx] <- sprintf( "%s has a high cardinality. %s (%s%%) distinct values", warn_unique_cat$variables, format(warn_unique_cat$unique_count, big.mark = ","), round(warn_unique_cat$unique_rate * 100, 1)) tab_warning$recommend[idx] <- "judgement" } else { idx <- NULL } # cardinally: low cardinality(numerical) --------------------------------------- warn_unique_num <- diagn_na_unique %>% filter(types %in% c("numeric", "integer")) %>% filter(unique_count <= thres_uniq_num & unique_count > 1) %>% select(variables, types, unique_count, unique_rate) if (nrow(warn_unique_num) > 0) { idx <- seq(nrow(warn_unique_num)) + idx_last tab_warning$status[idx] <- "cardinality" tab_warning$variables[idx] <- warn_unique_num$variables tab_warning$types[idx] <- warn_unique_num$types tab_warning$indicator[idx] <- warn_unique_num$unique_count tab_warning$ratio[idx] <- warn_unique_num$unique_rate tab_warning$warnings[idx] <- sprintf( "%s has a low cardinality. %s (%s%%) distinct values", warn_unique_num$variables, format(warn_unique_num$unique_count, big.mark = ","), round(warn_unique_num$unique_rate * 100, 1)) tab_warning$recommend[idx] <- "judgement" } else { idx <- NULL } # zeros ------------------------------------------------------------------------ idx_last <- idx_last + length(idx) if (!is.null(diagn_numeric)) { warn_zero <- diagn_numeric %>% filter(zero > 0) %>% select(variables, zero) %>% arrange(desc(zero)) if (nrow(warn_zero) > 0) { idx <- seq(nrow(warn_zero)) + idx_last tab_warning$status[idx] <- "zero" tab_warning$variables[idx] <- warn_zero$variables tab_warning$types[idx] <- NA tab_warning$indicator[idx] <- warn_zero$zero tab_warning$ratio[idx] <- warn_zero$zero / N tab_warning$warnings[idx] <- sprintf( "%s has %s (%s%%) zeros", warn_zero$variables, format(warn_zero$zero, big.mark = ","), round(warn_zero$zero / N * 100, 2)) tab_warning$recommend[idx] <- "check" } else { idx <- NULL } } else { idx <- NULL } # cardinally: negative --------------------------------------------------------- idx_last <- idx_last + length(idx) if (!is.null(diagn_numeric)) { warn_minus <- diagn_numeric %>% filter(minus > 0) %>% select(variables, minus) %>% arrange(desc(minus)) if (nrow(warn_minus) > 0) { idx <- seq(nrow(warn_minus)) + idx_last tab_warning$status[idx] <- "negative" tab_warning$variables[idx] <- warn_minus$variables tab_warning$types[idx] <- NA tab_warning$indicator[idx] <- warn_minus$minus tab_warning$ratio[idx] <- warn_minus$minus / N tab_warning$warnings[idx] <- sprintf( "%s has %s (%s%%) negatives", warn_minus$variables, format(warn_minus$minus, big.mark = ","), round(warn_minus$minus / N * 100, 2)) tab_warning$recommend[idx] <- "check" } else { idx <- NULL } } else { idx <- NULL } # outlier ---------------------------------------------------------------------- idx_last <- idx_last + length(idx) if (!is.null(diagn_numeric)) { warn_outlier <- diagn_numeric %>% filter(outlier > 0) %>% select(variables, outlier) %>% arrange(desc(outlier)) if (nrow(warn_outlier) > 0) { idx <- seq(nrow(warn_outlier)) + idx_last tab_warning$status[idx] <- "outlier" tab_warning$variables[idx] <- warn_outlier$variables tab_warning$types[idx] <- NA tab_warning$indicator[idx] <- warn_outlier$outlier tab_warning$ratio[idx] <- warn_outlier$outlier / N tab_warning$warnings[idx] <- sprintf( "%s has %s (%s%%) outliers", warn_outlier$variables, format(warn_outlier$outlier, big.mark = ","), round(warn_outlier$outlier / N * 100, 2)) tab_warning$recommend[idx] <- "judgement" } else { idx <- NULL } } else { idx <- NULL } tab_warning <- tab_warning %>% filter(status != "")
tab_left <- ov[1:9, ] tab_right <- ov[10:nrow(ov), ] rownames(tab_right) <- seq(nrow(tab_right)) tab_left <- tab_left %>% mutate(value = ifelse (metrics %in% "observations", N, value)) %>% mutate(value = ifelse (metrics %in% "memory size", ifelse(value / 1024^2 > 0, round(value / 1024^2), round(value / 1024)), value)) %>% mutate(metrics = ifelse (metrics %in% "memory size", ifelse(value / 1024^2 > 0, "memory size (MB)", "memory size (KB)"), metrics)) knitr::kables(format = "html", list( knitr::kable(tab_left, digits = 2, format = "html", valign = "t", format.args = list(big.mark = ","), table.attr = "style=\"margin-right:40px !important;\"") %>% kable_styling(full_width = FALSE, font_size = 15), knitr::kable(tab_right, digits = 2, format = "html", valign = "t", format.args = list(big.mark = ",")) %>% kable_styling(full_width = FALSE, font_size = 15) ), caption = "Data structures and types") %>% gsub("font-size: initial !important;", "font-size: 12px !important;", .) %>% cat()
division <- c("dataset" ,"dataset", "job", "job", "job") metrics <- c("dataset" ,"dataset type", "samples", "created", "created by") value <- c("$dataset$", "$datatype$", paste0(format(N_sample, big.mark = ","), " / ", format(N, big.mark = ","), " (", sample_percent, "%)"), "$date$", ifelse(author == "", "dlookr", author)) info_job <- data.frame(division = division, metrics = metrics, value = value) cap <- "Job informations" print_tab(info_job, caption = cap)
if (NROW(tab_warning) < 1) { html_cat("No warnings") } else { tab <- tab_warning %>% count(recommend) %>% right_join(data.frame(recommend = c("check", "judgement", "remove")), by = "recommend") %>% tidyr::spread(recommend, n) %>% mutate_all(function(x) ifelse(is.na(x), 0, x)) knitr::kable(tab, format = "html", caption = "Summary of warnings", col.names = c("checks", "judgements", "removes"), format.args = list(big.mark = ",")) %>% kableExtra::kable_styling(full_width = TRUE, font_size = 14, position = "left") %>% gsub("font-size: initial !important;", "font-size: 12px !important;", .) %>% cat() }
tab_warning <- tab_warning %>% select(warnings, status, recommend) if (NROW(tab_warning) > 0) { cap <- "Warnings in dataset and variables" print_tab(tab_warning, caption = cap, n_rows = 22, add_row = 6) } else { html_cat("No warnings in dataset and variables") break_page_asis() }
cap <- "List of variables diagnosis" html_paged_variable(reportData, caption = cap, n_rows = 25)
$content_missing$
cap <- "List of variables including missing values" html_paged_missing(diagn_na_unique, caption = cap, n_rows = 23, add_row = 5)
diagn_missing <- diagn_na_unique %>% filter(missing_count > 0) if (NROW(diagn_missing) > 1) { break_line_asis(1) plot_na_intersect(reportData, base_family = base_family) } else if (NROW(diagn_missing) == 1) { html_cat("Supported only when the number of variables including missing values is 2 or more.") break_line_asis(1) } else { html_cat("No variables including missing values") break_line_asis(1) } break_page_asis()
$content_missing$
html_paged_unique_cat(diagn_na_unique, thres_uniq_cat, n_rows = 25, add_row = 4)
html_paged_unique_num(diagn_na_unique, thres_uniq_num, n_rows = 24, add_row = 5)
in_category <- find_class(reportData, type = "date_categorical2") %>% length() %>% as.logical() if (in_category) { diagn_category <- html_paged_toprank(reportData, top = 10, type = "n") cap <- "Top 10 levels of categorical variables" print_tab(diagn_category, caption = cap, n_rows = 25, add_row = 4) } else { html_cat("No categorical variable") break_page_asis() }
diagn_numeric <- diagnose_numeric(reportData) if (NROW(diagn_numeric) > 0) { cap <- "General list of numerical diagnosis" print_tab(diagn_numeric, caption = cap, font_size = 13, n_rows = 25, add_row = 4) } else { html_cat("No numerical variable") break_page_asis() }
$content_zero$
if (NROW(diagn_numeric) > 0) { diagn_zero <- diagn_numeric %>% filter(zero > 0) %>% select(variables, min, median, max, zero) %>% mutate(`zero (%)` = round(zero / N * 100, 1)) %>% arrange(desc(zero)) if (NROW(diagn_zero) > 0) { cap <- "List of numerical diagnosis (zero)" print_tab(diagn_zero, caption = cap, n_rows = 25, add_row = 4) } else { html_cat("No numeric variable with zero value") break_page_asis() } } else { html_cat("No numerical variable") break_page_asis() }
$content_zero$
$content_minus$
if (NROW(diagn_numeric) > 0) { diagn_minus <- diagn_numeric %>% filter(minus > 0) %>% select(variables, min, median, max, minus) %>% mutate(`minus (%)` = round(minus / N * 100, 1)) %>% arrange(desc(minus)) if (NROW(diagn_minus) > 0) { cap <- "List of numerical diagnosis (minus)" print_tab(diagn_minus, caption = cap, n_rows = 25, add_row = 4) } else { html_cat("No numeric variable with negative value") break_page_asis() } } else { html_cat("No numerical variable") break_page_asis() }
$content_minus$
if (NROW(diagn_numeric) > 0) { diagn_outlier <- diagn_numeric %>% filter(outlier > 0) %>% select(variables, min, median, max, outlier) %>% mutate(`outlier (%)` = round(outlier / N * 100, 1)) %>% arrange(desc(outlier)) if (NROW(diagn_outlier) > 0) { cap <- "Diagnosis of numerical variable outliers" print_tab(diagn_outlier, caption = cap, n_rows = 25, add_row = 4) } else { html_cat("No numeric variables including outliers") break_page_asis() } } else { html_cat("No numerical variable") break_page_asis() }
if (NROW(diagn_numeric) > 0) { diagn_outlier2 <- reportData %>% diagnose_outlier(diagn_outlier$variables) cols <- c("Outliers count", "Outliers ratio (%)", "Mean of outliers", "Mean with outliers", "Mean without outliers") if (NROW(diagn_outlier2) > 0) { variables <- diagn_outlier2 %>% select(variables) %>% unlist for (i in seq(variables)) { cap <- sprintf("variable: %s", variables[i]) cat(sprintf("<h3>%s</h3>", cap)) outlier_df <- data.frame(Measures = cols, Values = as.vector(t(diagn_outlier2[i, -1]))) values <- outlier_df$Values outlier_df$Values[1] <- round(values[1]) %>% format(big.mark = ",") outlier_df$Values[2] <- round(values[2], 2) %>% as.character() %>% paste0("%") outlier_df$Values[3] <- values[3] %>% format() %>% as.character() outlier_df$Values[4] <- values[4] %>% format() %>% as.character() outlier_df$Values[5] <- values[5] %>% format() %>% as.character() knitr::kable(outlier_df, digits = 2, caption = variables[i], format = "html", table.attr = "class=\"table table-width\"", align = "lr") %>% kable_styling(full_width = FALSE, font_size = 14, position = "center") %>% gsub("font-size: initial !important;", "font-size: 12px !important;", .) %>% cat() cat("<br><br>") plot_outlier(reportData, variables[i], base_family = base_family) break_page_asis() } } else { html_cat("No numeric variables including outliers") } } else { html_cat("No numerical variable") }
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.