dlookr: Data Diagnosis

knitr::opts_chunk$set(echo = FALSE, 
                      message = FALSE, 
                      warning = FALSE, 
                      collapse = FALSE,
                      comment = "#>",
                      fig.align = "center")
knitr::opts_chunk$set(fig.width = 12, fig.height = 9)
library(dlookr)
library(dplyr)
library(kableExtra)
library(reactable)
library(htmltools)
reportData <- get("reportData", .dlookrEnv)
sample_percent <- get("sample_percent", .dlookrEnv)
author <- get("author", .dlookrEnv)
thres_uniq_cat <- get("thres_uniq_cat", .dlookrEnv)
thres_uniq_num <- get("thres_uniq_num", .dlookrEnv)
base_family <- get("base_family", .dlookrEnv)

```{css, echo=FALSE} :root { --custom-grey60: rgb(102, 102, 102); --custom-grey20: rgb(204, 204, 204); --custom-grey10: rgb(230, 230, 230); --custom-blue: rgb(0, 114, 188); --custom-lightblue: rgb(204, 227, 242); --custom-orange: rgb(255, 127, 42); --custom-lightorange: rgb(255, 204, 170); }

header .title{

color: $title_color$; }

.navbar { background-color: #f0f0f0;
border-bottom : 2px solid $customColor$ !important; }

.nav>li>a { position: relative; display: block; padding: 10px 15px; }

a { color: #337ab7 !important; background-color: transparent !important; text-decoration: none; }

/ Warnings / .tag { display: inline-block; padding: 2px 12px; border-radius: 5px; font-weight: 600; font-size: 12px; }

.recommand-check { background: hsl(116, 60%, 90%); color: hsl(116, 30%, 25%); }

.recommand-judgement { background: hsl(43, 82%, 68%); color: hsl(230, 45%, 30%); }

.recommand-remove { background: hsl(25, 93%, 63%); color: hsl(176, 93%, 97%); }

/ Defined title with H1, H2, H3 / .title-h1 { font-size: 2.5em; color: $customColor$; }

.title-h2 { font-size: 2.0em; font-weight: 400; color: $customColor$; }

.title-h3 { font-size: 1.75em; font-weight: 400; color: rgb(102, 102, 102); }

```r
# Number of observations
N <- NROW(reportData)

# sampling with sample_percent
if (sample_percent < 100) {
  N_sample <- ceiling(N * sample_percent / 100)
  idx <- sample(seq(N), size = N_sample)

  reportData <- reportData[idx, ]
} else {
  N_sample <- N
}

# solve the overview
ov <- overview(reportData)

# diagnose the missing & unique
diagn_na_unique <- diagnose(reportData)

# diagnose the numeric
diagn_numeric <- diagnose_numeric(reportData)

tab_warning <- data.frame(
  warnings = character(5000), status = character(5000), variables = character(5000),
  types = character(5000), indicator = numeric(5000), ratio = numeric(5000), 
  recommand = character(5000), stringsAsFactors = FALSE)

# duplicate --------------------------------------------------------------------
idx_last <- 0
n_duplicate <- length(attr(ov, "duplicate"))

if (n_duplicate > 0) {
  idx <- 1

  tab_warning$status[idx]    <- "duplicate"
  tab_warning$variables[idx] <- NA
  tab_warning$types[idx] <- NA  
  tab_warning$indicator[idx] <- n_duplicate
  tab_warning$ratio[idx]     <- n_duplicate / N
  tab_warning$warnings[idx]  <- sprintf(
    "dataset has %s (%s%%) duplicated observations", 
    format(n_duplicate, big.mark = ","),
    round(n_duplicate / N * 100, 1))
  tab_warning$recommand[idx]  <- "check"  
} else {
  idx <- NULL
}

# missing ----------------------------------------------------------------------
idx_last <- idx_last + length(idx)

warn_miss <- diagn_na_unique %>% 
  filter(missing_count > 0) %>% 
  select(variables, types, missing_count, missing_percent) %>% 
  arrange(desc(missing_count))

if (nrow(warn_miss) > 0) {
  idx <- seq(nrow(warn_miss)) + idx_last

  tab_warning$status[idx]    <- "missing"
  tab_warning$variables[idx] <- warn_miss$variables
  tab_warning$types[idx]     <- warn_miss$types  
  tab_warning$indicator[idx] <- warn_miss$missing_count
  tab_warning$ratio[idx]     <- warn_miss$missing_percent / 100
  tab_warning$warnings[idx]  <- sprintf(
    "%s has %s (%s%%) missing values", warn_miss$variables,
    format(warn_miss$missing_count, big.mark = ","),
    round(warn_miss$missing_percent, 1))
  tab_warning$recommand[idx]  <- "judgement"
} else {
  idx <- NULL
}

# cardinality: identifier ------------------------------------------------------
idx_last <- idx_last + length(idx)

warn_identifier <- diagn_na_unique %>% 
  filter(unique_rate == 1) %>% 
  select(variables, types, unique_count, unique_rate)

if (nrow(warn_identifier) > 0) {
  idx <- seq(nrow(warn_identifier)) + idx_last

  tab_warning$status[idx]     <- "cardinality"
  tab_warning$variables[idx]  <- warn_identifier$variables
  tab_warning$types[idx]      <- warn_identifier$types    
  tab_warning$indicator[idx]  <- warn_identifier$unique_count
  tab_warning$ratio[idx]      <- warn_identifier$unique_rate 
  tab_warning$warnings[idx]   <- sprintf(
    "%s has high(%.2f) cardinality, Maybe identifier",
    warn_identifier$variables, warn_identifier$unique_rate)
  tab_warning$recommand[idx]  <- "check"  
} else {
  idx <- NULL
}

# cardinality: constant --------------------------------------------------------
idx_last <- idx_last + length(idx)

warn_constant <- diagn_na_unique %>% 
  filter(unique_count == 1) %>% 
  select(variables, types, unique_count, unique_rate)

if (nrow(warn_constant) > 0) {
  idx <- seq(nrow(warn_constant)) + idx_last

  tab_warning$status[idx]     <- "cardinality"
  tab_warning$variables[idx]  <- warn_constant$variables
  tab_warning$types[idx]      <- warn_constant$types   
  tab_warning$indicator[idx]  <- warn_constant$unique_count
  tab_warning$ratio[idx]      <- warn_constant$unique_rate 
  tab_warning$warnings[idx]   <- sprintf(
    "%s has constant value \"%s\"",
    warn_constant$variables, 
    reportData[1, warn_constant$variables %>% as.character()] %>% 
      t() %>% 
      as.vector()
    )  
  tab_warning$recommand[idx]  <- "remove"
} else {
  idx <- NULL
}

# cardinally: high cardinality(category) ---------------------------------------
idx_last <- idx_last + length(idx)

warn_unique_cat <- diagn_na_unique %>% 
  filter(types %in% c("character", "factor", "ordered", "Date", "POSIXct")) %>%
  filter(unique_rate >= thres_uniq_cat & unique_rate < 1) %>%
  select(variables, types, unique_count, unique_rate)

if (nrow(warn_unique_cat) > 0) {
  idx <- seq(nrow(warn_unique_cat)) + idx_last

  tab_warning$status[idx]     <- "cardinality"
  tab_warning$variables[idx]  <- warn_unique_cat$variables
  tab_warning$types[idx]      <- warn_unique_cat$types   
  tab_warning$indicator[idx]  <- warn_unique_cat$unique_count
  tab_warning$ratio[idx]      <- warn_unique_cat$unique_rate 
  tab_warning$warnings[idx]   <- sprintf(
    "%s has a high cardinality. %s (%s%%) distinct values",
    warn_unique_cat$variables, 
    format(warn_unique_cat$unique_count, big.mark = ","),
    round(warn_unique_cat$unique_rate * 100, 1))  
  tab_warning$recommand[idx]  <- "judgement"
} else {
  idx <- NULL
}

# cardinally: low cardinality(numerical) ---------------------------------------
warn_unique_num <- diagn_na_unique %>% 
  filter(types %in% c("numeric", "integer")) %>%
  filter(unique_count <= thres_uniq_num & unique_count > 1) %>%
  select(variables, types, unique_count, unique_rate)

if (nrow(warn_unique_num) > 0) {
  idx <- seq(nrow(warn_unique_num)) + idx_last

  tab_warning$status[idx]     <- "cardinality"
  tab_warning$variables[idx]  <- warn_unique_num$variables
  tab_warning$types[idx]      <- warn_unique_num$types   
  tab_warning$indicator[idx]  <- warn_unique_num$unique_count
  tab_warning$ratio[idx]      <- warn_unique_num$unique_rate 
  tab_warning$warnings[idx]   <- sprintf(
    "%s has a low cardinality. %s (%s%%) distinct values",
    warn_unique_num$variables, 
    format(warn_unique_num$unique_count, big.mark = ","),
    round(warn_unique_num$unique_rate * 100, 1))  
  tab_warning$recommand[idx]  <- "judgement"
} else {
  idx <- NULL
}

# zeros ------------------------------------------------------------------------
idx_last <- idx_last + length(idx)

if (!is.null(diagn_numeric)) {
  warn_zero <- diagn_numeric %>% 
  filter(zero > 0) %>% 
  select(variables, zero) %>% 
  arrange(desc(zero))

  if (nrow(warn_zero) > 0) {
    idx <- seq(nrow(warn_zero)) + idx_last

    tab_warning$status[idx]     <- "zero"
    tab_warning$variables[idx]  <- warn_zero$variables
    tab_warning$types[idx]      <- NA  
    tab_warning$indicator[idx]  <- warn_zero$zero
    tab_warning$ratio[idx]      <- warn_zero$zero / N 
    tab_warning$warnings[idx]   <- sprintf(
      "%s has %s (%s%%) zeros", warn_zero$variables, 
      format(warn_zero$zero, big.mark = ","), round(warn_zero$zero / N * 100, 2))  
    tab_warning$recommand[idx]  <- "check"
  } else {
    idx <- NULL
  }
} else {
    idx <- NULL
}  


# cardinally: negative ---------------------------------------------------------
idx_last <- idx_last + length(idx)

if (!is.null(diagn_numeric)) {
  warn_minus <- diagn_numeric %>% 
    filter(minus > 0) %>% 
    select(variables, minus) %>% 
    arrange(desc(minus))

  if (nrow(warn_minus) > 0) {
    idx <- seq(nrow(warn_minus)) + idx_last

    tab_warning$status[idx]     <- "negative"
    tab_warning$variables[idx]  <- warn_minus$variables
    tab_warning$types[idx]      <- NA    
    tab_warning$indicator[idx]  <- warn_minus$minus
    tab_warning$ratio[idx]      <- warn_minus$minus / N 
    tab_warning$warnings[idx]   <- sprintf(
      "%s has %s (%s%%) negatives",
      warn_minus$variables, format(warn_minus$minus, big.mark = ","),
      round(warn_minus$minus / N * 100, 2))  
    tab_warning$recommand[idx]  <- "check"
  } else {
    idx <- NULL
  }
} else {
  idx <- NULL
}  

# outlier ----------------------------------------------------------------------
idx_last <- idx_last + length(idx)

if (!is.null(diagn_numeric)) {
  warn_outlier <- diagn_numeric %>% 
    filter(outlier > 0) %>% 
    select(variables, outlier) %>% 
    arrange(desc(outlier))

  if (nrow(warn_outlier) > 0) {
    idx <- seq(nrow(warn_outlier)) + idx_last

    tab_warning$status[idx]     <- "outlier"
    tab_warning$variables[idx]  <- warn_outlier$variables
    tab_warning$types[idx]      <- NA    
    tab_warning$indicator[idx]  <- warn_outlier$outlier
    tab_warning$ratio[idx]      <- warn_outlier$outlier / N 
    tab_warning$warnings[idx]   <- sprintf(
      "%s has %s (%s%%) outliers",
      warn_outlier$variables, format(warn_outlier$outlier, big.mark = ","),
      round(warn_outlier$outlier / N * 100, 2))  
    tab_warning$recommand[idx]  <- "judgement"
  } else {
    idx <- NULL
  }
} else {
  idx <- NULL
}  

tab_warning <- tab_warning %>% 
  filter(status != "")
division <- c("dataset" ,"dataset", "job", "job", "job")
metrics <- c("dataset" ,"dataset type", "samples", "created", "created by")

value <- c("$dataset$", 
           "$datatype$", 
           paste0(format(N_sample, big.mark = ","), " / ", 
                  format(N, big.mark = ","), " (", sample_percent, "%)"),
           "$date$",
           ifelse(author == "", "dlookr", author))

overview <- data.frame(Divisions = division, Metrics = metrics, Values = value)
h1("Overview", id = "ID-h1-overview", class = "title-h1")
h2("Data Structures", id = "ID-h2-data-struct", class = "title-h2")
tab_left <- ov[1:9, ]
tab_right <- ov[10:nrow(ov), ]
rownames(tab_right) <- seq(nrow(tab_right))

tab_left <- tab_left %>% 
  mutate(value = ifelse (metrics %in% "observations", N, value)) %>%
  mutate(value = ifelse (metrics %in% "memory size", 
                         ifelse(value / 1024^2 > 0, round(value / 1024^2),
                                round(value / 1024)), value)) %>% 
  mutate(metrics = ifelse (metrics %in% "memory size", 
                         ifelse(value / 1024^2 > 0, "memory size (MB)",
                                "memory size (KB)"), metrics))

cap <- "Data Structures"
knitr::kable(tab_left, digits = 2, caption = cap, format = "html",
             format.args = list(big.mark = ","),
             col.names = c("Divisions", "Metrics", "Values"),
             table.attr = "style=\"color: hsl(0, 0%, 40%);margin-right:30px !important;\"") %>% 
  kable_styling(full_width = FALSE, font_size = 14, position = "float_left") 

cap <- "Data Types"
knitr::kable(tab_right, digits = 2, caption = cap, format = "html",
             col.names = c("Divisions", "Metrics", "Values"),
             format.args = list(big.mark = ","),
             table.attr = "style=\"color: hsl(0, 0%, 40%);margin-right:30px !important;\"") %>% 
  kable_styling(full_width = FALSE, font_size = 14, position = "float_left") 

cap <- "Job Informations"
knitr::kable(overview, caption = cap, format = "html",
             table.attr = "style=\"color: hsl(0, 0%, 40%);\"") %>% 
  kable_styling(full_width = FALSE, font_size = 14, position = "left") 

breaks <- 9 - nrow(overview) + 3
break_line_asis(breaks)
h2("Warnings", id = "ID-h2-waring", class = "title-h2")
if (NROW(tab_warning) < 1) {
  html_cat("No warnings")
} else {
tab_warning %>% 
  count(recommand) %>% 
  right_join(data.frame(recommand = c("check", "judgement", "remove")),
             by = "recommand") %>% 
  tidyr::spread(recommand, n) %>% 
  mutate_all(function(x) ifelse(is.na(x), 0, x)) %>% 
  reactable(
    defaultColDef = colDef(style = "font-size: 14px;color: hsl(0, 0%, 40%);",
                           minWidth = 120),
    columns = list(
      check = colDef(
        name = "Checks",
        style = function(value) {
          list(color = "#007000", fontWeight = "bold")
        }
      ),
      judgement = colDef(
        name = "Judgements",
        style = function(value) {
          list(color = "#fdb368", fontWeight = "bold")
        }
      ),      
      remove = colDef(
        name = "Removes",
        style = function(value) {
          list(color = "#e00000", fontWeight = "bold")
        }
      )
    ),  
    fullWidth = FALSE
  )
}
if (NROW(tab_warning) > 0) {
tab_warning %>% 
  select(warnings, status, recommand) %>% 
  reactable(
    defaultColDef = colDef(style = "font-size: 14px;color: hsl(0, 0%, 40%);"),
    columns = list(
      warnings = colDef(
        name = "Warnings",
        cell = function(value, index) {
          variable_name <- strsplit(value, " ") %>% unlist() %>% "["(1)
          msg <- strsplit(value, " ") %>% unlist() %>% "["(-1) %>% paste(collapse = " ")
          msg <- paste("", msg)

          if (tab_warning$status[index] %in% "duplicate") {
            variable <- a(class = "anchor", href = "#ID-h2-duplicate", variable_name)
          } else if (tab_warning$status[index] %in% "missing") {
              variable <- a(class = "anchor", href = "#ID-h1-missing", variable_name)
          } else if (tab_warning$status[index] %in% "cardinality") {
            if (tab_warning$types[index] %in% c("character", "factor", "ordered", 
                                                "Date", "POSIXct"))
              variable <- a(class = "anchor", href = "#ID-h2-uniq-categorical", variable_name)
            else
              variable <- a(class = "anchor", href = "#ID-h2-uniq-numerical", variable_name)
          } else if (tab_warning$status[index] %in% "outlier") {
              variable <- a(class = "anchor", href = "#ID-h1-outlier", variable_name)            
          } else if (tab_warning$status[index] %in% c("zero", "negative")) {
              variable <- a(class = "anchor", href = "#ID-h2-variables", variable_name)
          } else {
            variable <- div(style = list(color = "red"), variable_name) 
          }

          tagList(
            div(style = list(display = "inline-block"), variable),
            msg
          )
        }
      ),
      status = colDef(name = "Types", width = 100),
      recommand = colDef(
        name = "Recommands", 
        width = 130,
        cell = function(value) {
          class <- paste0("tag recommand-", tolower(value))
          div(class = class, value)
        }
      )  
    )
  )
}

break_line_asis(2)
h2("Variables", id = "ID-h2-variables", class = "title-h2")
cap <- "Diagnostic overview of individual variables"
html_cat(cap)

html_variable(reportData, theme = "$theme$", base_family = base_family)

break_line_asis(2)
h1("Missing Values", id = "ID-h1-missing", class = "title-h1")
h2("List of Missing Values", id = "ID-h2-missing-value", class = "title-h2")
html_missing(diagn_na_unique)

break_line_asis(1)
h2("Visualization", id = "ID-h2-missing-viz", class = "title-h2")
diagn_missing <- diagn_na_unique %>% 
  filter(missing_count > 0)

if (NROW(diagn_missing) > 1) {
  cap <- "Interaction of variables including missing values"
  html_cat(cap)

  break_line_asis(1)
  plot_na_intersect(reportData, base_family = base_family)
} else if (NROW(diagn_missing) == 1) {
  html_cat("upported only when the number of variables including missing values is 2 or more")
  break_line_asis(1)
} else {
  html_cat("No variables including missing values")
  break_line_asis(1)
}

break_line_asis(1)
h1("Unique Values", id = "ID-h1-uniq-value", class = "title-h1")
h2("Categorical Variables", id = "ID-h2-uniq-categorical", class = "title-h2")
html_unique_cat(diagn_na_unique, thres_uniq_cat)  

break_line_asis(1)
h2("Numerical Variables", id = "ID-h2-uniq-numerical", class = "title-h2")
html_unique_num(diagn_na_unique, thres_uniq_num)  

break_line_asis(1)
h1("Outliers", id = "ID-h1-outlier", class = "title-h1")
if (NROW(diagn_numeric) > 0) {
  diagn_outlier <- diagn_numeric %>%
    filter(outlier > 0) 

  if (NROW(diagn_outlier) > 0) {
    cap <- "Diagnosis of numerical variable outliers"
    html_cat(cap)

    html_outlier(reportData, theme = "$theme$", base_family = base_family)

  } else {
    html_cat("No numeric variables including outliers")
    break_line_asis(1)
  }
} else {
  html_cat("No numerical variable")
  break_line_asis(1)
}

break_line_asis(1)
h1("Samples", id = "ID-h1-sample", class = "title-h1")
h2("Duplicated", id = "ID-h2-duplicate", class = "title-h2")
cap <- "Duplicated records"

idx_dup <- attr(ov, "duplicated")

if (length(idx_dup)) {
  tabs <- reportData[idx_dup, ] %>%
  reactable::reactable(
    defaultColDef = colDef(style = "font-size: 14px;color: hsl(0, 0%, 40%);",
                           minWidth = 150)
  )

  htmlwidgets::prependContent(
    tabs, 
    div(class = "caption", style = "padding-top: 8px; padding-bottom: 8px;
        color: #777777; text-align: left;", cap)
  )
} else {
  html_cat("No duplicated records")
  break_line_asis(1)
}

break_line_asis()
h2("Heads", id = "ID-h2-head", class = "title-h2")
  cap <- "First few records"

tabs <- reportData %>% 
  head(10) %>% 
  reactable::reactable(
    defaultColDef = colDef(style = "font-size: 14px;color: hsl(0, 0%, 40%);",
                           minWidth = 150)
  )

htmlwidgets::prependContent(tabs, 
    div(class = "caption", style = "padding-top: 8px; padding-bottom: 8px;
        color: #777777; text-align: left;", cap))

break_line_asis()
h2("Tails", id = "ID-h2-tail", class = "title-h2")
  cap <- "Last few records"

tabs <- reportData %>% 
  tail(10) %>% 
  reactable::reactable(
    defaultColDef = colDef(style = "font-size: 14px;color: hsl(0, 0%, 40%);",
                           minWidth = 150)
  )

htmlwidgets::prependContent(tabs, 
    div(class = "caption", style = "padding-top: 8px; padding-bottom: 8px;
        color: #777777; text-align: left;", cap))


Try the dlookr package in your browser

Any scripts or data that you put into this service are public.

dlookr documentation built on July 9, 2023, 6:31 p.m.