dlookr: EDA

knitr::opts_chunk$set(echo = FALSE, 
                      message = FALSE, 
                      warning = FALSE, 
                      collapse = FALSE,
                      comment = "#>",
                      fig.align = "center")
knitr::opts_chunk$set(fig.width = 12, fig.height = 9)
library(dlookr)
library(dplyr)

```{css, echo=FALSE} :root { --custom-grey60: rgb(102, 102, 102); --custom-grey20: rgb(204, 204, 204); --custom-grey10: rgb(230, 230, 230); --custom-blue: rgb(0, 114, 188); --custom-lightblue: rgb(204, 227, 242); --custom-orange: rgb(255, 127, 42); --custom-lightorange: rgb(255, 204, 170); }

header .title{

color: $title_color$; }

.navbar { background-color: #f0f0f0;
border-bottom : 2px solid $customColor$ !important; }

.nav>li>a { position: relative; display: block; padding: 10px 15px; }

a { color: #337ab7 !important; background-color: transparent !important; text-decoration: none; }

a.disable-links { pointer-events: none; color: var(--custom-grey60) !important; }

/ Warnings / .tag { display: inline-block; padding: 2px 12px; border-radius: 5px; font-weight: 600; font-size: 12px; }

.variables { font-size: 16px; color: hsl(358, 42%, 56%); }

.box-type { margin-right: 4px; padding: 0 4px; border: 1px solid hsl(0, 0%, 75%); border-radius: 2px; }

.variable-info-details { margin-top: 2px; font-size: 14px; font-weight: 400; color: hsl(0, 0%, 40%); overflow: hidden; text-overflow: ellipsis; }

.variable-info-text { margin-left: 12px; font-weight: 600; overflow: hidden; text-overflow: ellipsis; }

.variable-info { display: flex; align-items: center; }

.variables-tbl { margin-top: 16px; }

.value-info-text { margin-top: 2px; font-size: 14px; font-weight: 400; color: hsl(0, 0%, 40%); overflow: hidden; text-overflow: ellipsis; text-align: right; }

/ Defined title with H1, H2, H3 / .title-h1 { font-size: 2.5em; color: $customColor$; }

.title-h2 { font-size: 2.0em; font-weight: 400; color: $customColor$; }

.title-h3 { font-size: 1.75em; font-weight: 400; color: rgb(102, 102, 102); }

```r
reportData <- as.data.frame(get("reportData", .dlookrEnv))
targetVariable <- get("targetVariable", .dlookrEnv)
sample_percent <- get("sample_percent", .dlookrEnv)
author <- get("author", .dlookrEnv)
base_family <- get("base_family", .dlookrEnv)

# Number of observations
N <- NROW(reportData)

# solve the overview
ov <- overview(reportData)

# sampling with sample_percent
if (sample_percent < 100) {
  N_sample <- ceiling(N * sample_percent / 100)
  idx <- sample(seq(N), size = N_sample)

  reportData <- reportData[idx, ]
} else {
  N_sample <- N
}

if (length(targetVariable) == 0) targetVariable <- NULL
idx.numeric <- find_class(reportData, type = "numerical")
nm.numeric <- find_class(reportData, type = "numerical", index = FALSE)

if (!is.null(targetVariable)) {
  # remove target variable from variable index
  idx.numeric <- idx.numeric[nm.numeric != targetVariable]

  factor_flag <- class(pull(reportData, targetVariable))[1] %in% c("factor", "ordered")
  numeric_flag <- class(pull(reportData, targetVariable))[1] %in% c("integer", "numeric")

  target <- if (!factor_flag & numeric_flag) 
    factor(pull(reportData, targetVariable)) else
      pull(reportData, targetVariable)
} else { # fix error while performing eda_web_report() #83
  factor_flag <- FALSE
}

# if all elements of a numerical variable are NA,
# remove from correlation coefficient calculation
idx.numeric <- idx.numeric[apply(as.data.frame(reportData[, idx.numeric]), 2,
                                 function(x) !all(is.na(x)))]

# if all elements of the numerical variable are the same value,
# remove from the correlation coefficient calculation
idx.numeric <- idx.numeric[apply(as.data.frame(reportData[, idx.numeric]), 2,
                                 function(x) diff(range(x, na.rm = TRUE)) > 0)]
division <- c("dataset" ,"dataset" ,"dataset", "job", "job", "job")
metrics <- c("dataset" ,"dataset type", "target", "samples", 
             "created", "created by")

value <- c("$dataset$", 
           "$datatype$",
           ifelse(is.null(targetVariable), "not defied", targetVariable),
           paste0(format(N_sample, big.mark = ","), " / ", 
                  format(N, big.mark = ","), " (", sample_percent, "%)"),
           "$date$",
           ifelse(author == "", "dlookr", author))

overview <- data.frame(Divisions = division, Metrics = metrics, Values = value)
h1("Overview", id = "ID-h1-overview", class = "title-h1")
tab_left <- ov[1:9, ]
tab_right <- ov[10:nrow(ov), ]
rownames(tab_right) <- seq(nrow(tab_right))

tab_left <- tab_left %>% 
  mutate(value = ifelse (metrics %in% "observations", N, value)) %>%
  mutate(value = ifelse (metrics %in% "memory size", 
                         ifelse(value / 1024^2 > 0, round(value / 1024^2),
                                round(value / 1024)), value)) %>% 
  mutate(metrics = ifelse (metrics %in% "memory size", 
                         ifelse(value / 1024^2 > 0, "memory size (MB)",
                                "memory size (KB)"), metrics))

cap <- "Data Structures"
knitr::kable(tab_left, digits = 2, caption = cap, format = "html",
             format.args = list(big.mark = ","),
             col.names = c("Divisions", "Metrics", "Values"),
             table.attr = "style=\"color: hsl(0, 0%, 40%);margin-right:30px !important;\"") %>% 
  kable_styling(full_width = FALSE, font_size = 14, position = "float_left") 

cap <- "Data Types"
knitr::kable(tab_right, digits = 2, caption = cap, format = "html",
             col.names = c("Divisions", "Metrics", "Values"),
             format.args = list(big.mark = ","),
             table.attr = "style=\"color: hsl(0, 0%, 40%);margin-right:30px !important;\"") %>% 
  kable_styling(full_width = FALSE, font_size = 14, position = "float_left") 

cap <- "Job Informations"
knitr::kable(overview, caption = cap, format = "html",
             table.attr = "style=\"color: hsl(0, 0%, 40%);\"") %>% 
  kable_styling(full_width = FALSE, font_size = 14, position = "left") 

breaks <- 9 - nrow(overview) + 2
break_line_asis(breaks)
h1("Univariate Analysis", id = "ID-h1-univariate", class = "title-h1")
h2("Descriptive Statistics", id = "ID-h2-descriptive", class = "title-h2")
in_numerical <- find_class(reportData, type = "numerical") %>%
  length() %>% 
  as.logical()

if (in_numerical) {
  html_descriptive(reportData, base_family = base_family)
} else {
  html_cat("No numerical variable.")   
}

break_line_asis(1)
h2("Normality Test", id = "ID-h2-normality", class = "title-h2")
if (length(idx.numeric) > 0) {
  suppressWarnings({
    reportData %>% 
    html_normality(theme = "$theme$", base_family = base_family)
  })
} else {
  html_cat("No numerical variable.")
}

break_line_asis(1)
h1("Bivariate Analysis", id = "ID-h1-bivariate", class = "title-h1")
h2("Compare Numerical Variables", id = "ID-h2-compare-numerical", class = "title-h2")
html_compare_numerical(reportData, base_family = base_family)

break_line_asis(1)
h2("Compare Categorical Variables", id = "ID-h2-compare-categorical", class = "title-h2")
html_compare_category(reportData, base_family = base_family)

break_line_asis(1)
h1("Multivariate Analysis", id = "ID-h1-multivariate", class = "title-h1")
h2("Correlation Analysis", id = "ID-h2-correlation", class = "title-h2")
h3("Correlation Matrix", id = "ID-h3-correlation-matrix", class = "title-h3")
html_correlation(reportData) 

break_line_asis(1)
h3("Correlation Plot", id = "ID-h3-correlation-plot", class = "title-h3")
if (length(idx.numeric) < 2) {
  html_cat("The number of numerical variables is less than 2.")
} else {
  cap <- "Correlation coefficient matrix plot"
  html_cat(cap)

  reportData %>% 
    correlate() %>% 
    plot(base_family = base_family)  
}

if (!is.null(targetVariable)) {
  break_line_asis(1)
}  
if (!is.null(targetVariable))
  h1("Target based Analysis", id = "ID-h1-target-based", class = "title-h1")
if (!is.null(targetVariable))
  h2("Target Variable and Numeric Variables", id = "ID-h2-group-numerical", class = "title-h2")
if (!is.null(targetVariable)) {
  content <- html_target_numerical(reportData, target = targetVariable, base_family = base_family)
} else {
  content <- HTML("")
}  

if (!is.null(content))
  content

if (!is.null(targetVariable)) {
  break_line_asis(1)
}  
if (!is.null(targetVariable))
  h2("Target Variable and Categorical Variables", id = "ID-h2-group-categorical", class = "title-h2")
if (!is.null(targetVariable)) {
  content <- html_target_categorical(reportData, target = targetVariable, base_family = base_family)
} else {
  content <- HTML("")
}

if (!is.null(content))
  content

if (!is.null(targetVariable)) {
  break_line_asis(1)
}  
if (!is.null(targetVariable) & factor_flag)
  h2("Grouped Correlation", id = "ID-h2-group-correlation", class = "title-h2")
if (!is.null(targetVariable) & factor_flag) {
  cap <- "Correlation by level of target variable"
  html_cat(cap)

  content <- html_target_correlation(reportData, target = targetVariable, base_family = base_family)
} else {
  content <- HTML("")
}

if (!is.null(content))
  content

break_line_asis(1)


Try the dlookr package in your browser

Any scripts or data that you put into this service are public.

dlookr documentation built on July 9, 2023, 6:31 p.m.