inst/doc/EDA.R

## ----environment, echo = FALSE, message = FALSE, warning=FALSE----------------
knitr::opts_chunk$set(collapse = TRUE, comment = "", out.width = "600px", dpi = 70)
options(tibble.print_min = 4L, tibble.print_max = 4L)

library(dlookr)
library(dplyr)
library(ggplot2)

## ----import_data, warning=FALSE-----------------------------------------------
str(Carseats)

## ----missing------------------------------------------------------------------
carseats <- Carseats

suppressWarnings(RNGversion("3.5.0"))
set.seed(123)
carseats[sample(seq(NROW(carseats)), 20), "Income"] <- NA

suppressWarnings(RNGversion("3.5.0"))
set.seed(456)
carseats[sample(seq(NROW(carseats)), 10), "Urban"] <- NA

## ----describe-----------------------------------------------------------------
describe(carseats)

## ----describes2---------------------------------------------------------------
# Select columns by name
describe(carseats, Sales, CompPrice, Income)
# Select all columns between year and day (include)
describe(carseats, Sales:Income)
# Select all columns except those from year to day (exclude)
describe(carseats, -(Sales:Income))

## ----describe_pipe------------------------------------------------------------
carseats %>%
  describe() %>%
  select(described_variables, skewness, mean, p25, p50, p75) %>% 
  filter(!is.na(skewness)) %>% 
  arrange(desc(abs(skewness)))

## ----describe_pipe2-----------------------------------------------------------
carseats %>%
  group_by(US) %>% 
  describe(Sales, Income) 

## ----describe_pipe3-----------------------------------------------------------
carseats %>%
  group_by(US, Urban) %>% 
  describe(Sales, Income) 

## ----normality----------------------------------------------------------------
normality(carseats)

## ----normality2---------------------------------------------------------------
# Select columns by name
normality(carseats, Sales, CompPrice, Income)

# Select all columns between year and day (inclusive)
normality(carseats, Sales:Income)

# Select all columns except those from year to day (inclusive)
normality(carseats, -(Sales:Income))

## ----normality_pipe-----------------------------------------------------------
library(dplyr)

carseats %>%
  normality() %>%
  filter(p_value <= 0.01) %>% 
  arrange(abs(p_value))

## ----normality_pipe2----------------------------------------------------------
carseats %>%
  group_by(ShelveLoc, US) %>%
  normality(Income) %>% 
  arrange(desc(p_value))

## ----normality_pipe3----------------------------------------------------------
carseats %>%
  mutate(log_income = log(Income)) %>%
  group_by(ShelveLoc, US) %>%
  normality(log_income) %>%
  filter(p_value > 0.01)

## ----plot_normality, fig.align='center', fig.width = 6, fig.height = 4--------
# Select columns by name
plot_normality(carseats, Sales, CompPrice)

## ----plot_normality2, fig.align='center', fig.width = 6, fig.height = 4, eval=FALSE----
#  carseats %>%
#    filter(ShelveLoc == "Good") %>%
#    group_by(US) %>%
#    plot_normality(Income)

## ----correlate----------------------------------------------------------------
correlate(carseats)

## ----correlate2---------------------------------------------------------------
# Select columns by name
correlate(carseats, Sales, CompPrice, Income)

# Select all columns between year and day (include)
correlate(carseats, Sales:Income)

# Select all columns except those from year to day (exclude)
correlate(carseats, -(Sales:Income))

## ----correlate3---------------------------------------------------------------
carseats %>%
  correlate(Sales:Income) %>%
  filter(as.integer(var1) > as.integer(var2))

## ----correlate4---------------------------------------------------------------
tab_corr <- carseats %>%
  filter(ShelveLoc == "Good") %>%
  group_by(Urban, US) %>%
  correlate(Sales) %>%
  filter(abs(coef_corr) > 0.5)

tab_corr

## ----plot_correlate, fig.align='center', fig.width = 6, fig.height = 4--------
carseats %>% 
  correlate() %>% 
  plot()

## ----plot_correlate2, fig.align='center', fig.width = 6, fig.height = 4, eval=TRUE----
# Select columns by name
correlate(carseats, Sales, Price) %>% 
  plot()

## ----plot_correlate3, fig.align='center', fig.width = 6, fig.height = 4, warning=FALSE, eval=TRUE----
carseats %>%
  filter(ShelveLoc == "Good") %>%
  group_by(Urban) %>%
  correlate() %>%
  plot() 

## ----target_by----------------------------------------------------------------
categ <- target_by(carseats, US)

## ----target_by2---------------------------------------------------------------
# If the variable of interest is a numerical variable
cat_num <- relate(categ, Sales)
cat_num
summary(cat_num)

## ----target_by3, fig.align='center', fig.width = 6, fig.height = 4, warning=FALSE----
plot(cat_num)

## ----target_by4---------------------------------------------------------------
# If the variable of interest is a categorical variable
cat_cat <- relate(categ, ShelveLoc)
cat_cat
summary(cat_cat)

## ----target_by5, fig.align='center', fig.width = 6, fig.height = 4, warning=FALSE----
plot(cat_cat)

## ----target_by6---------------------------------------------------------------
# If the variable of interest is a numerical variable
num <- target_by(carseats, Sales)

## ----target_by7---------------------------------------------------------------
# If the variable of interest is a numerical variable
num_num <- relate(num, Price)
num_num
summary(num_num)

## ----target_by8, fig.align='center', fig.width = 6, fig.height = 4, warning=FALSE----
plot(num_num)

## ----target_by8_2, fig.align='center', fig.width = 6, fig.height = 4, warning=FALSE----
plot(num_num, hex_thres = 350)

## ----target_by9---------------------------------------------------------------
# If the variable of interest is a categorical variable
num_cat <- relate(num, ShelveLoc)
num_cat
summary(num_cat)

## ----target_by10, fig.align='center', fig.width = 6, fig.height = 4, warning=FALSE----
plot(num_cat)

## ----eda_web_report, eval=FALSE-----------------------------------------------
#  heartfailure %>%
#    eda_web_report(target = "death_event", subtitle = "heartfailure",
#                   output_dir = "./", output_file = "EDA.html", theme = "blue")

## ----eda_web_title, echo=FALSE, out.width='80%', fig.align='center', fig.pos="!h", fig.cap="The part of the report"----
knitr::include_graphics('img/eda_web_title.jpg')

## ----eda_paged_report, eval=FALSE---------------------------------------------
#  heartfailure %>%
#    eda_paged_report(target = "death_event", subtitle = "heartfailure",
#                     output_dir = "./", output_file = "EDA.pdf", theme = "blue")

## ----eda_paged_cover, echo=FALSE, out.width='80%', fig.align='center', fig.pos="!h", fig.cap="The part of the report"----
knitr::include_graphics('img/eda_paged_cover.jpg')

## ----eda_paged_cntent, echo=FALSE, out.width='80%', fig.align='center', fig.pos="!h", fig.cap="The dynamic contents of the report"----
knitr::include_graphics('img/eda_paged_content.jpg')

## ----dbi_table, warning=FALSE, message=FALSE, eval=FALSE----------------------
#  library(dplyr)
#  
#  carseats <- Carseats
#  carseats[sample(seq(NROW(carseats)), 20), "Income"] <- NA
#  carseats[sample(seq(NROW(carseats)), 5), "Urban"] <- NA
#  
#  # connect DBMS
#  con_sqlite <- DBI::dbConnect(RSQLite::SQLite(), ":memory:")
#  
#  # copy carseats to the DBMS with a table named TB_CARSEATS
#  copy_to(con_sqlite, carseats, name = "TB_CARSEATS", overwrite = TRUE)

## ----dbi_describe, eval=FALSE-------------------------------------------------
#  # Positive values select variables
#  con_sqlite %>%
#    tbl("TB_CARSEATS") %>%
#    describe(Sales, CompPrice, Income)
#  
#  # Negative values to drop variables, and In-memory mode and collect size is 200
#  con_sqlite %>%
#    tbl("TB_CARSEATS") %>%
#    describe(-Sales, -CompPrice, -Income, collect_size = 200)
#  
#  # Find the statistic of all numerical variables by 'ShelveLoc' and 'US',
#  # and extract only those with the 'ShelveLoc' variable level as "Good".
#  con_sqlite %>%
#    tbl("TB_CARSEATS") %>%
#    group_by(ShelveLoc, US) %>%
#    describe() %>%
#    filter(ShelveLoc == "Good")
#  
#  # extract only those with 'Urban' variable level is "Yes",
#  # and find 'Sales' statistics by 'ShelveLoc' and 'US'
#  con_sqlite %>%
#    tbl("TB_CARSEATS") %>%
#    filter(Urban == "Yes") %>%
#    group_by(ShelveLoc, US) %>%
#    describe(Sales)

## ----dbi_normality, eval=FALSE------------------------------------------------
#  # Test all numerical variables by 'ShelveLoc' and 'US',
#  # and extract only those with the 'ShelveLoc' variable level is "Good".
#  con_sqlite %>%
#    tbl("TB_CARSEATS") %>%
#   group_by(ShelveLoc, US) %>%
#   normality() %>%
#   filter(ShelveLoc == "Good")
#  
#  # extract only those with 'Urban' variable level is "Yes",
#  # and test 'Sales' by 'ShelveLoc' and 'US'
#  con_sqlite %>%
#    tbl("TB_CARSEATS") %>%
#   filter(Urban == "Yes") %>%
#   group_by(ShelveLoc, US) %>%
#   normality(Sales)
#  
#  # Test log(Income) variables by 'ShelveLoc' and 'US',
#  # and extract only p.value greater than 0.01.
#  
#  # SQLite extension functions for log transformation
#  RSQLite::initExtension(con_sqlite)
#  
#  con_sqlite %>%
#    tbl("TB_CARSEATS") %>%
#   mutate(log_income = log(Income)) %>%
#   group_by(ShelveLoc, US) %>%
#   normality(log_income) %>%
#   filter(p_value > 0.01)

## ----plot_normality_dbi, fig.align='center', fig.width = 6, fig.height = 4, eval=FALSE, eval=FALSE----
#  # Extract only those with the 'ShelveLoc' variable level is "Good",
#  # and plot 'Income' by 'US'
#  # The result is the same as the data.frame, but not displayed here. Reference above in document.
#  con_sqlite %>%
#    tbl("TB_CARSEATS") %>%
#    filter(ShelveLoc == "Good") %>%
#    group_by(US) %>%
#    plot_normality(Income)

## ----dbi_correlation, eval=FALSE----------------------------------------------
#  # Correlation coefficient
#  # that eliminates redundant combination of variables
#  con_sqlite %>%
#    tbl("TB_CARSEATS") %>%
#    correlate() %>%
#    filter(as.integer(var1) > as.integer(var2))
#  
#  con_sqlite %>%
#    tbl("TB_CARSEATS") %>%
#    correlate(Sales, Price) %>%
#    filter(as.integer(var1) > as.integer(var2))
#  
#  # Compute the correlation coefficient of the Sales variable by 'ShelveLoc'
#  # and 'US' variables. And extract only those with absolute
#  # value of the correlation coefficient is more significant than 0.5
#  con_sqlite %>%
#    tbl("TB_CARSEATS") %>%
#    group_by(ShelveLoc, US) %>%
#    correlate(Sales) %>%
#    filter(abs(coef_corr) >= 0.5)
#  
#  # Extract only those with the 'ShelveLoc' variable level is "Good",
#  # and compute the correlation coefficient of the 'Sales' variable
#  # by 'Urban' and 'US' variables.
#  # And the correlation coefficient is negative and smaller than 0.5
#  con_sqlite %>%
#    tbl("TB_CARSEATS") %>%
#    filter(ShelveLoc == "Good") %>%
#    group_by(Urban, US) %>%
#    correlate(Sales) %>%
#    filter(coef_corr < 0) %>%
#    filter(abs(coef_corr) > 0.5)

## ----plot_correlation_dbi, fig.align='center', fig.width = 6, fig.height = 4, warning=FALSE, eval=FALSE----
#  # Extract only those with 'ShelveLoc' variable level is "Good",
#  # and visualize correlation plot of 'Sales' variable by 'Urban'
#  # and 'US' variables.
#  # The result is the same as the data.frame, but not displayed here. Reference above in document.
#  con_sqlite %>%
#    tbl("TB_CARSEATS") %>%
#    filter(ShelveLoc == "Good") %>%
#    group_by(Urban) %>%
#    correlate() %>%
#    plot(Sales)

## ----dbi_ctarget_by, eval=FALSE-----------------------------------------------
#  # If the target variable is a categorical variable
#  categ <- target_by(con_sqlite %>% tbl("TB_CARSEATS") , US)
#  
#  # If the variable of interest is a numerical variable
#  cat_num <- relate(categ, Sales)
#  cat_num
#  summary(cat_num)

## ----plot_target_by_dbi, fig.align='center', fig.align='center', fig.width = 6, fig.height = 4, eval=FALSE----
#  # The result is the same as the data.frame, but not displayed here. Reference above in document.
#  plot(cat_num)

## ----dbi_eda_report, eval=FALSE-----------------------------------------------
#  # create a web report file.
#  con_sqlite %>%
#    tbl("TB_CARSEATS") %>%
#    eda_web_report()
#  
#  # create a pdf file. the file name is EDA.pdf, and the collect size is 350
#  con_sqlite %>%
#    tbl("TB_CARSEATS") %>%
#    eda_paged_report(collect_size = 350, output_file = "EDA.pdf")
choonghyunryu/dlookr documentation built on June 11, 2024, 9:12 a.m.