knitr::opts_chunk$set(echo = TRUE)
library(dplyr) library(ggplot2) library(gridExtra) library(knitr) library(kableExtra)
dist_numeric <- function(df, target, predictor) { library(dplyr) descr <- df %>% target_by(target) %>% relate(predictor) vars <- unlist(descr[, 2]) descr <- descr %>% select(-c(1:2)) %>% t colnames(descr) <- vars descr <- descr[, rev(seq(NCOL(descr)))] rnames <- c("n", "NA", "mean", "sd", "se(mean)", "IQR", "skewness", "kurtosis") rnames <- c(rnames, paste0(c(0, 1, 5, 10, 20, 25, 30, 40, 50, 60, 70, 75, 80, 90, 95, 99, 100), "%")) row.names(descr) <- rnames as.data.frame(descr) } setUnder <- function(x) { gsub("_", ".", x) } edaData <- as.data.frame(get("edaData", .dlookrEnv)) targetVariable <- get("targetVariable", .dlookrEnv) if (length(targetVariable) == 0) targetVariable <- NULL
idx.numeric <- find_class(edaData, type = "numerical") idx.factor <- find_class(edaData, type = "categorical") nm.numeric <- find_class(edaData, type = "numerical", index = FALSE) nm.factor <- find_class(edaData, type = "categorical", index = FALSE) if (!is.null(targetVariable)) { # remove target variable from variable index idx.numeric <- idx.numeric[nm.numeric != targetVariable] idx.factor <- idx.factor[nm.factor != targetVariable] factor_flag <- class(pull(edaData, targetVariable))[1] %in% c("factor", "ordered") numeric_flag <- class(pull(edaData, targetVariable))[1] %in% c("integer", "numeric") target <- if (!factor_flag & numeric_flag) factor(pull(edaData, targetVariable)) else pull(edaData, targetVariable) } else { # fix error while performing eda_web_report() #83 factor_flag <- FALSE numeric_flag <- FALSE } # if all elements of a numerical variable are NA, # remove from correlation coefficient calculation idx.numeric <- idx.numeric[apply(as.data.frame(edaData[, idx.numeric]), 2, function(x) !all(is.na(x)))] # if all elements of the numerical variable are the same value, # remove from the correlation coefficient calculation idx.numeric <- idx.numeric[apply(as.data.frame(edaData[, idx.numeric]), 2, function(x) diff(range(x, na.rm = TRUE)) > 0)]
The EDA Report provides exploratory data analysis information on objects that inherit data.frame and data.frame.
The dataset that generated the EDA Report is an 'r class(edaData)[1]
' object. It consists of r format(NROW(edaData), big.mark = ",")
observations and r NCOL(edaData)
variables.
The variable information of the data set that generated the EDA Report is shown in the following table.:
cap <- "Information of Variables" vars <- edaData %>% diagnose() %>% mutate(missing_percent = round(missing_percent, 2), unique_rate = round(unique_rate, 3)) knitr::kable(vars, caption = cap, format = "html") %>% kable_styling(full_width = FALSE, font_size = 15, position = "left") %>% cat() target_nm <- if (is.null(targetVariable)) "NULL" else targetVariable target_type <- if (is.null(targetVariable)) "NULL(You did not specify a target variable)" else class(pull(edaData, targetVariable))[1]
The target variable of the data is 'r target_nm
', and the data type of the variable is r target_type
.
EDA reports provide information and visualization results that support the EDA process. In particular, it provides a variety of information to understand the relationship between the target variable and the rest of the variables of interest.
x <- Hmisc:::describe.data.frame(edaData) Hmisc::html(x, file = "")
if (length(idx.numeric) == 0) { cat("\n\nThere is no numeric variable.\n\n") } else { for (i in idx.numeric) { cap <- sprintf("%s", names(edaData)[i]) cap <- paste(unlist(strsplit(cap, "_")), collapse = "\\_") cat("<b><font size='3' face='verdana'>[", names(edaData)[i], "]</font></b>", "\n") cat("\nnormality test : Shapiro-Wilk normality test\n\n") x <- pull(edaData, i) %>% .[!is.na(.)] if (length(x) == 0) { cat("all 'x' values are NA\n\n") next } if (length(x) > 5000) { x <- sample(x, size = 5000, replace = TRUE) } if(length(x) < 3L || length(x) > 5000L) { cat("sample size must be between 3 and 5000\n\n") next } if(diff(range(x, na.rm = TRUE)) == 0) { cat("all 'x' values are identical\n\n") next } y <- shapiro.test(x) cat(sprintf("\t statistic : %s, p-value : %G\n\n", round(y$statistic, 5), y$p.value)) cat("\n<br>\n") if (is.factor(x)) x <- as.numeric(x) if (sum(x < 0, na.rm = TRUE) > 0) { type <- c("original", "log+a transformation", "Box-Cox transformation") skew <- c(skewness(x, na.rm = TRUE), skewness(get_transform(x, "log+a"), na.rm = TRUE), skewness(get_transform(x, "Box-Cox"), na.rm = TRUE)) kurt <- c(kurtosis(x, na.rm = TRUE), kurtosis(get_transform(x, "log+a"), na.rm = TRUE), kurtosis(get_transform(x, "Box-Cox"), na.rm = TRUE)) } else { if (any(x == 0, na.rm = TRUE)) { type <- c("original", "log+1 transformation", "sqrt transformation") skew <- c(skewness(x, na.rm = TRUE), skewness(get_transform(x, "log+1"), na.rm = TRUE), skewness(sqrt(x), na.rm = TRUE)) kurt <- c(kurtosis(x, na.rm = TRUE), kurtosis(get_transform(x, "log+1"), na.rm = TRUE), kurtosis(sqrt(x), na.rm = TRUE)) } else { type <- c("original", "log transformation", "sqrt transformation") skew <- c(skewness(x, na.rm = TRUE), skewness(log(x), na.rm = TRUE), skewness(sqrt(x), na.rm = TRUE)) kurt <- c(kurtosis(x, na.rm = TRUE), kurtosis(log(x), na.rm = TRUE), kurtosis(sqrt(x), na.rm = TRUE)) } } stats <- data.frame(type = type, skewness = skew, kurtosis = kurt) cap <- "skewness and kurtosis" knitr::kable(stats, caption = cap, format = "html") %>% kable_styling(full_width = FALSE, font_size = 15, position = "left") %>% cat() cat("\n<br>\n") x <- data.frame(x) if (sum(x < 0, na.rm = TRUE) > 0) { plot_normality(x, x, left = "log+a", right = "Box-Cox") } else { if (any(x == 0, na.rm = TRUE)) plot_normality(x, x, left = "log+1", right = "sqrt") else plot_normality(x, x) } cat("\n<hr>\n<br>\n") } }
if (length(idx.numeric) > 2) { eps <- 0.5 cors <- edaData %>% correlate %>% filter(abs(coef_corr) >= eps) %>% filter(as.integer(var1) > as.integer(var2)) %>% arrange(desc(abs(coef_corr))) if (NROW(cors) > 0) { names(cors) <- c("Variable1", "Variable2", "Correlation Coefficient") cap = "Table of correlation coefficients (0.5 or more)" knitr::kable(cors, caption = cap, format = "html") %>% kable_styling(full_width = FALSE, font_size = 15, position = "left") } else { cat("No correlation coefficient is greater than 0.5.\n\n") } } else { cat("Number of numerical variables is less than 2.\n\n") }
if (length(idx.numeric) > 2) { edaData %>% correlate() %>% plot() } else { cat("Number of numerical variables is less than 2.\n\n") }
if (!is.null(targetVariable)) { target_by_obj <- target_by(edaData, all_of(targetVariable)) }
if (!is.null(targetVariable)) { if (factor_flag) { if (length(idx.numeric) == 0) { cat("There is no numeric variable.\n\n") } else { for (i in idx.numeric) { cat("<b><font size='3' face='verdana'>[", names(edaData)[i], "]</font></b>", "\n") z <- dist_numeric(edaData, targetVariable, names(edaData)[i]) ncols <- NCOL(z) rcnt <- round(ncols / 3) cap <- names(edaData)[i] cap <- sprintf("Distribution of %s", cap) cat(sprintf("<br><br>\n * %s", cap)) knitr::kable(z, digits = 2, caption = cap, format = "html", format.args = list(big.mark = ",")) %>% kable_styling(full_width = FALSE, font_size = 15, position = "left") %>% cat() cat("<br>") cat("<br>\n * Visualization of Distribution<br>\n") plot_outlier(target_by_obj, names(edaData)[i]) cat("<br><br>\n<hr>") } } } else if (numeric_flag) { if (length(idx.numeric) == 0) { cat("There is no numeric variable.\n\n") } else { targetby <- target_by(edaData, targetVariable) for (i in idx.numeric) { cat("<b><font size='3' face='verdana'>[", names(edaData)[i], "]</font></b>", "\n") num_num <- relate(targetby, names(edaData)[i]) tab <- summary(num_num)$coefficients rsq <- summary(num_num)$r.squared adj <- summary(num_num)$adj.r.squared fstat <- summary(num_num)$fstatistic df <- summary(num_num)$df rse <- summary(num_num)$sigma cat("\nSimple Linear Model Information\n\n") cat(sprintf("\t Residual standard error: %s on %s degrees of freedom\n", round(rse), df[2])) cat(sprintf("\t Multiple R-squared: %s, Adjusted R-squared: %s\n", round(rsq, 5), round(adj, 5))) cat(sprintf("\t F-statistic: %s on %s and %s DF, p-value: %s\n", round(fstat["value"]), fstat["numdf"], fstat["dendf"], tab[2, 4])) cat("\n<br>") cat("Coefficients:\n") cap <- sprintf("Simple Linear Model coefficients : %s", names(edaData)[i]) cap <- paste(unlist(strsplit(cap, "_")), collapse = "\\\\_") knitr::kable(tab, digits = 2, caption = cap, format = "html") %>% kable_styling(full_width = FALSE, font_size = 15, position = "left") %>% cat() cat("\n<br>Visualization:\n") cat("\n<br>\n") plot(num_num) cat("\n<hr>\n<br>\n") } } } } else { cat("There is no target variable.\n\n") }
if (!is.null(targetVariable)) { if (factor_flag) { if (length(idx.factor) == 0) { cat("There is no categorical variable.\n\n") } else { for (i in idx.factor) { if (sum(!is.na(edaData[, i])) == 0) next cat("\n") cat("<b><font size='3' face='verdana'>[", names(edaData)[i], "]</font></b>", "\n") cat("<br><br>\n * Frequency Table") tab <- edaData %>% target_by(targetVariable) %>% relate(names(edaData)[i]) %>% t() freq <- addmargins(tab) idx.na <- which(is.na(rownames(freq))) rownames(freq)[idx.na] <- "NA" ncols <- NCOL(freq) - 1 rcnt <- round(ncols / 3) ratio <- sweep(freq, 2, c(apply(tab, 2, sum), sum(tab)), "/") * 100 idx.na <- which(is.na(rownames(ratio))) rownames(ratio)[idx.na] <- "NA" cap <- sprintf("%s : frequency", names(edaData)[i]) cap <- paste(unlist(strsplit(cap, "_")), collapse = "\\\\_") knitr::kable(freq, digits = 2, caption = cap, format = "html", format.args = list(big.mark = ",")) %>% kable_styling(full_width = FALSE, font_size = 15, position = "left") %>% column_spec(NCOL(freq) + 1, bold = TRUE) %>% row_spec(NROW(freq), bold = TRUE, color = "white", background = "#FFA349") %>% cat() cat("<br>\n") if (ncols > 2) { cat("\n") } cat("<br>\n * Relative Frequency Table (%)") cap <- sprintf("%s : ratio", names(edaData)[i]) cap <- paste(unlist(strsplit(cap, "_")), collapse = "\\_") knitr::kable(ratio, digits = 2, caption = cap, format = "html") %>% kable_styling(full_width = FALSE, font_size = 15, position = "left") %>% column_spec(NCOL(ratio) + 1, bold = TRUE) %>% row_spec(NROW(ratio), bold = TRUE, color = "white", background = "#FFA349") %>% cat() cat("<br>\n") cat("<br>\n * Visualization of Frequency<br>\n") mosaics <- relate(target_by_obj, names(edaData)[i]) print(plot(mosaics)) cat("<br><br>\n<hr>") } } } else if (numeric_flag) { if (length(idx.factor) == 0) { cat("There is no categorical variable.\n\n") } else { targetby <- target_by(edaData, targetVariable) for (i in idx.factor) { cat("<b><font size='3' face='verdana'>[", names(edaData)[i], "]</font></b>", "\n<br>\n") num_cat <- relate(targetby, names(edaData)[i]) tab_aov <- data.frame(anova(num_cat)) cat("<br>\n1. Analysis of Variance\n\n") cap <- sprintf("Analysis of Variance Table : %s", names(edaData)[i]) cap <- paste(unlist(strsplit(cap, "_")), collapse = "\\\\_") knitr::kable(tab_aov, digits = 2, caption = cap, format = "html") %>% kable_styling(full_width = FALSE, font_size = 15, position = "left") %>% cat() cat("\n<br>") tab <- summary(num_cat)$coefficients rsq <- summary(num_cat)$r.squared adj <- summary(num_cat)$adj.r.squared fstat <- summary(num_cat)$fstatistic df <- summary(num_cat)$df rse <- summary(num_cat)$sigma cat("\n2. Simple Linear Model Information\n\n") cat(sprintf("\t Residual standard error: %s on %s degrees of freedom\n", round(rse), df[2])) cat(sprintf("\t Multiple R-squared: %s, Adjusted R-squared: %s\n", round(rsq, 5), round(adj, 5))) cat(sprintf("\t F-statistic: %s on %s and %s DF, p-value: %s\n", round(fstat["value"]), fstat["numdf"], fstat["dendf"], tab[2, 4])) cat("\n<br>") cat("Coefficients:\n") cap <- sprintf("Simple Linear Model coefficients : %s", names(edaData)[i]) cap <- paste(unlist(strsplit(cap, "_")), collapse = "\\\\_") knitr::kable(tab, digits = 2, caption = cap, format = "html") %>% kable_styling(full_width = FALSE, font_size = 15, position = "left") %>% cat() cat("\n<br>3. Visualization:\n") cat("\n<br>\n") print(plot(num_cat)) cat("\n<hr>\n<br>\n") } } } } else { cat("There is no target variable.\n\n") }
if (!is.null(targetVariable)) { if (factor_flag) { if (length(idx.numeric) > 2) { eps <- 0.5 idx <- which(names(edaData) %in% targetVariable) names(edaData)[idx] <- "targetVariable" cors <- edaData %>% group_by(targetVariable) %>% correlate %>% filter(abs(coef_corr) >= eps) %>% filter(as.integer(var1) > as.integer(var2)) %>% arrange(targetVariable, desc(abs(coef_corr))) if (NROW(cors) > 0) { names(cors) <- c(targetVariable, "Variable1", "Variable2", "Correlation Coefficient") cap <- "Table of correlation coefficients (0.5 or more)" knitr::kable(cors, caption = cap, format = "html") %>% kable_styling(full_width = FALSE, font_size = 15, position = "left") } else { cat("No correlation coefficient is greater than 0.5.\n\n") } } else { cat("Number of numerical variables is less than 2.\n\n") } } else if (numeric_flag) { cat("Numerical target variables are not supported.\n\n") } } else { cat("There is no target variable.\n\n") }
if (!is.null(targetVariable)) { if (factor_flag) { if (length(idx.numeric) > 2) { level_name <- levels(edaData$targetVariable) for (nm in level_name) { cat(sprintf("- Grouped Correlation Case of (%s == %s)\n", targetVariable, nm)) edaData %>% filter(targetVariable %in% nm) %>% correlate() %>% plot() } } else { cat("Number of numerical variables is less than 2.\n\n") } } else if (numeric_flag) { cat("Numerical target variables are not supported.\n\n") } } else { cat("There is no target variable.\n\n") }
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.