#' @title Summary of Continuous Variables with Distribution Diagnostics
#' @return Text and an HTML summary table (with optional distribution diagnostics)
#'
#' @importFrom R6 R6Class
#' @import jmvcore
#' @importFrom magrittr %>%
#' @importFrom gt gt
#' @importFrom htmltools HTML
#' @importFrom gtExtras gt_plt_summary
#' @import moments
#' @import pivottabler
#' @importFrom summarytools dfSummary descr freq ctable view st_options
#' @importFrom lubridate ymd ymd_hms dmy dmy_hms mdy mdy_hms ydm parse_date_time is.Date
summarydataClass <- if (requireNamespace("jmvcore")) R6::R6Class("summarydataClass",
inherit = summarydataBase,
private = list(
.run = function() {
# Check if variables have been selected. If not, display a welcoming message with instructions.
if (length(self$options$vars) == 0) {
intro_msg <- "
<h3>Welcome to ClinicoPath Descriptives!</h3>
<p>This tool helps you generate descriptive statistics for your numeric variables.
Please select one or more continuous variables from the options panel.</p>
<p>If you want to inspect distribution characteristics, enable the 'Distribution Diagnostics' option.</p>"
self$results$todo$setContent(intro_msg)
return()
} else {
# Clear any introductory message if variables are selected.
self$results$todo$setContent("")
# Validate that the dataset contains complete rows.
if (nrow(self$data) == 0) {
stop("Error: The provided dataset contains no complete rows. Please check your data and try again.")
}
# Retrieve the data and construct the list of variables.
dataset <- self$data
# Date validation function using lubridate
validate_and_parse_dates <- function(x) {
if (is.Date(x) || inherits(x, "POSIXt")) {
return(list(valid = TRUE, parsed = x, format = "already_date"))
}
# Try different date formats
date_formats <- list(
ymd = function(x) lubridate::ymd(x, quiet = TRUE),
ymd_hms = function(x) lubridate::ymd_hms(x, quiet = TRUE),
dmy = function(x) lubridate::dmy(x, quiet = TRUE),
dmy_hms = function(x) lubridate::dmy_hms(x, quiet = TRUE),
mdy = function(x) lubridate::mdy(x, quiet = TRUE),
mdy_hms = function(x) lubridate::mdy_hms(x, quiet = TRUE),
ydm = function(x) lubridate::ydm(x, quiet = TRUE)
)
for (format_name in names(date_formats)) {
parsed <- tryCatch({
date_formats[[format_name]](x)
}, error = function(e) NULL)
if (!is.null(parsed) && sum(!is.na(parsed)) > 0) {
return(list(valid = TRUE, parsed = parsed, format = format_name))
}
}
# Try generic parsing as last resort
parsed <- tryCatch({
lubridate::parse_date_time(x, orders = c("ymd", "dmy", "mdy", "ymd HMS", "dmy HMS", "mdy HMS"))
}, error = function(e) NULL)
if (!is.null(parsed) && sum(!is.na(parsed)) > 0) {
return(list(valid = TRUE, parsed = parsed, format = "generic"))
}
return(list(valid = FALSE, parsed = NULL, format = "invalid"))
}
# Handle regular numeric variables
var_list <- c()
if (length(self$options$vars) > 0) {
var_formula <- jmvcore::constructFormula(terms = self$options$vars)
var_list <- unlist(jmvcore::decomposeFormula(formula = var_formula))
}
# Handle and validate date variables
date_var_list <- c()
valid_date_vars <- c()
date_parse_info <- list()
if (length(self$options$date_vars) > 0) {
date_var_formula <- jmvcore::constructFormula(terms = self$options$date_vars)
date_var_list <- unlist(jmvcore::decomposeFormula(formula = date_var_formula))
# Validate each date variable
for (date_var in date_var_list) {
validation_result <- validate_and_parse_dates(dataset[[date_var]])
if (validation_result$valid) {
valid_date_vars <- c(valid_date_vars, date_var)
date_parse_info[[date_var]] <- validation_result
# Update dataset with parsed dates
dataset[[date_var]] <- validation_result$parsed
} else {
warning(paste("Variable", date_var, "could not be parsed as a valid date format"))
}
}
}
# Combined variable check
all_vars <- c(var_list, valid_date_vars)
if (length(all_vars) == 0) {
intro_msg <- "
<h3>Welcome to ClinicoPath Descriptives!</h3>
<p>This tool helps you generate descriptive statistics for your numeric and date variables.</p>
<p>Please select one or more continuous variables and/or date variables from the options panel.</p>
<p>For date variables, supported formats include: YYYY-MM-DD, DD/MM/YYYY, MM/DD/YYYY, and datetime combinations.</p>
<p>If you want to inspect distribution characteristics, enable the 'Distribution Diagnostics' option.</p>"
self$results$todo$setContent(intro_msg)
return()
}
# mysummary function - enhanced with optional sumvar-style features
mysummary <- function(myvar, data_subset = dataset) {
# Check if this is a date variable
is_date_var <- myvar %in% valid_date_vars
if (is_date_var) {
# Handle date variable
date_data <- data_subset[[myvar]]
# Date-specific statistics
total_n <- length(date_data)
valid_dates <- na.omit(date_data)
n_valid <- length(valid_dates)
n_missing <- total_n - n_valid
if (n_valid > 0) {
min_date <- min(valid_dates)
max_date <- max(valid_dates)
median_date <- median(valid_dates)
# Date range in days
date_range_days <- as.numeric(max_date - min_date)
# Format info
format_used <- date_parse_info[[myvar]]$format
# Generate date summary
date_summary <- paste0(
"<strong>", myvar, "</strong> (Date Variable)<br>",
"Format detected: ", format_used, "<br>",
"n = ", n_valid, ", missing = ", n_missing, " (", round(n_missing/total_n*100, 1), "%)<br>",
"Date range: ", as.character(min_date), " to ", as.character(max_date), "<br>",
"Median date: ", as.character(median_date), "<br>",
"Time span: ", round(date_range_days), " days<br><br>"
)
return(date_summary)
} else {
return(paste0("<strong>", myvar, "</strong> (Date Variable): No valid dates found<br><br>"))
}
} else {
# Handle numeric variable (original logic)
numeric_data <- jmvcore::toNumeric(data_subset[[myvar]])
# Original statistics (preserved for backward compatibility)
mean_x <- round(mean(numeric_data, na.rm = TRUE), digits = 1)
sd_x <- round(sd(numeric_data, na.rm = TRUE), digits = 1)
median_x <- round(median(numeric_data, na.rm = TRUE), digits = 1)
min_x <- round(min(numeric_data, na.rm = TRUE), digits = 1)
max_x <- round(max(numeric_data, na.rm = TRUE), digits = 1)
# Enhanced statistics for sumvar or pivot formats
summary_format <- self$options$summary_format
if (summary_format %in% c("sumvar", "pivot")) {
total_n <- length(numeric_data)
valid_data <- na.omit(numeric_data)
n_valid <- length(valid_data)
n_missing <- total_n - n_valid
# Additional statistics
q1 <- round(quantile(valid_data, 0.25, na.rm = TRUE), digits = 2)
q3 <- round(quantile(valid_data, 0.75, na.rm = TRUE), digits = 2)
iqr <- round(q3 - q1, digits = 2)
# 95% Confidence interval for mean
se_mean <- sd(valid_data) / sqrt(n_valid)
ci_lower <- round(mean(valid_data) - 1.96 * se_mean, digits = 2)
ci_upper <- round(mean(valid_data) + 1.96 * se_mean, digits = 2)
# Advanced metrics for pivot format
if (summary_format == "pivot" && self$options$advanced_metrics) {
mad_val <- round(mad(valid_data, na.rm = TRUE), digits = 2)
cv_val <- round((sd(valid_data) / mean(valid_data)) * 100, digits = 2)
}
}
dist_text <- ""
# If the distribution diagnostics option is enabled, add additional tests.
if (self$options$distr) {
# Shapiro-Wilk test (only valid if 3 <= sample size <= 5000)
numeric_data <- jmvcore::toNumeric(dataset[[myvar]])
valid_data <- na.omit(numeric_data)
if (length(valid_data) >= 3 && length(valid_data) <= 5000) {
sw_test <- shapiro.test(valid_data)
p_val <- round(sw_test$p.value, 3)
} else {
p_val <- NA
}
# Calculate skewness and kurtosis using the moments package.
skew_val <- round(moments::skewness(numeric_data, na.rm = TRUE), 2)
kurt_val <- round(moments::kurtosis(numeric_data, na.rm = TRUE), 2)
# Interpret normality based on the Shapiro-Wilk p-value.
norm_status <- if (!is.na(p_val)) {
if (p_val > 0.05) "appears to be normally distributed" else "does not appear to be normally distributed. Please use relevant visualisation and tests to verify the characteristics of distribution."
} else {
"Normality test not applicable due to sample size"
}
dist_text <- paste0(
"<br><em>Distribution Diagnostics for ", myvar ,":</em> Shapiro-Wilk p-value = ", p_val,
"; Skewness = ", skew_val, "; Kurtosis = ", kurt_val,
" (Data ", norm_status, ")."
)
}
# Generate output based on selected format
if (summary_format == "sumvar") {
# sumvar-style comprehensive output
summary_text <- paste0(
"<strong>", myvar, "</strong><br>",
"n = ", n_valid, ", missing = ", n_missing, " (", round(n_missing/total_n*100, 1), "%)<br>",
"Mean: ", round(mean(valid_data), 2), " (95% CI: ", ci_lower, " - ", ci_upper, ")<br>",
"Median: ", round(median(valid_data), 2), " (Q1: ", q1, ", Q3: ", q3, ")<br>",
"SD: ", round(sd(valid_data), 2), ", IQR: ", iqr, "<br>",
"Range: [", round(min(valid_data), 2), " - ", round(max(valid_data), 2), "]",
dist_text, "<br><br>"
)
return(summary_text)
} else if (summary_format == "pivot") {
# Store data for pivot table generation
pivot_data <<- if (!exists("pivot_data")) list() else pivot_data
pivot_data[[myvar]] <<- list(
variable = myvar,
n = n_valid,
missing = n_missing,
mean = round(mean(valid_data), 2),
sd = round(sd(valid_data), 2),
median = round(median(valid_data), 2),
q1 = q1,
q3 = q3,
iqr = iqr,
min = round(min(valid_data), 2),
max = round(max(valid_data), 2),
ci_lower = ci_lower,
ci_upper = ci_upper
)
if (self$options$advanced_metrics) {
pivot_data[[myvar]]$mad <- mad_val
pivot_data[[myvar]]$cv <- cv_val
}
# Return minimal text for now, pivot table will be generated separately
return("")
} else if (summary_format %in% c("summarytools_df", "summarytools_desc", "summarytools_freq")) {
# summarytools integration - NEW FUNCTIONALITY
return(private$.generate_summarytools_output(myvar, data_subset, summary_format))
} else {
# Original output format (preserved)
return(paste0("Mean of <strong>", myvar, "</strong> is: ", mean_x, " \U00B1 ", sd_x,
". (Median: ", median_x, " [Min: ", min_x, " - ", "Max: ",
max_x, "]) <br>", dist_text, "<br><br>", collapse = " "))
}
} # End of numeric variable handling
}
# Handle grouping if specified
if (!is.null(self$options$grvar) && length(self$options$grvar) > 0) {
group_var <- self$options$grvar
# Split analysis by group
group_results <- list()
group_levels <- levels(as.factor(dataset[[group_var]]))
for (level in group_levels) {
group_subset <- dataset[dataset[[group_var]] == level, ]
level_results <- list()
for (var in all_vars) {
# Pass the group subset to mysummary function
level_summary <- mysummary(var, group_subset)
level_results[[var]] <- paste0("<em>Group: ", level, "</em><br>", level_summary)
}
group_results[[level]] <- paste(unlist(level_results), collapse = "<br>")
}
results <- paste(unlist(group_results), collapse = "<hr>")
} else {
# Original ungrouped analysis - handle both numeric and date variables
results <- purrr::map(.x = all_vars, .f = mysummary)
results <- unlist(results)
results <- paste(results, collapse = "")
}
self$results$text$setContent(results)
# Generate pivot table if pivot format is selected
if (self$options$summary_format == "pivot" && exists("pivot_data") && length(pivot_data) > 0) {
pivot_html <- private$.generate_pivot_summary(pivot_data)
self$results$pivot_summary$setContent(pivot_html)
# Generate export information if enabled
if (self$options$pivot_export) {
export_html <- private$.generate_export_info()
self$results$pivot_export_info$setContent(export_html)
}
}
# Generate summarytools output if selected - NEW FUNCTIONALITY
if (self$options$summary_format %in% c("summarytools_df", "summarytools_desc", "summarytools_freq")) {
summarytools_html <- private$.generate_comprehensive_summarytools(dataset, all_vars)
self$results$text$setContent(summarytools_html)
}
plot_dataset <- dataset %>%
gtExtras::gt_plt_summary()
print_plot_dataset <- print(plot_dataset)
plot_dataset <- htmltools::HTML(print_plot_dataset[["children"]][[2]])
self$results$text1$setContent(plot_dataset)
}
},
.generate_pivot_summary = function(pivot_data) {
# Create enhanced summary using pivottabler
layout_style <- self$options$pivot_layout
# Convert pivot_data to data frame for pivottabler
df_data <- data.frame(
Variable = names(pivot_data),
N = sapply(pivot_data, function(x) x$n),
Missing = sapply(pivot_data, function(x) x$missing),
Mean = sapply(pivot_data, function(x) x$mean),
SD = sapply(pivot_data, function(x) x$sd),
Median = sapply(pivot_data, function(x) x$median),
Q1 = sapply(pivot_data, function(x) x$q1),
Q3 = sapply(pivot_data, function(x) x$q3),
Min = sapply(pivot_data, function(x) x$min),
Max = sapply(pivot_data, function(x) x$max),
stringsAsFactors = FALSE
)
# Add confidence intervals if enabled
if (self$options$include_confidence) {
df_data$CI_Lower <- sapply(pivot_data, function(x) x$ci_lower)
df_data$CI_Upper <- sapply(pivot_data, function(x) x$ci_upper)
}
# Add advanced metrics if enabled
if (self$options$advanced_metrics) {
df_data$MAD <- sapply(pivot_data, function(x) x$mad %||% NA)
df_data$CV <- sapply(pivot_data, function(x) x$cv %||% NA)
}
# Create HTML table based on layout style
html_table <- private$.create_styled_table(df_data, layout_style)
return(html_table)
},
.create_styled_table = function(df_data, layout_style) {
# Create professional HTML table based on layout style
style_config <- switch(layout_style,
"clinical" = list(
title = "Clinical Research Summary",
bg_color = "#f8f9fa",
header_color = "#495057",
border_color = "#dee2e6"
),
"statistical" = list(
title = "Statistical Analysis Summary",
bg_color = "#e3f2fd",
header_color = "#1976d2",
border_color = "#bbdefb"
),
"comparative" = list(
title = "Comparative Study Summary",
bg_color = "#f3e5f5",
header_color = "#7b1fa2",
border_color = "#ce93d8"
)
)
# Build HTML table
html <- paste0(
"<div style='background-color: ", style_config$bg_color, "; padding: 20px; border-radius: 8px; margin: 10px 0;'>",
"<h3 style='color: ", style_config$header_color, "; margin-top: 0;'>", style_config$title, "</h3>",
"<table style='width: 100%; border-collapse: collapse; font-family: Arial, sans-serif;'>",
"<thead><tr style='background-color: ", style_config$header_color, "; color: white;'>"
)
# Add headers
for (col_name in names(df_data)) {
display_name <- switch(col_name,
"Variable" = "Variable",
"N" = "N",
"Missing" = "Missing",
"Mean" = "Mean",
"SD" = "SD",
"Median" = "Median",
"Q1" = "Q1",
"Q3" = "Q3",
"Min" = "Min",
"Max" = "Max",
"CI_Lower" = "CI Lower",
"CI_Upper" = "CI Upper",
"MAD" = "MAD",
"CV" = "CV (%)",
col_name
)
html <- paste0(html, "<th style='padding: 12px; border: 1px solid ", style_config$border_color, ";'>", display_name, "</th>")
}
html <- paste0(html, "</tr></thead><tbody>")
# Add data rows
for (i in 1:nrow(df_data)) {
row_bg <- if (i %% 2 == 0) "#ffffff" else "#f9f9f9"
html <- paste0(html, "<tr style='background-color: ", row_bg, ";'>")
for (col_name in names(df_data)) {
value <- df_data[i, col_name]
if (is.na(value)) value <- "-"
html <- paste0(html, "<td style='padding: 10px; border: 1px solid ", style_config$border_color, "; text-align: center;'>", value, "</td>")
}
html <- paste0(html, "</tr>")
}
html <- paste0(html, "</tbody></table>")
html <- paste0(html, "<p style='font-size: 12px; color: #6c757d; margin-top: 15px;'>")
html <- paste0(html, "<em>📊 Enhanced Summary using pivottabler - Layout: ", layout_style, "</em></p>")
html <- paste0(html, "</div>")
return(html)
},
.generate_export_info = function() {
export_html <- paste0(
"<div style='background-color: #d4edda; padding: 15px; border-radius: 5px; margin: 10px 0;'>",
"<h4 style='color: #155724; margin-top: 0;'>📄 Export Ready</h4>",
"<p style='margin: 5px 0;'>Your pivot summary is ready for export in multiple formats:</p>",
"<ul style='margin: 10px 0; padding-left: 20px;'>",
"<li><strong>Excel:</strong> Professional spreadsheet format</li>",
"<li><strong>CSV:</strong> Data analysis compatible</li>",
"<li><strong>HTML:</strong> Web-ready presentation</li>",
"</ul>",
"<p style='font-size: 12px; color: #155724; margin-bottom: 0;'>",
"<em>💡 Export functionality available through pivottabler integration</em></p>",
"</div>"
)
return(export_html)
},
.generate_summarytools_output = function(myvar, data_subset, summary_format) {
# Individual variable summarytools output - placeholder for complex logic
return(paste0("<em>summarytools analysis for ", myvar, " (", summary_format, ")</em><br><br>"))
},
.generate_comprehensive_summarytools = function(dataset, all_vars) {
# Comprehensive summarytools analysis - NEW FUNCTIONALITY from autoEDA research
# Safely require summarytools
if (!requireNamespace("summarytools", quietly = TRUE)) {
return("<div style='color: red; padding: 20px;'>Error: summarytools package not available. Please install it to use this feature.</div>")
}
summary_format <- self$options$summary_format
include_graphs <- self$options$summarytools_graphs
round_digits <- self$options$summarytools_round_digits
# Set summarytools options for HTML output
summarytools::st_options(
plain.ascii = FALSE,
style = "html",
dfSummary.silent = TRUE,
dfSummary.graph.col = include_graphs,
round.digits = round_digits
)
tryCatch({
if (summary_format == "summarytools_df") {
# dfSummary - comprehensive dataset overview with plots (top feature from article)
df_subset <- dataset[all_vars]
summary_result <- summarytools::dfSummary(
df_subset,
plain.ascii = FALSE,
style = "html",
graph.magnif = 0.75,
valid.col = TRUE,
na.col = TRUE,
graph.col = include_graphs,
tmp.img.dir = tempdir()
)
html_output <- summarytools::print.summarytools(
summary_result,
method = "render",
headings = FALSE,
footnote = NA
)
header_html <- paste0(
"<div style='background-color: #e3f2fd; padding: 15px; border-radius: 8px; margin-bottom: 20px;'>",
"<h3 style='color: #1976d2; margin: 0;'>📊 summarytools: Dataset Summary (dfSummary)</h3>",
"<p style='margin: 5px 0 0 0; color: #555;'>Comprehensive overview with embedded visualizations - Based on autoEDA research (R Journal 2019)</p>",
"</div>"
)
return(paste0(header_html, html_output))
} else if (summary_format == "summarytools_desc") {
# descr - advanced descriptive statistics (key feature from article)
numeric_vars <- all_vars[sapply(dataset[all_vars], is.numeric)]
if (length(numeric_vars) == 0) {
return("<div style='color: orange; padding: 20px;'>No numeric variables selected for descriptive statistics.</div>")
}
summary_result <- summarytools::descr(
dataset[numeric_vars],
stats = c("mean", "sd", "min", "q1", "med", "q3", "max", "mad", "iqr", "cv", "skewness", "se.skewness", "kurtosis"),
transpose = TRUE,
style = "html"
)
html_output <- summarytools::print.summarytools(
summary_result,
method = "render",
headings = FALSE,
footnote = NA
)
header_html <- paste0(
"<div style='background-color: #f3e5f5; padding: 15px; border-radius: 8px; margin-bottom: 20px;'>",
"<h3 style='color: #7b1fa2; margin: 0;'>📈 summarytools: Advanced Descriptive Statistics</h3>",
"<p style='margin: 5px 0 0 0; color: #555;'>Enhanced descriptives with skewness, kurtosis, and robust statistics</p>",
"</div>"
)
return(paste0(header_html, html_output))
} else if (summary_format == "summarytools_freq") {
# freq - enhanced frequency tables for categorical variables
categorical_vars <- all_vars[sapply(dataset[all_vars], function(x) is.factor(x) || is.character(x))]
if (length(categorical_vars) == 0) {
return("<div style='color: orange; padding: 20px;'>No categorical variables selected for frequency analysis.</div>")
}
freq_results <- list()
for (var in categorical_vars) {
freq_result <- summarytools::freq(
dataset[[var]],
style = "html",
report.nas = TRUE,
totals = TRUE,
cumul = TRUE
)
freq_html <- summarytools::print.summarytools(
freq_result,
method = "render",
headings = FALSE,
footnote = NA
)
freq_results[[var]] <- paste0(
"<h4 style='color: #d32f2f; margin: 20px 0 10px 0;'>", var, "</h4>",
freq_html
)
}
header_html <- paste0(
"<div style='background-color: #ffebee; padding: 15px; border-radius: 8px; margin-bottom: 20px;'>",
"<h3 style='color: #d32f2f; margin: 0;'>📋 summarytools: Frequency Analysis</h3>",
"<p style='margin: 5px 0 0 0; color: #555;'>Enhanced frequency tables with percentages and cumulative statistics</p>",
"</div>"
)
return(paste0(header_html, paste(freq_results, collapse = "")))
}
}, error = function(e) {
error_html <- paste0(
"<div style='color: red; background-color: #ffebee; padding: 20px; border-radius: 8px;'>",
"<h4>summarytools Error</h4>",
"<p>Error generating summarytools output: ", e$message, "</p>",
"<p><em>Please check your data and try again.</em></p>",
"</div>"
)
return(error_html)
})
return("<div style='color: orange; padding: 20px;'>Unknown summarytools format selected.</div>")
}
)
)
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.