#' @title GoodeR R6 Class for Exploratory Data Analysis
#' @description
#' An R6 class designed to facilitate common exploratory data analysis (EDA) tasks
#' on data.frame or data.table objects. It provides methods for metadata
#' generation, numeric, categorical, and date summaries, correlation analysis,
#' various plots, and data quality checks.
#'
#' @field data A data.table containing the dataset to be analyzed.
#' @field name A character string name for the dataset/GoodeR object.
#'
#' @import R6
#' @import data.table
#' @import ggplot2
#' @import moments
#' @import lubridate
#' @import logger
#' @export GoodeR
GoodeR <- R6::R6Class("GoodeR",
public = list(
data = NULL,
name = "GoodeRObject",
#' @description
#' Initialize a new GoodeR object.
#' @param data A data.frame or data.table.
#' @param name A character string to name the dataset (default: "Dataset").
#' @return A new `GoodeR` object.
#' @examples
#' if (requireNamespace("stringi", quietly = TRUE)) {
#' # Generate sample data (simplified for example)
#' dt <- data.table::data.table(
#' ID = 1:10,
#' Cat = sample(c("A","B"), 10, replace=TRUE),
#' Num = rnorm(10)
#' )
#' GoodeR_obj <- GoodeR$new(dt, name = "My Sample")
#' print(GoodeR_obj)
#' }
initialize = function(data, name = "Dataset") {
logger::log_info("Initializing GoodeR object...")
if (missing(data) || is.null(data)) {
logger::log_error("Input 'data' is missing.")
stop("Input 'data' is missing.")
}
if (!is.data.frame(data)) {
logger::log_error("Input 'data' must be a data.frame or data.table.")
stop("Input 'data' must be a data.frame or data.table.")
}
if (nrow(data) == 0) {
logger::log_warn("Input 'data' has 0 rows.")
}
self$data <- data.table::as.data.table(data.table::copy(data))
self$name <- name
logger::log_info(paste0("GoodeR object '", self$name, "' initialized with ", nrow(self$data), " rows and ", ncol(self$data), " columns."))
invisible(self)
},
#' @description
#' Get metadata for the dataset.
#' Provides information like data type, unique values, and missing values for each column.
#' @param by_var Optional. A character string specifying a column name to group metadata by.
#' The grouping variable should be categorical or easily coercible to factor.
#' @return A data.table with metadata. If `by_var` is used, the first column(s) will be the grouping variable(s).
#' @examples
#' if (requireNamespace("stringi", quietly = TRUE)) {
#' dt <- data.table::data.table(
#' Group = sample(c("G1", "G2"), 20, replace = TRUE),
#' Val = rnorm(20),
#' Cat = sample(letters[1:3], 20, replace = TRUE)
#' )
#' GoodeR_obj <- GoodeR$new(dt)
#' meta_overall <- GoodeR_obj$get_metadata()
#' # print(meta_overall)
#' meta_by_group <- GoodeR_obj$get_metadata(by_var = "Group")
#' # print(meta_by_group)
#' }
get_metadata = function(by_var = NULL) {
logger::log_info(paste0("Generating metadata for '", self$name, if (!is.null(by_var)) paste0("' by '", by_var, "'") else "", "..."))
if (is.null(self$data)) {
logger::log_error("Data not loaded.")
stop("Data not loaded.")
}
.get_subset_metadata <- function(dt_subset) {
meta_sub <- data.table::data.table(
Column_Name = names(dt_subset),
Data_Type = sapply(dt_subset, function(x) paste(class(x), collapse = ", ")),
N_Rows_In_Scope = nrow(dt_subset),
Unique_Values = sapply(dt_subset, data.table::uniqueN),
Missing_Values = sapply(dt_subset, function(x) sum(is.na(x)))
)
meta_sub[, Missing_Percentage := data.table::fifelse(N_Rows_In_Scope > 0, round(Missing_Values / N_Rows_In_Scope * 100, 2), 0)]
return(meta_sub)
}
if (!is.null(by_var)) {
private$.check_data_and_vars(by_var)
private$.check_var_type(by_var, c("character", "factor", "integer", "logical"))
temp_data <- data.table::copy(self$data)
if (!is.factor(temp_data[[by_var]])) temp_data[, (by_var) := as.factor(get(by_var))]
results_list <- temp_data[, .get_subset_metadata(.SD), by = by_var]
if (by_var %in% names(results_list) && "Column_Name" %in% names(results_list)) data.table::setcolorder(results_list, c(by_var, "Column_Name"))
logger::log_info("Metadata by group generation complete.")
return(results_list)
} else {
meta_dt <- .get_subset_metadata(self$data)
data.table::setnames(meta_dt, "N_Rows_In_Scope", "N_Total_Rows")
logger::log_info("Overall metadata generation complete.")
return(meta_dt)
}
},
#' @description
#' Summarize numeric variables.
#' Calculates common descriptive statistics for numeric columns.
#' @param by_var Optional. A character string specifying a column name to group summaries by.
#' The grouping variable should be categorical or easily coercible to factor.
#' @return A data.table with numeric summaries.
#' @examples
#' if (requireNamespace("stringi", quietly = TRUE)) {
#' dt <- data.table::data.table(
#' Group = sample(c("G1", "G2"), 20, replace = TRUE),
#' Val1 = rnorm(20),
#' Val2 = rpois(20, 5)
#' )
#' GoodeR_obj <- GoodeR$new(dt)
#' num_sum_overall <- GoodeR_obj$summarize_numeric()
#' # print(num_sum_overall)
#' num_sum_by_group <- GoodeR_obj$summarize_numeric(by_var = "Group")
#' # print(num_sum_by_group)
#' }
summarize_numeric = function(by_var = NULL) {
logger::log_info(paste0("Summarizing numeric vars for '", self$name, if (!is.null(by_var)) paste0("' by '", by_var, "'") else "", "..."))
if (is.null(self$data)) {
logger::log_error("Data not loaded.")
stop("Data not loaded.")
}
numeric_cols <- names(self$data)[sapply(self$data, is.numeric)]
if (length(numeric_cols) == 0) {
logger::log_warn("No numeric columns found.")
return(data.table::data.table())
}
data_for_summary <- self$data
if (!is.null(by_var)) {
private$.check_data_and_vars(by_var)
private$.check_var_type(by_var, c("character", "factor", "integer", "logical"))
if (!is.factor(data_for_summary[[by_var]])) {
if (identical(data_for_summary, self$data)) data_for_summary <- data.table::copy(self$data)
logger::log_info(paste0("Coercing by_var '", by_var, "' to factor for summarize_numeric."))
data_for_summary[, (by_var) := as.factor(get(by_var))]
}
}
summary_list <- list()
for (col_name in numeric_cols) {
sym_col_name <- as.name(col_name)
# Need to ensure moments::skewness and moments::kurtosis are used
expr <- bquote(list(
N = .N, N_Non_Missing = sum(!is.na(.(sym_col_name))), Mean = mean(as.double(.(sym_col_name)), na.rm = TRUE),
SD = stats::sd(as.double(.(sym_col_name)), na.rm = TRUE), Min = as.double(min(as.double(.(sym_col_name)), na.rm = TRUE)),
P25 = stats::quantile(as.double(.(sym_col_name)), 0.25, na.rm = TRUE, names = FALSE, type = 7),
Median = as.double(stats::median(as.double(.(sym_col_name)), na.rm = TRUE)),
P75 = stats::quantile(as.double(.(sym_col_name)), 0.75, na.rm = TRUE, names = FALSE, type = 7),
Max = as.double(max(as.double(.(sym_col_name)), na.rm = TRUE)),
Skewness = {
x_vec <- .(sym_col_name)
x_dbl_nona <- as.double(x_vec[!is.na(x_vec)])
if (length(x_dbl_nona) > 2 && data.table::uniqueN(x_dbl_nona) > 1) moments::skewness(x_dbl_nona) else NA_real_
},
Kurtosis = {
x_vec <- .(sym_col_name)
x_dbl_nona <- as.double(x_vec[!is.na(x_vec)])
if (length(x_dbl_nona) > 3 && data.table::uniqueN(x_dbl_nona) > 1) moments::kurtosis(x_dbl_nona) else NA_real_
}
))
current_summary <- if (!is.null(by_var)) data_for_summary[, eval(expr), by = by_var] else data_for_summary[, eval(expr)]
if (!data.table::is.data.table(current_summary)) {
logger::log_fatal(paste0("CRITICAL: summary for '", col_name, "' not DT."))
current_summary <- data.table::as.data.table(current_summary)
}
current_summary[, Variable := col_name]
if (!is.null(by_var)) data.table::setcolorder(current_summary, c("Variable", by_var)) else data.table::setcolorder(current_summary, "Variable")
summary_list[[col_name]] <- current_summary
}
result_dt <- data.table::rbindlist(summary_list, fill = TRUE)
cols_fix <- c("Min", "Max", "Mean", "SD", "Median", "P25", "P75", "Skewness", "Kurtosis")
for (col_f in intersect(cols_fix, names(result_dt))) {
if (col_f %in% names(result_dt)) {
data.table::set(result_dt, i = which(is.infinite(result_dt[[col_f]]) | is.nan(result_dt[[col_f]])), j = col_f, value = NA_real_)
}
}
logger::log_info("Numeric summarization complete.")
return(result_dt)
},
#' @description
#' Summarize categorical variables.
#' Provides frequency counts and percentages for levels of categorical columns.
#' @param top_n Integer. Number of top levels to show for each variable (default: 5).
#' @param include_plot Logical. If TRUE, generates and prints bar plots for top levels (default: FALSE).
#' @param by_var Optional. A character string specifying a column name to group summaries by.
#' @return If `by_var` is NULL, a list where each element is a list containing `summary_stats`
#' and `top_levels_frequency` for a categorical variable.
#' If `by_var` is specified, a single data.table with summarized results across all categorical variables,
#' grouped by `by_var`.
summarize_categorical = function(top_n = 5, include_plot = FALSE, by_var = NULL) {
logger::log_info(paste0("Summarizing categorical variables for '", self$name, if (!is.null(by_var)) paste0("' by '", by_var, "'") else "", "..."))
if (is.null(self$data)) {
logger::log_error("Data not loaded.")
stop("Data not loaded.")
}
categorical_cols <- names(self$data)[sapply(self$data, function(x) is.factor(x) || is.character(x))]
if (length(categorical_cols) == 0) {
logger::log_warn("No categorical columns found.")
return(if (is.null(by_var)) list() else data.table::data.table())
}
data_to_use <- self$data
original_by_var_class_if_any <- NULL
if (!is.null(by_var)) {
private$.check_data_and_vars(by_var)
private$.check_var_type(by_var, c("character", "factor", "integer", "logical"))
original_by_var_class_if_any <- class(data_to_use[[by_var]])
if (!is.factor(data_to_use[[by_var]])) {
if (identical(data_to_use, self$data)) data_to_use <- data.table::copy(self$data)
logger::log_info(paste0("Coercing by_var '", by_var, "' to factor for summarize_categorical."))
data_to_use[, (by_var) := as.factor(get(by_var))]
}
}
all_results_accumulator <- list()
for (col_name in categorical_cols) {
if (!is.null(by_var) && col_name == by_var && !(any(c("factor", "character") %in% original_by_var_class_if_any))) {
logger::log_warn(paste0("Skipping '", col_name, "' as it is the 'by_var' (", paste(original_by_var_class_if_any, collapse = ","), ") and was not originally categorical."))
next
}
logger::log_debug(paste0("Summarizing cat var: ", col_name, if (!is.null(by_var)) paste0(" by ", by_var)))
grouping_vars_for_by_clause <- col_name
if (!is.null(by_var)) {
grouping_vars_for_by_clause <- if (col_name == by_var) by_var else c(by_var, col_name)
}
cols_for_na_filter <- unique(grouping_vars_for_by_clause)
# Ensure .SDcols is used correctly
data_subset_no_na <- data_to_use[stats::complete.cases(data_to_use[, .SD, .SDcols = cols_for_na_filter])]
current_counts_dt <- NULL
if (nrow(data_subset_no_na) > 0) {
current_counts_dt <- data_subset_no_na[, .N, by = grouping_vars_for_by_clause]
if (col_name %in% names(current_counts_dt)) {
if (is.null(by_var) || col_name != by_var || (col_name == by_var && length(grouping_vars_for_by_clause) == 1)) {
data.table::setnames(current_counts_dt, old = col_name, new = "Level")
}
}
if (!is.null(by_var) && col_name == by_var && "Level" %in% names(current_counts_dt) && !(by_var %in% names(current_counts_dt))) {
current_counts_dt[, (by_var) := get("Level")]
}
if (!is.null(by_var) && by_var %in% names(current_counts_dt)) {
data.table::setorderv(current_counts_dt, c(by_var, "N"), order = c(1, -1))
current_counts_dt <- current_counts_dt[, utils::head(.SD, top_n), by = by_var]
} else {
data.table::setorderv(current_counts_dt, "N", -1)
current_counts_dt <- utils::head(current_counts_dt, top_n)
}
} else {
current_counts_dt <- data.table::data.table(Level = character(0), N = integer(0))
if (!is.null(by_var) && !(by_var %in% names(current_counts_dt))) current_counts_dt[, (by_var) := factor()]
if (is.null(current_counts_dt[["Level"]]) && (!is.null(by_var) && col_name == by_var)) current_counts_dt[, Level := factor()] # Use [["Level"]] for safety
}
if (nrow(current_counts_dt) > 0) {
if (!is.null(by_var) && by_var %in% names(current_counts_dt)) {
total_col_name_for_perc <- ".Total_N_Group"
totals_in_by_group <- data_subset_no_na[, .N, by = by_var]
data.table::setnames(totals_in_by_group, "N", total_col_name_for_perc)
current_counts_dt <- merge(current_counts_dt, totals_in_by_group, by = by_var, all.x = TRUE)
current_counts_dt[!is.na(get(total_col_name_for_perc)) & get(total_col_name_for_perc) > 0,
Percentage := round(N / get(total_col_name_for_perc) * 100, 2)]
current_counts_dt[is.na(get(total_col_name_for_perc)) | get(total_col_name_for_perc) == 0, Percentage := NA_real_]
current_counts_dt[, (total_col_name_for_perc) := NULL]
} else {
total_for_overall_perc <- nrow(data_subset_no_na)
if (total_for_overall_perc > 0) current_counts_dt[, Percentage := round(N / total_for_overall_perc * 100, 2)]
else current_counts_dt[, Percentage := NA_real_]
}
} else {
current_counts_dt[, Percentage := NA_real_]
}
if (!is.null(by_var)) {
current_counts_dt[, Summarized_Variable := col_name]
all_results_accumulator[[col_name]] <- current_counts_dt
} else {
n_unique_overall <- data.table::uniqueN(self$data[[col_name]], na.rm = TRUE)
n_missing_overall <- sum(is.na(self$data[[col_name]]))
summary_stats_dt <- data.table::data.table(Var = col_name, UniqLvls = n_unique_overall, Missing = n_missing_overall, TopNShown = nrow(current_counts_dt))
all_results_accumulator[[col_name]] <- list(summary_stats = summary_stats_dt, top_levels_frequency = current_counts_dt)
}
if (include_plot && nrow(current_counts_dt) > 0 && any(current_counts_dt$N > 0)) {
plt_data <- data.table::copy(current_counts_dt)
if (!"Level" %in% names(plt_data)) {
logger::log_warn(paste0("Plot: 'Level' col NF for ", col_name, ". Skip plot."))
} else {
plt_data[, Level := factor(Level, levels = unique(Level))] # Use unique(Level) from plt_data itself for ordering
p_cat <- ggplot2::ggplot(plt_data, ggplot2::aes(x = .data$Level, y = .data$N, fill = .data$Level)) +
ggplot2::geom_bar(stat = "identity", show.legend = FALSE) +
ggplot2::geom_text(ggplot2::aes(label = .data$N), vjust = -0.3, size = 2.5) +
ggplot2::theme_minimal() +
ggplot2::theme(axis.text.x = ggplot2::element_text(angle = 45, hjust = 1))
title_base <- paste0("Top levels for ", col_name)
if (!is.null(by_var) && by_var %in% names(plt_data)) {
p_cat <- p_cat + ggplot2::facet_wrap(stats::as.formula(paste("~", by_var)))
title_base <- paste(title_base, "by", by_var)
}
p_cat <- p_cat + ggplot2::labs(title = title_base, x = col_name, y = "Freq")
tryCatch(print(p_cat), error = function(e) logger::log_warn(paste("Plot err cat", col_name, ":", e$message)))
}
}
}
logger::log_info("Cat summary done.")
if (!is.null(by_var)) {
final_dt <- data.table::rbindlist(all_results_accumulator, use.names = TRUE, fill = TRUE)
if (nrow(final_dt) > 0) {
expected_cols <- c("Summarized_Variable", by_var, "Level", "N", "Percentage")
present_cols <- intersect(expected_cols, names(final_dt))
if (length(present_cols) > 0) data.table::setcolorder(final_dt, present_cols)
}
return(final_dt)
} else {
return(all_results_accumulator)
}
},
#' @description
#' Summarize date and datetime variables.
#' Provides min/max dates, range, and counts for date/datetime columns.
#' @param include_plot Logical. If TRUE, generates and prints histograms for date distributions (default: FALSE).
#' @param by_var Optional. A character string specifying a column name to group summaries by.
#' @return A data.table with date summaries.
summarize_dates = function(include_plot = FALSE, by_var = NULL) {
logger::log_info(paste0("Summarizing date vars for '", self$name, if (!is.null(by_var)) paste0("' by '", by_var, "'") else "", "..."))
if (is.null(self$data)) {
logger::log_error("Data not loaded.")
stop("Data not loaded.")
}
date_cols <- names(self$data)[sapply(self$data, function(x) inherits(x, "Date") || inherits(x, "POSIXct"))]
if (length(date_cols) == 0) {
logger::log_warn("No date/datetime columns found.")
return(data.table::data.table())
}
data_to_use <- self$data
if (!is.null(by_var)) {
private$.check_data_and_vars(by_var)
private$.check_var_type(by_var, c("character", "factor", "integer", "logical"))
if (!is.factor(data_to_use[[by_var]])) {
if (identical(data_to_use, self$data)) data_to_use <- data.table::copy(self$data)
logger::log_info(paste0("Coercing by_var '", by_var, "' to factor for summarize_dates."))
data_to_use[, (by_var) := as.factor(get(by_var))]
}
}
all_date_summaries <- list()
for (col_name in date_cols) {
logger::log_debug(paste0("Summarizing date: ", col_name, if (!is.null(by_var)) paste0(" by ", by_var)))
sym_col <- as.name(col_name)
na_val_constructor <- if (inherits(self$data[[col_name]], "Date")) {
quote(as.Date(NA_character_))
} else {
quote(lubridate::as_datetime(NA_real_)) # Use as_datetime for POSIXct NA
}
expr <- bquote(list(
N_Total = .N, N_Non_Missing = sum(!is.na(.(sym_col))),
Min_Date_Raw = if (sum(!is.na(.(sym_col))) > 0) min(.(sym_col), na.rm = TRUE) else .(na_val_constructor),
Max_Date_Raw = if (sum(!is.na(.(sym_col))) > 0) max(.(sym_col), na.rm = TRUE) else .(na_val_constructor),
Unique_Count = data.table::uniqueN(.(sym_col), na.rm = TRUE)
))
current_summary <- if (!is.null(by_var)) data_to_use[, eval(expr), by = by_var] else data_to_use[, eval(expr)]
original_tz <- lubridate::tz(self$data[[col_name]][1]) # Get tz from original data
if (is.null(original_tz) || original_tz == "") original_tz <- "UTC"
# Ensure Min_Date_Raw and Max_Date_Raw are POSIXct before converting for consistency
# This is tricky because min/max on Date objects return Date, on POSIXct return POSIXct
# If original is Date, convert to POSIXct for Min_Date/Max_Date columns
if (inherits(self$data[[col_name]], "Date")) {
current_summary[, Min_Date := lubridate::as_datetime(Min_Date_Raw, tz = original_tz)]
current_summary[, Max_Date := lubridate::as_datetime(Max_Date_Raw, tz = original_tz)]
} else { # POSIXct
current_summary[, Min_Date := lubridate::as_datetime(as.numeric(Min_Date_Raw), origin="1970-01-01", tz=original_tz)]
current_summary[, Max_Date := lubridate::as_datetime(as.numeric(Max_Date_Raw), origin="1970-01-01", tz=original_tz)]
}
current_summary[, Range_Seconds := data.table::fifelse(N_Non_Missing > 1 & !is.na(Min_Date) & !is.na(Max_Date),
as.numeric(lubridate::difftime(Max_Date, Min_Date, units = "secs")), NA_real_)]
current_summary[, c("Min_Date_Raw", "Max_Date_Raw") := NULL]
current_summary[, Variable := col_name]
if (!is.null(by_var)) data.table::setcolorder(current_summary, c("Variable", by_var)) else data.table::setcolorder(current_summary, "Variable")
all_date_summaries[[col_name]] <- current_summary
if (include_plot && sum(!is.na(self$data[[col_name]])) > 0) {
plot_dt <- data.table::copy(data_to_use[!is.na(get(col_name))])
# Ensure column for plotting is POSIXct
if (inherits(plot_dt[[col_name]], "Date")) plot_dt[, (col_name) := lubridate::as_datetime(get(col_name), tz = original_tz)]
p <- ggplot2::ggplot(plot_dt, ggplot2::aes(x = .data[[col_name]])) +
ggplot2::geom_histogram(bins = 30, fill = "steelblue", color = "black") +
ggplot2::theme_minimal()
plot_title <- paste("Distribution of", col_name)
if (!is.null(by_var)) {
p <- p + ggplot2::facet_wrap(stats::as.formula(paste("~", by_var)))
plot_title <- paste(plot_title, "by", by_var)
}
p <- p + ggplot2::labs(title = plot_title, x = col_name, y = "Frequency")
tryCatch(print(p), error = function(e) logger::log_warn(paste("Plot err date", col_name, ":", e$message)))
}
}
result <- data.table::rbindlist(all_date_summaries, use.names = TRUE, fill = TRUE)
logger::log_info("Date summary complete.")
return(result)
},
#' @description
#' Calculate and optionally plot correlation matrices for numeric variables.
#' @param method Correlation method (e.g., "pearson", "kendall", "spearman"). Passed to `stats::cor`.
#' @param use Handling of missing data. Passed to `stats::cor`. (default: "pairwise.complete.obs").
#' @param plot_type Type of plot: "ggplot", "corrplot", or NULL for no plot (default: "ggplot").
#' "corrplot" requires the 'corrplot' package.
#' @param by_var Optional. A character string specifying a column name to group correlation analysis by.
#' @return If `by_var` is NULL, a single correlation matrix.
#' If `by_var` is specified, a list of correlation matrices, one for each group.
#' Plots are printed to the active graphics device if `plot_type` is not NULL.
get_correlations = function(method = "pearson", use = "pairwise.complete.obs", plot_type = "ggplot", by_var = NULL) {
logger::log_info(paste0("Correlations for '", self$name, if (!is.null(by_var)) paste0("' by '", by_var, "'") else "", "..."))
if (is.null(self$data)) {
logger::log_error("Data not loaded.")
stop("Data not loaded.")
}
data_to_use <- self$data
if (!is.null(by_var)) {
private$.check_data_and_vars(by_var)
private$.check_var_type(by_var, c("character", "factor", "integer", "logical"))
if (!is.factor(data_to_use[[by_var]])) {
if (identical(data_to_use, self$data)) data_to_use <- data.table::copy(self$data)
data_to_use[, (by_var) := as.factor(get(by_var))]
}
}
calc_cor_mat <- function(dt, title_suffix = "") {
num_cols <- names(dt)[sapply(dt, is.numeric)]
if (length(num_cols) < 1) {
logger::log_warn(paste0("No numeric columns for correlation", title_suffix))
return(matrix(nrow = 0, ncol = 0))
}
num_dt <- dt[, .SD, .SDcols = num_cols, drop = FALSE]
sds <- sapply(num_dt, function(x) {
xn <- x[!is.na(x)]
if (length(xn) < 2) 0 else {
s_val <- stats::sd(xn)
if (is.na(s_val)) 0 else s_val
}
})
cols_v <- names(sds[!is.na(sds) & sds > sqrt(.Machine$double.eps)])
if (length(cols_v) < 2) {
logger::log_warn(paste0("<2 numeric columns with variance for correlation", title_suffix))
return(matrix(nrow = 0, ncol = 0))
}
num_dt_c <- num_dt[, .SD, .SDcols = cols_v, drop = FALSE]
cor_m <- tryCatch(stats::cor(num_dt_c, method = method, use = use),
error = function(e) {
logger::log_error(paste("Correlation calculation error", title_suffix, ":", e$message))
matrix(nrow = 0, ncol = 0)
}
)
if (!is.null(plot_type) && is.matrix(cor_m) && nrow(cor_m) > 0) {
title_f <- paste0("Correlation Matrix", title_suffix)
if (plot_type == "ggplot") {
# Use .data pronoun for ggplot2 aes
mcor <- data.table::melt(data.table::as.data.table(cor_m, keep.rownames = "V1"), id.vars = "V1", variable.name = "V2", value.name = "Corr")
p <- ggplot2::ggplot(mcor, ggplot2::aes(x = .data$V1, y = .data$V2, fill = .data$Corr)) +
ggplot2::geom_tile(color = "white") +
ggplot2::scale_fill_gradient2(low = "blue", high = "red", mid = "white", midpoint = 0, limit = c(-1, 1), na.value = "grey80") +
ggplot2::theme_minimal() +
ggplot2::theme(axis.text.x = ggplot2::element_text(angle = 45, hjust = 1), axis.title = ggplot2::element_blank()) +
ggplot2::coord_fixed() +
ggplot2::labs(title = title_f)
tryCatch(print(p), error = function(e) logger::log_warn(paste("Plot cor ggplot err", title_suffix, ":", e$message)))
} else if (plot_type == "corrplot") {
if (requireNamespace("corrplot", quietly = TRUE)) {
tryCatch(corrplot::corrplot(cor_m,
method = "color", type = "upper", order = "hclust", tl.col = "black", tl.srt = 45,
addCoef.col = "black", number.cex = 0.7, na.label = "NA",
title = paste0("\n\n", title_f), mar = c(0, 0, 2, 0)
), error = function(e) logger::log_warn(paste("Corrplot err", title_suffix, ":", e$message)))
} else {
logger::log_warn("Package 'corrplot' not installed. Skipping corrplot.")
}
}
}
return(cor_m)
}
if (!is.null(by_var)) {
grps <- unique(data_to_use[[by_var]])
list_mats <- lapply(grps, function(g) {
logger::log_info(paste0("Correlation for group: ", by_var, "=", g))
sub_dt <- data_to_use[get(by_var) == g]
sub_dt_no_by <- sub_dt[, !c(by_var), with = FALSE] # Exclude by_var column
calc_cor_mat(sub_dt_no_by, paste0(" (", by_var, "=", g, ")"))
})
names(list_mats) <- make.names(paste0(by_var, "_", grps)) # make.names to ensure valid names
logger::log_info("Correlation by group done.")
return(list_mats)
} else {
overall_m <- calc_cor_mat(data_to_use)
logger::log_info("Overall correlation done.")
return(overall_m)
}
},
#' @description
#' Plot a histogram for a numeric variable.
#' @param variable Character string. Name of the numeric variable to plot.
#' @param by_var Optional. Character string. Name of a categorical variable for faceting.
#' @param bins Integer. Number of bins for the histogram (default: 30).
#' @return A ggplot object. The plot is also printed.
plot_histogram = function(variable, by_var = NULL, bins = 30) {
logger::log_info(paste0("Histogram for '", variable, if (!is.null(by_var)) paste0("' by '", by_var, "' (facet)") else "", "..."))
vars_chk <- c(variable, by_var)
private$.check_data_and_vars(vars_chk)
private$.check_var_type(variable, "numeric")
if (!is.null(by_var)) private$.check_var_type(by_var, c("character", "factor", "integer", "logical"))
plt_dt <- data.table::copy(self$data)
if (all(is.na(plt_dt[[variable]]))) {
logger::log_warn(paste0("Variable '", variable, "' is all NA. Cannot plot histogram."))
p_e <- ggplot2::ggplot() + ggplot2::labs(title = paste("Histogram of", variable, "(All NA)"), x = variable, y = "Frequency") + ggplot2::theme_minimal()
tryCatch(print(p_e), error = function(e) logger::log_warn(e$message))
return(invisible(p_e))
}
aes_map <- ggplot2::aes(x = .data[[variable]])
title_p <- paste("Histogram of", variable)
p <- ggplot2::ggplot(plt_dt, aes_map) +
ggplot2::geom_histogram(bins = bins, fill = "steelblue", color = "black", alpha = 0.7, na.rm = TRUE) +
ggplot2::theme_minimal()
if (!is.null(by_var)) {
plt_dt[, (by_var) := as.factor(get(by_var))] # Ensure by_var is factor for faceting
p <- p + ggplot2::facet_wrap(stats::as.formula(paste("~", by_var)))
title_p <- paste(title_p, "(Faceted by", by_var, ")")
}
p <- p + ggplot2::labs(title = title_p, x = variable, y = "Frequency")
logger::log_info("Histogram generation complete.")
tryCatch(print(p), error = function(e) logger::log_warn(paste("Error plotting histogram for '", variable, "': ", e$message)))
return(invisible(p))
},
#' @description
#' Plot a boxplot of a numeric variable grouped by a categorical variable.
#' @param numeric_var Character string. Name of the numeric variable.
#' @param categorical_var Character string. Name of the categorical variable for grouping.
#' @param by_var Optional. Character string. Name of another categorical variable for faceting.
#' @return A ggplot object. The plot is also printed.
plot_boxplot = function(numeric_var, categorical_var, by_var = NULL) {
logger::log_info(paste0("Boxplot for '", numeric_var, "' by '", categorical_var, if (!is.null(by_var)) paste0("', faceted by '", by_var, "'") else "", "..."))
vars_chk <- c(numeric_var, categorical_var, by_var)
private$.check_data_and_vars(vars_chk)
private$.check_var_type(numeric_var, "numeric")
private$.check_var_type(categorical_var, c("character", "factor", "integer", "logical"))
if (!is.null(by_var)) private$.check_var_type(by_var, c("character", "factor", "integer", "logical"))
plt_dt <- data.table::copy(self$data)
plt_dt[, (categorical_var) := as.factor(get(categorical_var))] # Ensure categorical_var is factor
if (all(is.na(plt_dt[[numeric_var]]))) {
logger::log_warn(paste0("Numeric variable '", numeric_var, "' is all NA. Cannot plot boxplot."))
p_e <- ggplot2::ggplot() + ggplot2::labs(title = paste("Boxplot of", numeric_var, "by", categorical_var, "(Numeric All NA)"), x = categorical_var, y = numeric_var) + ggplot2::theme_minimal()
tryCatch(print(p_e), error = function(e) logger::log_warn(e$message))
return(invisible(p_e))
}
title_p <- paste("Boxplot of", numeric_var, "by", categorical_var)
aes_map <- ggplot2::aes(x = .data[[categorical_var]], y = .data[[numeric_var]], fill = .data[[categorical_var]])
p <- ggplot2::ggplot(plt_dt, aes_map) +
ggplot2::geom_boxplot(show.legend = FALSE, na.rm = TRUE) +
ggplot2::theme_minimal() +
ggplot2::theme(axis.text.x = ggplot2::element_text(angle = 45, hjust = 1))
if (!is.null(by_var)) {
plt_dt[, (by_var) := as.factor(get(by_var))] # Ensure by_var is factor for faceting
p <- p + ggplot2::facet_wrap(stats::as.formula(paste("~", by_var)))
title_p <- paste(title_p, "(Faceted by", by_var, ")")
}
p <- p + ggplot2::labs(title = title_p, x = categorical_var, y = numeric_var) +
ggplot2::scale_fill_viridis_d(na.value = "grey80") # From ggplot2, handles discrete scales
logger::log_info("Boxplot generation complete.")
tryCatch(print(p), error = function(e) logger::log_warn(paste("Error plotting boxplot:", e$message)))
return(invisible(p))
},
#' @description
#' Plot a scatterplot between two numeric variables.
#' @param x_var Character string. Name of the numeric variable for the x-axis.
#' @param y_var Character string. Name of the numeric variable for the y-axis.
#' @param color_var Optional. Character string. Name of a variable for coloring points.
#' @param size_var Optional. Character string. Name of a numeric variable for sizing points.
#' @param add_smooth Logical. If TRUE, adds a loess smoothed line (default: FALSE).
#' @param by_var Optional. Character string. Name of a categorical variable for faceting.
#' @return A ggplot object. The plot is also printed.
plot_scatterplot = function(x_var, y_var, color_var = NULL, size_var = NULL, add_smooth = FALSE, by_var = NULL) {
logger::log_info(paste0("Scatterplot for '", y_var, "' vs '", x_var, if (!is.null(by_var)) paste0("', faceted by '", by_var, "'") else "", "..."))
vars_chk <- c(x_var, y_var, color_var, size_var, by_var)
private$.check_data_and_vars(vars_chk)
private$.check_var_type(x_var, "numeric")
private$.check_var_type(y_var, "numeric")
if (!is.null(size_var)) private$.check_var_type(size_var, "numeric")
if (!is.null(color_var)) private$.check_var_type(color_var, c("numeric", "character", "factor", "integer", "logical"))
if (!is.null(by_var)) private$.check_var_type(by_var, c("character", "factor", "integer", "logical"))
plt_dt <- data.table::copy(self$data)
if (all(is.na(plt_dt[[x_var]])) || all(is.na(plt_dt[[y_var]]))) {
logger::log_warn(paste0("Either '", x_var, "' or '", y_var, "' is all NA. Cannot plot scatterplot."))
p_e <- ggplot2::ggplot() + ggplot2::labs(title = paste("Scatterplot of", y_var, "vs", x_var, "(All NA)"), x = x_var, y = y_var) + ggplot2::theme_minimal()
tryCatch(print(p_e), error = function(e) logger::log_warn(e$message))
return(invisible(p_e))
}
aes_map <- ggplot2::aes(x = .data[[x_var]], y = .data[[y_var]])
title_p <- paste("Scatterplot of", y_var, "vs", x_var)
if (!is.null(color_var)) {
# Convert low-cardinality numerics, characters, logicals to factor for color mapping
if (is.character(plt_dt[[color_var]]) || is.logical(plt_dt[[color_var]])) {
plt_dt[, (color_var) := as.factor(get(color_var))]
} else if ((is.numeric(plt_dt[[color_var]]) || is.integer(plt_dt[[color_var]])) &&
data.table::uniqueN(plt_dt[[color_var]], na.rm = TRUE) < 10 &&
data.table::uniqueN(plt_dt[[color_var]], na.rm = TRUE) > 1) {
plt_dt[, (color_var) := as.factor(get(color_var))]
}
aes_map$colour <- as.name(color_var) # Use as.name for aes mapping
}
if (!is.null(size_var)) {
aes_map$size <- as.name(size_var)
}
p <- ggplot2::ggplot(plt_dt, aes_map) +
ggplot2::geom_point(alpha = 0.6, na.rm = TRUE) +
ggplot2::theme_minimal()
if (!is.null(color_var)) {
if (is.numeric(plt_dt[[color_var]]) && !is.factor(plt_dt[[color_var]])) { # Continuous color
p <- p + ggplot2::scale_color_viridis_c(name = color_var, na.value = "grey80")
} else { # Discrete color (factor)
p <- p + ggplot2::scale_color_viridis_d(name = color_var, na.value = "grey80")
}
}
if (add_smooth) {
p <- p + ggplot2::geom_smooth(se = FALSE, method = "loess", formula = 'y~x', na.rm = TRUE)
}
if (!is.null(by_var)) {
plt_dt[, (by_var) := as.factor(get(by_var))] # Ensure by_var is factor for faceting
p <- p + ggplot2::facet_wrap(stats::as.formula(paste("~", by_var)))
title_p <- paste(title_p, "(Faceted by", by_var, ")")
}
p <- p + ggplot2::labs(title = title_p, x = x_var, y = y_var)
logger::log_info("Scatterplot generation complete.")
tryCatch(print(p), error = function(e) logger::log_warn(paste("Error plotting scatterplot:", e$message)))
return(invisible(p))
},
#' @description
#' Plot a barplot of counts for a categorical variable.
#' @param categorical_var Character string. Name of the primary categorical variable.
#' @param fill_var Optional. Character string. Name of a categorical variable for fill color (stacked/dodged bars).
#' @param order_by_freq Logical. If TRUE, orders bars by frequency of `categorical_var` (default: TRUE).
#' @param by_var Optional. Character string. Name of another categorical variable for faceting.
#' @return A ggplot object. The plot is also printed.
plot_barplot_counts = function(categorical_var, fill_var = NULL, order_by_freq = TRUE, by_var = NULL) {
logger::log_info(paste0("Barplot counts for '", categorical_var,
if (!is.null(fill_var)) paste0("' by '", fill_var, "' (fill)") else "",
if (!is.null(by_var)) paste0(", faceted by '", by_var, "'") else "", "..."))
vars_chk <- c(categorical_var, fill_var, by_var)
private$.check_data_and_vars(vars_chk)
private$.check_var_type(categorical_var, c("character", "factor", "integer", "logical"))
if (!is.null(fill_var)) private$.check_var_type(fill_var, c("character", "factor", "integer", "logical"))
if (!is.null(by_var)) private$.check_var_type(by_var, c("character", "factor", "integer", "logical"))
plt_dt_orig <- data.table::copy(self$data)
# Filter NAs for essential plotting variables
plt_dt <- plt_dt_orig[!is.na(get(categorical_var))]
if (!is.null(fill_var)) plt_dt <- plt_dt[!is.na(get(fill_var))]
if (!is.null(by_var)) plt_dt <- plt_dt[!is.na(get(by_var))]
if (nrow(plt_dt) == 0) {
logger::log_warn(paste0("No non-NA data for barplot of '", categorical_var, "'. Cannot plot."))
p_e <- ggplot2::ggplot() + ggplot2::labs(title = paste("Barplot of", categorical_var, "(No Data)")) + ggplot2::theme_void()
tryCatch(print(p_e), error = function(e) logger::log_warn(e$message))
return(invisible(p_e))
}
plt_dt[, (categorical_var) := as.factor(get(categorical_var))]
if (order_by_freq) {
# Order factor levels by frequency
ord_lvls <- plt_dt[, .N, by = eval(as.name(categorical_var))][order(-N), get(categorical_var)]
plt_dt[, (categorical_var) := factor(get(categorical_var), levels = ord_lvls)]
}
if (!is.null(fill_var)) plt_dt[, (fill_var) := as.factor(get(fill_var))]
if (!is.null(by_var)) plt_dt[, (by_var) := as.factor(get(by_var))]
aes_map <- ggplot2::aes(x = .data[[categorical_var]])
title_p <- paste("Barplot Counts for", categorical_var)
pos_arg <- "identity" # Default for single var or when fill is the same as x
if (!is.null(fill_var)) {
aes_map$fill <- as.name(fill_var)
pos_arg <- ggplot2::position_dodge(preserve = "single") # Use dodge for fill_var
title_p <- paste(title_p, "by", fill_var)
} else {
aes_map$fill <- as.name(categorical_var) # Fill by x-axis var if no fill_var
}
p <- ggplot2::ggplot(plt_dt, aes_map) +
ggplot2::geom_bar(stat = "count", position = pos_arg) +
ggplot2::theme_minimal() +
ggplot2::theme(axis.text.x = ggplot2::element_text(angle = 45, hjust = 1))
if (is.null(fill_var) || fill_var == categorical_var) {
p <- p + ggplot2::guides(fill = "none") # No legend if fill is redundant
} else {
p <- p + ggplot2::scale_fill_viridis_d(name = fill_var, na.value="grey80")
}
if (!is.null(by_var)) {
p <- p + ggplot2::facet_wrap(stats::as.formula(paste("~", by_var)))
title_p <- paste(title_p, "(Faceted by", by_var, ")")
}
p <- p + ggplot2::labs(title = title_p, x = categorical_var, y = "Count")
logger::log_info("Barplot generation complete.")
tryCatch(print(p), error = function(e) logger::log_warn(paste("Error plotting barplot:", e$message)))
return(invisible(p))
},
#' @description
#' Find categorical variables with high cardinality.
#' Identifies categorical columns that have many unique values relative to non-missing rows or an absolute high number of unique values.
#' @param threshold_ratio Numeric. Ratio of unique values to non-missing rows to be considered high cardinality (default: 0.5).
#' @param threshold_unique Integer. Absolute number of unique values to be considered high cardinality (default: 100).
#' @param by_var Optional. Character string. Name of a categorical variable to group the analysis by.
#' @return A data.table listing high cardinality categorical variables and their metrics.
find_high_cardinality_categoricals = function(threshold_ratio = 0.5, threshold_unique = 100, by_var = NULL) {
logger::log_info(paste0("Finding high cardinality categorical variables", if (!is.null(by_var)) paste0(" by '", by_var, "'") else "", "..."))
if (is.null(self$data)) {
logger::log_error("Data not loaded.")
stop("Data not loaded.")
}
.find_high_card_subset <- function(dt_subset, group_val_char = NULL) {
meta_s <- data.table::data.table(
Col = names(dt_subset),
Type = sapply(dt_subset, function(x) paste(class(x), collapse = ",")),
Uniq = sapply(dt_subset, data.table::uniqueN),
Miss = sapply(dt_subset, function(x) sum(is.na(x)))
)
# Use %chin% for efficient character matching
cat_meta_s <- meta_s[Type %chin% c("factor", "character", "character, factor", "factor, character")]
if (nrow(cat_meta_s) == 0) return(data.table::data.table())
N_r_s <- nrow(dt_subset)
hcl_s <- list()
for (i in 1:nrow(cat_meta_s)) {
cn_s <- cat_meta_s$Col[i]
# Check if column is actually categorical in the subset
if (!(is.factor(dt_subset[[cn_s]]) || is.character(dt_subset[[cn_s]]))) next
uv_s <- cat_meta_s$Uniq[i]
nnar_s <- N_r_s - cat_meta_s$Miss[i]
rval_s <- if (nnar_s > 0) uv_s / nnar_s else 0
ex_r_s <- if (nnar_s > 0) (rval_s > threshold_ratio) else FALSE
ex_u_s <- uv_s > threshold_unique
if (ex_r_s || ex_u_s) {
hcl_s[[cn_s]] <- data.table::data.table(Var = cn_s, Uniq = uv_s, NonNA_Sub = nnar_s, Ratio_Sub = round(rval_s, 3), ExceedsRatio = ex_r_s, ExceedsUnique = ex_u_s)
}
}
res_s <- data.table::rbindlist(hcl_s, fill = TRUE)
if (!is.null(group_val_char) && nrow(res_s) > 0) res_s[, Group_Level_Temp_Col := group_val_char]
return(res_s)
}
if (!is.null(by_var)) {
private$.check_data_and_vars(by_var)
private$.check_var_type(by_var, c("character", "factor", "integer", "logical"))
temp_data_hcc <- data.table::copy(self$data)
if (!is.factor(temp_data_hcc[[by_var]])) temp_data_hcc[, (by_var) := as.factor(get(by_var))]
cols_to_check <- setdiff(names(temp_data_hcc), by_var)
# Ensure .BY[[1]] is character if it's a factor
results_by_hcc <- temp_data_hcc[, .find_high_card_subset(.SD, group_val_char = as.character(.BY[[1]])), by = by_var, .SDcols = cols_to_check]
if ("Group_Level_Temp_Col" %in% names(results_by_hcc)) {
data.table::setnames(results_by_hcc, "Group_Level_Temp_Col", by_var)
if ("Var" %in% names(results_by_hcc)) data.table::setcolorder(results_by_hcc, c(by_var, "Var"))
}
logger::log_info(paste("High cardinality categorical variable search by group complete."))
return(results_by_hcc)
} else {
result_overall <- .find_high_card_subset(self$data)
logger::log_info("High cardinality categorical variable search complete.")
return(result_overall)
}
},
#' @description
#' Find numeric variables with high skewness.
#' Uses the `summarize_numeric` method internally.
#' @param threshold Numeric. Absolute skewness value to exceed to be considered highly skewed (default: 1.0).
#' @param by_var Optional. Character string. Name of a categorical variable to group the analysis by.
#' @return A data.table listing skewed numeric variables, their skewness, and kurtosis.
find_skewed_numeric = function(threshold = 1.0, by_var = NULL) {
logger::log_info(paste0("Finding skewed numeric variables", if (!is.null(by_var)) paste0(" by '", by_var, "'") else "", "..."))
if (is.null(self$data)) {
logger::log_error("Data not loaded.")
stop("Data not loaded.")
}
# Call internal method to get numeric summaries
num_sum_scoped <- self$summarize_numeric(by_var = by_var)
if (is.null(num_sum_scoped) || nrow(num_sum_scoped) == 0) {
logger::log_info("No numeric variables found for skewness check.")
return(data.table::data.table())
}
# Define required columns; Skewness and Variable are essential
req_c <- c("Variable", "Skewness", "Kurtosis")
if (!is.null(by_var)) req_c <- c(by_var, req_c) # Prepend by_var if it exists
# Ensure essential columns are present
if (!"Skewness" %in% names(num_sum_scoped) || !"Variable" %in% names(num_sum_scoped)) {
logger::log_warn("Expected columns 'Variable' or 'Skewness' not found in numeric summary for skewness check.")
return(data.table::data.table())
}
# Intersect with actual columns to prevent errors if some are missing (e.g. Kurtosis if all vars have <4 obs)
cols_to_select <- intersect(req_c, names(num_sum_scoped))
# data.table i-part selection for rows is fine with unquoted column names
skewed <- num_sum_scoped[!is.na(Skewness) & abs(Skewness) > threshold, .SD, .SDcols = cols_to_select]
logger::log_info(paste("Found", nrow(skewed), "skewed entries meeting threshold."))
return(skewed)
},
#' @description
#' Find numeric variables with near-zero variance (NZV).
#' Based on `caret::nearZeroVar` logic: high ratio of most frequent to second most frequent value,
#' and low percentage of unique values.
#' @param freq_cut Numeric. The cutoff for the ratio of frequencies (most common / second most common) (default: 95/5).
#' @param unique_cut Numeric. The cutoff for the percentage of unique values out of total non-missing samples (default: 10).
#' @param by_var Optional. Character string. Name of a categorical variable to group the analysis by.
#' @return A data.table listing NZV numeric variables and their metrics.
find_near_zero_variance_numeric = function(freq_cut = 95/5, unique_cut = 10, by_var = NULL) {
logger::log_info(paste0("Finding near-zero variance numeric variables", if (!is.null(by_var)) paste0(" by '", by_var, "'") else "", "..."))
if (is.null(self$data)) {
logger::log_error("Data not loaded.")
stop("Data not loaded.")
}
.find_nzv_subset <- function(dt_subset, group_val_char = NULL) {
num_cols_s <- names(dt_subset)[sapply(dt_subset, is.numeric)]
if (length(num_cols_s) == 0) return(data.table::data.table())
dl_s <- list()
for (cn_s in num_cols_s) {
cv_s <- dt_subset[[cn_s]]
cvnn_s <- cv_s[!is.na(cv_s)]
nn_s <- length(cvnn_s)
nt_s <- ""
is_zero_var_s <- FALSE
freq_ratio_s <- NA_real_
pct_unique_s <- NA_real_
meets_criteria_s <- FALSE
if (nn_s == 0) {
nt_s <- "All NA"
} else {
is_zero_var_s <- (data.table::uniqueN(cvnn_s) <= 1)
if (is_zero_var_s) {
nt_s <- "Zero/Single Unique Value"
freq_ratio_s <- Inf # Or NA_real_ if preferred for single value
pct_unique_s <- if (nn_s > 0) (1 / nn_s) * 100 else 0 # Should be (uniqueN / nn_s) * 100 = (1/nn_s)*100
} else {
# Calculate freqRatio and percentUnique only if not zero var
counts_s <- table(cvnn_s)
sorted_counts_s <- sort(counts_s, decreasing = TRUE)
if (length(sorted_counts_s) >= 2) {
freq_ratio_s <- sorted_counts_s[1] / sorted_counts_s[2]
} else { # Only one unique value was present, should have been caught by is_zero_var_s logic
freq_ratio_s <- Inf # Effectively, as second most common is 0
}
pct_unique_s <- (data.table::uniqueN(cvnn_s) / nn_s) * 100
meets_criteria_s <- (freq_ratio_s >= freq_cut && pct_unique_s <= unique_cut)
}
}
is_nzv_final_s <- is_zero_var_s || meets_criteria_s
dl_s[[cn_s]] <- data.table::data.table(
Var = cn_s, IsZeroVar = is_zero_var_s, FreqRatio = freq_ratio_s,
PctUniq = pct_unique_s, MeetsCriteria = meets_criteria_s, IsNZV = is_nzv_final_s, Note = nt_s
)
}
all_data_s <- data.table::rbindlist(dl_s, fill = TRUE)
# Filter for only those that are NZV
res_s <- all_data_s[get("IsNZV") == TRUE] # Use get() for clarity or ensure IsNZV is known to R CMD CHECK
# Define expected columns for structure consistency, even if res_s is empty
expected_cols <- c("Var", "IsZeroVar", "FreqRatio", "PctUniq", "MeetsCriteria", "IsNZV", "Note")
if (!is.null(group_val_char)) expected_cols <- c(expected_cols, "Group_Level_Temp_Col")
if (nrow(res_s) > 0) {
if (!is.null(group_val_char)) res_s[, Group_Level_Temp_Col := group_val_char]
# Select and reorder to expected_cols that are present
present_expected_cols <- intersect(expected_cols, names(res_s))
res_s <- res_s[, .SD, .SDcols = present_expected_cols]
} else { # Create empty table with correct structure if no NZV found
empty_dt_data <- lapply(stats::setNames(nm = expected_cols), function(x) vector()) # Create named list of empty vectors
# Ensure correct types if possible, or let data.table infer from empty vectors
# For simplicity, this creates logical(0) for all if not specified.
# A more robust empty table might define types more explicitly.
res_s <- data.table::as.data.table(empty_dt_data)
# Remove Group_Level_Temp_Col if group_val_char was NULL
if(is.null(group_val_char) && "Group_Level_Temp_Col" %in% names(res_s)) {
res_s[, Group_Level_Temp_Col := NULL]
}
}
return(res_s)
}
if (!is.null(by_var)) {
private$.check_data_and_vars(by_var)
private$.check_var_type(by_var, c("character", "factor", "integer", "logical"))
temp_data_nzv <- data.table::copy(self$data)
if (!is.factor(temp_data_nzv[[by_var]])) temp_data_nzv[, (by_var) := as.factor(get(by_var))]
cols_to_check_nzv <- setdiff(names(temp_data_nzv), by_var)
results_by_nzv <- temp_data_nzv[, .find_nzv_subset(.SD, group_val_char = as.character(.BY[[1]])), by = by_var, .SDcols = cols_to_check_nzv]
if ("Group_Level_Temp_Col" %in% names(results_by_nzv)) {
data.table::setnames(results_by_nzv, "Group_Level_Temp_Col", by_var)
# Reorder to put by_var first, then Var
new_order <- c(by_var, setdiff(names(results_by_nzv), by_var))
if ("Var" %in% new_order) { # Ensure Var exists before trying to move it
new_order <- c(by_var, "Var", setdiff(new_order, c(by_var, "Var")))
data.table::setcolorder(results_by_nzv, new_order)
} else {
data.table::setcolorder(results_by_nzv, new_order) # Var might not be present if no NZV found
}
}
logger::log_info("Near-zero variance numeric variable search by group complete.")
return(results_by_nzv)
} else {
result_overall <- .find_nzv_subset(self$data)
logger::log_info("Near-zero variance numeric variable search complete.")
return(result_overall)
}
},
#' @description
#' Find potential univariate predictors for a target variable.
#' Fits simple linear models (LM for numeric target, GLM for binary target)
#' for each predictor variable against the target.
#' @param target_var Character string. Name of the target variable.
#' @param p_value_threshold Numeric. Significance level to consider a predictor (default: 0.05).
#' @param by_var Optional. Character string. Name of a categorical variable to group the analysis by.
#' @return A data.table listing significant univariate predictors, their model type,
#' statistic (R-squared or Pseudo R-squared), and p-value.
get_univariate_predictors = function(target_var, p_value_threshold = 0.05, by_var = NULL) {
logger::log_info(paste0("Finding univariate predictors for '", target_var, if (!is.null(by_var)) paste0("' by '", by_var, "'") else "", "..."))
private$.check_data_and_vars(c(target_var, by_var)) # by_var can be NULL, handled by .check_data_and_vars
.get_preds_subset <- function(dt_subset, current_target_var, group_val_char = NULL) {
preds_s <- setdiff(names(dt_subset), current_target_var)
if (length(preds_s) == 0) return(data.table::data.table())
work_dt_s <- data.table::copy(dt_subset)
tgt_v_s <- work_dt_s[[current_target_var]]
tgt_t_s <- private$.get_var_type_simple(tgt_v_s) # Use private helper
# Helper to create consistent logging prefix
log_prefix <- paste0("Subset ", if (is.null(group_val_char)) "Overall" else group_val_char, ": ")
if (!tgt_t_s %in% c("numeric", "binary_categorical")) {
logger::log_warn(paste0(log_prefix, "Target '", current_target_var, "' type '", tgt_t_s, "' is not numeric or binary_categorical. Skipping."))
return(data.table::data.table())
}
sig_preds_l_s <- list()
if (tgt_t_s == "binary_categorical") {
original_levels_s <- character(0)
if (is.factor(tgt_v_s)) original_levels_s <- levels(tgt_v_s)
else original_levels_s <- sort(as.character(unique(stats::na.omit(tgt_v_s))))
if (length(original_levels_s) == 2) {
logger::log_debug(paste0(log_prefix, "Binary target '", current_target_var, "': '", original_levels_s[1], "' maps to 0, '", original_levels_s[2], "' maps to 1."))
}
# Coerce target to 0/1 integer
work_dt_s[, (current_target_var) := as.integer(as.factor(get(current_target_var))) - 1]
unique_post_conversion_s <- data.table::uniqueN(work_dt_s[[current_target_var]], na.rm = TRUE)
if (unique_post_conversion_s > 0 && unique_post_conversion_s != 2 && !(unique_post_conversion_s == 1 && sum(!is.na(work_dt_s[[current_target_var]])) > 0) ) { # Allow 1 unique if all non-NA are same
logger::log_error(paste0(log_prefix, "Target '", current_target_var, "' failed 0/1 conversion or resulted in invalid binary. Unique values post-conversion: ", unique_post_conversion_s))
return(data.table::data.table())
}
}
for (pred_v_s in preds_s) {
# Create dataset with only target and current predictor, removing NAs for the pair
pair_dt_s <- work_dt_s[stats::complete.cases(work_dt_s[, .SD, .SDcols = c(current_target_var, pred_v_s)]), .SD, .SDcols = c(current_target_var, pred_v_s)]
if (nrow(pair_dt_s) < 10) { # Minimum observations for modeling
logger::log_debug(paste0(log_prefix, "Skipping '", pred_v_s, "' due to <10 complete obs with target."))
next
}
# Check target variance within this pair_dt_s
if (data.table::uniqueN(pair_dt_s[[current_target_var]]) < 2) {
logger::log_debug(paste0(log_prefix, "Target '", current_target_var, "' has <2 unique values for predictor '", pred_v_s, "'. Skipping model."))
next
}
pred_mdl_v_s <- pair_dt_s[[pred_v_s]]
pred_orig_t_s <- private$.get_var_type_simple(self$data[[pred_v_s]]) # Original type from self$data
pred_mdl_t_s <- private$.get_var_type_simple(pred_mdl_v_s) # Type within the current modeling subset
# Prepare predictor: convert categoricals to factor, skip if unsuitable
if (pred_mdl_t_s == "numeric") {
if (data.table::uniqueN(pred_mdl_v_s) <= 1) { # Skip if predictor has no variance in this subset
logger::log_debug(paste0(log_prefix, "Numeric predictor '", pred_v_s, "' has <=1 unique value. Skipping."))
next
}
} else if (pred_mdl_t_s %in% c("categorical", "binary_categorical")) {
if (data.table::uniqueN(pred_mdl_v_s) < 2) { # Skip if <2 levels for categorical predictor
logger::log_debug(paste0(log_prefix, "Categorical predictor '", pred_v_s, "' has <2 unique levels. Skipping."))
next
}
pair_dt_s[, (pred_v_s) := as.factor(get(pred_v_s))] # Convert to factor for modeling
} else if (pred_mdl_t_s %in% c("other_all_na", "empty_vector")) {
logger::log_debug(paste0(log_prefix, "Predictor '", pred_v_s, "' is all NA or empty. Skipping."))
next
} else { # Other types not suitable for direct modeling
logger::log_debug(paste0(log_prefix, "Predictor '", pred_v_s, "' type '", pred_mdl_t_s, "' not suitable for modeling. Skipping."))
next
}
fmla_s <- stats::as.formula(paste0("`", current_target_var, "`~`", pred_v_s, "`"))
tryCatch({
mdl_s <- NULL
p_value_s <- NA_real_
stat_value_s <- NA_real_
stat_name_s <- "N/A"
if (tgt_t_s == "numeric") { # LM for numeric target
mdl_s <- stats::lm(fmla_s, data = pair_dt_s)
mdl_summary_s <- summary(mdl_s)
if (is.numeric(pair_dt_s[[pred_v_s]])) { # Numeric predictor
# Handle potential backticks in coefficient names
coef_name_s <- pred_v_s
if (!coef_name_s %in% rownames(mdl_summary_s$coefficients)) coef_name_s <- paste0("`", pred_v_s, "`")
if (coef_name_s %in% rownames(mdl_summary_s$coefficients)) {
p_value_s <- mdl_summary_s$coefficients[coef_name_s, "Pr(>|t|)"]
stat_value_s <- mdl_summary_s$r.squared
stat_name_s <- "R_Sq"
}
} else { # Factor predictor (ANOVA from LM)
aov_s <- stats::anova(mdl_s)
# Predictor row name in anova table
pred_row_name_s <- pred_v_s
if (!pred_row_name_s %in% rownames(aov_s)) pred_row_name_s <- paste0("`",pred_v_s,"`")
# Check for common p-value column names in anova output
p_val_col_name_s <- intersect(c("Pr(>F)", "`Pr(>F)`"), names(aov_s))[1]
if (!is.na(p_val_col_name_s) && pred_row_name_s %in% rownames(aov_s)) {
p_value_s <- aov_s[pred_row_name_s, p_val_col_name_s]
stat_value_s <- mdl_summary_s$r.squared # Overall R-squared
stat_name_s <- "R_Sq (Factor)"
}
}
} else { # GLM for binary target (already 0/1 coded)
mdl_s <- stats::glm(fmla_s, data = pair_dt_s, family = "binomial")
null_mdl_s <- stats::glm(stats::as.formula(paste0("`", current_target_var, "`~1")), data = pair_dt_s, family = "binomial")
# Use likelihood ratio test
lrt_s <- suppressWarnings(stats::anova(null_mdl_s, mdl_s, test = "LRT"))
if (nrow(lrt_s) == 2 && "Pr(>Chi)" %in% names(lrt_s)) {
p_value_s <- lrt_s$"Pr(>Chi)"[2] # P-value from LRT
# McFadden's Pseudo R-squared
logLik_full_s <- tryCatch(stats::logLik(mdl_s), error = function(e) NA)
logLik_null_s <- tryCatch(stats::logLik(null_mdl_s), error = function(e) NA)
if (!is.na(logLik_full_s) && !is.na(logLik_null_s)) {
if (logLik_null_s != 0) stat_value_s <- 1 - (as.numeric(logLik_full_s) / as.numeric(logLik_null_s))
else if (logLik_full_s == logLik_null_s) stat_value_s <- 0 # Handles perfect null model case
else stat_value_s <- NA # Should not happen if logLiks are valid
} else {
stat_value_s <- NA
}
stat_name_s <- "Pseudo_R_Sq_McF"
}
}
if (!is.na(p_value_s) && p_value_s < p_value_threshold) {
sig_preds_l_s[[paste(current_target_var, pred_v_s, sep = "_vs_")]] <-
data.table::data.table(
Target = current_target_var, Predictor = pred_v_s, PredictorOriginalType = pred_orig_t_s,
ModelType = if (tgt_t_s == "numeric") "LM" else "GLM (Binomial)",
StatisticName = stat_name_s, StatisticValue = stat_value_s, PValue = p_value_s,
N_Obs_Model = nrow(pair_dt_s)
)
}
}, error = function(e) {
logger::log_warn(paste0(log_prefix, "Modeling error for '", current_target_var, "' ~ '", pred_v_s, "': ", e$message))
})
}
res_s <- data.table::rbindlist(sig_preds_l_s, fill = TRUE, idcol = FALSE)
if (nrow(res_s) > 0) data.table::setorderv(res_s, "PValue")
if (!is.null(group_val_char) && nrow(res_s) > 0) res_s[, Group_Level_Temp_Col := group_val_char]
return(res_s)
}
if (!is.null(by_var)) {
temp_data_preds <- data.table::copy(self$data)
if (!is.factor(temp_data_preds[[by_var]])) temp_data_preds[, (by_var) := as.factor(get(by_var))]
# Predictors are all columns except target and by_var
potential_predictors <- setdiff(names(self$data), c(target_var, by_var))
# Columns needed for .SD are the target and the potential predictors
cols_to_pass_to_sd <- c(target_var, potential_predictors)
# Run .get_preds_subset for each group defined by by_var
results_by_preds <- temp_data_preds[, .get_preds_subset(.SD, current_target_var = target_var, group_val_char = as.character(.BY[[1]])),
by = by_var, .SDcols = cols_to_pass_to_sd]
if ("Group_Level_Temp_Col" %in% names(results_by_preds)) {
data.table::setnames(results_by_preds, "Group_Level_Temp_Col", by_var)
# Set column order: by_var, then Predictor, then others
if ("Predictor" %in% names(results_by_preds)) {
new_order <- c(by_var, "Predictor", setdiff(names(results_by_preds), c(by_var, "Predictor")))
data.table::setcolorder(results_by_preds, new_order)
} else if (by_var %in% names(results_by_preds)) { # if no predictors found, just by_var might be there
data.table::setcolorder(results_by_preds, c(by_var, setdiff(names(results_by_preds), by_var)))
}
}
logger::log_info("Univariate predictor search by group complete.")
return(results_by_preds)
} else {
# For overall analysis, columns needed are target and all other columns as potential predictors
potential_predictors <- setdiff(names(self$data), target_var)
cols_for_overall_analysis <- c(target_var, potential_predictors)
result_overall <- .get_preds_subset(self$data[, .SD, .SDcols = cols_for_overall_analysis], current_target_var = target_var)
logger::log_info("Overall univariate predictor search complete.")
return(result_overall)
}
},
#' @description
#' Perform bivariate analysis between two variables.
#' Selects appropriate statistical tests and plots based on variable types.
#' (Numeric-Numeric: Correlation, Scatterplot; Numeric-Categorical: T-test/ANOVA, Boxplot;
#' Categorical-Categorical: Chi-squared, Barplot).
#' @param var1 Character string. Name of the first variable.
#' @param var2 Character string. Name of the second variable.
#' @param by_var Optional. Character string. Name of a categorical variable to group the analysis by.
#' If `by_var` is used, plots are not generated by this function directly;
#' instead, use specific plotting methods with `by_var` for faceted plots.
#' @return A list containing:
#' \itemize{
#' \item `summary`: A data.table with results of the statistical test. If `by_var` is used, this becomes `summary_table_by_group`.
#' \item `plot`: A ggplot object (if `by_var` is NULL and plot is applicable).
#' \item `contingency_table`: A table object for Cat-Cat analysis (if `by_var` is NULL).
#' }
get_bivariate_analysis = function(var1, var2, by_var = NULL) {
logger::log_info(paste0("Bivariate analysis for '", var1, "' vs '", var2, if (!is.null(by_var)) paste0("' by '", by_var, "'") else "", "'..."))
private$.check_data_and_vars(c(var1, var2, by_var)) # by_var can be NULL
.get_bivar_subset <- function(dt_subset, current_var1, current_var2, group_val_char = NULL, outer_by_var_name_for_plot = NULL) {
# Ensure we are working with a copy and only relevant columns
adt_s <- data.table::copy(dt_subset[, .SD, .SDcols = c(current_var1, current_var2)])
t1i_s <- private$.get_var_type_simple(adt_s[[current_var1]])
t2i_s <- private$.get_var_type_simple(adt_s[[current_var2]])
v1c_s <- current_var1; v2c_s <- current_var2
t1c_s <- t1i_s; t2c_s <- t2i_s
# Convention: if one numeric and one categorical, numeric is v1c_s
if (((t1c_s %in% c("categorical", "binary_categorical")) && t2c_s == "numeric")) {
v1_temp_s <- v1c_s; t1_temp_s <- t1c_s
v1c_s <- v2c_s; t1c_s <- t2c_s # v1 is now numeric
v2c_s <- v1_temp_s; t2c_s <- t1_temp_s # v2 is now categorical
}
# Convert character/logical categoricals to factor for modeling/plotting consistency
if (t1c_s %in% c("categorical", "binary_categorical", "other_all_na") && (is.character(adt_s[[v1c_s]]) || is.logical(adt_s[[v1c_s]]))) {
adt_s[, (v1c_s) := as.factor(get(v1c_s))]
}
if (t2c_s %in% c("categorical", "binary_categorical", "other_all_na") && (is.character(adt_s[[v2c_s]]) || is.logical(adt_s[[v2c_s]]))) {
adt_s[, (v2c_s) := as.factor(get(v2c_s))]
}
# Final types after potential coercion
t1f_s <- private$.get_var_type_simple(adt_s[[v1c_s]])
t2f_s <- private$.get_var_type_simple(adt_s[[v2c_s]])
res_s <- list(plot = NULL, summary = data.table::data.table(), contingency_table = NULL)
adt_comp_s <- adt_s[stats::complete.cases(adt_s[, .SD, .SDcols = c(v1c_s, v2c_s)])]
min_obs_s <- 5 # General minimum
if (t1f_s == "numeric" && (t2f_s %in% c("categorical", "binary_categorical"))) {
min_obs_s <- max(5, 2 * data.table::uniqueN(adt_comp_s[[v2c_s]])) # Need enough per group for t-test/ANOVA
} else if ((t1f_s %in% c("categorical", "binary_categorical")) && (t2f_s %in% c("categorical", "binary_categorical"))) {
# For chi-squared, need expected counts > 5 in most cells, but raw N is a first check
min_obs_s <- 10
}
if (nrow(adt_comp_s) < min_obs_s) {
res_s$summary <- data.table::data.table(Var1 = v1c_s, Var2 = v2c_s, N_Obs_Complete = nrow(adt_comp_s), Note = "Insufficient data for reliable analysis.")
return(res_s)
}
# Check variance for target/response variables
if (t1f_s == "numeric" && data.table::uniqueN(adt_comp_s[[v1c_s]]) < 2) {
res_s$summary <- data.table::data.table(Var1 = v1c_s, Var2 = v2c_s, N_Obs_Complete = nrow(adt_comp_s), Note = paste0("Numeric variable '",v1c_s,"' has < 2 unique values."))
return(res_s)
}
if (t2f_s == "numeric" && data.table::uniqueN(adt_comp_s[[v2c_s]]) < 2 && t1f_s=="numeric") { # Only if both numeric
res_s$summary <- data.table::data.table(Var1 = v1c_s, Var2 = v2c_s, N_Obs_Complete = nrow(adt_comp_s), Note = paste0("Numeric variable '",v2c_s,"' has < 2 unique values."))
return(res_s)
}
base_info_s <- data.table::data.table(Var1_Analysis = v1c_s, Var2_Analysis = v2c_s, N_Obs_Complete = nrow(adt_comp_s))
test_summary_dt_s <- data.table::data.table()
# Generate plot only if not part of a `by_var` operation at the outer level
generate_plot_here <- is.null(outer_by_var_name_for_plot)
if (t1f_s == "numeric" && t2f_s == "numeric") { # Numeric vs Numeric
if (generate_plot_here) {
# Call the class's own plotting method for consistency
res_s$plot <- tryCatch(self$plot_scatterplot(x_var = v1c_s, y_var = v2c_s, add_smooth = TRUE), # Assuming self is accessible or pass GoodeR obj
error = function(e) { logger::log_warn(paste("Bivar scatterplot error:", e$message)); NULL })
}
cor_test_res_s <- tryCatch(stats::cor.test(adt_comp_s[[v1c_s]], adt_comp_s[[v2c_s]]), error = function(e) NULL)
if (!is.null(cor_test_res_s)) {
test_summary_dt_s <- data.table::data.table(Method = cor_test_res_s$method, StatisticName = "Correlation", StatisticValue = cor_test_res_s$estimate, PValue = cor_test_res_s$p.value)
} else {
test_summary_dt_s <- data.table::data.table(Note = "Correlation test failed.")
}
} else if (t1f_s == "numeric" && (t2f_s %in% c("categorical", "binary_categorical"))) { # Numeric vs Categorical
if (generate_plot_here) {
res_s$plot <- tryCatch(self$plot_boxplot(numeric_var = v1c_s, categorical_var = v2c_s),
error = function(e) { logger::log_warn(paste("Bivar boxplot error:", e$message)); NULL })
}
fmla_s <- stats::as.formula(paste0("`", v1c_s, "`~`", v2c_s, "`"))
num_levels_s <- data.table::uniqueN(adt_comp_s[[v2c_s]]) # Already factor
if (num_levels_s == 2) {
ttest_res_s <- tryCatch(stats::t.test(fmla_s, data = adt_comp_s), error = function(e) NULL)
if (!is.null(ttest_res_s)) {
test_summary_dt_s <- data.table::data.table(Method = ttest_res_s$method, StatisticName = "t-statistic", StatisticValue = ttest_res_s$statistic, PValue = ttest_res_s$p.value, DF = round(ttest_res_s$parameter, 2))
} else test_summary_dt_s <- data.table::data.table(Note = "T-test failed.")
} else if (num_levels_s > 2) {
# Check if all groups have at least 2 observations
group_counts <- adt_comp_s[, .N, by = v2c_s]
if(all(group_counts$N >=2) && data.table::uniqueN(adt_comp_s[[v1c_s]]) > 1) { # also ensure numeric var has variance
aov_res_s <- tryCatch(stats::aov(fmla_s, data = adt_comp_s), error = function(e) NULL)
if (!is.null(aov_res_s)) {
summary_aov_s <- summary(aov_res_s)
if (length(summary_aov_s) > 0 && is.data.frame(summary_aov_s[[1]]) && "F value" %in% colnames(summary_aov_s[[1]])) {
test_summary_dt_s <- data.table::data.table(Method = "ANOVA", StatisticName = "F-statistic", StatisticValue = summary_aov_s[[1]]$`F value`[1], PValue = summary_aov_s[[1]]$`Pr(>F)`[1], DF_Numerator = summary_aov_s[[1]]$Df[1], DF_Denominator = summary_aov_s[[1]]$Df[2])
} else test_summary_dt_s <- data.table::data.table(Note = "ANOVA summary uninterpretable.")
} else test_summary_dt_s <- data.table::data.table(Note = "ANOVA failed.")
} else {
test_summary_dt_s <- data.table::data.table(Method="ANOVA", Note="Not all groups have sufficient data (>=2 obs) or numeric var lacks variance for ANOVA.")
}
} else { # < 2 levels
test_summary_dt_s <- data.table::data.table(Method = "N/A", Note = "Categorical variable has < 2 levels for comparison.")
}
} else if ((t1f_s %in% c("categorical", "binary_categorical")) && (t2f_s %in% c("categorical", "binary_categorical"))) { # Categorical vs Categorical
if (generate_plot_here) {
res_s$plot <- tryCatch(self$plot_barplot_counts(categorical_var = v1c_s, fill_var = v2c_s),
error = function(e) { logger::log_warn(paste("Bivar barplot error:", e$message)); NULL })
}
contingency_tbl_s <- table(adt_comp_s[[v1c_s]], adt_comp_s[[v2c_s]])
res_s$contingency_table <- contingency_tbl_s # Store table regardless of test
if (nrow(contingency_tbl_s) < 2 || ncol(contingency_tbl_s) < 2) {
test_summary_dt_s <- data.table::data.table(Method = "Chi-squared Test", Note = "Contingency table is smaller than 2x2. Test not applicable.")
} else {
chisq_res_s <- NULL; chisq_warning_s <- ""
tryCatch(
{
# Suppress warnings specifically from chisq.test if desired, or capture them
withCallingHandlers(
chisq_res_s <- stats::chisq.test(contingency_tbl_s),
warning = function(w) {
chisq_warning_s <<- paste("Chi-sq warning:", w$message) # Capture warning
invokeRestart("muffleWarning") # Suppress it from console
}
)
},
error = function(e) { chisq_res_s <<- NULL } # Ensure result is NULL on error
)
if (!is.null(chisq_res_s)) {
test_summary_dt_s <- data.table::data.table(Method = "Pearson's Chi-squared Test", StatisticName = "X-squared", StatisticValue = chisq_res_s$statistic, DF = chisq_res_s$parameter, PValue = chisq_res_s$p.value, Note = chisq_warning_s)
} else {
test_summary_dt_s <- data.table::data.table(Method = "Chi-squared Test", Error = "Could not compute Chi-squared test.", Note = chisq_warning_s)
}
}
} else { # Unhandled type combination
msg_s <- paste0("Bivariate analysis for type '", t1f_s, "' vs '", t2f_s, "' is not currently handled.")
logger::log_warn(msg_s)
test_summary_dt_s <- data.table::data.table(Note = msg_s)
}
res_s$summary <- cbind(base_info_s, test_summary_dt_s)
if (!is.null(group_val_char) && nrow(res_s$summary) > 0) res_s$summary[, Group_Level_Temp_Col := group_val_char]
return(res_s)
}
if (!is.null(by_var)) {
temp_data_bivar <- data.table::copy(self$data)
if (!is.factor(temp_data_bivar[[by_var]])) temp_data_bivar[, (by_var) := as.factor(get(by_var))]
# Columns needed for the subset operation
cols_for_bivar_subset <- unique(c(var1, var2)) # Do not include by_var here, it's handled by `by`
# Process each group
# The result of the expression in j must be a list for data.table when by is used and j returns multiple rows / complex objects
results_list_of_lists <- temp_data_bivar[, list(bivar_result = list(.get_bivar_subset(.SD, current_var1 = var1, current_var2 = var2, group_val_char = as.character(.BY[[1]]), outer_by_var_name_for_plot = by_var))),
by = by_var, .SDcols = cols_for_bivar_subset]
# Combine all summary tables
all_summaries_dt <- data.table::rbindlist(lapply(results_list_of_lists$bivar_result, function(x) x$summary), fill = TRUE, use.names = TRUE)
if ("Group_Level_Temp_Col" %in% names(all_summaries_dt)) {
data.table::setnames(all_summaries_dt, "Group_Level_Temp_Col", by_var)
# Standard column order
ordered_cols <- c(by_var, "Var1_Analysis", "Var2_Analysis", "N_Obs_Complete", "Method", "StatisticName", "StatisticValue", "PValue", "DF", "DF_Numerator", "DF_Denominator", "Note", "Error")
present_ordered_cols <- intersect(ordered_cols, names(all_summaries_dt))
data.table::setcolorder(all_summaries_dt, present_ordered_cols)
}
logger::log_info("Bivariate analysis by group complete. For plots, use specific plot methods with by_var for faceting.")
# When by_var is used, individual plots per group are not generated by this method.
# Contingency tables per group could be extracted if needed but are not aggregated here.
return(list(summary_table_by_group = all_summaries_dt, plot = NULL, contingency_table = NULL))
} else {
# Overall analysis (no by_var)
overall_bivar_result <- .get_bivar_subset(self$data[, .SD, .SDcols = c(var1, var2)], current_var1 = var1, current_var2 = var2, outer_by_var_name_for_plot = NULL)
logger::log_info("Overall bivariate analysis complete.")
return(overall_bivar_result)
}
}
),
private = list(
.check_data_and_vars = function(vars) {
# Filter out NULL elements from vars, common if optional vars are passed directly
vars_to_check <- vars[!sapply(vars, is.null)]
if (length(vars_to_check) == 0) return(invisible(NULL))
if (is.null(self$data)) {
logger::log_error("Data not loaded.")
stop("Data not loaded.")
}
for (v_nm in vars_to_check) {
if (!v_nm %in% names(self$data)) {
logger::log_error(paste0("Variable '", v_nm, "' not found in the data."))
stop(paste0("Variable '", v_nm, "' not found."))
}
}
invisible(NULL)
},
.check_var_type = function(var_name, expected_types) {
if (is.null(var_name)) return(invisible(NULL)) # Allow optional vars to be NULL
actual_cls <- class(self$data[[var_name]])
is_expected <- any(sapply(expected_types, function(type_str) {
switch(type_str,
"numeric" = is.numeric(self$data[[var_name]]),
"integer" = is.integer(self$data[[var_name]]),
"logical" = is.logical(self$data[[var_name]]),
"character" = is.character(self$data[[var_name]]),
"factor" = is.factor(self$data[[var_name]]),
"date" = inherits(self$data[[var_name]], "Date"),
"datetime" = inherits(self$data[[var_name]], "POSIXct"),
FALSE # Default if type_str not matched
)
}))
if (!is_expected) {
msg <- paste0("Variable '", var_name, "' has type '", paste(actual_cls, collapse = ","),
"', which is not among the expected types: '", paste(expected_types, collapse = ", "), "'.")
logger::log_error(msg)
stop(msg)
}
invisible(NULL)
},
# Simplified type checker for internal modeling logic
.get_var_type_simple = function(variable_vector) {
if (!is.atomic(variable_vector) && !inherits(variable_vector, c("Date", "POSIXct", "difftime"))) return("other_non_atomic")
if (length(variable_vector) == 0) return("empty_vector")
var_no_na <- variable_vector[!is.na(variable_vector)]
if (length(var_no_na) == 0) { # All NA values
# Try to infer from the original vector's class
if (is.logical(variable_vector)) return("binary_categorical") # All NA logical still binary
if (is.numeric(variable_vector)) return("numeric") # All NA numeric still numeric
if (is.factor(variable_vector)) {
return(if (length(levels(variable_vector)) == 2) "binary_categorical" else "categorical")
}
if (is.character(variable_vector)) return("categorical") # All NA char still categorical
if (inherits(variable_vector, c("Date", "POSIXct"))) return("date")
return("other_all_na")
}
# Has non-NA values
if (is.logical(variable_vector)) return("binary_categorical")
if (is.numeric(variable_vector)) {
# Check for 0/1 numerics or two unique values that could be binary
if (all(var_no_na %in% c(0, 1)) && data.table::uniqueN(var_no_na) <= 2) return("binary_categorical")
if (data.table::uniqueN(var_no_na) == 2) return("binary_categorical") # e.g. (1, 5) only
return("numeric")
}
if (is.factor(variable_vector)) {
return(if (length(levels(variable_vector)) == 2) "binary_categorical" else "categorical")
}
if (is.character(variable_vector)) {
return(if (data.table::uniqueN(var_no_na) == 2) "binary_categorical" else "categorical")
}
if (inherits(variable_vector, c("Date", "POSIXct"))) return("date")
return("other") # Fallback
}
)
)
# Helper to replace rlang::`%||%`
`%||%` <- function(a, b) {
if (is.null(a)) b else a
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.