R/gooder_class.R

#' @title GoodeR R6 Class for Exploratory Data Analysis
#' @description
#' An R6 class designed to facilitate common exploratory data analysis (EDA) tasks
#' on data.frame or data.table objects. It provides methods for metadata
#' generation, numeric, categorical, and date summaries, correlation analysis,
#' various plots, and data quality checks.
#'
#' @field data A data.table containing the dataset to be analyzed.
#' @field name A character string name for the dataset/GoodeR object.
#'
#' @import R6
#' @import data.table
#' @import ggplot2
#' @import moments
#' @import lubridate
#' @import logger
#' @export GoodeR
GoodeR <- R6::R6Class("GoodeR",
                   public = list(
                     data = NULL,
                     name = "GoodeRObject",
                     
                     #' @description
                     #' Initialize a new GoodeR object.
                     #' @param data A data.frame or data.table.
                     #' @param name A character string to name the dataset (default: "Dataset").
                     #' @return A new `GoodeR` object.
                     #' @examples
                     #' if (requireNamespace("stringi", quietly = TRUE)) {
                     #'   # Generate sample data (simplified for example)
                     #'   dt <- data.table::data.table(
                     #'     ID = 1:10,
                     #'     Cat = sample(c("A","B"), 10, replace=TRUE),
                     #'     Num = rnorm(10)
                     #'   )
                     #'   GoodeR_obj <- GoodeR$new(dt, name = "My Sample")
                     #'   print(GoodeR_obj)
                     #' }
                     initialize = function(data, name = "Dataset") {
                       logger::log_info("Initializing GoodeR object...")
                       if (missing(data) || is.null(data)) {
                         logger::log_error("Input 'data' is missing.")
                         stop("Input 'data' is missing.")
                       }
                       if (!is.data.frame(data)) {
                         logger::log_error("Input 'data' must be a data.frame or data.table.")
                         stop("Input 'data' must be a data.frame or data.table.")
                       }
                       if (nrow(data) == 0) {
                         logger::log_warn("Input 'data' has 0 rows.")
                       }
                       self$data <- data.table::as.data.table(data.table::copy(data))
                       self$name <- name
                       logger::log_info(paste0("GoodeR object '", self$name, "' initialized with ", nrow(self$data), " rows and ", ncol(self$data), " columns."))
                       invisible(self)
                     },
                     
                     #' @description
                     #' Get metadata for the dataset.
                     #' Provides information like data type, unique values, and missing values for each column.
                     #' @param by_var Optional. A character string specifying a column name to group metadata by.
                     #' The grouping variable should be categorical or easily coercible to factor.
                     #' @return A data.table with metadata. If `by_var` is used, the first column(s) will be the grouping variable(s).
                     #' @examples
                     #' if (requireNamespace("stringi", quietly = TRUE)) {
                     #'   dt <- data.table::data.table(
                     #'     Group = sample(c("G1", "G2"), 20, replace = TRUE),
                     #'     Val = rnorm(20),
                     #'     Cat = sample(letters[1:3], 20, replace = TRUE)
                     #'   )
                     #'   GoodeR_obj <- GoodeR$new(dt)
                     #'   meta_overall <- GoodeR_obj$get_metadata()
                     #'   # print(meta_overall)
                     #'   meta_by_group <- GoodeR_obj$get_metadata(by_var = "Group")
                     #'   # print(meta_by_group)
                     #' }
                     get_metadata = function(by_var = NULL) {
                       logger::log_info(paste0("Generating metadata for '", self$name, if (!is.null(by_var)) paste0("' by '", by_var, "'") else "", "..."))
                       if (is.null(self$data)) {
                         logger::log_error("Data not loaded.")
                         stop("Data not loaded.")
                       }
                       .get_subset_metadata <- function(dt_subset) {
                         meta_sub <- data.table::data.table(
                           Column_Name = names(dt_subset),
                           Data_Type = sapply(dt_subset, function(x) paste(class(x), collapse = ", ")),
                           N_Rows_In_Scope = nrow(dt_subset),
                           Unique_Values = sapply(dt_subset, data.table::uniqueN),
                           Missing_Values = sapply(dt_subset, function(x) sum(is.na(x)))
                         )
                         meta_sub[, Missing_Percentage := data.table::fifelse(N_Rows_In_Scope > 0, round(Missing_Values / N_Rows_In_Scope * 100, 2), 0)]
                         return(meta_sub)
                       }
                       if (!is.null(by_var)) {
                         private$.check_data_and_vars(by_var)
                         private$.check_var_type(by_var, c("character", "factor", "integer", "logical"))
                         temp_data <- data.table::copy(self$data)
                         if (!is.factor(temp_data[[by_var]])) temp_data[, (by_var) := as.factor(get(by_var))]
                         results_list <- temp_data[, .get_subset_metadata(.SD), by = by_var]
                         if (by_var %in% names(results_list) && "Column_Name" %in% names(results_list)) data.table::setcolorder(results_list, c(by_var, "Column_Name"))
                         logger::log_info("Metadata by group generation complete.")
                         return(results_list)
                       } else {
                         meta_dt <- .get_subset_metadata(self$data)
                         data.table::setnames(meta_dt, "N_Rows_In_Scope", "N_Total_Rows")
                         logger::log_info("Overall metadata generation complete.")
                         return(meta_dt)
                       }
                     },
                     
                     #' @description
                     #' Summarize numeric variables.
                     #' Calculates common descriptive statistics for numeric columns.
                     #' @param by_var Optional. A character string specifying a column name to group summaries by.
                     #' The grouping variable should be categorical or easily coercible to factor.
                     #' @return A data.table with numeric summaries.
                     #' @examples
                     #' if (requireNamespace("stringi", quietly = TRUE)) {
                     #'   dt <- data.table::data.table(
                     #'     Group = sample(c("G1", "G2"), 20, replace = TRUE),
                     #'     Val1 = rnorm(20),
                     #'     Val2 = rpois(20, 5)
                     #'   )
                     #'   GoodeR_obj <- GoodeR$new(dt)
                     #'   num_sum_overall <- GoodeR_obj$summarize_numeric()
                     #'   # print(num_sum_overall)
                     #'   num_sum_by_group <- GoodeR_obj$summarize_numeric(by_var = "Group")
                     #'   # print(num_sum_by_group)
                     #' }
                     summarize_numeric = function(by_var = NULL) {
                       logger::log_info(paste0("Summarizing numeric vars for '", self$name, if (!is.null(by_var)) paste0("' by '", by_var, "'") else "", "..."))
                       if (is.null(self$data)) {
                         logger::log_error("Data not loaded.")
                         stop("Data not loaded.")
                       }
                       numeric_cols <- names(self$data)[sapply(self$data, is.numeric)]
                       if (length(numeric_cols) == 0) {
                         logger::log_warn("No numeric columns found.")
                         return(data.table::data.table())
                       }
                       data_for_summary <- self$data
                       if (!is.null(by_var)) {
                         private$.check_data_and_vars(by_var)
                         private$.check_var_type(by_var, c("character", "factor", "integer", "logical"))
                         if (!is.factor(data_for_summary[[by_var]])) {
                           if (identical(data_for_summary, self$data)) data_for_summary <- data.table::copy(self$data)
                           logger::log_info(paste0("Coercing by_var '", by_var, "' to factor for summarize_numeric."))
                           data_for_summary[, (by_var) := as.factor(get(by_var))]
                         }
                       }
                       summary_list <- list()
                       for (col_name in numeric_cols) {
                         sym_col_name <- as.name(col_name)
                         # Need to ensure moments::skewness and moments::kurtosis are used
                         expr <- bquote(list(
                           N = .N, N_Non_Missing = sum(!is.na(.(sym_col_name))), Mean = mean(as.double(.(sym_col_name)), na.rm = TRUE),
                           SD = stats::sd(as.double(.(sym_col_name)), na.rm = TRUE), Min = as.double(min(as.double(.(sym_col_name)), na.rm = TRUE)),
                           P25 = stats::quantile(as.double(.(sym_col_name)), 0.25, na.rm = TRUE, names = FALSE, type = 7),
                           Median = as.double(stats::median(as.double(.(sym_col_name)), na.rm = TRUE)),
                           P75 = stats::quantile(as.double(.(sym_col_name)), 0.75, na.rm = TRUE, names = FALSE, type = 7),
                           Max = as.double(max(as.double(.(sym_col_name)), na.rm = TRUE)),
                           Skewness = {
                             x_vec <- .(sym_col_name)
                             x_dbl_nona <- as.double(x_vec[!is.na(x_vec)])
                             if (length(x_dbl_nona) > 2 && data.table::uniqueN(x_dbl_nona) > 1) moments::skewness(x_dbl_nona) else NA_real_
                           },
                           Kurtosis = {
                             x_vec <- .(sym_col_name)
                             x_dbl_nona <- as.double(x_vec[!is.na(x_vec)])
                             if (length(x_dbl_nona) > 3 && data.table::uniqueN(x_dbl_nona) > 1) moments::kurtosis(x_dbl_nona) else NA_real_
                           }
                         ))
                         current_summary <- if (!is.null(by_var)) data_for_summary[, eval(expr), by = by_var] else data_for_summary[, eval(expr)]
                         if (!data.table::is.data.table(current_summary)) {
                           logger::log_fatal(paste0("CRITICAL: summary for '", col_name, "' not DT."))
                           current_summary <- data.table::as.data.table(current_summary)
                         }
                         current_summary[, Variable := col_name]
                         if (!is.null(by_var)) data.table::setcolorder(current_summary, c("Variable", by_var)) else data.table::setcolorder(current_summary, "Variable")
                         summary_list[[col_name]] <- current_summary
                       }
                       result_dt <- data.table::rbindlist(summary_list, fill = TRUE)
                       cols_fix <- c("Min", "Max", "Mean", "SD", "Median", "P25", "P75", "Skewness", "Kurtosis")
                       for (col_f in intersect(cols_fix, names(result_dt))) {
                         if (col_f %in% names(result_dt)) {
                           data.table::set(result_dt, i = which(is.infinite(result_dt[[col_f]]) | is.nan(result_dt[[col_f]])), j = col_f, value = NA_real_)
                         }
                       }
                       logger::log_info("Numeric summarization complete.")
                       return(result_dt)
                     },
                     
                     #' @description
                     #' Summarize categorical variables.
                     #' Provides frequency counts and percentages for levels of categorical columns.
                     #' @param top_n Integer. Number of top levels to show for each variable (default: 5).
                     #' @param include_plot Logical. If TRUE, generates and prints bar plots for top levels (default: FALSE).
                     #' @param by_var Optional. A character string specifying a column name to group summaries by.
                     #' @return If `by_var` is NULL, a list where each element is a list containing `summary_stats`
                     #' and `top_levels_frequency` for a categorical variable.
                     #' If `by_var` is specified, a single data.table with summarized results across all categorical variables,
                     #' grouped by `by_var`.
                     summarize_categorical = function(top_n = 5, include_plot = FALSE, by_var = NULL) {
                       logger::log_info(paste0("Summarizing categorical variables for '", self$name, if (!is.null(by_var)) paste0("' by '", by_var, "'") else "", "..."))
                       if (is.null(self$data)) {
                         logger::log_error("Data not loaded.")
                         stop("Data not loaded.")
                       }
                       categorical_cols <- names(self$data)[sapply(self$data, function(x) is.factor(x) || is.character(x))]
                       if (length(categorical_cols) == 0) {
                         logger::log_warn("No categorical columns found.")
                         return(if (is.null(by_var)) list() else data.table::data.table())
                       }
                       
                       data_to_use <- self$data
                       original_by_var_class_if_any <- NULL
                       if (!is.null(by_var)) {
                         private$.check_data_and_vars(by_var)
                         private$.check_var_type(by_var, c("character", "factor", "integer", "logical"))
                         original_by_var_class_if_any <- class(data_to_use[[by_var]])
                         if (!is.factor(data_to_use[[by_var]])) {
                           if (identical(data_to_use, self$data)) data_to_use <- data.table::copy(self$data)
                           logger::log_info(paste0("Coercing by_var '", by_var, "' to factor for summarize_categorical."))
                           data_to_use[, (by_var) := as.factor(get(by_var))]
                         }
                       }
                       
                       all_results_accumulator <- list()
                       for (col_name in categorical_cols) {
                         if (!is.null(by_var) && col_name == by_var && !(any(c("factor", "character") %in% original_by_var_class_if_any))) {
                           logger::log_warn(paste0("Skipping '", col_name, "' as it is the 'by_var' (", paste(original_by_var_class_if_any, collapse = ","), ") and was not originally categorical."))
                           next
                         }
                         logger::log_debug(paste0("Summarizing cat var: ", col_name, if (!is.null(by_var)) paste0(" by ", by_var)))
                         
                         grouping_vars_for_by_clause <- col_name
                         if (!is.null(by_var)) {
                           grouping_vars_for_by_clause <- if (col_name == by_var) by_var else c(by_var, col_name)
                         }
                         
                         cols_for_na_filter <- unique(grouping_vars_for_by_clause)
                         # Ensure .SDcols is used correctly
                         data_subset_no_na <- data_to_use[stats::complete.cases(data_to_use[, .SD, .SDcols = cols_for_na_filter])]
                         
                         current_counts_dt <- NULL
                         if (nrow(data_subset_no_na) > 0) {
                           current_counts_dt <- data_subset_no_na[, .N, by = grouping_vars_for_by_clause]
                           
                           if (col_name %in% names(current_counts_dt)) {
                             if (is.null(by_var) || col_name != by_var || (col_name == by_var && length(grouping_vars_for_by_clause) == 1)) {
                               data.table::setnames(current_counts_dt, old = col_name, new = "Level")
                             }
                           }
                           if (!is.null(by_var) && col_name == by_var && "Level" %in% names(current_counts_dt) && !(by_var %in% names(current_counts_dt))) {
                             current_counts_dt[, (by_var) := get("Level")]
                           }
                           
                           if (!is.null(by_var) && by_var %in% names(current_counts_dt)) {
                             data.table::setorderv(current_counts_dt, c(by_var, "N"), order = c(1, -1))
                             current_counts_dt <- current_counts_dt[, utils::head(.SD, top_n), by = by_var]
                           } else {
                             data.table::setorderv(current_counts_dt, "N", -1)
                             current_counts_dt <- utils::head(current_counts_dt, top_n)
                           }
                         } else {
                           current_counts_dt <- data.table::data.table(Level = character(0), N = integer(0))
                           if (!is.null(by_var) && !(by_var %in% names(current_counts_dt))) current_counts_dt[, (by_var) := factor()]
                           if (is.null(current_counts_dt[["Level"]]) && (!is.null(by_var) && col_name == by_var)) current_counts_dt[, Level := factor()] # Use [["Level"]] for safety
                         }
                         
                         if (nrow(current_counts_dt) > 0) {
                           if (!is.null(by_var) && by_var %in% names(current_counts_dt)) {
                             total_col_name_for_perc <- ".Total_N_Group"
                             totals_in_by_group <- data_subset_no_na[, .N, by = by_var]
                             data.table::setnames(totals_in_by_group, "N", total_col_name_for_perc)
                             current_counts_dt <- merge(current_counts_dt, totals_in_by_group, by = by_var, all.x = TRUE)
                             current_counts_dt[!is.na(get(total_col_name_for_perc)) & get(total_col_name_for_perc) > 0,
                                               Percentage := round(N / get(total_col_name_for_perc) * 100, 2)]
                             current_counts_dt[is.na(get(total_col_name_for_perc)) | get(total_col_name_for_perc) == 0, Percentage := NA_real_]
                             current_counts_dt[, (total_col_name_for_perc) := NULL]
                           } else {
                             total_for_overall_perc <- nrow(data_subset_no_na)
                             if (total_for_overall_perc > 0) current_counts_dt[, Percentage := round(N / total_for_overall_perc * 100, 2)]
                             else current_counts_dt[, Percentage := NA_real_]
                           }
                         } else {
                           current_counts_dt[, Percentage := NA_real_]
                         }
                         
                         if (!is.null(by_var)) {
                           current_counts_dt[, Summarized_Variable := col_name]
                           all_results_accumulator[[col_name]] <- current_counts_dt
                         } else {
                           n_unique_overall <- data.table::uniqueN(self$data[[col_name]], na.rm = TRUE)
                           n_missing_overall <- sum(is.na(self$data[[col_name]]))
                           summary_stats_dt <- data.table::data.table(Var = col_name, UniqLvls = n_unique_overall, Missing = n_missing_overall, TopNShown = nrow(current_counts_dt))
                           all_results_accumulator[[col_name]] <- list(summary_stats = summary_stats_dt, top_levels_frequency = current_counts_dt)
                         }
                         
                         if (include_plot && nrow(current_counts_dt) > 0 && any(current_counts_dt$N > 0)) {
                           plt_data <- data.table::copy(current_counts_dt)
                           if (!"Level" %in% names(plt_data)) {
                             logger::log_warn(paste0("Plot: 'Level' col NF for ", col_name, ". Skip plot."))
                           } else {
                             plt_data[, Level := factor(Level, levels = unique(Level))] # Use unique(Level) from plt_data itself for ordering
                             p_cat <- ggplot2::ggplot(plt_data, ggplot2::aes(x = .data$Level, y = .data$N, fill = .data$Level)) +
                               ggplot2::geom_bar(stat = "identity", show.legend = FALSE) +
                               ggplot2::geom_text(ggplot2::aes(label = .data$N), vjust = -0.3, size = 2.5) +
                               ggplot2::theme_minimal() +
                               ggplot2::theme(axis.text.x = ggplot2::element_text(angle = 45, hjust = 1))
                             title_base <- paste0("Top levels for ", col_name)
                             if (!is.null(by_var) && by_var %in% names(plt_data)) {
                               p_cat <- p_cat + ggplot2::facet_wrap(stats::as.formula(paste("~", by_var)))
                               title_base <- paste(title_base, "by", by_var)
                             }
                             p_cat <- p_cat + ggplot2::labs(title = title_base, x = col_name, y = "Freq")
                             tryCatch(print(p_cat), error = function(e) logger::log_warn(paste("Plot err cat", col_name, ":", e$message)))
                           }
                         }
                       }
                       logger::log_info("Cat summary done.")
                       if (!is.null(by_var)) {
                         final_dt <- data.table::rbindlist(all_results_accumulator, use.names = TRUE, fill = TRUE)
                         if (nrow(final_dt) > 0) {
                           expected_cols <- c("Summarized_Variable", by_var, "Level", "N", "Percentage")
                           present_cols <- intersect(expected_cols, names(final_dt))
                           if (length(present_cols) > 0) data.table::setcolorder(final_dt, present_cols)
                         }
                         return(final_dt)
                       } else {
                         return(all_results_accumulator)
                       }
                     },
                     
                     #' @description
                     #' Summarize date and datetime variables.
                     #' Provides min/max dates, range, and counts for date/datetime columns.
                     #' @param include_plot Logical. If TRUE, generates and prints histograms for date distributions (default: FALSE).
                     #' @param by_var Optional. A character string specifying a column name to group summaries by.
                     #' @return A data.table with date summaries.
                     summarize_dates = function(include_plot = FALSE, by_var = NULL) {
                       logger::log_info(paste0("Summarizing date vars for '", self$name, if (!is.null(by_var)) paste0("' by '", by_var, "'") else "", "..."))
                       if (is.null(self$data)) {
                         logger::log_error("Data not loaded.")
                         stop("Data not loaded.")
                       }
                       date_cols <- names(self$data)[sapply(self$data, function(x) inherits(x, "Date") || inherits(x, "POSIXct"))]
                       if (length(date_cols) == 0) {
                         logger::log_warn("No date/datetime columns found.")
                         return(data.table::data.table())
                       }
                       data_to_use <- self$data
                       if (!is.null(by_var)) {
                         private$.check_data_and_vars(by_var)
                         private$.check_var_type(by_var, c("character", "factor", "integer", "logical"))
                         if (!is.factor(data_to_use[[by_var]])) {
                           if (identical(data_to_use, self$data)) data_to_use <- data.table::copy(self$data)
                           logger::log_info(paste0("Coercing by_var '", by_var, "' to factor for summarize_dates."))
                           data_to_use[, (by_var) := as.factor(get(by_var))]
                         }
                       }
                       all_date_summaries <- list()
                       for (col_name in date_cols) {
                         logger::log_debug(paste0("Summarizing date: ", col_name, if (!is.null(by_var)) paste0(" by ", by_var)))
                         sym_col <- as.name(col_name)
                         na_val_constructor <- if (inherits(self$data[[col_name]], "Date")) {
                           quote(as.Date(NA_character_))
                         } else {
                           quote(lubridate::as_datetime(NA_real_)) # Use as_datetime for POSIXct NA
                         }
                         expr <- bquote(list(
                           N_Total = .N, N_Non_Missing = sum(!is.na(.(sym_col))),
                           Min_Date_Raw = if (sum(!is.na(.(sym_col))) > 0) min(.(sym_col), na.rm = TRUE) else .(na_val_constructor),
                           Max_Date_Raw = if (sum(!is.na(.(sym_col))) > 0) max(.(sym_col), na.rm = TRUE) else .(na_val_constructor),
                           Unique_Count = data.table::uniqueN(.(sym_col), na.rm = TRUE)
                         ))
                         current_summary <- if (!is.null(by_var)) data_to_use[, eval(expr), by = by_var] else data_to_use[, eval(expr)]
                         
                         original_tz <- lubridate::tz(self$data[[col_name]][1]) # Get tz from original data
                         if (is.null(original_tz) || original_tz == "") original_tz <- "UTC"
                         
                         # Ensure Min_Date_Raw and Max_Date_Raw are POSIXct before converting for consistency
                         # This is tricky because min/max on Date objects return Date, on POSIXct return POSIXct
                         # If original is Date, convert to POSIXct for Min_Date/Max_Date columns
                         if (inherits(self$data[[col_name]], "Date")) {
                           current_summary[, Min_Date := lubridate::as_datetime(Min_Date_Raw, tz = original_tz)]
                           current_summary[, Max_Date := lubridate::as_datetime(Max_Date_Raw, tz = original_tz)]
                         } else { # POSIXct
                           current_summary[, Min_Date := lubridate::as_datetime(as.numeric(Min_Date_Raw), origin="1970-01-01", tz=original_tz)]
                           current_summary[, Max_Date := lubridate::as_datetime(as.numeric(Max_Date_Raw), origin="1970-01-01", tz=original_tz)]
                         }
                         
                         current_summary[, Range_Seconds := data.table::fifelse(N_Non_Missing > 1 & !is.na(Min_Date) & !is.na(Max_Date),
                                                                                as.numeric(lubridate::difftime(Max_Date, Min_Date, units = "secs")), NA_real_)]
                         current_summary[, c("Min_Date_Raw", "Max_Date_Raw") := NULL]
                         current_summary[, Variable := col_name]
                         if (!is.null(by_var)) data.table::setcolorder(current_summary, c("Variable", by_var)) else data.table::setcolorder(current_summary, "Variable")
                         all_date_summaries[[col_name]] <- current_summary
                         
                         if (include_plot && sum(!is.na(self$data[[col_name]])) > 0) {
                           plot_dt <- data.table::copy(data_to_use[!is.na(get(col_name))])
                           # Ensure column for plotting is POSIXct
                           if (inherits(plot_dt[[col_name]], "Date")) plot_dt[, (col_name) := lubridate::as_datetime(get(col_name), tz = original_tz)]
                           
                           p <- ggplot2::ggplot(plot_dt, ggplot2::aes(x = .data[[col_name]])) +
                             ggplot2::geom_histogram(bins = 30, fill = "steelblue", color = "black") +
                             ggplot2::theme_minimal()
                           plot_title <- paste("Distribution of", col_name)
                           if (!is.null(by_var)) {
                             p <- p + ggplot2::facet_wrap(stats::as.formula(paste("~", by_var)))
                             plot_title <- paste(plot_title, "by", by_var)
                           }
                           p <- p + ggplot2::labs(title = plot_title, x = col_name, y = "Frequency")
                           tryCatch(print(p), error = function(e) logger::log_warn(paste("Plot err date", col_name, ":", e$message)))
                         }
                       }
                       result <- data.table::rbindlist(all_date_summaries, use.names = TRUE, fill = TRUE)
                       logger::log_info("Date summary complete.")
                       return(result)
                     },
                     
                     #' @description
                     #' Calculate and optionally plot correlation matrices for numeric variables.
                     #' @param method Correlation method (e.g., "pearson", "kendall", "spearman"). Passed to `stats::cor`.
                     #' @param use Handling of missing data. Passed to `stats::cor`. (default: "pairwise.complete.obs").
                     #' @param plot_type Type of plot: "ggplot", "corrplot", or NULL for no plot (default: "ggplot").
                     #'                  "corrplot" requires the 'corrplot' package.
                     #' @param by_var Optional. A character string specifying a column name to group correlation analysis by.
                     #' @return If `by_var` is NULL, a single correlation matrix.
                     #' If `by_var` is specified, a list of correlation matrices, one for each group.
                     #' Plots are printed to the active graphics device if `plot_type` is not NULL.
                     get_correlations = function(method = "pearson", use = "pairwise.complete.obs", plot_type = "ggplot", by_var = NULL) {
                       logger::log_info(paste0("Correlations for '", self$name, if (!is.null(by_var)) paste0("' by '", by_var, "'") else "", "..."))
                       if (is.null(self$data)) {
                         logger::log_error("Data not loaded.")
                         stop("Data not loaded.")
                       }
                       data_to_use <- self$data
                       if (!is.null(by_var)) {
                         private$.check_data_and_vars(by_var)
                         private$.check_var_type(by_var, c("character", "factor", "integer", "logical"))
                         if (!is.factor(data_to_use[[by_var]])) {
                           if (identical(data_to_use, self$data)) data_to_use <- data.table::copy(self$data)
                           data_to_use[, (by_var) := as.factor(get(by_var))]
                         }
                       }
                       calc_cor_mat <- function(dt, title_suffix = "") {
                         num_cols <- names(dt)[sapply(dt, is.numeric)]
                         if (length(num_cols) < 1) {
                           logger::log_warn(paste0("No numeric columns for correlation", title_suffix))
                           return(matrix(nrow = 0, ncol = 0))
                         }
                         num_dt <- dt[, .SD, .SDcols = num_cols, drop = FALSE]
                         sds <- sapply(num_dt, function(x) {
                           xn <- x[!is.na(x)]
                           if (length(xn) < 2) 0 else {
                             s_val <- stats::sd(xn)
                             if (is.na(s_val)) 0 else s_val
                           }
                         })
                         cols_v <- names(sds[!is.na(sds) & sds > sqrt(.Machine$double.eps)])
                         if (length(cols_v) < 2) {
                           logger::log_warn(paste0("<2 numeric columns with variance for correlation", title_suffix))
                           return(matrix(nrow = 0, ncol = 0))
                         }
                         num_dt_c <- num_dt[, .SD, .SDcols = cols_v, drop = FALSE]
                         cor_m <- tryCatch(stats::cor(num_dt_c, method = method, use = use),
                                           error = function(e) {
                                             logger::log_error(paste("Correlation calculation error", title_suffix, ":", e$message))
                                             matrix(nrow = 0, ncol = 0)
                                           }
                         )
                         if (!is.null(plot_type) && is.matrix(cor_m) && nrow(cor_m) > 0) {
                           title_f <- paste0("Correlation Matrix", title_suffix)
                           if (plot_type == "ggplot") {
                             # Use .data pronoun for ggplot2 aes
                             mcor <- data.table::melt(data.table::as.data.table(cor_m, keep.rownames = "V1"), id.vars = "V1", variable.name = "V2", value.name = "Corr")
                             p <- ggplot2::ggplot(mcor, ggplot2::aes(x = .data$V1, y = .data$V2, fill = .data$Corr)) +
                               ggplot2::geom_tile(color = "white") +
                               ggplot2::scale_fill_gradient2(low = "blue", high = "red", mid = "white", midpoint = 0, limit = c(-1, 1), na.value = "grey80") +
                               ggplot2::theme_minimal() +
                               ggplot2::theme(axis.text.x = ggplot2::element_text(angle = 45, hjust = 1), axis.title = ggplot2::element_blank()) +
                               ggplot2::coord_fixed() +
                               ggplot2::labs(title = title_f)
                             tryCatch(print(p), error = function(e) logger::log_warn(paste("Plot cor ggplot err", title_suffix, ":", e$message)))
                           } else if (plot_type == "corrplot") {
                             if (requireNamespace("corrplot", quietly = TRUE)) {
                               tryCatch(corrplot::corrplot(cor_m,
                                                           method = "color", type = "upper", order = "hclust", tl.col = "black", tl.srt = 45,
                                                           addCoef.col = "black", number.cex = 0.7, na.label = "NA",
                                                           title = paste0("\n\n", title_f), mar = c(0, 0, 2, 0)
                               ), error = function(e) logger::log_warn(paste("Corrplot err", title_suffix, ":", e$message)))
                             } else {
                               logger::log_warn("Package 'corrplot' not installed. Skipping corrplot.")
                             }
                           }
                         }
                         return(cor_m)
                       }
                       if (!is.null(by_var)) {
                         grps <- unique(data_to_use[[by_var]])
                         list_mats <- lapply(grps, function(g) {
                           logger::log_info(paste0("Correlation for group: ", by_var, "=", g))
                           sub_dt <- data_to_use[get(by_var) == g]
                           sub_dt_no_by <- sub_dt[, !c(by_var), with = FALSE] # Exclude by_var column
                           calc_cor_mat(sub_dt_no_by, paste0(" (", by_var, "=", g, ")"))
                         })
                         names(list_mats) <- make.names(paste0(by_var, "_", grps)) # make.names to ensure valid names
                         logger::log_info("Correlation by group done.")
                         return(list_mats)
                       } else {
                         overall_m <- calc_cor_mat(data_to_use)
                         logger::log_info("Overall correlation done.")
                         return(overall_m)
                       }
                     },
                     
                     #' @description
                     #' Plot a histogram for a numeric variable.
                     #' @param variable Character string. Name of the numeric variable to plot.
                     #' @param by_var Optional. Character string. Name of a categorical variable for faceting.
                     #' @param bins Integer. Number of bins for the histogram (default: 30).
                     #' @return A ggplot object. The plot is also printed.
                     plot_histogram = function(variable, by_var = NULL, bins = 30) {
                       logger::log_info(paste0("Histogram for '", variable, if (!is.null(by_var)) paste0("' by '", by_var, "' (facet)") else "", "..."))
                       vars_chk <- c(variable, by_var)
                       private$.check_data_and_vars(vars_chk)
                       private$.check_var_type(variable, "numeric")
                       if (!is.null(by_var)) private$.check_var_type(by_var, c("character", "factor", "integer", "logical"))
                       
                       plt_dt <- data.table::copy(self$data)
                       if (all(is.na(plt_dt[[variable]]))) {
                         logger::log_warn(paste0("Variable '", variable, "' is all NA. Cannot plot histogram."))
                         p_e <- ggplot2::ggplot() + ggplot2::labs(title = paste("Histogram of", variable, "(All NA)"), x = variable, y = "Frequency") + ggplot2::theme_minimal()
                         tryCatch(print(p_e), error = function(e) logger::log_warn(e$message))
                         return(invisible(p_e))
                       }
                       
                       aes_map <- ggplot2::aes(x = .data[[variable]])
                       title_p <- paste("Histogram of", variable)
                       p <- ggplot2::ggplot(plt_dt, aes_map) +
                         ggplot2::geom_histogram(bins = bins, fill = "steelblue", color = "black", alpha = 0.7, na.rm = TRUE) +
                         ggplot2::theme_minimal()
                       
                       if (!is.null(by_var)) {
                         plt_dt[, (by_var) := as.factor(get(by_var))] # Ensure by_var is factor for faceting
                         p <- p + ggplot2::facet_wrap(stats::as.formula(paste("~", by_var)))
                         title_p <- paste(title_p, "(Faceted by", by_var, ")")
                       }
                       p <- p + ggplot2::labs(title = title_p, x = variable, y = "Frequency")
                       logger::log_info("Histogram generation complete.")
                       tryCatch(print(p), error = function(e) logger::log_warn(paste("Error plotting histogram for '", variable, "': ", e$message)))
                       return(invisible(p))
                     },
                     
                     #' @description
                     #' Plot a boxplot of a numeric variable grouped by a categorical variable.
                     #' @param numeric_var Character string. Name of the numeric variable.
                     #' @param categorical_var Character string. Name of the categorical variable for grouping.
                     #' @param by_var Optional. Character string. Name of another categorical variable for faceting.
                     #' @return A ggplot object. The plot is also printed.
                     plot_boxplot = function(numeric_var, categorical_var, by_var = NULL) {
                       logger::log_info(paste0("Boxplot for '", numeric_var, "' by '", categorical_var, if (!is.null(by_var)) paste0("', faceted by '", by_var, "'") else "", "..."))
                       vars_chk <- c(numeric_var, categorical_var, by_var)
                       private$.check_data_and_vars(vars_chk)
                       private$.check_var_type(numeric_var, "numeric")
                       private$.check_var_type(categorical_var, c("character", "factor", "integer", "logical"))
                       if (!is.null(by_var)) private$.check_var_type(by_var, c("character", "factor", "integer", "logical"))
                       
                       plt_dt <- data.table::copy(self$data)
                       plt_dt[, (categorical_var) := as.factor(get(categorical_var))] # Ensure categorical_var is factor
                       
                       if (all(is.na(plt_dt[[numeric_var]]))) {
                         logger::log_warn(paste0("Numeric variable '", numeric_var, "' is all NA. Cannot plot boxplot."))
                         p_e <- ggplot2::ggplot() + ggplot2::labs(title = paste("Boxplot of", numeric_var, "by", categorical_var, "(Numeric All NA)"), x = categorical_var, y = numeric_var) + ggplot2::theme_minimal()
                         tryCatch(print(p_e), error = function(e) logger::log_warn(e$message))
                         return(invisible(p_e))
                       }
                       
                       title_p <- paste("Boxplot of", numeric_var, "by", categorical_var)
                       aes_map <- ggplot2::aes(x = .data[[categorical_var]], y = .data[[numeric_var]], fill = .data[[categorical_var]])
                       p <- ggplot2::ggplot(plt_dt, aes_map) +
                         ggplot2::geom_boxplot(show.legend = FALSE, na.rm = TRUE) +
                         ggplot2::theme_minimal() +
                         ggplot2::theme(axis.text.x = ggplot2::element_text(angle = 45, hjust = 1))
                       
                       if (!is.null(by_var)) {
                         plt_dt[, (by_var) := as.factor(get(by_var))] # Ensure by_var is factor for faceting
                         p <- p + ggplot2::facet_wrap(stats::as.formula(paste("~", by_var)))
                         title_p <- paste(title_p, "(Faceted by", by_var, ")")
                       }
                       p <- p + ggplot2::labs(title = title_p, x = categorical_var, y = numeric_var) +
                         ggplot2::scale_fill_viridis_d(na.value = "grey80") # From ggplot2, handles discrete scales
                       
                       logger::log_info("Boxplot generation complete.")
                       tryCatch(print(p), error = function(e) logger::log_warn(paste("Error plotting boxplot:", e$message)))
                       return(invisible(p))
                     },
                     
                     #' @description
                     #' Plot a scatterplot between two numeric variables.
                     #' @param x_var Character string. Name of the numeric variable for the x-axis.
                     #' @param y_var Character string. Name of the numeric variable for the y-axis.
                     #' @param color_var Optional. Character string. Name of a variable for coloring points.
                     #' @param size_var Optional. Character string. Name of a numeric variable for sizing points.
                     #' @param add_smooth Logical. If TRUE, adds a loess smoothed line (default: FALSE).
                     #' @param by_var Optional. Character string. Name of a categorical variable for faceting.
                     #' @return A ggplot object. The plot is also printed.
                     plot_scatterplot = function(x_var, y_var, color_var = NULL, size_var = NULL, add_smooth = FALSE, by_var = NULL) {
                       logger::log_info(paste0("Scatterplot for '", y_var, "' vs '", x_var, if (!is.null(by_var)) paste0("', faceted by '", by_var, "'") else "", "..."))
                       vars_chk <- c(x_var, y_var, color_var, size_var, by_var)
                       private$.check_data_and_vars(vars_chk)
                       private$.check_var_type(x_var, "numeric")
                       private$.check_var_type(y_var, "numeric")
                       if (!is.null(size_var)) private$.check_var_type(size_var, "numeric")
                       if (!is.null(color_var)) private$.check_var_type(color_var, c("numeric", "character", "factor", "integer", "logical"))
                       if (!is.null(by_var)) private$.check_var_type(by_var, c("character", "factor", "integer", "logical"))
                       
                       plt_dt <- data.table::copy(self$data)
                       
                       if (all(is.na(plt_dt[[x_var]])) || all(is.na(plt_dt[[y_var]]))) {
                         logger::log_warn(paste0("Either '", x_var, "' or '", y_var, "' is all NA. Cannot plot scatterplot."))
                         p_e <- ggplot2::ggplot() + ggplot2::labs(title = paste("Scatterplot of", y_var, "vs", x_var, "(All NA)"), x = x_var, y = y_var) + ggplot2::theme_minimal()
                         tryCatch(print(p_e), error = function(e) logger::log_warn(e$message))
                         return(invisible(p_e))
                       }
                       
                       aes_map <- ggplot2::aes(x = .data[[x_var]], y = .data[[y_var]])
                       title_p <- paste("Scatterplot of", y_var, "vs", x_var)
                       
                       if (!is.null(color_var)) {
                         # Convert low-cardinality numerics, characters, logicals to factor for color mapping
                         if (is.character(plt_dt[[color_var]]) || is.logical(plt_dt[[color_var]])) {
                           plt_dt[, (color_var) := as.factor(get(color_var))]
                         } else if ((is.numeric(plt_dt[[color_var]]) || is.integer(plt_dt[[color_var]])) &&
                                    data.table::uniqueN(plt_dt[[color_var]], na.rm = TRUE) < 10 &&
                                    data.table::uniqueN(plt_dt[[color_var]], na.rm = TRUE) > 1) {
                           plt_dt[, (color_var) := as.factor(get(color_var))]
                         }
                         aes_map$colour <- as.name(color_var) # Use as.name for aes mapping
                       }
                       if (!is.null(size_var)) {
                         aes_map$size <- as.name(size_var)
                       }
                       
                       p <- ggplot2::ggplot(plt_dt, aes_map) +
                         ggplot2::geom_point(alpha = 0.6, na.rm = TRUE) +
                         ggplot2::theme_minimal()
                       
                       if (!is.null(color_var)) {
                         if (is.numeric(plt_dt[[color_var]]) && !is.factor(plt_dt[[color_var]])) { # Continuous color
                           p <- p + ggplot2::scale_color_viridis_c(name = color_var, na.value = "grey80")
                         } else { # Discrete color (factor)
                           p <- p + ggplot2::scale_color_viridis_d(name = color_var, na.value = "grey80")
                         }
                       }
                       if (add_smooth) {
                         p <- p + ggplot2::geom_smooth(se = FALSE, method = "loess", formula = 'y~x', na.rm = TRUE)
                       }
                       if (!is.null(by_var)) {
                         plt_dt[, (by_var) := as.factor(get(by_var))] # Ensure by_var is factor for faceting
                         p <- p + ggplot2::facet_wrap(stats::as.formula(paste("~", by_var)))
                         title_p <- paste(title_p, "(Faceted by", by_var, ")")
                       }
                       p <- p + ggplot2::labs(title = title_p, x = x_var, y = y_var)
                       logger::log_info("Scatterplot generation complete.")
                       tryCatch(print(p), error = function(e) logger::log_warn(paste("Error plotting scatterplot:", e$message)))
                       return(invisible(p))
                     },
                     
                     #' @description
                     #' Plot a barplot of counts for a categorical variable.
                     #' @param categorical_var Character string. Name of the primary categorical variable.
                     #' @param fill_var Optional. Character string. Name of a categorical variable for fill color (stacked/dodged bars).
                     #' @param order_by_freq Logical. If TRUE, orders bars by frequency of `categorical_var` (default: TRUE).
                     #' @param by_var Optional. Character string. Name of another categorical variable for faceting.
                     #' @return A ggplot object. The plot is also printed.
                     plot_barplot_counts = function(categorical_var, fill_var = NULL, order_by_freq = TRUE, by_var = NULL) {
                       logger::log_info(paste0("Barplot counts for '", categorical_var,
                                               if (!is.null(fill_var)) paste0("' by '", fill_var, "' (fill)") else "",
                                               if (!is.null(by_var)) paste0(", faceted by '", by_var, "'") else "", "..."))
                       vars_chk <- c(categorical_var, fill_var, by_var)
                       private$.check_data_and_vars(vars_chk)
                       private$.check_var_type(categorical_var, c("character", "factor", "integer", "logical"))
                       if (!is.null(fill_var)) private$.check_var_type(fill_var, c("character", "factor", "integer", "logical"))
                       if (!is.null(by_var)) private$.check_var_type(by_var, c("character", "factor", "integer", "logical"))
                       
                       plt_dt_orig <- data.table::copy(self$data)
                       # Filter NAs for essential plotting variables
                       plt_dt <- plt_dt_orig[!is.na(get(categorical_var))]
                       if (!is.null(fill_var)) plt_dt <- plt_dt[!is.na(get(fill_var))]
                       if (!is.null(by_var)) plt_dt <- plt_dt[!is.na(get(by_var))]
                       
                       if (nrow(plt_dt) == 0) {
                         logger::log_warn(paste0("No non-NA data for barplot of '", categorical_var, "'. Cannot plot."))
                         p_e <- ggplot2::ggplot() + ggplot2::labs(title = paste("Barplot of", categorical_var, "(No Data)")) + ggplot2::theme_void()
                         tryCatch(print(p_e), error = function(e) logger::log_warn(e$message))
                         return(invisible(p_e))
                       }
                       
                       plt_dt[, (categorical_var) := as.factor(get(categorical_var))]
                       if (order_by_freq) {
                         # Order factor levels by frequency
                         ord_lvls <- plt_dt[, .N, by = eval(as.name(categorical_var))][order(-N), get(categorical_var)]
                         plt_dt[, (categorical_var) := factor(get(categorical_var), levels = ord_lvls)]
                       }
                       if (!is.null(fill_var)) plt_dt[, (fill_var) := as.factor(get(fill_var))]
                       if (!is.null(by_var)) plt_dt[, (by_var) := as.factor(get(by_var))]
                       
                       aes_map <- ggplot2::aes(x = .data[[categorical_var]])
                       title_p <- paste("Barplot Counts for", categorical_var)
                       pos_arg <- "identity" # Default for single var or when fill is the same as x
                       
                       if (!is.null(fill_var)) {
                         aes_map$fill <- as.name(fill_var)
                         pos_arg <- ggplot2::position_dodge(preserve = "single") # Use dodge for fill_var
                         title_p <- paste(title_p, "by", fill_var)
                       } else {
                         aes_map$fill <- as.name(categorical_var) # Fill by x-axis var if no fill_var
                       }
                       
                       p <- ggplot2::ggplot(plt_dt, aes_map) +
                         ggplot2::geom_bar(stat = "count", position = pos_arg) +
                         ggplot2::theme_minimal() +
                         ggplot2::theme(axis.text.x = ggplot2::element_text(angle = 45, hjust = 1))
                       
                       if (is.null(fill_var) || fill_var == categorical_var) {
                         p <- p + ggplot2::guides(fill = "none") # No legend if fill is redundant
                       } else {
                         p <- p + ggplot2::scale_fill_viridis_d(name = fill_var, na.value="grey80")
                       }
                       
                       if (!is.null(by_var)) {
                         p <- p + ggplot2::facet_wrap(stats::as.formula(paste("~", by_var)))
                         title_p <- paste(title_p, "(Faceted by", by_var, ")")
                       }
                       p <- p + ggplot2::labs(title = title_p, x = categorical_var, y = "Count")
                       logger::log_info("Barplot generation complete.")
                       tryCatch(print(p), error = function(e) logger::log_warn(paste("Error plotting barplot:", e$message)))
                       return(invisible(p))
                     },
                     
                     #' @description
                     #' Find categorical variables with high cardinality.
                     #' Identifies categorical columns that have many unique values relative to non-missing rows or an absolute high number of unique values.
                     #' @param threshold_ratio Numeric. Ratio of unique values to non-missing rows to be considered high cardinality (default: 0.5).
                     #' @param threshold_unique Integer. Absolute number of unique values to be considered high cardinality (default: 100).
                     #' @param by_var Optional. Character string. Name of a categorical variable to group the analysis by.
                     #' @return A data.table listing high cardinality categorical variables and their metrics.
                     find_high_cardinality_categoricals = function(threshold_ratio = 0.5, threshold_unique = 100, by_var = NULL) {
                       logger::log_info(paste0("Finding high cardinality categorical variables", if (!is.null(by_var)) paste0(" by '", by_var, "'") else "", "..."))
                       if (is.null(self$data)) {
                         logger::log_error("Data not loaded.")
                         stop("Data not loaded.")
                       }
                       
                       .find_high_card_subset <- function(dt_subset, group_val_char = NULL) {
                         meta_s <- data.table::data.table(
                           Col = names(dt_subset),
                           Type = sapply(dt_subset, function(x) paste(class(x), collapse = ",")),
                           Uniq = sapply(dt_subset, data.table::uniqueN),
                           Miss = sapply(dt_subset, function(x) sum(is.na(x)))
                         )
                         # Use %chin% for efficient character matching
                         cat_meta_s <- meta_s[Type %chin% c("factor", "character", "character, factor", "factor, character")]
                         if (nrow(cat_meta_s) == 0) return(data.table::data.table())
                         
                         N_r_s <- nrow(dt_subset)
                         hcl_s <- list()
                         for (i in 1:nrow(cat_meta_s)) {
                           cn_s <- cat_meta_s$Col[i]
                           # Check if column is actually categorical in the subset
                           if (!(is.factor(dt_subset[[cn_s]]) || is.character(dt_subset[[cn_s]]))) next
                           
                           uv_s <- cat_meta_s$Uniq[i]
                           nnar_s <- N_r_s - cat_meta_s$Miss[i]
                           rval_s <- if (nnar_s > 0) uv_s / nnar_s else 0
                           ex_r_s <- if (nnar_s > 0) (rval_s > threshold_ratio) else FALSE
                           ex_u_s <- uv_s > threshold_unique
                           if (ex_r_s || ex_u_s) {
                             hcl_s[[cn_s]] <- data.table::data.table(Var = cn_s, Uniq = uv_s, NonNA_Sub = nnar_s, Ratio_Sub = round(rval_s, 3), ExceedsRatio = ex_r_s, ExceedsUnique = ex_u_s)
                           }
                         }
                         res_s <- data.table::rbindlist(hcl_s, fill = TRUE)
                         if (!is.null(group_val_char) && nrow(res_s) > 0) res_s[, Group_Level_Temp_Col := group_val_char]
                         return(res_s)
                       }
                       
                       if (!is.null(by_var)) {
                         private$.check_data_and_vars(by_var)
                         private$.check_var_type(by_var, c("character", "factor", "integer", "logical"))
                         temp_data_hcc <- data.table::copy(self$data)
                         if (!is.factor(temp_data_hcc[[by_var]])) temp_data_hcc[, (by_var) := as.factor(get(by_var))]
                         cols_to_check <- setdiff(names(temp_data_hcc), by_var)
                         
                         # Ensure .BY[[1]] is character if it's a factor
                         results_by_hcc <- temp_data_hcc[, .find_high_card_subset(.SD, group_val_char = as.character(.BY[[1]])), by = by_var, .SDcols = cols_to_check]
                         
                         if ("Group_Level_Temp_Col" %in% names(results_by_hcc)) {
                           data.table::setnames(results_by_hcc, "Group_Level_Temp_Col", by_var)
                           if ("Var" %in% names(results_by_hcc)) data.table::setcolorder(results_by_hcc, c(by_var, "Var"))
                         }
                         logger::log_info(paste("High cardinality categorical variable search by group complete."))
                         return(results_by_hcc)
                       } else {
                         result_overall <- .find_high_card_subset(self$data)
                         logger::log_info("High cardinality categorical variable search complete.")
                         return(result_overall)
                       }
                     },
                     
                     #' @description
                     #' Find numeric variables with high skewness.
                     #' Uses the `summarize_numeric` method internally.
                     #' @param threshold Numeric. Absolute skewness value to exceed to be considered highly skewed (default: 1.0).
                     #' @param by_var Optional. Character string. Name of a categorical variable to group the analysis by.
                     #' @return A data.table listing skewed numeric variables, their skewness, and kurtosis.
                     find_skewed_numeric = function(threshold = 1.0, by_var = NULL) {
                       logger::log_info(paste0("Finding skewed numeric variables", if (!is.null(by_var)) paste0(" by '", by_var, "'") else "", "..."))
                       if (is.null(self$data)) {
                         logger::log_error("Data not loaded.")
                         stop("Data not loaded.")
                       }
                       # Call internal method to get numeric summaries
                       num_sum_scoped <- self$summarize_numeric(by_var = by_var)
                       
                       if (is.null(num_sum_scoped) || nrow(num_sum_scoped) == 0) {
                         logger::log_info("No numeric variables found for skewness check.")
                         return(data.table::data.table())
                       }
                       
                       # Define required columns; Skewness and Variable are essential
                       req_c <- c("Variable", "Skewness", "Kurtosis")
                       if (!is.null(by_var)) req_c <- c(by_var, req_c) # Prepend by_var if it exists
                       
                       # Ensure essential columns are present
                       if (!"Skewness" %in% names(num_sum_scoped) || !"Variable" %in% names(num_sum_scoped)) {
                         logger::log_warn("Expected columns 'Variable' or 'Skewness' not found in numeric summary for skewness check.")
                         return(data.table::data.table())
                       }
                       
                       # Intersect with actual columns to prevent errors if some are missing (e.g. Kurtosis if all vars have <4 obs)
                       cols_to_select <- intersect(req_c, names(num_sum_scoped))
                       
                       # data.table i-part selection for rows is fine with unquoted column names
                       skewed <- num_sum_scoped[!is.na(Skewness) & abs(Skewness) > threshold, .SD, .SDcols = cols_to_select]
                       
                       logger::log_info(paste("Found", nrow(skewed), "skewed entries meeting threshold."))
                       return(skewed)
                     },
                     
                     #' @description
                     #' Find numeric variables with near-zero variance (NZV).
                     #' Based on `caret::nearZeroVar` logic: high ratio of most frequent to second most frequent value,
                     #' and low percentage of unique values.
                     #' @param freq_cut Numeric. The cutoff for the ratio of frequencies (most common / second most common) (default: 95/5).
                     #' @param unique_cut Numeric. The cutoff for the percentage of unique values out of total non-missing samples (default: 10).
                     #' @param by_var Optional. Character string. Name of a categorical variable to group the analysis by.
                     #' @return A data.table listing NZV numeric variables and their metrics.
                     find_near_zero_variance_numeric = function(freq_cut = 95/5, unique_cut = 10, by_var = NULL) {
                       logger::log_info(paste0("Finding near-zero variance numeric variables", if (!is.null(by_var)) paste0(" by '", by_var, "'") else "", "..."))
                       if (is.null(self$data)) {
                         logger::log_error("Data not loaded.")
                         stop("Data not loaded.")
                       }
                       
                       .find_nzv_subset <- function(dt_subset, group_val_char = NULL) {
                         num_cols_s <- names(dt_subset)[sapply(dt_subset, is.numeric)]
                         if (length(num_cols_s) == 0) return(data.table::data.table())
                         
                         dl_s <- list()
                         for (cn_s in num_cols_s) {
                           cv_s <- dt_subset[[cn_s]]
                           cvnn_s <- cv_s[!is.na(cv_s)]
                           nn_s <- length(cvnn_s)
                           nt_s <- ""
                           is_zero_var_s <- FALSE
                           freq_ratio_s <- NA_real_
                           pct_unique_s <- NA_real_
                           meets_criteria_s <- FALSE
                           
                           if (nn_s == 0) {
                             nt_s <- "All NA"
                           } else {
                             is_zero_var_s <- (data.table::uniqueN(cvnn_s) <= 1)
                             if (is_zero_var_s) {
                               nt_s <- "Zero/Single Unique Value"
                               freq_ratio_s <- Inf # Or NA_real_ if preferred for single value
                               pct_unique_s <- if (nn_s > 0) (1 / nn_s) * 100 else 0 # Should be (uniqueN / nn_s) * 100 = (1/nn_s)*100
                             } else {
                               # Calculate freqRatio and percentUnique only if not zero var
                               counts_s <- table(cvnn_s)
                               sorted_counts_s <- sort(counts_s, decreasing = TRUE)
                               
                               if (length(sorted_counts_s) >= 2) {
                                 freq_ratio_s <- sorted_counts_s[1] / sorted_counts_s[2]
                               } else { # Only one unique value was present, should have been caught by is_zero_var_s logic
                                 freq_ratio_s <- Inf # Effectively, as second most common is 0
                               }
                               pct_unique_s <- (data.table::uniqueN(cvnn_s) / nn_s) * 100
                               meets_criteria_s <- (freq_ratio_s >= freq_cut && pct_unique_s <= unique_cut)
                             }
                           }
                           is_nzv_final_s <- is_zero_var_s || meets_criteria_s
                           dl_s[[cn_s]] <- data.table::data.table(
                             Var = cn_s, IsZeroVar = is_zero_var_s, FreqRatio = freq_ratio_s,
                             PctUniq = pct_unique_s, MeetsCriteria = meets_criteria_s, IsNZV = is_nzv_final_s, Note = nt_s
                           )
                         }
                         all_data_s <- data.table::rbindlist(dl_s, fill = TRUE)
                         # Filter for only those that are NZV
                         res_s <- all_data_s[get("IsNZV") == TRUE] # Use get() for clarity or ensure IsNZV is known to R CMD CHECK
                         
                         # Define expected columns for structure consistency, even if res_s is empty
                         expected_cols <- c("Var", "IsZeroVar", "FreqRatio", "PctUniq", "MeetsCriteria", "IsNZV", "Note")
                         if (!is.null(group_val_char)) expected_cols <- c(expected_cols, "Group_Level_Temp_Col")
                         
                         if (nrow(res_s) > 0) {
                           if (!is.null(group_val_char)) res_s[, Group_Level_Temp_Col := group_val_char]
                           # Select and reorder to expected_cols that are present
                           present_expected_cols <- intersect(expected_cols, names(res_s))
                           res_s <- res_s[, .SD, .SDcols = present_expected_cols]
                         } else { # Create empty table with correct structure if no NZV found
                           empty_dt_data <- lapply(stats::setNames(nm = expected_cols), function(x) vector()) # Create named list of empty vectors
                           # Ensure correct types if possible, or let data.table infer from empty vectors
                           # For simplicity, this creates logical(0) for all if not specified.
                           # A more robust empty table might define types more explicitly.
                           res_s <- data.table::as.data.table(empty_dt_data)
                           # Remove Group_Level_Temp_Col if group_val_char was NULL
                           if(is.null(group_val_char) && "Group_Level_Temp_Col" %in% names(res_s)) {
                             res_s[, Group_Level_Temp_Col := NULL]
                           }
                         }
                         return(res_s)
                       }
                       
                       if (!is.null(by_var)) {
                         private$.check_data_and_vars(by_var)
                         private$.check_var_type(by_var, c("character", "factor", "integer", "logical"))
                         temp_data_nzv <- data.table::copy(self$data)
                         if (!is.factor(temp_data_nzv[[by_var]])) temp_data_nzv[, (by_var) := as.factor(get(by_var))]
                         cols_to_check_nzv <- setdiff(names(temp_data_nzv), by_var)
                         
                         results_by_nzv <- temp_data_nzv[, .find_nzv_subset(.SD, group_val_char = as.character(.BY[[1]])), by = by_var, .SDcols = cols_to_check_nzv]
                         
                         if ("Group_Level_Temp_Col" %in% names(results_by_nzv)) {
                           data.table::setnames(results_by_nzv, "Group_Level_Temp_Col", by_var)
                           # Reorder to put by_var first, then Var
                           new_order <- c(by_var, setdiff(names(results_by_nzv), by_var))
                           if ("Var" %in% new_order) { # Ensure Var exists before trying to move it
                             new_order <- c(by_var, "Var", setdiff(new_order, c(by_var, "Var")))
                             data.table::setcolorder(results_by_nzv, new_order)
                           } else {
                             data.table::setcolorder(results_by_nzv, new_order) # Var might not be present if no NZV found
                           }
                         }
                         logger::log_info("Near-zero variance numeric variable search by group complete.")
                         return(results_by_nzv)
                       } else {
                         result_overall <- .find_nzv_subset(self$data)
                         logger::log_info("Near-zero variance numeric variable search complete.")
                         return(result_overall)
                       }
                     },
                     
                     #' @description
                     #' Find potential univariate predictors for a target variable.
                     #' Fits simple linear models (LM for numeric target, GLM for binary target)
                     #' for each predictor variable against the target.
                     #' @param target_var Character string. Name of the target variable.
                     #' @param p_value_threshold Numeric. Significance level to consider a predictor (default: 0.05).
                     #' @param by_var Optional. Character string. Name of a categorical variable to group the analysis by.
                     #' @return A data.table listing significant univariate predictors, their model type,
                     #'         statistic (R-squared or Pseudo R-squared), and p-value.
                     get_univariate_predictors = function(target_var, p_value_threshold = 0.05, by_var = NULL) {
                       logger::log_info(paste0("Finding univariate predictors for '", target_var, if (!is.null(by_var)) paste0("' by '", by_var, "'") else "", "..."))
                       private$.check_data_and_vars(c(target_var, by_var)) # by_var can be NULL, handled by .check_data_and_vars
                       
                       .get_preds_subset <- function(dt_subset, current_target_var, group_val_char = NULL) {
                         preds_s <- setdiff(names(dt_subset), current_target_var)
                         if (length(preds_s) == 0) return(data.table::data.table())
                         
                         work_dt_s <- data.table::copy(dt_subset)
                         tgt_v_s <- work_dt_s[[current_target_var]]
                         tgt_t_s <- private$.get_var_type_simple(tgt_v_s) # Use private helper
                         
                         # Helper to create consistent logging prefix
                         log_prefix <- paste0("Subset ", if (is.null(group_val_char)) "Overall" else group_val_char, ": ")
                         
                         if (!tgt_t_s %in% c("numeric", "binary_categorical")) {
                           logger::log_warn(paste0(log_prefix, "Target '", current_target_var, "' type '", tgt_t_s, "' is not numeric or binary_categorical. Skipping."))
                           return(data.table::data.table())
                         }
                         
                         sig_preds_l_s <- list()
                         if (tgt_t_s == "binary_categorical") {
                           original_levels_s <- character(0)
                           if (is.factor(tgt_v_s)) original_levels_s <- levels(tgt_v_s)
                           else original_levels_s <- sort(as.character(unique(stats::na.omit(tgt_v_s))))
                           
                           if (length(original_levels_s) == 2) {
                             logger::log_debug(paste0(log_prefix, "Binary target '", current_target_var, "': '", original_levels_s[1], "' maps to 0, '", original_levels_s[2], "' maps to 1."))
                           }
                           # Coerce target to 0/1 integer
                           work_dt_s[, (current_target_var) := as.integer(as.factor(get(current_target_var))) - 1]
                           unique_post_conversion_s <- data.table::uniqueN(work_dt_s[[current_target_var]], na.rm = TRUE)
                           if (unique_post_conversion_s > 0 && unique_post_conversion_s != 2 && !(unique_post_conversion_s == 1 && sum(!is.na(work_dt_s[[current_target_var]])) > 0) ) { # Allow 1 unique if all non-NA are same
                             logger::log_error(paste0(log_prefix, "Target '", current_target_var, "' failed 0/1 conversion or resulted in invalid binary. Unique values post-conversion: ", unique_post_conversion_s))
                             return(data.table::data.table())
                           }
                         }
                         
                         for (pred_v_s in preds_s) {
                           # Create dataset with only target and current predictor, removing NAs for the pair
                           pair_dt_s <- work_dt_s[stats::complete.cases(work_dt_s[, .SD, .SDcols = c(current_target_var, pred_v_s)]), .SD, .SDcols = c(current_target_var, pred_v_s)]
                           
                           if (nrow(pair_dt_s) < 10) { # Minimum observations for modeling
                             logger::log_debug(paste0(log_prefix, "Skipping '", pred_v_s, "' due to <10 complete obs with target."))
                             next
                           }
                           
                           # Check target variance within this pair_dt_s
                           if (data.table::uniqueN(pair_dt_s[[current_target_var]]) < 2) {
                             logger::log_debug(paste0(log_prefix, "Target '", current_target_var, "' has <2 unique values for predictor '", pred_v_s, "'. Skipping model."))
                             next
                           }
                           
                           pred_mdl_v_s <- pair_dt_s[[pred_v_s]]
                           pred_orig_t_s <- private$.get_var_type_simple(self$data[[pred_v_s]]) # Original type from self$data
                           pred_mdl_t_s <- private$.get_var_type_simple(pred_mdl_v_s) # Type within the current modeling subset
                           
                           # Prepare predictor: convert categoricals to factor, skip if unsuitable
                           if (pred_mdl_t_s == "numeric") {
                             if (data.table::uniqueN(pred_mdl_v_s) <= 1) { # Skip if predictor has no variance in this subset
                               logger::log_debug(paste0(log_prefix, "Numeric predictor '", pred_v_s, "' has <=1 unique value. Skipping."))
                               next
                             }
                           } else if (pred_mdl_t_s %in% c("categorical", "binary_categorical")) {
                             if (data.table::uniqueN(pred_mdl_v_s) < 2) { # Skip if <2 levels for categorical predictor
                               logger::log_debug(paste0(log_prefix, "Categorical predictor '", pred_v_s, "' has <2 unique levels. Skipping."))
                               next
                             }
                             pair_dt_s[, (pred_v_s) := as.factor(get(pred_v_s))] # Convert to factor for modeling
                           } else if (pred_mdl_t_s %in% c("other_all_na", "empty_vector")) {
                             logger::log_debug(paste0(log_prefix, "Predictor '", pred_v_s, "' is all NA or empty. Skipping."))
                             next
                           } else { # Other types not suitable for direct modeling
                             logger::log_debug(paste0(log_prefix, "Predictor '", pred_v_s, "' type '", pred_mdl_t_s, "' not suitable for modeling. Skipping."))
                             next
                           }
                           
                           fmla_s <- stats::as.formula(paste0("`", current_target_var, "`~`", pred_v_s, "`"))
                           tryCatch({
                             mdl_s <- NULL
                             p_value_s <- NA_real_
                             stat_value_s <- NA_real_
                             stat_name_s <- "N/A"
                             
                             if (tgt_t_s == "numeric") { # LM for numeric target
                               mdl_s <- stats::lm(fmla_s, data = pair_dt_s)
                               mdl_summary_s <- summary(mdl_s)
                               if (is.numeric(pair_dt_s[[pred_v_s]])) { # Numeric predictor
                                 # Handle potential backticks in coefficient names
                                 coef_name_s <- pred_v_s 
                                 if (!coef_name_s %in% rownames(mdl_summary_s$coefficients)) coef_name_s <- paste0("`", pred_v_s, "`")
                                 
                                 if (coef_name_s %in% rownames(mdl_summary_s$coefficients)) {
                                   p_value_s <- mdl_summary_s$coefficients[coef_name_s, "Pr(>|t|)"]
                                   stat_value_s <- mdl_summary_s$r.squared
                                   stat_name_s <- "R_Sq"
                                 }
                               } else { # Factor predictor (ANOVA from LM)
                                 aov_s <- stats::anova(mdl_s)
                                 # Predictor row name in anova table
                                 pred_row_name_s <- pred_v_s
                                 if (!pred_row_name_s %in% rownames(aov_s)) pred_row_name_s <- paste0("`",pred_v_s,"`")
                                 
                                 # Check for common p-value column names in anova output
                                 p_val_col_name_s <- intersect(c("Pr(>F)", "`Pr(>F)`"), names(aov_s))[1]
                                 
                                 if (!is.na(p_val_col_name_s) && pred_row_name_s %in% rownames(aov_s)) {
                                   p_value_s <- aov_s[pred_row_name_s, p_val_col_name_s]
                                   stat_value_s <- mdl_summary_s$r.squared # Overall R-squared
                                   stat_name_s <- "R_Sq (Factor)"
                                 }
                               }
                             } else { # GLM for binary target (already 0/1 coded)
                               mdl_s <- stats::glm(fmla_s, data = pair_dt_s, family = "binomial")
                               null_mdl_s <- stats::glm(stats::as.formula(paste0("`", current_target_var, "`~1")), data = pair_dt_s, family = "binomial")
                               # Use likelihood ratio test
                               lrt_s <- suppressWarnings(stats::anova(null_mdl_s, mdl_s, test = "LRT"))
                               if (nrow(lrt_s) == 2 && "Pr(>Chi)" %in% names(lrt_s)) {
                                 p_value_s <- lrt_s$"Pr(>Chi)"[2] # P-value from LRT
                                 # McFadden's Pseudo R-squared
                                 logLik_full_s <- tryCatch(stats::logLik(mdl_s), error = function(e) NA)
                                 logLik_null_s <- tryCatch(stats::logLik(null_mdl_s), error = function(e) NA)
                                 if (!is.na(logLik_full_s) && !is.na(logLik_null_s)) {
                                   if (logLik_null_s != 0) stat_value_s <- 1 - (as.numeric(logLik_full_s) / as.numeric(logLik_null_s))
                                   else if (logLik_full_s == logLik_null_s) stat_value_s <- 0 # Handles perfect null model case
                                   else stat_value_s <- NA # Should not happen if logLiks are valid
                                 } else {
                                   stat_value_s <- NA
                                 }
                                 stat_name_s <- "Pseudo_R_Sq_McF"
                               }
                             }
                             
                             if (!is.na(p_value_s) && p_value_s < p_value_threshold) {
                               sig_preds_l_s[[paste(current_target_var, pred_v_s, sep = "_vs_")]] <-
                                 data.table::data.table(
                                   Target = current_target_var, Predictor = pred_v_s, PredictorOriginalType = pred_orig_t_s,
                                   ModelType = if (tgt_t_s == "numeric") "LM" else "GLM (Binomial)",
                                   StatisticName = stat_name_s, StatisticValue = stat_value_s, PValue = p_value_s,
                                   N_Obs_Model = nrow(pair_dt_s)
                                 )
                             }
                           }, error = function(e) {
                             logger::log_warn(paste0(log_prefix, "Modeling error for '", current_target_var, "' ~ '", pred_v_s, "': ", e$message))
                           })
                         }
                         res_s <- data.table::rbindlist(sig_preds_l_s, fill = TRUE, idcol = FALSE)
                         if (nrow(res_s) > 0) data.table::setorderv(res_s, "PValue")
                         if (!is.null(group_val_char) && nrow(res_s) > 0) res_s[, Group_Level_Temp_Col := group_val_char]
                         return(res_s)
                       }
                       
                       if (!is.null(by_var)) {
                         temp_data_preds <- data.table::copy(self$data)
                         if (!is.factor(temp_data_preds[[by_var]])) temp_data_preds[, (by_var) := as.factor(get(by_var))]
                         
                         # Predictors are all columns except target and by_var
                         potential_predictors <- setdiff(names(self$data), c(target_var, by_var))
                         # Columns needed for .SD are the target and the potential predictors
                         cols_to_pass_to_sd <- c(target_var, potential_predictors)
                         
                         # Run .get_preds_subset for each group defined by by_var
                         results_by_preds <- temp_data_preds[, .get_preds_subset(.SD, current_target_var = target_var, group_val_char = as.character(.BY[[1]])), 
                                                             by = by_var, .SDcols = cols_to_pass_to_sd]
                         
                         if ("Group_Level_Temp_Col" %in% names(results_by_preds)) {
                           data.table::setnames(results_by_preds, "Group_Level_Temp_Col", by_var)
                           # Set column order: by_var, then Predictor, then others
                           if ("Predictor" %in% names(results_by_preds)) {
                             new_order <- c(by_var, "Predictor", setdiff(names(results_by_preds), c(by_var, "Predictor")))
                             data.table::setcolorder(results_by_preds, new_order)
                           } else if (by_var %in% names(results_by_preds)) { # if no predictors found, just by_var might be there
                             data.table::setcolorder(results_by_preds, c(by_var, setdiff(names(results_by_preds), by_var)))
                           }
                         }
                         logger::log_info("Univariate predictor search by group complete.")
                         return(results_by_preds)
                       } else {
                         # For overall analysis, columns needed are target and all other columns as potential predictors
                         potential_predictors <- setdiff(names(self$data), target_var)
                         cols_for_overall_analysis <- c(target_var, potential_predictors)
                         result_overall <- .get_preds_subset(self$data[, .SD, .SDcols = cols_for_overall_analysis], current_target_var = target_var)
                         logger::log_info("Overall univariate predictor search complete.")
                         return(result_overall)
                       }
                     },
                     
                     #' @description
                     #' Perform bivariate analysis between two variables.
                     #' Selects appropriate statistical tests and plots based on variable types.
                     #' (Numeric-Numeric: Correlation, Scatterplot; Numeric-Categorical: T-test/ANOVA, Boxplot;
                     #' Categorical-Categorical: Chi-squared, Barplot).
                     #' @param var1 Character string. Name of the first variable.
                     #' @param var2 Character string. Name of the second variable.
                     #' @param by_var Optional. Character string. Name of a categorical variable to group the analysis by.
                     #'        If `by_var` is used, plots are not generated by this function directly;
                     #'        instead, use specific plotting methods with `by_var` for faceted plots.
                     #' @return A list containing:
                     #' \itemize{
                     #'   \item `summary`: A data.table with results of the statistical test. If `by_var` is used, this becomes `summary_table_by_group`.
                     #'   \item `plot`: A ggplot object (if `by_var` is NULL and plot is applicable).
                     #'   \item `contingency_table`: A table object for Cat-Cat analysis (if `by_var` is NULL).
                     #' }
                     get_bivariate_analysis = function(var1, var2, by_var = NULL) {
                       logger::log_info(paste0("Bivariate analysis for '", var1, "' vs '", var2, if (!is.null(by_var)) paste0("' by '", by_var, "'") else "", "'..."))
                       private$.check_data_and_vars(c(var1, var2, by_var)) # by_var can be NULL
                       
                       .get_bivar_subset <- function(dt_subset, current_var1, current_var2, group_val_char = NULL, outer_by_var_name_for_plot = NULL) {
                         # Ensure we are working with a copy and only relevant columns
                         adt_s <- data.table::copy(dt_subset[, .SD, .SDcols = c(current_var1, current_var2)])
                         
                         t1i_s <- private$.get_var_type_simple(adt_s[[current_var1]])
                         t2i_s <- private$.get_var_type_simple(adt_s[[current_var2]])
                         
                         v1c_s <- current_var1; v2c_s <- current_var2
                         t1c_s <- t1i_s; t2c_s <- t2i_s
                         
                         # Convention: if one numeric and one categorical, numeric is v1c_s
                         if (((t1c_s %in% c("categorical", "binary_categorical")) && t2c_s == "numeric")) {
                           v1_temp_s <- v1c_s; t1_temp_s <- t1c_s
                           v1c_s <- v2c_s; t1c_s <- t2c_s # v1 is now numeric
                           v2c_s <- v1_temp_s; t2c_s <- t1_temp_s # v2 is now categorical
                         }
                         
                         # Convert character/logical categoricals to factor for modeling/plotting consistency
                         if (t1c_s %in% c("categorical", "binary_categorical", "other_all_na") && (is.character(adt_s[[v1c_s]]) || is.logical(adt_s[[v1c_s]]))) {
                           adt_s[, (v1c_s) := as.factor(get(v1c_s))]
                         }
                         if (t2c_s %in% c("categorical", "binary_categorical", "other_all_na") && (is.character(adt_s[[v2c_s]]) || is.logical(adt_s[[v2c_s]]))) {
                           adt_s[, (v2c_s) := as.factor(get(v2c_s))]
                         }
                         
                         # Final types after potential coercion
                         t1f_s <- private$.get_var_type_simple(adt_s[[v1c_s]])
                         t2f_s <- private$.get_var_type_simple(adt_s[[v2c_s]])
                         
                         res_s <- list(plot = NULL, summary = data.table::data.table(), contingency_table = NULL)
                         adt_comp_s <- adt_s[stats::complete.cases(adt_s[, .SD, .SDcols = c(v1c_s, v2c_s)])]
                         
                         min_obs_s <- 5 # General minimum
                         if (t1f_s == "numeric" && (t2f_s %in% c("categorical", "binary_categorical"))) {
                           min_obs_s <- max(5, 2 * data.table::uniqueN(adt_comp_s[[v2c_s]])) # Need enough per group for t-test/ANOVA
                         } else if ((t1f_s %in% c("categorical", "binary_categorical")) && (t2f_s %in% c("categorical", "binary_categorical"))) {
                           # For chi-squared, need expected counts > 5 in most cells, but raw N is a first check
                           min_obs_s <- 10 
                         }
                         
                         
                         if (nrow(adt_comp_s) < min_obs_s) {
                           res_s$summary <- data.table::data.table(Var1 = v1c_s, Var2 = v2c_s, N_Obs_Complete = nrow(adt_comp_s), Note = "Insufficient data for reliable analysis.")
                           return(res_s)
                         }
                         
                         # Check variance for target/response variables
                         if (t1f_s == "numeric" && data.table::uniqueN(adt_comp_s[[v1c_s]]) < 2) {
                           res_s$summary <- data.table::data.table(Var1 = v1c_s, Var2 = v2c_s, N_Obs_Complete = nrow(adt_comp_s), Note = paste0("Numeric variable '",v1c_s,"' has < 2 unique values."))
                           return(res_s)
                         }
                         if (t2f_s == "numeric" && data.table::uniqueN(adt_comp_s[[v2c_s]]) < 2 && t1f_s=="numeric") { # Only if both numeric
                           res_s$summary <- data.table::data.table(Var1 = v1c_s, Var2 = v2c_s, N_Obs_Complete = nrow(adt_comp_s), Note = paste0("Numeric variable '",v2c_s,"' has < 2 unique values."))
                           return(res_s)
                         }
                         
                         
                         base_info_s <- data.table::data.table(Var1_Analysis = v1c_s, Var2_Analysis = v2c_s, N_Obs_Complete = nrow(adt_comp_s))
                         test_summary_dt_s <- data.table::data.table()
                         
                         # Generate plot only if not part of a `by_var` operation at the outer level
                         generate_plot_here <- is.null(outer_by_var_name_for_plot)
                         
                         if (t1f_s == "numeric" && t2f_s == "numeric") { # Numeric vs Numeric
                           if (generate_plot_here) {
                             # Call the class's own plotting method for consistency
                             res_s$plot <- tryCatch(self$plot_scatterplot(x_var = v1c_s, y_var = v2c_s, add_smooth = TRUE), # Assuming self is accessible or pass GoodeR obj
                                                    error = function(e) { logger::log_warn(paste("Bivar scatterplot error:", e$message)); NULL })
                           }
                           cor_test_res_s <- tryCatch(stats::cor.test(adt_comp_s[[v1c_s]], adt_comp_s[[v2c_s]]), error = function(e) NULL)
                           if (!is.null(cor_test_res_s)) {
                             test_summary_dt_s <- data.table::data.table(Method = cor_test_res_s$method, StatisticName = "Correlation", StatisticValue = cor_test_res_s$estimate, PValue = cor_test_res_s$p.value)
                           } else {
                             test_summary_dt_s <- data.table::data.table(Note = "Correlation test failed.")
                           }
                         } else if (t1f_s == "numeric" && (t2f_s %in% c("categorical", "binary_categorical"))) { # Numeric vs Categorical
                           if (generate_plot_here) {
                             res_s$plot <- tryCatch(self$plot_boxplot(numeric_var = v1c_s, categorical_var = v2c_s),
                                                    error = function(e) { logger::log_warn(paste("Bivar boxplot error:", e$message)); NULL })
                           }
                           fmla_s <- stats::as.formula(paste0("`", v1c_s, "`~`", v2c_s, "`"))
                           num_levels_s <- data.table::uniqueN(adt_comp_s[[v2c_s]]) # Already factor
                           
                           if (num_levels_s == 2) {
                             ttest_res_s <- tryCatch(stats::t.test(fmla_s, data = adt_comp_s), error = function(e) NULL)
                             if (!is.null(ttest_res_s)) {
                               test_summary_dt_s <- data.table::data.table(Method = ttest_res_s$method, StatisticName = "t-statistic", StatisticValue = ttest_res_s$statistic, PValue = ttest_res_s$p.value, DF = round(ttest_res_s$parameter, 2))
                             } else test_summary_dt_s <- data.table::data.table(Note = "T-test failed.")
                           } else if (num_levels_s > 2) {
                             # Check if all groups have at least 2 observations
                             group_counts <- adt_comp_s[, .N, by = v2c_s]
                             if(all(group_counts$N >=2) && data.table::uniqueN(adt_comp_s[[v1c_s]]) > 1) { # also ensure numeric var has variance
                               aov_res_s <- tryCatch(stats::aov(fmla_s, data = adt_comp_s), error = function(e) NULL)
                               if (!is.null(aov_res_s)) {
                                 summary_aov_s <- summary(aov_res_s)
                                 if (length(summary_aov_s) > 0 && is.data.frame(summary_aov_s[[1]]) && "F value" %in% colnames(summary_aov_s[[1]])) {
                                   test_summary_dt_s <- data.table::data.table(Method = "ANOVA", StatisticName = "F-statistic", StatisticValue = summary_aov_s[[1]]$`F value`[1], PValue = summary_aov_s[[1]]$`Pr(>F)`[1], DF_Numerator = summary_aov_s[[1]]$Df[1], DF_Denominator = summary_aov_s[[1]]$Df[2])
                                 } else test_summary_dt_s <- data.table::data.table(Note = "ANOVA summary uninterpretable.")
                               } else test_summary_dt_s <- data.table::data.table(Note = "ANOVA failed.")
                             } else {
                               test_summary_dt_s <- data.table::data.table(Method="ANOVA", Note="Not all groups have sufficient data (>=2 obs) or numeric var lacks variance for ANOVA.")
                             }
                           } else { # < 2 levels
                             test_summary_dt_s <- data.table::data.table(Method = "N/A", Note = "Categorical variable has < 2 levels for comparison.")
                           }
                         } else if ((t1f_s %in% c("categorical", "binary_categorical")) && (t2f_s %in% c("categorical", "binary_categorical"))) { # Categorical vs Categorical
                           if (generate_plot_here) {
                             res_s$plot <- tryCatch(self$plot_barplot_counts(categorical_var = v1c_s, fill_var = v2c_s),
                                                    error = function(e) { logger::log_warn(paste("Bivar barplot error:", e$message)); NULL })
                           }
                           contingency_tbl_s <- table(adt_comp_s[[v1c_s]], adt_comp_s[[v2c_s]])
                           res_s$contingency_table <- contingency_tbl_s # Store table regardless of test
                           
                           if (nrow(contingency_tbl_s) < 2 || ncol(contingency_tbl_s) < 2) {
                             test_summary_dt_s <- data.table::data.table(Method = "Chi-squared Test", Note = "Contingency table is smaller than 2x2. Test not applicable.")
                           } else {
                             chisq_res_s <- NULL; chisq_warning_s <- ""
                             tryCatch(
                               {
                                 # Suppress warnings specifically from chisq.test if desired, or capture them
                                 withCallingHandlers(
                                   chisq_res_s <- stats::chisq.test(contingency_tbl_s),
                                   warning = function(w) {
                                     chisq_warning_s <<- paste("Chi-sq warning:", w$message) # Capture warning
                                     invokeRestart("muffleWarning") # Suppress it from console
                                   }
                                 )
                               },
                               error = function(e) { chisq_res_s <<- NULL } # Ensure result is NULL on error
                             )
                             if (!is.null(chisq_res_s)) {
                               test_summary_dt_s <- data.table::data.table(Method = "Pearson's Chi-squared Test", StatisticName = "X-squared", StatisticValue = chisq_res_s$statistic, DF = chisq_res_s$parameter, PValue = chisq_res_s$p.value, Note = chisq_warning_s)
                             } else {
                               test_summary_dt_s <- data.table::data.table(Method = "Chi-squared Test", Error = "Could not compute Chi-squared test.", Note = chisq_warning_s)
                             }
                           }
                         } else { # Unhandled type combination
                           msg_s <- paste0("Bivariate analysis for type '", t1f_s, "' vs '", t2f_s, "' is not currently handled.")
                           logger::log_warn(msg_s)
                           test_summary_dt_s <- data.table::data.table(Note = msg_s)
                         }
                         
                         res_s$summary <- cbind(base_info_s, test_summary_dt_s)
                         if (!is.null(group_val_char) && nrow(res_s$summary) > 0) res_s$summary[, Group_Level_Temp_Col := group_val_char]
                         
                         return(res_s)
                       }
                       
                       if (!is.null(by_var)) {
                         temp_data_bivar <- data.table::copy(self$data)
                         if (!is.factor(temp_data_bivar[[by_var]])) temp_data_bivar[, (by_var) := as.factor(get(by_var))]
                         
                         # Columns needed for the subset operation
                         cols_for_bivar_subset <- unique(c(var1, var2)) # Do not include by_var here, it's handled by `by`
                         
                         # Process each group
                         # The result of the expression in j must be a list for data.table when by is used and j returns multiple rows / complex objects
                         results_list_of_lists <- temp_data_bivar[, list(bivar_result = list(.get_bivar_subset(.SD, current_var1 = var1, current_var2 = var2, group_val_char = as.character(.BY[[1]]), outer_by_var_name_for_plot = by_var))), 
                                                                  by = by_var, .SDcols = cols_for_bivar_subset]
                         
                         # Combine all summary tables
                         all_summaries_dt <- data.table::rbindlist(lapply(results_list_of_lists$bivar_result, function(x) x$summary), fill = TRUE, use.names = TRUE)
                         
                         if ("Group_Level_Temp_Col" %in% names(all_summaries_dt)) {
                           data.table::setnames(all_summaries_dt, "Group_Level_Temp_Col", by_var)
                           # Standard column order
                           ordered_cols <- c(by_var, "Var1_Analysis", "Var2_Analysis", "N_Obs_Complete", "Method", "StatisticName", "StatisticValue", "PValue", "DF", "DF_Numerator", "DF_Denominator", "Note", "Error")
                           present_ordered_cols <- intersect(ordered_cols, names(all_summaries_dt))
                           data.table::setcolorder(all_summaries_dt, present_ordered_cols)
                         }
                         
                         logger::log_info("Bivariate analysis by group complete. For plots, use specific plot methods with by_var for faceting.")
                         # When by_var is used, individual plots per group are not generated by this method.
                         # Contingency tables per group could be extracted if needed but are not aggregated here.
                         return(list(summary_table_by_group = all_summaries_dt, plot = NULL, contingency_table = NULL))
                       } else {
                         # Overall analysis (no by_var)
                         overall_bivar_result <- .get_bivar_subset(self$data[, .SD, .SDcols = c(var1, var2)], current_var1 = var1, current_var2 = var2, outer_by_var_name_for_plot = NULL)
                         logger::log_info("Overall bivariate analysis complete.")
                         return(overall_bivar_result)
                       }
                     }
                   ),
                   
                   private = list(
                     .check_data_and_vars = function(vars) {
                       # Filter out NULL elements from vars, common if optional vars are passed directly
                       vars_to_check <- vars[!sapply(vars, is.null)]
                       if (length(vars_to_check) == 0) return(invisible(NULL))
                       
                       if (is.null(self$data)) {
                         logger::log_error("Data not loaded.")
                         stop("Data not loaded.")
                       }
                       for (v_nm in vars_to_check) {
                         if (!v_nm %in% names(self$data)) {
                           logger::log_error(paste0("Variable '", v_nm, "' not found in the data."))
                           stop(paste0("Variable '", v_nm, "' not found."))
                         }
                       }
                       invisible(NULL)
                     },
                     .check_var_type = function(var_name, expected_types) {
                       if (is.null(var_name)) return(invisible(NULL)) # Allow optional vars to be NULL
                       
                       actual_cls <- class(self$data[[var_name]])
                       is_expected <- any(sapply(expected_types, function(type_str) {
                         switch(type_str,
                                "numeric" = is.numeric(self$data[[var_name]]),
                                "integer" = is.integer(self$data[[var_name]]),
                                "logical" = is.logical(self$data[[var_name]]),
                                "character" = is.character(self$data[[var_name]]),
                                "factor" = is.factor(self$data[[var_name]]),
                                "date" = inherits(self$data[[var_name]], "Date"),
                                "datetime" = inherits(self$data[[var_name]], "POSIXct"),
                                FALSE # Default if type_str not matched
                         )
                       }))
                       if (!is_expected) {
                         msg <- paste0("Variable '", var_name, "' has type '", paste(actual_cls, collapse = ","),
                                       "', which is not among the expected types: '", paste(expected_types, collapse = ", "), "'.")
                         logger::log_error(msg)
                         stop(msg)
                       }
                       invisible(NULL)
                     },
                     # Simplified type checker for internal modeling logic
                     .get_var_type_simple = function(variable_vector) {
                       if (!is.atomic(variable_vector) && !inherits(variable_vector, c("Date", "POSIXct", "difftime"))) return("other_non_atomic")
                       if (length(variable_vector) == 0) return("empty_vector")
                       
                       var_no_na <- variable_vector[!is.na(variable_vector)]
                       if (length(var_no_na) == 0) { # All NA values
                         # Try to infer from the original vector's class
                         if (is.logical(variable_vector)) return("binary_categorical") # All NA logical still binary
                         if (is.numeric(variable_vector)) return("numeric") # All NA numeric still numeric
                         if (is.factor(variable_vector)) {
                           return(if (length(levels(variable_vector)) == 2) "binary_categorical" else "categorical")
                         }
                         if (is.character(variable_vector)) return("categorical") # All NA char still categorical
                         if (inherits(variable_vector, c("Date", "POSIXct"))) return("date")
                         return("other_all_na")
                       }
                       
                       # Has non-NA values
                       if (is.logical(variable_vector)) return("binary_categorical")
                       if (is.numeric(variable_vector)) {
                         # Check for 0/1 numerics or two unique values that could be binary
                         if (all(var_no_na %in% c(0, 1)) && data.table::uniqueN(var_no_na) <= 2) return("binary_categorical")
                         if (data.table::uniqueN(var_no_na) == 2) return("binary_categorical") # e.g. (1, 5) only
                         return("numeric")
                       }
                       if (is.factor(variable_vector)) {
                         return(if (length(levels(variable_vector)) == 2) "binary_categorical" else "categorical")
                       }
                       if (is.character(variable_vector)) {
                         return(if (data.table::uniqueN(var_no_na) == 2) "binary_categorical" else "categorical")
                       }
                       if (inherits(variable_vector, c("Date", "POSIXct"))) return("date")
                       
                       return("other") # Fallback
                     }
                   )
)

# Helper to replace rlang::`%||%`
`%||%` <- function(a, b) {
  if (is.null(a)) b else a
}
abmathewks/GoodeR documentation built on June 12, 2025, 1:48 a.m.