R/desc_utils.R

Defines functions reorder_total_column format_pvalues_desctable process_survival perform_categorical_test process_categorical get_stat_label format_continuous_stat perform_continuous_test add_raw_stats process_continuous process_variable

Documented in add_raw_stats format_continuous_stat format_pvalues_desctable get_stat_label perform_categorical_test perform_continuous_test process_categorical process_continuous process_survival process_variable reorder_total_column

#' Process variable wrapper
#' 
#' Routes variable processing to appropriate handler based on variable type
#' (continuous, categorical, or survival). Returns both formatted display
#' strings and raw numeric values.
#' 
#' @param data Data.table containing the variable.
#' @param var Character string naming the variable to process.
#' @param group_var Optional character string naming the grouping variable.
#' @param stats_continuous Character vector of statistics for continuous variables.
#' @param stats_categorical Character vector of statistics for categorical variables.
#' @param digits Integer number of decimal places for continuous statistics.
#' @param conf_level Numeric confidence level for survival confidence intervals.
#' @param na_include Logical whether to include missing values as a category.
#' @param na_label Character string label for missing values.
#' @param test Logical whether to perform statistical tests.
#' @param test_continuous Character string specifying test type for continuous variables.
#' @param test_categorical Character string specifying test type for categorical variables.
#' @param total Logical or character controlling total column display.
#' @param total_label Character string label for total column.
#' @param labels Named character vector of variable labels.
#' @param na_percent Logical whether to include NA in percentage denominators.
#' @param p_per_stat Logical whether to show separate \emph{p}-values per statistic for
#'   continuous variables. Default \code{FALSE} for better performance.
#' @param marks List with \code{big.mark} and \code{decimal.mark} as returned
#'   by \code{\link{resolve_number_marks}}.
#' @param ... Additional arguments passed to test functions.
#' @return List with 'formatted' and 'raw' data.table components.
#' @keywords internal
process_variable <- function(data, var, group_var = NULL, 
                             stats_continuous, stats_categorical,
                             digits, conf_level = 0.95, na_include, na_label,
                             test, test_continuous, test_categorical,
                             total, total_label, labels, na_percent, 
                             p_per_stat = FALSE, marks = NULL, ...) {
    
    ## Get variable label
    var_label <- if (!is.null(labels) && var %chin% names(labels)) {
                     labels[var]
                 } else {
                     var
                 }
    
    ## Determine variable type and process accordingly
    if (grepl("^Surv\\(", var)) {
        return(process_survival(
            data = data,
            var = var,
            var_label = var_label,
            group_var = group_var,
            digits = digits,
            conf_level = conf_level,
            na_include = na_include,
            na_label = na_label,
            test = test,
            total = total,
            total_label = total_label,
            marks = marks,
            ...
        ))
    } else if (is.numeric(data[[var]])) {
        return(process_continuous(
            data = data,
            var = var,
            var_label = var_label,
            group_var = group_var,
            stats = stats_continuous,
            digits = digits,
            na_include = na_include,
            na_label = na_label,
            test = test,
            test_type = test_continuous,
            total = total,
            total_label = total_label,
            p_per_stat = p_per_stat,
            marks = marks,
            ...
        ))
    } else {
        return(process_categorical(
            data = data,
            var = var,
            var_label = var_label,
            group_var = group_var,
            stats = stats_categorical,
            na_include = na_include,
            na_label = na_label,
            test = test,
            test_type = test_categorical,
            total = total,
            total_label = total_label,
            na_percent = na_percent,
            marks = marks,
            ...
        ))
    }
}

#' Process continuous variable
#' 
#' Calculates descriptive statistics for continuous numeric variables, with
#' optional grouping and statistical testing. Supports multiple summary
#' statistics (mean \eqn{\pm} SD, median [IQR], range) and various hypothesis tests.
#' 
#' @param data Data.table containing the variable.
#' @param var Character string naming the variable to process.
#' @param var_label Character string label for display.
#' @param group_var Optional character string naming the grouping variable.
#' @param stats Character vector of statistics to calculate.
#' @param digits Integer number of decimal places.
#' @param na_include Logical whether to include missing values.
#' @param na_label Character string label for missing values.
#' @param test Logical whether to perform statistical tests.
#' @param test_type Character string specifying test type.
#' @param total Logical or character controlling total column display.
#' @param total_label Character string label for total column.
#' @param p_per_stat Logical. If TRUE, calculate separate \emph{p}-values for each 
#'   statistic type (\emph{e.g.,} t-test for means, Wilcoxon for medians). If \code{FALSE}
#'   (default), calculate a single \emph{p}-value based on the first statistic type.
#' @param ... Additional arguments passed to test functions.
#' @return List with 'formatted' and 'raw' data.table components.
#' @keywords internal
process_continuous <- function(data, var, var_label, group_var, stats, digits,
                               na_include, na_label, test, test_type,
                               total, total_label, p_per_stat = FALSE,
                               marks = NULL, ...) {
    
    ## Pre-extract the variable column
    var_vec <- data[[var]]
    not_na <- !is.na(var_vec)
    total_vals <- var_vec[not_na]
    total_n <- length(total_vals)
    
    ## Calculate total statistics
    if (total_n > 0) {
        total_fivenum <- fivenum(total_vals)
        total_stats <- list(
            mean = mean(total_vals),
            sd = sd(total_vals),
            median = total_fivenum[3],
            q1 = total_fivenum[2],
            q3 = total_fivenum[4],
            min = total_fivenum[1],
            max = total_fivenum[5],
            n = total_n
        )
    } else {
        total_stats <- list(mean = NA_real_, sd = NA_real_, median = NA_real_,
                            q1 = NA_real_, q3 = NA_real_, min = NA_real_, 
                            max = NA_real_, n = 0L)
    }
    
    ## Calculate group statistics (if grouped)
    group_stats_list <- NULL
    groups <- NULL
    
    if (!is.null(group_var)) {
        grp_vec <- data[[group_var]]
        if (is.factor(grp_vec)) {
            groups <- levels(grp_vec)
        } else {
            groups <- unique(grp_vec)
            groups <- groups[!is.na(groups)]
        }
        
        ## Compute statistics for each group
        group_stats_list <- lapply(groups, function(g) {
            idx <- which(grp_vec == g & not_na)
            if (length(idx) == 0) {
                return(list(mean = NA_real_, sd = NA_real_, median = NA_real_,
                           q1 = NA_real_, q3 = NA_real_, min = NA_real_, 
                           max = NA_real_, n = 0L))
            }
            vals <- var_vec[idx]
            fn <- fivenum(vals)
            list(
                mean = mean(vals),
                sd = sd(vals),
                median = fn[3],
                q1 = fn[2],
                q3 = fn[4],
                min = fn[1],
                max = fn[5],
                n = length(vals)
            )
        })
        names(group_stats_list) <- as.character(groups)
    }
    
    ## Calculate p-values
    p_values <- list()
    p_value_single <- NULL
    if (test && !is.null(group_var) && length(groups) >= 2) {
        grp_vec <- data[[group_var]]
        
        if (p_per_stat) {
            ## Calculate separate p-value for each statistic type
            for (stat_type in stats) {
                if (stat_type == "range") {
                    p_values[[stat_type]] <- NULL
                } else {
                    p_values[[stat_type]] <- perform_continuous_test(
                        var_vec, grp_vec, test_type, stat_type)
                }
            }
        } else {
            ## Calculate single p-value based on first non-range statistic
            first_stat <- stats[stats != "range"][1]
            if (is.na(first_stat)) first_stat <- "median_iqr"
            p_value_single <- perform_continuous_test(
                var_vec, grp_vec, test_type, first_stat)
        }
    }
    
    ## Build output
    n_stats <- length(stats)
    has_missing <- na_include && any(!not_na)
    n_rows <- n_stats + if (has_missing) 1L else 0L
    
    ## Compute format string
    fmt_str <- paste0("%.", digits, "f")
    
    formatted_list <- vector("list", n_rows)
    raw_list <- vector("list", n_rows)
    
    for (i in seq_along(stats)) {
        stat_type <- stats[i]
        
        formatted_row <- list(
            variable = if (i == 1L) var_label else "",
            level = get_stat_label(stat_type)
        )
        
        raw_row <- list(
            variable = if (i == 1L) var_label else "",
            level = stat_type,
            stat_type = stat_type
        )
        
        ## Add total column
        if (!isFALSE(total)) {
            formatted_row[[total_label]] <- format_continuous_stat(
                total_stats, stat_type, fmt_str, marks)
            raw_row <- add_raw_stats(raw_row, total_label, total_stats, stat_type)
        }
        
        ## Add group columns
        if (!is.null(group_var)) {
            for (g in groups) {
                grp_col <- as.character(g)
                grp_stats <- group_stats_list[[grp_col]]
                formatted_row[[grp_col]] <- format_continuous_stat(
                    grp_stats, stat_type, fmt_str, marks)
                raw_row <- add_raw_stats(raw_row, grp_col, grp_stats, stat_type)
            }
            
            ## Add p-value
            if (test && stat_type != "range") {
                if (p_per_stat && !is.null(p_values[[stat_type]])) {
                    ## Per-statistic p-values
                    formatted_row[["p_value"]] <- p_values[[stat_type]]
                    raw_row[["p_value"]] <- p_values[[stat_type]]
                } else if (!p_per_stat && i == 1L && !is.null(p_value_single)) {
                    ## Single p-value on first row only
                    formatted_row[["p_value"]] <- p_value_single
                    raw_row[["p_value"]] <- p_value_single
                }
            }
        }
        
        formatted_list[[i]] <- data.table::as.data.table(formatted_row)
        raw_list[[i]] <- data.table::as.data.table(raw_row)
    }
    
    ## Missing row
    if (has_missing) {
        n_miss_total <- sum(!not_na)
        miss_formatted <- list(variable = "", level = na_label)
        miss_raw <- list(variable = "", level = na_label, stat_type = "missing")
        
        if (!isFALSE(total)) {
            miss_formatted[[total_label]] <- format_count(n_miss_total, marks)
            miss_raw[[total_label]] <- n_miss_total
        }
        
        if (!is.null(group_var)) {
            grp_vec <- data[[group_var]]
            for (g in groups) {
                grp_col <- as.character(g)
                n_miss <- sum(!not_na & grp_vec == g, na.rm = TRUE)
                miss_formatted[[grp_col]] <- format_count(n_miss, marks)
                miss_raw[[grp_col]] <- n_miss
            }
        }
        
        formatted_list[[n_rows]] <- data.table::as.data.table(miss_formatted)
        raw_list[[n_rows]] <- data.table::as.data.table(miss_raw)
    }
    
    list(
        formatted = data.table::rbindlist(formatted_list, fill = TRUE),
        raw = data.table::rbindlist(raw_list, fill = TRUE)
    )
}


#' Add raw statistics to row
#' 
#' Appends raw numeric statistics to a data.table row for downstream processing.
#' Used to preserve underlying values alongside formatted display strings.
#' 
#' @param row Data.table row to modify.
#' @param col Character string column name for the statistics.
#' @param stats Named list of numeric statistics (mean, sd, median, \emph{etc.}).
#' @param stat_type Character string indicating which statistic is displayed.
#' @return Modified row (by reference).
#' @keywords internal
add_raw_stats <- function(row, col, stats, stat_type) {
    if (stat_type == "mean_sd") {
        row[[col]] <- stats$mean
        row[[paste0(col, "_sd")]] <- stats$sd
        row[[paste0(col, "_n")]] <- stats$n
    } else if (stat_type == "median_iqr") {
        row[[col]] <- stats$median
        row[[paste0(col, "_q1")]] <- stats$q1
        row[[paste0(col, "_q3")]] <- stats$q3
        row[[paste0(col, "_n")]] <- stats$n
    } else if (stat_type == "median_range") {
        row[[col]] <- stats$median
        row[[paste0(col, "_min")]] <- stats$min
        row[[paste0(col, "_max")]] <- stats$max
        row[[paste0(col, "_n")]] <- stats$n
    } else if (stat_type == "range") {
        row[[col]] <- stats$min
        row[[paste0(col, "_max")]] <- stats$max
        row[[paste0(col, "_n")]] <- stats$n
    }
    row  # Return the modified row
}

#' Perform statistical tests for continuous variables
#' 
#' Conducts hypothesis tests comparing continuous variables across groups.
#' Supports t-tests, Wilcoxon tests, ANOVA, and Kruskal-Wallis tests with
#' automatic selection based on number of groups.
#' 
#' @param var_vec Numeric vector of the continuous variable.
#' @param grp_vec Factor or character vector defining groups.
#' @param test_type Character string: "parametric", "nonparametric", or "auto".
#' @param stat_type Character string indicating primary statistic being tested.
#' @return Numeric \emph{p}-value from the hypothesis test.
#' @keywords internal
perform_continuous_test <- function(var_vec, grp_vec, test_type, stat_type) {
    ## Remove NAs
    valid <- !is.na(var_vec) & !is.na(grp_vec)
    if (sum(valid) < 3) return(NA_real_)
    
    x <- var_vec[valid]
    g <- grp_vec[valid]
    
    groups <- unique(g)
    n_groups <- length(groups)
    if (n_groups < 2) return(NA_real_)
    
    ## Auto-select test
    if (test_type == "auto") {
        if (grepl("mean", stat_type)) {
            test_type <- if (n_groups == 2) "t" else "aov"
        } else {
            test_type <- if (n_groups == 2) "wrs" else "kwt"
        }
    }
    
    tryCatch({
        switch(test_type,
            "t" = t.test(x ~ g)$p.value,
            "wrs" = wilcox.test(x ~ g)$p.value,
            "aov" = {
                fit <- aov(x ~ g)
                summary(fit)[[1]][["Pr(>F)"]][1]
            },
            "kwt" = kruskal.test(x ~ g)$p.value,
            NA_real_
        )
    }, error = function(e) NA_real_)
}


#' Format continuous statistic for display
#' 
#' Converts numeric summary statistics into formatted display strings following
#' standard conventions (mean \eqn{\pm} SD, median [IQR], range, \emph{etc.}).
#' 
#' @param stats Named list of numeric statistics (mean, sd, median, q1, q3, min, max).
#' @param stat_type Character string: "mean_sd", "median_iqr", "median_range", or "range".
#' @param fmt_str Character string format specification for sprintf.
#' @param marks List with \code{big.mark} and \code{decimal.mark} as returned
#'   by \code{\link{resolve_number_marks}}.
#' @return Character string with formatted statistic.
#' @keywords internal
format_continuous_stat <- function(stats, stat_type, fmt_str, marks) {
    if (stats$n == 0) return("")
    
    switch(stat_type,
        "mean_sd" = paste0(
            format_num(stats$mean, fmt_str, marks), " \u00B1 ",
            format_num(stats$sd, fmt_str, marks)
        ),
        "median_iqr" = {
            sep <- resolve_separator(stats$q1, stats$q3, marks)
            paste0(
                format_num(stats$median, fmt_str, marks), " [",
                format_num(stats$q1, fmt_str, marks), sep,
                format_num(stats$q3, fmt_str, marks), "]"
            )
        },
        "median_range" = {
            sep <- resolve_separator(stats$min, stats$max, marks)
            paste0(
                format_num(stats$median, fmt_str, marks), " (",
                format_num(stats$min, fmt_str, marks), sep,
                format_num(stats$max, fmt_str, marks), ")"
            )
        },
        "range" = {
            sep <- resolve_separator(stats$min, stats$max, marks)
            paste0(format_num(stats$min, fmt_str, marks), sep, 
                   format_num(stats$max, fmt_str, marks))
        },
        ""
    )
}


# NOTE: format_num() and format_count() have been moved to number_format.R
# as locale-aware versions that accept a 'marks' parameter.
# See resolve_number_marks() for the locale resolution logic.


#' Get display label for statistic type
#' 
#' Converts internal statistic type codes to formatted display labels for
#' table column headers.
#' 
#' @param stat_type Character string: "mean_sd", "median_iqr", "median_range",
#'   "range", "n_miss", or custom type.
#' @return Character string with formatted label (\emph{e.g.,} "Mean \eqn{\pm} SD").
#' @keywords internal
get_stat_label <- function(stat_type) {
    switch(stat_type,
           "mean_sd" = "Mean \u00B1 SD",
           "median_iqr" = "Median [IQR]",
           "median_range" = "Median (Range)",
           "range" = "Range",
           "n_miss" = "Missing",
           stat_type
    )
}


#' Process categorical variable
#' 
#' Calculates frequency and percentage statistics for categorical variables,
#' with optional grouping and chi-square/Fisher's exact testing. Handles
#' factor levels, missing values, and custom labeling.
#' 
#' @param data Data.table containing the variable.
#' @param var Character string naming the variable to process.
#' @param var_label Character string label for display.
#' @param group_var Optional character string naming the grouping variable.
#' @param stats Character vector of statistics to calculate.
#' @param na_include Logical whether to include missing values as a category.
#' @param na_label Character string label for missing values.
#' @param test Logical whether to perform statistical tests.
#' @param test_type Character string specifying test type.
#' @param total Logical or character controlling total column display.
#' @param total_label Character string label for total column.
#' @param na_percent Logical whether to include NA in percentage denominators.
#' @param ... Additional arguments passed to test functions.
#' @return List with 'formatted' and 'raw' data.table components.
#' @keywords internal
process_categorical <- function(data, var, var_label, group_var, stats,
                                na_include, na_label, test, test_type,
                                total, total_label, na_percent,
                                marks = NULL, ...) {
    
    ## Pre-extract vectors
    var_vec <- data[[var]]
    
    ## Get levels
    if (is.factor(var_vec)) {
        levels_to_show <- levels(var_vec)
    } else {
        levels_to_show <- unique(var_vec)
        levels_to_show <- levels_to_show[!is.na(levels_to_show)]
    }
    
    ## Add NA level if requested
    has_na <- anyNA(var_vec)
    if (na_include && has_na) {
        levels_to_show <- c(levels_to_show, NA)
    }
    
    n_levels <- length(levels_to_show)
    
    ## Pre-compute all counts with single table() call
    use_na <- if (na_include) "ifany" else "no"
    
    if (!is.null(group_var)) {
        grp_vec <- data[[group_var]]
        
        if (is.factor(grp_vec)) {
            groups <- levels(grp_vec)
        } else {
            groups <- unique(grp_vec)
            groups <- groups[!is.na(groups)]
        }
        
        ## Single table call for cross-tabulation
        tab <- table(var_vec, grp_vec, useNA = use_na)
        total_tab <- rowSums(tab)
        grp_totals <- colSums(tab)
        
        ## Calculate denominators
        if (na_percent) {
            total_denom <- sum(tab)
            grp_denoms <- grp_totals
        } else {
            ## Exclude NA row from denominators
            non_na_rows <- !is.na(rownames(tab))
            total_denom <- sum(tab[non_na_rows, , drop = FALSE])
            grp_denoms <- colSums(tab[non_na_rows, , drop = FALSE])
        }
        
        ## Calculate p-value once
        p_value <- if (test) {
            perform_categorical_test(tab, test_type)
                   } else {
            NULL
        }
        
        ## Build output
        formatted_list <- vector("list", n_levels)
        raw_list <- vector("list", n_levels)
        
        for (i in seq_along(levels_to_show)) {
            lvl <- levels_to_show[i]
            lvl_label <- if (is.na(lvl)) na_label else as.character(lvl)
            lvl_char <- if (is.na(lvl)) NA_character_ else as.character(lvl)
            
            level_formatted <- list(
                variable = if (i == 1L) var_label else "",
                level = lvl_label
            )
            
            level_raw <- list(
                variable = if (i == 1L) var_label else "",
                level = lvl_label,
                stat_type = "category"
            )
            
            ## Get row from table
            if (is.na(lvl)) {
                tab_row <- tab[is.na(rownames(tab)), , drop = FALSE]
                n_total <- sum(tab_row)
            } else {
                n_total <- total_tab[lvl_char]
                if (is.na(n_total)) n_total <- 0L
            }
            
            ## Add total column
            if (!isFALSE(total)) {
                ## NA row: use total denom if na_percent, otherwise count only
                if (is.na(lvl)) {
                    denom <- if (na_percent) total_denom else sum(tab)
                    na_stat <- if (na_percent) stats else "n"
                } else {
                    denom <- total_denom
                    na_stat <- stats
                }
                level_formatted[[total_label]] <- format_categorical_stat(
                    as.integer(n_total), as.integer(denom), na_stat, marks)
                level_raw[[total_label]] <- as.integer(n_total)
                level_raw[[paste0(total_label, "_total")]] <- as.integer(denom)
            }
            
            ## Add group columns
            for (g in groups) {
                grp_col <- as.character(g)
                
                if (is.na(lvl)) {
                    n <- sum(tab[is.na(rownames(tab)), grp_col])
                    denom <- if (na_percent) grp_denoms[grp_col] else grp_totals[grp_col]
                    na_stat <- if (na_percent) stats else "n"
                } else {
                    n <- tab[lvl_char, grp_col]
                    if (is.na(n)) n <- 0L
                    denom <- grp_denoms[grp_col]
                    na_stat <- stats
                }
                
                level_formatted[[grp_col]] <- format_categorical_stat(
                    as.integer(n), as.integer(denom), na_stat, marks)
                level_raw[[grp_col]] <- as.integer(n)
                level_raw[[paste0(grp_col, "_total")]] <- as.integer(denom)
            }
            
            ## Add p-value to first level only
            if (i == 1L && !is.null(p_value)) {
                level_formatted[["p_value"]] <- p_value
                level_raw[["p_value"]] <- p_value
            }
            
            formatted_list[[i]] <- data.table::as.data.table(level_formatted)
            raw_list[[i]] <- data.table::as.data.table(level_raw)
        }
        
    } else {
        ## Ungrouped version
        total_tab <- table(var_vec, useNA = use_na)
        total_denom <- if (na_percent) sum(total_tab) else sum(total_tab[!is.na(names(total_tab))])
        
        formatted_list <- vector("list", n_levels)
        raw_list <- vector("list", n_levels)
        
        for (i in seq_along(levels_to_show)) {
            lvl <- levels_to_show[i]
            lvl_label <- if (is.na(lvl)) na_label else as.character(lvl)
            
            level_formatted <- list(
                variable = if (i == 1L) var_label else "",
                level = lvl_label
            )
            
            level_raw <- list(
                variable = if (i == 1L) var_label else "",
                level = lvl_label,
                stat_type = "category"
            )
            
            if (!isFALSE(total)) {
                if (is.na(lvl)) {
                    n <- sum(is.na(var_vec))
                    na_stat <- if (na_percent) stats else "n"
                } else {
                    n <- total_tab[as.character(lvl)]
                    if (is.na(n)) n <- 0L
                    na_stat <- stats
                }
                
                level_formatted[[total_label]] <- format_categorical_stat(
                    as.integer(n), as.integer(total_denom), na_stat, marks)
                level_raw[[total_label]] <- as.integer(n)
                level_raw[[paste0(total_label, "_total")]] <- as.integer(total_denom)
            }
            
            formatted_list[[i]] <- data.table::as.data.table(level_formatted)
            raw_list[[i]] <- data.table::as.data.table(level_raw)
        }
    }
    
    list(
        formatted = data.table::rbindlist(formatted_list, fill = TRUE),
        raw = data.table::rbindlist(raw_list, fill = TRUE)
    )
}


#' Perform statistical tests for categorical variables
#' 
#' Conducts chi-square or Fisher's exact tests for categorical variables
#' across groups. Automatically selects Fisher's exact test for small
#' expected frequencies.
#' 
#' @param tab Contingency table (matrix or table object).
#' @param test_type Character string: "chisq" for chi-square, "fisher" for
#'   Fisher's exact, or "auto" for automatic selection.
#' @return Numeric \emph{p}-value from the hypothesis test.
#' @keywords internal
perform_categorical_test <- function(tab, test_type) {
    ## Remove NA rows for testing
    non_na_rows <- !is.na(rownames(tab))
    tab_test <- tab[non_na_rows, , drop = FALSE]
    
    if (any(dim(tab_test) < 2)) return(NA_real_)
    
    ## Auto-select test
    if (test_type == "auto") {
        expected <- suppressWarnings(chisq.test(tab_test)$expected)
        test_type <- if (any(expected < 5)) "fisher" else "chisq"
    }
    
    tryCatch({
        switch(test_type,
            "fisher" = ,
            "fisher.test" = fisher.test(tab_test, workspace = 2e5)$p.value,
            "chisq" = ,
            "chisq.test" = chisq.test(tab_test)$p.value,
            NA_real_
        )
    }, error = function(e) NA_real_)
}


# NOTE: format_categorical_stat() has been moved to number_format.R
# as a locale-aware version that accepts a 'marks' parameter.


#' Process survival variable
#' 
#' Calculates survival statistics including median survival times with
#' confidence intervals, with optional grouping and log-rank testing. Parses
#' Surv() expressions and uses \pkg{survival} package functions.
#' 
#' @param data Data.table containing the survival variables.
#' @param var Character string with Surv() expression (\emph{e.g.,} "Surv(time, status)").
#' @param var_label Character string label for display.
#' @param group_var Optional character string naming the grouping variable.
#' @param digits Integer number of decimal places.
#' @param conf_level Numeric confidence level for confidence intervals.
#' @param na_include Logical whether to include missing values.
#' @param na_label Character string label for missing values.
#' @param test Logical whether to perform log-rank test.
#' @param total Logical or character controlling total column display.
#' @param total_label Character string label for total column.
#' @param ... Additional arguments (currently unused).
#' @return List with 'formatted' and 'raw' data.table components.
#' @keywords internal
process_survival <- function(data, var, var_label, group_var, digits,
                             conf_level = 0.95, na_include, na_label, 
                             test, total, total_label, marks = NULL, ...) {
    
    ## Parse Surv() expression
    surv_match <- regexec("Surv\\(([^,]+),\\s*([^)]+)\\)", var)
    surv_parts <- regmatches(var, surv_match)[[1]]
    
    if (length(surv_parts) < 3) {
        stop("Invalid Surv() syntax: ", var)
    }
    
    time_var <- trimws(surv_parts[2])
    status_var <- trimws(surv_parts[3])
    
    if (!requireNamespace("survival", quietly = TRUE)) {
        stop("Package 'survival' required for survival analysis")
    }
    
    ## Build CI label based on conf_level
    ci_pct <- round(conf_level * 100)
    ci_label <- paste0("Median (", ci_pct, "% CI)")
    
    ## Build CI column name suffixes for extracting from survfit summary
    ## survfit names CI columns as e.g. "0.95LCL" and "0.95UCL" using conf.int value
    ci_lower_name <- paste0(format(conf_level, nsmall = 2), "LCL")
    ci_upper_name <- paste0(format(conf_level, nsmall = 2), "UCL")
    
    fmt_str <- paste0("%.", digits, "f")
    
    if (!is.null(group_var)) {
        grp_vec <- data[[group_var]]
        if (is.factor(grp_vec)) {
            groups <- levels(grp_vec)
        } else {
            groups <- unique(grp_vec)
            groups <- groups[!is.na(groups)]
        }
        
        ## Calculate p-value (log-rank test)
        p_value <- if (test) {
            tryCatch({
                surv_obj <- survival::Surv(data[[time_var]], data[[status_var]])
                survival::survdiff(surv_obj ~ grp_vec)$pvalue
            }, error = function(e) NA_real_)
        } else {
            NULL
        }
        
        formatted_row <- list(
            variable = var_label,
            level = ci_label
        )
        
        raw_row <- list(
            variable = var_label,
            level = "median",
            stat_type = "survival"
        )
        
        ## Add total column
        if (!isFALSE(total)) {
            surv_obj <- survival::Surv(data[[time_var]], data[[status_var]])
            fit <- survival::survfit(surv_obj ~ 1, conf.int = conf_level)
            table <- summary(fit)$table
            
            formatted_row[[total_label]] <- format_survival_ci(
                table["median"], table[ci_lower_name], table[ci_upper_name],
                fmt_str, marks
            )
            
            raw_row[[total_label]] <- table["median"]
            raw_row[[paste0(total_label, "_ci_lower")]] <- table[ci_lower_name]
            raw_row[[paste0(total_label, "_ci_upper")]] <- table[ci_upper_name]
        }
        
        ## Add group columns
        for (g in groups) {
            grp_col <- as.character(g)
            grp_idx <- which(grp_vec == g)
            grp_time <- data[[time_var]][grp_idx]
            grp_status <- data[[status_var]][grp_idx]
            
            surv_obj <- survival::Surv(grp_time, grp_status)
            fit <- survival::survfit(surv_obj ~ 1, conf.int = conf_level)
            table <- summary(fit)$table
            
            formatted_row[[grp_col]] <- format_survival_ci(
                table["median"], table[ci_lower_name], table[ci_upper_name],
                fmt_str, marks
            )
            
            raw_row[[grp_col]] <- table["median"]
            raw_row[[paste0(grp_col, "_ci_lower")]] <- table[ci_lower_name]
            raw_row[[paste0(grp_col, "_ci_upper")]] <- table[ci_upper_name]
        }
        
        if (test && !is.null(p_value)) {
            formatted_row[["p_value"]] <- p_value
            raw_row[["p_value"]] <- p_value
        }
        
        formatted_result <- data.table::as.data.table(formatted_row)
        raw_result <- data.table::as.data.table(raw_row)
        
    } else {
        formatted_row <- list(
            variable = var_label,
            level = ci_label
        )
        
        raw_row <- list(
            variable = var_label,
            level = "median",
            stat_type = "survival"
        )
        
        if (!isFALSE(total)) {
            surv_obj <- survival::Surv(data[[time_var]], data[[status_var]])
            fit <- survival::survfit(surv_obj ~ 1, conf.int = conf_level)
            table <- summary(fit)$table
            
            formatted_row[[total_label]] <- format_survival_ci(
                table["median"], table[ci_lower_name], table[ci_upper_name],
                fmt_str, marks
            )
            
            raw_row[[total_label]] <- table["median"]
            raw_row[[paste0(total_label, "_ci_lower")]] <- table[ci_lower_name]
            raw_row[[paste0(total_label, "_ci_upper")]] <- table[ci_upper_name]
        }
        
        formatted_result <- data.table::as.data.table(formatted_row)
        raw_result <- data.table::as.data.table(raw_row)
    }
    
    list(formatted = formatted_result, raw = raw_result)
}


#' Format \emph{p}-values for descriptive tables
#' 
#' Converts numeric p-values to formatted strings with appropriate precision.
#' Handles very small p-values with threshold notation (\emph{e.g.,} "< 0.001").
#' 
#' @param result Data.table with 'p_value' column to format.
#' @param p_digits Integer number of decimal places for p-values.
#' @param marks List with \code{big.mark} and \code{decimal.mark} as returned
#'   by \code{\link{resolve_number_marks}}.
#' @return Modified data.table with 'p_value' column (formatted strings).
#' @keywords internal
format_pvalues_desctable <- function(result, p_digits, marks) {
    if ("p_value" %chin% names(result)) {
        threshold <- 10^(-p_digits)
        threshold_str <- paste0("< 0", marks$decimal.mark,
                                strrep("0", p_digits - 1), "1")
        fmt_str <- paste0("%.", p_digits, "f")
        
        ## Format p-values vectorized
        p_str <- sprintf(fmt_str, result$p_value)
        if (marks$decimal.mark != ".") {
            p_str <- sub(".", marks$decimal.mark, p_str, fixed = TRUE)
        }
        
        result[, p_formatted := data.table::fifelse(
            is.na(p_value) | Variable == "N", 
            "",
            data.table::fifelse(
                p_value < threshold,
                threshold_str,
                p_str
            )
        )]
        
        ## Remove old column, rename new
        result[, p_value := NULL]
        data.table::setnames(result, "p_formatted", "p-value")
    }
    result
}


#' Reorder columns to position total column
#' 
#' Rearranges data.table columns to place the total column in the specified
#' position (first, last, or default). Ensures proper ordering of Variable,
#' Group, total, group columns, and \emph{p}-value.
#' 
#' @param result Data.table with columns to reorder.
#' @param total Logical or character: \code{TRUE}/"first" (total first), "last" (total last).
#' @param total_label Character string name of the total column.
#' @return Modified data.table with reordered columns.
#' @keywords internal
reorder_total_column <- function(result, total, total_label) {
    if (total_label %chin% names(result)) {
        cols <- names(result)
        
        base_cols <- c("Variable", "Group")
        group_cols <- cols[!cols %chin% c(base_cols, total_label, "p-value")]
        p_col <- if ("p-value" %chin% cols) "p-value" else NULL
        
        if (isTRUE(total) || identical(total, "first")) {
            new_order <- c(base_cols, total_label, group_cols, p_col)
        } else if (identical(total, "last")) {
            new_order <- c(base_cols, group_cols, total_label, p_col)
        } else {
            new_order <- c(base_cols, total_label, group_cols, p_col)
        }
        
        new_order <- new_order[new_order %chin% cols]
        data.table::setcolorder(result, new_order)
    }
    result
}

Try the summata package in your browser

Any scripts or data that you put into this service are public.

summata documentation built on May 7, 2026, 5:07 p.m.