R/table2_utils.R

Defines functions bold_pvalues_ft format_headers_ft calculate_table_width apply_zebra_stripes_ft replace_empty_cells identify_variable_groups process_table_for_flextable condense_quantitative_rows condense_table_rows format_column_headers_with_n_html format_column_headers_with_n_tex sanitize_for_latex determine_alignment

Documented in apply_zebra_stripes_ft bold_pvalues_ft calculate_table_width condense_quantitative_rows condense_table_rows determine_alignment format_column_headers_with_n_html format_column_headers_with_n_tex format_headers_ft identify_variable_groups process_table_for_flextable replace_empty_cells sanitize_for_latex

#' Add padding to exported table headers
#' 
#' Adds LaTeX vertical spacing rules to column headers for proper vertical
#' alignment in PDF/LaTeX exports.
#' 
#' @param col_names Character vector of column names.
#' @return Character vector with LaTeX padding rules added.
#' @keywords internal
add_header_padding <- function (col_names) {
    padded_names <- paste0("\\rule{0pt}{3ex}", col_names, "\\rule[-1.5ex]{0pt}{0pt}")
    return(as.character(padded_names))
}

#' Add padding to exported table variables
#' 
#' Inserts blank padding rows between variable groups in exported tables
#' for improved visual separation.
#' 
#' @param df Data.table with Variable column.
#' @return Data.table with padding rows inserted between variable groups.
#' @keywords internal
add_variable_padding <- function (df) {
    var_col <- if ("Variable" %chin% names(df)) 
                   "Variable"
               else if ("variable" %chin% names(df)) 
                   "variable"
               else NULL
    if (is.null(var_col)) 
        return(df)
    
    ## Find variable start rows - rows where Variable is not empty/NA and not indented
    ## Check for various indentation patterns: spaces, &nbsp;, \hspace (LaTeX)
    var_starts <- which(
        df[[var_col]] != "" & 
        !is.na(df[[var_col]]) &
        !grepl("^\\\\hspace", df[[var_col]]) &      # LaTeX indent
        !grepl("^\\s+", df[[var_col]]) &            # Space indent
        !grepl("^&nbsp;", df[[var_col]]) &          # HTML indent
        !grepl("^\\\\quad", df[[var_col]])          # LaTeX quad indent
    )
    
    if (length(var_starts) == 0) 
        return(df)
    
    new_df <- data.table()
    padding_row <- df[1, ]
    padding_row[1, ] <- ""
    new_df <- rbind(new_df, padding_row)
    for (i in seq_along(var_starts)) {
        if (i < length(var_starts)) {
            var_rows <- var_starts[i]:(var_starts[i + 1] - 1)
        }
        else {
            var_rows <- var_starts[i]:nrow(df)
        }
        new_df <- rbind(new_df, df[var_rows, ])
        padding_row <- df[1, ]
        padding_row[1, ] <- ""
        new_df <- rbind(new_df, padding_row)
    }
    return(new_df)
}

#' Check LaTeX installation
#' 
#' Verifies that a LaTeX distribution (pdflatex or xelatex) is available
#' on the system for PDF compilation.
#' 
#' @return Logical \code{TRUE} if LaTeX is available, \code{FALSE} otherwise.
#' @keywords internal
check_latex <- function () {
    pdflatex_check <- Sys.which("pdflatex")
    if (pdflatex_check != "") 
        return(TRUE)
    xelatex_check <- Sys.which("xelatex")
    if (xelatex_check != "") 
        return(TRUE)
    return(FALSE)
}

#' Determine alignment for exported tables
#' 
#' Creates column alignment string for LaTeX tables. Variable and Group
#' columns are left-aligned; all others are centered.
#' 
#' @param df Data.frame or data.table to determine alignment for.
#' @return Character string with alignment codes (\emph{e.g.,} "rlcc").
#' @keywords internal
determine_alignment <- function(df) {
    align <- "r"  # Start with row numbers column (if any)
    for (col in names(df)) {
        if (col %chin% c("Variable", "Group")) {
            align <- paste0(align, "l")
        } else {
            align <- paste0(align, "c")
        }
    }
    return(align)
}

#' Apply formatting to column headers in exported tables (PDF/LaTeX)
#' 
#' Formats column headers for LaTeX output by escaping special characters,
#' italicizing 'n' and 'p', and optionally adding vertical spacing.
#' 
#' @param col_names Character vector of column names.
#' @param add_header_space Logical whether to add vertical padding.
#' @return Character vector with LaTeX-formatted column names.
#' @keywords internal
format_column_headers <- function (col_names, add_header_space = TRUE) {
    if (!is.character(col_names)) {
        col_names <- as.character(col_names)
    }
    col_names <- gsub("%", "\\\\%", col_names)
    col_names <- gsub("^n$", "\\\\textit{n}", col_names, ignore.case = TRUE)
    col_names <- gsub("p-value", "\\\\textit{p}-value", col_names, 
                      ignore.case = TRUE)
    col_names <- gsub("Uni p", "Uni \\\\textit{p}", col_names)
    col_names <- gsub("Multi p", "Multi \\\\textit{p}", col_names)
    if (add_header_space && !is.null(col_names)) {
        col_names <- add_header_padding(col_names)
    }
    return(as.character(col_names))
}

#' Apply formatting to column headers in exported tables (HTML)
#' 
#' Formats column headers for HTML output by italicizing 'n' and 'p'
#' using HTML tags.
#' 
#' @param col_names Character vector of column names.
#' @return Character vector with HTML-formatted column names.
#' @keywords internal
format_column_headers_html <- function (col_names) {
    col_names <- gsub("^n$", "<i>n</i>", col_names, ignore.case = TRUE)
    col_names <- gsub("p-value", "<i>p</i>-value", col_names, 
                      ignore.case = TRUE)
    return(col_names)
}

#' Apply formatting to indented groups
#' 
#' Transforms tables with Variable/Group columns into indented format where
#' group levels appear as indented rows under variable names. Handles both
#' regression and descriptive tables with appropriate \emph{p}-value placement.
#' 
#' @param df Data.table with Variable and Group columns.
#' @param indent_string Character string to use for indentation.
#' @return Data.table with Group column removed and levels indented under Variables.
#' @keywords internal
format_indented_groups <- function (df, indent_string = "    ") {
    if (!("Variable" %chin% names(df) && "Group" %chin% names(df))) {
        return(df)
    }
    
    var_rows <- which(df$Variable != "")
    p_cols <- grep("p-value|Uni p|Multi p", names(df), value = TRUE)
    effect_cols <- c("OR (95% CI)", "HR (95% CI)", "RR (95% CI)", "Estimate (95% CI)",
                     "Coefficient (95% CI)",  # ADD THIS
                     "Univariable OR (95% CI)", "Multivariable aOR (95% CI)", 
                     "Univariable HR (95% CI)", "Multivariable aHR (95% CI)",
                     "Univariable RR (95% CI)", "Multivariable aRR (95% CI)",
                     "Univariable Estimate (95% CI)", "Multivariable Estimate (95% CI)",
                     "Univariable Coefficient (95% CI)", "Multivariable Coefficient (95% CI)")
    is_regression_table <- any(effect_cols %chin% names(df))
    is_fullfit_table <- any(c("Uni p", "Multi p") %chin% names(df))
    new_df <- data.table()
    
    for (i in seq_along(var_rows)) {
        current <- var_rows[i]
        next_var <- if (i < length(var_rows)) 
                        var_rows[i + 1]
                    else nrow(df) + 1
        var_name <- df$Variable[current]
        has_group <- df$Group[current] != "" && !is.na(df$Group[current]) && 
            df$Group[current] != "-"
        
        if (has_group) {
            if (is_regression_table) {
                var_row <- df[current, ]
                var_row$Variable <- var_name
                var_row$Group <- ""
                data_cols <- setdiff(names(df), c("Variable", "Group", p_cols))
                for (col in data_cols) {
                    var_row[[col]] <- ""
                }
                new_df <- rbind(new_df, var_row)
                
                for (j in current:(next_var - 1)) {
                    group_row <- df[j, ]
                    group_row$Variable <- paste0(indent_string, 
                                                 df$Group[j])
                    if (is_fullfit_table) {
                        for (ec in effect_cols) {
                            if (ec %chin% names(group_row)) {
                                val <- group_row[[ec]]
                                if (!is.na(val) && (val == "-" || grepl("Reference", 
                                                                        val))) {
                                    if (grepl("Univariable", ec) && "Uni p" %chin% 
                                        names(group_row)) {
                                        group_row[["Uni p"]] <- "-"
                                    }
                                    else if (grepl("Multivariable", ec) && 
                                             "Multi p" %chin% names(group_row)) {
                                        group_row[["Multi p"]] <- "-"
                                    }
                                }
                            }
                        }
                    }
                    else {
                        is_reference <- FALSE
                        for (ec in effect_cols) {
                            if (ec %chin% names(group_row)) {
                                val <- group_row[[ec]]
                                if (!is.na(val) && (val == "-" || grepl("Reference", 
                                                                        val))) {
                                    is_reference <- TRUE
                                    break
                                }
                            }
                        }
                        if (is_reference) {
                            for (p_col in p_cols) {
                                if (p_col %chin% names(group_row)) {
                                    group_row[[p_col]] <- ""
                                }
                            }
                        }
                    }
                    new_df <- rbind(new_df, group_row)
                }
            }
            else {
                var_row <- df[current, ]
                var_row$Variable <- var_name
                data_cols <- setdiff(names(df), c("Variable", 
                                                  "Group", p_cols))
                for (col in data_cols) {
                    var_row[[col]] <- ""
                }
                new_df <- rbind(new_df, var_row)
                
                for (j in current:(next_var - 1)) {
                    group_row <- df[j, ]
                    group_row$Variable <- paste0(indent_string, 
                                                 df$Group[j])
                    for (p_col in p_cols) {
                        if (p_col %chin% names(group_row)) {
                            group_row[[p_col]] <- ""
                        }
                    }
                    new_df <- rbind(new_df, group_row)
                }
            }
        }
        else {
            new_df <- rbind(new_df, df[current, ])
        }
    }
    new_df$Group <- NULL
    rownames(new_df) <- NULL
    return(new_df)
}

#' Format \emph{p}-values for exported tables
#' 
#' Applies bold formatting to significant \emph{p}-values in LaTeX tables using
#' the textbf command.
#' 
#' @param df Data.table containing \emph{p}-value columns.
#' @param p_threshold Numeric threshold for significance (default 0.05).
#' @return Data.table with significant \emph{p}-values wrapped in LaTeX bold commands.
#' @keywords internal
format_pvalues_export_tex <- function (df, p_threshold = 0.05) {
    for (col in names(df)) {
        if (col == "p-value" || col == "Uni p" || col == "Multi p" || 
            grepl("p.value|pvalue", col, ignore.case = TRUE)) {
            for (i in seq_len(nrow(df))) {
                cell_value <- as.character(df[[col]][i])
                if (is.na(cell_value) || cell_value == "" || 
                    cell_value == "NA" || grepl("\\\\textbf", cell_value)) {
                    next
                }
                is_significant <- FALSE
                if (grepl("^<\\s*0\\.001", cell_value)) {
                    is_significant <- TRUE
                }
                else if (grepl("^[0-9]\\.[0-9]", cell_value) || 
                         grepl("^0\\.[0-9]", cell_value)) {
                    p_numeric <- suppressWarnings(as.numeric(gsub("[^0-9.]", 
                                                                  "", cell_value)))
                    if (!is.na(p_numeric) && p_numeric < p_threshold) {
                        is_significant <- TRUE
                    }
                }
                if (is_significant) {
                    df[[col]][i] <- paste0("\\textbf{", cell_value, 
                                           "}")
                }
            }
        }
    }
    return(df)
}

#' Format \emph{p}-values for exported tables (HTML)
#' 
#' Applies bold formatting to significant \emph{p}-values in HTML tables using
#' the b tag.
#' 
#' @param df Data.table containing \emph{p}-value columns.
#' @param p_threshold Numeric threshold for significance (default 0.05).
#' @return Data.table with significant \emph{p}-values wrapped in HTML bold tags.
#' @keywords internal
format_pvalues_export_html <- function (df, p_threshold = 0.05) {
    for (col in names(df)) {
        if (col == "p-value" || col == "Uni p" || col == "Multi p" || 
            grepl("p.value|pvalue", col, ignore.case = TRUE)) {
            for (i in seq_len(nrow(df))) {
                cell_value <- as.character(df[[col]][i])
                if (is.na(cell_value) || cell_value == "" || 
                    cell_value == "NA") {
                    next
                }
                is_significant <- FALSE
                if (grepl("^<\\s*0\\.001", cell_value)) {
                    is_significant <- TRUE
                }
                else if (grepl("^[0-9]\\.[0-9]", cell_value) || 
                         grepl("^0\\.[0-9]", cell_value)) {
                    p_numeric <- suppressWarnings(as.numeric(gsub("[^0-9.]", 
                                                                  "", cell_value)))
                    if (!is.na(p_numeric) && p_numeric < p_threshold) {
                        is_significant <- TRUE
                    }
                }
                if (is_significant) {
                    df[[col]][i] <- paste0("<b>", cell_value, "</b>")
                }
            }
        }
    }
    return(df)
}

#' Get paper size for PDF/LaTeX export
#' 
#' Returns paper dimensions and margin settings for the specified paper size.
#' 
#' @param paper Character string: "letter", "a4", or "auto".
#' @param margins Optional numeric vector of margins (length 1 or 4).
#' @return List with latex_paper, width, height, and margins components.
#' @keywords internal
get_paper_settings <- function (paper, margins = NULL) {
    paper <- match.arg(paper, c("letter", "a4", "auto"))
    settings <- switch(paper,
                       letter = list(latex_paper = "letterpaper", 
                                     width = 8.5,
                                     height = 11,
                                     default_margins = c(1, 1, 1, 1)),
                       a4 = list(latex_paper = "a4paper",
                                 width = 8.27, 
                                 height = 11.69,
                                 default_margins = c(1, 1, 1, 1)),
                       auto = list(latex_paper = "letterpaper", 
                                   width = NULL, height = NULL, default_margins = c(0.5, 0.5, 0.5, 0.5)))
    if (!is.null(margins)) {
        if (length(margins) == 1) {
            margins <- rep(margins, 4)
        }
        else if (length(margins) != 4) {
            stop("margins must be length 1 or 4")
        }
        settings$margins <- margins
    }
    else {
        settings$margins <- settings$default_margins
    }
    return(settings)
}

#' Sanitize certain symbols for LaTeX
#' 
#' Escapes special LaTeX characters (%, &, #, _, $, ^, ~, {, }) while
#' preserving existing LaTeX commands. Uses negative lookbehind to avoid
#' double-escaping already escaped characters.
#' 
#' @param x Character vector to sanitize.
#' @return Character vector with special characters escaped for LaTeX.
#' @keywords internal
sanitize_for_latex <- function(x) {
    if (is.null(x) || length(x) == 0) 
        return(x)
    
    ## Process each element
    result <- x
    
    for (i in seq_along(x)) {
        if (is.na(x[i])) next
        
        val <- x[i]
        
        ## Check for already-formatted LaTeX commands
        has_latex <- grepl("\\\\(text(bf|it|tt|sc|sl|rm)|hspace|vspace|rule|begin|end|[a-zA-Z]+\\{)", val)
        
        if (has_latex) {
            ## String has LaTeX commands - only escape unescaped special chars
            ## Escape % that is not already escaped (not preceded by \)
            val <- gsub("(?<!\\\\)%", "\\\\%", val, perl = TRUE)
            ## Escape & that is not already escaped
            val <- gsub("(?<!\\\\)&", "\\\\&", val, perl = TRUE)
            ## Escape # that is not already escaped
            val <- gsub("(?<!\\\\)#", "\\\\#", val, perl = TRUE)
            ## Escape _ that is not already escaped
            val <- gsub("(?<!\\\\)_", "\\\\_", val, perl = TRUE)
            ## Escape $ that is not already escaped
            val <- gsub("(?<!\\\\)\\$", "\\\\$", val, perl = TRUE)
        } else {
            ## No LaTeX commands - check if already escaped
            already_escaped <- grepl("\\\\[%&#_$]", val)
            
            if (!already_escaped) {
                ## Full sanitization needed
                val <- gsub("\\\\", "\\\\textbackslash{}", val)
                val <- gsub("%", "\\\\%", val)
                val <- gsub("&", "\\\\&", val)
                val <- gsub("#", "\\\\#", val)
                val <- gsub("_", "\\\\_", val)
                val <- gsub("\\$", "\\\\$", val)
                val <- gsub("\\^", "\\\\textasciicircum{}", val)
                val <- gsub("~", "\\\\textasciitilde{}", val)
                val <- gsub("\\{", "\\\\{", val)
                val <- gsub("\\}", "\\\\}", val)
            }
        }
        
        result[i] <- val
    }
    
    return(result)
}

#' Format column headers with n counts (TeX)
#' 
#' Creates LaTeX-formatted column headers with sample size counts displayed
#' below the column name in a stacked format.
#' 
#' @param col_names Character vector of column names.
#' @param n_row_data Named list or data.table row with n values for each column.
#' @return Character vector with LaTeX-formatted headers including n counts.
#' @keywords internal
format_column_headers_with_n_tex <- function(col_names, n_row_data) {
    new_names <- character(length(col_names))
    
    for (i in seq_along(col_names)) {
        col <- col_names[i]
        
        ## Sanitize the column name first
        col_sanitized <- sanitize_for_latex(col)
        
        has_n <- col %chin% names(n_row_data) && 
            !is.na(n_row_data[[col]]) && 
            n_row_data[[col]] != "" && 
            n_row_data[[col]] != "0"
        
        if (col %chin% c("Variable", "Group")) {
            new_names[i] <- format_column_headers(col_sanitized)
        } else if (col %chin% c("p-value", "p value")) {
            new_names[i] <- format_column_headers(col_sanitized)
        } else if (has_n) {
            n_value <- n_row_data[[col]]
            ## Build the complex structure with already-sanitized name
            new_names[i] <- paste0("\\begin{tabular}{@{}c@{}}\\rule{0pt}{2.5ex}", 
                                   col_sanitized,  # Use sanitized version
                                   "\\\\[-0ex] (\\textit{N} = ", 
                                   n_value, 
                                   ")\\rule[-1ex]{0pt}{0pt}\\end{tabular}")
        } else {
            new_names[i] <- format_column_headers(col_sanitized)
        }
    }
    
    return(new_names)
}

#' Format column headers with n counts (HTML)
#' 
#' Creates HTML-formatted column headers with sample size counts displayed
#' below the column name using line breaks.
#' 
#' @param col_names Character vector of column names.
#' @param n_row_data Named list or data.table row with n values for each column.
#' @return Character vector with HTML-formatted headers including n counts.
#' @keywords internal
format_column_headers_with_n_html <- function(col_names, n_row_data) {
    new_names <- col_names
    for (i in seq_along(col_names)) {
        col <- col_names[i]
        
        ## Skip Variable, Group, and p-value columns
        if (col %chin% c("Variable", "Group", "p-value", "p value")) {
            new_names[i] <- format_column_headers_html(col)
            next
        }
        
        ## Get n value from the N row
        if (col %chin% names(n_row_data)) {
            n_value <- n_row_data[[col]]
            if (!is.na(n_value) && n_value != "" && n_value != "0") {
                ## Format header with only N italicized
                clean_col <- format_column_headers_html(col)
                new_names[i] <- paste0(clean_col, "<br>(<i>N</i> = ", n_value, ")")
            } else {
                new_names[i] <- format_column_headers_html(col)
            }
        } else {
            new_names[i] <- format_column_headers_html(col)
        }
    }
    return(new_names)
}

#' Condense table rows for more compact display
#' 
#' Collapses multi-row variables into single rows for compact tables.
#' Continuous variables show only the first statistic row, binary categorical
#' variables show only the non-reference category, and survival variables
#' show only the median row.
#' 
#' @param df Data.table with Variable and Group columns.
#' @param indent_groups Logical whether indentation will be applied (affects processing).
#' @return Data.table with condensed rows.
#' @keywords internal
condense_table_rows <- function(df, indent_groups = TRUE) {
    
    ## Work with a copy
    result <- data.table::copy(as.data.table(df))
    
    ## Detect table type
    is_descriptive <- !any(grepl("(OR|HR|RR|Coefficient|Estimate).*\\(95% CI\\)", names(result)))
    
    ## Track rows to delete
    rows_to_delete <- integer()
    
    ## Process each variable
    vars_to_process <- which(result$Variable != "" & !is.na(result$Variable))
    
    for (i in seq_along(vars_to_process)) {
        var_start <- vars_to_process[i]
        var_end <- if (i < length(vars_to_process)) {
                       vars_to_process[i + 1] - 1
                   } else {
                       nrow(result)
                   }
        
        if (var_start %in% rows_to_delete) next
        
        var_name <- result$Variable[var_start]
        var_rows <- result[var_start:var_end]
        n_rows <- nrow(var_rows)
        
        if ("Group" %chin% names(var_rows)) {
            groups <- var_rows$Group
            
            ## Check for continuous variable
            is_continuous <- any(grepl("^(Mean|Median|Range|SD|IQR)", groups[1], ignore.case = TRUE))
            
            ## Check for binary categorical
            non_empty_groups <- groups[groups != "" & !is.na(groups)]
            is_binary <- length(non_empty_groups) == 2 && !is_continuous
            
            ## Check for survival
            is_survival <- any(grepl("Median.*\\(.*CI.*\\)", groups, ignore.case = TRUE))
            
            if (is_continuous || is_survival) {
                ## For continuous/survival, keep only first row
                if (is_descriptive) {
                    stat_type <- gsub("\\s*\\(.*\\)", "", groups[1])
                    stat_type <- gsub("\\s*\\+/-.*", "", stat_type)
                    if (stat_type != "" && !is.na(stat_type) && stat_type != "-") {
                        if (grepl("Mean \\+/- SD", groups[1])) {
                            result[var_start, Variable := paste0(var_name, ", mean +/- SD")]
                        } else if (grepl("Median", groups[1]) && grepl("IQR", groups[1])) {
                            result[var_start, Variable := paste0(var_name, ", median [IQR]")]
                        } else if (grepl("Median", groups[1]) && is_survival) {
                            result[var_start, Variable := paste0(var_name, ", median (95% CI)")]
                        } else {
                            result[var_start, Variable := paste0(var_name, ", ", tolower(stat_type))]
                        }
                    }
                    ## Clear Group column
                    result[var_start, Group := ""]
                }
                
                ## Mark extra rows for deletion
                if (n_rows > 1) {
                    rows_to_delete <- c(rows_to_delete, (var_start + 1):var_end)
                }
                
            } else if (is_binary) {
                
                ## For binary categorical
                data_cols <- setdiff(names(result), c("Variable", "Group", "p-value", "p.value"))
                
                if (length(data_cols) > 0) {
                    ## For regression tables, find the coefficient/effect column
                    ## This is more reliable than using the first data column (often "n")
                    effect_col <- NULL
                    effect_patterns <- c("Coefficient", "OR", "HR", "RR", "Estimate", "aOR", "aHR", "aRR")
                    for (col in data_cols) {
                        if (any(vapply(effect_patterns, function(p) grepl(p, col, ignore.case = TRUE), logical(1)))) {
                            effect_col <- col
                            break
                        }
                    }
                    
                    ## Use effect column if found, otherwise fall back to first data column
                    check_col <- if (!is.null(effect_col)) effect_col else data_cols[1]
                    
                    ## Find non-reference row by checking for actual data (not "-", "reference", etc.)
                    non_ref_idx <- which(!var_rows[[check_col]] %chin% c("-", "reference", "Reference", ""))
                    
                    if (length(non_ref_idx) > 1) {
                        ## Multiple non-ref rows - take the first one
                        non_ref_idx <- non_ref_idx[1]
                    } else if (length(non_ref_idx) == 0) {
                        ## All rows are reference-like - skip condensing
                        next
                    }
                    
                    if (non_ref_idx <= n_rows) {
                        non_ref_row <- var_start + non_ref_idx - 1
                        ref_idx <- setdiff(1:n_rows, non_ref_idx)[1]
                        ref_row <- var_start + ref_idx - 1
                        
                        non_ref_category <- result$Group[non_ref_row]
                        ref_category <- result$Group[ref_row]
                        
                        if (!is.na(non_ref_category) && non_ref_category != "") {
                            ## Use greedy helper function for condensing detection
                            ## Pass var_name as label for case-insensitive matching
                            if (should_condense_binary(ref_category, non_ref_category, var_name)) {
                                result[var_start, Variable := paste0(var_name)]
                            } else {
                                result[var_start, Variable := paste0(var_name, " (", non_ref_category, ")")]                                
                            }
                        }
                        
                        ## Copy statistics
                        for (col in data_cols) {
                            if (col %chin% names(result)) {
                                result[var_start, (col) := result[non_ref_row, get(col)]]
                            }
                        }
                        
                        ## Copy p-value
                        if ("p-value" %chin% names(result)) {
                            pval <- result[var_start:var_end, `p-value`]
                            pval <- pval[!is.na(pval) & pval != ""]
                            if (length(pval) > 0) {
                                result[var_start, `p-value` := pval[1]]
                            }
                        }
                        
                        ## Clear Group column
                        result[var_start, Group := ""]
                        
                        ## Mark other rows for deletion
                        rows_to_delete <- c(rows_to_delete, (var_start + 1):var_end)
                    }
                }
            } 
        }
    }
    
    ## Remove marked rows
    if (length(rows_to_delete) > 0) {
        rows_to_delete <- sort(unique(rows_to_delete))
        rows_to_delete <- rows_to_delete[rows_to_delete <= nrow(result)]
        if (length(rows_to_delete) > 0) {
            result <- result[-rows_to_delete]
        }
    }
    
    return(result)
}


#' Condense quantitative variable rows only
#' 
#' Collapses multi-row continuous and survival variables into single rows
#' while preserving all categorical variable rows (including binary).
#' Only applies to descriptive tables from desctable().
#' 
#' @param df Data.table or data frame
#' @param indent_groups Logical. Whether to apply indentation formatting.
#' @return A data.table with condensed continuous/survival rows
#' @keywords internal
condense_quantitative_rows <- function(df, indent_groups = TRUE) {
    
    ## Work with a copy
    result <- data.table::copy(as.data.table(df))
    
    ## Detect table type - only apply to descriptive tables
    is_descriptive <- !any(grepl("(OR|HR|RR|Coefficient|Estimate).*\\(95% CI\\)", names(result)))
    
    if (!is_descriptive) {
        warning("condense_quantitative only applies to descriptive tables. Returning unchanged.")
        return(result)
    }
    
    ## Track rows to delete
    rows_to_delete <- integer()
    
    ## Process each variable
    vars_to_process <- which(result$Variable != "" & !is.na(result$Variable))
    
    for (i in seq_along(vars_to_process)) {
        var_start <- vars_to_process[i]
        var_end <- if (i < length(vars_to_process)) {
                       vars_to_process[i + 1] - 1
                   } else {
                       nrow(result)
                   }
        
        if (var_start %in% rows_to_delete) next
        
        var_name <- result$Variable[var_start]
        var_rows <- result[var_start:var_end]
        n_rows <- nrow(var_rows)
        
        if ("Group" %chin% names(var_rows)) {
            groups <- var_rows$Group
            
            ## Check for continuous variable
            is_continuous <- any(grepl("^(Mean|Median|Range|SD|IQR)", groups[1], ignore.case = TRUE))
            
            ## Check for survival
            is_survival <- any(grepl("Median.*\\(.*CI.*\\)", groups, ignore.case = TRUE))
            
            ## Only condense continuous and survival - NOT categorical (including binary)
            if (is_continuous || is_survival) {
                ## For continuous/survival, keep only first row
                stat_type <- gsub("\\s*\\(.*\\)", "", groups[1])
                stat_type <- gsub("\\s*\\+/-.*", "", stat_type)
                if (stat_type != "" && !is.na(stat_type) && stat_type != "-") {
                    if (grepl("Mean \\+/- SD", groups[1])) {
                        result[var_start, Variable := paste0(var_name, ", mean +/- SD")]
                    } else if (grepl("Median", groups[1]) && grepl("IQR", groups[1])) {
                        result[var_start, Variable := paste0(var_name, ", median [IQR]")]
                    } else if (grepl("Median", groups[1]) && is_survival) {
                        result[var_start, Variable := paste0(var_name, ", median (95% CI)")]
                    } else {
                        result[var_start, Variable := paste0(var_name, ", ", tolower(stat_type))]
                    }
                }
                ## Clear Group column
                result[var_start, Group := ""]
                
                ## Mark extra rows for deletion
                if (n_rows > 1) {
                    rows_to_delete <- c(rows_to_delete, (var_start + 1):var_end)
                }
            }
            ## Note: Categorical variables (including binary) are not processed here
        }
    }
    
    ## Remove marked rows
    if (length(rows_to_delete) > 0) {
        rows_to_delete <- sort(unique(rows_to_delete))
        rows_to_delete <- rows_to_delete[rows_to_delete <= nrow(result)]
        if (length(rows_to_delete) > 0) {
            result <- result[-rows_to_delete]
        }
    }
    
    return(result)
}


#' Core flextable processing function
#' 
#' Central processing function for creating \pkg{flextable} objects from data tables.
#' Handles N row extraction, condensing, indentation, zebra stripes, formatting,
#' and styling. Used by table2docx, table2pptx, and table2rtf.
#' 
#' @param table Data.frame or data.table to process.
#' @param caption Optional character string for table caption.
#' @param font_size Numeric font size in points.
#' @param font_family Character string font family name.
#' @param format_headers Logical whether to format headers.
#' @param bold_significant Logical whether to bold significant \emph{p}-values.
#' @param p_threshold Numeric \emph{p}-value threshold for significance.
#' @param indent_groups Logical whether to indent group levels.
#' @param condense_table Logical whether to condense all variable types.
#' @param condense_quantitative Logical whether to condense only continuous/survival.
#' @param zebra_stripes Logical whether to apply alternating row shading.
#' @param dark_header Logical whether to use dark header style.
#' @param bold_variables Logical whether to bold variable names (non-indented rows).
#' @param paper Character string paper size.
#' @param orientation Character string page orientation.
#' @param width Optional numeric table width in inches.
#' @param align Optional alignment specification.
#' @return List with ft (\pkg{flextable} object) and caption components.
#' @keywords internal
process_table_for_flextable <- function(table,
                                        caption = NULL,
                                        font_size = 10,
                                        font_family = "Arial",
                                        format_headers = TRUE,
                                        bold_significant = TRUE,
                                        p_threshold = 0.05,
                                        indent_groups = FALSE,
                                        condense_table = FALSE,
                                        condense_quantitative = FALSE,
                                        zebra_stripes = FALSE,
                                        dark_header = FALSE,
                                        bold_variables = TRUE,
                                        paper = "letter",
                                        orientation = "portrait",
                                        width = NULL,
                                        align = NULL) {
    
    ## Convert to data.table
    df <- data.table::as.data.table(table)
    
    ## Handle N row if present
    has_n_row <- FALSE
    n_row_data <- NULL
    if (nrow(df) > 0 && "Variable" %chin% names(df) && 
        !is.na(df$Variable[1]) && df$Variable[1] == "N") {
        has_n_row <- TRUE
        n_row_data <- df[1, ]
        df <- df[-1, ]
    }
    
    ## Track variable groups BEFORE any transformation
    var_groups <- NULL
    if (zebra_stripes && "Variable" %chin% names(df)) {
        var_groups <- identify_variable_groups(df)
    }
    
    ## Apply condensing if requested
    if (condense_table) {
        indent_groups <- TRUE
        df <- condense_table_rows(df, indent_groups = indent_groups)
        
        ## Update variable groups after condensing
        if (zebra_stripes) {
            var_groups <- identify_variable_groups(df)
        }
        
        df <- format_indented_groups(df, indent_string = "    ")
    } else if (condense_quantitative) {
        ## Only condense continuous/survival variables (not categorical)
        ## Also set indent_groups = TRUE to avoid awkward empty Group column
        indent_groups <- TRUE
        df <- condense_quantitative_rows(df, indent_groups = indent_groups)
        
        ## Update variable groups after condensing
        if (zebra_stripes) {
            var_groups <- identify_variable_groups(df)
        }
        
        df <- format_indented_groups(df, indent_string = "    ")
    } else if (indent_groups) {
        df <- format_indented_groups(df, indent_string = "    ")
    }
    
    ## Replace empty cells with "-" for consistency
    df <- replace_empty_cells(df)
    
    ## Create flextable
    ft <- flextable::flextable(df)
    
    ## Set font
    ft <- flextable::font(ft, fontname = font_family, part = "all")
    ft <- flextable::fontsize(ft, size = font_size, part = "all")
    
    ## Format headers
    if (format_headers) {
        ft <- format_headers_ft(ft, has_n_row, n_row_data)
    }

    ## Apply dark header if requested
    if (dark_header) {
        ft <- flextable::bg(ft, bg = "#000000", part = "header")
        ft <- flextable::color(ft, color = "#FFFFFF", part = "header")
        ft <- flextable::bold(ft, bold = TRUE, part = "header")
    }
    
    ## Bold significant p-values
    if (bold_significant) {
        ft <- bold_pvalues_ft(ft, df, p_threshold)
    }
    
    ## Bold variable names (non-indented rows in Variable column)
    if (bold_variables && "Variable" %chin% names(df)) {
        ## Find rows where Variable is not empty and not indented (doesn't start with spaces)
        var_rows <- which(!is.na(df$Variable) & 
                          df$Variable != "" & 
                          df$Variable != "-" &
                          !grepl("^\\s+", df$Variable))
        if (length(var_rows) > 0) {
            ft <- flextable::bold(ft, i = var_rows, j = "Variable", part = "body")
        }
    }
    
    ## Set alignment
    if (is.null(align)) {
        for (col in names(df)) {
            if (col %chin% c("Variable", "Group")) {
                ft <- flextable::align(ft, j = col, align = "left", part = "all")
            } else {
                ft <- flextable::align(ft, j = col, align = "center", part = "all")
            }
        }
    } else {
        if (length(align) == 1) {
            ft <- flextable::align(ft, align = align, part = "all")
        } else if (length(align) == ncol(df)) {
            for (i in seq_along(align)) {
                ft <- flextable::align(ft, j = i, align = align[i], part = "all")
            }
        }
    }
    
    ## Add borders
    ft <- flextable::border_remove(ft)
    ft <- flextable::hline_top(ft, border = officer::fp_border(width = 2), part = "header")
    ft <- flextable::hline_bottom(ft, border = officer::fp_border(width = 1), part = "header")
    ft <- flextable::hline_bottom(ft, border = officer::fp_border(width = 2), part = "body")
    
    ## Reduce line spacing and padding
    ft <- flextable::line_spacing(ft, space = 1)
    ft <- flextable::padding(ft, i = NULL, j = NULL,
                             padding.top = 1, padding.bottom = 1,
                             padding.left = 1, padding.right = 1)
    
    ## Add zebra stripes by variable group if requested
    if (zebra_stripes && !is.null(var_groups)) {
        ft <- apply_zebra_stripes_ft(ft, df, var_groups)
    }
    
    ## Calculate width based on paper and orientation if not specified
    if (is.null(width)) {
        width <- calculate_table_width(paper, orientation)
    }
    
    ## Set width
    ft <- flextable::width(ft, width = width / ncol(df))
    
    return(list(ft = ft, caption = caption))
}

#' Identify variable groups before indentation
#' 
#' Detects variable group boundaries by finding rows where Variable column
#' is non-empty. Returns row indices for each group for zebra stripe application.
#' 
#' @param df Data.table with Variable column.
#' @return List of integer vectors, each containing row indices for one variable group.
#' @keywords internal
identify_variable_groups <- function(df) {
    if (!"Variable" %in% names(df)) return(NULL)
    
    var_starts <- which(df$Variable != "" & !is.na(df$Variable))
    if (length(var_starts) == 0) return(NULL)
    
    ## Vectorized group creation
    var_ends <- c(var_starts[-1] - 1, nrow(df))
    groups <- mapply(seq, var_starts, var_ends, SIMPLIFY = FALSE)
    
    return(groups)
}

#' Replace empty cells with "-"
#' 
#' Converts empty strings and NA values to "-" for consistent display
#' in exported tables. Preserves Variable column values.
#' 
#' @param df Data.frame or data.table to process.
#' @return Data.table with empty cells replaced by "-".
#' @keywords internal
replace_empty_cells <- function(df) {
    ## Convert to data.table for efficient in-place modification
    dt <- data.table::as.data.table(df)
    
    ## Get columns to process (excluding Variable)
    cols_to_process <- setdiff(names(dt), "Variable")
    
    ## Vectorized replacement using data.table
    for (col in cols_to_process) {
        dt[is.na(get(col)) | get(col) == "", (col) := "-"]
    }
    
    return(data.table::as.data.table(dt))
}

#' Apply zebra stripes with proper variable group detection for indented tables
#' 
#' Applies alternating background colors to variable groups in \pkg{flextable} objects.
#' Handles both indented tables (detects groups by leading whitespace) and
#' non-indented tables (uses pre-identified groups).
#' 
#' @param ft \pkg{flextable} object.
#' @param df The source data.table used to create the flextable.
#' @param var_groups List of row index vectors for variable groups.
#' @return Flextable object with zebra stripe formatting applied.
#' @keywords internal
apply_zebra_stripes_ft <- function(ft, df, var_groups) {
    ## Check if table has been indented (look for leading spaces in Variable column)
    is_indented <- any(grepl("^\\s{2,}", df$Variable))
    
    if (is_indented) {
        ## For indented tables, identify variable groups by non-indented rows
        var_starts <- which(!grepl("^\\s", df$Variable) & df$Variable != "")
        
        for (i in seq_along(var_starts)) {
            start_row <- var_starts[i]
            end_row <- if (i < length(var_starts)) {
                           var_starts[i + 1] - 1
                       } else {
                           nrow(df)
                       }
            
            if (i %% 2 == 1) {  ## Odd variable groups get gray shading
                ft <- flextable::bg(ft, i = start_row:end_row, 
                                    bg = "#EEEEEE", part = "body")
            } else {  ## Even variable groups get white background
                ft <- flextable::bg(ft, i = start_row:end_row, 
                                    bg = "#FFFFFF", part = "body")
            }
        }
    } else if (!is.null(var_groups)) {
        ## Use pre-identified groups for non-indented tables
        for (i in seq_along(var_groups)) {
            rows <- var_groups[[i]]
            rows <- rows[rows <= nrow(df)]
            if (length(rows) > 0) {
                if (i %% 2 == 1) {  ## Odd variable groups get gray shading
                    ft <- flextable::bg(ft, i = rows, bg = "#EEEEEE", part = "body")
                } else {  ## Even variable groups get white background
                    ft <- flextable::bg(ft, i = rows, bg = "#FFFFFF", part = "body")
                }
            }
        }
    } else {
        ## Fallback to row-based striping
        odd_rows <- seq(1, nrow(df), 2)
        even_rows <- seq(2, nrow(df), 2)
        ft <- flextable::bg(ft, i = odd_rows, bg = "#EEEEEE", part = "body")
        ft <- flextable::bg(ft, i = even_rows, bg = "#FFFFFF", part = "body")
    }
    
    return(ft)
}

#' Calculate table width based on paper size and orientation
#' 
#' Computes usable table width in inches based on paper dimensions and
#' orientation, accounting for standard 1-inch margins.
#' 
#' @param paper Character string paper size ("letter", "a4", "legal").
#' @param orientation Character string page orientation ("portrait", "landscape").
#' @return Numeric usable width in inches.
#' @keywords internal
calculate_table_width <- function(paper, orientation) {
    ## Define paper sizes (in inches) with margins
    paper_sizes <- list(
        letter = c(width = 8.5, height = 11),
        a4 = c(width = 8.27, height = 11.69),
        legal = c(width = 8.5, height = 14)
    )
    
    if (!paper %chin% names(paper_sizes)) {
        paper <- "letter"
    }
    
    dims <- paper_sizes[[paper]]
    
    ## Swap for landscape
    if (orientation == "landscape") {
        dims <- c(width = dims["height"], height = dims["width"])
    }
    
    ## Subtract margins (1 inch on each side)
    usable_width <- dims["width"] - 2
    
    return(as.numeric(usable_width))
}

#' Format headers for flextable
#' 
#' Applies formatting to flextable headers including italicizing 'n',
#' adding sample size counts from N row data, and bolding all headers.
#' 
#' @param ft \pkg{flextable} object.
#' @param has_n_row Logical whether source data had an N row.
#' @param n_row_data Data from the N row for adding counts to headers.
#' @return Formatted \pkg{flextable} object.
#' @keywords internal
format_headers_ft <- function(ft, has_n_row, n_row_data) {
    col_names <- names(ft$body$dataset)
    
    for (i in seq_along(col_names)) {
        col <- col_names[i]
        
        ## Skip Variable column for N count addition
        if (col == "Variable") {
            ## Just keep the original label without adding (N = X)
            next
        }
        
        ## Italicize 'n' column header if present
        if (col == "n") {
            ft <- flextable::italic(ft, j = i, part = "header")
        }
        
        ## Add N counts for data columns only (not Variable)
        if (has_n_row && col %chin% names(n_row_data) && col != "Variable") {
            n_val <- n_row_data[[col]]
            if (!is.na(n_val) && n_val != "" && n_val != "0") {
                ## Use compose to italicize just the N
                ft <- flextable::compose(
                    ft, 
                    j = i, 
                    part = "header",
                    value = flextable::as_paragraph(
                        col, "\n(", 
                        flextable::as_i("N"), 
                        " = ", n_val, ")"
                    )
                )
            }
        }
    }
    
    ## Bold all headers
    ft <- flextable::bold(ft, part = "header")
    
    return(ft)
}

#' Bold significant \emph{p}-values in DOCX
#' 
#' Applies bold formatting to significant \emph{p}-values in \pkg{flextable} objects
#' by detecting values below threshold or "< 0.001" patterns.
#' 
#' @param ft \pkg{flextable} object.
#' @param df The source data.table.
#' @param p_threshold Numeric \emph{p}-value threshold for significance.
#' @return Flextable object with significant \emph{p}-values bolded.
#' @keywords internal
bold_pvalues_ft <- function(ft, df, p_threshold = 0.05) {
    p_cols <- grep("p-value|p value|Uni p|Multi p|pvalue", names(df), 
                   ignore.case = TRUE, value = TRUE)
    
    if (length(p_cols) == 0) return(ft)
    
    for (p_col in p_cols) {
        if (p_col %in% names(df)) {
            vals <- df[[p_col]]
            
            ## Significance check
            is_very_small <- grepl("^<\\s*0\\.001", vals)
            p_numeric <- suppressWarnings(as.numeric(gsub("[^0-9.]", "", vals)))
            is_small_numeric <- !is.na(p_numeric) & p_numeric < p_threshold
            
            sig_rows <- which((is_very_small | is_small_numeric) & 
                              vals != "" & !is.na(vals))
            
            ## Bulk bold operation
            if (length(sig_rows) > 0) {
                ft <- flextable::bold(ft, i = sig_rows, j = p_col, part = "body")
            }
        }
    }
    
    return(ft)
}

Try the summata package in your browser

Any scripts or data that you put into this service are public.

summata documentation built on May 7, 2026, 5:07 p.m.