R/utils.R

Defines functions create_categorical_rows create_numeric_row format_effect_size format_statistic get_stars format_p_value summarize_categorical test_categorical_variable summarize_numeric test_numeric_variable prepare_table1_data validate_table1_inputs build_partial_caption handle_missing_data prepare_variables validate_partial_inputs residualize_variables build_caption validate_inputs apply_triangle classify_variables assemble_matrix format_chisq format_f format_t format_r add_stars compute_chisq compute_anova compute_ttests compute_correlations

## INTERNAL FUNCTIONS ----

## correltable ----

#' @keywords internal
#' @noRd
compute_correlations <- function(data, method = "pearson", use = "pairwise") {
  r <- stats::cor(data, use = use, method = method)
  n <- crossprod(!is.na(data))
  t <- (r * sqrt(n - 2)) / sqrt(1 - r^2)
  p <- 2 * pt(abs(t), n - 2, lower.tail = FALSE)
  p[p > 1] <- 1
  list(r = r, p = p, n = n)
}

#' Compute t-tests between numeric and binary factor
#' @keywords internal
#' @noRd
compute_ttests <- function(data, num_vars, factor_vars) {
  results <- expand.grid(num = num_vars, fac = factor_vars,
                         stringsAsFactors = FALSE)

  results$statistic <- mapply(function(n, f) {
    tryCatch(
      t.test(data[[n]] ~ data[[f]])$statistic,
      error = function(e) NA_real_
    )
  }, results$num, results$fac)

  results$p_value <- mapply(function(n, f) {
    tryCatch(
      t.test(data[[n]] ~ data[[f]])$p.value,
      error = function(e) NA_real_
    )
  }, results$num, results$fac)
  results
}

#' Compute ANOVA F-tests between numeric and multi-level factor
#' @keywords internal
#' @noRd
compute_anova <- function(data, num_vars, factor_vars) {
  results <- expand.grid(num = num_vars, fac = factor_vars,
                         stringsAsFactors = FALSE)

  results$statistic <- mapply(function(n, f) {
    tryCatch({
      aov_result <- aov(data[[n]] ~ data[[f]])
      summary(aov_result)[[1]]$"F value"[1]
    }, error = function(e) NA_real_)
  }, results$num, results$fac)

  results$p_value <- mapply(function(n, f) {
    tryCatch({
      aov_result <- aov(data[[n]] ~ data[[f]])
      summary(aov_result)[[1]]$"Pr(>F)"[1]
    }, error = function(e) NA_real_)
  }, results$num, results$fac)

  results
}

#' Compute chi-squared tests between factors
#' @keywords internal
#' @noRd
compute_chisq <- function(data, factor_vars) {
  # Only compute upper triangle (no self-comparisons)
  pairs <- utils::combn(factor_vars, 2, simplify = FALSE)

  results <- data.frame(
    var1 = sapply(pairs, `[`, 1),
    var2 = sapply(pairs, `[`, 2),
    stringsAsFactors = FALSE
  )

  results$statistic <- mapply(function(v1, v2) {
    tryCatch(
      chisq.test(data[[v1]], data[[v2]])$statistic,
      error = function(e) NA_real_
    )
  }, results$var1, results$var2)

  results$p_value <- mapply(function(v1, v2) {
    tryCatch(
      chisq.test(data[[v1]], data[[v2]])$p.value,
      error = function(e) NA_real_
    )
  }, results$var1, results$var2)

  results
}

#' Add significance stars to p-values
#' @keywords internal
#' @noRd
add_stars <- function(p, alpha = c(0.001, 0.01, 0.05)) {
  ifelse(p < alpha[1], "***",
         ifelse(p < alpha[2], "**",
                ifelse(p < alpha[3], "*", "")))
}

#' Format correlation coefficient
#' @keywords internal
#' @noRd
format_r <- function(r, p, round_n = 2) {
  r_fmt <- sub("0\\.", ".", format(round(r, round_n), nsmall = round_n))
  paste0(r_fmt, add_stars(p))
}

#' Format t-statistic
#' @keywords internal
#' @noRd
format_t <- function(t_stat, p, round_n = 2) {
  t_fmt <- format(round(t_stat, round_n), nsmall = round_n)
  paste0("t=", t_fmt, add_stars(p))
}

#' Format F-statistic
#' @keywords internal
#' @noRd
format_f <- function(f_stat, p, round_n = 2) {
  f_fmt <- format(round(f_stat, round_n), nsmall = round_n)
  paste0("F=", f_fmt, add_stars(p))
}

#' Format chi-squared statistic
#' @keywords internal
#' @noRd
format_chisq <- function(chi_stat, p, round_n = 2) {
  chi_fmt <- format(round(chi_stat, round_n), nsmall = round_n)
  paste0("X2", chi_fmt, add_stars(p))
}

#' Assemble complete statistics matrix
#' @keywords internal
#' @noRd
assemble_matrix <- function(data, vars, method = "pearson",
                            use = "pairwise", round_n = 2) {
  # Classify variables
  var_types <- classify_variables(data, vars)

  # Initialize matrix
  n_vars <- length(vars)
  stat_matrix <- matrix("", nrow = n_vars, ncol = n_vars)
  rownames(stat_matrix) <- vars
  colnames(stat_matrix) <- vars
  diag(stat_matrix) <- "-"

  # Correlations for numeric variables
  if (length(var_types$numeric) > 0) {
    cor_data <- data[, var_types$numeric, drop = FALSE]
    cor_results <- compute_correlations(cor_data, method, use)

    for (i in seq_along(var_types$numeric)) {
      for (j in seq_along(var_types$numeric)) {
        if (i != j) {
          stat_matrix[var_types$numeric[i], var_types$numeric[j]] <-
            format_r(cor_results$r[i, j], cor_results$p[i, j], round_n)
        }
      }
    }
  }

  # T-tests for numeric vs binary factors
  if (length(var_types$numeric) > 0 && length(var_types$binary_factor) > 0) {
    t_results <- compute_ttests(data, var_types$numeric, var_types$binary_factor)

    for (i in seq_len(nrow(t_results))) {
      stat_matrix[t_results$num[i], t_results$fac[i]] <-
        format_t(t_results$statistic[i], t_results$p_value[i], round_n)
      stat_matrix[t_results$fac[i], t_results$num[i]] <-
        stat_matrix[t_results$num[i], t_results$fac[i]]
    }
  }

  # ANOVA for numeric vs multi-level factors
  if (length(var_types$numeric) > 0 && length(var_types$multi_factor) > 0) {
    f_results <- compute_anova(data, var_types$numeric, var_types$multi_factor)

    for (i in seq_len(nrow(f_results))) {
      stat_matrix[f_results$num[i], f_results$fac[i]] <-
        format_f(f_results$statistic[i], f_results$p_value[i], round_n)
      stat_matrix[f_results$fac[i], f_results$num[i]] <-
        stat_matrix[f_results$num[i], f_results$fac[i]]
    }
  }

  # Chi-squared for factor vs factor
  all_factors <- c(var_types$binary_factor, var_types$multi_factor)
  if (length(all_factors) > 1) {
    chi_results <- compute_chisq(data, all_factors)

    for (i in seq_len(nrow(chi_results))) {
      val <- format_chisq(chi_results$statistic[i], chi_results$p_value[i], round_n)
      stat_matrix[chi_results$var1[i], chi_results$var2[i]] <- val
      stat_matrix[chi_results$var2[i], chi_results$var1[i]] <- val
    }
  }

  stat_matrix
}

#' Classify variables by type
#' @keywords internal
#' @noRd
classify_variables <- function(data, vars) {
  numeric_vars <- character(0)
  binary_factor_vars <- character(0)
  multi_factor_vars <- character(0)

  for (v in vars) {
    if (is.numeric(data[[v]])) {
      numeric_vars <- c(numeric_vars, v)
    } else {
      # Convert to factor if needed
      if (!is.factor(data[[v]])) {
        data[[v]] <- factor(data[[v]])
      }

      n_levels <- nlevels(data[[v]])
      if (n_levels == 2) {
        binary_factor_vars <- c(binary_factor_vars, v)
      } else if (n_levels > 2) {
        multi_factor_vars <- c(multi_factor_vars, v)
      }
    }
  }

  list(
    numeric = numeric_vars,
    binary_factor = binary_factor_vars,
    multi_factor = multi_factor_vars
  )
}

#' Apply triangle selection to matrix
#' @keywords internal
#' @noRd
apply_triangle <- function(mat, tri = "upper", cutempty = FALSE) {
  if (tri == "upper") {
    mat[lower.tri(mat, diag = TRUE)] <- ""
    if (cutempty) {
      mat <- mat[-nrow(mat), -1, drop = FALSE]
    }
  } else if (tri == "lower") {
    mat[upper.tri(mat, diag = TRUE)] <- ""
    if (cutempty) {
      mat <- mat[-1, -ncol(mat), drop = FALSE]
    }
  }
  # tri == "all" returns unchanged

  mat
}

#' Validate inputs
#' @keywords internal
#' @noRd
validate_inputs <- function(data, vars, vars2, strata) {
  # Check data is data.frame
  if (!is.data.frame(data)) {
    stop("data must be a data.frame", call. = FALSE)
  }

  # Check vars exist
  if (!is.null(vars) && !all(vars %in% names(data))) {
    missing <- setdiff(vars, names(data))
    stop("Variables not found in data: ", paste(missing, collapse = ", "),
         call. = FALSE)
  }

  if (!is.null(vars2) && !all(vars2 %in% names(data))) {
    missing <- setdiff(vars2, names(data))
    stop("vars2 not found in data: ", paste(missing, collapse = ", "),
         call. = FALSE)
  }

  # Check strata
  if (!is.null(strata)) {
    if (length(strata) != 1 || !strata %in% names(data)) {
      stop("strata must be a single variable name in data", call. = FALSE)
    }

    strata_levels <- unique(na.omit(data[[strata]]))
    if (length(strata_levels) != 2) {
      stop("strata variable must have exactly 2 levels", call. = FALSE)
    }

    if (min(table(data[[strata]])) < 3) {
      stop("each strata level must have at least 3 observations", call. = FALSE)
    }

    if (!is.null(vars2)) {
      stop("cannot use strata with vars2", call. = FALSE)
    }
  }

  invisible(TRUE)
}


#' Build table caption
#' @keywords internal
#' @noRd
build_caption <- function(method, use, n_obs, n_missing, strata,
                          data, all_vars) {

  # Missing data info
  if (use == "complete") {
    missing_text <- sprintf("list-wise deletion (N=%d%s)",
                            n_obs,
                            if (n_missing > 0) sprintf(", %d cases deleted", n_missing) else "")
  } else {
    missing_text <- "pairwise deletion"
  }

  # Strata info
  strata_text <- if (!is.null(strata)) {
    strata_levels <- sort(unique(na.omit(data[[strata]])))
    sprintf(" Data stratified by %s (upper = %s; lower = %s).",
            strata, strata_levels[1], strata_levels[2])
  } else {
    ""
  }

  sprintf(
    "Note. %s correlations with %s.%s * p<.05, ** p<.01, *** p<.001",
    tools::toTitleCase(method),
    missing_text,
    strata_text
  )
}





### partial_correltable ----
#' @keywords internal
#' @noRd
residualize_variables <- function(data, vars, partialvars) {
  # Create output data frame
  resid_data <- data.frame(matrix(NA, nrow = nrow(data), ncol = length(vars)))
  names(resid_data) <- vars

  # Residualize each variable
  for (v in vars) {
    formula_str <- paste0("scale(", v, ") ~ ", paste(partialvars, collapse = " + "))
    model_formula <- as.formula(formula_str)

    tryCatch({
      fit <- lm(model_formula, data = data, na.action = stats::na.exclude)
      resid_data[[v]] <- resid(fit)
    }, error = function(e) {
      warning("Failed to residualize ", v, ": ", e$message, call. = FALSE)
      resid_data[[v]] <- NA
    })
  }

  resid_data
}

#' Validate partial correlation inputs
#' @keywords internal
#' @noRd
validate_partial_inputs <- function(data, vars, var_names,
                                    partialvars, partialvar_names) {
  # Check data
  if (!is.data.frame(data)) {
    stop("data must be a data.frame", call. = FALSE)
  }

  # Check minimum requirements
  if (is.null(vars) || length(vars) < 2) {
    stop("must specify at least 2 variables in vars", call. = FALSE)
  }

  if (is.null(partialvars) || length(partialvars) < 1) {
    stop("must specify at least 1 variable in partialvars", call. = FALSE)
  }

  # Check variables exist
  if (!all(vars %in% names(data))) {
    missing <- setdiff(vars, names(data))
    stop("Variables not found in data: ", paste(missing, collapse = ", "),
         call. = FALSE)
  }

  if (!all(partialvars %in% names(data))) {
    missing <- setdiff(partialvars, names(data))
    stop("Partial variables not found in data: ", paste(missing, collapse = ", "),
         call. = FALSE)
  }

  # Check name lengths
  if (length(var_names) != length(vars)) {
    stop("length of var_names must match length of vars", call. = FALSE)
  }

  if (length(partialvar_names) != length(partialvars)) {
    stop("length of partialvar_names must match length of partialvars",
         call. = FALSE)
  }

  invisible(TRUE)
}

#' Clean and prepare variables
#' @keywords internal
#' @noRd
prepare_variables <- function(data, vars, var_names,
                              partialvars, partialvar_names) {

  # Remove duplicates from vars
  if (anyDuplicated(vars)) {
    keep <- !duplicated(vars)
    vars <- vars[keep]
    var_names <- var_names[keep]
    warning("Removed duplicate entries from vars", call. = FALSE)
  }

  # Remove duplicates from partialvars
  if (anyDuplicated(partialvars)) {
    keep <- !duplicated(partialvars)
    partialvars <- partialvars[keep]
    partialvar_names <- partialvar_names[keep]
    warning("Removed duplicate entries from partialvars", call. = FALSE)
  }

  # Check for numeric vars only
  non_numeric_vars <- vars[!sapply(data[vars], is.numeric)]

  if (length(non_numeric_vars) > 0) {
    warning("Dropping non-numeric vars: ",
            paste(non_numeric_vars, collapse = ", "),
            call. = FALSE)

    # Remove non-numeric vars
    keep <- vars %in% setdiff(vars, non_numeric_vars)
    vars <- vars[keep]
    var_names <- var_names[keep]
  }

  # Check if vars still has at least 2 variables
  if (length(vars) < 2) {
    stop("At least 2 numeric variables required in vars after filtering",
         call. = FALSE)
  }

  # Check for overlap between vars and partialvars
  overlap <- intersect(vars, partialvars)

  if (length(overlap) > 0) {
    warning("Variables in both vars and partialvars removed from partialvars: ",
            paste(overlap, collapse = ", "),
            call. = FALSE)

    # Remove overlap from partialvars
    keep <- !(partialvars %in% overlap)
    partialvars <- partialvars[keep]
    partialvar_names <- partialvar_names[keep]
  }

  # Check if partialvars still has at least 1 variable
  if (length(partialvars) < 1) {
    stop("At least 1 variable required in partialvars after filtering",
         call. = FALSE)
  }

  list(
    vars = vars,
    var_names = var_names,
    partialvars = partialvars,
    partialvar_names = partialvar_names
  )
}

#' Handle missing data for partial correlations
#' @keywords internal
#' @noRd
handle_missing_data <- function(data, vars, partialvars, use = "pairwise") {
  all_vars <- c(vars, partialvars)
  data_subset <- data[, all_vars, drop = FALSE]

  if (use == "complete") {
    # Complete case analysis: remove any row with ANY missing value
    complete_rows <- complete.cases(data_subset)
    n_excluded <- sum(!complete_rows)

    data_clean <- data_subset[complete_rows, , drop = FALSE]

    info <- list(
      data = data_clean,
      n_excluded = n_excluded,
      exclusion_reason = if (n_excluded > 0) {
        sprintf("N=%d cases excluded due to missing data", n_excluded)
      } else {
        NULL
      }
    )

  } else {
    # Pairwise: require complete partial variables
    # (correlations can still be pairwise on vars)
    complete_partial <- complete.cases(data_subset[, partialvars, drop = FALSE])
    n_excluded <- sum(!complete_partial)

    data_clean <- data_subset[complete_partial, , drop = FALSE]

    info <- list(
      data = data_clean,
      n_excluded = n_excluded,
      exclusion_reason = if (n_excluded > 0) {
        sprintf("N=%d cases excluded for missing covariates", n_excluded)
      } else {
        NULL
      }
    )
  }

  info
}


#' Build caption for partial correlation table
#' @keywords internal
#' @noRd
build_partial_caption <- function(base_caption, partialvar_names,
                                  exclusion_reason = NULL) {

  # Replace "correlation coefficients" with "partial correlation coefficients"
  caption <- sub(
    "correlation coefficients",
    paste0("partial correlation coefficients controlling for ",
           paste(partialvar_names, collapse = ", ")),
    base_caption
  )

  # Add exclusion information if present
  if (!is.null(exclusion_reason)) {
    caption <- sub(
      "\\* p<\\.05",
      paste0(exclusion_reason, ". * p<.05"),
      caption
    )
  }

  caption
}



### FullTable1 ----
#' @keywords internal
#' @noRd
validate_table1_inputs <- function(data, vars, var_names, strata) {
  if (!is.data.frame(data)) {
    stop("data must be a data.frame", call. = FALSE)
  }

  if (!is.null(vars) && !all(vars %in% names(data))) {
    missing <- setdiff(vars, names(data))
    stop("Variables not found in data: ", paste(missing, collapse = ", "),
         call. = FALSE)
  }

  if (!is.null(strata) && !strata %in% names(data)) {
    stop("Strata variable '", strata, "' not found in data", call. = FALSE)
  }

  # Check strata has reasonable number of levels
  if (!is.null(strata)) {
    strata_unique <- length(unique(na.omit(data[[strata]])))

    # Check if truly continuous (many unique values)
    if (is.numeric(data[[strata]]) && strata_unique > 10) {
      stop("Strata variable '", strata, "' appears to be continuous (",
           strata_unique, " unique values). ",
           "Table 1 requires a categorical grouping variable. ",
           "Consider: (1) creating groups with cut() or (2) using a different grouping variable.",
           call. = FALSE)
    }

    # Warn if many levels (likely still problematic)
    if (strata_unique > 10) {
      warning("Strata variable '", strata, "' has ", strata_unique,
              " levels. Tables with >10 groups may be difficult to interpret.",
              call. = FALSE)
    }

    # Check minimum observations per group
    strata_counts <- table(data[[strata]], useNA = "no")
    if (any(strata_counts < 2)) {
      small_groups <- names(strata_counts[strata_counts < 2])
      warning("Strata groups with <2 observations: ",
              paste(small_groups, collapse = ", "),
              ". Statistical tests may fail.",
              call. = FALSE)
    }
  }

  if (length(var_names) != length(vars)) {
    stop("length of var_names must match length of vars", call. = FALSE)
  }

  invisible(TRUE)
}

#' Prepare data for Table 1
#' @keywords internal
#' @noRd
prepare_table1_data <- function(data, vars, strata, factor_vars) {
  # Convert to data frame (not tibble for consistency)
  data <- as.data.frame(data)

  # Handle NULL strata - create dummy column
  if (is.null(strata)) {
    strata <- ".dummy_strata"
    data[[strata]] <- factor("Sample")
  }

  # Handle NULL vars - use all except strata
  if (is.null(vars)) {
    vars <- setdiff(names(data), strata)
  }

  # Subset to relevant columns
  all_cols <- unique(c(vars, strata))
  data_subset <- data[, all_cols, drop = FALSE]

  # Convert specified factors
  if (!is.null(factor_vars)) {
    for (fv in factor_vars) {
      if (fv %in% names(data_subset)) {
        data_subset[[fv]] <- factor(data_subset[[fv]])
      }
    }
  }

  # Convert strata to factor
  data_subset[[strata]] <- factor(data_subset[[strata]])

  # Auto-convert other types to factor with warnings
  conversions <- list()

  for (v in vars) {
    original_class <- class(data_subset[[v]])[1]

    if (is.character(data_subset[[v]])) {
      data_subset[[v]] <- factor(data_subset[[v]])
      conversions$character <- c(conversions$character, v)
    } else if (is.logical(data_subset[[v]])) {
      data_subset[[v]] <- factor(data_subset[[v]])
      conversions$logical <- c(conversions$logical, v)
    } else if (is.ordered(data_subset[[v]])) {
      data_subset[[v]] <- factor(data_subset[[v]], ordered = FALSE)
      conversions$ordered <- c(conversions$ordered, v)
    } else if (!is.factor(data_subset[[v]]) &&
               !is.numeric(data_subset[[v]]) &&
               length(unique(na.omit(data_subset[[v]]))) == 2) {
      data_subset[[v]] <- factor(data_subset[[v]])
      conversions$binary <- c(conversions$binary, v)
    }
  }

  # Issue warnings for conversions
  if (length(conversions$character) > 0) {
    warning("Character variables converted to factor: ",
            paste(conversions$character, collapse = ", "), call. = FALSE)
  }
  if (length(conversions$logical) > 0) {
    warning("Logical variables converted to factor: ",
            paste(conversions$logical, collapse = ", "), call. = FALSE)
  }
  if (length(conversions$ordered) > 0) {
    warning("Ordered variables converted to unordered factor: ",
            paste(conversions$ordered, collapse = ", "), call. = FALSE)
  }
  if (length(conversions$binary) > 0) {
    warning("Binary variables converted to factor: ",
            paste(conversions$binary, collapse = ", "), call. = FALSE)
  }

  # Handle missing strata
  n_missing_strata <- sum(is.na(data_subset[[strata]]))
  if (n_missing_strata > 0 && strata != ".dummy_strata") {
    warning("N=", n_missing_strata, " excluded for missing group variable: ",
            strata, call. = FALSE)
    data_subset <- data_subset[!is.na(data_subset[[strata]]), , drop = FALSE]
  }

  # Classify variables
  factor_vars_final <- names(data_subset)[sapply(data_subset, is.factor)]
  factor_vars_final <- setdiff(factor_vars_final, strata)

  list(
    data = data_subset,
    strata = strata,
    vars = vars,
    factor_vars = factor_vars_final,
    n_missing_strata = n_missing_strata
  )
}



#' Test numeric variable across groups
#' @keywords internal
#' @noRd
test_numeric_variable <- function(y, x, round_n = 2) {
  # Remove NA values
  complete <- !is.na(y) & !is.na(x)
  y <- y[complete]
  x <- droplevels(x[complete])

  n_groups <- length(unique(x))

  if (n_groups < 2) {
    return(list(statistic = NA, p_value = NA, effect_size = NA))
  }

  if (n_groups == 2) {
    # T-test
    test_result <- tryCatch(
      t.test(y ~ x),
      error = function(e) NULL
    )

    if (is.null(test_result)) {
      return(list(statistic = NA, p_value = NA, effect_size = NA))
    }

    # Cohen's d with pooled standard deviation
    means <- tapply(y, x, mean, na.rm = TRUE)
    sds <- tapply(y, x, sd, na.rm = TRUE)
    # Pooled SD = sqrt((sd1^2 + sd2^2) / 2)
    pooled_sd <- sqrt((sds[1]^2 + sds[2]^2) / 2)
    cohens_d <- (means[2] - means[1]) / pooled_sd

    list(
      statistic = -1 * test_result$statistic,  # Flip for g2 > g1
      p_value = test_result$p.value,
      effect_size = cohens_d,
      test_type = "t",
      es_type = "d"
    )

  } else {
    # ANOVA
    test_result <- tryCatch(
      aov(y ~ x),
      error = function(e) NULL
    )

    if (is.null(test_result)) {
      return(list(statistic = NA, p_value = NA, effect_size = NA))
    }

    aov_summary <- summary(test_result)[[1]]
    f_stat <- aov_summary[["F value"]][1]
    p_val <- aov_summary[["Pr(>F)"]][1]

    # Partial eta-squared
    ss_effect <- aov_summary[["Sum Sq"]][1]
    ss_total <- sum(aov_summary[["Sum Sq"]])
    eta_sq <- ss_effect / ss_total

    list(
      statistic = f_stat,
      p_value = p_val,
      effect_size = eta_sq,
      test_type = "F",
      es_type = "n2"
    )
  }
}

#' Summarize numeric variable by group
#' @keywords internal
#' @noRd
summarize_numeric <- function(y, x, round_n = 2) {
  result <- tapply(y, x, function(vals) {
    m <- mean(vals, na.rm = TRUE)
    s <- sd(vals, na.rm = TRUE)
    sprintf("%s (%s)",
            format(round(m, round_n), nsmall = round_n),
            format(round(s, round_n), nsmall = round_n))
  })

  as.character(result)
}



#' Test categorical variable across groups
#' @keywords internal
#' @noRd
test_categorical_variable <- function(y, x, round_n = 2) {
  # Remove NA values
  complete <- !is.na(y) & !is.na(x)
  y <- droplevels(y[complete])
  x <- droplevels(x[complete])

  n_groups <- length(unique(x))

  if (n_groups < 2) {
    return(list(statistic = NA, p_value = NA, effect_size = NA))
  }

  # Chi-squared test
  test_result <- tryCatch(
    chisq.test(y, x),
    error = function(e) NULL
  )

  if (is.null(test_result)) {
    return(list(statistic = NA, p_value = NA, effect_size = NA))
  }

  # Effect size depends on variable structure
  if (nlevels(y) == 2 && n_groups == 2) {
    # Odds ratio for 2x2 table
    fisher_result <- tryCatch(
      fisher.test(x, y),
      error = function(e) NULL
    )

    effect_size <- if (!is.null(fisher_result)) fisher_result$estimate else NA
    es_type <- "OR"

  } else {
    # Cramer's V for larger tables
    n <- sum(complete.cases(cbind(y, x)))
    cramers_v <- sqrt(test_result$statistic / (n * test_result$parameter))
    effect_size <- cramers_v
    es_type <- "V"
  }

  list(
    statistic = test_result$statistic,
    p_value = test_result$p.value,
    effect_size = effect_size,
    test_type = "X2",
    es_type = es_type
  )
}

#' Summarize categorical variable by group
#' @keywords internal
#' @noRd
summarize_categorical <- function(y, x, round_n = 2) {
  # Create frequency table
  tab <- table(x, y, useNA = "no")

  # Calculate percentages within each group
  result <- apply(tab, 1, function(row) {
    total <- sum(row)
    if (total == 0) return(rep("-", length(row)))

    pcts <- 100 * row / total
    sprintf("%d (%s%%)", row, format(round(pcts, round_n), nsmall = round_n))
  })

  # Transpose to get groups as columns
  if (is.matrix(result)) {
    result <- t(result)
  }

  result
}



#' Format p-value
#' @keywords internal
#' @noRd
format_p_value <- function(p) {
  if (is.na(p)) return("-")

  if (p < 0.001) {
    "<.001"
  } else if (p < 0.01) {
    sub("0\\.", ".", format(round(p, 3), nsmall = 3))
  } else {
    sub("0\\.", ".", format(round(p, 2), nsmall = 2))
  }
}

#' Add significance stars
#' @keywords internal
#' @noRd
get_stars <- function(p) {
  if (is.na(p)) return("")

  if (p < 0.001) "***"
  else if (p < 0.01) "**"
  else if (p < 0.05) "*"
  else ""
}

#' Format test statistic
#' @keywords internal
#' @noRd
format_statistic <- function(stat, test_type, round_n = 2, include_label = FALSE) {
  if (is.na(stat)) return("-")

  label <- if (include_label) paste0(test_type, "=") else ""
  paste0(label, format(round(stat, round_n), nsmall = round_n))
}

#' Format effect size
#' @keywords internal
#' @noRd
format_effect_size <- function(es, es_type, round_n = 2, include_label = FALSE) {
  if (is.na(es)) return("-")

  label <- if (include_label) paste0(es_type, "=") else ""
  paste0(label, format(round(es, round_n), nsmall = round_n))
}


#' Create row for numeric variable
#' @keywords internal
#' @noRd
create_numeric_row <- function(var_name, y, x, round_n, include_tests) {
  # Summary statistics
  summaries <- summarize_numeric(y, x, round_n)

  # Statistical test (if multiple groups)
  if (include_tests) {
    test_results <- test_numeric_variable(y, x, round_n)
  } else {
    test_results <- list(statistic = NA, p_value = NA, effect_size = NA,
                         test_type = "", es_type = "")
  }

  # Build row
  row <- c(
    Variable = var_name,
    summaries,
    Stat = as.character(test_results$statistic),
    p = as.character(test_results$p_value),
    sig = get_stars(test_results$p_value),
    es = as.character(test_results$effect_size)
  )

  # Store metadata as list element instead of attributes
  list(
    row = row,
    test_type = test_results$test_type,
    es_type = test_results$es_type,
    is_numeric = TRUE
  )
}

#' Create rows for categorical variable
#' @keywords internal
#' @noRd
create_categorical_rows <- function(var_name, y, x, round_n, include_tests) {
  y_levels <- levels(y)

  # Summary statistics
  summaries <- summarize_categorical(y, x, round_n)

  # Statistical test (if multiple groups)
  if (include_tests) {
    test_results <- test_categorical_variable(y, x, round_n)
  } else {
    test_results <- list(statistic = NA, p_value = NA, effect_size = NA,
                         test_type = "", es_type = "")
  }

  # Handle binary vs multi-level factors differently
  if (length(y_levels) == 2) {
    # Binary: single row showing second level
    row <- c(
      Variable = paste0(var_name, " (", y_levels[2], ")"),
      summaries[, 2],  # Second level counts
      Stat = as.character(test_results$statistic),
      p = as.character(test_results$p_value),
      sig = get_stars(test_results$p_value),
      es = as.character(test_results$effect_size)
    )

    list(list(
      row = row,
      test_type = test_results$test_type,
      es_type = test_results$es_type,
      is_numeric = FALSE
    ))

  } else {
    # Multi-level: header row + level rows
    rows <- vector("list", length(y_levels) + 1)

    # Header row
    rows[[1]] <- list(
      row = c(
        Variable = var_name,
        rep("-", ncol(summaries)),
        Stat = as.character(test_results$statistic),
        p = as.character(test_results$p_value),
        sig = get_stars(test_results$p_value),
        es = as.character(test_results$effect_size)
      ),
      test_type = test_results$test_type,
      es_type = test_results$es_type,
      is_numeric = FALSE
    )

    # Level rows
    for (i in seq_along(y_levels)) {
      rows[[i + 1]] <- list(
        row = c(
          Variable = paste0("  ", y_levels[i]),
          summaries[, i],
          Stat = "-",
          p = "-",
          sig = "",
          es = "-"
        ),
        test_type = "",
        es_type = "",
        is_numeric = FALSE
      )
    }

    rows
  }
}

Try the scipub package in your browser

Any scripts or data that you put into this service are public.

scipub documentation built on Jan. 10, 2026, 5:07 p.m.