R/table2.R
In statuser: Statistical Tools Designed for End Users

Documented in print.table2 table2

#' Enhanced alternative to table() 
#'
#'  The function \code{\link[base]{table}} does not show variable 
#'  names when tabulating from a dataframe, requires running another
#'  function, \code{\link[base]{prop.table}},  to tabulate proportions 
#'  and yet another function, \code{\link[stats]{chisq.test}} to test difference of 
#'  proportions.  \code{table2} does what those three functions do, producing easier to 
#'  read output, and always shows variable names. 
#' @param ... same arguments as \code{\link[base]{table}}, plus the arguments shown below
#' @param prop report a table with:
#'   \itemize{
#'     \item \code{prop="all"}: Proportions for full table (each cell / total)
#'     \item \code{prop="row"}: Proportions by row  ('rows' also accepted)
#'     \item \code{prop="col"}: Proportions by columns ('cols', 'column', 'columns' also accepted)
#'   }
#' @param digits Number of decimal values to show for proportions 
#' @param chi Logical. If \code{TRUE}, performs a chi-square test on frequency table,
#' reports results in APA format
#' @param correct Logical. If \code{TRUE}, applies Yates' continuity correction 
#' for 2x2 tables in the chi-square test. Default is \code{FALSE} (no correction).
#' 
#'
#' @return A list (object of class "table2") with the following components:
#'   \itemize{
#'     \item \code{freq}: frequency table 
#'     \item \code{prop}: proportions table 
#'     \item \code{chisq}: chi-square test  
#'   }
#'
#' @examples
#' # Create example data
#' df <- data.frame(
#'   group = c("A", "A", "B", "B", "A"),
#'   status = c("X", "Y", "X", "Y", "X")
#' )
#'
#' # Enhanced table with variable names (2 variables)
#' table2(df$group, df$status)
#'
#' # Enhanced table with variable names (3 variables)
#' df3 <- data.frame(
#'   x = c("A", "A", "B", "B"),
#'   y = c("X", "Y", "X", "Y"),
#'   z = c("high", "low", "high", "low")
#' )
#' table2(df3$x, df3$y, df3$z)
#'
#' # Table with proportions
#' table2(df$group, df$status, prop = 'all')  # Overall proportions
#' table2(df$group, df$status, prop = 'row')  # Row proportions
#' table2(df$group, df$status, prop = 'col')  # Column proportions
#'
#' # Table with chi-square test
#' table2(df$group, df$status, chi = TRUE,prop='all')
#'
#' @usage NULL
#' @export
table2 <- function(..., data = NULL, exclude = if (useNA == "no") c(NA, NaN), 
                  useNA = c("no", "ifany", "always"), 
                  dnn = NULL, deparse.level = 1, prop = NULL, digits = 3, 
                  chi = FALSE, correct = FALSE) {
  
  # FUNCTION OUTLINE:
  # 1. Validate and process useNA and exclude arguments
  # 2. Capture function call expressions to extract variable names
  # 3. Call base::table() to create the contingency table
  # 4. Extract variable names from dataframe column references (df$var, df[["var"]], etc.)
  # 5. FORMAT HEADERS: Set variable names as dimension names (appear in margins when printed)
  # 6. FORMAT HEADERS: Keep row/column labels as values only (not "var=value")
  # 7. Handle proportion calculations if prop argument is provided
  # 8. Add marginal totals for proportion tables (Total rows/columns)
  # 9. Return the enhanced table object
  
  # TASK 1: Validate and process useNA and exclude arguments
  # Match useNA argument to handle default properly
  useNA <- match.arg(useNA)
  
  # Set exclude default based on useNA
  if (missing(exclude)) {
    exclude <- if (useNA == "no") c(NA, NaN) else NULL
  }
  
  # TASK 2: Validate inputs and handle data argument
  validated <- validate_table2(..., data = data, func_name = "table2")
  dots <- validated$dots
  dot_expressions <- validated$dot_expressions
  data_name <- validated$data_name
  
  # Track missing values for each variable
  var_lengths <- integer(length(dots))
  var_na_counts <- integer(length(dots))
  for (i in seq_along(dots)) {
    var_lengths[i] <- length(dots[[i]])
    var_na_counts[i] <- sum(is.na(dots[[i]]))
  }
  
  # Initialize return list elements
  freq <- NULL
  prop_out <- NULL
  chisq <- NULL
  
  # TASK 3: Call base table function
  # Let base::table handle dnn default (it uses list.names internally)
  if (is.null(dnn)) {
    result <- do.call(base::table, c(dots, list(exclude = exclude, useNA = useNA, 
                          deparse.level = deparse.level)))
  } else {
    result <- do.call(base::table, c(dots, list(exclude = exclude, useNA = useNA, 
                         dnn = dnn, deparse.level = deparse.level)))
  }
  
  # Store frequency table
  freq <- result
  
  # Helper function to extract variable name from an expression
  extract_var_name <- function(expr) {
    var_name <- ""
    # Check if it's a symbol (variable name) - works with or without data argument
    if (is.symbol(expr) || is.name(expr)) {
      var_name <- as.character(expr)
      return(var_name)
    }
    
    # Check if it's a dataframe column reference: df$var
    if (is.call(expr) && length(expr) >= 3) {
      op <- expr[[1]]
      
      # Handle comparison operators: show full expression like "var>10"
      # For expressions like df$var > 10, the operator is >, not $
      if (identical(op, quote(`>`)) || identical(op, quote(`<`)) || 
          identical(op, quote(`>=`)) || identical(op, quote(`<=`)) || 
          identical(op, quote(`==`)) || identical(op, quote(`!=`))) {
        # Extract variable name from left-hand side
        left_var <- extract_var_name(expr[[2]])
        # Get the operator as string
        op_str <- as.character(op)
        # Get the right-hand side value
        right_val <- deparse(expr[[3]])
        # Construct full expression like "var>10"
        full_expr <- paste0(left_var, op_str, right_val)
        # Truncate if longer than 30 characters
        if (nchar(full_expr) > 30) {
          full_expr <- paste0(substr(full_expr, 1, 27), "...")
        }
        return(full_expr)
      }
      
      # Handle df$var
      if (identical(op, quote(`$`)) || identical(op, as.name("$"))) {
        var_name <- as.character(expr[[3]])
      }
      # Handle df[["var"]] or df[, "var"] or df[, i]
      else if (identical(op, quote(`[`)) || identical(op, as.name("["))) {
        if (length(expr) >= 3) {
          col_expr <- expr[[3]]
          # Handle df[["var"]] - double bracket with character
          if (is.character(col_expr) && length(col_expr) == 1) {
            var_name <- col_expr
          }
          # Handle df[, "var"] - single bracket with character column name
          else if (is.call(col_expr) && identical(col_expr[[1]], quote(`[`)) && 
                   length(col_expr) >= 2 && is.character(col_expr[[2]])) {
            var_name <- col_expr[[2]]
          }
          # Handle df[, i] where i is a name or number
          else if (is.name(col_expr)) {
            # Try to evaluate to see if it's a character
            tryCatch({
              val <- eval(col_expr, parent.frame())
              if (is.character(val) && length(val) == 1) {
                var_name <- val
              }
            }, error = function(e) {})
          }
        }
      }
    }
    return(var_name)
  }
  
  # TASK 4: Extract variable names from dataframe column references
  # Initialize var_names
  var_names <- character(length(dots))
  
  # Enhance if we have 1, 2 or 3 dimensions matching the number of arguments
  n_dims <- length(dim(result))
  if ((n_dims == 1 && length(dots) == 1) || 
      (n_dims == 2 && length(dots) == 2) || 
      (n_dims == 3 && length(dots) == 3)) {
    
    # Try to extract variable names from expressions
    for (i in 1:n_dims) {
      expr <- dot_expressions[[i]]
      var_names[i] <- extract_var_name(expr)
    }
    
    # TASK 5 & 6: FORMAT HEADERS - Set variable names and labels
    # If we found variable names, enhance the dimnames
    if (any(nchar(var_names) > 0)) {
      dimn <- dimnames(result)
      
      # TASK 5: FORMAT HEADERS - Set variable names as dimension names
      # These appear in margins when printed (column var on top, row var on left)
      # For 3D tables, the third dimension name appears in the slice headers
      for (i in 1:n_dims) {
        if (nchar(var_names[i]) > 0) {
          names(dimn)[i] <- var_names[i]
        }
      }
      
      # TASK 6: FORMAT HEADERS - Keep row and column labels as just the values
      # Row labels: dimn[[1]] contains just values (e.g., "A", "B", "C")
      # Column labels: dimn[[2]] contains just values (e.g., "1", "2", "3")
      # Third dimension labels: dimn[[3]] contains just values (e.g., "high", "low")
      # The variable names will appear in the margins via names(dimnames)
      dimnames(result) <- dimn
      
      # Update freq to have the enhanced dimnames
      freq <- result
    }
  }
  
  # TASK 7: Perform chi-square test if requested
  # Initialize chi_test_attr to NULL
  chi_test_attr <- NULL
  
  if (isTRUE(chi)) {
    # Chi-square test can be performed on 1D or 2D frequency tables
    # For 1D: tests for given probabilities (goodness of fit)
    # For 2D: tests for independence
    n_dims_chi <- length(dim(result))
    if (n_dims_chi == 1 || n_dims_chi == 2) {
      # Perform chi-square test on the frequency table
      # Capture warning about small expected counts
      chi_warning <- NULL
      chi_test <- tryCatch({
        withCallingHandlers(
          stats::chisq.test(result, correct = correct),
          warning = function(w) {
            if (grepl("Chi-squared approximation may be incorrect", w$message)) {
              chi_warning <<- TRUE
              invokeRestart("muffleWarning")
            }
          }
        )
      }, error = function(e) {
        # If chi-square test fails (e.g., all zeros, insufficient data), return NULL
        NULL
      })
      # Store chi-square test result and warning flag
      attr(result, "chi_test") <- chi_test
      attr(chi_test, "low_expected") <- isTRUE(chi_warning)
      chi_test_attr <- chi_test
      chisq <- chi_test
    } else {
      # For 3D tables, chi-square test could be performed on each slice
      # For now, we'll skip it or perform on the first slice
      # Store NULL to indicate chi-square test not performed
      attr(result, "chi_test") <- NULL
      chi_test_attr <- NULL
    }
  }
  
  # TASK 8: Convert to proportions if requested
  # Initialize orig_freq to NULL (will be set if prop is requested)
  orig_freq <- NULL
  
  if (!is.null(prop)) {
    # Mark this as a proportion table
    attr(result, "is_proportion") <- TRUE
    attr(result, "proportion_digits") <- digits
    # Convert character values to numeric
    if (is.character(prop)) {
      prop_lower <- tolower(prop)
      if (prop_lower == "all") {
        prop <- 0
      } else if (prop_lower %in% c("row", "rows")) {
        prop <- 1
      } else if (prop_lower %in% c("col", "cols", "column", "columns")) {
        prop <- 2
      } else {
        stop("table2(): prop must be 0, 1, 2, 'all', 'row'/'rows', or 'col'/'cols'/'column'/'columns'", call. = FALSE)
      }
    }
    
    if (!prop %in% c(0, 1, 2)) {
      stop("table2(): prop must be 0, 1, 2, 'all', 'row', or 'column'", call. = FALSE)
    }
    
    # Get original dimnames before modification
    orig_dimn <- dimnames(result)
    n_dims_orig <- length(dim(result))
    
    # Save original frequency table before any modifications (for all prop types)
    orig_freq <- result
    
    # Track if original was 1D
    was_1d <- (n_dims_orig == 1)
    
    # Handle 1D tables: keep as 1D, just calculate proportions
    if (was_1d) {
      # For 1D tables, calculate proportions and return early
      total_sum <- sum(result, na.rm = TRUE)
      prop_result <- result / total_sum
      prop_result <- round(prop_result, digits = digits)
      
      # Keep the same dimnames as the frequency table
      dimnames(prop_result) <- dimnames(result)
      class(prop_result) <- "table"
      
      # Store proportion table
      prop_out <- prop_result
      
      # Build output list
      output <- list2(freq = freq, prop = prop_out, chisq = chisq)
      
      # Store proportion metadata
      attr(output, "prop_type") <- prop
      attr(output, "proportion_digits") <- digits
      var1_name <- if (!is.null(names(orig_dimn)) && length(names(orig_dimn)) >= 1 && 
                       !is.na(names(orig_dimn)[1]) && nchar(names(orig_dimn)[1]) > 0) {
        names(orig_dimn)[1]
      } else {
        ""
      }
      attr(output, "var1_name") <- var1_name
      attr(output, "var2_name") <- ""
      
      class(output) <- c("table2", class(output))
      return(output)
    }
    
    # Handle 3D tables: calculate proportions within each slice
    if (n_dims_orig == 3) {
      # For 3D tables, calculate proportions within each 2D slice
      prop_result <- result
      n_slices <- dim(result)[3]
      
      for (k in seq_len(n_slices)) {
        slice <- result[, , k]
        
        if (prop == 0) {
          # Overall proportions within slice
          slice_sum <- sum(slice, na.rm = TRUE)
          if (slice_sum > 0) {
            prop_result[, , k] <- slice / slice_sum
          }
        } else if (prop == 1) {
          # Row proportions within slice
          row_sums <- rowSums(slice, na.rm = TRUE)
          row_sums[row_sums == 0] <- 1
          prop_result[, , k] <- slice / row_sums
        } else if (prop == 2) {
          # Column proportions within slice
          col_sums <- colSums(slice, na.rm = TRUE)
          col_sums[col_sums == 0] <- 1
          prop_result[, , k] <- sweep(slice, 2, col_sums, "/")
        }
      }
      
      prop_result <- round(prop_result, digits = digits)
      
      # Keep the same dimnames as the frequency table
      dimnames(prop_result) <- dimnames(result)
      class(prop_result) <- "table"
      
      # Store proportion table
      prop_out <- prop_result
      
      # Get variable names
      var1_name <- if (!is.null(names(orig_dimn)) && length(names(orig_dimn)) >= 1 && 
                       !is.na(names(orig_dimn)[1]) && nchar(names(orig_dimn)[1]) > 0) {
        names(orig_dimn)[1]
      } else {
        ""
      }
      var2_name <- if (!is.null(names(orig_dimn)) && length(names(orig_dimn)) >= 2 && 
                       !is.na(names(orig_dimn)[2]) && nchar(names(orig_dimn)[2]) > 0) {
        names(orig_dimn)[2]
      } else {
        ""
      }
      
      # Build output list
      output <- list2(freq = freq, prop = prop_out, chisq = chisq)
      
      # Store proportion metadata
      attr(output, "prop_type") <- prop
      attr(output, "proportion_digits") <- digits
      attr(output, "var1_name") <- var1_name
      attr(output, "var2_name") <- var2_name
      
      class(output) <- c("table2", class(output))
      return(output)
    }
    
    # Get variable names from dimnames for cat messages (for 2D tables)
    var1_name <- if (length(dim(result)) == 2 && !is.null(names(orig_dimn)) && length(names(orig_dimn)) >= 1 && !is.na(names(orig_dimn)[1]) && nchar(names(orig_dimn)[1]) > 0) {
      names(orig_dimn)[1]
    } else {
      ""
    }
    var2_name <- if (length(dim(result)) == 2 && !is.null(names(orig_dimn)) && length(names(orig_dimn)) >= 2 && !is.na(names(orig_dimn)[2]) && nchar(names(orig_dimn)[2]) > 0) {
      names(orig_dimn)[2]
    } else {
      ""
    }
    
    if (prop == 0) {
      # Overall proportions: divide by sum of all cells
      total_sum <- sum(result, na.rm = TRUE)
      
      result <- result / total_sum
      # Round to specified number of digits
      result <- round(result, digits = digits)
      prop_type <- "overall proportions"
      
      # Add row and column with marginal proportions, 100% only in bottom right
      # Skip margins for 1D tables (they would just duplicate the single column)
      if (length(dim(result)) == 2 && !was_1d) {
        n_rows <- nrow(result)
        n_cols <- ncol(result)
        dimn <- dimnames(result)
        # Preserve names of dimnames
        dimn_names <- names(dimn)
        
        # Calculate column totals from original frequency table, then convert to proportions
        col_totals_freq <- colSums(orig_freq, na.rm = TRUE)
        col_totals <- col_totals_freq / total_sum
        col_totals <- round(col_totals, digits = digits)
        
        # TASK 8: Add summary row with column marginal proportions
        summary_row <- matrix(col_totals, nrow = 1, ncol = n_cols)
        result <- rbind(result, summary_row)
        dimn[[1]] <- c(dimn[[1]], "Total")  # Add "Total" to row labels
        
        # Calculate row totals from original frequency table, then convert to proportions
        row_totals_freq <- rowSums(orig_freq, na.rm = TRUE)
        row_totals <- row_totals_freq / total_sum
        row_totals <- round(row_totals, digits = digits)
        
        # TASK 8: Add summary column with row marginal proportions
        summary_col <- matrix(c(row_totals, round(1.0, digits = digits)), nrow = n_rows + 1, ncol = 1)
        # Bottom right corner is 1.0 - sum of all proportions
        result <- cbind(result, summary_col)
        dimn[[2]] <- c(dimn[[2]], "Total")  # Add "Total" to column labels
        # Restore names of dimnames
        names(dimn) <- dimn_names
        dimnames(result) <- dimn
      }
      
    } else if (prop == 1) {
      # Row proportions: each row sums to 1
      row_sums <- rowSums(result, na.rm = TRUE)
      # Avoid division by zero
      row_sums[row_sums == 0] <- 1
      result <- result / row_sums
      # Round to specified number of digits
      result <- round(result, digits = digits)
      prop_type <- "row proportions"
      
      # TASK 8: Add column with 1.0 for each row
      # Skip for 1D tables (would just duplicate the single column)
      if (length(dim(result)) == 2 && !was_1d) {
        n_rows <- nrow(result)
        dimn <- dimnames(result)
        # Preserve names of dimnames
        dimn_names <- names(dimn)
        summary_col <- matrix(round(1.0, digits = digits), nrow = n_rows, ncol = 1)  # 1.0 for each row
        result <- cbind(result, summary_col)
        dimn[[2]] <- c(dimn[[2]], "Total")  # Add "Total" to column labels
        # Restore names of dimnames
        names(dimn) <- dimn_names
        dimnames(result) <- dimn
      }
      
    } else if (prop == 2) {
      # Column proportions: each column sums to 1
      col_sums <- colSums(result, na.rm = TRUE)
      # Avoid division by zero
      col_sums[col_sums == 0] <- 1
      result <- sweep(result, 2, col_sums, "/")
      # Round to specified number of digits
      result <- round(result, digits = digits)
      prop_type <- "column proportions"
      
      # TASK 8: Add row with 1.0 for each column
      if (length(dim(result)) == 2) {
        n_cols <- ncol(result)
        dimn <- dimnames(result)
        # Preserve names of dimnames
        dimn_names <- names(dimn)
        summary_row <- matrix(round(1.0, digits = digits), nrow = 1, ncol = n_cols)  # 1.0 for each column
        result <- rbind(result, summary_row)
        dimn[[1]] <- c(dimn[[1]], "Total")  # Add "Total" to row labels
        # Restore names of dimnames
        names(dimn) <- dimn_names
        dimnames(result) <- dimn
      }
    }
    
    # Re-set attributes after all modifications (rbind/cbind may have created new objects)
    if (!is.null(prop)) {
      attr(result, "is_proportion") <- TRUE
      attr(result, "proportion_digits") <- digits
      attr(result, "original_frequency") <- orig_freq
      attr(result, "prop_type") <- prop
      attr(result, "var1_name") <- var1_name
      attr(result, "var2_name") <- var2_name
      # Restore chi_test attribute if it was saved before prop calculations
      if (!is.null(chi_test_attr)) {
        attr(result, "chi_test") <- chi_test_attr
      }
      # Store proportion table as plain table (no custom attributes)
      # Keep only dim, dimnames, class
      prop_out <- result
      attr(prop_out, "is_proportion") <- NULL
      attr(prop_out, "proportion_digits") <- NULL
      attr(prop_out, "original_frequency") <- NULL
      attr(prop_out, "prop_type") <- NULL
      attr(prop_out, "var1_name") <- NULL
      attr(prop_out, "var2_name") <- NULL
      attr(prop_out, "chi_test") <- NULL
      class(prop_out) <- "table"
    }
  }
  
  # TASK 9: Return as list with class table2
  output <- list2(freq = freq, prop = prop_out, chisq = chisq)
  
  # Store proportion metadata if prop was specified
  if (!is.null(prop)) {
    attr(output, "prop_type") <- prop
    attr(output, "proportion_digits") <- digits
    attr(output, "var1_name") <- var1_name
    attr(output, "var2_name") <- var2_name
  }
  
  # Store missing data information and variable names
  attr(output, "var_lengths") <- var_lengths
  attr(output, "var_na_counts") <- var_na_counts
  attr(output, "var_names") <- var_names
  
  class(output) <- c("table2", class(output))
  return(output)
}

#' Print method for table2 objects
#'
#' @param x An object of class \code{table2}
#' @param ... Additional arguments (ignored)
#'
#' @return Invisibly returns the original object
#' @export
print.table2 <- function(x, ...) {
  # Print the frequency table
  if (!is.null(x$freq)) {
    print(x$freq)
  }
  
  # Print proportion table if present
  if (!is.null(x$prop)) {
    cat("\n")
    prop_type <- attr(x, "prop_type")
    if (!is.null(prop_type)) {
      if (prop_type == "all" || prop_type == 0) {
        cat("Overall proportions:\n")
      } else if (prop_type == "row" || prop_type == 1 || prop_type == "rows") {
        cat("Row proportions:\n")
      } else if (prop_type == "col" || prop_type == 2 || 
                 prop_type == "cols" || prop_type == "column" || prop_type == "columns") {
        cat("Column proportions:\n")
      }
    }
    # Format and print proportion values with the specified number of digits
    digits <- attr(x, "proportion_digits")
    if (is.null(digits)) digits <- 3  # Default
    
    # Create a formatted version of the prop table with fixed decimal places
    prop_table <- x$prop
    
    # Convert to character matrix with fixed decimal places using formatC
    prop_values <- as.numeric(prop_table)
    prop_formatted <- formatC(prop_values, format = "f", digits = digits)
    
    # Restore dimensions and dimnames
    dim(prop_formatted) <- dim(prop_table)
    dimnames(prop_formatted) <- dimnames(prop_table)
    
    # Print as a formatted table
    print(noquote(prop_formatted))
  }
  
  # Print chi-square test if present
  if (!is.null(x$chisq)) {
    cat("\n")
    print(x$chisq)
  }
  
  # Print missing data notes (mirroring t.test2 style)
  var_na_counts <- attr(x, "var_na_counts")
  var_lengths <- attr(x, "var_lengths")
  var_names <- attr(x, "var_names")
  
  if (!is.null(var_na_counts) && !is.null(var_lengths) && !is.null(var_names)) {
    # Check if any variables have missing data
    has_missing <- any(var_na_counts > 0, na.rm = TRUE)
    
    if (has_missing) {
      cat("\n")
      missing_msgs <- character(0)
      for (i in seq_along(var_na_counts)) {
        if (!is.na(var_na_counts[i]) && var_na_counts[i] > 0) {
          var_name <- if (i <= length(var_names) && !is.na(var_names[i]) && var_names[i] != "") {
            paste0("'", var_names[i], "'")
          } else {
            paste0("variable ", i)
          }
          missing_msgs <- c(missing_msgs, 
                           paste0(var_name, " is missing ", var_na_counts[i], 
                                 " of ", var_lengths[i], " values"))
        }
      }
      if (length(missing_msgs) > 0) {
        cat("note:", paste(missing_msgs, collapse = ", while "), "\n")
      }
    }
  }
  
  invisible(x)
}