R/crosstable.data.frame.R

Defines functions crosstable.data.frame crosstable_finalize_output crosstable_empty_output crosstable_loop_over_deps crosstable_process_dep crosstable_finalize_variable_output crosstable_merge_summaries crosstable_calculate_proportions crosstable_calculate_means crosstable_prepare_data crosstable_add_totals crosstable_validate_columns crosstable

crosstable <- function(data,
                       ...) {
  UseMethod("crosstable")
}


# Helper function: Validate column names
crosstable_validate_columns <- function(data, dep, indep) {
  invalid_deps <- dep[!dep %in% colnames(data)]
  if (length(invalid_deps) > 0) {
    cli::cli_abort("Column{?s} {.var {invalid_deps}} {?doesn't/don't} exist.")
  }
  invalid_indeps <- indep[!indep %in% colnames(data)]
  if (length(invalid_indeps) > 0) {
    cli::cli_abort("Column{?s} {.var {invalid_indeps}} {?doesn't/don't} exist.")
  }
}

# Helper function: Add totals for independent variables
crosstable_add_totals <- function(data, indep, translations) {
  for (indep_var in indep) {
    data_duplicate <- data
    data_duplicate[[indep_var]] <- forcats::fct_na_value_to_level(data_duplicate[[indep_var]], level = translations$by_total)
    levels(data_duplicate[[indep_var]]) <- rep(translations$by_total, length(levels(data_duplicate[[indep_var]])))

    if (is.ordered(data[[indep_var]])) {
      levels(data_duplicate[[indep_var]]) <- c(levels(data[[indep_var]]), levels(data_duplicate[[indep_var]]))
    }

    data <- dplyr::bind_rows(data, data_duplicate)
    for (i in seq_len(ncol(data))) {
      attr(data[[i]], "label") <- attr(data_duplicate[[i]], "label")
    }
  }
  return(data)
}


# Helper function: Prepare data for processing
crosstable_prepare_data <- function(data, dep_var, indep, showNA) {
  out <- data
  names(out)[names(out) == dep_var] <- ".category"
  col <- out$.category

  if (!is.factor(col) && dplyr::n_distinct(col, na.rm = FALSE) <= 10) {
    out$.category <- factor(col)
    col <- out$.category

  }

  if (showNA == "always" || (showNA == "ifany" && any(is.na(col)))) {
    out$.category <- forcats::fct_na_value_to_level(f = col, level = "NA")
  } else {
    out <- out[!is.na(out$.category), , drop = FALSE]
  }

  for (indep_var in indep) {
    indep_col <- out[[indep_var]]
    if (showNA == "always" || (showNA == "ifany" && any(is.na(indep_col)))) {
      out[[indep_var]] <- forcats::fct_na_value_to_level(f = indep_col, level = "NA")
    } else {
      out <- vctrs::vec_slice(out, !is.na(out[[indep_var]]))
    }
  }
  return(out)
}

# Helper function: Calculate mean values
crosstable_calculate_means <- function(data, indep) {
  data$.mean <- suppressWarnings(as.numeric(data$.category))
  tryCatch(
    stats::aggregate(.mean ~ ., data = data[, c(indep, ".mean"), drop = FALSE], FUN = mean, na.rm = TRUE),
              error = function(e) {
                cols <- c(indep, ".mean")
                data.frame(matrix(NA, ncol = length(cols), dimnames = list(NULL, cols)))
              }
  )
}

# Helper function: Calculate proportions
crosstable_calculate_proportions <- function(data, dep_var, fct_lvls, indep, showNA) {

  data$.count <- 1L
  summary_prop <- tryCatch(
    stats::aggregate(x = data$.count, by = data[, c(indep, ".category"), drop = FALSE], FUN = length, simplify = TRUE),
              error = function(e) {
                cols <- c(indep, ".category")
                data.frame(matrix(NA, ncol = length(cols), dimnames = list(NULL, cols)))
              }
  )
  names(summary_prop)[ncol(summary_prop)] <- ".count"


  # Summaries per dep variable (e.g. b_1, b_2)
  summary_prop[[".count_per_dep"]] <- sum(summary_prop$.count, na.rm = TRUE)

#browser()
  # Summaries per indep group (e.g. males, females)
  grouped_count <- tryCatch(
                 stats::aggregate(
                                x = summary_prop$.count,
                                by = summary_prop[, indep, drop = FALSE],
                                FUN = sum, na.rm = TRUE, simplify = TRUE
                                ),
                 error = function(e) {
                    data.frame(matrix(NA, ncol = max(c(1, length(indep))), dimnames = list(NULL, indep)))
              }
            )
  names(grouped_count)[ncol(grouped_count)] <- ".count_per_indep_group"
  summary_prop <- merge(summary_prop, grouped_count, by = indep)
  summary_prop$.proportion <- summary_prop$.count / summary_prop[[".count_per_indep_group"]]


  summary_prop$.category <- factor(
   x = summary_prop$.category,
   levels = fct_lvls,
   labels = fct_lvls,
    exclude = character()
  )
  summary_prop$.count_se <- NA_real_
  summary_prop$.count_per_dep_se <- NA_real_
  summary_prop$.count_per_indep_group_se <- NA_real_
  summary_prop$.proportion_se <- NA_real_
  summary_prop$.mean_se <- NA_real_
  return(summary_prop)
}

# Helper function: Merge summaries
crosstable_merge_summaries <- function(summary_prop, summary_mean, indep) {
  if (length(indep) > 0) {
    common_cols <- intersect(colnames(summary_prop), colnames(summary_mean))
    if (length(common_cols) == 0) cli::cli_abort("Internal error in `crosstable_merge_summaries`")
    dplyr::left_join(summary_prop, summary_mean, by = common_cols)
  } else {
    cbind(summary_prop, summary_mean)
  }

}

# Helper function: Finalize variable output
crosstable_finalize_variable_output <- function(out, dep_var, data) {
    
  out$.variable_name <- dep_var
  out$.variable_position <- match(dep_var, colnames(data))
  out$.variable_label <- unname(get_raw_labels(data = data, col_pos = dep_var))

  return(out)
}


# Helper function: Process a single dependent variable
crosstable_process_dep <- function(data, dep_var, indep, showNA, call) {

  out <- crosstable_prepare_data(data, dep_var, indep, showNA)
  if (nrow(out) == 0) return(crosstable_empty_output(dep_var, indep, data = data))
 
  fct_lvls <- if (is.factor(out[[".category"]])) levels(out[[".category"]]) else sort(unique(out[[".category"]]))
  
  #out <- out[rlang::inject(order(!!!out[, c(indep, ".category"), drop = FALSE])), , drop = FALSE]
  summary_mean <- crosstable_calculate_means(out, indep)
  summary_prop <- crosstable_calculate_proportions(out, dep_var, fct_lvls, indep, showNA)

  merged_output <- crosstable_merge_summaries(summary_prop, summary_mean, indep)
  crosstable_finalize_variable_output(merged_output, dep_var, data)
}


# Helper function: Generate crosstables for dependent variables
crosstable_loop_over_deps <- function(data, dep_cols, indep, showNA, call) {
  lapply(stats::setNames(dep_cols, dep_cols), function(dep_var) {
    crosstable_process_dep(data, dep_var, indep, showNA, call)
  })
}

# Helper function: Return empty output
crosstable_empty_output <- function(dep_var, indep, data) {
  out <- data.frame(
    .variable_name = dep_var,
    .variable_position = match(dep_var, colnames(data)),
    .variable_label = unname(get_raw_labels(data = data, col_pos = dep_var)),
    .category = factor(NA),
    .count = NA_integer_,
    .count_se = NA_real_,
    .count_per_dep = NA_integer_,
    .count_per_dep_se = NA_real_,
    .count_per_indep_group = NA_integer_,
    .count_per_indep_group_se = NA_real_,
    .proportion = NA_real_,
    .proportion_se = NA_real_,
    .mean = NA_real_,
    .mean_se = NA_real_
  )
  out[, indep] <- NA_character_
  return(out)
}

# Helper function: Finalize entire output
crosstable_finalize_output <- function(output, indep, indep_labels, dep_cols) {
  result <- do.call(rbind, output)
  for (indep_var in indep) {
    attr(result[[indep_var]], "label") <- indep_labels[[indep_var]]
  }

  result <-
      result[
        , 
        c(
          ".variable_name", 
          ".variable_position",
          ".variable_label",
          ".category",
          ".count", ".count_se",
          ".count_per_dep",
          ".count_per_dep_se",
          ".count_per_indep_group",
          ".count_per_indep_group_se",
          ".proportion", ".proportion_se",
          ".mean", ".mean_se",
          indep
        ), drop = FALSE]
  result <- dplyr::arrange(result, dplyr::pick(tidyselect::all_of(c(".variable_name", indep, ".category"))))
  rownames(result) <- NULL
  return(result)
}



#' @export
crosstable.data.frame <- function(data,
                                  dep = colnames(data),
                                  indep = NULL,
                                  showNA = eval(formals(makeme)$showNA),
                                  totals = eval(formals(makeme)$totals),
                                  translations = eval(formals(makeme)$translations),
                                  ...,
                                  call = rlang::caller_env()) {
  # Validate inputs
  showNA <- rlang::arg_match(showNA, values = eval(formals(makeme)$showNA), error_call = call)
  
  crosstable_validate_columns(data, dep, indep)

  # Prepare dependent and independent variable labels
  indep_labels <- get_raw_labels(data = data, col_pos = indep)
  dep_cols <- setdiff(dep, indep)
  if (length(dep_cols) == 0) return()

  # Handle totals for independent variables
  if (length(indep) > 0 && isTRUE(totals)) {
    data <- crosstable_add_totals(data, indep, translations)
  }

  # Generate crosstables for each dependent variable
  output <- crosstable_loop_over_deps(data, dep_cols, indep, showNA, call)

  # Combine and arrange results
  result <- crosstable_finalize_output(output, indep, indep_labels, dep_cols)
  return(result)
}


#' @export
crosstable.tbl_df <- crosstable.data.frame

Try the saros package in your browser

Any scripts or data that you put into this service are public.

saros documentation built on June 8, 2025, 10:43 a.m.