R/run_statdecide.R

Defines functions run_statdecide

Documented in run_statdecide

#' @importFrom utils globalVariables
utils::globalVariables(c("group", "max_val", "angle", "cld_var", "grouping", "mean_val"))

#' @title Example Data for Non-parametric test
#' @description An example dataset of pollen collection by honeybee at different times and different months.
#' @usage df_nonparam
#' @export
df_nonparam <- read.table(text = "Month	Time	Pollen
1	08:00	29
1	10:00	41.33
1	12:00	17.33
1	14:00	1
1	16:00	7
1	18:00	4
1	08:00	1.66
1	10:00	17.66
1	12:00	22
1	14:00	24.66
1	16:00	17
1	18:00	8
1	08:00	3
1	10:00	39
1	12:00	28.33
1	14:00	18.33
1	16:00	32
1	18:00	26.33
1	08:00	6
1	10:00	57.33
1	12:00	57
1	14:00	30.33
1	16:00	1.66
1	18:00	15.33
2	08:00	10.33
2	10:00	73
2	12:00	16.33
2	14:00	3.33
2	16:00	2
2	18:00	1.66
2	08:00	0.66
2	10:00	55.33
2	12:00	46.33
2	14:00	5.33
2	16:00	1.66
2	18:00	5.66
2	08:00	14.66
2	10:00	55.66
2	12:00	18.66
2	14:00	13.33
2	16:00	21
2	18:00	3.33
2	08:00	36.66
2	10:00	58.66
2	12:00	25.66
2	14:00	12.33
2	16:00	1
2	18:00	2.33
3	08:00	28.33
3	10:00	85
3	12:00	43.33
3	14:00	36.66
3	16:00	20.66
3	18:00	2.66
3	08:00	31.66
3	10:00	50.33
3	12:00	10.66
3	14:00	7.33
3	16:00	13
3	18:00	5.33
3	08:00	82.66
3	10:00	65
3	12:00	26
3	14:00	12
3	16:00	12.33
3	18:00	1.33
3	08:00	43.33
3	10:00	107.33
3	12:00	12.66
3	14:00	15.66
3	16:00	3
3	18:00	4.33
3	08:00	49
3	10:00	73.33
3	12:00	8.66
3	14:00	12
3	16:00	9
3	18:00	1
4	08:00	76.66
4	10:00	67.33
4	12:00	20.33
4	14:00	17
4	16:00	10.33
4	18:00	3.66
4	08:00	53.33
4	10:00	49
4	12:00	21.33
4	14:00	20.33
4	16:00	8.66
4	18:00	2.33
5	08:00	47.33
5	10:00	34.33
5	12:00	9.66
5	14:00	3
5	16:00	4.33
5	18:00	12.66
5	08:00	35.33
5	10:00	27.66
5	12:00	13
5	14:00	17
5	16:00	1.33
5	18:00	3.66
5	08:00	23.66
5	10:00	58.66
5	12:00	25.66
5	14:00	13
5	16:00	2.66
5	18:00	0.66
6	08:00	27.66
6	10:00	60.33
6	12:00	29.66
6	14:00	21.66
6	16:00	18.33
6	18:00	17
8	08:00	36
8	10:00	32.33
8	12:00	20.33
8	14:00	11.66
8	16:00	12.33
8	18:00	1
10	08:00	11.33
10	10:00	47.33
10	12:00	10.66
10	14:00	30
10	16:00	9.66
10	18:00	17.66
10	08:00	44
10	10:00	75.33
10	12:00	15.66
10	14:00	12
10	16:00	7
10	18:00	9.33", header = TRUE, row.names = NULL)

#' Run Statistical Decision Workflow
#'
#' Automatically checks normality, selects appropriate test (ANOVA or Kruskal-Wallis), performs post-hoc, and visualizes results with compact letter display (CLD). Returns all results as an object with optional console output.
#'
#' @param data A data frame.
#' @param dep_var Character. Name of the dependent variable.
#' @param group_vars Character vector. One or two grouping variables.
#' @param cld_offset Numeric. Vertical offset to place CLD labels above the boxplot (default: 5).
#' @param verbose Logical. Whether to print progress messages and results (default: TRUE).
#' @import agricolae ggplot2 dplyr effectsize stringr
#' @importFrom stats kruskal.test setNames aov as.formula shapiro.test
#'
#' @return A list containing:
#' \describe{
#'   \item{normality_test}{Results of Shapiro-Wilk test}
#'   \item{main_effects}{Results for each main effect}
#'   \item{interaction}{Interaction results (if 2 group_vars)}
#'   \item{plots}{List of ggplot objects}
#'   \item{facet_plot}{Faceted ggplot (if 2 group_vars)}
#'   \item{heatmap}{Heatmap ggplot (if 2 group_vars)}
#' }
#' @export
#' @examples
#' # Silent operation
#' results <- run_statdecide(data = df_nonparam, dep_var = "Pollen",
#'                          group_vars = c("Month","Time"), verbose = FALSE)
#'
#' # With console output
#' run_statdecide(data = df_nonparam, dep_var = "Pollen", group_vars = "Month")
run_statdecide <- function(data, dep_var, group_vars, cld_offset = 5, verbose = TRUE) {
  stopifnot(length(group_vars) %in% c(1, 2))

  # Initialize results object
  results <- list(
    normality_test = NULL,
    main_effects = list(),
    interaction = NULL,
    plots = list(),
    facet_plot = NULL,
    heatmap = NULL
  )

  if (verbose) message("\n===== Step 1: Normality Test =====")
  shapiro <- shapiro.test(data[[dep_var]])
  normal <- shapiro$p.value > 0.05

  # Store results
  results$normality_test <- list(
    test = "Shapiro-Wilk",
    p.value = shapiro$p.value,
    is.normal = normal
  )

  if (verbose) {
    message("Shapiro-Wilk p-value: ", signif(shapiro$p.value, 4))
    message("Normality? ", ifelse(normal, "Yes (parametric)", "No (non-parametric)"))
  }

  plot_with_cld <- function(data, dep_var, group_var, normal = TRUE, cld_offset = 5, verbose = TRUE) {
    analysis_results <- list(
      test = ifelse(normal, "ANOVA", "Kruskal-Wallis"),
      p.value = NA,
      posthoc = NULL,
      effect_size = NULL,
      plot = NULL
    )

    data$grouping <- factor(data[[group_var]])

    if (verbose) message("\n\n=============================\n",
                         "Analysis for: ", group_var, "\n",
                         "=============================\n")

    if (normal) {
      model <- aov(as.formula(paste(dep_var, "~ grouping")), data = data)
      anova_summary <- summary(model)
      p_val <- anova_summary[[1]][["Pr(>F)"]][1]
      analysis_results$p.value <- p_val

      if (verbose) {
        message("\nANOVA Results:")
        print(anova_summary)
        message("Significance: ", .get_sig_label(p_val))
      }

      if (!is.na(p_val) && p_val < 0.05) {
        tukey <- agricolae::HSD.test(model, "grouping", group = TRUE)
        cld <- tukey$groups
        cld$group <- gsub(" ", "", cld$groups)
        cld[[group_var]] <- rownames(cld)

        analysis_results$posthoc <- list(
          method = "Tukey HSD",
          results = cld[, c(group_var, "means", "group")]
        )
        analysis_results$effect_size <- effectsize::eta_squared(model)

        if (verbose) {
          message("\nTukey HSD Post-hoc Results with CLD:")
          print(analysis_results$posthoc$results)
          message("\nEffect Size (eta^2):")
          print(analysis_results$effect_size)
        }
      }
    } else {
      kruskal_test <- kruskal.test(as.formula(paste(dep_var, "~ grouping")), data = data)
      p_val <- kruskal_test$p.value
      analysis_results$p.value <- p_val

      if (verbose) {
        message("\nKruskal-Wallis Results:")
        print(kruskal_test)
        message("Significance: ", .get_sig_label(p_val))
      }

      if (!is.na(p_val) && p_val < 0.05) {
        kruskal <- agricolae::kruskal(data[[dep_var]], data[[group_var]], group = TRUE)
        cld <- kruskal$groups
        cld$group <- gsub(" ", "", cld$groups)
        cld[[group_var]] <- rownames(cld)

        if ("means" %in% names(cld)) names(cld)[names(cld) == "means"] <- "mean_val"

        analysis_results$posthoc <- list(
          method = "Kruskal post-hoc",
          results = cld[, c(group_var, intersect(c("mean_val", "group"), names(cld)))]
        )

        if (verbose) {
          message("\nKruskal Post-hoc Results with CLD:")
          print(analysis_results$posthoc$results)
        }
      }
    }

    if (!is.na(p_val) && p_val < 0.05) {
      means <- data |>
        dplyr::group_by(grouping) |>
        dplyr::summarise(
          mean_val = mean(.data[[dep_var]], na.rm = TRUE),
          max_val = max(.data[[dep_var]], na.rm = TRUE),
          .groups = "drop"
        ) |>
        dplyr::left_join(cld, by = setNames(group_var, "grouping")) |>
        dplyr::mutate(
          angle = ifelse(nchar(group) > 3, 90, 0),
          cld_var = max_val + cld_offset
        )

      p <- ggplot2::ggplot(data, aes(x = grouping, y = .data[[dep_var]], fill = grouping)) +
        ggplot2::geom_boxplot() +
        ggplot2::geom_text(
          data = means,
          aes(x = grouping, y = cld_var, label = group, angle = angle),
          size = 4,
          vjust = 0
        ) +
        ggplot2::theme_bw(base_size = 14) +
        ggplot2::theme(
          panel.grid.minor = element_blank(),
          axis.text.x = element_text(angle = 60, hjust = 1),
          legend.position = "none"
        ) +
        ggplot2::scale_x_discrete(labels = function(x) stringr::str_wrap(x, width = 10)) +
        ggplot2::labs(x = group_var, y = dep_var)

      if (verbose) print(p)
      analysis_results$plot <- p
    } else {
      if (verbose) message("No significant effect for '", group_var, "' (p = ", round(p_val, 4), ").")
    }

    return(analysis_results)
  }

  # Helper function for significance labels
  .get_sig_label <- function(p) {
    if (p < 0.001) return("***")
    else if (p < 0.01) return("**")
    else if (p < 0.05) return("*")
    else return("ns")
  }

  # Run main effect analyses
  for (grp in group_vars) {
    if (verbose) message("\n===== Analyzing: ", grp, " =====")
    analysis <- plot_with_cld(data, dep_var, grp, normal = normal,
                              cld_offset = cld_offset, verbose = verbose)
    results$main_effects[[grp]] <- analysis
    if (!is.null(analysis$plot)) results$plots[[grp]] <- analysis$plot
  }

  # Handle interaction if two grouping variables
  if (length(group_vars) == 2) {
    interaction_var <- "interaction"
    data[[interaction_var]] <- interaction(data[[group_vars[1]]], data[[group_vars[2]]], sep = ":")

    if (verbose) message("\n===== Analyzing Interaction: ", paste(group_vars, collapse = " X "), " =====")
    interaction_results <- plot_with_cld(data, dep_var, interaction_var, normal = normal,
                                         cld_offset = cld_offset, verbose = verbose)
    results$interaction <- interaction_results
    if (!is.null(interaction_results$plot)) {
      results$plots[[interaction_var]] <- interaction_results$plot
    }

    # Faceted boxplot
    results$facet_plot <- ggplot2::ggplot(
      data,
      ggplot2::aes(x = .data[[group_vars[2]]], y = .data[[dep_var]], fill = .data[[group_vars[2]]])
    ) +
      ggplot2::geom_boxplot() +
      ggplot2::facet_wrap(stats::as.formula(paste("~", group_vars[1])), ncol = 4) +
      ggplot2::theme_bw(base_size = 14) +
      ggplot2::theme(panel.grid.minor = element_blank()) +
      ggplot2::labs(
        title = paste("Faceted Boxplot of", dep_var),
        x = group_vars[2], y = dep_var
      )

    if (verbose) print(results$facet_plot)

    # Heatmap
    heatmap_data <- data |>
      dplyr::group_by_at(group_vars) |>
      dplyr::summarise(mean_val = mean(.data[[dep_var]], na.rm = TRUE), .groups = "drop")

    results$heatmap <- ggplot2::ggplot(
      heatmap_data,
      ggplot2::aes(x = .data[[group_vars[2]]], y = .data[[group_vars[1]]], fill = mean_val)
    ) +
      ggplot2::geom_tile(color = "white") +
      ggplot2::scale_fill_viridis_c(option = "C", name = paste("Mean", dep_var)) +
      ggplot2::theme_bw(base_size = 14) +
      ggplot2::theme(panel.grid.minor = element_blank()) +
      ggplot2::labs(
        title = paste("Heatmap of", dep_var),
        x = group_vars[2], y = group_vars[1]
      )

    if (verbose) print(results$heatmap)
  }

  invisible(results)
}

Try the statdecideR package in your browser

Any scripts or data that you put into this service are public.

statdecideR documentation built on June 8, 2025, 11:17 a.m.