In pstraforelli/tidytests: A tidy implementation of statistical testing

knitr::opts_chunk$set(
  collapse = TRUE,
  comment = "#>",
  fig.path = "man/figures/README-",
  out.width = "100%"
)

tidytests

The goal of tidytests is to provide a tidy implementation of the output from t-tests and proportional tests.

Installation

You can install the development version from GitHub with:

``` {r eval=FALSE}

install.packages("remotes")

remotes::install_github("pstraforelli/tidytests")

## Example

The package's functions follow an iterative process. First, use either one of two functions, `pairwise_t_test()` and `pairwise_prop_test()`. Both take a raw data frame as input, and the variables to be used for testing.

### `pairwise_t_test()`

```r
library(tidytests)

pairwise_t_test(iris, Sepal.Length, Species)

`pairwise_prop_test()`

set.seed(1) # For reproducibility

mydf <- data.frame(smokers = c(rbinom(100, 1, 0.8),
                               rbinom(70, 1, 0.6),
                               rbinom(50, 1, 0.3)),
                   region = c(rep("A", 100), rep("B", 70), rep("C", 50)))
test_df <- pairwise_prop_test(mydf, smokers, region)
test_df

# Comparing each subgroup vs rest of the sample:
test_df2 <- pairwise_prop_test(mydf, smokers, region, vs_rest = TRUE)
test_df2

Then filter out comparisons that you do not want to show if they do not pass a threshold of your choice (such as a p-value of 0.05).

library(dplyr, warn.conflicts = FALSE)
test_df_sig <- test_df |> 
  filter(p_value < 0.05, smokers == 1) |> # Only interested in the % among smokers
  select(higher_group, lower_group) # Don't need the rest of the output for the following steps

test_df_sig2 <- test_df2 |> 
  filter(p_value < 0.05, smokers == 1) |> # Only interested in the % among smokers
  select(higher_group, lower_group) # Don't need the rest of the output for the following steps

`prep_sigmark()`

Use prep_sigmark() to combine the summary results of your data frame to the output of pairwise_*_test().

# Creating summary output
mydf_summ <- mydf |> 
  count(smokers, region) |>
  filter(smokers == 1) |>
  mutate(perc = n / sum(n))
mydf_summ

# Creating colour vector for upcoming chart
colour_vec <- c("red", "blue", "green")
names(colour_vec) <- LETTERS[1:3]

mydf_prep <- prep_sigmark(mydf_summ, test_df_sig, region, perc, colour_vec, percent = TRUE, vs_rest = FALSE)
mydf_prep

# With comparisons to rest of sample:
mydf_prep2 <- prep_sigmark(mydf_summ, test_df_sig2, region, perc, colour_vec, percent = TRUE, vs_rest = TRUE)
mydf_prep2

`geom_sigmark()`

Use geom_sigmark() as a geom object for your ggplot2 chart.

library(ggplot2)
ggplot(mydf_prep, aes(x = perc, y = region, label = labels, fill = region)) +
  geom_col() +
  geom_sigmark("fill", colour_vec, hjust = 0) +
  xlim(c(0, 1))

# Comparing each subgroup to rest of sample:
ggplot(mydf_prep2, aes(x = perc, y = region, label = labels, fill = region)) +
  geom_col() +
  geom_sigmark(hjust = 0) +
  scale_fill_manual(values = colour_vec) + 
  xlim(c(0, 1))