In andrewpbray/infer: Tidy Statistical Inference

knitr::opts_chunk$set(fig.width = 6, fig.height = 3.5) 
options(digits = 4)

Data preparation

library(nycflights13)
library(dplyr)
library(ggplot2)
library(stringr)
library(infer)
set.seed(2017)
fli_small <- flights %>% 
  na.omit() %>%
  sample_n(size = 500) %>% 
  mutate(season = case_when(
    month %in% c(10:12, 1:3) ~ "winter",
    month %in% c(4:9) ~ "summer"
  )) %>% 
  mutate(day_hour = case_when(
    between(hour, 1, 12) ~ "morning",
    between(hour, 13, 24) ~ "not morning"
  )) %>% 
  select(arr_delay, dep_delay, season, 
         day_hour, origin, carrier)

Two numeric - arr_delay, dep_delay
Two categories
- season ("winter", "summer"),
- day_hour ("morning", "not morning")
Three categories - origin ("EWR", "JFK", "LGA")
Sixteen categories - carrier

Hypothesis tests

One numerical variable (mean)

Observed stat

( x_bar <- fli_small %>%
  specify(response = dep_delay) %>%
  calculate(stat = "mean") )

null_distn <- fli_small %>%
  specify(response = dep_delay) %>%
  hypothesize(null = "point", mu = 10) %>%
  generate(reps = 1000) %>%
  calculate(stat = "mean")

visualize(null_distn) +
  shade_p_value(obs_stat = x_bar, direction = "two_sided")
null_distn %>%
  get_p_value(obs_stat = x_bar, direction = "two_sided")

One numerical variable (standardized mean $t$)

Observed stat

( t_bar <- fli_small %>%
  specify(response = dep_delay) %>%
  calculate(stat = "t") )

null_distn <- fli_small %>%
  specify(response = dep_delay) %>%
  hypothesize(null = "point", mu = 8) %>%
  generate(reps = 1000) %>%
  calculate(stat = "t")

visualize(null_distn) +
  shade_p_value(obs_stat = t_bar, direction = "two_sided")
null_distn %>%
  get_p_value(obs_stat = t_bar, direction = "two_sided")

One numerical variable (median)

Observed stat

( x_tilde <- fli_small %>%
  specify(response = dep_delay) %>%
  calculate(stat = "median") )

null_distn <- fli_small %>%
  specify(response = dep_delay) %>%
  hypothesize(null = "point", med = -1) %>% 
  generate(reps = 1000) %>% 
  calculate(stat = "median")

visualize(null_distn) +
  shade_p_value(obs_stat = x_tilde, direction = "two_sided")
null_distn %>%
  get_p_value(obs_stat = x_tilde, direction = "two_sided")

One categorical (one proportion)

Observed stat

( p_hat <- fli_small %>%
  specify(response = day_hour, success = "morning") %>%
  calculate(stat = "prop") )

null_distn <- fli_small %>%
  specify(response = day_hour, success = "morning") %>%
  hypothesize(null = "point", p = .5) %>%
  generate(reps = 1000) %>%
  calculate(stat = "prop")

visualize(null_distn) +
  shade_p_value(obs_stat = p_hat, direction = "two_sided")
null_distn %>%
  get_p_value(obs_stat = p_hat, direction = "two_sided")

Logical variables will be coerced to factors:

null_distn <- fli_small %>%
  mutate(day_hour_logical = (day_hour == "morning")) %>%
  specify(response = day_hour_logical, success = "TRUE") %>%
  hypothesize(null = "point", p = .5) %>%
  generate(reps = 1000) %>%
  calculate(stat = "prop")

One categorical variable (standardized proportion $z$)

Not yet implemented.

Two categorical (2 level) variables

Observed stat

( d_hat <- fli_small %>% 
  specify(day_hour ~ season, success = "morning") %>%
  calculate(stat = "diff in props", order = c("winter", "summer")) )

null_distn <- fli_small %>%
  specify(day_hour ~ season, success = "morning") %>%
  hypothesize(null = "independence") %>% 
  generate(reps = 1000) %>% 
  calculate(stat = "diff in props", order = c("winter", "summer"))

visualize(null_distn) +
  shade_p_value(obs_stat = d_hat, direction = "two_sided")
null_distn %>%
  get_p_value(obs_stat = d_hat, direction = "two_sided")

Two categorical (2 level) variables (z)

Standardized observed stat

( z_hat <- fli_small %>% 
  specify(day_hour ~ season, success = "morning") %>%
  calculate(stat = "z", order = c("winter", "summer")) )

null_distn <- fli_small %>%
  specify(day_hour ~ season, success = "morning") %>%
  hypothesize(null = "independence") %>% 
  generate(reps = 1000) %>% 
  calculate(stat = "z", order = c("winter", "summer"))

visualize(null_distn) +
  shade_p_value(obs_stat = z_hat, direction = "two_sided")
null_distn %>%
  get_p_value(obs_stat = z_hat, direction = "two_sided")

Note the similarities in this plot and the previous one.

One categorical (>2 level) - GoF

Observed stat

Note the need to add in the hypothesized values here to compute the observed statistic.

( Chisq_hat <- fli_small %>%
  specify(response = origin) %>%
  hypothesize(null = "point", 
              p = c("EWR" = .33, "JFK" = .33, "LGA" = .34)) %>% 
  calculate(stat = "Chisq") )

null_distn <- fli_small %>%
  specify(response = origin) %>%
  hypothesize(null = "point", 
              p = c("EWR" = .33, "JFK" = .33, "LGA" = .34)) %>% 
  generate(reps = 1000, type = "simulate") %>% 
  calculate(stat = "Chisq")

visualize(null_distn) +
  shade_p_value(obs_stat = Chisq_hat, direction = "greater")
null_distn %>%
  get_p_value(obs_stat = Chisq_hat, direction = "greater")

Two categorical (>2 level) variables

Observed stat

( Chisq_hat <- fli_small %>%
  specify(formula = day_hour ~ origin) %>% 
  calculate(stat = "Chisq") )

null_distn <- fli_small %>%
  specify(day_hour ~ origin) %>%
  hypothesize(null = "independence") %>% 
  generate(reps = 1000, type = "permute") %>% 
  calculate(stat = "Chisq")

visualize(null_distn) +
  shade_p_value(obs_stat = Chisq_hat, direction = "greater")
null_distn %>%
  get_p_value(obs_stat = Chisq_hat, direction = "greater")

One numerical variable, one categorical (2 levels) (diff in means)

Observed stat

( d_hat <- fli_small %>% 
  specify(dep_delay ~ season) %>% 
  calculate(stat = "diff in means", order = c("summer", "winter")) )

null_distn <- fli_small %>%
  specify(dep_delay ~ season) %>%
  hypothesize(null = "independence") %>%
  generate(reps = 1000, type = "permute") %>%
  calculate(stat = "diff in means", order = c("summer", "winter"))

visualize(null_distn) +
  shade_p_value(obs_stat = d_hat, direction = "two_sided")
null_distn %>%
  get_p_value(obs_stat = d_hat, direction = "two_sided")

One numerical variable, one categorical (2 levels) (t)

Standardized observed stat

( t_hat <- fli_small %>% 
  specify(dep_delay ~ season) %>% 
  calculate(stat = "t", order = c("summer", "winter")) )

null_distn <- fli_small %>%
  specify(dep_delay ~ season) %>%
  hypothesize(null = "independence") %>%
  generate(reps = 1000, type = "permute") %>%
  calculate(stat = "t", order = c("summer", "winter"))

visualize(null_distn) +
  shade_p_value(obs_stat = t_hat, direction = "two_sided")
null_distn %>%
  get_p_value(obs_stat = t_hat, direction = "two_sided")

Note the similarities in this plot and the previous one.

One numerical variable, one categorical (2 levels) (diff in medians)

Observed stat

( d_hat <- fli_small %>% 
  specify(dep_delay ~ season) %>% 
  calculate(stat = "diff in medians", order = c("summer", "winter")) )

null_distn <- fli_small %>%
  specify(dep_delay ~ season) %>% # alt: response = dep_delay, 
  # explanatory = season
  hypothesize(null = "independence") %>%
  generate(reps = 1000, type = "permute") %>%
  calculate(stat = "diff in medians", order = c("summer", "winter"))

visualize(null_distn) +
  shade_p_value(obs_stat = d_hat, direction = "two_sided")
null_distn %>%
  get_p_value(obs_stat = d_hat, direction = "two_sided")

One numerical, one categorical (>2 levels) - ANOVA

Observed stat

( F_hat <- fli_small %>% 
  specify(arr_delay ~ origin) %>%
  calculate(stat = "F") )

null_distn <- fli_small %>%
   specify(arr_delay ~ origin) %>%
   hypothesize(null = "independence") %>%
   generate(reps = 1000, type = "permute") %>%
   calculate(stat = "F")

visualize(null_distn) +
  shade_p_value(obs_stat = F_hat, direction = "greater")
null_distn %>%
  get_p_value(obs_stat = F_hat, direction = "greater")

Two numerical vars - SLR

Observed stat

( slope_hat <- fli_small %>% 
  specify(arr_delay ~ dep_delay) %>% 
  calculate(stat = "slope") )

null_distn <- fli_small %>%
   specify(arr_delay ~ dep_delay) %>% 
   hypothesize(null = "independence") %>%
   generate(reps = 1000, type = "permute") %>%
   calculate(stat = "slope")

visualize(null_distn) +
  shade_p_value(obs_stat = slope_hat, direction = "two_sided")
null_distn %>%
  get_p_value(obs_stat = slope_hat, direction = "two_sided")

Two numerical vars - correlation

Observed stat

( correlation_hat <- fli_small %>% 
  specify(arr_delay ~ dep_delay) %>% 
  calculate(stat = "correlation") )

null_distn <- fli_small %>%
   specify(arr_delay ~ dep_delay) %>% 
   hypothesize(null = "independence") %>%
   generate(reps = 1000, type = "permute") %>%
   calculate(stat = "correlation")

visualize(null_distn) +
  shade_p_value(obs_stat = correlation_hat, direction = "two_sided")
null_distn %>%
  get_p_value(obs_stat = correlation_hat, direction = "two_sided")

Two numerical vars - SLR (t)

Not currently implemented since $t$ could refer to standardized slope or standardized correlation.

# **Standardized observed stat**
( t_hat <- fli_small %>% 
  specify(arr_delay ~ dep_delay) %>% 
  calculate(stat = "t") )

null_distn <- fli_small %>%
   specify(arr_delay ~ dep_delay) %>% 
   hypothesize(null = "independence") %>%
   generate(reps = 1000, type = "permute") %>%
   calculate(stat = "t")

visualize(null_distn) +
  shade_p_value(obs_stat = t_hat, direction = "two_sided")
null_distn %>%
  get_p_value(obs_stat = t_hat, direction = "two_sided")

Confidence intervals

One numerical (one mean)

Point estimate

( x_bar <- fli_small %>% 
  specify(response = arr_delay) %>%
  calculate(stat = "mean") )

boot <- fli_small %>%
   specify(response = arr_delay) %>%
   generate(reps = 1000, type = "bootstrap") %>%
   calculate(stat = "mean")
( percentile_ci <- get_ci(boot) )

visualize(boot) +
  shade_confidence_interval(endpoints = percentile_ci)
( standard_error_ci <- get_ci(boot, type = "se", point_estimate = x_bar) )

visualize(boot) +
  shade_confidence_interval(endpoints = standard_error_ci)

One numerical (one mean - standardized)

Point estimate

( t_hat <- fli_small %>% 
  specify(response = arr_delay) %>%
  calculate(stat = "t") )

boot <- fli_small %>%
   specify(response = arr_delay) %>%
   generate(reps = 1000, type = "bootstrap") %>%
   calculate(stat = "t")
( percentile_ci <- get_ci(boot) )

visualize(boot) +
  shade_confidence_interval(endpoints = percentile_ci)
( standard_error_ci <- get_ci(boot, type = "se", point_estimate = t_hat) )

visualize(boot) +
  shade_confidence_interval(endpoints = standard_error_ci)

One categorical (one proportion)

Point estimate

( p_hat <- fli_small %>% 
   specify(response = day_hour, success = "morning") %>%
   calculate(stat = "prop") )

boot <- fli_small %>%
 specify(response = day_hour, success = "morning") %>%
 generate(reps = 1000, type = "bootstrap") %>%
 calculate(stat = "prop")
( percentile_ci <- get_ci(boot) )

visualize(boot) +
  shade_confidence_interval(endpoints = percentile_ci)
( standard_error_ci <- get_ci(boot, type = "se", point_estimate = p_hat) )

visualize(boot) +
  shade_confidence_interval(endpoints = standard_error_ci)

One categorical variable (standardized proportion $z$)

Not yet implemented.

One numerical variable, one categorical (2 levels) (diff in means)

Point estimate

( d_hat <- fli_small %>%
  specify(arr_delay ~ season) %>%
  calculate(stat = "diff in means", order = c("summer", "winter")) )

boot <- fli_small %>%
   specify(arr_delay ~ season) %>%
   generate(reps = 1000, type = "bootstrap") %>%
   calculate(stat = "diff in means", order = c("summer", "winter"))
( percentile_ci <- get_ci(boot) )

visualize(boot) +
  shade_confidence_interval(endpoints = percentile_ci)
( standard_error_ci <- get_ci(boot, type = "se", point_estimate = d_hat) )

visualize(boot) +
  shade_confidence_interval(endpoints = standard_error_ci)

One numerical variable, one categorical (2 levels) (t)

Standardized point estimate

( t_hat <- fli_small %>%
  specify(arr_delay ~ season) %>%
  calculate(stat = "t", order = c("summer", "winter")) )

boot <- fli_small %>%
   specify(arr_delay ~ season) %>%
   generate(reps = 1000, type = "bootstrap") %>%
   calculate(stat = "t", order = c("summer", "winter"))
( percentile_ci <- get_ci(boot) )

visualize(boot) +
  shade_confidence_interval(endpoints = percentile_ci)
( standard_error_ci <- get_ci(boot, type = "se", point_estimate = t_hat) )

visualize(boot) +
  shade_confidence_interval(endpoints = standard_error_ci)

Two categorical variables (diff in proportions)

Point estimate

( d_hat <- fli_small %>% 
  specify(day_hour ~ season, success = "morning") %>%
  calculate(stat = "diff in props", order = c("summer", "winter")) )

boot <- fli_small %>%
  specify(day_hour ~ season, success = "morning") %>%
  generate(reps = 1000, type = "bootstrap") %>% 
  calculate(stat = "diff in props", order = c("summer", "winter"))
( percentile_ci <- get_ci(boot) )

visualize(boot) +
  shade_confidence_interval(endpoints = percentile_ci)
( standard_error_ci <- get_ci(boot, type = "se", point_estimate = d_hat) )

visualize(boot) +
  shade_confidence_interval(endpoints = standard_error_ci)

Two categorical variables (z)

Standardized point estimate

( z_hat <- fli_small %>% 
  specify(day_hour ~ season, success = "morning") %>%
  calculate(stat = "z", order = c("summer", "winter")) )

boot <- fli_small %>%
  specify(day_hour ~ season, success = "morning") %>%
  generate(reps = 1000, type = "bootstrap") %>% 
  calculate(stat = "z", order = c("summer", "winter"))
( percentile_ci <- get_ci(boot) )

visualize(boot) +
  shade_confidence_interval(endpoints = percentile_ci)
( standard_error_ci <- get_ci(boot, type = "se", point_estimate = z_hat) )

visualize(boot) +
  shade_confidence_interval(endpoints = standard_error_ci)

Two numerical vars - SLR

Point estimate

( slope_hat <- fli_small %>% 
  specify(arr_delay ~ dep_delay) %>%
  calculate(stat = "slope") )

boot <- fli_small %>%
   specify(arr_delay ~ dep_delay) %>% 
   generate(reps = 1000, type = "bootstrap") %>%
   calculate(stat = "slope")
( percentile_ci <- get_ci(boot) )

visualize(boot) +
  shade_confidence_interval(endpoints = percentile_ci)
( standard_error_ci <- get_ci(boot, type = "se", point_estimate = slope_hat) )

visualize(boot) +
  shade_confidence_interval(endpoints = standard_error_ci)

Two numerical vars - correlation

Point estimate

( correlation_hat <- fli_small %>% 
  specify(arr_delay ~ dep_delay) %>%
  calculate(stat = "correlation") )

boot <- fli_small %>%
   specify(arr_delay ~ dep_delay) %>% 
   generate(reps = 1000, type = "bootstrap") %>%
   calculate(stat = "correlation")
( percentile_ci <- get_ci(boot) )

visualize(boot) +
  shade_confidence_interval(endpoints = percentile_ci)
( standard_error_ci <- get_ci(boot, type = "se", 
                            point_estimate = correlation_hat) )

visualize(boot) +
  shade_confidence_interval(endpoints = standard_error_ci)

Two numerical vars - t

Not currently implemented since $t$ could refer to standardized slope or standardized correlation.

# **Point estimate**
( t_hat <- fli_small %>% 
  specify(arr_delay ~ dep_delay) %>%
  calculate(stat = "t") )

boot <- fli_small %>%
   specify(arr_delay ~ dep_delay) %>% 
   generate(reps = 1000, type = "bootstrap") %>%
   calculate(stat = "t")
( percentile_ci <- get_ci(boot) )

visualize(boot) +
  shade_confidence_interval(endpoints = percentile_ci)
( standard_error_ci <- get_ci(boot, type = "se", point_estimate = t_hat) )

visualize(boot) +
  shade_confidence_interval(endpoints = standard_error_ci)

andrewpbray/infer documentation built on Aug. 29, 2019, 5:57 a.m.

rdrr.io home R language documentation Run R code online

CRAN packages Bioconductor packages R-Forge packages GitHub packages

Note that we can't provide technical support on individual packages. You should contact the package authors for that.

andrewpbray/infer
Tidy Statistical Inference

In andrewpbray/infer: Tidy Statistical Inference

Data preparation

Hypothesis tests

One numerical variable (mean)

One numerical variable (standardized mean $t$)

One numerical variable (median)

One categorical (one proportion)

One categorical variable (standardized proportion $z$)

Two categorical (2 level) variables

Two categorical (2 level) variables (z)

One categorical (>2 level) - GoF

Two categorical (>2 level) variables

One numerical variable, one categorical (2 levels) (diff in means)

One numerical variable, one categorical (2 levels) (t)

One numerical variable, one categorical (2 levels) (diff in medians)

One numerical, one categorical (>2 levels) - ANOVA

Two numerical vars - SLR

Two numerical vars - correlation

Two numerical vars - SLR (t)

Confidence intervals

One numerical (one mean)

One numerical (one mean - standardized)

One categorical (one proportion)

One categorical variable (standardized proportion $z$)

One numerical variable, one categorical (2 levels) (diff in means)

One numerical variable, one categorical (2 levels) (t)

Two categorical variables (diff in proportions)

Two categorical variables (z)

Two numerical vars - SLR

Two numerical vars - correlation

Two numerical vars - t

R Package Documentation

Browse R Packages

We want your feedback!

andrewpbray/infer Tidy Statistical Inference

In andrewpbray/infer: Tidy Statistical Inference

Data preparation

Hypothesis tests

One numerical variable (mean)

One numerical variable (standardized mean $t$)

One numerical variable (median)

One categorical (one proportion)

One categorical variable (standardized proportion $z$)

Two categorical (2 level) variables

Two categorical (2 level) variables (z)

One categorical (>2 level) - GoF

Two categorical (>2 level) variables

One numerical variable, one categorical (2 levels) (diff in means)

One numerical variable, one categorical (2 levels) (t)

One numerical variable, one categorical (2 levels) (diff in medians)

One numerical, one categorical (>2 levels) - ANOVA

Two numerical vars - SLR

Two numerical vars - correlation

Two numerical vars - SLR (t)

Confidence intervals

One numerical (one mean)

One numerical (one mean - standardized)

One categorical (one proportion)

One categorical variable (standardized proportion $z$)

One numerical variable, one categorical (2 levels) (diff in means)

One numerical variable, one categorical (2 levels) (t)

Two categorical variables (diff in proportions)

Two categorical variables (z)

Two numerical vars - SLR

Two numerical vars - correlation

Two numerical vars - t

R Package Documentation

Browse R Packages

We want your feedback!

andrewpbray/infer
Tidy Statistical Inference