$\$

SDS230::download_data("IPED_salaries_2016.rda")

# SDS230::download_data("Interactions_Categorical.csv")
# install.packages("latex2exp")

library(latex2exp)
library(dplyr)
library(ggplot2)

#options(scipen=999)


knitr::opts_chunk$set(echo = TRUE)

set.seed(123)

$\$

Overview

$\$

Part 1: Connections between ANOVA and regression

Let's look at connections between the least squares fit we used when fitting linear regression models and our one-way ANOVA.

$\$

library(dplyr)
library(ggplot2)

load("IPED_salaries_2016.rda")

IPED_2 <- IPED_salaries |>
  filter(endowment > 0) |>
  mutate(log_salary = log10(salary_tot)) |>
  filter(CARNEGIE %in% c(15, 31)) |>
  filter(rank_name %in% c("Assistant", "Associate", "Full")) |>
  group_by(school) |>
  mutate(num_ranks = n()) |>
  filter(num_ranks == 3)     # only use schools that have all three ranks


# could look at the log salary instead...
# IPED_2$salary_tot <- IPED_2$log_salary

dim(IPED_2)


# visualize the data
IPED_2 |>
  ggplot(aes(rank_name, salary_tot, col = rank_name)) + 
  geom_jitter(position = position_jitter(width = .2)) + 
  xlab("Faculty rank") + 
  ylab("Salary ($)")

$\$

Step 1.1: Least squares offsets are the group means

Let's look at the mean salary for each rank and compare it to the least squares offsets that the lm() function finds.

# fit a linear model 
fit_salary <- lm(salary_tot ~ rank_name, data = IPED_2)


# check that the least squares fit offsets are the means of each group
(summary_salary <- summary(fit_salary))

fit_coefs <- coef(fit_salary)


c(fit_coefs[1], fit_coefs[1] + fit_coefs[2], fit_coefs[1] + fit_coefs[3])



# get the mean and sd of the salary for each faculty rank
IPED_stats <- IPED_2 |>
  group_by(rank_name) |>
  summarize(mean_salary = mean(salary_tot),
            sd_salary = sd(salary_tot))


IPED_stats$mean_salary

$\$

Step 1.2: Using R to get an ANOVA table and to look at diagnostic plots

We can use the anova() function to create an ANOVA table, and we can use the plot() function to look at diagnostic plots to make sure our ANOVA conditions have been met.

# an easy way to get the ANOVA table using the ANOVA function 
(anova_table <- anova(fit_salary))



# we can use regression diagnostic plots to assess if ANOVA conditions have been met
par(mfrow = c(2, 2))
plot(fit_salary)


# we should also check that the maximum and minimum standard deviations are not greater 
# than a factor of 2 apart
IPED_stats$sd_salary
max(IPED_stats$sd_salary)/min(IPED_stats$sd_salary)

$\$

Part 1.3 Kruskal–Wallis test to see if any of our groups stochastically dominate another group.

If we are concerned that our one-way ANOVA conditions are not met, we can run a Kruskal–Wallis test which does not rely on the assumptions of normality and homoscedasticity. We could also run a permutation test which does not rely on these assumptions either.

#Kruskal–Wallis test
kruskal.test(salary_tot ~ rank_name, data = IPED_2)


# compare to the ANOVA 
# anova(fit_salary)

$\$

Part 2: Two-way ANOVA

Let's use a two-way ANOVA to examine if ants are more attracted to particular types of sandwiches! This is a balanced design because there are the same number of observations at each factor level combination (in this case there are 4 observations at each factor combination level).

$\$

Part 2: Pairwise comparisons after running a one-way ANOVA

Part 2.1 Pairwise comparisons in R

If we run a one-way ANOVA and the results are statistically significant, there are a number of tests we can run to see which pairs of results are significantly different.

# test with no multiple comparisons adjustment (not great)
(pairwise_ttests <- pairwise.t.test(IPED_2$salary_tot, IPED_2$rank_name, p.adj = "none"))


# with the Bonferroni correction
(bonferroni <- pairwise.t.test(IPED_2$salary_tot, IPED_2$rank_name, p.adj = "bonferroni"))


# Note, the Bonferroni p-values are 3 times larger than the p-values with no adjustment
bonferroni$p.value/pairwise_ttests$p.value


# Tukey's HSD test using the TukeyHSD() function 
# It is giving results similar to the Bonferroni correction
fit_salary_aov <- aov(salary_tot ~ rank_name, data = IPED_2)
TukeyHSD(fit_salary_aov)

$\$

Part 3.1.: Visualizing the data

Let's start by visualizing the data (you can practice stating the null and alternative hypotheses on the homework).

# install.packages("Stat2Data")

library(Stat2Data)

data(SandwichAnts)


# visualize the data
SandwichAnts |>
  ggplot(aes(Filling, Ants)) + 
  geom_point() + 
  facet_wrap(~Bread)


# Two-way interaction plot using base R
interaction.plot(x.factor = SandwichAnts$Bread, trace.factor = SandwichAnts$Filling, 
                 response = SandwichAnts$Ants, fun = mean, 
                 type = "b", legend = TRUE, 
                 xlab = "Break", ylab="Filling",
                 pch=c(1,19), col = c("#00AFBB", "#E7B800", "#FF0000"))

$\$

Part 3.2.: Fitting the model

Let's now use a two-way ANOVA to run a hypothesis test to see if the differences are statistically significant.

# Create a main effects only model
fit_ants1 <- lm(Ants ~ Bread*Filling, data = SandwichAnts)
anova(fit_ants1)


# Order doesn't matter for a balanced design
fit_ants2 <- lm(Ants ~ Filling*Bread, data = SandwichAnts)
anova(fit_ants2)


# Could look at diagnostic plots
par(mfrow = c(2, 2))
plot(fit_ants1)

$\$

Part 4: Examening unbalanced data

In an unbalanced data, there are different numbers of measured responses at the different variable levels. When running an ANOVA on unbalanced data, one needs to be careful because there are different ways to calculate the sum of squares for the different factors, and this can lead to different results about which factors are statistically significant. Let's examine this using the IPED faculty salary data.

# Factor A: lecturer, assistant, associate, full professor
# Factor B: liberal arts vs research university 

IPED_3 <- IPED_salaries |>
  filter(rank_name %in% c("Lecturer", "Assistant", "Associate", "Full")) |>
  mutate(rank_name  = droplevels(rank_name)) |>
  filter(CARNEGIE %in% c(15, 31)) |>
 # na.omit() |>
  mutate(Inst_type = dplyr::recode(CARNEGIE, "31" = "Liberal arts", "15" = "Research extensive"))


# examine properties of the data 
table(IPED_3$Inst_type, IPED_3$rank_name)

$\$

Part 4.1.: Type I sum of squares

In type I sum of squares, the sum of squares are calculated sequentially, where first SSA is taken into account, and then SSB is consider. In particular:

# Create a main effects and interaction model
fit_profs1 <- lm(salary_tot ~ Inst_type * rank_name, data = IPED_3)
fit_profs2 <- lm(salary_tot ~ rank_name * Inst_type, data = IPED_3)

anova(fit_profs1)
anova(fit_profs2)

$\$

Part 4.2.: Type III sum of squares

In type III sum of squares, the sum of squares the full model is fit SS(A, B, AB) and then the sum of squares for each factor is determined by taking the full model SS(A, B, AB) and subtracting out the fit when a given factor is missing.

# type III sum of squares the order that variables are added does not matter
car::Anova(fit_profs1, type = "III")
car::Anova(fit_profs2, type = "III")

$\$



emeyers/SDS230 documentation built on Jan. 18, 2024, 1:01 a.m.