In emeyers/SDS100: Tools for the class Introductory Statistics

knitr::opts_chunk$set(echo = TRUE)

library(SDS100)

Should you choose B on a multiple choice test?

Multiple-choice questions on Advanced Placement exams have five options: A, B, C, D, and E. A random sample of the correct choice on 400 multiple-choice questions on a variety of AP shows that B was the correct answer 90 of the 400 questions. Does this provide evidence that B occurs at a different rate than what is expected if each question is equally likely?

Please run a hypothesis test to find out!

$\$

# step 1: state null and alternative hypotheses
# H0: pi = 1/5 = 0.2
# HA: pi != 1/5 = 0.2


# step 2: compute observed statistic
(b_phat <- 90/400)


# step 3: create the null distribution


null_dist <- do_it(10000) * {

  rflip_count(400, .2)/400

}


hist(null_dist, breaks = 30, 
     main = "Null distribution",
     xlab = "Proportion of B's")


abline(v = .2, col = "green")
abline(v = b_phat, col = "red")
abline(v = (.2 - (b_phat - .2)), col = "red")



# step 4: calculate the p-value

p_lower <- pnull(.2 - (b_phat - .2), null_dist, lower.tail = TRUE) 
p_upper <- pnull(b_phat, null_dist, lower.tail = FALSE) 

(p_value <- p_lower + p_upper)



# 5. decision?





# FYI: You don't need to look at the actual data to solve this problem, 
# but if you'd like it can be loaded using:
#  APMultipleChoice <- Lock5Data::APMultipleChoice



# Bonus, can you create a 95% CI for the proportion of answers that are B? 

answers <- Lock5Data::APMultipleChoice$Answer


boot_dist <- do_it(10000) * {

  boot_sample <- sample(answers, 400, replace = TRUE)
  prop.table(table(boot_sample))["B"]

}


hist(boot_dist, breaks = 30, main = "Bootstrap distribution")


SE_boot <- sd(boot_dist)

(CI <- b_phat + 2 * c(-SE_boot, SE_boot))

$\$

Do private colleges have the same graduation rates as public colleges?

$\$

# Null hypothesis is: public universities have the same completion rates on
#  average as private ones


# Alternative hypothesis is:
# Private universities have different completion rates than public universities?


# In symbols:
#  H0: mu_private - mu_public = 0
#  HA: mu_private - mu_public != 0

# significance level: alpha = 0.05  




# step 2: observed stat

college <- read.csv("https://www.lock5stat.com/datasets3e/CollegeScores4yr.csv")

public <- subset(college, Control == "Public")
private <- subset(college, Control == "Private")

public_CompRate <- public$CompRate
private_CompRate  <- private$CompRate


# What's a good way to visualize the data for our question of interest? 
boxplot(public_CompRate, private_CompRate, 
        names = c("public", "private"),
        ylab = "Competion rate")



# bar{x}_{private} - bar{x}_{public}
obs_stat <- mean(private_CompRate, na.rm = TRUE) - mean(public_CompRate, na.rm = TRUE)




# step 3:

combo_data <- c(private_CompRate, public_CompRate)

n_private <- length(private_CompRate)
n_tot <- length(combo_data)

null_dist <- do_it(10000) * {

  shuff_data <- shuffle(combo_data)
  shuff_private <- shuff_data[1:n_private]
  shuff_public <- shuff_data[(n_private + 1):n_tot]

  mean(shuff_private, na.rm = TRUE) - mean(shuff_public, na.rm = TRUE)

}

hist(null_dist, breaks = 100, xlim = c(-6, 6))
abline(v = obs_stat, col = "red")
abline(v = -1 * obs_stat, col = "red")


# step 4:
p_right <- pnull(obs_stat, null_dist, lower.tail = FALSE)
p_left <- pnull(-1 * obs_stat, null_dist, lower.tail = TRUE)

(p_val <- p_right + p_left)



# step 5: Conclusion?

$\$

Do movies that pass the Bechdel test have the same average budgets as movies that fail?

$\$

# step 1:

# In words: 

# Null hypothesis is: Movies that pass the Bechdel test have the same budget 
#  on average as movies that pass the Bechdel test

# Alternative hypothesis is:
#  that pass the Bechdel test have do not have the budget 
#  on average as movies that pass the Bechdel test


# In symbols:
#  H0: mu_pass - mu_fail = 0
#  HA: mu_pass - mu_fail != 0

# significance level: alpha = 0.05  




# step 2: observed stat



library(fivethirtyeight)

bechdel <- bechdel


passed <- subset(bechdel, binary == "PASS")
failed <- subset(bechdel, binary == "FAIL")

passed_budget <- passed$budget_2013
failed_budget <- failed$budget_2013


# What's a good way to visualize the data for our question of interest? 
boxplot(passed_budget, failed_budget, 
        names = c("passed", "failed"),
        ylab = "Budget ($)")



# bar{x}_{private} - bar{x}_{public}
obs_stat <- mean(passed_budget, na.rm = TRUE) - mean(failed_budget, na.rm = TRUE)




# step 3:

combo_data <- c(passed_budget, failed_budget)

n_passed <- length(passed_budget)
n_tot <- length(combo_data)

null_dist <- do_it(10000) * {

  shuff_data <- shuffle(combo_data)
  shuff_private <- shuff_data[1:n_passed]
  shuff_public <- shuff_data[(n_passed + 1):n_tot]

  mean(shuff_private, na.rm = TRUE) - mean(shuff_public, na.rm = TRUE)

}

hist(null_dist, breaks = 100, xlim = c(-2 * 10^7, 2 * 10^7))
abline(v = obs_stat, col = "red")
abline(v = -1 * obs_stat, col = "red")


# step 4:
p_right <- pnull(-1 * obs_stat, null_dist, lower.tail = FALSE)
p_left <- pnull(obs_stat, null_dist, lower.tail = TRUE)

(p_val <- p_right + p_left)



# step 5: Conclusion?