knitr::opts_chunk$set(echo = TRUE)
library(SDS100)
Multiple-choice questions on Advanced Placement exams have five options: A, B, C, D, and E. A random sample of the correct choice on 400 multiple-choice questions on a variety of AP shows that B was the correct answer 90 of the 400 questions. Does this provide evidence that B occurs at a different rate than what is expected if each question is equally likely?
Please run a hypothesis test to find out!
$\$
# step 1: state null and alternative hypotheses # H0: pi = 1/5 = 0.2 # HA: pi != 1/5 = 0.2 # step 2: compute observed statistic (b_phat <- 90/400) # step 3: create the null distribution null_dist <- do_it(10000) * { rflip_count(400, .2)/400 } hist(null_dist, breaks = 30, main = "Null distribution", xlab = "Proportion of B's") abline(v = .2, col = "green") abline(v = b_phat, col = "red") abline(v = (.2 - (b_phat - .2)), col = "red") # step 4: calculate the p-value p_lower <- pnull(.2 - (b_phat - .2), null_dist, lower.tail = TRUE) p_upper <- pnull(b_phat, null_dist, lower.tail = FALSE) (p_value <- p_lower + p_upper) # 5. decision? # FYI: You don't need to look at the actual data to solve this problem, # but if you'd like it can be loaded using: # APMultipleChoice <- Lock5Data::APMultipleChoice # Bonus, can you create a 95% CI for the proportion of answers that are B? answers <- Lock5Data::APMultipleChoice$Answer boot_dist <- do_it(10000) * { boot_sample <- sample(answers, 400, replace = TRUE) prop.table(table(boot_sample))["B"] } hist(boot_dist, breaks = 30, main = "Bootstrap distribution") SE_boot <- sd(boot_dist) (CI <- b_phat + 2 * c(-SE_boot, SE_boot))
$\$
$\$
# Null hypothesis is: public universities have the same completion rates on # average as private ones # Alternative hypothesis is: # Private universities have different completion rates than public universities? # In symbols: # H0: mu_private - mu_public = 0 # HA: mu_private - mu_public != 0 # significance level: alpha = 0.05 # step 2: observed stat college <- read.csv("https://www.lock5stat.com/datasets3e/CollegeScores4yr.csv") public <- subset(college, Control == "Public") private <- subset(college, Control == "Private") public_CompRate <- public$CompRate private_CompRate <- private$CompRate # What's a good way to visualize the data for our question of interest? boxplot(public_CompRate, private_CompRate, names = c("public", "private"), ylab = "Competion rate") # bar{x}_{private} - bar{x}_{public} obs_stat <- mean(private_CompRate, na.rm = TRUE) - mean(public_CompRate, na.rm = TRUE) # step 3: combo_data <- c(private_CompRate, public_CompRate) n_private <- length(private_CompRate) n_tot <- length(combo_data) null_dist <- do_it(10000) * { shuff_data <- shuffle(combo_data) shuff_private <- shuff_data[1:n_private] shuff_public <- shuff_data[(n_private + 1):n_tot] mean(shuff_private, na.rm = TRUE) - mean(shuff_public, na.rm = TRUE) } hist(null_dist, breaks = 100, xlim = c(-6, 6)) abline(v = obs_stat, col = "red") abline(v = -1 * obs_stat, col = "red") # step 4: p_right <- pnull(obs_stat, null_dist, lower.tail = FALSE) p_left <- pnull(-1 * obs_stat, null_dist, lower.tail = TRUE) (p_val <- p_right + p_left) # step 5: Conclusion?
$\$
$\$
# step 1: # In words: # Null hypothesis is: Movies that pass the Bechdel test have the same budget # on average as movies that pass the Bechdel test # Alternative hypothesis is: # that pass the Bechdel test have do not have the budget # on average as movies that pass the Bechdel test # In symbols: # H0: mu_pass - mu_fail = 0 # HA: mu_pass - mu_fail != 0 # significance level: alpha = 0.05 # step 2: observed stat library(fivethirtyeight) bechdel <- bechdel passed <- subset(bechdel, binary == "PASS") failed <- subset(bechdel, binary == "FAIL") passed_budget <- passed$budget_2013 failed_budget <- failed$budget_2013 # What's a good way to visualize the data for our question of interest? boxplot(passed_budget, failed_budget, names = c("passed", "failed"), ylab = "Budget ($)") # bar{x}_{private} - bar{x}_{public} obs_stat <- mean(passed_budget, na.rm = TRUE) - mean(failed_budget, na.rm = TRUE) # step 3: combo_data <- c(passed_budget, failed_budget) n_passed <- length(passed_budget) n_tot <- length(combo_data) null_dist <- do_it(10000) * { shuff_data <- shuffle(combo_data) shuff_private <- shuff_data[1:n_passed] shuff_public <- shuff_data[(n_passed + 1):n_tot] mean(shuff_private, na.rm = TRUE) - mean(shuff_public, na.rm = TRUE) } hist(null_dist, breaks = 100, xlim = c(-2 * 10^7, 2 * 10^7)) abline(v = obs_stat, col = "red") abline(v = -1 * obs_stat, col = "red") # step 4: p_right <- pnull(-1 * obs_stat, null_dist, lower.tail = FALSE) p_left <- pnull(obs_stat, null_dist, lower.tail = TRUE) (p_val <- p_right + p_left) # step 5: Conclusion?
$\$
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.