knitr::opts_chunk$set(echo = TRUE)
library(SDS100)

Do private colleges have the same graduation rates as public colleges?

$\$

# Null hypothesis is: public universities have the same completion rates on
#  average as private ones


# Alternative hypothesis is:
# Private universities have different completion rates than public universities?


# In symbols:
#  H0: mu_private - mu_public = 0
#  HA: mu_private - mu_public != 0

# significance level: alpha = 0.05  




# step 2: observed stat

college <- read.csv("https://www.lock5stat.com/datasets3e/CollegeScores4yr.csv")

public <- subset(college, Control == "Public")
private <- subset(college, Control == "Private")

public_CompRate <- public$CompRate
private_CompRate  <- private$CompRate


# What's a good way to visualize the data for our question of interest? 
boxplot(public_CompRate, private_CompRate, 
        names = c("public", "private"),
        ylab = "Competion rate")



# bar{x}_{private} - bar{x}_{public}
obs_stat <- mean(private_CompRate, na.rm = TRUE) - mean(public_CompRate, na.rm = TRUE)




# step 3:

combo_data <- c(private_CompRate, public_CompRate)

n_private <- length(private_CompRate)
n_tot <- length(combo_data)

null_dist <- do_it(10000) * {

  shuff_data <- shuffle(combo_data)
  shuff_private <- shuff_data[1:n_private]
  shuff_public <- shuff_data[(n_private + 1):n_tot]

  mean(shuff_private, na.rm = TRUE) - mean(shuff_public, na.rm = TRUE)

}

hist(null_dist, breaks = 100, xlim = c(-6, 6))
abline(v = obs_stat, col = "red")
abline(v = -1 * obs_stat, col = "red")


# step 4:
p_right <- pnull(obs_stat, null_dist, lower.tail = FALSE)
p_left <- pnull(-1 * obs_stat, null_dist, lower.tail = TRUE)

(p_val <- p_right + p_left)



# step 5: Conclusion? 

$\$

Do movies that pass the Bechdel test have the same average budgets as movies that fail?

$\$

# step 1:

# In words: 

# Null hypothesis is: Movies that pass the Bechdel test have the same budget 
#  on average as movies that pass the Bechdel test

# Alternative hypothesis is:
#  that pass the Bechdel test have do not have the budget 
#  on average as movies that pass the Bechdel test


# In symbols:
#  H0: mu_pass - mu_fail = 0
#  HA: mu_pass - mu_fail != 0

# significance level: alpha = 0.05  




# step 2: observed stat



library(fivethirtyeight)

bechdel <- bechdel


passed <- subset(bechdel, binary == "PASS")
failed <- subset(bechdel, binary == "FAIL")

passed_budget <- passed$budget_2013
failed_budget <- failed$budget_2013


# What's a good way to visualize the data for our question of interest? 
boxplot(passed_budget, failed_budget, 
        names = c("passed", "failed"),
        ylab = "Budget ($)")



# bar{x}_{private} - bar{x}_{public}
obs_stat <- mean(passed_budget, na.rm = TRUE) - mean(failed_budget, na.rm = TRUE)




# step 3:

combo_data <- c(passed_budget, failed_budget)

n_passed <- length(passed_budget)
n_tot <- length(combo_data)

null_dist <- do_it(10000) * {

  shuff_data <- shuffle(combo_data)
  shuff_private <- shuff_data[1:n_passed]
  shuff_public <- shuff_data[(n_passed + 1):n_tot]

  mean(shuff_private, na.rm = TRUE) - mean(shuff_public, na.rm = TRUE)

}

hist(null_dist, breaks = 100, xlim = c(-2 * 10^7, 2 * 10^7))
abline(v = obs_stat, col = "red")
abline(v = -1 * obs_stat, col = "red")


# step 4:
p_right <- pnull(-1 * obs_stat, null_dist, lower.tail = FALSE)
p_left <- pnull(obs_stat, null_dist, lower.tail = TRUE)

(p_val <- p_right + p_left)



# step 5: Conclusion? 

$\$

Is there a significant correlation between movie budget and movie revenue?

# Step 1: 

# H0: rho = 0
# HA: rho > 0



# Step 2: 

# Get the data, visualize it, and computer the statistic of interest

bechdel2 <- na.omit(bechdel)

budget <- bechdel2$budget_2013
revenue <- bechdel2$domgross_2013




# visualize the data

plot(budget, revenue, 
     xlab = "Budget ($)", 
     ylab = "Revenue ($)")





# calculate the observed correlation 
(obs_stat <- cor(budget, revenue))





# Create the null distribution


null_dist <- do_it(10000) * {

  cor(budget, shuffle(revenue))

}


# visualize the null distribution 
hist(null_dist, breaks = 100) #, xlim = c(-.7, .7))
abline(v = obs_stat, col = "red")



# 4. Get the p-value
pnull(obs_stat, null_dist, lower.tail = FALSE)



# 5. Decision? 

$\$

Does the price of the college differ depending on whether the college is in a City, Town, Rural, or Suburb location?

$\$

# Step 1: 

# H0: mu_city = mu_town = mu_rural = mu_suburb
# HA: mu_i != mu_j   for one pair of locations



# Step 2: 

# Get the data, visualize it, and computer the statistic of interest

# load the data..
# download_data("CollegeScores4yr.csv")
college <- read.csv("https://www.lock5stat.com/datasets3e/CollegeScores4yr.csv")
college <- na.omit(college)  # delete rows with missing data

# how many colleges are in each type of location? 

cost <- college$Cost
locale <- college$Locale

table(college$Locale)


# visualize the data - does there appear to be a difference? 
boxplot(cost ~ locale)



# calculate the MAD statistic
# get_MAD_stat(data_vector, grouping_vector)

(obs_stat <- get_MAD_stat(cost, locale))




# Create the null distribution


null_dist <- do_it(10000) * {

  shuffled_location <- shuffle(locale)
  get_MAD_stat(cost, shuffled_location)

}


# visualize the null distribution 
hist(null_dist, breaks = 100)
abline(v = obs_stat, col = "red")



# 4. Get the p-value
pnull(obs_stat, null_dist, lower.tail = FALSE)



# 5. Decision? 


emeyers/SDS100 documentation built on April 28, 2024, 5:07 p.m.