In emeyers/SDS100: Tools for the class Introductory Statistics

knitr::opts_chunk$set(echo = TRUE)

library(SDS100)

Do private colleges have the same graduation rates as public colleges?

$\$

# Null hypothesis is: public universities have the same completion rates on
#  average as private ones


# Alternative hypothesis is:
# Private universities have different completion rates than public universities?


# In symbols:
#  H0: mu_private - mu_public = 0
#  HA: mu_private - mu_public != 0

# significance level: alpha = 0.05  




# step 2: observed stat

college <- read.csv("https://www.lock5stat.com/datasets3e/CollegeScores4yr.csv")

public <- subset(college, Control == "Public")
private <- subset(college, Control == "Private")

public_CompRate <- public$CompRate
private_CompRate  <- private$CompRate


# What's a good way to visualize the data for our question of interest? 
boxplot(public_CompRate, private_CompRate, 
        names = c("public", "private"),
        ylab = "Competion rate")



# bar{x}_{private} - bar{x}_{public}
obs_stat <- mean(private_CompRate, na.rm = TRUE) - mean(public_CompRate, na.rm = TRUE)




# step 3:

combo_data <- c(private_CompRate, public_CompRate)

n_private <- length(private_CompRate)
n_tot <- length(combo_data)

null_dist <- do_it(10000) * {

  shuff_data <- shuffle(combo_data)
  shuff_private <- shuff_data[1:n_private]
  shuff_public <- shuff_data[(n_private + 1):n_tot]

  mean(shuff_private, na.rm = TRUE) - mean(shuff_public, na.rm = TRUE)

}

hist(null_dist, breaks = 100, xlim = c(-6, 6))
abline(v = obs_stat, col = "red")
abline(v = -1 * obs_stat, col = "red")


# step 4:
p_right <- pnull(obs_stat, null_dist, lower.tail = FALSE)
p_left <- pnull(-1 * obs_stat, null_dist, lower.tail = TRUE)

(p_val <- p_right + p_left)



# step 5: Conclusion?

$\$

Do movies that pass the Bechdel test have the same average budgets as movies that fail?

$\$

# step 1:

# In words: 

# Null hypothesis is: Movies that pass the Bechdel test have the same budget 
#  on average as movies that pass the Bechdel test

# Alternative hypothesis is:
#  that pass the Bechdel test have do not have the budget 
#  on average as movies that pass the Bechdel test


# In symbols:
#  H0: mu_pass - mu_fail = 0
#  HA: mu_pass - mu_fail != 0

# significance level: alpha = 0.05  




# step 2: observed stat



library(fivethirtyeight)

bechdel <- bechdel


passed <- subset(bechdel, binary == "PASS")
failed <- subset(bechdel, binary == "FAIL")

passed_budget <- passed$budget_2013
failed_budget <- failed$budget_2013


# What's a good way to visualize the data for our question of interest? 
boxplot(passed_budget, failed_budget, 
        names = c("passed", "failed"),
        ylab = "Budget ($)")



# bar{x}_{private} - bar{x}_{public}
obs_stat <- mean(passed_budget, na.rm = TRUE) - mean(failed_budget, na.rm = TRUE)




# step 3:

combo_data <- c(passed_budget, failed_budget)

n_passed <- length(passed_budget)
n_tot <- length(combo_data)

null_dist <- do_it(10000) * {

  shuff_data <- shuffle(combo_data)
  shuff_private <- shuff_data[1:n_passed]
  shuff_public <- shuff_data[(n_passed + 1):n_tot]

  mean(shuff_private, na.rm = TRUE) - mean(shuff_public, na.rm = TRUE)

}

hist(null_dist, breaks = 100, xlim = c(-2 * 10^7, 2 * 10^7))
abline(v = obs_stat, col = "red")
abline(v = -1 * obs_stat, col = "red")


# step 4:
p_right <- pnull(-1 * obs_stat, null_dist, lower.tail = FALSE)
p_left <- pnull(obs_stat, null_dist, lower.tail = TRUE)

(p_val <- p_right + p_left)



# step 5: Conclusion?

$\$

Is there a significant correlation between movie budget and movie revenue?

# Step 1: 

# H0: rho = 0
# HA: rho > 0



# Step 2: 

# Get the data, visualize it, and computer the statistic of interest

bechdel2 <- na.omit(bechdel)

budget <- bechdel2$budget_2013
revenue <- bechdel2$domgross_2013




# visualize the data

plot(budget, revenue, 
     xlab = "Budget ($)", 
     ylab = "Revenue ($)")





# calculate the observed correlation 
(obs_stat <- cor(budget, revenue))





# Create the null distribution


null_dist <- do_it(10000) * {

  cor(budget, shuffle(revenue))

}


# visualize the null distribution 
hist(null_dist, breaks = 100) #, xlim = c(-.7, .7))
abline(v = obs_stat, col = "red")



# 4. Get the p-value
pnull(obs_stat, null_dist, lower.tail = FALSE)



# 5. Decision?

$\$

Does the price of the college differ depending on whether the college is in a City, Town, Rural, or Suburb location?

$\$

# Step 1: 

# H0: mu_city = mu_town = mu_rural = mu_suburb
# HA: mu_i != mu_j   for one pair of locations



# Step 2: 

# Get the data, visualize it, and computer the statistic of interest

# load the data..
# download_data("CollegeScores4yr.csv")
college <- read.csv("https://www.lock5stat.com/datasets3e/CollegeScores4yr.csv")
college <- na.omit(college)  # delete rows with missing data

# how many colleges are in each type of location? 

cost <- college$Cost
locale <- college$Locale

table(college$Locale)


# visualize the data - does there appear to be a difference? 
boxplot(cost ~ locale)



# calculate the MAD statistic
# get_MAD_stat(data_vector, grouping_vector)

(obs_stat <- get_MAD_stat(cost, locale))




# Create the null distribution


null_dist <- do_it(10000) * {

  shuffled_location <- shuffle(locale)
  get_MAD_stat(cost, shuffled_location)

}


# visualize the null distribution 
hist(null_dist, breaks = 100)
abline(v = obs_stat, col = "red")



# 4. Get the p-value
pnull(obs_stat, null_dist, lower.tail = FALSE)



# 5. Decision?

emeyers/SDS100 documentation built on April 28, 2024, 5:07 p.m.

rdrr.io home R language documentation Run R code online

CRAN packages Bioconductor packages R-Forge packages GitHub packages

Note that we can't provide technical support on individual packages. You should contact the package authors for that.

emeyers/SDS100
Tools for the class Introductory Statistics

In emeyers/SDS100: Tools for the class Introductory Statistics

Do private colleges have the same graduation rates as public colleges?

Do movies that pass the Bechdel test have the same average budgets as movies that fail?

Is there a significant correlation between movie budget and movie revenue?

Does the price of the college differ depending on whether the college is in a City, Town, Rural, or Suburb location?

R Package Documentation

Browse R Packages

We want your feedback!

emeyers/SDS100 Tools for the class Introductory Statistics

In emeyers/SDS100: Tools for the class Introductory Statistics

Do private colleges have the same graduation rates as public colleges?

Do movies that pass the Bechdel test have the same average budgets as movies that fail?

Is there a significant correlation between movie budget and movie revenue?

Does the price of the college differ depending on whether the college is in a City, Town, Rural, or Suburb location?

R Package Documentation

Browse R Packages

We want your feedback!

emeyers/SDS100
Tools for the class Introductory Statistics