knitr::opts_chunk$set(echo = TRUE)
library(SDS100)

$\$

Hypothesis test for more than 2 means

Does the price of the college differ depending on whether the college is in a City, Town, Rural, or Suburb location?

$\$

# Step 1: 

# H0: mu_city = mu_town = mu_rural = mu_suburb
# HA: mu_i != mu_j   for one pair of locations



# Step 2: 

# Get the data, visualize it, and computer the statistic of interest

# load the data..
# download_data("CollegeScores4yr.csv")
college <- read.csv("CollegeScores4yr.csv")
college <- na.omit(college)  # delete rows with missing data

# how many colleges are in each type of location? 

cost <- college$Cost
locale <- college$Locale

table(college$Locale)


# visualize the data - does there appear to be a difference? 
boxplot(cost ~ locale)



# calculate the MAD statistic
# get_MAD_stat(data_vector, grouping_vector)

(obs_stat <- get_MAD_stat(cost, locale))




# Create the null distribution


null_dist <- do_it(10000) * {

  shuffled_location <- shuffle(locale)
  get_MAD_stat(cost, shuffled_location)

}


# visualize the null distribution 
hist(null_dist, breaks = 100)
abline(v = obs_stat, col = "red")



# 4. Get the p-value
pnull(obs_stat, null_dist, lower.tail = FALSE)



# 5. Decision? 

$\$

Hypothesis test for correlation

Do movies that have larger budgets have larger profits?

Let's look at correlation between budget and profit!

### Hypothesis test for correlation --------------


library(fivethirtyeight)



# Hypothesis tests for correlations!



# step 1

# H0: rho = 0    
# HA: rho > 0




# step 2

bechdel3 <- na.omit(bechdel)

budget <- bechdel3$budget_2013
gross <- bechdel3$domgross_2013

plot(budget, gross)

(obs_stat <- cor(budget, gross))





# step 3: null distribution!!!

null_dist <- do_it(10000) * {
  cor(budget, shuffle(gross))
}

hist(null_dist, breaks = 100)
abline(v = obs_stat, col = "red")





# step 4
pnull(obs_stat, null_dist, lower.tail = FALSE)




# step 5
# reject!


emeyers/SDS100 documentation built on April 28, 2024, 5:07 p.m.