In emeyers/SDS100: Tools for the class Introductory Statistics

knitr::opts_chunk$set(echo = TRUE)

library(SDS100)

$\$

Is there a significant correlation between movie budget and movie revenue?

library(fivethirtyeight)



# Step 1: 

# H0: rho = 0
# HA: rho > 0



# Step 2: 

# Get the data, visualize it, and computer the statistic of interest

bechdel2 <- na.omit(bechdel)

budget <- bechdel2$budget_2013
revenue <- bechdel2$domgross_2013




# visualize the data

plot(budget, revenue, 
     xlab = "Budget ($)", 
     ylab = "Revenue ($)")





# calculate the observed correlation 
(obs_stat <- cor(budget, revenue))





# Create the null distribution


null_dist <- do_it(10000) * {

  cor(budget, shuffle(revenue))

}


# visualize the null distribution 
hist(null_dist, breaks = 100) #, xlim = c(-.7, .7))
abline(v = obs_stat, col = "red")



# 4. Get the p-value
pnull(obs_stat, null_dist, lower.tail = FALSE)



# 5. Decision?

$\$

Does the price of the college differ depending on whether the college is in a City, Town, Rural, or Suburb location?

$\$

# Step 1: 

# H0: mu_city = mu_town = mu_rural = mu_suburb
# HA: mu_i != mu_j   for one pair of locations



# Step 2: 

# Get the data, visualize it, and computer the statistic of interest

# load the data..
college <- read.csv("https://www.lock5stat.com/datasets3e/CollegeScores4yr.csv")
college <- na.omit(college)  # delete rows with missing data

# how many colleges are in each type of location? 

cost <- college$Cost
locale <- college$Locale

table(college$Locale)


# visualize the data - does there appear to be a difference? 
boxplot(cost ~ locale)



# calculate the MAD statistic
# get_MAD_stat(data_vector, grouping_vector)

(obs_stat <- get_MAD_stat(cost, locale))




# Create the null distribution


null_dist <- do_it(10000) * {

  shuffled_location <- shuffle(locale)
  get_MAD_stat(cost, shuffled_location)

}


# visualize the null distribution 
hist(null_dist, breaks = 100)
abline(v = obs_stat, col = "red")



# 4. Get the p-value
pnull(obs_stat, null_dist, lower.tail = FALSE)



# 5. Decision?