In emeyers/SDS100: Tools for the class Introductory Statistics

knitr::opts_chunk$set(echo = TRUE)

library(SDS100)

$\$

Inference for regression - Bechdel data

Step 1

$H_0: \beta = 0$ $H_A: \beta > 0$

# load the library with the data
library(fivethirtyeight)

# remove missing values
bechdel <- na.omit(bechdel)

# extract variables of interest
budget <- bechdel$budget/10^6
revenue <- bechdel$domgross/10^6


lm_fit <- lm(revenue ~ budget)

obs_stat <- coef(lm_fit)[2]



# step 3: using randomization methods

null_dist <- do_it(10000) * {

  lm_shuff <- lm(revenue ~ shuffle(budget))
  coef(lm_shuff)[2]

}


# visualize the null distribution: where is the observed stat
hist(null_dist, breaks = 100)



# step 4 
pnull(obs_stat, null_dist, lower.tail = FALSE)




# step 5





# using parametric methods with R's built-in functions

summary(lm_fit)





# could do the bootstrap to get a CI for beta0 using SDS100::resample_pairs()

boot_dist <- do_it(1000) * {

  resampled_data <- resample_pairs(budget, revenue)
  lm_boot <- lm(vector2 ~ vector1, data = resampled_data)
  coef(lm_boot)[2]

}

quantile(boot_dist, c(.025, .975))

$\$

Multiple regression

Try to predict a movie's revenue (domgross_2013) from: 1) year 2) whether it passed the Bechdel test 3) the movie's budget

columns_to_use <- c("year", "binary", "budget_2013", "domgross_2013")

bechdel2 <- bechdel[, columns_to_use]


lm_fit <- lm(domgross_2013 ~ ., data = bechdel2)


summary(lm_fit)

$\$

Inference for correlation - Bechdel data

We can run a parametric test for correlation using the following t-statistic:

$t ~=~ \frac{r\sqrt{n - 2}}{\sqrt{1 - r^2}}$

Let's examine again if there is a correlation between budget and revenue on the Bechdel data.

$\$

Step 1

$H_0: \rho = 0$ $H_A: \rho > 0$

# Step 2 - calculate the t-statistic

(r_stat <- cor(budget, revenue))

n <- length(budget)

t_stat <- r_stat * sqrt(n - 2)/sqrt(1 - r_stat^2)


# Steps 3-5: 

pt(t_stat, n -2, lower.tail = FALSE)



# check with the cor.test() function 
cor.test(budget, revenue)