knitr::opts_chunk$set(echo = TRUE)
library(SDS100)
$\$
Step 1
$H_0: \beta = 0$ $H_A: \beta > 0$
# load the library with the data library(fivethirtyeight) # remove missing values bechdel <- na.omit(bechdel) # extract variables of interest budget <- bechdel$budget/10^6 revenue <- bechdel$domgross/10^6 lm_fit <- lm(revenue ~ budget) obs_stat <- coef(lm_fit)[2] # step 3: using randomization methods null_dist <- do_it(10000) * { lm_shuff <- lm(revenue ~ shuffle(budget)) coef(lm_shuff)[2] } # visualize the null distribution: where is the observed stat hist(null_dist, breaks = 100) # step 4 pnull(obs_stat, null_dist, lower.tail = FALSE) # step 5 # using parametric methods with R's built-in functions summary(lm_fit) # could do the bootstrap to get a CI for beta0 using SDS100::resample_pairs() boot_dist <- do_it(1000) * { resampled_data <- resample_pairs(budget, revenue) lm_boot <- lm(vector2 ~ vector1, data = resampled_data) coef(lm_boot)[2] } quantile(boot_dist, c(.025, .975))
$\$
Try to predict a movie's revenue (domgross_2013) from: 1) year 2) whether it passed the Bechdel test 3) the movie's budget
columns_to_use <- c("year", "binary", "budget_2013", "domgross_2013") bechdel2 <- bechdel[, columns_to_use] lm_fit <- lm(domgross_2013 ~ ., data = bechdel2) summary(lm_fit)
$\$
We can run a parametric test for correlation using the following t-statistic:
$t ~=~ \frac{r\sqrt{n - 2}}{\sqrt{1 - r^2}}$
Let's examine again if there is a correlation between budget and revenue on the Bechdel data.
$\$
Step 1
$H_0: \rho = 0$ $H_A: \rho > 0$
# Step 2 - calculate the t-statistic (r_stat <- cor(budget, revenue)) n <- length(budget) t_stat <- r_stat * sqrt(n - 2)/sqrt(1 - r_stat^2) # Steps 3-5: pt(t_stat, n -2, lower.tail = FALSE) # check with the cor.test() function cor.test(budget, revenue)
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.