knitr::opts_chunk$set(echo = TRUE)

$\$

library(SDS100)

download_data("CarbonDioxide.csv")

download_data("CollegeScores4yr.csv")

Regression!

# linear regression with the Bechdel movie data


library(fivethirtyeight)

# remove missing data 
bechdel <- na.omit(bechdel)

# extract variables of interest and convert to millions of $
budget <- bechdel$budget/10^6
domgross <- bechdel$domgross/10^6


# plot the data
plot(budget, domgross)

# fit a linear regression model
lm_fit <- lm(domgross ~ budget)

# get the linear regression coefficients
coef(lm_fit)

# add the linear regression line to the plot
abline(lm_fit, col = "red")

$\$

Extra practice

Practice problem 1: Concentration of CO2 in the Atmosphere

Levels of carbon dioxide (CO2) in the atmosphere are rising rapidly, far above any levels ever before recorded. Levels were around 278 parts per million in 1800, before the Industrial Age, and had never, in the hundreds of thousands of years before that, gone above 300 ppm. Levels are now over 400 ppm.

Please use the data loaded below to predict CO2 levels in different years, and answer the questions in the comments below:

library(SDS100)

#download_data("CarbonDioxide.csv")

carbon <- read.csv("CarbonDioxide.csv")

# 1. What is the explanatory variable? What is the response variable?
year <- carbon$Year  
co2 <- carbon$C02


# 2. Create a scatterplot of the data. Does there appear to be a linear relationship?
plot(year, co2)


# 3. Find the correlation between year and CO2 levels. Does the value of the correlation support your answer to part 2?
cor(year, co2)


# 4. Compute the regression line to predict CO2 from year.
lm_fit <- lm(co2 ~ year)
the_coefficients <- coef(lm_fit)


# 5. Interpret the slope of the regression line, in terms of carbon dioxide concentrations.
(slope <- the_coefficients[2])   # A: 1.5 ppm increase per year


# 6.. What is the intercept of the line? Does it make sense in context? Why or why not?
(intercept <- the_coefficients[1])  # A: -2701 ppm in the year 0 


# 7.  Use the regression line to predict the CO2 level in 2003. In 2025.
slope * c(2003, 2025) + intercept


# 8. Find the residual for 2010.
co2[11] - (slope * 2010 + intercept)

Practice 2: Faculty Salary and Completion Rate

The CompRate variable in CollegeScores4yr records the percentage of students at each four-year school who graduate within six years (known as the completion or graduation rate). Another variable, FacSalary, gives the average monthly salary for faculty (in dollars) at each school.

#download_data("CollegeScores4yr.csv")
salary_data <- read.csv("CollegeScores4yr.csv")

salaries <- salary_data$FacSalary
completion_rate <- salary_data$CompRate


# 1. Find a regression line for predicting the completion rate at four-year colleges, based on their faculty salaries.

lm_fit <- lm(completion_rate ~ salaries)
the_coeffs <- coef(lm_fit)


# 2. What does the fitted line indicate about the expected completion rate at a school 
# with average monthly faculty salary of $5,000?

the_coeffs[1] + the_coeffs[2] * 5000



# 3. Does it look like higher completion rates are associated with higher faculty salaries?

Practice problem 3: Does playing football affect brain size?

Let's examine whether playing football affects one's brain size...

library(Lock5Data)

# get the data from the data frame
years_played <- FootballBrain$Years
hippocampus_vol <- FootballBrain$Hipp
group <- FootballBrain$Group



# 1. create scatterplot and calculate the correlation between years played and hippocampus valume 
plot(years_played, hippocampus_vol)
cor(years_played, hippocampus_vol)



# 2. create side-by-side boxplots for the hippocampus volume for the different groups
boxplot(hippocampus_vol ~ group)


emeyers/SDS100 documentation built on April 28, 2024, 5:07 p.m.