$\$

# makes sure you have all the packages we have used in class installed
#SDS230::update_installed_packages()

SDS230::download_data("IPED_salaries_2016.rda")
# install.packages("latex2exp")

library(latex2exp)
library(dplyr)
library(ggplot2)

#options(scipen=999)


knitr::opts_chunk$set(echo = TRUE)

set.seed(123)

$\$

Overview

$\$

Part 1: Polynomial regression

We can also fit models that have non-linear transformations to our explanatory variables. This allows us to fit a much broader range of models to our data. In particular, we can fit models that are polynomial functions of our original explanatory variables:

$y = \hat{\beta}_0 + \hat{\beta}_1 \cdot x + \hat{\beta}_2 \cdot x^2 + ... + \hat{\beta}_k \cdot + x^k$

$\$

Let's start by recreating our assistant professor salary data...

load("IPED_salaries_2016.rda")

# get the assistant professor data
# we will keep all the variables and all schools in the data frame for this analysis
assistant_data <- filter(IPED_salaries,  endowment > 0,  rank_name == "Assistant")  

$\$

Part 1.1: Non-linear transformations of the explanatory variable x

Let's explore fitting a model that predicts salary from log endowment, log endowment squared and log endowment cubed using our salary data from assistant professors:

$\text{salary} = \hat{\beta}_0 + \hat{\beta}_1 \cdot \text{log(endowment)} + \hat{\beta}_2 \cdot \text{log(endowment)}^2 + \hat{\beta}_2 \cdot + \text{log(endowment)}^3$

To do this we can use the poly() function which creates polynomial expansions of our explanatory variables that are "orthogonal", which means that they will not have multicolinearity. If we had instead just created polynomial expansions of our data using say a dplyr mutate() operation, then the predictor values could be highly correlated and thus lead to multicolinearity (e.g., if x = 1:10, then x and x^2 will be highly correlated leading to multicolinearity).

Let's fit a degree 3 polynomial linear regression model to our data.

# refit our original degree 1 model
lm_fit_1 <- lm(salary_tot ~ log(endowment), data = assistant_data)

summary_1 <- summary(lm_fit_1)


# fit a cubic model
lm_fit_3 <- lm(salary_tot ~ poly(log(endowment), 3), 
               data = assistant_data)

# get statistics on our cubic fit
(summary_3 <- summary(lm_fit_3))


# compare the r^2
summary_1$r.squared
summary_3$r.squared

Question: Are the higher order terms statistically significant?

$\$

Part 1.2: Let's visualize our model

predict_df <- data.frame(endowment = 10^(seq(0, 11, by = .1)))


plot(log(assistant_data$endowment), assistant_data$salary_tot)

points(log(predict_df$endowment), predict(lm_fit_3, newdata = predict_df), type = "l", col = "red")

Question: Does this model visually appear to fit the data better?

$\$

Part 1.3: Using an ANOVA to compare the nested models

Because our linear model is nested within our cubic model, we can use an ANOVA to evaluate whether our cubic model accounts for more of the variability than our linear model.

anova(lm_fit_1, lm_fit_3)

$\$

Part 1.4: Higher order polynomials

Let's try fitting even higher order polynomials!

# try degrees, 5, 10, etc. 
for (degree in seq(5, 50, 10)) {

  # fit a cubic model
  lm_fit_poly <- lm(salary_tot ~ poly(log(endowment), degree, raw = TRUE), data = assistant_data)
  the_summary <- summary(lm_fit_poly)

  print(the_summary$r.squared)

}

$\$

Part 1.5: Visualizing higher order polynomial fits

Let's visualize our 50 degree polynomial fit

# fit 50 degree polynomial
poly_fit50 <- lm(salary_tot ~ poly(log(endowment), 50, raw = TRUE), data = assistant_data)


# data frame with x-values to get predicted y-hat values 
predict_df <- data.frame(endowment = 10^(seq(0, 11, by = .1)))


# predicted y values (i.e., the regression curve)
predicted_values <- predict(poly_fit50, newdata = predict_df)


# visualize the data
plot(log(assistant_data$endowment), assistant_data$salary_tot)


points(log(predict_df$endowment), predicted_values, type = "l", col = "red")

What would Michael Jackson say about this?

https://www.youtube.com/watch?v=DQWI1kvmwRg

$\$

Part 2: Comparing models with R^2, adjusted R^2, AIC, BIC and cross-validation

We can compare models using several statistics including $R^2$, $R^2_{adj}$, $AIC$, $BIC$ and cross-validation. Let's try comparing models using these statistics to see how well each statistic captures our intuition about the best model to use.

$\$

Part 2.1: Comparing models with R^2, adjusted R^2, AIC, BIC

Let's compare models using these statistics:

We can use either the synthetic x, y, data we created above, or the assistant professor data. We will fit the model with different degree polynomials and wee which modle each statistic selects.

library(dplyr)


# fit models of degree 1, 3, and 5
#fit_1 <- lm(y ~ poly(x, 1, raw=TRUE))
#fit_3 <- lm(y ~ poly(x, 3, raw=TRUE))
#fit_5 <- lm(y ~ poly(x, 5, raw=TRUE))


fit_1 <- lm(salary_tot ~ poly(log(endowment), 1), data = assistant_data)
fit_3 <- lm(salary_tot ~ poly(log(endowment), 3), data = assistant_data)
fit_5 <- lm(salary_tot ~ poly(log(endowment), 5), data = assistant_data)



# summarizing these models
fit1_summary <- summary(fit_1)
fit3_summary <- summary(fit_3)
fit5_summary <- summary(fit_5)



# printing the R^2 values (higher is better)
cat('R^2 \n')
(rsquared_values <- c(fit1_summary$r.squared, fit3_summary$r.squared, fit5_summary$r.squared))
which.max(rsquared_values)


# printing the adjusted R^2 values (higher is better)
cat("\n Adjusted R^2 \n")
(adjusted_rsquared_values <- c(fit1_summary$adj.r.squared, 
                              fit3_summary$adj.r.squared,
                              fit5_summary$adj.r.squared))  
which.max(adjusted_rsquared_values)



# printing the AIC values (lower is better)
cat("\n AIC \n")
(aic_values <- c(AIC(fit_1), AIC(fit_3), AIC(fit_5)))
which.min(aic_values)


# printing the BIC values (lower is better)
cat("\n BIC \n")
(bic_values <- c(BIC(fit_1), BIC(fit_3), BIC(fit_5)))
which.min(bic_values)

$\$

Part 2.2: Comparing models using cross-validation

Let's now compare the models using cross-validation. To keep things simple, we are just going to split the data once into a training set and a test set rather than do k-fold cross-validation.

We will compare the models based on their mean squared prediction error (MSPE) where a smaller MSPE is better.

# create the training set and the test set
total_num_points <- dim(assistant_data)[1]
num_training_points <- floor(nrow(assistant_data)/2)

training_data <- assistant_data[1:num_training_points, ]
test_data <- assistant_data[(num_training_points + 1):total_num_points, ]



# fit both models on the training data, and calculate the MSPE based on their predictions on the test set

fit_cv_1 <- lm(salary_tot ~ poly(log(endowment), 3), data = training_data)
test_predictions_1 <- predict(fit_cv_1, newdata = test_data)
MSPE_smaller_model <- mean((test_data$salary_tot - test_predictions_1)^2)


fit_cv_2 <- lm(salary_tot ~ poly(log(endowment), 5), data = training_data)
test_predictions_2 <- predict(fit_cv_2, newdata = test_data)
MSPE_larger_model <- mean((test_data$salary_tot - test_predictions_2)^2)


MSPE_smaller_model   # smaller MSPE is better
MSPE_larger_model

which.min(c(MSPE_smaller_model, MSPE_larger_model))




# use a for loop to compare the MSPE for models of degree 1 to 7
cv_results <- NULL
for (i in 1:7) {
  curr_fit_cv <- lm(salary_tot ~ poly(log(endowment), i, raw = TRUE), data = training_data)
  curr_test_predictions <- predict(curr_fit_cv, newdata = test_data)
  cv_results[i] <- mean((test_data$salary_tot - curr_test_predictions)^2) 
 }
plot(cv_results, type = "o")

$\$



emeyers/SDS230 documentation built on Jan. 18, 2024, 1:01 a.m.