$\$
# makes sure you have all the packages we have used in class installed #SDS230::update_installed_packages() SDS230::download_data("IPED_salaries_2016.rda")
# install.packages("latex2exp") library(latex2exp) library(dplyr) library(ggplot2) #options(scipen=999) knitr::opts_chunk$set(echo = TRUE) set.seed(123)
$\$
$\$
We can also fit models that have non-linear transformations to our explanatory variables. This allows us to fit a much broader range of models to our data. In particular, we can fit models that are polynomial functions of our original explanatory variables:
$y = \hat{\beta}_0 + \hat{\beta}_1 \cdot x + \hat{\beta}_2 \cdot x^2 + ... + \hat{\beta}_k \cdot + x^k$
$\$
Let's start by recreating our assistant professor salary data...
load("IPED_salaries_2016.rda") # get the assistant professor data # we will keep all the variables and all schools in the data frame for this analysis assistant_data <- filter(IPED_salaries, endowment > 0, rank_name == "Assistant")
$\$
Let's explore fitting a model that predicts salary from log endowment, log endowment squared and log endowment cubed using our salary data from assistant professors:
$\text{salary} = \hat{\beta}_0 + \hat{\beta}_1 \cdot \text{log(endowment)} + \hat{\beta}_2 \cdot \text{log(endowment)}^2 + \hat{\beta}_2 \cdot + \text{log(endowment)}^3$
To do this we can use the poly()
function which creates polynomial expansions
of our explanatory variables that are "orthogonal", which means that they will
not have multicolinearity. If we had instead just created polynomial expansions
of our data using say a dplyr mutate()
operation, then the predictor values
could be highly correlated and thus lead to multicolinearity (e.g., if x = 1:10,
then x and x^2 will be highly correlated leading to multicolinearity).
Let's fit a degree 3 polynomial linear regression model to our data.
# refit our original degree 1 model lm_fit_1 <- lm(salary_tot ~ log(endowment), data = assistant_data) summary_1 <- summary(lm_fit_1) # fit a cubic model lm_fit_3 <- lm(salary_tot ~ poly(log(endowment), 3), data = assistant_data) # get statistics on our cubic fit (summary_3 <- summary(lm_fit_3)) # compare the r^2 summary_1$r.squared summary_3$r.squared
Question: Are the higher order terms statistically significant?
$\$
predict_df <- data.frame(endowment = 10^(seq(0, 11, by = .1))) predicted_values <- predict(lm_fit_3, newdata = predict_df) # visualize the model fit plot(log(assistant_data$endowment), assistant_data$salary_tot) points(log(predict_df$endowment), predicted_values, type = "l", col = "red")
Question: Does this model visually appear to fit the data better?
$\$
Because our linear model is nested within our cubic model, we can use an ANOVA to evaluate whether our cubic model accounts for more of the variability than our linear model.
anova(lm_fit_1, lm_fit_3)
$\$
Let's try fitting even higher order polynomials!
# try degrees, 5, 10, etc. for (degree in seq(5, 50, 10)) { # fit a cubic model lm_fit_poly <- lm(salary_tot ~ poly(log(endowment), degree, raw = TRUE), data = assistant_data) the_summary <- summary(lm_fit_poly) print(the_summary$r.squared) }
$\$
Let's visualize our 50 degree polynomial fit
# fit 50 degree polynomial poly_fit50 <- lm(salary_tot ~ poly(log(endowment), 50, raw = TRUE), data = assistant_data) # data frame with x-values to get predicted y-hat values predict_df <- data.frame(endowment = 10^(seq(0, 11, by = .1))) # predicted y values (i.e., the regression curve) predicted_values <- predict(poly_fit50, newdata = predict_df) # visualize the data plot(log(assistant_data$endowment), assistant_data$salary_tot) points(log(predict_df$endowment), predicted_values, type = "l", col = "red")
What would Michael Jackson say about this?
https://www.youtube.com/watch?v=DQWI1kvmwRg
$\$
We can compare models using several statistics including $R^2$, $R^2_{adj}$, $AIC$, $BIC$ and cross-validation. Let's try comparing models using these statistics to see how well each statistic captures our intuition about the best model to use.
$\$
Let's compare models using these statistics:
R^2
: where a higher value means a better fitAdjusted R^2
: where higher value means a better fitAIC
: where a lower value means a better modelBIC
: where a lower value means a better modelWe can use either the synthetic x, y, data we created above, or the assistant professor data. We will fit the model with different degree polynomials and wee which modle each statistic selects.
library(dplyr) # fit models of degree 1, 3, and 5 #fit_1 <- lm(y ~ poly(x, 1, raw=TRUE)) #fit_3 <- lm(y ~ poly(x, 3, raw=TRUE)) #fit_5 <- lm(y ~ poly(x, 5, raw=TRUE)) fit_1 <- lm(salary_tot ~ poly(log(endowment), 1), data = assistant_data) fit_3 <- lm(salary_tot ~ poly(log(endowment), 3), data = assistant_data) fit_5 <- lm(salary_tot ~ poly(log(endowment), 5), data = assistant_data) # summarizing these models fit1_summary <- summary(fit_1) fit3_summary <- summary(fit_3) fit5_summary <- summary(fit_5) # printing the R^2 values (higher is better) cat('R^2 \n') (rsquared_values <- c(fit1_summary$r.squared, fit3_summary$r.squared, fit5_summary$r.squared)) which.max(rsquared_values) # printing the adjusted R^2 values (higher is better) cat("\n Adjusted R^2 \n") (adjusted_rsquared_values <- c(fit1_summary$adj.r.squared, fit3_summary$adj.r.squared, fit5_summary$adj.r.squared)) which.max(adjusted_rsquared_values) # printing the AIC values (lower is better) cat("\n AIC \n") (aic_values <- c(AIC(fit_1), AIC(fit_3), AIC(fit_5))) which.min(aic_values) # printing the BIC values (lower is better) cat("\n BIC \n") (bic_values <- c(BIC(fit_1), BIC(fit_3), BIC(fit_5))) which.min(bic_values)
$\$
Let's now compare the models using cross-validation. To keep things simple, we are just going to split the data once into a training set and a test set rather than do k-fold cross-validation.
We will compare the models based on their mean squared prediction error (MSPE) where a smaller MSPE is better.
# create the training set and the test set total_num_points <- dim(assistant_data)[1] num_training_points <- floor(nrow(assistant_data)/2) training_data <- assistant_data[1:num_training_points, ] test_data <- assistant_data[(num_training_points + 1):total_num_points, ] # fit both models on the training data, and calculate the MSPE based on their predictions on the test set fit_cv_1 <- lm(salary_tot ~ poly(log(endowment), 3), data = training_data) test_predictions_1 <- predict(fit_cv_1, newdata = test_data) MSPE_smaller_model <- mean((test_data$salary_tot - test_predictions_1)^2) fit_cv_2 <- lm(salary_tot ~ poly(log(endowment), 5), data = training_data) test_predictions_2 <- predict(fit_cv_2, newdata = test_data) MSPE_larger_model <- mean((test_data$salary_tot - test_predictions_2)^2) MSPE_smaller_model # smaller MSPE is better MSPE_larger_model which.min(c(MSPE_smaller_model, MSPE_larger_model)) # use a for loop to compare the MSPE for models of degree 1 to 7 cv_results <- NULL for (i in 1:7) { curr_fit_cv <- lm(salary_tot ~ poly(log(endowment), i, raw = TRUE), data = training_data) curr_test_predictions <- predict(curr_fit_cv, newdata = test_data) cv_results[i] <- mean((test_data$salary_tot - curr_test_predictions)^2) } plot(cv_results, type = "o")
$\$
In prior classes we have visualized multiple regression models with categorical
predictors using base R graphics. In particular, we created scatter plots for
data in different categories using the plot()
and points()
and used the
col
argument to color the points. We then added on regression lines using the
abline()
function. This method was useful for educational purposes so that we
could see the connection between the model parameters that were estimated using
the lm()
function, and the underlying data. However, if we want to create
better looking visualizations in a more efficient manner, then it is better to
use ggplot.
Let's now use ggplot to visualize a multiple regression model that has one quantitative and one categorical variable. In particular, let's recreate the plot from class 19 part 3.6 where we display faculty salaries as a function of log endowment separately for different faculty ranks.
# load the data into R load("IPED_salaries_2016.rda") # create the IPED data subset used in class 19 part 3 IPED_2 <- IPED_salaries |> filter(endowment > 0) |> mutate(log_endowment = log10(endowment)) |> filter(CARNEGIE %in% c(15, 31)) |> filter(rank_name %in% c("Assistant", "Associate", "Full")) dim(IPED_2) # using ggplot IPED_2 |> ggplot(aes(log_endowment, salary_tot, col = rank_name)) + geom_point() + geom_smooth(method = "lm") + xlab("Log endowment") + ylab("Salary ($)")
$\$
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.