In CalvinData/ds202calvin:

library(learnr)
library(shiny)
options(dplyr.summarise.inform = FALSE)
library(tidyverse)
library(tidymodels)
data("ames", package = "modeldata")
knitr::opts_chunk$set(echo = FALSE, eval=TRUE)
set.seed(1)
ames <- ames %>%
  mutate(Sale_Price = Sale_Price / 1000) %>% 
  select(Lot_Area, Bsmt_Unf_SF, Sale_Price) %>%
  mutate(across(everything(), as.numeric))
ames_split <- rsample::initial_split(ames, prop = 1/10)
training_set <- rsample::training(ames_split)
test_set <- rsample::testing(ames_split)
min_intercept <- 0
max_intercept <- 250

Objectives

Experience the process of adjusting model parameters to fit data
Contrast different ways of measuring error, and how that affects what model you choose
See how models can perform better on their training data than unseen data

Simplest model

We'll start by looking at the simplest possible model, one that predicts a constant sale price for every home. This is analogous to saying something like "Homes around here generally go for around $150k".

We'll start by using a small subset of the home sale dataset.

training_set

summary(training_set)

Intercept only

Adjust the value of "intercept" to make your line give the best prediction.

sliderInput("intercept", "Intercept", min = min_intercept, max = max_intercept, value = 125)
plotOutput("plot_intercept_train")
tableOutput("error_intercept_train")

plotModel <- function(dataset, intercept, coef_lot_area) {
  ggplot(dataset, aes(x = Lot_Area, y = Sale_Price)) +
      geom_point() +
      geom_abline(intercept = intercept, slope = coef_lot_area)
}

errorTable <- function(dataset, intercept, coef_lot_area) {
  dataset %>% 
    mutate(prediction = intercept + Lot_Area * coef_lot_area, residual = Sale_Price - prediction) %>% 
    summarize(
      mean_error = mean(residual),
      mean_absolute_error = mean(abs(residual)),
      max_abs_error = max(abs(residual)),
      mean_squared_error = mean(residual * residual),
      rmse = sqrt(mean_squared_error)
    )
}
output$plot_intercept_train <- renderPlot({ plotModel(training_set, input$intercept, 0) })
output$plot_intercept_test <- renderPlot({  plotModel(test_set,     input$intercept, 0) })
output$plot_lotarea_train <- renderPlot({   plotModel(training_set, input$intercept2, input$coef_lot_area)})
output$plot_lotarea_test <- renderPlot({    plotModel(test_set,     input$intercept2, input$coef_lot_area) })

output$error_intercept_train <- renderTable({ errorTable(training_set, input$intercept, 0) })
output$error_intercept_test  <- renderTable({ errorTable(test_set, input$intercept, 0) })
output$error_lotarea_train   <- renderTable({ errorTable(training_set, input$intercept2, input$coef_lot_area) })
output$error_lotarea_test    <- renderTable({ errorTable(test_set, input$intercept2, input$coef_lot_area) })

question("We get the lowest Mean Absolute Error when we set the intercept to the ___ of the Sale_Price column (see summary table above)",
         answer("minimum"),
         answer("mean"),
         answer("exactly halfway between minimum and maximum"),
         answer("median", correct = TRUE),
         answer("maximum"),
         incorrect = "Try changing the intercept to each of these options and see what error you get!",
         allow_retry = TRUE)

question("We get the lowest Mean Squared Error when we set the intercept to the ___ of the Sale_Price column (see summary table above)",
         answer("minimum"),
         answer("mean", correct = TRUE),
         answer("exactly halfway between minimum and maximum"),
         answer("median"),
         answer("maximum"),
         incorrect = "Try changing the intercept to each of these options and see what error you get!",
         allow_retry = TRUE)

Here's the rest of the data, which we're calling the test set. It has r nrow(test_set) observations.

summary(test_set)

plotOutput("plot_intercept_test")
tableOutput("error_intercept_test")

More predictors

Start with "intercept" at the same value as your best value above. Now, adjust both "intercept" and "coef_lot_area" to minimize error on the training set.

sliderInput("intercept2", "Intercept", min = min_intercept, max = max_intercept, value = 125)
sliderInput("coef_lot_area", "Lot_Area", min = -.01, max = .01, value = 0, step = .0001)
plotOutput("plot_lotarea_train")
tableOutput("error_lotarea_train")

Consider these questions:

Did you need to adjust "intercept"? Why or why not?
Were you able to get a better error than when you just had intercept? Why or why not?

Now let's look at the test set:

plotOutput("plot_lotarea_test")
tableOutput("error_lotarea_test")

Evaluate error

Write code here to add a prediction and residual column to test_set. Make sure that you use the correct sign for residual.

intercept = 125
coef_lot_area <- .1

test_set_augmented <- test_set # Your code here

intercept = 125
coef_lot_area <- .1

test_set_augmented <- test_set %>%
  mutate(predicted = intercept + coef_lot_area * Lot_Area,
         residual = Sale_Price - predicted)

Now, write code that uses test_set_augmented to compute the mean error (bias), mean absolute error, and mean squared error. Use summarize.

test_set_augmented %>% 
  summarize(
    mean_error = 0, # your code here
    mean_absolute_error = 0, # etc.
    mean_squared_error = 0
  )

test_set_augmented %>% 
  summarize(
    mean_error = mean(residual),
    mean_absolute_error = mean(abs(residual)),
    mean_squared_error = mean(residual * residual)
  )