library(learnr)
library(shiny)
options(dplyr.summarise.inform = FALSE)
library(tidyverse)
library(tidymodels)
data("ames", package = "modeldata")
knitr::opts_chunk$set(echo = FALSE, eval=TRUE)
set.seed(1)
ames <- ames %>%
  mutate(Sale_Price = Sale_Price / 1000) %>% 
  select(Lot_Area, Bsmt_Unf_SF, Sale_Price) %>%
  mutate(across(everything(), as.numeric))
ames_split <- rsample::initial_split(ames, prop = 1/10)
training_set <- rsample::training(ames_split)
test_set <- rsample::testing(ames_split)
min_intercept <- 0
max_intercept <- 250

Objectives

Simplest model

We'll start by looking at the simplest possible model, one that predicts a constant sale price for every home. This is analogous to saying something like "Homes around here generally go for around $150k".

We'll start by using a small subset of the home sale dataset.

training_set
summary(training_set)

Intercept only

Adjust the value of "intercept" to make your line give the best prediction.

sliderInput("intercept", "Intercept", min = min_intercept, max = max_intercept, value = 125)
plotOutput("plot_intercept_train")
tableOutput("error_intercept_train")
plotModel <- function(dataset, intercept, coef_lot_area) {
  ggplot(dataset, aes(x = Lot_Area, y = Sale_Price)) +
      geom_point() +
      geom_abline(intercept = intercept, slope = coef_lot_area)
}

errorTable <- function(dataset, intercept, coef_lot_area) {
  dataset %>% 
    mutate(prediction = intercept + Lot_Area * coef_lot_area, residual = Sale_Price - prediction) %>% 
    summarize(
      mean_error = mean(residual),
      mean_absolute_error = mean(abs(residual)),
      max_abs_error = max(abs(residual)),
      mean_squared_error = mean(residual * residual),
      rmse = sqrt(mean_squared_error)
    )
}
output$plot_intercept_train <- renderPlot({ plotModel(training_set, input$intercept, 0) })
output$plot_intercept_test <- renderPlot({  plotModel(test_set,     input$intercept, 0) })
output$plot_lotarea_train <- renderPlot({   plotModel(training_set, input$intercept2, input$coef_lot_area)})
output$plot_lotarea_test <- renderPlot({    plotModel(test_set,     input$intercept2, input$coef_lot_area) })

output$error_intercept_train <- renderTable({ errorTable(training_set, input$intercept, 0) })
output$error_intercept_test  <- renderTable({ errorTable(test_set, input$intercept, 0) })
output$error_lotarea_train   <- renderTable({ errorTable(training_set, input$intercept2, input$coef_lot_area) })
output$error_lotarea_test    <- renderTable({ errorTable(test_set, input$intercept2, input$coef_lot_area) })
question("We get the lowest Mean Absolute Error when we set the intercept to the ___ of the Sale_Price column (see summary table above)",
         answer("minimum"),
         answer("mean"),
         answer("exactly halfway between minimum and maximum"),
         answer("median", correct = TRUE),
         answer("maximum"),
         incorrect = "Try changing the intercept to each of these options and see what error you get!",
         allow_retry = TRUE)
question("We get the lowest Mean Squared Error when we set the intercept to the ___ of the Sale_Price column (see summary table above)",
         answer("minimum"),
         answer("mean", correct = TRUE),
         answer("exactly halfway between minimum and maximum"),
         answer("median"),
         answer("maximum"),
         incorrect = "Try changing the intercept to each of these options and see what error you get!",
         allow_retry = TRUE)

Here's the rest of the data, which we're calling the test set. It has r nrow(test_set) observations.

summary(test_set)
plotOutput("plot_intercept_test")
tableOutput("error_intercept_test")

More predictors

Start with "intercept" at the same value as your best value above. Now, adjust both "intercept" and "coef_lot_area" to minimize error on the training set.

sliderInput("intercept2", "Intercept", min = min_intercept, max = max_intercept, value = 125)
sliderInput("coef_lot_area", "Lot_Area", min = -.01, max = .01, value = 0, step = .0001)
plotOutput("plot_lotarea_train")
tableOutput("error_lotarea_train")

Consider these questions:

Now let's look at the test set:

plotOutput("plot_lotarea_test")
tableOutput("error_lotarea_test")

Evaluate error

Write code here to add a prediction and residual column to test_set. Make sure that you use the correct sign for residual.

intercept = 125
coef_lot_area <- .1

test_set_augmented <- test_set # Your code here
intercept = 125
coef_lot_area <- .1

test_set_augmented <- test_set %>%
  mutate(predicted = intercept + coef_lot_area * Lot_Area,
         residual = Sale_Price - predicted)

Now, write code that uses test_set_augmented to compute the mean error (bias), mean absolute error, and mean squared error. Use summarize.

test_set_augmented %>% 
  summarize(
    mean_error = 0, # your code here
    mean_absolute_error = 0, # etc.
    mean_squared_error = 0
  )
test_set_augmented %>% 
  summarize(
    mean_error = mean(residual),
    mean_absolute_error = mean(abs(residual)),
    mean_squared_error = mean(residual * residual)
  )


CalvinData/ds202calvin documentation built on Nov. 24, 2022, 7:11 p.m.