# this is taken from https://cs.calvin.edu/courses/data/202/21fa/ex/ex07/ex07-modeling-inst.html library(learnr) # remotes::install_github("rstudio/learnr", build_vignettes = TRUE) knitr::opts_chunk$set(echo = TRUE, comment = "") tutorial_options(exercise.reveal_solution = FALSE) # https://bookdown.org/yihui/rmarkdown-cookbook/reuse-chunks.html#embed-chunk <<user-facing-setup>>
library(tidyverse) library(tidymodels) library(modelr) theme_set(theme_bw()) options(scipen = 5) # encourage metrics to print in fixed-point notation options(dplyr.summarise.inform = FALSE) # silence a warning message <<load-and-subset-data>>
Two goals here:
set.seed(1234) data(ames, package = "modeldata") ames <- ames %>% filter(Gr_Liv_Area < 4000, Sale_Condition == "Normal") %>% mutate(Sale_Price = Sale_Price / 1000) # Split our data randomly set.seed(364) ames_split <- initial_split(ames, prop = 0.8) ames_train <- training(ames_split) ames_test <- testing(ames_split) library(NHANES) people <- NHANES %>% select(Age, BMI, Diabetes) %>% drop_na() people_split <- initial_split(people, prop = 0.8) people_train <- training(people_split) people_test <- testing(people_split)
spec <- linear_reg() #spec <- decision_tree(mode = "regression") #spec <- rand_forest(mode = "regression") #spec <- boost_tree(mode = "regression") model_fit <- fit( spec, Sale_Price ~ Gr_Liv_Area + Bldg_Type, data = ames_train) # Plot how the model's predictions change as living area changes. # Overlay the test set data. augment( model_fit, data_grid(ames, Gr_Liv_Area = seq_range(Gr_Liv_Area, 500), Bldg_Type = levels(ames$Bldg_Type))) %>% ggplot(aes(x = Gr_Liv_Area, y = .pred, color = Bldg_Type)) + geom_line() + geom_point(aes(y = Sale_Price), data = ames_test, alpha = 0.5) # Measure test-set error. augment(model_fit, ames_test) %>% metrics(truth = Sale_Price, estimate = .pred)
Exercises:
linear_reg
by decision_tree
by commenting and uncommenting the spec
assignments accordingly. Repeat the first two questions. What do you notice about the differences between the models? Think about how those differences relate to how each model makes a prediction.decision_tree
by rand_forest
; repeat the previous question.boost_tree
.In this example, you might notice that the linear regression actually gets least error. After reflecting on the last question you can probably see why. You might try adding some other features to the formula; does linear regression still win? (You'll need to comment out the plotting code if you want to try that, though.)
Now we'll switch to an example from the book, which uses data about diabetes; read the explanation there.
spec <- logistic_reg() #spec <- decision_tree(mode = "classification", cost_complexity = .005) #spec <- rand_forest(mode = "classification") #spec <- boost_tree(mode = "classification") model_fit <- fit( spec, Diabetes ~ BMI + Age, data = people_train) # Show the model in "data space". augment( model_fit, data_grid( people, Age = seq_range(Age, 100), BMI = seq_range(BMI, 100) ) ) %>% ggplot(aes(x = Age, y = BMI)) + geom_tile(aes(fill = .pred_Yes), color = NA) + geom_count(aes(color = Diabetes), data = people, alpha = 0.3) + scale_fill_gradient("Prob of\nDiabetes", low = "white", high = "red") + scale_color_manual(values = c("gold", "black")) # Measure test-set error. confusion <- augment(model_fit, people_test) %>% conf_mat(truth = Diabetes, estimate = .pred_class) summary(confusion)
As before, comment and uncomment the spec
s to compare the models and think about how the shape of the predictions relates to the structure of the model.
#spec <- logistic_reg() #spec <- decision_tree(mode = "classification", cost_complexity = .005) spec <- rand_forest(mode = "classification") #spec <- boost_tree(mode = "classification") model_fit <- fit( spec, Diabetes ~ BMI + AgeDecade, data = people) # Plot how the model's predictions change as living area changes. # Overlay the test set data. sweep_model(model_fit, BMI, range(people$BMI), AgeDecade = levels(people$AgeDecade)) %>% ggplot(aes(x = BMI, y = .pred_Yes, color = AgeDecade)) + geom_line() #geom_point(aes(y = frac), data = people %>% group_by() %>% summarize(frac = mean(Diabetes == "Yes")), alpha = 0.5) # Measure test-set error. augment(model_fit, people) %>% conf_mat(truth = Diabetes, estimate = .pred_class)
To run any code chunk from this tutorial in your own environment, use:
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.