inst/doc/package.R

## ---- include = FALSE---------------------------------------------------------
knitr::opts_chunk$set(
  collapse = TRUE,
  comment = "#>"
)

options(rlang_backtrace_on_error = "none")

## ----setup--------------------------------------------------------------------
library(hardhat)
library(tibble)
library(modeldata)

data(penguins)
penguins <- na.omit(penguins)

## ----out.width = '100%', echo = FALSE-----------------------------------------
knitr::include_graphics("../man/figures/Fitting.png")

## ----out.width = '100%', echo = FALSE-----------------------------------------
knitr::include_graphics("../man/figures/Prediction.png")

## -----------------------------------------------------------------------------
new_simple_lm <- function(coefs, coef_names, blueprint) {
  
  if (!is.numeric(coefs)) {
    stop("`coefs` should be a numeric vector.", call. = FALSE)
  }
  
  if (!is.character(coef_names)) {
    stop("`coef_names` should be a character vector.", call. = FALSE)
  }
  
  if (length(coefs) != length(coef_names)) {
    stop("`coefs` and `coef_names` must have the same length.")
  }
  
  new_model(
    coefs = coefs, 
    coef_names = coef_names,
    blueprint = blueprint, 
    class = "simple_lm"
  )
}

## -----------------------------------------------------------------------------
manual_model <- new_simple_lm(1, "my_coef", default_xy_blueprint())

manual_model

names(manual_model)

manual_model$blueprint

## -----------------------------------------------------------------------------
simple_lm_impl <- function(predictors, outcomes) {
  lm_fit <- lm.fit(predictors, outcomes)
  
  coefs <- lm_fit$coefficients
  
  coef_names <- names(coefs)
  coefs <- unname(coefs)
  
  list(
    coefs = coefs,
    coef_names = coef_names
  )
}

## -----------------------------------------------------------------------------
predictors <- as.matrix(subset(penguins, select = bill_length_mm))
outcomes <- penguins$body_mass_g

simple_lm_impl(predictors, outcomes)

## -----------------------------------------------------------------------------
simple_lm_bridge <- function(processed) {
  
  validate_outcomes_are_univariate(processed$outcomes)
  
  predictors <- as.matrix(processed$predictors)
  outcomes <- processed$outcomes[[1]]
  
  fit <- simple_lm_impl(predictors, outcomes)
  
  new_simple_lm(
    coefs = fit$coefs,
    coef_names = fit$coef_names,
    blueprint = processed$blueprint
  )
}

## -----------------------------------------------------------------------------
# Simulate formula interface
processed_1 <- mold(bill_length_mm ~ body_mass_g + species, penguins)

# Simulate xy interface
processed_2 <- mold(x = penguins["body_mass_g"], y = penguins$bill_length_mm)

simple_lm_bridge(processed_1)

simple_lm_bridge(processed_2)

## ---- error=TRUE--------------------------------------------------------------
multi_outcome <- mold(bill_length_mm + bill_depth_mm ~ body_mass_g + species, penguins)

simple_lm_bridge(multi_outcome)

## -----------------------------------------------------------------------------
# Generic
simple_lm <- function(x, ...) {
  UseMethod("simple_lm")
}

# Default
simple_lm.default <- function(x, ...) {
  stop(
    "`simple_lm()` is not defined for a '", class(x)[1], "'.", 
    call. = FALSE
  )
}

# XY method - data frame
simple_lm.data.frame <- function(x, y, ...) {
  processed <- mold(x, y)
  simple_lm_bridge(processed)
}

# XY method - matrix
simple_lm.matrix <- function(x, y, ...) {
  processed <- mold(x, y)
  simple_lm_bridge(processed)
}

# Formula method
simple_lm.formula <- function(formula, data, ...) {
  processed <- mold(formula, data)
  simple_lm_bridge(processed)
}

# Recipe method
simple_lm.recipe <- function(x, data, ...) {
  processed <- mold(x, data)
  simple_lm_bridge(processed)
}

## -----------------------------------------------------------------------------
predictors <- penguins[c("bill_length_mm", "bill_depth_mm")]
outcomes_vec <- penguins$body_mass_g
outcomes_df <- penguins["body_mass_g"]

# Vector outcome
simple_lm(predictors, outcomes_vec)

# 1 column data frame outcome
simple_lm(predictors, outcomes_df)

# Formula interface
simple_lm(body_mass_g ~ bill_length_mm + bill_depth_mm, penguins)

## ---- warning=FALSE, message=FALSE--------------------------------------------
library(recipes)

# - Log a predictor
# - Generate dummy variables for factors
simple_lm(body_mass_g ~ log(bill_length_mm) + species, penguins)

# Same, but with a recipe
rec <- recipe(body_mass_g ~ bill_length_mm + species, penguins) %>%
  step_log(bill_length_mm) %>%
  step_dummy(species, one_hot = TRUE)

simple_lm(rec, penguins)

## -----------------------------------------------------------------------------
simple_lm <- function(x, ...) {
  UseMethod("simple_lm")
}

simple_lm.data.frame <- function(x, y, intercept = TRUE, ...) {
  blueprint <- default_xy_blueprint(intercept = intercept)
  processed <- mold(x, y, blueprint = blueprint)
  simple_lm_bridge(processed)
}

simple_lm.matrix <- function(x, y, intercept = TRUE,...) {
  blueprint <- default_xy_blueprint(intercept = intercept)
  processed <- mold(x, y, blueprint = blueprint)
  simple_lm_bridge(processed)
}

simple_lm.formula <- function(formula, data, intercept = TRUE, ...) {
  blueprint <- default_formula_blueprint(intercept = intercept)
  processed <- mold(formula, data, blueprint = blueprint)
  simple_lm_bridge(processed)
}

simple_lm.recipe <- function(x, data, intercept = TRUE, ...) {
  blueprint <- default_recipe_blueprint(intercept = intercept)
  processed <- mold(x, data, blueprint = blueprint)
  simple_lm_bridge(processed)
}

## -----------------------------------------------------------------------------
# By default an intercept is included
simple_lm(predictors, outcomes_df)

# But the user can turn this off
simple_lm(body_mass_g ~ log(bill_length_mm) + species, penguins, intercept = FALSE)

## -----------------------------------------------------------------------------
predict_simple_lm_numeric <- function(object, predictors) {
  
  coefs <- object$coefs
  
  pred <- as.vector(predictors %*% coefs)
  
  out <- spruce_numeric(pred)
  
  out
}

## -----------------------------------------------------------------------------
model <- simple_lm(bill_length_mm ~ body_mass_g + species, penguins)

predictors <- forge(penguins, model$blueprint)$predictors
predictors <- as.matrix(predictors)

predict_simple_lm_numeric(model, predictors)

## -----------------------------------------------------------------------------
predict_simple_lm_bridge <- function(type, object, predictors) {
  
  type <- rlang::arg_match(type, "numeric")
  
  predictors <- as.matrix(predictors)
  
  switch(
    type,
    numeric = predict_simple_lm_numeric(object, predictors)
  )
}

## ---- error=TRUE--------------------------------------------------------------
model <- simple_lm(bill_length_mm ~ body_mass_g + species, penguins)

# Pass in the data frame
predictors <- forge(penguins, model$blueprint)$predictors

predict_simple_lm_bridge("numeric", model, predictors)

# Partial matches are an error
predict_simple_lm_bridge("numer", model, predictors)

## -----------------------------------------------------------------------------
predict.simple_lm <- function(object, new_data, type = "numeric", ...) {
  
  # Enforces column order, type, column names, etc
  processed <- forge(new_data, object$blueprint)
  
  out <- predict_simple_lm_bridge(type, object, processed$predictors)
  
  validate_prediction_size(out, new_data)
  
  out
}

## -----------------------------------------------------------------------------
model <- simple_lm(bill_length_mm ~ log(body_mass_g) + species, penguins)

predict(model, penguins)

## ---- warning=TRUE, error=TRUE------------------------------------------------
# `new_data` isn't a data frame
predict(model, penguins$species)

# Missing a required column
predict(model, subset(penguins, select = -body_mass_g))

# In this case, 'species' is a character, 
# but can be losslessy converted to a factor.
# That happens for you automatically and silently.
penguins_chr_species <- transform(penguins, species = as.character(species))

predict(model, penguins_chr_species)

# Slightly different from above. Here, 'species' is a character, 
# AND has an extra unexpected factor level. It is 
# removed with a warning, but you still get a factor 
# with the correct levels
penguins_chr_bad_species <- penguins_chr_species
penguins_chr_bad_species$species[1] <- "new_level"

predict(model, penguins_chr_bad_species)

# This case throws an error.
# Here, 'species' is a double and
# when it should have been a factor.
# You can't cast a double to a factor!
penguins_dbl_species <- transform(penguins, species = 1)

predict(model, penguins_dbl_species)

Try the hardhat package in your browser

Any scripts or data that you put into this service are public.

hardhat documentation built on March 31, 2023, 10:21 p.m.