inst/doc/regressinator.R

## ----include = FALSE----------------------------------------------------------
knitr::opts_chunk$set(
  collapse = TRUE,
  comment = "#>"
)

## -----------------------------------------------------------------------------
library(regressinator)

linear_pop <- population(
  x1 = predictor(rnorm, mean = 4, sd = 10),
  x2 = predictor(runif, min = 0, max = 10),
  y = response(
    0.7 + 2.2 * x1 - 0.2 * x2, # relationship between X and Y
    family = gaussian(),       # link function and response distribution
    error_scale = 1.5          # errors are scaled by this amount
  )
)

## -----------------------------------------------------------------------------
logistic_pop <- population(
  x1 = predictor(rnorm, mean = 0, sd = 10),
  x2 = predictor(runif, min = 0, max = 10),
  y = response(0.7 + 2.2 * x1 - 0.2 * x2,
               family = binomial(link = "logit"))
)

## -----------------------------------------------------------------------------
heteroskedastic_pop <- population(
  x1 = predictor(rnorm, mean = 5, sd = 4),
  x2 = predictor(runif, min = 0, max = 10),
  y = response(
    4 + 2 * x1 - 3 * x2, # relationship between X and Y
    family = ols_with_error(rnorm), # distribution of the errors
    error_scale = 0.5 + x2 / 10 # errors depend on x2
  )
)

## -----------------------------------------------------------------------------
heavy_tail_pop <- population(
  x1 = predictor(rnorm, mean = 5, sd = 4),
  x2 = predictor(runif, min = 0, max = 10),
  y = response(
    4 + 2 * x1 - 3 * x2, # relationship between X and Y
    family = ols_with_error(rt, df = 3), # distribution of the errors
    error_scale = 1.0 # errors are multiplied by this scale factor
  )
)

## -----------------------------------------------------------------------------
# 40% of draws have lambda = 0, the rest have lambda given by the inverse link
zeroinfpois <- function(ys) {
  n <- length(ys)
  rpois(n, lambda = ys * rbinom(n, 1, prob = 0.4))
}

pop <- population(
  x1 = predictor(rnorm, mean = 2, sd = 2),
  y = response(
    0.7 + 0.8 * x1,
    family = custom_family(zeroinfpois, exp)
  )
)

## -----------------------------------------------------------------------------
# default is equally likely levels
rfactor(5, c("foo", "bar", "baz"))

# but probabilities per level can be specified
rfactor(5, c("foo", "bar", "baz"), c(0.4, 0.3, 0.3))

## -----------------------------------------------------------------------------
intercepts <- c("foo" = 2, "bar" = 30, "baz" = 7)

factor_intercept_pop <- population(
  group = predictor(rfactor,
                    levels = c("foo", "bar", "baz"),
                    prob = c(0.1, 0.6, 0.3)),
  x = predictor(runif, min = 0, max = 10),
  y = response(by_level(group, intercepts) + 0.3 * x,
               error_scale = 1.5)
)

## -----------------------------------------------------------------------------
slopes <- c("foo" = 2, "bar" = 30, "baz" = 7)

factor_slope_pop <- population(
  group = predictor(rfactor,
                    levels = c("foo", "bar", "baz"),
                    prob = c(0.1, 0.6, 0.3)),
  x = predictor(runif, min = 0, max = 10),
  y = response(7 + by_level(group, slopes) * x,
               error_scale = 1.5)
)

## -----------------------------------------------------------------------------
linear_samp <- sample_y(sample_x(linear_pop, n = 10))

## -----------------------------------------------------------------------------
logistic_pop |>
  sample_x(n = 10) |>
  sample_y()

## -----------------------------------------------------------------------------
fit <- lm(y ~ x1 + x2, data = linear_samp)

## -----------------------------------------------------------------------------
fit <- lm(linear_samp$y ~ linear_samp$x1 + linear_samp$x2)

## ----fig.width=4, fig.height=4------------------------------------------------
library(broom)

nonlinear_pop <- population(
  x1 = predictor(runif, min = 1, max = 8),
  x2 = predictor(runif, min = 0, max = 10),
  y = response(0.7 + x1**2 - x2, family = gaussian(),
               error_scale = 4.0)
)

nonlinear_data <- nonlinear_pop |>
  sample_x(n = 100) |>
  sample_y()

nonlinear_fit <- lm(y ~ x1 + x2, data = nonlinear_data)

# to see the kind of information we get from an augmented fit
augment(nonlinear_fit)

library(ggplot2)

ggplot(augment(nonlinear_fit),
       aes(x = .fitted, y = .resid)) +
  geom_point() +
  labs(x = "Fitted value", y = "Residual")

## ----fig.height=4-------------------------------------------------------------
ggplot(augment_longer(nonlinear_fit),
       aes(x = .predictor_value, y = .resid)) +
  geom_point() +
  facet_wrap(vars(.predictor_name), scales = "free") +
  labs(x = "Predictor value", y = "Residual")

## ----fig.height=4-------------------------------------------------------------
ggplot(partial_residuals(nonlinear_fit),
       aes(x = .predictor_value, y = .partial_resid)) +
  geom_point() + # partial residuals
  geom_smooth(se = FALSE) + # smoothed residuals
  geom_line(aes(x = .predictor_value, y = .predictor_effect)) + # effects
  facet_wrap(vars(.predictor_name), scales = "free") +
  labs(x = "Predictor value", y = "Partial residual")

## ----fig.height=4-------------------------------------------------------------
quadratic_fit <- lm(y ~ poly(x1, 2) + x2, data = nonlinear_data)

ggplot(partial_residuals(quadratic_fit),
       aes(x = .predictor_value, y = .partial_resid)) +
  geom_point() + # partial residuals
  geom_smooth(se = FALSE) + # smoothed residuals
  geom_line(aes(x = .predictor_value, y = .predictor_effect)) + # effects
  facet_wrap(vars(.predictor_name), scales = "free") +
  labs(x = "Predictor value", y = "Partial residual")

## ----fig.width=6, fig.height=6------------------------------------------------
model_lineup(nonlinear_fit) |>
  ggplot(aes(x = .fitted, y = .resid)) +
  geom_point() +
  facet_wrap(vars(.sample)) +
  labs(x = "Fitted value", y = "Residual")

## ----fig.width=6, fig.height=6------------------------------------------------
heavy_tail_pop <- population(
  x1 = predictor(rnorm, mean = 5, sd = 4),
  x2 = predictor(runif, min = 0, max = 10),
  y = response(
    4 + 2 * x1 - 3 * x2,
    family = ols_with_error(rt, df = 3),
    error_scale = 1.0
  )
)

heavy_tail_sample <- heavy_tail_pop |>
  sample_x(n = 100) |>
  sample_y()

fit <- lm(y ~ x1 + x2, data = heavy_tail_sample)

model_lineup(fit) |>
  ggplot(aes(sample = .resid)) +
  geom_qq() +
  geom_qq_line() +
  facet_wrap(vars(.sample)) +
  labs(x = "Theoretical quantiles", y = "Observed quantiles")

## -----------------------------------------------------------------------------
d <- linear_pop |>
  sample_x(n = 30) |>
  sample_y()

fit <- lm(y ~ x1 + x2, data = d)
tidy(fit)

sampling_distribution(fit, d, nsim = 4)

## ----fig.height=4-------------------------------------------------------------
samples <- sampling_distribution(fit, d, nsim = 1000)

samples |>
  ggplot(aes(x = estimate)) +
  geom_histogram() +
  facet_wrap(vars(term), scales = "free") +
  labs(x = "Estimate", y = "Frequency")

## ----message=FALSE------------------------------------------------------------
library(dplyr)

samples |>
  group_by(term) |>
  summarize(mean = mean(estimate),
            sd = sd(estimate))

## -----------------------------------------------------------------------------
quadratic_fit <- lm(y ~ x1 + I(x1^2) + x2, data = nonlinear_data)

anova(nonlinear_fit, quadratic_fit)

## ----fig.height=4-------------------------------------------------------------
quadratic_coefs <- parametric_boot_distribution(nonlinear_fit, quadratic_fit,
                                                data = nonlinear_data) |>
  filter(term == "I(x1^2)")

ggplot(quadratic_coefs, aes(x = estimate)) +
  geom_histogram() +
  geom_vline(xintercept = coef(quadratic_fit)["I(x1^2)"],
             color = "red") +
  labs(x = "Quadratic term coefficient",
       y = "Count",
       title = "Null distribution of quadratic term")

Try the regressinator package in your browser

Any scripts or data that you put into this service are public.

regressinator documentation built on Sept. 11, 2024, 6:50 p.m.