Nothing
## ----include = FALSE----------------------------------------------------------
knitr::opts_chunk$set(
collapse = TRUE,
comment = "#>"
)
## -----------------------------------------------------------------------------
library(regressinator)
linear_pop <- population(
x1 = predictor(rnorm, mean = 4, sd = 10),
x2 = predictor(runif, min = 0, max = 10),
y = response(
0.7 + 2.2 * x1 - 0.2 * x2, # relationship between X and Y
family = gaussian(), # link function and response distribution
error_scale = 1.5 # errors are scaled by this amount
)
)
## -----------------------------------------------------------------------------
logistic_pop <- population(
x1 = predictor(rnorm, mean = 0, sd = 10),
x2 = predictor(runif, min = 0, max = 10),
y = response(0.7 + 2.2 * x1 - 0.2 * x2,
family = binomial(link = "logit"))
)
## -----------------------------------------------------------------------------
heteroskedastic_pop <- population(
x1 = predictor(rnorm, mean = 5, sd = 4),
x2 = predictor(runif, min = 0, max = 10),
y = response(
4 + 2 * x1 - 3 * x2, # relationship between X and Y
family = ols_with_error(rnorm), # distribution of the errors
error_scale = 0.5 + x2 / 10 # errors depend on x2
)
)
## -----------------------------------------------------------------------------
heavy_tail_pop <- population(
x1 = predictor(rnorm, mean = 5, sd = 4),
x2 = predictor(runif, min = 0, max = 10),
y = response(
4 + 2 * x1 - 3 * x2, # relationship between X and Y
family = ols_with_error(rt, df = 3), # distribution of the errors
error_scale = 1.0 # errors are multiplied by this scale factor
)
)
## -----------------------------------------------------------------------------
# 40% of draws have lambda = 0, the rest have lambda given by the inverse link
zeroinfpois <- function(ys) {
n <- length(ys)
rpois(n, lambda = ys * rbinom(n, 1, prob = 0.4))
}
pop <- population(
x1 = predictor(rnorm, mean = 2, sd = 2),
y = response(
0.7 + 0.8 * x1,
family = custom_family(zeroinfpois, exp)
)
)
## -----------------------------------------------------------------------------
# default is equally likely levels
rfactor(5, c("foo", "bar", "baz"))
# but probabilities per level can be specified
rfactor(5, c("foo", "bar", "baz"), c(0.4, 0.3, 0.3))
## -----------------------------------------------------------------------------
intercepts <- c("foo" = 2, "bar" = 30, "baz" = 7)
factor_intercept_pop <- population(
group = predictor(rfactor,
levels = c("foo", "bar", "baz"),
prob = c(0.1, 0.6, 0.3)),
x = predictor(runif, min = 0, max = 10),
y = response(by_level(group, intercepts) + 0.3 * x,
error_scale = 1.5)
)
## -----------------------------------------------------------------------------
slopes <- c("foo" = 2, "bar" = 30, "baz" = 7)
factor_slope_pop <- population(
group = predictor(rfactor,
levels = c("foo", "bar", "baz"),
prob = c(0.1, 0.6, 0.3)),
x = predictor(runif, min = 0, max = 10),
y = response(7 + by_level(group, slopes) * x,
error_scale = 1.5)
)
## -----------------------------------------------------------------------------
linear_samp <- sample_y(sample_x(linear_pop, n = 10))
## -----------------------------------------------------------------------------
logistic_pop |>
sample_x(n = 10) |>
sample_y()
## -----------------------------------------------------------------------------
fit <- lm(y ~ x1 + x2, data = linear_samp)
## -----------------------------------------------------------------------------
fit <- lm(linear_samp$y ~ linear_samp$x1 + linear_samp$x2)
## ----fig.width=4, fig.height=4------------------------------------------------
library(broom)
nonlinear_pop <- population(
x1 = predictor(runif, min = 1, max = 8),
x2 = predictor(runif, min = 0, max = 10),
y = response(0.7 + x1**2 - x2, family = gaussian(),
error_scale = 4.0)
)
nonlinear_data <- nonlinear_pop |>
sample_x(n = 100) |>
sample_y()
nonlinear_fit <- lm(y ~ x1 + x2, data = nonlinear_data)
# to see the kind of information we get from an augmented fit
augment(nonlinear_fit)
library(ggplot2)
ggplot(augment(nonlinear_fit),
aes(x = .fitted, y = .resid)) +
geom_point() +
labs(x = "Fitted value", y = "Residual")
## ----fig.height=4-------------------------------------------------------------
ggplot(augment_longer(nonlinear_fit),
aes(x = .predictor_value, y = .resid)) +
geom_point() +
facet_wrap(vars(.predictor_name), scales = "free") +
labs(x = "Predictor value", y = "Residual")
## ----fig.height=4-------------------------------------------------------------
ggplot(partial_residuals(nonlinear_fit),
aes(x = .predictor_value, y = .partial_resid)) +
geom_point() + # partial residuals
geom_smooth(se = FALSE) + # smoothed residuals
geom_line(aes(x = .predictor_value, y = .predictor_effect)) + # effects
facet_wrap(vars(.predictor_name), scales = "free") +
labs(x = "Predictor value", y = "Partial residual")
## ----fig.height=4-------------------------------------------------------------
quadratic_fit <- lm(y ~ poly(x1, 2) + x2, data = nonlinear_data)
ggplot(partial_residuals(quadratic_fit),
aes(x = .predictor_value, y = .partial_resid)) +
geom_point() + # partial residuals
geom_smooth(se = FALSE) + # smoothed residuals
geom_line(aes(x = .predictor_value, y = .predictor_effect)) + # effects
facet_wrap(vars(.predictor_name), scales = "free") +
labs(x = "Predictor value", y = "Partial residual")
## ----fig.width=6, fig.height=6------------------------------------------------
model_lineup(nonlinear_fit) |>
ggplot(aes(x = .fitted, y = .resid)) +
geom_point() +
facet_wrap(vars(.sample)) +
labs(x = "Fitted value", y = "Residual")
## ----fig.width=6, fig.height=6------------------------------------------------
heavy_tail_pop <- population(
x1 = predictor(rnorm, mean = 5, sd = 4),
x2 = predictor(runif, min = 0, max = 10),
y = response(
4 + 2 * x1 - 3 * x2,
family = ols_with_error(rt, df = 3),
error_scale = 1.0
)
)
heavy_tail_sample <- heavy_tail_pop |>
sample_x(n = 100) |>
sample_y()
fit <- lm(y ~ x1 + x2, data = heavy_tail_sample)
model_lineup(fit) |>
ggplot(aes(sample = .resid)) +
geom_qq() +
geom_qq_line() +
facet_wrap(vars(.sample)) +
labs(x = "Theoretical quantiles", y = "Observed quantiles")
## -----------------------------------------------------------------------------
d <- linear_pop |>
sample_x(n = 30) |>
sample_y()
fit <- lm(y ~ x1 + x2, data = d)
tidy(fit)
sampling_distribution(fit, d, nsim = 4)
## ----fig.height=4-------------------------------------------------------------
samples <- sampling_distribution(fit, d, nsim = 1000)
samples |>
ggplot(aes(x = estimate)) +
geom_histogram() +
facet_wrap(vars(term), scales = "free") +
labs(x = "Estimate", y = "Frequency")
## ----message=FALSE------------------------------------------------------------
library(dplyr)
samples |>
group_by(term) |>
summarize(mean = mean(estimate),
sd = sd(estimate))
## -----------------------------------------------------------------------------
quadratic_fit <- lm(y ~ x1 + I(x1^2) + x2, data = nonlinear_data)
anova(nonlinear_fit, quadratic_fit)
## ----fig.height=4-------------------------------------------------------------
quadratic_coefs <- parametric_boot_distribution(nonlinear_fit, quadratic_fit,
data = nonlinear_data) |>
filter(term == "I(x1^2)")
ggplot(quadratic_coefs, aes(x = estimate)) +
geom_histogram() +
geom_vline(xintercept = coef(quadratic_fit)["I(x1^2)"],
color = "red") +
labs(x = "Quadratic term coefficient",
y = "Count",
title = "Null distribution of quadratic term")
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.