In dajmcdon/ubc-stat406-labs: Tutorials and labs for UBC Stat 406 in the 2020-2021 online year

library(learnr)
library(gradethis)

tutorial_options(exercise.timelimit = 5, exercise.checker = gradethis::grade_learnr)

knitr::opts_chunk$set(echo = FALSE)

Generate the Data

In this activity, we'll be exploring the f-test and how it applies to linear regression. To begin, run the code below that creates a data generation function.

set.seed(000012345)
n = 100
p = 3
X = matrix(rnorm(n*(p+1)),nrow=n)
generate.data <- function(X, sig.epsilon=1){
  ## This function takes 2 inputs (1 mandatory, 1 optional)
  ## X - the design matrix
  ## sig.epsilon - (optional) the sd of the normal noise (default=1 if omitted)
  n = nrow(X)
  p = ncol(X)
  epsilon = rnorm(n, sd = sig.epsilon) ## noise ~ N(0, sig.epsilon)
  beta = c(p:2,0) # p beta coefficients
  beta.0 = 3 # an intercept
  y = beta.0 + X %*% beta + epsilon # the linear model
  df = data.frame(y, X) # put it all in a data frame
  return(df) # output
}

And create the data here and save it as dat. Use the default epsilon=1.

set.seed(000012345)
n = 100
p = 3
X = matrix(rnorm(n*(p+1)),nrow=n)
generate.data <- function(X, sig.epsilon=1){
  ## This function takes 2 inputs (1 mandatory, 1 optional)
  ## X - the design matrix
  ## sig.epsilon - (optional) the sd of the normal noise (default=1 if omitted)
  n = nrow(X)
  p = ncol(X)
  epsilon = rnorm(n, sd = sig.epsilon) ## noise ~ N(0, sig.epsilon)
  beta = c(p:2,0) # p beta coefficients
  beta.0 = 3 # an intercept
  y = beta.0 + X %*% beta + epsilon # the linear model
  df = data.frame(y, X) # put it all in a data frame
  return(df) # output
}

dat = ____________
dat

set.seed(000012345)
n = 100
p = 3
X = matrix(rnorm(n*(p+1)),nrow=n)
generate.data <- function(X, sig.epsilon=1){
  ## This function takes 2 inputs (1 mandatory, 1 optional)
  ## X - the design matrix
  ## sig.epsilon - (optional) the sd of the normal noise (default=1 if omitted)
  n = nrow(X)
  p = ncol(X)
  epsilon = rnorm(n, sd = sig.epsilon) ## noise ~ N(0, sig.epsilon)
  beta = c(p:2,0) # p beta coefficients
  beta.0 = 3 # an intercept
  y = beta.0 + X %*% beta + epsilon # the linear model
  df = data.frame(y, X) # put it all in a data frame
  return(df) # output
}
dat = generate.data(X)

grade_result(
  pass_if(~ identical(.result, dat), "Correct!"),
  fail_if(~ TRUE, "Incorrect.")
)

Linear Regression

set.seed(000012345)
n = 100
p = 3
X = matrix(rnorm(n*(p+1)),nrow=n)
generate.data <- function(X, sig.epsilon=1){
  ## This function takes 2 inputs (1 mandatory, 1 optional)
  ## X - the design matrix
  ## sig.epsilon - (optional) the sd of the normal noise (default=1 if omitted)
  n = nrow(X)
  p = ncol(X)
  epsilon = rnorm(n, sd = sig.epsilon) ## noise ~ N(0, sig.epsilon)
  beta = c(p:2,0) # p beta coefficients
  beta.0 = 3 # an intercept
  y = beta.0 + X %*% beta + epsilon # the linear model
  df = data.frame(y, X) # put it all in a data frame
  return(df) # output
}
dat = generate.data(X)
head(dat)

Estimate the Regression

The first 6 rows of the data are displayed above. Estimate 2 linear models: (1) on all four predictors, (2) on the first 3 predictors.

set.seed(000012345)
n = 100
p = 3
X = matrix(rnorm(n*(p+1)),nrow=n)
generate.data <- function(X, sig.epsilon=1){
  ## This function takes 2 inputs (1 mandatory, 1 optional)
  ## X - the design matrix
  ## sig.epsilon - (optional) the sd of the normal noise (default=1 if omitted)
  n = nrow(X)
  p = ncol(X)
  epsilon = rnorm(n, sd = sig.epsilon) ## noise ~ N(0, sig.epsilon)
  beta = c(p:2,0) # p beta coefficients
  beta.0 = 3 # an intercept
  y = beta.0 + X %*% beta + epsilon # the linear model
  df = data.frame(y, X) # put it all in a data frame
  return(df) # output
}
dat = generate.data(X)

lm.large = lm(__________,data=dat)
coef(lm.large)

set.seed(000012345)
n = 100
p = 3
X = matrix(rnorm(n*(p+1)),nrow=n)
generate.data <- function(X, sig.epsilon=1){
  ## This function takes 2 inputs (1 mandatory, 1 optional)
  ## X - the design matrix
  ## sig.epsilon - (optional) the sd of the normal noise (default=1 if omitted)
  n = nrow(X)
  p = ncol(X)
  epsilon = rnorm(n, sd = sig.epsilon) ## noise ~ N(0, sig.epsilon)
  beta = c(p:2,0) # p beta coefficients
  beta.0 = 3 # an intercept
  y = beta.0 + X %*% beta + epsilon # the linear model
  df = data.frame(y, X) # put it all in a data frame
  return(df) # output
}
dat = generate.data(X)
lm.large = coef(lm(y~.,data=dat))

grade_result(
  pass_if(~ identical(.result, lm.large), "Correct!"),
  fail_if(~ TRUE, "Incorrect.")
)

set.seed(000012345)
n = 100
p = 3
X = matrix(rnorm(n*(p+1)),nrow=n)
generate.data <- function(X, sig.epsilon=1){
  ## This function takes 2 inputs (1 mandatory, 1 optional)
  ## X - the design matrix
  ## sig.epsilon - (optional) the sd of the normal noise (default=1 if omitted)
  n = nrow(X)
  p = ncol(X)
  epsilon = rnorm(n, sd = sig.epsilon) ## noise ~ N(0, sig.epsilon)
  beta = c(p:2,0) # p beta coefficients
  beta.0 = 3 # an intercept
  y = beta.0 + X %*% beta + epsilon # the linear model
  df = data.frame(y, X) # put it all in a data frame
  return(df) # output
}
dat = generate.data(X)

lm.small = lm(__________, data=dat)
coef(lm.small)

set.seed(000012345)
n = 100
p = 3
X = matrix(rnorm(n*(p+1)),nrow=n)
generate.data <- function(X, sig.epsilon=1){
  ## This function takes 2 inputs (1 mandatory, 1 optional)
  ## X - the design matrix
  ## sig.epsilon - (optional) the sd of the normal noise (default=1 if omitted)
  n = nrow(X)
  p = ncol(X)
  epsilon = rnorm(n, sd = sig.epsilon) ## noise ~ N(0, sig.epsilon)
  beta = c(p:2,0) # p beta coefficients
  beta.0 = 3 # an intercept
  y = beta.0 + X %*% beta + epsilon # the linear model
  df = data.frame(y, X) # put it all in a data frame
  return(df) # output
}
dat = generate.data(X)
lm.large = coef(lm(y~.,data=dat))
lm.small = coef(lm(y~.-X4, data=dat))

grade_result(
  pass_if(~ identical(.result, lm.small), "Correct!"),
  fail_if(~ identical(.result, lm.large), "Did you forget to drop the fourth predictor?"),
  fail_if(~ TRUE, "Incorrect.")
)

Residual Sum of Squares

For each model, calculate the RSS. Find the difference between RSS(small)-RSS(large).

set.seed(000012345)
n = 100
p = 3
X = matrix(rnorm(n*(p+1)),nrow=n)
generate.data <- function(X, sig.epsilon=1){
  ## This function takes 2 inputs (1 mandatory, 1 optional)
  ## X - the design matrix
  ## sig.epsilon - (optional) the sd of the normal noise (default=1 if omitted)
  n = nrow(X)
  p = ncol(X)
  epsilon = rnorm(n, sd = sig.epsilon) ## noise ~ N(0, sig.epsilon)
  beta = c(p:2,0) # p beta coefficients
  beta.0 = 3 # an intercept
  y = beta.0 + X %*% beta + epsilon # the linear model
  df = data.frame(y, X) # put it all in a data frame
  return(df) # output
}
dat = generate.data(X)
lm.large = lm(y~.,data=dat)
lm.small = lm(y~.-X4, data=dat)

RSS.large = _________________________
RSS.small = _________________________
RSS.diff = RSS.small - RSS.large
RSS.diff

set.seed(000012345)
n = 100
p = 3
X = matrix(rnorm(n*(p+1)),nrow=n)
generate.data <- function(X, sig.epsilon=1){
  ## This function takes 2 inputs (1 mandatory, 1 optional)
  ## X - the design matrix
  ## sig.epsilon - (optional) the sd of the normal noise (default=1 if omitted)
  n = nrow(X)
  p = ncol(X)
  epsilon = rnorm(n, sd = sig.epsilon) ## noise ~ N(0, sig.epsilon)
  beta = c(p:2,0) # p beta coefficients
  beta.0 = 3 # an intercept
  y = beta.0 + X %*% beta + epsilon # the linear model
  df = data.frame(y, X) # put it all in a data frame
  return(df) # output
}
dat = generate.data(X)
lm.large = lm(y~.,data=dat)
lm.small = lm(y~.-X4, data=dat)
RSS.large = sum(residuals(lm.large)^2)
RSS.small = sum(residuals(lm.small)^2)
RSS.diff = RSS.small - RSS.large

grade_result(
  pass_if(~ identical(.result, RSS.diff), "Correct!"),
  fail_if(~ TRUE, "Incorrect.")
)

Residual Degrees of Freedom

Find the residual degrees of freedom (n-p) for both models.

set.seed(000012345)
n = 100
p = 3
X = matrix(rnorm(n*(p+1)),nrow=n)
generate.data <- function(X, sig.epsilon=1){
  ## This function takes 2 inputs (1 mandatory, 1 optional)
  ## X - the design matrix
  ## sig.epsilon - (optional) the sd of the normal noise (default=1 if omitted)
  n = nrow(X)
  p = ncol(X)
  epsilon = rnorm(n, sd = sig.epsilon) ## noise ~ N(0, sig.epsilon)
  beta = c(p:2,0) # p beta coefficients
  beta.0 = 3 # an intercept
  y = beta.0 + X %*% beta + epsilon # the linear model
  df = data.frame(y, X) # put it all in a data frame
  return(df) # output
}
dat = generate.data(X)
lm.large = lm(y~.,data=dat)
lm.small = lm(y~.-X4, data=dat)

res.df.large = _________________
res.df.small = _________________
c(res.df.large, res.df.small)

set.seed(000012345)
n = 100
p = 3
X = matrix(rnorm(n*(p+1)),nrow=n)
generate.data <- function(X, sig.epsilon=1){
  ## This function takes 2 inputs (1 mandatory, 1 optional)
  ## X - the design matrix
  ## sig.epsilon - (optional) the sd of the normal noise (default=1 if omitted)
  n = nrow(X)
  p = ncol(X)
  epsilon = rnorm(n, sd = sig.epsilon) ## noise ~ N(0, sig.epsilon)
  beta = c(p:2,0) # p beta coefficients
  beta.0 = 3 # an intercept
  y = beta.0 + X %*% beta + epsilon # the linear model
  df = data.frame(y, X) # put it all in a data frame
  return(df) # output
}
dat = generate.data(X)
lm.large = lm(y~.,data=dat)
lm.small = lm(y~.-X4, data=dat)
RSS.large = sum(residuals(lm.large)^2)
RSS.small = sum(residuals(lm.small)^2)
RSS.diff = RSS.small - RSS.large
res.df.large = lm.large$df.residual
res.df.small = lm.small$df.residual
sol <- c(res.df.large, res.df.small)

grade_result(
  pass_if(~ identical(.result, sol), "Correct!"),
  fail_if(~ TRUE, "Incorrect.")
)

F-test

The following objects are available for you to reference:

dat - the data

lm.large the linear regression of the full model

lm.small the linear regression of the smaller model (excludes $x_4$)

RSS.large - the residual sum of squares from the full model

RSS.small - the residual sum of squares from the smaller model

res.df.large - the residual degrees of freedom from the large model

res.df.small - the residual degrees of freedom from the smaller model

Write an F-test Function

Write a function which calculates the F-statistic given two fitted, nested lm objects. [ F = \frac{(RSS_{small}-RSS_{large})/(df_{small}-df_{large})}{RSS_{large}/df_{large}} ] Calculate the statistic on your large vs. small model.

set.seed(000012345)
n = 100
p = 3
X = matrix(rnorm(n*(p+1)),nrow=n)
generate.data <- function(X, sig.epsilon=1){
  ## This function takes 2 inputs (1 mandatory, 1 optional)
  ## X - the design matrix
  ## sig.epsilon - (optional) the sd of the normal noise (default=1 if omitted)
  n = nrow(X)
  p = ncol(X)
  epsilon = rnorm(n, sd = sig.epsilon) ## noise ~ N(0, sig.epsilon)
  beta = c(p:2,0) # p beta coefficients
  beta.0 = 3 # an intercept
  y = beta.0 + X %*% beta + epsilon # the linear model
  df = data.frame(y, X) # put it all in a data frame
  return(df) # output
}
dat = generate.data(X)
lm.large = lm(y~.,data=dat)
lm.small = lm(y~.-X4, data=dat)
RSS.large = sum(residuals(lm.large)^2)
RSS.small = sum(residuals(lm.small)^2)
RSS.diff = RSS.small - RSS.large
res.df.large = lm.large$df.residual
res.df.small = lm.small$df.residual

Fstat <- function(small, large){
  ## Takes in 2 fitted, nested lm objects
  num   = _________________________________
  denom = _________________________________
  return(num/denom)
}
fs = Fstat(lm.small, lm.large)
fs

set.seed(000012345)
n = 100
p = 3
X = matrix(rnorm(n*(p+1)),nrow=n)
generate.data <- function(X, sig.epsilon=1){
  ## This function takes 2 inputs (1 mandatory, 1 optional)
  ## X - the design matrix
  ## sig.epsilon - (optional) the sd of the normal noise (default=1 if omitted)
  n = nrow(X)
  p = ncol(X)
  epsilon = rnorm(n, sd = sig.epsilon) ## noise ~ N(0, sig.epsilon)
  beta = c(p:2,0) # p beta coefficients
  beta.0 = 3 # an intercept
  y = beta.0 + X %*% beta + epsilon # the linear model
  df = data.frame(y, X) # put it all in a data frame
  return(df) # output
}
dat = generate.data(X)
lm.large = lm(y~.,data=dat)
lm.small = lm(y~.-X4, data=dat)
RSS.large = sum(residuals(lm.large)^2)
RSS.small = sum(residuals(lm.small)^2)
RSS.diff = RSS.small - RSS.large
res.df.large = lm.large$df.residual
res.df.small = lm.small$df.residual
Fstat <- function(small, large){
  ## Takes in 2 fitted, nested lm objects
  num = (sum(residuals(small)^2)-sum(residuals(large)^2))/
    (small$df.residual-large$df.residual)
  denom = sum(residuals(large)^2)/large$df.residual
  return(num/denom)
}
fs = Fstat(lm.small, lm.large)

grade_result(
  pass_if(~ identical(.result, fs), "Correct!"),
  fail_if(~ TRUE, "Incorrect.")
)

Calculate the p-value using pf(Fstat,dfnum,dfdenom,lower.tail=FALSE).

set.seed(000012345)
n = 100
p = 3
X = matrix(rnorm(n*(p+1)),nrow=n)
generate.data <- function(X, sig.epsilon=1){
  ## This function takes 2 inputs (1 mandatory, 1 optional)
  ## X - the design matrix
  ## sig.epsilon - (optional) the sd of the normal noise (default=1 if omitted)
  n = nrow(X)
  p = ncol(X)
  epsilon = rnorm(n, sd = sig.epsilon) ## noise ~ N(0, sig.epsilon)
  beta = c(p:2,0) # p beta coefficients
  beta.0 = 3 # an intercept
  y = beta.0 + X %*% beta + epsilon # the linear model
  df = data.frame(y, X) # put it all in a data frame
  return(df) # output
}
dat = generate.data(X)
lm.large = lm(y~.,data=dat)
lm.small = lm(y~.-X4, data=dat)
RSS.large = sum(residuals(lm.large)^2)
RSS.small = sum(residuals(lm.small)^2)
RSS.diff = RSS.small - RSS.large
res.df.large = lm.large$df.residual
res.df.small = lm.small$df.residual
Fstat <- function(small, large){
  ## Takes in 2 fitted, nested lm objects
  num = (sum(residuals(small)^2)-sum(residuals(large)^2))/
    (small$df.residual-large$df.residual)
  denom = sum(residuals(large)^2)/large$df.residual
  return(num/denom)
}
fs = Fstat(lm.small, lm.large)

pval = __________________________________
pval

set.seed(000012345)
n = 100
p = 3
X = matrix(rnorm(n*(p+1)),nrow=n)
generate.data <- function(X, sig.epsilon=1){
  ## This function takes 2 inputs (1 mandatory, 1 optional)
  ## X - the design matrix
  ## sig.epsilon - (optional) the sd of the normal noise (default=1 if omitted)
  n = nrow(X)
  p = ncol(X)
  epsilon = rnorm(n, sd = sig.epsilon) ## noise ~ N(0, sig.epsilon)
  beta = c(p:2,0) # p beta coefficients
  beta.0 = 3 # an intercept
  y = beta.0 + X %*% beta + epsilon # the linear model
  df = data.frame(y, X) # put it all in a data frame
  return(df) # output
}
dat = generate.data(X)
lm.large = lm(y~.,data=dat)
lm.small = lm(y~.-X4, data=dat)
RSS.large = sum(residuals(lm.large)^2)
RSS.small = sum(residuals(lm.small)^2)
RSS.diff = RSS.small - RSS.large
res.df.large = lm.large$df.residual
res.df.small = lm.small$df.residual
Fstat <- function(small, large){
  ## Takes in 2 fitted, nested lm objects
  num = (sum(residuals(small)^2)-sum(residuals(large)^2))/
    (small$df.residual-large$df.residual)
  denom = sum(residuals(large)^2)/large$df.residual
  return(num/denom)
}
fs = Fstat(lm.small, lm.large)
pval = pf(fs, 1, lm.large$df.residual, lower.tail = FALSE)


grade_result(
  pass_if(~ identical(.result, pval), "Correct!"),
  fail_if(~ TRUE, "Incorrect.")
)

Check with `anova` function.

set.seed(000012345)
n = 100
p = 3
X = matrix(rnorm(n*(p+1)),nrow=n)
generate.data <- function(X, sig.epsilon=1){
  ## This function takes 2 inputs (1 mandatory, 1 optional)
  ## X - the design matrix
  ## sig.epsilon - (optional) the sd of the normal noise (default=1 if omitted)
  n = nrow(X)
  p = ncol(X)
  epsilon = rnorm(n, sd = sig.epsilon) ## noise ~ N(0, sig.epsilon)
  beta = c(p:2,0) # p beta coefficients
  beta.0 = 3 # an intercept
  y = beta.0 + X %*% beta + epsilon # the linear model
  df = data.frame(y, X) # put it all in a data frame
  return(df) # output
}
dat = generate.data(X)
lm.large = lm(y~.,data=dat)
lm.small = lm(y~.-X4, data=dat)
RSS.large = sum(residuals(lm.large)^2)
RSS.small = sum(residuals(lm.small)^2)
RSS.diff = RSS.small - RSS.large
res.df.large = lm.large$df.residual
res.df.small = lm.small$df.residual
Fstat <- function(small, large){
  ## Takes in 2 fitted, nested lm objects
  num = (sum(residuals(small)^2)-sum(residuals(large)^2))/
    (small$df.residual-large$df.residual)
  denom = sum(residuals(large)^2)/large$df.residual
  return(num/denom)
}
fs = Fstat(lm.small, lm.large)
pval = pf(fs, 1, lm.large$df.residual, lower.tail = FALSE)

anova(lm.small,lm.large)

If done correctly, the F-stat and p-value should match the output of the anova applied to the two models.

Simulate the F test

Now, follow the slides and use the code below. Simulate the test statistic 1000 times.

set.seed(000012345)
n = 100
p = 3
X = matrix(rnorm(n*(p+1)),nrow=n)
generate.data <- function(X, sig.epsilon=1){
  ## This function takes 2 inputs (1 mandatory, 1 optional)
  ## X - the design matrix
  ## sig.epsilon - (optional) the sd of the normal noise (default=1 if omitted)
  n = nrow(X)
  p = ncol(X)
  epsilon = rnorm(n, sd = sig.epsilon) ## noise ~ N(0, sig.epsilon)
  beta = c(p:2,0) # p beta coefficients
  beta.0 = 3 # an intercept
  y = beta.0 + X %*% beta + epsilon # the linear model
  df = data.frame(y, X) # put it all in a data frame
  return(df) # output
}
dat = generate.data(X)
lm.large = lm(y~.,data=dat)
lm.small = lm(y~.-X4, data=dat)
RSS.large = sum(residuals(lm.large)^2)
RSS.small = sum(residuals(lm.small)^2)
RSS.diff = RSS.small - RSS.large
res.df.large = lm.large$df.residual
res.df.small = lm.small$df.residual
Fstat <- function(small, large){
  ## Takes in 2 fitted, nested lm objects
  num = (sum(residuals(small)^2)-sum(residuals(large)^2))/
    (small$df.residual-large$df.residual)
  denom = sum(residuals(large)^2)/large$df.residual
  return(num/denom)
}
fs = Fstat(lm.small, lm.large)
pval = pf(fs, 1, lm.large$df.residual, lower.tail = FALSE)

simulate.from.lm <- function(df, mdl) { # altered to work with any lm output
  yhat <- predict(mdl) 
  newy <- yhat + rnorm(nrow(df), sd = summary(mdl)$sigma)
  df[[names(mdl$model)[1]]] <- newy # the names part, gets the response from the df
  return(df)
}

# Simulate from an estimated linear model, and refit both the large and small lm
# Inputs: data frame with covariates (df), fitted small lm (small), fitted large lm (large)
# Output: the Fstat
Fstat.sim <- function (df, small, large) {
  sim.df <- simulate.from.lm(df, small)
  small.sim <- lm(formula(small),data=sim.df) # used formulas instead
  large.sim = lm(formula(large), data=sim.df)
  return(Fstat(small.sim,large.sim))
}

testDist = replicate(1000, Fstat.sim(dat, lm.small, lm.large))

Plot

Use the code below to plot the simulated density, the true F-density, and the observed test statistic. Print both p-values.

set.seed(000012345)
n = 100
p = 3
X = matrix(rnorm(n*(p+1)),nrow=n)
generate.data <- function(X, sig.epsilon=1){
  ## This function takes 2 inputs (1 mandatory, 1 optional)
  ## X - the design matrix
  ## sig.epsilon - (optional) the sd of the normal noise (default=1 if omitted)
  n = nrow(X)
  p = ncol(X)
  epsilon = rnorm(n, sd = sig.epsilon) ## noise ~ N(0, sig.epsilon)
  beta = c(p:2,0) # p beta coefficients
  beta.0 = 3 # an intercept
  y = beta.0 + X %*% beta + epsilon # the linear model
  df = data.frame(y, X) # put it all in a data frame
  return(df) # output
}
dat = generate.data(X)
lm.large = lm(y~.,data=dat)
lm.small = lm(y~.-X4, data=dat)
RSS.large = sum(residuals(lm.large)^2)
RSS.small = sum(residuals(lm.small)^2)
RSS.diff = RSS.small - RSS.large
res.df.large = lm.large$df.residual
res.df.small = lm.small$df.residual
Fstat <- function(small, large){
  ## Takes in 2 fitted, nested lm objects
  num = (sum(residuals(small)^2)-sum(residuals(large)^2))/
    (small$df.residual-large$df.residual)
  denom = sum(residuals(large)^2)/large$df.residual
  return(num/denom)
}
fs = Fstat(lm.small, lm.large)
pval = pf(fs, 1, lm.large$df.residual, lower.tail = FALSE)
simulate.from.lm <- function(df, mdl) { # altered to work with any lm output
  yhat <- predict(mdl) 
  newy <- yhat + rnorm(nrow(df), sd = summary(mdl)$sigma)
  df[[names(mdl$model)[1]]] <- newy # the names part, gets the response from the df
  return(df)
}

# Simulate from an estimated linear model, and refit both the large and small lm
# Inputs: data frame with covariates (df), fitted small lm (small), fitted large lm (large)
# Output: the Fstat
Fstat.sim <- function (df, small, large) {
  sim.df <- simulate.from.lm(df, small)
  small.sim <- lm(formula(small),data=sim.df) # used formulas instead
  large.sim = lm(formula(large), data=sim.df)
  return(Fstat(small.sim,large.sim))
}

testDist = replicate(1000, Fstat.sim(dat, lm.small, lm.large))

library(tidyverse)
library(cowplot)
ggplot(data.frame(x=testDist), aes(x)) + geom_density(color="darkorange") +
  stat_function(fun = stats::df, color="darkgreen",
                args=list(df1=res.df.small-res.df.large, df2=res.df.large)) +
  coord_cartesian(ylim=c(0,.8), expand = expand_scale()) + theme_cowplot() +
  geom_vline(xintercept = fs, color="darkblue")
c(pval,mean(testDist>fs))

Test your Understanding

quiz(
  question("Which line represents the true F density",
    answer("dark green", correct=TRUE, message="The dark orange line is the simulated density and the dark blue line represents the test statistic."),
    answer("dark orange"),
    answer("dark blue"),
    answer("black"),
    allow_retry = TRUE
  )
)