library(learnr) library(gradethis) tutorial_options(exercise.timelimit = 5, exercise.checker = gradethis::grade_learnr) knitr::opts_chunk$set(echo = FALSE)
In this activity, we'll be exploring the f-test and how it applies to linear regression. To begin, run the code below that creates a data generation function.
set.seed(000012345) n = 100 p = 3 X = matrix(rnorm(n*(p+1)),nrow=n) generate.data <- function(X, sig.epsilon=1){ ## This function takes 2 inputs (1 mandatory, 1 optional) ## X - the design matrix ## sig.epsilon - (optional) the sd of the normal noise (default=1 if omitted) n = nrow(X) p = ncol(X) epsilon = rnorm(n, sd = sig.epsilon) ## noise ~ N(0, sig.epsilon) beta = c(p:2,0) # p beta coefficients beta.0 = 3 # an intercept y = beta.0 + X %*% beta + epsilon # the linear model df = data.frame(y, X) # put it all in a data frame return(df) # output }
And create the data here and save it as dat
. Use the default epsilon=1.
set.seed(000012345) n = 100 p = 3 X = matrix(rnorm(n*(p+1)),nrow=n) generate.data <- function(X, sig.epsilon=1){ ## This function takes 2 inputs (1 mandatory, 1 optional) ## X - the design matrix ## sig.epsilon - (optional) the sd of the normal noise (default=1 if omitted) n = nrow(X) p = ncol(X) epsilon = rnorm(n, sd = sig.epsilon) ## noise ~ N(0, sig.epsilon) beta = c(p:2,0) # p beta coefficients beta.0 = 3 # an intercept y = beta.0 + X %*% beta + epsilon # the linear model df = data.frame(y, X) # put it all in a data frame return(df) # output }
dat = ____________
dat
set.seed(000012345) n = 100 p = 3 X = matrix(rnorm(n*(p+1)),nrow=n) generate.data <- function(X, sig.epsilon=1){ ## This function takes 2 inputs (1 mandatory, 1 optional) ## X - the design matrix ## sig.epsilon - (optional) the sd of the normal noise (default=1 if omitted) n = nrow(X) p = ncol(X) epsilon = rnorm(n, sd = sig.epsilon) ## noise ~ N(0, sig.epsilon) beta = c(p:2,0) # p beta coefficients beta.0 = 3 # an intercept y = beta.0 + X %*% beta + epsilon # the linear model df = data.frame(y, X) # put it all in a data frame return(df) # output } dat = generate.data(X) grade_result( pass_if(~ identical(.result, dat), "Correct!"), fail_if(~ TRUE, "Incorrect.") )
set.seed(000012345) n = 100 p = 3 X = matrix(rnorm(n*(p+1)),nrow=n) generate.data <- function(X, sig.epsilon=1){ ## This function takes 2 inputs (1 mandatory, 1 optional) ## X - the design matrix ## sig.epsilon - (optional) the sd of the normal noise (default=1 if omitted) n = nrow(X) p = ncol(X) epsilon = rnorm(n, sd = sig.epsilon) ## noise ~ N(0, sig.epsilon) beta = c(p:2,0) # p beta coefficients beta.0 = 3 # an intercept y = beta.0 + X %*% beta + epsilon # the linear model df = data.frame(y, X) # put it all in a data frame return(df) # output } dat = generate.data(X) head(dat)
The first 6 rows of the data are displayed above. Estimate 2 linear models: (1) on all four predictors, (2) on the first 3 predictors.
set.seed(000012345) n = 100 p = 3 X = matrix(rnorm(n*(p+1)),nrow=n) generate.data <- function(X, sig.epsilon=1){ ## This function takes 2 inputs (1 mandatory, 1 optional) ## X - the design matrix ## sig.epsilon - (optional) the sd of the normal noise (default=1 if omitted) n = nrow(X) p = ncol(X) epsilon = rnorm(n, sd = sig.epsilon) ## noise ~ N(0, sig.epsilon) beta = c(p:2,0) # p beta coefficients beta.0 = 3 # an intercept y = beta.0 + X %*% beta + epsilon # the linear model df = data.frame(y, X) # put it all in a data frame return(df) # output } dat = generate.data(X)
lm.large = lm(__________,data=dat) coef(lm.large)
set.seed(000012345) n = 100 p = 3 X = matrix(rnorm(n*(p+1)),nrow=n) generate.data <- function(X, sig.epsilon=1){ ## This function takes 2 inputs (1 mandatory, 1 optional) ## X - the design matrix ## sig.epsilon - (optional) the sd of the normal noise (default=1 if omitted) n = nrow(X) p = ncol(X) epsilon = rnorm(n, sd = sig.epsilon) ## noise ~ N(0, sig.epsilon) beta = c(p:2,0) # p beta coefficients beta.0 = 3 # an intercept y = beta.0 + X %*% beta + epsilon # the linear model df = data.frame(y, X) # put it all in a data frame return(df) # output } dat = generate.data(X) lm.large = coef(lm(y~.,data=dat)) grade_result( pass_if(~ identical(.result, lm.large), "Correct!"), fail_if(~ TRUE, "Incorrect.") )
set.seed(000012345) n = 100 p = 3 X = matrix(rnorm(n*(p+1)),nrow=n) generate.data <- function(X, sig.epsilon=1){ ## This function takes 2 inputs (1 mandatory, 1 optional) ## X - the design matrix ## sig.epsilon - (optional) the sd of the normal noise (default=1 if omitted) n = nrow(X) p = ncol(X) epsilon = rnorm(n, sd = sig.epsilon) ## noise ~ N(0, sig.epsilon) beta = c(p:2,0) # p beta coefficients beta.0 = 3 # an intercept y = beta.0 + X %*% beta + epsilon # the linear model df = data.frame(y, X) # put it all in a data frame return(df) # output } dat = generate.data(X)
lm.small = lm(__________, data=dat) coef(lm.small)
set.seed(000012345) n = 100 p = 3 X = matrix(rnorm(n*(p+1)),nrow=n) generate.data <- function(X, sig.epsilon=1){ ## This function takes 2 inputs (1 mandatory, 1 optional) ## X - the design matrix ## sig.epsilon - (optional) the sd of the normal noise (default=1 if omitted) n = nrow(X) p = ncol(X) epsilon = rnorm(n, sd = sig.epsilon) ## noise ~ N(0, sig.epsilon) beta = c(p:2,0) # p beta coefficients beta.0 = 3 # an intercept y = beta.0 + X %*% beta + epsilon # the linear model df = data.frame(y, X) # put it all in a data frame return(df) # output } dat = generate.data(X) lm.large = coef(lm(y~.,data=dat)) lm.small = coef(lm(y~.-X4, data=dat)) grade_result( pass_if(~ identical(.result, lm.small), "Correct!"), fail_if(~ identical(.result, lm.large), "Did you forget to drop the fourth predictor?"), fail_if(~ TRUE, "Incorrect.") )
For each model, calculate the RSS. Find the difference between RSS(small)-RSS(large).
set.seed(000012345) n = 100 p = 3 X = matrix(rnorm(n*(p+1)),nrow=n) generate.data <- function(X, sig.epsilon=1){ ## This function takes 2 inputs (1 mandatory, 1 optional) ## X - the design matrix ## sig.epsilon - (optional) the sd of the normal noise (default=1 if omitted) n = nrow(X) p = ncol(X) epsilon = rnorm(n, sd = sig.epsilon) ## noise ~ N(0, sig.epsilon) beta = c(p:2,0) # p beta coefficients beta.0 = 3 # an intercept y = beta.0 + X %*% beta + epsilon # the linear model df = data.frame(y, X) # put it all in a data frame return(df) # output } dat = generate.data(X) lm.large = lm(y~.,data=dat) lm.small = lm(y~.-X4, data=dat)
RSS.large = _________________________ RSS.small = _________________________ RSS.diff = RSS.small - RSS.large RSS.diff
set.seed(000012345) n = 100 p = 3 X = matrix(rnorm(n*(p+1)),nrow=n) generate.data <- function(X, sig.epsilon=1){ ## This function takes 2 inputs (1 mandatory, 1 optional) ## X - the design matrix ## sig.epsilon - (optional) the sd of the normal noise (default=1 if omitted) n = nrow(X) p = ncol(X) epsilon = rnorm(n, sd = sig.epsilon) ## noise ~ N(0, sig.epsilon) beta = c(p:2,0) # p beta coefficients beta.0 = 3 # an intercept y = beta.0 + X %*% beta + epsilon # the linear model df = data.frame(y, X) # put it all in a data frame return(df) # output } dat = generate.data(X) lm.large = lm(y~.,data=dat) lm.small = lm(y~.-X4, data=dat) RSS.large = sum(residuals(lm.large)^2) RSS.small = sum(residuals(lm.small)^2) RSS.diff = RSS.small - RSS.large grade_result( pass_if(~ identical(.result, RSS.diff), "Correct!"), fail_if(~ TRUE, "Incorrect.") )
Find the residual degrees of freedom (n-p
) for both models.
set.seed(000012345) n = 100 p = 3 X = matrix(rnorm(n*(p+1)),nrow=n) generate.data <- function(X, sig.epsilon=1){ ## This function takes 2 inputs (1 mandatory, 1 optional) ## X - the design matrix ## sig.epsilon - (optional) the sd of the normal noise (default=1 if omitted) n = nrow(X) p = ncol(X) epsilon = rnorm(n, sd = sig.epsilon) ## noise ~ N(0, sig.epsilon) beta = c(p:2,0) # p beta coefficients beta.0 = 3 # an intercept y = beta.0 + X %*% beta + epsilon # the linear model df = data.frame(y, X) # put it all in a data frame return(df) # output } dat = generate.data(X) lm.large = lm(y~.,data=dat) lm.small = lm(y~.-X4, data=dat)
res.df.large = _________________ res.df.small = _________________ c(res.df.large, res.df.small)
set.seed(000012345) n = 100 p = 3 X = matrix(rnorm(n*(p+1)),nrow=n) generate.data <- function(X, sig.epsilon=1){ ## This function takes 2 inputs (1 mandatory, 1 optional) ## X - the design matrix ## sig.epsilon - (optional) the sd of the normal noise (default=1 if omitted) n = nrow(X) p = ncol(X) epsilon = rnorm(n, sd = sig.epsilon) ## noise ~ N(0, sig.epsilon) beta = c(p:2,0) # p beta coefficients beta.0 = 3 # an intercept y = beta.0 + X %*% beta + epsilon # the linear model df = data.frame(y, X) # put it all in a data frame return(df) # output } dat = generate.data(X) lm.large = lm(y~.,data=dat) lm.small = lm(y~.-X4, data=dat) RSS.large = sum(residuals(lm.large)^2) RSS.small = sum(residuals(lm.small)^2) RSS.diff = RSS.small - RSS.large res.df.large = lm.large$df.residual res.df.small = lm.small$df.residual sol <- c(res.df.large, res.df.small) grade_result( pass_if(~ identical(.result, sol), "Correct!"), fail_if(~ TRUE, "Incorrect.") )
The following objects are available for you to reference:
dat
- the data
lm.large
the linear regression of the full model
lm.small
the linear regression of the smaller model (excludes $x_4$)
RSS.large
- the residual sum of squares from the full model
RSS.small
- the residual sum of squares from the smaller model
res.df.large
- the residual degrees of freedom from the large model
res.df.small
- the residual degrees of freedom from the smaller model
Write a function which calculates the F-statistic given two fitted, nested lm
objects.
[
F = \frac{(RSS_{small}-RSS_{large})/(df_{small}-df_{large})}{RSS_{large}/df_{large}}
]
Calculate the statistic on your large vs. small model.
set.seed(000012345) n = 100 p = 3 X = matrix(rnorm(n*(p+1)),nrow=n) generate.data <- function(X, sig.epsilon=1){ ## This function takes 2 inputs (1 mandatory, 1 optional) ## X - the design matrix ## sig.epsilon - (optional) the sd of the normal noise (default=1 if omitted) n = nrow(X) p = ncol(X) epsilon = rnorm(n, sd = sig.epsilon) ## noise ~ N(0, sig.epsilon) beta = c(p:2,0) # p beta coefficients beta.0 = 3 # an intercept y = beta.0 + X %*% beta + epsilon # the linear model df = data.frame(y, X) # put it all in a data frame return(df) # output } dat = generate.data(X) lm.large = lm(y~.,data=dat) lm.small = lm(y~.-X4, data=dat) RSS.large = sum(residuals(lm.large)^2) RSS.small = sum(residuals(lm.small)^2) RSS.diff = RSS.small - RSS.large res.df.large = lm.large$df.residual res.df.small = lm.small$df.residual
Fstat <- function(small, large){ ## Takes in 2 fitted, nested lm objects num = _________________________________ denom = _________________________________ return(num/denom) } fs = Fstat(lm.small, lm.large) fs
set.seed(000012345) n = 100 p = 3 X = matrix(rnorm(n*(p+1)),nrow=n) generate.data <- function(X, sig.epsilon=1){ ## This function takes 2 inputs (1 mandatory, 1 optional) ## X - the design matrix ## sig.epsilon - (optional) the sd of the normal noise (default=1 if omitted) n = nrow(X) p = ncol(X) epsilon = rnorm(n, sd = sig.epsilon) ## noise ~ N(0, sig.epsilon) beta = c(p:2,0) # p beta coefficients beta.0 = 3 # an intercept y = beta.0 + X %*% beta + epsilon # the linear model df = data.frame(y, X) # put it all in a data frame return(df) # output } dat = generate.data(X) lm.large = lm(y~.,data=dat) lm.small = lm(y~.-X4, data=dat) RSS.large = sum(residuals(lm.large)^2) RSS.small = sum(residuals(lm.small)^2) RSS.diff = RSS.small - RSS.large res.df.large = lm.large$df.residual res.df.small = lm.small$df.residual Fstat <- function(small, large){ ## Takes in 2 fitted, nested lm objects num = (sum(residuals(small)^2)-sum(residuals(large)^2))/ (small$df.residual-large$df.residual) denom = sum(residuals(large)^2)/large$df.residual return(num/denom) } fs = Fstat(lm.small, lm.large) grade_result( pass_if(~ identical(.result, fs), "Correct!"), fail_if(~ TRUE, "Incorrect.") )
Calculate the p-value using pf(Fstat,dfnum,dfdenom,lower.tail=FALSE)
.
set.seed(000012345) n = 100 p = 3 X = matrix(rnorm(n*(p+1)),nrow=n) generate.data <- function(X, sig.epsilon=1){ ## This function takes 2 inputs (1 mandatory, 1 optional) ## X - the design matrix ## sig.epsilon - (optional) the sd of the normal noise (default=1 if omitted) n = nrow(X) p = ncol(X) epsilon = rnorm(n, sd = sig.epsilon) ## noise ~ N(0, sig.epsilon) beta = c(p:2,0) # p beta coefficients beta.0 = 3 # an intercept y = beta.0 + X %*% beta + epsilon # the linear model df = data.frame(y, X) # put it all in a data frame return(df) # output } dat = generate.data(X) lm.large = lm(y~.,data=dat) lm.small = lm(y~.-X4, data=dat) RSS.large = sum(residuals(lm.large)^2) RSS.small = sum(residuals(lm.small)^2) RSS.diff = RSS.small - RSS.large res.df.large = lm.large$df.residual res.df.small = lm.small$df.residual Fstat <- function(small, large){ ## Takes in 2 fitted, nested lm objects num = (sum(residuals(small)^2)-sum(residuals(large)^2))/ (small$df.residual-large$df.residual) denom = sum(residuals(large)^2)/large$df.residual return(num/denom) } fs = Fstat(lm.small, lm.large)
pval = __________________________________
pval
set.seed(000012345) n = 100 p = 3 X = matrix(rnorm(n*(p+1)),nrow=n) generate.data <- function(X, sig.epsilon=1){ ## This function takes 2 inputs (1 mandatory, 1 optional) ## X - the design matrix ## sig.epsilon - (optional) the sd of the normal noise (default=1 if omitted) n = nrow(X) p = ncol(X) epsilon = rnorm(n, sd = sig.epsilon) ## noise ~ N(0, sig.epsilon) beta = c(p:2,0) # p beta coefficients beta.0 = 3 # an intercept y = beta.0 + X %*% beta + epsilon # the linear model df = data.frame(y, X) # put it all in a data frame return(df) # output } dat = generate.data(X) lm.large = lm(y~.,data=dat) lm.small = lm(y~.-X4, data=dat) RSS.large = sum(residuals(lm.large)^2) RSS.small = sum(residuals(lm.small)^2) RSS.diff = RSS.small - RSS.large res.df.large = lm.large$df.residual res.df.small = lm.small$df.residual Fstat <- function(small, large){ ## Takes in 2 fitted, nested lm objects num = (sum(residuals(small)^2)-sum(residuals(large)^2))/ (small$df.residual-large$df.residual) denom = sum(residuals(large)^2)/large$df.residual return(num/denom) } fs = Fstat(lm.small, lm.large) pval = pf(fs, 1, lm.large$df.residual, lower.tail = FALSE) grade_result( pass_if(~ identical(.result, pval), "Correct!"), fail_if(~ TRUE, "Incorrect.") )
anova
function.set.seed(000012345) n = 100 p = 3 X = matrix(rnorm(n*(p+1)),nrow=n) generate.data <- function(X, sig.epsilon=1){ ## This function takes 2 inputs (1 mandatory, 1 optional) ## X - the design matrix ## sig.epsilon - (optional) the sd of the normal noise (default=1 if omitted) n = nrow(X) p = ncol(X) epsilon = rnorm(n, sd = sig.epsilon) ## noise ~ N(0, sig.epsilon) beta = c(p:2,0) # p beta coefficients beta.0 = 3 # an intercept y = beta.0 + X %*% beta + epsilon # the linear model df = data.frame(y, X) # put it all in a data frame return(df) # output } dat = generate.data(X) lm.large = lm(y~.,data=dat) lm.small = lm(y~.-X4, data=dat) RSS.large = sum(residuals(lm.large)^2) RSS.small = sum(residuals(lm.small)^2) RSS.diff = RSS.small - RSS.large res.df.large = lm.large$df.residual res.df.small = lm.small$df.residual Fstat <- function(small, large){ ## Takes in 2 fitted, nested lm objects num = (sum(residuals(small)^2)-sum(residuals(large)^2))/ (small$df.residual-large$df.residual) denom = sum(residuals(large)^2)/large$df.residual return(num/denom) } fs = Fstat(lm.small, lm.large) pval = pf(fs, 1, lm.large$df.residual, lower.tail = FALSE)
anova(lm.small,lm.large)
If done correctly, the F-stat and p-value should match the output of the anova
applied to the two models.
Now, follow the slides and use the code below. Simulate the test statistic 1000 times.
set.seed(000012345) n = 100 p = 3 X = matrix(rnorm(n*(p+1)),nrow=n) generate.data <- function(X, sig.epsilon=1){ ## This function takes 2 inputs (1 mandatory, 1 optional) ## X - the design matrix ## sig.epsilon - (optional) the sd of the normal noise (default=1 if omitted) n = nrow(X) p = ncol(X) epsilon = rnorm(n, sd = sig.epsilon) ## noise ~ N(0, sig.epsilon) beta = c(p:2,0) # p beta coefficients beta.0 = 3 # an intercept y = beta.0 + X %*% beta + epsilon # the linear model df = data.frame(y, X) # put it all in a data frame return(df) # output } dat = generate.data(X) lm.large = lm(y~.,data=dat) lm.small = lm(y~.-X4, data=dat) RSS.large = sum(residuals(lm.large)^2) RSS.small = sum(residuals(lm.small)^2) RSS.diff = RSS.small - RSS.large res.df.large = lm.large$df.residual res.df.small = lm.small$df.residual Fstat <- function(small, large){ ## Takes in 2 fitted, nested lm objects num = (sum(residuals(small)^2)-sum(residuals(large)^2))/ (small$df.residual-large$df.residual) denom = sum(residuals(large)^2)/large$df.residual return(num/denom) } fs = Fstat(lm.small, lm.large) pval = pf(fs, 1, lm.large$df.residual, lower.tail = FALSE)
simulate.from.lm <- function(df, mdl) { # altered to work with any lm output yhat <- predict(mdl) newy <- yhat + rnorm(nrow(df), sd = summary(mdl)$sigma) df[[names(mdl$model)[1]]] <- newy # the names part, gets the response from the df return(df) } # Simulate from an estimated linear model, and refit both the large and small lm # Inputs: data frame with covariates (df), fitted small lm (small), fitted large lm (large) # Output: the Fstat Fstat.sim <- function (df, small, large) { sim.df <- simulate.from.lm(df, small) small.sim <- lm(formula(small),data=sim.df) # used formulas instead large.sim = lm(formula(large), data=sim.df) return(Fstat(small.sim,large.sim)) } testDist = replicate(1000, Fstat.sim(dat, lm.small, lm.large))
Use the code below to plot the simulated density, the true F-density, and the observed test statistic. Print both p-values.
set.seed(000012345) n = 100 p = 3 X = matrix(rnorm(n*(p+1)),nrow=n) generate.data <- function(X, sig.epsilon=1){ ## This function takes 2 inputs (1 mandatory, 1 optional) ## X - the design matrix ## sig.epsilon - (optional) the sd of the normal noise (default=1 if omitted) n = nrow(X) p = ncol(X) epsilon = rnorm(n, sd = sig.epsilon) ## noise ~ N(0, sig.epsilon) beta = c(p:2,0) # p beta coefficients beta.0 = 3 # an intercept y = beta.0 + X %*% beta + epsilon # the linear model df = data.frame(y, X) # put it all in a data frame return(df) # output } dat = generate.data(X) lm.large = lm(y~.,data=dat) lm.small = lm(y~.-X4, data=dat) RSS.large = sum(residuals(lm.large)^2) RSS.small = sum(residuals(lm.small)^2) RSS.diff = RSS.small - RSS.large res.df.large = lm.large$df.residual res.df.small = lm.small$df.residual Fstat <- function(small, large){ ## Takes in 2 fitted, nested lm objects num = (sum(residuals(small)^2)-sum(residuals(large)^2))/ (small$df.residual-large$df.residual) denom = sum(residuals(large)^2)/large$df.residual return(num/denom) } fs = Fstat(lm.small, lm.large) pval = pf(fs, 1, lm.large$df.residual, lower.tail = FALSE) simulate.from.lm <- function(df, mdl) { # altered to work with any lm output yhat <- predict(mdl) newy <- yhat + rnorm(nrow(df), sd = summary(mdl)$sigma) df[[names(mdl$model)[1]]] <- newy # the names part, gets the response from the df return(df) } # Simulate from an estimated linear model, and refit both the large and small lm # Inputs: data frame with covariates (df), fitted small lm (small), fitted large lm (large) # Output: the Fstat Fstat.sim <- function (df, small, large) { sim.df <- simulate.from.lm(df, small) small.sim <- lm(formula(small),data=sim.df) # used formulas instead large.sim = lm(formula(large), data=sim.df) return(Fstat(small.sim,large.sim)) } testDist = replicate(1000, Fstat.sim(dat, lm.small, lm.large))
library(tidyverse) library(cowplot) ggplot(data.frame(x=testDist), aes(x)) + geom_density(color="darkorange") + stat_function(fun = stats::df, color="darkgreen", args=list(df1=res.df.small-res.df.large, df2=res.df.large)) + coord_cartesian(ylim=c(0,.8), expand = expand_scale()) + theme_cowplot() + geom_vline(xintercept = fs, color="darkblue") c(pval,mean(testDist>fs))
quiz( question("Which line represents the true F density", answer("dark green", correct=TRUE, message="The dark orange line is the simulated density and the dark blue line represents the test statistic."), answer("dark orange"), answer("dark blue"), answer("black"), allow_retry = TRUE ) )
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.