library(learnr) library(gradethis) tutorial_options(exercise.timelimit = 5, exercise.checker = gradethis::grade_learnr) knitr::opts_chunk$set(echo = FALSE)
## Note: this is the same data as in Lecture 8. trueFunction <- function(x) sin(x) + 1/sqrt(x) + 3 set.seed(1234) n = 100 x = 1:n/n*2*pi wiggly = data.frame(x = x, y = trueFunction(x) + rnorm(n, 0, .75)) library(ggplot2) ggplot(wiggly, aes(x, y)) + geom_point() + xlim(0,2*pi) + ylim(0,max(wiggly$y)) + stat_function(fun=trueFunction, color='red')
Write a function that estimates the linear model of $y$ on $x$ for any data set. Call your function wiggly.estimator.lm
. It should return the slope and intercept.
## Note: this is the same data as in Lecture 8. trueFunction <- function(x) sin(x) + 1/sqrt(x) + 3 set.seed(1234) n = 100 x = 1:n/n*2*pi wiggly = data.frame(x = x, y = trueFunction(x) + rnorm(n, 0, .75))
wiggly.estimator.lm <- function(newdata=wiggly){ lm.coef = _________________________ lm.coef } wiggly.estimator.lm()
trueFunction <- function(x) sin(x) + 1/sqrt(x) + 3 set.seed(1234) n = 100 x = 1:n/n*2*pi wiggly = data.frame(x = x, y = trueFunction(x) + rnorm(n, 0, .75)) wiggly.estimator.lm <- function(newdata){ lm.coef = coef(lm(y~x, data=newdata)) lm.coef } sol <- wiggly.estimator.lm(wiggly) grade_result( pass_if(~ identical(.result, sol), "Correct!"), fail_if(~ TRUE, "Incorrect.") )
Write a function that computes the residuals, resamples from those residuals, and then outputs a data.frame with the original $x$'s but new $y$'s. Call it wiggly.resids.resamp
.
trueFunction <- function(x) sin(x) + 1/sqrt(x) + 3 set.seed(1234) n = 100 x = 1:n/n*2*pi wiggly = data.frame(x = x, y = trueFunction(x) + rnorm(n, 0, .75))
wiggly.resids.resamp <- function(newdata=wiggly){ newdata.ml = lm(y~x, data=newdata) resids = __________________ # compute the residuals from the linear model newResids = __________________ # resample the residuals from the original model new.wiggles = data.frame( # create a new dataframe x = ______________, # with the original x's y = fitted(newdata.ml)+newResids) # but new y's return(new.wiggles) } set.seed(1234) wiggly.resids.resamp()
trueFunction <- function(x) sin(x) + 1/sqrt(x) + 3 set.seed(1234) n = 100 x = 1:n/n*2*pi wiggly = data.frame(x = x, y = trueFunction(x) + rnorm(n, 0, .75)) wiggly.resids.resamp <- function(newdata=wiggly){ newdata.ml = lm(y~x, data=newdata) resids = residuals(newdata.ml) newResids = sample(resids, replace=TRUE) # resample the residuals from the original model new.wiggles = data.frame( # create a new dataframe x = newdata$x, # with the original x's but new y's y = fitted(newdata.ml)+newResids) return(new.wiggles) } set.seed(1234) wiggles = wiggly.resids.resamp(wiggly) grade_result( pass_if(~ identical(.result, wiggles), "Correct!"), fail_if(~ TRUE, "Incorrect.") )
Check your code with the following. They should be similar.
wiggly.resids.resamp <- function(newdata=wiggly){ newdata.ml = lm(y~x, data=newdata) resids = residuals(newdata.ml) newResids = sample(resids, replace=TRUE) # resample the residuals from the original model new.wiggles = data.frame( # create a new dataframe x = wiggly$x, # with the original x's but new y's y = fitted(newdata.ml)+newResids) return(new.wiggles) }
Write a function called new.wiggly.resamp
to resample the data frame. The following functions from your book can be referenced: resample()
, resample.data.frame()
, rboot()
, bootstrap()
, equitails()
, bootstrap.ci()
.
Hint: To write this function, it can be as easy as calling one of the functions above.
resample <- function(x) { sample(x, replace = TRUE) } resample.data.frame <- function(data) { sample.rows <- resample(1:nrow(data)) return(data[sample.rows, ]) } rboot <- function(statistic, simulator, B) { tboots <- replicate(B, statistic(simulator())) if (is.null(dim(tboots))) { tboots <- array(tboots, dim = c(1, B)) } return(tboots) } bootstrap <- function(tboots, summarizer, ...) { summaries <- apply(tboots, 1, summarizer, ...) return(t(summaries)) } equitails <- function(x, alpha) { lower <- quantile(x, alpha/2) upper <- quantile(x, 1 - alpha/2) return(c(lower, upper)) } bootstrap.ci <- function(statistic = NULL, simulator = NULL, tboots = NULL, B = if (!is.null(tboots)) { ncol(tboots) }, t.hat, level) { if (is.null(tboots)) { stopifnot(!is.null(statistic)) stopifnot(!is.null(simulator)) stopifnot(!is.null(B)) tboots <- rboot(statistic, simulator, B) } alpha <- 1 - level intervals <- bootstrap(tboots, summarizer = equitails, alpha = alpha) upper <- t.hat + (t.hat - intervals[, 1]) lower <- t.hat + (t.hat - intervals[, 2]) CIs <- cbind(lower = lower, upper = upper) return(CIs) } trueFunction <- function(x) sin(x) + 1/sqrt(x) + 3 set.seed(1234) n = 100 x = 1:n/n*2*pi wiggly = data.frame(x = x, y = trueFunction(x) + rnorm(n, 0, .75))
new.wiggly.resamp <- function(newdata=wiggly){ __________________________ #resample the data frame } new.wiggly.resamp()
resample <- function(x) { sample(x, replace = TRUE) } resample.data.frame <- function(data) { sample.rows <- resample(1:nrow(data)) return(data[sample.rows, ]) } trueFunction <- function(x) sin(x) + 1/sqrt(x) + 3 set.seed(1234) n = 100 x = 1:n/n*2*pi wiggly = data.frame(x = x, y = trueFunction(x) + rnorm(n, 0, .75)) new.wiggly.resamp <- function(newdata){ resample.data.frame(newdata) #resample the data frame } wiggles = new.wiggly.resamp(wiggly) grade_result( pass_if(~ identical(.result, wiggles), "Correct!"), fail_if(~ TRUE, "Incorrect.") )
Use both samplers to get model-based and nonparametric confidence intervals for your linear model.
resample <- function(x) { sample(x, replace = TRUE) } resample.data.frame <- function(data) { sample.rows <- resample(1:nrow(data)) return(data[sample.rows, ]) } rboot <- function(statistic, simulator, B) { tboots <- replicate(B, statistic(simulator())) if (is.null(dim(tboots))) { tboots <- array(tboots, dim = c(1, B)) } return(tboots) } bootstrap <- function(tboots, summarizer, ...) { summaries <- apply(tboots, 1, summarizer, ...) return(t(summaries)) } equitails <- function(x, alpha) { lower <- quantile(x, alpha/2) upper <- quantile(x, 1 - alpha/2) return(c(lower, upper)) } bootstrap.ci <- function(statistic = NULL, simulator = NULL, tboots = NULL, B = if (!is.null(tboots)) { ncol(tboots) }, t.hat, level) { if (is.null(tboots)) { stopifnot(!is.null(statistic)) stopifnot(!is.null(simulator)) stopifnot(!is.null(B)) tboots <- rboot(statistic, simulator, B) } alpha <- 1 - level intervals <- bootstrap(tboots, summarizer = equitails, alpha = alpha) upper <- t.hat + (t.hat - intervals[, 1]) lower <- t.hat + (t.hat - intervals[, 2]) CIs <- cbind(lower = lower, upper = upper) return(CIs) } trueFunction <- function(x) sin(x) + 1/sqrt(x) + 3 set.seed(1234) n = 100 x = 1:n/n*2*pi wiggly = data.frame(x = x, y = trueFunction(x) + rnorm(n, 0, .75)) wiggly.estimator.lm <- function(newdata=wiggly){ lm.coef = coef(lm(y~x, data=newdata)) lm.coef } wiggly.resids.resamp <- function(newdata=wiggly){ newdata.ml = lm(y~x, data=newdata) resids = residuals(newdata.ml) newResids = sample(resids, replace=TRUE) # resample the residuals from the original model new.wiggles = data.frame( # create a new dataframe x = wiggly$x, # with the original x's but new y's y = fitted(newdata.ml)+newResids) return(new.wiggles) } new.wiggly.resamp <- function(newdata=wiggly){ resample.data.frame(newdata) #resample the data frame }
mbb.wiggly = bootstrap.ci( statistic = wiggly.estimator.lm, simulator = wiggly.resids.resamp, B = 1000, t.hat = wiggly.estimator.lm(wiggly), level = 0.95) npb.wiggly = bootstrap.ci( statistic = wiggly.estimator.lm, simulator = new.wiggly.resamp, B = 1000, t.hat = wiggly.estimator.lm(wiggly), level = 0.95) mbb.wiggly npb.wiggly
Which bootstrap is more appropriate to use in this case? Why?
Answer: The model-based bootstrap is wrong. Data did not actually come from this linear model. So resampling rows is a more accurate measure of uncertainty.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.