In lbelzile/hecstatmod: Statistical Modelling at HEC Montréal

library(learnr)
knitr::opts_chunk$set(echo = TRUE, highlight = TRUE)

Hypothesis tests

question_checkbox("Suppose that we perform a one-sample *t*-test for a one-sided hypothesis and that we obtain a *p*-value of 0.06. Which of the following statement(s) is correct?",
  answer("The type I error is 0.06.", message = "The type I error is equal to the level $\\alpha$ if the null distribution is correctly specified."),
  answer("At level 5%, we conclude that the null hypothesis is true.", message = "We never conclude in favor of the hypothesis, because the *p*-value is the conditional probability given the null is true. The alternative could be equally likely, but lack of power prevents us from rejecting the null."),
  answer("At level 10%, we fail to reject the null hypothesis", correct = FALSE, message = "We reject the null hypothesis if the _p_-value is less than the level."),
  answer("If the null hypothesis was true, we would expect a result as extreme 6% of the time", correct = TRUE),
  random_answer_order = TRUE,
  allow_retry = TRUE,
  submit_button = "Submit an answer",
  try_again_button = "Try again")

question("Consider an hypothesis test for a parameter $\\theta$. Which factor lead to an increase of the power?",
  answer("An increase of the difference between the postulated value  $\\theta_0$ and the true value of the parameter.", correct = TRUE),
  answer("A decrease of the sample size.", message = "If the sample size decrease, we have less information and thus its harder to detect a difference."),
  answer("The asymptotic normality of the null distribution.", message = "While the sample size impacts the null distribution, it is uncertain (and unlikely) that it would impact power."),
  answer("An increase of the variability of $\\hat{\\theta}$.", message = "The more variable the estimator is, the less precise our measurement and so we reject the null less often."),
  random_answer_order = TRUE,
  allow_retry = TRUE,
  submit_button = "Submit an answer",
  try_again_button = "Try again")

question("We obtain an estimate of 14 for the mean; suppose that the 95% confidence interval is [12,16]. Which of the following interpretation of the confidence interval is correct?",
  answer("We expect that, 95% of the time in repeated samples, the true mean will fall between 12 and 16.", message = "The bounds of the confidence interval are random!"),
  answer("The probability that the true mean is between 12 and 16 is 95%.", message = "The interval either contains the true mean, or it doesn't. The definition is that of a credible interval (Bayesian paradigm)."),
  answer("In repeated samples, 95% of these intervals would contain the mean, i.e., 14.", message = "The true mean is unknown and 14 is the estimate."),
  answer("None of the above", correct = TRUE),
  allow_retry = TRUE,
  submit_button = "Submit an answer",
  try_again_button = "Try again")

We want to know whether the price of holiday homes increased over the past year and compare the price of all properties sold on Centris between 2018 and 2019.

question_checkbox("What of the following statement is valid?",
  answer("We have a census, because we have all of the listings on Centris.", message = "The population of interest is all holiday homes, and these may not all be listed on Centris. "),
  answer("We are considering a random sample.", message = "No. The sample is not random because it includes a single seller (for example, more expensive houses may be sold by independent brokers)."),
  answer("We cannot say anything about the quality of the sample.", correct = TRUE),
  submit_button = "Submit an answer")

question_checkbox("Let $\\mu_{2018}$ and $\\mu_{2019}$ denote the expected sale prize of holiday homes in 2018 and 2019, respectively. Which of the following statements are valid?",
  answer("The hypothesis of interest calls for a one-sided test.", correct = TRUE),
  answer("The hypothesis of interest calls for a two-sided test."),
  answer("The alternative hypothesis is $\\mathscr{H}_a: \\mu_{2018} = \\mu_{2019}$ and the null hypothesis is $\\mathscr{H}_0: \\mu_{2018} > \\mu_{2019}$", message = "The opposite: the null hypothesis is a pointwise hypothesis (Devil's advocate)."),
   answer("The alternative hypothesis is $\\mathscr{H}_a: \\mu_{2018} < \\mu_{2019}$ and the null hypothesis is $\\mathscr{H}_0: \\mu_{2018} \\geq \\mu_{2019}$", correct = TRUE),
   answer("The alternative hypothesis is $\\mathscr{H}_a: \\mu_{2018} = \\mu_{2019}$ and the null hypothesis is $\\mathscr{H}_0: \\mu_{2018} \\neq \\mu_{2019}$"),
     answer("The alternative hypothesis is $\\mathscr{H}_a: \\mu_{2018} \\neq \\mu_{2019}$ and the null hypothesis is $\\mathscr{H}_0: \\mu_{2018} = \\mu_{2019}$"),
  allow_retry = TRUE,
  submit_button = "Submit an answer",
  try_again_button = "Try again")

Exploratory data analysis

question_checkbox("Which of the following statements accurately describes a data frame in long format if each line of the table contains:",
         answer("the data and the weekly average price of gasoline in each of the 17 regions of Quebec (18 columns)", message = "Region is a categorical variable, so it should be one response measurement by line!"),
         answer("reaction time to an obstacle and a binary indicator for the source of distraction (texting or phone call).", correct = TRUE),
         answer("the satisfaction rate of an individual, along with socio-demographic characteristics of that person.", correct = TRUE),
 allow_retry = TRUE,
  submit_button = "Submit an answer",
  try_again_button = "Try again")

question("Suppose that we want to represent graphically the salary of professors a function of their rank. Which graphical representation would be the most adequate amongst the following options?",
         answer("a scatterplot of salary (*y*-axis) as a function of ranking, including transparency for points to avoid overlaid.", message = "The scale variable is ordinal, but it should be categorical!"),
         answer("a boxplot of salary (*y*-axis) as a function of rank (*x*-axis), ordered in increasing rank (assistant, associate, full))", correct = TRUE),
         answer("three histograms of salary (one per rank), with different colors.", message = "If the histograms are overlaid, it would be difficult to distinguish between the groups. Better would be to add them to different panels."),
         answer("a bar plot for average salary per rank", message = "This visual representation doesn't make adequate use of information, because it only considers the mean."),
 allow_retry = TRUE,
  submit_button = "Submit an answer",
  try_again_button = "Try again")

question("What are the elements of a box-and-whiskers plot?",
         answer("the box gives the average plus or minus two standard errors; the whiskers extend until the minimum and maximum of the sample."),
         answer("the box gives the mean, the 25th and 75th percentiles of the distribution. The whiskers' length extend up to twice the standard error on each side of the box and any point beyond the whiskers is flagged as outlying."),
         answer("the box contains the first, second and third quartiles; the whiskers extend from the box until 1.5 times the interquartile range.", correct = TRUE),
         answer("the box contains the mean and the 25th and 75th percentiles of the distribution; the whiskers extend from the box until 1.5 times the interquartile range and values beyond these are flagged as outlying."),
 allow_retry = TRUE,
  submit_button = "Submit an answer",
  try_again_button = "Try again")

Consider the following graph of the renfe data.

library(poorman)
library(ggplot2)
library(hecstatmod)
renfe %>% subset(type != "REXPRESS") %>%
    ggplot(aes(y = duration, x = price, col = type)) + 
    geom_point() + 
    labs(x = "price", 
         y = "duration of travel (in minutes)",
         col = "type of train") + 
    theme(legend.position = "none")

question_checkbox("Which of the following elements would improve the graphic?",
         answer("axes should be permuted so that the response variable is on the *y*-axis", correct = TRUE),
         answer("a legend for the color code", correct = TRUE),
         answer("the monetary unit for the price label", correct = TRUE),
         answer("a plot title providing context for the data", correct = TRUE), 
         answer("removing the color that indicates the type of high speed train", message = "use of color avoids having two side-by-side panels and allows for a better comparison."),
         answer("a different type of geometry to replace the scatterplot, for example a boxplot", correct = TRUE),
         answer("mapping travel duration into an unordered categorical variable"),
         answer("inclusion of transparency to better see ties", correct = TRUE), 
         answer("jitting observations (vertically), i.e. adding noise to avoid overlaid points", message = "If this option is often desirable, the small difference between duration of travel is likely to make this misleading."),
         answer("including zero on the axis", message = "Bad idea: neither price nor duration is close to zero and this would lead to clutter of observations, making comparisons more difficult."),
 allow_retry = TRUE,
  random_answer_order = TRUE,
  submit_button = "Submit an answer",
  try_again_button = "Try again")