# Ensure that libraries are loaded.
library(tidyverse)
library(learnr)
library(gradethis)
library(knitr)
library(kableExtra)
# New packages (must be installed before taking this tutorial)

tutorial_options(exercise.timelimit = 60, exercise.checker = gradethis::grade_learnr)
knitr::opts_chunk$set(echo = FALSE, warning = FALSE, message = FALSE)
# Ensure that the data is loaded for the remainder of this tutorial.
flights4 <- UsingRTutorials::flights4
# Store the results of a t test, so the htest object is available in the tutorial.
result_t <- flights4 %>% t.test(arr_delay0 ~ origin, data = ., na.action = "na.omit")
# Ensure that the final function report_ttest() is available in the tutorial.
report_ttest <- function(result_t) { 
  # if (is.null(result_t) || class(result_t) != "htest") { 
  #   return("#### Input is not a result of t.test()! ######") 
  # } else { 
      paste0("*t* (", format(round(result_t$parameter, digits = 2), nsmall = 2), ") = ",
             format(round(result_t$statistic, digits = 2), nsmall = 2), ", *p* ", 
             ifelse(result_t$p.value >= 0.0005, 
                    paste0("= ", format(round(result_t$p.value, digits = 3), nsmall = 3)),
                    "< 0.001"), 
             ", 95%CI[", format(round(result_t$conf.int[1], digits = 2), nsmall = 2), 
             ", ", format(round(result_t$conf.int[2], digits = 2), nsmall = 2), "]") 
  # } 
}

Overview

First 1.5 hours: Course content

Second 1.5 hours: Data project

Piping

Pipe more: Cut out intermediary results.

Let us quickly rehearse piping.

The code below creates and shows a plot. Replace it by one pipe that is short as possible.
# Show a plot of the logarithm of arrival delays for flights to 
# Atlanta, Boston, or Buffalo in January.
flights4_january <- filter(flights4, month == 1)
flights4_jan_atlantabostonbuffalo <- filter(flights4_january, dest %in% c("ATL", "BOS", "BUF"))
flights4_jan_ATLBOSBUF_logdelay <- mutate(flights4_jan_atlantabostonbuffalo, log_arr_delay0 = log(arr_delay0 + 1))
ggplot(flights4_jan_ATLBOSBUF_logdelay, aes(x = log_arr_delay0)) +
  geom_area(stat = "count")
__Hint:__ What do we do with the data argument in a pipe?
flights4 %>% filter(month == 1 & dest %in% c("ATL", "BOS", "BUF")) %>% mutate(log_arr_delay0 = log(arr_delay0 + 1)) %>% ggplot(aes(x = log_arr_delay0)) + geom_area(stat = "count")
gradethis::grade_code(
  correct = "The (input) data argument disappears in a pipe if it is the first argument. And you correctly combined the two filter steps.", 
  incorrect = "Don't save the plot as a data object, send it to the screen. Perhaps you should join the two filter functions."
  )

The pipe:

. in a pipe

Piping:

Test if average arrival delay (`arr_delay0`) differs between the two airports of origin (`origin`) included in the `flights4` tibble. Pipe this tibble into the `t.test()` function (of the `stats` package).

# Check help on the `t.test()` function.
# Use a dot to specify the piped-in tibble as input data.
flights4 %>% t.test(arr_delay0 ~ origin, )
flights4 %>% t.test(arr_delay0 ~ origin, data = .)
gradethis::grade_code(
  correct = "The `data` argument is not the first argument, so it must be specified and the input data tibble must be represented by a dot. As you have done!", 
  incorrect = "Did you use the data argument in the t.test() function?"
  )

In a pipe, . represents the input object. Use it if:

# Get the value of the t statistic of an independent samples t test.
flights4 %>% 
  t.test(arr_delay0 ~ origin, data = .) %>%
  .$statistic #not a function

We will see soon where $statistic comes from.

Lists

A List may store anything (your perfect cupboard?)

Examples:

  1. Data frame (or tibble): list of (logic/num/char) vectors (variables).
    Restriction: All vectors have the same length.
  2. Statistical results: list with coefficients, etc.
  3. Nested data: More than one value per variable per case.

Extracting statistical results

A t test yields a results object, which is a list.

# Store the results of a t test.
result_t <- flights4 %>% 
  t.test(arr_delay0 ~ origin, data = .)
result_t #Default print method for these results.

Function str() shows the contents of a list.

Try to recognize the contents of this list.
str(result_t)

Note:

Extracting an element from a list

We get elements from a list:

Extract the confidence interval from the t test results data object (`result_t`) with `[[]]` and show (print) it (instead of saving it as a new data object).

__Hint:__ The element name must be in quotation marks. See the book section on _Recursive Vectors (Lists)_.
result_t[["conf.int"]]
gradethis::grade_code(
  correct = "", 
  incorrect = "Perhaps you used the element number in the list instead of the element name to extract the confidence level. That is OK."
  )
Do the same thing but now use `$`.

__Hint:__ See the book section on _Recursive Vectors (Lists)_.
result_t$conf.int
gradethis::grade_code(
  correct = "`$conf.int` is shorthand for `[['conf.int']]`.", 
  incorrect = ""
  )

Let's go one level down in the list of t test results:

str(result_t[4])
Show the confidence level (as a number) using only dollar signs (`$`), not square brackets (`[[]]`). Show the result on the screen.

# The confidence level is an attribute (`attr` in the structure overview presented above), so use the `attributes()` function.
attributes()
# Pull out the confidence interval from the results. Complete this code.
attributes(result_t)
# Have a look at the output of the attributes() function. How can you get the number of the confidence level?
attributes(result_t$conf.int)
attributes(result_t$conf.int)$conf.level
gradethis::grade_code(
  correct = "We can use the dolar sign directly after the attributes() function. Isn't that nice?", 
  incorrect = "If you just get the number 0.95, you are fine. Probably, you used [[]] instead of $. If you get more than just the number, you have to go down one level in the list that `attributes()` generates."
  )

Instead of the attribute value, we can get the attribute name with the names() function.

Extract the label "conf.level" from the list of t test results.

# If you want the name of an attribute, apply the `names()` function to an attribute that you extract with attributes().
names(attributes( ))
names(attributes(result_t$conf.int))
gradethis::grade_code(
  correct = "", 
  incorrect = "Did you use [[]] instead of $?"
  )

Let us practice some more.

Get the name and value (rounded to one decimal place) of the mean of the second group in the results of the _t_ test (`result_t`).

# Build your code in steps. First, find the group means in the results.
str(result_t)
# Second, pull the mean from the results data object. You know how to get only the second mean (complete the code below).
result_t$estimate
# Third, round the mean with the round() function (use Help). Complete the code below.
round(result_t$estimate[[2]] )
# Fourth, pull the name of the second group from the results data object. Complete the code below.
names(result_t$estimate)[2]
names(result_t$estimate)[2]
round(result_t$estimate[[2]], digits = 1)
gradethis::grade_code(
  correct = "", 
  incorrect = "Check out the hints to this exercise. And mind the blanks in the resulting sentence."
  )

__Programming Tips__ - Attribute names and attribute values are nested within a list element. - `attributes()` creates a list; select from this list with $ or [[]]. - 'Drill' into lower levels step by step: + `attributes(result_t)` returns all first-level names and attributes (as a list); + `attributes(result_t$conf.int)` returns all attributes of this element; + `attributes(result_t$conf.int)$conf.level` gives the value of the conf.level attribute; + `names(attributes(result_t$conf.int))` returns the name of the conf.int attribute. - `result_t$conf.int[1]` yields the lower limit of the confidence interval

Extract test results in APA style

The code below extracts the results of the t test in APA style.

What does the `digits` argument do?
paste0(                       # base R function to concatenate strings
  "*t* (", 
  round(result_t$parameter, digits = 2), ") = ",        # df
  round(result_t$statistic, digits = 2),                # t
  ", *p* = ", round(result_t$p.value, digits = 3),      # p
  ", 95%CI[", round(result_t$conf.int[1], digits = 2),  # 95%CI lower
  ", ", round(result_t$conf.int[2], digits = 2), "]")   # 95%CI upper

The stars will turn t and p into italics in a knitted document.

Conditions with ifelse()

Improving the reported p value:

1. What does the `digits` argument do? 2. To which function does it belong? 3. Explain how 'ifelse()` works. Experiment with the code to check your answers.
paste0("*t* (", 
  format(round(result_t$parameter, digits = 2), nsmall = 2), ") = ",
  format(round(result_t$statistic, digits = 2), nsmall = 2), 
  ", *p* ", 
  ifelse(result_t$p.value < 0.0005, 
         paste0("= ", format(round(result_t$p.value, digits = 3), nsmall = 2)),
         "< 0.001"),
  ", 95%CI[", format(round(result_t$conf.int[1], digits = 2), nsmall = 2),
  ", ", format(round(result_t$conf.int[2], digits = 2), nsmall = 2), "]")
__Hint:__ Use help on functions `format()` and `ifelse()`.

Functions

Functions in mathematics: y = f(x).

Functions in R: y <- f(x).

Meaning: Do something to data object x -- f(x) -- and store result in data object y.

For short: Transform x into y.

Left-hand data object (y):
- Does not exist: new data object created. Can be a function!
- Exists: data object overwritten.
- Not named: output to screen (console).

Creating a function for reporting t tests in APA style

Step 1: Add function(), enclose code within { and }, and store.

report_ttest <- function() {
  paste0("*t* (", format(round(result_t$parameter, digits = 2), nsmall = 2), ") = ",
    format(round(result_t$statistic, digits = 2), nsmall = 2), ", *p* ", 
    ifelse(result_t$p.value >= 0.0005, 
      paste0("= ", format(round(result_t$p.value, digits = 3), nsmall = 3)),"< 0.001"),
    ", 95%CI[", format(round(result_t$conf.int[1], digits = 2), nsmall = 2),
    ", ", format(round(result_t$conf.int[2], digits = 2), nsmall = 2), "]")
}

report_ttest is the name of the new function.

__Programming Tip__ - Start function creation with code that works.

Identify inputs as arguments

Step 2: Specify user input.

The user must specify the data object containing the results of t.test().

Specify the results data object as __x__ in the below code.
report_ttest <- function() {
  paste0("*t* (", format(round(result_t$parameter, digits = 2), nsmall = 2), ") = ",
    format(round(result_t$statistic, digits = 2), nsmall = 2), ", *p* ", 
    ifelse(result_t$p.value >= 0.0005, 
      paste0("= ", format(round(result_t$p.value, digits = 3), nsmall = 3)),"< 0.001"),
    ", 95%CI[", format(round(result_t$conf.int[1], digits = 2), nsmall = 2),
    ", ", format(round(result_t$conf.int[2], digits = 2), nsmall = 2), "]")
}
report_ttest <- function(x) {
  paste0("*t* (", format(round(x$parameter, digits = 2), nsmall = 2), ") = ",
    format(round(x$statistic, digits = 2), nsmall = 2), ", *p* ", 
    ifelse(x$p.value >= 0.0005, 
      paste0("= ", format(round(x$p.value, digits = 3), nsmall = 3)),"< 0.001"),
    ", 95%CI[", format(round(x$conf.int[1], digits = 2), nsmall = 2),
    ", ", format(round(x$conf.int[2], digits = 2), nsmall = 2), "]")
}
gradethis::grade_code(
  correct = "", 
  incorrect = "Don't forget to add x as an argument to the function. Replace result_t everywhere by x!"
  )

Check encapsulation

Encapsulation:

Step 3: Ensure that all data object names in the body are:

  1. named as arguments (for example: x), or
  2. created within the body (none in this example).
What goes wrong and why in the `report_ttest()` function? Can you solve the problem?
# The function is created here.
report_ttest <- function(x) {
  paste0("*t* (", format(round(x$parameter, digits = 2), nsmall = 2), ") = ",
    format(round(x$statistic, digits = 2), nsmall = 2), ", *p* ", 
    ifelse(x$p.value >= 0.0005, 
      paste0("= ", format(round(x$p.value, digits = 3), nsmall = 3)),"< 0.001"),
    ", 95%CI[", format(round(result_t$conf.int[1], digits = 2), nsmall = 2),
    ", ", format(round(result_t$conf.int[2], digits = 2), nsmall = 2), "]")
}
# Execute another t test: average arrival delay of carriers AA and UA.
result_t2 <- flights4 %>% 
  filter(carrier %in% c("AA", "UA")) %>%
  t.test(arr_delay0 ~ carrier, data = .)
# New test results.
result_t2
# New test results with function report_ttest().
report_ttest(result_t2)
__Hint:__ Carefully compare the results. Note that `result_t` is present in the environment (you can check this with the `ls()` function.

Dot-Dot-Dot

... is a special argument for R functions:

What is the input for the dots in the last line of code and what does the function do with this input?
report_ttest <- function(x, ...) {
  print(paste(...))
  paste0("*t* (", format(round(x$parameter, digits = 2), nsmall = 2), ") = ",
    format(round(x$statistic, digits = 2), nsmall = 2), ", *p* ", 
    ifelse(x$p.value >= 0.0005, 
      paste0("= ", format(round(x$p.value, digits = 3), nsmall = 3)),"< 0.001"),
    ", 95%CI[", format(round(x$conf.int[1], digits = 2), nsmall = 2),
    ", ", format(round(x$conf.int[2], digits = 2), nsmall = 2), "]")
}
# Use the function.
report_ttest(result_t, "The", "difference", "was", "significant.", sep=' ')

Named arguments and default values

1. Add a named argument `digits =` to the `report_ttest()` function that specifies the number of decimal places for numeric results other than _p_ values. 2. Set the default number of decimal places to 2 in this function. 3. And ensure that the requested number of decimal places are used in the function output.
report_ttest <- function(x) {
  paste0("*t* (", format(round(x$parameter, digits = 2), nsmall = 2), ") = ",
    format(round(x$statistic, digits = 2), nsmall = 2), ", *p* ", 
    ifelse(x$p.value >= 0.0005, 
      paste0("= ", format(round(x$p.value, digits = 3), nsmall = 3)),"< 0.001"),
    ", 95%CI[", format(round(x$conf.int[1], digits = 2), nsmall = 2),
    ", ", format(round(x$conf.int[2], digits = 2), nsmall = 2), "]")
}
# Test the function.
report_ttest(result_t, digits = 5)
__Hint:__ The name of the new argument must be used within the function everywhere the number of digits should follow the choice of the user.
report_ttest <- function(x, digits = 2) { paste0("*t* (", format(round(x$parameter, digits = digits), nsmall = digits), ") = ", format(round(x$statistic, digits = digits), nsmall = digits), ", *p* ",  ifelse(x$p.value >= 0.0005,  paste0("= ", format(round(x$p.value, digits = 3), nsmall = 3)),"< 0.001"), ", 95%CI[", format(round(x$conf.int[1], digits = digits), nsmall = digits), ", ", format(round(x$conf.int[2], digits = digits), nsmall = digits), "]") }
report_ttest(result_t, digits = 5)
gradethis::grade_code(
  correct = "", 
  incorrect = "You must use digits instead of the number 2 everywhere. And don't forget to specify the default number of digits in the function's argument."
  )

Embedding output in R Markdown

Function report_ttest() is meant to display t test results in APA style to the reader.

This means that the result of the function must be displayed within a sentence in the report.

This is called i

Inline code:

Example R Markdown text with inline code:

There is a statistically significant difference in average delay between the two airports, `r report_ttest(result_t)`.  

The R Markdown text shown above if the document is knitted:

There is a statistically significant difference in average delay between the two airports, r report_ttest(result_t).

Note the italics of t and p.

__Programming Tip__ - This is a valuable way to report correct numerical results. If the data or analysis change, the new results will be shown.

Control Flow: Conditions

Our function still has flaws: It gives errors or wrong output if the input is not a t.test() result (class htest).

#Run a regression.
result_lm <- flights4 %>% lm(arr_delay0 ~ origin, data = .)
#Run a regression.
result_lm <- flights4 %>% lm(arr_delay0 ~ origin, data = .)
#Use regression results in function report_ttest().
report_ttest(result_lm)

Using if () {} else {}

Let's fix it.

Use `if (class(x) != "htest") {} else {}` to print either #### Input is not a result of t.test()! ###### or the APA-style formatted t test result.
report_ttest <- function(x) {
  paste0("*t* (", format(round(x$parameter, digits = 2), nsmall = 2), ") = ",
    format(round(x$statistic, digits = 2), nsmall = 2), ", *p* ", 
    ifelse(x$p.value >= 0.0005, 
      paste0("= ", format(round(x$p.value, digits = 3), nsmall = 3)),"< 0.001"),
    ", 95%CI[", format(round(x$conf.int[1], digits = 2), nsmall = 2),
    ", ", format(round(x$conf.int[2], digits = 2), nsmall = 2), "]")
}
# Applied to t test results.
report_ttest(result_t)
# Applied to regression results.
report_ttest(result_lm)
# Applied to empty object.
report_ttest(NULL) 
# Check the use of `return()` in a function.
# The if - else flow control must be at the start of the function, because it makes no sense to pull results from a data object containing something else than a t test result.
report_ttest <- function(result_t) { 
  if (class(result_t) != "htest") { 

  } else {

  }
}
# Just return a message if the data object is not of class htest.
report_ttest <- function(result_t) { 
  if (class(result_t) != "htest") { 
    return("")
  } else {

  }
}
# Put the code to create the formatted results in the else{} part.
report_ttest <- function(result_t) { 
  if (class(result_t) != "htest") { 
    return("#### Input is not a result of t.test()! ######")
  } else {
    # add code here
  }
}
report_ttest <- function(x) { if (class(x) != "htest") { return("#### Input is not a result of t.test()! ######") } else { paste0("*t* (", format(round(x$parameter, digits = 2), nsmall = 2), ") = ", format(round(x$statistic, digits = 2), nsmall = 2), ", *p* ",  ifelse(x$p.value >= 0.0005, paste0("= ", format(round(x$p.value, digits = 3), nsmall = 3)),"< 0.001"), ", 95%CI[", format(round(x$conf.int[1], digits = 2), nsmall = 2), ", ", format(round(x$conf.int[2], digits = 2), nsmall = 2), "]") } }
report_ttest(result_t)
report_ttest(result_lm)
report_ttest(NULL) 
gradethis::grade_code(
  correct = "A function returns the result of the last step in the code or what is marked by `return()`.", 
  incorrect = "If you get the message 'non-numeric argument to mathematical function', input other than t test results is still being treated as if it contains t test results." 
  )

A function returns:

__Programming Tip__ - Empty data objects may cause problems. Always include them in a function test.

Conditions and logical operators

An F test also yields results as a htest class.

How can we ensure that report_ttest() does not report F test results?

Explain how the code below excludes _F_ test results. What does `||` mean?
report_ttest <- function(x) { 
  if (class(x) != "htest" || x$method == "F test to compare two variances") { 
    return("#### Input is not a result of at.test()! ######") 
  } else { 
      paste0("*t* (", format(round(x$parameter, digits = 2), nsmall = 2), ") = ",
             format(round(x$statistic, digits = 2), nsmall = 2), ", *p* ", 
             ifelse(x$p.value >= 0.0005, 
                    paste0("= ", format(round(x$p.value, digits = 3), nsmall = 3)),
                    "< 0.001"), 
             ", 95%CI[", format(round(x$conf.int[1], digits = 2), nsmall = 2), 
             ", ", format(round(x$conf.int[2], digits = 2), nsmall = 2), "]") 
  } 
}
# Applied to previous t test results.
report_ttest(result_t)
# Applied to F test results.
flights4 %>% var.test(arr_delay0 ~ origin, data = .) %>% report_ttest()

In the report_ttest() function, we deal with single values instead of vectors:

Fancy Stuff

Sorry, no fancy stuff in this tutorial.

Data Project Today

It is time to finish tidying your group's project data.

Use the remaining time to design and create your data visualization.



WdeNooy/UsingRTutorials documentation built on Jan. 25, 2023, 2:39 a.m.