In PsyTeachR/reprores-v2: Data Skills for Reproducible Research

# please do not alter this code chunk
knitr::opts_chunk$set(echo = TRUE,
                      message = FALSE, 
                      error = TRUE)

library(tidyverse)
library(reprores)

# install the class package reprores to access built-in data
# devtools::install_github("psyteachr/reprores-v2)
# or download data from the website
# https://psyteachr.github.io/reprores/data/data.zip

Load built-in datasets

List the datasets in dplyr.

data(package = "dplyr")

Load the built-in dataset starwars and use glimpse() to see an overview.

data("starwars")
glimpse(starwars)

Convert the built-in base R mtcars dataset to a tibble (you will need to find the function for this; it isn't in the chapter), and store it in the object mt.

mt <- as_tibble(mtcars)

Import data from CSV and Excel files

Using the data directory created by reprores::getdata() (or download the zip file, read "disgust_scores.csv" into a table.

disgust <- read_csv("data/disgust_scores.csv")

Override the default column specifications to skip the id column.

my_cols <- cols(
  id = col_skip()
)

disgust_skip <- read_csv("data/disgust_scores.csv", col_types = my_cols)

How many rows and columns are in the disgust dataset?

## dim() returns a vector c(rows, cols)
dimensions <- dim(disgust)
disgust_rows <- dimensions[1]
disgust_cols <- dimensions[2]

## nrow() returns the number of rows
disgust_rows <- nrow(disgust)

## ncol() returns the number of columns
disgust_cols <- ncol(disgust)

Load the data in "data/stroop.csv" as stroop1 and "data/stroop.xlsx" as stroop2.

stroop1 <- read_csv("data/stroop.csv")
stroop2 <- readxl::read_xlsx("data/stroop.xlsx")

Use glimpse() to figure out the difference between the two data tables and fix the problem.

# the difference is the data type of rt is double in stroop1 and character in stroop 2
glimpse(stroop1)
glimpse(stroop2)

# missing values use the characters "NA", so define the NA value when importing
stroop2b <- readxl::read_xlsx("data/stroop.xlsx", na = "NA")

Create a data table

Create a tibble with the columns name, age, and country of origin for 2 people you know.

# you can do this with the tibble function
people <- tibble(name = c("Lisa", "Robbie"),
                 age = c(43, 12),
                 country = c("US","UK") )

# also note:
# you can type this in row by row, rather than column by column,
# using the 'tribble' function
people <- tribble(~name,  ~age, ~country,
                  "Lisa",   43,  "US",
                  "Robbie", 12,  "UK")

Create a tibble that has the structure of the table below, using the minimum typing possible. (Hint: rep()). Store it in the variable my_tbl.

ID | A | B | C --|-----|-----|--- 1 | A1 | B1 | C1
2 | A1 | B2 | C1
3 | A1 | B1 | C1
4 | A1 | B2 | C1
5 | A2 | B1 | C1
6 | A2 | B2 | C1
7 | A2 | B1 | C1
8 | A2 | B2 | C1

my_tbl <- tibble(ID = 1:8,
                 A = rep(c("A1", "A2"), each = 4),
                 B = rep(c("B1", "B2"), 4),
                 C = "C1")

Understand the use the basic data types

Set the following objects to the number 1 with the indicated data type:

one_int (integer)
one_dbl (double)
one_chr (character)

one_int <- 1L
one_dbl <- 1.0
one_chr <- "1"

Set the objects T_log, T_chr, T_int and T_dbl to logical, character, integer and double values that will all be equal to TRUE.

T_log <- TRUE
T_chr <- "TRUE"
T_int <- 1L
T_dbl <- 1.0

Check your answers with this code:

# these should all evaluate to TRUE
tests <- list(
  T_log_is_TRUE = T_log == TRUE,
  T_chr_is_TRUE = T_chr == TRUE,
  T_int_is_TRUE = T_int == TRUE,
  T_dbl_is_TRUE = T_dbl == TRUE,
  T_log_is_log = is.logical(T_log),
  T_chr_is_chr = is.character(T_chr),
  T_int_is_int = is.integer(T_int),
  T_dbl_is_dbl = is.double(T_dbl)
)

str(tests) # this shows a condensed version of the list

Understand and use the basic container types

Create a vector of the numbers 3, 6, and 9.

threes <- c(3, 6, 9)

The built-in vector letters contains the letters of the English alphabet. Use an indexing vector of integers to extract the letters that spell 'cat'.

cat <- letters[c(3, 1, 20)]

The function colors() returns all of the color names that R is aware of. What is the length of the vector returned by this function? (Use code to find the answer.)

col_length <- length(colors())

Create a named list called col_types where the name is each column in the built-in dataset table1 and the value is the column data type (e.g., "double", "character", "integer", "logical").

# you can do this manually
col_types <- list(
  country = "character",
  year = "integer",
  cases  = "integer",
  population = "integer"
)

# or with coding
col_types <- list(
  typeof(table1[[1]]),
  typeof(table1[[2]]),
  typeof(table1[[3]]),
  typeof(table1[[4]])
)
names(col_types) <- names(table1)

# here is a shortcut to do it all in one step
# lapply applies the function (FUN) to each item in the list (X)
col_types <- lapply(X = table1, FUN = typeof)

Use vectorized operations

Set the object x to the integers 1 to 100. Use vectorised operations to set y to x squared. Use plot(x, y) to visualise the relationship between these two numbers.

x <- -100:100
y <- x^2

plot(x, y)

Set t to the numbers 0 to 100 in increments of 0.1. Set x to the sine of t and y to the cosine of t (you will need to find the functions for sine and cosine). Plot x against y.

t <- seq(0, 100, 0.1)

x <- sin(t)
y <- cos(t)
plot(x, y)

The function call runif(n, min, max) will draw n numbers from a uniform distribution from min to max. If you set n to 10000, min to 0 and max to 1, this simulates the p-values that you would get from 10000 experiments where the null hypothesis is true. Create the following objects:

pvals: 10000 simulated p-values using runif()
is_sig: a logical vector that is TRUE if the corresponding element of pvals is less than .05, FALSE otherwise
sig_vals: a vector of just the significant p-values
prop_sig: the proportion of those p-values that were significant

set.seed(8675309) # ensures you get the same random numbers each time you run this code chunk

pvals    <- runif(10000, 0, 1)
is_sig   <- pvals < .05
sig_vals <- pvals[is_sig]
prop_sig <- length(sig_vals) / length(pvals)

# alternatively:
prop_sig <- mean(is_sig)
prop_sig <- mean(pvals < .05)