library(learnr)
library(r4np)
library(tidyverse)
knitr::opts_chunk$set(echo = FALSE)
tutorial_options(exercise.blanks = TRUE)

Importing data

Exercise 1

quiz(
  question(
    "Which of the following ways can you import data into your RStudio session?",
    answer("The 'Import Dataset' button in the 'Environment' pane does the trick.",
           correct = TRUE),
    answer("I can drag and drop files into the 'Source' window.",
           message = "I am afraid it is not possible to drag and drop files. It would be nice, though."),
    answer("I can use the function 'import_dataset()' from the 'readr' package.",
           message = "Unfortunately, the `readr` package has no function called 'import_dataset()."),
    answer("readr::read_csv('mydata.csv') imports data directly into the 'Environment' pane.",
           correct = TRUE),
    allow_retry = TRUE
  )
)

Exercise 2

quiz(
  question(
    "What is a delimiter in datasets?",
    answer("It limits the amount of data shown in the console.",
           message = "Not quite. Delimiters do not affect what is shown in the console."),
    answer("It enables computers to identify were a column starts and where it ends in our data frame.",
           correct = TRUE),
    answer("A comma is the delimiter in a '.csv' file.",
           correct = TRUE),
    answer("There is a pre-defined set of delimiters we have to use.",
           message = "You can define any character or symbol as a delimiter. The choice is yours."),
    allow_retry = TRUE
  )
)

Inspecting data

Exercise 3

How can we find out what kind of variables are included in the imdb_top_250 dataset? Change the code below accordingly.

imdb_top_250
# You might want to take a 'glimpse' at the dataset.
# Solution
glimpse(imdb_top_250)

Exercise 4

How can we view the synopsis (a column in imdb_top_250)? Provide the correct code below.


# We have to use '$' to select specific column/variable in a dataset
# Solution
imdb_top_250$synopsis

Cleaning column names

Exercise 5

quiz(
  question(
    "What makes for good column/variable names?",
    answer("They should be as detailed as possible.",
           message = "While it is important a name captures the meaning of column/variable, we always should aim for being succinct, even if it means we have to sacrifice a bit on detail."),
    answer("They should be standardised, for example using lower-case only.",
           correct = TRUE),
    answer("If other people can easily make sense of them.",
           correct = TRUE),
    answer("They do not exceed a certain character limit.",
           message = "The length of names is relevant, but there is no particular character limit. Remember: Short is good, but nobodoy really defined what is 'short' and what is not."),
    allow_retry = TRUE
  )
)

Exercise 6

# Install janitor if not installed yet
if("janitor" %in% rownames(installed.packages()) == FALSE){install.packages("janitor")}

# Create messy dataset
data <- tibble(ID_number = c("N006", "N007", "N008"),
               Gender_of_Participant = c("female", "male", "female"),
               emotional_resilience = c(4, 8, 10),
               Name = c("Kirk", "Bond", "The Rock")
               )

A colleague gaves us a data frame saved in the object data. Let's first, inspect data using glimpse() to see what it includes


quiz(
  caption = "Follow up question",
  question(
    "What can we say about this data frame?",
    answer("The data frame is not correct.",
           message = "Technically, this is a fully functional dataset. However, there are other things we can improve."),
    answer("There are 3 variables and 4 observations",
           message = "This dataset has 4 variables and 3 observations. Be aware of the difference between variable (a column) and an observation (a row) in a dataset"),
    answer("The column names are difficult to read.",
           correct = TRUE),
    answer("We could use a package like `janitor` to clean this dataset.",
           correct = TRUE),
    allow_retry = TRUE
  )
)

Exercise 7

With these insights, how can we clean the column names and save it in the new object data_clean? Inspect the result with glimpse().

library(janitor)

# Clean the column names to make them more uniform
data_clean <- 

# Inspect the results
# The 'janitor' package has a function to 'clean names'
# The following function should help
clean_names(___)
# Solution
data_clean <- clean_names(data)

Exercise 8

There are some more improvements we could make to data_clean. For example, we could shorten the following column names:

Clean these column names using the dplyr function rename() and save the changes in the object data_clean. Review the changes using glimpse().

# Create messy dataset
data <- tibble(ID_number = c("N006", "N007", "N008"),
               Gender_of_Participant = c("female", "male", "female"),
               emotional_resilience = c(4, 8, 10),
               Name = c("Kirk", "Bond", "The Rock")
               )

data_clean <- janitor::clean_names(data)
# Rename columns


# Review changes
# When using dplyr we can specify the dataset first.
data_clean <-
  data_clean %>%
  ___
# Then we can use the 'rename()' function.
data_clean <-
  data_clean %>%
  rename(___)
# Remember, we first need to specify the new and then the old name
data_clean <-
  data_clean %>%
  rename(new_name = old_name)
# Here is the example for the column 'id_number'
data_clean <-
  data_clean %>%
  rename(id = id_number)
# The solution for both columns
data_clean <-
  data_clean %>%
  rename(id = id_number,
         gender = gender_of_participant)

glimpse(data_clean)

Data types

Exercise 10

quiz(
  question(
    "What types of data can we usually find in Social Sciences projects.",
    answer("Valided and unvalidaded data.",
           message = "Valided/unvalidaded data is not a data type."),
    answer("Nominal data, which refers to categorical data with no particular order.",
           correct = TRUE),
    answer("Quantitative data, which consists exclusively of numbers.",
           correct = TRUE),
    answer("Ordinal data, which can include categories and numbers to show their order.",
           message = "Ordinal data is categorical and its categories can be placed in a meaningful order."),
    allow_retry = TRUE
  )
)

Exercise 11

quiz(
  question(
    "Which of the following statements are correct?",
    answer("`<chr>` and `<fct>` refer to columns in a dataset that are categorical.",
           correct = TRUE),
    answer("`<dbl>` and `<int>` refer to numeric variables and can be used interchangeably.",
           message = "It is true that `<dbl>` and `<int>` refer to numeric variables, but they are not interchangeable. <dbl> allows decimals."),
    answer("Logical variables always only two values: `TRUE` or `FALSE`.",
           correct = TRUE),
    answer("When importing data with `readr`, factors are always imported as `<chr>`",
           correct = TRUE),
    allow_retry = TRUE
  )
)

Exercise 12

students <- tibble(study_level = c("UG", "PG", "PG"),
                   study_experiece = c(32, 83, 95),
                   team = c("Alpha", "Beta", "Gamma")
                   )

Inspect the dataset students and answer the questions below.


quiz(
  question(
    "What can we say about this dataset?",
    answer("There are three categorical variables as indicated by `<chr>`.",
           message = "There are only two categorical variables: `study_level` and `team`"),
    answer("Two of the variables should not be `<chr>`, but a factor `<fct>`.",
           correct = TRUE),
    answer("The variable 'study_experience' could be considered as an integer `<int>`.",
           correct = TRUE),
    answer("It is important to convert the categorical variables into factors `<fct>`.",
           correct = TRUE),
    allow_retry = TRUE
  )
)

Exercise 13

Changing data types is an essential aspect of data wrangling and cleaning. Take a look at the dataset halloween and change the data type of those variables that are not entirely correct.

# Inspect dataset

# Change data types
halloween_clean <-
  halloween %>%
  ___

# Inspect the new dataset
glimpse(halloween)

# We need to mutate the variable 'country' into a factor <fct>,
# because it is currently a character <chr>
glimpse(halloween)

halloween_clean <-
  halloween %>%
  mutate(___)
glimpse(halloween)

halloween_clean <- 
  halloween %>%
  mutate(country = as_factor(country))

Exercise 14

The dataset gep contains categorical and numerical data. We are asked to clean the dataset and ensure that data types are correct. First, take a look at the variables included in this data frame.


quiz(
  question(
    "Which of the following steps should we take?",
    answer("'gender' and 'level_of_study' should be converted to `<fct>`.",
           correct = TRUE),
    answer("'age' needs to be converted to an integer `<int>`.",
           message = "While 'age' is an integer, it is not necessary to change to conduct further analysis."),
    answer("All numeric variables can be converted to an `<int>`, but it is not necessary.",
           correct = TRUE),
    answer("We can create a new object to save the changes we made.",
           correct = TRUE),
    allow_retry = TRUE
  )
)

Let's implement these steps, store the changes in the object gep_clean and review the changes using glimpse().


gep_clean <-
  gep %>%
  mutate(gender = as_factor(gender),
         level_of_study = as_factor(level_of_study))

glimpse(gep_clean)

Handling factors

famous_actors <- tibble(name = c("Zendaya", "Will Poulter", "Jamie Lee Curtis", "Oscar Isaac"),
                   gender = c(1, 0, 1, 0),
                   country_of_origin = c(1, 2, 1, 3),
                   birth_place = c("Oakland", "London", "Santa Monica", "Guatemala City")
                   )

We received a small dataset famous_actors which contain information about commonly known actors working in Hollywood. However, some data cleaning is required to ensure data is correctly reflected in R for further processing. There are several steps we should perform.

Exercise 15

First, we need to inspect the dataset to get an idea which variables we need to correct.

# Inspect the dataset 'famous_actors'
quiz(
  question(
    "Which of the following steps should we take?",
    answer("We need to correct the variable `gender`, because it is considered as `dbl` but should be a factor, i.e. `fct`.",
           correct = TRUE),
    answer("The variable `country_of_origin` is `dbl` and therefore cannot be used. We should remove it.",
           message = "We do not have to remove `country_of_origin`, but need to find out what these numbers stand for, i.e. which countries."),
    answer("The variable `birth_place` is an example of a `chr` variable and therefore needs not further treatment.",
           message = "The variable `birth_place` shows a limited number of categories and therefore qualifies as a factor (`fct`)."),
    answer("Except for `name`, all variables should be cleaned up.",
           correct = TRUE),
    allow_retry = TRUE
  )
)

Exercise 16

We usually should try to avoid using numbers as factor levels to make it easier to read tables and data visualisations. Instead, we can provide meaningful labels. To do so, we have to 'recode' these factor levels. In the famous_actors dataset we find two factors which are missing appropriate labels, i.e. gender and country_of_origin. Make the required adjustments considering the following lines from the coding booklet of this dataset:

- Gender: 1 stands for female and 0 stands for male.
- Country: 1 stands for USA, 2 stands for United Kingdom and 3 stands for Guatemala.


# We need to create an object to save the outcome and use the 'mutate()' function.
famous_actors_clean <-
  famous_actors %>%
  mutate(___)
# First we need to change the data type to 'factor'.
famous_actors_clean <-
  famous_actors %>%
  mutate(gender = as_factor(gender),
         country_of_origin = as_factor(country_of_origin)
         )
# Then we use the function 'fct_recode()' to replace the numbers
famous_actors_clean <-
  famous_actors %>%
  mutate(gender = as_factor(gender),
         country_of_origin = as_factor(country_of_origin),
         gender = fct_recode(___)
         )
# The same function is used to change the labels for 'country_of_origin'.
famous_actors_clean <-
  famous_actors %>%
  mutate(gender = as_factor(gender),
         country_of_origin = as_factor(country_of_origin),
         gender = fct_recode(gender,
                             "male" = "0",
                             "female" = "1"),
         country_of_origin = fct_recode(___)
         )
# Solution
famous_actors_clean <-
  famous_actors %>%
  mutate(gender = as_factor(gender),
         country_of_origin = as_factor(country_of_origin),
         gender = fct_recode(gender,
                             "male" = "0",
                             "female" = "1"),
         country_of_origin = fct_recode(country_of_origin,
                                        "USA" = "1",
                                        "United Kingdom" = "2",
                                        "Guatemala" = "3")
         )

# Inspect the results
glimpse(famous_actors_clean)

Dealing with missing values

set.seed(2345)
student_feedback <- tibble(teacher_rating = sample(1:10, size = 175, replace = TRUE),
                           student_rating = sample(1:10, size = 175, replace = TRUE))
student_feedback <- missForest::prodNA(student_feedback, noNA = 0.09)

Exercise 17

We made use of a new 360 feedback tool which allows students to evaluate a lecturer, who in return evaluates students as well. The dataset student_feedback includes the results. However, before digging deeper into the data we have to check whether there is any missing data we have to consider.

Inspect student_feedback and determine how much data is missing by using naniar::vismis().


[Add a MC question here]

Exercise 18

(currently under development)

Exercise 19

(currently under development)

Latent constructs and their realiability

set.seed(1234)
happiness_at_work_high <-  sample(3:6, size = 150, replace = TRUE)

set.seed(1234)
happiness_at_work2_low <- sample(1:4, size = 150, replace = TRUE)

set.seed(1234)
happiness_family_high <-  sample(4:6, size = 150, replace = TRUE)

set.seed(1234)
happiness_family_low <- sample(1:3, size = 150, replace = TRUE)

set.seed(1234)
happiness_health_high <-  sample(4:6, size = 150, replace = TRUE)
set.seed(1234)
happiness_health_low <- sample(1:4, size = 150, replace = TRUE)

set.seed(1234)
stress_lots_of_work_high <-  sample(3:6, size = 150, replace = TRUE)
set.seed(1234)
stress_lots_of_work_low <- sample(1:4, size = 150, replace = TRUE)

set.seed(1234)
stress_cannot_keep_up_high <-  sample(3:6, size = 150, replace = TRUE)
set.seed(1234)
stress_cannot_keep_up_low <- sample(1:4, size = 150, replace = TRUE)

set.seed(1234)
stress_busy_schedule_high <-  sample(3:6, size = 150, replace = TRUE)
set.seed(1234)
stress_busy_schedule_low <- sample(1:4, size = 150, replace = TRUE)

# CONTINUE FROM HERE
happy_data <- tibble(age = sample(25:64, 300, replace = TRUE),
                     happiness_at_work = append(happiness_at_work_high,
                                                happiness_at_work2_low),
                     happiness_family = append(happiness_family_high,
                                               happiness_family_low),
                     happiness_health = append(happiness_health_high,
                                               happiness_health_low),
                     stress_lots_of_work = append(stress_lots_of_work_low,
                                                  stress_lots_of_work_high),
                     stress_cannot_keep_up = append(stress_cannot_keep_up_low,
                                                    stress_cannot_keep_up_high),
                     stress_busy_schedule = append(stress_busy_schedule_low,
                                                   stress_lots_of_work_high)
                     )

Exercise 20

We collected data about a teams level of happiness and stress stored in the object happy_data. We are asked to report the average happiness and stress in the team. Inspect the dataset first and then answer the questions below.


quiz(
  question(
    "Which of the following statements apply?",
    answer("We need to compute the two latent variables each based on three questions asked in the questionnaire.",
           correct = TRUE),
    answer("We need to check the internal consistency before computing a latent variable.",
           correct = TRUE),
    answer("There is no need to compute a latent variable, because the sample is too small.",
           message = "The sample size does not affect the computation of latent variables."),
    answer("We need to consider latent variables if what we want to measure is not normally measured as a number.",
           correct = TRUE),
    answer("Often, the computation of latent variables implies taking the average score of each participant across multiple questions.",
           correct = TRUE),
    allow_retry = TRUE
  )
)

Exercise 21

In a first step, we need to check the internal consistency of those items, i.e. questions in the questionnaire that belong to happiness and stress.

Compute the Cronbach's $\alpha$ for those variables which reflect the latent construct happiness

happy_data %>%
  ___
# We need to select the right variables
happy_data %>%
  select(happiness_at_work,
         happiness_family,
         happiness_health) %>%
  ___
# Solution: We use psych::alpha() to compute the Cronbach's alpha in R
happy_data %>%
  select(happiness_at_work,
         happiness_family,
         happiness_health) %>%
  psych::alpha()

Also, compute the Cronbach's $\alpha$ for those variables which reflect the latent construct stress.

happy_data %>%
  ___
# We first need to select the correct variables
happy_data %>%
  select(stress_lots_of_work,
         stress_cannot_keep_up,
         stress_busy_schedule) %>%
  ___
# Solution: We use the function alpha() from the 'psych' package
happy_data %>%
  select(stress_lots_of_work,
         stress_cannot_keep_up,
         stress_busy_schedule) %>%
  psych::alpha()
quiz(
  question(
    "How do you rate the level of internal consistency for `happiness` and `stress`?",
    answer("It is good, because alpha > 0.80",
           message = "The alpha score for stress does not lie above 0.8"),
    answer("It is good, because alpha > 0.70",
           correct = TRUE),
    answer("It is good, because alpha > 0.65",
           message = "It is necessary that alpha is at least 0.7."),
    answer("The Cronbach's alpha is not high enough.",
           message = "An alpha score of at least 0.7 would be enough."),
    allow_retry = TRUE
  )
)

Exercise 22

Even though the Cronbach's $\alpha$ looks promising for both latent variables, it is essential to also run a confirmatory factor analysis to ensure that these factors are independent latent variables.

Perform a confirmatory factor analysis and include both latent variables when specifying the model we have to test.


# We need to load the lavaan package to perform a CFA
library(lavaan)
# In a first step we need to define a model
library(lavaan)

model <- '
___
'
# We need to list all items and use '=~' to define a latent variable
library(lavaan)

model <- '
happiness =~
happiness_at_work +
happiness_family +
happiness_health
'
# The full specification of the required model.
library(lavaan)

model <- '
happiness =~
happiness_at_work +
happiness_family +
happiness_health

stress =~
stress_lots_of_work +
stress_cannot_keep_up +
stress_busy_schedule
'
# Perform CFA
library(lavaan)

model <- '
happiness =~
happiness_at_work +
happiness_family +
happiness_health

stress =~
stress_lots_of_work +
stress_cannot_keep_up +
stress_busy_schedule
'

fit <- cfa(model, data = happy_data)
# Solution
library(lavaan)

model <- '
happiness =~
happiness_at_work +
happiness_family +
happiness_health

stress =~
stress_lots_of_work +
stress_cannot_keep_up +
stress_busy_schedule
'

fit <- cfa(model, data = happy_data)

# Here we compute the fit indices to check model fit
fit_indices <- fitmeasures(fit)

# Optional if we want to only inspect certain indices
fit_indices %>%
  enframe() %>%
  filter(name == "cfi" |
         name == "srmr" |
         name == "rmsea") %>%
  mutate(value = round(value, 3))
quiz(
  question(
    "Based on the result of our CFA, which conclusions can we draw if we looked at the CFI, SRMR and RMSEA?",
    answer("The indicators suggest a good model fit and therefore we can compute the latent variables",
           message = "Unfortunately, the CFI is not as high as required, i.e. >= 0.95"),
    answer("The Root Mean Square Error of Approximation suggests a good fit our model.",
           correct = TRUE),
    answer("Two of the indicators suggest that our model fits our data well, but one of them is just a bit too low.",
           correct = TRUE),
    answer("The results of the CFA cannot be fully trusted, because our data set is too small.",
           message = "While sample size does play a role when performing a CFA, for the number of factors we are interested in, this is a large enough sample."),
    allow_retry = TRUE
  )
)

Exercise 23

Considering the fairly good results of our CFA in combination with the good Cronbach's alpha scores, we need to undertake one final step: Compute the latent variables happiness and stress.

happy_data_latent <-
  happy_data %>%
  ___
# We want to compute a new score for each participant, i.e. for each row.
happy_data_latent <-
  happy_data %>%
  rowwise() %>%
  ___
# We need to compute the mean of the corresponding variables
# to obtain a combined score for the latent variables
happy_data_latent <-
  happy_data %>%
  rowwise() %>%
  mutate(___)
# Solution
happy_data_latent <-
  happy_data %>%
  rowwise() %>%
  mutate(happiness = mean(c(happiness_at_work,
                            happiness_family,
                            happiness_health)),
         stress = mean(c(stress_lots_of_work,
                         stress_cannot_keep_up,
                         stress_busy_schedule)))

glimpse(happy_data_latent)


ddauber/r4np documentation built on Jan. 15, 2025, 8:46 p.m.