library(learnr) library(r4np) library(tidyverse) knitr::opts_chunk$set(echo = FALSE) tutorial_options(exercise.blanks = TRUE)
quiz( question( "Which of the following ways can you import data into your RStudio session?", answer("The 'Import Dataset' button in the 'Environment' pane does the trick.", correct = TRUE), answer("I can drag and drop files into the 'Source' window.", message = "I am afraid it is not possible to drag and drop files. It would be nice, though."), answer("I can use the function 'import_dataset()' from the 'readr' package.", message = "Unfortunately, the `readr` package has no function called 'import_dataset()."), answer("readr::read_csv('mydata.csv') imports data directly into the 'Environment' pane.", correct = TRUE), allow_retry = TRUE ) )
quiz( question( "What is a delimiter in datasets?", answer("It limits the amount of data shown in the console.", message = "Not quite. Delimiters do not affect what is shown in the console."), answer("It enables computers to identify were a column starts and where it ends in our data frame.", correct = TRUE), answer("A comma is the delimiter in a '.csv' file.", correct = TRUE), answer("There is a pre-defined set of delimiters we have to use.", message = "You can define any character or symbol as a delimiter. The choice is yours."), allow_retry = TRUE ) )
How can we find out what kind of variables are included in the imdb_top_250
dataset? Change the code below accordingly.
imdb_top_250
# You might want to take a 'glimpse' at the dataset.
# Solution glimpse(imdb_top_250)
How can we view the synopsis
(a column in imdb_top_250
)? Provide the correct code below.
# We have to use '$' to select specific column/variable in a dataset
# Solution imdb_top_250$synopsis
quiz( question( "What makes for good column/variable names?", answer("They should be as detailed as possible.", message = "While it is important a name captures the meaning of column/variable, we always should aim for being succinct, even if it means we have to sacrifice a bit on detail."), answer("They should be standardised, for example using lower-case only.", correct = TRUE), answer("If other people can easily make sense of them.", correct = TRUE), answer("They do not exceed a certain character limit.", message = "The length of names is relevant, but there is no particular character limit. Remember: Short is good, but nobodoy really defined what is 'short' and what is not."), allow_retry = TRUE ) )
# Install janitor if not installed yet if("janitor" %in% rownames(installed.packages()) == FALSE){install.packages("janitor")} # Create messy dataset data <- tibble(ID_number = c("N006", "N007", "N008"), Gender_of_Participant = c("female", "male", "female"), emotional_resilience = c(4, 8, 10), Name = c("Kirk", "Bond", "The Rock") )
A colleague gaves us a data frame saved in the object data
. Let's first, inspect data
using glimpse()
to see what it includes
quiz( caption = "Follow up question", question( "What can we say about this data frame?", answer("The data frame is not correct.", message = "Technically, this is a fully functional dataset. However, there are other things we can improve."), answer("There are 3 variables and 4 observations", message = "This dataset has 4 variables and 3 observations. Be aware of the difference between variable (a column) and an observation (a row) in a dataset"), answer("The column names are difficult to read.", correct = TRUE), answer("We could use a package like `janitor` to clean this dataset.", correct = TRUE), allow_retry = TRUE ) )
With these insights, how can we clean the column names and save it in the new object data_clean
? Inspect the result with glimpse()
.
library(janitor) # Clean the column names to make them more uniform data_clean <- # Inspect the results
# The 'janitor' package has a function to 'clean names'
# The following function should help clean_names(___)
# Solution data_clean <- clean_names(data)
There are some more improvements we could make to data_clean
. For example, we could shorten the following column names:
id_number
to id
, and
gender_of_participant
to gender
.
Clean these column names using the dplyr
function rename()
and save the changes in the object data_clean
. Review the changes using glimpse()
.
# Create messy dataset data <- tibble(ID_number = c("N006", "N007", "N008"), Gender_of_Participant = c("female", "male", "female"), emotional_resilience = c(4, 8, 10), Name = c("Kirk", "Bond", "The Rock") ) data_clean <- janitor::clean_names(data)
# Rename columns # Review changes
# When using dplyr we can specify the dataset first. data_clean <- data_clean %>% ___
# Then we can use the 'rename()' function. data_clean <- data_clean %>% rename(___)
# Remember, we first need to specify the new and then the old name data_clean <- data_clean %>% rename(new_name = old_name)
# Here is the example for the column 'id_number' data_clean <- data_clean %>% rename(id = id_number)
# The solution for both columns data_clean <- data_clean %>% rename(id = id_number, gender = gender_of_participant) glimpse(data_clean)
quiz( question( "What types of data can we usually find in Social Sciences projects.", answer("Valided and unvalidaded data.", message = "Valided/unvalidaded data is not a data type."), answer("Nominal data, which refers to categorical data with no particular order.", correct = TRUE), answer("Quantitative data, which consists exclusively of numbers.", correct = TRUE), answer("Ordinal data, which can include categories and numbers to show their order.", message = "Ordinal data is categorical and its categories can be placed in a meaningful order."), allow_retry = TRUE ) )
quiz( question( "Which of the following statements are correct?", answer("`<chr>` and `<fct>` refer to columns in a dataset that are categorical.", correct = TRUE), answer("`<dbl>` and `<int>` refer to numeric variables and can be used interchangeably.", message = "It is true that `<dbl>` and `<int>` refer to numeric variables, but they are not interchangeable. <dbl> allows decimals."), answer("Logical variables always only two values: `TRUE` or `FALSE`.", correct = TRUE), answer("When importing data with `readr`, factors are always imported as `<chr>`", correct = TRUE), allow_retry = TRUE ) )
students <- tibble(study_level = c("UG", "PG", "PG"), study_experiece = c(32, 83, 95), team = c("Alpha", "Beta", "Gamma") )
Inspect the dataset students
and answer the questions below.
quiz( question( "What can we say about this dataset?", answer("There are three categorical variables as indicated by `<chr>`.", message = "There are only two categorical variables: `study_level` and `team`"), answer("Two of the variables should not be `<chr>`, but a factor `<fct>`.", correct = TRUE), answer("The variable 'study_experience' could be considered as an integer `<int>`.", correct = TRUE), answer("It is important to convert the categorical variables into factors `<fct>`.", correct = TRUE), allow_retry = TRUE ) )
Changing data types is an essential aspect of data wrangling and cleaning. Take a look at the dataset halloween
and change the data type of those variables that are not entirely correct.
# Inspect dataset # Change data types halloween_clean <- halloween %>% ___ # Inspect the new dataset
glimpse(halloween) # We need to mutate the variable 'country' into a factor <fct>, # because it is currently a character <chr>
glimpse(halloween) halloween_clean <- halloween %>% mutate(___)
glimpse(halloween) halloween_clean <- halloween %>% mutate(country = as_factor(country))
The dataset gep
contains categorical and numerical data. We are asked to clean the dataset and ensure that data types are correct. First, take a look at the variables included in this data frame.
quiz( question( "Which of the following steps should we take?", answer("'gender' and 'level_of_study' should be converted to `<fct>`.", correct = TRUE), answer("'age' needs to be converted to an integer `<int>`.", message = "While 'age' is an integer, it is not necessary to change to conduct further analysis."), answer("All numeric variables can be converted to an `<int>`, but it is not necessary.", correct = TRUE), answer("We can create a new object to save the changes we made.", correct = TRUE), allow_retry = TRUE ) )
Let's implement these steps, store the changes in the object gep_clean
and review the changes using glimpse()
.
gep_clean <- gep %>% mutate(gender = as_factor(gender), level_of_study = as_factor(level_of_study)) glimpse(gep_clean)
famous_actors <- tibble(name = c("Zendaya", "Will Poulter", "Jamie Lee Curtis", "Oscar Isaac"), gender = c(1, 0, 1, 0), country_of_origin = c(1, 2, 1, 3), birth_place = c("Oakland", "London", "Santa Monica", "Guatemala City") )
We received a small dataset famous_actors
which contain information about commonly known actors working in Hollywood. However, some data cleaning is required to ensure data is correctly reflected in R for further processing. There are several steps we should perform.
First, we need to inspect the dataset to get an idea which variables we need to correct.
# Inspect the dataset 'famous_actors'
quiz( question( "Which of the following steps should we take?", answer("We need to correct the variable `gender`, because it is considered as `dbl` but should be a factor, i.e. `fct`.", correct = TRUE), answer("The variable `country_of_origin` is `dbl` and therefore cannot be used. We should remove it.", message = "We do not have to remove `country_of_origin`, but need to find out what these numbers stand for, i.e. which countries."), answer("The variable `birth_place` is an example of a `chr` variable and therefore needs not further treatment.", message = "The variable `birth_place` shows a limited number of categories and therefore qualifies as a factor (`fct`)."), answer("Except for `name`, all variables should be cleaned up.", correct = TRUE), allow_retry = TRUE ) )
We usually should try to avoid using numbers as factor levels to make it easier to read tables and data visualisations. Instead, we can provide meaningful labels. To do so, we have to 'recode' these factor levels.
In the famous_actors
dataset we find two factors which are missing appropriate labels, i.e. gender
and country_of_origin
. Make the required adjustments considering the following lines from the coding booklet of this dataset:
- Gender: 1
stands for female
and 0
stands for male
.
- Country: 1
stands for USA
, 2
stands for United Kingdom
and 3
stands for Guatemala
.
# We need to create an object to save the outcome and use the 'mutate()' function. famous_actors_clean <- famous_actors %>% mutate(___)
# First we need to change the data type to 'factor'. famous_actors_clean <- famous_actors %>% mutate(gender = as_factor(gender), country_of_origin = as_factor(country_of_origin) )
# Then we use the function 'fct_recode()' to replace the numbers famous_actors_clean <- famous_actors %>% mutate(gender = as_factor(gender), country_of_origin = as_factor(country_of_origin), gender = fct_recode(___) )
# The same function is used to change the labels for 'country_of_origin'. famous_actors_clean <- famous_actors %>% mutate(gender = as_factor(gender), country_of_origin = as_factor(country_of_origin), gender = fct_recode(gender, "male" = "0", "female" = "1"), country_of_origin = fct_recode(___) )
# Solution famous_actors_clean <- famous_actors %>% mutate(gender = as_factor(gender), country_of_origin = as_factor(country_of_origin), gender = fct_recode(gender, "male" = "0", "female" = "1"), country_of_origin = fct_recode(country_of_origin, "USA" = "1", "United Kingdom" = "2", "Guatemala" = "3") ) # Inspect the results glimpse(famous_actors_clean)
set.seed(2345) student_feedback <- tibble(teacher_rating = sample(1:10, size = 175, replace = TRUE), student_rating = sample(1:10, size = 175, replace = TRUE)) student_feedback <- missForest::prodNA(student_feedback, noNA = 0.09)
We made use of a new 360 feedback tool which allows students to evaluate a lecturer, who in return evaluates students as well. The dataset student_feedback
includes the results. However, before digging deeper into the data we have to check whether there is any missing data we have to consider.
Inspect student_feedback
and determine how much data is missing by using naniar::vismis()
.
[Add a MC question here]
(currently under development)
(currently under development)
set.seed(1234) happiness_at_work_high <- sample(3:6, size = 150, replace = TRUE) set.seed(1234) happiness_at_work2_low <- sample(1:4, size = 150, replace = TRUE) set.seed(1234) happiness_family_high <- sample(4:6, size = 150, replace = TRUE) set.seed(1234) happiness_family_low <- sample(1:3, size = 150, replace = TRUE) set.seed(1234) happiness_health_high <- sample(4:6, size = 150, replace = TRUE) set.seed(1234) happiness_health_low <- sample(1:4, size = 150, replace = TRUE) set.seed(1234) stress_lots_of_work_high <- sample(3:6, size = 150, replace = TRUE) set.seed(1234) stress_lots_of_work_low <- sample(1:4, size = 150, replace = TRUE) set.seed(1234) stress_cannot_keep_up_high <- sample(3:6, size = 150, replace = TRUE) set.seed(1234) stress_cannot_keep_up_low <- sample(1:4, size = 150, replace = TRUE) set.seed(1234) stress_busy_schedule_high <- sample(3:6, size = 150, replace = TRUE) set.seed(1234) stress_busy_schedule_low <- sample(1:4, size = 150, replace = TRUE) # CONTINUE FROM HERE happy_data <- tibble(age = sample(25:64, 300, replace = TRUE), happiness_at_work = append(happiness_at_work_high, happiness_at_work2_low), happiness_family = append(happiness_family_high, happiness_family_low), happiness_health = append(happiness_health_high, happiness_health_low), stress_lots_of_work = append(stress_lots_of_work_low, stress_lots_of_work_high), stress_cannot_keep_up = append(stress_cannot_keep_up_low, stress_cannot_keep_up_high), stress_busy_schedule = append(stress_busy_schedule_low, stress_lots_of_work_high) )
We collected data about a teams level of happiness and stress stored in the object happy_data
. We are asked to report the average happiness
and stress in the team. Inspect the dataset first and then answer the questions below.
quiz( question( "Which of the following statements apply?", answer("We need to compute the two latent variables each based on three questions asked in the questionnaire.", correct = TRUE), answer("We need to check the internal consistency before computing a latent variable.", correct = TRUE), answer("There is no need to compute a latent variable, because the sample is too small.", message = "The sample size does not affect the computation of latent variables."), answer("We need to consider latent variables if what we want to measure is not normally measured as a number.", correct = TRUE), answer("Often, the computation of latent variables implies taking the average score of each participant across multiple questions.", correct = TRUE), allow_retry = TRUE ) )
In a first step, we need to check the internal consistency of those items, i.e. questions in the questionnaire that belong to happiness
and stress
.
Compute the Cronbach's $\alpha$ for those variables which reflect the latent construct happiness
happy_data %>%
___
# We need to select the right variables happy_data %>% select(happiness_at_work, happiness_family, happiness_health) %>% ___
# Solution: We use psych::alpha() to compute the Cronbach's alpha in R happy_data %>% select(happiness_at_work, happiness_family, happiness_health) %>% psych::alpha()
Also, compute the Cronbach's $\alpha$ for those variables which reflect the latent construct stress
.
happy_data %>%
___
# We first need to select the correct variables happy_data %>% select(stress_lots_of_work, stress_cannot_keep_up, stress_busy_schedule) %>% ___
# Solution: We use the function alpha() from the 'psych' package happy_data %>% select(stress_lots_of_work, stress_cannot_keep_up, stress_busy_schedule) %>% psych::alpha()
quiz( question( "How do you rate the level of internal consistency for `happiness` and `stress`?", answer("It is good, because alpha > 0.80", message = "The alpha score for stress does not lie above 0.8"), answer("It is good, because alpha > 0.70", correct = TRUE), answer("It is good, because alpha > 0.65", message = "It is necessary that alpha is at least 0.7."), answer("The Cronbach's alpha is not high enough.", message = "An alpha score of at least 0.7 would be enough."), allow_retry = TRUE ) )
Even though the Cronbach's $\alpha$ looks promising for both latent variables, it is essential to also run a confirmatory factor analysis to ensure that these factors are independent latent variables.
Perform a confirmatory factor analysis and include both latent variables when specifying the model we have to test.
# We need to load the lavaan package to perform a CFA library(lavaan)
# In a first step we need to define a model library(lavaan) model <- ' ___ '
# We need to list all items and use '=~' to define a latent variable library(lavaan) model <- ' happiness =~ happiness_at_work + happiness_family + happiness_health '
# The full specification of the required model. library(lavaan) model <- ' happiness =~ happiness_at_work + happiness_family + happiness_health stress =~ stress_lots_of_work + stress_cannot_keep_up + stress_busy_schedule '
# Perform CFA library(lavaan) model <- ' happiness =~ happiness_at_work + happiness_family + happiness_health stress =~ stress_lots_of_work + stress_cannot_keep_up + stress_busy_schedule ' fit <- cfa(model, data = happy_data)
# Solution library(lavaan) model <- ' happiness =~ happiness_at_work + happiness_family + happiness_health stress =~ stress_lots_of_work + stress_cannot_keep_up + stress_busy_schedule ' fit <- cfa(model, data = happy_data) # Here we compute the fit indices to check model fit fit_indices <- fitmeasures(fit) # Optional if we want to only inspect certain indices fit_indices %>% enframe() %>% filter(name == "cfi" | name == "srmr" | name == "rmsea") %>% mutate(value = round(value, 3))
quiz( question( "Based on the result of our CFA, which conclusions can we draw if we looked at the CFI, SRMR and RMSEA?", answer("The indicators suggest a good model fit and therefore we can compute the latent variables", message = "Unfortunately, the CFI is not as high as required, i.e. >= 0.95"), answer("The Root Mean Square Error of Approximation suggests a good fit our model.", correct = TRUE), answer("Two of the indicators suggest that our model fits our data well, but one of them is just a bit too low.", correct = TRUE), answer("The results of the CFA cannot be fully trusted, because our data set is too small.", message = "While sample size does play a role when performing a CFA, for the number of factors we are interested in, this is a large enough sample."), allow_retry = TRUE ) )
Considering the fairly good results of our CFA in combination with the good Cronbach's alpha scores, we need to undertake one final step: Compute the latent variables happiness
and stress
.
happy_data_latent <- happy_data %>% ___
# We want to compute a new score for each participant, i.e. for each row. happy_data_latent <- happy_data %>% rowwise() %>% ___
# We need to compute the mean of the corresponding variables # to obtain a combined score for the latent variables happy_data_latent <- happy_data %>% rowwise() %>% mutate(___)
# Solution happy_data_latent <- happy_data %>% rowwise() %>% mutate(happiness = mean(c(happiness_at_work, happiness_family, happiness_health)), stress = mean(c(stress_lots_of_work, stress_cannot_keep_up, stress_busy_schedule))) glimpse(happy_data_latent)
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.