# Ensure that libraries are loaded.
# includes stringr:: for string manipulation 
# includes forcats:: for factor manipulation
# includes readr:: for (importing and) parsing data
# does not include broom:: for presenting statistical results
library(lubridate) # for date&time manipulation
# New packages (must be installed before taking this tutorial)
# library(tm) # to create & handle text corpora
# library(wordcloud) # for word clouds
# library(RColorBrewer) # color palettes (for word clouds; also loaded by wordcloud package)
# library(topicmodels) # topic modelling

# Ensure that the data is loaded for the remainder of this tutorial.
Brexit <- UsingRTutorials::Brexit
surveyweek1 <- UsingRTutorials::surveyweek1


Tidying tasks (reminder)

For your complex datasets, ask yourself one of the following:

  1. Does the data set contain info on more than one type of unit?

    • Or: Is the same information repeated in different rows?
    • If so, transfer the info on each type of unit to a separate table.
  2. Do different variables address the same information?

    • If so, gather (pivot_longer()) repeated information into one variable.
  3. Is there a variable with values that should be variables by themselves?

    • If so, spread (pivot_wider) the values into variables.
  4. Does a variable contain more than one piece of information?

    • If so, separate the pieces into different variables.


Character variables

Example data set Brexit: 600 articles about Brexit from the (former) news website nujij.nl.

Create a frequency table of the variable `category` in the `Brexit`dataset.

# Use the count() function and a pipe.
Brexit %>% count(category)
  correct = "That wasn't hard, was it?" 
  incorrect = ""

Note the order of values in this frequency table: alphabetical.

In addition, note the negligible number of observations for media, etc.

Character variable in statistical analysis

Below are the results of a regression model predicting the number of votes for a post from the post category.

# Linear regression: more in Session 6.
lm(votes ~ category, data = Brexit) %>%
  broom::tidy() %>% #retrieving the main results
  kable(digits = c(0,2,2,2,3),
               col.names = c("Parameter", "b", "SE", "t", "p"),
               align = "lrrrr") %>%
  kableExtra::kable_classic(full_width = FALSE)

question("What is the reference category here?",
  answer("Algemeen", correct = TRUE),
__Programming Tip__ - Having the values in another order than alphabetically is the main reason for using a factor instead of a character variable.

Create a factor

Create a factor called `cat_fact` from the variable `category` and add it to tibble `Brexit`.
Brexit <- Brexit %>% 
# Use mutate().
Brexit <- Brexit %>% mutate()
# Use the (base::) `factor()` function.
factor(x = )
Brexit <- Brexit %>% mutate(cat_fact = factor(x = category))
  correct = "" 
  incorrect = ""

Have a close look at the description of the new factor:

# Info shown in Environment tab if you work in RStudio.

Change the first factor category

Use the previously created factor `cat_fact` to add a new factor called `cat_new` to tibble `Brexit` in which `Economie` (_Economy_) is the first category.

# Use `fct_relevel()`, which is part of the `forcats` package, 
# which is loaded by the `tidyverse` package.
# Oh yes, use mutate() again. You should know that by now.
Brexit <- Brexit %>% 
Brexit <- Brexit %>% mutate(cat_new = fct_relevel(cat_fact, "Economie"))
  correct = "" 
  incorrect = "There are more ways of getting the result but we want you to use the function you learned in the book."

Now check that `Economie` is the reference category in a regression model.
Brexit <- Brexit %>% 
  mutate(cat_fact = factor(x = category), 
         cat_new = fct_relevel(cat_fact, "Economie")
# Adapt the code for a regression model from a previous exercise. 
# Use only the `lm()` function, which will send the results to the screen. 
# That suffices for checking the reference category.
__Hint:__ Regression function: lm(y ~ x, data = ).
lm(votes ~ cat_new, data = Brexit)
  correct = "Details on statistical analysis in Session 6." 
  incorrect = ""


Recode factor levels

Merge these with category `Algemeen` (_General_) in a new variable `cat_fact2` in tibble `Brexit`. Inspect the result.

# Use variable `cat_fact` as your starting point.
# Use function `fct_recode()`.
mutate(cat_fact2 = fct_recode(cat_fact, ))
Brexit <- Brexit %>% mutate(cat_fact2 = fct_recode(cat_fact, Algemeen = "Media", Algemeen = "Overig", Algemeen = "Showbizz", Algemeen = "Wetenschap"))
  correct = "In the `fct_recode()`function, the new value is to the left, the old value is to the right. As in y <- x." 
  incorrect = "Perhaps your order of factor values is different but your result can be OK. The code checker can only deal with one order. Poor soul that it is."

Grouping a numeric variable into a factor

Unfortunately, variable reactions is not numeric.

Use function `parse_number()` to create a new numeric variable `react_num` in tibble `Brexit`, which gives us the number of reactions to a post. Check the results with a frequency table of `reactions` - `react_num`combinations.
Brexit <- Brexit %>% mutate( ____ )
Brexit %>% count()
# Remember the structure of mutate:
mutate(new_var = function(old_var))
# Function `parse_number()` is part of the `readr` package, which is loaded by
# the `tidyverse` package.
# Use help on function parse_number() if you need it.
# Select two variables to be counted in count().
Brexit <- Brexit %>% mutate(react_num = parse_number(reactions))
Brexit %>% count(reactions, react_num)
  correct = "We use `parse_number()` to extract the (first) number from a string. Your first string manipulation in this tutorial! More to follow." 
  incorrect = "Use tidyverse code with piping! And don't forget to adapt the second line of code."

Group the number of reactions in three bins with (nearly) the same number of cases and store this variable as a factor named `react_cat` in tibble `Brexit`.

# There are two steps. In the first step, bin the numeric variable.
# In the second step, create a factor from the bins. 
# The two steps can be performed in one `mutate()` function. It is even possible
# to use the same name for the new variable in both steps.
  react_cat = ntile(react_num, 3), 
  react_cat = factor()
Brexit <- Brexit %>% mutate(react_cat = ntile(react_num, 3), react_cat = factor(react_cat, levels = c(1, 2, 3), labels = c("low", "medium", "high")))
  correct = "" 
  incorrect = "Note the `levels` (sets the order) and `labels` (sets the label/value) arguments of the `factor()` function."

The three reactions bins should be labeled "low", "medium", "high". But the substantive order from low to high does not match the alphabetical order. The bins variable must therefore be a factor.

Plot a bar chart of the factor representing the reactions bins. Are the bins in the desired order?
Brexit <- Brexit %>% mutate(react_cat = ntile(parse_number(reactions), 3), react_cat = factor(react_cat, levels = c(1, 2, 3), labels = c("low", "medium", "high")))

# Use ggplot() and geom_bar().
ggplot(data = Brexit, mapping = aes(x = react_cat)) + geom_bar()
  correct = "The bins are in the right order: low - medium - high. And yes indeed, all three bins have the same number of observations (posts)." 
  incorrect = "Maybe you specified aes(x) in the geom_bar() step. That is OK.
__Programming Tip__ - An ordered factor assigns ordinal measurement level. In a regression model, no ordinary dummy variables are created for an ordered factor. So avoid ordered factors, unless you really need an ordinal variable.

Dates and Times

Date-time data object

Example: Publication date of post on nujij.nl in tibble Brexit.

str(Brexit$pubDate, "%B %d, %Y %H:%M:%S")

Date plus time of day is a character string here.

To work with dates and times change to POSIXct data type.

String to date-time

Create a date-time variable `pubDateTime` in tibble `Brexit` using a parse function in package `readr` (p. 134 {Ch. 11.3.4}). Check the results with a frequency table.
# Create and add the date-time variable.
Brexit <- Brexit %>% ____
# Check the results
Brexit %>% _____
# The parse function is `parse_datetime()`.
parse_datetime(pubDate, format = "")
# Carefully specify the date and time formatting codes. 
# See help for `strptime()` under __Details__.
Brexit <- Brexit %>% mutate(pubDateTime = parse_datetime(pubDate, format = "%B %d, %Y %H:%M:%S"))
Brexit %>% count(pubDate, pubDateTime)

Now do the same with the `lubridate` package (p. 239 {Ch. 16.2.1}), storing the results as a new variable `pubDateTime2` in tibble `Brexit`.
Brexit <- Brexit %>% 
# Select the correct function. Check out help for `ymd_hms()`.
Brexit <- Brexit %>% mutate(pubDateTime2 = mdy_hms(pubDate))
  correct = "Again, compare the new variable to the original publication date variable: Is the new variable correct?" 
  incorrect = ""
__Programming Tip__ - Especially when you let `readr` guess the date and time format, the results can be wrong. Always compare the new date-time variable to the original string variable.

Retrieve info from date-time

It is easy to get the date from a date-time (POSIXct) variable with the lubridate package.

Add a variable `date` to the `Brexit` tibble; derive it from the recently created variable `pubDateTime`.
Brexit <- Brexit %>% 
# How about the `date()` function in package lubridate?
Brexit <- Brexit %>% mutate(date = date(pubDateTime))
  correct = "" 
  incorrect = ""

Inspect the dates of the posts with a frequency polygon.
Brexit <- Brexit %>% mutate(date = date(parse_datetime(pubDate, format = "%B %d, %Y %H:%M:%S")))

# Pay attention to the `stat` argument in `geom_freqpoly().
# Look the value up under __Computed Variables__ in the RStudio help on
Brexit %>% ggplot(aes(x = date)) + geom_freqpoly(stat = "count")
  correct = "If you do not use the stat = 'count' argument, geom_freqpoly bins the dates, so you won't get counts for separate days." 
  incorrect = "The aesthetics for the horizontal axis can be specified in the `ggplot()` function or in the `geom_freqpoly()` geom. Both are OK."

It is also easy to get the weekday with the lubridate package.

Use the `pubDateTime` variable to calculate the weekday at which a post was published as a new variable `weekday` containing the names of the weekdays. Store it in the `Brexit` tibble and inspect a frequency table of the `weekday` variable.
# This time, start from scratch...
__Hint:__ How about the `wday()` function?
Brexit %>% mutate(weekday = wday(pubDateTime, label=TRUE)) %>% select(weekday)
  correct = "" 
  incorrect = ""

Brexit  %>% mutate(weekday = lubridate::wday(parse_datetime(pubDate, "%B %d, %Y %H:%M:%S"), label=TRUE)) %>% ggplot(aes(weekday)) + geom_bar()

Variable weekday must be a factor. How can you tell?

Apply this trick to the `pubDateTime` variable in tibble `Brexit` to create a variable `time`. Show the variable as an area plot; don't save it. Put all code in one pipe!
Brexit %>% 
  mutate(____) %>%
# Use the `update()` function.
# Use `geom_area()`.
  + geom_area()
# And use `scale_x_datetime` to set the labels of the horizontal axis to hour.
  + geom_area()
  + scale_x_datetime()
Brexit %>% mutate(time = update(pubDateTime, yday = 1)) %>% ggplot(aes(time)) + geom_area(stat = "bin", bins = 30) + scale_x_datetime(date_labels = "%H")
  correct = "" 
  incorrect = ""

Calculating with date-time

Subtract the `pubDateTime` of the previous post from the `pubDateTime` of the current post and store the result as variable `lapse` in tibble `Brexit`.
Brexit <- Brexit %>% 
  arrange(____) %>%
# Sort the data to be sure that they are in chronological order before you
# calculate the time lapse.
# Use the `lag()` function.
Brexit <- Brexit %>% arrange(pubDateTime) %>% mutate(lapse = pubDateTime - lag(pubDateTime, n = 1L))
  correct = "" 
  incorrect = ""

#Show publication date-time and lapse for first few observations.
Brexit %>% 
  mutate(pubDateTime = parse_datetime(pubDate, "%B %d, %Y %H:%M:%S")) %>%
  arrange(pubDateTime) %>% mutate(lapse = pubDateTime - lag(pubDateTime, n = 1L)) %>% 
  select(pubDateTime, lapse) %>%
Show the difference in days (dates) between subsequent posts (new variable `lapse2`) for the first six posts in a table.
Brexit %>% 
  arrange(____) %>% 
  mutate(____) %>% 
  select(___) %>% 
# Don't save the new variable lapse2, just show it in a table.
# Show only date and lapse2 in the table.
# Use slice() to select the first six rows.
Brexit %>% arrange(date) %>% mutate(lapse2 = date - lag(date, n = 1L)) %>% select(date, lapse2) %>% slice(1:6)
  correct = "" 
  incorrect = ""

Subtracting two date-times:

We don't like unpredictable results!

Data type duration:

__Programming Tips__ - Work with data type `duration` instead of `difftime` unless you have to take into account daylight saving and leap seconds. - Note: For minutes, hours, days, and weeks, a `duration` can be divided (ignoring leap seconds) by the appropriate number of seconds.


String manipulation

Main applications:

  1. Change string into another type (factor, date, number) that is more convenient for analysis.
    • Examples in previous topics of this tutorial (string to factor or date-time).
  2. Query and quantify natural text contents.
    • Find texts with particular contents.
    • Change content characteristics into variables.

Topic 2 is esp. relevant for students working on a complex data set including text (natural language): Alcohol Posts and Use, German Chancellor Debate .

Change variable names

Friends & Families weekly survey: nasty variable names. Use rename()?


Objective: Simplify the variable name to the date for which the sad/depressed score was entered.

__Programming Tip__ - If possible, first create code that works for one example. Next, extend the code to deal with all cases.
# Get a variable name as a character variable with the names() function.
# [4] selects item 4 from the vector of variable names

Manipulate one variable name

Step 1: Remove all up to (and including) the first " - ".

Run the below code and inspect the intermediary results to check that you understand what each line of code does. For example, why is there `+ 1` within the `str_sub()` function?
# Store variable name in a data object, so we can use it.
varname <- names(surveyweek1)[4]
# Show result to check that the code works.
varname # Intermediary result. Comment out in your Data Project code!

# Find the first position of " - ".
# Show results for understanding before using it below.
str_locate(varname, " - ") #start and end position!
# Intermediary result. Comment out in your Data Project code!

# Select everything after the end position of " - ".
varname <- str_sub(varname, start = str_locate(varname, " - ")[2] + 1)

# Show result to check that the code works.
varname # Intermediary result. Comment out in your Data Project code!

The book focuses on finding and replacing particular strings/words:

We now have:

str_sub(names(surveyweek1)[4], start = str_locate(names(surveyweek1)[4], " - ")[2] + 1)
varname <- str_sub(names(surveyweek1)[4], start = str_locate(names(surveyweek1)[4], " - ")[2] + 1)

Step 2: Get rid of the weekday name, so remove all up to (and including) ", ".

Run the below code and inspect the intermediary results to check that you understand what each line of code does.
# Select substring after the end position of ", ".
varname <- str_sub(varname, start = str_locate(varname, ", ")[2] + 1)
# Show result: see the difference. 
varname # Intermediary result. Comment out in your Data Project code!

varname <- str_sub(names(surveyweek1)[4], start = str_locate(names(surveyweek1)[4], " - ")[2] + 1)
varname <- str_sub(varname, start = str_locate(varname, ", ")[2] + 1)

Step 3: Get rid of the "Sad or depressed" part, so remove all from (and including) " - ".

Run the below code and inspect the intermediary results to check that you understand what each line of code does.
# Show the name that we start with.
# Select substring up to start position of " - ".
varname <- str_sub(varname, end = str_locate(varname, " - ")[1] - 1)
# Show result: see the difference.

This is the variable name that we were after.

__Programming Tip__ - It is good practice to check intermediary results, so you know that your code performs as expected. - It is equally good practice to comment out code that is only used for checking intermediary results when the code is definitive.

(for code experts...)

Note that start and end arguments can be combined in sub_str(). Steps 2 and 3 can thus be done in one command.

Write the `str_sub()`function that combines Steps 2 and 3.
str_sub(varname, )
__Hint:__ You should be able to figure this out. Use help on the `str_sub()` and `str_locate()` functions.
str_sub(varname, start = str_locate(varname, ", ")[2] + 1, end = str_locate(varname, " - ")[1] - 1)
  correct = "You added the `end` argument at the correct position." 
  incorrect = ""

Manipulate all variable names

Let us apply the commands to all relevant variables (sequence 4 to 10).

Run the below code. What do `names()` and `str_locate()` produce now?
# Get all selected variable names.
varname <- names(surveyweek1)[4:10] 
# Check result.

# Get the start and end positions.
str_locate(varname, " - ")

# Get all selected variable names.
varname <- names(surveyweek1)[4:10]
# Get the end positions from the matrix.
str_locate(varname, " - ")[,2]

Adjust the commands that we have created for changing one variable name to changing the names of all relevant variables.
# Store variable name in a data object, so we can use it.
varname <- names(surveyweek1)[4]
# Select everything after the end position of " - ".
varname <- str_sub(varname, start = str_locate(varname, " - ")[2] + 1)
# Select substring after the end position of ", ".
varname <- str_sub(varname, start = str_locate(varname, ", ")[2] + 1)
# Select substring up to start position of " - ".
varname <- str_sub(varname, end = str_locate(varname, " - ")[1] - 1)
# Check result.
# Start selecting all variable names instead of only the first.
varname <- names(surveyweek1)[4:10]
# Check intermediate steps, as we have done before. 
# For example: What does str_locate() yield now?
str_locate(varname, " - ")
# Instead of a single end position, we need a column of end positions.
# Check out the difference between:
str_locate(varname, " - ")[2]
str_locate(varname, " - ")[,2]
# A comma makes a big difference...
  fail_if(~ identical(.result, "March 8"), "Did you select variables 4 to 10 in the first command?"),
  fail_if(~ length(.result) == 7 && .result[2] == " March ", "Did you use columns instead of single values for start and end positions?"),
  pass_if(~ identical(.result, c("March 8", "March 9", "March 10", "March 11", "March 12", "March 13", "March 14")), "You have selected all seven variables (`[4:10]`) and the correct columns of start (`[,1]` instead of `[1]`) and end positions (`[,2]` instead of `[2]`).")

Once we have the correct variable names, we can use them to replace the names in the surveyweek1 tibble.

# Store variable name in a data object, so we can use it.
varname <- names(surveyweek1)[4:10]
# Select everything after the end position of " - ".
varname <- str_sub(varname, start = str_locate(varname, " - ")[,2] + 1)
# Select substring after the end position of ", ".
varname <- str_sub(varname, start = str_locate(varname, ", ")[,2] + 1)
# Select substring up to start position of " - ".
varname <- str_sub(varname, end = str_locate(varname, " - ")[,1] - 1)
# Replace variable names in tibble.
names(surveyweek1)[4:10] <- varname
# Inspect results

Tidyverse alternative

Because the variable names are so well structured in this particular example, we can achieve the same with the tidyverse separate() function.

Inspect the results of each step in the pipe with `View()`. Note that the View window may be hidden behind another screen.
#tidyverse requires a data frame
varnames <- tibble(name = names(surveyweek1)[4:10]) %>%
  #split each name into three parts
  separate(name, into = c("humbug", "date", "topic"), sep = " - ") %>%
  #split the date into weekday and date
  separate(date, into = c("weekday", "date"), sep = ", ") %>%
  #extract the date variable as a vector
# Replace variable names in tibble.
names(surveyweek1)[4:10] <- varnames
# Show result.

Fancy Stuff

Natural text such as the header or body text of posts is used for classifying texts (topic modelling), for example, using sentiment analysis, or investigating relations between words, e.g., as visualized in a word cloud.

Word cloud

Let us use this package and the wordcloud package to create some word clouds.

Use help and inspect intermediary results to understand what happens here.
# Load the required packages.
# Create a corpus for the headings (titles) of the Brexit articles.
docs <- Brexit %>%
  # extract the titles as a vector
  pull(title) %>%
  # use tm functions to create a corpus
  tm::VectorSource() %>%
  tm::Corpus() %>%
  # use tm functions to clean the data
  tm::tm_map(removeNumbers) %>% #remove numbers (assuming they are not informative)
  tm::tm_map(removePunctuation) %>% #remove punctuation (assuming they are not informative)
  tm::tm_map(stripWhitespace) %>% #remove blanks
  tm::tm_map(content_transformer(tolower)) %>% #only lower case letters
  tm::tm_map(removeWords, stopwords("dutch")) #remove Duch stopwords
# Create our first wordcloud.
  random.order = FALSE,
  colors = RColorBrewer::brewer.pal(11, "Spectral")

Play around with the selection of words and the arguments of the `wordcloud()` function.
# Create a corpus for the headings (titles) of the Brexit articles.
docs <- Brexit %>%
  # extract the titles as a vector
  pull(title) %>%
  # use tm functions to create a corpus
  tm::VectorSource() %>%
  tm::Corpus() %>%
  # use tm functions to clean the data
  tm::tm_map(removeNumbers) %>% #remove numbers (assuming they are not informative)
  tm::tm_map(removePunctuation) %>% #remove punctuation (assuming they are not informative)
  tm::tm_map(stripWhitespace) %>% #remove blanks
  tm::tm_map(content_transformer(tolower)) %>% #only lower case letters
  tm::tm_map(removeWords, stopwords("dutch")) #remove Duch stopwords
docs %>% 
  #remove word "brexit"
  tm::tm_map(removeWords, c("brexit")) %>%
  #create wordcloud
    random.order = FALSE,
    colors = brewer.pal(11, "Spectral")

Topic modelling


The tm package can create document-term-matrices from a text corpus.

The code below must identify two topics in the Brexit texts and shows these as two word clouds.

Play around with the number of topics in the `lda()` function. Note that the code only displays the first two topics.
# Load the required package.
# Create a document-term-matrix from the corpus.
dtm = tm::DocumentTermMatrix(docs)
# Run a Latent Dirichlet Allocation topic model with 2 topics
lda = topicmodels::LDA(dtm, k = 2, seed = 234) 
# Calculate the probabilities that a text or word belongs to Topic 1 or Topic 2.
pred = topicmodels::posterior(lda,dtm) 

# Show word clouds of terms that are more strongly related to one topic than to the other.
# First, create a tibble with the ratio of the probability that a term links with Topic 2 over the probability that a term links with Topic 1.
probRatio <- pred$terms %>% as_tibble() %>%
  rownames_to_column(var = "topic") %>% #preserve topic number as variable
  #turn term variables into one variable
  pivot_longer(!topic, names_to = "term", values_to = "prob") %>% 
  #calculate the probability ration
  group_by(term) %>%
  arrange(term, topic) %>%
  mutate(pratio = prob / lag(prob, n = 1L)) %>%
  #only retain the rows with a valid probability ratio (namely, topic 2)
  filter(topic == 2)

# Show two plots side by side.
par(mar = c(0, 0, 0, 0), mfcol = c(1, 2))
# Wordcloud Topic 1: terms that are at least 5 times more likely for topic 1.
nottopic1 <- probRatio %>%
  filter(pratio > 0.2) %>%
  pull(term) #we need a vector for the removeWords() function
docs %>% 
  #remove word "brexit"
  tm::tm_map(removeWords, nottopic1) %>%
  #create wordcloud
    random.order = FALSE,
    colors = RColorBrewer::brewer.pal(11, "Spectral"),
    min.freq = 2
# Wordcloud Topic 2: terms that are at least 5 times more likely for topic 1.
nottopic2 <- probRatio %>%
  filter(pratio < 5) %>%
  pull(term) #we need a vector for the removeWords() function
docs %>% 
  #remove word "brexit"
  tm::tm_map(removeWords, nottopic2) %>%
  #create wordcloud
    random.order = FALSE,
    colors = RColorBrewer::brewer.pal(11, "Spectral"),
    min.freq = 2

Well, what's the difference between the two topics? Note that the results may change dramatically from one run to the next. Perhaps, there is just one story.

__Programming Tip__ - Never assume that dates and times are correctly read.

