# Ensure that libraries are loaded. library(tidyverse) # includes stringr:: for string manipulation # includes forcats:: for factor manipulation # includes readr:: for (importing and) parsing data # does not include broom:: for presenting statistical results library(lubridate) # for date&time manipulation library(learnr) library(gradethis) library(knitr) library(kableExtra) # New packages (must be installed before taking this tutorial) # library(tm) # to create & handle text corpora # library(wordcloud) # for word clouds # library(RColorBrewer) # color palettes (for word clouds; also loaded by wordcloud package) # library(topicmodels) # topic modelling tutorial_options(exercise.timelimit = 60, exercise.checker = gradethis::grade_learnr) knitr::opts_chunk$set(echo = FALSE, warning = FALSE, message = FALSE)
# Ensure that the data is loaded for the remainder of this tutorial. Brexit <- UsingRTutorials::Brexit surveyweek1 <- UsingRTutorials::surveyweek1
First 1.5 hours: Course content
Second 1.5 hours: Data project
For your complex datasets, ask yourself one of the following:
Does the data set contain info on more than one type of unit?
Do different variables address the same information?
Is there a variable with values that should be variables by themselves?
Does a variable contain more than one piece of information?
Example data set Brexit
: 600 articles about Brexit from the (former) news website nujij.nl.
# Use the count() function and a pipe. count()
Brexit %>% count(category)
gradethis::grade_code( correct = "That wasn't hard, was it?", incorrect = "" )
Note the order of values in this frequency table: alphabetical.
In addition, note the negligible number of observations for media, etc.
Below are the results of a regression model predicting the number of votes for a post from the post category.
# Linear regression: more in Session 6. lm(votes ~ category, data = Brexit) %>% broom::tidy() %>% #retrieving the main results kable(digits = c(0,2,2,2,3), col.names = c("Parameter", "b", "SE", "t", "p"), align = "lrrrr") %>% kableExtra::kable_classic(full_width = FALSE)
question("What is the reference category here?", answer("(Intercept)"), answer("Algemeen", correct = TRUE), answer("Economie"), answer("Media"), answer("Overig"), answer("Politiek"), answer("Showbizz"), answer("Wetenschap") )
Brexit <- Brexit %>%
# Use mutate(). Brexit <- Brexit %>% mutate()
# Use the (base::) `factor()` function. factor(x = )
Brexit <- Brexit %>% mutate(cat_fact = factor(x = category))
gradethis::grade_code( correct = "", incorrect = "" )
Have a close look at the description of the new factor:
# Info shown in Environment tab if you work in RStudio. str(Brexit$cat_fact)
# Use `fct_relevel()`, which is part of the `forcats` package, # which is loaded by the `tidyverse` package. fct_relevel()
# Oh yes, use mutate() again. You should know that by now. Brexit <- Brexit %>% mutate()
Brexit <- Brexit %>% mutate(cat_new = fct_relevel(cat_fact, "Economie"))
gradethis::grade_code( correct = "", incorrect = "There are more ways of getting the result but we want you to use the function you learned in the book." )
Brexit <- Brexit %>% mutate(cat_fact = factor(x = category), cat_new = fct_relevel(cat_fact, "Economie") )
# Adapt the code for a regression model from a previous exercise. # Use only the `lm()` function, which will send the results to the screen. # That suffices for checking the reference category.
lm(votes ~ cat_new, data = Brexit)
gradethis::grade_code( correct = "Details on statistical analysis in R follow in Session 6.", incorrect = "" )
Notes:
Brexit
tibble because we want to use the reordered factor later on in this tutorial. Media
, Overig
(Other), Showbizz
, and Wetenschap
(Science) contain only one observation. # Use variable `cat_fact` as your starting point.
# Use function `fct_recode()`. mutate(cat_fact2 = fct_recode(cat_fact, ))
Brexit <- Brexit %>% mutate(cat_fact2 = fct_recode(cat_fact, Algemeen = "Media", Algemeen = "Overig", Algemeen = "Showbizz", Algemeen = "Wetenschap"))
gradethis::grade_code( correct = "In the `fct_recode()`function, the new value is to the left, the old value is to the right. As in y <- x.", incorrect = "Perhaps your order of factor values is different but your result can be OK. The code checker can only deal with one order. Poor soul that it is." )
reactions
: the number of reactions that a post received. Unfortunately, variable reactions
is not numeric.
# Brexit <- Brexit %>% mutate( ____ ) Brexit %>% count()
# Remember the structure of mutate: mutate(new_var = function(old_var))
# Function `parse_number()` is part of the `readr` package, which is loaded by # the `tidyverse` package. # Use help on function parse_number() if you need it.
# Select two variables to be counted in count().
Brexit <- Brexit %>% mutate(react_num = parse_number(reactions)) Brexit %>% count(reactions, react_num)
gradethis::grade_code( correct = "We use `parse_number()` to extract the (first) number from a string. Your first string manipulation in this tutorial! More to follow.", incorrect = "Use tidyverse code with piping! And don't forget to adapt the second line of code." )
# There are two steps. In the first step, bin the numeric variable. ntile()
# In the second step, create a factor from the bins. # The two steps can be performed in one `mutate()` function. It is even possible # to use the same name for the new variable in both steps. mutate( react_cat = ntile(react_num, 3), react_cat = factor() )
Brexit <- Brexit %>% mutate(react_cat = ntile(react_num, 3), react_cat = factor(react_cat, levels = c(1, 2, 3), labels = c("low", "medium", "high")))
gradethis::grade_code( correct = "", incorrect = "Note the `levels` (sets the order) and `labels` (sets the label/value) arguments of the `factor()` function." )
The three reactions bins should be labeled "low", "medium", "high". But the substantive order from low to high does not match the alphabetical order. The bins variable must therefore be a factor.
Brexit <- Brexit %>% mutate(react_cat = ntile(parse_number(reactions), 3), react_cat = factor(react_cat, levels = c(1, 2, 3), labels = c("low", "medium", "high")))
# Use ggplot() and geom_bar().
ggplot(data = Brexit, mapping = aes(x = react_cat)) + geom_bar()
gradethis::grade_code( correct = "The bins are in the right order: low - medium - high. And yes indeed, all three bins have the same number of observations (posts).", incorrect = "Maybe you specified aes(x) in the geom_bar() step. That is OK. )
Example: Publication date of post on nujij.nl in tibble Brexit
.
str(Brexit$pubDate, "%B %d, %Y %H:%M:%S")
Date plus time of day is a character string here.
To work with dates and times change to POSIXct
data type.
Brexit
, variable pubDate
: date and time an article was posted as a string, e.g., "r Brexit$pubDate[1]
". # Create and add the date-time variable. Brexit <- Brexit %>% ____ # Check the results Brexit %>% _____
# The parse function is `parse_datetime()`. parse_datetime(pubDate, format = "")
# Carefully specify the date and time formatting codes. # See help for `strptime()` under __Details__.
Brexit <- Brexit %>% mutate(pubDateTime = parse_datetime(pubDate, format = "%B %d, %Y %H:%M:%S")) Brexit %>% count(pubDate, pubDateTime)
gradethis::grade_code()
Brexit <- Brexit %>% mutate(____)
# Select the correct function. Check out help for `ymd_hms()`.
Brexit <- Brexit %>% mutate(pubDateTime2 = mdy_hms(pubDate))
gradethis::grade_code( correct = "Again, compare the new variable to the original publication date variable: Is the new variable correct?", incorrect = "" )
lubridate
simplifies the identification of date/time elements.It is easy to get the date from a date-time (POSIXct) variable with the lubridate
package.
Brexit <- Brexit %>% mutate(____)
# How about the `date()` function in package lubridate?
Brexit <- Brexit %>% mutate(date = date(pubDateTime))
gradethis::grade_code( correct = "", incorrect = "" )
Brexit <- Brexit %>% mutate(date = date(parse_datetime(pubDate, format = "%B %d, %Y %H:%M:%S")))
# Pay attention to the `stat` argument in `geom_freqpoly(). # Look the value up under __Computed Variables__ in the RStudio help on ?geom_freqpoly()
Brexit %>% ggplot(aes(x = date)) + geom_freqpoly(stat = "count")
gradethis::grade_code( correct = "If you do not use the stat = 'count' argument, geom_freqpoly bins the dates, so you won't get counts for separate days.", incorrect = "The aesthetics for the horizontal axis can be specified in the `ggplot()` function or in the `geom_freqpoly()` geom. Both are OK." )
date
variable type in R. It behaves just like a date-time
variable.date()
is also a base R function. When you load the lubridate
package, the base R date()
function is overriden.It is also easy to get the weekday with the lubridate
package.
# This time, start from scratch...
Brexit %>% mutate(weekday = wday(pubDateTime, label=TRUE)) %>% select(weekday)
gradethis::grade_code( correct = "", incorrect = "" )
Brexit %>% mutate(weekday = lubridate::wday(parse_datetime(pubDate, "%B %d, %Y %H:%M:%S"), label=TRUE)) %>% ggplot(aes(weekday)) + geom_bar()
Variable weekday
must be a factor. How can you tell?
Brexit %>% mutate(____) %>% ggplot(____)
# Use the `update()` function. update()
# Use `geom_area()`. ggplot(aes(time)) + geom_area()
# And use `scale_x_datetime` to set the labels of the horizontal axis to hour. ggplot(aes(time)) + geom_area() + scale_x_datetime()
Brexit %>% mutate(time = update(pubDateTime, yday = 1)) %>% ggplot(aes(time)) + geom_area(stat = "bin", bins = 30) + scale_x_datetime(date_labels = "%H")
gradethis::grade_code( correct = "", incorrect = "" )
Brexit <- Brexit %>% arrange(____) %>% mutate(____)
# Sort the data to be sure that they are in chronological order before you # calculate the time lapse.
# Use the `lag()` function. lag()
Brexit <- Brexit %>% arrange(pubDateTime) %>% mutate(lapse = pubDateTime - lag(pubDateTime, n = 1L))
gradethis::grade_code( correct = "", incorrect = "" )
#Show publication date-time and lapse for first few observations. Brexit %>% mutate(pubDateTime = parse_datetime(pubDate, "%B %d, %Y %H:%M:%S")) %>% arrange(pubDateTime) %>% mutate(lapse = pubDateTime - lag(pubDateTime, n = 1L)) %>% select(pubDateTime, lapse) %>% slice(1:6)
Brexit %>% arrange(____) %>% mutate(____) %>% select(___) %>% ____(____)
# Don't save the new variable lapse2, just show it in a table.
# Show only date and lapse2 in the table.
# Use slice() to select the first six rows.
Brexit %>% arrange(date) %>% mutate(lapse2 = date - lag(date, n = 1L)) %>% select(date, lapse2) %>% slice(1:6)
gradethis::grade_code( correct = "", incorrect = "" )
Subtracting two date-times:
difftime
; Data type duration
:
difftime
into duration
with the as.duration()
function. See the book for more details.Main applications:
Topic 2 is esp. relevant for students working on a complex data set including text (natural language): Alcohol Posts and Use, German Chancellor Debate .
Friends & Families weekly survey: nasty variable names. Use rename()
?
knitr::include_graphics("images/surveyweek1.png")
rename()
to change each name one by one.Objective: Simplify the variable name to the date for which the sad/depressed score was entered.
surveyweek1
. # Get a variable name as a character variable with the names() function. names(surveyweek1)[4] # [4] selects item 4 from the vector of variable names
Step 1: Remove all up to (and including) the first " - ".
# Store variable name in a data object, so we can use it. varname <- names(surveyweek1)[4] # Show result to check that the code works. varname # Intermediary result. Comment out in your Data Project code! # Find the first position of " - ". # Show results for understanding before using it below. str_locate(varname, " - ") #start and end position! # Intermediary result. Comment out in your Data Project code! # Select everything after the end position of " - ". varname <- str_sub(varname, start = str_locate(varname, " - ")[2] + 1) # Show result to check that the code works. varname # Intermediary result. Comment out in your Data Project code!
The book focuses on finding and replacing particular strings/words:
str_locate()
with str_sub()
are nice for cutting up a string.[2]
behind str_locate() selects the second item of the vector: the end position.We now have:
str_sub(names(surveyweek1)[4], start = str_locate(names(surveyweek1)[4], " - ")[2] + 1)
varname <- str_sub(names(surveyweek1)[4], start = str_locate(names(surveyweek1)[4], " - ")[2] + 1)
Step 2: Get rid of the weekday name, so remove all up to (and including) ", ".
# Select substring after the end position of ", ". varname <- str_sub(varname, start = str_locate(varname, ", ")[2] + 1) # Show result: see the difference. varname # Intermediary result. Comment out in your Data Project code!
varname <- str_sub(names(surveyweek1)[4], start = str_locate(names(surveyweek1)[4], " - ")[2] + 1) varname <- str_sub(varname, start = str_locate(varname, ", ")[2] + 1)
Step 3: Get rid of the "Sad or depressed" part, so remove all from (and including) " - ".
# Show the name that we start with. varname # Select substring up to start position of " - ". varname <- str_sub(varname, end = str_locate(varname, " - ")[1] - 1) # Show result: see the difference. varname
This is the variable name that we were after.
Note that start
and end
arguments can be combined in sub_str()
. Steps 2 and 3 can thus be done in one command.
str_sub(varname, )
str_sub(varname, start = str_locate(varname, ", ")[2] + 1, end = str_locate(varname, " - ")[1] - 1)
gradethis::grade_code( correct = "You added the `end` argument at the correct position.", incorrect = "" )
Let us apply the commands to all relevant variables (sequence 4 to 10).
# Get all selected variable names. varname <- names(surveyweek1)[4:10] # Check result. varname # Get the start and end positions. str_locate(varname, " - ")
str_locate()
creates a matrix with start and end position columns for each variable. [,1]
;[,2]
.# Get all selected variable names. varname <- names(surveyweek1)[4:10]
# Get the end positions from the matrix. str_locate(varname, " - ")[,2]
# Store variable name in a data object, so we can use it. varname <- names(surveyweek1)[4] # Select everything after the end position of " - ". varname <- str_sub(varname, start = str_locate(varname, " - ")[2] + 1) # Select substring after the end position of ", ". varname <- str_sub(varname, start = str_locate(varname, ", ")[2] + 1) # Select substring up to start position of " - ". varname <- str_sub(varname, end = str_locate(varname, " - ")[1] - 1) # Check result. varname
# Start selecting all variable names instead of only the first. varname <- names(surveyweek1)[4:10]
# Check intermediate steps, as we have done before. # For example: What does str_locate() yield now? str_locate(varname, " - ")
# Instead of a single end position, we need a column of end positions. # Check out the difference between: str_locate(varname, " - ")[2] str_locate(varname, " - ")[,2] # A comma makes a big difference...
gradethis::grade_result( fail_if(~ identical(.result, "March 8"), "Did you select variables 4 to 10 in the first command?"), fail_if(~ length(.result) == 7 && .result[2] == " March ", "Did you use columns instead of single values for start and end positions?"), pass_if(~ identical(.result, c("March 8", "March 9", "March 10", "March 11", "March 12", "March 13", "March 14")), "You have selected all seven variables (`[4:10]`) and the correct columns of start (`[,1]` instead of `[1]`) and end positions (`[,2]` instead of `[2]`).") )
Once we have the correct variable names, we can use them to replace the names in the surveyweek1
tibble.
# Store variable name in a data object, so we can use it. varname <- names(surveyweek1)[4:10] # Select everything after the end position of " - ". varname <- str_sub(varname, start = str_locate(varname, " - ")[,2] + 1) # Select substring after the end position of ", ". varname <- str_sub(varname, start = str_locate(varname, ", ")[,2] + 1) # Select substring up to start position of " - ". varname <- str_sub(varname, end = str_locate(varname, " - ")[,1] - 1)
# Replace variable names in tibble. names(surveyweek1)[4:10] <- varname # Inspect results str(surveyweek1)
Because the variable names are so well structured in this particular example, we can achieve the same with the tidyverse separate()
function.
#tidyverse requires a data frame varnames <- tibble(name = names(surveyweek1)[4:10]) %>% #split each name into three parts separate(name, into = c("humbug", "date", "topic"), sep = " - ") %>% #split the date into weekday and date separate(date, into = c("weekday", "date"), sep = ", ") %>% #extract the date variable as a vector pull(date) # Replace variable names in tibble. names(surveyweek1)[4:10] <- varnames # Show result. str(surveyweek1)
data_frame()
,pull()
.Natural text such as the header or body text of posts is used for classifying texts (topic modelling), for example, using sentiment analysis, or investigating relations between words, e.g., as visualized in a word cloud.
Corpus
handles sets of texts. tm
package: tools for creating, cleaning, and manipulating a corpus of texts. Let us use this package and the wordcloud
package to create some word clouds.
# Load the required packages. library(tm) library(wordcloud) library(RColorBrewer) # Create a corpus for the headings (titles) of the Brexit articles. docs <- Brexit %>% # extract the titles as a vector pull(title) %>% # use tm functions to create a corpus tm::VectorSource() %>% tm::Corpus() %>% # use tm functions to clean the data tm::tm_map(removeNumbers) %>% #remove numbers (assuming they are not informative) tm::tm_map(removePunctuation) %>% #remove punctuation (assuming they are not informative) tm::tm_map(stripWhitespace) %>% #remove blanks tm::tm_map(content_transformer(tolower)) %>% #only lower case letters tm::tm_map(removeWords, stopwords("dutch")) #remove Duch stopwords # Create our first wordcloud. wordcloud::wordcloud( docs, random.order = FALSE, colors = RColorBrewer::brewer.pal(11, "Spectral") )
# Create a corpus for the headings (titles) of the Brexit articles. docs <- Brexit %>% # extract the titles as a vector pull(title) %>% # use tm functions to create a corpus tm::VectorSource() %>% tm::Corpus() %>% # use tm functions to clean the data tm::tm_map(removeNumbers) %>% #remove numbers (assuming they are not informative) tm::tm_map(removePunctuation) %>% #remove punctuation (assuming they are not informative) tm::tm_map(stripWhitespace) %>% #remove blanks tm::tm_map(content_transformer(tolower)) %>% #only lower case letters tm::tm_map(removeWords, stopwords("dutch")) #remove Duch stopwords
docs %>% #remove word "brexit" tm::tm_map(removeWords, c("brexit")) %>% #create wordcloud wordcloud::wordcloud( random.order = FALSE, colors = brewer.pal(11, "Spectral") )
Document-term-matrix:
The tm
package can create document-term-matrices from a text corpus.
The code below must identify two topics in the Brexit
texts and shows these as two word clouds.
# Load the required package. library(topicmodels) # Create a document-term-matrix from the corpus. dtm = tm::DocumentTermMatrix(docs) # Run a Latent Dirichlet Allocation topic model with 2 topics lda = topicmodels::LDA(dtm, k = 2, seed = 234) # Calculate the probabilities that a text or word belongs to Topic 1 or Topic 2. pred = topicmodels::posterior(lda,dtm) # Show word clouds of terms that are more strongly related to one topic than to the other. # First, create a tibble with the ratio of the probability that a term links with Topic 2 over the probability that a term links with Topic 1. probRatio <- pred$terms %>% as_tibble() %>% rownames_to_column(var = "topic") %>% #preserve topic number as variable #turn term variables into one variable pivot_longer(!topic, names_to = "term", values_to = "prob") %>% #calculate the probability ration group_by(term) %>% arrange(term, topic) %>% mutate(pratio = prob / lag(prob, n = 1L)) %>% #only retain the rows with a valid probability ratio (namely, topic 2) filter(topic == 2) # Show two plots side by side. par(mar = c(0, 0, 0, 0), mfcol = c(1, 2)) # Wordcloud Topic 1: terms that are at least 5 times more likely for topic 1. nottopic1 <- probRatio %>% filter(pratio > 0.2) %>% pull(term) #we need a vector for the removeWords() function docs %>% #remove word "brexit" tm::tm_map(removeWords, nottopic1) %>% #create wordcloud wordcloud::wordcloud( random.order = FALSE, colors = RColorBrewer::brewer.pal(11, "Spectral"), min.freq = 2 ) # Wordcloud Topic 2: terms that are at least 5 times more likely for topic 1. nottopic2 <- probRatio %>% filter(pratio < 5) %>% pull(term) #we need a vector for the removeWords() function docs %>% #remove word "brexit" tm::tm_map(removeWords, nottopic2) %>% #create wordcloud wordcloud::wordcloud( random.order = FALSE, colors = RColorBrewer::brewer.pal(11, "Spectral"), min.freq = 2 )
Well, what's the difference between the two topics? Note that the results may change dramatically from one run to the next. Perhaps, there is just one story.
Follow the instructions in the Expert Meeting Instructions document (attached to Session 4 module on Canvas.)
Duration: 20 minutes.
Last 15 minutes of the session.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.