knitr::opts_chunk$set(
  tidy = TRUE,
  tidy.opts = list(blank = FALSE, width.cutoff = 70),
  cache = TRUE
)
# knitr::knit_hooks$set(
# source = function(x,options){
#   if( options$engine == 'R' ) {
#     # format R code
#     x = highr::hilight( x, format = 'html' )
#   } else if( options$engine == 'bash' ) {
#     # format bash code
#     x = paste0( '<span class="hl std">$</span> ', unlist( stringr::str_split( x, '\\n' ) ), '\n', collapse = '' )
#   }
#   # for (i in names(options)) {
#   #   message( paste( i, options[[i]] ) )
#   # }
#   x = paste(x, collapse = "\n")
#   sprintf("<div class=\"%s\"><pre class=\"%s %s\"><code class=\"%s %s\">%s</code></pre></div>\n", 'sourceCode', 'sourceCode', tolower(options$engine), 'sourceCode', tolower(options$engine), x)
# }
# )

This session

  1. Basic text manipulation tools
    • REGEX
    • stringr
  2. Dictionary-based approach to text analysis
    • tm
    • tidytext
  3. Topic-based approach to text analysis
    • lda

REGEX

|REGEX | Match(es) | Notes |-------|----------|-------- |z |spoon7 | No matches |. |s p o o n 7 | Each character matches |[sp] |s p oon7 | s and p are matched separately |sp |sp oon7 | sp is matched as a whole |sp|on|sp o on 7 | Means "or" |[^sp]|sp o o n 7 | ^ negates the set [sp] |[m-q]|s p o o n 7 | A range of characters

Repetitions

|REGEX| Definition| |-----|-----------| |a* |Zero or more a's| |a+ |One or more a's| |a? |Zero or one a's| |a{5}|Exactly 5 a's| |a{3,}|3 or more a's| |a{1,3}|Between 1 and 3 a's|

|REGEX| Definition| |-----|-----------| |a+? |As few a's as possible| |a{2,} |At least 2 a's, but as few as possible|

Special characters

|Character |REGEX| Character | REGEX| |---------------|-----|---------------|------| |tab |\t | ^ | \^ | |new line |\n | $ | \$ | |. |\. | [ | \[ | |\ |\\ | ] | \] | |+ |\+ | ( | \( | |* |\* | ) | \) | |? |\? | / | \/ | |{ |\{ | } | \} |

cat('\\n')

stringr

library(stringr)
s <- 'This is the string to be matched'
r <- 'm'
str_detect(s, r)

library(stringr)
s <- 'This is the string to be matched'
r <- ' [^ ] '
str_detect(s, r)
library(stringr)
s <- 'This is the string to be matched'
r <- ' [^ ]+ '
str_detect(s, r)

s <- 'This is the string to be matched'
r <- ' [^ ]+ '
str_match(s, r)[[1]]
r <- '(is ){2}'
str_match(s, r)[[1]]
r <- '(is )+?'
str_match(s, r)[[1]]

Task


Task

library(stringr)
s <- 'This is hi$ fist'
str_match_all(s, 'is') %>% unlist
str_match_all(s, 'i[s\\$]') %>% unlist
str_match_all(s, '[hf]ist?') %>% unlist

Text mining tools

Text mining with tm

  1. Collect -- Organize texts from many sources in various formats in a single repository
  2. Clean -- Pre-process the texts to make them easier to work with
  3. Transform -- Generate numerical quantities based on the underlying texts for use in subsequent analysis
  4. Subsequent analysis steps...

The tm package


wzxhzdk:8 wzxhzdk:9 wzxhzdk:10
ufo_text <- 
  reports %>%
  filter(!str_detect(report, '^MADAR')) %>%
  .$report

library(tm)
ufo_corp <- Corpus(VectorSource(ufo_text)) %>%  # create corpus
  tm_map(content_transformer(tolower)) %>%      # lower case
  tm_map(removePunctuation) %>%                 # remove punctuation
  tm_map(removeWords, stopwords('english')) %>% # remove common words
  tm_map(stemDocument) %>%                      # reduce to word stems
  tm_map(stripWhitespace)                       # remove extra space 
ufo_corp
ufo_text[1557] %>% str_wrap %>% cat

tdm <- ufo_corp %>% TermDocumentMatrix
tdm_long <- with(tdm, tibble(term = i, document = j, count = v)) 
tdm_long

inspect(tdm[1:10, 1:6])

inspect(tdm[535:545, 1:6])

tdm_long <- tdm_long %>% mutate(term = factor(term, labels = Terms(tdm)))
tdm_long %>% group_by(term) %>% summarise(count = sum(count)) %>% 
  arrange(desc(count))  

tdm %>% tidytext::tidy()

tdm %>% findFreqTerms(400)

tdm %>% findAssocs("shape", .25) # cor(x,y) >= .25
tdm %>% findAssocs("sky", .25)

More about tidytext

library(tidytext)
tidy_reports <- tibble(report = ufo_text) %>% 
  mutate(id = row_number()) %>%
  unnest_tokens(word, report)
tidy_reports

Removing stop words with tidytext

cleaned_reports <- 
  tidy_reports %>%
  anti_join(get_stopwords(), by = 'word')
cleaned_reports %>%
  count(word, sort = TRUE)

"Bag-of-words" approaches to text tagging

Scoring a term-document matrix

wzxhzdk:24 wzxhzdk:25 * Sums occurrances of terms within each document (very quickly) wzxhzdk:26

Sentiment

Word lists

word_scores_url <- 'http://crr.ugent.be/papers/Ratings_Warriner_et_al.csv'
(words <- readr::read_csv(word_scores_url) %>% 
    select(Word, valence = V.Mean.Sum, arousal = A.Mean.Sum, dominance = D.Mean.Sum))

words %>% gather(variable, value, -Word) %>% 
  group_by(variable) %>% 
  filter(percent_rank(value) > .99) %>% 
  ungroup %>% 
  arrange(Word)

Arousal scores for word stems

(arousal_stems <- words %>% select(Word, arousal) %>% 
   mutate(term = stemDocument(Word)))

Merge scores and TDM

tdm_long %>% inner_join(arousal_stems)

UFO report with highest arousal

d <-
  tdm_long %>% 
  inner_join(arousal_stems) %>% 
  group_by(document) %>% 
  summarise(score = sum(arousal)) %>% 
  arrange(desc(score)) %>% 
  top_n(1) %>% 
  getElement('document')
ufo_text[d] %>% 
  str_wrap(width = 70) %>%
  cat()

UFO reports with "lowest" arousal

d <-
  tdm_long %>% 
  inner_join(arousal_stems) %>% 
  group_by(document) %>% 
  summarise(score = -sum(arousal)) %>% 
  arrange(desc(score)) %>% 
  top_n(5) %>% 
  getElement('document')
ufo_text[d] %>%
  str_wrap(width = 70, exdent = 2) %>%
  str_c('* ', ., collapse = '\n') %>%
  cat()

tidytext approach (without stemming)

cleaned_reports
scored_reports <- 
  words %>% select(word = Word, arousal) %>%
  inner_join(cleaned_reports) %>%
  group_by(id) %>%
  summarise(arousal = sum(arousal))

Bottom 5 based on sum of arousal score

scored_reports %>% top_n(5, -arousal) %>% getElement('id') %>% ufo_text[.] %>% .[1:5] %>% 
  str_trunc(width = 250) %>%
  str_wrap(width = 70, exdent = 2) %>% 
  str_c("* ", .) %>% 
  cat(sep = "\n")

Top 5 based on sum of arousal score

scored_reports %>% top_n(5, arousal) %>% getElement('id') %>% ufo_text[.] %>% .[1:5] %>% 
  str_trunc(width = 250) %>%
  str_wrap(width = 70, exdent = 2) %>% 
  str_c("* ", .) %>% 
  cat(sep = "\n")

Sentences instead of words

d0 <- tibble(report = ufo_text) %>% mutate(id = row_number())
d1 <- d0 %>%
  unnest_tokens(sentence, report, token = "sentences")
d1

d2 <- 
  d1 %>% group_by(id) %>% mutate(sentence_id = row_number()) %>%
  unnest_tokens(word, sentence, token = "words") %>% ungroup()
d2

NRC sentiments

http://sentiment.nrc.ca/lexicons-for-research/

surprise <- 
  readr::read_tsv('~/Dropbox_RSM/Datasets/NRC-Sentiment-Emotion-Lexicons/NRC-Sentiment-Emotion-Lexicons/NRC-Emotion-Lexicon-v0.92/NRC-Emotion-Lexicon-Wordlevel-v0.92.txt', col_names = c('word', 'affect', 'association')) %>%
  filter(affect %in% c('surprise') & association == 1) %>%
  transmute(word, surprise = 1L)
head(surprise)
top_5_surprise <- 
  d2 %>%
  left_join(surprise, by = 'word') %>%
  mutate(surprise = coalesce(surprise, 0L)) %>%
  group_by(id, sentence_id) %>%
  summarise(score = mean(surprise)) %>% ungroup() %>%
  group_by(id) %>%
  summarise(score = median(score)) %>%
  inner_join(d0, by = 'id') %>%
  top_n(5, score) %>%
  arrange(desc(score))

Top 5 based on surprise

top_5_surprise %>%
  getElement('report') %>%
  # str_trunc(width = 250) %>%
  str_wrap(width = 70, exdent = 2) %>% 
  str_c("* ", .) %>% 
  cat(sep = "\n")

Latent Dirichlet allocation

txt <- c('Abe is funny',
         'Abe is fat',
         'Bob is fat',
         'Cam is fat',
         'Cam is funny',
         'Cam is short',
         'Dee is funny')
library(lda)
lc <- 
  txt %>% 
  removeWords(stopwords()) %>% 
  stripWhitespace %>% 
  lexicalize  # create a special structure used by the lda package
K <- 4  # number of topics to detect
set.seed(1234)
f1 <- lda.collapsed.gibbs.sampler(lc$documents, K, lc$vocab, 500, .5, .1)
(top.words <- top.topic.words(f1$topics, 1, by.score = TRUE))
K <- 2  # number of topics to detect
set.seed(1234)
f1 <- lda.collapsed.gibbs.sampler(lc$documents, K, lc$vocab, 500, .5, .1)
(top.words <- top.topic.words(f1$topics, 1, by.score = TRUE))

{width=100%} \

lda

library(lda)
library(lda)
lda_corp <-
  ufo_corp %>% 
  plyr::laply(as.character) %>% 
  str_replace('(^| )[0-9]( |$)', ' [number] ') %>% 
  str_replace('^ ', '') %>% 
  lexicalize
K <- 14  # number of topics to detect
set.seed(1234)
fit <-
  lda.collapsed.gibbs.sampler(lda_corp$documents, K, lda_corp$vocab, 500, .5, .1)
(top.words <- top.topic.words(fit$topics, 4, by.score = TRUE))

get_topic_names <- function(fit) {
  # code copied from lda::top.topic.words:
  normalized.topics <- fit$topics / (rowSums(fit$topics) + 1e-05)
  scores <- apply(normalized.topics, 2, function(x) x * (log(x + 1e-05) - sum(log(x + 1e-05)) / length(x)))
  # end
  cutoff <- min(apply(scores, 1, function(x) max(x)))
  scores %>% 
    as.data.frame %>% 
    mutate(f = 1:nrow(.)) %>% 
    gather(key, value, -f) %>% 
    filter(value >= cutoff) %>% 
    group_by(f) %>% 
    arrange(f, desc(value)) %>% 
    summarise(name = paste(key, collapse = ' ')) %>% 
    getElement('name')
}
get_topic_names(fit)

wzxhzdk:46

K <- 5
set.seed(1234)
fit <- lda.collapsed.gibbs.sampler(lda_corp$documents, K, lda_corp$vocab, 500, .5, .1)
exemp_tab(fit) 

ufo_corp_with_stops_no_stemming <- Corpus(VectorSource(ufo_text)) %>%  # create corpus
  tm_map(content_transformer(tolower)) %>%      # lower case
  tm_map(removePunctuation) %>%                 # remove punctuation
  tm_map(stripWhitespace)                       # remove extra space 
lda_corp_with_stops_no_stemming <- 
  ufo_corp_with_stops_no_stemming %>% 
  plyr::laply( as.character ) %>% 
  str_replace('(^| )[0-9]( |$)', ' [number] ') %>% 
  str_replace('^ ','') %>% 
  lexicalize

K <- 10
set.seed(1234)
fit <- lda.collapsed.gibbs.sampler(lda_corp_with_stops_no_stemming$documents,
    K, lda_corp_with_stops_no_stemming$vocab, 500, .5, .1)
exemp_tab(fit) 

ufo_corp <- Corpus(VectorSource(ufo_text)) %>% 
  tm_map(content_transformer(tolower)) %>% 
  tm_map(removePunctuation) %>% 
  tm_map(removeWords, stopwords('english')) %>%
  # tm_map(stemDocument) %>%
  tm_map(stripWhitespace) 

lda_corp <- ufo_corp %>% plyr::laply( as.character ) %>% 
  str_replace('(^| )[0-9]( |$)', ' [number] ') %>% str_replace('^ ','') %>% lexicalize

K <- 10 
set.seed(1234)
fit <- lda.collapsed.gibbs.sampler(lda_corp$documents, K, lda_corp$vocab, 500, .5, .1)
exemp_tab(fit) 

The topicmodels package

library(topicmodels)
control <- list(seed = 1, iter = 1500, burnin = 500)
dtm <- DocumentTermMatrix(ufo_corp)
set.seed(1234)
fit2 <- LDA(dtm, k = 10, method = 'Gibbs', control = control)

Output from topicmodels

post <- posterior(fit2)
output <- 
  post$terms %>% 
  as_tibble %>%
  mutate(topic = row_number()) %>%
  gather(term, value, -topic) %>%
  group_by(topic) %>%
  top_n(5, value) %>%
  ungroup %>%
  arrange(topic, desc(value), term) %>%
  group_by(topic) %>%
  summarise(terms = paste(term, collapse = ', '))

output

Task



jasonmtroos/rook documentation built on May 24, 2020, 3:16 p.m.