knitr::opts_chunk$set( tidy = TRUE, tidy.opts = list(blank = FALSE, width.cutoff = 70), cache = TRUE ) # knitr::knit_hooks$set( # source = function(x,options){ # if( options$engine == 'R' ) { # # format R code # x = highr::hilight( x, format = 'html' ) # } else if( options$engine == 'bash' ) { # # format bash code # x = paste0( '<span class="hl std">$</span> ', unlist( stringr::str_split( x, '\\n' ) ), '\n', collapse = '' ) # } # # for (i in names(options)) { # # message( paste( i, options[[i]] ) ) # # } # x = paste(x, collapse = "\n") # sprintf("<div class=\"%s\"><pre class=\"%s %s\"><code class=\"%s %s\">%s</code></pre></div>\n", 'sourceCode', 'sourceCode', tolower(options$engine), 'sourceCode', tolower(options$engine), x) # } # )
stringr
tm
tidytext
lda
|REGEX | Match(es) | Notes
|-------|----------|--------
|z |spoon7 | No matches
|.
|s
p
o
o
n
7
| Each character matches
|[sp]
|s
p
oon7 | s
and p
are matched separately
|sp
|sp
oon7 | sp
is matched as a whole
|sp|on
|sp
o on
7 | Means "or"
|[^sp]
|sp o
o
n
7
| ^
negates the set [sp]
|[m-q]
|s p
o
o
n
7 | A range of characters
|REGEX| Definition|
|-----|-----------|
|a*
|Zero or more a's|
|a+
|One or more a's|
|a?
|Zero or one a's|
|a{5}
|Exactly 5 a's|
|a{3,}
|3 or more a's|
|a{1,3}
|Between 1 and 3 a's|
a{2,4}
matches aaaa
and not aaa
or aa
|REGEX| Definition|
|-----|-----------|
|a+?
|As few a's as possible|
|a{2,}
|At least 2 a's, but as few as possible|
|Character |REGEX| Character | REGEX|
|---------------|-----|---------------|------|
|tab
|\t
| ^
| \^
|
|new line
|\n
| $
| \$
|
|.
|\.
| [
| \[
|
|\
|\\
| ]
| \]
|
|+
|\+
| (
| \(
|
|*
|\*
| )
| \)
|
|?
|\?
| /
| \/
|
|{
|\{
| }
| \}
|
\
must be written as \\
inside of stringscat('\\n')
c:\
as a REGEX inside an R string, we write c:\\\\
stringr
package includes tools for detecting whether a string matches a REGEX patternstringi
package, which is more flexible, but harder to work withstringr::str_detect(s, r)
returns TRUE
if s
can be matched by r
library(stringr) s <- 'This is the string to be matched' r <- 'm' str_detect(s, r)
library(stringr) s <- 'This is the string to be matched' r <- ' [^ ] ' str_detect(s, r)
library(stringr) s <- 'This is the string to be matched' r <- ' [^ ]+ ' str_detect(s, r)
str_match(s, r)[[1]]
will return the portion of s
that was matched by r
s <- 'This is the string to be matched' r <- ' [^ ]+ ' str_match(s, r)[[1]] r <- '(is ){2}' str_match(s, r)[[1]] r <- '(is )+?' str_match(s, r)[[1]]
Expression
expression
expressions
/
and /g
Text
text
Save
Favorites
but not available
Welcome
RegExr
Media
Temple
Edit
... and all words that begin with a capital letter, but no spaces or punctuationtesting
and test
but no punctuationlibrary(stringr) s <- 'This is hi$ fist'
str_match_all(s, r)
that produce the following matchesstr_match_all(s, 'is') %>% unlist str_match_all(s, 'i[s\\$]') %>% unlist str_match_all(s, '[hf]ist?') %>% unlist
stringr
and stringi
--- operate over vectors of character stringstm
--- for applying the same extraction or transformation operations to large collections of text
tidytext
provides a somewhat more %>%
-friendly approach and has a free book that goes with it: http://tidytextmining.comlda
--- topical analysis of an already-cleaned collection of documents (e.g., previously prepared via tm
)
tm
tm
packageCorpus
is a collection of related documentstm
works with a variety of repository and file types (aka Sources
)VectorSource
(i.e., text are stored in character vectors)Document
is the basic unit of analysis, and might be a few short sentences (e.g., tweets) or a much longer text (e.g., a manuscript)VectorSource
, each element of the character vector is considered a separate "document"TermDocumentMatrix
and DocumentTermMatrix
are sparse matrices whose cells contain the number of co-occurances of each word in each document in the corpusufo_text <- reports %>% filter(!str_detect(report, '^MADAR')) %>% .$report
library(tm)
ufo_corp <- Corpus(VectorSource(ufo_text)) %>% # create corpus tm_map(content_transformer(tolower)) %>% # lower case tm_map(removePunctuation) %>% # remove punctuation tm_map(removeWords, stopwords('english')) %>% # remove common words tm_map(stemDocument) %>% # reduce to word stems tm_map(stripWhitespace) # remove extra space ufo_corp ufo_text[1557] %>% str_wrap %>% cat
tdm <- ufo_corp %>% TermDocumentMatrix tdm_long <- with(tdm, tibble(term = i, document = j, count = v)) tdm_long
inspect(tdm[1:10, 1:6])
inspect(tdm[535:545, 1:6])
dplyr
to aggregate counts across documentstdm_long <- tdm_long %>% mutate(term = factor(term, labels = Terms(tdm))) tdm_long %>% group_by(term) %>% summarise(count = sum(count)) %>% arrange(desc(count))
tidytext
package:tdm %>% tidytext::tidy()
tdm %>% findFreqTerms(400)
tdm %>% findAssocs("shape", .25) # cor(x,y) >= .25 tdm %>% findAssocs("sky", .25)
tidytext
library(tidytext) tidy_reports <- tibble(report = ufo_text) %>% mutate(id = row_number()) %>% unnest_tokens(word, report) tidy_reports
tidytext
cleaned_reports <- tidy_reports %>% anti_join(get_stopwords(), by = 'word')
cleaned_reports %>% count(word, sort = TRUE)
word_scores_url <- 'http://crr.ugent.be/papers/Ratings_Warriner_et_al.csv' (words <- readr::read_csv(word_scores_url) %>% select(Word, valence = V.Mean.Sum, arousal = A.Mean.Sum, dominance = D.Mean.Sum))
words %>% gather(variable, value, -Word) %>% group_by(variable) %>% filter(percent_rank(value) > .99) %>% ungroup %>% arrange(Word)
(arousal_stems <- words %>% select(Word, arousal) %>% mutate(term = stemDocument(Word)))
tdm_long %>% inner_join(arousal_stems)
d <- tdm_long %>% inner_join(arousal_stems) %>% group_by(document) %>% summarise(score = sum(arousal)) %>% arrange(desc(score)) %>% top_n(1) %>% getElement('document') ufo_text[d] %>% str_wrap(width = 70) %>% cat()
d <- tdm_long %>% inner_join(arousal_stems) %>% group_by(document) %>% summarise(score = -sum(arousal)) %>% arrange(desc(score)) %>% top_n(5) %>% getElement('document') ufo_text[d] %>% str_wrap(width = 70, exdent = 2) %>% str_c('* ', ., collapse = '\n') %>% cat()
tidytext
approach (without stemming)cleaned_reports scored_reports <- words %>% select(word = Word, arousal) %>% inner_join(cleaned_reports) %>% group_by(id) %>% summarise(arousal = sum(arousal))
scored_reports %>% top_n(5, -arousal) %>% getElement('id') %>% ufo_text[.] %>% .[1:5] %>% str_trunc(width = 250) %>% str_wrap(width = 70, exdent = 2) %>% str_c("* ", .) %>% cat(sep = "\n")
scored_reports %>% top_n(5, arousal) %>% getElement('id') %>% ufo_text[.] %>% .[1:5] %>% str_trunc(width = 250) %>% str_wrap(width = 70, exdent = 2) %>% str_c("* ", .) %>% cat(sep = "\n")
d0 <- tibble(report = ufo_text) %>% mutate(id = row_number()) d1 <- d0 %>% unnest_tokens(sentence, report, token = "sentences") d1
d2 <- d1 %>% group_by(id) %>% mutate(sentence_id = row_number()) %>% unnest_tokens(word, sentence, token = "words") %>% ungroup() d2
http://sentiment.nrc.ca/lexicons-for-research/
surprise <- readr::read_tsv('~/Dropbox_RSM/Datasets/NRC-Sentiment-Emotion-Lexicons/NRC-Sentiment-Emotion-Lexicons/NRC-Emotion-Lexicon-v0.92/NRC-Emotion-Lexicon-Wordlevel-v0.92.txt', col_names = c('word', 'affect', 'association')) %>% filter(affect %in% c('surprise') & association == 1) %>% transmute(word, surprise = 1L)
head(surprise) top_5_surprise <- d2 %>% left_join(surprise, by = 'word') %>% mutate(surprise = coalesce(surprise, 0L)) %>% group_by(id, sentence_id) %>% summarise(score = mean(surprise)) %>% ungroup() %>% group_by(id) %>% summarise(score = median(score)) %>% inner_join(d0, by = 'id') %>% top_n(5, score) %>% arrange(desc(score))
top_5_surprise %>% getElement('report') %>% # str_trunc(width = 250) %>% str_wrap(width = 70, exdent = 2) %>% str_c("* ", .) %>% cat(sep = "\n")
txt <- c('Abe is funny', 'Abe is fat', 'Bob is fat', 'Cam is fat', 'Cam is funny', 'Cam is short', 'Dee is funny')
library(lda) lc <- txt %>% removeWords(stopwords()) %>% stripWhitespace %>% lexicalize # create a special structure used by the lda package K <- 4 # number of topics to detect set.seed(1234) f1 <- lda.collapsed.gibbs.sampler(lc$documents, K, lc$vocab, 500, .5, .1) (top.words <- top.topic.words(f1$topics, 1, by.score = TRUE)) K <- 2 # number of topics to detect set.seed(1234) f1 <- lda.collapsed.gibbs.sampler(lc$documents, K, lc$vocab, 500, .5, .1) (top.words <- top.topic.words(f1$topics, 1, by.score = TRUE))
{width=100%} \
lda
library(lda)
library(lda) lda_corp <- ufo_corp %>% plyr::laply(as.character) %>% str_replace('(^| )[0-9]( |$)', ' [number] ') %>% str_replace('^ ', '') %>% lexicalize K <- 14 # number of topics to detect set.seed(1234) fit <- lda.collapsed.gibbs.sampler(lda_corp$documents, K, lda_corp$vocab, 500, .5, .1) (top.words <- top.topic.words(fit$topics, 4, by.score = TRUE))
get_topic_names <- function(fit) { # code copied from lda::top.topic.words: normalized.topics <- fit$topics / (rowSums(fit$topics) + 1e-05) scores <- apply(normalized.topics, 2, function(x) x * (log(x + 1e-05) - sum(log(x + 1e-05)) / length(x))) # end cutoff <- min(apply(scores, 1, function(x) max(x))) scores %>% as.data.frame %>% mutate(f = 1:nrow(.)) %>% gather(key, value, -f) %>% filter(value >= cutoff) %>% group_by(f) %>% arrange(f, desc(value)) %>% summarise(name = paste(key, collapse = ' ')) %>% getElement('name') } get_topic_names(fit)
K <- 5 set.seed(1234) fit <- lda.collapsed.gibbs.sampler(lda_corp$documents, K, lda_corp$vocab, 500, .5, .1) exemp_tab(fit)
ufo_corp_with_stops_no_stemming <- Corpus(VectorSource(ufo_text)) %>% # create corpus tm_map(content_transformer(tolower)) %>% # lower case tm_map(removePunctuation) %>% # remove punctuation tm_map(stripWhitespace) # remove extra space lda_corp_with_stops_no_stemming <- ufo_corp_with_stops_no_stemming %>% plyr::laply( as.character ) %>% str_replace('(^| )[0-9]( |$)', ' [number] ') %>% str_replace('^ ','') %>% lexicalize K <- 10 set.seed(1234) fit <- lda.collapsed.gibbs.sampler(lda_corp_with_stops_no_stemming$documents, K, lda_corp_with_stops_no_stemming$vocab, 500, .5, .1) exemp_tab(fit)
ufo_corp <- Corpus(VectorSource(ufo_text)) %>% tm_map(content_transformer(tolower)) %>% tm_map(removePunctuation) %>% tm_map(removeWords, stopwords('english')) %>% # tm_map(stemDocument) %>% tm_map(stripWhitespace) lda_corp <- ufo_corp %>% plyr::laply( as.character ) %>% str_replace('(^| )[0-9]( |$)', ' [number] ') %>% str_replace('^ ','') %>% lexicalize K <- 10 set.seed(1234) fit <- lda.collapsed.gibbs.sampler(lda_corp$documents, K, lda_corp$vocab, 500, .5, .1) exemp_tab(fit)
topicmodels
packageSimilar functionality, just a different implementation
Takes a DocumentTermMatrix
object as input
library(topicmodels) control <- list(seed = 1, iter = 1500, burnin = 500) dtm <- DocumentTermMatrix(ufo_corp) set.seed(1234) fit2 <- LDA(dtm, k = 10, method = 'Gibbs', control = control)
topicmodels
post <- posterior(fit2) output <- post$terms %>% as_tibble %>% mutate(topic = row_number()) %>% gather(term, value, -topic) %>% group_by(topic) %>% top_n(5, value) %>% ungroup %>% arrange(topic, desc(value), term) %>% group_by(topic) %>% summarise(terms = paste(term, collapse = ', '))
output
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.