doc_names = movie_review$id, # document names, optional ngram_window = c(1, 2), # minimum and maximum n-gram length stopword_vec = c(stopwords::stopwords("en"), # stopwords from tm stopwords::stopwords(source = "smart")), # this is the default value lower = TRUE, # lowercase - this is the default value remove_punctuation = TRUE, # punctuation - this is the default remove_numbers = TRUE, # numbers - this is the default verbose = FALSE, # Turn off status bar for this demo cpus = 2) # by default, this will be the max number of cpus available  Even though a dgCMatrix isn't a traditional matrix, it has methods that make it similar to standard R matrices. dim(dtm) nrow(dtm) ncol(dtm)  head(colnames(dtm))  knitr::kable(head(colnames(dtm)), col.names = "colnames(dtm)") # tokens  head(rownames(dtm))  knitr::kable(head(rownames(dtm)), col.names = "rownames(dtm)") # document IDs  # Basic corpus statistics The code below performs some basic corpus statistics. textmineR has a built in function for getting term frequencies across the corpus. This function TermDocFreq gives term frequencies (equivalent to colSums(dtm)), the number of documents in which each term appears (equivalent to colSums(dtm > 0)), and an inverse-document frequency (IDF) vector. The IDF vector can be used to create a TF-IDF matrix. # get counts of tokens across the corpus tf_mat <- TermDocFreq(dtm = dtm) str(tf_mat)  # look at the most frequent tokens head(tf_mat[ order(tf_mat$term_freq, decreasing = TRUE) , ], 10)

knitr::kable(head(tf_mat[ order(tf_mat$term_freq, decreasing = TRUE) , ], 10), caption = "Ten most frequent tokens")  # look at the most frequent bigrams tf_bigrams <- tf_mat[ stringr::str_detect(tf_mat$term, "_") , ]

head(tf_bigrams[ order(tf_bigrams$term_freq, decreasing = TRUE) , ], 10)  knitr::kable(head(tf_bigrams[ order(tf_bigrams$term_freq, decreasing = TRUE) , ], 10),
caption = "Ten most frequent bi-grams")


It looks like we have stray html tags ("\<br>") in the documents. These aren't giving us any relevant information about content. (Except, perhaps, that these documents were originally part of web pages.)

The most intuitive approach, perhaps, is to strip these tags from our documents, re-construct a document term matrix, and re-calculate the objects as above. However, a simpler approach would be to simply remove the tokens containing "br" from the DTM we already calculated. This is much more computationally efficient and gives us the same result anyway.

# remove offending tokens from the DTM
dtm <- dtm[ , ! stringr::str_detect(colnames(dtm),
"(^br$)|(_br$)|(^br_)") ]

# re-construct tf_mat and tf_bigrams
tf_mat <- TermDocFreq(dtm)

tf_bigrams <- tf_mat[ stringr::str_detect(tf_mat$term, "_") , ]  head(tf_mat[ order(tf_mat$term_freq, decreasing = TRUE) , ], 10)

knitr::kable(head(tf_mat[ order(tf_mat$term_freq, decreasing = TRUE) , ], 10), caption = "Ten most frequent terms, '\\<br\\>' removed")  head(tf_bigrams[ order(tf_bigrams$term_freq, decreasing = TRUE) , ], 10)

knitr::kable(head(tf_bigrams[ order(tf_bigrams$term_freq, decreasing = TRUE) , ], 10), caption = "Ten most frequent bi-grams, '\\<br\\>' removed")  We can also calculate how many tokens each document contains from the DTM. Note that this reflects the modifications we made in constructing the DTM (removing stop words, punctuation, numbers, etc.). # summary of document lengths doc_lengths <- rowSums(dtm) summary(doc_lengths)  Often,it's useful to prune your vocabulary and remove any tokens that appear in a small number of documents. This will greatly reduce the vocabulary size (see Zipf's law) and improve computation time. # remove any tokens that were in 3 or fewer documents dtm <- dtm[ , colSums(dtm > 0) > 3 ] # alternatively: dtm[ , tf_mat$term_freq > 3 ]

tf_mat <- tf_mat[ tf_mat$term %in% colnames(dtm) , ] tf_bigrams <- tf_bigrams[ tf_bigrams$term %in% colnames(dtm) , ]


The movie review data set contains more than just text of reviews. It also contains a variable tagging the review as positive (movie_review$sentiment$=1$) or negative (movie_review$sentiment $=0$). We can examine terms associated with positive and negative reviews. If we wanted, we could use them to build a simple classifier.

However, as we will see immediately below, looking at only the most frequent terms in each category is not helpful. Because of Zipf's law, the most frequent terms in just about any category will be the same.

# what words are most associated with sentiment?
tf_sentiment <- list(positive = TermDocFreq(dtm[ movie_review$sentiment == 1 , ]), negative = TermDocFreq(dtm[ movie_review$sentiment == 0 , ]))


These are basically the same. Not helpful at all.

head(tf_sentiment$positive[ order(tf_sentiment$positive$term_freq, decreasing = TRUE) , ], 10)  knitr::kable(head(tf_sentiment$positive[ order(tf_sentiment$positive$term_freq, decreasing = TRUE) , ], 10)
, caption = "Ten most-frequent positive tokens")

head(tf_sentiment$negative[ order(tf_sentiment$negative$term_freq, decreasing = TRUE) , ], 10)  knitr::kable(head(tf_sentiment$negative[ order(tf_sentiment$negative$term_freq, decreasing = TRUE) , ], 10), caption = "Ten most-frequent negative tokens")


That was unhelpful. Instead, we need to re-weight the terms in each class. We'll use a probabilistic reweighting, described below.

The most frequent words in each class are proportional to $P(word|sentiment_j)$. As we saw above, that would puts the words in the same order as $P(word)$, overall. However, we can use the difference in those probabilities to get a new order. That difference is

\begin{align} P(word|sentiment_j) - P(word) \end{align}

You can interpret the difference in (1) as follows: Positive values are more probable in the sentiment class than in the corpus overall. Negative values are less probable. Values close to zero are statistically-independent of sentiment. Since most of the top words are the same when we sort by $P(word|sentiment_j)$, these words are statistically-independent of sentiment. They get forced towards zero.

For those paying close attention, this difference should give a similar ordering as pointwise-mutual information (PMI), defined as $PMI = \frac{P(word|sentiment_j)}{P(word)}$. However, I prefer the difference as it is bound between $-1$ and $1$.

The difference method is applied to both words overall and bi-grams in the code below.

# let's reweight by probability by class
p_words <- colSums(dtm) / sum(dtm) # alternatively: tf_mat$term_freq / sum(tf_mat$term_freq)

tf_sentiment$positive$conditional_prob <-
tf_sentiment$positive$term_freq / sum(tf_sentiment$positive$term_freq)

tf_sentiment$positive$prob_lift <- tf_sentiment$positive$conditional_prob - p_words

tf_sentiment$negative$conditional_prob <-
tf_sentiment$negative$term_freq / sum(tf_sentiment$negative$term_freq)

tf_sentiment$negative$prob_lift <- tf_sentiment$negative$conditional_prob - p_words

# let's look again with new weights
head(tf_sentiment$positive[ order(tf_sentiment$positive$prob_lift, decreasing = TRUE) , ], 10)  knitr::kable(head(tf_sentiment$positive[ order(tf_sentiment$positive$prob_lift, decreasing = TRUE) , ], 10), caption = "Reweighted: ten most relevant terms for positive sentiment")

head(tf_sentiment$negative[ order(tf_sentiment$negative$prob_lift, decreasing = TRUE) , ], 10)  knitr::kable(head(tf_sentiment$negative[ order(tf_sentiment$negative$prob_lift, decreasing = TRUE) , ], 10), caption = "Reweighted: ten most relevant terms for negative sentiment")

# what about bi-grams?
tf_sentiment_bigram <- lapply(tf_sentiment, function(x){
x <- x[ stringr::str_detect(x$term, "_") , ] x[ order(x$prob_lift, decreasing = TRUE) , ]
})

head(tf_sentiment_bigram$positive, 10)  knitr::kable(head(tf_sentiment_bigram$positive, 10),
caption = "Reweighted: ten most relevant bigrams for positive sentiment")

head(tf_sentiment_bigram$negative, 10)  knitr::kable(head(tf_sentiment_bigram$negative, 10),
caption = "Reweighted: ten most relevant bigrams for negative sentiment")


## Try the textmineR package in your browser

Any scripts or data that you put into this service are public.

textmineR documentation built on June 28, 2021, 9:08 a.m.