This is my first test with Rmd notebooks, and am looking at it as an alternative to notes in, e.g., Evernote. A bunch of code is copied-and-pasted from the history, because I was exploring in the console while the big test of optimal K for LDA was running.

# library(broom)
library(dplyr)
library(ggplot2)
library(hcphb)
# library(ldatuning)
library(parallel)
library(readr)
library(stringr)
library(tm)
library(topicmodels)
library(viridis)
# library(wordnet)

data("hcp_rev_sent")
data("hcp_cur_sent")

hcp_cur <- unlist(hcp_cur_sent, use.names = FALSE)
hcp_rev <- unlist(hcp_rev_sent, use.names = FALSE)

Data prep

# # first the CUR handbook
n_5 <- ceiling(length(hcp_cur) / 5)
cur_5 <- list()
for(i in 1:n_5) {
  st <- (i * 5) - 4
  en <- i * 5
  cur_5[[i]] <- paste(hcp_cur[st:en], collapse = " ")
}
cur_5_corp <- VCorpus(VectorSource(cur_5))

n_10 <- ceiling(length(hcp_cur) / 10)
cur_10 <- list()
for(i in 1:n_10) {
  st <- (i * 10) - 9
  en <- i * 10
  cur_10[[i]] <- paste(hcp_cur[st:en], collapse = " ")
}
cur_10_corp <- VCorpus(VectorSource(cur_10))

# # now the REV handbook
n_5 <- ceiling(length(hcp_rev) / 5)
rev_5 <- list()
for(i in 1:n_5) {
  st <- (i * 5) - 4
  en <- i * 5
  rev_5[[i]] <- paste(hcp_rev[st:en], collapse = " ")
}
rev_5_corp <- VCorpus(VectorSource(rev_5))

n_10 <- ceiling(length(hcp_rev) / 10)
rev_10 <- list()
for(i in 1:n_10) {
  st <- (i * 10) - 9
  en <- i * 10
  rev_10[[i]] <- paste(hcp_rev[st:en], collapse = " ")
}
rev_10_corp <- VCorpus(VectorSource(rev_10))

Now we clean up the corpuses, e.g., removing stopwords, punctuation, and other miscellany.

cleanup <- function(x) {
  x <- tm_map(x, content_transformer(tolower))
  x <- tm_map(x, removeWords, stopwords("english"))
  x <- tm_map(x, removePunctuation)
}

cur_5_clean <- cleanup(cur_5_corp)
cur_10_clean <- cleanup(cur_10_corp)
rev_5_clean <- cleanup(rev_5_corp)
rev_10_clean <- cleanup(rev_10_corp)

LDA is performed on a document-term matrix (DTM), and we construct the DTMs for all four corpuses using tm::DocumentTermMatrix. Infrequent words rarely add much understanding to topic models, so we also remove terms with a frequency < 4.

cur_5_DTM <- DocumentTermMatrix(cur_5_clean,
                                control = list(minWordLength = 3))
cur_10_DTM <- DocumentTermMatrix(cur_10_clean,
                                 control = list(minWordLength = 3))
rev_5_DTM <- DocumentTermMatrix(rev_5_clean,
                                control = list(minWordLength = 3))
rev_10_DTM <- DocumentTermMatrix(rev_10_clean,
                                 control = list(minWordLength = 3))

cat(paste("# cur_5_DTM terms:", cur_5_DTM$ncol, "\n"))
cat(paste("# cur_10_DTM terms:", cur_10_DTM$ncol, "\n"))
cat(paste("# rev_5_DTM terms:", rev_5_DTM$ncol, "\n"))
cat(paste("# rev_10_DTM terms:", rev_10_DTM$ncol, "\n"))

So there are many more terms in CUR than in REV...

cur_5_DTM_slim <- cur_5_DTM[ , which(table(cur_5_DTM$j) >= 5)]
cur_10_DTM_slim <- cur_10_DTM[ , which(table(cur_10_DTM$j) >= 5)]
rev_5_DTM_slim <- rev_5_DTM[ , which(table(rev_5_DTM$j) >= 5)]
rev_10_DTM_slim <- rev_10_DTM[ , which(table(rev_10_DTM$j) >= 5)]

cat(paste("# cur_5_DTM_slim terms:", cur_5_DTM_slim$ncol, "\n"))
cat(paste("# cur_10_DTM_slim terms:",cur_10_DTM_slim$ncol, "\n"))
cat(paste("# rev_5_DTM_slim terms:", rev_5_DTM_slim$ncol, "\n"))
cat(paste("# rev_10_DTM_slim terms:",rev_10_DTM_slim$ncol, "\n"))

And that holds after filtering the low-frequency terms.

An LDA test

First we do LDA with k = 25 and 50, then focus on k = 25 to compare topics. One way to do topic comparison is by converting the words associated with each topic into a document-term matrix (DTM; with topic ~ document), then calculating distances (euclidean by default) from the DTM:

# Uncomment the block below to-rerun analysis.
Ks <- c(25)
test <- parallel::mclapply(Ks,
            LDA,
            x = cur_10_DTM_slim,
            method = "Gibbs",
            control = list(seed = 742,
                           burnin = 1000,
                           thin = 100,
                           iter = 1000),
            mc.cores = 2)
z <- data.frame(terms(test[[1]], 30), stringsAsFactors = FALSE)
head(z)
w <- as.list(z)
w <- lapply(w, paste, collapse = " ")
w[[1]]
w_corp <- VCorpus(VectorSource(w))
w_DTM <- DocumentTermMatrix(w_corp)
dim(w_DTM)
w_dist <- dist(w_DTM)
head(w_dist)
plot(hclust(w_dist))

Based on this simple approach, it looks like there are six clusters of topics. We can compare the lists of some pairs of topics to see if they make sense:

w[[1]]
w[[15]]
cor.test(as.vector(w_DTM[1,]), as.vector(w_DTM[15,]))
cor.test(as.vector(w_DTM[1,]), as.vector(w_DTM[2,]))

So there is a statistically significant correlation between the presence-absence data for topics 1 and 15, but 1 and 2 (in different topic clusters) are uncorrelated. Across all topics the distribution of correlations is:

w_DTM_mat <- as.matrix(w_DTM)
w_cor_mat <- cor(t(w_DTM_mat))
for_hist <- data.frame(cor = w_cor_mat[w_cor_mat < 1])
ggplot2::qplot(data = for_hist, x = cor, geom = "histogram", bins = 7) + 
  ggthemes::theme_hc()

We see that most correlations are centered around zero, so even though the clusters appear in the dendrogram, there's not a whole lot of separation. And the correlations as a heatmap:

# cor_mat <- reshape::melt(w_cor_mat)
# cor_mat <- filter(cor_mat, value < 1)
# ggplot(data=cor_mat, aes(x = X1, y = X2)) +
#   geom_tile(aes(fill = value)) +
#   labs(x = "Topic",
#        y = "Topic") +
#   scale_fill_viridis() +
#   theme_bw()

As expected, the patterns here (e.g., topics 1 & 15, 12 & 17) are captured in the dendrogram. Note that even the strongest correlations are still relatively weak: the correlation for 1 & 15 is just 0.18. Whether this speaks to the power of separation of topics from LDA or a weakness because topics are so different is unclear. Actually...beyond the correlation matrix, there's a role here for PCA. In theory, LDA will find K unique topics from the corpus. If analyzed with PCA, we should have 25 maximally orthogonal dimensions.

# top_pca <- FactoMineR::PCA(t(w_DTM_mat))
# eig <- data.frame(top_pca$eig)
# plot(eig$percentage.of.variance, type = "b")

Interim conclusion

There is probably some utility in examining the similarities of topics as done above. There may be opportunities to improve this analysis, such as:

WordNet on top of LDA

So this is going to take more "doing" than I feel like I have time for at the moment...

# setDict("/usr/local/Cellar/wordnet/3.1/dict")
# tmp <- z$Topic.1
# res <- list()
# for(i in 1:length(tmp)) {
#   print(paste(i, tmp[i]))
#   res[[i]] <- NA
#   filt <- getTermFilter("ExactMatchFilter", tmp[i], TRUE)
#   term <- getIndexTerms(c("NOUN", "VERB"), 5, filt)
#   if(identical(term, NULL)) {
#     res[[i]] <- NA
#   } else {
#     for(j in length(term)) {
#       res[[i]] <- c(res[[i]], getSynonyms(term[[j]]))
#     }
#   }
# }

Topics @ K = 90

# cur_10_lda_90 <- LDA(x = cur_10_DTM_slim, 
#                      k = 90, 
#                      method = "Gibbs",
#                      control = list(seed = 742, 
#                                     burnin = 1000, 
#                                     thin = 100,
#                                     iter = 1000))
# save(cur_10_lda_90, file = "cur_10_lda_90.rda")
# 
# rev_10_lda_90 <- LDA(x = rev_10_DTM_slim, 
#                      k = 90, 
#                      method = "Gibbs",
#                      control = list(seed = 742, 
#                                     burnin = 1000, 
#                                     thin = 100,
#                                     iter = 1000))
# save(rev_10_lda_90, file = "rev_10_lda_90.rda")

cur_10_ctm_90 <- CTM(x = cur_10_DTM_slim, 
                     k = 90, method = "VEM",
                     control = list(seed = 742))
save(cur_10_ctm_90, file = "cur_10_ctm_90.rda")

rev_10_ctm_90 <- CTM(x = rev_10_DTM_slim, 
                     k = 90, method = "VEM",
                     control = list(seed = 742))
save(rev_10_ctm_90, file = "rev_10_ctm_90.rda")
load("rev_10_lda_90.rda")
load("cur_10_lda_90.rda")
rev_top30 <- data.frame(topicmodels::terms(rev_10_lda_90, 30), 
                        stringsAsFactors = FALSE)
names(rev_top30) <- paste0("rev_", names(rev_top30))
cur_top30 <- data.frame(topicmodels::terms(cur_10_lda_90, 30), 
                        stringsAsFactors = FALSE)
names(cur_top30) <- paste0("cur_", names(cur_top30))
top30 <- cbind(rev_top30, cur_top30)

# Now convert to a df to get the DTM
top30_ls <- as.list(top30)
top30_cat <- lapply(top30_ls, paste, collapse = " ")
top30_cat[[1]]
top30_corp <- VCorpus(VectorSource(top30_cat))
top30_dtm <- DocumentTermMatrix(top30_corp)
dim(top30_dtm)
top30_dist <- dist(top30_dtm)
plot(hclust(top30_dist))
plot(hclust(top30_dist), hang = -1)
top30_mat <- as.matrix(top30_dist)

nam <- colnames(top30_mat)
cut <- 7
res <- data.frame(r = NA, c = NA, val = NA)
for(i in 1:length(nam)) {
  for(j in 1:length(nam)) {
    if(j < i) {
      if(top30_mat[i, j] < cut & top30_mat[i, j] > 0) {
        dat <- c(r = nam[i], c = nam[j], val = top30_mat[i, j])
        res <- rbind(res, dat)
      }
    }
  }
}
res <- res[-1, ]
dim(res)

rev_topic_assign <- topics(rev_10_lda_90)
rev_T64 <- rev_topic_assign[rev_topic_assign == 64]
rev_T64_blk <- rev_10[as.numeric(names(rev_T64))]
rev_towrite <- paste(unlist(rev_T64_blk), collapse = "\n\n")
write(rev_towrite, file = "rev_topic64_chunks.txt")

cur_topic_assign <- topics(cur_10_lda_90)
cur_T19 <- cur_topic_assign[cur_topic_assign == 19]
cur_T19_blk <- cur_10[as.numeric(names(cur_T19))]
cur_towrite <- paste(unlist(cur_T19_blk), collapse = "\n\n")
write(cur_towrite, file = "cur_topic19_chunks.txt")
cur_uncert <- cur_10[grep(cur_10, pattern = "uncertain")]
rev_uncert <- rev_10[grep(rev_10, pattern = "uncertain")]


jacob-ogre/hcphb documentation built on May 18, 2019, 8:01 a.m.