knitr::opts_chunk$set( collapse = TRUE, warning = FALSE, message = FALSE, echo = FALSE, comment = "#>", fig.path = "figures" ) library(saaabstracts) library(tm) library(topicmodels) library(readxl) library(Rmpfr) library(ggplot2)
# read in the abstracts general_2018 <- read_excel("../data/raw_data/General Abstracts_2018.xlsx") general_2018_posters <- general_2018 %>% filter(grepl("Poster", `Presentation Format`)) general_2018_papers <- general_2018 %>% filter(grepl("Paper", `Presentation Format`))
# prepare a mapping structure, first item is column with text to model, others are metadata m <- list(content = "Abstract", # content id = "Abstract Id", # metadata title = "Title", author_first = "First Name", author_last = "Last Name", geo = "Geographic Focus", keyword1 = "Keyword1", keyword2 = "Keyword2", keyword3 = "Keyword3", org = "Affiliation")
# Create a corpus object from the abstracts general_2018_papers_corpus <- tm::Corpus(tm::DataframeSource(general_2018_papers), readerControl = list(reader = tm::readTabular(mapping = m))) # create a document term matrix general_2018_papers_tm <- tm::DocumentTermMatrix(general_2018_papers_corpus, control = list(stemming = TRUE, stopwords = TRUE, minWordLength = 2, removeNumbers = TRUE, removePunctuation = TRUE))
# Use TF-IDF to wieght terms and remove rare ones term_2018_papers_tfidf <- tapply(general_2018_papers_tm$v/slam::row_sums(general_2018_papers_tm)[general_2018_papers_tm$i], general_2018_papers_tm$j, mean) * log2(tm::nDocs(general_2018_papers_tm)/slam::col_sums(general_2018_papers_tm > 0)) summary(term_2018_papers_tfidf) # Median = 0.08120
## Keeping the rows with tfidf >= median general_2018_papers_reduced_dtm <- general_2018_papers_tm[,term_2018_papers_tfidf >= summary(term_2018_papers_tfidf)[3]] summary(slam::col_sums(general_2018_papers_reduced_dtm)) # save saveRDS(general_2018_papers_reduced_dtm, "../data/derived_data/general_2018_papers_reduced_dtmrds")
general_2018_papers_reduced_dtm <- readRDS("../data/derived_data/general_2018_papers_reduced_dtm.rds") number_of_paper_abstracts <- general_2018_papers_reduced_dtm$nrow
# Determine k number of topics harmonicMean <- function(logLikelihoods, precision = 2000L) { llMed <- median(logLikelihoods) as.double(llMed - log(Rmpfr::mean(exp(-Rmpfr::mpfr(logLikelihoods, prec = precision) + llMed)))) }
# run a loop over the abstract with different numbers of topics to find which number of topics is the best seqk <- seq(2, 100, 1) burnin <- 1000 iter <- 1000 keep <- 50
system.time(fitted_many_2018_papers <- lapply(seqk, function(k) topicmodels::LDA(general_2018_papers_reduced_dtm, k = k, method = "Gibbs", control = list(burnin = burnin, iter = iter, keep = keep) ))) saveRDS(fitted_many_2018_papers, "../data/derived_data/fitted_many_2018_papers.rds") # rerun with k = 60 to force the abstracts into 60 topics
fitted_many_2018_papers <- readRDS("../data/derived_data/fitted_many_2018_papers.rds") # how many abstracts, if we just start from here? how_many_paper_abstracts <- fitted_many_2018_papers[[1]]@Dim[1]
# extract logliks from each topic logLiks_many_2018_papers <- lapply(fitted_many_2018_papers, function(L) L@logLiks[-c(1:(burnin/keep))]) # compute harmonic means hm_many_2018_papers <- sapply(logLiks_many_2018_papers, function(h) harmonicMean(h)) # to know the optimal number of topics: optimal_number_of_topics_2018_papers <- seqk[which.max(hm_many_2018_papers)] # plot it library(ggplot2) lda_plot <- ggplot(data.frame(seqk, hm_many_2018_papers), aes(x=seqk, y=hm_many_2018_papers)) + geom_path(lwd=1.5) + xlab('Number of Topics') + ylab('Harmonic Mean') + geom_vline(xintercept = optimal_number_of_topics_2018_papers, colour = "red") + annotate("text", x = 55, y = -150000, label = paste("The optimal number of topics is", optimal_number_of_topics_2018_papers)) + theme_bw() + ggtitle("Latent Dirichlet Allocation Analysis of SAA General Abstracts", subtitle = paste0("How many distinct topics are present in the general paper abstracts (n = ", how_many_paper_abstracts, ") for 2018?")) ggsave("../figures/saa2018_general_abstracts_papers_opti_topics.png")
# now run the model with the optimum number of topics system.time(general_model_papers <- topicmodels::LDA(general_2018_papers_reduced_dtm, optimal_number_of_topics_2018_papers, method = "Gibbs", control = list(iter=2000, seed = 0622))) # explore the model general_topics_papers <- topicmodels::topics(general_model_papers, 1) ## In this case I am returning the top 30 terms. general_terms_papers <- as.data.frame(topicmodels::terms(general_model_papers, 30), stringsAsFactors = FALSE)
library(tidytext) tidy_topics_papers <- tidy(general_model_papers, matrix = "beta") library(ggplot2) library(dplyr) tidy_topics_papers_top_terms <- tidy_topics_papers %>% group_by(topic) %>% top_n(10, beta) %>% ungroup() %>% arrange(topic, -beta)
# show top keywords for each topic tidy_topics_papers_top_terms %>% mutate(term = reorder(term, beta)) %>% ggplot(aes(term, beta, fill = factor(topic))) + geom_col(show.legend = FALSE) + facet_wrap(~ topic, scales = "free") + coord_flip() ggsave("../figures/keywords_for_saa2018_general_papers_topics.png", h = 14, w = 14)
tidy_documents_papers <- tidy(general_model_papers, matrix = "gamma") library(tidyr) tidy_documents_papers_topics_gamma <- tidy_documents_papers %>% spread(topic, gamma) abstract_classifications_papers <- tidy_documents_papers %>% group_by(document) %>% top_n(1, gamma) %>% ungroup() %>% arrange(desc(gamma)) %>% mutate(document = as.numeric(document)) abstract_classifications_papers
# put topics back onto abstract spreadsheet # abstract_ids <- dimnames(general_2018_reduced_dtm)$Docs # general_2018$`Abstract Id` general_2018_papers_with_topics <- general_2018_papers %>% left_join(abstract_classifications_papers, by = c("Abstract Id" = "document")) write.csv(general_2018_papers_with_topics, "../data/derived_data/general_2018_papers_with_topics.csv")
This report was generated on r Sys.time()
using the following computational environment and dependencies:
# which R packages and versions? devtools::session_info()
The current Git commit details are:
# what commit is this file at? You may need to change the path value # if your Rmd is not in analysis/paper/ git2r::repository("../..")
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.