knitr::opts_chunk$set(
  collapse = TRUE,
  warning = FALSE,
  message = FALSE,
  echo = FALSE,
  comment = "#>",
  fig.path = "figures"
)

library(saaabstracts)
library(tm)
library(topicmodels)
library(readxl)
library(Rmpfr)
library(ggplot2)

Introduction

# read in the abstracts
general_2018 <- read_excel("../data/raw_data/General Abstracts_2018.xlsx")

general_2018_posters <- general_2018 %>% filter(grepl("Poster", `Presentation Format`))
general_2018_papers  <- general_2018 %>% filter(grepl("Paper", `Presentation Format`))
# prepare a mapping structure, first item is column with text to model, others are metadata
m <- list(content = "Abstract",  # content
          id = "Abstract Id",      # metadata
          title = "Title", 
          author_first = "First Name",
          author_last = "Last Name",
          geo = "Geographic Focus",
          keyword1 = "Keyword1", 
          keyword2 = "Keyword2", 
          keyword3 = "Keyword3", 
          org = "Affiliation")
# Create a corpus object from the abstracts
general_2018_papers_corpus <- tm::Corpus(tm::DataframeSource(general_2018_papers), 
                             readerControl = list(reader = tm::readTabular(mapping = m)))

# create a document term matrix
general_2018_papers_tm <- tm::DocumentTermMatrix(general_2018_papers_corpus, 
                                     control = list(stemming = TRUE, 
                                                    stopwords = TRUE,
                                                    minWordLength = 2, 
                                                    removeNumbers = TRUE, 
                                                    removePunctuation = TRUE))
# Use TF-IDF to wieght terms and remove rare ones
term_2018_papers_tfidf <- 
  tapply(general_2018_papers_tm$v/slam::row_sums(general_2018_papers_tm)[general_2018_papers_tm$i], 
         general_2018_papers_tm$j, mean) * log2(tm::nDocs(general_2018_papers_tm)/slam::col_sums(general_2018_papers_tm > 0))

summary(term_2018_papers_tfidf)
# Median =   0.08120
## Keeping the rows with tfidf >= median
general_2018_papers_reduced_dtm <- general_2018_papers_tm[,term_2018_papers_tfidf >= summary(term_2018_papers_tfidf)[3]]
summary(slam::col_sums(general_2018_papers_reduced_dtm))
# save
saveRDS(general_2018_papers_reduced_dtm, "../data/derived_data/general_2018_papers_reduced_dtmrds")
general_2018_papers_reduced_dtm <- readRDS("../data/derived_data/general_2018_papers_reduced_dtm.rds")
number_of_paper_abstracts <- general_2018_papers_reduced_dtm$nrow
# Determine k number of topics
harmonicMean <- function(logLikelihoods, precision = 2000L) {
  llMed <- median(logLikelihoods)
  as.double(llMed - log(Rmpfr::mean(exp(-Rmpfr::mpfr(logLikelihoods,
                                       prec = precision) + llMed))))
}
# run a loop over the abstract with different numbers of topics to find which number of topics is the best
seqk <- seq(2, 100, 1)
burnin <- 1000
iter <- 1000
keep <- 50
system.time(fitted_many_2018_papers <- 
              lapply(seqk, function(k) topicmodels::LDA(general_2018_papers_reduced_dtm, 
                                                        k = k,
                                                        method = "Gibbs",
                                                        control = list(burnin = burnin,
                                                                       iter = iter, 
                                                                       keep = keep) )))

saveRDS(fitted_many_2018_papers, "../data/derived_data/fitted_many_2018_papers.rds")

# rerun with k = 60 to force the abstracts into 60 topics
fitted_many_2018_papers <- readRDS("../data/derived_data/fitted_many_2018_papers.rds")

# how many abstracts, if we just start from here?
how_many_paper_abstracts <- fitted_many_2018_papers[[1]]@Dim[1]
# extract logliks from each topic
logLiks_many_2018_papers <- lapply(fitted_many_2018_papers, function(L)  L@logLiks[-c(1:(burnin/keep))])

# compute harmonic means
hm_many_2018_papers <- sapply(logLiks_many_2018_papers, function(h) harmonicMean(h))

# to know the optimal number of topics:
optimal_number_of_topics_2018_papers <- seqk[which.max(hm_many_2018_papers)]

# plot it
library(ggplot2)
lda_plot <- ggplot(data.frame(seqk, 
                             hm_many_2018_papers), 
                  aes(x=seqk, 
                      y=hm_many_2018_papers)) + 
  geom_path(lwd=1.5)  +
  xlab('Number of Topics') +
  ylab('Harmonic Mean') +
  geom_vline(xintercept = optimal_number_of_topics_2018_papers,
             colour = "red") +
  annotate("text",
           x = 55, 
           y = -150000, 
           label = paste("The optimal number of topics is", 
                         optimal_number_of_topics_2018_papers)) +
  theme_bw() +
  ggtitle("Latent Dirichlet Allocation Analysis of SAA General Abstracts",
          subtitle = paste0("How many distinct topics are present in the general paper abstracts (n = ", how_many_paper_abstracts, ") for 2018?"))

ggsave("../figures/saa2018_general_abstracts_papers_opti_topics.png")
# now run the model with the optimum number of topics
system.time(general_model_papers <- topicmodels::LDA(general_2018_papers_reduced_dtm, 
                                              optimal_number_of_topics_2018_papers,
                                              method = "Gibbs", 
                                              control = list(iter=2000, 
                                                             seed = 0622)))

# explore the model
general_topics_papers <- topicmodels::topics(general_model_papers, 1)

## In this case I am returning the top 30 terms.
general_terms_papers <- as.data.frame(topicmodels::terms(general_model_papers, 30), 
                               stringsAsFactors = FALSE)
library(tidytext)
tidy_topics_papers <- tidy(general_model_papers, matrix = "beta")

library(ggplot2)
library(dplyr)

tidy_topics_papers_top_terms <- tidy_topics_papers %>%
  group_by(topic) %>%
  top_n(10, beta) %>%
  ungroup() %>%
  arrange(topic, -beta)
# show top keywords for each topic
tidy_topics_papers_top_terms %>%
  mutate(term = reorder(term, beta)) %>%
  ggplot(aes(term, beta, fill = factor(topic))) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~ topic, scales = "free") +
  coord_flip()

ggsave("../figures/keywords_for_saa2018_general_papers_topics.png", h = 14, w = 14)
tidy_documents_papers <- tidy(general_model_papers, matrix = "gamma")

library(tidyr)
tidy_documents_papers_topics_gamma <-
  tidy_documents_papers %>% 
    spread(topic, gamma)

abstract_classifications_papers <- 
  tidy_documents_papers %>%
  group_by(document) %>%
  top_n(1, gamma) %>%
  ungroup() %>% 
  arrange(desc(gamma)) %>% 
  mutate(document = as.numeric(document))

abstract_classifications_papers
# put topics back onto abstract spreadsheet
# abstract_ids <- dimnames(general_2018_reduced_dtm)$Docs
# general_2018$`Abstract Id`

general_2018_papers_with_topics <- 
  general_2018_papers %>% 
  left_join(abstract_classifications_papers, 
            by = c("Abstract Id" = "document"))

write.csv(general_2018_papers_with_topics, "../data/derived_data/general_2018_papers_with_topics.csv")

Colophon

This report was generated on r Sys.time() using the following computational environment and dependencies:

# which R packages and versions?
devtools::session_info()

The current Git commit details are:

# what commit is this file at? You may need to change the path value
# if your Rmd is not in analysis/paper/
git2r::repository("../..")


benmarwick/confschedlr documentation built on May 20, 2019, 4:26 p.m.