knitr::opts_chunk$set(
  collapse = TRUE,
  warning = FALSE,
  message = FALSE,
  echo = FALSE,
  comment = "#>",
  fig.path = "figures"
)

library(saaabstracts)
library(tm)
library(topicmodels)
library(readxl)
library(Rmpfr)
library(ggplot2)

Introduction

# read in the abstracts
general <- read_excel("../data/raw_data/General Abstracts.xlsx")
# prepare a mapping structure, first item is column with text to model, others are metadata
m <- list(content = "Abstract",  # content
          id = "Abstract Id",      # metadata
          title = "Title", 
          author_first = "First Name",
          author_last = "Last Name",
          geo = "Geographic Focus",
          keyword1 = "Keyword1", 
          keyword2 = "Keyword2", 
          keyword3 = "Keyword3", 
          org = "Affiliation")
# Create a corpus object from the abstracts
general_corpus <- tm::Corpus(tm::DataframeSource(general), 
                             readerControl = list(reader = tm::readTabular(mapping = m)))

# create a document term matrix
general_tm <- tm::DocumentTermMatrix(general_corpus, 
                                     control = list(stemming = TRUE, 
                                                    stopwords = TRUE,
                                                    minWordLength = 2, 
                      s                              removeNumbers = TRUE, 
                                                    removePunctuation = TRUE))
# Use TF-IDF to wieght terms and remove rare ones
term_tfidf <- 
  tapply(general_tm$v/slam::row_sums(general_tm)[general_tm$i], 
         general_tm$j, mean) * log2(tm::nDocs(general_tm)/slam::col_sums(general_tm > 0))

summary(term_tfidf)
# Median =  0.08989
## Keeping the rows with tfidf >= median
general_reduced_dtm <- general_tm[,term_tfidf >= summary(term_tfidf)[3]]
summary(slam::col_sums(general_reduced_dtm))
# save
saveRDS(general_reduced_dtm, "../data/derived_data/general_reduced_dtm.rds")
general_reduced_dtm <- readRDS("../data/derived_data/general_reduced_dtm.rds")
library(ldatuning)
result <- FindTopicsNumber(
  general_reduced_dtm,
  topics = seq(from = 2, to = 100, by = 2),
  metrics = c("Griffiths2004", "CaoJuan2009", "Arun2010", "Deveaud2014"),
  method = "Gibbs",
  control = list(seed = 77),
  mc.cores = 2L,
  verbose = TRUE
)

knitr::kable(result)
FindTopicsNumber_plot(result)
# Determine k number of topics
harmonicMean <- function(logLikelihoods, precision = 2000L) {
  llMed <- median(logLikelihoods)
  as.double(llMed - log(Rmpfr::mean(exp(-Rmpfr::mpfr(logLikelihoods,
                                       prec = precision) + llMed))))
}
# run a loop over the abstract with different numbers of topics to find which number of topcis is the best
seqk <- seq(2, 100, 1)
burnin <- 1000
iter <- 1000
keep <- 50
system.time(fitted_many <- 
              lapply(seqk, function(k) topicmodels::LDA(general_reduced_dtm, 
                                                        k = k,
                                                        method = "Gibbs",
                                                        control = list(burnin = burnin,
                                                                       iter = iter, 
                                                                       keep = keep) )))

saveRDS(fitted_many, "../data/derived_data/fitted_many.rds")
fitted_many <- readRDS("../data/derived_data/fitted_many.rds")
# extract logliks from each topic
logLiks_many <- lapply(fitted_many, function(L)  L@logLiks[-c(1:(burnin/keep))])

# compute harmonic means
hm_many <- sapply(logLiks_many, function(h) harmonicMean(h))

# to know the optimal number of topics:
optimal_number_of_topics <- seqk[which.max(hm_many)]

# plot it
library(ggplot2)
lda_plot <- ggplot(data.frame(seqk, 
                             hm_many), 
                  aes(x=seqk, 
                      y=hm_many)) + 
  geom_path(lwd=1.5)  +
  xlab('Number of Topics') +
  ylab('Harmonic Mean') +
  geom_vline(xintercept = seqk[which.max(hm_many)],
             colour = "red") +
  annotate("text",
           x = 55, 
           y = -150000, 
           label = paste("The optimal number of topics is", 
                         seqk[which.max(hm_many)])) +
  theme_bw() +
  ggtitle(expression(atop("Latent Dirichlet Allocation Analysis of SAA General Abstracts", atop(italic("How many distinct topics in the abstracts?"), ""))))
# now run the model with the optimum number of topics
system.time(general_model <- topicmodels::LDA(general_reduced_dtm, 
                                              optimal_number_of_topics,
                                              method = "Gibbs", 
                                              control = list(iter=2000, 
                                                             seed = 0622)))

# explore the model
general_topics <- topicmodels::topics(general_model, 1)

## In this case I am returning the top 30 terms.
general_terms <- as.data.frame(topicmodels::terms(general_model, 30), 
                               stringsAsFactors = FALSE)
library(tidytext)
tidy_topics <- tidy(general_model, matrix = "beta")

library(ggplot2)
library(dplyr)

tidy_topics_top_terms <- tidy_topics %>%
  group_by(topic) %>%
  top_n(10, beta) %>%
  ungroup() %>%
  arrange(topic, -beta)
tidy_topics_top_terms %>%
  mutate(term = reorder(term, beta)) %>%
  ggplot(aes(term, beta, fill = factor(topic))) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~ topic, scales = "free") +
  coord_flip()
tidy_documents <- tidy(general_model, matrix = "gamma")

library(tidyr)
tidy_documents_topics_gamma <-
  tidy_documents %>% 
    spread(topic, gamma)

abstract_classifications <- 
  tidy_documents %>%
  group_by(document) %>%
  top_n(1, gamma) %>%
  ungroup() %>% 
  arrange(desc(gamma))
mydata <- tidy_documents_topics_gamma[,-1]

# Determine number of clusters
library(NbClust)
nclust <- NbClust(mydata, method = "kmeans")

library(fpc)
pamk.best <- pamk(mydata)

# Determine number of clusters
wss <- (nrow(mydata)-1)*sum(apply(mydata,2,var))
for (i in 2:15) wss[i] <- sum(kmeans(mydata, 
    centers=i)$withinss)
plot(1:15, wss, type="b", xlab="Number of Clusters",
  ylab="Within groups sum of squares")

# K-Means Cluster Analysis
fit <- kmeans(mydata, 5) # 5 cluster solution
# get cluster means 
aggregate(mydata,by=list(fit$cluster),FUN=mean)
# append cluster assignment
mydata <- data.frame(mydata, fit$cluster)

Knapsack

We have how many sessions?

library(readxl)
org_sessions <- read_excel("../data/raw_data/Organized Session Abstracts_2018_AB.xlsx")

gen_sessions <- read_excel("../data/raw_data/01 - gen_paper_sessions_FINAL(oct5)_DG_including_moves v2.xlsx")

# how many unique organised sessions? 
n_org_sessions <- length(unique(org_sessions$`Session Id`))

# how many unique general sessions? 
n_gen_sessions <- length(unique(gen_sessions$SESSION))

# total number of sessions
total_number_of_sessions <-  n_org_sessions + n_gen_sessions

For sheduling into rooms, we don't include the posters, because they go in the ballroom. How many 4h slots do we have? I looked on the grid on 'Rooom&Time_DG.xlsx' and counted up the 4h slots, excluding poster sessions, because they are less than 4 h

library(tidyverse)
four_h_slots <- 
  tribble(~time,       ~Th, ~Fr, ~Sa, ~Su,
          "morning",    36,  31,  30,  28,
          "afternoon",  35,  32,  30,  0,
          "evening",    36,  0,    0,  0)

four_h_slots_long <- 
  four_h_slots %>% 
  gather(times, slot, -time)

# how many 4 h slots
n_four_h_slots <-  sum(four_h_slots_long$slot)

So that's r n_four_h_slots 4 hour slots, where we consider each slot a knapsack.

Now we need to pack the sessions into these slots. First we need to set the upper bounds for the session durations, according to the SAA rules, and then compute how much time each session is likely to take give the number of papers/speakers

# combine organised and general sessions
org_sessions$`Session Id` <- as.character(org_sessions$`Session Id`)
gen_sessions$`Session Id` <- gen_sessions$SESSION
gen_sessions$`Session Type` <- "Symposium"

gen_sessions_to_join <- 
gen_sessions %>% 
  select(names(gen_sessions)[names(gen_sessions) %in% names(org_sessions)])

all_nonposter_sessions <- 
  bind_rows(gen_sessions_to_join, org_sessions)

# how many papers per session
papers_per_session <- 
all_nonposter_sessions %>% 
  group_by(`Session Id`) %>% 
  tally(sort = TRUE)

ggplot(papers_per_session,
       aes(n)) +
  geom_histogram() +
  ylab("Papers per non-poster session") +
  theme_bw()

# what's the duration of each session, and he max allowed duration? 
session_types <- unique(all_nonposter_sessions$`Session Type`)

# this comes from 'Conflicts.docx'
session_time_limits <- 
  tribble(~`Session Type`,             ~max_time,
          "Poster Symposium",           2,
          "Symposium",                  4,
          "Electronic Symposium",       2, 
          "Forum" ,                     2, 
          "Lightning Rounds",           2)

# join time limits onto session data
session_with_limits <- 
  all_nonposter_sessions %>% 
  filter(`Session Type` != "Poster Symposium") %>% 
  left_join(session_time_limits)

# now let's see which session exceed the time limits
session_with_limits <- 
session_with_limits %>% 
  group_by(`Session Id`, `Session Type`, max_time ) %>% 
  tally(sort = TRUE) %>% 
  mutate(paper_time = n * 15/60) %>% 
  mutate(paper_time = if_else(paper_time > max_time,  max_time, paper_time))

Now we have a known number of 4 hour sessions (knapsacks), and a known number of sessions of various lengths. We turn to http://www.sumsar.net/blog/2016/06/how-to-cut-your-planks-with-r/ to see how to arrange them most efficiently. Here is the original knapsack example that we'll try to adapt:

# Here is Rasmus' example: http://www.sumsar.net/blog/2016/06/how-to-cut-your-planks-with-r/

planks_we_have <- c(120, 137, 220, 420, 480)
planks_we_want <- c(19, 19, 19, 19, 79, 79, 79, 103, 103,
                    103, 135, 135, 135, 135, 160)

library(adagio)
# mknapsack calling signature is: mknapsack(values, weights, capacities)
solution <- mknapsack(planks_we_want, planks_we_want + 1, planks_we_have)
# Above I added +1 cm  to each length to compensate for the loss when sawing.
solution$ksack

# That is, cut plank length 1 from plank 1, plank length 2 from plank 4, etc.

# Now pretty printing what to cut so that we don't make mistakes...
assignment <- data.frame(
  cut_this = planks_we_have[solution$ksack],
  into_this = planks_we_want)
t(assignment[order(assignment[,1]), ])

Now let's substitute with our data:

# Convert the durations to minutes so we get integers, mknapsack doesn't work with decimals

# into these
sessions_we_have <- rep(4, n_four_h_slots) * 60 # so many 4 hour slots that we need to fill 
# we want to fit these
sessions_we_want <- session_with_limits$paper_time * 60 # the duration of each session

library(adagio)
# mknapsack calling signature is: mknapsack(values, weights, capacities)
solution <- mknapsack(sessions_we_want, sessions_we_want, sessions_we_have)

solution$ksack
# That is, put session 1 into slot 1, session 2 into slot 2, etc.

# Now pretty printing what to session to put in what slot so that we don't make mistakes...
assignment <- data.frame(
  fill_this_slot = sessions_we_have[solution$ksack],
  with_this_session = sessions_we_want)
t(assignment[order(assignment[,1]), ])

# put the computed slot back onto the full details of the sessions
session_with_limits$computed_slot <- solution$ksack

That seems to work great, assuming that we've got the time limits ok. Now we arbitrarily assign them to the schedule, then check for conflicts

four_h_slots_long$cumsum <- cumsum(four_h_slots_long$slot)

xx <- 
four_h_slots_long %>% 
  mutate(day_part = paste0(times, "-", time)) %>% 
  select(-time, -times) %>% 
  rename(end = cumsum) %>% 
  filter( slot != 0)

# get the number of the start slot to go with the number of the end slot
for(i in 1:(nrow(xx))) {

  xx$start[i] <- 
    if_else(i == 1, 
           1,
            as.numeric(xx[ i-1, 2]) + 1)

}

intervals <- 
xx %>% 
  select(-slot) %>% 
  select(day_part, start, end ) 

elements <- session_with_limits$computed_slot

library(fuzzyjoin)
join_intervals_and_elements <- 
fuzzy_left_join(data.frame(elements), 
                intervals, 
                by = c("elements" = "start", 
                       "elements" = "end"), 
                match_fun = list(`>=`, `<=`)) %>% 
  distinct()

# Which slot goes into which part of what day
slot_day_part <- 
join_intervals_and_elements %>% 
  arrange(elements, day_part)

# now we put the sessions on also
session_slot_day_part <- 
  session_with_limits %>% 
  left_join(slot_day_part, by  = c("computed_slot" = "elements"))

Working in a local Docker container

To run this project in a local Docker container, I start a bash shell in the project directory:

Do this one time only, build my Docker container from Dockerfile at top-level in my project:

docker build - < Dockerfile
docker ps

Take note of the container ID, which you get after docker ps

Then run my Docker container, access it via the web browser, link the project dir to the Docker drive. Make sure you're in a local directory that is sharable (C:\Users\yourname is usually good):

docker run -dp 8787:8787 -v ${pwd}:/home/rstudio/ -e ROOT=TRUE  <container ID>

Then go to http://192.168.99.100:8787/ or localhost:8787 in youor browser, log in (rstudio/rstudio), and start your RStudio project by double-clickin on the .Rproj file

When you're done, in the shell, stop all docker containers with:

docker stop $(docker ps -a -q)

Or do docker ps and docker stop <container ID> just to stop one container.

Colophon

This report was generated on r Sys.time() using the following computational environment and dependencies:

# which R packages and versions?
devtools::session_info()

The current Git commit details are:

# what commit is this file at? You may need to change the path value
# if your Rmd is not in analysis/paper/
git2r::repository("../..")


benmarwick/confschedlr documentation built on May 20, 2019, 4:26 p.m.