In benmarwick/confschedlr: Organise submitted papers into time slots to schedule a conference

knitr::opts_chunk$set(
  collapse = TRUE,
  warning = FALSE,
  message = FALSE,
  echo = FALSE,
  comment = "#>",
  fig.path = "figures"
)

library(saaabstracts)
library(readxl)
library(tidyverse)
library(knitr)
library(glue)

Introduction

How many sessions do we have?

We can combine general and organised sessions, for the purpose of scheduling. But we have to get the after hours sessions and put them aside.

library(readxl)

# Here we get all the general session posters
# session number is cuts column
gen_poster_session_posters_individual <- read_excel("../data/raw_data/02 - poster_sessions_FINAL(oct31).xlsx")
gen_poster_session_posters_individual$`Session Id` <- as.character(gen_poster_session_posters_individual$cuts)
# 422 posters in the general session

# In an attempt to solve the scheduling problem...
# Let's make each gen session poster a unique session to see if we can allocat them ok into the slots
# gen_poster_session_posters_individual$`Session Id` <-
#   as.character(glue('9999_{1:nrow(gen_poster_session_posters_individual)}_{gen_poster_session_posters_individual$`Session Id`}'))

# grab the two general sessions from the 
# ah posters from the other sheet (very inconvenient!)
ah_poster_session_posters_individual <- read_excel("../data/raw_data/02 - poster_sessions_FINAL(oct31).xlsx", sheet = "AFTER_HOURS", skip = 11)
ah_poster_session_posters_individual$`Session Id` <- as.character(ah_poster_session_posters_individual$cuts)

ah_poster_session_posters_individual <- 
ah_poster_session_posters_individual[, names(ah_poster_session_posters_individual) %in% names(gen_poster_session_posters_individual)  ]

gen_poster_session_posters_individual <- 
  rbind(gen_poster_session_posters_individual,
        ah_poster_session_posters_individual)

# In here we have org and after hours posters
# but not general posters

all_abstracts  <-  read_excel("../data/raw_data/Organized Session Abstracts_2018_AB.xlsx") 

org_and_after_hours_posters_individual <- 
all_abstracts %>% 
  filter(`Session Type` == "Poster Symposium") %>% 
  filter(Role == "Presenter") %>% 
  mutate(`Session Id`  = as.character(`Session Id`))
# 196 posters in org + ah sessions

# combine general, organised and after hours individual posters
common_names <- 
names(gen_poster_session_posters_individual)[names(gen_poster_session_posters_individual) %in% names(org_and_after_hours_posters_individual)]

gen_poster_session_posters_individual <- 
  gen_poster_session_posters_individual %>% 
  select(common_names)

org_and_after_hours_posters_individual <- 
  org_and_after_hours_posters_individual %>% 
    select(common_names)

all_posters_individual <- 
  bind_rows(gen_poster_session_posters_individual,
            org_and_after_hours_posters_individual) %>% 
  filter(!is.na(Title))

There are a small number of people with posters who are also discussants, etc. So we need to be sure we don't schedule posters at the same time as those activities.

# Do people giving posters have other roles?
# check 
people_with_2_non_poster_roles <- 
all_abstracts %>% 
  group_by(`First Name`, `Last Name`) %>% 
  add_tally() %>% 
  filter(n >= 2) %>% 
  filter(!grepl("Poster", `Session Type`)) %>% 
  mutate(uid = glue('{`First Name`} {`Last Name`}'))

people_in_posters_with_2_roles <- 
  all_abstracts %>% 
  group_by(`First Name`, `Last Name`) %>% 
  add_tally() %>% 
  filter(n >= 2) %>% 
  filter(grepl("Poster", `Session Type`)) %>% 
  mutate(uid = glue('{`First Name`} {`Last Name`}'))

# any overlap?
people_with_posters_and_another_role <- 
people_in_posters_with_2_roles$uid[people_in_posters_with_2_roles$uid %in% people_with_2_non_poster_roles$uid]

Here's an experiment to break up the general sessions to see if we can improve the fit into the SAA session limits. We split the general sessions with more than so many papers.

gen_poster_session_posters_individual %>%
  group_by(`Session Id`) %>%
  tally() %>%
  ggplot(aes(n)) +
  geom_histogram()

n_allowed <- 8 # almost each poster is it's own session
gen_poster_sessions_n_more_than_n <-
gen_poster_session_posters_individual %>%
  group_by(`Session Id`) %>%
  tally()  %>%
  filter(n >= n_allowed) %>%
  pull(`Session Id`)


gen_posters_in_sessions_n_more_than_n <-
gen_poster_session_posters_individual %>%
  filter(`Session Id` %in% gen_poster_sessions_n_more_than_n) %>%
  group_by(`Session Id`) %>%
  mutate(label = 1:n()) %>%
  mutate(new_session_id1 =  label %% n_allowed) %>%
  group_by(new_session_id1, idx = cumsum(new_session_id1 == 1L)) %>% 
  ungroup() %>%
  mutate(`Session Id` = glue('{`Session Id`}_{idx}'))  %>%
  select(-label, -new_session_id1, -idx )

gen_posters_in_sessions_n_more_than_n %>%
  group_by(`Session Id`) %>%
  tally() %>%
  ggplot(aes(n)) +
  geom_histogram()

# now update the all_posters_individual data frame
all_posters_individual <-
all_posters_individual %>%
  mutate(`Session Id` = as.character(`Session Id`)) %>%
  # first remove the old gen session with more than 12 papers each
  filter(!`Session Id` %in% gen_poster_sessions_n_more_than_n) %>%
  # then add in those same posters, but with their new session ids
  bind_rows(gen_posters_in_sessions_n_more_than_n)

We have r nrow(all_posters_individual) posters altogether, with r nrow(gen_poster_session_posters_individual) in the general session, and r nrow(org_and_after_hours_posters_individual) in the organised sessions (including after hours posters).

Let's remove the after hours sessions from all the posters, since we know they will be assigned to Th-evening:

# how many ah sessions?
after_hours_poster_sessions_only <- 
  read_excel("../data/raw_data/02 - poster_sessions_FINAL(oct31).xlsx", 
             sheet = "AFTER_HOURS") %>% 
  slice(1:9) %>% 
  mutate(`Session Id` = as.numeric(gsub("\\D", "", ID)))

# how many ah posters?
after_hours_poster_sessions_posters_individual <- 
  all_posters_individual %>%
  filter(`Session Id` %in% after_hours_poster_sessions_only$`Session Id`) %>% 
  mutate(poster_session_time = "Th-evening")

# remove these from all posters
all_posters_individual_no_ah <- 
  all_posters_individual %>% 
  filter(!`Session Id` %in% after_hours_poster_sessions_only$`Session Id`)

# put out the after hours sessions and posters in a CSV
write_csv(after_hours_poster_sessions_posters_individual,
          "../data/derived_data/after_hours_poster_sessions_all_posters.csv")

# Make sure that we don't have the Thu-evening posters in the remaining posterings
all_posters_individual_no_ah$`Session Id`[all_posters_individual_no_ah$`Session Id` %in% after_hours_poster_sessions_posters_individual$`Session Id`]

We have r length(unique(after_hours_poster_sessions_only$'Session Id')) sessions of after-hours posters, with a total of r nrow(after_hours_poster_sessions_posters_individual) posters. This leaves r nrow(all_posters_individual_no_ah) remaning to allocate in the day-parts.

100 posters are permitted in the after-hours session. We have r nrow(after_hours_poster_sessions_posters_individual) posters in the after hours session. That's r ifelse(nrow(after_hours_poster_sessions_posters_individual) <= 100, "great", "not ideal").

We have from the SAA information about how many posters can go into each session

# poster session durations
library(stringi)
poster_session_durations <- 
  read_excel("../data/raw_data/saa all organizedsession_timeslotABv3.xls", 
             sheet = "Posters1")

# add day-part to compare with papers
poster_session_durations <- 
  poster_session_durations %>% 
  mutate(start = as.character(start)) %>% 
  mutate(end = as.character(end)) %>% 
  mutate(day_part = case_when(
    start == "1899-12-31 08:00:00" ~ "morning",
    start == "1899-12-31 10:30:00" ~ "morning",
    start == "1899-12-31 14:00:00" ~ "afternoon",
    start == "1899-12-31 17:00:00" ~ "evening"
  )) %>% 
  mutate(start = substr(start, 11, nchar(start))) %>% 
  mutate(end = substr(end, 11, nchar(end))) %>% 
  mutate(day_part = glue('{stri_trans_totitle(substr(day, 1, 2))}-{day_part}'))

poster_session_durations

diff_between_need_and_have <- nrow(all_posters_individual_no_ah) - sum(poster_session_durations$'N max')

There are spaces for a total of r sum(poster_session_durations$'N max') posters. We have received r nrow(all_posters_individual_no_ah) poster submissions. So we have r nrow(all_posters_individual_no_ah) - sum(poster_session_durations$'N max') r ifelse(diff_between_need_and_have < 0, glue('{abs(diff_between_need_and_have)} free spaces'), glue('{abs(diff_between_need_and_have)} more posters than available spaces')).

Thu-evening is the 'after hours' session, so that's already got the after-hours poster sessions in there. We'll just omit it from the following available schedules, and we don't need to worry about clashes because it's after all the papers.

And we'll combine day-parts that are in the same paper day-part, which will make it simper for avoiding clashes.

# exclude Thurs evening from now on
poster_session_durations_no_th_ev <- 
  poster_session_durations %>% 
  filter(day_part != "Th-evening") %>% 
  group_by(day_part) %>% 
  summarise(n_max = sum(`N max`) )

# see if we can do the poster day-parts individually

poster_session_durations_no_th_ev <- 
  poster_session_durations %>% 
  filter(day_part != "Th-evening") %>% 
  mutate(poster_day_part = glue('poster-{day_part}')) %>% 
  group_by(poster_day_part) %>% 
  mutate(label = 1:n()) %>% 
  ungroup() %>% 
  mutate(poster_day_part = glue('{poster_day_part}_{label}')) %>% 
  select(-label) %>% 
  mutate(n_max = `N max`)

We want to avoid people giving a poster and a talk at the same time, so we need to check with the paper schedule:

sessions_in_day_parts_data_with_all_names_df <- 
  read_csv("../data/derived_data/sessions_in_day_parts_data_with_all_names_df.csv")

Now we need to put the poster sessions into the remaining slots while checking to see if one of the poster authors is already scheduled to give a paper.

# to avoid overfilling day-parts
buffer <- 7 # is the largest to avoid overfill

# day-parts to fill
 poster_session_durations_no_th_ev <- 
   poster_session_durations_no_th_ev %>% 
   arrange(desc(n_max))

# make a unique value for names for all posters
all_posters_individual_no_ah <- 
  all_posters_individual_no_ah %>% 
    mutate(uid = glue('{`First Name`}_{`Last Name`}'))

# get table of session id's only 
poster_session_ids <- 
unique(all_posters_individual_no_ah$`Session Id`) 

# we're going to change these
  all_posters_individual_no_ah_tmp <- all_posters_individual_no_ah
  container_for_this_day_part <- 0
  poster_session_durations_no_th_ev$n_posters <- 0
  poster_session_durations_no_th_ev$n_sessions <- 0
  poster_session_durations_no_th_ev$c_sessions <- 0
  poster_session_durations_no_th_ev$c_posters <- 0
  all_of_the_day_parts <- 0
  i <- 1

# loop over each day-slot available for poster sessions
# i is each day-part
for(i in seq_len(nrow(poster_session_durations_no_th_ev))) {

    # stop loop gracefully when we have assigned all people to day-parts
  if (nrow(all_posters_individual_no_ah_tmp) == 0){ break }

    # how many rows (posters) to we have remaning to allocate to a day-part?
    # print(glue('number of poster sessions remaining to classify is {nrow(all_posters_individual_no_ah_tmp)}'))

  # need a buffer to avoid overfilling day-parts
  target_capacity <- poster_session_durations_no_th_ev$n_max[i] - buffer

  # prepare variables for the next round, just to be sure
  cumulative_number_of_posters <- 0
  container_for_this_day_part <- 0
  this_session_posters <- 0
  # j is each poster session
  j <- 1

  while(cumulative_number_of_posters <= target_capacity & 
        j <= length(poster_session_ids)) {

  # get a session 
  this_session <- poster_session_ids[j]

  # get names for all the people in this poster session
  this_session_posters <- 
    all_posters_individual_no_ah_tmp %>% 
      filter(`Session Id` == this_session)

#-------------------------------------------------------------------  
  # check to see if these names are already in this day-part
  # or in the paper sessions
  if (j == 1){

    # do nothing is this is the first incoming session 
    names_already_in <-  0
    names_already_in2 <- 0

  } else {
      # get distinct names for each session previously added
      # to this day-part
      # (don't care about same name multiple times in one session)
      distinct_names_for_each_session_so_far <- 
      container_for_this_day_part %>% 
        group_by(uid) %>% 
        distinct(uid, .keep_all = TRUE) %>% 
        pull(uid)

      # check to see if incoming session has 
      # names that already exist in this day_part
      names_already_in <- sum(this_session_posters$uid %in% distinct_names_for_each_session_so_far)

      # check to see if papers for this day-part already have the name in it
      names_papers_in_this_day_part <- 
        sessions_in_day_parts_data_with_all_names_df %>% 
        filter(day_part_from_while.y == poster_session_durations_no_th_ev[i,]$day_part) %>% 
        pull(uid.y)

      names_already_in2 <- sum(this_session_posters$uid %in% names_papers_in_this_day_part)

  }

#-------------------------------------------------------------------
    # if there are no name conflicts, then
    # accumulate the incoming session onto the sessions
    # we've already added to this day-part

    if (j == 1) {

       container_for_this_day_part <- 
        this_session_posters

      } else {
      if (names_already_in == 0 & names_already_in2 == 0) {

      # if there are no name clashes, 
      # add these names to previous names in this day-part
      # bind rows of session that has no clashing  names with 
      # sessions we already added to this day-part
      container_for_this_day_part <- 
        bind_rows(this_session_posters,
                  container_for_this_day_part)

    }  else {

      # do not add this session to the day-part 
      # because at least one of the names is already in there
      }
      }

#-------------------------------------------------------------------  
  # compute the number of posters we've added to this day-part
  # by counting the abstract Id, because this excludes the session chairs
    cumulative_number_of_posters <- 
    nrow(container_for_this_day_part)

   # store this result
    poster_session_durations_no_th_ev$n_posters[i] <- cumulative_number_of_posters

    # print(glue('cumulative number of posters: {cumulative_number_of_posters}, j = {j}, i = {i}'))

  # go to next session
  j <- j + 1
    }
  # end of while loop

   # add an identifier so we can see where we have assigned each session so far
 container_for_this_day_part_with_session_from_while <- 
   container_for_this_day_part %>% 
   mutate(day_part_from_while  = as.character(poster_session_durations_no_th_ev$poster_day_part[i]))

 # bind rows to previous day-parts we might have done
  if (i == 1) {

  all_of_the_day_parts <- container_for_this_day_part_with_session_from_while

  } else {

    # check to see

  all_of_the_day_parts <- bind_rows(container_for_this_day_part_with_session_from_while, 
                                      all_of_the_day_parts)
  }

 poster_session_durations_no_th_ev$n_sessions[i] <- length(unique(container_for_this_day_part_with_session_from_while$`Session Id`))

 poster_session_durations_no_th_ev$c_sessions[i] <-  sum(poster_session_durations_no_th_ev$n_sessions)
 poster_session_durations_no_th_ev$c_posters[i] <-  sum(poster_session_durations_no_th_ev$n_posters)

 print(poster_session_durations_no_th_ev %>% select(n_max, 
                                                   poster_day_part,
                                                   n_posters,
                                                   n_sessions, 
                                                   c_sessions,
                                                   c_posters))

 # for the next day-part, we need to make sure
 # we don't add the same session twice to different day-parts
 # so let's update our big table of posters, and drop the session that 
 # we've already got in the day-part

all_posters_individual_no_ah_tmp <-  
   all_posters_individual_no_ah_tmp %>% 
   filter(! `Session Id` %in% unique(container_for_this_day_part$`Session Id`))

}
# end of for loop

# out put the results
write.csv(all_of_the_day_parts, 
          "../data/derived_data/posters_output_of_session_classification.csv")

write.csv(poster_session_durations_no_th_ev, "../data/derived_data/poster_session_durations_no_th_ev.csv")

We have from the SAA spaces for r sum(poster_session_durations_no_th_ev$n_max) posters. And we have r sum(poster_session_durations_no_th_ev$c_posters)

It's tricky to allocate the posters so they don't exceed the number allowed in each day part.

We have no clash with paper presenters.

What do we still need to do?

Colophon

This report was generated on r Sys.time() using the following computational environment and dependencies:

# which R packages and versions?
devtools::session_info()

The current Git commit details are:

# what commit is this file at? 
git2r::repository("../..")

benmarwick/confschedlr documentation built on May 20, 2019, 4:26 p.m.

rdrr.io home R language documentation Run R code online

CRAN packages Bioconductor packages R-Forge packages GitHub packages

Note that we can't provide technical support on individual packages. You should contact the package authors for that.

benmarwick/confschedlr
Organise submitted papers into time slots to schedule a conference

In benmarwick/confschedlr: Organise submitted papers into time slots to schedule a conference

Introduction

How many sessions do we have?

Colophon

R Package Documentation

Browse R Packages

We want your feedback!

benmarwick/confschedlr Organise submitted papers into time slots to schedule a conference

In benmarwick/confschedlr: Organise submitted papers into time slots to schedule a conference

Introduction

How many sessions do we have?

Colophon

R Package Documentation

Browse R Packages

We want your feedback!

benmarwick/confschedlr
Organise submitted papers into time slots to schedule a conference