inst/slidedecks/shiny/archive/webscraping.R

library(magrittr)
library(rvest)
library(data.table)
library(stringr)
library(plyr)

# All (accepted) submitted sessions
allsubmissions <- html("https://sqlbits.com/information/PublicSessions.aspx") %>% 
  html_node(".session") %>% 
  html_table() %>% 
    extract2(1) %>% 
  data.table(raw = .)

dataclean <- function(x) {
    # Changes datatype, trims excess whitespace, and puts any carriage returns as spaces in the abstracts
    str_replace_all(str_trim(unlist(x)), "\r\n", " ")
}

allsubmissions[
  , `:=`(split = str_split(raw, pattern = "\r\n     "))][
  , `:=`(SessionLevel = dataclean(pluck(split, 1))
         , SessionTrack = dataclean(pluck(split,4))
         , SessionTitle = dataclean(pluck(split, 8))
         , SessionSpeaker = dataclean(pluck(split, 17))
         , SessionAbstract = dataclean(pluck(split, 20))
         )]

desiredcols <- c("SessionLevel", "SessionTrack", "SessionTitle", "SessionSpeaker", "SessionAbstract")

allsubmissions <- allsubmissions[, desiredcols, with = FALSE]

# All selected sessions
getsessions <- function(url) {
    html(url) %>% 
      html_nodes(".AgendaSession") %>% 
      html_table(fill = TRUE) %>% 
      rbind.fill %>% 
      data.table %>%
      .[,day:=as.Date(str_sub(url,-8,-1),format="%Y%m%d")]
}

days <- 20150304:20150307
urls <- paste0("https://sqlbits.com/information/event14/Agenda.aspx?", days)

selectedsessions <- lapply(urls, getsessions) %>% 
  rbind.fill %>% 
  data.table

nonsessiontitles <- "Registration|Keynote|DitBits|Lunch|Break|Prize-Giving|End of Day"

# http://txt2re.com/
timeregex <- "((?:(?:[0-1][0-9])|(?:[2][0-3])|(?:[0-9])):(?:[0-5][0-9])(?::[0-5][0-9])?(?:\\s?(?:am|AM|pm|PM))?)"
timerow <- paste0(timeregex, "(.)", timeregex)

selectedsessions <- selectedsessions[(str_detect(X1, nonsessiontitles) | str_detect(X1, timerow)) == FALSE][
  , `:=`(ID = .I
         , Pair = (.I -  1)%/%2)]

setkey(selectedsessions, Pair)

selectedsessions.titles <- selectedsessions[ID%%2 == 1]
selectedsessions.speakers <- selectedsessions[ID%%2 == 0]

selectedsessions <- selectedsessions.titles[selectedsessions.speakers,
                                            .(SessionTitle = X1,
                                              SessionSpeaker = i.X1, 
                                              SessionLevel = i.X2,
                                              day)]

# Joining the data
setkey(allsubmissions, SessionTitle, SessionSpeaker)
setkey(selectedsessions, SessionTitle, SessionSpeaker)
# removes multiple instances for pre-cons etc
selectedsessions <- unique(selectedsessions)

finaldataset <- selectedsessions[allsubmissions]

sessionswithoutsubmissions <- allsubmissions[selectedsessions][is.na(SessionTrack)] 

write.csv(finaldataset, "finaldataset.csv")
write.csv(sessionswithoutsubmissions, "sessionswithoutsubmissions.csv")
save(finaldataset,sessionswithoutsubmissions,file = "webscrapingresults.RData")
stephlocke/Rtraining documentation built on May 30, 2019, 3:36 p.m.