#' Returns a list of all text information from the first professor listed through the last.
#' @param year The year in interest
#' @return pdf A list of all Williams faculty
#' @export
find_data_by_year <- function(year){
Sys.setlocale(category = "LC_ALL", locale = "C")
filename <- paste0("bulletin", year, "_", (year %% 100)+1, "_small.pdf") #general filename for years 2000:2013
pdf <- tm::readPDF(control = list(text = "-layout -nopgbrk"))(elem = list(uri = system.file("extdata", filename, package="WilliamsStaff")),
language = "en",
id = "id1")
wide_data <- content(pdf) #convert data into a usable form in R
#collect a fuzzy range on professor data within the wide data set
wide_start = grep("On leave second semester", wide_data)
if(year %in% 2002:2013){
wide_end = grep("LIBRARIES", wide_data)
} else{
wide_end = grep("ATHLETIC COACHES", wide_data)
}
#######
# Narrow down the boundaries
#######
#last ditch effort to ensure all characters are readable
# wide_data <- sapply(wide_data, function(row) gsub("[^[:graph:][:space:]]", "", row))
start <- -1
end <- -1
# wide_data <- sapply(wide_data, function(row) stringi::stri_enc_toutf8(row))
## THIS IS NOT VERY EFFICIENT (but presumably only has to hit index = 3)
for(index in wide_start:wide_end){
if(is_this_a_professor(wide_data[index])){ #first relevant information will be a professor
start <- index #get a more narrow range on the placements of professors
break #done with what we came here for
}
}
for(index in wide_end:wide_start){ #iterate through the list from the bottom
if(contains_school_info(wide_data[index])){ #last bit of relevant information in a pdf school be schooling information for the last professor (which may or may not be on the same line)
end <- index
break
}
}
wide_data <- wide_data[start:end]
########
# clean up data -- remove non professor lines
########
#remove page numbers (which all happen to be three digit numbers)
wide_data <- wide_data[-grep("\\b[0-9]{3}\\b", wide_data)]
#remove blanks
wide_data <- wide_data[which(nchar(wide_data) > 0)]
#remove interspersed "Faculty" lines
wide_data <- wide_data[!wide_data == "Faculty"]
return(wide_data)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.