R/find_data_by_year.R

#' Returns a list of all text information from the first professor listed through the last.
#' @param year The year in interest
#' @return pdf A list of all Williams faculty
#' @export
find_data_by_year <- function(year){
  Sys.setlocale(category = "LC_ALL", locale = "C")
  filename <- paste0("bulletin", year, "_", (year %% 100)+1, "_small.pdf") #general filename for years 2000:2013
  pdf <- tm::readPDF(control = list(text = "-layout -nopgbrk"))(elem = list(uri = system.file("extdata", filename, package="WilliamsStaff")),
                                                                language = "en",
                                                                id       = "id1")
  wide_data <- content(pdf) #convert data into a usable form in R

  #collect a fuzzy range on professor data within the wide data set
  wide_start = grep("On leave second semester", wide_data)
  if(year %in% 2002:2013){
    wide_end = grep("LIBRARIES", wide_data)
  } else{
    wide_end = grep("ATHLETIC COACHES", wide_data)
  }

  #######
  # Narrow down the boundaries
  #######

  #last ditch effort to ensure all characters are readable
  # wide_data <- sapply(wide_data, function(row) gsub("[^[:graph:][:space:]]", "", row))
  start <- -1
  end <- -1

  # wide_data <- sapply(wide_data, function(row) stringi::stri_enc_toutf8(row))

  ## THIS IS NOT VERY EFFICIENT (but presumably only has to hit index = 3)
  for(index in wide_start:wide_end){
    if(is_this_a_professor(wide_data[index])){ #first relevant information will be a professor
      start <- index #get a more narrow range on the placements of professors
      break #done with what we came here for
    }
  }
  for(index in wide_end:wide_start){ #iterate through the list from the bottom
    if(contains_school_info(wide_data[index])){ #last bit of relevant information in a pdf school be schooling information for the last professor (which may or may not be on the same line)
      end <- index
      break
    }
  }
  wide_data <- wide_data[start:end]

  ########
  # clean up data -- remove non professor lines
  ########
  #remove page numbers (which all happen to be three digit numbers)
  wide_data <- wide_data[-grep("\\b[0-9]{3}\\b", wide_data)]
  #remove blanks
  wide_data <- wide_data[which(nchar(wide_data) > 0)]
  #remove interspersed "Faculty" lines
  wide_data <- wide_data[!wide_data == "Faculty"]

  return(wide_data)
}
PhilBrockman/WilliamsStaff documentation built on May 8, 2019, 1:33 a.m.