WilliamsStaff: Williams Staff

# Faculty-related info fetches and gets
#
#
#' Returns a list of faculty from the college with one professor per line
#' @param year Desired year of interest
#' @param append_year If true, appends '|||####' where #### is the year to each faculty member item.  Otherwise, it does not
#' @return list a list of professors working at Williams during the year in question
#' @export
collect_faculty <- function(year, append_year=FALSE){
  raw_data <- find_data_by_year(year)
  fragment_bucket <- c() #used for picking up partial lines
  faculty <- c() #initially empty faculty list

  for(current in raw_data){
    if(!is_suitable_faculty_candidate(fragment_bucket)){ #does the existing fragment already contain a full candidate?
      fragment_bucket <- paste(fragment_bucket, current, sep=", ")
    } else{
      if(is_this_a_professor(current)){ #both fragment_bucket and current refer to different profs
        if(append_year){
          fragment_bucket <- paste0(fragment_bucket, "|||", year)
        }
        faculty <- c(faculty, fragment_bucket) #so append the old prof to the list of faculty
        fragment_bucket <- current
      } else{
        fragment_bucket <- paste(fragment_bucket, current, sep=", ")
      }
    }
  }

  # return(faculty)
  return(sapply(faculty, function(x) general_format(x, year)))
}

#' Converts the formatting of the different PDFs to a common form
#' That form is Name, Title, degree, year, degree, year, etc.
#' @param person A line of content to describe a person
#' @param year The year the person information was found
#' @return general_form A comma separated list with years preceeding the degrees
#' @export
general_format <- function(person, year){
  #2013: William G. Wagner, Brown Professor of History, 1974, BPhil, Oxford University, 1981, PHD, Oxford University
  #2012: "    Magnus T. Bernhardsson,Professor of History--B.A. (1990) University of Iceland, Ph.D. (1999) Yale"
  # first step is to clean off any non-letters from the beginning of the string (*)
  person <- gsub("^[^[:alpha:]]+", "", person)
  seperator <- ", "

  if(year == 2013){
    clean <- stringr::str_split(person, ",")[[1]]
    clean <- stringr::str_trim(iconv(clean, "latin1", "ASCII", sub=""))

    years   <- grepl("^[0-9]{4}$", clean) #find the years
    degrees <-  taRifx::shift(grepl("^[0-9]{4}$", clean), -1) #degrees come after years in 2013

    temp <- clean
    temp[years]   <- clean[degrees]
    temp[degrees] <- clean[years]
    return(stringr::str_c(temp, collapse = seperator))
  } else if(year %in% 2000:2012){
    person <- gsub("\\(([0-9]{4})\\)", " \\1 ", person) #removes the '(' and the ')' around a year
    person <- gsub("\\b([0-9]{4})\\b", ", \\1,", person) #adds commas around years
    #turn the dash/semicolon into a regular seperater
    person <- gsub("--", seperator, person)
    person <- gsub(";", seperator, person)

    #remove any trailing/leading spaces
    clean <- stringr::str_split(person, seperator)[[1]]
    clean <- stringr::str_trim(clean)
    return(stringr::str_c(person, collapse = seperator))
  }
}