R/utility_functions.R

phd_degrees = c("PHD", "DMA")
ma_degrees = c("MA", "MFA", "MED", "MS", "MM", "MMA", "PHM")
ba_degrees = c("BA", "BS", "AB", "BM", "BE", "BFA", "BPHIL")

#' calculates the standard error of a list
#' @param l a list of numbers
#' @return se the standard error
#' @export
std_error <- function(l){
  return(sd(l)/sqrt(length(l)))
}

#' Returns all accepted notation for an undergrad degree
#' @return ba_degrees a list
get_ba_degrees <- function(){
  return(ba_degrees)
}

#' Finds the year that professor was registered as faculty.  Needs append_year=TRUE
#' @param person a faculty member
#' @return year the year that person taught
get_prof_year <- function(person){
  person <- gsub("[^0-9]", "", person) #remove all non-digit chaff
  return(as.integer(substr(person, nchar(person)-3, nchar(person)))) #return the last 4 digits, as the year is appended at the end
}

#' Determines whether a given string contains information about schooling based on the presence of a year
#' @param str String in question
#' @return bool Wheteher or not the string contains a type of degree and a year
contains_school_info <- function(str){
  str <- gsub("[^[:graph:]^[:space:]]", "", str)
  str <- gsub("[-]", " ", str)
  arr <- stringr::str_split(str, " ")[[1]]
  all_degrees <- c(phd_degrees, ma_degrees, ba_degrees)
  year <- sum(grepl("[0-9]{4}", arr)) > 0 #has a year present
  degree <- length(intersect(sanitize(all_degrees), sanitize(arr))) > 0 #has at least one degree listed
  return(year & degree)
}

#' Helper function to determine if a string is a page number.  Assumes that page numbers are three digits, as all page numbers are three digits long in this section of the data
#' @param str String in question
#' @return bool Page number found?
is_page_number <- function(str){
  return(grepl("\\b[0-9]{3}\\b", str))
}

#' Helper function to determine if a string is a professor based on the presence of professor keywords
#' @param person String in question
#' @return num The number of instances of "professor words"
is_this_a_professor <- function(person){
  professor_keywords <- c("Professor", "rofessor", "Professorship", "Fellow", "Lecturer", "Librarian", "Theatre Production Manager", "Director", "Instructor", "Artist-in-Residence", "Artist in Residence")
  return(sum(sapply(professor_keywords, function(x) grepl(sprintf('\\b%s\\b', x), person))) > 0)
}

#' Makes input suitable for searching by converting to upper case and removing any non-letter characters
#' @param str String to mold
#' @return str String with no non-letter characters
sanitize <- function(str){
  str <- toupper(str)
  str <- gsub("[^[:alpha:]]", "", str)
  return(str)
}

#' Determines whether a string contains sufficient information to be considered a full faculty member -- that is, does the string contain a title and a year of receipt of a degree?
#' @param person a string
#' @return bool Whether person meets the requirements
is_suitable_faculty_candidate <- function(person){
  if(is.null(person)){
    return(FALSE)
  }
  return(sum(is_this_a_professor(person)) > 0 &
        sum(contains_school_info(person)) > 0)
}

#' applies stringr::str_trim to an array
#' @param a Array (containing strings)
#' @return trim_a a trimmed up Array
str_trim_arr <- function(a){
  a <- lapply(a, function(x) stringr::str_trim(x))
  return(a)
}

#' splits and trims an array
#' @param str string to manipulate
#' @return arr converted string
split_trim <- function(str){
  clean <- stringr::str_split(str, ",")[[1]]
  clean <- str_trim_arr(clean)
  return(clean)
}

#' returns a RegEx to match any of the undergraduate degrees
#' @return pattern
ba_pattern <- function(){
  return(paste0("^", ba_degrees, "$", collapse="|"))
}

#' returns a RegEx to match any master degrees
#' @return pattern
ma_pattern <- function(){
  return(paste0("^", ma_degrees, "$", collapse="|"))
}

#' returns a RegEx to match any doctorates
#' @return pattern
phd_pattern <- function(){
  return(paste0("^", phd_degrees, "$", collapse="|"))
}
PhilBrockman/WilliamsStaff documentation built on May 8, 2019, 1:33 a.m.