data-raw/professors/02_munge-affs.R

# Merge salaries and affiliations based on a key constructed from the
# last_name first_name middle_initial; use fuzzy join to account for missing middle initials

library(dplyr) # for %>%
library(readr)


clean_punc <- function(x) {
  stringr::str_replace_all(x, "\\'", "") %>%
    stringr::str_replace_all("\\.", " ") %>%
    stringr::str_replace_all("[^[:alnum:]]", " ") %>%
    stringr::str_trim(.) %>%
    stringr::str_squish(.)
}


test <- "j.  -gordon "
clean_punc(test)


# Load the data ------------------------------------------------------------

data("affiliation")

# keep only colelges, simplify depts----------------------------------------

patterns <- c("-agls|-las|-hsci-a|-e")

aff0 <-
  affiliation %>%
  mutate(
    dept_short_name = stringr::str_remove_all(dept_short_name, patterns)
    ) %>%
  filter(grepl("college", org_short_name))


# 1-3 clean up names ------------------------------------

aff1 <-
  aff0 %>%
  dplyr::mutate(year = as.integer(year)) %>%
  dplyr::mutate(
    last_name = clean_punc(last_name),
    mid_name = clean_punc(mid_name),
    first_name = clean_punc(first_name))

#--fix middle initials (make aff match sals) when I know they are problems
aff2 <-
  aff1 %>%
  mutate(
    first_name = case_when(
      grepl("sternberg", last_name) ~ "henrik",
      grepl("dalessandro", last_name) ~ "domenic",
      TRUE ~ first_name),
    mid_name = case_when(
      grepl("arritt", last_name) & grepl("raymond", first_name) ~ "w",
      grepl("booth", last_name) & grepl("larry", first_name) ~ "c",
      grepl("cerfogli", last_name) & grepl("jennifer", first_name) ~ "ann",
      grepl("constant", last_name) & grepl("kristen", first_name) ~ "p",
      grepl("crum", last_name) & grepl("michael", first_name) ~ "robert",
      grepl("ellingson", last_name) & grepl("laura", first_name) ~ "d",
      grepl("jeong", last_name) & grepl("hyungseok", first_name) ~ "davi",
      grepl("lewis", last_name) & grepl("calvin", first_name) ~ "f",
      grepl("oconnor", last_name) & grepl("annette", first_name) ~ "m",
      grepl("oneal", last_name) & grepl("matthew", first_name) ~ "ellio",
      grepl("pineyro", last_name) & grepl("pablo", first_name) ~ "enriqu",
      grepl("quam", last_name) & grepl("andrea", first_name) ~ "l",
      grepl("sternberg", last_name) ~ "ste",
      grepl("stevens", last_name) & grepl("julie", first_name) ~ "l",
      grepl("stevenson", last_name) & grepl("gregory", first_name) ~ "w",
      grepl("taylor", last_name) & grepl("gary", first_name) ~ "d",
      grepl("anderson", last_name) & grepl("e", first_name)
      & dept1 == 4140 ~ "walter",
      grepl("windus", last_name) & grepl("theresa", first_name) ~ "l",
      TRUE ~ mid_name)
  )

# create what might look like the salary names
aff3 <-
  aff2 %>%
  dplyr::mutate(
    name_lf = paste(last_name, first_name, sep = " "),
    name_lfm = ifelse(is.na(mid_name), name_lf, paste(name_lf, mid_name, sep = " ")),
    name_lfm = clean_punc(name_lfm),
    name_lfm20 = stringr::str_sub(name_lfm, start = 1, end = 20),
    ) %>%
  select(-name_lf, -name_lfm)

aff3 %>%
  filter(grepl("zhang hongwei", name_lfm20))

# 4-X fix people in 2019 ------------------------------------------------------

# some people have affs 2012-2018 but not 2019.
# aff_na19 <- read_csv("data-raw/professors/aff19-nas.csv") %>%
#   pull(sal_lfm20)
#
# #--these people are nas in 2019, but exist in 2018
# aff_here18 <-
#   aff4 %>%
#   filter(year == 2018) %>%
#   filter(name_lfm20 %in% aff_na19) %>%
#   pull(name_lfm20)
#
# aff_here19 <-
#   aff4 %>%
#   filter(year == 2019) %>%
#   filter(name_lfm20 %in% aff_na19) %>%
#   pull(name_lfm20)

#--check, but these could be copy-pasted from 2018?
#--yes. 11/15
#--write it so if the nas change, I still have this happening for hard coded peeps
#copythese <- setdiff(aff_here18, aff_here19)
#tibble(here18 = copythese) %>% write_csv("data-raw/professors/aff19-nas-here18.csv")
# aff4 %>%
#   filter(year == 2018|2019) %>%
#   filter(name_lfm20 %in% copythese[16])

copythese <- read_csv("data-raw/professors/aff19-nas-here18.csv") %>% pull(here18)

aff_replace19 <-
  aff3 %>%
  filter(year == 2018) %>%
  filter(name_lfm20 %in% copythese) %>%
  mutate(year = 2019)

#--laura has aff data 2012-2015, but salary data from 2012-2019.
#--kristen has aff data up to 2017
aff_smar <-
  aff3 %>%
  filter(year == 2015) %>%
  filter(grepl("smarandescu laura", name_lfm20)) %>%
  select(-year) %>%
  tidyr::crossing(year = c(2016, 2017, 2018, 2019))

aff_constant <-
  aff3 %>%
  filter(year == 2015) %>%
  filter(grepl("constant kristen p", name_lfm20)) %>%
  select(-year) %>%
  tidyr::crossing(year = c(2018, 2019))

aff_crum <-
  aff3 %>%
  filter(year == 2014) %>%
  filter(grepl("crum michael robert", name_lfm20)) %>%
  select(-year) %>%
  tidyr::crossing(year = c(2015, 2016, 2017, 2018, 2019))


aff4 <-
  aff3 %>%
  bind_rows(aff_replace19) %>%
  bind_rows(aff_smar) %>%
  bind_rows(aff_constant) %>%
  bind_rows(aff_crum)


#--fix other things?
# aff_na19 <- read_csv("data-raw/professors/aff19-nas.csv") %>%
#   pull(sal_lfm20)

#
# setdiff(aff_na19, copythese)
# qs <- tibble(qs = setdiff(aff_na19, copythese)) %>%
#   tidyr::separate(qs, into = c("last", "first", "mid"))
# qs
#
data("affiliation")
affiliation %>%
  filter(grepl("bender", last_name)) %>%
  filter(grepl("holly", first_name))
#
#sals <- read_csv("data-raw/professors/sals-profs.csv")

# sals %>%
#   filter(grepl("oneal", name_lfm20)) %>%
#   select(year, name_lfm20)

#--arcand
#--not sure why strenberg isn't working

aff5 <- aff4


# 6 center reassignments ------------------------------------------------

#--only one prof in cds, google says this guy is in psychology
professors %>%
  filter(grepl("lbrl art/sc cds", dept)) %>%
  select(year, college, dept, gender, name, title, dept_chair, base_salary)


aff6  <-
  aff5 %>%
  mutate(
    dept_short_name = case_when(
      grepl("goggi alcira susana", name_lfm20) ~ "agronomy",
      grepl("kirschenmann frederi", name_lfm20) ~ "phil/relig st",
      grepl("johnson lawrence", name_lfm20) ~ "food sc/hn",
      grepl("niederhauser dale s", name_lfm20) ~ "school of ed",
      grepl("thompson elizabeth a", name_lfm20) ~ "school of ed",
      grepl("kedrowski karen m", name_lfm20) ~ "political sc",
      grepl("venkatagiri horabail", name_lfm20) ~ "psychology",
      TRUE ~ dept_short_name)
    ) %>%
  mutate(
    org_short_name = case_when(
    grepl("kirschenmann frederi", name_lfm20) ~ "college of liberal arts & sciences",
    grepl("johnson lawrence", name_lfm20) ~ "college of human sciences",
    TRUE ~ org_short_name)
  )



# 7 manual clean up ---------------------------------------------------------
# elim david peterson in DEPT1 12170, prob a typo as it has no dept assigned
# li wang in eeob is a post-doc (why included in affs ?!)
# qian wang in accounting is prof, others are post-docs
# zhang hongwei is elec eng, agronomy one is post-doc
# Dept 7090 existed 2013-2015 and has no DEPT_SHORT_NAME
# The people associated w/7090 all transitioned to 7040 in 2016, which is ART/VISUAL CULT
# This assigns the name from 7040 to 7090
# we know prashant jha is agronomy, was hired 2019 but has no affil

aff7 <-
  aff6 %>%
  mutate(dept_short_name = ifelse(dept1 == 7090,
                                  "art/visual cult",
                                  dept_short_name),
         org_short_name = ifelse(dept1 == 7090,
                                 "college of design",
                                 org_short_name)) %>%
  filter(!(stringr::str_detect("peterson david", name_lfm20) &
             dept1 == 12170)) %>%
  filter(!(last_name == "johnson" &
             first_name == "duane" &
             org_short_name == "college of business")) %>%
  filter(!(last_name == "wang" &
             first_name == "li" &
             dept_short_name == "eeob")) %>%
  filter(!(last_name == "wang" &
             first_name == "qian" &
             dept_short_name == "physics/astron")) %>%
  filter(!(last_name == "wang" &
             first_name == "qian" &
             dept_short_name == "food sc/hn-hsci")) %>%
  filter(!(last_name == "zhang" &
             first_name == "hongwei" &
             dept_short_name == "agronomy")) %>%
  dplyr::add_row(
    year = 2019,
    dept1 = 1150,
    org_short_name = "college of agriculture & life sciences",
    dept_short_name = "agronomy",
    last_name = "jha",
    first_name = "prashant",
    mid_name = NA,
    name_lfm20 = paste(last_name, first_name))



# write it ----------------------------------------------------------------

write_csv(aff7, "data-raw/professors/02_munged-affs.csv")
vanichols/CyChecks3 documentation built on Jan. 19, 2021, 5:14 p.m.