knitr::opts_chunk$set(echo = TRUE,  
                      fig.align = 'center', 
                      message = FALSE)

# Copying code to highlight lines
hook_source <- knitr::knit_hooks$get('source')
knitr::knit_hooks$set(source = function(x, options) {
  x <- stringr::str_replace(x, "^[[:blank:]]?([^*].+?)[[:blank:]]*#<<[[:blank:]]*$", "*\\1")
  hook_source(x, options)
})

# Load packages
library(tidyverse)
library(lme4)
library(lmerTest)
library(CyChecks)
library(CyChecks2)

Overview


.pull-left[

.large[ 1. Motivation

  1. Data cleaning & issues

  2. Data visualization with Shiny

  3. Data modeling (help!) ]]

.pull-right[ ]


Motivation


knitr::include_graphics("figures/phdcomics.gif")

Motivation


.large[ - Final project for STAT 585, Spring 2019]

-- .large[ - How does pay compare for women and men in the Agronomy department? ]


Data cleaning & issues


.large[Where does this data come from?]

database

.center[]


Data cleaning & issues


Where does this data come from?

.tinyer[

sal_df<- function(limit= 1000, offset = 0, fiscal_year = 2007, token = NULL){
  checkmate::assertNumber(limit, lower = 0)
  checkmate::assertNumber(offset, lower = 0)
  checkmate::assertNumber(fiscal_year, lower =2007, upper = 2018)
  if (!is.null(token)){
 url <- sprintf("https://data.iowa.gov/resource/s3p7-wy6w.json?%s&$limit=%d&$offset=%d&$order=:id&department=Iowa%%20State%%20University&fiscal_year=%d", #<<
                token, limit, offset, fiscal_year) #<<
  } 
  else {
    url <- sprintf("https://data.iowa.gov/resource/s3p7-wy6w.json?$limit=%d&$offset=%d&$order=:id&department=Iowa%%20State%%20University&fiscal_year=%d", limit, offset, fiscal_year)
  }
  s <- tibble::as_tibble(jsonlite::fromJSON(url))
  checkmate::assertTibble(s, min.rows = 1, ncols =10)

  sals <- s %>%
    dplyr::select(-c(base_salary_date,department))%>%
    dplyr::mutate_at(dplyr::vars(fiscal_year, total_salary_paid, travel_subsistence), as.numeric)%>%
    dplyr::mutate(name = gsub(",","",name)) %>%
    dplyr::mutate(position = stringr::str_trim(position, side = "right"))

  checkmate::assertTibble(sals, min.rows = 1, ncols = 8)
  return(sals)
}

]

Data cleaning & issues


Where does this data come from?

.tiny[

{{ex <- sal_df(limit = 15, offset = 4500, fiscal_year = 2018)}}
knitr::kable(ex %>% select(-travel_subsistence), align = 'c', 'html') %>%
  kableExtra::kable_styling(font_size = 11, full_width = FALSE)

]

Data cleaning & issues


.large[Tidying government salary data]

--

--

--

--


Data cleaning & issues


--

--

--

--

--


Data visualization with Shiny


shiny!


Data modeling (help!)


--

--


Data modeling (help!)


.large[Response variable:] .pull-left[ .large[log(Salary)] - Gender needs to be included in the model ]
.pull-right[ .large[log(Ratio of Salaries)] - Gender "effect" emcompassed in the response term ]


Data modeling (help!)


.large[Response variable = log(salary) ]

data("cyd_salprofs")
ex <- cyd_salprofs %>% 
  mutate(college = replace_na(college, "college of combos")) %>% # added this so we can include soc, eeob, etc. 
  filter(grepl("college", college)) %>% 
  mutate(lsal = log(base_salary)) %>% 
  filter(base_salary > 0) 

goodcmp <- 
  ex %>% 
  # make it so each person only counts once per position
  group_by(college, dept, prof_simp, gender, id) %>% 
  summarise(base_salary = mean(base_salary)) %>% 
  # keep only depts w/m and f in both positions AT SOME POINT in the dataset
  group_by(college, dept, prof_simp, gender) %>% 
  summarise(n = n()) %>% 
  spread(gender, value = n) %>% 
  # if NA get rid of that line
  filter(!is.na(`F`)) %>% 
  filter(!is.na(M)) %>% 
  select(college, dept, prof_simp)

ex_all <- # including ALL years
  goodcmp %>% 
  left_join(ex)
m1a <- lmerTest::lmer(lsal ~ prof_simp * gender + (1 + gender|dept), data = ex_all)

Data modeling (help!)


sjPlot::tab_model(m1a, show.df = TRUE, show.aic = TRUE)

Data modeling (help!)


.large[Response variable = log(salary) ]

# making ratio table
lrat_all <- ex %>% 
  group_by(fiscal_year, college, dept, prof_simp, gender) %>% 
  summarise(base_salary = mean(base_salary)) %>% 
  spread(gender, base_salary) %>% 
  mutate(rat = M/`F`,
         lrat = log(rat))
m2a <- lmerTest::lmer(lrat ~ prof_simp + (1 | dept), data = lrat_all)

Data modeling (help!)


sjPlot::tab_model(m2a, show.df = TRUE, show.aic = TRUE)

Data modeling (help!)


# which depts have biggest effects on lrat? Show this...??!
as_tibble(ranef(m2a)) %>% 
  ggplot(aes(grp, condval)) +
  geom_col() + 
  labs(x = NULL, y = "Conditional Variance")+
  coord_flip()+
  theme_bw()


lydiaPenglish/CyChecks2 documentation built on Nov. 26, 2020, 5:34 a.m.