# Libraries library(div) library(gridExtra) library(kableExtra) library(grid) library(lattice) # Options and defaults: options(knitr.table.format = "latex") knitr::opts_chunk$set(echo = FALSE) knitr::opts_chunk$set(warning = FALSE) knitr::opts_chunk$set(message = FALSE) knitr::opts_chunk$set(fig.width=12, fig.height=6) # The fake team: teamSize <- 400 pctFemales <- 0.35 maleSalaryBias <- 1.035 d <- div_fake_team(seed = 18901228, N = teamSize, genders = c("F", "M"), gender_prob = c(pctFemales, 1 - pctFemales), gender_salaryBias = c(1, maleSalaryBias) ) # Gender diversity tibble per grade: tbl_gender_div <- table(d$gender, d$grade) %>% apply(2, diversity, prior = c(50.2, 49.8)) %>% tibble(value = ., label = paste("Grade", names(.))) # Average gender diversity tibble: tbl_gender_div_all <- table(d$gender) %>% as.vector %>% diversity(prior = c(50.2, 49.8)) %>% tibble(value = ., label = "Gender Diversity") # Age diversity tibble: ageRef <- c(15.94, 27.11, 23.99, 20.68, 12.28) / 100 ad <- table(cut(d$age, breaks = c(20, 30, 40, 50, 60, 65))) ad <- ad / sum(ad) # not needed, diversity() does this, bu useful for the table tbl_age_div <- tibble(value = diversity(ad, prior = ageRef), label = "Age Diversity") # Gender PayGap tibble: pg_gender <- div_paygap(d) # Citizenship PayGap tibble pg_citizenship <- d %>% div_paygap(x = "citizenship" , x_ctrl = "Polish")
\tableofcontents[hideallsubsections]
The data used for this report is randomly generated with the following characteristics:
r teamSize
r pctFemales
r maleSalaryBias
\small
f1 <- tibble(Nbr = 1, Area = "Gender", Finding = "Where we can calculate the paygap between females and non-females, we find that the females generally earn less in similar roles and similar grades.", Suggestion = "Check the gender-paygap table and identify the grade/role combinations where an the paygap has most stars. Check if the salary differences are justified.") f2 <- tibble(Nbr = 2, Area = "Age", Finding = "The team is predominantly younger than the surrounding population (Poland).", Suggestion = "Consider hiring older people to balance. Focus on retention.") f3 <- tibble(Nbr = 3, Area = "Gender", Finding = "The diversity is good in grade 1 and 2, but under par in grade 3", Suggestion = "Consider if females have barriers to apply to grade 3 jobs and remove the barriers.") f4 <- tibble(Nbr = 4, Area = "Gender", Finding = "Males in Grade 2 seem to have been promoted faster.", Suggestion = "Understand unconscious bias, coach everyone (and specially females), work on trust.") findings <- rbind(f1, f2, f3, f4) colnames(findings) <- c("Nbr", "Area", "Finding", "Suggestion") knitr::kable(findings) %>% column_spec(3:4, width = "12em") write_csv(findings,'findings.csv')
f_wrap <- function(x) { strwrap(x, width = 55, simplify = FALSE) %>% sapply(paste, collapse = "\n") } lay1 <- rbind(c(1,1,2), c(1,1,2) ) p1_1 <- div_gauge_plot(tbl_gender_div, ncol = 2, nbrSize = 4) p1_2 <- div_gauge_plot(tbl_gender_div_all, ncol = 1) p1 <- arrangeGrob(p1_1, p1_2, layout_matrix = lay1) p2 <- textGrob(f_wrap(f3$Finding)) p3 <- div_gauge_plot(tbl_age_div, ncol = 1) p4 <- textGrob(f_wrap(f2$Finding)) p5 <- div_plot_paygap_distribution(pg_gender$data$paygap, label = "Gender") p6 <- textGrob(f_wrap(f1$Finding)) p7 <- div_plot_paygap_distribution(pg_citizenship$data$paygap, label = "Citizenship") p8 <- textGrob(f_wrap("No Bias detetected for Citizenship (in salary) -- both team and unbiased distribution are virtually the same")) lay <- rbind(c(2,2,4,4), c(1,1,3,3), c(1,1,3,3), c(1,1,3,3), c(6,6,8,8), c(5,5,7,7), c(5,5,7,7), c(5,5,7,7) ) grid.arrange(p1, p2, p3, p4, p5, p6, p7, p8, layout_matrix = lay)
```r"} t <- table(d$gender, d$grade) t1 <- apply(t, 2, diversity, prior = c(50.2, 49.8)) t2 <- tibble(value = t1, label = paste("Grade", names(t1))) div_gauge_plot(t2, ncol = 3)
## Age diversity ```r"} # ages in Poland p1 <- div_gauge_plot(tbl_age_div) t <- rbind(ad, ageRef) rownames(t) <- c("team", "reference") #knitr::kable(round(t,2), # caption = "The percentages of different age groups in this team compared to the population of Poland. An equal distribution would yield a diversity of 1.\\label{tab:div:age}") t2 <- t %>% t() %>% as_tibble %>% mutate(age_group = rownames(t(t))) %>% as_tibble %>% gather("population", "percentage", 1:2) p2 <- ggplot(t2, aes(x = age_group, y = percentage, fill = population)) + geom_bar(stat="identity", position=position_dodge()) + geom_text(aes(label=round(percentage,2)), vjust=1.6, color="black", position = position_dodge(0.9), size=3.5) + theme(legend.position="bottom") grid.arrange(p1, p2, ncol=2, widths = 1:2)
\framesubtitle{Distribution for the team}
#t <- table(d$citizenship, d$grade) t <- d %>% group_by(citizenship) %>% summarise(n = n()) %>% arrange(desc(n)) %>% ggplot( aes(x = citizenship, y = n)) + geom_col(aes(fill = n), width = 0.7) + theme(axis.text.x=element_text(angle=90,hjust=1,vjust=0.5), axis.title.x=element_blank()) t
\framesubtitle{Breakdown per Grade}
t <- d %>% group_by(citizenship, grade) %>% summarise(n = n()) %>% ggplot( aes(x = grade, y = n)) + geom_col(aes(fill = n), width = 0.7) + facet_wrap(~ citizenship, scales="free_y") + theme(axis.text.x=element_text(angle=90,hjust=1,vjust=0.5), axis.title.x=element_blank()) t
div_parse_paygap(pg_gender)
div_parse_paygap(pg_citizenship)
div_paygap(d, x = "age") %>% div_parse_paygap
div_paygap(d, x = "tenure_firm") %>% div_parse_paygap
```r)."} ylim1 = boxplot.stats(1/d$tenure_job)$stats[c(1, 5)] p <- ggplot(data = d, aes(x=gender, y=1/tenure_job, fill=gender)) + geom_boxplot() + facet_wrap(~grade) + coord_cartesian(ylim = ylim1*1.05) p
## Promotions per Year per Gender ```r"} ylim1 = boxplot.stats(1/d$tenure_job)$stats[c(1, 5)] p <- ggplot(data = d, aes(x=gender, y=1/tenure_grade, fill=gender)) + geom_boxplot() + facet_wrap(~grade) + coord_cartesian(ylim = ylim1*1.05) p
conclusions <- rbind(tibble(Nbr = 0, Suggestion = 'Learn more by reading e.g. "The Essentials of Diversification & Inclusion", Dabrowska (2019)' ), findings[,c(1,4)]) knitr::kable(conclusions) %>% column_spec(2, width = "22em")
nocite: | @diversityhub2019
paygap
= the ratio of median salaries of one group divided by the median of the salaries of the other groupNA
} = numbers are too small, please look at individuals;.
} = maybe there is some bias, but the numbers are low, check individuals;*
} = you should check for bias;**
} = bias is probably there;***
} = most certainly there is biasSo, there will be more stars if the probability of a bias is higher: this can be due to a higher bias and/or due to a larger sample size.
\small
grade
= the salary grade as used in the companyjobID
= a unique identifier of the job category (can be abbreviated)sal_F
= the median salary of the females (F)sal_oth
= the median salary of the other groups (non F). The tool is open to use more than one gender.age_F
= the median age of the females (or age_Pol
could be the median age of the team members with Polish citizenship)age_oth
= the median age of the other groups take together (e.g. the median age of non females)paygap
= the ratio of median salary earned by the selected group (e.g. females) divided by the median of the other people. If this is lower than $1$, then median female earns less than the median non-female.conf.
= the confidence level that this paygap is significant.We express diversity as a number between zero and one. Our calculation is based on @debrouwer2020 and more in particular section 36.3.1 ``The Business Case: a Diversity Dashboard''. Details can be found in the book. The method is:
```r"} x <- seq(length.out = 500, from = 0, to = 1) t <- cbind(x, 1-x) tt <- tibble(diversity = apply(t, 1, FUN = diversity), x = x) p1 <- ggplot(data=tt, aes(x=x, y=diversity, color = diversity)) + geom_line(lwd = 3) + ggtitle("Diversity index for binary variables and prior probabilities of 50/50") + xlab("Proportion of one of the two classes")
tt <- tibble(diversity = apply(t, 1, FUN = diversity, prior = c(.7,.3)), x = x) p2 <- ggplot(data=tt, aes(x=x, y=diversity, color = diversity)) + geom_line(lwd = 3) + ggtitle("Diversity index for binary variables and prior probabilities of 70/30") + xlab("Proportion of one of the two classes") grid.arrange(p1, p2, ncol = 1)
## The confidence level and p-value The p-value is the probability that we make a mistake by assuming that there is no paygap. It is calculated by splitting the data on a variable in binary factors (e.g. Females and others) and then checking how likely it is that a random person from the first group earns less than a random person from the second group. This is done by a method known as Mann-Whitney U test: [see Wikipedia](https://en.wikipedia.org/wiki/Mann%E2%80%93Whitney_U_test)[^1] [^1]: The Mann–Whitney U test (aka. Mann–Whitney–Wilcoxon (MWW), Wilcoxon rank-sum test, or Wilcoxon–Mann–Whitney test) is a nonparametric test of the null hypothesis that, for randomly selected values X and Y from two populations, the probability of X being greater than Y is equal to the probability of Y being greater than X. If we assume that the distributions are symmetric, it boils down to a test that the medians are different. ## Another view on the PayGap ```r."} # Transformation function for 2 decimals scaleFUN <- function(x) sprintf("%.0f", x) p <- ggplot(data = d, aes(x=gender, y=salary, fill=gender)) + geom_boxplot() + facet_wrap(~grade + jobID, scale="free") + scale_y_continuous(labels=scaleFUN) p
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.