#' Plotting function for more comprehensible gender effects
#'
#' @param
#' @keywords flat; violin
#' @export
#' @examples
#' library(AER)
#' data("PhDPublications")
#'
#' genderplot::gender_plot(PhDPublications, varname="prestige")
#' genderplot::gender_plot(PhDPublications, varname="prestige", lb=-1) #need to adjust lower bound
#' # to depict larger gender differences
#'
#' data(STAR) #data on effect of reducing class size on test scores in the early grades
#' genderplot::gender_plot(STAR, varname="read1") #read1 = reading in first grade
#' genderplot::gender_plot(STAR, varname="math3") #math3= math in 3rd grade
#'
#' data("TeachingRatings") #Data on course evaluations, course characteristics,
#' # and professor characteristics (beauty)
#' genderplot::gender_plot(TeachingRatings, varname="age") #more old professors are male
#' genderplot::gender_plot(TeachingRatings, varname="age", lb=-1, ub=2.5) #need to adjust
#' #lower and upper bound
#' genderplot::gender_plot(TeachingRatings, varname="beauty", lb=-1) #more beautiful
#' # professors are female
#' genderplot::gender_plot(TeachingRatings, varname="eval") # higher evaluated professors
#' # are more often male than female
gender_plot <- function(data, varname, gender="gender",
color =NA, lb=NA, ub=NA, varname_title=NA){
if (!requireNamespace("ggplot2", quietly = TRUE)) {"The `ggplot2` package is not installed but required. You can install it using install.packages('ggplot2')"}
if (!requireNamespace("dplyr", quietly = TRUE)) {"The `dplyr` package is not installed but required. You can install it using install.packages('dplyr')"}
if (!requireNamespace("tidyr", quietly = TRUE)) {"The `tidyr` package is not installed but required. You can install it using install.packages('tidyr')"}
if (!requireNamespace("see", quietly = TRUE)) {"The `see` package is not installed but required. You can install it using install.packages('see')"}
if (!requireNamespace("bayestestR", quietly = TRUE)) {"The `bayestestR` package is not installed but required. You can install it using install.packages('bayestestR')"}
if (!requireNamespace("overlapping", quietly = TRUE)) {"The `overlapping` package is not installed but required. You can install it using install.packages('overlapping')"}
requireNamespace("ggplot2"); requireNamespace("tidyr"); requireNamespace("dplyr");
requireNamespace("see"); requireNamespace("bayestestR"); requireNamespace("overlapping");
if(is.na(varname_title)) varname_title <- varname
if(is.na(color)){color <- "black"} #"#B9001EFF"
if(is.na(lb)){lb <- -0.5}
if(is.na(ub)){ub <- 1 }
data <- as.data.frame(data)
tmp <- data.frame(gender=data[,gender])
tmp$Sum <- data[,varname]
if(!is.numeric(tmp$Sum)){
stop(paste0(varname," must be a numeric vector"))
}
if(length(tmp$Sum) == 1){
warning(paste0(varname," has length 1, will return NA"))
}
if(!all(levels(tmp$gender) %in% c("male","female"))){
if (!requireNamespace("gendercoder", quietly = TRUE)){"The `gendercoder` package is not installed. It is optional: you can install it via devtools::install_github('ropenscilabs/gendercodeR'), or you provide a gender column with factors labelled 'male' and 'female'"}
tmp$gender <- recode_gender(tmp$gender, dictionary = broad)
}
tmp$gender <- factor(tmp$gender, levels=c("male","female"),
ordered=TRUE)
breaks_left <- c(0,.10, .25, .49)
breaks_right <- c(.51, .75,.90,1)
tmp <- tmp %>%
dplyr::mutate(quant= dplyr::ntile(Sum,100) / 100)
dd <- data.frame()
for(p in c(.10, .25, .49, .51, .75,.90)){ #c(.01, .05, .10, .25, .49, .51, .75,.90, .95, .99)
if(p < .5){
tmp2 <- tmp[which(tmp$quant <= p ),]
N_boys <- length(which(tmp2$gender=="male"))
N_girls <- length(which(tmp2$gender=="female"))
if(is.na(sd(tmp2$Sum,na.rm=T))){cd <- 0.00; cd_low <- NA; cd_high <- NA
} else if(sd(tmp2$Sum, na.rm=T) ==0 ){cd <- 0.00; cd_low <- NA; cd_high <- NA
} else {
cd <- effsize::cohen.d(Sum ~gender, tmp2)
cd_low <- as.numeric(cd$conf.int[1])
cd_high <- as.numeric(cd$conf.int[2])
cd <- round(cd$estimate,2)}
}else{
tmp2 <- tmp[which(tmp$quant > p ), ]
N_boys <- length(which(tmp2$gender=="male"))
N_girls <- length(which(tmp2$gender=="female"))
if(is.na(sd(tmp2$Sum,na.rm=T))){cd <- 0.00; cd_low <- NA; cd_high <- NA
}else if(sd(tmp2$Sum) ==0){cd <- 0.00; cd_low <- NA; cd_high <- NA} else {
cd <- effsize::cohen.d(Sum ~gender, tmp2)
cd_low <- as.numeric(cd$conf.int[1])
cd_high <- as.numeric(cd$conf.int[2])
cd <- round(cd$estimate,2)}
}
row <- data.frame( #grade=g,
percentile=p, d=cd, d_low=cd_low, d_high=cd_high,
N_boys=N_boys, N_girls=N_girls)
dd <- rbind(dd,row)
}
dd2 <- dd %>%
dplyr::group_by(percentile) %>%
dplyr::summarise(Nsum_boys=sum(N_boys), Nsum_girls = sum(N_girls)) %>%
dplyr::mutate(N_total = Nsum_boys+Nsum_girls, MF_ratio = round(Nsum_boys/Nsum_girls,2)) %>%
dplyr::ungroup()
dd3 <- dd2 %>%
dplyr::select(percentile, Nsum_boys, Nsum_girls) %>%
tidyr::pivot_longer(-percentile)
# Overlap measures
Y <- data.frame(tmp)[tmp$gender == "female", "Sum"] # 1 = FEMALE = Y
X <- data.frame(tmp)[tmp$gender == "male", "Sum"]
d_obs <- (mean(X)- mean(Y)) / (sqrt( (((sd(X)^2) + (sd(Y)^2))) /2 ))
##overlap_Bayes
ov_Bayes <- bayestestR::overlap(X,Y, method_density="KernSmooth",
method_auc = "spline")
##overlap_Reiser
ov_Reiser <- 2* pnorm( - abs(d_obs) / 2)
##overlap_Pastore
ov_Pastore <- overlapping::overlap(list(X,Y))$OV
# for plotting distributions
tmp$section1 <- NA; tmp$section2 <- NA
tmp$section1 <- cut(tmp$quant, breaks_left)
tmp$section2 <- cut(tmp$quant, breaks_right)
tmp$section <- dplyr::coalesce(tmp$section1, tmp$section2)
tmp$percentile <- car::recode(tmp$section, "'(0,0.1]' = 0.10; '(0.1,0.25]' =0.25;'(0.25,0.49]'=0.49; '(0.51,0.75]'=0.51; '(0.75,0.9]'=0.75;'(0.9,1]'=0.90")
tmp$percentile <- as.numeric(as.character(tmp$percentile))
tmp <- tmp[!is.na(tmp$percentile),]
# z-score of sum
tmp <- tmp %>% dplyr::group_by(percentile) %>% dplyr::mutate(Sum_z = scale(Sum))
tmp$Sum_z <- tmp$Sum_z/10
#plot
pd <- ggplot2::position_dodge(.02)
dodge <- ggplot2::position_dodge(1)
dd2$label_pos <- ifelse(dd2$percentile == .49, .465,
ifelse(dd2$percentile == .51, .535, dd2$percentile))
#
ggplot2::ggplot(data=dd, ggplot2::aes(x=percentile,y=d))+
ggplot2::geom_hline(yintercept=0)+
ggplot2::geom_hline(yintercept=(1/4)+.5, linetype="dashed", col="grey65")+
#geom_flat_violin(data=tmp[tmp$gender== "female" , ], ggplot2::aes(x=percentile, y = Sum_z, group=factor(percentile)), fill="#D95F02", col=NA,size=.7,width=.2, alpha = 0.2)+
#geom_flat_violin(data=tmp[tmp$gender== "male", ], ggplot2::aes(x=percentile, y = Sum_z, group=factor(percentile)), fill="#1B9E77", col=NA, size=.7,width=.2, alpha = 0.2) +
see::geom_violinhalf(data=tmp[tmp$gender=="female" & tmp$percentile < .50,], flip=TRUE, ggplot2::aes(x=percentile, y= Sum_z, group=factor(percentile)), fill="#D95F02", col=NA, size=.7, width=0.2, alpha=0.2)+
see::geom_violinhalf(data=tmp[tmp$gender=="male" & tmp$percentile < .50,], flip=TRUE, ggplot2::aes(x=percentile, y= Sum_z, group=factor(percentile)), fill="#1B9E77", col=NA, width=0.2, alpha=0.2)+
see::geom_violinhalf(data=tmp[tmp$gender=="female" & tmp$percentile > .50,], ggplot2::aes(x=percentile, y= Sum_z, group=factor(percentile)), fill="#D95F02", col=NA, size=.7, width=0.2, alpha=0.2)+
see::geom_violinhalf(data=tmp[tmp$gender=="male" & tmp$percentile > .50,], ggplot2::aes(x=percentile, y= Sum_z, group=factor(percentile)), fill="#1B9E77", col=NA, width=0.2, alpha=0.2)+
ggplot2::geom_errorbar( ggplot2::aes( ymin=d_low, ymax=d_high),position=pd,
colour="grey35", width=.02, alpha=.4) +
ggplot2::geom_line(data=dd2, ggplot2::aes(x=percentile, y=(MF_ratio/4)+.5), col="grey65" )+
ggplot2::geom_point(col=color, size=2.5, position=pd, alpha=.7)+
ggplot2::geom_line(col=color,size=.9,linetype="solid", position=pd, alpha=.7)+
ggplot2::geom_text(data=dd2, ggplot2::aes(x=label_pos, y=-.4, label=Nsum_boys),
col="#1B9E77")+
ggplot2::geom_text(data=dd2, ggplot2::aes(x=label_pos, y=-.48, label=Nsum_girls),
col="#D95F02")+
ggplot2::geom_text(data=dd2, ggplot2::aes(x=0.00, y=-.44, label="N ="))+
ggplot2::geom_text(data=dd2, ggplot2::aes(x=percentile, y=(MF_ratio/4)+.5, label=MF_ratio),
col="grey55", fontface="bold")+
ggplot2::geom_label(ggplot2::aes(.22,1.25, label="Differences in favor of boys"),
col="gray30", fill="#1B9E77", size=2.5)+
ggplot2::geom_label(ggplot2::aes(.22,-1.25, label="Differences in favor of girls"),
col="gray30", fill="#D95F02", size=2.5)+
ggplot2::theme_bw()+
ggplot2::scale_color_manual(guide="none", values=color)+
ggplot2::ggtitle(paste("Gender effects across percentiles for",varname_title))+
# labs(subtitle="Colored points: Differences in Cohen's d and 95% CI; Grey: Male/female ratio (second y-axis); \nbottom: sample size per percentile (red:girls; blue: boys)")+
ggplot2::scale_y_continuous(name=" \t Cohen's d", limits=c(lb, ub), breaks=seq(lb,0.5, ub-0.5), sec.axis=ggplot2::sec_axis( ~(.*4)-2,
breaks=c(0,1,2,3),
name="Male:female ratio")) +
ggplot2::scale_x_continuous(limits=c(0,1), breaks=c(0.10, 0.25,0.49, 0.50, 0.51, 0.75, 0.90), labels=c("0.10", "0.25","", "0.50", "", "0.75", "0.90"))+
ggplot2::theme(panel.grid.minor.x = ggplot2::element_blank(),
panel.grid.major.x = ggplot2::element_blank(),
axis.title.y = ggplot2::element_text(hjust=.33))+
ggplot2::theme(axis.text=ggplot2::element_text(size=13),
axis.title = ggplot2::element_text(size=14),
title = ggplot2::element_text(size=14))+
ggplot2::labs(caption = paste0("Overlap: eta (Reiser & Faraggi) = ", round(ov_Reiser,2),", eta_hat (Pastore) = " ,round(ov_Pastore,2),", Bayesian (Makowski et al.) = ",round(ov_Bayes,2)))
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.