Nothing
## ----include = FALSE----------------------------------------------------------
knitr::opts_chunk$set(
collapse = FALSE,
comment = "#>"
)
options(rmarkdown.html_vignette.check_title = FALSE)
## ----eval = FALSE-------------------------------------------------------------
# install.packages("multid")
## ----eval = FALSE-------------------------------------------------------------
# install.packages("devtools")
# devtools::install_github("vjilmari/multid")
## ----setup,warning = FALSE, message = FALSE-----------------------------------
library(rio)
library(multid)
library(dplyr)
library(overlapping)
## ----importdata---------------------------------------------------------------
# create a temporary directory
td <- tempdir()
# create a temporary file
tf <- tempfile(tmpdir=td, fileext=".zip")
# download file from internet into temporary location
download.file("http://openpsychometrics.org/_rawdata/BIG5.zip", tf)
# list zip archive
file_names <- unzip(tf, list=TRUE)
# extract files from zip file
unzip(tf, exdir=td, overwrite=TRUE)
# use when zip file has only one file
dat <- import(file.path(td, "BIG5/data.csv"))
# delete the files and directories
unlink(td)
## -----------------------------------------------------------------------------
# save names of personality items to a vector
per.items<-names(dat)[which(names(dat)=="E1"):
which(names(dat)=="O10")]
# code item response 0 (zero) to not available (NA) on all these items
dat[,per.items]<-
sapply(dat[,per.items],function(x){ifelse(x==0,NA,x)})
# check that the numerical range makes sense
range(dat[,per.items],na.rm=T)
# calculate sum scores for Big Five (reverse coded items subtracted from 6)
# see codebook from https://openpsychometrics.org for item content and keys
dat$N<-rowMeans(
cbind(dat[,c("N1","N3","N5","N6","N7","N8","N9","N10")],
6-dat[,c("N2","N4")]),na.rm=T)
dat$E<-rowMeans(
cbind(dat[,c("E1","E3","E5","E7","E9")],
6-dat[,c("E2","E4","E6","E8","E10")]),na.rm=T)
dat$O<-rowMeans(
cbind(dat[,c("O1","O3","O5","O7","O8","O9","O10")],
6-dat[,c("O2","O4","O6")]),na.rm=T)
dat$A<-rowMeans(
cbind(dat[,c("A2","A4","A6","A8","A9","A10")],
6-dat[,c("A1","A3","A5","A7")]),na.rm=T)
dat$C<-rowMeans(
cbind(dat[,c("C1","C3","C5","C7","C9","C10")],
6-dat[,c("C2","C4","C6","C8")]),na.rm=T)
## -----------------------------------------------------------------------------
US.dat<-
na.omit(dat[dat$country=="US" & (dat$gender==1 | dat$gender==2),
c("gender",per.items,"N","E","O","A","C")])
# code categorical gender variables
US.dat$gender.cat<-
ifelse(US.dat$gender==1,"Male","Female")
## -----------------------------------------------------------------------------
d.N<-d_pooled_sd(data = US.dat,var = "N",
group.var = "gender.cat",group.values = c("Male","Female"),
infer=T)
d.E<-d_pooled_sd(data = US.dat,var = "E",
group.var = "gender.cat",group.values = c("Male","Female"),
infer=T)
d.O<-d_pooled_sd(data = US.dat,var = "O",
group.var = "gender.cat",group.values = c("Male","Female"),
infer=T)
d.A<-d_pooled_sd(data = US.dat,var = "A",
group.var = "gender.cat",group.values = c("Male","Female"),
infer=T)
d.C<-d_pooled_sd(data = US.dat,var = "C",
group.var = "gender.cat",group.values = c("Male","Female"),
infer=T)
# compile to a table
uni.d<-rbind(d.N,d.E,d.O,d.A,d.C)
rownames(uni.d)<-c("N","E","O","A","C")
round(uni.d,2)
## -----------------------------------------------------------------------------
# let's use the previously obtained univariate d-values
d_vector<-uni.d[,"D"]
# calculate intercorrelations
cor_mat<-cor(US.dat[,c("N","E","O","A","C")])
# test that the ordering of these variables matches
names(d_vector)==colnames(cor_mat)
# print the correlation matrix
round(cor_mat,2)
# Calculate D
D_maha<-sqrt(t(d_vector) %*% solve(cor_mat) %*% d_vector)
D_maha
## -----------------------------------------------------------------------------
# set seed number for reproducibility
set.seed(37127)
# decide the size of the regularization (training data fold)
# here, half of the smaller of the male and female groups is used
size=round(min(table(US.dat$gender.cat))/2)
# run D_regularized
D_out<-
D_regularized(data=US.dat,
mv.vars=c("N","E","O","A","C"),
group.var="gender.cat",
group.values = c("Male","Female"),
out = T,
size=size,
pcc = T,
auc = T,
pred.prob = T)
# print the output
round(D_out$D,2)
## -----------------------------------------------------------------------------
round(100*D_out$P.table,1)
## -----------------------------------------------------------------------------
coefficients(D_out$cv.mod,
s = "lambda.min")
coefficients(D_out$cv.mod,
s = "lambda.1se")
plot(D_out$cv.mod)
## ----fig.height=4, fig.width=8------------------------------------------------
# Obtain the D-estimate
D<-unname(D_out$D[,"D"])
# Overlap relative to a single distribution (OVL)
OVL<-2*pnorm((-D/2))
# Proportion of overlap relative to the joint distribution
OVL2<-OVL/(2-OVL)
# non-parametric overlap with overlapping package
np.overlap<-
overlap(x = list(D_out$pred.dat[
D_out$pred.dat$group=="Male","pred"],
D_out$pred.dat[
D_out$pred.dat$group=="Female","pred"]),
plot=T)
# this corresponds to Proportion of overlap relative to the joint distribution (OVL2)
np.OVL2<-unname(np.overlap$OV)
# from which Proportion of overlap relative to a single distribution (OVL) is approximated at
np.OVL<-(2*np.OVL2)/(1+np.OVL2)
# comparison of different overlap-estimates
round(cbind(OVL,np.OVL,OVL2,np.OVL2),2)
## -----------------------------------------------------------------------------
# number of redraws
reps=10
# placeholder for D-estimates
D_out_boot<-rep(NA,reps)
# repeat the procedure to obtain a distribution
t1<-Sys.time()
for (i in 1:reps){
# draw replaced samples with the same size as the original data
boot.dat<-sample_n(US.dat,size=nrow(US.dat),replace=T)
# run D_regularized for each sample
D_out_boot[i]<-
D_regularized(data=boot.dat,
mv.vars=c("N","E","O","A","C"),
group.var="gender.cat",
group.values = c("Male","Female"),
out = T,
size=size)$D[,"D"]
}
t2<-Sys.time()
t2-t1
# print 95% CI percentile interval
round(quantile(D_out_boot,c(.025,.975)),2)
## -----------------------------------------------------------------------------
# place-holder vector for univariate Cohen d's for items
d_vector_items<-rep(NA,length(per.items))
# loop through the items and obtain d for each item
for (i in 1:length(per.items)){
d_vector_items[i]<-
d_pooled_sd(data=US.dat,
var=per.items[i],
group.var="gender.cat",
group.values=c("Male","Female"))[,"D"]
}
cor_mat_items<-cor(US.dat[,per.items])
# Calculate D
D_maha_items<-
sqrt(t(d_vector_items) %*% solve(cor_mat_items) %*% d_vector_items)
D_maha_items
## -----------------------------------------------------------------------------
D_items_out<-
D_regularized(data=US.dat,
mv.vars=per.items,
group.var="gender.cat",
group.values = c("Male","Female"),
out = T,
size=size,pcc = T,auc = T,pred.prob = T)
# Item-based D
round(D_items_out$D,2)
# Big Five domain-based D
round(D_out$D,2)
## -----------------------------------------------------------------------------
round(100*D_items_out$P.table,1)
round(100*D_out$P.table,1)
## ----fig.height=4, fig.width=8------------------------------------------------
# Obtain the D-estimate
D_items<-unname(D_items_out$D[,"D"])
# Overlap relative to a single distribution (OVL)
OVL_items<-2*pnorm((-D_items/2))
# Proportion of overlap relative to the joint distribution
OVL2_items<-OVL_items/(2-OVL_items)
# non-parametric overlap
np.overlap_items<-
overlap(x = list(D_items_out$pred.dat[
D_items_out$pred.dat$group=="Male","pred"],
D_items_out$pred.dat[
D_items_out$pred.dat$group=="Female","pred"]),
plot=T)
# this corresponds to Proportion of overlap relative to the joint distribution (OVL2)
np.OVL2_items<-unname(np.overlap_items$OV)
# from which Proportion of overlap relative to a single distribution (OVL) is approximated at
np.OVL_items<-(2*np.OVL2_items)/(1+np.OVL2_items)
# compare the different overlap-estimates
round(cbind(OVL_items,np.OVL_items,
OVL2_items,np.OVL2_items),2)
# print Big Five domain based overlaps for comparison
round(cbind(OVL,np.OVL,OVL2,np.OVL2),2)
## -----------------------------------------------------------------------------
sessionInfo()
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.