# devtools::install_github("peterhalpin/cirt")
# library("cirt")
rm(list = ls())
library("ggplot2")
library("dplyr")
source("~/github/cirt/R/cIRF_functions.R")
source("~/github/cirt/R/IRF_functions.R")
NYU <- rgb(87, 6, 140, maxColorValue = 255)
a <- seq(0, 1, by = .01)
plot(a, dbeta(a, 1.025, 1.025), type = "l")
abline(a = 1, b = 0, col = 2)
# ------------------------------------------------------------
# Load data and estimate individual thetas
# ------------------------------------------------------------
# Load calibrated item parms
setwd("~/Dropbox/Academic/Projects/CA/Data/response_matrices")
parms <- read.csv("calibration_parms.csv", row.names = 1)
summary(parms)
# Drop DIF items
#dif_items <- "045|065"
parms <- parms[!grepl(dif_items, row.names(parms)),]
items <- paste(row.names(parms), collapse = "|")
# Load collaboration data and split into forms
collab <- read.csv("collaboration_2016_0.csv", check.names = F)
#collab <- collab[!collab$group_id%in%collab$group_id[drops], ]
head(collab)
col_form <- format_resp(collab, row.names(parms), "COL")
ind_form <- format_resp(collab, row.names(parms), "IND")
# Apply conjunctive scoring rule
odd <- seq(1, nrow(col_form), by = 2)
col_form[odd,] <- col_form[odd+1,] <- col_form[odd,]*col_form[odd+1,]
# Drop 13 unsuable response patterns (all 1 or all 0)
drop_groups <- c(
collab$group_id[apply(col_form, 1, mean, na.rm = T) %in% c(1,0)],
collab$group_id[apply(ind_form, 1, mean, na.rm = T) %in% c(1,0)])
col_form <-col_form[!collab$group_id%in%drop_groups,]
ind_form <-ind_form[!collab$group_id%in%drop_groups,]
# Reset odd for dropped items
odd <- seq(1, nrow(col_form), by = 2)
# Estimate theta for ind forms
ind <- MLE(ind_form, parms, WMLE = T)
plot(ind$theta, ind$se)
theta1 <- ind$theta[odd]
theta2 <- ind$theta[odd+1]
theta1_se <- ind$se[odd]
theta2_se <- ind$se[odd+1]
resp <- col_form[odd,]
resp <- cbind(ind_form, col_form)
head(resp)
q <- est_WA(resp, parms, SE = "exp", method = "map", parallel = F)
hist(q$w)
plot(q$w, q$w_se)
plot(q$theta1, q$theta1_se)
plot(q$theta2, q$theta2_se)
# ------------------------------------------------------------
# WA parameter estimation
# ------------------------------------------------------------
set.seed(101)
n_reps <- 250
n_obs <- length(theta1)
w <- runif(n_obs, 0, 1)
ell <- function(i, w){
nw <- length(w)
t1 <- rep(theta1[i], nw)
t2 <- rep(theta2[i], nw)
s1 <- rep(theta1_se[i], nw)
s2 <- rep(theta2_se[i], nw)
R <- matrix(as.numeric(resp[i,col]), nrow = nw, ncol = nrow(parms), byrow = T)
m_WA(R, w, parms, t1, t2)
}
w <- seq(0.0001, .99999, by = .1)
i = 1
y <- ell(i, w)
plot(w, y, type = "l", main = round(c(w[which.max(y)], ml$w[i]), 3))
sum(resp[i,], na.rm = T)
sum(ind_form[odd[i],], na.rm = T)
sum(ind_form[odd[i]+1,], na.rm = T)
resp[i,]
theta1[i]
theta2[i]
w[which.max(y)]
i = i + 1
col <- grep("COL", names(resp))
map <- map_WA(resp[odd,col], parms, theta1, theta2, SE = "exp")
map
ml <- mle_WA(resp, parms, theta1, theta2, SE = "exp")
plot(map$w, ml$w)
abline(a = 0, b = 1)
plot(map$w, map$psd)
plot(map$psd, ml$se)
abline(a = 0, b = 1)
m <- function(par, resp) {
-1*m_full(resp, par[1], parms, par[2], par[3])
}
m(c(.999, -5, -5), resp[c(odd[i], odd[i]+ 1),])
i = 7
q <- optim(c(.5,0,0),
m,
resp = resp[c(odd[i], odd[i]+ 1),],
#method = "Nelder-Mead",
method = "L-BFGS-B",
lower = c(.00001, -4, -4),
upper = c(.99999, 4, 4),
hessian = T
)
q$par
sqrt(diag(solve(q$hessian)))
theta1[i]
theta2[i]
theta1_se[i]
theta2_se[i]
map[i,]
points(ml$w, ml$se, col = 2)
hist(map$w)
sqrt(1/diag(map$hessian))/10
#MLE(ind_form[odd[i + 1],], parms, WMLE = T)
# Estimate weights for empirical data
ml <- mle_WA(resp, parms, theta1, theta2, SE = "exp")
plot(ml$w, ml$se)
ml <- mle_WA2(resp, parms, theta1, theta2, theta1_se, theta2_se, SE = "obs")
sum(ml$w == 1)
plot(ml$w, ml$se)
plot(abs(theta1-theta2), ml$w)
q <- apply(item_delta(parms, theta1, theta2, NA_pattern = resp), 1, mean, na.rm = T)
plot(q, ml$w)
hist(ml$w)
# Generate plausible values for theta estimates
pv <- pv_gen(n_reps, resp, parms, theta1, theta2, theta1_se, theta2_se, weights = NA)
# Estimate weights for generated data
ml_pv <- mle_WA(pv[grep(items, names(pv))], parms, pv$theta1, pv$theta2, SE = "exp", starts = pv$w, parallel = T)
pv <- cbind(pv, ml_pv)
pv_w <- tapply(pv$w, pv$pairs, mean)
var_w <- tapply(pv$se^2, pv$pairs, mean)
var_b <- tapply(pv$w, pv$pairs, var)
pv_se <- sqrt(var_w + (1 + 1/n_reps) * var_b)
var_increase <- (1 + 1/n_reps) * var_b/var_w
var_prop <- (1 + 1/n_reps) * var_b/pv_se^2
summary(var_prop)
summary(var_increase)
# SEs Figure ---------------------
gg_v <- data.frame(c(ml$se, pv_se), rep(pv_w, times = n_obs*2), rep(c("ML", "PV"), each = n_obs))
names(gg_v) <- c("SE", "PV_W", "Type")
ggplot(gg_v, aes(x = PV_W, y = SE, group = Type)) + geom_point(size = 3, aes(shape = Type, color = Type)) + ylim(c(0,.8)) +
scale_color_manual(values = c("grey50", "black")) +
scale_shape_discrete(solid=F) +
theme_set(theme_grey(base_size = 15))
# ------------------------------------------------------------
# Goodness of fit
# ------------------------------------------------------------
logL <- ml$logL
sim <- data_gen(n_reps, pv_w, parms, theta1, theta2, NA_pattern = resp)
temp <- mle_WA(sim[grep(items, names(sim))], parms, sim$theta1, sim$theta2, SE = "exp", starts = sim$w, parallel = T)
sim <- cbind(sim, temp)
head(sim)
quant <- tapply(sim$logL, sim$pairs, function(x) quantile(x, p = c(.5, .025))) %>% unlist %>% matrix(, nrow = 2, ncol = length(theta1)) %>% t()
# Visual key for misfit
fit <- rep("<.95", times = length(theta1))
fit[logL < quant[,2]] <- ">.95"
fit <- ordered(fit, c(">.95", "<.95"))
# Set up and plot
gg <- data.frame(-2*sim$logL, -2*rep(logL, each = n_reps), rep(fit, each = n_reps), rep(-2*quant[,1], each = n_reps))
names(gg) <- c("l_dist", "l_obs", "fit", "median")
gg <- gg[order(gg$median), ]
gg$pair <- rep(1:length(theta1), each = n_reps)
ggplot(gg, aes(x = pair, y = l_dist, group = pair)) +
geom_boxplot(outlier.size = 0, outlier.color = "grey90", aes(fill = fit)) +
geom_point(aes(x = pair, y = l_obs, pch = fit, size = fit)) +
scale_shape_manual(values = c(4, 20)) +
scale_fill_grey(start = 0.1, end = 0.8) +
xlab("Groups") +
ylab("-2 * log-likelihood") +
scale_size_manual(values = c(4, 1)) +
theme(legend.title=element_blank()) +
theme_set(theme_grey(base_size = 15))
# ------------------------------------------------------------
# Process loss
# ------------------------------------------------------------
dim(pv)
dim(resp)
dim(parms)
head(pv)
# Expected log likelihoods
p_add <- format_NA(cIRF("Add", parms, pv$theta1, pv$theta2), NA_pattern = resp)
e_max <- likelihood("Max", p_add, parms, pv$theta1, pv$theta2)
e_ai <- likelihood("Add", p_add, parms, pv$theta1, pv$theta2)
e_ind <- likelihood("Ind", p_add, parms, pv$theta1, pv$theta2)
e_wa <- l_WA(p_add, rep(pv_w, each = n_reps), parms, pv$theta1, pv$theta2)
e_wa2 <- l_WA(p_add, pv$w, parms, pv$theta1, pv$theta2)
# Process loss
pl_wa <- 1 - (e_ai - e_wa)/(e_ai - e_ind)
pl_wa2 <- 1 - (e_ai - e_wa2)/(e_ai - e_ind)
plot(pl_wa, pl_wa2)
pl_max <- (e_ai - e_max)
/(e_ai - e_ind)
# quantiles for wa model
quant_wa <- tapply(pl_wa2, pv$pairs, function(x) quantile(x, p = c(.5, .025))) %>% unlist %>% matrix(, nrow = 2, ncol = length(theta1)) %>% t()
# quantiles for max model
quant_max <- tapply(pl_max, pv$pairs, function(x) quantile(x, p = c(.5, .975))) %>% unlist %>% matrix(, nrow = 2, ncol = length(theta1)) %>% t()
# order groups my wa medians
ind <- order(rep(quant_wa[,1], each = n_reps))
sum(quant_wa[,2] > quant_max[2])/151
# overlapping 95% confidence intervals for wa and max?
overlap <- rep(as.factor(quant_wa[,2] > quant_max[2]), each = n_reps)
levels(overlap) <- c("< Max", "> Max")
gg <- data.frame(pl_max[ind], pl_wa[ind], overlap[ind], rep(1:n_obs, each = n_reps))
names(gg) <- c("max", "obs", "overlap", "pair")
head(gg)
ggplot(gg, aes(x = pair, y = obs, fill = overlap)) +
geom_boxplot(aes(group = pair), outlier.size = 0, outlier.color = "grey90", size = .2) +
scale_fill_manual(values = c("grey50", 5)) +
ylab("1 - process loss") +
xlab("Pair") +
theme(legend.title=element_blank()) +
theme_set(theme_grey(base_size = 15))
ggplot(gg, aes(x = pair, y = obs, fill = overlap)) +
geom_boxplot(aes(group = pair), outlier.size = 0, outlier.color = "grey90", size = .2) +
scale_fill_manual(values = c("grey50", 5)) +
ylab("1 - process loss") +
xlab("Pair") +
theme(legend.title=element_blank()) +
geom_rect(aes(xmin = 70, xmax = 151, ymin = .8, ymax = 1.05),
fill = "transparent", color = 5, size = 1.5)
ggplot(gg[gg$pair > 70, ], aes(x = pair, y = obs, fill = overlap)) +
geom_boxplot(aes(group = pair), outlier.size = 0, outlier.color = "grey90", size = .3) +
scale_fill_manual(values = c("grey50", 5)) +
ylab("1 - process loss") +
xlab("Pair") +
theme(legend.title=element_blank()) +
theme(panel.border = element_rect(colour = 5, fill = NA, size = 1.5))
24/151
# ------------------------------------------------------------
# Sample Demographics
#------------------------------------------------------------
setwd("~/Dropbox/Academic/Projects/CA/Data")
demo <- read.csv("Long_AMT_IDs_Match(1.30.17).csv")
names(demo)
vars <- c("Age", "Gender", "Ethnicity", "English", "leveledu", "liveusa")
temp <- demo[vars]
temp$Ethnicity[temp$Ethnicity != 10] <- 0
temp$Ethnicity[temp$Ethnicity != 0] <- 1
temp$Gender <- temp$Gender - 1
temp$English <- abs(temp$English - 2)
temp$leveledu[temp$leveledu < 3] <- 0
temp$leveledu[temp$leveledu != 0] <- 1
temp$liveusa <- abs(temp$liveusa - 2)
head(temp)
apply(temp, 2, mean)
summary(temp$Age)
#------------------------------------------------------------
# DIF with Mplus: Items 45 and 65 identified as problematic in
# scalar model. After dropping, scalar model fits OK
# Metric 36.971 37 0.4704
# Scalar 101.071 74 0.0200
# Scalar (w/ drop) 84.358 70 0.1161
# ------------------------------------------------------------
# Load calibrated item parms
setwd("~/Dropbox/Academic/Projects/CA/Data/response_matrices")
parms <- read.csv("calibration_parms.csv", row.names = 1)
grep_items <- paste0(row.names(parms), collapse = "|")
scalar.out <- "./DIF/collaboration_DIF_scalar.out"
temp <- readLines(scalar.out)
Begin <- grep("Item Difficulties", temp)
End <- grep("Variances", temp)
calib_temp <- temp[Begin[1]:End[End > Begin[1] & End < Begin[2]]]
collab_temp <- temp[Begin[2]:End[End > Begin[2] & End < Begin[3]][1]]
calib_beta <- substr(calib_temp[grepl(grep_items, calib_temp)], 23, 28) %>% as.numeric
collab_beta <- substr(collab_temp[grepl(grep_items, collab_temp)], 23, 28) %>% as.numeric
item_names <- substr(collab_temp[grepl(grep_items, collab_temp)], 6, 10)
gg <- data.frame(item_names, collab_beta, calib_beta)
gg$dif <- resid(lm(collab_beta ~ calib_beta, gg)) > 1
dif_items <- paste0(gg$item_names[gg$dif == T], collapse = "|")
# DIF Figure
ggplot(gg, aes(x = calib_beta, y = collab_beta)) +
geom_point(aes(pch = dif, color = dif, size = dif)) +
stat_smooth(method = "lm", col = "black", se = F) +
xlab("Calibration sample") +
ylab("Collaborative testing condition") +
theme(legend.position = "none") +
scale_shape_manual(values = c(20, 4)) +
scale_color_manual(values = c("black", "red")) +
scale_size_manual(values = c(2, 4))
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.