test-6-primerReg.R
In Dire: Linear Regressions with a Latent Outcome Variable

skip_on_cran()
skip_if_not_installed("EdSurvey")
require(EdSurvey)
# paramTab$a/slope <-
# paramTab$g/asymptote/guessing <- 
# paramTab$d/difficulty <- 
# param$D <- 1.7
dichotParamTab <- structure(list(ItemID = structure(1:11,
                                                    .Label = c("m109801", "m020001", "m111001", "m046301", "m046501", "m051501", "m111601", "m111301", "m111201", "m110801", "m110101"), class = "factor"),
                                 test = rep("composite",11),
                                 subtest = c(rep("num",6),rep("alg",5)),
                                 slope = c(0.96, 0.69, 0.83, 0.99, 1.03, 0.97, 1.45, 0.59, 0.34, 0.18, 1.20),
                                 difficulty = c(-0.37, -0.55, 0.85 , -0.97, -0.14, 1.21, 0.53, -1.84, -0.46, 2.43, 0.70),
                                 guessing = c(0.16,0.00,0.17,0.31,0.37,0.18, 0.28, 0.15, 0.09, 0.05, 0.18),
                                 D = rep(1.7, 11),
                                 MODEL = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), .Label = "3pl", class = "factor")), class = "data.frame", row.names = c(NA, -11L))

## GPCM
# a / slope
# d1,2,...,(param-1)
# b/itemLocation, then map to d1, d2
# D / D
polyParamTab <- structure(list(ItemID = structure(c(1:2), .Label = c("m0757cl", "m066501"), class = "factor"),
                               test = rep("composite",2),
                               subtest = c(rep("alg",2)),
                               slope = c( 0.43, 0.52 ), # could also be called "a"
                               itemLocation = c( -1.21, -0.96), # could also not exist, added to d1 ... dn
                               d1 = c(2.38, -0.56), 
                               d2 = c(-0.57, 0.56),
                               d3 = c(-1.18, NA),
                               D = c(1.7, 1.7),
                               scorePoints = c(4L, 3L) # number of score points, read d1 to d(n-1)
), class = "data.frame", row.names=c(1L,2L))
# read-in NAEP Primer data 
sdf <- readNAEP(system.file("extdata/data", "M36NT2PM.dat", package = "NAEPprimer"))
items <- c(as.character(dichotParamTab$ItemID), as.character(polyParamTab$ItemID))
edf <- getData(data=sdf, varnames=c(items, "dsex", "origwt", "repgrp1", "jkunit", "sdracem"), omittedLevels = FALSE)
edf$sid <- paste0("S",1:nrow(edf))
# apply super basic scoring
for(i in 1:length(items)) {
  coli <- items[i]
  # save the original
  edf[,paste0(coli,"raw")] <- edf[,coli]
  if( coli %in% dichotParamTab$ItemID) {
    edf[,coli] <- ifelse(grepl("[ABCDE]", edf[,paste0(coli,"raw")]), 0, NA)
    edf[,coli] <- ifelse(grepl("*", edf[,paste0(coli,"raw")], fixed=TRUE), 1, edf[,coli])
  } else {
    edf[,coli] <- ifelse(grepl("Incorrect", edf[,paste0(coli,"raw")], fixed=TRUE), 0, NA)
    edf[,coli] <- ifelse(grepl("None correct", edf[,paste0(coli,"raw")], fixed=TRUE), 0, edf[,coli])
    edf[,coli] <- ifelse(grepl("Incorrect", edf[,paste0(coli,"raw")], fixed=TRUE), 0, edf[,coli])
    edf[,coli] <- ifelse(grepl("One correct", edf[,paste0(coli,"raw")], fixed=TRUE), 1, edf[,coli])
    edf[,coli] <- ifelse(grepl("Partial", edf[,paste0(coli,"raw")], fixed=TRUE), 1, edf[,coli])
    edf[,coli] <- ifelse(grepl("Two correct", edf[,paste0(coli,"raw")], fixed=TRUE), 2, edf[,coli])
    edf[,coli] <- ifelse(grepl("Correct", edf[,paste0(coli,"raw")], fixed=TRUE), 2, edf[,coli])
    edf[,coli] <- ifelse(grepl("Three correct", edf[,paste0(coli,"raw")], fixed=TRUE), 3, edf[,coli])
  }
  # print results
  edf[,paste0(coli,"raw")] <- NULL # delete original
}

# dummify
indepVars <- c("dsex","sdracem")
dummyVars <- c() 
for (indep in indepVars) {
  dummyMatrix <- model.matrix(~get(indep) - 1, data = edf)
  edf <- cbind(edf, dummyMatrix)
  levs <- levels(edf[[indep]])
  colnames(edf) <- sub('^get\\(indep\\)', indep, colnames(edf)) # change to variableLEVEL
  edf <- edf[,!colnames(edf)%in% c(indep)] # drop original column and first dummy
  # record dummy names for subset
  newNames <- paste0(indep,levs)
  dummyVars <- c(dummyVars, newNames)
}

# AM doesn't like certain overlap of variable names so make a correction here 
colnames(edf)[colnames(edf) == "sdracemAmer Ind/Alaska Natv"] <- "sdracemInd"
dummyVars[dummyVars == "sdracemAmer Ind/Alaska Natv"] <- "sdracemInd"

# make stuItems (long) and stuDat 
stuItems <- stuItems1 <- edf[,c("sid", items)]
stuItems <- reshape(data=stuItems, varying=c(items), idvar=c("sid"), direction="long", v.names="score", times=items, timevar="key")
stuDat <- edf[,c("sid", "origwt", "repgrp1", "jkunit", dummyVars)]

# make testDat 
testDat <- data.frame(test=c("composite", "composite", "composite") ,
                      subtest=c("num", "alg", NA),
                      location=c(277.1563, 280.2948, sum(c(0.3,0.7) * c(277.1563, 280.2948))),
                      scale=c(37.7297, 36.3887, sum(c(0.3,0.7) * c(37.7297, 36.3887))),
                      subtestWeight=c(0.3,0.7, NA))

context("NAEPPrimer Regression Non-composite Algebra")
# algebra
mmlA <- mml(alg ~ dsexFemale + sdracemBlack + sdracemHispanic + `sdracemAsian/Pacific Island` + 
              sdracemInd + sdracemOther, stuItems=stuItems, stuDat=stuDat, dichotParamTab=dichotParamTab, polyParamTab=polyParamTab,
            Q=34, idVar="sid", testScale=testDat, composite=FALSE, weightVar="origwt",
            minNode = -4, maxNode = 4, 
            strataVar="repgrp1", PSUVar="jkunit")
mmlATaylor <- summary(mmlA, varType="Taylor", gradientHessian=TRUE, strataVar="repgrp1", PSUVar="jkunit")

# similarly, the PV results should be approximately equal
set.seed(2)
pvA <- drawPVs(mmlA, construct="alg")
stuDatA <- merge(stuDat, pvA[["data"]], by.x="sid", by.y="id", all.x=TRUE, all.y=FALSE)
lmA <- summary(lm(alg_dire1 ~  dsexFemale + sdracemBlack + sdracemHispanic + `sdracemAsian/Pacific Island` + 
                  sdracemInd + sdracemOther, data=stuDatA, weights=stuDatA$origwt))
expect_equal(coef(mmlATaylor)[1:6, 1], lmA$coef[1:6, 1], 0.1)

# check stochastic beta works
pvAs <- drawPVs(mmlATaylor, construct="alg", stochasticBeta=TRUE)
stuDatAs <- merge(stuDat, pvAs[["data"]], by.x="sid", by.y="id", all.x=TRUE, all.y=FALSE)
lmAs <- summary(lm(alg_dire1 ~  dsexFemale + sdracemBlack + sdracemHispanic + `sdracemAsian/Pacific Island` + 
                   sdracemInd + sdracemOther, data=stuDatAs, weights=stuDatA$origwt))
expect_equal(coef(mmlATaylor)[1:6, 1], lmAs$coef[1:6, 1], 0.1)

# check the posterior means agree with the MML estimate
pvApos <- drawPVs(mmlA, construct="alg", returnPosterior =TRUE)
pvApos <- pvApos[["posterior"]]
pvApos$mus <- pvApos$mu * mmlA$scale + mmlA$location
stuDatApos <- merge(stuDat, pvApos, by.x="sid", by.y="id", all.x=TRUE, all.y=FALSE)
lmApos <- summary(lm(mus ~  dsexFemale + sdracemBlack + sdracemHispanic + `sdracemAsian/Pacific Island` + 
                     sdracemInd + sdracemOther, data=stuDatApos, weights=stuDatApos$origwt))
# this should be exact
expect_equal(coef(mmlATaylor)[1:6, 1], lmApos$coef[1:6, 1])

# MML Regression	
# Run completed on Wednesday, December 09, 2020. 09:54:09 AM
# 
# Selection:         	ALL
# Observations:      	8826
# Strata Variable:   	REPGRP1
# Cluster Variable:  	JKUNIT
# Weight Variable:   	ORIGWT
# 
# Iterations: 37 
# Log Likelihood: -14495.6
# 
# Adjusted Wald Test
# F(6,57) = 22.4731
# p(F > f) = 2.17826e-013	
# Dependent Variable: composite, alg	
# Parameter Name	Estimate	Standard Error	t Statistic	p > |t|	
# Constant	284.0244475898	1.5620600420	181.8268440084 0.0000000000	
# DSEXFEMA	4.6425240064	  1.6651582440	2.7880377274	0.0070480262	
# SDRACEMB	-26.4952368029	3.1195434525	-8.4933058976	0.0000000000	
# SDRACEMH	-20.3300619109	2.9454209419	-6.9022602582	0.0000000033	
# SDRACEMA	 13.4723776156	4.7778038300	2.8197845904	0.0064602479	
# SDRACEMI	-24.0877598983	7.1463851501	-3.3706215649	0.0013014462	
# SDRACEMO	 -2.9570604020	10.8693889849	-0.2720539679	0.7864836263	
# Root MSE	 29.9108170108	1.1684294466	---	---	
#   AM Statistical Software Beta Version 0.06.04. (c) The American Institutes for Research and Jon Cohen	
# compare estimate and weight 
expect_equal(unname(mmlATaylor$coefficients[,1]), 
             c(284.0244475898, 4.6425240064, -26.4952368029,-20.3300619109,
               13.4723776156, -24.0877598983, -2.9570604020, 29.9108170108), 
             tolerance=50*sqrt(.Machine$double.eps))
expect_equal(unname(mmlATaylor$coef[,2]),
             c(1.5620600420, 1.6651582440,3.1195434525, 2.9454209419,
               4.7778038300, 7.1463851501, 10.8693889849, 1.1684294466),
             tolerance=5*(.Machine$double.eps)^0.25)



context("NAEPPrimer Regression Non-composite Numeric")
# Numeric 
mmlN <- mml(num ~ dsexFemale + sdracemBlack + sdracemHispanic + `sdracemAsian/Pacific Island` + sdracemInd + sdracemOther, 
            stuItems=stuItems, 
            stuDat=stuDat, 
            dichotParamTab=dichotParamTab, 
            polyParamTab=polyParamTab,
            Q=34, 
            idVar="sid", 
            testScale=testDat, 
            composite=FALSE, 
            weightVar="origwt",
            minNode = -4, 
            maxNode = 4,  
            strataVar="repgrp1", 
            PSUVar="jkunit")
mmlNTaylor <- summary(mmlN, varType="Taylor", gradientHessian=TRUE, strataVar="repgrp1", PSUVar="jkunit")


# MML Regression	
# Run completed on Wednesday, December 09, 2020. 10:00:59 AM
# 
# Selection:         	ALL
# Observations:      	6378
# Strata Variable:   	REPGRP1
# Cluster Variable:  	JKUNIT
# Weight Variable:   	ORIGWT
# 
# Iterations: 12 
# Log Likelihood: -8647.33
# 
# Adjusted Wald Test
# F(6,57) = 25.0037
# p(F > f) = 2.63123e-014	
# Dependent Variable: composite, num	
# Parameter Name	Estimate	Standard Error	t Statistic	p > |t|	
# Constant	295.4743460911	1.5484819999	190.8154864667	0.0000000000	
# DSEXFEMA	-2.6901100945	1.7664688656	-1.5228743324	0.1328644564	
# SDRACEMB	-26.8599727157	2.5390553373	-10.5787267893	0.0000000000	
# SDRACEMH	-22.3144730714	2.8487217696	-7.8331528582	0.0000000001	
# SDRACEMA	4.9481736285	5.9500552409	0.8316180990	0.4088183655	
# SDRACEMI	-17.7347803395	7.2513021373	-2.4457373316	0.0173294853	
# SDRACEMO	-4.9037488612	13.9236962237	-0.3521872915	0.7259035127	
# Root MSE	29.6919958757	1.3943320344	---	---	
#   AM Statistical Software Beta Version 0.06.04. (c) The American Institutes for Research and Jon Cohen	

# compare estimate and SE 
expect_equal(unname(mmlNTaylor$coefficients[,1]), 
             c(295.4743460911, -2.6901100945, -26.8599727157,-22.3144730714,
               4.9481736285, -17.7347803395, -4.9037488612, 29.6919958757), 
             tolerance=200*sqrt(.Machine$double.eps))
expect_equal(unname(mmlNTaylor$coef[,2]),
             c(1.5484819999, 1.7664688656,2.5390553373, 2.8487217696,
               5.9500552409, 7.2513021373, 13.9236962237, 1.3943320344),
             tolerance=3*(.Machine$double.eps)^0.25)


context("NAEPPrimer Regression Composite")
# Composite 
mmlC <- mml(composite ~ dsexFemale + sdracemBlack + sdracemHispanic + `sdracemAsian/Pacific Island` + 
              `sdracemInd` + sdracemOther, stuItems=stuItems, stuDat=stuDat, dichotParamTab=dichotParamTab, polyParamTab=polyParamTab,
            Q=34, idVar="sid", testScale=testDat, composite=TRUE, weightVar="origwt",
            minNode = -4, maxNode = 4, 
            strataVar="repgrp1", PSUVar="jkunit")
mmlCTaylor <- summary(mmlC, varType="Taylor", gradientHessian=TRUE)

### similarly, the PV results should be approximately equal
# draw the PVs
pvC <- drawPVs(mmlC)
stuDatC <- merge(stuDat, pvC[["data"]], by.x="sid", by.y="id", all.x=TRUE, all.y=FALSE)
# first for a subscale
lmA2 <- summary(lm(alg_dire1 ~  dsexFemale + sdracemBlack + sdracemHispanic + `sdracemAsian/Pacific Island` + 
                   sdracemInd + sdracemOther, data=stuDatC, weights=stuDatC$origwt))
# this is the above, just Algebra mml estimate
expect_equal(coef(mmlATaylor)[1:6, 1], lmA2$coef[1:6, 1], 0.05)
# now use full composite
lmC2 <- summary(lm(composite_dire1 ~  dsexFemale + sdracemBlack + sdracemHispanic + `sdracemAsian/Pacific Island` + 
                   sdracemInd + sdracemOther, data=stuDatC, weights=stuDatC$origwt))
# this is the above, just Algebra mml estimate
expect_equal(coef(mmlCTaylor)[1:6, 1], lmC2$coef[1:6, 1], 0.05)

# now draw with stochastic beta
pvCs <- drawPVs(mmlCTaylor, stochasticBeta=TRUE)
stuDatCs <- merge(stuDat, pvCs[["data"]], by.x="sid", by.y="id", all.x=TRUE, all.y=FALSE)
# first for a subscale
lmA2s <- summary(lm(alg_dire1 ~  dsexFemale + sdracemBlack + sdracemHispanic + `sdracemAsian/Pacific Island` + 
                    sdracemInd + sdracemOther, data=stuDatCs, weights=stuDatC$origwt))
expect_equal(coef(mmlATaylor)[1:6, 1], lmA2s$coef[1:6, 1], 0.05)
# now composite
lmC2s <- summary(lm(composite_dire1 ~  dsexFemale + sdracemBlack + sdracemHispanic + `sdracemAsian/Pacific Island` + 
                    sdracemInd + sdracemOther, data=stuDatCs, weights=stuDatC$origwt))
expect_equal(coef(mmlCTaylor)[1:6, 1], lmC2s$coef[1:6, 1], 0.05)


# MML Composite Regression	
# Run completed on Wednesday, December 09, 2020. 12:54:46 PM
# 
# Selection:         	ALL
# Observations:      	16915
# Strata Variable:   	REPGRP1
# Cluster Variable:  	JKUNIT
# Weight Variable:   	ORIGWT
# 
# Adjusted Wald Test
# F(6,57) = 22.1717
# p(F > f) = 2.83107e-013	
# Composite Results	
# Parameter Name	Estimate	Standard Error	t Statistic	p > |t|	
#   Constant	287.4594172767	1.5217659456	5.3268139977	0.0000014991	
# DSEXFEMA	2.4427336887	1.6550653899	1.4759137033	0.1450151434	
# SDRACEMB	-26.6046575811	2.8827206928	-9.2290098195	0.0000000000	
# SDRACEMH	-20.9253852827	2.8492971588	-7.3440515737	0.0000000006	
# SDRACEMA	10.9151163178	5.0000511649	2.1830009250	0.0328517195	
# SDRACEMI	-22.1818659549	7.0094097999	-3.1645839790	0.0024164736	
# SDRACEMO	-3.5410669629	11.4857615243	-0.3083005820	0.7588921216	
# Root MSE	29.1509253186	(--)	
# AM Statistical Software Beta Version 0.06.04. (c) The American Institutes for Research and Jon Cohen	

# Dependent Variable: composite, alg	
# Parameter Name	Estimate	Standard Error	
# Constant	     0.102	     0.043	
# DSEXFEMA	     0.128	     0.046	
# SDRACEMB	    -0.728	     0.086	
# SDRACEMH	    -0.559	     0.081	
# SDRACEMA	     0.370	     0.131	
# SDRACEMI	    -0.662	     0.196	
# SDRACEMO	    -0.081	     0.299	
# Root MSE	     0.822	     0.032	
# Dependent Variable: composite, num	
# Parameter Name	Estimate	Standard Error	
# Constant	     0.486	     0.041	
# DSEXFEMA	    -0.071	     0.047	
# SDRACEMB	    -0.712	     0.067	
# SDRACEMH	    -0.591	     0.076	
# SDRACEMA	     0.131	     0.158	
# SDRACEMI	    -0.470	     0.192	
# SDRACEMO	    -0.130	     0.369	
# Root MSE	     0.787	     0.037	

# DIRE to AM
expect_equal(unname(mmlCTaylor$coefficients[-8,1]), 
             c(287.4594172767, 2.4427336887, -26.6046575811, -20.9253852827, 
               10.9151163178, -22.1818659549, -3.5410669629), 
             tolerance=50*sqrt(.Machine$double.eps)) 

# AM gets wrong correlations, reported for posterity
# expect_equal(unname(mmlCTaylor$coefficients[1:7,2]), 
#              c(1.5217659456, 1.6550653899, 2.8827206928, 
#                2.8492971588, 5.0000511649, 7.0094097999,11.4857615243), 
#              tolerance=5*(.Machine$double.eps)^0.25)