test-5-primerIntercept.R
In Dire: Linear Regressions with a Latent Outcome Variable

# AM Results updated on 10/15/2020
skip_on_cran()
context("NAEPPrimer composite")
skip_if_not_installed("EdSurvey")
require(EdSurvey)

## Dichotomous Parameter Tab 
# paramTab$test
# paramTab$subtest
# paramTab$a/slope <-
# paramTab$g/asymptote/guessing <- 
# paramTab$d/difficulty <- 
# param$D <- 1.7
dichotParamTab <- structure(list(ItemID = structure(1:11,
                                                    .Label = c("m109801", "m020001", "m111001", "m046301", "m046501", "m051501", "m111601", "m111301", "m111201", "m110801", "m110101"), class = "factor"),
                                 test = rep("composite",11),
                                 subtest = c(rep("num",6),rep("alg",5)),
                                 slope = c(0.96, 0.69, 0.83, 0.99, 1.03, 0.97, 1.45, 0.59, 0.34, 0.18, 1.20),
                                 difficulty = c(-0.37, -0.55, 0.85 , -0.97, -0.14, 1.21, 0.53, -1.84, -0.46, 2.43, 0.70),
                                 guessing = c(0.16,0.00,0.17,0.31,0.37,0.18, 0.28, 0.15, 0.09, 0.05, 0.18),
                                 D = rep(1.7, 11),
                                 MODEL = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), .Label = "3pl", class = "factor")), class = "data.frame", row.names = c(NA, -11L))


## GPCM Parameter Tab 
# a / slope
# d1,2,...,(param-1)
# b/itemLocation, then map to d1, d2
# D / D
polyParamTab <- structure(list(ItemID = structure(c(1:2), .Label = c("m0757cl", "m066501"), class = "factor"),
                               test = rep("composite",2),
                               subtest = c(rep("alg",2)),
                               slope = c( 0.43, 0.52 ), # could also be called "a"
                               itemLocation = c( -1.21, -0.96), # could also not exist, added to d1 ... dn
                               d1 = c(2.38, -0.56), 
                               d2 = c(-0.57, 0.56),
                               d3 = c(-1.18, NA),
                               D = c(1.7, 1.7),
                               scorePoints = c(4L, 3L) # number of score points, read d1 to d(n-1)
), class = "data.frame", row.names=c(1L,2L))

## Test Data information
testDat <- data.frame(
  subtest=c("num", "alg"),
  location=c(277.1563, 280.2948),
  scale=c(37.7297, 36.3887),
  subtestWeight=c(0.3,0.7),
  test=c("composite","composite"))

## Read-in NAEP Primer and Subset to items in Parameter Tabs 
sdf <- readNAEP(system.file("extdata/data", "M36NT2PM.dat", package = "NAEPprimer"))
items <- c(as.character(dichotParamTab$ItemID), as.character(polyParamTab$ItemID))
edf <- getData(data=sdf, varnames=c(items, "dsex", "origwt", "repgrp1", "jkunit"), omittedLevels = FALSE)
edf$stu_id <- paste0("S",1:nrow(edf))

# apply super basic scoring to Primer 
for(i in 1:length(items)) {
  coli <- items[i]
  # save the original
  edf[,paste0(coli,"raw")] <- edf[,coli]
  if( coli %in% dichotParamTab$ItemID) {
    edf[,coli] <- ifelse(grepl("[ABCDE]", edf[,paste0(coli,"raw")]), 0, NA)
    edf[,coli] <- ifelse(grepl("*", edf[,paste0(coli,"raw")], fixed=TRUE), 1, edf[,coli])
  } else {
    edf[,coli] <- ifelse(grepl("Incorrect", edf[,paste0(coli,"raw")], fixed=TRUE), 0, NA)
    edf[,coli] <- ifelse(grepl("None correct", edf[,paste0(coli,"raw")], fixed=TRUE), 0, edf[,coli])
    edf[,coli] <- ifelse(grepl("Incorrect", edf[,paste0(coli,"raw")], fixed=TRUE), 0, edf[,coli])
    edf[,coli] <- ifelse(grepl("One correct", edf[,paste0(coli,"raw")], fixed=TRUE), 1, edf[,coli])
    edf[,coli] <- ifelse(grepl("Partial", edf[,paste0(coli,"raw")], fixed=TRUE), 1, edf[,coli])
    edf[,coli] <- ifelse(grepl("Two correct", edf[,paste0(coli,"raw")], fixed=TRUE), 2, edf[,coli])
    edf[,coli] <- ifelse(grepl("Correct", edf[,paste0(coli,"raw")], fixed=TRUE), 2, edf[,coli])
    edf[,coli] <- ifelse(grepl("Three correct", edf[,paste0(coli,"raw")], fixed=TRUE), 3, edf[,coli])
  }
  # print results
  #print(table(edf[,paste0(coli,"raw")], edf[,coli], useNA="always"))
  edf[,paste0(coli,"raw")] <- NULL # delete original
}

# Separate items and student data from eachother 
stuItems <- edf[,c("stu_id", items)]
stuDat <- edf[,c("stu_id", "dsex", "origwt", "repgrp1", "jkunit")]

testDat <- data.frame(
  subtest=c("num", "alg"),
  location=c(277.1563, 280.2948),
  scale=c(37.7297, 36.3887),
  subtestWeight=c(0.3,0.7),
  test=c("composite","composite"))

# make stuItems long
stuItems <- reshape(data=stuItems, 
                    varying=c(items), idvar=c("stu_id"), direction="long", v.names="score", times=items, timevar="key")

if(FALSE) {
  ## Run to output direct estimation ready data to AM ready data (dct and sav file)
  saveSav(stuItems, edf, "IRTparams", getwd(), idVar="stu_id")
  paramTab2AMdct(dichotParamTab,
                 polyParamTab,
                 testDat,
                 weightVar="origwt",
                 stratumVar="repgrp1",
                 PSUVar="jkunit",
                 testName="IRTparams",
                 saveFilePath=getwd())
}

# algebral score
mmlA <- mml(alg ~ 1, stuItems=stuItems, stuDat=stuDat, dichotParamTab=dichotParamTab, polyParamTab=polyParamTab,
            Q=34, idVar="stu_id", testScale=testDat, composite=FALSE, weightVar="origwt",
            minNode = -5, maxNode = 5,
            strataVar="repgrp1", PSUVar="jkunit")
# Eric Buehler updated results on 10/15/2020
mmlATaylor <- summary(mmlA, varType="Taylor")

# AM output on 10/21/2020
#  Observations:        8826
#  Strata Variable:     REPGRP1
#  Cluster Variable:    JKUNIT
#  Weight Variable:     ORIGWT
# Iterations: 7 
# Log Likelihood: -16394.5  
# Dependent Variable: composite, alg  
# --  Weighted N  Estimate  Standard Error  
# All 8782.713764235  278.334734151 1.035753180 
# Root MSE  ... 32.173725366  1.132270387 
# AM Statistical Software Beta Version 0.06.04. (c) The American Institutes for Research and Jon Cohen  

expect_equal(unname(mmlATaylor$coef[,1]), c(278.334734151, 32.173725366), tolerance=25*sqrt(.Machine$double.eps))
expect_equal(unname(mmlATaylor$coef[1,2]), c(1.035753180), tolerance=5*(.Machine$double.eps)^0.25)
expect_equal(mmlA$LogLik, -16394.5, tol=1E-5)

# numeracy score
mmlN <- mml(num ~ 1, stuItems=stuItems, stuDat=stuDat, dichotParamTab=dichotParamTab, polyParamTab=polyParamTab,
            Q=34, idVar="stu_id", testScale=testDat, composite=FALSE, weightVar="origwt",
            minNode = -5, maxNode = 5, strataVar="repgrp1", PSUVar="jkunit", verbose=TRUE)

# AM output, 10/21/2020
#  Observations:        6378
#  Strata Variable:     REPGRP1
#  Cluster Variable:    JKUNIT
#  Weight Variable:     ORIGWT
# Iterations: 8 
# Log Likelihood: -10299.1  
# Dependent Variable: composite, num  
# --  Weighted N  Estimate  Standard Error  
# All 6254.724474587  285.351404123 1.244369007 
# Root MSE  ... 32.354923506  1.366182380 
# AM Statistical Software Beta Version 0.06.04. (c) The American Institutes for Research and Jon Cohen  
# Eric Buehler updated results on 10/15/2020
mmlNTaylor <- summary(mmlN, varType="Taylor")

# composite
mmlC <- mml(composite ~ 1, stuItems=stuItems, stuDat=stuDat, dichotParamTab=dichotParamTab, polyParamTab=polyParamTab,
            Q=34, idVar="stu_id", testScale=testDat, composite=TRUE, weightVar="origwt",
            minNode = -5, maxNode = 5, 
            strataVar="repgrp1", PSUVar="jkunit", fast=TRUE, verbose=TRUE)
# check that composite coefficients really are composite
# only [1] because the SD is not formed in this way
expect_equal(0.3* coef(mmlN)[1] + 0.7*coef(mmlA)[1], coef(mmlC)[1])

# AM run 10/21/2020
#  Observations:        16915
#  Strata Variable:     REPGRP1
#  Cluster Variable:    JKUNIT
#  Weight Variable:     ORIGWT  
# Composite Results 
# --  Weighted N  Mean  Standard Error  Standard Deviation  
# All 16932.463329904 280.439735226 1.098169602 32.223451497  
# use AM means
# means correctly estimates correlation, AM's regression finds a false maximum for the correlation
expect_equal(unname(coef(mmlC)[1]), c(280.4397365300, 32.223451497)[1], tolerance=1e-6)
expect_equal(unname(coef(mmlC)[2]), c(280.4397365300, 32.223451497)[2], tolerance=0.001)

context("NAEPPrimer composite variance")

# Taylor variance
mmlCTaylor <- summary(mmlC, varType="Taylor", gradientHessian=TRUE)
# updated on 1/20/2022 to mml means; new "full Taylor" result
expect_equal(mmlCTaylor$coef[1,2], c(0.97841726915504), 
             tolerance=20*(.Machine$double.eps)^0.25)

mmlCpTaylor <- summary(mmlC, varType="Partial Taylor", gradientHessian=TRUE)
# 10/2020 regression test result for partial Taylor
expect_equal(mmlCpTaylor$coef[1,2], c(1.098169521), 
             tolerance=20*(.Machine$double.eps)^0.25)