Nothing
#' @title Connect to NAEP Data
#'
#' @description Opens a connection to a Main NAEP, or Long-Term Trend NAEP data file residing
#' on the disk. Returns an \code{edsurvey.data.frame} with
#' information about the file and data.
#'
#' @param path a character value indicating the full filepath location and name
#' of the (.dat) data file
#' @param defaultWeight a character value that indicates the default weight
#' specified in the resulting \code{edsurvey.data.frame}.
#' Default value is \code{origwt} if not specified.
#' @param defaultPvs a character value that indicates the default plausible value
#' specified in the resulting \code{edsurvey.data.frame}.
#' Default value is \code{composite} if not specified.
#' @param omittedLevels a character vector indicating which factor levels/labels
#' should be excluded. When set to the default value of
#' \code{c('Multiple',NA,'Omitted')}, adds the vector to
#' the \code{edsurvey.data.frame}.
#' @param frPath a character value indicating the file location of the \code{.fr2}
#' parameter layout file included with the data companion to
#' parse the specified \code{path} data file.
#' The default value of \code{NULL} will attempt to search the parent directory for the corresponding \code{.fr2} file for the specified \code{path} data file.
#' @param xmlPath a character value indicating the file path of the \code{.xml} parameter layout file included as part of the NAEPEX companion. This file provides necessary
#' information required to read and parse the (.dat) data file. The default value of \code{NULL} will attempt to search
#' the parent directory for the corresponding \code{.xml} file for the specified \code{path} data file.
#'
#' @details
#' The \code{frPath} file layout information will take precedence over the \code{xmlPath} file when the \code{xmlPath} is not explicitly set, or when the \code{xmlPath} file cannot be located.
#'
#' The \code{readNAEP} function includes both scaled scores and theta scores, with the latter having names ending in \code{\_theta}.
#'
#' When a NAEP administration includes a linking error variable those variables are included and end in \code{_linking}.
#' When present, simply use the \code{_linking} version of a variable to get a standard error estimate that includes linking error.
#'
#' This function supports both the Main NAEP data files, and Long-Term Trend NAEP data files.
#' A table outlining the differences can be found on the \href{https://nces.ed.gov/nationsreportcard/about/ltt_main_diff.aspx}{NAEP Nations Report Card website}.
#'
#' @return An \code{edsurvey.data.frame} for a NAEP data file.
#'
#' @seealso \code{\link{edsurvey.data.frame}} \code{\link{getData}}
#' @author Tom Fink and Ahmad Emad
#' @example \man\examples\readNAEP.R
#' @importFrom xml2 read_xml as_list xml_find_all
#' @export
readNAEP <- function(path, defaultWeight = "origwt", defaultPvs = "composite", omittedLevels = c("Multiple", NA, "Omitted"), frPath = NULL, xmlPath = NULL) {
# temporarily adjust any necessary option settings; revert back when done
userOp <- options(OutDec = ".")
on.exit(options(userOp), add = TRUE)
filePathInfo <- validate_NAEPFilePaths(path, frPath, xmlPath)
# check if this is a long-term trend file
isLTT <- grepl("long.*term.*trend", filePathInfo$FileDesc$Assessment_Code, ignore.case = TRUE)
dataSchLaf <- NULL # default
schLabelsFile <- NULL
if (isLTT) {
if (filePathInfo$PreferredLayoutMethod == "XML") {
xmlLayout <- parseNAEP_XML(filePathInfo$StudentXML)
labelsFile <- xmlLayout$FileFormat # only assign the 'FileFormat", as xml returns a list object with much more data
} else { # FR2
labelsFile <- readMRC_LTT(filePathInfo$StudentFR2, filePathInfo$FileDesc$Subject) # FR2 FILE
}
# build the weights and pvvars list
weights <- buildNAEP_LTT_WeightList(labelsFile)
pvs <- buildNAEP_LTT_PVList(labelsFile)
dataLaf <- laf_open_fwf(
filename = filePathInfo$StudentDAT, column_types = labelsFile$dataType,
column_names = labelsFile$variableName, column_widths = as.numeric(labelsFile$Width)
)
# in some rare cases the LaF cleaned names differ from the fileFormat, ensure they are consistent
labelsFile$variableName <- names(dataLaf)
} else { # parse as main NAEP data file
if (filePathInfo$PreferredLayoutMethod == "XML") {
xmlLayout <- parseNAEP_XML(filePathInfo$StudentXML)
labelsFile <- xmlLayout$FileFormat # only assign the 'FileFormat", as xml returns a list object with much more data
} else { # FR2
labelsFile <- readMRC(filePathInfo$StudentFR2) # FR2 FILE
}
if (!is.null(filePathInfo$SchoolDAT)) {
if (filePathInfo$PreferredLayoutMethod == "XML") {
xmlLayoutSch <- parseNAEP_XML(filePathInfo$SchoolXML)
schLabelsFile <- xmlLayoutSch$FileFormat # only assign the 'FileFormat", as xml returns a list object with much more data
} else { # FR2
schLabelsFile <- readMRC(filePathInfo$SchoolFR2) # FR2 FILE
}
schLabelsFile <- schLabelsFile[order(schLabelsFile$Start), ]
widths <- schLabelsFile$Width
dataTypes <- as.character(schLabelsFile$dataType)
varNames <- as.character(tolower(schLabelsFile$variableName))
dataSchLaf <- laf_open_fwf(
filename = filePathInfo$SchoolDAT, column_types = dataTypes,
column_names = varNames, column_widths = widths
)
}
labelsFile <- labelsFile[order(labelsFile$Start), ]
widths <- labelsFile$Width
dataTypes <- as.character(labelsFile$dataType)
varNames <- as.character(tolower(labelsFile$variableName))
dataLaf <- laf_open_fwf(
filename = filePathInfo$StudentDAT, column_types = dataTypes,
column_names = varNames, column_widths = widths
)
# Defining PVs and JKs
##############################################################
## Accomodation not permitted weights and PVs
pvs <- list()
pv_subset <- subset(labelsFile, select = c("Type", "variableName"), labelsFile$Labels %in% c("PV", "PV2", "PVT"))
uniquePvTypes <- unique(pv_subset$Type)
for (i in uniquePvTypes) {
vars <- tolower(pv_subset$variableName[pv_subset$Type == i])
temp_list <- list(varnames = vars)
pvs[[i]] <- temp_list
}
# gets the weights (can have 'origwt' or 'aorigwt' or both)
weights <- list()
if (sum("JK" %in% labelsFile$Labels) > 0) { # origwt
weight_temp <- tolower(varNames[labelsFile$Labels == "JK"])
jksuffix <- gsub("[^0-9]", "", weight_temp)
base <- gsub(jksuffix[1], "", weight_temp[1])
weights[["origwt"]] <- list(jkbase = base, jksuffixes = jksuffix)
}
if (sum("JK2" %in% labelsFile$Labels) > 0) { # aorigwt (a = accomodations)
weight_temp <- tolower(varNames[labelsFile$Labels == "JK2"])
jksuffix <- gsub("[^0-9]", "", weight_temp)
base <- gsub(jksuffix[1], "", weight_temp[1])
weights[["aorigwt"]] <- list(jkbase = base, jksuffixes = jksuffix)
}
if(sum("JK-N" %in% labelsFile$Labels) > 0){ #norigwt (n = national?)
weight_temp <- tolower(varNames[labelsFile$Labels == "JK-N"])
jksuffix <- gsub("[^0-9]","", weight_temp)
base <- gsub(jksuffix[1],"", weight_temp[1])
weights[["norigwt"]] <- list(jkbase = base, jksuffixes = jksuffix)
}
if(sum("JK-V" %in% labelsFile$Labels) > 0){ #vorigwt (v = vocabulary)
weight_temp <- tolower(varNames[labelsFile$Labels == "JK-V"])
jksuffix <- gsub("[^0-9]","", weight_temp)
base <- gsub(jksuffix[1],"", weight_temp[1])
weights[["vorigwt"]] <- list(jkbase = base, jksuffixes = jksuffix)
}
} #end if(isLTT)
# set default weight
if (missing(defaultWeight) || !any(defaultWeight %in% names(weights))) {
attributes(weights)$default <- names(weights)[1]
} else {
attributes(weights)$default <- defaultWeight
}
if (all(is.null(defaultPvs)) || all(is.na(defaultPvs))) {
warning(paste0("Argument ", dQuote("defaultPvs"), " not specified. There will not be a default PV value."))
} else {
if (!defaultPvs[1] %in% names(pvs)) {
defPV <- names(pvs[1])
defi <- 1
while (grepl("theta", defPV, fixed = TRUE) && length(pvs) > defi) {
defi <- defi + 1
defPV <- names(pvs[defi])
}
if (length(pvs) > 0) {
warning(paste0("Updating name of default plausible value since ", sQuote(defaultPvs[1]), " not found. Setting to ", sQuote(defPV), "."))
defaultPvs <- defPV
} else {
warning(paste0("No plausible values found. If plausible value(s) expected, check the ", sQuote("defaultPvs"), " argument."))
}
} # end if(!defaultPvs %in% names(pvs))
}
if (length(pvs) > 0) {
attributes(pvs)$default <- defaultPvs[1]
}
# achievementLevelsHelp located in descriptionOfFile.R
# each file will only have one achievement level scale associated with the file
levels <- achievementLevelsHelp(
filePathInfo$FileDesc$Grade_Level,
filePathInfo$FileDesc$Year,
filePathInfo$FileDesc$Subject,
filePathInfo$FileDesc$Assessment_Code
)
# apply the achievement levels only to the non-theta pvs
# levels will only have one row here for NAEP data files
for (i in seq_along(pvs)) {
if (!grepl("_theta$", names(pvs)[i], ignore.case = TRUE)) {
pvs[[i]]$achievementLevel <- levels # there will be only one value in levels
}
}
# convert achievementLevels to list, this bypasses the automatic apply to all pvs
levelName <- filePathInfo$FileDesc$Subject
levels <- list(levels)
names(levels) <- levelName
# add reporting sample default condition if the column exists
if ("rptsamp" %in% tolower(names(dataLaf))) {
defaultConditions <- quote(tolower(rptsamp) == "reporting sample")
} else {
defaultConditions <- NULL
}
# get psu and stratum variables for Taylor series variance
checkVars <- tolower(labelsFile$variableName)
psCheck <- TRUE
if (psCheck && all(c("jkpair", "jkrepl") %in% checkVars)) {
ps <- c(stratum = "jkpair", psu = "jkrepl")
psCheck <- FALSE
}
if (psCheck && all(c("repgrp", "dropwt") %in% checkVars)) {
ps <- c(stratum = "repgrp", psu = "dropwt")
psCheck <- FALSE
}
if (psCheck && all(c("jkpair", "jkunit") %in% checkVars)) {
ps <- c(stratum = "jkpair", psu = "jkunit")
psCheck <- FALSE
}
# note: this is the default psu/stratum vars for all normal NAEP files and newer LTT files
if (psCheck && all(c("repgrp1", "jkunit") %in% checkVars)) {
ps <- c(stratum = "repgrp1", psu = "jkunit")
psCheck <- FALSE
}
if (psCheck) { # no match was found!
ps <- NULL
}
if (all(is.null(ps)) || all(is.na(ps))) {
psuVar <- NULL
stratumVar <- NULL
} else {
psuVar <- ps["psu"]
stratumVar <- ps["stratum"]
}
# provide more detail as part of the assessment code for users
assessCode <- filePathInfo$FileDesc$Assessment_Code
if (grepl(".*", filePathInfo$FileDesc$Assessment_Sample)) {
assessCode <- paste0(assessCode, " - ", filePathInfo$FileDesc$Assessment_Sample)
}
# build the result list and return
res <- edsurvey.data.frame(
userConditions = list(),
defaultConditions = list(defaultConditions),
dataList = buildNAEP_dataList(
dataLaf,
labelsFile,
dataSchLaf,
schLabelsFile
),
weights = weights,
pvvars = pvs,
subject = filePathInfo$FileDesc$Subject,
year = filePathInfo$FileDesc$Year,
assessmentCode = assessCode,
dataType = filePathInfo$FileDesc$Data_Type,
gradeLevel = filePathInfo$FileDesc$Grade_Level,
achievementLevels = levels,
omittedLevels = omittedLevels,
survey = "NAEP",
country = "USA",
psuVar = psuVar,
stratumVar = stratumVar,
jkSumMultiplier = 1,
fr2Path = ifelse(filePathInfo$PreferredLayoutMethod == "XML", filePathInfo$StudentXML, filePathInfo$StudentFR2)
)
f <- filePathInfo$FileDesc # reassign for the shorthand below
# apply the linking-error parameters
if ("dbapba" %in% colnames(res)) {
foundScale <- FALSE
if (res$subject == "Science") {
foundScale <- TRUE
subscaleWeights <- c(1, 0, 0, 0)
subscales <- c("univariate_scale", "physical", "earth", "life")
composite <- c("univariate_scale")
}
if (res$subject == "Civics") {
foundScale <- TRUE
subscaleWeights <- 1
subscales <- c("civics")
composite <- c("civics")
}
if (res$subject == "Geography") {
if (f[["Grade_Level"]] %in% c("Grade 4", "Grade 8", "Grade 12")) {
foundScale <- TRUE
subscaleWeights <- c(0.40, 0.30, 0.30)
subscales <- c("spaceplace", "envsociety", "spatial_dyn")
composite <- c("composite")
}
}
if (res$subject == "History") {
if (f[["Grade_Level"]] == "Grade 4") {
foundScale <- TRUE
subscaleWeights <- c(0.25, 0.35, 0.25, 0.15)
subscales <- c("democracy", "cultures", "technology", "world_role")
composite <- c("composite")
}
if (f[["Grade_Level"]] == "Grade 8") {
foundScale <- TRUE
subscaleWeights <- c(0.3, 0.30, 0.20, 0.20)
subscales <- c("democracy", "cultures", "technology", "world_role")
composite <- c("composite")
}
if (f[["Grade_Level"]] == "Grade 12") {
foundScale <- TRUE
subscaleWeights <- c(0.25, 0.25, 0.25, 0.25)
subscales <- c("democracy", "cultures", "technology", "world_role")
composite <- c("composite")
}
}
if (res$subject == "Mathematics") {
if (f[["Grade_Level"]] == "Grade 4") {
foundScale <- TRUE
subscaleWeights <- c(0.40, 0.10, 0.15, 0.2, 0.15)
subscales <- c("num_oper", "da_stat_prob", "algebra", "measurement", "geom")
composite <- c("composite")
}
if (f[["Grade_Level"]] == "Grade 8") {
foundScale <- TRUE
subscaleWeights <- c(0.2, 0.15, 0.30, 0.15, 0.20)
subscales <- c("num_oper", "da_stat_prob", "algebra", "measurement", "geom")
composite <- c("composite")
}
if (f[["Grade_Level"]] == "Grade 12") {
foundScale <- TRUE
subscaleWeights <- c(0.1, 0.25, 0.35, 0.3)
subscales <- c("num_oper", "da_stat_prob", "algebra", "measurementgeom")
composite <- c("composite")
}
}
if (res$subject == "Reading") {
if (f[["Grade_Level"]] == "Grade 4") {
foundScale <- TRUE
subscaleWeights <- c(0.50, 0.50)
subscales <- c("literary", "information")
composite <- c("composite")
}
if (f[["Grade_Level"]] == "Grade 8") {
foundScale <- TRUE
subscaleWeights <- c(0.45, 0.55)
subscales <- c("literary", "information")
composite <- c("composite")
}
if (f[["Grade_Level"]] == "Grade 12") {
foundScale <- TRUE
subscaleWeights <- c(0.30, 0.70)
subscales <- c("literary", "information")
composite <- c("composite")
}
}
if (res$subject == "Geography") {
if (f[["Grade_Level"]] %in% c("Grade 4", "Grade 8", "Grade 12")) {
foundScale <- TRUE
subscaleWeights <- c(0.40, 0.30, 0.30)
subscales <- c("spaceplace", "envsociety", "spatial_dyn")
composite <- c("composite")
}
}
if (!foundScale) {
warning("Could not find construct weights for this NAEP. Linking error cannot be calculated.")
} else {
res <- linkVarAugment(res, subscaleWeights = subscaleWeights, subscales = subscales, composite = composite)
}
} # end if("dbapba" %in% colnames(res))
# applicable for main NAEP at this point
if (!isLTT) {
if (filePathInfo$PreferredLayoutMethod == "XML") {
# XML file already contains the IRT parameters and the XML parser returns the formats as needed for EdSurvey
res$scoreCard <- xmlLayout$IRTParams$ScoreCard
res$dichotParamTab <- xmlLayout$IRTParams$DichotParamTab
res$polyParamTab <- xmlLayout$IRTParams$PolyParamTab
res$adjustedData <- xmlLayout$IRTParams$AdjsutedData
res$testData <- xmlLayout$IRTParams$TestData
res$scoreDict <- xmlLayout$IRTParams$ScoreDict
res$scoreFunction <- scoreDefault
} else {
res <- appendIRTAttributes_NAEP(
esdf = res,
year = filePathInfo$FileDesc$Year,
subject = filePathInfo$FileDesc$Subject,
region = filePathInfo$FileDesc$Assessment_Code,
level = filePathInfo$FileDesc$Grade_Level,
fr2Path = filePathInfo$StudentFR2,
filePathInfo = filePathInfo
)
}
}
return(res)
}
linkVarAugment <- function(data, subscaleWeights, subscales, composite) {
PVVars <- getAttributes(data = data, attribute = "pvvars")
PVVarsDefault <- attributes(PVVars)$default
# actual variables
scaledPVs <- list()
thetaPVs <- list()
for (i in seq_along(subscales)) {
scaledPVs <- c(scaledPVs, list(getPlausibleValue(subscales[i], data = data)))
thetaPVs <- c(thetaPVs, list(getPlausibleValue(paste0(subscales[i], "_theta"), data = data)))
}
PVs <- getPlausibleValue(composite, data = data)
# use default weights
W <- attr(getAttributes(data = data, attribute = "weights"), "default")
repWs <- getWeightJkReplicates(W, data)
stratumVar <- getStratumVar(data = data)
PSUvar <- getPSUVar(data)
# the DBA subset
if (!"dbapba" %in% colnames(data)) {
stop(paste0("Cannot find variable ", dQuote("dbapba"), " on data."))
}
if (!stratumVar %in% colnames(data)) {
stop(paste0("Cannot find variable ", dQuote("stratumVar"), " on data."))
}
if (!PSUvar %in% colnames(data)) {
stop(paste0("Cannot find variable ", dQuote("PSUvar"), " on data."))
}
DBASS <- data$dbapba %in% "DBA"
nDBA <- sum(DBASS) # number in DBA subset
# the PBA subset
PBASS <- data$dbapba %in% "PBA"
nPBA <- sum(PBASS) # number in the PBA subset
if (nDBA + nPBA < length(data$dbapba)) {
warning("Some rows are neither PBA nor DBA.")
}
# a map from the DBA PV to the PBA PB
RAM <- getRAM()
# number of subscales
K <- length(subscaleWeights)
if (K != length(scaledPVs)) {
stop(paste0("The argument ", dQuote("scaledPVs"), " is a list with one element per subscale, so it should have the same length as ", dQuote("subscaleWeights"), " and ", dQuote("thetaPVs"), ". Each element of ", dQuote("scaledPVs"), " is a vector of plausible values."))
}
if (K != length(thetaPVs)) {
stop(paste0("The argument ", dQuote("thetaPVs"), " is a list with one element per subscale, so it should have the same length as ", dQuote("subscaleWeights"), " and ", dQuote("scaledPVs"), ". Each element of ", dQuote("thetaPVs"), " is a vector of plausible values."))
}
if (length(unique(sapply(c(thetaPVs, scaledPVs, list(PVs)), length))) != 1) {
stop(paste0("The arguments ", dQuote("thetaPVs"), " and ", dQuote("scaledPVs"), " are lists with one element per subscale, each a vector of the same length. Each vector should also be the same length as the vector ", dQuote("PVs"), "."))
}
# subscale weights called "beta" in documentation
beta <- subscaleWeights
# DBA weights
wd <- data[DBASS, W]
# PBA weights
wp <- data[PBASS, W]
### measurement variance agumentation
# uses full sample weights and varries PVs according to the RAM (Table 1, page 3)
# create PV data frame to minimize calls to getData
pvData <- data[, PVs]
# for each PV (column in the RAM)
RAMi <- 1
thetaPVscores <- data[DBASS, unlist(thetaPVs)]
scaledPVscores <- data[PBASS, unlist(scaledPVs)]
repWsDBA <- data[DBASS, repWs]
repWsPBA <- data[PBASS, repWs]
for (n in 1:ncol(RAM)) {
# for each RAM column (5)
for (j in 1:nrow(RAM)) {
ynj <- rep(0, nDBA)
znj <- rep(0, nPBA)
# RAM says which PBA PV to use
PBApv <- RAM[j, n]
for (k in 1:K) {
# DBA PV is the jth one
# scaledPVs[[k]] is for the kth subscale, in that [j] is for the jth plausible value
theta_k <- thetaPVscores[, thetaPVs[[k]][j]]
# PBApv from RAM
# PBAPVs[[k]], similar to DBA, inside of that we select the plausilbe value from the RAM for row j
x_k <- scaledPVscores[, scaledPVs[[k]][PBApv]]
# mean, std estimates for DBA (theta scale)
mu_t <- getMu(theta_k, wd)
s_t <- getS(theta_k, wd, mu = mu_t)
# mean, std estimates for PBA (reporting scale)
mu_x <- getMu(x_k, wp)
s_x <- getS(x_k, wp, mu = mu_x)
# variable transformations
a <- s_x / s_t
b <- mu_x - a * mu_t
data[DBASS, paste0(subscales[k], "_linking_imp_", nrow(RAM) * (n - 1) + j)] <- (a * theta_k + b)
data[PBASS, paste0(subscales[k], "_linking_imp_", nrow(RAM) * (n - 1) + j)] <- x_k
# y is the sum over the k subscales
ynj <- ynj + beta[k] * (a * theta_k + b)
znj <- znj + beta[k] * x_k
}
# write composite variable for DBA
data[DBASS, paste0(composite, "_linking_imp_", nrow(RAM) * (n - 1) + j)] <- ynj
RAMi <- RAMi + 1
# write composite variable for PBA
compositePBA <- pvData[PBASS, PVs[PBApv]]
data[PBASS, paste0(composite, "_linking_imp_", nrow(RAM) * (n - 1) + j)] <- compositePBA
if (max(abs(znj - compositePBA)) > 0.02) {
warning("Inacurate data or subscale weights, linking error is inacurately estimated due to low precision inputs. Disagreement on composite of ", max(abs(znj - data[PBASS, paste0("comp", nrow(RAM) * (n - 1) + j)])), "\n")
}
}
}
for (i in seq_along(repWs)) {
# y1 is for DBA
y1 <- rep(0, nDBA)
# z1 is for PBA
z1 <- rep(0, nPBA)
# DBA ith replicate weight
wd <- repWsDBA[, repWs[i]]
# PBA ith replicate weight
wp <- repWsPBA[, repWs[i]]
for (k in 1:K) {
# DBA is theta scale
theta_k <- thetaPVscores[, thetaPVs[[k]][1]]
# get mean, stdev, DBA, use ith replicate weight
mu_t <- getMu(theta_k, wd)
s_t <- getS(theta_k, wd, mu = mu_t)
# get mean, stdev, PBA, use ith replicate weight
x_k <- scaledPVscores[, scaledPVs[[k]][1]]
mu_x <- getMu(x_k, wp)
s_x <- getS(x_k, wp, mu = mu_x)
# variablle transformations
a <- s_x / s_t
b <- mu_x - a * mu_t
data[DBASS, paste0(subscales[k], "_linking_samp_", i)] <- (a * theta_k + b)
data[!DBASS, paste0(subscales[k], "_linking_samp_", i)] <- x_k
# sum y1
y1 <- y1 + beta[k] * (a * theta_k + b)
z1 <- z1 + beta[k] * x_k
}
data[DBASS, paste0(composite, "_linking_samp_", i)] <- y1
compositePBA <- pvData[PBASS, PVs[1]]
data[PBASS, paste0(composite, "_linking_samp_", i)] <- compositePBA # z1
if (max(abs(z1 - compositePBA)) > 0.02) {
warning("Inacurate data or subscale weights, linking error is inacurately estimated due to low precision inputs. Disagreement on composite of ", max(abs(z1 - data[PBASS, paste0(composite, "_linking_samp_", i)])), "\n")
}
}
# final RAM index
RAMiFinal <- RAMi - 1
# construct composite PV var
newPV <- list(list(
estVarnames = PVs,
impVarnames = paste0(composite, "_linking_imp_", 1:RAMiFinal),
sampVarnames = paste0(composite, "_linking_samp_", seq_along(repWs)),
achievementLevel = data[["pvvars"]][[composite]][["achievementLevel"]]
))
names(newPV) <- paste0(composite, "_linking")
for (k in 1:K) {
# sometimes
if (subscales[k] != composite) {
newPVi <- list(
estVarnames = scaledPVs[[k]],
impVarnames = paste0(subscales[k], "_linking_imp_", 1:RAMiFinal),
sampVarnames = paste0(subscales[k], "_linking_samp_", seq_along(repWs))
)
newPV[[paste0(subscales[k], "_linking")]] <- newPVi
}
}
pvv <- c(data[["pvvars"]], newPV)
attributes(pvv)$default <- PVVarsDefault
data[["pvvars"]] <- pvv
return(data)
}
# get RAM table for how the two sets of plausible values are permuted to approximate
# integration over the entire possible set of 20! cases.
getRAM <- function() {
RAM <- matrix(NA, nrow = 20, ncol = 5)
RAM[, 1] <- 1:20
RAM[, 2] <- c(14, 11, 18, 15, 20, 17, 3, 16, 12, 6, 19, 2, 1, 13, 5, 7, 4, 9, 8, 10)
RAM[, 3] <- c(6, 4, 3, 9, 11, 10, 2, 17, 19, 13, 7, 8, 15, 18, 20, 12, 5, 16, 14, 1)
RAM[, 4] <- c(2, 14, 7, 9, 20, 13, 16, 8, 4, 11, 19, 1, 18, 10, 6, 12, 15, 3, 5, 17)
RAM[, 5] <- c(19, 10, 13, 8, 17, 12, 1, 11, 5, 6, 18, 9, 15, 2, 3, 7, 14, 20, 16, 4)
return(RAM)
}
# @author Paul Bailey & Ahmad Emad
# this function
readMRC <- function(filename) {
# read NAEP machine readable file.
# this has layout information on it
t <- try(mrcFile <- readLines(filename), silent = TRUE)
# Split mrcFile by "\n"
mrcFile <- strsplit(mrcFile, "\n", fixed = TRUE)
mrcFile <- unlist(mrcFile)
# read in the variables from the file,t his is based on the information ETS
# shared with us
if (any(nchar(mrcFile) < 90)) {
stop(paste0("Malformed fr2 file: some lines have fewer than the required 90 characters. Lines ", pasteItems(which(nchar(mrcFile) < 90)), "."))
}
specs <- getNAEP_FR2Specs(mrcFile)
variableName <- specs$variableName # name of the variable
Start <- specs$Start # start column in file
Width <- specs$Width # number of characters in variable
End <- specs$End # end column in file
Decimal <- specs$Decimal # digits (from the right) to be considered decimals
Labels <- specs$Labels # variable label
NumValue <- specs$NumValue # number of numeric codes (e.g. one could would be 1="Yes")
zeroLenVars <- nchar(variableName) == 0
if (any(zeroLenVars)) {
warning(paste0("Unnamed variables in .fr2 file on row(s) ", pasteItems((seq_along(variableName))[zeroLenVars]), ". File located at ", filename, ". These variables renamed sequentially, starting with V1."))
variableName[zeroLenVars] <- paste0("v", 1:sum(zeroLenVars))
}
# parse the numeric codes
labelValues <- specs$labelValues
# keep the original labels
oLabels <- Labels
# Finding the plausible weights and jacknife replicates.
# normally there is just one set of PVs (with multiple subjects or subscales).
# When there are multiple sets then one starts with an "A" and is the
# accommodations permitted values
# main PVs:
Labels[grepl("plausible", tolower(Labels)) & grepl("value", tolower(Labels)) & "A" != substring(variableName, 1, 1)] <- "PV"
# theta PVs, these do not say "value"
Labels[grepl("plausible", tolower(Labels)) & grepl("theta", tolower(Labels))] <- "PVT"
# accommodations perimted PVs:
Labels[grepl("plausible", tolower(Labels)) & grepl("value", tolower(Labels)) & "A" == substring(variableName, 1, 1)] <- "PV2"
# normally there is just one set of weights. When there are multiple sets
# then one starts with an "A" and is the accommodations permitted values
# for very specific files it will have an 'norigwt' or a 'vorigwt'
Labels[grepl("weight", tolower(Labels)) & grepl("replicate", tolower(Labels)) & !toupper(substring(variableName, 1, 1)) %in% c("A", "N", "V")] <- "JK"
Labels[grepl("weight", tolower(Labels)) & grepl("replicate", tolower(Labels)) & "A" == toupper(substring(variableName, 1, 1))] <- "JK2"
Labels[grepl("weight", tolower(Labels)) & grepl("replicate", tolower(Labels)) & "N" == toupper(substring(variableName, 1, 1))] <- "JK-N"
Labels[grepl("weight", tolower(Labels)) & grepl("replicate", tolower(Labels)) & "V" == toupper(substring(variableName, 1, 1))] <- "JK-V"
pvWt <- character(length(mrcFile)) # the number of the PV (e.g. for a subject or subscale with five PVs this would show values from 1 to 5 for five variables)
Type <- character(length(mrcFile)) # this is the subject or subscale
# Check if there is at least one Plausible Value, and then finding their names
tempValue <- applyPV("PV", Labels, pvWt, oLabels, Type)
Labels <- tempValue[["Labels"]]
pvWt <- tempValue[["pvWt"]]
Type <- tempValue[["Type"]]
# note theta PVs
tempValue <- applyPV("PVT", Labels, pvWt, oLabels, Type)
Labels <- tempValue[["Labels"]]
pvWt <- tempValue[["pvWt"]]
Type <- tempValue[["Type"]]
# note AP PVs
tempValue <- applyPV("PV2", Labels, pvWt, oLabels, Type)
Labels <- tempValue[["Labels"]]
pvWt <- tempValue[["pvWt"]]
Type <- tempValue[["Type"]]
# Check if there is at least one JK replicate.
if(any(grepl("^(JK|JK2|JK-N|JK-V)$", Labels))) {
# get the number of the JK replicate
pvWt[Labels == "JK"] <- as.numeric(gsub("[^\\d]+", "", oLabels[Labels=="JK"], perl=TRUE))
pvWt[Labels == "JK2"] <- as.numeric(gsub("[^\\d]+", "", oLabels[Labels=="JK2"], perl=TRUE))
pvWt[Labels == "JK-N"] <- as.numeric(gsub("[^\\d]+", "", oLabels[Labels=="JK-N"], perl=TRUE))
pvWt[Labels == "JK-V"] <- as.numeric(gsub("[^\\d]+", "", oLabels[Labels=="JK-V"], perl=TRUE))
}
# identify weights
labels <- tolower(Labels)
weights <- grepl("^(a|n|v){0,1}(origwt)$", variableName, ignore.case = TRUE) #T/F if variable is a weight
#only do this check if the above does not locate any weights, this will be only applicable for old NAEP files
if (!any(weights)){
weights <- grepl("^(weight)$", variableName, ignore.case = TRUE)
}
# For now, assume all variables are characters.
dataType <- rep("character", length(mrcFile))
# Create appropriate data type for variables.
dataType[Decimal > 0 & Width < 8] <- "integer"
dataType[Decimal > 0 & Width >= 8] <- "numeric"
labelValues <- as.character(labelValues)
mrcFileCSV <- data.frame(variableName, Start, End, Width, Decimal, Labels, labelValues, pvWt, Type, dataType, weights, stringsAsFactors = FALSE)
mrcFileCSV <- mrcFileCSV[order(mrcFileCSV$Start), ]
# mrcFileCSV$labelValues <- as.character(mrcFileCSV$labelValues)
mrcFileCSV$variableName <- make.names(mrcFileCSV$variableName, unique = TRUE) # make.names will ensure names match to LaF object and are valid for R
return(mrcFileCSV)
}
# @author Tom Fink
# parses the MRC file and returns a file format data.frame specifically for the long-term trend data files/layout
# this is to ensure the original readMRC function doesn't break compatibility for previously QCed data files
readMRC_LTT <- function(filename, subject) {
# read NAEP machine readable file.
# this has layout information on it
t <- try(mrcFile <- readLines(filename), silent = TRUE)
# Split mrcFile by "\n"
mrcFile <- strsplit(mrcFile, "\n", fixed = TRUE)
mrcFile <- unlist(mrcFile)
# read in the variables from the file,t his is based on the information ETS
# shared with us
if (any(nchar(mrcFile) < 90)) {
stop(paste0("Malformed fr2 file: some lines have fewer than the required 90 characters. Lines ", pasteItems(which(nchar(mrcFile) < 90)), "."))
}
specs <- getNAEP_FR2Specs(mrcFile)
variableName <- specs$variableName # name of the variable
Start <- specs$Start # start column in file
Width <- specs$Width # number of characters in variable
End <- specs$End # end column in file
Decimal <- specs$Decimal # digits (from the right) to be considered decimals
Labels <- specs$Labels # variable label
NumValue <- specs$NumValue # number of numeric codes (e.g. one could would be 1="Yes")
zeroLenVars <- nchar(variableName) == 0
if (any(zeroLenVars)) {
warning(paste0("Unnamed variables in .fr2 file on row(s) ", pasteItems((seq_along(variableName))[zeroLenVars]), ". File located at ", filename, ". These variables renamed sequentially, starting with V1."))
variableName[zeroLenVars] <- paste0("v", 1:sum(zeroLenVars))
}
# parse the numeric codes
labelValues <- specs$labelValues
labels <- tolower(Labels)
weights <- rep(FALSE, times = length(variableName)) # default to all false, to be calculated later
pvWt <- rep("", times = length(variableName))
Type <- rep("", times = length(variableName))
# For now, assume all variables are characters.
dataType <- rep("character", length(mrcFile))
# Create appropriate data type for variables.
# integer isn't really the right type here, but no way to determine accurate type
# the default of 'character' should be sufficient
dataType[Decimal > 0 & Width < 8] <- "integer"
dataType[Decimal > 0 & Width >= 8] <- "numeric"
labelValues <- as.character(labelValues)
mrcFileCSV <- data.frame(variableName, Start, End, Width, Decimal, Labels, labelValues, pvWt, Type, dataType, weights, stringsAsFactors = FALSE)
mrcFileCSV <- mrcFileCSV[order(mrcFileCSV$Start), ]
# apply the Weights and PVVars (to identify and mark them)
mrcFileCSV <- identifyNAEP_LTT_Weights(mrcFileCSV)
mrcFileCSV <- identifyNAEP_LTT_PV(mrcFileCSV, subject = subject)
mrcFileCSV$variableName <- make.names(mrcFileCSV$variableName, unique = TRUE) # make.names will ensure names match to LaF object and are valid for R
return(mrcFileCSV)
}
# identifies the NAEP Long-Term Trend Weights and set's the logical value to TRUE in the fileFormat
# the modified file format object is then returned
identifyNAEP_LTT_Weights <- function(fileFormatDF) {
# check first if origwt is present, 'weight' is sometimes on newer LTT files but not the overall JK weight we want with the replicates
isWgt <- grepl("^origwt$", fileFormatDF$variableName, ignore.case = TRUE)
if (!any(isWgt)) {
isWgt <- grepl("^weight$", fileFormatDF$variableName, ignore.case = TRUE)
}
fileFormatDF$weights <- isWgt
return(fileFormatDF)
}
# returns a list of weights based on the identified weights from the file format for use with package
buildNAEP_LTT_WeightList <- function(fileFormatDF) {
wgtVars <- fileFormatDF$variableName[fileFormatDF$weights == TRUE]
res <- list()
# should only be one weight defined, but just in case
for (i in seq_along(wgtVars)) {
# this is true for all LTT weights. the replicates use the same base variable name regardless of the weight name
# the number of replicates differs from year to year
repVars <- grep("^srwt\\d{1,2}$", fileFormatDF$variableName, ignore.case = TRUE, value = TRUE)
repVars <- gsub("srwt", "", repVars, ignore.case = TRUE) # leave the numeric value only
res[[i]] <- list(jkbase = "srwt", jksuffixes = repVars)
}
names(res) <- tolower(wgtVars)
# mark the replicate
return(res)
}
# identifies the NAEP Long-Term Trend plausible values and sets the appropriate pvWT and Type values in the file format
# the modified file format object is then returned
identifyNAEP_LTT_PV <- function(fileFormatDF, subject) {
subject <- tolower(subject)
subject <- ifelse(subject == "mathematics", "math", subject)
# RLTTR
pvvars <- c()
if (subject == "reading") {
vars1 <- grep("^(rrp|rth)sct\\d{1,2}$", fileFormatDF$variableName, ignore.case = TRUE, value = TRUE)
vars2 <- grep("^(theta|lmrval|redtht|redval|rltt(th|rp|r|t))\\d{1,2}$", fileFormatDF$variableName, ignore.case = TRUE, value = TRUE)
pvvars <- c(vars1, vars2)
}
if (subject == "math") {
vars1 <- grep("^(mrp|mth)sct\\d{1,2}$", fileFormatDF$variableName, ignore.case = TRUE, value = TRUE)
vars2 <- grep("^(theta|lmmval|mtht|medval|mltt(th|rp|r|t))\\d{1,2}$", fileFormatDF$variableName, ignore.case = TRUE, value = TRUE)
pvvars <- c(vars1, vars2)
}
if (length(pvvars) == 0) {
warning(paste0("No plausible values found in this NAEP file."))
}
baseVars <- unique(gsub("\\d+$", "", pvvars))
for (i in seq_along(baseVars)) {
repVars <- pvvars[grepl(paste0("^", baseVars[i]), pvvars, ignore.case = TRUE)]
repVarsNum <- as.numeric(gsub(baseVars[i], "", repVars, ignore.case = TRUE))
fileFormatDF$pvWt[fileFormatDF$variableName %in% repVars] <- repVarsNum
isTheta <- all(grepl("theta", fileFormatDF$Labels[fileFormatDF$variableName %in% repVars], ignore.case = TRUE))
fileFormatDF$Type[fileFormatDF$variableName %in% repVars] <- ifelse(isTheta, paste0(subject, "_theta"), subject)
}
return(fileFormatDF)
}
# build the NAEP Long-Term Trend plausible value list
buildNAEP_LTT_PVList <- function(fileFormatDF) {
res <- list()
pvBase <- unique(fileFormatDF$Type[nchar(fileFormatDF$Type) > 0])
for (i in seq_along(pvBase)) {
repVars <- fileFormatDF$variableName[fileFormatDF$Type == pvBase[i]]
res[[i]] <- list(varnames = tolower(repVars))
}
names(res) <- pvBase
return(res)
}
# @author Paul Bailey & Ahmad Emad
applyPV <- function(pv, Labels, pvWt, oLabels, Type) {
# the instances where Labels == pv is the cases we are looking at.
if (sum(Labels == pv) > 0) {
# there are some relevant variables
# example text for oLabels: "Plausible NAEP math value #3 (num & oper)"
pvWt[Labels == pv] <- sapply(strsplit(oLabels[Labels == pv], "#", fixed = TRUE), function(x) gsub("[^0-9]", "", x[[2]]))
# make oLabels lower case
oLabels[Labels == pv] <- tolower(oLabels[Labels == pv])
# edit oLabels to Keep only letters
# 1) remove punctuation
oLabels[Labels == pv] <- gsub("[[:punct:]]", "", oLabels[Labels == pv])
# 2) remove numbers
oLabels[Labels == pv] <- gsub("[0-9]", "", oLabels[Labels == pv])
# break oLabels into words and remove common words
temp <- strsplit(oLabels[Labels == pv], " ", fixed = TRUE)
temp <- lapply(temp, function(x) x[x != ""])
# this code gets rid of words that are common to all Plausible values,
# for example, "plausible", "value", and "NAEP"
words <- temp[[1]]
summation <- sapply(temp, function(x) words %in% x)
summation[summation == TRUE] <- 1
summation[summation == FALSE] <- 0
summation <- rowSums(summation)
removeWords <- words[summation == length(temp)]
# check if there is more than one plausible value (and so all words are "common")
if (length(removeWords) != length(words)) {
# Remove common words
temp <- lapply(temp, function(x) {
x[!x %in% removeWords]
})
# concatenate remaining words with "_"
temp <- sapply(temp, function(x) paste0(x, collapse = "_"))
# in the case of thetas, theta will be removed by removeWords, which is fine
# we want to be sure it is at the end anyways
if (pv == "PVT") {
temp <- paste0(temp, "_theta")
}
Type[Labels == pv] <- temp
} else {
# if there is only one, get rid of words like plausible, value, naep
Type[Labels == pv] <- gsub("plausible|naep|value", "", oLabels[Labels == pv])
Type[Labels == pv] <- trimws(Type[Labels == pv])
Type[Labels == pv] <- gsub(" ", "_", Type[Labels == pv])
# all plausible values for a particular subscale/composite now all share a name
# and that name has no spaces but underscores in it.
}
}
# these are accommodations permitted PVs
Type[Labels == "PV2"] <- paste0(Type[Labels == "PV2"], "_ap")
return(list(Labels = Labels, Type = Type, pvWt = pvWt))
}
# for a case-sensitive file system this will change the path to resolve
# correctly when the name is known up to the case.
# Assumes that there is not another file with the same name in the directory
# when ignoring the case. Essentially, this makes a case-sensitive file
# system work as if case-insensitive (when file names would not be overloaded)
# Note that for the NAEP data this is a safe assumption.
# @author Paul Bailey
ignoreCaseFileName <- function(f) {
bn <- basename(f) # the name of the file in the folder
dr <- dirname(f) # the name of the directory
ff <- list.files(dr, pattern = bn, ignore.case = TRUE, full.names = TRUE)
if (length(ff) == 0) {
stop(paste0("Could not find file ", dQuote(f), "."))
}
if (length(ff) > 1) {
stop(paste0("File name ", dQuote(f), " is ambigious with respect to capitalization. Please remove one of the following files and try again.", pasteItems(dQuote(ff), final = "or"), "."))
}
ff
}
# builds the NAEP dataList object
buildNAEP_dataList <- function(stuLaf, stuFF, schLaf, schFF) {
dataList <- list()
# build the list hierarchical based on the order in which the data levels would be merged in getData
dataList[["Student"]] <- dataListItem(
lafObject = stuLaf,
fileFormat = stuFF,
levelLabel = "Student",
forceMerge = TRUE,
parentMergeLevels = NULL,
parentMergeVars = NULL,
mergeVars = NULL,
ignoreVars = NULL,
isDimLevel = TRUE
)
# school datafile won't always be present, only add it if applicable
if (!is.null(schLaf)) {
dataList[["School"]] <- dataListItem(
lafObject = schLaf,
fileFormat = schFF,
levelLabel = "School",
forceMerge = FALSE,
parentMergeLevels = c("Student"),
parentMergeVars = c("scrpsu"),
mergeVars = c("sscrpsu"),
ignoreVars = NULL,
isDimLevel = FALSE
)
}
return(dataList)
}
# 1) path: validates and returns the 'path' argument for a valid Data (.dat) fixed-width data file for as well as the 'school' data (.dat) file
# 2) frPath: validates and returns information about the .fr2 layout file, and it's full file path (if available)
# 3) xmlPath: validates and returns information about the .xml layout file, and it's full file path (if available)
validate_NAEPFilePaths <- function(path, frPath, xmlPath) {
# ==== .dat File checking ====
if (length(path) != 1) {
stop(paste0("The argument ", dQuote("path"), " must be of length 1."))
}
if (!file.exists(path)) {
stop(paste0("The specified ", dQuote("path"), " is invalid or cannot be found: ", dQuote(path), "."))
}
if (!grepl("[.]dat$", path, ignore.case = TRUE)) {
stop(paste0("The specified ", dQuote("path"), " must have a .dat file extension: ", dQuote(path), "."))
}
path <- suppressWarnings(normalizePath(unique(path), winslash = "/"))
origDir <- dirname(path)
searchDir <- file.path(dirname(path), "..") # search up one directory level from where file specified
fileBN <- basename(path)
fileBN_NoExt <- toupper(sub("[.]dat$", "", fileBN, ignore.case = TRUE))
# ==== .fr2 File checking ====
fr2Specified <- FALSE
# check if frPath or xmlPath were passed
# flags for if the paremter was passed by user explicitly
if (!is.null(frPath) && is.character(frPath)) {
fr2Specified <- TRUE
if (length(frPath) != 1) {
stop(paste0("The argument ", dQuote("frPath"), " must be of length 1."))
}
if (!file.exists(frPath)) {
stop(paste0("The specified ", dQuote("frPath"), " is invalid or cannot be found: ", dQuote(frPath), "."))
}
if (!grepl("[.]fr2$", frPath, ignore.case = TRUE)) {
stop(paste0("The specified ", dQuote("frPath"), " must have a .fr2 file extension: ", dQuote(frPath), "."))
}
frPath <- suppressWarnings(normalizePath(unique(frPath), winslash = "/"))
searchDir <- c(searchDir, dirname(frPath)) # expand out searchDir as well
}
# ==== .xml File checking ====
xmlSpecified <- FALSE
# check if frPath or xmlPath were passed
# flags for if the paremter was passed by user explicitly
if (!is.null(xmlPath) && is.character(xmlPath)) {
xmlSpecified <- TRUE
if (length(xmlPath) != 1) {
stop(paste0("The argument ", dQuote("xmlPath"), " must be of length 1."))
}
if (!file.exists(xmlPath)) {
stop(paste0("The specified ", dQuote("xmlPath"), " is invalid or cannot be found: ", dQuote(xmlPath), "."))
}
if (!grepl("[.]xml$", xmlPath, ignore.case = TRUE)) {
stop(paste0("The specified ", dQuote("xmlPath"), " must have a .xml file extension: ", dQuote(xmlPath), "."))
}
xmlPath <- suppressWarnings(normalizePath(unique(xmlPath), winslash = "/"))
searchDir <- c(searchDir, dirname(xmlPath)) # expand out searchDir as well
}
# this function will throw specific NAEP filename errors if present
datFileDesc <- descriptionOfFile(fileBN_NoExt) # function in descriptionOfFile.R file, does more error checking of specific filename there
# prep default ooutputs
studentFileOut <- NULL
schoolFileOut <- NULL
# if LTT file there is no school level file that we will need to search
isLTT <- grepl("Long.*Term.*Trend", datFileDesc$Assessment_Code, ignore.case = TRUE)
searchSchool <- datFileDesc$CheckSchoolFile
searchStudent <- FALSE # assumes
# check if the school file was specified and search for the student file instead
if (searchSchool && !grepl("student", datFileDesc$Data_Type, ignore.case = TRUE)) {
warning("Input file was a school level file. Attempting to read in student level file instead. EdSurvey will automatically link the school file.")
searchStudent <- TRUE
} else {
studentFileOut <- path # user passed correct student level file!
}
# do one single list.files for both student/school .fr2 and .xml files here in case folders are slow or have lots of contents
dataFileRegex <- paste0("^", substr(fileBN_NoExt, 1, 4), "(C|T)", substr(fileBN_NoExt, 6, 99), "[.]dat$")
dataFileMatches <- list.files(origDir, dataFileRegex, full.names = TRUE, recursive = FALSE, ignore.case = TRUE)
dataFileBN <- basename(dataFileMatches)
if (searchSchool) {
fileRegex <- paste0("^", substr(fileBN_NoExt, 1, 4), "C", substr(fileBN_NoExt, 6, 99), "[.]dat$") # the 'C' signifies the school level file in the 5th character index
foundFiles <- dataFileMatches[grepl(fileRegex, dataFileBN, ignore.case = TRUE)]
if (length(foundFiles) > 0) {
schoolFileOut <- foundFiles[1] # default to first if more than one, but should only be one
} else {
if (!isLTT) {
warning(paste0(
"Could not find school level data file matching filename pattern: ", fileRegex, "\n",
"No school level data will be available for analysis."
))
}
}
}
if (searchStudent) {
fileRegex <- paste0("^", substr(fileBN_NoExt, 1, 4), "T", substr(fileBN_NoExt, 6, 99), "[.]dat$") # the 'T' signifies the student level file in the 5th character index
foundFiles <- dataFileMatches[grepl(fileRegex, dataFileBN, ignore.case = TRUE)]
if (length(foundFiles) > 0) {
studentFileOut <- foundFiles[1] # default to first if more than one, but should only be one
} else {
stop(paste0(
"Could not find student level data file matching filename pattern: ", fileRegex, "\n",
"Be sure to specify the student level data file."
))
}
}
# do one single list.files for both student/school .fr2 and .xml files here in case folders are slow or have lots of contents
layoutFileRegex <- paste0("^", substr(fileBN_NoExt, 1, 4), "(C|T)", substr(fileBN_NoExt, 6, 99), "[.](fr2|xml)$")
layoutFileMatches <- list.files(searchDir, layoutFileRegex, full.names = TRUE, recursive = TRUE, ignore.case = TRUE)
layoutFileBN <- basename(layoutFileMatches)
# ==== .fr2 Layout Files ====
fr2StudentFileOut <- NULL # default
if (fr2Specified) { # if specified, ensure it's the student level .fr2 file
fr2BN <- toupper(basename(frPath))
isStudentFR2 <- substr(fr2BN, 5, 5) == "T"
}
if (fr2Specified && isStudentFR2) {
fr2StudentFileOut <- frPath
} else { # do the search
fileRegex <- paste0("^", substr(fileBN_NoExt, 1, 4), "T", substr(fileBN_NoExt, 6, 99), "[.]fr2$") # the 'T' signifies the student level file in the 5th character index
foundFiles <- layoutFileMatches[grepl(fileRegex, layoutFileBN, ignore.case = TRUE)]
if (length(foundFiles) > 0) {
fr2StudentFileOut <- foundFiles[1] # default to first if more than one, but should only be one
}
}
fr2SchoolFileOut <- NULL # default
if (searchSchool) {
fileRegex <- paste0("^", substr(fileBN_NoExt, 1, 4), "C", substr(fileBN_NoExt, 6, 99), "[.]fr2$") # the 'C' signifies the school level file in the 5th character index
foundFiles <- layoutFileMatches[grepl(fileRegex, layoutFileBN, ignore.case = TRUE)]
if (length(foundFiles) > 0) {
fr2SchoolFileOut <- foundFiles[1] # default to first if more than one, but should only be one
}
}
# ==== .xml Layout Files ====
xmlStudentFileOut <- NULL # default
if (xmlSpecified) { # if specified, ensure it's the student level .fr2 file
xmlBN <- toupper(basename(xmlPath))
isStudentXML <- substr(xmlBN, 4, 4) == "T"
}
if (xmlSpecified && isStudentXML) {
xmlStudentFileOut <- frPath
} else { # do the search
fileRegex <- paste0("^", substr(fileBN_NoExt, 1, 4), "T", substr(fileBN_NoExt, 6, 99), "[.]xml$") # the 'T' signifies the student level file in the 5th character index
foundFiles <- layoutFileMatches[grepl(fileRegex, layoutFileBN, ignore.case = TRUE)]
if (length(foundFiles) > 0) {
xmlStudentFileOut <- foundFiles[1] # default to first if more than one, but should only be one
}
}
xmlSchoolFileOut <- NULL # default
if (searchSchool) {
fileRegex <- paste0("^", substr(fileBN_NoExt, 1, 4), "C", substr(fileBN_NoExt, 6, 99), "[.]xml$") # the 'C' signifies the school level file in the 5th character index
foundFiles <- layoutFileMatches[grepl(fileRegex, layoutFileBN, ignore.case = TRUE)]
if (length(foundFiles) > 0) {
xmlSchoolFileOut <- foundFiles[1] # default to first if more than one, but should only be one
}
}
# recalc the file description with the true student file
datFileDesc <- descriptionOfFile(toupper(sub("[.]dat$", "", basename(studentFileOut), ignore.case = TRUE)))
yr <- datFileDesc$Year
# determine what layout method to use based on what was passed, as well as what is available
if (fr2Specified && !is.null(fr2StudentFileOut)) {
layoutMethod <- "FR2"
} else if (xmlSpecified && !is.null(xmlStudentFileOut)) {
layoutMethod <- "XML"
} else {
# determine default if not specified based on the file year, starting NAEP 2022 the XML file will be the default method as the .FR2 file is phased out
if (!is.null(fr2StudentFileOut) && (yr < 2022)) { # FR2 is still preferred method if neither is specified for now, can be swapped later if designed to have XML as default
layoutMethod <- "FR2"
} else if (!is.null(xmlStudentFileOut) && (yr >= 2022)) {
layoutMethod <- "XML"
} else if (!is.null(fr2StudentFileOut)) {
layoutMethod <- "FR2"
} else if (!is.null(xmlStudentFileOut)) {
layoutMethod <- "XML"
} else {
stop(paste0(
"Unable to locate a matching .fr2 or .xml layout file for: ", dQuote(path), "\n",
"Try setting the ", dQuote("frPath"), " or ", dQuote("xmlPath"), " parameter to the known layout file location."
))
}
}
# output list
retList <- list(
FileDesc = datFileDesc,
StudentDAT = studentFileOut,
StudentFR2 = fr2StudentFileOut,
StudentXML = xmlStudentFileOut,
SchoolDAT = schoolFileOut,
SchoolFR2 = fr2SchoolFileOut,
SchoolXML = xmlSchoolFileOut,
FR2Specified = fr2Specified,
XMLSpecified = xmlSpecified,
PreferredLayoutMethod = layoutMethod
)
return(retList)
}
appendIRTAttributes_NAEP <- function(esdf, year, subject, region, level, fr2Path, filePathInfo) {
# grab only the digits from the level (e.g. Grade 4 ==> 4)
levelVal <- as.integer(regmatches(level, regexpr("\\d{1,2}", level, ignore.case = TRUE)))
levelType <- tolower(regmatches(level, regexpr("\\w+", level, ignore.case = TRUE)))
# use NAEPirtparams package and filter to needed params
params <- NAEPirtparams::parameters
# filter parameters for our file details
paramsFilter <- params[params$year == year & tolower(params$subject) == tolower(subject) & params$level == levelVal & tolower(params$levelType) == levelType, ]
hasNAEPirtparams <- nrow(paramsFilter) > 0 # eval to T/F
if (hasNAEPirtparams) {
paramsFilter$NAEPid <- tolower(paramsFilter$NAEPid)
if ("accom" %in% paramsFilter$accommodations) {
# Everything no-accom
paramsFilter <- paramsFilter[!paramsFilter$accommodations == "accom", ]
} else {
paramsFilter <- paramsFilter[!paramsFilter$accommodations == "accom", ]
}
if (tolower(region) == "state") {
paramsFilter <- paramsFilter[tolower(paramsFilter$assessmentCode) == "state", ]
} else {
# This is National or not indicated
paramsFilter <- paramsFilter[!tolower(paramsFilter$assessmentCode) == "state", ]
}
# no parameters exist for thie file, break early
if (nrow(paramsFilter) == 0) {
return(esdf)
}
# get deleted and/or adjusted items
adjust <- NAEPirtparams::adjustments
adjustData <- adjust[adjust$level == levelVal & tolower(adjust$subject) == tolower(subject) & adjust$year == year, ]
if ("accom" %in% adjustData$accommodations) {
# Everything no-accom for now
adjustData <- adjustData[!adjustData$accommodations == "accom", ]
} else {
adjustData <- adjustData[!adjustData$accommodations == "accom", ]
}
deletedItems <- tolower(adjustData[adjustData$adjustment == "Deleted", "NAEPid"])
deletedItems <- deletedItems[deletedItems %in% colnames(esdf)]
adjustedData <- adjustData[adjustData$adjustment == "Collapsed", c("NAEPid", "from", "to")]
adjustedData$NAEPid <- tolower(adjustedData$NAEPid)
if (length(deletedItems) > 0) {
paramsFilter <- paramsFilter[!paramsFilter$NAEPid %in% deletedItems, ]
}
# get paramTabs
paramTabs <- naepParamTabs(paramsFilter)
polyParamTab <- paramTabs$polyParamTab
dichotParamTab <- paramTabs$dichotParamTab
# building testDat
# filter transformation constants
transf <- NAEPirtparams::transformations
transFilter <- transf[transf$level == levelVal & transf$year == year & tolower(transf$subject) == tolower(subject), ]
if ("accom" %in% transFilter$accommodations) {
# Everything no-accom for now
transFilter <- transFilter[!transFilter$accommodations == "accom", ]
} else {
transFilter <- transFilter[!transFilter$accommodations == "accom", ]
}
if ("State" == region) {
transFilter <- transFilter[tolower(transFilter$assessmentCode) == "state", ]
} else {
# This is National or not indicated
transFilter <- transFilter[!tolower(transFilter$assessmentCode) == "state", ]
}
# filter to testDat
testDat <- transFilter[, c("subtest", "location", "scale", "subtestWeight")]
# more than one test scale
if (nrow(testDat) > 1 && anyNA(testDat$subtestWeight)) {
compositeVars <- grepl("(composite|univariate)", testDat$subtest, ignore.case = TRUE)
if (any(compositeVars)) {
testDat <- testDat[!(compositeVars & is.na(testDat$subtestWeight)), ] # drop any composite vars with no subtest weight (ensure ! is evaluated after the & with the quotes!)
}
}
# test that if there are subtest weights, that they equal up to 100%
if (!all(is.na(testDat$subtestWeight))) {
testSum <- sum(testDat$subtestWeight[!is.na(testDat$subtestWeight)])
if (abs(testSum - 1) > 0.02) {
warning(paste0("The IRT Test subscales don't equal 100% when it is expected."))
}
}
# default scoreCard
scoreCard <- defaultNAEPScoreCard()
# check that there are no duplicate items in your paramTabs
hasPolyDupes <- anyDuplicated(unique(polyParamTab[, c("ItemID", "subtest")]))
hasDichotDupes <- anyDuplicated(unique(polyParamTab[, c("ItemID", "subtest")]))
hasTestDupes <- anyDuplicated(unique(polyParamTab[, c("subtest")]))
if (hasPolyDupes > 0 || hasDichotDupes > 0 || hasTestDupes > 0) {
# THIS SHOULD NEVER HIT SINCE WE ARE ONLY USING NAEPirtparameters package
warning("You have duplicate data in your NAEP IRT parameters table(s). Use a DCT file with the function setNAEPScoreCard to use mml.sdf.")
return(esdf) # return unmodified edsurvey.data.frame
} else {
# NAEP score card
scoreDict <- getNAEPScoreCard(fr2Path, polyParamTab$ItemID, dichotParamTab$ItemID, adjustedData, scoreCard)
# update dichotparamtab: 1/k for missing value
dichotScores <- scoreDict[scoreDict$key %in% dichotParamTab$ItemID & !scoreDict$answer %in% defaultNAEPScoreCard()$resCat, ]
if (nrow(dichotScores) > 0) {
Ks <- aggregate(answer ~ key, dichotScores, length)
Ks$answer <- 1 / Ks$answer
dichotParamTab <- merge(dichotParamTab, Ks, by.x = "ItemID", by.y = "key", all.x = TRUE)
dichotParamTab$missingValue <- ifelse(is.na(dichotParamTab$answer), dichotParamTab$missingValue, dichotParamTab$answer)
dichotParamTab$answer <- NULL
}
} # end if (hasPolyDupes > 0 || hasDichotDupes > 0 || hasTestDupes > 0)
# check if composite/overall and add that in as the "test" for the param tabs
pvvars <- names(esdf$pvvars)
pvvars <- pvvars[!grepl("(_theta|_linking|_ap)$", pvvars, ignore.case = TRUE)] # remove thetas, linking, and accomodation pvvars from checks!
compositePV <- pvvars[grepl("(composite|composite_ap|univariate|overall|scale_compos)", pvvars, ignore.case = TRUE)] # should only have 1 composite record!
if (length(compositePV) > 0 && nrow(testDat) > 0) {
if (length(compositePV) > 1) {
warning(paste0("Multiple Composite Plausible Values found! Only first one will be used."))
compositePV <- compositePV[1]
}
if (!any(compositePV %in% testDat$subtest)) {
if (nrow(dichotParamTab) > 0) {
dichotParamTab$test <- compositePV
} else {
dichotParamTab$test <- character(0)
}
if (nrow(polyParamTab) > 0) {
polyParamTab$test <- compositePV
} else {
polyParamTab$test <- character(0)
}
testDat$test <- compositePV
}
}
# set the scorePoints for the polyParamTab here
polyScorePoints <- unique(scoreDict[scoreDict$key %in% polyParamTab$ItemID, c("key", "scorePoints")])
polyParamTab <- merge(polyParamTab, polyScorePoints, by.x = "ItemID", by.y = "key", all.x = TRUE)
# get unique list of the subtest/subscales for validation
scaleNames <- unique(c(testDat$subtest, testDat$test))
# validate that all of test names match the pvvar names
if (!all(pvvars %in% scaleNames)) {
badPV <- pvvars[!(pvvars %in% scaleNames)]
badScales <- scaleNames[!(scaleNames %in% pvvars)]
if (length(badPV) > 0) {
fixDF <- data.frame(
pvName = c(
"alg_fnctns", "alg_functns",
"da_stat_prob", "data_an_stat", "data_anal_prob", "data_analstat", "data_anal_stat",
"envsociety",
"measurementgeom",
"num_oper",
"spaceplace",
"spatial_dyn",
"world_role"
),
scaleName = c(
"algebra", "algebra",
"data", "data", "data", "data", "data",
"environment",
"measurement",
"number",
"space",
"spatial",
"world"
)
)
fixDF <- fixDF[fixDF$pvName %in% badPV & fixDF$scaleName %in% badScales, ]
for (i in seq_len(nrow(fixDF))) {
testDat$subtest[testDat$subtest == fixDF[i, "scaleName"]] <- fixDF[i, "pvName"]
dichotParamTab$subtest[dichotParamTab$subtest == fixDF[i, "scaleName"]] <- fixDF[i, "pvName"]
polyParamTab$subtest[polyParamTab$subtest == fixDF[i, "scaleName"]] <- fixDF[i, "pvName"]
}
# recheck again if any issues still present
scaleNames <- unique(c(testDat$subtest, testDat$test))
badPV <- pvvars[!(pvvars %in% scaleNames)]
badScales <- scaleNames[!(scaleNames %in% pvvars)]
if (length(badPV) > 0 && length(badScales) > 0) {
warning(paste0("Mismatched Plausible Value names (", paste0(sQuote(badPV), collapse = ", "), ") and IRT Scale names (", paste0(sQuote(badScales), collapse = ", "), ")!"))
}
} # end if(length(badPV) > 0)
} # end if (!all(pvvars %in% scaleNames))
# set mml.sdf related objects as attributes
esdf <- setAttributes(esdf, "scoreCard", scoreCard)
esdf <- setAttributes(esdf, "dichotParamTab", dichotParamTab)
esdf <- setAttributes(esdf, "polyParamTab", polyParamTab)
esdf <- setAttributes(esdf, "adjustedData", adjustedData)
esdf <- setAttributes(esdf, "testData", testDat)
esdf <- setAttributes(esdf, "scoreDict", scoreDict)
esdf <- setAttributes(esdf, "scoreFunction", scoreDefault)
} # end if (year %in% unique(params$year) & subject %in% unique(params$subject) & level %in% unique(params$level))
# search for an associated .dct file and apply it if found
if (!hasNAEPirtparams) {
searchDir <- file.path(dirname(filePathInfo$StudentDAT), "../AM") # search AM folder specifically as STATA files have .dct format that's completely different
searchFN <- gsub("[.]dat$", "", basename(filePathInfo$StudentDAT), ignore.case = TRUE)
dctFP <- list.files(searchDir, paste0("^", searchFN, "[.]dct$"), full.names = TRUE, ignore.case = TRUE, recursive = TRUE)
if (length(dctFP) > 0) {
dctFP <- dctFP[1] # use the first one if multiple found(they should be the same)
tryCatch(esdf <- suppressWarnings(setNAEPScoreCard(esdf, dctFP)), error = function(e) {}) # use this function to set the IRT attributes, check why getNAEPScorecard throwing NA coercion warnings?
}
}
return(esdf) # return original if data not found in NAEPirtparams, or the modified esdf if data located
}
#' default scorer
#' scores column on edf identified by polyParamTab$ItemID, dichotParamTab$ItemID using a crosswalk in scoreDict
#' @title Assessment scoring
#'
#' @description Score assessments
#'
#' @param edf the data
#' @param polyParamTab see \code{\link{mml.sdf}}
#' @param dichotParamTab see \code{\link{mml.sdf}}
#' @param scoreDict a data frame; see Details.
#' @details
#' the \code{scoreDict} is a data frame in long format with columns \code{key}, \code{answer}, and \code{score}.
#' the function maps, within the item identified by \code{key} from \code{answer} to \code{score}.
#'
#' @return a data frame with the columns in the \code{scoreDict} \code{key} column mapped from \code{answer} to \code{score}.
#'
#' @seealso \code{\link{mml.sdf}}
#' @author Paul Bailey and Tom Fink
#'
#' @export
scoreDefault <- function(edf, polyParamTab, dichotParamTab, scoreDict) {
# for NAEP
edf$score <- NULL
for (item in c(polyParamTab$ItemID, dichotParamTab$ItemID)) {
itemDict <- scoreDict[scoreDict$key %in% item, c("answer", "score")]
if (nrow(itemDict) < 1) {
stop("could not score item ", item, " check that it is on the scoreDict")
}
edf <- merge(edf, itemDict, by.x = item, by.y = "answer", all.x = TRUE, all.y = FALSE)
# update to score
edf[, item] <- edf$score
edf$score <- NULL
}
return(edf)
}
# determines the .FR2 file layout and returns a list of vectors with the .fr2 file details (e.g., Variable Names, Start Position, Width, End Position, Decimal, Labels, Num Values)
getNAEP_FR2Specs <- function(txt) {
# original fr2 file column specifications
variableName <- trimws(substring(txt, 1, 8)) # name of the variable
Start <- trimws(substring(txt, 9, 12)) # start column in file
Width <- trimws(substring(txt, 13, 14)) # number of characters in variable
Decimal <- trimws(substring(txt, 15, 15)) # digits (from the right) to be considered decimals
Labels <- trimws(substring(txt, 21, 70)) # variable label
scoreKey <- trimws(substring(txt, 71, 80)) # score key (e.g. 00010, 0100, 1000)
NumValue <- trimws(substring(txt, 89, 90)) # number of numeric codes (e.g. one could would be 1="Yes")
startOK <- all(grepl("\\d+", Start))
widthOK <- all(grepl("\\d+", Width[nchar(Width) > 0]))
decimalOK <- all(grepl("(\\d+|[*])", Decimal[nchar(Decimal) > 0])) # sometimes a '*' character is used, be sure to allow for that
numValueOK <- all(grepl("\\d+", NumValue))
if (startOK && widthOK && decimalOK && numValueOK) {
# get the value labels from the foils
labelValues <- rep("", times = length(variableName))
valLblIndex <- which(NumValue > 0, arr.ind = TRUE) # only process items with defined value labels
NumValue <- as.numeric(NumValue)
Decimal[!grepl("\\d+", Decimal)] <- NA # convert non-numeric values to NA
for (i in valLblIndex) {
# if it has numeric codes
# read in up to 26 character per code, plus 2 characters for the number
Ncodes <- NumValue[i] - 1 # be cautious here, for the seq call to work properly, a 0 value indicates 1 value label item
codeValSeq <- seq(91, (91 + Ncodes * 28), by = 28)
codeLabelSeq <- seq(93, (93 + Ncodes * 28), by = 28)
values <- as.numeric(trimws(substring(txt[i], codeValSeq, codeValSeq + 1)))
labels <- trimws(substring(txt[i], codeLabelSeq, codeLabelSeq + 19))
labelValues[i] <- paste(values, labels, collapse = "^", sep = "=")
}
return(list(
variableName = variableName,
Start = as.numeric(Start),
Width = as.numeric(Width),
End = (as.numeric(Start) + as.numeric(Width) - 1),
Decimal = as.numeric(Decimal),
Labels = Labels,
NumValue = NumValue, # converted to numeric earlier
labelValues = labelValues,
scoreKey = scoreKey,
fr2Version = 1
))
}
# the 'updated .fr2 file column specifications as of NAEP 2022 files
variableName <- trimws(substring(txt, 1, 32)) # name of the variable
Start <- trimws(substring(txt, 33, 36)) # start column in file
Width <- trimws(substring(txt, 37, 38)) # number of characters in variable
Decimal <- trimws(substring(txt, 39, 39)) # digits (from the right) to be considered decimals
Labels <- trimws(substring(txt, 45, 94)) # variable label
scoreKey <- trimws(substring(txt, 95, 104)) # score key (e.g. 00010, 0100, 1000)
NumValue <- trimws(substring(txt, 113, 114)) # number of numeric codes (e.g. one could would be 1="Yes")
startOK <- all(grepl("\\d+", Start))
widthOK <- all(grepl("\\d+", Width[nchar(Width) > 0]))
decimalOK <- all(grepl("(\\d+|[*])", Decimal[nchar(Decimal) > 0])) # sometimes a '*' character is used, be sure to allow for that
numValueOK <- all(grepl("\\d+", NumValue))
if (startOK && widthOK && decimalOK && numValueOK) {
# get the value labels from the foils
labelValues <- rep("", times = length(variableName))
valLblIndex <- which(NumValue > 0, arr.ind = TRUE) # only process items with defined value labels
NumValue <- as.numeric(NumValue)
Decimal[!grepl("\\d+", Decimal)] <- NA # convert non-numeric values to NA
for (i in valLblIndex) {
# if it has numeric codes
# read in up to 26 character per code, plus 2 characters for the number
Ncodes <- NumValue[i] - 1 # be cautious here, for the seq call to work properly, a 0 value indicates 1 value label item
codeValSeq <- seq(115, (115 + Ncodes * 28), by = 28)
codeLabelSeq <- seq(117, (117 + Ncodes * 28), by = 28)
values <- as.numeric(trimws(substring(txt[i], codeValSeq, codeValSeq + 1)))
labels <- trimws(substring(txt[i], codeLabelSeq, codeLabelSeq + 19))
labelValues[i] <- paste(values, labels, collapse = "^", sep = "=")
}
return(list(
variableName = variableName,
Start = as.numeric(Start),
Width = as.numeric(Width),
End = (as.numeric(Start) + as.numeric(Width) - 1),
Decimal = as.numeric(Decimal),
Labels = Labels,
NumValue = NumValue, # converted to numeric earlier
labelValues = labelValues,
scoreKey = scoreKey,
fr2Version = 2
))
}
# throw error if .FR2 is not expected format
stop(paste0("The NAEP .FR2 layout is corrupt and unable to be parsed. Consider trying the NAEP .XML layout version if available."))
}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.