Nothing
#' @title Connect to HSTS Data
#'
#' @description Opens a connection to a High School Transcript Study (HSTS) data files for years 2019.
#' Returns an \code{edsurvey.data.frame} with
#' information about the file and data.
#'
#' @param dataFilePath a character value to the root directory path of extracted set of ASCII data files (.txt or .dat file extension).
#' \code{readHSTS} will search within sub-directories of this parameter for expected data files based on the specified \code{year} parameter.
#'
#' @param spssPrgPath a character value to the directory path of where the extracted set of .sps program files are located.
#' The data file and associated SPSS program filenames *must match* (having different file extensions) to determine which files are associated together.
#' \code{readHSTS} will search within sub-directories of this parameter for expected SPSS programe files based on the specified \code{year} parameter.
#'
#' @param year a character value to indicate the year of the dataset. Only one year is supported for a single \code{readHSTS} data call.
#' The year is required to help determine specific study information. Only 2019 study is currently supported.
#'
#' @param verbose a logical value that will determine if you want verbose output while the \code{readHSTS} function is running to indicate processing progress.
#' The default value is \code{TRUE}.
#'
#' @details The HSTS data has a complex structure and unique characteristics all handled internally within \code{EdSurvey}.
#' The structure allows for automatic dynamic linking across all various data 'levels' based the requested variables. The \code{student} data level is the primary analysis unit.
#' Dynamic linking for variables that include both \code{tests} and \code{transcript} level details will result in an error, as they cannot be simultaneously returned in a single call.
#' Situations may arise where the analyst must derive variables for analysis. See the documentation for \code{merge} and \code{$<-} functions for more detail. All merge operations are done at the \code{student} level (the main analysis unit).
#'
#' File Layout for HSTS 2019:
#' \itemize{
#' \item School (school.dat) - School level variables.
#' \itemize{
#' \item School Catalog (catalog.dat) - Catalog variables joined to School data. Variables renamed to begin with \code{SchCat_} to distinguish from Transcript Catalog. Cannot be merged with any \code{Student} data.
#' }
#' \item Student (student.dat) - Student level variables. Primary analysis unit, all merged/cached data must be at this level.
#' \itemize{
#' \item NAEP Math (naepmath.dat) - Subset of students containing NAEP Math variables. Variables begin with \code{math_} to ensure they are unique from the NAEP Science variables.
#' \item NAEP Science (naepsci.dat) - Subset of students containing NAEP Science variables. Variables begin with \code{sci_} to ensure they are unique from the NAEP Math variables.
#' \item Tests (tests.dat) - Students may have many test records. Contains ACT/SAT testing score details for students. Cannot be merged together with any Transcript or Transcript Catalog data.
#' \item Transcripts (trnscrpt.dat) - Students may have many transcript records. Contains transcript level details. Cannot be merged together with Test data.
#' \itemize{
#' \item Transcript Catalog (catalog.dat) - Each transcript record is associated to a catalog record for giving context to the transcript record. 2019 uses \href{https://nces.ed.gov/scedfinder}{SCED codes} for categorizing courses.
#' }
#' }
#' }
#' @return
#' an \code{edsurvey.data.frame} for the HSTS dataset.
#'
#' @seealso \code{\link{showCodebook}}, \code{\link{searchSDF}}, \code{\link{edsurvey.data.frame}}, \code{\link{merge.edsurvey.data.frame}}, and \code{\link{getData}}
#' @author Tom Fink
#' @export
readHSTS <- function(dataFilePath = getwd(),
spssPrgPath = dataFilePath,
year = c("2019"),
verbose = TRUE) {
# temporarily adjust any necessary option settings; revert back when done
userOp <- options(OutDec = ".")
on.exit(options(userOp), add = TRUE)
dataFilePath <- suppressWarnings(normalizePath(unique(dataFilePath), winslash = "/"))
spssPrgPath <- suppressWarnings(normalizePath(unique(spssPrgPath), winslash = "/"))
# gather datafiles and .sps syntax files
txtDataFiles <- list.files(path = dataFilePath, pattern = "[.](txt|dat)$", full.names = TRUE, recursive = TRUE, include.dirs = TRUE)
spssFormatFiles <- list.files(path = spssPrgPath, pattern = "[.]sps$", full.names = TRUE, recursive = TRUE, include.dirs = TRUE)
linkingErrorFiles <- list.files(path = unique(c(dataFilePath, spssPrgPath)), pattern = "^Linking_Factors_HSTS[.]xlsx$", full.names = TRUE, recursive = TRUE, include.dirs = TRUE)
#validate that we have data and spss program files
if(length(txtDataFiles)==0 || is.null(txtDataFiles)){
stop(paste0("Cannot find any data files in path ", sQuote(dataFilePath), "."))
}
if (length(spssFormatFiles) == 0 || is.null(spssFormatFiles)) {
stop(paste0("Cannot find any SPSS (.sps) files in path ", sQuote(spssPrgPath), "."))
}
year <- match.arg(as.character(year), c("2019"), several.ok = FALSE)
# get the filenames without the extension
fnTxt <- tolower(gsub("[.](txt|dat)$", "", basename(txtDataFiles), ignore.case = TRUE))
fnSPSS <- tolower(gsub("[.]sps$", "", basename(spssFormatFiles), ignore.case = TRUE))
chkFiles <- getHSTS_DataFileNames(year)
if (year == 2019) {
fileFound <- chkFiles %in% fnTxt
if (!all(fileFound)) {
stop(paste0("Expected 2019 HSTS .dat Data File(s) not found: ", pasteItems(chkFiles[!fileFound])))
}
layoutFound <- chkFiles %in% fnSPSS
if (!all(layoutFound)) {
stop(paste0("Expected 2019 HSTS .sps SPSS Script File(s) not found: ", pasteItems(chkFiles[!layoutFound])))
}
spsFileEnc <- "UTF8"
# special case for 2019 where we want to create essentially TWO catalogs (one for School and one For Transcripts)
chkFiles <- c(chkFiles, "school_catalog")
txtDataFiles <- c(txtDataFiles, txtDataFiles[which(grepl("catalog", fnTxt, ignore.case = TRUE))])
fnTxt <- c(fnTxt, "school_catalog")
spssFormatFiles <- c(spssFormatFiles, spssFormatFiles[which(grepl("catalog", fnSPSS, ignore.case = TRUE))])
fnSPSS <- c(fnSPSS, "school_catalog")
}
# placeholders to store LaF objects and the parsed File Format objects from the .sps file
spssFileFormat <- vector(mode = "list", length = length(txtDataFiles))
lafObj <- vector(mode = "list", length = length(txtDataFiles))
for (i in seq_along(chkFiles)) {
spssIn <- spssFormatFiles[i]
datIn <- txtDataFiles[i]
ff <- parseScript_SPSS(spssIn, verbose = FALSE, outputFormat = "data.frame", encoding = spsFileEnc)
ff$variableName <- tolower(ff$variableName)
# special case for 2019 where field names can be same between both math and science naep files
if (chkFiles[i] %in% c("naepmath") && year == 2019) {
ff$variableName <- ifelse(ff$variableName %in% c("schoolid", "studenti"), ff$variableName, paste0("math_", ff$variableName)) # exclude merge vars, but ensure var names are unique across subjects
}
if (chkFiles[i] %in% c("naepsci") && year == 2019) {
ff$variableName <- ifelse(ff$variableName %in% c("schoolid", "studenti"), ff$variableName, paste0("sci_", ff$variableName)) # exclude merge vars, but ensure var names are unique across subjects
}
if (chkFiles[i] %in% c("school_catalog") && year == 2019) {
ff$variableName <- ifelse(ff$variableName %in% c("schoolid"), ff$variableName, paste0("schcat_", ff$variableName)) # exclude merge vars, but ensure var names are unique across subjects
}
lo <- laf_open_fwf(datIn, ff$dataType, ff$Width, ff$variableName)
lafObj[[i]] <- lo
names(lafObj)[i] <- chkFiles[i]
spssFileFormat[[i]] <- ff
names(spssFileFormat)[i] <- chkFiles[i]
}
if (year == 2019) {
dataList <- buildHSTS_2019_dataList(fnTxt, lafObj, spssFileFormat)
#precalc dim0 for speed boost to not have it calculate in edsf constructor
studentN <- dataList$Student$nrow
allCols <- unique(unlist(lapply(dataList, function(x){x$fileFormat$variableName})))
colN <- length(allCols) + 1 #add one for ROWID from the 'cache' to match colnames() functionâș
dim0 <- c(studentN, colN) #be sent to ESDF
linkingErrorSpecs <- NULL
if(length(linkingErrorFiles)>0 && file.exists(linkingErrorFiles[1])){
if(length(linkingErrorFiles) > 1){
warning("Multiple HSTS 2019 Linking Error Data files located, only the first will be used.")
}
linkingErrorSpecs <- getHSTS19_LinkingErrorSpecs(linkingErrorFiles[1])
}
}
yrAttrib <- getDataAttributes_HSTS(year) #year specific attributes
esdf <- edsurvey.data.frame(userConditions = list(),
defaultConditions = NULL,
dataList = dataList,
weights = yrAttrib$weights,
pvvars = yrAttrib$pvvars,
subject = c("Mathematics", "Science"),
year = year,
assessmentCode = "Transcript Study",
dataType = "Transcript Study",
gradeLevel = "High School",
achievementLevels = yrAttrib$achievementLevels,
omittedLevels = yrAttrib$omittedLevels,
survey = "HSTS",
country = "USA",
psuVar = yrAttrib$psuVar,
stratumVar = yrAttrib$stratumVar,
jkSumMultiplier = 1,
validateFactorLabels = FALSE, #the validateFactorLabels will check in `getData` if all values have a defined label, any missing labels will be automatically added.
reqDecimalConversion = FALSE, #decimal conversion is not needed
dim0 = dim0,
cacheDataLevelName = "Student")
#apply the linking errors to the edsurvey.data.frame
if(!is.null(linkingErrorSpecs) && year == 2019){
#math linking error
esdf <- linkVarAugmentAB(data = esdf,
linkingErrorSpecs = linkingErrorSpecs$math,
subscaleWeights = c(0.1, 0.3, 0.25, 0.35),
subscales = c("math_num_oper", "math_meas_geo", "math_data_anal", "math_algebra"),
composite = "math_composite",
dbapbaVarName = "math_dbapba",
weightVar = "math_finlnkwt")
#science linking error
esdf <- linkVarAugmentAB(data = esdf,
linkingErrorSpecs =linkingErrorSpecs$sci,
subscaleWeights = c(1),
subscales = c("sci_univariate"),
composite = "sci_univariate",
dbapbaVarName = "sci_dbapba",
weightVar = "sci_finlnkwt")
}
return(esdf)
}
buildHSTS_2019_dataList <- function(filenames, lafList, ffList) {
# for HSTS the main STUDENT file will be the 'base' level
# The 'catalog' and 'transcript' data cannot be comingled together in a single data call
dataList <- list()
dataList[["School"]] <- dataListItem(
lafObject = lafList[["school"]],
fileFormat = ffList[["school"]],
levelLabel = "School",
forceMerge = FALSE,
parentMergeLevels = NULL,
parentMergeVars = NULL,
mergeVars = NULL,
ignoreVars = NULL,
isDimLevel = FALSE
)
iVar <- unique(c(ffList[["school"]]$variableName))
iVar <- intersect(iVar, ffList[["school_catalog"]]$variableName)
dataList[["School_Catalog"]] <- dataListItem(
lafObject = lafList[["school_catalog"]],
fileFormat = ffList[["school_catalog"]],
levelLabel = "School_Catalog",
forceMerge = FALSE,
parentMergeLevels = c("School"),
parentMergeVars = c("schoolid"),
mergeVars = c("schoolid"),
ignoreVars = iVar,
conflictLevels = c(
"Student", "NAEP_Math", "NAEP_Sci",
"Test", "Transcript", "Transcript_Catalog"
),
isDimLevel = FALSE
)
# The BASE data level (main unit of analysis)
iVar <- unique(c(ffList[["school"]]$variableName, ffList[["school_catalog"]]$variableName))
iVar <- intersect(iVar, ffList[["student"]]$variableName)
dataList[["Student"]] <- dataListItem(
lafObject = lafList[["student"]],
fileFormat = ffList[["student"]],
levelLabel = "Student",
forceMerge = FALSE,
parentMergeLevels = c("School"),
parentMergeVars = c("schoolid"),
mergeVars = c("schoolid"),
ignoreVars = iVar,
isDimLevel = TRUE
)
# get ignore vars
iVar <- unique(c(ffList[["student"]]$variableName))
iVar <- intersect(iVar, ffList[["naepmath"]]$variableName)
dataList[["NAEP_Math"]] <- dataListItem(
lafObject = lafList[["naepmath"]],
fileFormat = ffList[["naepmath"]],
levelLabel = "NAEP_Math",
forceMerge = FALSE,
parentMergeLevels = c("Student"),
parentMergeVars = c("studenti"),
mergeVars = c("studenti"),
ignoreVars = iVar,
isDimLevel = FALSE
)
iVar <- unique(c(
ffList[["student"]]$variableName,
ffList[["naepmath"]]$variableName
))
iVar <- intersect(iVar, ffList[["naepsci"]]$variableName)
dataList[["NAEP_Science"]] <- dataListItem(
lafObject = lafList[["naepsci"]],
fileFormat = ffList[["naepsci"]],
levelLabel = "NAEP_Science",
forceMerge = FALSE,
parentMergeLevels = c("Student"),
parentMergeVars = c("studenti"),
mergeVars = c("studenti"),
ignoreVars = iVar,
isDimLevel = FALSE
)
# TEST DATA CANNOT BE SIMULTANEOUSLY MERGED WITH CATALOG OR TRANSCRIPT DATA
iVar <- unique(c(
ffList[["school"]]$variableName,
ffList[["student"]]$variableName,
ffList[["naepmath"]]$variableName, ffList[["naepsci"]]$variableName
))
iVar <- intersect(iVar, ffList[["tests"]]$variableName)
dataList[["Test"]] <- dataListItem(
lafObject = lafList[["tests"]],
fileFormat = ffList[["tests"]],
levelLabel = "Test",
forceMerge = FALSE,
parentMergeLevels = c("Student"),
parentMergeVars = c("studenti"),
mergeVars = c("studenti"),
ignoreVars = iVar,
conflictLevels = c("Transcript", "Transcript_Catalog", "School_Catalog"),
isDimLevel = FALSE
)
# CATALOG DATA CANNOT BE SIMULTANEOUSLY MERGED WITH TEST OR TRANSCRIPT DATA
iVar <- unique(c(
ffList[["school"]]$variableName,
ffList[["student"]]$variableName,
ffList[["naepmath"]]$variableName, ffList[["naepsci"]]$variableName,
ffList[["catalog"]]$variableName
))
iVar <- intersect(iVar, ffList[["trnscrpt"]]$variableName)
dataList[["Transcript"]] <- dataListItem(
lafObject = lafList[["trnscrpt"]],
fileFormat = ffList[["trnscrpt"]],
levelLabel = "Transcript",
forceMerge = FALSE,
parentMergeLevels = c("Student"),
parentMergeVars = c("studenti"),
mergeVars = c("studenti"),
ignoreVars = iVar,
conflictLevels = c("Test", "School_Catalog"),
isDimLevel = FALSE
)
iVar <- unique(c(
ffList[["school"]]$variableName,
ffList[["student"]]$variableName,
ffList[["naepmath"]]$variableName, ffList[["naepsci"]]$variableName
))
iVar <- intersect(iVar, ffList[["catalog"]]$variableName)
#special variables that are duplicates that we want to handle specially
renameVar <- c("catsrce", "cattype")
iVar <- iVar[!iVar %in% renameVar]
#rename LaF object column names
idx <- which(lafList[["catalog"]]@column_names %in% renameVar)
lafList[["catalog"]]@column_names[idx] <- paste0("trncat_", lafList[["catalog"]]@column_names[idx])
#rename FileFormat variable names
idx <- which(ffList[["catalog"]]$variableName %in% renameVar)
ffList[["catalog"]]$variableName[idx] <- paste0("trncat_", ffList[["catalog"]]$variableName[idx])
dataList[["Transcript_Catalog"]] <- dataListItem(lafObject = lafList[["catalog"]],
fileFormat = ffList[["catalog"]],
levelLabel = "Transcript_Catalog",
forceMerge = FALSE,
parentMergeLevels = c("Transcript"),
parentMergeVars = c("catlogid"),
mergeVars = c("catlogid"),
ignoreVars = iVar,
conflictLevels = c("Test", "School_Catalog"),
isDimLevel = FALSE)
return(dataList)
}
getHSTS_DataFileNames <- function(year) {
# define a named list that we will try to grab with the "year" being the name
chkList <- list("2019" = c("catalog", "naepmath", "naepsci", "school", "student", "tests", "trnscrpt"))
year <- as.character(year) # coerce to character for name check
res <- chkList[[year]]
if (is.null(res)) {
stop("Undefined Year Specified in getHSTS_DataFileNames: ", dQuote(year), ".")
}
return(res)
}
getDataAttributes_HSTS <- function(year) {
weights <- list()
pvvars <- list()
aL <- list()
omittedLevels <- c()
psuVar <- c()
stratumVar <- c()
if(year == 2019) {
#get weight detail, 2019 doesn't include any taylor psu/stratum variables only has JK variance
weights[["finstuwt"]] <- list(jkbase = "repwt", jksuffixes = as.character(1:62)) #student file (unlinked weight)
weights[["math_finlnkwt"]] <- list(jkbase = "math_lrepwt", jksuffixes = as.character(1:62)) #math linked weight (for math NAEP plausible values)
weights[["sci_finlnkwt"]] <- list(jkbase = "sci_lrepwt", jksuffixes = as.character(1:62)) #science linked weight (for science NAEP plausible values)
#don't set a default weight to force user selection in analysis functions, do this by setting empty string value
attributes(weights)$default <- ""
#mathematics
alMath <- achievementLevelsHelp("Grade 12", "2019", "Mathematics", "N") #from descriptionOfFile.R file used for readNAEP
pvvars[["math_num_oper"]] <- list(varnames = paste0("math_pv1", sprintf("%02d", 1:20)), achievementLevel = alMath)
pvvars[["math_meas_geo"]] <- list(varnames = paste0("math_pv2", sprintf("%02d", 1:20)), achievementLevel = alMath)
pvvars[["math_data_anal"]] <- list(varnames = paste0("math_pv3", sprintf("%02d", 1:20)), achievementLevel = alMath)
pvvars[["math_algebra"]] <- list(varnames = paste0("math_pv4", sprintf("%02d", 1:20)), achievementLevel = alMath)
pvvars[["math_composite"]] <- list(varnames = paste0("math_pvc", sprintf("%02d", 1:20)), achievementLevel = alMath)
# science
alSci <- achievementLevelsHelp("Grade 12", "2019", "Science", "N") # from descriptionOfFile.R file used for readNAEP
pvvars[["sci_univariate"]] <- list(varnames = paste0("sci_pvun", paste0(1:20)), achievementLevel = alSci)
aL <- list(Mathematics = alMath,
Science = alSci)
omittedLevels <- c(NA, "(Missing)", "Missing", "Omitted", "No recorded grade point average",
"Not reported", "Multiple responses/Don't know/Omitted/Missing",
"Missing graduation credits data", "NAEP assessment not taken",
"Not applicable", "Incomplete transcript")
psuVar <- NULL
stratumVar <- NULL
}
return(list(weights = weights,
pvvars = pvvars,
achievementLevels = aL,
omittedLevels = omittedLevels,
psuVar = psuVar,
stratumVar = stratumVar))
}
getHSTS19_LinkingErrorSpecs <- function(filePath) {
math1 <- readxl::read_excel(filePath, sheet = "MRPS1") #numbers & operation
math2 <- readxl::read_excel(filePath, sheet = "MRPS6") #measurement & geography
math3 <- readxl::read_excel(filePath, sheet = "MRPS4") #data analysis
math4 <- readxl::read_excel(filePath, sheet = "MRPS5") #algebra
sci1 <- readxl::read_excel(filePath, sheet = "SRPS")
linkErr <- list(math = vector("list", length = 4), sci = vector("list", length = 1))
names(linkErr$math) <- c("math_num_oper", "math_meas_geo", "math_data_anal", "math_algebra")
names(linkErr$sci) <- c("sci_univariate")
#for the math items (4 subscale score plausible values)
for(i in 1:4){
m <- get(paste0("math", i))
m <- setDT(m)
dba <- subset(m, m$DBAPBA == 1) #subset to digital based assessment
pba <- subset(m, m$DBAPBA == 2) #subset to paper based assessment
#build matrix for replicate weight sample adjustments (A and B with 62 records for each replicate weight)
sampdba <- data.table::melt(dba, id.vars = "DBAPBA", measure.vars = list(A = grep("^[A]samp\\d{1,2}$", colnames(dba), value = TRUE, ignore.case = TRUE),
B = grep("^[B]samp\\d{1,2}$", colnames(dba), value = TRUE, ignore.case = TRUE)),
variable.name = "JKRep")
samppba <- data.table::melt(pba, id.vars = "DBAPBA", measure.vars = list(A = grep("^[A]samp\\d{1,2}$", colnames(dba), value = TRUE, ignore.case = TRUE),
B = grep("^[B]samp\\d{1,2}$", colnames(dba), value = TRUE, ignore.case = TRUE)),
variable.name = "JKRep")
sampMatrix <- list(Digital = data.matrix(sampdba[ , c("A", "B")]),
Paper = data.matrix(samppba[ , c("A", "B")]))
#build matrix for plausible value imputation adjustments (A and B each with 5 plausible values with 20 replicates each)
matAdba <- matrix(nrow = 20, ncol = 5)
matBdba <- matrix(nrow = 20, ncol = 5)
matCdba <- matrix(nrow = 20, ncol = 5)
matApba <- matrix(nrow = 20, ncol = 5)
matBpba <- matrix(nrow = 20, ncol = 5)
matCpba <- matrix(nrow = 20, ncol = 5)
for(ii in 1:5){
zdba <- data.table::melt(dba, id.vars = "DBAPBA", measure.vars = list(A = grep(paste0("^[A]meas", ii, "_\\d{1,2}$"), colnames(dba), value = TRUE, ignore.case = TRUE),
B = grep(paste0("^[B]meas", ii, "_\\d{1,2}$"), colnames(dba), value = TRUE, ignore.case = TRUE),
C = grep(paste0("^[C]meas", ii, "_\\d{1,2}$"), colnames(dba), value = TRUE, ignore.case = TRUE)),
variable.name = "PVRep")
zpba <- data.table::melt(pba, id.vars = "DBAPBA", measure.vars = list(A = grep(paste0("^[A]meas", ii, "_\\d{1,2}$"), colnames(pba), value = TRUE, ignore.case = TRUE),
B = grep(paste0("^[B]meas", ii, "_\\d{1,2}$"), colnames(pba), value = TRUE, ignore.case = TRUE),
C = grep(paste0("^[C]meas", ii, "_\\d{1,2}$"), colnames(pba), value = TRUE, ignore.case = TRUE)),
variable.name = "PVRep")
matAdba[ , ii] <- zdba$A
matBdba[ , ii] <- zdba$B
matCdba[ , ii] <- zdba$C
matApba[ , ii] <- zpba$A
matBpba[ , ii] <- zpba$B
matCpba[ , ii] <- zpba$C
} #end for(ii in 1:5)
linkErr$math[[i]] <- list(repWSamp = sampMatrix, pvvarImp = list(Digital = list(A = matAdba, B = matBdba, C = matCdba),
Paper = list(A = matApba, B = matBpba, C = matCpba)))
} #end for(i in 1:4)
#for the science items (1 univariate scale score)
for(i in 1:1){
s <- get(paste0("sci", i))
s <- setDT(s)
dba <- subset(s, s$DBAPBA == 1) #digital based assessment
pba <- subset(s, s$DBAPBA == 2) #paper based assessment
#build matrix for replicate weight sample adjustments (A and B with 62 records for each replicate weight)
sampdba <- data.table::melt(dba, id.vars = "DBAPBA", measure.vars = list(A = grep("^[A]samp\\d{1,2}$", colnames(dba), value = TRUE, ignore.case = TRUE),
B = grep("^[B]samp\\d{1,2}$", colnames(dba), value = TRUE, ignore.case = TRUE)),
variable.name = "JKRep")
samppba <- data.table::melt(pba, id.vars = "DBAPBA", measure.vars = list(A = grep("^[A]samp\\d{1,2}$", colnames(pba), value = TRUE, ignore.case = TRUE),
B = grep("^[B]samp\\d{1,2}$", colnames(pba), value = TRUE, ignore.case = TRUE)),
variable.name = "JKRep")
sampMatrix <- list(Digital = data.matrix(sampdba[ , c("A", "B")]),
Paper = data.matrix(samppba[ , c("A", "B")]))
#build matrix for plausible value imputation adjustments for dba and pba (A and B each with 5 plausible values with 20 replicates each)
matAdba <- matrix(nrow = 20, ncol = 5)
matBdba <- matrix(nrow = 20, ncol = 5)
matCcba <- matrix(nrow = 20, ncol = 5)
matApba <- matrix(nrow = 20, ncol = 5)
matBpba <- matrix(nrow = 20, ncol = 5)
matCpba <- matrix(nrow = 20, ncol = 5)
for(ii in 1:5){
zdba <- data.table::melt(dba, id.vars = "DBAPBA", measure.vars = list(A = grep(paste0("^[A]meas", ii, "_\\d{1,2}$"), colnames(dba), value = TRUE, ignore.case = TRUE),
B = grep(paste0("^[B]meas", ii, "_\\d{1,2}$"), colnames(dba), value = TRUE, ignore.case = TRUE),
C = grep(paste0("^[C]meas", ii, "_\\d{1,2}$"), colnames(dba), value = TRUE, ignore.case = TRUE)),
variable.name = "PVRep")
zpba <- data.table::melt(pba, id.vars = "DBAPBA", measure.vars = list(A = grep(paste0("^[A]meas", ii, "_\\d{1,2}$"), colnames(pba), value = TRUE, ignore.case = TRUE),
B = grep(paste0("^[B]meas", ii, "_\\d{1,2}$"), colnames(pba), value = TRUE, ignore.case = TRUE),
C = grep(paste0("^[C]meas", ii, "_\\d{1,2}$"), colnames(pba), value = TRUE, ignore.case = TRUE)),
variable.name = "PVRep")
matAdba[ , ii] <- zdba$A
matBdba[ , ii] <- zdba$B
matCdba[ , ii] <- zdba$C
matApba[ , ii] <- zpba$A
matBpba[ , ii] <- zpba$B
matCpba[ , ii] <- zpba$C
} #end for(ii in 1:5)
linkErr$sci[[i]] <- list(repWSamp = sampMatrix, pvvarImp = list(Digital = list(A = matAdba, B = matBdba, C = matCdba),
Paper = list(A = matApba, B = matBpba, C = matCpba)))
} #end for(i in 1:1)
return(linkErr)
}
linkVarAugmentAB <- function(data, linkingErrorSpecs, subscaleWeights, subscales, composite, weightVar, dbapbaVarName = "dbapba") {
PVVars <- getAttributes(data = data, attribute = "pvvars")
PVVarsDefault <- attributes(PVVars)$default
# actual variables
scaledPVs <- list()
for(i in seq_along(subscales)) {
scaledPVs <- c(scaledPVs, list(getPlausibleValue(subscales[i], data = data)))
}
names(scaledPVs) <- subscales
PVs <- getPlausibleValue(composite, data = data)
# the DBA subset
if(!dbapbaVarName %in% colnames(data)) {
stop(paste0("Cannot find variable ", dQuote(dbapbaVarName), " on data."))
}
dbapba <- getData(data, varnames = dbapbaVarName, drop = TRUE, omittedLevels = FALSE) #drop NA's here for HSTS as only subset of students have NAEP math or sci score
DBASS <- dbapba %in% c("DBA", "Digital")
nDBA <- sum(DBASS) # number in DBA subset
# the PBA subset
PBASS <- dbapba %in% c("PBA", "Paper")
nPBA <- sum(PBASS) # number in the PBA subset
# number of subscales
K <- length(subscaleWeights)
if(K != length(scaledPVs)) {
stop(paste0("The argument ", dQuote("scaledPVs"), " is a list with one element per subscale, so it should have the same length as ", dQuote("subscaleWeights"), " and ",dQuote("thetaPVs"),". Each element of ", dQuote("scaledPVs"), " is a vector of plausible values."))
}
#get the weight to be used
wList <- getAttributes(data = data, attribute = "weights")
W <- wList[[weightVar]]
repWs <- paste0(W$jkbase, W$jksuffixes)
repWsDBA <- data[DBASS, repWs]
repWsPBA <- data[PBASS, repWs]
# subscale weights called "beta" in documentation
beta <- subscaleWeights
### measurement variance agumentation
# create PV data frame to minimize calls to getData
pvData <- data[ ,PVs]
# for each PV (column in the RAM)
scaledPV <- data[ , unlist(scaledPVs)]
for(i in seq_along(repWs)) {
# y1 is for DBA
y1 <- rep(0, nDBA)
# z1 is for PBA
x1 <- rep(0, nPBA)
for(k in 1:K) {
# get mean, stdev, PBA, use ith replicate weight
y_k <- scaledPV[DBASS, scaledPVs[[k]][1] ]
x_k <- scaledPV[PBASS, scaledPVs[[k]][1] ]
# variable transformations. The linking error specs subscales need to match the ordering of the subscales object and subscaleWeights
A <- linkingErrorSpecs[[subscales[k]]]$repWSamp$Digital[i, "A"]
B <- linkingErrorSpecs[[subscales[k]]]$repWSamp$Digital[i, "B"]
data[DBASS, paste0(subscales[k], "_linking_samp_", i)] <- (A*y_k + B)
data[PBASS, paste0(subscales[k], "_linking_samp_", i)] <- x_k
# sum y1
y1 <- y1 + beta[k] * (A*y_k + B) #ERRORS here (originally 'theta_k'), trying 'y_k' to match above?
x1 <- x1 + beta[k] * x_k
}
data[DBASS, paste0(composite, "_linking_samp_", i)] <- y1
compositePBA <- pvData[PBASS ,PVs[1]]
data[PBASS, paste0(composite, "_linking_samp_", i)] <- compositePBA
if(max(abs(x1 - compositePBA)) > 0.02) {
warning("Inacurate data or subscale weights, linking error is inacurately estimated due to low precision inputs. Disagreement on composite of ", max(abs(x1 - compositePBA)), "\n")
}
}
for(n in 1:5) {
# for each RAM column (5)
for(j in 1:20) {
y1 <- rep(0, nDBA)
x1 <- rep(0, nPBA)
for(k in 1:K) {
A <- linkingErrorSpecs[[subscales[k]]]$pvvarImp$Digital$A[j, n]
B <- linkingErrorSpecs[[subscales[k]]]$pvvarImp$Digital$B[j, n]
C <- linkingErrorSpecs[[subscales[k]]]$pvvarImp$Paper$C[j, n]
y_k <- scaledPV[DBASS, scaledPVs[[k]][j] ]
x_k <- scaledPV[PBASS, scaledPVs[[k]][C] ] ###C calculated above using paper RAM
# variable transformations
data[DBASS, paste0(subscales[k], "_linking_imp_", 20*(n-1)+j)] <- (A*y_k + B)
data[PBASS, paste0(subscales[k], "_linking_imp_", 20*(n-1)+j)] <- x_k
# y is the sum over the k subscales
y1 <- y1 + beta[k] * (A*y_k + B)
x1 <- x1 + beta[k] * x_k
}
# write composite variable for DBA
data[DBASS, paste0(composite, "_linking_imp_", 20*(n-1)+j)] <- y1
# write composite variable for PBA
jpv <- linkingErrorSpecs[[subscales[k]]]$pvvarImp$Paper$C[j, n]
compositePBA <- pvData[PBASS ,PVs[jpv]]
data[PBASS, paste0(composite, "_linking_imp_", 20*(n-1)+j)] <- compositePBA
if(max(abs(x1 - compositePBA)) > 0.02) {
warning("Inacurate data or subscale weights, linking error is inacurately estimated due to low precision inputs. Disagreement on composite of ", max(abs(x1 - compositePBA)), "\n")
}
}
}
# construct composite PV var
newPV <- list(list(estVarnames=PVs,
impVarnames=paste0(composite,"_linking_imp_",1:100),
sampVarnames=paste0(composite,"_linking_samp_",seq_along(repWs)),
achievementLevel=data[["pvvars"]][[composite]][["achievementLevel"]]))
names(newPV) <- paste0(composite, "_linking")
for(k in 1:K) {
# sometimes
if(subscales[k] != composite) {
newPVi <- list(estVarnames=scaledPVs[[k]],
impVarnames=paste0(subscales[k],"_linking_imp_",1:100),
sampVarnames=paste0(subscales[k],"_linking_samp_",seq_along(repWs)))
newPV[[paste0(subscales[k], "_linking")]] <- newPVi
}
}
pvv <- c(data[["pvvars"]], newPV)
attributes(pvv)$default <- PVVarsDefault
data[["pvvars"]] <- pvv
return(data)
}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.