R/readNAEP_XML.R
In EdSurvey: Analysis of NCES Education Survey and Assessment Data

# parses the NAEP XML layout file and returns a list object of the following items:
# 1) 'Contents' - A list of data.frames containing all of the XML data in an easy to use format of data.frames
# 2) 'FileFormat' - An 'EdSurvey' fileFormat data.frame that mirrors a fileFormat data.frame from an .fr2 file
# 3) 'XML' - The XML node structure as originally parsed from the XML layout file with the xml2 package
parseNAEP_XML <- function(filename, mmlScoreDict = defaultNAEPScoreCard()) {
  if (!file.exists(filename)) {
    stop(paste0("Specified XML File Not Found: ", filename))
  }

  # read in the text using readLines first as raw.
  # if read_xml is used with a filepath it can cause parsing errors for these specific XML files.
  txt <- readLines(filename) # read in as raw text
  txt <- paste0(txt, collapse = "\n")

  # ASCII encoding for all NAEP XML layout files
  xml <- read_xml(txt, encoding = "ASCII", as_html = FALSE, options = c("RECOVER", "NOBLANK"))
  catalog <- as_list(xml_find_all(xml, "//NAEPDataCatalog")) # main/root node here, must be only be one entry per file

  # check catalog length
  if (length(catalog) < 1) {
    stop(paste0("No NAEP Data Catalog Entry Found in File: ", filename))
  } else if (length(catalog) > 2) {
    stop(paste0("Multiple NAEP Data Catalog Entry Found in File: ", filename, "\n\n", "Only 1 is expected."))
  }

  catalog <- catalog[[1]] # will only be one entry for the 'NAEPDataCatalog'

  # list object to hold information at the entire file level
  dataFileDetail <- attributes(catalog$DataFile)
  # datafields returned as a list of data.frames(DataFields, DataCounts, ClassItems, IRTItems)
  dataFields <- parseNAEP_XML_DataFields(catalog$DataFields)
  # dataformats returned as data.frame
  dataFormats <- parseNAEP_XML_DataFormats(catalog$DataFormats)
  # IRT scaling information returned as data.frame
  IRTScales <- parseNAEP_XML_IRTScales(catalog$IRTScales)
  # Classifications returned as data.frame
  classifications <- parseNAEP_XML_Classifications(catalog$Classifications)
  # Samples
  samples <- parseNAEP_XML_Samples(catalog$Samples)
  # Groups
  groups <- parseNAEP_XML_Groups(catalog$Groups)

  fileContents <- list(
    DataFileDetails = dataFileDetail,
    DataFields = dataFields,
    DataFormats = dataFormats,
    IRTScales = IRTScales,
    Classifications = classifications,
    Samples = samples,
    Groups = groups
  )

  # fileFormat object will be same format as the 'readMRC' function with the .fr2 file
  # note that the data types are going to be slightly off due to knowing more about the data fields
  fileFormat <- parseNAEP_XML_ToEdSurveyFileFormat(fileContents)

  # pvvars are needed ahead of time to properly name the scales to match the pvvars
  fileFormat_pvvars <- trimws(unique(fileFormat$Type)) # get unique pvvars
  fileFormat_pvvars <- fileFormat_pvvars[nchar(fileFormat_pvvars) > 0] # filter out blanks/empty
  fileFormat_pvvars <- fileFormat_pvvars[!grepl("_theta", fileFormat_pvvars, ignore.case = TRUE)] # filter out theta pvvars
  irtParam <- parseNAEP_XML_mmlIRT_Params(fileContents, mmlScoreDict, fileFormat_pvvars)

  return(list(Contents = fileContents, FileFormat = fileFormat, IRTParams = irtParam, xml = xml))
}

# converts the processed data.frame XML layout file contents to a fileFormat object for EdSurvey use
parseNAEP_XML_ToEdSurveyFileFormat <- function(fileContents) {
  nfields <- nrow(fileContents$DataFields$DataFields)

  ff <- data.frame(
    variableName = make.names(tolower(fileContents$DataFields$DataFields$fieldname), unique = TRUE), # make.names here will ensure names match to LaF object and are valid for R
    Start = fileContents$DataFields$DataFields$start,
    End = cumsum(fileContents$DataFields$DataFields$width),
    Width = fileContents$DataFields$DataFields$width,
    Decimal = fileContents$DataFields$DataFields$decimal,
    Labels = trimws(fileContents$DataFields$DataFields$fieldLabel),
    labelValues = rep("", times = nfields),
    pvWt = rep("", times = nfields),
    Type = rep("", times = nfields),
    dataType = rep("", times = nfields),
    weights = rep("", times = nfields)
  )

  # yes, these datatypes are strange, but it's to match the existing NAEP fileformat FR2 parser
  # there are strange cases of NAEP files having NULL (ascii code 0) character instead of a blank space and LaF throws error trying to convert
  # getData will convert to numerics where applicable
  ff$dataType <- "character"
  ff$dataType[ff$Width >= 8 & ff$Decimal > 0] <- "numeric"
  ff$dataType[ff$Width < 8 & ff$Decimal > 0] <- "integer"

  # value label matching based on the format code
  vlCode <- fileContents$DataFields$DataFields$formatname
  vlIdx <- which(nchar(vlCode) > 0, arr.ind = TRUE)

  for (i in vlIdx) {
    tmpCode <- tolower(vlCode[i])
    ff$labelValues[i] <- fileContents$DataFormats$formatdefinition[tmpCode == tolower(fileContents$DataFormats$formatname)]
  }

  # ==== THIS BLOCK OF CODE IS UTILIZED FROM readMRC function in the `readNAEP.R` file. Perhaps better methods exist in the XML file, but unable to find documentation, so re-using prior method====
  # keep the original labels
  oLabels <- ff$Labels
  # Finding the plausible weights and jacknife replicates.
  # normally there is just one set of PVs (with multiple subjects or subscales).
  # When there are multiple sets then one starts with an "A" and is the
  # accommodations permitted values
  # main PVs:
  ff$Labels[grepl("plausible", tolower(ff$Labels)) & grepl("value", tolower(ff$Labels)) & "A" != toupper(substring(ff$variableName, 1, 1))] <- "PV"
  # theta PVs, these do not say "value"
  ff$Labels[grepl("plausible", tolower(ff$Labels)) & grepl("theta", tolower(ff$Labels))] <- "PVT"
  # accommodations perimted PVs:
  ff$Labels[grepl("plausible", tolower(ff$Labels)) & grepl("value", tolower(ff$Labels)) & "A" == toupper(substring(ff$variableName, 1, 1))] <- "PV2"

  # normally there is just one set of weights. When there are multiple sets
  # then one starts with an "A" and is the accommodations permitted values
  # for very specific files it will have an 'norigwt' or a 'vorigwt'
  ff$Labels[grepl("weight", tolower(ff$Labels)) & grepl("replicate", tolower(ff$Labels)) & !toupper(substring(ff$variableName, 1, 1)) %in% c("A", "N", "V")] <- "JK"
  ff$Labels[grepl("weight", tolower(ff$Labels)) & grepl("replicate", tolower(ff$Labels)) & "A" == toupper(substring(ff$variableName, 1, 1))] <- "JK2"
  ff$Labels[grepl("weight", tolower(ff$Labels)) & grepl("replicate", tolower(ff$Labels)) & "N" == toupper(substring(ff$variableName, 1, 1))] <- "JK-N"
  ff$Labels[grepl("weight", tolower(ff$Labels)) & grepl("replicate", tolower(ff$Labels)) & "V" == toupper(substring(ff$variableName, 1, 1))] <- "JK-V"
  
  # Check if there is at least one Plausible Value, and then finding their names
  tempValue <- applyPV("PV", ff$Labels, ff$pvWt, oLabels, ff$Type) # from readMRC
  ff$Labels <- tempValue[["Labels"]]
  ff$pvWt <- tempValue[["pvWt"]]
  ff$Type <- tempValue[["Type"]]

  # note theta PVs
  tempValue <- applyPV("PVT", ff$Labels, ff$pvWt, oLabels, ff$Type) # from readMRC
  ff$Labels <- tempValue[["Labels"]]
  ff$pvWt <- tempValue[["pvWt"]]
  ff$Type <- tempValue[["Type"]]

  # note AP PVs
  tempValue <- applyPV("PV2", ff$Labels, ff$pvWt, oLabels, ff$Type) # from readMRC
  ff$Labels <- tempValue[["Labels"]]
  ff$pvWt <- tempValue[["pvWt"]]
  ff$Type <- tempValue[["Type"]]

  # Check if there is at least one JK replicate.
  if(any(grepl("^(JK|JK2|JK-N|JK-V)$", ff$Labels))) {
    # get the number of the JK replicate
    ff$pvWt[ff$Labels == "JK"] <- as.numeric(gsub("[^\\d]+", "", oLabels[ff$Labels=="JK"], perl=TRUE))
    ff$pvWt[ff$Labels == "JK2"] <- as.numeric(gsub("[^\\d]+", "", oLabels[ff$Labels=="JK2"], perl=TRUE))
    ff$pvWt[ff$Labels == "JK-N"] <- as.numeric(gsub("[^\\d]+", "", oLabels[ff$Labels=="JK-N"], perl=TRUE))
    ff$pvWt[ff$Labels == "JK-V"] <- as.numeric(gsub("[^\\d]+", "", oLabels[ff$Labels=="JK-V"], perl=TRUE))
  }

  # identify weights (tries to use new method first, then old method of )
  # the weight variable will be 'preselected' and have a 'designrole' of "W"
  varDF <- fileContents$DataFields$DataFields
  wgtVar <- varDF$fieldname[trimws(tolower(varDF$designrole))=="w"]
  
  ff$weights <- grepl("^(a|n|v){0,1}(origwt)$", ff$variableName, ignore.case = TRUE) #T/F if variable is a weight
  
  #only do this check if the above does not locate any weights, this will be only applicable for old NAEP files
  if (!any(ff$weights)){
    ff$weights <- grepl("^(weight)$", ff$variableName, ignore.case = TRUE)
  }
  return(ff)
}

parseNAEP_XML_DataFields <- function(fieldList) {
  n <- length(fieldList)

  fieldname <- character(n)
  naepid <- character(n)
  start <- numeric(n)
  width <- numeric(n)
  decimal <- numeric(n)
  format <- character(n)
  type <- character(n)
  block <- character(n)
  item <- character(n)
  accnum <- character(n)
  validn <- numeric(n)
  minvalue <- numeric(n)
  maxvalue <- numeric(n)
  mean <- numeric(n)
  stddev <- numeric(n)
  usage <- character(n)
  formatname <- character(n)
  nummiss <- numeric(n)
  fieldLabel <- character(n)
  preselect <- character(n)
  designrole <- character(n)
  datacounts <- vector("list", n)
  irtitems <- vector("list", n)
  classitems <- vector("list", n)

  # build out sequence for loop
  if (n > 0) {
    n <- 1:n
  } else {
    n <- NULL
  }

  # need to parse record by record since not all entries will have all attributes
  for (i in seq_along(n)) {
    rec <- fieldList[[i]]

    fieldname[i] <- parseNAEP_XML_GetAttribute(rec, "fieldname")
    naepid[i] <- parseNAEP_XML_GetAttribute(rec, "naepid")
    start[i] <- parseNAEP_XML_GetAttribute(rec, "start", "numeric")
    width[i] <- parseNAEP_XML_GetAttribute(rec, "width", "numeric")
    decimal[i] <- parseNAEP_XML_GetAttribute(rec, "decimal", "numeric")
    format[i] <- parseNAEP_XML_GetAttribute(rec, "format")
    type[i] <- parseNAEP_XML_GetAttribute(rec, "type")
    block[i] <- parseNAEP_XML_GetAttribute(rec, "block")
    item[i] <- parseNAEP_XML_GetAttribute(rec, "item")
    accnum[i] <- parseNAEP_XML_GetAttribute(rec, "accnum")
    validn[i] <- parseNAEP_XML_GetAttribute(rec, "validn", "numeric")
    minvalue[i] <- parseNAEP_XML_GetAttribute(rec, "minvalue", "numeric")
    maxvalue[i] <- parseNAEP_XML_GetAttribute(rec, "maxvalue", "numeric")
    mean[i] <- parseNAEP_XML_GetAttribute(rec, "mean", "numeric")
    stddev[i] <- parseNAEP_XML_GetAttribute(rec, "stddev", "numeric")
    usage[i] <- parseNAEP_XML_GetAttribute(rec, "usage")
    formatname[i] <- parseNAEP_XML_GetAttribute(rec, "formatname")
    nummiss[i] <- parseNAEP_XML_GetAttribute(rec, "nummiss", "numeric")
    fieldLabel[i] <- parseNAEP_XML_GetAttribute(rec, "fieldlabel")
    preselect[i] <- parseNAEP_XML_GetAttribute(rec, "preselect")
    designrole[i] <- parseNAEP_XML_GetAttribute(rec, "designrole")

    if (!is.null(rec$DataCounts)) {
      datacounts[[i]] <- rec$DataCounts
    }
    if (!is.null(rec$IRTItems)) {
      irtitems[[i]] <- rec$IRTItems
    }
    if (!is.null(rec$ClassItems)) {
      classitems[[i]] <- rec$ClassItems
    }
  } # end for(i in seq_along(n))

  decimal[is.na(decimal)] <- 0 # to match existing readMRC function output only

  # give names to items for better linking later
  names(datacounts) <- fieldname # for our purposes we don't need the datacounts information, can be ignored
  names(irtitems) <- fieldname
  names(classitems) <- fieldname

  df <- data.frame(fieldname, naepid, start, width, decimal, format, type, validn, minvalue, maxvalue,
    mean, stddev, usage, formatname, nummiss, fieldLabel, preselect, designrole, block, item, accnum,
    stringsAsFactors = FALSE
  )

  charCols <- sapply(df, is.character)
  df[charCols][is.na(df[charCols])] <- "" # for character columns convert NA to blanks

  # ==== DATA COUNT ====
  # the data count details the frequency counts for discrete variables.
  datacountDF <- data.frame(
    fieldname = character(0),
    datavalue = character(0),
    datafreq = numeric(0),
    stringsAsFactors = FALSE
  )

  for (i in seq_along(datacounts)) {
    rec <- datacounts[[i]]
    if (is.null(rec)) { # if null no class item definition
      next
    }

    tempDF <- datacountDF[FALSE, ] # prep template
    for (ii in seq_along(rec)) {
      tempDF[ii, "fieldname"] <- names(datacounts)[i]
      tempDF[ii, "datavalue"] <- parseNAEP_XML_GetAttribute(rec[[ii]], "datavalue")
      tempDF[ii, "datafreq"] <- parseNAEP_XML_GetAttribute(rec[[ii]], "datafreq", "numeric")
    }
    datacountDF <- rbind(datacountDF, tempDF)
  }

  # ==== CLASS ITEMS ====
  # Process the classitems lists into a data.frame, filter out NULL entries
  # don't know exactly how many items we will have, so need to rbind here since a data field can have multiple class items associated with it
  classItemDF <- data.frame(
    fieldname = character(0),
    classcode = character(0),
    levelcode = character(0),
    stringsAsFactors = FALSE
  )

  for (i in seq_along(classitems)) {
    rec <- classitems[[i]]
    if (is.null(rec)) { # if null no class item definition
      next
    }

    tempDF <- classItemDF[FALSE, ] # prep template
    for (ii in seq_along(rec)) {
      tempDF[ii, "fieldname"] <- names(classitems)[i]
      tempDF[ii, "classcode"] <- parseNAEP_XML_GetAttribute(rec[[ii]], "classcode")
      tempDF[ii, "levelcode"] <- parseNAEP_XML_GetAttribute(rec[[ii]], "levelcode")
    }
    classItemDF <- rbind(classItemDF, tempDF)
  }

  # ==== IRT Item Parameters ====
  # grabs the IRT Item details as well as the parameter details and generates a data.frame of the information
  irtItemDF <- data.frame(
    fieldname = character(0),
    scale = character(0),
    sample = character(0),
    numalt = character(0),
    numcat = character(0),
    model = character(0),
    score = character(0),
    numparms = character(0),
    parametercode = character(0),
    IRTValue = numeric(0),
    stderror = numeric(0),
    stringsAsFactors = FALSE
  )

  for (i in seq_along(irtitems)) {
    for (j in seq_along(irtitems[[i]])) { # three levels of nesting here
      rec <- irtitems[[i]][[j]]

      if (is.null(rec)) { # if null no IRT Items definition
        next
      }

      fldName <- names(irtitems)[i]
      scale <- parseNAEP_XML_GetAttribute(rec, "scale")
      sample <- parseNAEP_XML_GetAttribute(rec, "sample")
      numalt <- parseNAEP_XML_GetAttribute(rec, "numalt")
      numcat <- parseNAEP_XML_GetAttribute(rec, "numcat")
      model <- parseNAEP_XML_GetAttribute(rec, "model")
      score <- parseNAEP_XML_GetAttribute(rec, "score")
      numparms <- parseNAEP_XML_GetAttribute(rec, "numparms")

      tempDF <- irtItemDF[FALSE, ] # prep template
      for (ii in seq_along(rec)) {
        tempDF[ii, "fieldname"] <- fldName
        tempDF[ii, "scale"] <- scale
        tempDF[ii, "sample"] <- sample
        tempDF[ii, "numalt"] <- numalt
        tempDF[ii, "numcat"] <- numcat
        tempDF[ii, "model"] <- model
        tempDF[ii, "score"] <- score
        tempDF[ii, "numparms"] <- numparms
        tempDF[ii, "parametercode"] <- parseNAEP_XML_GetAttribute(rec[[ii]], "parametercode")
        tempDF[ii, "IRTValue"] <- parseNAEP_XML_GetAttribute(rec[[ii]], "IRTvalue", "numeric")
        tempDF[ii, "stderror"] <- parseNAEP_XML_GetAttribute(rec[[ii]], "stderror", "numeric")
      }
      irtItemDF <- rbind(irtItemDF, tempDF)
    } # end for(j in seq_along(irtitems[i]))
  } # end for(i in seq_along(irtitems))

  retList <- list(DataFields = df, DataCounts = datacountDF, ClassItems = classItemDF, IRTItems = irtItemDF)
  return(retList)
}

# helper function to get the attribute of a list item
# if attribute does not exist in the item an 'NA' value is returned
# if dataType is NULL it will be ignored and a character will be returned (default for attributes), otherwise it will convert by setting class type
parseNAEP_XML_GetAttribute <- function(item, attributeName, dataType = NULL) {
  x <- attr(item, attributeName)
  # NULL if attribute does not exist
  if (is.null(x) || is.na(x)) {
    return(NA)
  }
  if (!is.null(dataType)) {
    class(x) <- dataType
  }
  return(x)
}

# parse the dataformats section and stack into a formatted data.frame
parseNAEP_XML_DataFormats <- function(formatList) {
  n <- length(formatList)

  formatname <- character(n)
  numcodes <- numeric(n)
  formatdefinition <- character(n)

  # build out sequence for loop
  if (n > 0) {
    n <- 1:n
  } else {
    n <- NULL
  }

  # need to parse record by record since not all entries will have all attributes
  for (i in seq_along(n)) {
    rec <- formatList[[i]]

    formatname[i] <- parseNAEP_XML_GetAttribute(rec, "formatname")
    numcodes[i] <- parseNAEP_XML_GetAttribute(rec, "numcodes", "numeric")
    nCodes <- length(rec)
    val <- character(nCodes)
    lbl <- character(nCodes)
    nCodes <- seq_along(rec)

    for (ii in nCodes) {
      fmt <- rec[[ii]]
      val[ii] <- parseNAEP_XML_GetAttribute(fmt, "formatvalue")
      lbl[ii] <- parseNAEP_XML_GetAttribute(fmt, "formatlabel")
      lbl[ii] <- gsub("^", "", lbl[ii], fixed = TRUE) # need to ensure carrot is not in labels for collapse scheme!!
    }

    # special case for the label 'multiple' to be moved to the end of the item string to retain consistency with .FR2 format
    if (any(grepl("^multiple$", lbl, ignore.case = TRUE))) {
      idx <- which(grepl("^multiple$", lbl, ignore.case = TRUE))
      # remove the item, then order it at the tail of the vector
      tmpVal <- val[-idx]
      tmpLbl <- lbl[-idx]
      val <- c(tmpVal, val[idx])
      lbl <- c(tmpLbl, lbl[idx])
    }

    formatdefinition[i] <- paste(val, lbl, sep = "=", collapse = "^")
  }

  df <- data.frame(formatname, numcodes, formatdefinition, stringsAsFactors = FALSE)

  charCols <- sapply(df, is.character)
  df[charCols][is.na(df[charCols])] <- "" # for character columns convert NA to blanks

  return(df)
}

# parse the IRT Scales section and stack into a formatted data.frame
parseNAEP_XML_IRTScales <- function(scales) {
  n <- length(scales)

  subject <- character(n)
  scalecode <- character(n)
  scalelabel <- character(n)
  scalesample <- character(n)
  transform_a <- numeric(n)
  transform_b <- numeric(n)
  compositeweight <- numeric(n)

  # build out sequence for loop
  if (n > 0) {
    n <- 1:n
  } else {
    n <- NULL
  }

  subj <- parseNAEP_XML_GetAttribute(scales, "subject")

  # need to parse record by record since not all entries will have all attributes
  for (i in seq_along(n)) {
    rec <- scales[[i]]

    subject[i] <- subj
    scalecode[i] <- parseNAEP_XML_GetAttribute(rec, "scalecode")
    scalelabel[i] <- parseNAEP_XML_GetAttribute(rec, "scalelabel")
    scalesample[i] <- parseNAEP_XML_GetAttribute(rec, "scalesample")
    transform_a[i] <- parseNAEP_XML_GetAttribute(rec, "transform-a", "numeric")
    transform_b[i] <- parseNAEP_XML_GetAttribute(rec, "transform-b", "numeric")
    compositeweight[i] <- parseNAEP_XML_GetAttribute(rec, "compositeweight", "numeric")
  }

  df <- data.frame(subject, scalecode, scalelabel, scalesample, transform_a, transform_b, compositeweight, stringsAsFactors = FALSE)

  charCols <- sapply(df, is.character)
  df[charCols][is.na(df[charCols])] <- "" # for character columns convert NA to blanks

  return(df)
}

# parse the IRT Scales section and stack into a formatted data.frame
parseNAEP_XML_Classifications <- function(classifications) {
  df <- data.frame(
    classcode = character(0),
    classlabel = character(0),
    numlevels = numeric(0),
    levelcode = character(0),
    levellabel = character(0)
  )

  # need to parse record by record since not all entries will have all attributes
  for (i in seq_along(classifications)) {
    rec <- classifications[[i]]
    tempDF <- df[FALSE, ]

    classcode <- parseNAEP_XML_GetAttribute(rec, "classcode")
    classlabel <- parseNAEP_XML_GetAttribute(rec, "classlabel")
    numlevels <- parseNAEP_XML_GetAttribute(rec, "numlevels", "numeric")

    for (ii in seq_along(rec)) {
      tempDF[ii, "classcode"] <- classcode
      tempDF[ii, "classlabel"] <- classlabel
      tempDF[ii, "numlevels"] <- numlevels
      tempDF[ii, "levelcode"] <- parseNAEP_XML_GetAttribute(rec[[ii]], "levelcode")
      tempDF[ii, "levellabel"] <- parseNAEP_XML_GetAttribute(rec[[ii]], "levellabel")
    }

    df <- rbind(df, tempDF)
  }

  charCols <- sapply(df, is.character)
  df[charCols][is.na(df[charCols])] <- "" # for character columns convert NA to blanks

  return(df)
}

# parse the Samples section and stack into a formatted data.frame
parseNAEP_XML_Samples <- function(samples) {
  n <- length(samples)
  df <- data.frame(
    samplecode = character(n),
    samplevar = character(n),
    samplevalue = character(n),
    samplelabel = character(n)
  )

  # need to parse record by record since not all entries will have all attributes
  for (i in seq_along(samples)) {
    rec <- samples[[i]]

    df[i, "samplecode"] <- parseNAEP_XML_GetAttribute(rec, "samplecode")
    df[i, "samplevar"] <- parseNAEP_XML_GetAttribute(rec, "samplevar")
    df[i, "samplevalue"] <- parseNAEP_XML_GetAttribute(rec, "samplevalue")
    df[i, "samplelabel"] <- parseNAEP_XML_GetAttribute(rec, "samplelabel")
  }

  charCols <- sapply(df, is.character)
  df[charCols][is.na(df[charCols])] <- "" # for character columns convert NA to blanks

  return(df)
}

# parse the Samples section and stack into a formatted data.frame
parseNAEP_XML_Groups <- function(groups) {
  n <- length(groups)
  df <- data.frame(
    usagecode = character(n),
    grouplabel = character(n)
  )

  # need to parse record by record since not all entries will have all attributes
  for (i in seq_along(groups)) {
    rec <- groups[[i]]

    df[i, "usagecode"] <- parseNAEP_XML_GetAttribute(rec, "usagecode")
    df[i, "grouplabel"] <- parseNAEP_XML_GetAttribute(rec, "grouplabel")
  }

  charCols <- sapply(df, is.character)
  df[charCols][is.na(df[charCols])] <- "" # for character columns convert NA to blanks

  return(df)
}

# reformats the xml layout IRT parameter information into the format needed for mml functions
# returns a list object consisting of these elements:
# 1: TestData
# 2: DichotParamTab
# 3: PolyParamTab
# 4: AdjsutedData
# 5: ScoreDict
# 6: ScoreCard
parseNAEP_XML_mmlIRT_Params <- function(fileContents, scoreDict, fileFormat_pvvars) {
  compositeRegex <- "composite"

  testData <- parseNAEP_XML_IRT_TestData(fileContents, fileFormat_pvvars)

  pvvarsNotInScales <- fileFormat_pvvars[!fileFormat_pvvars %in% testData$subtest]
  hasComposite <- grepl(compositeRegex, pvvarsNotInScales, ignore.case = TRUE)


  dichotParamTab <- parseNAEP_XML_IRT_DichotParam(fileContents, fileFormat_pvvars)
  polyParamTab <- parseNAEP_XML_IRT_PolyParam(fileContents, fileFormat_pvvars)
  adjustedData <- parseNAEP_XML_IRT_AdjustedData(fileContents) # returns formatted empty data.frame with matching columns

  # score dict required for processing score card
  scoreCard <- parseNAEP_XML_IRT_ScoreCard(fileContents, scoreDict)

  res <- list(
    TestData = testData,
    DichotParamTab = dichotParamTab,
    PolyParamTab = polyParamTab,
    AdjsutedData = adjustedData,
    ScoreDict = scoreDict,
    ScoreCard = scoreCard
  )
  return(res)
}

# prepares the dichotomous item data.frame for mml
parseNAEP_XML_IRT_DichotParam <- function(fileContents, fileFormat_pvvars) {
  irtItems <- fileContents$DataFields$IRTItems
  irtItems <- irtItems[irtItems$numcat == 1, ] # filter here for 3PL and 2PL items

  irtScales <- fileContents$IRTScales
  scaleInfo <- irtScales[ , c("subject", "scalecode", "scalelabel")]
  scaleInfo$convertedCode <- parseNAEP_XML_CodeConvert(irtScales, fileFormat_pvvars)

  # get our data subset
  subIRT <- unique(irtItems[ , c("fieldname", "scale", "sample", "numalt", "numcat", "model", "score")]) # numcat==1 gives us 2PL and 3PL items

  # split these out into separate data frames for easier merging
  matchFields <- c("fieldname", "scale", "sample")
  dataFields <- c("IRTValue", "stderror")
  aparams <- irtItems[irtItems$parametercode == "A", c(matchFields, dataFields)]
  bparams <- irtItems[irtItems$parametercode == "B", c(matchFields, dataFields)]
  cparams <- irtItems[irtItems$parametercode == "C", c(matchFields, dataFields)]
  abcParams <- unique(rbind(
    aparams[ , matchFields],
    bparams[ , matchFields],
    cparams[ , matchFields]
  ))

  # slope
  abcParams <- merge(abcParams, aparams, by = matchFields, all.x = TRUE)
  abcParams$slope <- abcParams$IRTValue
  abcParams[ , dataFields] <- NULL
  # difficulty
  abcParams <- merge(abcParams, bparams, by = matchFields, all.x = TRUE)
  abcParams$difficulty <- abcParams$IRTValue
  abcParams[ , dataFields] <- NULL
  # guessing
  abcParams <- merge(abcParams, cparams, by = matchFields, all.x = TRUE)
  abcParams$guessing <- abcParams$IRTValue
  abcParams[ , dataFields] <- NULL
  abcParams$guessing[is.na(abcParams$guessing)] <- 0 # set any 2PL items to 0 here

  # these fields are to match existing IRT 'dichotParamTab' value and will be what gets returned
  subIRT$ItemID <- tolower(subIRT$fieldname)
  subIRT$subtest <- scaleInfo$convertedCode[match(tolower(subIRT$scale), tolower(scaleInfo$scalecode))]
  if (nrow(subIRT) > 0) { # might not have data
    subIRT$D <- 1.7
    subIRT$missingValue <- 1 / nchar(subIRT$score)
    subIRT$missingCode <- 8
  } else {
    subIRT$D <- numeric(0)
    subIRT$missingValue <- character(0)
    subIRT$missingCode <- numeric(0)
  }

  subIRT <- merge(subIRT, abcParams, by = matchFields, all.x = TRUE)

  # to match existing IRT specs
  res <- subIRT[ , c("ItemID", "subtest", "slope", "difficulty", "guessing", "D", "missingValue", "missingCode")]
  return(res)
}

parseNAEP_XML_IRT_PolyParam <- function(fileContents, fileFormat_pvvars) {
  irtItems <- fileContents$DataFields$IRTItems
  irtItems <- irtItems[irtItems$numcat > 1, ] # filter here for poly items (having numcat > 1)

  irtScales <- fileContents$IRTScales
  scaleInfo <- irtScales[ , c("subject", "scalecode", "scalelabel")]
  scaleInfo$convertedCode <- parseNAEP_XML_CodeConvert(irtScales, fileFormat_pvvars)

  # get our data subset
  subIRT <- unique(irtItems[ , c("fieldname", "scale", "sample", "numalt", "numcat", "model", "score")])

  matchFields <- c("fieldname", "scale", "sample")
  dataFields <- c("IRTValue", "stderror")

  aparam <- irtItems[irtItems$parametercode == "A", c(matchFields, dataFields)]
  bparam <- irtItems[irtItems$parametercode == "B", c(matchFields, dataFields)]
  d1 <- irtItems[irtItems$parametercode == "D01", c(matchFields, dataFields)]
  d2 <- irtItems[irtItems$parametercode == "D02", c(matchFields, dataFields)]
  d3 <- irtItems[irtItems$parametercode == "D03", c(matchFields, dataFields)]
  d4 <- irtItems[irtItems$parametercode == "D04", c(matchFields, dataFields)]
  d5 <- irtItems[irtItems$parametercode == "D05", c(matchFields, dataFields)]

  allParams <- unique(rbind(
    aparam[ , matchFields],
    bparam[ , matchFields],
    d1[ , matchFields],
    d2[ , matchFields],
    d3[ , matchFields],
    d4[ , matchFields],
    d5[ , matchFields]
  ))

  # slope
  allParams <- merge(allParams, aparam, by = matchFields, all.x = TRUE)
  allParams$slope <- allParams$IRTValue
  allParams[ , dataFields] <- NULL
  # item location
  allParams <- merge(allParams, bparam, by = matchFields, all.x = TRUE)
  allParams$itemLocation <- allParams$IRTValue
  allParams[ , dataFields] <- NULL
  # d1
  allParams <- merge(allParams, d1, by = matchFields, all.x = TRUE)
  allParams$d1 <- allParams$IRTValue
  allParams[ , dataFields] <- NULL
  # d2
  allParams <- merge(allParams, d2, by = matchFields, all.x = TRUE)
  allParams$d2 <- allParams$IRTValue
  allParams[ , dataFields] <- NULL
  # d3
  allParams <- merge(allParams, d3, by = matchFields, all.x = TRUE)
  allParams$d3 <- allParams$IRTValue
  allParams[ , dataFields] <- NULL
  # d4
  allParams <- merge(allParams, d4, by = matchFields, all.x = TRUE)
  allParams$d4 <- allParams$IRTValue
  allParams[ , dataFields] <- NULL
  # d5
  allParams <- merge(allParams, d5, by = matchFields, all.x = TRUE)
  allParams$d5 <- allParams$IRTValue
  allParams[ , dataFields] <- NULL

  # these fields are to match existing IRT 'dichotParamTab' value and will be what gets returned
  subIRT$ItemID <- tolower(subIRT$fieldname)
  subIRT$subtest <- scaleInfo$convertedCode[match(tolower(subIRT$scale), tolower(scaleInfo$scalecode))]
  if (nrow(subIRT) > 0) {
    subIRT$D <- 1.7
    subIRT$missingValue <- "c"
    subIRT$missingCode <- 8
  } else {
    subIRT$D <- numeric(0)
    subIRT$missingValue <- character(0)
    subIRT$missingCode <- numeric(0)
  }

  subIRT <- merge(subIRT, allParams, by = matchFields, all.x = TRUE)

  # to match existing IRT specs
  res <- subIRT[ , c("ItemID", "subtest", "slope", "itemLocation", "d1", "d2", "d3", "d4", "d5", "D", "missingValue", "missingCode")]
  return(res)
}

# retrieve the subscale test level information
parseNAEP_XML_IRT_TestData <- function(fileContents, fileFormat_pvvars) {
  irt <- fileContents$IRTScales # this includes the 'subtest' information needed
  res <- data.frame(
    subtest = parseNAEP_XML_CodeConvert(irt, fileFormat_pvvars), # convert codes to what IRTparams has
    location = irt$transform_b, # convert codes to what IRTparams has
    scale = irt$transform_a,
    subtestWeight = irt$compositeweight,
    scalecode = irt$scalecode, # these are on the XML irt scales and might be useful to include here, but not needed for mml.sdf
    scalelabel = irt$scalelabel,
    scalesample = irt$scalesample
  )
  # do we need to check if composite weights equal to 1 (100%) if they are present?
  # a file can have multiple samples (e.g. digital/paper)
  return(res)
}

parseNAEP_XML_IRT_AdjustedData <- function(fileContents) {
  df <- data.frame(
    NAEPid = character(0),
    from = character(0),
    to = character(0),
    stringsAsFactors = FALSE
  )

  # RETURN EMTPY DATA.FRAME. WE HAVE THE SCORING KEY MASK (e.g., 0010, 0123, 0100)
  return(df)
}

parseNAEP_XML_IRT_ScoreCard <- function(fileContents, scoreDict) {
  irtItems <- unique(fileContents$DataFields$IRTItems[ , c("fieldname", "score", "numcat", "model")])

  dataFields <- fileContents$DataFields$DataFields
  dataFields <- subset(dataFields, (tolower(dataFields$fieldname) %in% tolower(irtItems$fieldname)))
  dataFields <- merge(dataFields, irtItems, by = "fieldname", all.x = TRUE)

  formats <- fileContents$DataFormats
  dataFields <- merge(dataFields, formats, by = "formatname", all.x = TRUE) # datafields DF will now have all needed data (labels and scoring key)
  irtItems <- NULL
  formats <- NULL

  dfTemplate <- data.frame(
    key = character(0), # the variable name
    answer = character(0), # the value label defined
    score = numeric(0),
    stringsAsFactors = FALSE
  ) # the number of points its assigned for the 'scoringKey' index
  dfResult <- dfTemplate

  for (i in seq_len(nrow(dataFields))) { # loop for each row in dataFields data.frame

    var <- tolower(dataFields$fieldname[i])
    scr <- dataFields$score[i]
    scrArr1 <- unlist(strsplit(scr, "")) # split into it a character vector
    valLbls <- dataFields$formatdefinition[i]
    isDichot <- dataFields$numcat[i] == 1 # numcat==1 signifies it's a dichotomous item

    valTokens <- unlist(strsplit(valLbls, "^", fixed = TRUE))
    valTokens <- strsplit(valTokens, "=", fixed = TRUE)

    vals <- character(length(valTokens))
    lbls <- character(length(valTokens))

    for (ii in seq_along(valTokens)) {
      vals[ii] <- valTokens[[ii]][1]
      lbls[ii] <- paste(valTokens[[ii]][-1], sep = "", collapse = "")
    }

    vals1 <- vals[1:nchar(scr)] # only need the items in the scoring key, the rest can be discarded
    lbls1 <- lbls[1:nchar(scr)] # only need the items in the scoring key, the rest can be discarded
    vals2 <- vals[!vals %in% vals1] # these will be omitted, missing, etc.
    lbls2 <- lbls[!lbls %in% lbls1] # these will be omitted, missing, etc.

    tempDF <- data.frame(key = rep(var, times = length(lbls1)), answer = lbls1, score = as.numeric(scrArr1), stringsAsFactors = FALSE)
    dfResult <- rbind(dfResult, tempDF)
    tempDF <- NULL

    # scored differently depending on it's question type
    if (isDichot) {
      scrArr2 <- scoreDict$pointMult[match(tolower(lbls2), tolower(scoreDict$resCat))]
    } else { # poly item
      scrArr2 <- scoreDict$pointConst[match(tolower(lbls2), tolower(scoreDict$resCat))]
    }

    tempDF <- data.frame(key = rep(var, times = length(lbls2)), answer = lbls2, score = as.numeric(scrArr2), stringsAsFactors = FALSE)
    tempDF <- subset(tempDF, !is.na(tempDF$score)) # only add items with a scoring value
    dfResult <- rbind(dfResult, tempDF)
    tempDF <- NULL
  }

  return(dfResult)
}

# converts a NAEP subject or test code into the value used with NAEPirtparams to have them be consistent and match
parseNAEP_XML_CodeConvert <- function(irtScaleDF, fileFormat_pvvars) {
  compositePVRegex <- "(composite|univariate)" # regex for determining composite pvvars, they will have 'composite' or 'univariate' in the title

  samples <- unique(irtScaleDF$scalesample) # files can have multiple samples, generally for paper/digital variations
  resDF <- irtScaleDF[FALSE, ] # empty result data.frame
  resDF$pvvar <- character(0) # add blank column for our matching pvvar

  if (nrow(irtScaleDF) == 1 && length(fileFormat_pvvars) == 1) {
    return(fileFormat_pvvars)
  }

  # process each sample separately
  for (samp in samples) {
    irtSubDF <- irtScaleDF[irtScaleDF$scalesample == samp, ] # subset for specific sample
    irtSubDF$pvvar <- NA # add pvvar to match our resDF

    # is digital or paper based assessment sample
    isPBA <- all(grepl("paper", irtSubDF$scalelabel, ignore.case = TRUE))
    isDBA <- all(grepl("digital", irtSubDF$scalelabel, ignore.case = TRUE))

    # sometimes there is a separate IRT scale code for the composite/overall, other times there is not, and the entire sample is the composite
    hasComposite <- any(grepl(compositePVRegex, fileFormat_pvvars, ignore.case = TRUE))

    for (i in seq_len(nrow(irtSubDF))) {
      # Test 1 || check if this is a composite scale item or 'overall' item
      if (hasComposite && grepl("overall", irtSubDF$scalelabel[i], ignore.case = TRUE)) {
        irtSubDF$pvvar[i] <- fileFormat_pvvars[grepl(compositePVRegex, fileFormat_pvvars, ignore.case = TRUE)]
        next # skip to next
      }

      # get the first word in the scale label
      firstWrd <- tolower(unlist(regmatches(irtSubDF$scalelabel[i], regexpr("\\w+", irtSubDF$scalelabel[i])))) # regexpr for first match only

      # Test 2 - test first word matching sting entirely for pvvar name (e.g., physical science == physical)
      if (any(tolower(fileFormat_pvvars) == firstWrd)) {
        irtSubDF$pvvar[i] <- fileFormat_pvvars[tolower(fileFormat_pvvars) == firstWrd][1] # first TRUE match (should only be one!)
        next
      }

      # Test 3 - test if the first word is in the pvvar name at any point
      if (any(grepl(paste0("(", firstWrd, ")"), fileFormat_pvvars, ignore.case = TRUE))) {
        irtSubDF$pvvar[i] <- fileFormat_pvvars[grepl(paste0("(", firstWrd, ")"), fileFormat_pvvars, ignore.case = TRUE)][1] # first TRUE match (should only be one!)
        next
      }

      # Test 4 - grab tokens for each word, remove any 'ignore' tokens (e.g., 'and', 'for', 'from'), then test if any tokens match pvvar name
      tokens <- tolower(unlist(regmatches(irtSubDF$scalelabel[i], gregexpr("\\w+", irtSubDF$scalelabel[i])))) # gregexpr for all words
      ignoreTokens <- "^(and|&|to|from|for|long-term |long|term|trend|digital|paper)$"
      tokens <- tokens[!grepl(ignoreTokens, tokens, ignore.case = TRUE)]

      testpvvars <- fileFormat_pvvars[!grepl(compositePVRegex, fileFormat_pvvars, ignore.case = TRUE)] # start with all pvvars, filter out the composites
      testRes <- numeric(length = length(tokens))
      for (ii in seq_along(tokens)) {
        xRes <- grepl(paste0("(", tokens[ii], ")"), testpvvars, ignore.case = TRUE)

        if (any(xRes)) {
          testRes[ii] <- which(xRes)
        }
        # testRes[ii] <- which(grepl(paste0("(", tokens[ii], ")"), testpvvars, ignore.case = TRUE))
      }

      if (any(testRes > 0)) { # a match was found!
        testRes <- testRes[testRes > 0]
        irtSubDF$pvvar[i] <- testpvvars[testRes[1]]
        next
      }

      # Test 5 - grab first three characters of the tokens, check to see if the 3-character tokens are present in the pvvar name
      tokens3Char <- substr(tokens, 1, 3)

      testpvvars <- fileFormat_pvvars[!grepl(compositePVRegex, fileFormat_pvvars, ignore.case = TRUE)] # start with all pvvars, filter out the composites
      testRes <- numeric(length = length(tokens3Char))
      for (ii in seq_along(tokens3Char)) {
        xRes <- grepl(paste0("(", tokens3Char[ii], ")"), testpvvars, ignore.case = TRUE)

        if (any(xRes)) {
          testRes[ii] <- which(xRes)
        }
        # testRes[ii] <- which(grepl(paste0("(", tokens[ii], ")"), testpvvars, ignore.case = TRUE))
      }

      frq <- table(testRes)
      frq <- frq[names(frq) != "0"] # filter out no matches

      if (length(frq) > 0) { # match was found!
        testRes <- as.numeric(names(frq)[which.max(frq)]) # get the most common match

        irtSubDF$pvvar[i] <- testpvvars[testRes[1]]
        next
      }

      # could not be located, or the pvvars were not accurately located in the file throw warning for the user
      warning(paste0("Unable to locate matching pvvar variable for scale: ", paste0(irtSubDF$scalecode[i], " - ", irtSubDF$scalelabel[i])))
      irtSubDF$pvvar[i] <- ""
    } # end for(i in seq_len(nrow(irtSubDF)))

    if (isPBA) {
      irtSubDF$pvvar <- paste0(irtSubDF$pvvar, "_paper")
    }
    if (isDBA) {
      irtSubDF$pvvar <- paste0(irtSubDF$pvvar, "_digital")
    }

    resDF <- rbind(resDF, irtSubDF)
  } # end for (samp in samples)

  # reorder to match original order, in case it didn't stack as it was originally
  origOrder <- which(resDF$scalecode == irtScaleDF$scalecode)
  res <- resDF$pvvar[origOrder]

  return(res)
}
Any scripts or data that you put into this service are public.
EdSurvey documentation built on June 27, 2024, 5:10 p.m.
rdrr.io home R language documentation Run R code online
CRAN packages Bioconductor packages R-Forge packages GitHub packages
Note that we can't provide technical support on individual packages. You should contact the package authors for that.
EdSurvey
Analysis of NCES Education Survey and Assessment Data

R/readNAEP_XML.R
In EdSurvey: Analysis of NCES Education Survey and Assessment Data

Try the EdSurvey package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

EdSurvey Analysis of NCES Education Survey and Assessment Data

R/readNAEP_XML.R In EdSurvey: Analysis of NCES Education Survey and Assessment Data

Try the EdSurvey package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

EdSurvey
Analysis of NCES Education Survey and Assessment Data

R/readNAEP_XML.R
In EdSurvey: Analysis of NCES Education Survey and Assessment Data