R/extractSaveData.R

Defines functions l_getSavedata_readRawFile l_getSavedata_Bparams getSavedata_Bparams getSavedata_Data l_getSavedata_Fileinfo getSavedata_Fileinfo

Documented in getSavedata_Bparams getSavedata_Data getSavedata_Fileinfo l_getSavedata_Bparams l_getSavedata_Fileinfo l_getSavedata_readRawFile

#' Read Variable Names, Formats, and Widths from data generated by the SAVEDATA Command
#'
#' This function reads the SAVEDATA INFORMATION section from an Mplus output file that used
#' the SAVEDATA command, and it returns a list with the filename, variable names, variable formats,
#' and variable widths of the SAVEDATA file. If present, the function also parses information about the
#' Bayesian Parameters (BPARAMETERS) file.
#'
#' @param outfile required. The name of the Mplus output file to read. Can be an absolute or relative path.
#'   If \code{outfile} is a relative path or just the filename, then it is assumed that the file resides in
#'   the working directory \code{getwd()}.
#' @return Returns a list of SAVEDATA file information that includes:
#' \item{fileName}{The name of the file containing the analysis dataset created by the Mplus SAVEDATA command.}
#' \item{fileVarNames}{A character vector containing the names of variables in the dataset.}
#' \item{fileVarFormats}{A character vector containing the Fortran-style formats of variables in the dataset.}
#' \item{fileVarWidths}{A numeric vector containing the widths of variables in the dataset (which is stored in fixed-width format).}
#' \item{bayesFile}{The name of the BPARAMETERS file containing draws from the posterior distribution created by
#'   the Mplus SAVEDATA BPARAMETERS command.}
#' \item{bayesVarNames}{A character vector containing the names of variables in the BPARAMETERS dataset.}
#' \item{tech3File}{A character vector of the tech 3 output.}
#' \item{tech4File}{A character vector of the tech 4 output.}
#' @author Michael Hallquist
#' @seealso \code{\link{getSavedata_Data}}
#' @examples
#' \dontrun{
#'   fileInfo <- getSavedata_Fileinfo("C:/Program Files/Mplus/Test Output.out")
#' }
#' @keywords internal
getSavedata_Fileinfo <- function(outfile) {
  #wraps l_getSavedata_Fileinfo by checking for the outfile and reading it as a character vector.

  #helper function to parse output from savedata command
  #if returnData is true, the data file created will be read in as an R data frame
  #if returnData is false, just the variable names are returned
  #considering using addHeader to prepend a header row

  if(!file.exists(outfile)) {
    stop("Cannot locate outfile: ", outfile)
  }

  outfiletext <- scan(outfile, what="character", sep="\n", strip.white=FALSE, blank.lines.skip=FALSE, quiet=TRUE)

  if (length(outfiletext) == 0) {
    warning("Empty outfile")
    return(NULL)
  }

  return(l_getSavedata_Fileinfo(outfile, outfiletext))

}

#' local function that does the work of getSaveData_Fileinfo
#'
#' This function is split out so that \code{getSaveData_Fileinfo} is
#' exposed to the user, but the parsing function can be used by
#' \code{readModels}
#'
#' @param outfile A character string giving the name of the Mplus
#'   output file.
#' @param outfiletext The contents of the output file, for example as read by \code{scan}
#' @return A list that includes:
#' \item{fileName}{The name of the file containing the analysis dataset created by the Mplus SAVEDATA command.}
#' \item{fileVarNames}{A character vector containing the names of variables in the dataset.}
#' \item{fileVarFormats}{A character vector containing the Fortran-style formats of variables in the dataset.}
#' \item{fileVarWidths}{A numeric vector containing the widths of variables in the dataset (which is stored in fixed-width format).}
#' \item{bayesFile}{The name of the BPARAMETERS file containing draws from the posterior distribution created by
#'   the Mplus SAVEDATA BPARAMETERS command.}
#' \item{bayesVarNames}{A character vector containing the names of variables in the BPARAMETERS dataset.}
#' \item{tech3File}{A character vector of the tech 3 output.}
#' \item{tech4File}{A character vector of the tech 4 output.}
#' @importFrom gsubfn strapply
#' @seealso \code{\link{getSavedata_Data}}
#' @examples
#' # make me!
#' @keywords internal
l_getSavedata_Fileinfo <- function(outfile, outfiletext, summaries) {

  sectionStarts <- c("Estimates", #estimates
      "Estimated Covariance Matrix for the Parameter Estimates", #tech3
      "Estimated Means and Covariance Matrix for the Latent Variables", #tech4
      "Sample/H1/Pooled-Within Matrix", #sample
      "Bayesian Parameters", #bparameters
      "Within and between sample statistics with Weight matrix", #swmatrix
      "Model Estimated Covariance Matrix" #covariance
  )

  #extract entire savedata section
  savedataSection <- getSection("^\\s*SAVEDATA INFORMATION\\s*$", outfiletext)

  #need to have SAVEDATA section to proceed
  if (is.null(savedataSection)) {
    #omit warning -- probably more common for section to be missing
    #warning("Unable to locate a SAVEDATA section in output file: ", outfile)
    return(NULL)
  }

  #initialize these variables to empty character strings so that list return value is complete
  #important in cases where some savedata output available, but other sections unused
  listVars <- c("saveFile", "fileVarNames", "fileVarFormats", "fileVarWidths", "bayesFile", "bayesVarNames",
      "tech3File", "tech4File", "covFile", "sampleFile", "estimatesFile")
  l_ply(listVars, assign, value=NA_character_, envir=environment())
  
  # NULL bparameters iterations if not available (needs to be NULL instead of NA since the return is a data.frame)
  bayesIterDetails <- NULL

  #in Mplus v7, "Save file" comes before "Order and format of variables"
  #new approach: process the savedata output more sequentially

  #In general, the savedata section is formatted with the unlabeled stuff first.
  #TECH3, TECH4, and BPARAMATERS are all labeled below
  #Thus, the logic here is to parse all labeled sections, then remove these from consideration.

  linesParsed <- c()

  #model covariance matrix (COVARIANCE) -- only available when all DVs are continuous
  ###  Model Estimated Covariance Matrix
  ###
  ###  Save file
  ###    ex3.1.cov
  ###  Save format      Free

  covSection <- getSection("^\\s*Model Estimated Covariance Matrix\\s*$", savedataSection, sectionStarts)

  if (!is.null(covSection)) {
    covFileStart <- grep("^\\s*Save file\\s*$", covSection, ignore.case=TRUE, perl=TRUE)
    if (length(covFileStart) > 0L) {
      covFile <- trimSpace(covSection[covFileStart+1])
    }
    linesParsed <- c(linesParsed, attr(covSection, "lines"))
  }

  #parameter estimates (ESTIMATES)
  ###  Estimates
  ###
  ###  Save file
  ###    ex3.1.est
  ###  Save format      Free
  estSection <- getSection("^\\s*Estimates\\s*$", savedataSection, sectionStarts)

  if (!is.null(estSection)) {
    estFileStart <- grep("^\\s*Save file\\s*$", estSection, ignore.case=TRUE, perl=TRUE)
    if (length(estFileStart) > 0L) {
      estimatesFile <- trimSpace(estSection[estFileStart+1])
    }
    linesParsed <- c(linesParsed, attr(estSection, "lines"))
  }

  #sample/h1/pooled-within matrix (SAMPLE)
  ###  Sample/H1/Pooled-Within Matrix
  ###
  ###  Save file
  ###    ex3.12.sample
  ###  Save type        CORRELATION
  ###  Save format      Free

  sampleSection <- getSection("^\\s*Sample/H1/Pooled-Within Matrix\\s*$", savedataSection, sectionStarts)

  if (!is.null(sampleSection)) {
    sampleFileStart <- grep("^\\s*Save file\\s*$", sampleSection, ignore.case=TRUE, perl=TRUE)
    if (length(sampleFileStart) > 0L) {
      sampleFile <- trimSpace(sampleSection[sampleFileStart+1])
    }
    linesParsed <- c(linesParsed, attr(sampleSection, "lines"))
  }

  #tech3 output (TECH3)
  tech3Section <- getSection("^\\s*Estimated Covariance Matrix for the Parameter Estimates\\s*$", savedataSection, sectionStarts)

  if (!is.null(tech3Section)) {
    tech3FileStart <- grep("^\\s*Save file\\s*$", tech3Section, ignore.case=TRUE, perl=TRUE)
    if (length(tech3FileStart) > 0L) {
      tech3File <- trimSpace(tech3Section[tech3FileStart+1])
    }
    linesParsed <- c(linesParsed, attr(tech3Section, "lines"))
  }

  #tech4 output (TECH4)
  tech4Section <- getSection("^\\s*Estimated Means and Covariance Matrix for the Latent Variables\\s*$", savedataSection, sectionStarts)

  if (!is.null(tech4Section)) {
    tech4FileStart <- grep("^\\s*Save file\\s*$", tech4Section, ignore.case=TRUE, perl=TRUE)
    if (length(tech4FileStart) > 0L) {
      tech4File <- trimSpace(tech4Section[tech4FileStart+1])
    }
    linesParsed <- c(linesParsed, attr(tech4Section, "lines"))
  }

  #Bayesian parameters section (BPARAMETERS)
  bparametersSection <- getSection("^\\s*Bayesian Parameters\\s*$", savedataSection, sectionStarts)

  if (!is.null(bparametersSection)) {

    bayesFileStart <- grep("^\\s*Save file\\s*$", bparametersSection, ignore.case=TRUE, perl=TRUE)

    if (length(bayesFileStart > 0)) {
      bayesFile <- trimSpace(bparametersSection[bayesFileStart+1])
      paramOrderStart <- grep("^\\s*Order of parameters saved\\s*$", bparametersSection, ignore.case=TRUE, perl=TRUE)
      paramOrderSection <- sapply(bparametersSection[(paramOrderStart+2):length(bparametersSection)], trimSpace, USE.NAMES=FALSE) #trim leading/trailing spaces and skip "Order of parameters" line and the subsequent blank line

      #parameters list ends with a blank line, so truncate section at next newline
      #or in case where section ends after last param, then no blank line is present (just end of vector), so do nothing
      nextBlankLine <- which(paramOrderSection == "")
      if (length(nextBlankLine) > 0) paramOrderSection <- paramOrderSection[1:(nextBlankLine[1]-1)]

      #rather than split into two columns, concatenate so that these are workable as column names
      #paramOrderSection <- strsplit(paramOrderSection, "\\s*,\\s*", perl=TRUE)
      #bayesVarTypes <- gsub("\\s+", "\\.", sapply(paramOrderSection, "[", 1), perl=TRUE)
      #bayesVarNames <- gsub("\\s+", "\\.", sapply(paramOrderSection, "[", 2), perl=TRUE)

      #15Mar2012: Workaround for bug: means for categorical latent variables are in output file listing
      #but these are not actually present in bparameters. Filter these out.
      #12Oct2015: This appears to have been fixed in Mplus 7 and beyond, so check version if available.    
      if (missing(summaries) || is.null(summaries$Mplus.version) || as.numeric(summaries$Mplus.version) < 7) {
        paramOrderSection <- paramOrderSection[!grepl("Parameter\\s+\\d+, \\[ [^#]#\\d+ \\]", paramOrderSection, perl=TRUE)]
      }
      
      bayesVarNames <- gsub("\\s*,\\s*", "_", paramOrderSection, perl=TRUE)
      bayesVarNames <- gsub("\\[", "MEAN", bayesVarNames, perl=TRUE)
      bayesVarNames <- gsub("\\s*\\]\\s*", "", bayesVarNames, perl=TRUE)
      bayesVarNames <- gsub("\\s+", ".", bayesVarNames, perl=TRUE)
      
      # figure out if there additional draws from the posterior, typically for factor score estimation
      convergenceLine <- grep("Convergence iterations", bparametersSection)
      
      if (length(convergenceLine) == 1L) {
        nextBlankLine <- which(bparametersSection[convergenceLine:length(bparametersSection)] == "")
        iterLines <- bparametersSection[convergenceLine:(convergenceLine + nextBlankLine - 2)]
        bayesIterDetails <- data.frame()
        for (ii in seq_along(iterLines)) {
          iterType <- sub("\\s*(.*) iterations .*", "\\1", iterLines[ii])
          iterType <- tolower(gsub("\\s", ".", iterType))
          iterStart <- as.numeric(sub(".*iterations\\s*([0-9]+)-[0-9]+", "\\1", iterLines[ii]))
          iterEnd <- as.numeric(sub(".*iterations\\s*[0-9]+-([0-9]+)", "\\1", iterLines[ii]))
          bayesIterDetails <- rbind(bayesIterDetails, data.frame(type=iterType, start=iterStart, end=iterEnd))
        }
      }
    }

    linesParsed <- c(linesParsed, attr(bparametersSection, "lines"))
  }

  #remove all lines already parsed within named sections
  #this should just leave basic savedata output containing save file info, MC replication info, etc.
  if (length(linesParsed) > 0L) savedataSection <- savedataSection[linesParsed*-1]

  #process FILE= section. Also applies to Monte Carlo and multiple imputation output
  #savedata filename
  saveFile.text <- grep("^\\s*Save file\\s*$", savedataSection, perl=TRUE)

  if (length(saveFile.text) > 0L) {
    saveFile <- trimSpace(savedataSection[(saveFile.text[1L] + 1)])
  }

  #save file format (sometimes on same line, sometimes on next line)
#  saveFileFormat.text <- getSection("^\\s*Save file format", savedataSection, sectionStarts)
#
#  #actually, sometimes the format is "Free" and falls on the same line...
#  if (!is.null(saveFileFormat.text)) {
#    saveFileFormat <- trimSpace(saveFileFormat.text[1]) #first line contains format
#  }
#

#  #file record length
#  savefile.recordlength <- getSection("^\\s*Save file record length\\s+\\d+\\s*$", savedataSection, sectionStarts)
#  #Skip for now

  orderFormat.text <- getMultilineSection("Order and format of variables", savedataSection, outfile, allowMultiple=FALSE)

  if (!is.na(orderFormat.text[1L])) {
    variablesToParse <- orderFormat.text[orderFormat.text != ""]
    
    #Mplus v8: remove + sign in front of variables that plausible values from SAVE=FSCORES (XX) in BSEM.
    if (length(nimp_line <- grep("\\+ variables that have a value for each of the \\d+ imputations", savedataSection)) > 0L) {
      nimp <- as.numeric(sub(".*\\+ variables that have a value for each of the (\\d+) imputations.*", "\\1", savedataSection[nimp_line]))
      which_imp <- grep("^\\s*\\+.*", variablesToParse)
      variablesToParse <- sub("+", "", variablesToParse, fixed=TRUE)
    } else { which_imp <- integer(0) } #no imputations to unpack
    
    #Mplus v8: because variables are now allowed to have spaces in the output, switch to a string split on spaces.
    #In this approach, the last element will be the format.
    varsSplit <- strsplit(trimSpace(variablesToParse), "\\s+")
    
    # Use rlang::flatten to remove the nested lists that result from the inner lapply over imputations
    # 2024: removed rlang::flatten in favor of inner unlist. Cannot find case where this was important
    varsSplit <- lapply(1:length(varsSplit), function(x) {
        vname <- varsSplit[[x]]
        if (x %in% which_imp) {
          #replicate the variable for each imputation
          return(unlist(lapply(1:nimp, function(i) { c(vname[1:(length(vname)-1)], sprintf("I_%03d", i), vname[length(vname)]) })))
        } else {
          return(vname)
        }
      })
    
    fileVarNames <- sapply(varsSplit, function(x) { paste(x[1:(length(x) - 1)], collapse=".") })
    fileVarFormats <- sapply(varsSplit, function(x) { x[length(x)] }) #last element
    fileVarWidths <- strapply(fileVarFormats, "[IEFG]+(\\d+)(\\.\\d+)*", as.numeric, perl=TRUE, simplify=TRUE)

  }

  #Monte carlo and multiple imputation output: contains only order of variables, not their format
  order.text <- getMultilineSection("Order of variables", savedataSection, outfile, allowMultiple=FALSE)

  if (!is.na(order.text[1L])) {
	  #dump any blank fields because they will cause nulls in the names, formats, widths.
	  #This is handled by blank.lines.skip=TRUE in wrappers, but readModels needs to retain blank lines
	  #for other functions, so strip here.
	  variablesToParse <- order.text[order.text != ""]

	  #just have variable names, no formats
    fileVarNames <- trimSpace(variablesToParse)
  }


  #future: plausible values output from Bayesian runs
  #PLAUSIBLE VALUE MEAN, MEDIAN, SD, AND PERCENTILES FOR EACH OBSERVATION


  #return the file information as a list
  #N.B. Would like to shift return to "saveFile", but need to update everywhere and note deprecation in changelog

  #bayesVarTypes=bayesVarTypes,
  return(list(fileName=saveFile, fileVarNames=fileVarNames, fileVarFormats=fileVarFormats, fileVarWidths=fileVarWidths,
          bayesFile=bayesFile, bayesVarNames=bayesVarNames, bayesIterDetails=bayesIterDetails, tech3File=tech3File, tech4File=tech4File))
}

#' Load an analysis dataset from the SAVEDATA command into an R data.frame
#'
#' This function reads an analysis dataset generated by the Mplus SAVEDATA command
#' and returns an R \code{data.frame} object.
#'
#' @note Note that the \code{outfile} parameter should refer to the Mplus output file (.out extension), not the
#' actual dataset generated by SAVEDATA. This function reads information about the dataset from the .out file
#' and loads the dataset accordingly.
#'
#' @param outfile Required. The name of the Mplus output file to read. Can be an absolute or relative path.
#'   If \code{outfile} is a relative path or just the filename, then it is assumed that the file resides in
#'   the working directory \code{getwd()}.
#' @return A \code{data.frame} containing the analysis dataset generated by the SAVEDATA command.
#' @author Michael Hallquist
#' @seealso \code{\link{getSavedata_Fileinfo}}
#' @examples
#' \dontrun{
#'   savedat <- getSavedata_Data("C:/Program Files/Mplus/Test Output.out")
#' }
#' @keywords internal
getSavedata_Data <- function(outfile) {
  #exposed wrapper for l_getSavedata_Data, which pulls saveData data into data.frame
  
  message("getSavedata_Data has been deprecated. Please use readModels(\"nameofMplusoutfile.out\", what=\"savedata\")$savedata to replicate the old functionality.")
  return(invisible(NULL))
}

#' Load the draws from the Bayesian model posterior distribution (SAVEDATA BPARAMETERS) command into an R data.frame
#'
#' This function reads a the BPARAMETERS output file from the Mplus SAVEDATA BPARAMETERS command
#' and returns an R \code{data.frame} object.
#'
#' @note Note that the \code{outfile} parameter should refer to the Mplus output file (.out extension), not the
#' actual dataset generated by SAVEDATA. This function reads information about the dataset from the .out file
#' and loads the dataset accordingly.
#'
#' @param outfile Required. The name of the Mplus output file to read. Can be an absolute or relative path.
#'   If \code{outfile} is a relative path or just the filename, then it is assumed that the file resides in
#'   the working directory \code{getwd()}.
#' @param discardBurnin Optional. Whether to discard the burn-in phase of each MCMC chain (i.e., the first half).
#'
#' @return A \code{list} containing the draws from the MCMC chains for a Bayesian model that uses the
#' SAVEDATA BPARAMETERS command. Each list element corresponds to a single MCMC chain, as specified by
#' the ANALYSIS: CHAINS syntax in \code{Mplus}. If discardBurnin is \code{FALSE}, then a superordinate list is
#' provided that divides output in terms of burn-in versus valid draw halves of the MCMC chains. For documentation
#' of how \code{Mplus} implements chain convergence checks and MCMC draws, see here: \url{http://www.statmodel.com/download/Bayes3.pdf}.
#'
#' @author Michael Hallquist, Florian Boeing-Messing
#' @seealso \code{\link{getSavedata_Fileinfo}}, \code{\link{getSavedata_Data}}
#' @references \url{http://www.statmodel.com/download/Bayes3.pdf}
#' @examples
#' \dontrun{
#'   fileInfo <- getSavedata_Data("C:/Program Files/Mplus/Test Output.out")
#' }
#' @keywords internal
getSavedata_Bparams <- function(outfile, discardBurnin=TRUE) {
  #exposed wrapper for l_getSavedata_readRawFile, which pulls bayesian parameters into a data.frame
  
  message("getSavedata_Bparams has been deprecated. Please use readModels(\"nameofMplusoutfile.out\", what=\"bparameters\")$bparameters to replicate the old functionality.")
  return(invisible(NULL))
}


#' Internal function to load the draws from the Bayesian model posterior distribution
#'
#' To do: add details.
#'
#' @param outfile The Mplus output file to be used.
#' @param outfiletext The contents of the Mplus output file
#' @param fileInfo The file info
#' @param discardBurnin Logical value whether to discard the burnin iterations or not. Defaults to \code{TRUE}.
#' @return A list of class \code{mcmc} and \code{mplus.bparameters}
#' @import coda
#' @keywords internal
l_getSavedata_Bparams <- function(outfile, outfiletext, fileInfo, discardBurnin=TRUE) {

  #missing fileInfo
  if (is.null(fileInfo)) return(NULL)

  bp <- l_getSavedata_readRawFile(outfile, outfiletext, format="free", fileName=fileInfo[["bayesFile"]], varNames=fileInfo[["bayesVarNames"]])
  
  if (is.null(bp)) return(NULL)
  
  #for(i in 1:nrow(bp)) {
  #  if (i > 1 && bp$Chain.number[i] != bp$Chain.number[i-1]) cat(i, " ", bp$Chain.number[i], "\n")
  #}

  #logic and code adapted from Florian Boeing-Messing.
  #Mplus runs 100 iterations for each chain, and then computes R-hat for each parameter.
  #If R-hat drops below a critical value for Potential Scale Reduction ~ 1.1, the chains are treated as converged (the values generated among chains are indistinguishable)
  #Then draws from the converged chains are obtained.
  #Thus, the second half of each chain contains valid draws, whereas the first half is burn-in

  if (!"Chain.number" %in% names(bp)) {
    warning("Chain number not in bparameters file. Assuming 1 chain.")
    bp$Chain.number <- 1
  }

  #number of chains
  nchains <- max(bp[,"Chain.number"])

  # separate out the convergence iterations from other iterations (e.g., factor score computation)
  additional_iterations <- list() # tagged on after splitting and parsing convergence iterations
  if (!is.null(fileInfo[["bayesIterDetails"]])) {
    c_row <- which(fileInfo$bayesIterDetails == "convergence")
    if (length(c_row) == 1L) {
      # the length of convergence iterations will be nchains * ndraws
      offset <- fileInfo$bayesIterDetails$end[c_row]*(nchains - 1)
      
      # since we have nchains * ndraws for convergence, we need to add the offset to cover the difference
      other_iter <- fileInfo$bayesIterDetails[-c_row,,drop=FALSE]
      other_iter$start <- other_iter$start + offset
      other_iter$end <- other_iter$end + offset
      if (nrow(other_iter) > 0L) {
        for (ii in seq_len(nrow(other_iter))) {
          additional_iterations[[other_iter$type[ii]]] <- bp[other_iter$start[ii]:other_iter$end[ii],,drop=FALSE]
        }
      }
      
      # subset the main object down to just convergence draws
      bp <- bp[fileInfo$bayesIterDetails$start[c_row]:(fileInfo$bayesIterDetails$end[c_row]*nchains),,drop=FALSE]
    }
  }
  
  #sort by chain number for separate burn-in and valid draws
  bp <- bp[sort.list(bp[,"Chain.number"]),]
  
  #Each section contains two halves (burn-in and valid draws)
  nsections <- 2*nchains

  lengthsec <- nrow(bp)/nsections

  ind <- rep(rep(c(FALSE, TRUE), each=lengthsec), nchains)

  bp$Burnin <- factor(ind, levels=c(FALSE, TRUE), labels=c("burn_in", "valid_draw"))

  #convert to mcmc.list object to be usable with coda
  if (nchains > 1) {
    #only keep numeric columns (required by mcmc objects)
    #divide into burn-in versus draw
    bp.burnin <- split(bp[,sapply(bp, is.numeric)], list(bp$Burnin))

    #divide the draws by chain and convert to mcmc list
    bp.split <- lapply(bp.burnin, function(bpsub) {
          mcmc.list(lapply(split(bpsub[,sapply(bpsub, is.numeric)], list(bpsub$Chain.number)), mcmc))
        })
  } else {
    #just one chain, so divide into burn-in versus draw
    bp.split <- lapply(split(bp[,sapply(bp, is.numeric)], list(bp$Burnin)), mcmc)
  }

  # Small logical inconsistency introduced here with additional iterations (e.g., factor scores)
  # Even if we discard the burn-in, should we also discard any other iterations? The discard is intended
  # to drop the return to a data.frame/mcmc object, not a list, so I'm sticking with that for now.
  if (discardBurnin) {
    bp.split <- bp.split[["valid_draw"]]
  } else {
    bp.split <- c(bp.split, additional_iterations)
  }
  
  class(bp.split) <- c(class(bp.split), "mplus.bparameters")

  return(bp.split)

}

#' Internal function to load the draws from the Bayesian model posterior distribution
#'
#' This function is used by other extractor functions that read particular types of
#' saved data such as factor scores or iterations from an MCMC chain.
#'
#' @param outfile The Mplus output file to be used.
#' @param outfiletext The contents of the Mplus output file
#' @param format A character string indicating the format. Defaults to \dQuote{fixed}.
#' @param fileName The name of the file
#' @param varNames The names of the variables to extract, comes from the fileInfo
#' @param varWidths The widths of the variables to extract, comes from the fileInfo
#' @param input list of parsed Mplus input section extracted upstream in readModels
#' @return A data frame of the extracted data.
#' @keywords internal
#' @importFrom utils read.table read.fwf
l_getSavedata_readRawFile <- function(outfile, outfiletext, format="fixed", fileName, varNames, varWidths, input) {
  outfileDirectory <- splitFilePath(outfile)$directory

  #if file requested is missing, then abort data pull
  if (isEmpty(fileName) || isEmpty(varNames)) return(NULL)

  savedataSplit <- splitFilePath(fileName)

  #if outfile target directory is non-empty, but savedataFile is without directory, then append
  #outfile directory to savedataFile. This ensures that R need not be in the working directory
  #to read the savedataFile. But if savedataFile has an absolute directory, don't append

  #if savedata directory is present and absolute, or if no directory in outfile, just use filename as is
  if (!is.na(savedataSplit$directory) && savedataSplit$absolute) {
    savedataFile <- fileName #just use savedata filename if has absolute path
  } else if (is.na(outfileDirectory)) {
    savedataFile <- fileName #just use savedata filename if outfile is missing path (working dir)
  } else {
    savedataFile <- file.path(outfileDirectory, fileName) #savedata path is relative or absent and outfile dir is present: append
  }

  #cat("Outfile dir: ", outfileDirectory, "\n")
  #cat("Savedata directory: ", savedataSplit$directory, "\n")
  #cat("concat result: ", savedataFile, "\n")

  #need to read as fixed width format given the way Mplus left-aligns missing vals (*)
  #dataset <- read.table(file=file.path(path, fileInfo$fileName), header=FALSE,
  #    na.strings="*", col.names=fileInfo$varNames)

  if (format == "free") {
    #handle case where filename contains * indicating Monte Carlo (MC) or multiple imputation (MI) dataset with reps
    if (grepl("\\*", savedataSplit$filename, perl=TRUE)) {
      #N.B. Mplus does not output the directory to which the MC/MI replications are saved.
      #Thus, the file name for the "Save file" section in "SAVEDATA INFORMATION" is always just the filename alone.
      #Using MONTECARLO: SAVE = or DATA IMPUTATION: SAVE = in the input instructions, one can specify a relative or
      #absolute path to the saved files. If this is set, then the above logic for determining
      #where to look for saved data should be overridden. In particular, we should use
      #the path from MONTECARLO: SAVE or DATA IMPUTATION: SAVE = and ignore where the output file is located.

      #if save command is present in montecarlo section and there is a directory specified, then use that.
      #resplit now that outfileDir and savedataDir are assembled to setup wildcard matching using list.files.

      split.above <- splitFilePath(savedataFile) #get info the directory for the output file, as determined above

      if (!is.null(input$montecarlo$save)) {
        mcmi.savesplit <- splitFilePath(input$montecarlo$save)
      } else if (!is.null(input$data.imputation$save)) {
        mcmi.savesplit <- splitFilePath(input$data.imputation$save)
      } else {
        mcmi.savesplit <- NULL
      }

      if (!is.null(mcmi.savesplit) &&
          !is.na(mcmi.savesplit$directory)) {

        if (mcmi.savesplit$absolute) {
          resplit <- splitFilePath(file.path(mcmi.savesplit$directory, split.above$filename)) #concat absolute montecarlo: save dir with rep filename
        } else {
          #mc save directory is present, but relative
          #thus, need to combine it with directory above
          if (!is.na(split.above$directory)) {
            resplit <- splitFilePath(file.path(split.above$directory, mcmi.savesplit$directory, split.above$filename))
          }
        }
      } else {
        resplit <- splitFilePath(savedataFile)
      }


      #patch file pattern to match perl syntax
      pat <- gsub("\\.", "\\\\.", resplit$filename, perl=TRUE)
      pat <- gsub("\\*", "\\\\d+", pat, perl=TRUE)
      fileNames <- list.files(path=resplit$directory, pattern=pat, full.names=FALSE) #very klunky way to handle this
      fileList <- list.files(path=resplit$directory, pattern=pat, full.names=TRUE)

      dataset <- list()

      if (length(fileList) == 0L) {
        #function was generating error when files could not be found
        warning("Unable to locate SAVEDATA files for filename: ", resplit$filename)
      } else {
        for (f in 1:length(fileList)) {
          dataset[[make.names(fileNames[f])]] <- read.table(file=fileList[f], header=FALSE,
              na.strings="*", col.names=varNames, strip.white=TRUE)
        }
      }

    } else {
      dataset <- read.table(file=savedataFile, header=FALSE, na.strings="*", strip.white=TRUE)
      if (length(varNames) > ncol(dataset)) {
        warning("Number of variable names for Bayesian Parameters section exceeds number of columns in: ", savedataFile)
        varNames <- varNames[1:ncol(dataset)] #heuristically, just using columns names up to the last column in the data
      } 
      
      names(dataset) <- varNames
    }
  }
  else if (format == "fixed") {
    if (!length(varWidths) > 0) stop("Fixed width file specified, but no widths obtained.")
    #strip.white is necessary for na.strings to work effectively with fixed width fields
    #otherwise would need something like "*       " for na.strings

    dataset <- read.fwf(file=savedataFile, widths=varWidths, header=FALSE,
        na.strings="*", col.names=varNames, strip.white=TRUE)
  }

  return(dataset)
}
michaelhallquist/MplusAutomation documentation built on March 14, 2024, 11:03 a.m.