R/modelCreatoR.R

Defines functions splitDFByRow classifyTags getInitTags parseTags modelCreatoR lookupSimpleTags updateCurrentValues recurseReplace replaceInitTags replaceBodyTags lookupValue finalizeInitCollection evaluateConditional clipString processConditionalTags processInit createVarSyntax prepareMplusData_Mat .cleanHashData .hashifyFile .convertData prepareMplusData

Documented in classifyTags .cleanHashData clipString .convertData createVarSyntax evaluateConditional finalizeInitCollection getInitTags .hashifyFile lookupSimpleTags lookupValue modelCreatoR parseTags prepareMplusData prepareMplusData_Mat processConditionalTags processInit recurseReplace replaceBodyTags replaceInitTags splitDFByRow updateCurrentValues

# TODO: enforce use of #iterator syntax in init section for vars with length > 1
# TODO: Check array tags used in the init and body sections for validity.
# TODO: Make sure that classify tags accurately interprets all tags and errors if uninterpretable tag.
# TODO: Allow for conditional tags to use a list, such as [[nclass#class == 5]]

#note that there's a bit of trickery in interpreting list tags
#they varnames are stored as only the prefix in the initCollection (no #iterator)
#and they are referenced in the body as var#iterator
#At this point, doesn't enforce proper use of iterator with a list

#setwd("C:/Users/Michael Hallquist/Documents/Automation_Sandbox")
#createModels("C:/Users/Michael Hallquist/Documents/Automation_Sandbox/LSPD Covariate Template.txt")
#system.time(createModels("C:/Users/Michael Hallquist/Documents/Automation_Sandbox/LSPD Template.txt"))
#createModels("C:/Users/Michael Hallquist/Documents/Automation_Sandbox/LSPD Template New Init.txt")
#createModels("C:/Users/Michael Hallquist/Documents/Automation_Sandbox/L2 Multimodel Template No iter.txt")

#need to sort out why is.na is working for lookupValue in replaceBodyTags
#in particular, why isn't the current value carrying over from the previous looping iteration?

#SOME THOUGHTS RE DOCUMENTATION
#foreach tags may only be with respect to an iterator... could not have some random foreach var

#' Split a data frame into a list by rows
#'
#' Takes a data frame and returns a list with an element for each row of the data frame.
#' This is an internal function.
#'
#' @param df An object inheriting from class \code{data.frame}
#'
#' @return A list where each element is a one row data frame
#' @keywords internal
#' @examples
#' # small example using built in data
#' MplusAutomation:::splitDFByRow(mtcars)
splitDFByRow <- function(df) {
  stopifnot(inherits(df, "data.frame"))
  lapply(seq.int(nrow(df)), function(i) df[i, ])
}

#' Classifies Tags
#'
#' Accepts a vector of tags to be classified as well as the iterators.
#' Tags are classified as \sQuote{iterator}, \sQuote{array}, \sQuote{conditional}, or
#' \sQuote{simple}. This is an internal function.
#'
#' @param tagVector A vector of tags to be classified
#' @param iteratorsVector a vector of the iterators to correctly classify tags
#' @return A character vector the same length as the vectors to be tagged
#' @keywords internal
classifyTags <- function(tagVector, iteratorsVector) {
  #accepts a vector of tags to be classified
  #also needs a vector of the iterators to correctly classify tags
  #returns a vector of tag types

  #creates an empty character vector of the same length as tagVector (each element defaults to "")
  tagType <- vector(mode="character", length=length(tagVector))

  #default to missing for tag type (replaced below)
  #tagData$tagType <- NA_character_

  # named list of the regexs to match for
  # the names of each elements are used later to classify tags
  RegEx <- list(
    iterator = paste0("\\[\\[\\s*(", paste(iteratorsVector, collapse="|"), ")\\s*\\]\\]"),
    array = paste0("\\[\\[\\s*\\b([\\w\\.]+)#(", paste(iteratorsVector, collapse="|"), ")\\b\\s*\\]\\]"),

    #optional forward slash for closing tags
    #could the alternation syntax be problematic if variable names overlaps
    #(e.g., x matching xy)? Use word boundaries?
    #any reason to limit this to iterators?!
    conditional = paste0("\\[\\[\\s*/*(", paste(iteratorsVector, collapse="|"), ")\\s*[!><=]+\\s*\\d+\\s*\\]\\]"),

    #simple tags -- not wrt iterators, not conditional
    #use negative lookahead to skip tags that are iterators
    simple = paste0("\\[\\[\\s*(?!", paste(iteratorsVector, collapse="|"), ")[\\w+\\.]+\\s*\\]\\]"))

  Positions <- lapply(RegEx, grep, x = tagVector, perl = TRUE)

  # assert no duplicates, i.e., tag cannot match multiples classes
  stopifnot(!any(duplicated(unlist(Positions))))

  for (n in names(Positions)) {
    tagType[Positions[[n]]] <- n
  }

  return(tagType)
}

#' Get Initial Tags
#'
#' An internal function
#'
#' @param initCollection A list?
#' @return The initMatches
#' @keywords internal
getInitTags <- function(initCollection) {
  initMatches <- c()
  for (i in 1:length(initCollection)) {
    if (storage.mode(initCollection[[i]]) == "character") {
      matches <- friendlyGregexpr("\\[\\[\\s*[\\s\\w=><!#/]+\\s*\\]\\]", initCollection[[i]], perl=T)
      #if there are matches for this item, add its position in the list pos
      #the idea is that the list has elements and the elements can be vectors
      #thus, a match may occur for initCollection[[5]][3] if the fifth element of the list is a vector
      #and the match is the third element.
      if (!is.null(matches)) matches$listpos <- i
      initMatches <- rbind(initMatches, matches)
    }
  }

  #successfully creates a data.frame of the sort below.
  #   element start end                          tag listpos
  #1        1     1  11                  [[classes]]      14
  #2        1    19  38         [[groupnames#group]]      14
  #3        1    40  63     [[outcomenames#outcome]]      14
  #4        1    65  84         [[modelnames#model]]      14
  #5        1    85 112 [[zeroclassnames#zeroclass]]      14
  #6        1     6  29     [[outcomenames#outcome]]      15
  #7        1    31  50         [[groupnames#group]]      15
  #8        1    73  92         [[modelnames#model]]      15
  #9        1     1   9                    [[hello]]      17
  #10       2     1  10                   [[hello2]]      17

  #classify tags in terms of simple, array, iterator, conditional, foreach
  if (!is.null(initMatches) && nrow(initMatches) > 0) {
    initMatches$tagType <- classifyTags(initMatches$tag, initCollection$iterators)

    #chop off the [[ ]] portion of the tags, along with any leading or trailing space
    #this makes it easier to use the sub function to update current values
    initMatches$tag <- sapply(initMatches$tag, function(tag) {
      return(sub("\\[\\[\\s*([\\s\\w=><!#/]+)\\s*\\]\\]", "\\1", tag, perl=TRUE))
    })
  }

  #return empty data frame if no matches
  if (is.null(initMatches)) return(data.frame())
  else return(initMatches)
}

#' Parses tags in the body section
#'
#' Parses tags in the body section (character vector) and
#' init collection (list of vars defined in the init section).
#' This is an internal function.
#'
#' @param bodySection The body
#' @param initCollection The initial collection
#' @return A list with three elements, where each list represents the location,
#' start character, end character, tag type, etc. of each tag.
#' \describe{
#'   \item{initTags}{initMatches}
#'   \item{bodyTags}{bodyMatches}
#'   \item{bodyText}{bodySection}
#' }
#' @keywords internal
parseTags <- function(bodySection, initCollection) {
  #first handle init tags
  initMatches <- getInitTags(initCollection)

  initMatches$currentValue <- NA_character_

  bodyTagRegex <- "\\[\\[\\s*[\\s\\w=><!#/]+\\s*\\]\\]"
  bodyMatches <- friendlyGregexpr(bodyTagRegex, bodySection, perl=TRUE)

  if (is.null(bodyMatches)) stop("No tags found in body section of template file.")

  bodyMatches$tagType <- classifyTags(bodyMatches$tag, initCollection$iterators)
  #okay, now every tag is categorized
  #the notion here is to substitute in the running value for a given variable
  #then we'll do a mass substitute for each model
  bodyMatches$currentValue <- NA_character_

  #chop off the [[ ]] portion of the tags, along with any leading or trailing space
  bodyMatches$tag <- sapply(bodyMatches$tag, function(tag) {
    return(sub("\\[\\[\\s*([\\s\\w=><!#/]+)\\s*\\]\\]", "\\1", tag, perl=TRUE))
  })

  #return a three-element list with constituent data frames for init and body tags.
  return(list(initTags=initMatches, bodyTags=bodyMatches, bodyText=bodySection))
}

#' Create Mplus Input Files from Template
#'
#' The \code{createModels} function processes a single Mplus template file and creates a group of related
#' model input files. Definitions and examples for the template language are provided in the MplusAutomation
#' vignette and are not duplicated here at the moment. See this documentation: \code{vignette("Vignette", package="MplusAutomation")}
#'
#' @param templatefile The filename (absolute or relative path) of an Mplus template file to be processed. Example \dQuote{C:/MplusTemplate.txt}
#' @return No value is returned by this function. It is solely used to process an Mplus template file.
#' @author Michael Hallquist
#' @keywords interface
#' @export
#' @examples
#' \dontrun{
#'   createModels("L2 Multimodel Template No iter.txt")
#' }
modelCreatoR <- function(templatefile) {
  # should probably have the function cd to wherever the template file is located (if given as abs path)
  # todo: allow for direct runs?

  if (!file.exists(templatefile)) stop("Template file not found.")

  readfile <- scan(templatefile, what="character", sep="\n", strip.white=FALSE, blank.lines.skip=FALSE, quiet=TRUE)

  # divide into init versus body
  startinit <- grep("[[init]]", readfile, fixed=T)
  endinit <- grep("[[/init]]", readfile, fixed=T)

  if (length(startinit) != 1 || length(endinit) != 1) {
    stop("Unable to find init section in template file.")
  }

  # extract init section
  initSection <- readfile[(startinit+1):(endinit-1)]

  # extract body section
  bodySection <- readfile[(endinit+1):length(readfile)]

  # convert the init text into a list object containing parsed init instructions
  initCollection <- processInit(initSection)

  templateTags <- parseTags(bodySection, initCollection)

  # lookup values for simple tags, which won't vary by iterator
  templateTags <- lookupSimpleTags(templateTags, initCollection)

  # kick off the recursive replace
  if (length(initCollection$iterators) > 0) {
    recurseReplace(templateTags, initCollection)
  }
}

#' Simple tag lookup
#'
#' The purpose of this function is to set the currentValue column
#' for the bodyTags and initTags data.frames for simple tags only.
#' Most values will be replaced at the bottom level of recursion,
#' but simple tags do not change over iterations, so can be set one time.
#'
#' @param templateTags The template tags
#' @param initCollection The initial collection
#' @return A tag.
#' @keywords internal
lookupSimpleTags <- function(templateTags, initCollection) {
  #  #locate simple tags in body
  #  simpleBodyPositions <- which(templateTags$bodyTags$tagType=="simple")
  #
  #  #replace tag with value
  #  templateTags$bodyTags$currentValue[simpleBodyPositions] <- sapply(templateTags$bodyTags$tag[simpleBodyPositions],
  #      function(value) {
  #        currentValue <- eval(parse(text=paste("initCollection$", value, sep="")))
  #        if (regexpr("\\[\\[\\s*[\\s\\w=><!#/]+\\s*\\]\\]", currentValue, perl=TRUE) > 0) {
  #          #The replacement tag itself contains additional tags.
  #          #Thus, not a simple replacement. This replacement needs to be deferred until
  #          #we have iterated to the bottom of the tree and have all needed information
  #          #set a deferred value to be replace later
  #          currentValue <- "..deferred.."
  #        }
  #        return(currentValue)
  #      })

  #locate simple tags in init
  simpleInitPositions <- which(templateTags$initTags$tagType=="simple")

  templateTags$initTags$currentValue[simpleInitPositions] <- sapply(
    templateTags$initTags$tag[simpleInitPositions],
    function(value) {
      return(eval(parse(text=paste0("initCollection$", value))))
    })

  return(templateTags)
}

#' Updates current values
#'
#' Body tags currentValues are substituted at the bottom-most level
#' after init collection is finalized (recursively process any nested tags)
#'
#' @param templateTags The template tags
#' @param initCollection Initial collection
#' @return Updated current value or the original if no match.
#' @keywords internal
updateCurrentValues <- function(templateTags, initCollection) {
  #Better idea: only updateCurrentValues for init tags collection
  #And only update init collection for the respective iterator
  #Only need to update values for a given iterator....
  #The issue is that values for a given iterator shouldn't change when another iterator is active

  #need to replace array and iterator tags for this iterator

  #locate iterator tags in init
  initIteratorPositions <- which(
    templateTags$initTags$tagType=="iterator" &
      templateTags$initTags$tag == initCollection$curIteratorName)

  #set the current value to the position in the looping process for this iterator
  templateTags$initTags$currentValue[initIteratorPositions] <- initCollection$curItPos[initCollection$curIteratorDepth]

  #allow for iterator lookups here... just to an is.na check in the replaceBodyTags
  #locate iterator tags in body
  bodyIteratorPositions <- which(
    templateTags$bodyTags$tagType == "iterator" &
      templateTags$bodyTags$tag == initCollection$curIteratorName)

  templateTags$bodyTags$currentValue[bodyIteratorPositions] <- initCollection$curItPos[initCollection$curIteratorDepth]

  # Next, handle array tags
  # figure out the iterator for each array tag and only select
  # those that are relevant to the current iterator
  initArrayPositions <- which(templateTags$initTags$tagType=="array")

  # only update values if any array tags are found
  # (generates an error otherwise because of weird format from splitter_a
  if (length(initArrayPositions) > 0) {

    # use plyr's splitter_a function to divide dataset by row (builds a big list)
    # 20Jul2010: Had to call splitter function directly, ignoring namespace because plyr 1.0 hid this.
    # divideByRow <- plyr:::splitter_a(templateTags$initTags[initArrayPositions,], 1)
    # actually, splitter_a no longer has the same return type (it's now an environment)
    # would have to call row$data$tag... just replace with homespun function defined above.

    divideByRow <- splitDFByRow(templateTags$initTags[initArrayPositions,])


    #for each element of the list, check for a match with this iterator and return the value of interest
    #if the array tag is not for this iterator, return the current value unchanged
    templateTags$initTags$currentValue[initArrayPositions] <- unlist(sapply(divideByRow,
                                                                            function(row) {
                                                                              split <- strsplit(row$tag, split="#", fixed=TRUE)[[1]]
                                                                              if (length(split) != 2) stop("array tag missing iterator: ", row$tag)

                                                                              if (split[2] == initCollection$curIteratorName) {
                                                                                currentValue <- eval(parse(text =
                                                                                                             paste0("initCollection$", split[1], "[",
                                                                                                                    initCollection$curItPos[initCollection$curIteratorDepth], "]")))

                                                                                if (is.null(currentValue)) {
                                                                                  stop("When replacing tag: ", row$tag, ", could not find corresponding value.")
                                                                                }
                                                                                return(currentValue)
                                                                              } else {
                                                                                # return unchanged current value if not this iterator
                                                                                return(row$currentValue)
                                                                              }
                                                                            }))
  }

  # for now, we don't use any current values for body tags collection (handled at bottom)
  #  #conduct same process for body tags: locate array tags and update values for this iterator
  #  bodyArrayPositions <- which(templateTags$bodyTags$tagType=="array")
  #
  #  #use plyr's splitter_a function to divide dataset by row (builds a big list)
  #  divideByRow <- splitter_a(templateTags$bodyTags[bodyArrayPositions,], 1)
  #
  #  #for each element of the list, check for a match with this iterator and return the value of interest
  #  templateTags$bodyTags$currentValue[bodyArrayPositions] <- unlist(sapply(divideByRow,
  #      function(row) {
  #        split <- strsplit(row$tag, split="#", fixed=TRUE)[[1]]
  #        if (length(split) != 2) stop("array tag missing iterator: ", row$tag)
  #
  #        if (split[2] == initCollection$curIteratorName) {
  #          currentValue <- eval(parse(text=paste("initCollection$", split[1], "[", initCollection$curItPos[initCollection$curIteratorDepth], "]", sep="")))
  #          if (regexpr("\\[\\[\\s*[\\s\\w=><!#/]+\\s*\\]\\]", currentValue, perl=TRUE) > 0) {
  #            #The replacement tag itself contains additional tags.
  #            #Thus, not a simple replacement. This replacement needs to be deferred until
  #            #we have iterated to the bottom of the tree and have all needed information
  #            #set a deferred value to be replace later
  #            currentValue <- "..deferred.."
  #          }
  #          if (is.null(currentValue)) stop("When replacing tag: ", row$tag, ", could not find corresponding value.")
  #          return(currentValue)
  #        }
  #        else return(row$currentValue) #return unchanged current value if not this iterator
  #      }))

  return(templateTags)
}

#' Recursive replace
#'
#' To do: fill in some details
#'
#' @param templateTags The template tags
#' @param initCollection The list of all arguments parsed from the init section
#' @param curiterator An integer that tracks of the depth of recursion through the iterators. Defaults to 1.
#' @return Does not look like it returns anything
#' @keywords internal
recurseReplace <- function(templateTags, initCollection, curiterator=1L) {
  #bodySection is the character vector representing each line of the body section
  #bodyTags is a data.frame documenting the location and type of all tags in bodySection
  #initTags is a data.frame documenting the location and type of all tags in initCollection

  if (!is.list(initCollection)) {
    stop("Argument list passed to recurseReplace is not a list")
  }

  # check that curiterator is indeed a whole number
  stopifnot(curiterator %% 1 == 0)

  thisIterator <- initCollection$iterators[curiterator]

  #set the current iterator for the collection (used by replaceTags)
  initCollection$curIteratorName <- thisIterator
  initCollection$curIteratorDepth <- curiterator

  #would it work better to use a named array here?
  #like curItVals <- c(1, 3, 5, 2) for iterators a, b, c, d
  #then names(curItVals) <- c("a", "b", "c", "d")

  for (i in initCollection[[thisIterator]]) {

    #set the current position within this iterator for use in replace tags
    #create a vector of iterator positions for use in replaceTags
    #initCollection$curItPos[curiterator] <- i

    #add the iterator name to the vector of iterator positions
    #this has the same effect as above (appending as it recurses), but allows for name-based lookup
    initCollection$curItPos[thisIterator] <- i

    #print(paste("current iterator is:", thisIterator, ", position:", as.character(i)))

    #process foreach commands
    #For now, take this out
    #bodySection <- processForEachTags(bodySection, initCollection)

    #update the current values for this iterator and this iteration
    #this applies for every iterator and iteration, not just processing
    #at the deepest level. The function only updates array and iterator
    #tags that match this iterator, thus minimizing redundant work.
    #the latest is that only init collection tags will be updated
    #then body tags are replaced at the bottom level after init collection is finalized
    templateTags <- updateCurrentValues(templateTags, initCollection)

    if (curiterator < length(initCollection$iterators)) {
      #if not at deepest level, recurse to the next level by adding 1 to the iterator

      #NOTE to self: consider adding a "foreachReplacements" collection to templateTags
      #that contains the expansions of these tags (appended per iteration)
      #this avoids having to think about reparsing the tags based on new code created by foreach

      recurseReplace(templateTags, initCollection, curiterator = curiterator+1)
    } else {
      #we have reached the bottom of the iteration tree
      #simple, array, and iterator tags should be up to date in the templateTags collection

      #first delete conditional tags from the body section, reduce subsequent processing burden
      #need to return templateTags collection from processConditionalTags (including bodyText)

      #need to use a copy of templateTags to avoid it affecting subsequent loop iterations
      finalTemplateTags <- processConditionalTags(templateTags, initCollection)

      #the body section to write is stored in the templateTags collection
      toWrite <- finalTemplateTags$bodyText

      #create a separate initCollection with the appropriate values substituted.
      finalInitCollection <- replaceInitTags(finalTemplateTags$initTags, initCollection)

      #finalize init collection values (in cases of nested tags)
      #wades through init collection for any remaining tags and replaces them
      finalInitCollection <- finalizeInitCollection(finalInitCollection)

      #update bodySection with tag values from finalized init tags
      toWrite <- replaceBodyTags(toWrite, finalTemplateTags$bodyTags, finalInitCollection)

      filename <- finalInitCollection$filename

      cat(paste("writing file: ", filename, "\n", sep=""))
      curdir <- getwd()


      #figure out the output directory
      outputDir <- trimws(finalInitCollection$outputDirectory)
      # finalInitCollection$factorNames <- trimws(finalInitCollection$factorNames)
      # finalInitCollection$factorDefs <- trimws(finalInitCollection$factorDefs)

      if (!file.exists(outputDir)) {
        dir.create(outputDir, recursive=TRUE)
      }

      #setwd(outputDir)
      #options(browser())
      #make sure that no line is more than 90 chars
      toWrite <- unlist(lapply(toWrite, function(line) {
        if (nchar(line) > 90) {
          strwrap(line, width=85, exdent=5)
        } else {
          line
        }
      }))

      cat(toWrite, file = paste0(outputDir, '/', filename), sep = "\n")

      #on.exit(setwd(curdir))
    }
  }
}

#' Replace Init Tags
#'
#' To do: fill in some details
#'
#' @param initTags Init tags
#' @param initCollection The list of all arguments parsed from the init section
#' @return Returns updated initCollection
#' @keywords internal
replaceInitTags <- function(initTags, initCollection) {
  targetRows <- which(initTags$tagType %in% c("simple", "iterator", "array"))
  targetTags <- initTags[targetRows, ]
  targetTags$rownumber <- 1:nrow(targetTags)

  #going to re-use this chunk in finalizeSubstitutions, so functionalize...
  #consider the looping replacement here
  for (i in 1:nrow(targetTags)) {
    row <- targetTags[i, ]
    stringToChange <- initCollection[[row$listpos]][row$element]

    if(row$start > 1) {
      preTag <- substr(stringToChange, 1, row$start - 1)
    } else {
      preTag <- ""
    }

    if (row$end < nchar(stringToChange)) {
      postTag <- substr(stringToChange, row$end+1, nchar(stringToChange))
    } else {
      postTag <- ""
    }

    initCollection[[row$listpos]][row$element] <- paste0(preTag, row$currentValue, postTag)

    subsequentRows <- which(
      targetTags$rownumber > i &
        targetTags$listpos == row$listpos &
        targetTags$element == row$element)

    if (length(subsequentRows > 0)) {

      #need to offset subsequent start/stops by the difference
      #between the tag and replacement lengths
      diffLength <- nchar(row$currentValue) - (row$end - row$start + 1)

      #update rows in targetTags that have additional tags on the same row
      #need to offset by the diffLength
      targetTags[subsequentRows,"start"] <- targetTags[subsequentRows,"start"] + diffLength
      targetTags[subsequentRows,"end"] <- targetTags[subsequentRows,"end"] + diffLength
    }
  }

  #refresh the initTags collection with the replaced values
  #need to dump the rownumber to align the data.frames
  # (templateTags doesn't have a rownumber field)
  targetTags$rownumber <- NULL
  initTags[targetRows, ] <- targetTags

  #return(initTags)
  #browser()
  return(initCollection)
}

#' Replace Body Tags
#'
#' To do: fill in some details
#'
#' @param bodySection character vector of body section of Mplus syntax
#' @param bodyTags collection of tags used inside of the template body
#' @param initCollection The list of all arguments parsed from the init section
#' @return Returns updated bodySection
#' @keywords internal
replaceBodyTags <- function(bodySection, bodyTags, initCollection) {
  if (length(bodySection) <= 0) stop("Empty body section")

  #need to ponder issues where a replaced tag still contains another tag

  #hmm, actually seems futile to do a replacement in the init section
  #these are already set by update values.... won't affect the body section

  # so we need to finalize the tag substitutions...
  # the idea is that we need to convert all tags to literals in the initCollection
  # once this is done, then we replace all deferred tags in the body section


  #don't update current values if initcollection value contains any tag
  #if so, replace at the last minute (check this in Init)

  #set a "deferred" status in currentValue if replacement contains tags
  targetTags <- with(bodyTags, bodyTags[tagType %in% c("simple", "iterator", "array"), ])
  targetTags$rownumber <- 1:nrow(targetTags)

  #print(targetTags)
  #stop("test")

  #could improve this by replacing identical tags at once
  #like ddply by the tag

  for (i in 1:nrow(targetTags)) {
    row <- targetTags[i, ]
    stringToChange <- bodySection[row$element]

    if (row$start > 1) {
      preTag <- substr(stringToChange, 1, row$start-1)
    } else {
      preTag <- ""
    }

    if (row$end < nchar(stringToChange)) {
      postTag <- substr(stringToChange, row$end+1, nchar(stringToChange))
    } else {
      postTag <- ""
    }

    #lookup value as needed
    if (is.na(row$currentValue)) {
      row$currentValue <- lookupValue(row$tag, row$tagType, initCollection)
    }
    #row$currentValue <- lookupValue(row$tag, row$tagType, initCollection)

    bodySection[row$element] <- paste0(preTag, row$currentValue, postTag)

    #need to offset subsequent start/stops by the difference between the tag and replacement lengths
    diffLength <- nchar(row$currentValue) - (row$end - row$start + 1)

    subsequentRows <- which(
      targetTags$rownumber > i &
        targetTags$element == row$element)

    if (length(subsequentRows > 0)) {
      #update rows in targetTags that have additional tags on the same row
      #need to offset by the diffLength
      targetTags[subsequentRows,"start"] <- targetTags[subsequentRows,"start"] + diffLength
      targetTags[subsequentRows,"end"] <- targetTags[subsequentRows,"end"] + diffLength
    }
  }

  return(bodySection)
}

#' Lookup values
#'
#' To do: fill in some details
#'
#' @param tag name of tag for which we want to know the current value
#' @param tagType type of tag (simple, array, etc.) for the tag to lookup
#' @param initCollection The list of all arguments parsed from the init section
#' @return Current value
#' @keywords internal
lookupValue <- function(tag, tagType, initCollection) {
  #redundant with finalize code... re-use
  if (missing(tag)) stop("No tag provided")
  if (missing(tagType)) stop("No tag type provided")

  if (tagType == "simple") {
    return(eval(parse(text=paste0("initCollection$", tag))))
  }
  else if (tagType == "array") {
    split <- strsplit(tag, split="#", fixed=TRUE)[[1]]
    if (length(split) != 2) stop("array tag missing iterator: ", row$tag)

    #find where in the iterator depth this iterator lies
    #iteratorPosition <- grep(paste("\\b", split[2], "\\b", sep=""), initCollection$iterators, perl=T)

    #use named array look-up
    iteratorPosition <- initCollection$curItPos[split[2]]

    #note that the padding performed by processInit should handle non-contiguous iteratorPosition values here.
    currentValue <- eval(parse(text=paste0("initCollection$", split[1], "[", iteratorPosition, "]")))

    if (is.null(currentValue)) {
      stop("When replacing tag: ", row$tag, ", could not find corresponding value.")
    }

    return(currentValue)
  }
}

#' Finalize Init Collection
#'
#' this function should handle initTags that still contain tags
#' once the initCollection is finalized, then process the deferred body tags
#' the notion is that the substitutions will be handled in an inefficient manner -- using lots
#' of regular expression parsing, not using the matched tags data.frame
#'
#' we only need to handle simple and array tags
#' iterators should always be integers
#' foreach and conditional are not relevant
#'
#' iterate over init tags until no tags are left
#' here, the init collection should already have had most of its tags substituted by
#' replaceInitTags above.
#'
#' @param initCollection The list of all arguments parsed from the init section
#' @return Finalized initCollection
#' @keywords internal
finalizeInitCollection <- function(initCollection) {
  tagsRemain <- TRUE
  numIterations <- 1
  while(tagsRemain) {
    initTags <- getInitTags(initCollection)

    if (nrow(initTags) == 0) break #if no tags found, then substitution complete

    #update: iterator tags can be nested within other tag types and not updated until here.
    initTags <- with(initTags, initTags[tagType %in% c("simple", "iterator", "array"),])
    if (nrow(initTags) == 0) break #some tags, but none of the simple or array variety, which we want to replace

    #use plyr's splitter_a function to divide dataset by row (builds a big list)
    #divideByRow <- plyr:::splitter_a(initTags, 1)
    divideByRow <- splitDFByRow(initTags)

    #for each element of the list, check for a match with this iterator and return the value of interest
    initTags$currentValue <- unlist(sapply(divideByRow,
                                           function(row) {
                                             if (row$tagType == "simple") {
                                               return(eval(parse(text=paste0("initCollection$", row$tag))))
                                             }
                                             else if (row$tagType == "iterator") {
                                               #an iterator tag was nested
                                               return(initCollection$curItPos[row$tag])
                                             }
                                             else if (row$tagType == "array") {
                                               split <- strsplit(row$tag, split="#", fixed=TRUE)[[1]]
                                               if (length(split) != 2) stop("array tag missing iterator: ", row$tag)

                                               #find where in the iterator depth this iterator lies
                                               #iteratorPosition <- grep(paste("\\b", split[2], "\\b", sep=""), initCollection$iterators, perl=T)

                                               #use named array look-up
                                               iteratorPosition <- initCollection$curItPos[split[2]]

                                               currentValue <- eval(parse(text=paste0("initCollection$", split[1], "[", iteratorPosition, "]")))

                                               if (is.null(currentValue)) {
                                                 stop("When replacing tag: ", row$tag, ", could not find corresponding value.")
                                               }

                                               return(currentValue)
                                             }
                                           }
    ))

    #now we have a list of curent values for any init tags
    #and we want to update the init collection with their values... just as with above.
    initCollection <- replaceInitTags(initTags, initCollection)

    numIterations <- numIterations + 1
    if (numIterations > 20) stop("While replacing tags in init section, looped over variables 20 times without completing substitutions.\n  Check for circular definitions within init section.")
  }

  #browser()
  return(initCollection)
}

#' Evaluate Conditional
#'
#' Note that at thie point the comparator must be a number (not another variable).
#'
#' @param tag A tag
#' @param initCollection The list of all arguments parsed from the init section
#' @return A boolean value indicating whether the conditional is true
#' @keywords internal
evaluateConditional <- function(tag, initCollection) {
  #evaluate whether tag is true
  #first divide up into name, operator, and value
  regexp <- "(\\w+)\\s*([!><=]+)\\s*(\\w+)"
  conditional <- unlist(strapply(tag, regexp, c))

  if (length(conditional) < 3) {
    stop("Error in conditional tag: does not contain variable, operator, and value. Tag = ", tag)
  }

  #convert simple equals to logical equals
  if (conditional[2] == "=") conditional[2] <- "=="

  #obsolete b/c using named array
  #iteratorPosition <- grep(paste("\\b", conditional[1], "\\b", sep=""), initCollection$iterators, perl=T)

  #return a boolean value indicating whether the conditional is true
  return(eval(parse(text=paste0("initCollection$curItPos[conditional[1]]", conditional[2], conditional[3]))))
}

#' Clip String
#'
#' To do: add any details.
#'
#' @param string A string to be clipped
#' @param start The character position to start at
#' @param end  The character position to end at
#' @return A string from start to end
#' @keywords internal
clipString <- function(string, start, end) {
  #if the string is shorter than the length of the clip, then nothing remains
  if (nchar(string) <= end-start+1) return("")

  if(start > 1) preString <- substr(string, 1, start-1)
  else preString <- ""

  if(end < nchar(string)) postString <- substr(string, end+1, nchar(string))
  else postString <- ""

  return(paste0(preString, postString))
}

#' Process Conditional Tags
#'
#' To do: add details.
#'
#' @param templateTags A template tag
#' @param initCollection The list of all arguments parsed from the init section
#' @return Processed templateTags
#' @keywords internal
processConditionalTags <- function(templateTags, initCollection) {
  #require(gsubfn) #moving to import strategy
  #find all conditional tags in the body section and remove them from the templateTags and bodyText pieces...

  conditionalTagIndices <- which(templateTags$bodyTags$tagType=="conditional")

  #return templateTags unharmed if there are no conditional tags (creates error below otherwise)
  if (length(conditionalTagIndices) == 0) return(templateTags)

  openClose <- ifelse(substr(templateTags$bodyTags$tag[conditionalTagIndices], 1, 1)=="/", "close", "open")
  allOpen <- conditionalTagIndices[openClose=="open"]

  bodyTagsToDrop <- c()
  bodyLinesToDrop <- c()
  for (i in allOpen) {
    #should be able to decide whether to skip an iteration if the affected lines are already in bodyLinesToDrop
    thisTag <- templateTags$bodyTags$tag[i]

    #evaluate truth of conditional
    conditionalTrue <- evaluateConditional(thisTag, initCollection)

    #only look for closing tags after the opening and accept the first exact match
    close <- conditionalTagIndices[
      templateTags$bodyTags$tag[conditionalTagIndices] == paste0("/", thisTag) &
        templateTags$bodyTags$element[conditionalTagIndices] >= templateTags$bodyTags$element[i]][1]

    sameLine <- FALSE
    #in case of same line match, check to make sure close follows opening on that line
    #the conditions above could match when a closing tag precedes opening tag on the same line
    if (templateTags$bodyTags$element[close]==templateTags$bodyTags$element[i]) {
      sameLine <- TRUE

      close <- conditionalTagIndices[
        openClose == "close" &
          templateTags$bodyTags$tag[conditionalTagIndices] == paste0("/", thisTag) &
          templateTags$bodyTags$element[conditionalTagIndices] == templateTags$bodyTags$element[i] &
          templateTags$bodyTags$start[conditionalTagIndices] > templateTags$bodyTags$end[i]][1]

      if (!close > 0) stop("Could not find closing tag for conditional:", thisTag)
    }

    #skip this iteration if the opening and closing tags in question are already in the drop pile
    #these lines (and the lines between, if necessary) will already be dropped, so don't process
    if (templateTags$bodyTags$element[i] %in% bodyLinesToDrop &&
        templateTags$bodyTags$element[close] %in% bodyLinesToDrop) next

    #first check for tags to drop from the bodyTags collection (don't want these parsed later)
    if (conditionalTrue) {
      #only remove starting and ending tags
      bodyTagsToDrop <- c(bodyTagsToDrop, i, close)
    } else {
      #if conditional false, then remove all tags between conditional tags
      #first, dump all lines in the bodyTags section that fall between elements
      bodyTagsToDrop <- c(bodyTagsToDrop, i:close)

      #conditional is not true
      #so dump the tags and all space between
      #really, the only difference here from the calculation below is that
      #bodyLinesToDrop should encompass the space between opening and closing
      #and the clips below should dump the rest of the line when multiple tags on same line
      #no need to rewrite code for clipping out tags
      #don't clip the tag lines themselves because this is handled below (whole line goes if nchar <= 0)
      #print(bodyLinesToDrop)
      #browser()

      #only drop lines between matching open/close tags if not on the same line
      #otherwise, the clipping code below handles everything correctly
      #if on the same line, then element + 1:close - 1 will lead to something like 58:56, which is bad
      if (!sameLine) {
        bodyLinesToDrop <- c(bodyLinesToDrop,
                             (templateTags$bodyTags$element[i]+1):(templateTags$bodyTags$element[close]-1))
      }
    }

    #then dump lines from the syntax section itself
    #handle same line issues, then delete whole lines between tags
    #as with replaceTags substitution, need to handle situation where tag is on line with other stuff
    #thus, need to update bodyTags collection, too to reflect new start/stop positions

    #when the conditional is true, just remove the tags and leave the syntax
    #dump the opening tag on the line

    #if the conditional is true, just use the last pos of the opening tag for the clip
    if (conditionalTrue) endPos <- templateTags$bodyTags$end[i]
    #want to clip the rest of the line
    else if (!conditionalTrue && sameLine == FALSE) endPos <- nchar(templateTags$bodyText[templateTags$bodyTags$element[i]])
    #just clip anything between open tag and first element of close tag (close tag itself handled by code below)
    else if (!conditionalTrue && sameLine == TRUE) endPos <- templateTags$bodyTags$start[close] - 1

    templateTags$bodyText[templateTags$bodyTags$element[i]] <- clipString(
      templateTags$bodyText[templateTags$bodyTags$element[i]],
      templateTags$bodyTags$start[i], endPos)

    if (nchar(trimSpace(templateTags$bodyText[templateTags$bodyTags$element[i]])) <= 0) {
      #no characters remain, so dump line
      bodyLinesToDrop <- c(bodyLinesToDrop, templateTags$bodyTags$element[i])
    } else {
      #if there is other text on this line, it may contain tags that need to be adjusted given the clip
      subsequentTags <- which(
        templateTags$bodyTags$element == templateTags$bodyTags$element[i] &
          templateTags$bodyTags$start > endPos)

      if (length(subsequentTags > 0)) {
        #calculate length of opening tag
        openLength <- endPos - templateTags$bodyTags$start[i] + 1
        templateTags$bodyTags[subsequentTags,"start"] <- templateTags$bodyTags[subsequentTags,"start"] - openLength
        templateTags$bodyTags[subsequentTags,"end"] <- templateTags$bodyTags[subsequentTags,"end"] - openLength
        #print("openlength")
        #browser()
      }
    }

    #okay, we've handled issues related to the opening tag, now handle closing tag
    #for the closing tag, just need to clip the tag itself (spacing handled above)
    templateTags$bodyText[templateTags$bodyTags$element[close]] <- clipString(
      templateTags$bodyText[templateTags$bodyTags$element[close]],
      templateTags$bodyTags$start[close],
      templateTags$bodyTags$end[close])

    if (nchar(trimSpace(templateTags$bodyText[templateTags$bodyTags$element[close]])) <= 0) {
      #no characters remain, so dump line
      bodyLinesToDrop <- c(bodyLinesToDrop, templateTags$bodyTags$element[close])
    } else {
      #only look for additional tags if nchar > 0
      #redundant code with above... must be a way to consolidate
      #if there is other text on then end line, it may contain tags that need to be adjusted given the clip
      subsequentTags <- which(
        templateTags$bodyTags$element == templateTags$bodyTags$element[close] &
          templateTags$bodyTags$start > templateTags$bodyTags$end[close])

      if (length(subsequentTags > 0)) {
        closeLength <- templateTags$bodyTags$end[close] - templateTags$bodyTags$start[close] + 1
        templateTags$bodyTags[subsequentTags,"start"] <- templateTags$bodyTags[subsequentTags,"start"] - closeLength
        templateTags$bodyTags[subsequentTags,"end"] <- templateTags$bodyTags[subsequentTags,"end"] - closeLength
        #print("closelength")
        #browser()
      }
    }
  }


  #print(bodyLinesToDrop)
  #print(bodyTagsToDrop)

  #drop all bad body lines

  #only keep unique bodyTagsToDrop (and sort for clarity in debugging)
  #hard to imagine that bodyTagsToDrop could be NULL at this point (given the return when no conditional tags above)
  #but if it were NULL, the bodyTags collection would be dumped by the NULL*-1 evaluation

  if (!is.null(bodyTagsToDrop)) {
    bodyTagsToDrop <- sort(unique(bodyTagsToDrop))
    templateTags$bodyTags <- templateTags$bodyTags[bodyTagsToDrop*-1, ]
  }

  #need to check whether bodyLinesToDrop is NULL. If it is, then we must not attempt the subset
  #(it will delete the whole character vector)
  if (!is.null(bodyLinesToDrop)) {
    #only retain unique bodyLinesToDrop (in theory handled by the "next" code above, but good to be safe)
    bodyLinesToDrop <- sort(unique(bodyLinesToDrop))
    templateTags$bodyText <- templateTags$bodyText[bodyLinesToDrop*-1]

    #need to move up the line markers in the bodyTags collection based on the lines dropped
    templateTags$bodyTags <- ddply(templateTags$bodyTags, "element", function(subDF) {
      numMoveUp <- length(which(bodyLinesToDrop < subDF$element[1]))
      subDF$element <- subDF$element - numMoveUp
      return(subDF)
    })
  }
  return(templateTags)
}


#' Process the Init Section
#'
#' To do: add details.
#'
#' @param initsection The list of all arguments parsed from the init section
#' @return arglist
#' @importFrom gsubfn strapply
#' @keywords internal
processInit <- function(initsection) {

  #combine multi-line statements by searching for semi-colon
  assignments <- grep("^\\s*.+\\s*=", initsection, perl=TRUE)

  #check for valid variable names
  valid <- grep("^\\s*[A-Za-z\\.]+[\\w\\.#]*\\s*=", initsection[assignments], perl=TRUE)

  if (length(valid) < length(assignments)) {
    badvars <- initsection[assignments[which(!1:length(assignments) %in% valid)]]
    stop(paste(c("Invalid variable definitions in init section.",
                 "Variables must begin with a letter or a period.",
                 "Variables may contain only the following characters: letters, numbers, underscores, periods, and a single pound sign for list variables.",
                 "Problematic variable(s):", badvars), collapse="\n  "))
  }

  #preallocate vector of strings to process
  argstoprocess <- vector("character", length(assignments))

  #loop through each line containing an assignment
  for (i in 1:length(assignments)) {
    argstoprocess[i] = initsection[assignments[i]]

    #if line does not terminate in semicolon, then read subsequent lines until semicolon found
    #start file position at n+1 line
    filepos = assignments[i] + 1
    while (length(grep(";\\s*$", argstoprocess[i], perl=TRUE)) != 1) {
      #cat("multi-line: ", unlist(argstoprocess[i]), fill=T)
      argstoprocess[i] = paste(argstoprocess[i], initsection[filepos])
      filepos = filepos + 1
    }
  }

  #will return a list (one element per argstoprocess) with a three-element vector (name, iterator, value)
  #note that the regexp implicitly dumps the semicolon and any trailing spaces
  arglist <- strapply(argstoprocess, "^\\s*(\\w+[\\w\\.]*)(#[\\w\\.]+)?\\s*=\\s*(.+);\\s*$",
                      function(name, iterator, value) {
                        return(c(name, iterator, value))
                      }, perl=TRUE)

  #copy the first element (name) of each vector into the list names
  names(arglist) <- make.names(sapply(arglist, '[', 1))

  #1. parse values into vectors according to spaces and quotes
  #2. add iterator attribute to be processed after iterators are setup below
  #3. implicitly drop name by not including element[1]
  arglist <- lapply(arglist, function(element) {
    output <- friendlyGregexpr("(\"[^\"]*\"|[^\\s]+)", element[3])$tag
    output <- gsub("\"", "", output)

    #the regexp above matches the # itself.
    #need to trim off in cases where iterator defined
    if (nchar(element[2]) > 0) {
      element[2] <- substr(element[2], 2, nchar(element[2]))
    }
    attr(output, "iterator") <- element[2]

    return(output)
  })


  if (is.null(arglist$iterators)) {
    stop("No iterators in init section. Cannot process template.")
  }

  #convert iterators from string to list
  #arglist$iterators <- unlist(strsplit(as.character(arglist$iterate_wrt), "\\s*,\\s*", perl=T))

  #process sequence text for each iterator
  for (thisIt in arglist$iterators) {
    if (is.null(arglist[[thisIt]])) {
      stop("Variable specified in iterators list, but not defined: ", thisIt)
    }

    #expand colon notation as needed
    #use do.call to combine elements of list returned by lapply
    #if there are many elements (e.g., 1 3 5), then lapply returns an element for each
    #one, but we just want a combined array. In the case of colon expansion, want to c that together
    #with any other elements... Maybe in the future when we support non-contiguous iterators.

    arglist[[thisIt]] <- do.call("c", lapply(arglist[[thisIt]], function(x) {
      if (length(grep(":", x)) > 0) {
        return(strapply(x, "(\\d+)\\s*:\\s*(\\d+)", function(start, stop) return(start:stop))[[1]])
      } else {
        return(as.numeric(x))
      }
    }))

    #sort as ascending and only keep unique values
    if (length(unique(arglist[[thisIt]])) < length(arglist[[thisIt]])) {
      stop("Problem with iterator: ", thisIt, "\n  Non-unique values specified: ",
           paste(arglist[[thisIt]], collapse=", "))
    }

    arglist[[thisIt]] <- sort(unique(arglist[[thisIt]]))
  }

  #now that iterators are defined, ensure that list tags match
  #pad vectors accordingly

  arglist <- lapply(arglist, function(element) {
    #if the iterator is defined, then this is a list tag
    #need to make sure it is properly padded
    iteratorAttr <- attr(element, "iterator")
    if (!is.null(iteratorAttr) && nchar(iteratorAttr) > 0) {
      iteratorValues <- arglist[[iteratorAttr]]

      #make sure that the length of the values vector
      #matches the length of the iterator vector
      if (length(element) != length(iteratorValues)) {
        stop("Variable locked to iterator: ", iteratorAttr,
             ", but has different length.\n  Values: ",
             paste(element, collapse=", "),
             "\n  Should be length: ", length(iteratorValues))
      }

      if (length(element) < max(iteratorValues)) {
        #pad
        updatedElement <- c()
        listElement <- 1
        #build a vector of the same length as the max of the iterator
        #only insert list values for defined indices. Otherwise pad
        for (i in 1:max(iteratorValues)) {
          if (i %in% iteratorValues) {
            updatedElement[i] <- element[listElement]
            listElement <- listElement + 1
          } else {
            updatedElement[i] <- ""
          }
        }
        element <- updatedElement
        attr(element, "iterator") <- iteratorAttr #re-add attribute
      }
    }

    return(element)
  })


  #default output directory to the current directory
  if (is.null(arglist$outputDirectory)) {
    warning("No output directory specified. Defaulting to the current directory.")
    arglist$outputDirectory <- getwd()
  }
  if (is.null(arglist$filename)) {
    stop("No definition provided for the output filename. The filename definition is required.")
  }

  return(arglist)
}


#' Create Mplus syntax for variable names
#'
#' This is a simple function designed to take a dataset in \code{R}
#' and translate it into a set of variable names for Mplus.
#'
#' @param data An \code{R} dataset.
#' @return A character vector of the variable names for Mplus
#' @keywords internal
#' @seealso \code{\link{prepareMplusData}}
#' @examples
#' MplusAutomation:::createVarSyntax(mtcars)
createVarSyntax <- function(data) {
  #variable created for readability
  variableNames <- paste(gsub("\\.", "_", colnames(data)), collapse=" ")

  vnames <- paste(strwrap(paste(c("NAMES = ", variableNames, ";"), collapse = ""),
                          width=85, exdent=5), collapse="\n")
  vnames[length(vnames)] <- paste(vnames[length(vnames)], "\n", collapse="")

  return(vnames)
}

#' Prepare Mplus Data Matrix
#'
#' support writing of covariance or means + covariance matrix (future)
#'
#' @param covMatrix The covariance matrix
#' @param meansMatrix The means matrix
#' @param nobs Number of observations for the data
#' @return A dataset
#' @keywords internal
prepareMplusData_Mat <- function(covMatrix, meansMatrix, nobs) {

}

#' Clean data and calculate the md5 hash
#'
#' Internal utility function, primarily for \code{prepareMplusData}.
#'
#' @param df The R data.frame to be prepared for Mplus
#' @param keepCols A character vector specifying the variable names
#'   within \code{df} to be output to \code{filename} or a numeric
#'   vector of the column indices to be output or a logical vector
#'   corresponding to the same.
#' @param dropCols A character vector specifying the variable names
#'   within \code{df} to be omitted from the data output to \code{filename}
#'   or a numeric vector of the column indices not to be output
#'   or a logical vector corresponding to the same.
#' @param imputed A logical whether data are multiply imputed.  Defaults
#'   to \code{FALSE}.  If \code{TRUE}, the data should be a list,
#'   where each element of the list is a multiply imputed dataset.
#' @return A list of the data and the md5 hash.
#' @keywords internal
#' @importFrom digest digest
#' @rdname MplusAutomationUtils
#' @examples
#' \dontrun{
#'
#' ## basic example
#' MplusAutomation:::.cleanHashData(mtcars)
#'
#' ## has changes when data changes
#' MplusAutomation:::.cleanHashData(mtcars[-15,])
#'
#' ## example on a list (e.g., for multiply imputed data)
#'
#' MplusAutomation:::.cleanHashData(
#'  list(
#'    data.frame(a = 1:4),
#'    data.frame(a = c(2, 2, 3, 4))),
#'   imputed = TRUE)
#'
#' }
.cleanHashData <- function(df, keepCols, dropCols, imputed=FALSE) {
  if (imputed) {
    stopifnot(inherits(df, "list"))
  } else {
    stopifnot(inherits(df, "data.frame"))
  }

  ## only allow keep OR drop.
  if(!missing(keepCols) && !missing(dropCols) && length(keepCols) && length(dropCols)) {
    stop("keepCols and dropCols passed. You must choose one or the other, but not both.")
  }

  ## assert types allowed for keep and drop cols
  stopifnot(missing(keepCols) || is.character(keepCols) ||
              is.numeric(keepCols) || is.logical(keepCols))

  stopifnot(missing(dropCols) || is.character(dropCols) ||
              is.numeric(dropCols) || is.logical(dropCols))


  ## keep only columns specified by keepCols
  if (!missing(keepCols) && length(keepCols) > 0) {
    if (imputed) {
      df <- lapply(df, function(d) d[, keepCols, drop = FALSE])
    } else {
      df <- df[, keepCols, drop = FALSE] # works with all types
    }
  }

  ## drop columns specified by dropCols
  if (!missing(dropCols) && length(dropCols) > 0) {
    if (is.character(dropCols)) {
      if (imputed) {
        df <- lapply(df, function(d) {subset(d, select = -which(colnames(d) %in% dropCols))})
      } else {
        df <- subset(df, select = -which(colnames(df) %in% dropCols))
      }
    } else if (is.numeric(dropCols)) {
      if (imputed) {
        df <- lapply(df, function(d) {subset(d, select = -dropCols)})
      } else {
        df <- subset(df, select = -dropCols)
      }
    } else if (is.logical(dropCols)) {
      if (imputed) {
        df <- lapply(df, function(d) {subset(d, select = !dropCols)})
      } else {
        df <- subset(df, select = !dropCols)
      }
    }
  }

  f <- function(x) {
    as.vector(c(
      dim(x),
      unlist(lapply(x, class)),
      unlist(dimnames(x)),
      as.character(unlist(x[c(1, nrow(x)), ]))))
  }

  if (imputed) {
    hash <- lapply(seq_along(df), function(i) digest(f(df[[i]]), "md5"))
  } else {
    hash <- digest(f(df), "md5")
  }

  return(list(data = df, md5 = hash))
}

#' Check if a file exists with a given hash and add a hash to an existing filename
#'
#' Internal utility function, primarily for \code{prepareMplusData}.
#'
#' @param filename A character vector containing the filename
#' @param hash A character vector with the hash to use
#' @param useexisting A logical whether to use an existing file name
#'   if one is found containing the hash.  Defaults to \code{FALSE}
#'   in which case the hash is added to the user specified filename
#' @return A list of the filename (plus hash) and a logical value
#'   whether a filename with the hash already existed or not.
#' @keywords internal
#' @rdname MplusAutomationUtils
#' @examples
#' MplusAutomation:::.hashifyFile("testit.dat", "abc")
.hashifyFile <- function(filename, hash, useexisting = FALSE) {
  fileonly <- basename(filename)
  allfiles <- list.files(path = dirname(filename))
  existingfile <- grep(hash, allfiles, value=TRUE)[1]

  fileexists <- length(existingfile) && !isTRUE(is.na(existingfile))

  if (fileexists && useexisting) {
    filename <- gsub(basename(filename), basename(existingfile), filename)
  } else {
    filename <- gsub("\\.dat$", paste0("_", hash, ".dat"), filename)
  }
  list(filename = filename, fileexists = fileexists)
}




#' Convert a matrix or data frame to numeric or integer for Mplus
#'
#' Primarily an internal utility function, for \code{prepareMplusData}.
#'
#' @param df A matrix or data frame
#' @return An error if it cannot be converted or
#'   a matrix or data frame with all variables converted to
#'   numeric or integer classes
#' @importFrom data.table is.data.table
#' @keywords internal
#' @examples
#'
#' \dontrun{
#' df1 <- df2 <- df3 <- df4 <- mtcars
#'
#' df2$cyl <- factor(df2$cyl)
#' df2$am <- as.logical(df2$am)
#'
#' df3$mpg <- as.character(df3$mpg)
#'
#' df4$vs <- as.Date(df4$vs, origin = "1989-01-01")
#'
#' df5 <- as.matrix(cars)
#'
#' df6 <- matrix(c(TRUE, TRUE, FALSE, FALSE), ncol = 2)
#'
#' df7 <- as.list(mtcars)
#'
#'
#' MplusAutomation:::.convertData(df1)
#'
#' MplusAutomation:::.convertData(df2)
#'
#' MplusAutomation:::.convertData(df3)
#'
#' MplusAutomation:::.convertData(df4)
#'
#' MplusAutomation:::.convertData(df5)
#'
#' MplusAutomation:::.convertData(df6)
#'
#' MplusAutomation:::.convertData(df7)
#'
#' rm(df1, df2, df3, df4, df5, df6, df7)
#' }
.convertData <- function(df) {
  if (isTRUE(is.matrix(df))) {
    if (isTRUE(is.numeric(df)) || isTRUE(is.integer(df))) {
      NULL ## do nothing
    } else if (isTRUE(is.logical(df))) {
      message("Logical matrix converted to integer")
      storage.mode(df) <- "integer"
    } else {
      stop(paste(
        "\nIf data are passed as a matrix, must be of class",
        "numeric, integer, or logical, but data was of class",
        class(df[,1]),
        sep = "\n"))
    }

    if (isTRUE(is.null(colnames(df)[1]))) {
      message("no variable names, setting to V1, V2, etc")
      colnames(df) <- paste0("V", 1:ncol(df))
    }
  } else if (isTRUE(is.data.frame(df))) {
    if (isTRUE(is.data.table(df))) {
      df <- as.data.frame(df)
    }

    col_logical <- vapply(df, is.logical, FUN.VALUE = NA)
    col_numeric <- vapply(df, is.numeric, FUN.VALUE = NA) | vapply(df, is.integer, FUN.VALUE = NA)
    col_factor <- vapply(df, is.factor, FUN.VALUE = NA)

    col_class <- vapply(df, class, FUN.VALUE = NA_character_)

    ok_cols <- col_logical | col_numeric | col_factor

    if (!all(ok_cols)) {
      stop(paste("\nCurrently only variables of class: ",
                 "numeric, integer, logical, or factor",
                 "are allowed but found additional class types including: ",
                 paste(unique(col_class[!ok_cols]), collapse = ", "),
                 "\nto see which variables are problematic, try:",
                 "str(yourdata)",
                 sep = "\n"))
    }

    factor_cols <- which(col_factor)
    if (isTRUE(length(factor_cols) > 0)) {
      for (i in factor_cols) {
        message("Factor variable: ", names(df)[i], "; factor levels:",
                paste(levels(df[,i]), collapse=", "), "\nconverted to numbers: ",
                paste(seq_along(levels(df[,i])), collapse=", "), "\n")
        df[[i]] <- as.numeric(df[[i]])
      }
    }

    logical_cols <- which(col_logical)
    if (isTRUE(length(logical_cols) > 0)) {
      for (i in logical_cols) {
        message("Logical variable: ", names(df)[i], " converted to integer")
        df[[i]] <- as.integer(df[[i]])
      }
    }
  } else {
    stop (paste(
      "\nCan only convert matrix or data frame class ",
      "data but found data of class:",
      class(df),
      sep = "\n"))
  }
  return(df)
}





#' Create tab-delimited file and Mplus input syntax from R data.frame
#'
#' The \code{prepareMplusData} function converts an R data.frame
#' (or a list of data frames), into a tab-delimited
#' file (without header) to be used in an Mplus
#' input file. The corresponding Mplus syntax, including the
#' data file definition and variable names,
#' is printed to the console or optionally to an input file.
#'
#' The \code{writeData} argument is new and can be used to reduce overhead
#' from repeatedly writing the same data from R to the disk.  When using the
#' \sQuote{always} option, \code{prepareMplusData} behaves as before, always writing
#' data from R to the disk.  When \sQuote{ifmissing}, R generates an
#' md5 hash of the data prior to writing it out to the disk.  The md5 hash is based on:
#' (1) the dimensions of the dataset, (2) the variable names,
#' (3) the class of every variable, and (4) the raw data from the first and last rows.
#' This combination ensures that under most all circumstances, if the data changes,
#' the hash will change.  The hash is appended to the specified data file name
#' (which is controlled by the logical \code{hashfilename} argument).  Next R
#' checks in the directory where the data would normally be written.  If a data file
#' exists in that directory that matches the hash generated from the data, R will
#' use that existing data file instead of writing out the data again.
#' A final option is \sQuote{never}.  If this option is used, R will not write
#' the data out even if no file matching the hash is found.
#'
#' @param df The R data.frame to be prepared for Mplus
#' @param filename The path and filename for the tab-delimited data file
#'   for use with Mplus. Example: "C:/Mplusdata/data1.dat"
#' @param keepCols A character vector specifying the variable names
#'   within \code{df} to be output to \code{filename} or a numeric
#'   vector of the column indices to be output or a logical vector
#'   corresponding to the same.
#' @param dropCols A character vector specifying the variable names
#'   within \code{df} to be omitted from the data output to \code{filename}
#'   or a numeric vector of the column indices not to be output
#'   or a logical vector corresponding to the same.
#' @param inpfile Logical value whether the Mplus syntax should be written
#'   to the console or to an input file. Defaults to \code{FALSE}. If
#'   \code{TRUE}, the file name will be the same as \code{filename} with
#'   the extension changed to .inp.  Alternately, this can be a character
#'   string giving the file name to write the Mplus syntax to.
#' @param interactive Logical value indicating whether file names
#'   should be selected interactively. If \code{filename} is
#'   missing and \code{interative=TRUE}, then a dialogue box
#'   will pop up to select a file or a console prompt if in a
#'   non interactive context. Defaults to \code{TRUE}.
#' @param overwrite Logical value indicating whether
#'   data and input (if present) files should be overwritten.
#'   Defaults to \code{TRUE} to be consistent with prior behavior.
#'   If \code{FALSE} and the file to write the data to already exists,
#'   it will throw an error.
#' @param imputed A logical whether data are multiply imputed.  Defaults
#'   to \code{FALSE}.  If \code{TRUE}, the data should be a list,
#'   where each element of the list is a multiply imputed dataset.
#' @param writeData A character vector, one of \sQuote{always},
#'   \sQuote{ifmissing}, \sQuote{never} indicating whether the data files
#'   (*.dat) should be written to disk.  Defaults to
#'   \sQuote{always} for consistency with previous behavior.
#'   See details for further information.
#' @param hashfilename A logical whether or not to add a hash of the raw data to the
#'   data file name.  Defaults to \code{FALSE} for consistency with previous
#'   behavior where this feature was not available..
#' @return Invisibly returns a character vector of the Mplus input
#'   syntax. Primarily called for its side effect of creating Mplus
#'   data files and optionally input files.
#' @keywords interface
#' @importFrom utils write.table
#' @importFrom data.table fwrite
#' @author Michael Hallquist
#' @export
#' @examples
#' \dontrun{
#' library(foreign)
#'
#' study5 <- read.spss("reanalysis-study-5-mt-fall-08.sav", to.data.frame=TRUE)
#' ASData5 <- subset(study5, select=c("ppnum", paste("as", 1:33, sep="")))
#'
#' prepareMplusData(ASData5, "study5.dat")
#'
#'
#' # basic example
#' test01 <- prepareMplusData(mtcars, "test01.dat")
#'
#'
#'
#' # see that syntax was stored
#' test01
#'
#' # example when there is a factor and logical
#' tmpd <- mtcars
#' tmpd$cyl <- factor(tmpd$cyl)
#' tmpd$am <- as.logical(tmpd$am)
#' prepareMplusData(tmpd, "test_type.dat")
#' rm(tmpd)
#'
#' # by default, if re-run, data is re-written, with a note
#' test01b <- prepareMplusData(mtcars, "test01.dat")
#'
#' # if we turn on hashing in the filename the first time,
#' # we can avoid overwriting notes the second time
#' test01c <- prepareMplusData(mtcars, "test01c.dat", hashfilename=TRUE)
#'
#' # now that the filename was hashed in test01c, future calls do not re-write data
#' # as long as the hash matches
#' test01d <- prepareMplusData(mtcars, "test01c.dat",
#'   writeData = "ifmissing", hashfilename=TRUE)
#'
#' # now that the filename was hashed in test01c, future calls do not re-write data
#' # as long as the hash matches
#' test01db <- prepareMplusData(mtcars, "test01d.dat",
#'   writeData = "ifmissing", hashfilename=TRUE)
#'
#' # however, if the data change, then the file is re-written
#' test01e <- prepareMplusData(iris, "test01c.dat",
#'   writeData = "ifmissing", hashfilename=TRUE)
#'
#' # tests for keeping and dropping variables
#' prepareMplusData(mtcars, "test02.dat", keepCols = c("mpg", "hp"))
#' prepareMplusData(mtcars, "test03.dat", keepCols = c(1, 2))
#' prepareMplusData(mtcars, "test04.dat",
#'   keepCols = c(TRUE, FALSE, FALSE, TRUE, FALSE,
#'   FALSE, FALSE, FALSE, FALSE, FALSE, FALSE))
#'
#' prepareMplusData(mtcars, "test05.dat", dropCols = c("mpg", "hp"))
#' prepareMplusData(mtcars, "test06.dat", dropCols = c(1, 2))
#' prepareMplusData(mtcars, "test07.dat",
#'   dropCols = c(TRUE, FALSE, FALSE, TRUE, FALSE,
#'   FALSE, FALSE, FALSE, FALSE, FALSE, FALSE))
#'
#'
#' # interactive (test08.dat)
#' prepareMplusData(mtcars, interactive=TRUE)
#'
#' # write syntax to input file, not stdout
#' prepareMplusData(mtcars, "test09.dat", inpfile=TRUE)
#'
#' # write syntax to alternate input file, not stdout
#' prepareMplusData(mtcars, "test10.dat", inpfile="test10alt.inp")
#'
#' # should be error, no file
#' prepareMplusData(mtcars, interactive=FALSE)
#'
#' # new warnings if it is going to overwrite files
#' # (the default to be consistent with prior behavior)
#' prepareMplusData(mtcars, "test10.dat")
#'
#' # new warnings if it is going to overwrite files
#' # (the default to be consistent with prior behavior)
#' prepareMplusData(mtcars, "test11.dat", inpfile="test10alt.inp")
#'
#' # new errors if files exist and overwrite=FALSE
#' prepareMplusData(mtcars, "test10.dat",
#'   inpfile="test10alt.inp", overwrite=FALSE)
#'
#'
#' # can write multiply imputed data too
#' # here are three "imputed" datasets
#' idat <- list(
#'   data.frame(mpg = mtcars$mpg, hp = c(100, mtcars$hp[-1])),
#'   data.frame(mpg = mtcars$mpg, hp = c(110, mtcars$hp[-1])),
#'   data.frame(mpg = mtcars$mpg, hp = c(120, mtcars$hp[-1])))
#'
#' # if we turn on hashing in the filename the first time,
#' # we can avoid overwriting notes the second time
#' testimp1 <- prepareMplusData(idat, "testi1.dat",
#'   writeData = "ifmissing", hashfilename=TRUE,
#'   imputed = TRUE)
#'
#' # now that the filename was hashed, future calls do not re-write data
#' # as long as all the hashes match
#' testimp2 <- prepareMplusData(idat, "testi2.dat",
#'   writeData = "ifmissing", hashfilename=TRUE,
#'   imputed = TRUE)
#'
#' # in fact, the number of imputations can decrease
#' # and they still will not be re-written
#' testimp3 <- prepareMplusData(idat[-3], "testi3.dat",
#'   writeData = "ifmissing", hashfilename=TRUE,
#'   imputed = TRUE)
#'
#' # however, if the data changes, then all are re-written
#' # note that it warns for the two files that already exist
#' # as these two are overwritten
#'
#' idat2 <- list(
#'   data.frame(mpg = mtcars$mpg, hp = c(100, mtcars$hp[-1])),
#'   data.frame(mpg = mtcars$mpg, hp = c(109, mtcars$hp[-1])),
#'   data.frame(mpg = mtcars$mpg, hp = c(120, mtcars$hp[-1])))
#' testimp4 <- prepareMplusData(idat2, "testi4.dat",
#'   writeData = "ifmissing", hashfilename=TRUE,
#'   imputed = TRUE)
#'
#'
#' }
prepareMplusData <- function(df, filename, keepCols, dropCols, inpfile=FALSE,
                             interactive=TRUE, overwrite=TRUE, imputed=FALSE,
                             writeData=c("always", "ifmissing", "never"), hashfilename=FALSE) {

  writeData <- match.arg(writeData)

  ## message and then exit function if never write data
  if (identical(writeData, "never")) {
    message("No action taken as writeData = 'never'")
    return(invisible(""))
  }

  if (!hashfilename && identical(writeData, "ifmissing")) {
    writeData <- "always"
    message("When hashfilename = FALSE, writeData cannot be 'ifmissing', setting to 'always'")
  }

  if (missing(keepCols)) {
    if (missing(dropCols)) {
      cleand <- .cleanHashData(df = df, imputed = imputed)
    } else {
      cleand <- .cleanHashData(df = df, dropCols = dropCols, imputed = imputed)
    }
  } else {
    if (missing(dropCols)) {
      cleand <- .cleanHashData(df = df, keepCols = keepCols, imputed = imputed)
    } else {
      cleand <- .cleanHashData(df = df, keepCols = keepCols,
                               dropCols = dropCols, imputed = imputed)
    }
  }

  df <- cleand$data
  md5 <- cleand$md5
  rm(cleand)

  ## if filename is missing and interactive is TRUE
  ## interactively (through GUI or console)
  ## request filename from user
  if (missing(filename) && interactive) {
    filename <- file.choose()
  }

  ## if filename is still missing at this point, throw an error
  stopifnot(!missing(filename))

  origfilename <- filename

  ## impfilename <- gsub("\\.dat$", "_implist.dat", filename)
  impfilename <- filename

  if (isTRUE(imputed) && isTRUE(hashfilename)) {
    tmp <- lapply(1:length(md5), function(i) {
      .hashifyFile(filename, md5[[i]],
                   useexisting = identical(writeData, "ifmissing"))
    })
    filename <- unlist(lapply(tmp, function(x) x$filename))

    allfilesexist <- all(vapply(tmp, function(x) x$fileexists, FUN.VALUE = NA))
  } else if (isTRUE(imputed) && isFALSE(hashfilename)) {
    filename.base <- gsub("\\.dat", "", filename)
    filename <- unlist(lapply(1:length(df), function(i) {
      paste0(filename.base, "_imp_", i, ".dat")
    }))
  } else {
    tmp <- .hashifyFile(filename, md5,
                        useexisting = identical(writeData, "ifmissing"))
    allfilesexist <- tmp$fileexists
    if (isTRUE(hashfilename)) {
      filename <- tmp$filename
    }
  }

  if (isTRUE(imputed)) {
    message("writing implist to ", impfilename)
    cat(filename, file = impfilename, sep = "\n")
  }

  if (identical(writeData, "ifmissing") && isTRUE(allfilesexist)) {
    message(sprintf("File(s) with md5 hash matching data found, using \n%s",
                    paste(filename, collapse = "\n")))
  } else {
    ## even if writeData = 'ifmissing' if the data are missing, need to write out
    writeData <- "always"
  }

  if (identical(writeData, "always")) {
    ## convert factors to numbers
    if (isTRUE(imputed)) {
      df <- lapply(1:length(df), function(i) {
        if (i == 1) {
          .convertData(df[[i]])
        } else {
          suppressMessages(.convertData(df[[i]]))
        }
      })
    } else {
      df <- .convertData(df)
    }

    if (any(vapply(filename, file.exists, FUN.VALUE = NA))) {
      if (isTRUE(overwrite)) {
        message(paste("The file(s)\n", sQuote(
          paste(vapply(filename[vapply(filename, file.exists, FUN.VALUE = NA)], basename,
                       FUN.VALUE = NA_character_), collapse = ";\n")),
          "\ncurrently exist(s) and will be overwritten"))
      } else {
        stop(paste("The file(s)\n", sQuote(
          paste(vapply(filename[vapply(filename, file.exists, FUN.VALUE = NA)], basename,
                       FUN.VALUE = NA_character_), collapse = ";\n")),
          "\ncurrently exist(s). Specify a different filename or set overwrite=TRUE"))
      }
    }

    if (isTRUE(imputed)) {
      junk <- lapply(1:length(df), function(i) {
        fwrite(df[[i]], filename[[i]], sep = "\t",
               col.names = FALSE, row.names = FALSE, na=".")
      })
    } else {
      fwrite(df, filename, sep = "\t",
             col.names = FALSE, row.names = FALSE, na=".")
    }
  }

  if (isTRUE(imputed)) {
    syntax <- c(
      "TITLE: Your title goes here\n",
      DATA <- paste0("DATA: FILE = \"", impfilename, "\";\n", "TYPE = IMPUTATION;\n"),
      "VARIABLE: \n", createVarSyntax(df[[1]]), "MISSING=.;\n")

  } else {
    syntax <- c(
      "TITLE: Your title goes here\n",
      DATA <- paste0("DATA: FILE = \"", filename, "\";\n"),
      "VARIABLE: \n", createVarSyntax(df), "MISSING=.;\n")
  }

  # if inpfile is a logical value and is TRUE
  # then create the file using filename
  # changing the extension to .inp
  if (is.logical(inpfile) && inpfile) {
    inpfile <- gsub("(.*)\\..*$", "\\1.inp", origfilename)
  }

  # if the input file is not a character
  # either by user specification or automatically
  # by replacing extension of filename with .inp
  # then just use stdout
  if (isFALSE(is.character(inpfile))) {
    inpfile <- stdout()
  }

  if (isTRUE(is.character(inpfile)) && isTRUE(file.exists(inpfile))) {
    if (overwrite) {
      message(paste("The file", sQuote(basename(inpfile)),
                    "currently exists and will be overwritten"))
    } else {
      stop(paste("The file", sQuote(basename(inpfile)),
                 "currently exists. Specify a different filename or set overwrite=TRUE"))
    }
  }

  # write out syntax, either to stdout or to a file
  cat(syntax, file=inpfile, sep="")

  # return syntax invisibly for later use/reuse
  return(invisible(syntax))
}
enaY15/TabulationAutomation documentation built on March 18, 2020, 8:35 p.m.