psindex: Parameter Stability Index

# this code is written by Michael Hallquist
#First draft of parser to convert Mplus model syntax to psindex model syntax

#idea: build parTable and run model from mplus syntax
#then perhaps write export function: parTable2Mplus
#and/or parTable2psindex

trimSpace <- function(string) {
  stringTrim <- sapply(string, function(x) {
        x <- sub("^\\s*", "", x, perl=TRUE)
        x <- sub("\\s*$","", x, perl=TRUE)
        return(x)
      }, USE.NAMES=FALSE)
  return(stringTrim)
}

#small utility function to join strings in a regexp loop
joinRegexExpand <- function(cmd, argExpand, matches, iterator, matchLength="match.length") {
  if (iterator == 1 && matches[iterator] > 1) {
    pre <- substr(cmd, 1, matches[iterator] - 1)
  } else pre <- ""
  
  #if this is not the final match, then get sub-string between the end of this match and the beginning of the next
  #otherwise, match to the end of the command
  post.end <- ifelse(iterator < length(matches), matches[iterator+1] - 1, nchar(cmd))
  post <- substr(cmd, matches[iterator] + attr(matches, matchLength)[iterator], post.end)
  
  cmd.expand <- paste(pre, argExpand, post, sep="")
  return(cmd.expand)
}

#expand Mplus hyphen syntax (will also expand constraints with hyphens)
expandCmd <- function(cmd, alphaStart=TRUE) {
  #use negative lookahead and negative lookbehind to eliminate possibility of hyphen being used as a negative starting value (e.g., x*-1)
  #also avoid match of anything that includes a decimal point, such as a floating-point starting value -10.5*x1
  
  #if alphaStart==TRUE, then require that the matches before and after hyphens begin with alpha character
  #this is used for variable names, whereas the more generic expansion works for numeric constraints and such
  
  #need to do a better job of this so that u1-u20* is supported... I don't think the regexp below is general enough
  
  #if (alphaStart) {
  #  hyphens <- gregexpr("[_A-Za-z]+\\w*\\s*-\\s*[_A-Za-z]+\\w*", cmd, perl=TRUE)[[1]]  
  #} else {
  #  hyphens <- gregexpr("(?!<(\\*|\\.))\\w+(?!(\\*|\\.))\\s*-\\s*(?!<(\\*|\\.))\\w+(?!(\\*|\\.))", cmd, perl=TRUE)[[1]]
  #}

  hyphens <- gregexpr("(?!<(\\*|\\.))\\w+(?!(\\*|\\.))\\s*-\\s*(?!<(\\*|\\.))\\w+(?!(\\*|\\.))", cmd, perl=TRUE)[[1]]
    
  if (hyphens[1L] > 0) {
    
    cmd.expand <- c()
    ep <- 1
    
    for (v in 1:length(hyphens)) {
      #match one keyword before and after hyphen
      argsplit <- strsplit(substr(cmd, hyphens[v], hyphens[v] + attr(hyphens, "match.length")[v] - 1), "\\s*-\\s*", perl=TRUE)[[1]]
      
      v_pre <- argsplit[1]
      v_post <- argsplit[2]
      
      #the basic positive lookbehind blows up with pure numeric constraints (1 - 3) because no alpha char precedes digit
      #can use an non-capturing alternation grouping to allow for digits only or the final digits after alphas (as in v_post.num)
      v_pre.num <- as.integer(sub("\\w*(?<=[A-Za-z_])(\\d+)$", "\\1", v_pre, perl=TRUE)) #use positive lookbehind to avoid greedy \w+ match -- capture all digits
      
      v_post.match <- regexpr("^(?:\\w*(?<=[A-Za-z_])(\\d+)|(\\d+))$", v_post, perl=TRUE)
      stopifnot(v_post.match[1L] > 0)
      
      #match mat be under capture[1] or capture[2] because of alternation above
      whichCapture <- which(attr(v_post.match, "capture.start") > 0)
      
      v_post.num <- as.integer(substr(v_post, attr(v_post.match, "capture.start")[whichCapture], attr(v_post.match, "capture.start")[whichCapture] + attr(v_post.match, "capture.length")[whichCapture] - 1))
      v_post.prefix <- substr(v_post, 1, attr(v_post.match, "capture.start")[whichCapture] - 1) #just trusting that pre and post match
      
      if (is.na(v_pre.num) || is.na(v_post.num)) stop("Cannot expand variables: ", v_pre, ", ", v_post)
      v_expand <- paste(v_post.prefix, v_pre.num:v_post.num, sep="", collapse=" ")
      
      #for first hyphen, there may be non-hyphenated syntax preceding the initial match
      cmd.expand[ep] <- joinRegexExpand(cmd, v_expand, hyphens, v) 
      ep <- ep + 1
      
    }
    return(paste(cmd.expand, collapse=""))
  } else {
    return(cmd) #no hyphens to expand
  }
}


#handle starting values and fixed parameters on rhs
parseFixStart <- function(cmd) {
  cmd.parse <- c()
  ep <- 1L
  if ((fixed.starts <- gregexpr("[\\w\\.-]+\\s*([@*])\\s*[\\w\\.-]+", cmd, perl=TRUE)[[1]])[1L] > 0) { #shouldn't it be \\*, not * ?! Come back to this.
    for (f in 1:length(fixed.starts)) {
      
      #capture above obtains the fixed/start character (@ or *), whereas match obtains the full regex match
      opchar <- substr(cmd, attr(fixed.starts, "capture.start")[f], attr(fixed.starts, "capture.start")[f] + attr(fixed.starts, "capture.length")[f] - 1)
      
      #match arguments around asterisk/at symbol
      argsplit <- strsplit(substr(cmd, fixed.starts[f], fixed.starts[f] + attr(fixed.starts, "match.length")[f] - 1), paste0("\\s*", ifelse(opchar=="*", "\\*", opchar), "\\s*"), perl=TRUE)[[1]]
      v_pre <- argsplit[1]
      v_post <- argsplit[2]
      
      if (suppressWarnings(is.na(as.numeric(v_pre)))) { #fixed.starts value post-multiplier
        var <- v_pre 
        val <- v_post        
      } else if (suppressWarnings(is.na(as.numeric(v_post)))) { #starting value pre-multiplier
        var <- v_post
        val <- v_pre
      } else stop("Cannot parse Mplus fixed/starts values specification: ", v_pre, v_post)
      
      if (opchar == "@") {
        cmd.parse[ep] <- joinRegexExpand(cmd, paste0(val, "*", var, sep=""), fixed.starts, f)
        ep <- ep + 1L
      } else {
        cmd.parse[ep] <- joinRegexExpand(cmd, paste0("start(", val, ")*", var, sep=""), fixed.starts, f)
        ep <- ep + 1L
      }
      
    }
    return(paste(cmd.parse, collapse=""))  
    
  } else {
    return(cmd)
  }
  
}

parseConstraints <- function(cmd) {
  #Allow cmd to have newlines embedded. In this case, split on newlines, and loop over and parse each chunk
  #Dump leading and trailing newlines, which contain no information about constraints, but may add dummy elements to vector after strsplit
  #Maybe return LHS and RHS parsed command where constraints only appear on the RHS, whereas the LHS contains only parameters.
  #Example: LHS is v1 v2 v3 and RHS is con1*v1 con2*v2 con3*v3
  
  cmd.split <- strsplit(cmd, "\n")[[1]]
  
  #drop empty lines (especially leading newline)
  cmd.split <- if(length(emptyPos <- which(cmd.split == "")) > 0L) {cmd.split[-1*emptyPos]} else {cmd.split}
  
  #Create a version of the command with no constraint specifications.
  #This is useful for constraint specs that use the params on the LHS and RHS. Example: v1 ~~ conB*v1 
  cmd.noconstraints <- paste0(gsub("\\s*\\([^\\)]+\\)\\s*", "", cmd.split, perl=TRUE), collapse=" ")
  
  cmd.tojoin <- c() #will store all chunks divided by newlines, which will be joined at the end.
  
  #iterate over each newline segment
  for (n in 1:length(cmd.split)) {
    #in principle, now that we respect newlines, parens should only be of length 1, since Mplus syntax dictates newlines for each use of parentheses for constraints
    if ((parens <- gregexpr("(?<!start)\\(([^\\)]+)\\)", cmd.split[n], perl=TRUE)[[1L]])[1L] > 0) { #match parentheses, but not start()
      #the syntax chunk after all parentheses have been matched 
      cmd.expand <- c()
      
      for (p in 1:length(parens)) {
        #string within the constraint parentheses
        constraints <- substr(cmd.split[n], attr(parens, "capture.start")[p], attr(parens, "capture.start")[p] + attr(parens, "capture.length")[p] - 1)
        
        #Divide constraints on spaces to determine number of constraints to parse. Use trimSpace to avoid problem of user including leading/trailing spaces within parentheses.
        con.split <- strsplit(trimSpace(constraints), "\\s+", perl=TRUE)[[1]]
        
        #if Mplus uses a purely numeric constraint, then add ".con" prefix to be consistent with R naming.
        con.split <- sapply(con.split, function(x) {
              if (! suppressWarnings(is.na(as.numeric(x)))) {
                make.names(paste0(".con", x))
              } else { x }
            })
        
        #determine the parameters that precede the parentheses (either first character for p == 1 or character after preceding parentheses)
        prestrStart <- ifelse(p > 1, attr(parens, "capture.start")[p-1] + attr(parens, "capture.length")[p-1] + 1, 1)
        
        #obtain the parameters that precede the parentheses, divide into arguments on spaces
        #use trimSpace here because first char after prestrStart for p > 1 will probably be a space
        precmd.split <- strsplit(trimSpace(substr(cmd.split[n], prestrStart, parens[p] - 1)), "\\s+", perl=TRUE)[[1]]
        
        #peel off any potential LHS arguments, such as F1 BY
        precmdLHSOp <- which(tolower(precmd.split) %in% c("by", "with", "on"))
        if (any(precmdLHSOp)) {
          lhsop <- paste0(precmd.split[1:precmdLHSOp[1L]], " ", collapse=" ") #join lhs and op as a single string, add trailing space so that paste with expanded RHS is right.
          rhs <- precmd.split[(precmdLHSOp+1):length(precmd.split)]
        } else {
          lhsop <- ""
          rhs <- precmd.split 
        }
        
        if (length(con.split) > 1L) {
          #several constraints listed within parentheses. Example: F1 BY X1 X2 X3 X4 (C2 C3 C4)
          #thus, backwards match the constraints to parameters
          
          #restrict parameters to backwards match to be of the same length as number of constraints 
          rhs.backmatch <- rhs[(length(rhs)-length(con.split)+1):length(rhs)]
          
          rhs.expand <- c()
          
          #check that no mean or scale markers are part of the rhs param to expand
          if ((preMark.match <- regexpr("^\\s*[\\[\\{]", rhs.backmatch[1L], perl=TRUE))[1L] > 0) {
            preMark <- substr(rhs.backmatch[1L], preMark.match[1L], preMark.match[1L] + attr(preMark.match, "match.length")[1L] - 1)
            rhs.backmatch[1L] <- substr(rhs.backmatch[1L], preMark.match[1L] + attr(preMark.match, "match.length")[1L], nchar(rhs.backmatch[1L]))
          } else { preMark <- "" }
          
          if ((postMark.match <- regexpr("[\\]\\}]\\s*$", rhs.backmatch[length(rhs.backmatch)], perl=TRUE))[1L] > 0) {
            postMark <- substr(rhs.backmatch[length(rhs.backmatch)], postMark.match[1L], nchar(rhs.backmatch[length(rhs.backmatch)]))
            rhs.backmatch[length(rhs.backmatch)] <- substr(rhs.backmatch[length(rhs.backmatch)], 1, postMark.match[1L] - 1)
          } else { postMark <- "" }
          
          
          #pre-multiply each parameter with each corresponding constraint
          for (i in 1:length(rhs.backmatch)) {
            rhs.expand[i] <- paste0(con.split[i], "*", rhs.backmatch[i])            
          }
          
          #join rhs as string and add back in mean/scale operator, if present
          rhs.expand <- paste0(preMark, paste(rhs.expand, collapse=" "), postMark)
          
          #if there were params that preceded the backwards match, then add these back to the syntax
          #append this syntax to the parsed command, cmd.expand
          if (length(rhs) - length(con.split) > 0L) {
            cmd.expand <- c(cmd.expand, paste(lhsop, paste(rhs[1:(length(rhs)-length(con.split))], collapse=" "), rhs.expand))
          } else {
            cmd.expand <- c(cmd.expand, paste0(lhsop, rhs.expand))
          }
        } else {
          #should be able to reduce redundancy with above
          
          #all parameters on the right hand side are to be equated
          #thus, pre-multiply each parameter by the constraint
          
          #check that no mean or scale markers are part of the rhs param to expand
          #DUPE CODE FROM ABOVE. Make Function?!
          if ((preMark.match <- regexpr("^\\s*[\\[\\{]", rhs[1L], perl=TRUE))[1L] > 0) {
            preMark <- substr(rhs[1L], preMark.match[1L], preMark.match[1L] + attr(preMark.match, "match.length")[1L] - 1)
            rhs[1L] <- substr(rhs[1L], preMark.match[1L] + attr(preMark.match, "match.length")[1L], nchar(rhs[1L]))
          } else { preMark <- "" }
          
          if ((postMark.match <- regexpr("[\\]\\}]\\s*$", rhs[length(rhs)], perl=TRUE))[1L] > 0) {
            postMark <- substr(rhs[length(rhs)], postMark.match[1L], nchar(rhs[length(rhs)]))
            rhs[length(rhs)] <- substr(rhs[length(rhs)], 1, postMark.match[1L] - 1)
          } else { postMark <- "" }
          
          
          rhs.expand <- c()
          for (i in 1:length(rhs)) {
            rhs.expand[i] <- paste0(con.split[1L], "*", rhs[i])
          }
          
          #join rhs as string
          rhs.expand <- paste0(preMark, paste(rhs.expand, collapse=" "), postMark)
          
          cmd.expand <- c(cmd.expand, paste0(lhsop, rhs.expand))
          
        }
      }
      
      cmd.tojoin[n] <- paste(cmd.expand, collapse=" ")
    } else { cmd.tojoin[n] <- cmd.split[n] } #no parens
  }
  
  #eliminate newlines in this function so that they don't mess up \\s+ splits downstream
  toReturn <- paste(cmd.tojoin, collapse=" ")
  attr(toReturn, "noConstraints") <- cmd.noconstraints
  
  return(toReturn)
  
}

expandGrowthCmd <- function(cmd) {
  #can assume that any spaces between tscore and variable were stripped by parseFixStart
  
  #verify that this is not a random slope
  if (any(tolower(strsplit(cmd, "\\s+", perl=TRUE)[[1]]) %in% c("on", "at"))) {
    stop("psindex does not support random slopes or individually varying growth model time scores")
  }
  
  cmd.split <- strsplit(cmd, "\\s*\\|\\s*", perl=TRUE)[[1]]
  if (!length(cmd.split) == 2) stop("Unknown growth syntax: ", cmd)
  
  lhs <- cmd.split[1]
  lhs.split <- strsplit(lhs, "\\s+", perl=TRUE)[[1]]
  
  rhs <- cmd.split[2]
  rhs.split <- strsplit(rhs, "(\\*|\\s+)", perl=TRUE)[[1]]
  
  if (length(rhs.split) %% 2 != 0) stop("Number of variables and number of tscores does not match: ", rhs)
  tscores <- as.numeric(rhs.split[1:length(rhs.split) %% 2 != 0]) #pre-multipliers
  
  vars <- rhs.split[1:length(rhs.split) %% 2 == 0]
  
  cmd.expand <- c()
  
  for (p in 0:(length(lhs.split)-1)) {
    if (p == 0) {
      #intercept
      cmd.expand <- c(cmd.expand, paste(lhs.split[(p+1)], "=~", paste("1*", vars, sep="", collapse=" + ")))
    } else {
      cmd.expand <- c(cmd.expand, paste(lhs.split[(p+1)], "=~", paste(tscores^p, "*", vars, sep="", collapse=" + ")))
    }
  }
  
  return(cmd.expand)
  
}

#function to wrap long lines at a certain width, splitting on + symbols to be consistent with R syntax
wrapAfterPlus <- function(cmd, width=90, exdent=5) {
  result <- lapply(cmd, function(line) {
        if (nchar(line) > width) {
          split <- c()
          spos <- 1L
          
          plusMatch <- gregexpr("+", line, fixed=TRUE)[[1]]
          mpos <- 1L
          
          if (plusMatch[1L] > 0L) {
            #split after plus symbol
            charsRemain <- nchar(line)
            while(charsRemain > 0L) {
              toProcess <- substr(line, nchar(line) - charsRemain + 1, nchar(line))
              offset <- nchar(line) - charsRemain + 1
              
              if (nchar(remainder <- substr(line, offset, nchar(line))) <= (width - exdent)) {
                #remainder of line fits within width -- no need to continue wrapping
                split[spos] <- remainder
                charsRemain <- 0
              } else {
                
                wrapAt <- which(plusMatch < (width + offset - exdent))
                wrapAt <- wrapAt[length(wrapAt)] #at the final +
                
                split[spos] <- substr(line, offset, plusMatch[wrapAt])
                charsRemain <- charsRemain - nchar(split[spos])
                spos <- spos + 1
              }
            }
            
            #remove leading and trailing chars
            split <- trimSpace(split)
            
            #handle exdent
            split <- sapply(1:length(split), function(x) {
                  if (x > 1) paste0(paste(rep(" ", exdent), collapse=""), split[x])
                  else split[x]
                })
            
            return(split)
          } else {
            return(strwrap(line, width=width, exdent=exdent)) #convention strwrap when no + present
          }         
        } else {
          return(line)
        }
      })
  
  #bind together multi-line expansions into single vector
  return(unname(do.call(c, result)))
}

mplus2psindex.constraintSyntax <- function(syntax) {
  #should probably pass in model syntax along with some tracking of which parameter labels are defined.
  
  #convert MODEL CONSTRAINT section to psindex model syntax
  syntax <- paste(lapply(trimSpace(strsplit(syntax, "\n")), function(x) { if (length(x) == 0L && is.character(x)) "" else x}), collapse="\n")
  
  #replace ! with # for comment lines. Also strip newline and replace with semicolon 
  syntax <- gsub("(\\s*)!(.+)\n", "\\1#\\2;", syntax, perl=TRUE)
  
  #split into vector of strings 
  #need to peel off leading or trailing newlines -- leads to parsing confusion downstream otherwise
  syntax.split <- gsub("(^\n|\n$)", "", unlist( strsplit(syntax, ";") ), perl=TRUE) 
  
  constraint.out <- c()
  
  #TODO: Handle PLOT and LOOP syntax for model constraints.
  #TODO: Handle DO loop convention
  
  #first parse new parameters defined in MODEL CONSTRAINT into a vector
  new.parameters <- c() #parameters that are defined in constraint section
  if (length(new.con.lines <- grep("^\\s*NEW\\s*\\([^\\)]+\\)", syntax.split, perl=TRUE, ignore.case=TRUE)) > 0L) {
    for (cmd in syntax.split[new.con.lines]) {
      #process new constraint definition
      new.con <- regexpr("^\\s*NEW\\s*\\(([^\\)]+)\\)", cmd, perl=TRUE, ignore.case=TRUE)
      if (new.con[1L] == -1) stop("Unable to parse names of new contraints")
      new.con <- substr(cmd, attr(new.con, "capture.start"), attr(new.con, "capture.start") + attr(new.con, "capture.length") - 1L)
      new.con <- expandCmd(new.con) #allow for hyphen expansion
      new.parameters <- c(new.parameters, strsplit(trimSpace(new.con), "\\s+", perl=TRUE)[[1L]])
    }
    
    syntax.split <- syntax.split[-1L * new.con.lines] #drop out these lines
    parameters.undefined <- new.parameters #to be used below to handle ambiguity of equation versus definition
    
  }

  for (cmd in syntax.split) {
    if (grepl("^\\s*#", cmd, perl=TRUE)) { #comment line
      constraint.out <- c(constraint.out , gsub("\n", "", cmd, fixed=TRUE)) #drop any newlines
    } else if (grepl("^\\s+$", cmd, perl=TRUE)) {
      #do nothing, just a space line
    } else {
      #constraint proper
      cmd <- gsub("**", "^", cmd, fixed=TRUE) #handle exponent
      
      #lower case the math operations supported by Mplus to be consistent with R
      #match all math operators, then lower case each and rejoin string
      maths <- gregexpr("(SQRT|LOG|LOG10|EXP|ABS|SIN|COS|TAN|ASIN|ACOS|ATAN)\\s*\\(", cmd, perl=TRUE)[[1L]]
      if (maths[1L] > 0) {
        maths.replace <- c()
        ep <- 1
        
        for (i in 1:length(maths)) {
          operator <- tolower(substr(cmd, attr(maths, "capture.start")[i], attr(maths, "capture.start")[i] + attr(maths, "capture.length")[i] - 1))
          maths.replace[ep] <- joinRegexExpand(cmd, operator, maths, i, matchLength="capture.length") #only match operator, not opening (
          ep <- ep + 1
        }
        cmd <- paste(maths.replace, collapse="")
      }
      
      #equating some lhs and rhs: could reflect definition of new parameter
      if ((equals <- regexpr("=", cmd, fixed=TRUE))[1L] > 0) {
        lhs <- trimSpace(substr(cmd, 1, equals - 1))
        rhs <- trimSpace(substr(cmd, equals + attr(equals, "match.length"), nchar(cmd)))
                
        #possibility of lhs or rhs containing the single variable to be equated
        if (regexpr("\\s+", lhs, perl=TRUE)[1L] > 0L) {
          def <- rhs
          body <- lhs
        } else if (regexpr("\\s+", rhs, perl=TRUE)[1L] > 0L) {
          def <- lhs
          body <- rhs
        } else { 
          #warning("Can't figure out which side of constraint defines a parameter")
          #this would occur for simple rel5 = rel2 sort of syntax
          def <- lhs
          body <- rhs
        }
        
        #must decide whether this is a new parameter (:=) or equation of exising labels (==)
        #alternatively, could be zero, as in  0 = x + y
        #this is tricky, because mplus doesn't differentiate definition from equation
        #consequently, could confuse the issue as in ex5.20
        #NEW(rel2 rel5 stan3 stan6);
        #rel2 = lam2**2*vf1/(lam2**2*vf1 + ve2); 
        #rel5 = lam5**2*vf2/(lam5**2*vf2 + ve5); 
        #rel5 = rel2;
  
        #for now, only define a new constraint if it's not already defined
        #otherwise equate
        if (def %in% new.parameters && def %in% parameters.undefined) {
          constraint.out <- c(constraint.out, paste(def, ":=", body))
          parameters.undefined <- parameters.undefined[!parameters.undefined==def]
        } else {
          constraint.out <- c(constraint.out,  paste(def, "==", body))
        }
      
      } else {
        #inequality constraints -- paste as is
        constraint.out <- c(constraint.out, cmd)
      }
      
    }
    
  }
  
  wrap <- paste(wrapAfterPlus(constraint.out, width=90, exdent=5), collapse="\n")
  return(wrap)
  
}

mplus2psindex.modelSyntax <- function(syntax) {
  #initial strip of leading/trailing whitespace, which can interfere with splitting on spaces 
  #strsplit generates character(0) for empty strings, which causes problems in paste because paste actually includes it as a literal
  #example: paste(list(character(0), "asdf", character(0)), collapse=" ")
  #thus, use lapply to convert these to empty strings first
  syntax <- paste(lapply(trimSpace(strsplit(syntax, "\n")), function(x) { if (length(x) == 0L && is.character(x)) "" else x}), collapse="\n")
  
  #replace ! with # for comment lines. Also strip newline and replace with semicolon 
  syntax <- gsub("(\\s*)!(.+)\n*", "\\1#\\2;", syntax, perl=TRUE)
  
  #new direction: retain newlines in parsed syntax until after constraints have been parsed
  
  #delete newlines
  #syntax <- gsub("\n", "", syntax, fixed=TRUE)
  
  # replace semicolons with newlines prior to split (divide into commands)
  #syntax <- gsub(";", "\n", syntax, fixed=TRUE)
  
  #split into vector of strings 
  #syntax.split <- unlist( strsplit(syntax, "\n") )
  syntax.split <- trimSpace(unlist( strsplit(syntax, ";") ))
  
  #format of parTable to mimic.
# 'data.frame':	34 obs. of  12 variables:
#  $ id    : int  1 2 3 4 5 6 7 8 9 10 ...
#  $ lhs   : chr  "ind60" "ind60" "ind60" "dem60" ...
#  $ op    : chr  "=~" "=~" "=~" "=~" ...
#  $ rhs   : chr  "x1" "x2" "x3" "y1" ...
#  $ user  : int  1 1 1 1 1 1 1 1 1 1 ...
#  $ group : int  1 1 1 1 1 1 1 1 1 1 ...
#  $ free  : int  0 1 2 0 3 4 5 0 6 7 ...
#  $ ustart: num  1 NA NA 1 NA NA NA 1 NA NA ...
#  $ exo   : int  0 0 0 0 0 0 0 0 0 0 ...
#  $ label : chr  "" "" "" "" ...
#  $ eq.id : int  0 0 0 0 0 0 0 0 0 0 ...
#  $ unco  : int  0 1 2 0 3 4 5 0 6 7 ...
  
  #vector of psindex syntax
  psindex.out <- c()
  
  for (cmd in syntax.split) {
    if (grepl("^\\s*#", cmd, perl=TRUE)) { #comment line
      psindex.out <- c(psindex.out, gsub("\n", "", cmd, fixed=TRUE)) #drop any newlines (otherwise done by parseConstraints)
    } else if (grepl("^\\s*$", cmd, perl=TRUE)) {
      #do nothing, just a space or blank line
    } else {
      #hyphen expansion
      cmd <- expandCmd(cmd)
      
      #blow up on growth syntax for now
#      if (grepl("|", cmd, fixed=TRUE)) stop("Growth modeling syntax using | not supported at the moment.")
      
      #parse fixed parameters and starting values
      cmd <- parseFixStart(cmd)
      
      #parse any constraints here (avoid weird logic below)
      cmd <- parseConstraints(cmd)

      if ((op <- regexpr("\\s+(by|on|with|pwith)\\s+", cmd, ignore.case=TRUE, perl=TRUE))[1L] > 0) { #regressions, factors, covariances
        
        lhs <- substr(cmd, 1, op - 1) #using op takes match.start which will omit spaces before operator
        rhs <- substr(cmd, op + attr(op, "match.length"), nchar(cmd))
        operator <- tolower(substr(cmd, attr(op, "capture.start"), attr(op, "capture.start") + attr(op, "capture.length") - 1))
        
        if (operator == "by") { lav.operator <- "=~"
        } else if (operator == "with" || operator == "pwith") { lav.operator <- "~~"
        } else if (operator == "on") { lav.operator <- "~" }
        
        #handle parameter combinations
        lhs.split <- strsplit(lhs, "\\s+")[[1]] #trimSpace(
        
        #handle pwith syntax
        if (operator == "pwith") {
          #TODO: Figure out if pwith can be paired with constraints?
          
          rhs.split <- strsplit(rhs, "\\s+")[[1]] #trimSpace(
          if (length(lhs.split) != length(rhs.split)) { browser(); stop("PWITH command does not have the same number of arguments on the left and right sides.")}
          
          cmd <- sapply(1:length(lhs.split), function(i) paste(lhs.split[i], lav.operator, rhs.split[i])) 
        } else {
          
          #insert plus signs on the rhs
          rhs <- gsub("\\s+", " + ", rhs, perl=TRUE) 
          
          if (length(lhs.split) > 1L) {
            #expand using possible combinations
            cmd <- sapply(lhs.split, function(larg) {
                  pair <- paste(larg, lav.operator, rhs)
                  return(pair)
                })
          } else {
            cmd <- paste(lhs, lav.operator, rhs)
          }
          
        }
      } else if ((means.scales <- regexpr("^\\s*([\\[\\{])([^\\]\\}]+)[\\]\\}]\\s*$", cmd, ignore.case=TRUE, perl=TRUE))[1L] > 0) { #intercepts/means or scales
        #first capture is the operator: [ or {
        operator <- substr(cmd, attr(means.scales, "capture.start")[1L], attr(means.scales, "capture.start")[1L] + attr(means.scales, "capture.length")[1L] - 1)
        
        params <- substr(cmd, attr(means.scales, "capture.start")[2L], attr(means.scales, "capture.start")[2L] + attr(means.scales, "capture.length")[2L] - 1)
        
        #obtain parameters with no constraint specification for LHS
        params.noconstraints <- sub("^\\s*[\\[\\{]([^\\]\\}]+)[\\]\\}]\\s*$", "\\1", attr(cmd, "noConstraints"), perl=TRUE)
        
        means.scales.split <- strsplit(params, "\\s+")[[1]] #trimSpace(
        means.scales.noConstraints.split <- strsplit(params.noconstraints, "\\s+")[[1]] #trimSpace(
        
        if (operator == "[") {
          #Tricky syntax shift (and corresponding kludge). For means, need to put constraint on RHS as pre-multiplier of 1 (e.g., x1 ~ 5*1).
          #But parseConstraints returns constraints multiplied by parameters    
          cmd <- sapply(means.scales.split, function(v) {
                #shift pre-multiplier
                if ((premult <- regexpr("([^\\*]+)\\*([^\\*]+)", v, perl=TRUE))[1L] > 0) {
                  modifier <- substr(v, attr(premult, "capture.start")[1L], attr(premult, "capture.start")[1L] + attr(premult, "capture.length")[1L] - 1)
                  paramName <- substr(v, attr(premult, "capture.start")[2L], attr(premult, "capture.start")[2L] + attr(premult, "capture.length")[2L] - 1)
                  paste0(paramName, " ~ ", modifier, "*1")
                } else {
                  paste(v, "~ 1")
                }
              })
          
        } else if (operator == "{"){
          #only include constraints on RHS
          cmd <- sapply(1:length(means.scales.split), function(v) paste(means.scales.noConstraints.split[v], "~*~", means.scales.split[v]))
        } else { stop("What's the operator?!") }
      } else if (grepl("|", cmd, fixed=TRUE)) {
        #expand growth modeling language
        cmd <- expandGrowthCmd(cmd)
      } else { #no operator, no means, must be variance.
        #cat("assuming vars: ", cmd, "\n")
        vars.lhs <- strsplit(attr(cmd, "noConstraints"), "\\s+")[[1]] #trimSpace(
        vars.rhs <- strsplit(cmd, "\\s+")[[1]] #trimSpace(
        
        cmd <- sapply(1:length(vars.lhs), function(v) paste(vars.lhs[v], "~~", vars.rhs[v]))
      }
      
      #handle threshold substitution: $ -> |
      cmd <- gsub("$", "|", cmd, fixed=TRUE)
      
      psindex.out <- c(psindex.out, cmd)
      
    }
  }
  
  #for now, include a final trimSpace call since some arguments have leading/trailing space stripped.
  wrap <- paste(wrapAfterPlus(psindex.out, width=90, exdent=5), collapse="\n") #trimSpace(
  return(wrap)
  
}

mplus2psindex <- function(inpfile, run=TRUE) {
  stopifnot(length(inpfile) == 1L)
  stopifnot(grepl("\\.inp$", inpfile))
  if (!file.exists(inpfile)) { stop("Could not find file: ", inpfile) }
  
  #for future consideration. For now, require a .inp file
#  if (length(inpfile) == 1L && grepl("\\.inp$", inpfile)) {
#    if (!file.exists(inpfile)) { stop("Could not find file: ", inpfile) }
#    inpfile.text <- scan(inpfile, what="character", sep="\n", strip.white=FALSE, blank.lines.skip=FALSE, quiet=TRUE)
#  } else {
#    #assume that inpfile itself is syntax (e.g., in a character vector)
#    inpfile.text <- inpfile
#  }
  
  inpfile.text <- scan(inpfile, what="character", sep="\n", strip.white=FALSE, blank.lines.skip=FALSE, quiet=TRUE)
  sections <- divideInputIntoSections(inpfile.text, inpfile)
  
  mplus.inp <- list()
  
  mplus.inp$title <- trimSpace(paste(sections$title, collapse=" "))
  mplus.inp$data <- divideIntoFields(sections$data, required="file")
  mplus.inp$variable <- divideIntoFields(sections$variable, required="names")
  mplus.inp$analysis <- divideIntoFields(sections$analysis)
  
  meanstructure <- "default" #psindex default
  if(!is.null(mplus.inp$analysis$model)) {
    if (tolower(mplus.inp$analysis$model) == "nomeanstructure") { meanstructure=FALSE } #explicitly disable mean structure
  }
  
  information <- "default" #psindex default
  if(!is.null(mplus.inp$analysis$information)) {
    information <- tolower(mplus.inp$analysis$information)
  }
  
  
  estimator <- "default"
  if (!is.null(est <- mplus.inp$analysis$estimator)) {
    #no memory of what this is up to....
    if (toupper(est) == "MUML") warning("Mplus does not support MUML estimator. Using default instead.")
    estimator <- est
    
    #march 2013: handle case where categorical data are specified, but ML-based estimator requested.
    #use WLSMV instead
    if (!is.null(mplus.inp$variable$categorical) && toupper(substr(mplus.inp$analysis$estimator, 1, 2)) == "ML") {
      warning("Lavaan does not yet support ML-based estimation for categorical data. Reverting to WLSMV")
      estimator <- "WLSMV"
    }
  }
  
  #expand hyphens in variable names and split into vector that will be the names for read.table
  mplus.inp$variable$names <- strsplit(expandCmd(mplus.inp$variable$names), "\\s+", perl=TRUE)[[1]]
  
  #expand hyphens in categorical declaration
  if (!is.null(mplus.inp$variable$categorical)) mplus.inp$variable$categorical <- strsplit(expandCmd(mplus.inp$variable$categorical), "\\s+", perl=TRUE)[[1]]
  
  #convert mplus syntax to psindex syntax
  mplus.inp$model <- mplus2psindex.modelSyntax(sections$model)
  
  #handle model constraint
  if ("model.constraint" %in% names(sections)) {
    mplus.inp$model.constraint <- mplus2psindex.constraintSyntax(sections$model.constraint)
    mplus.inp$model <- paste(mplus.inp$model, mplus.inp$model.constraint, sep="\n")
  }
  
  #read mplus data (and handle missing spec)
  mplus.inp$data <- readMplusInputData(mplus.inp, inpfile)
  
  #handle bootstrapping specification
  se="default"
  bootstrap <- 1000L
  test <- "default"
  if (!is.null(mplus.inp$analysis$bootstrap)) {
    boot.type <- "standard"
    #check whether standard versus residual bootstrap is specified
    if ((boot.match <- regexpr("\\((\\w+)\\)", mplus.inp$analysis$bootstrap, perl=TRUE)) > 0L) {
      boot.type <- tolower(substr(mplus.inp$analysis$bootstrap, attr(boot.match, "capture.start"), attr(boot.match, "capture.start") + attr(boot.match, "capture.length") - 1L))
    }
    
    if (boot.type == "residual") test <- "Bollen.Stine"
    
    se <- "bootstrap"
    
    if ((nboot.match <- regexpr("^\\s*(\\d+)", mplus.inp$analysis$bootstrap, perl=TRUE)) > 0L) {
      bootstrap <- as.numeric(substr(mplus.inp$analysis$bootstrap, attr(nboot.match, "capture.start"), attr(nboot.match, "capture.start") + attr(nboot.match, "capture.length") - 1L))
    }    
  }
  
  if (run) {
    fit <- sem(mplus.inp$model, data=mplus.inp$data, meanstructure=meanstructure, mimic="Mplus", estimator=estimator, test=test, se=se, bootstrap=bootstrap, information=information)
    fit@external <- list(mplus.inp=mplus.inp)
  } else {
    fit <- mplus.inp #just return the syntax outside of a psindex object
  }
  
  return(fit)
}


divideIntoFields <- function(section.text, required) {
  
  if (is.null(section.text)) { return(NULL) }
  
  #The parser breaks down when there is a line with a trailing comment because then splitting on semicolon will combine it with the following line
  #Thus, trim off trailing comments before initial split
  section.text <- gsub("\\s*!.*$", "", section.text, perl=TRUE)
  section.split <- strsplit(paste(section.text, collapse=" "), ";", fixed=TRUE)[[1]] #split on semicolons
  section.divide <- list()
  
  for (cmd in section.split) {
    if (grepl("^\\s*!.*", cmd, perl=TRUE)) next #skip comment lines
    if (grepl("^\\s+$", cmd, perl=TRUE)) next #skip blank lines
    
    #mplus is apparently tolerant of specifications that don't include IS/ARE/=
    #example: usevariables x1-x10;
    #thus, split on spaces and assume that first element is lhs, drop second element if IS/ARE/=, and assume remainder is rhs
    
    #but if user uses equals sign, then spaces will not always be present (e.g., usevariables=x1-x10)
    if ( (leadingEquals <- regexpr("^\\s*[A-Za-z]+[A-Za-z_-]*\\s*(=)", cmd[1L], perl=TRUE))[1L] > 0) {
      cmdName <- trimSpace(substr(cmd[1L], 1, attr(leadingEquals, "capture.start") - 1))
      cmdArgs <- trimSpace(substr(cmd[1L], attr(leadingEquals, "capture.start") + 1, nchar(cmd[1L])))
    } else {
      cmd.spacesplit <- strsplit(trimSpace(cmd[1L]), "\\s+", perl=TRUE)[[1L]]
      
      if (length(cmd.spacesplit) < 2L) {
        #for future: make room for this function to prase things like just TECH13 (no rhs)
      } else {
        cmdName <- trimSpace(cmd.spacesplit[1L])
        if (length(cmd.spacesplit) > 2L && tolower(cmd.spacesplit[2L]) %in% c("is", "are")) {
          cmdArgs <- paste(cmd.spacesplit[3L:length(cmd.spacesplit)], collapse=" ") #remainder, removing is/are
        } else {
          cmdArgs <- paste(cmd.spacesplit[2L:length(cmd.spacesplit)], collapse=" ") #is/are not used, so just join rhs
        }
      }
      
    }
    
    section.divide[[make.names(tolower(cmdName))]] <- cmdArgs
    
  }
  
  if (!missing(required)) { stopifnot(all(required %in% names(section.divide))) }
  return(section.divide)
}

#helper function
splitFilePath <- function(abspath) {
  #function to split path into path and filename 
  #code adapted from R.utils filePath command
  if (!is.character(abspath)) stop("Path not a character string")
  if (nchar(abspath) < 1 || is.na(abspath)) stop("Path is missing or of zero length")
  
  components <- strsplit(abspath, split="[\\/]")[[1]]
  lcom <- length(components)
  
  stopifnot(lcom > 0)
  
  #the file is the last element in the list. In the case of length == 1, this will extract the only element.
  relFilename <- components[lcom]
  absolute <- FALSE
  
  if (lcom == 1) {
    dirpart <- NA_character_
  }
  else if (lcom > 1) {
    #drop the file from the list (the last element)
    components <- components[-lcom]
    dirpart <- do.call("file.path", as.list(components))
    
    #if path begins with C:, /, //, or \\, then treat as absolute
    if (grepl("^([A-Z]{1}:|/|//|\\\\)+.*$", dirpart, perl=TRUE)) absolute <- TRUE
  }
  
  return(list(directory=dirpart, filename=relFilename, absolute=absolute))
}

readMplusInputData <- function(mplus.inp, inpfile) {
  
  #handle issue of mplus2psindex being called with an absolute path, whereas mplus has only a local data file
  inpfile.split <- splitFilePath(inpfile)
  datfile.split <- splitFilePath(mplus.inp$data$file)
  
  #if inp file target directory is non-empty, but mplus data is without directory, then append
  #inp file directory to mplus data. This ensures that R need not be in the working directory
  #to read the dat file. But if mplus data has an absolute directory, don't append
  
  #if mplus data directory is present and absolute, or if no directory in input file, just use filename as is
  if (!is.na(datfile.split$directory) && datfile.split$absolute)
    datFile <- mplus.inp$data$file #just use mplus data filename if it has absolute path
  else if (is.na(inpfile.split$directory))
    datFile <- mplus.inp$data$file #just use mplus data filename if inp file is missing path (working dir)
  else
    datFile <- file.path(inpfile.split$directory, mplus.inp$data$file) #dat file path is relative or absent, and inp file directory is present
  
  if (!file.exists(datFile)) {
    warning("Cannot find data file: ", datFile)
    return(NULL)
  }
  
  #handle missing is/are:
  missList <- NULL
  if (!is.null(missSpec <- mplus.inp$variable$missing)) {
   
    expandMissVec <- function(missStr) {
      #sub-function to obtain a vector of all missing values within a set of parentheses
      missSplit <- strsplit(missStr, "\\s+")[[1L]]
      missVals <- c()
      for (f in missSplit) {
        if ((hyphenPos <- regexpr("\\d+(-)\\d+", f, perl=TRUE))[1L] > -1L) {
          #expand hyphen
          preHyphen <- substr(f, 1, attr(hyphenPos, "capture.start") - 1)
          postHyphen <- substr(f, attr(hyphenPos, "capture.start") + 1, nchar(f))
          missVals <- c(missVals, as.character(seq(preHyphen, postHyphen)))
        } else {
          #append to vector
          missVals <- c(missVals, f)
        }
      }
      return(as.numeric(missVals))
    }
    
    if (missSpec == "." || missSpec=="*") { #case 1: MISSING ARE|=|IS .;
      na.strings <- missSpec
    } else if ((allMatch <- regexpr("\\s*ALL\\s*\\(([^\\)]+)\\)", missSpec, perl=TRUE))[1L] > -1L) { #case 2: use of ALL with parens
      missStr <- trimSpace(substr(missSpec, attr(allMatch, "capture.start"), attr(allMatch, "capture.start") + attr(allMatch, "capture.length") - 1L))
      na.strings <- expandMissVec(missStr)
    } else { #case 3: specific missing values per variable
      #process each element
      missBlocks <- gregexpr("(?:(\\w+)\\s+\\(([^\\)]+)\\))+", missSpec, perl=TRUE)[[1]]
      missList <- list()
      
      if (missBlocks[1L] > -1L) {
        for (i in 1:length(missBlocks)) {
          vname <- substr(missSpec, attr(missBlocks, "capture.start")[i,1L], attr(missBlocks, "capture.start")[i,1L] + attr(missBlocks, "capture.length")[i,1L] - 1L)
          vmiss <- substr(missSpec, attr(missBlocks, "capture.start")[i,2L], attr(missBlocks, "capture.start")[i,2L] + attr(missBlocks, "capture.length")[i,2L] - 1L)
          
          vnameHyphen <- regexpr("(\\w+)-(\\w+)", vname, perl=TRUE)[1L]
          if (vnameHyphen > -1L) {
            #lookup against variable names
            vstart <- which(mplus.inp$variable$names == substr(vname, attr(vnameHyphen, "capture.start")[1L], attr(vnameHyphen, "capture.start")[1L] + attr(vnameHyphen, "capture.length")[1L] - 1L))
            vend <- which(mplus.inp$variable$names == substr(vname, attr(vnameHyphen, "capture.start")[2L], attr(vnameHyphen, "capture.start")[2L] + attr(vnameHyphen, "capture.length")[2L] - 1L))
            if (length(vstart) == 0L || length(vend) == 0L) { stop("Unable to lookup missing variable list: ", vname) }
            #I suppose start or finish could be mixed up
            if (vstart > vend) { vstart.orig <- vstart; vstart <- vend; vend <- vstart.orig }
            vname <- mplus.inp$variable$names[vstart:vend]
          }
          
          missVals <- expandMissVec(vmiss)
          
          for (j in 1:length(vname)) {
            missList[[ vname[j] ]] <- missVals
          }
          
        }
      } else { stop("I don't understand this missing specification: ", missSpec) }
    }
  } else { na.strings <- "NA" }

  if (!is.null(missList)) {
    dat <- read.table(datFile, header=FALSE, col.names=mplus.inp$variable$names, colClasses="numeric")
    #loop over variables in missList and set missing values to NA
    dat[,names(missList)] <- lapply(names(missList), function(vmiss) {
            dat[which(dat[,vmiss] %in% missList[[vmiss]]), vmiss] <- NA
            return(dat[,vmiss])
        })
    
    names(dat) <- mplus.inp$variable$names #loses these from the lapply
  
  } else {
    dat <- read.table(datFile, header=FALSE, col.names=mplus.inp$variable$names, na.strings=na.strings, colClasses="numeric")
  }
  
  
  #TODO: support covariance/mean+cov inputs
  
  #store categorical variables as ordered factors
  if (!is.null(mplus.inp$variable$categorical)) {
    dat[,c(mplus.inp$variable$categorical)] <- lapply(dat[,c(mplus.inp$variable$categorical), drop=FALSE], ordered) 
  }
  
  return(dat)
}


divideInputIntoSections <- function(inpfile.text, filename) {
  inputHeaders <- grep("^\\s*(title:|data.*:|variable:|define:|analysis:|model.*:|output:|savedata:|plot:|montecarlo:)", inpfile.text, ignore.case=TRUE, perl=TRUE)
  
  stopifnot(length(inputHeaders) > 0L)
  
  mplus.sections <- list()
  
  for (h in 1:length(inputHeaders)) {
    sectionEnd <- ifelse(h < length(inputHeaders), inputHeaders[h+1] - 1, length(inpfile.text))
    section <- inpfile.text[inputHeaders[h]:sectionEnd]
    sectionName <- trimSpace(sub("^([^:]+):.*$", "\\1", section[1L], perl=TRUE)) #obtain text before the colon
    
    #dump section name from input syntax
    section[1L] <- sub("^[^:]+:(.*)$", "\\1", section[1L], perl=TRUE)
    
    mplus.sections[[make.names(tolower(sectionName))]] <- section
  }
  
  return(mplus.sections)
}