R/lav_syntax_parser_r.R

Defines functions lav_parse_model_string_r lav_parse_get_modifier lav_parse_adapt_vector_type lav_parse_evaluate_r_expression lav_parse_unpaste lav_parse_num_modifier lav_parse_check_valid_name lav_parse_step2 lav_parse_step1 lav_local_msgcode lav_parse_sublist lav_create_enum

# ----------------------- lav_create_enum ------------------------------------ #
# function to create an Enumerable like structure in R
#  usage example  mycolors <- lav_create_enum(c("black", "white",
#                                     "orange", "green", "red", "blue"))
#                 xyz <- mycolors$red
# values are default 1L, ..., number of names, but can be user specified
# ---------------------------------------------------------------------------- #
lav_create_enum <- function(names, values = seq_along(names)) {
  stopifnot(identical(unique(names), names), is.character(names))
  stopifnot(length(names) == length(values))
  res <- as.list(setNames(values, names))
  res$enum.names <- names
  res$enum.values <- values
  res$enum.size <- length(values)
  res <- as.environment(res)
  lockEnvironment(res, bindings = TRUE)
  res
}

# ------------------------ lav_parse_sublist --------------------------------- #
# function to create a list with only some indexes for all members
# ---------------------------------------------------------------------------- #
lav_parse_sublist <- function(inlist, indexes) {
  for (j in seq_along(inlist)) {
    inlist[[j]] <- inlist[[j]][indexes]
  }
  inlist
}

# ------------------------ lav_local_msgcode  -------------------------------- #
# function which stores a warning or error message code + position in a list
# ---------------------------------------------------------------------------- #
lav_local_msgcode <- function(isError, msgcode, msgpos, msgenv) {
    if (isError) {
        assign("error", c(msgcode, msgpos), msgenv)
    } else {
        wrnnum <- 1L + get0("warncount", envir = msgenv, ifnotfound = 0L)
        assign(paste0("warn", sprintf("%03d", wrnnum)),
               c(msgcode, msgpos - 1L), msgenv)  # msgpos - 1L to align with C
        assign("warncount", wrnnum, msgenv)
    }
    return(invisible(NULL))
}

# ------------------------ lav_parse_step1 ----------------------------------- #
# function to split the model source in tokens.
# Returns a list with tokens with their attributes
#   elem.pos  : position in source
#   elem.type : type of token (cf. definition of types
#               in lav_parse_model_string)
#   elem.text : the text of the token
#   elem.formule.number : sequence number of the 'logical'
#                         formula where the token occurs
# the function returns the stored tokens in a list
# ---------------------------------------------------------------------------- #
lav_parse_step1 <- function(modelsrc, types, msgenv) {
  nmax <- nchar(modelsrc)
  elem.pos <- vector("integer", nmax)
  elem.type <- elem.pos
  elem.text <- vector("character", nmax)
  elem.i <- 1L
  modelsrcw <- paste0(modelsrc, "\n") # working model, must end
  # with a newline for tests via regexpr
  stringliterals <- gregexpr("\"[^\"]*?[\"\n]", modelsrcw)[[1L]]
  if (stringliterals[1L] > -1L) {
    stringliteral.lengths <- attr(stringliterals, "match.length")
    for (i in seq_along(stringliterals)) {
      pfpos <- stringliterals[i]
      pflen <- stringliteral.lengths[i]
      substr(modelsrcw, pfpos + 1L, pfpos + pflen - 2L) <-
        strrep(" ", pflen - 2L)
      elem.pos[elem.i] <- pfpos
      elem.text[elem.i] <- substr(modelsrc, pfpos + 1L, pfpos + pflen - 2L)
      elem.type[elem.i] <- types$stringliteral
      elem.i <- elem.i + 1L
    }
  }
  comments <- gregexpr("[#!].*?\n", modelsrcw)[[1L]]
  if (comments[1] > -1L) {
    comment.lengths <- attr(comments, "match.length")
    for (i in seq_along(comments)) {
      substr(modelsrcw, comments[i], comments[i] + comment.lengths[i] - 1L) <-
        strrep(" ", comment.lengths[i] - 1L)
      # check for stringliterals in comment
      str.in.comment <- (elem.pos > comments[i] &
        elem.pos < comments[i] + comment.lengths[i])
      if (any(str.in.comment)) {
        elem.type[str.in.comment] <- 0
      }
    }
  }
  modelsrcw <- gsub("\t", " ", modelsrcw)
  newlines <- gregexpr("[;\n]", modelsrcw)[[1L]]
  if (newlines[1L] > -1L) {
    for (i in seq_along(newlines)) {
      pfpos <- newlines[i]
      substr(modelsrcw, pfpos, pfpos) <- "\n"
      elem.pos[elem.i] <- pfpos
      elem.text[elem.i] <- "\n"
      elem.type[elem.i] <- types$newline
      elem.i <- elem.i + 1L
    }
  }
  # --------------------- handling spaces in operators ------------------------ #
  if (grepl("= +~", modelsrcw)) {
    waar <- regexpr("= +~", modelsrcw)[1]
    modelsrcw <- gsub("=( +)~", "=~\\1", modelsrcw)
    lav_local_msgcode(FALSE, 101L, waar, msgenv)
  }
  if (grepl("[^=~]~ +~", modelsrcw)) {
    waar <- regexpr("[^=~]~ +~", modelsrcw)[1]
    modelsrcw <- gsub("([^=~])~( +)~", "\\1~~\\2", modelsrcw)
    lav_local_msgcode(FALSE, 101L, waar, msgenv)
  }
  # -------------------------------------------------------------------------- #
  lavops <- gregexpr("=~|<~|~\\*~|~~|~|==|<|>|:=|:|\\||%", modelsrcw)[[1]]
  if (lavops[1L] > -1L) {
    lavop.lengths <- attr(lavops, "match.length")
    for (i in seq_along(lavops)) {
      pfpos <- lavops[i]
      pflen <- lavop.lengths[i]
      elem.pos[elem.i] <- pfpos
      elem.text[elem.i] <- substr(modelsrcw, pfpos, pfpos + pflen - 1L)
      elem.type[elem.i] <- types$lavaanoperator
      substr(modelsrcw, pfpos, pfpos + pflen - 1L) <- strrep(" ", pflen)
      elem.i <- elem.i + 1L
    }
  }
  symbols <- gregexpr("[,()/*?^']", modelsrcw)[[1L]] # f1=~x2 + 0.5 ? x3
  symbols1 <- gregexpr("[-+][^.0-9]", modelsrcw)[[1L]] # f1=~x2+x3
  symbols2 <- gregexpr("[._0-9a-df-zA-DF-Z)] *[-+][.0-9]", modelsrcw)[[1L]]
  # f1=~x2+2*x3, len-2 !
  symbols3 <- gregexpr("[^.0-9][eE] *[-+][.0-9]", modelsrcw)[[1L]]
  # f1=~xe+2*x3, len-2 !
  if (symbols1[1L] > -1L) {
    if (symbols[1L] == -1L) {
      symbols <- symbols1
    } else {
      symbols <- c(symbols, symbols1)
    }
  }
  if (symbols2[1L] > -1L) {
    symbols2.lengths <- attr(symbols2, "match.length")
    symbols2 <- symbols2 + symbols2.lengths - 2L
    if (symbols[1L] == -1L) {
      symbols <- symbols2
    } else {
      symbols <- c(symbols, symbols2)
    }
  }
  if (symbols3[1L] > -1L) {
    symbols3.lengths <- attr(symbols3, "match.length")
    symbols3 <- symbols3 + symbols3.lengths - 2L
    if (symbols[1L] == -1L) {
      symbols <- symbols3
    } else {
      symbols <- c(symbols, symbols3)
    }
  }
  if (symbols[1L] > -1L) {
    for (i in seq_along(symbols)) {
      pfpos <- symbols[i]
      substr(modelsrcw, pfpos, pfpos) <- " "
      elem.pos[elem.i] <- pfpos
      elem.text[elem.i] <- substr(modelsrc, pfpos, pfpos)
      elem.type[elem.i] <- types$symbol
      elem.i <- elem.i + 1L
    }
  }

  numliterals <- gregexpr(
    "([ \n][-+][.0-9]|[ \n]\\.[0-9]|[ \n][0-9])[-+\\.0-9eE]*",
    paste0(" ",modelsrcw)
  )[[1]]
  if (numliterals[1L] > -1L) {
    numliteral.lengths <- attr(numliterals, "match.length") - 1L
    for (i in seq_along(numliterals)) {
      pfpos <- numliterals[i]
      pflen <- numliteral.lengths[i]
      substr(modelsrcw, pfpos, pfpos + pflen - 1L) <- strrep(" ", pflen)
      elem.pos[elem.i] <- pfpos
      elem.text[elem.i] <- substr(modelsrc, pfpos, pfpos + pflen - 1L)
      elem.type[elem.i] <- types$numliteral
      elem.i <- elem.i + 1L
    }
  }
  identifiers <- gregexpr(
    "[ \n][_.[:alpha:]][_.[:alnum:]]*",
    paste0(" ", modelsrcw)
  )[[1]]
  identifier.lengths <- attr(identifiers, "match.length") - 1L
  for (i in seq_along(identifiers)) {
    pfpos <- identifiers[i]
    pflen <- identifier.lengths[i]
    substr(modelsrcw, pfpos, pfpos + pflen - 1L) <- strrep(" ", pflen)
    elem.pos[elem.i] <- pfpos
    elem.text[elem.i] <- substr(modelsrc, pfpos, pfpos + pflen - 1L)
    elem.type[elem.i] <- types$identifier
    elem.i <- elem.i + 1L
  }
  # check for uninterpreted chars
  wrong <- regexpr("[^\"\n ]", modelsrcw)
  if (wrong != -1L) {
      lav_local_msgcode(TRUE, 24L, wrong, msgenv)
      return(msgenv)
  }
  # remove unused elements from vectors
  elements <- which(elem.type > 0L)
  elem.pos <- elem.pos[elements]
  elem.type <- elem.type[elements]
  elem.text <- elem.text[elements]
  # order tokens
  token.order <- order(elem.pos)
  elem.pos <- elem.pos[token.order]
  elem.type <- elem.type[token.order]
  elem.text <- elem.text[token.order]

  # concatenate identifiers with only spaces in between - LDW 22/4/2024
  elem.i <- length(elem.pos)
  concatenated <- FALSE
  while (elem.i > 1L) {
    if (any(elem.type[elem.i] == c(types$identifier, types$numliteral)) &&
      elem.type[elem.i - 1L] == types$identifier) {
      spaces.between <- elem.pos[elem.i] - elem.pos[elem.i - 1L] -
        length(elem.text[elem.i - 1L])
      elem.text[elem.i - 1L] <- paste0(
        elem.text[elem.i - 1L],
        strrep(" ", spaces.between),
        elem.text[elem.i]
      )
      elem.type[elem.i] <- 0L
      concatenated <- TRUE
    }
    elem.i <- elem.i - 1L
  }
  if (concatenated) { # remove items with type 0
    elements <- which(elem.type > 0L)
    elem.pos <- elem.pos[elements]
    elem.type <- elem.type[elements]
    elem.text <- elem.text[elements]
  }

  # to set formula number
  elem.formula.number <- rep(0L, length(elem.type))
  frm.number <- 1L
  frm.hasefa <- FALSE
  frm.lastplus <- FALSE
  frm.incremented <- FALSE
  for (i in seq_along(elem.type)) {
    elem.formula.number[i] <- frm.number
    if (elem.type[i] == types$identifier && elem.text[i] == "efa") {
      frm.hasefa <- TRUE
    }
    if (any(elem.text[i] == c("+", "*", "=~", "-"))) {
      if (frm.incremented) {
        frm.number <- frm.number - 1L
        elem.formula.number[i] <- frm.number
        frm.incremented <- FALSE
      }
      frm.lastplus <- TRUE
    } else {
      if (any(elem.type[i] == c(
        types$stringliteral, types$identifier, types$numliteral,
        types$stringliteral, types$symbol
      ))) {
        frm.lastplus <- FALSE
      }
      if (i > 1 && elem.type[i] != types$newline &&
        elem.type[i - 1L] == types$lavaanoperator) {
        frm.hasefa <- FALSE
      }
    }
    if (elem.type[i] == types$newline) {
      if (i > 1 && elem.type[i - 1L] != types$newline) { # ignore multiple nl's
        if (!frm.hasefa && !frm.lastplus) {
          frm.number <- frm.number + 1L
          frm.incremented <- TRUE
        } else {
          frm.hasefa <- FALSE
        }
      }
    } else {
      frm.incremented <- FALSE
    }
  }
  return(list(
    elem.pos = elem.pos, elem.type = elem.type,
    elem.text = elem.text, elem.formula.number = elem.formula.number
  ))
}

# ------------------------ lav_parse_step2 ----------------------------------- #
# function to group the modellist tokens in 'mono' formulas.
# mono means that the terms (for formulas other then blocks and constraints)
#   are split in seperate formula's, e.g.
#   a1 + a2 =~ b1 + b2  becomes
#     /  a1 =~ b1
#     |  a1 =~ b2
#     |  a2 =~ b1
#     \  a2 =~ b2
# newlines are removed
# the function returns a list of formulas
# ---------------------------------------------------------------------------- #
lav_parse_step2 <- function(modellist, modelsrc, types, msgenv) {
  real.operators <- c("=~", "<~", "~*~", "~~", "~", "|", "%")
  welke <- modellist$elem.type != types$newline
  formula.numbers <- unique(modellist$elem.formula.number[welke])
  formulas <- lapply(formula.numbers, function(s) {
    welkenu <- modellist$elem.formula.number == s & welke
    list(
      elem.pos = modellist$elem.pos[welkenu],
      elem.type = modellist$elem.type[welkenu],
      elem.text = modellist$elem.text[welkenu]
    )
  })
  maxnum <- length(formula.numbers) + sum(modellist$elem.text == "+")
  outval <- vector(mode = "list", length = maxnum)
  realnum <- 0L
  for (i in seq_along(formulas)) {
    formul1 <- formulas[[i]]
    opi <- which(formul1$elem.type == types$lavaanoperator)
    nelem <- length(formul1$elem.type)
    if (length(opi) == 0L) {
        lav_local_msgcode(TRUE, 31L, formul1$elem.pos[1], msgenv)
        return(msgenv)
    }
    if (length(opi) > 1L) opi <- opi[1] # only first operator taken
    if (any(formul1$elem.text[opi] == real.operators) &&
      sum(formul1$elem.text == "+") > 0) {
      # check + symbols outside parentheses in left and right hand side
      lhplusjes <- integer(0)
      openparentheses <- 0L
      for (jj in seq.int(1L, opi - 1L)) {
        if (formul1$elem.text[jj] == "(") {
          openparentheses <- openparentheses + 1L
          next
        }
        if (formul1$elem.text[jj] == ")") {
          openparentheses <- openparentheses - 1L
          next
        }
        if (formul1$elem.text[jj] == "+" && openparentheses == 0L) {
          lhplusjes <- c(lhplusjes, jj)
        }
      }
      lhplusjes <- c(lhplusjes, opi)
      plusjes <- integer(0)
      openparentheses <- 0L
      for (jj in seq.int(opi + 1L, nelem)) {
        if (formul1$elem.text[jj] == "(") {
          openparentheses <- openparentheses + 1L
          next
        }
        if (formul1$elem.text[jj] == ")") {
          openparentheses <- openparentheses - 1L
          next
        }
        if (formul1$elem.text[jj] == "+" && openparentheses == 0L) {
          plusjes <- c(plusjes, jj)
        }
      }
      plusjes <- c(plusjes, nelem + 1)
      # splitting lhs and rhs on '+' signs
      for (j in seq_along(lhplusjes)) {
        j0 <- 1L
        if (j > 1L) j0 <- lhplusjes[j - 1L] + 1L
        j1 <- lhplusjes[j] - 1L
        if (j1 < j0) next # skip empty parts
        for (k in seq_along(plusjes)) {
          k0 <- opi + 1L
          k1 <- plusjes[k] - 1L
          if (k > 1L) k0 <- plusjes[k - 1L] + 1L
          if (k1 < k0) next # skip empty parts
          welke <- c(seq.int(j0, j1), opi, seq.int(k0, k1))
          realnum <- realnum + 1L
          outval[[realnum]] <- lav_parse_sublist(formul1, welke)
        }
      }
    } else {
      realnum <- realnum + 1L
      outval[[realnum]] <- formul1
    }
  }
  outval[seq_len(realnum)]
}
# ------------------------ lav_parse_check_valid_name ------------------------ #
# checks if an element of the elem.text member in a list is a valid r-name
# ---------------------------------------------------------------------------- #
lav_parse_check_valid_name <- function(formul1, ind, modelsrc, msgenv) {
  # allow spaces, LDW 22/4/2024
  testitem <- gsub(" ", "_", formul1$elem.text[ind], fixed = TRUE)

  if (make.names(testitem) != testitem) {
    lav_local_msgcode(TRUE, 41L, formul1$elem.pos[ind], msgenv)
    return(msgenv)
  }
  return(invisible(NULL))
}
# ------------------------ lav_parse_num_modifier  --------------------------------- #
# function for transforming string with numeric values separated by semicolons
# in a numeric vector (used in lav_parse_get_modifier)
# ---------------------------------------------------------------------------- #
lav_parse_num_modifier <- function(txt) {
  # help function
  vapply(strsplit(txt, ";")[[1]], function(x) {
    if (x == "NA") NA_real_ else as.numeric(x)
  }, 1.0, USE.NAMES = FALSE)
}
# ------------------------ lav_parse_unpaste  -------------------------------------- #
# function for transforming string with string values separated by semicolons
# in a vector (used in lav_parse_get_modifier)
# ---------------------------------------------------------------------------- #
lav_parse_unpaste <- function(text) {
  out <- strsplit(text, ";(NA;)*")[[1]]
  if (grepl(";$", text)) out <- c(out, "")
  out
}
# ------------------------ lav_parse_evaluate_r_expression ------------------------- #
# help function to evaluate the value of an r expression formed by the elements
# with index 'from' to 'to' of a formula 'formul1'
# returns "_error_" if evaluation failed
# used only in lav_parse_get_modifier
# ---------------------------------------------------------------------------- #
lav_parse_evaluate_r_expression <- function(formul1, from, to, types) {
  strings <- vapply(seq.int(from, to), function(x) {
    if (formul1$elem.type[x] == types$stringliteral) {
      paste0('"', formul1$elem.text[x], '"')
    } else {
      formul1$elem.text[x]
    }
  }, "")
  txt <- paste(strings, collapse = "")
  result <- try(eval(parse(text = txt),
    envir = NULL,
    enclos = baseenv()
  ), silent = TRUE)
  if (inherits(result, "try-error")) {
    return("_error_")
  }
  return(result)
}
# ------------------------ lav_parse_adapt_vector_type ----------------------------- #
# help function to dynamically adapt the type of a vector in a c(...) sequence
# used only in lav_parse_get_modifier
# ---------------------------------------------------------------------------- #
lav_parse_adapt_vector_type <- function(typenu, typetoadd, texttoadd, types) {
  if (texttoadd != "NA") {
    if (typenu == 0) {
      typenu <- typetoadd
    } else {
      if (typenu != typetoadd) typenu <- types$stringliteral
    }
  } else if (typenu == 0) {
    typenu <- types$numliteral
  }
  return(typenu)
}
# ------------------------ lav_parse_get_modifier ---------------------------- #
# The function takes a list with tokens belonging to a single 'mono' lavaan
# formula as input. The other arguments are:
#       lhs : check for lhs or rhs modifier
#       opi : index of the lavaan operator in the list-items
#  modelsrc : the model source string (for error messages and warnings)
#     types : the types of tokens
#       rme : index of last element of modifier in formula (*)
#   rmeprev : index of first element of modifier in formula - 1L (*)
# The function return the modifier detected as element of a list
#  with name the modifier type (efa, fixed, start, label, lower, upper, prior or
#   rv) and value an array of values (length > 1 if vector via c(...)) for the
#   modifier value.
# (*) if rme > remprev the rhs is limited to the elements with index
#     rmeprev+1:rme, this is to support multiple modifiers for the same element.
# An error message is produced when no modifier can be determined.
# ---------------------------------------------------------------------------- #
lav_parse_get_modifier <- function(formul1, lhs, opi, modelsrc, types,
                                   rme = 0L, rmeprev = 0L, msgenv) {
  if (rme > rmeprev) {
    welke <- c(seq.int(1L, opi), seq.int(rmeprev + 1L, rme), length(formul1))
    formul1 <- lav_parse_sublist(formul1, welke)
  }
  nelem <- length(formul1$elem.type)
  # remove unnecessary parentheses (one element between parentheses, previous
  # no identifier)
  check.more <- TRUE
  while (check.more && nelem > 4L) {
    check.more <- FALSE
    for (par.i in seq.int(3L, nelem - 1L)) {
      if (formul1$elem.text[par.i - 1L] == "(" &&
        formul1$elem.text[par.i + 1L] == ")" &&
        formul1$elem.type[par.i - 2L] != types$identifier) {
        formul1$elem.type[par.i - 1L] <- 0L
        formul1$elem.type[par.i + 1L] <- 0L
        check.more <- TRUE
      }
    }
    if (check.more) {
      formul1 <- lav_parse_sublist(formul1, which(formul1$elem.type > 0))
      nelem <- length(formul1$elem.type)
    }
  }
  if (lhs) {
    # modifier on left hand side
    # only 1 possibility : efa ( expression-resulting-in-char ) *
    #                                        identifier operator ... (rhs) ...
    if (formul1$elem.text[1L] == "efa" &&
      formul1$elem.text[2L] == "(" &&
      formul1$elem.text[opi - 3L] == ")" &&
      formul1$elem.text[opi - 2L] == "*") {
      temp <- lav_parse_evaluate_r_expression(formul1, 3L, opi - 4L, types)
      if (is.character(temp) && temp[1] != "_error_") {
        return(list(efa = temp))
      }
    }
    lav_local_msgcode(TRUE, 42L, formul1$elem.pos[1], msgenv)
    return(msgenv)
  } else {
    # modifier on right hand side
    # check for vectors c(...), start(...), fixed(...), ...
    for (j in (opi + 1L):(nelem - 2L)) {
      if (formul1$elem.text[j + 1L] == "(") {
        if (formul1$elem.text[j] == "c") {
          vector.type <- 0
          labnu <- j + 2L
          lab <- formul1$elem.text[labnu]
          vector.type <- lav_parse_adapt_vector_type(
            vector.type, formul1$elem.type[labnu],
            formul1$elem.text[labnu], types
          )
          while (formul1$elem.text[labnu + 1L] == ",") {
            labnu <- labnu + 2L
            lab <- c(lab, formul1$elem.text[labnu])
            vector.type <- lav_parse_adapt_vector_type(
              vector.type, formul1$elem.type[labnu],
              formul1$elem.text[labnu], types
            )
          }
          if (vector.type == 0) vector.type <- types$stringliteral
          if (formul1$elem.text[labnu + 1L] == ")") {
            formul1$elem.type[seq.int(j, labnu)] <- 0
            formul1$elem.type[labnu + 1L] <- vector.type
            formul1$elem.text[labnu + 1L] <- paste(lab, collapse = ";")
            formul1 <- lav_parse_sublist(formul1, which(formul1$elem.type > 0))
            nelem <- length(formul1$elem.type)
            break
          } else {
            lav_local_msgcode(TRUE, 43L, formul1$elem.pos[j], msgenv)
            return(msgenv)
          }
        }
        if (j + 3L < nelem && formul1$elem.text[j + 3L] == "," &&
          any(formul1$elem.text[j] == c(
            "start", "fixed", "label",
            "upper", "lower", "rv", "prior"
          ))) {
          vector.type <- 0
          labnu <- j + 2L
          lab <- formul1$elem.text[labnu]
          vector.type <- lav_parse_adapt_vector_type(
            vector.type, formul1$elem.type[labnu],
            formul1$elem.text[labnu], types
          )
          while (formul1$elem.text[labnu + 1L] == ",") {
            labnu <- labnu + 2L
            lab <- c(lab, formul1$elem.text[labnu])
            vector.type <- lav_parse_adapt_vector_type(
              vector.type, formul1$elem.type[labnu],
              formul1$elem.text[labnu], types
            )
          }
          if (vector.type == 0) vector.type <- types$stringliteral
          if (formul1$elem.text[labnu + 1L] == ")") {
            formul1$elem.type[seq.int(j + 3L, labnu)] <- 0
            formul1$elem.type[j + 2L] <- vector.type
            formul1$elem.text[j + 2L] <- paste(lab, collapse = ";")
            formul1 <- lav_parse_sublist(formul1, which(formul1$elem.type > 0))
            nelem <- length(formul1$elem.type)
            break
          } else {
            lav_local_msgcode(TRUE, 43L, formul1$elem.pos[j], msgenv)
            return(msgenv)
          }
        }
      }
    }
    # possibilities
    # stringliteral|identifier * identifier|numliteral
    # numliteral * identifier|numliteral
    # numliteral ? identifier|numliteral
    # fixed|start|upper|lower|rv|prior(numliteral) * identifier|numliteral
    # label|equal (stringliteral|identifier) * identifier|numliteral
    # ==> literals before * or ? can be replaced by R-expression
    #     resulting in correct type
    # check on last element being a numliteral or identifier
    #                       already done in calling function
    if (all(formul1$elem.text[nelem - 1L] != c("*", "?"))) {
        lav_local_msgcode(TRUE, 49L, formul1$elem.pos[nelem - 1L], msgenv)
        return(msgenv)
    }
    if (formul1$elem.text[nelem - 1L] == "?") {
      temp <- lav_parse_evaluate_r_expression(formul1, opi + 1L, nelem - 2L, types)
      if (is.numeric(temp)) {
        return(list(start = temp))
      }
      lav_local_msgcode(TRUE, 45L, formul1$elem.pos[opi + 1L], msgenv)
      return(msgenv)
    }
    if (nelem == opi + 3) {
      if (formul1$elem.text[opi + 1L] == "NA") {
        formul1$elem.type[opi + 1L] <- types$numliteral
      }
      if (any(formul1$elem.type[opi + 1L] ==
        c(types$identifier, types$stringliteral))) {
        return(list(label = lav_parse_unpaste(formul1$elem.text[opi + 1L])))
      } else {
        if (formul1$elem.type[opi + 1L] == types$numliteral) {
          return(list(fixed = lav_parse_num_modifier(formul1$elem.text[opi + 1L])))
        } else {
          lav_local_msgcode(TRUE, 44L, formul1$elem.pos[opi + 1L], msgenv)
          return(msgenv)
        }
      }
    }
    if (formul1$elem.text[opi + 2L] == "(" &&
      formul1$elem.text[nelem - 2L] == ")") {
      if (any(formul1$elem.text[opi + 1L] ==
        c("fixed", "start", "upper", "lower", "prior"))) {
        if (nelem == opi + 6L) {
          if (formul1$elem.text[opi + 3L] == "NA") {
            formul1$elem.type[opi + 3L] <- types$numliteral
          }
          if (formul1$elem.type[opi + 3L] == types$numliteral) {
            outje <- list()
            outje[[formul1$elem.text[opi + 1L]]] <-
              lav_parse_num_modifier(formul1$elem.text[opi + 3L])
            return(outje)
          }
          lav_local_msgcode(TRUE, 45L, formul1$elem.pos[opi + 3L], msgenv)
          return(msgenv)
        }
        temp <- lav_parse_evaluate_r_expression(formul1, opi + 3L, nelem - 3L, types)
        if (is.numeric(temp)) {
          outje <- list()
          outje[[formul1$elem.text[opi + 1L]]] <- temp
          return(outje)
        }
        lav_local_msgcode(TRUE, 45L, formul1$elem.pos[opi + 3L], msgenv)
        return(msgenv)
      }
      if (any(formul1$elem.text[opi + 1L] == c("equal", "rv", "label"))) {
        modname <- formul1$elem.text[opi + 1L]
        if (modname == "equal") modname <- "label"
        if (nelem == opi + 6L) {
          if (formul1$elem.type[opi + 3L] == types$stringliteral) {
            outje <- list()
            outje[[modname]] <-
              lav_parse_unpaste(formul1$elem.text[opi + 3L])
            return(outje)
          }
          lav_local_msgcode(TRUE, 46L, formul1$elem.pos[opi + 3L], msgenv)
          return(msgenv)
        }
        temp <- lav_parse_evaluate_r_expression(formul1, opi + 3L, nelem - 3L, types)
        if (is.character(temp) && temp[1] != "_error_") {
          outje <- list()
          outje[[modname]] <- temp
          return(outje)
        }
        lav_local_msgcode(TRUE, 46L, formul1$elem.pos[opi + 3L], msgenv)
        return(msgenv)
      }
    }
    temp <- lav_parse_evaluate_r_expression(formul1, opi + 1L, nelem - 2L, types)
    if (is.numeric(temp)) {
      return(list(fixed = temp))
    }
    if (is.character(temp) && temp[1] != "_error_") {
      return(list(label = temp))
    }
    lav_local_msgcode(TRUE, 50L, formul1$elem.pos[opi + 1L], msgenv)
    return(msgenv)
  }
}
# -------------------- main parsing function --------------------------------- #
lav_parse_model_string_r <- function(model.syntax = "", as.data.frame. = FALSE) {
  stopifnot(length(model.syntax) > 0L)
  # replace 'strange' tildes (in some locales) (new in 0.6-6)
  modelsrc <- gsub(
    pattern = "\u02dc",
    replacement = "~",
    paste(unlist(model.syntax), "", collapse = "\n")
  )
  msgenv <- new.env(hash = FALSE, parent = emptyenv(), size = 5L)
  types <- lav_create_enum(c(
    "identifier", "numliteral", "stringliteral",
    "symbol", "lavaanoperator", "newline"
  ))
  modellist <- lav_parse_step1(modelsrc, types, msgenv)
  if (exists("error", envir = msgenv)) return(msgenv$error);
  if (lav_debug()) {
    print(data.frame(
      pos = modellist$elem.pos,
      type = types$enum.names[modellist$elem.type],
      text = modellist$elem.text,
      formula = modellist$elem.formula.number
    ))
  }
  formulalist <- lav_parse_step2(modellist, modelsrc, types, msgenv)
  #---- analyse syntax formulas and put in flat.-----
  max.mono.formulas <- length(formulalist)
  flat.lhs <- character(max.mono.formulas)
  flat.op <- character(max.mono.formulas)
  flat.rhs <- character(max.mono.formulas)
  flat.rhs.mod.idx <- integer(max.mono.formulas)
  flat.block <- integer(max.mono.formulas) # keep track of groups using ":" opr
  flat.fixed <- character(max.mono.formulas) # only for display purposes!
  flat.start <- character(max.mono.formulas) # only for display purposes!
  flat.lower <- character(max.mono.formulas) # only for display purposes!
  flat.upper <- character(max.mono.formulas) # only for display purposes!
  flat.label <- character(max.mono.formulas) # only for display purposes!
  flat.prior <- character(max.mono.formulas)
  flat.efa <- character(max.mono.formulas)
  flat.rv <- character(max.mono.formulas)
  flat.idx <- 0L
  mod.idx <- 0L
  constraints <- list()
  mod <- list()
  block <- 1L
  block.op <- FALSE
  if (lav_debug()) {
    cat("formula to analyse:\n")
  }
  #  operators <- c("=~", "<~", "~*~", "~~", "~", "==", "<", ">", ":=",
  #                 ":", "\\|", "%")
  constraint_operators <- c("==", "<", ">", ":=")
  for (s in seq_along(formulalist)) {
    formul1 <- formulalist[[s]]
    if (lav_debug()) {
      cat(vapply(seq_along(formul1$elem.type), function(j) {
        if (formul1$elem.type[j] == types$stringliteral) {
          return(dQuote(formul1$elem.text[j], FALSE))
        }
        return(formul1$elem.text[j])
      }, ""), "\n")
    }
    nelem <- length(formul1$elem.type)
    # where is the operator
    opi <- match(types$lavaanoperator, formul1$elem.type)
    # opi <- which(formul1$elem.type == types$lavaanoperator)
    # if (length(opi) > 1L) opi <- opi[1L]
    op <- formul1$elem.text[opi]
    if (any(op == constraint_operators)) { # ----- constraints -------
      lhs <- paste(formul1$elem.text[seq.int(1L, opi - 1L)], collapse = "")
      rhs <- paste(formul1$elem.text[seq.int(opi + 1L, nelem)], collapse = "")
      constraints <- c(
        constraints,
        list(list(
          op = op,
          lhs = lhs,
          rhs = rhs,
          user = 1L
        ))
      )
      next
    }
    if (op == ":") { # ------------------------- block start ----------------- #
      if (opi == 1L) {
        return(c(47L, formul1$elem.pos[1] - 1L))
      }
      if (opi > 2L || all(tolower(formul1$elem.text[1]) !=
        c("group", "level", "block", "class"))) {
        return(c(47L, formul1$elem.pos[1] - 1L))
      }
      if (nelem != 3 || all(formul1$elem.type[3] !=
        c(types$stringliteral, types$identifier, types$numliteral))) {
        return(c(47L, formul1$elem.pos[1] - 1L))
      }
      flat.idx <- flat.idx + 1L
      flat.lhs[flat.idx] <- formul1$elem.text[1]
      flat.op[flat.idx] <- op
      flat.rhs[flat.idx] <- formul1$elem.text[3]
      flat.rhs.mod.idx[flat.idx] <- 0L
      if (block.op) {
        block <- block + 1L
      } else {
        if (flat.idx != 1) {
          lav_local_msgcode(FALSE, 103L, formul1$elem.pos[1], msgenv)
        }
      }
      flat.block[flat.idx] <- block
      block.op <- TRUE
      next
    }
    # ------------------ relational operators -------------------------------- #
    # warn if some identifiers contain spaces
    contsp <- which(formul1$elem.type == types$identifier &
      grepl(" ", formul1$elem.text, fixed = TRUE))
    if (length(contsp) > 0L) {
      lav_local_msgcode(FALSE, 102L, formul1$elem.pos[contsp[1L]], msgenv)
    }
    # checks for valid names in lhs and rhs
    lav_parse_check_valid_name(formul1, opi - 1L, modelsrc, msgenv) # valid name lhs
    if (exists("error", envir = msgenv)) return(msgenv$error);
    for (j in seq.int(opi + 1L, nelem)) { # valid names rhs
      if (formul1$elem.type[j] == types$identifier &&
        formul1$elem.text[j] != "NA") {
        lav_parse_check_valid_name(formul1, j, modelsrc, msgenv)
        if (exists("error", envir = msgenv)) return(msgenv$error);
      }
    }
    if (formul1$elem.type[nelem] != types$identifier &&
        (formul1$elem.type[nelem] != types$numliteral ||
         all(op != c("~", "=~")))) {
            return(c(48L, formul1$elem.pos[nelem] - 1L))
    }
    # intercept fixed on 0
    # replace 'lhs ~ 0' => 'lhs ~ 0 * 1' - intercept fixed on zero
    if (formul1$elem.text[nelem] == "0" && op == "~" && opi == nelem - 1L) {
      formul1$elem.type <- c(formul1$elem.type, types$symbol, types$numliteral)
      formul1$elem.text <- c(formul1$elem.text, "*", "1")
      formul1$elem.pos <- c(formul1$elem.pos, rep(formul1$elem.pos[nelem], 2))
      nelem <- length(formul1$elem.type)
    }
    # phantom latent variable
    # replace 'lhs =~ 0' => 'lhs =~ fixed(0)*lhs', 0 can be other numliteral
    #          also, lhs is last element before '=~'
    if (formul1$elem.type[nelem] == types$numliteral && op == "=~") {
      formul1$elem.type <- c(
        formul1$elem.type[seq.int(1L, nelem - 1L)], types$identifier,
        types$symbol, types$numliteral, types$symbol, types$symbol,
        types$identifier
      )
      formul1$elem.text <- c(
        formul1$elem.text[seq.int(1L, nelem - 1L)], "fixed", "(",
        formul1$elem.text[nelem], ")", "*", formul1$elem.text[opi - 1L]
      )
      formul1$elem.pos <- c(
        formul1$elem.pos[seq.int(1L, nelem - 1L)],
        rep(formul1$elem.pos[nelem], 6)
      )
      nelem <- length(formul1$elem.type)
    }
    # handling interaction variable types
    colons <- which(formul1$elem.text[seq.int(1L, nelem - 1L)] == ":" &
      formul1$elem.type[seq.int(2L, nelem)] == types$identifier)
    # check at most 1 colon
    if (length(colons) > 1) {
      return(c(33L, formul1$elem.pos[colons[2]] - 1L))
    }
    if (length(colons) == 1) {
      # collapse items around colon "a" ":" "b" => "a:b"
      formul1$elem.text[colons - 1L] <-
        paste(formul1$elem.text[seq.int(colons - 1L, colons + 1L)],
          collapse = ""
        )
      formul1 <- lav_parse_sublist(formul1, seq.int(1L, colons - 1L))
      nelem <- length(formul1$elem.type)
    }
    # modifiers
    rhsmodelems <- which(seq_along(formul1$elem.type) > opi &
      formul1$elem.type == types$symbol &
      (formul1$elem.text == "*" | formul1$elem.text == "?"))
    for (j in seq_along(rhsmodelems)) {
      if (sum(formul1$elem.text[seq.int(opi, rhsmodelems[j])] == "(") !=
          sum(formul1$elem.text[seq.int(opi, rhsmodelems[j])] == ")"))
        rhsmodelems[j] = 0L
    }
    rhsmodelems <- rhsmodelems[rhsmodelems != 0L]
    if (length(rhsmodelems) == 0L) rhsmodelems <- opi
    lhs <- formul1$elem.text[opi - 1L]
    rhs <- formul1$elem.text[nelem]
    for (rmei in seq_along(rhsmodelems)) {
      rme <- rhsmodelems[rmei]
      rmeprev <- if (rmei == 1L) opi else rhsmodelems[rmei - 1L]
      already <- which(flat.lhs == lhs & flat.op == op & flat.block == block &
        (flat.rhs == rhs | (flat.rhs == "" & op == "~" &
          formul1$elem.type[nelem] == types$numliteral)))
      if (length(already) == 1L) {
        idx <- already
      } else {
        flat.idx <- flat.idx + 1L
        idx <- flat.idx
        flat.lhs[idx] <- lhs
        flat.op[idx] <- op
        flat.rhs[idx] <- rhs
        flat.block[idx] <- block
        if (formul1$elem.type[nelem] == types$numliteral) {
          if (op == "~") flat.rhs[idx] <- ""
        }
      }
      lhsmod <- list()
      if (opi > 2 && rmei == 1L) {
        lhsmod <- lav_parse_get_modifier(
          formul1,
          TRUE, opi, modelsrc, types, 0L, 0L, msgenv
        )
      }
      rhsmod <- list()
      if (nelem - opi > 1) {
        rhsmod <- lav_parse_get_modifier(
          formul1,
          FALSE, opi, modelsrc, types, rme, rmeprev, msgenv
        )
      }
      flat.fixed[idx] <- if (is.null(rhsmod$fixed)) {
        flat.fixed[idx]
      } else {
        paste(rhsmod$fixed, collapse = ";")
      }
      flat.start[idx] <- if (is.null(rhsmod$start)) {
        flat.start[idx]
      } else {
        paste(rhsmod$start, collapse = ";")
      }
      flat.label[idx] <- if (is.null(rhsmod$label)) {
        flat.label[idx]
      } else {
        paste(rhsmod$label, collapse = ";")
      }
      flat.lower[idx] <- if (is.null(rhsmod$lower)) {
        flat.lower[idx]
      } else {
        paste(rhsmod$lower, collapse = ";")
      }
      flat.upper[idx] <- if (is.null(rhsmod$upper)) {
        flat.upper[idx]
      } else {
        paste(rhsmod$upper, collapse = ";")
      }
      flat.prior[idx] <- if (is.null(rhsmod$prior)) {
        flat.prior[idx]
      } else {
        paste(rhsmod$prior, collapse = ";")
      }
      flat.efa[idx] <- if (is.null(lhsmod$efa)) {
        flat.efa[idx]
      } else {
        paste(lhsmod$efa, collapse = ";")
      }
      flat.rv[idx] <- if (is.null(rhsmod$rv)) {
        flat.rv[idx]
      } else {
        paste(rhsmod$rv, collapse = ";")
      }
      modnu <- c(lhsmod, rhsmod)
      if (length(modnu) > 0L) { # there is a modifier here
        if (length(already) == 0) { # unknown element
          mod.idx <- mod.idx + 1L
          cur.mod.idx <- mod.idx
          mod[[cur.mod.idx]] <- modnu
          flat.rhs.mod.idx[idx] <- cur.mod.idx
        } else { # known element
          if (flat.rhs.mod.idx[idx] == 0) { # not yet modifier
            mod.idx <- mod.idx + 1L
            cur.mod.idx <- mod.idx
            mod[[cur.mod.idx]] <- modnu
            flat.rhs.mod.idx[idx] <- cur.mod.idx
          } else { # use existing modifier index
            cur.mod.idx <- flat.rhs.mod.idx[idx]
            overwrite <- names(modnu)[names(modnu) %in%
              names(mod[[cur.mod.idx]])]
            if (length(overwrite) > 0) {
               lav_local_msgcode(FALSE, 104L, formul1$elem.pos[rmeprev + 1L], msgenv)
            }
            mod[[cur.mod.idx]] <- modifyList(mod[[cur.mod.idx]], modnu)
          }
        }
      }
    }
    # check for variable regressed on itself
    if (formul1$elem.text[opi] == "~" &&
        formul1$elem.text[opi - 1L] == formul1$elem.text[nelem])
      if (!grepl("^0\\.?0*$", flat.fixed[idx])) return(c(34L, formul1$elem.pos[opi] - 1L))
  }
  # create flat (omit items without operator)
  filled.ones <- which(flat.op != "")
  flat <- list(
    lhs = flat.lhs[filled.ones],
    op = flat.op[filled.ones],
    rhs = flat.rhs[filled.ones],
    mod.idx = flat.rhs.mod.idx[filled.ones],
    block = flat.block[filled.ones],
    fixed = flat.fixed[filled.ones],
    start = flat.start[filled.ones],
    lower = flat.lower[filled.ones],
    upper = flat.upper[filled.ones],
    label = flat.label[filled.ones],
    prior = flat.prior[filled.ones],
    efa = flat.efa[filled.ones],
    rv = flat.rv[filled.ones]
  )
  # change op for intercepts (for convenience only)
  int.idx <- which(flat.op == "~" & flat.rhs == "")
  if (length(int.idx) > 0L) {
    flat$op[int.idx] <- "~1"
  }
  # if there are constraints that are simple lower or upper limits, put
  # them in these members, add a modifier and remove constraint
  aantal <- length(constraints)
  if (aantal > 0) {
    for (j in aantal:1) {
      if (any(flat$label == constraints[[j]]$lhs) &&
          any(constraints[[j]]$op == c("<", ">"))) {
          rhslang <- str2lang(constraints[[j]]$rhs)
          numbound <- NA_real_
          if (mode(rhslang) == "numeric") {
            numbound <- as.numeric(constraints[[j]]$rhs)
          } else {
            if (mode(rhslang) == "call") {
              if (is.numeric(tryCatch(eval(rhslang),
                                      error = function(e) "error"))) {
                numbound <- eval(rhslang)
              }
            }
          }
          if (!is.na(numbound)) {
            nrs <- which(flat$label == constraints[[j]]$lhs)
            for (nr in nrs) {
              nrm <- length(mod) + 1L
              if (flat$mod.idx[nr] > 0L) {
                nrm <- flat$mod.idx[nr]
              } else {
                flat$mod.idx[nr] <- nrm
                mod <- c(mod, list(label = constraints[[j]]$lhs))
              }
              if (constraints[[j]]$op == "<") {
                flat$upper[nr] <- as.character(numbound)
                mod[[nrm]]$upper <- numbound
              } else {
                flat$lower[nr] <- as.character(numbound)
                mod[[nrm]]$lower <- numbound
              }
            }
            constraints <- constraints[-j]
          }
        }
    }
  }
  # new in 0.6-4: check for 'group' within 'level'
  if (any(flat$op == ":")) {
    op.idx <- which(flat$op == ":")
    if (length(op.idx) < 2L) {
      # only 1 block identifier? this is weird -> give warning
      lav_local_msgcode(FALSE, 105L, 1L, msgenv)
    } else {
      first.block <- flat$lhs[op.idx[1L]]
      second.block <- flat$lhs[op.idx[2L]]
      if (first.block == "level" && second.block == "group") {
        return(c(52L, 1L))
      }
    }
  }
  # new in 0.6, reorder covariances here!
  flat <- lav_partable_covariance_reorder(flat)
  if (as.data.frame.) {
    flat <- as.data.frame(flat, stringsAsFactors = FALSE)
  }
  # create output
  if (exists("error", envir = msgenv)) return(msgenv$error);
  if (exists("warncount", envir = msgenv)) {
       warns <- list();
       warncount <- get("warncount", envir = msgenv)
       for (jj in seq.int(warncount)) {
            warns = c(warns,
            list(get(paste0("warn", sprintf("%03d", jj)), envir = msgenv)))
       }
       attr(flat, "warns") <- warns
   }
  attr(flat, "modifiers") <- mod
  attr(flat, "constraints") <- constraints

  flat
}
yrosseel/lavaan documentation built on April 14, 2025, 1:26 a.m.